未验证 提交 56cd3407 编写于 作者: A Aurelius84 提交者: GitHub

[Phi] Migrate Adam and AdamW into Phi (#40351)

* [Phi] Migrate Adam and Adamw into Phi

* fix compile error and unittest ok

* fix compile error and unittest ok

* fix undefined reference to fLI::FLAGS

* test depend on operator

* fix cmake

* fix xpu compile

* fix infrt

* fix amp_type_traits

* fix amp_type_traits

* modify according reviewer

* modify according reviewer

* fix dtype float16

* fix typo

* fix Cmake

* fix code style
上级 aeae81a7
......@@ -113,7 +113,7 @@ function(kernel_library TARGET)
# used for cc_library selected_rows dir target
set(target_suffix "")
if ("${kernel_library_SUB_DIR}" STREQUAL "selected_rows_kernel")
if ("${kernel_library_SUB_DIR}" STREQUAL "selected_rows")
set(target_suffix "_sr")
endif()
......
......@@ -58,8 +58,6 @@ class DenseTensor;
DECLARE_bool(benchmark);
DECLARE_bool(check_nan_inf);
DECLARE_bool(enable_unused_var_check);
PADDLE_DEFINE_EXPORTED_int32(inner_op_parallelism, 0,
"number of threads for inner op");
DECLARE_bool(run_kp_kernel);
namespace paddle {
......
......@@ -294,30 +294,30 @@ template struct SelectedRowsAddToTensor<platform::CPUDeviceContext,
// add or mul.
namespace scatter {
template <typename T>
template <typename T, typename DeviceContext>
typename std::enable_if<!std::is_integral<T>::value>::type elementwise_add_to(
phi::funcs::BlasT<platform::CPUDeviceContext, T>* blas, size_t data_len,
const T* in, T* out) {
phi::funcs::BlasT<DeviceContext, T>* blas, size_t data_len, const T* in,
T* out) {
blas->AXPY(data_len, T(1.f), in, out);
}
template <typename T>
template <typename T, typename DeviceContext>
typename std::enable_if<std::is_integral<T>::value>::type elementwise_add_to(
phi::funcs::BlasT<platform::CPUDeviceContext, T>* blas, size_t data_len,
const T* in, T* out) {
phi::funcs::BlasT<DeviceContext, T>* blas, size_t data_len, const T* in,
T* out) {
for (size_t i = 0; i < data_len; i++) {
out[i] += in[i];
}
}
template <typename T>
template <typename T, typename DeviceContext>
typename std::enable_if<std::is_same<T, platform::bfloat16>::value>::type
add_sparse_inputs(const std::vector<const phi::SelectedRows*>& inputs,
const std::unordered_map<int64_t, size_t>& rows_to_id,
int64_t input_width,
const platform::CPUDeviceContext& context, T* out_data) {
int64_t input_width, const DeviceContext& context,
T* out_data) {
#ifndef PADDLE_WITH_MKLDNN
auto blas = phi::funcs::GetBlas<platform::CPUDeviceContext, T>(context);
auto blas = phi::funcs::GetBlas<DeviceContext, T>(context);
#endif
for (auto* input : inputs) {
if (input->rows().size() == 0) {
......@@ -336,22 +336,22 @@ add_sparse_inputs(const std::vector<const phi::SelectedRows*>& inputs,
#else
for (size_t i = 0; i < input_rows.size(); i++) {
size_t out_i = rows_to_id.at(input_rows[i]);
elementwise_add_to<T>(&blas, static_cast<size_t>(input_width),
&input_data[i * input_width],
&out_data[out_i * input_width]);
elementwise_add_to<T, DeviceContext>(
&blas, static_cast<size_t>(input_width), &input_data[i * input_width],
&out_data[out_i * input_width]);
}
#endif
}
}
template <typename T>
template <typename T, typename DeviceContext>
typename std::enable_if<!std::is_same<T, platform::bfloat16>::value>::type
add_sparse_inputs(const std::vector<const phi::SelectedRows*>& inputs,
const std::unordered_map<int64_t, size_t>& rows_to_id,
int64_t input_width,
const platform::CPUDeviceContext& context, T* out_data) {
int64_t input_width, const DeviceContext& context,
T* out_data) {
VLOG(4) << "[CPU] add_sparse_inputs <" << typeid(T).name();
auto blas = phi::funcs::GetBlas<platform::CPUDeviceContext, T>(context);
auto blas = phi::funcs::GetBlas<DeviceContext, T>(context);
for (auto* input : inputs) {
if (input->rows().size() == 0) {
continue;
......@@ -361,16 +361,16 @@ add_sparse_inputs(const std::vector<const phi::SelectedRows*>& inputs,
for (size_t i = 0; i < input_rows.size(); i++) {
size_t out_i = rows_to_id.at(input_rows[i]);
elementwise_add_to<T>(&blas, static_cast<size_t>(input_width),
&input_data[i * input_width],
&out_data[out_i * input_width]);
elementwise_add_to<T, DeviceContext>(
&blas, static_cast<size_t>(input_width), &input_data[i * input_width],
&out_data[out_i * input_width]);
}
}
}
template <typename T>
struct MergeAdd<platform::CPUDeviceContext, T> {
phi::SelectedRows operator()(const platform::CPUDeviceContext& context,
template <typename DeviceContext, typename T>
struct MergeAddImpl {
phi::SelectedRows operator()(const DeviceContext& context,
const phi::SelectedRows& input,
const bool sorted_result = false) {
phi::SelectedRows out;
......@@ -378,15 +378,14 @@ struct MergeAdd<platform::CPUDeviceContext, T> {
return out;
}
void operator()(const platform::CPUDeviceContext& context,
const phi::SelectedRows& input, phi::SelectedRows* output,
const bool sorted_result = false) {
void operator()(const DeviceContext& context, const phi::SelectedRows& input,
phi::SelectedRows* output, const bool sorted_result = false) {
std::vector<const phi::SelectedRows*> inputs;
inputs.push_back(&input);
(*this)(context, inputs, output, sorted_result);
}
void operator()(const platform::CPUDeviceContext& context,
void operator()(const DeviceContext& context,
const std::vector<const phi::SelectedRows*>& inputs,
phi::SelectedRows* output, const bool sorted_result = false) {
if (inputs.size() == 0) {
......@@ -461,7 +460,7 @@ struct MergeAdd<platform::CPUDeviceContext, T> {
out.set_rows(merge_rows);
phi::funcs::SetConstant<platform::CPUDeviceContext, T> constant_functor;
phi::funcs::SetConstant<DeviceContext, T> constant_functor;
constant_functor(context, out.mutable_value(), static_cast<T>(0.f));
std::unordered_map<int64_t, size_t> rows_to_id;
......@@ -469,11 +468,75 @@ struct MergeAdd<platform::CPUDeviceContext, T> {
rows_to_id[merge_rows[i]] = i;
}
add_sparse_inputs<T>(inputs, rows_to_id, input_width, context, out_data);
add_sparse_inputs<T, DeviceContext>(inputs, rows_to_id, input_width,
context, out_data);
}
}
};
template <typename T>
struct MergeAdd<platform::CPUDeviceContext, T> {
// unary functor, merge by adding duplicated rows in
// the input SelectedRows object.
phi::SelectedRows operator()(const platform::CPUDeviceContext& context,
const phi::SelectedRows& input,
const bool sorted_result) {
return MergeAddImpl<platform::CPUDeviceContext, T>()(context, input,
sorted_result);
}
void operator()(const platform::CPUDeviceContext& context,
const phi::SelectedRows& input, phi::SelectedRows* output,
const bool sorted_result) {
MergeAddImpl<platform::CPUDeviceContext, T>()(context, input, output,
sorted_result);
}
void operator()(const platform::CPUDeviceContext& context,
const std::vector<const phi::SelectedRows*>& inputs,
phi::SelectedRows* output, const bool sorted_result) {
MergeAddImpl<platform::CPUDeviceContext, T>()(context, inputs, output,
sorted_result);
}
};
template <typename T>
struct MergeAdd<phi::CPUContext, T> {
// unary functor, merge by adding duplicated rows in
// the input SelectedRows object.
phi::SelectedRows operator()(const phi::CPUContext& context,
const phi::SelectedRows& input,
const bool sorted_result) {
return MergeAddImpl<phi::CPUContext, T>()(context, input, sorted_result);
}
void operator()(const phi::CPUContext& context,
const phi::SelectedRows& input, phi::SelectedRows* output,
const bool sorted_result) {
MergeAddImpl<phi::CPUContext, T>()(context, input, output, sorted_result);
}
void operator()(const phi::CPUContext& context,
const std::vector<const phi::SelectedRows*>& inputs,
phi::SelectedRows* output, const bool sorted_result) {
MergeAddImpl<phi::CPUContext, T>()(context, inputs, output, sorted_result);
}
};
#define TEMPLATE_SPECIALIZED_FOR_MERGEADD_CPU(dtype) \
template struct MergeAddImpl<platform::CPUDeviceContext, dtype>; \
template struct MergeAddImpl<phi::CPUContext, dtype>; \
template struct MergeAdd<platform::CPUDeviceContext, dtype>; \
template struct MergeAdd<phi::CPUContext, dtype>;
TEMPLATE_SPECIALIZED_FOR_MERGEADD_CPU(float)
TEMPLATE_SPECIALIZED_FOR_MERGEADD_CPU(double)
TEMPLATE_SPECIALIZED_FOR_MERGEADD_CPU(int)
TEMPLATE_SPECIALIZED_FOR_MERGEADD_CPU(int64_t)
TEMPLATE_SPECIALIZED_FOR_MERGEADD_CPU(platform::bfloat16)
TEMPLATE_SPECIALIZED_FOR_MERGEADD_CPU(platform::complex<float>)
TEMPLATE_SPECIALIZED_FOR_MERGEADD_CPU(platform::complex<double>)
#ifdef PADDLE_WITH_XPU
template <typename T>
struct MergeAdd<platform::XPUDeviceContext, T> {
......@@ -714,17 +777,6 @@ struct MergeAverage<platform::CPUDeviceContext, T> {
}
};
template struct MergeAdd<platform::CPUDeviceContext, int>;
template struct MergeAdd<platform::CPUDeviceContext, int64_t>;
template struct MergeAdd<platform::CPUDeviceContext, float>;
template struct MergeAdd<platform::CPUDeviceContext, double>;
template struct MergeAdd<platform::CPUDeviceContext,
paddle::platform::complex<float>>;
template struct MergeAdd<platform::CPUDeviceContext,
paddle::platform::complex<double>>;
template struct MergeAdd<platform::CPUDeviceContext,
paddle::platform::bfloat16>;
#ifdef PADDLE_WITH_XPU
template struct MergeAdd<platform::XPUDeviceContext, float>;
#endif
......
......@@ -319,9 +319,9 @@ __global__ void MergeAddKernel(const T* input, const int64_t* input_rows,
}
}
template <typename T>
struct MergeAdd<platform::CUDADeviceContext, T> {
phi::SelectedRows operator()(const platform::CUDADeviceContext& context,
template <typename DeviceContext, typename T>
struct MergeAddImpl {
phi::SelectedRows operator()(const DeviceContext& context,
const phi::SelectedRows& input,
const bool sorted_result = false) {
phi::SelectedRows out;
......@@ -329,9 +329,8 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
return out;
}
void operator()(const platform::CUDADeviceContext& context,
const phi::SelectedRows& input, phi::SelectedRows* output,
const bool sorted_result = false) {
void operator()(const DeviceContext& context, const phi::SelectedRows& input,
phi::SelectedRows* output, const bool sorted_result = false) {
framework::Vector<int64_t> input_rows(input.rows());
if (input_rows.size() == 0) {
return;
......@@ -350,7 +349,7 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
phi::make_ddim({static_cast<int64_t>(merge_rows.size()), input_width}),
context.GetPlace());
phi::funcs::SetConstant<platform::CUDADeviceContext, T> constant_functor;
phi::funcs::SetConstant<DeviceContext, T> constant_functor;
constant_functor(context, out.mutable_value(), static_cast<T>(0));
auto* out_data = out.mutable_value()->data<T>();
......@@ -369,7 +368,7 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
mix_vector_out.CopyToCPU();
}
void operator()(const platform::CUDADeviceContext& context,
void operator()(const DeviceContext& context,
const std::vector<const phi::SelectedRows*>& inputs,
phi::SelectedRows* output, const bool sorted_result = false) {
if (inputs.size() == 0) {
......@@ -414,7 +413,7 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
phi::make_ddim({static_cast<int64_t>(merge_rows.size()), input_width}),
context.GetPlace());
phi::funcs::SetConstant<platform::CUDADeviceContext, T> constant_functor;
phi::funcs::SetConstant<DeviceContext, T> constant_functor;
constant_functor(context, out.mutable_value(), static_cast<T>(0));
auto* out_data = out.mutable_value()->data<T>();
......@@ -441,15 +440,69 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
}
};
template struct MergeAdd<platform::CUDADeviceContext, float>;
template struct MergeAdd<platform::CUDADeviceContext, double>;
template struct MergeAdd<platform::CUDADeviceContext, int>;
template struct MergeAdd<platform::CUDADeviceContext, int64_t>;
template struct MergeAdd<platform::CUDADeviceContext, platform::float16>;
template struct MergeAdd<platform::CUDADeviceContext, platform::bfloat16>;
template struct MergeAdd<platform::CUDADeviceContext, platform::complex<float>>;
template struct MergeAdd<platform::CUDADeviceContext,
platform::complex<double>>;
template <typename T>
struct MergeAdd<platform::CUDADeviceContext, T> {
// unary functor, merge by adding duplicated rows in
// the input SelectedRows object.
phi::SelectedRows operator()(const platform::CUDADeviceContext& context,
const phi::SelectedRows& input,
const bool sorted_result) {
return MergeAddImpl<platform::CUDADeviceContext, T>()(context, input,
sorted_result);
}
void operator()(const platform::CUDADeviceContext& context,
const phi::SelectedRows& input, phi::SelectedRows* output,
const bool sorted_result) {
MergeAddImpl<platform::CUDADeviceContext, T>()(context, input, output,
sorted_result);
}
void operator()(const platform::CUDADeviceContext& context,
const std::vector<const phi::SelectedRows*>& inputs,
phi::SelectedRows* output, const bool sorted_result) {
MergeAddImpl<platform::CUDADeviceContext, T>()(context, inputs, output,
sorted_result);
}
};
template <typename T>
struct MergeAdd<phi::GPUContext, T> {
// unary functor, merge by adding duplicated rows in
// the input SelectedRows object.
phi::SelectedRows operator()(const phi::GPUContext& context,
const phi::SelectedRows& input,
const bool sorted_result) {
return MergeAddImpl<phi::GPUContext, T>()(context, input, sorted_result);
}
void operator()(const phi::GPUContext& context,
const phi::SelectedRows& input, phi::SelectedRows* output,
const bool sorted_result) {
MergeAddImpl<phi::GPUContext, T>()(context, input, output, sorted_result);
}
void operator()(const phi::GPUContext& context,
const std::vector<const phi::SelectedRows*>& inputs,
phi::SelectedRows* output, const bool sorted_result) {
MergeAddImpl<phi::GPUContext, T>()(context, inputs, output, sorted_result);
}
};
#define TEMPLATE_SPECIALIZED_FOR_MERGEADD(dtype) \
template struct MergeAddImpl<platform::CUDADeviceContext, dtype>; \
template struct MergeAddImpl<phi::GPUContext, dtype>; \
template struct MergeAdd<platform::CUDADeviceContext, dtype>; \
template struct MergeAdd<phi::GPUContext, dtype>;
TEMPLATE_SPECIALIZED_FOR_MERGEADD(float)
TEMPLATE_SPECIALIZED_FOR_MERGEADD(double)
TEMPLATE_SPECIALIZED_FOR_MERGEADD(int)
TEMPLATE_SPECIALIZED_FOR_MERGEADD(int64_t)
TEMPLATE_SPECIALIZED_FOR_MERGEADD(platform::float16)
TEMPLATE_SPECIALIZED_FOR_MERGEADD(platform::bfloat16)
TEMPLATE_SPECIALIZED_FOR_MERGEADD(platform::complex<float>)
TEMPLATE_SPECIALIZED_FOR_MERGEADD(platform::complex<double>)
template <typename T, int block_size>
__global__ void UpdateToTensorKernel(const T* selected_rows,
......
......@@ -12,125 +12,41 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/optimizers/adam_op.h"
#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/operators/optimizers/adamw_op.h"
#include "paddle/fluid/framework/infershape_utils.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/infermeta/multiary.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
void AdamOp::InferShape(framework::InferShapeContext *ctx) const {
PADDLE_ENFORCE_EQ(
ctx->HasInput("Param"), true,
platform::errors::NotFound("Input(Param) of AdamOp should not be null."));
PADDLE_ENFORCE_EQ(
ctx->HasInput("Grad"), true,
platform::errors::NotFound("Input(Grad) of AdamOp should not be null."));
PADDLE_ENFORCE_EQ(ctx->HasInput("Moment1"), true,
platform::errors::NotFound(
"Input(Moment1) of AdamOp should not be null."));
PADDLE_ENFORCE_EQ(ctx->HasInput("Moment2"), true,
platform::errors::NotFound(
"Input(Moment2) of AdamOp should not be null."));
PADDLE_ENFORCE_EQ(ctx->HasInput("LearningRate"), true,
platform::errors::NotFound(
"Input(LearningRate) of AdamOp should not be null."));
PADDLE_ENFORCE_EQ(ctx->HasInput("Beta1Pow"), true,
platform::errors::NotFound(
"Input(Beta1Pow) of AdamOp should not be null."));
PADDLE_ENFORCE_EQ(ctx->HasInput("Beta2Pow"), true,
platform::errors::NotFound(
"Input(Beta2Pow) of AdamOp should not be null."));
PADDLE_ENFORCE_EQ(ctx->HasOutput("ParamOut"), true,
platform::errors::NotFound(
"Output(ParamOut) of AdamOp should not be null."));
PADDLE_ENFORCE_EQ(ctx->HasOutput("Moment1Out"), true,
platform::errors::NotFound(
"Output(Moment1Out) of AdamOp should not be null."));
PADDLE_ENFORCE_EQ(ctx->HasOutput("Moment2Out"), true,
platform::errors::NotFound(
"Output(Moment2Out) of AdamOp should not be null."));
auto lr_dims = ctx->GetInputDim("LearningRate");
PADDLE_ENFORCE_NE(
phi::product(lr_dims), 0,
platform::errors::InvalidArgument(
"The number of LearningRate shall not be 0, but received %d. Maybe "
"the Input variable LearningRate has not "
"been initialized. You may need to confirm "
"if you put exe.run(startup_program) "
"after optimizer.minimize function.",
phi::product(lr_dims)));
PADDLE_ENFORCE_EQ(
phi::product(lr_dims), 1,
platform::errors::InvalidArgument(
"Learning rate should have 1 dimension, but received %d",
phi::product(lr_dims)));
auto beta1_pow_dims = ctx->GetInputDim("Beta1Pow");
VLOG(3) << "dims of Beta1Pow : [" << beta1_pow_dims << "]";
PADDLE_ENFORCE_GE(phi::product(beta1_pow_dims), 1,
platform::errors::InvalidArgument(
"The size of Beta1 power accumulator should be greater "
"than 0, but received %d.",
phi::product(beta1_pow_dims)));
auto beta2_pow_dims = ctx->GetInputDim("Beta2Pow");
VLOG(3) << "dims of Beta2Pow : [" << beta2_pow_dims << "]";
PADDLE_ENFORCE_GE(phi::product(beta2_pow_dims), 1,
platform::errors::InvalidArgument(
"The size of Beta2 power accumulator should be greater "
"than 0, but received %d.",
phi::product(beta2_pow_dims)));
class AdamOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
auto param_dims = ctx->GetInputDim("Param");
if (ctx->GetInputsVarType("Grad")[0] ==
framework::proto::VarType::LOD_TENSOR) {
PADDLE_ENFORCE_EQ(
param_dims, ctx->GetInputDim("Grad"),
platform::errors::InvalidArgument(
"Param and Grad input of AdamOp should have same dimension. But "
"received Param dims: [%s], Grad dims: [%s].",
param_dims, ctx->GetInputDim("Grad")));
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const {
auto input_data_type =
OperatorWithKernel::IndicateVarDataType(ctx, "Param");
return framework::OpKernelType(input_data_type, ctx.GetPlace());
}
PADDLE_ENFORCE_EQ(
param_dims, ctx->GetInputDim("Moment1"),
platform::errors::InvalidArgument(
"Param and Moment1 input of AdamOp should have same dimension. But "
"received Param dims: [%s], Moment1 dims: [%s].",
param_dims, ctx->GetInputDim("Moment1")));
PADDLE_ENFORCE_EQ(
param_dims, ctx->GetInputDim("Moment2"),
platform::errors::InvalidArgument(
"Param and Moment2 input of AdamOp should have same dimension. But "
"received Param dims: [%s], Moment2 dims: [%s].",
param_dims, ctx->GetInputDim("Moment2")));
ctx->SetOutputDim("ParamOut", param_dims);
ctx->SetOutputDim("Moment1Out", param_dims);
ctx->SetOutputDim("Moment2Out", param_dims);
ctx->SetOutputDim("Beta1PowOut", beta1_pow_dims);
ctx->SetOutputDim("Beta2PowOut", beta2_pow_dims);
}
framework::OpKernelType AdamOp::GetExpectedKernelType(
const framework::ExecutionContext &ctx) const {
auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "Param");
return framework::OpKernelType(input_data_type, ctx.GetPlace());
}
framework::OpKernelType AdamOp::GetKernelTypeForVar(
const std::string &var_name, const framework::Tensor &tensor,
const framework::OpKernelType &expected_kernel_type) const {
if (var_name == "Beta1Pow" || var_name == "Beta2Pow" ||
var_name == "SkipUpdate") {
return expected_kernel_type;
} else {
return framework::OpKernelType(expected_kernel_type.data_type_,
tensor.place(), tensor.layout());
framework::OpKernelType GetKernelTypeForVar(
const std::string &var_name, const framework::Tensor &tensor,
const framework::OpKernelType &expected_kernel_type) const {
if (var_name == "Beta1Pow" || var_name == "Beta2Pow" ||
var_name == "SkipUpdate") {
return expected_kernel_type;
} else {
return framework::OpKernelType(expected_kernel_type.data_type_,
tensor.place(), tensor.layout());
}
}
}
};
class AdamOpMaker : public framework::OpProtoAndCheckerMaker {
public:
......@@ -232,6 +148,10 @@ $$
}
};
class AdamWOp : public AdamOp {
using AdamOp::AdamOp;
};
class AdamWOpMaker : public AdamOpMaker {
public:
void Make() {
......@@ -255,13 +175,23 @@ class AdamWOpMaker : public AdamOpMaker {
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(adam, ops::AdamOp, ops::AdamOpMaker);
REGISTER_OP_WITHOUT_GRADIENT(adamw, ops::AdamWOp, ops::AdamWOpMaker);
REGISTER_OP_CPU_KERNEL(
adam, ops::AdamOpKernel<paddle::platform::CPUDeviceContext, float>,
ops::AdamOpKernel<paddle::platform::CPUDeviceContext, double>);
DECLARE_INFER_SHAPE_FUNCTOR(adam, AdamInferMetaFunctor,
PD_INFER_META(phi::AdamInferMeta));
REGISTER_OPERATOR(
adam, ops::AdamOp, ops::AdamOpMaker,
paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
AdamInferMetaFunctor);
DECLARE_INFER_SHAPE_FUNCTOR(adamw, AdamwInferMetaFunctor,
PD_INFER_META(phi::AdamwInferMeta));
REGISTER_OPERATOR(
adamw, ops::AdamWOp, ops::AdamWOpMaker,
paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
AdamwInferMetaFunctor);
REGISTER_OP_VERSION(adam)
.AddCheckpoint(
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/amp/fp16_type_traits.h"
#include "paddle/fluid/operators/optimizers/adam_op.h"
#include "paddle/fluid/platform/float16.h"
namespace paddle {
namespace operators {
template <typename T, typename MT>
__global__ void AdamKernelREG(MT beta1, MT beta2, MT epsilon, MT beta1_pow_,
MT beta2_pow_, const MT* moment1, MT* moment1_out,
const MT* moment2, MT* moment2_out, const MT* lr_,
const T* grad, const T* param, T* param_out,
const MT* master_param, MT* master_param_out,
int ndim) {
MT lr = *lr_;
MT beta1_pow = beta1_pow_;
MT beta2_pow = beta2_pow_;
int id = blockIdx.x * blockDim.x + threadIdx.x;
for (; id < ndim; id += gridDim.x * blockDim.x) {
MT p = master_param ? master_param[id] : static_cast<MT>(param[id]);
MT g = static_cast<MT>(grad[id]);
MT mom1 = static_cast<MT>(moment1[id]);
MT mom2 = static_cast<MT>(moment2[id]);
mom1 = beta1 * mom1 + (static_cast<MT>(1.0) - beta1) * g;
mom2 = beta2 * mom2 + (static_cast<MT>(1.0) - beta2) * g * g;
MT denom = (sqrt(mom2) / sqrt(static_cast<MT>(1.0) - beta2_pow)) + epsilon;
p += (mom1 / denom) * (-(lr / (static_cast<MT>(1.0) - beta1_pow)));
moment1_out[id] = mom1;
moment2_out[id] = mom2;
param_out[id] = static_cast<T>(p);
if (master_param_out) {
master_param_out[id] = p;
}
}
}
template <typename T, typename MT>
__global__ void AdamKernelMEM(MT beta1, MT beta2, MT epsilon,
const MT* beta1_pow_, const MT* beta2_pow_,
const MT* moment1, MT* moment1_out,
const MT* moment2, MT* moment2_out, const MT* lr_,
const T* grad, const T* param, T* param_out,
const MT* master_param, MT* master_param_out,
int ndim) {
MT lr = *lr_;
MT beta1_pow = *beta1_pow_;
MT beta2_pow = *beta2_pow_;
int id = blockIdx.x * blockDim.x + threadIdx.x;
for (; id < ndim; id += gridDim.x * blockDim.x) {
MT p = master_param ? master_param[id] : static_cast<MT>(param[id]);
MT g = static_cast<MT>(grad[id]);
MT mom1 = static_cast<MT>(moment1[id]);
MT mom2 = static_cast<MT>(moment2[id]);
mom1 = beta1 * mom1 + (static_cast<MT>(1.0) - beta1) * g;
mom2 = beta2 * mom2 + (static_cast<MT>(1.0) - beta2) * g * g;
MT denom = (sqrt(mom2) / sqrt(static_cast<MT>(1.0) - beta2_pow)) + epsilon;
p += (mom1 / denom) * (-(lr / (static_cast<MT>(1.0) - beta1_pow)));
moment1_out[id] = mom1;
moment2_out[id] = mom2;
param_out[id] = static_cast<T>(p);
if (master_param_out) {
master_param_out[id] = p;
}
}
}
template <typename T>
__global__ void UpdateBetaPow(T beta1, T beta2, const T* beta1_pow_,
const T* beta2_pow_, T* beta1_pow_out,
T* beta2_pow_out) {
*beta1_pow_out = beta1 * beta1_pow_[0];
*beta2_pow_out = beta2 * beta2_pow_[0];
}
template <typename T, typename MT>
__global__ void SparseAdamCUDAKernelREG(
MT beta1, MT beta2, MT epsilon, const MT beta1_pow, const MT beta2_pow,
const MT* mom1_, MT* mom1_out_, const MT* mom2_, MT* mom2_out_,
const MT* lr_, const T* grad_, const T* param_, T* param_out_,
const MT* master_param, MT* master_param_out, const int64_t* rows_,
int64_t row_numel, int64_t row_count, bool lazy_mode, int ndim) {
int id = blockIdx.x * blockDim.x + threadIdx.x;
MT lr = *lr_;
for (; id < ndim; id += blockDim.x * gridDim.x) {
auto row_idx =
phi::funcs::BinarySearch<int64_t>(rows_, row_count, id / row_numel);
if (lazy_mode && row_idx < 0) {
return;
} else {
MT mom1 = mom1_[id];
MT mom2 = mom2_[id];
MT p = master_param ? master_param[id] : static_cast<MT>(param_[id]);
MT g = row_idx >= 0
? static_cast<MT>(grad_[row_idx * row_numel + id % row_numel])
: static_cast<MT>(0);
mom1 = beta1 * mom1 + (static_cast<MT>(1.0) - beta1) * g;
mom2 = beta2 * mom2 + (static_cast<MT>(1.0) - beta2) * g * g;
MT denom =
(sqrt(mom2) / sqrt(static_cast<MT>(1.0) - beta2_pow)) + epsilon;
p += (mom1 / denom) * (-(lr / (static_cast<MT>(1.0) - beta1_pow)));
// Write back to global memory
mom1_out_[id] = mom1;
mom2_out_[id] = mom2;
param_out_[id] = static_cast<T>(p);
if (master_param_out) {
master_param_out[id] = p;
}
}
}
}
template <typename T>
class AdamOpCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
const auto* param_var = ctx.InputVar("Param");
PADDLE_ENFORCE_EQ(param_var->IsType<framework::LoDTensor>(), true,
platform::errors::InvalidArgument(
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s",
ctx.InputNames("Param").front(),
framework::ToTypeName(param_var->Type())));
using paddle::framework::LoDTensor;
using MPDType = typename details::MPTypeTrait<T>::Type;
int64_t min_row_size_to_use_multithread =
ctx.Attr<int64_t>("min_row_size_to_use_multithread");
bool lazy_mode = ctx.Attr<bool>("lazy_mode");
bool use_global_beta_pow = ctx.Attr<bool>("use_global_beta_pow");
VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow;
auto* param = ctx.Input<LoDTensor>("Param");
auto* grad_var = ctx.InputVar("Grad");
auto* mom1 = ctx.Input<LoDTensor>("Moment1");
auto* mom2 = ctx.Input<LoDTensor>("Moment2");
auto* lr = ctx.Input<LoDTensor>("LearningRate");
auto* beta1_pow = ctx.Input<LoDTensor>("Beta1Pow");
auto* beta2_pow = ctx.Input<LoDTensor>("Beta2Pow");
auto* param_out = ctx.Output<LoDTensor>("ParamOut");
auto* mom1_out = ctx.Output<LoDTensor>("Moment1Out");
auto* mom2_out = ctx.Output<LoDTensor>("Moment2Out");
auto* beta1_pow_out = ctx.Output<LoDTensor>("Beta1PowOut");
auto* beta2_pow_out = ctx.Output<LoDTensor>("Beta2PowOut");
bool skip_update = false;
if (ctx.HasInput("SkipUpdate")) {
auto* skip_update_tensor = ctx.Input<framework::Tensor>("SkipUpdate");
PADDLE_ENFORCE_EQ(skip_update_tensor->numel(), 1,
platform::errors::InvalidArgument(
"Input(SkipUpdate) size must be 1, but get %d",
skip_update_tensor->numel()));
std::vector<bool> skip_update_vec;
paddle::framework::TensorToVector(*skip_update_tensor,
ctx.device_context(), &skip_update_vec);
skip_update = skip_update_vec[0];
}
// skip_update=true, just copy input to output, and TensorCopy will call
// mutable_data
if (skip_update) {
VLOG(4) << "Adam skip update";
framework::TensorCopy(
*param, ctx.GetPlace(),
ctx.template device_context<platform::DeviceContext>(), param_out);
framework::TensorCopy(
*mom1, ctx.GetPlace(),
ctx.template device_context<platform::DeviceContext>(), mom1_out);
framework::TensorCopy(
*mom2, ctx.GetPlace(),
ctx.template device_context<platform::DeviceContext>(), mom2_out);
framework::TensorCopy(
*beta1_pow, beta1_pow->place(),
ctx.template device_context<platform::DeviceContext>(),
beta1_pow_out);
framework::TensorCopy(
*beta2_pow, beta2_pow->place(),
ctx.template device_context<platform::DeviceContext>(),
beta2_pow_out);
return;
}
MPDType beta1 = static_cast<MPDType>(ctx.Attr<float>("beta1"));
if (ctx.HasInput("Beta1Tensor")) {
auto* beta1_tensor = ctx.Input<framework::Tensor>("Beta1Tensor");
PADDLE_ENFORCE_EQ(beta1_tensor->numel(), 1,
platform::errors::InvalidArgument(
"Input(Beta1Tensor) size must be 1, but get %d",
beta1_tensor->numel()));
beta1 = static_cast<MPDType>(GetAttrFromTensor(beta1_tensor));
}
MPDType beta2 = static_cast<MPDType>(ctx.Attr<float>("beta2"));
if (ctx.HasInput("Beta2Tensor")) {
auto* beta2_tensor = ctx.Input<framework::Tensor>("Beta2Tensor");
PADDLE_ENFORCE_EQ(beta2_tensor->numel(), 1,
platform::errors::InvalidArgument(
"Input(Beta2Tensor) size must be 1, but get %d",
beta2_tensor->numel()));
beta2 = static_cast<MPDType>(GetAttrFromTensor(beta2_tensor));
}
MPDType epsilon = static_cast<MPDType>(ctx.Attr<float>("epsilon"));
if (ctx.HasInput("EpsilonTensor")) {
auto* epsilon_tensor = ctx.Input<framework::Tensor>("EpsilonTensor");
PADDLE_ENFORCE_EQ(epsilon_tensor->numel(), 1,
platform::errors::InvalidArgument(
"Input(EpsilonTensor) size must be 1, but get %d",
epsilon_tensor->numel()));
epsilon = static_cast<MPDType>(GetAttrFromTensor(epsilon_tensor));
}
VLOG(3) << "beta1_pow.numel() : " << beta1_pow->numel()
<< "beta2_pow.numel() : " << beta2_pow->numel();
VLOG(3) << "param.numel(): " << param->numel();
PADDLE_ENFORCE_EQ(beta1_pow_out->numel(), 1,
platform::errors::InvalidArgument(
"beta1 pow output size should be 1, but received "
"value is:%d.",
beta1_pow_out->numel()));
PADDLE_ENFORCE_EQ(beta2_pow_out->numel(), 1,
platform::errors::InvalidArgument(
"beta2 pow output size should be 1, but received "
"value is:%d.",
beta2_pow_out->numel()));
const bool multi_precision = ctx.Attr<bool>("multi_precision");
const LoDTensor* master_param = nullptr;
LoDTensor* master_param_out = nullptr;
if (multi_precision) {
bool has_master =
ctx.HasInput("MasterParam") && ctx.HasOutput("MasterParamOut");
PADDLE_ENFORCE_EQ(has_master, true,
platform::errors::InvalidArgument(
"The Input(MasterParam) and Output(MasterParamOut) "
"should not be null when "
"the attr `multi_precision` is true"));
master_param = ctx.Input<LoDTensor>("MasterParam");
master_param_out = ctx.Output<LoDTensor>("MasterParamOut");
}
const MPDType* master_in_data =
multi_precision ? master_param->data<MPDType>() : nullptr;
MPDType* master_out_data =
multi_precision
? master_param_out->mutable_data<MPDType>(ctx.GetPlace())
: nullptr;
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
if (grad_var->IsType<framework::LoDTensor>()) {
auto* grad = ctx.Input<LoDTensor>("Grad");
// update param and moment
int threads = 512;
int blocks = (param->numel() + threads - 1) / threads;
if (beta1_pow->place() == platform::CPUPlace() &&
beta2_pow->place() == platform::CPUPlace()) {
// Compute with betapow in REG
AdamKernelREG<T, MPDType><<<blocks, threads, 0, dev_ctx.stream()>>>(
beta1, beta2, epsilon, *beta1_pow->data<MPDType>(),
*beta2_pow->data<MPDType>(), mom1->data<MPDType>(),
mom1_out->mutable_data<MPDType>(ctx.GetPlace()),
mom2->data<MPDType>(),
mom2_out->mutable_data<MPDType>(ctx.GetPlace()),
lr->data<MPDType>(), grad->data<T>(), param->data<T>(),
param_out->mutable_data<T>(ctx.GetPlace()), master_in_data,
master_out_data, param->numel());
if (!use_global_beta_pow) {
// Cpu update
beta1_pow_out->mutable_data<MPDType>(platform::CPUPlace())[0] =
beta1 * beta1_pow->data<MPDType>()[0];
beta2_pow_out->mutable_data<MPDType>(platform::CPUPlace())[0] =
beta2 * beta2_pow->data<MPDType>()[0];
}
} else {
AdamKernelMEM<T, MPDType><<<blocks, threads, 0, dev_ctx.stream()>>>(
beta1, beta2, epsilon, beta1_pow->data<MPDType>(),
beta2_pow->data<MPDType>(), mom1->data<MPDType>(),
mom1_out->mutable_data<MPDType>(ctx.GetPlace()),
mom2->data<MPDType>(),
mom2_out->mutable_data<MPDType>(ctx.GetPlace()),
lr->data<MPDType>(), grad->data<T>(), param->data<T>(),
param_out->mutable_data<T>(ctx.GetPlace()), master_in_data,
master_out_data, param->numel());
if (!use_global_beta_pow) {
// Update with gpu
UpdateBetaPow<MPDType><<<1, 32, 0, dev_ctx.stream()>>>(
beta1, beta2, beta1_pow->data<MPDType>(),
beta2_pow->data<MPDType>(),
beta1_pow_out->mutable_data<MPDType>(ctx.GetPlace()),
beta2_pow_out->mutable_data<MPDType>(ctx.GetPlace()));
}
}
} else if (grad_var->IsType<phi::SelectedRows>()) {
auto* grad = ctx.Input<phi::SelectedRows>("Grad");
if (grad->rows().size() == 0) {
VLOG(3) << "grad row size is 0!!";
return;
}
std::vector<int64_t> cpu_rows(grad->rows().begin(), grad->rows().end());
bool is_strict_sorted = true;
for (size_t i = 1; i < cpu_rows.size(); ++i) {
if (cpu_rows[i - 1] >= cpu_rows[i]) {
is_strict_sorted = false;
break;
}
}
phi::SelectedRows tmp_grad_merge;
const phi::SelectedRows* grad_merge_ptr;
if (is_strict_sorted) {
grad_merge_ptr = grad;
} else {
// merge duplicated rows if any.
// The rows of grad_merge have been sorted inside MergeAdd functor
scatter::MergeAdd<platform::CUDADeviceContext, T> merge_func;
merge_func(ctx.template device_context<platform::CUDADeviceContext>(),
*grad, &tmp_grad_merge, true);
grad_merge_ptr = &tmp_grad_merge;
}
auto& grad_merge = *grad_merge_ptr;
auto& grad_tensor = grad_merge.value();
const T* grad_data = grad_tensor.template data<T>();
auto* grad_merge_rows = &grad_merge.rows();
paddle::framework::MixVector<int64_t> mixv_grad_merge_rows(
grad_merge_rows);
const int64_t* rows = mixv_grad_merge_rows.Data(ctx.GetPlace());
auto row_numel = grad_tensor.numel() / grad_merge.rows().size();
if (beta1_pow->place() == platform::CPUPlace() &&
beta2_pow->place() == platform::CPUPlace()) {
int threads = 512;
int ndim = param->numel();
int blocks = (ndim + threads - 1) / threads;
SparseAdamCUDAKernelREG<
T, MPDType><<<blocks, threads, 0, dev_ctx.stream()>>>(
beta1, beta2, epsilon, *beta1_pow->data<MPDType>(),
*beta2_pow->data<MPDType>(), mom1->data<MPDType>(),
mom1_out->mutable_data<MPDType>(ctx.GetPlace()),
mom2->data<MPDType>(),
mom2_out->mutable_data<MPDType>(ctx.GetPlace()),
lr->data<MPDType>(), grad_data, param->data<T>(),
param_out->mutable_data<T>(ctx.GetPlace()), master_in_data,
master_out_data, rows, row_numel, grad_merge.rows().size(),
lazy_mode, ndim);
if (!use_global_beta_pow) {
// Update with cpu
beta1_pow_out->mutable_data<MPDType>(platform::CPUPlace())[0] =
beta1 * beta1_pow->data<MPDType>()[0];
beta2_pow_out->mutable_data<MPDType>(platform::CPUPlace())[0] =
beta2 * beta2_pow->data<MPDType>()[0];
}
} else {
SparseAdamFunctor<T, GPUAdam, MPDType> functor(
beta1, beta2, epsilon, beta1_pow->data<MPDType>(),
beta2_pow->data<MPDType>(), mom1->data<MPDType>(),
mom1_out->mutable_data<MPDType>(ctx.GetPlace()),
mom2->data<MPDType>(),
mom2_out->mutable_data<MPDType>(ctx.GetPlace()),
lr->data<MPDType>(), grad_data, param->data<T>(),
param_out->mutable_data<T>(ctx.GetPlace()), master_in_data,
master_out_data, rows, row_numel, grad_merge.rows().size(),
lazy_mode);
// FIXME(minqiyang): remove BinarySearch in GPU later
platform::ForRange<platform::CUDADeviceContext> for_range(
static_cast<const platform::CUDADeviceContext&>(
ctx.device_context()),
param->numel());
for_range(functor);
if (!use_global_beta_pow) {
// update beta1 and beta2
UpdateBetaPow<MPDType><<<1, 32, 0, dev_ctx.stream()>>>(
beta1, beta2, beta1_pow->data<MPDType>(),
beta2_pow->data<MPDType>(),
beta1_pow_out->mutable_data<MPDType>(ctx.GetPlace()),
beta2_pow_out->mutable_data<MPDType>(ctx.GetPlace()));
}
}
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"Variable type not supported by adam_op"));
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(adam, ops::AdamOpCUDAKernel<float>,
ops::AdamOpCUDAKernel<double>,
ops::AdamOpCUDAKernel<plat::float16>);
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
......@@ -12,9 +12,32 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include <paddle/fluid/operators/optimizers/adamw_op.h>
#pragma once
namespace ops = paddle::operators;
REGISTER_OP_CPU_KERNEL(
adamw, ops::AdamWOpKernel<paddle::platform::CPUDeviceContext, float>,
ops::AdamWOpKernel<paddle::platform::CPUDeviceContext, double>);
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h"
namespace paddle {
namespace operators {
namespace scatter = paddle::operators::math::scatter;
static inline float GetAttrFromTensor(const framework::Tensor* tensor) {
const float* tensor_data = tensor->data<float>();
framework::Tensor cpu_tensor;
if (platform::is_gpu_place(tensor->place())) {
paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(),
&cpu_tensor);
tensor_data = cpu_tensor.data<float>();
}
if (platform::is_xpu_place(tensor->place())) {
paddle::framework::TensorCopySync(*tensor, platform::CPUPlace(),
&cpu_tensor);
tensor_data = cpu_tensor.data<float>();
}
return tensor_data[0];
}
} // namespace operators
} // namespace paddle
......@@ -15,8 +15,8 @@ limitations under the License. */
#include <memory>
#include <string>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/optimizers/adam_op.h"
#include "paddle/fluid/platform/device/npu/npu_op_runner.h"
namespace paddle {
......
......@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/optimizers/adam_op.h"
#include "gflags/gflags.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/optimizers/adam_op_functor.h"
namespace paddle {
namespace operators {
......
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/amp/fp16_type_traits.h"
#include "paddle/fluid/operators/optimizers/adamw_op.h"
#include "paddle/fluid/platform/float16.h"
namespace paddle {
namespace operators {
template <typename T, typename MT>
__global__ void AdamWKernelREG(MT beta1, MT beta2, MT epsilon, MT coeff,
MT lr_ratio, MT beta1_pow_, MT beta2_pow_,
const MT* moment1, MT* moment1_out,
const MT* moment2, MT* moment2_out,
const MT* lr_, const T* grad, const T* param,
T* param_out, const MT* master_param,
MT* master_param_out, int ndim) {
MT lr = *lr_ * lr_ratio;
MT beta1_pow = beta1_pow_;
MT beta2_pow = beta2_pow_;
int id = blockIdx.x * blockDim.x + threadIdx.x;
for (; id < ndim; id += gridDim.x * blockDim.x) {
MT p = master_param ? master_param[id] : static_cast<MT>(param[id]);
MT g = static_cast<MT>(grad[id]);
MT mom1 = static_cast<MT>(moment1[id]);
MT mom2 = static_cast<MT>(moment2[id]);
p *= (static_cast<MT>(1.0) - lr * coeff);
mom1 = beta1 * mom1 + (static_cast<MT>(1.0) - beta1) * g;
mom2 = beta2 * mom2 + (static_cast<MT>(1.0) - beta2) * g * g;
MT denom = (sqrt(mom2) / sqrt(static_cast<MT>(1.0) - beta2_pow)) + epsilon;
p += (mom1 / denom) * (-(lr / (static_cast<MT>(1.0) - beta1_pow)));
moment1_out[id] = mom1;
moment2_out[id] = mom2;
param_out[id] = static_cast<T>(p);
if (master_param_out) {
master_param_out[id] = p;
}
}
}
template <typename T, typename MT>
__global__ void AdamWKernelMEM(
MT beta1, MT beta2, MT epsilon, MT coeff, MT lr_ratio, const MT* beta1_pow_,
const MT* beta2_pow_, const MT* moment1, MT* moment1_out, const MT* moment2,
MT* moment2_out, const MT* lr_, const T* grad, const T* param, T* param_out,
const MT* master_param, MT* master_param_out, int ndim) {
MT lr = *lr_ * lr_ratio;
MT beta1_pow = *beta1_pow_;
MT beta2_pow = *beta2_pow_;
int id = blockIdx.x * blockDim.x + threadIdx.x;
for (; id < ndim; id += gridDim.x * blockDim.x) {
MT p = master_param ? master_param[id] : static_cast<MT>(param[id]);
MT g = static_cast<MT>(grad[id]);
MT mom1 = static_cast<MT>(moment1[id]);
MT mom2 = static_cast<MT>(moment2[id]);
p *= (static_cast<MT>(1.0) - lr * coeff);
mom1 = beta1 * mom1 + (static_cast<MT>(1.0) - beta1) * g;
mom2 = beta2 * mom2 + (static_cast<MT>(1.0) - beta2) * g * g;
MT denom = (sqrt(mom2) / sqrt(static_cast<MT>(1.0) - beta2_pow)) + epsilon;
p += (mom1 / denom) * (-(lr / (static_cast<MT>(1.0) - beta1_pow)));
moment1_out[id] = mom1;
moment2_out[id] = mom2;
param_out[id] = static_cast<T>(p);
if (master_param_out) {
master_param_out[id] = p;
}
}
}
template <typename T>
__global__ void UpdateAdamWBetaPow(T beta1, T beta2, const T* beta1_pow_,
const T* beta2_pow_, T* beta1_pow_out,
T* beta2_pow_out) {
*beta1_pow_out = beta1 * beta1_pow_[0];
*beta2_pow_out = beta2 * beta2_pow_[0];
}
template <typename T, typename MT>
__global__ void SparseAdamWCUDAKernelREG(
MT beta1, MT beta2, MT epsilon, MT coeff, MT lr_ratio, const MT beta1_pow,
const MT beta2_pow, const MT* mom1_, MT* mom1_out_, const MT* mom2_,
MT* mom2_out_, const MT* lr_, const T* grad_, const T* param_,
T* param_out_, const MT* master_param, MT* master_param_out,
const int64_t* rows_, int64_t row_numel, int64_t row_count, bool lazy_mode,
int ndim) {
int id = blockIdx.x * blockDim.x + threadIdx.x;
MT lr = *lr_ * lr_ratio;
for (; id < ndim; id += blockDim.x * gridDim.x) {
auto row_idx =
phi::funcs::BinarySearch<int64_t>(rows_, row_count, id / row_numel);
if (lazy_mode && row_idx < 0) {
return;
} else {
MT mom1 = static_cast<MT>(mom1_[id]);
MT mom2 = static_cast<MT>(mom2_[id]);
MT p = master_param ? master_param[id] : static_cast<MT>(param_[id]);
MT g = row_idx >= 0
? static_cast<MT>(grad_[row_idx * row_numel + id % row_numel])
: static_cast<MT>(0);
p *= (static_cast<MT>(1.0) - lr * coeff);
mom1 = beta1 * mom1 + (static_cast<MT>(1.0) - beta1) * g;
mom2 = beta2 * mom2 + (static_cast<MT>(1.0) - beta2) * g * g;
MT denom =
(sqrt(mom2) / sqrt(static_cast<MT>(1.0) - beta2_pow)) + epsilon;
p += (mom1 / denom) * (-(lr / (static_cast<MT>(1.0) - beta1_pow)));
// Write back to global memory
mom1_out_[id] = mom1;
mom2_out_[id] = mom2;
param_out_[id] = static_cast<T>(p);
if (master_param_out) {
master_param_out[id] = p;
}
}
}
}
template <typename T>
class AdamWOpCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
const auto* param_var = ctx.InputVar("Param");
PADDLE_ENFORCE_EQ(param_var->IsType<framework::LoDTensor>(), true,
platform::errors::InvalidArgument(
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s",
ctx.InputNames("Param").front(),
framework::ToTypeName(param_var->Type())));
using paddle::framework::LoDTensor;
using MPDType = typename details::MPTypeTrait<T>::Type;
int64_t min_row_size_to_use_multithread =
ctx.Attr<int64_t>("min_row_size_to_use_multithread");
bool lazy_mode = ctx.Attr<bool>("lazy_mode");
bool use_global_beta_pow = ctx.Attr<bool>("use_global_beta_pow");
VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow;
MPDType coeff = static_cast<MPDType>(ctx.Attr<float>("coeff"));
MPDType lr_ratio = static_cast<MPDType>(ctx.Attr<float>("lr_ratio"));
auto* param = ctx.Input<LoDTensor>("Param");
auto* grad_var = ctx.InputVar("Grad");
auto* mom1 = ctx.Input<LoDTensor>("Moment1");
auto* mom2 = ctx.Input<LoDTensor>("Moment2");
auto* lr = ctx.Input<LoDTensor>("LearningRate");
auto* beta1_pow = ctx.Input<LoDTensor>("Beta1Pow");
auto* beta2_pow = ctx.Input<LoDTensor>("Beta2Pow");
auto* param_out = ctx.Output<LoDTensor>("ParamOut");
auto* mom1_out = ctx.Output<LoDTensor>("Moment1Out");
auto* mom2_out = ctx.Output<LoDTensor>("Moment2Out");
auto* beta1_pow_out = ctx.Output<LoDTensor>("Beta1PowOut");
auto* beta2_pow_out = ctx.Output<LoDTensor>("Beta2PowOut");
bool skip_update = false;
if (ctx.HasInput("SkipUpdate")) {
auto* skip_update_tensor = ctx.Input<framework::Tensor>("SkipUpdate");
PADDLE_ENFORCE_EQ(skip_update_tensor->numel(), 1,
platform::errors::InvalidArgument(
"Input(SkipUpdate) size must be 1, but get %d",
skip_update_tensor->numel()));
std::vector<bool> skip_update_vec;
paddle::framework::TensorToVector(*skip_update_tensor,
ctx.device_context(), &skip_update_vec);
skip_update = skip_update_vec[0];
}
// skip_update=true, just copy input to output, and TensorCopy will call
// mutable_data
if (skip_update) {
VLOG(4) << "Adamw skip update";
framework::TensorCopy(
*param, ctx.GetPlace(),
ctx.template device_context<platform::DeviceContext>(), param_out);
framework::TensorCopy(
*mom1, ctx.GetPlace(),
ctx.template device_context<platform::DeviceContext>(), mom1_out);
framework::TensorCopy(
*mom2, ctx.GetPlace(),
ctx.template device_context<platform::DeviceContext>(), mom2_out);
framework::TensorCopy(
*beta1_pow, ctx.GetPlace(),
ctx.template device_context<platform::DeviceContext>(),
beta1_pow_out);
framework::TensorCopy(
*beta2_pow, ctx.GetPlace(),
ctx.template device_context<platform::DeviceContext>(),
beta2_pow_out);
return;
}
// if with_decay = false, coeff = 0
bool with_decay = ctx.Attr<bool>("with_decay");
if (!with_decay) {
coeff = static_cast<float>(0.0);
}
MPDType beta1 = static_cast<MPDType>(ctx.Attr<float>("beta1"));
if (ctx.HasInput("Beta1Tensor")) {
auto* beta1_tensor = ctx.Input<framework::Tensor>("Beta1Tensor");
PADDLE_ENFORCE_EQ(beta1_tensor->numel(), 1,
platform::errors::InvalidArgument(
"Input(Beta1Tensor) size must be 1, but get %d",
beta1_tensor->numel()));
beta1 = static_cast<MPDType>(GetAttrFromTensor(beta1_tensor));
}
MPDType beta2 = static_cast<MPDType>(ctx.Attr<float>("beta2"));
if (ctx.HasInput("Beta2Tensor")) {
auto* beta2_tensor = ctx.Input<framework::Tensor>("Beta2Tensor");
PADDLE_ENFORCE_EQ(beta2_tensor->numel(), 1,
platform::errors::InvalidArgument(
"Input(Beta2Tensor) size must be 1, but get %d",
beta2_tensor->numel()));
beta2 = static_cast<MPDType>(GetAttrFromTensor(beta2_tensor));
}
MPDType epsilon = static_cast<MPDType>(ctx.Attr<float>("epsilon"));
if (ctx.HasInput("EpsilonTensor")) {
auto* epsilon_tensor = ctx.Input<framework::Tensor>("EpsilonTensor");
PADDLE_ENFORCE_EQ(epsilon_tensor->numel(), 1,
platform::errors::InvalidArgument(
"Input(EpsilonTensor) size must be 1, but get %d",
epsilon_tensor->numel()));
epsilon = static_cast<MPDType>(GetAttrFromTensor(epsilon_tensor));
}
VLOG(3) << "beta1_pow.numel() : " << beta1_pow->numel()
<< "beta2_pow.numel() : " << beta2_pow->numel();
VLOG(3) << "param.numel(): " << param->numel();
PADDLE_ENFORCE_EQ(beta1_pow_out->numel(), 1,
platform::errors::InvalidArgument(
"beta1 pow output size should be 1, but received "
"value is:%d.",
beta1_pow_out->numel()));
PADDLE_ENFORCE_EQ(beta2_pow_out->numel(), 1,
platform::errors::InvalidArgument(
"beta2 pow output size should be 1, but received "
"value is:%d.",
beta2_pow_out->numel()));
const bool multi_precision = ctx.Attr<bool>("multi_precision");
const LoDTensor* master_param = nullptr;
LoDTensor* master_param_out = nullptr;
if (multi_precision) {
bool has_master =
ctx.HasInput("MasterParam") && ctx.HasOutput("MasterParamOut");
PADDLE_ENFORCE_EQ(has_master, true,
platform::errors::InvalidArgument(
"The Input(MasterParam) and Output(MasterParamOut) "
"should not be null when "
"the attr `multi_precision` is true"));
master_param = ctx.Input<LoDTensor>("MasterParam");
master_param_out = ctx.Output<LoDTensor>("MasterParamOut");
}
const MPDType* master_in_data =
multi_precision ? master_param->data<MPDType>() : nullptr;
MPDType* master_out_data =
multi_precision
? master_param_out->mutable_data<MPDType>(ctx.GetPlace())
: nullptr;
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
if (grad_var->IsType<framework::LoDTensor>()) {
auto* grad = ctx.Input<LoDTensor>("Grad");
// update param and moment
int threads = 512;
int blocks = (param->numel() + threads - 1) / threads;
if (beta1_pow->place() == platform::CPUPlace() &&
beta2_pow->place() == platform::CPUPlace()) {
// Compute with betapow in REG
AdamWKernelREG<T, MPDType><<<blocks, threads, 0, dev_ctx.stream()>>>(
beta1, beta2, epsilon, coeff, lr_ratio, *beta1_pow->data<MPDType>(),
*beta2_pow->data<MPDType>(), mom1->data<MPDType>(),
mom1_out->mutable_data<MPDType>(ctx.GetPlace()),
mom2->data<MPDType>(),
mom2_out->mutable_data<MPDType>(ctx.GetPlace()),
lr->data<MPDType>(), grad->data<T>(), param->data<T>(),
param_out->mutable_data<T>(ctx.GetPlace()), master_in_data,
master_out_data, param->numel());
if (!use_global_beta_pow) {
// Cpu update
beta1_pow_out->mutable_data<MPDType>(platform::CPUPlace())[0] =
beta1 * beta1_pow->data<MPDType>()[0];
beta2_pow_out->mutable_data<MPDType>(platform::CPUPlace())[0] =
beta2 * beta2_pow->data<MPDType>()[0];
}
} else {
AdamWKernelMEM<T, MPDType><<<blocks, threads, 0, dev_ctx.stream()>>>(
beta1, beta2, epsilon, coeff, lr_ratio, beta1_pow->data<MPDType>(),
beta2_pow->data<MPDType>(), mom1->data<MPDType>(),
mom1_out->mutable_data<MPDType>(ctx.GetPlace()),
mom2->data<MPDType>(),
mom2_out->mutable_data<MPDType>(ctx.GetPlace()),
lr->data<MPDType>(), grad->data<T>(), param->data<T>(),
param_out->mutable_data<T>(ctx.GetPlace()), master_in_data,
master_out_data, param->numel());
if (!use_global_beta_pow) {
// Update with gpu
UpdateAdamWBetaPow<MPDType><<<1, 32, 0, dev_ctx.stream()>>>(
beta1, beta2, beta1_pow->data<MPDType>(),
beta2_pow->data<MPDType>(),
beta1_pow_out->mutable_data<MPDType>(ctx.GetPlace()),
beta2_pow_out->mutable_data<MPDType>(ctx.GetPlace()));
}
}
} else if (grad_var->IsType<phi::SelectedRows>()) {
auto* grad = ctx.Input<phi::SelectedRows>("Grad");
if (grad->rows().size() == 0) {
VLOG(3) << "grad row size is 0!!";
return;
}
std::vector<int64_t> cpu_rows(grad->rows().begin(), grad->rows().end());
bool is_strict_sorted = true;
for (size_t i = 1; i < cpu_rows.size(); ++i) {
if (cpu_rows[i - 1] >= cpu_rows[i]) {
is_strict_sorted = false;
break;
}
}
phi::SelectedRows tmp_grad_merge;
const phi::SelectedRows* grad_merge_ptr;
if (is_strict_sorted) {
grad_merge_ptr = grad;
} else {
// merge duplicated rows if any.
// The rows of grad_merge have been sorted inside MergeAdd functor
scatter::MergeAdd<platform::CUDADeviceContext, T> merge_func;
merge_func(ctx.template device_context<platform::CUDADeviceContext>(),
*grad, &tmp_grad_merge, true);
grad_merge_ptr = &tmp_grad_merge;
}
auto& grad_merge = *grad_merge_ptr;
auto& grad_tensor = grad_merge.value();
const T* grad_data = grad_tensor.template data<T>();
auto* grad_merge_rows = &grad_merge.rows();
paddle::framework::MixVector<int64_t> mixv_grad_merge_rows(
grad_merge_rows);
const int64_t* rows = mixv_grad_merge_rows.Data(ctx.GetPlace());
auto row_numel = grad_tensor.numel() / grad_merge.rows().size();
if (beta1_pow->place() == platform::CPUPlace() &&
beta2_pow->place() == platform::CPUPlace()) {
int threads = 512;
int ndim = param->numel();
int blocks = (ndim + threads - 1) / threads;
SparseAdamWCUDAKernelREG<
T, MPDType><<<blocks, threads, 0, dev_ctx.stream()>>>(
beta1, beta2, epsilon, coeff, lr_ratio, *beta1_pow->data<MPDType>(),
*beta2_pow->data<MPDType>(), mom1->data<MPDType>(),
mom1_out->mutable_data<MPDType>(ctx.GetPlace()),
mom2->data<MPDType>(),
mom2_out->mutable_data<MPDType>(ctx.GetPlace()),
lr->data<MPDType>(), grad_data, param->data<T>(),
param_out->mutable_data<T>(ctx.GetPlace()), master_in_data,
master_out_data, rows, row_numel, grad_merge.rows().size(),
lazy_mode, ndim);
if (!use_global_beta_pow) {
// Update with cpu
beta1_pow_out->mutable_data<MPDType>(platform::CPUPlace())[0] =
beta1 * beta1_pow->data<MPDType>()[0];
beta2_pow_out->mutable_data<MPDType>(platform::CPUPlace())[0] =
beta2 * beta2_pow->data<MPDType>()[0];
}
} else {
SparseAdamWFunctor<T, GPUAdamW, MPDType> functor(
beta1, beta2, epsilon, coeff, lr_ratio, beta1_pow->data<MPDType>(),
beta2_pow->data<MPDType>(), mom1->data<MPDType>(),
mom1_out->mutable_data<MPDType>(ctx.GetPlace()),
mom2->data<MPDType>(),
mom2_out->mutable_data<MPDType>(ctx.GetPlace()),
lr->data<MPDType>(), grad_data, param->data<T>(),
param_out->mutable_data<T>(ctx.GetPlace()), master_in_data,
master_out_data, rows, row_numel, grad_merge.rows().size(),
lazy_mode);
// FIXME(minqiyang): remove BinarySearch in GPU later
platform::ForRange<platform::CUDADeviceContext> for_range(
static_cast<const platform::CUDADeviceContext&>(
ctx.device_context()),
param->numel());
for_range(functor);
if (!use_global_beta_pow) {
// update beta1 and beta2
UpdateAdamWBetaPow<MPDType><<<1, 32, 0, dev_ctx.stream()>>>(
beta1, beta2, beta1_pow->data<MPDType>(),
beta2_pow->data<MPDType>(),
beta1_pow_out->mutable_data<MPDType>(ctx.GetPlace()),
beta2_pow_out->mutable_data<MPDType>(ctx.GetPlace()));
}
}
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"Variable type not supported by adamw_op"));
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(adamw, ops::AdamWOpCUDAKernel<float>,
ops::AdamWOpCUDAKernel<double>,
ops::AdamWOpCUDAKernel<plat::float16>);
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <paddle/fluid/operators/optimizers/adam_op.h>
namespace paddle {
namespace operators {
class AdamWOp : public AdamOp {
using AdamOp::AdamOp;
};
struct GPUAdamW;
struct CPUAdamW;
template <typename T, typename Flavour>
class AdamWFunctor;
template <typename T>
class AdamWFunctor<T, CPUAdamW> {
private:
const T coeff_;
const T lr_ratio_;
const T* lr_;
T* param_;
public:
AdamWFunctor(const T coeff, const T lr_ratio, const T* lr, T* param)
: coeff_(coeff), lr_ratio_(lr_ratio), lr_(lr), param_(param) {}
inline HOSTDEVICE void operator()(size_t numel) const {
Eigen::Map<Eigen::Array<T, 1, Eigen::Dynamic>> param{
param_, static_cast<Eigen::Index>(numel)};
T lr = *lr_;
// Calculation
param -= lr * lr_ratio_ * coeff_ * param;
}
};
template <typename T, typename Flavour, typename MT = T>
class SparseAdamWFunctor;
template <typename T, typename MT>
class SparseAdamWFunctor<T, GPUAdamW, MT> {
private:
MT beta1_;
MT beta2_;
MT epsilon_;
MT coeff_;
MT lr_ratio_;
const MT* beta1_pow_;
const MT* beta2_pow_;
const MT* moment1_;
MT* moment1_out_;
const MT* moment2_;
MT* moment2_out_;
const MT* lr_;
const T* grad_;
const T* param_;
T* param_out_;
const MT* master_param_;
MT* master_param_out_;
const int64_t* rows_;
int64_t row_numel_;
int64_t row_count_;
bool lazy_mode_;
public:
SparseAdamWFunctor(MT beta1, MT beta2, MT epsilon, MT coeff, MT lr_ratio,
const MT* beta1_pow, const MT* beta2_pow, const MT* mom1,
MT* mom1_out, const MT* mom2, MT* mom2_out, const MT* lr,
const T* grad, const T* param, T* param_out,
const MT* master_param, MT* master_param_out,
const int64_t* rows, int64_t row_numel, int64_t row_count,
bool lazy_mode)
: beta1_(beta1),
beta2_(beta2),
epsilon_(epsilon),
coeff_(coeff),
lr_ratio_(lr_ratio),
beta1_pow_(beta1_pow),
beta2_pow_(beta2_pow),
moment1_(mom1),
moment1_out_(mom1_out),
moment2_(mom2),
moment2_out_(mom2_out),
lr_(lr),
grad_(grad),
param_(param),
param_out_(param_out),
master_param_(master_param),
master_param_out_(master_param_out),
rows_(rows),
row_numel_(row_numel),
row_count_(row_count),
lazy_mode_(lazy_mode) {}
inline HOSTDEVICE void adamw_update(size_t i, MT g) const {
// The following code is the same as dense
MT mom1 = moment1_[i];
MT mom2 = moment2_[i];
MT lr = *lr_ * lr_ratio_;
MT lr_orig = lr;
MT beta1_pow = *beta1_pow_;
MT beta2_pow = *beta2_pow_;
MT p = master_param_ ? master_param_[i] : static_cast<MT>(param_[i]);
// Calculation
lr *= sqrt(static_cast<MT>(1.0) - beta2_pow) /
(static_cast<MT>(1.0) - beta1_pow);
mom1 = beta1_ * mom1 + (static_cast<MT>(1.0) - beta1_) * g;
mom2 = beta2_ * mom2 + (static_cast<MT>(1.0) - beta2_) * g * g;
p -= lr_orig * coeff_ * p;
p -= lr * (mom1 / (sqrt(mom2) +
epsilon_ * sqrt(static_cast<MT>(1.0) - beta2_pow)));
// Write back to global memory
moment1_out_[i] = mom1;
moment2_out_[i] = mom2;
param_out_[i] = static_cast<T>(p);
if (master_param_out_) {
master_param_out_[i] = p;
}
}
inline HOSTDEVICE void operator()(size_t i) const {
auto row_idx =
phi::funcs::BinarySearch<int64_t>(rows_, row_count_, i / row_numel_);
if (lazy_mode_ && row_idx < 0) {
return;
} else {
MT g = row_idx >= 0
? static_cast<MT>(grad_[row_idx * row_numel_ + i % row_numel_])
: static_cast<MT>(0);
adamw_update(i, g);
}
}
};
template <typename DeviceContext, typename T>
class AdamWOpKernel : public AdamOpKernel<DeviceContext, T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
const auto* param_var = ctx.InputVar("Param");
PADDLE_ENFORCE_EQ(param_var->IsType<framework::LoDTensor>(), true,
platform::errors::InvalidArgument(
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s",
ctx.InputNames("Param").front(),
framework::ToTypeName(param_var->Type())));
using paddle::framework::LoDTensor;
bool skip_update = false;
// TODO(liupeng):
if (ctx.HasInput("SkipUpdate")) {
VLOG(3) << "Has SkipUpdate";
auto* skip_update_tensor = ctx.Input<framework::Tensor>("SkipUpdate");
PADDLE_ENFORCE_EQ(skip_update_tensor->numel(), 1,
platform::errors::InvalidArgument(
"Input(SkipUpdate) size must be 1, but get %d",
skip_update_tensor->numel()));
std::vector<bool> skip_update_vec;
paddle::framework::TensorToVector(*skip_update_tensor,
ctx.device_context(), &skip_update_vec);
skip_update = skip_update_vec[0];
}
VLOG(3) << "Skip update" << skip_update;
bool with_decay = ctx.Attr<bool>("with_decay");
if (skip_update || !with_decay) {
AdamOpKernel<DeviceContext, T>::Compute(ctx);
return;
}
T coeff = static_cast<T>(ctx.Attr<float>("coeff"));
T lr_ratio = static_cast<T>(ctx.Attr<float>("lr_ratio"));
auto* lr = ctx.Input<LoDTensor>("LearningRate");
LoDTensor* param;
if (ctx.HasInput("MasterParam")) {
// TODO(liupeng): master
param = const_cast<LoDTensor*>(ctx.Input<LoDTensor>("MasterParam"));
} else {
param = const_cast<LoDTensor*>(ctx.Input<LoDTensor>("Param"));
}
AdamWFunctor<T, CPUAdamW> functor(coeff, lr_ratio, lr->data<T>(),
param->data<T>());
functor(param->numel());
AdamOpKernel<DeviceContext, T>::Compute(ctx);
}
};
} // namespace operators
} // namespace paddle
......@@ -13,7 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "gflags/gflags.h"
#include "paddle/fluid/operators/optimizers/adam_op.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/optimizers/adam_op_functor.h"
namespace paddle {
namespace operators {
......
......@@ -11,7 +11,8 @@ limitations under the License. */
#pragma once
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/optimizers/adam_op.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h"
#include "paddle/phi/kernels/funcs/adam_functors.h"
namespace paddle {
namespace operators {
......@@ -82,7 +83,7 @@ class MergedAdamOpKernel : public framework::OpKernel<T> {
size_t param_num = param.size();
for (size_t idx = 0; idx < param_num; idx++) {
AdamFunctor<T, CPUAdam> functor(
phi::funcs::AdamFunctor<T, phi::funcs::CPUAdam> functor(
beta1, beta2, epsilon, beta1_pow[idx]->data<T>(),
beta2_pow[idx]->data<T>(), mom1[idx]->data<T>(),
mom1_out[idx]->mutable_data<T>(ctx.GetPlace()), mom2[idx]->data<T>(),
......
......@@ -32,6 +32,9 @@ ExportedFlagInfoMap *GetMutableExportedFlagInfoMap() {
} // namespace platform
} // namespace paddle
PADDLE_DEFINE_EXPORTED_int32(inner_op_parallelism, 0,
"number of threads for inner op");
/**
* NOTE(paddle-dev): This file is designed to define all public FLAGS.
*/
......
......@@ -66,6 +66,88 @@ void AdadeltaInferMeta(const MetaTensor& param,
avg_squared_update_out->set_dtype(avg_squared_update.dtype());
}
void AdamInferMeta(const MetaTensor& param,
const MetaTensor& grad,
const MetaTensor& learning_rate,
const MetaTensor& moment1,
const MetaTensor& moment2,
const MetaTensor& beta1_pow,
const MetaTensor& beta2_pow,
paddle::optional<const MetaTensor&> master_param,
paddle::optional<const MetaTensor&> skip_update,
const Scalar& beta1,
const Scalar& beta2,
const Scalar& epsilon,
bool lazy_mode,
int64_t min_row_size_to_use_multithread,
bool multi_precision,
bool use_global_beta_pow,
MetaTensor* param_out,
MetaTensor* moment1_out,
MetaTensor* moment2_out,
MetaTensor* beta1_pow_out,
MetaTensor* beta2_pow_out,
MetaTensor* master_param_outs) {
auto lr_dims = learning_rate.dims();
PADDLE_ENFORCE_EQ(
phi::product(lr_dims),
1,
errors::InvalidArgument(
"The number of LearningRate shall be 1, but received %d. Maybe "
"the Input variable LearningRate has not "
"been initialized. You may need to confirm "
"if you put exe.run(startup_program) "
"after optimizer.minimize function.",
phi::product(lr_dims)));
auto beta1_pow_dims = beta1_pow.dims();
VLOG(3) << "dims of Beta1Pow : [" << beta1_pow_dims << "]";
PADDLE_ENFORCE_GE(phi::product(beta1_pow_dims),
1,
errors::InvalidArgument(
"The size of Beta1 power accumulator should be greater "
"than 0, but received %d.",
phi::product(beta1_pow_dims)));
auto beta2_pow_dims = beta2_pow.dims();
VLOG(3) << "dims of Beta2Pow : [" << beta2_pow_dims << "]";
PADDLE_ENFORCE_GE(phi::product(beta2_pow_dims),
1,
errors::InvalidArgument(
"The size of Beta2 power accumulator should be greater "
"than 0, but received %d.",
phi::product(beta2_pow_dims)));
auto param_dims = param.dims();
PADDLE_ENFORCE_EQ(
param_dims,
moment1.dims(),
errors::InvalidArgument(
"Param and Moment1 input of AdamOp should have same dimension. But "
"received Param dims: [%s], Moment1 dims: [%s].",
param_dims,
moment1.dims()));
PADDLE_ENFORCE_EQ(
param_dims,
moment2.dims(),
errors::InvalidArgument(
"Param and Moment2 input of AdamOp should have same dimension. But "
"received Param dims: [%s], Moment2 dims: [%s].",
param_dims,
moment2.dims()));
param_out->set_dims(param_dims);
param_out->set_dtype(param.dtype());
moment1_out->set_dims(param_dims);
moment1_out->set_dtype(moment1.dtype());
moment2_out->set_dims(param_dims);
moment2_out->set_dtype(moment2.dtype());
beta1_pow_out->set_dims(beta1_pow_dims);
beta1_pow_out->set_dtype(beta1_pow.dtype());
beta2_pow_out->set_dims(beta2_pow_dims);
beta2_pow_out->set_dtype(beta2_pow.dtype());
}
void AdamaxInferMeta(const MetaTensor& param,
const MetaTensor& grad,
const MetaTensor& learning_rate,
......@@ -122,6 +204,55 @@ void AdamaxInferMeta(const MetaTensor& param,
inf_norm_out->set_dtype(inf_norm.dtype());
}
void AdamwInferMeta(const MetaTensor& param,
const MetaTensor& grad,
const MetaTensor& learning_rate,
const MetaTensor& moment1,
const MetaTensor& moment2,
const MetaTensor& beta1_pow,
const MetaTensor& beta2_pow,
paddle::optional<const MetaTensor&> master_param,
paddle::optional<const MetaTensor&> skip_update,
const Scalar& beta1,
const Scalar& beta2,
const Scalar& epsilon,
float lr_ratio,
float coeff,
bool with_decay,
bool lazy_mode,
int64_t min_row_size_to_use_multithread,
bool multi_precision,
bool use_global_beta_pow,
MetaTensor* param_out,
MetaTensor* moment1_out,
MetaTensor* moment2_out,
MetaTensor* beta1_pow_out,
MetaTensor* beta2_pow_out,
MetaTensor* master_param_outs) {
AdamInferMeta(param,
grad,
learning_rate,
moment1,
moment2,
beta1_pow,
beta2_pow,
master_param,
skip_update,
beta1,
beta2,
epsilon,
lazy_mode,
min_row_size_to_use_multithread,
multi_precision,
use_global_beta_pow,
param_out,
moment1_out,
moment2_out,
beta1_pow_out,
beta2_pow_out,
master_param_outs);
}
void AucInferMeta(const MetaTensor& input,
const MetaTensor& label,
const MetaTensor& stat_pos,
......
......@@ -60,6 +60,55 @@ void AdamaxInferMeta(const MetaTensor& param,
MetaTensor* moment_out,
MetaTensor* inf_norm_out);
void AdamInferMeta(const MetaTensor& param,
const MetaTensor& grad,
const MetaTensor& learning_rate,
const MetaTensor& moment1,
const MetaTensor& moment2,
const MetaTensor& beta1_pow,
const MetaTensor& beta2_pow,
paddle::optional<const MetaTensor&> master_param,
paddle::optional<const MetaTensor&> skip_update,
const Scalar& beta1,
const Scalar& beta2,
const Scalar& epsilon,
bool lazy_mode,
int64_t min_row_size_to_use_multithread,
bool multi_precision,
bool use_global_beta_pow,
MetaTensor* param_out,
MetaTensor* moment1_out,
MetaTensor* moment2_out,
MetaTensor* beta1_pow_out,
MetaTensor* beta2_pow_out,
MetaTensor* master_param_outs);
void AdamwInferMeta(const MetaTensor& param,
const MetaTensor& grad,
const MetaTensor& learning_rate,
const MetaTensor& moment1,
const MetaTensor& moment2,
const MetaTensor& beta1_pow,
const MetaTensor& beta2_pow,
paddle::optional<const MetaTensor&> master_param,
paddle::optional<const MetaTensor&> skip_update,
const Scalar& beta1,
const Scalar& beta2,
const Scalar& epsilon,
float lr_ratio,
float coeff,
bool with_decay,
bool lazy_mode,
int64_t min_row_size_to_use_multithread,
bool multi_precision,
bool use_global_beta_pow,
MetaTensor* param_out,
MetaTensor* moment1_out,
MetaTensor* moment2_out,
MetaTensor* beta1_pow_out,
MetaTensor* beta2_pow_out,
MetaTensor* master_param_outs);
void AucInferMeta(const MetaTensor& input,
const MetaTensor& label,
const MetaTensor& stat_pos,
......
......@@ -1811,6 +1811,7 @@ void TileInferMeta(const MetaTensor& x,
if (out_shape[0] == x_dims[0]) {
out->share_lod(x);
}
out->set_dtype(x.dtype());
}
void TopKInferMeta(const MetaTensor& x,
......
......@@ -27,12 +27,14 @@ kernel_library(full_kernel DEPS ${COMMON_KERNEL_DEPS} empty_kernel)
# Some kernels depend on some targets that are not commonly used.
# These targets are not suitable for common dependencies.
# In this case, you need to manually generate them here.
set(MANUAL_BUILD_KERNELS deformable_conv_kernel deformable_conv_grad_kernel eigh_kernel gumbel_softmax_kernel gumbel_softmax_grad_kernel
hierarchical_sigmoid_kernel hierarchical_sigmoid_grad_kernel
set(MANUAL_BUILD_KERNELS adam_kernel adamw_kernel deformable_conv_kernel deformable_conv_grad_kernel eigh_kernel
gumbel_softmax_kernel gumbel_softmax_grad_kernel hierarchical_sigmoid_kernel hierarchical_sigmoid_grad_kernel
matrix_power_kernel matrix_power_grad_kernel maxout_kernel maxout_grad_kernel pool_kernel
put_along_axis_kernel put_along_axis_grad_kernel segment_pool_kernel segment_pool_grad_kernel
softmax_kernel softmax_grad_kernel take_along_axis_kernel take_along_axis_grad_kernel
triangular_solve_grad_kernel determinant_grad_kernel reduce_kernel)
kernel_library(adam_kernel DEPS gflags glog flags ${COMMON_KERNEL_DEPS} selected_rows_functor threadpool jit_kernel_helper)
kernel_library(adamw_kernel DEPS ${COMMON_KERNEL_DEPS} adam_kernel)
kernel_library(deformable_conv_kernel DEPS ${COMMON_KERNEL_DEPS} deformable_conv_functor)
kernel_library(deformable_conv_grad_kernel DEPS ${COMMON_KERNEL_DEPS} deformable_conv_functor)
kernel_library(eigh_kernel DEPS ${COMMON_KERNEL_DEPS} lapack_function)
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/common/scalar.h"
#include "paddle/phi/core/dense_tensor.h"
namespace phi {
template <typename T, typename Context>
void AdamDenseKernel(const Context& dev_ctx,
const DenseTensor& param,
const DenseTensor& grad,
const DenseTensor& learning_rate,
const DenseTensor& moment1,
const DenseTensor& moment2,
const DenseTensor& beta1_pow,
const DenseTensor& beta2_pow,
paddle::optional<const DenseTensor&> master_param,
paddle::optional<const DenseTensor&> skip_update,
const Scalar& beta1,
const Scalar& beta2,
const Scalar& epsilon,
bool lazy_mode,
int64_t min_row_size_to_use_multithread,
bool multi_precision,
bool use_global_beta_pow,
DenseTensor* param_out,
DenseTensor* moment1_out,
DenseTensor* moment2_out,
DenseTensor* beta1_pow_out,
DenseTensor* beta2_pow_out,
DenseTensor* master_param_outs);
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/common/scalar.h"
#include "paddle/phi/core/dense_tensor.h"
namespace phi {
template <typename T, typename Context>
void AdamwDenseKernel(const Context& dev_ctx,
const DenseTensor& param,
const DenseTensor& grad,
const DenseTensor& learning_rate,
const DenseTensor& moment1,
const DenseTensor& moment2,
const DenseTensor& beta1_pow,
const DenseTensor& beta2_pow,
paddle::optional<const DenseTensor&> master_param,
paddle::optional<const DenseTensor&> skip_update,
const Scalar& beta1,
const Scalar& beta2,
const Scalar& epsilon,
float lr_ratio,
float coeff,
bool with_decay,
bool lazy_mode,
int64_t min_row_size_to_use_multithread,
bool multi_precision,
bool use_global_beta_pow,
DenseTensor* param_out,
DenseTensor* moment1_out,
DenseTensor* moment2_out,
DenseTensor* beta1_pow_out,
DenseTensor* beta2_pow_out,
DenseTensor* master_param_outs);
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/adam_kernel.h"
#include <vector>
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/jit/kernels.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/kernels/funcs/adam_functors.h"
DECLARE_int32(inner_op_parallelism);
namespace phi {
template <typename T, typename Context>
void AdamDenseKernel(const Context& dev_ctx,
const DenseTensor& param,
const DenseTensor& grad,
const DenseTensor& learning_rate,
const DenseTensor& moment1,
const DenseTensor& moment2,
const DenseTensor& beta1_pow,
const DenseTensor& beta2_pow,
paddle::optional<const DenseTensor&> master_param,
paddle::optional<const DenseTensor&> skip_update,
const Scalar& beta1,
const Scalar& beta2,
const Scalar& epsilon,
bool lazy_mode,
int64_t min_row_size_to_use_multithread,
bool multi_precision,
bool use_global_beta_pow,
DenseTensor* param_out,
DenseTensor* moment1_out,
DenseTensor* moment2_out,
DenseTensor* beta1_pow_out,
DenseTensor* beta2_pow_out,
DenseTensor* master_param_outs) {
VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow;
bool skip_update_ = false;
if (skip_update.is_initialized()) {
PADDLE_ENFORCE_EQ(
skip_update->numel(),
1,
errors::InvalidArgument("Input(SkipUpdate) size must be 1, but get %d",
skip_update->numel()));
std::vector<bool> skip_update_vec;
paddle::framework::TensorToVector(*skip_update, dev_ctx, &skip_update_vec);
skip_update_ = skip_update_vec[0];
}
// skip_update=true, just copy input to output, and TensorCopy will call
// mutable_data
if (skip_update_) {
VLOG(4) << "Adam skip update";
phi::Copy(dev_ctx, param, dev_ctx.GetPlace(), false, param_out);
phi::Copy(dev_ctx, moment1, dev_ctx.GetPlace(), false, moment1_out);
phi::Copy(dev_ctx, moment2, dev_ctx.GetPlace(), false, moment2_out);
phi::Copy(dev_ctx, beta1_pow, dev_ctx.GetPlace(), false, beta1_pow_out);
phi::Copy(dev_ctx, beta2_pow, dev_ctx.GetPlace(), false, beta2_pow_out);
return;
}
T beta1_ = beta1.to<T>();
T beta2_ = beta2.to<T>();
T epsilon_ = epsilon.to<T>();
VLOG(3) << "beta1_pow.numel() : " << beta1_pow.numel();
VLOG(3) << "beta2_pow.numel() : " << beta2_pow.numel();
VLOG(3) << "param.numel(): " << param.numel();
PADDLE_ENFORCE_EQ(
beta1_pow_out->numel(),
1,
errors::InvalidArgument("beta1 pow output size should be 1, but received "
"value is:%d.",
beta1_pow_out->numel()));
PADDLE_ENFORCE_EQ(
beta2_pow_out->numel(),
1,
errors::InvalidArgument("beta2 pow output size should be 1, but received "
"value is:%d.",
beta2_pow_out->numel()));
T beta1_p = beta1_pow.data<T>()[0];
T beta2_p = beta2_pow.data<T>()[0];
if (!use_global_beta_pow) {
dev_ctx.template Alloc<T>(beta1_pow_out)[0] = beta1_ * beta1_p;
dev_ctx.template Alloc<T>(beta2_pow_out)[0] = beta2_ * beta2_p;
}
T* param_out_ptr = dev_ctx.template Alloc<T>(param_out);
T* mom1_out_ptr = dev_ctx.template Alloc<T>(moment1_out);
T* mom2_out_ptr = dev_ctx.template Alloc<T>(moment2_out);
T learning_rate_ =
learning_rate.data<T>()[0] * (sqrt(1 - beta2_p) / (1 - beta1_p));
T eps = epsilon_ * sqrt(1 - beta2_p);
paddle::operators::jit::adam_attr_t attr(beta1_, beta2_);
int64_t numel = param.numel();
const T* param_ptr = param.data<T>();
const T* mom1_ptr = moment1.data<T>();
const T* mom2_ptr = moment2.data<T>();
const T* grad_ptr = grad.data<T>();
auto adam =
paddle::operators::jit::KernelFuncs<paddle::operators::jit::AdamTuple<T>,
phi::CPUPlace>::Cache()
.At(attr);
static constexpr int64_t chunk_size = 512;
#ifdef PADDLE_WITH_MKLML
#pragma omp parallel for
#endif
for (int64_t i = 0; i < numel / chunk_size; ++i) {
const int64_t offset = i * chunk_size;
adam(beta1_,
beta2_,
-learning_rate_,
eps,
chunk_size,
grad_ptr + offset,
mom1_ptr + offset,
mom2_ptr + offset,
param_ptr + offset,
mom1_out_ptr + offset,
mom2_out_ptr + offset,
param_out_ptr + offset);
}
if (numel % chunk_size != 0) {
const int64_t offset = (numel / chunk_size) * chunk_size;
const int64_t tail_numel = numel % chunk_size;
adam(beta1_,
beta2_,
-learning_rate_,
eps,
tail_numel,
grad_ptr + offset,
mom1_ptr + offset,
mom2_ptr + offset,
param_ptr + offset,
mom1_out_ptr + offset,
mom2_out_ptr + offset,
param_out_ptr + offset);
}
}
} // namespace phi
PD_REGISTER_KERNEL(adam, CPU, ALL_LAYOUT, phi::AdamDenseKernel, float, double) {
}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/adamw_kernel.h"
#include <vector>
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/common/float16.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/adam_kernel.h"
#include "paddle/phi/kernels/funcs/adam_functors.h"
namespace phi {
template <typename T, typename Context>
void AdamwDenseKernel(const Context& dev_ctx,
const DenseTensor& param,
const DenseTensor& grad,
const DenseTensor& learning_rate,
const DenseTensor& moment1,
const DenseTensor& moment2,
const DenseTensor& beta1_pow,
const DenseTensor& beta2_pow,
paddle::optional<const DenseTensor&> master_param,
paddle::optional<const DenseTensor&> skip_update,
const Scalar& beta1,
const Scalar& beta2,
const Scalar& epsilon,
float lr_ratio,
float coeff,
bool with_decay,
bool lazy_mode,
int64_t min_row_size_to_use_multithread,
bool multi_precision,
bool use_global_beta_pow,
DenseTensor* param_out,
DenseTensor* moment1_out,
DenseTensor* moment2_out,
DenseTensor* beta1_pow_out,
DenseTensor* beta2_pow_out,
DenseTensor* master_param_outs) {
bool skip_update_ = false;
if (skip_update.is_initialized()) {
PADDLE_ENFORCE_EQ(
skip_update->numel(),
1,
errors::InvalidArgument("Input(SkipUpdate) size must be 1, but get %d",
skip_update->numel()));
std::vector<bool> skip_update_vec;
paddle::framework::TensorToVector(*skip_update, dev_ctx, &skip_update_vec);
skip_update_ = skip_update_vec[0];
}
VLOG(3) << "Skip update" << skip_update_;
if (skip_update_ || !with_decay) {
AdamDenseKernel<T, Context>(dev_ctx,
param,
grad,
learning_rate,
moment1,
moment2,
beta1_pow,
beta2_pow,
master_param,
skip_update,
beta1,
beta2,
epsilon,
lazy_mode,
min_row_size_to_use_multithread,
multi_precision,
use_global_beta_pow,
param_out,
moment1_out,
moment2_out,
beta1_pow_out,
beta2_pow_out,
master_param_outs);
return;
}
auto* param_ =
master_param.is_initialized() ? master_param.get_ptr() : &param;
T coeff_ = static_cast<T>(coeff);
T lr_ratio_ = static_cast<T>(lr_ratio);
funcs::AdamWFunctor<T, funcs::CPUAdamW> functor(
coeff_,
lr_ratio_,
learning_rate.data<T>(),
const_cast<T*>(param_->data<T>()));
functor(param_->numel());
AdamDenseKernel<T, Context>(dev_ctx,
param,
grad,
learning_rate,
moment1,
moment2,
beta1_pow,
beta2_pow,
master_param,
skip_update,
beta1,
beta2,
epsilon,
lazy_mode,
min_row_size_to_use_multithread,
multi_precision,
use_global_beta_pow,
param_out,
moment1_out,
moment2_out,
beta1_pow_out,
beta2_pow_out,
master_param_outs);
}
} // namespace phi
PD_REGISTER_KERNEL(
adamw, CPU, ALL_LAYOUT, phi::AdamwDenseKernel, float, double) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/adam_kernel.h"
#include <math.h> // for sqrt in CPU and CUDA
#include <vector>
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/amp_type_traits.h"
#include "paddle/phi/common/float16.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/kernels/funcs/adam_functors.h"
#include "paddle/phi/kernels/funcs/for_range.h"
namespace phi {
template <typename T, typename MT>
__global__ void AdamKernelREG(MT beta1,
MT beta2,
MT epsilon,
MT beta1_pow_,
MT beta2_pow_,
const MT* moment1,
MT* moment1_out,
const MT* moment2,
MT* moment2_out,
const MT* lr_,
const T* grad,
const T* param,
T* param_out,
const MT* master_param,
MT* master_param_out,
int ndim) {
MT lr = *lr_;
MT beta1_pow = beta1_pow_;
MT beta2_pow = beta2_pow_;
int id = blockIdx.x * blockDim.x + threadIdx.x;
for (; id < ndim; id += gridDim.x * blockDim.x) {
MT p = master_param ? master_param[id] : static_cast<MT>(param[id]);
MT g = static_cast<MT>(grad[id]);
MT mom1 = static_cast<MT>(moment1[id]);
MT mom2 = static_cast<MT>(moment2[id]);
mom1 = beta1 * mom1 + (static_cast<MT>(1.0) - beta1) * g;
mom2 = beta2 * mom2 + (static_cast<MT>(1.0) - beta2) * g * g;
MT denom = (sqrt(mom2) / sqrt(static_cast<MT>(1.0) - beta2_pow)) + epsilon;
p += (mom1 / denom) * (-(lr / (static_cast<MT>(1.0) - beta1_pow)));
moment1_out[id] = mom1;
moment2_out[id] = mom2;
param_out[id] = static_cast<T>(p);
if (master_param_out) {
master_param_out[id] = p;
}
}
}
template <typename T, typename MT>
__global__ void AdamKernelMEM(MT beta1,
MT beta2,
MT epsilon,
const MT* beta1_pow_,
const MT* beta2_pow_,
const MT* moment1,
MT* moment1_out,
const MT* moment2,
MT* moment2_out,
const MT* lr_,
const T* grad,
const T* param,
T* param_out,
const MT* master_param,
MT* master_param_out,
int ndim) {
MT lr = *lr_;
MT beta1_pow = *beta1_pow_;
MT beta2_pow = *beta2_pow_;
int id = blockIdx.x * blockDim.x + threadIdx.x;
for (; id < ndim; id += gridDim.x * blockDim.x) {
MT p = master_param ? master_param[id] : static_cast<MT>(param[id]);
MT g = static_cast<MT>(grad[id]);
MT mom1 = static_cast<MT>(moment1[id]);
MT mom2 = static_cast<MT>(moment2[id]);
mom1 = beta1 * mom1 + (static_cast<MT>(1.0) - beta1) * g;
mom2 = beta2 * mom2 + (static_cast<MT>(1.0) - beta2) * g * g;
MT denom = (sqrt(mom2) / sqrt(static_cast<MT>(1.0) - beta2_pow)) + epsilon;
p += (mom1 / denom) * (-(lr / (static_cast<MT>(1.0) - beta1_pow)));
moment1_out[id] = mom1;
moment2_out[id] = mom2;
param_out[id] = static_cast<T>(p);
if (master_param_out) {
master_param_out[id] = p;
}
}
}
template <typename T>
__global__ void UpdateBetaPow(T beta1,
T beta2,
const T* beta1_pow_,
const T* beta2_pow_,
T* beta1_pow_out,
T* beta2_pow_out) {
*beta1_pow_out = beta1 * beta1_pow_[0];
*beta2_pow_out = beta2 * beta2_pow_[0];
}
template <typename T, typename Context>
void AdamDenseKernel(const Context& dev_ctx,
const DenseTensor& param,
const DenseTensor& grad,
const DenseTensor& learning_rate,
const DenseTensor& moment1,
const DenseTensor& moment2,
const DenseTensor& beta1_pow,
const DenseTensor& beta2_pow,
paddle::optional<const DenseTensor&> master_param,
paddle::optional<const DenseTensor&> skip_update,
const Scalar& beta1,
const Scalar& beta2,
const Scalar& epsilon,
bool lazy_mode,
int64_t min_row_size_to_use_multithread,
bool multi_precision,
bool use_global_beta_pow,
DenseTensor* param_out,
DenseTensor* moment1_out,
DenseTensor* moment2_out,
DenseTensor* beta1_pow_out,
DenseTensor* beta2_pow_out,
DenseTensor* master_param_outs) {
using MPDType = typename phi::dtype::MPTypeTrait<T>::Type;
VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow;
bool skip_update_ = false;
if (skip_update.is_initialized()) {
PADDLE_ENFORCE_EQ(
skip_update->numel(),
1,
errors::InvalidArgument("Input(SkipUpdate) size must be 1, but get %d",
skip_update->numel()));
std::vector<bool> skip_update_vec;
paddle::framework::TensorToVector(*skip_update, dev_ctx, &skip_update_vec);
skip_update_ = skip_update_vec[0];
}
// skip_update=true, just copy input to output, and TensorCopy will call
// mutable_data
if (skip_update_) {
VLOG(4) << "Adam skip update";
phi::Copy(dev_ctx, param, dev_ctx.GetPlace(), false, param_out);
phi::Copy(dev_ctx, moment1, dev_ctx.GetPlace(), false, moment1_out);
phi::Copy(dev_ctx, moment2, dev_ctx.GetPlace(), false, moment2_out);
phi::Copy(dev_ctx, beta1_pow, dev_ctx.GetPlace(), false, beta1_pow_out);
phi::Copy(dev_ctx, beta2_pow, dev_ctx.GetPlace(), false, beta2_pow_out);
return;
}
MPDType beta1_ = beta1.to<MPDType>();
MPDType beta2_ = beta2.to<MPDType>();
MPDType epsilon_ = epsilon.to<MPDType>();
VLOG(3) << "beta1_pow.numel() : " << beta1_pow.numel()
<< "beta2_pow.numel() : " << beta2_pow.numel();
VLOG(3) << "param.numel(): " << param.numel();
PADDLE_ENFORCE_EQ(
beta1_pow_out->numel(),
1,
errors::InvalidArgument("beta1 pow output size should be 1, but received "
"value is:%d.",
beta1_pow_out->numel()));
PADDLE_ENFORCE_EQ(
beta2_pow_out->numel(),
1,
errors::InvalidArgument("beta2 pow output size should be 1, but received "
"value is:%d.",
beta2_pow_out->numel()));
const MPDType* master_in_data =
multi_precision ? master_param->data<MPDType>() : nullptr;
MPDType* master_out_data =
multi_precision ? dev_ctx.template Alloc<MPDType>(master_param_outs)
: nullptr;
// update param and moment
int threads = 512;
int blocks = (param.numel() + threads - 1) / threads;
if (beta1_pow.place() == CPUPlace() && beta2_pow.place() == CPUPlace()) {
// Compute with betapow in REG
AdamKernelREG<T, MPDType><<<blocks, threads, 0, dev_ctx.stream()>>>(
beta1_,
beta2_,
epsilon_,
*beta1_pow.data<MPDType>(),
*beta2_pow.data<MPDType>(),
moment1.data<MPDType>(),
dev_ctx.template Alloc<MPDType>(moment1_out),
moment2.data<MPDType>(),
dev_ctx.template Alloc<MPDType>(moment2_out),
learning_rate.data<MPDType>(),
grad.data<T>(),
param.data<T>(),
dev_ctx.template Alloc<T>(param_out),
master_in_data,
master_out_data,
param.numel());
if (!use_global_beta_pow) {
// Cpu update
dev_ctx.template HostAlloc<MPDType>(beta1_pow_out)[0] =
beta1_ * beta1_pow.data<MPDType>()[0];
dev_ctx.template HostAlloc<MPDType>(beta2_pow_out)[0] =
beta2_ * beta2_pow.data<MPDType>()[0];
}
} else {
AdamKernelMEM<T, MPDType><<<blocks, threads, 0, dev_ctx.stream()>>>(
beta1_,
beta2_,
epsilon_,
beta1_pow.data<MPDType>(),
beta2_pow.data<MPDType>(),
moment1.data<MPDType>(),
dev_ctx.template Alloc<MPDType>(moment1_out),
moment2.data<MPDType>(),
dev_ctx.template Alloc<MPDType>(moment2_out),
learning_rate.data<MPDType>(),
grad.data<T>(),
param.data<T>(),
dev_ctx.template Alloc<T>(param_out),
master_in_data,
master_out_data,
param.numel());
if (!use_global_beta_pow) {
// Update with gpu
UpdateBetaPow<MPDType><<<1, 32, 0, dev_ctx.stream()>>>(
beta1_,
beta2_,
beta1_pow.data<MPDType>(),
beta2_pow.data<MPDType>(),
dev_ctx.template Alloc<MPDType>(beta1_pow_out),
dev_ctx.template Alloc<MPDType>(beta2_pow_out));
}
}
}
} // namespace phi
PD_REGISTER_KERNEL(adam,
GPU,
ALL_LAYOUT,
phi::AdamDenseKernel,
float,
double,
phi::dtype::float16) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/adamw_kernel.h"
#include <math.h> // for sqrt in CPU and CUDA
#include <vector>
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/amp_type_traits.h"
#include "paddle/phi/common/float16.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/kernels/funcs/adam_functors.h"
#include "paddle/phi/kernels/funcs/for_range.h"
namespace phi {
template <typename T, typename MT>
__global__ void AdamWKernelREG(MT beta1,
MT beta2,
MT epsilon,
MT coeff,
MT lr_ratio,
MT beta1_pow_,
MT beta2_pow_,
const MT* moment1,
MT* moment1_out,
const MT* moment2,
MT* moment2_out,
const MT* lr_,
const T* grad,
const T* param,
T* param_out,
const MT* master_param,
MT* master_param_out,
int ndim) {
MT lr = *lr_ * lr_ratio;
MT beta1_pow = beta1_pow_;
MT beta2_pow = beta2_pow_;
int id = blockIdx.x * blockDim.x + threadIdx.x;
for (; id < ndim; id += gridDim.x * blockDim.x) {
MT p = master_param ? master_param[id] : static_cast<MT>(param[id]);
MT g = static_cast<MT>(grad[id]);
MT mom1 = static_cast<MT>(moment1[id]);
MT mom2 = static_cast<MT>(moment2[id]);
p *= (static_cast<MT>(1.0) - lr * coeff);
mom1 = beta1 * mom1 + (static_cast<MT>(1.0) - beta1) * g;
mom2 = beta2 * mom2 + (static_cast<MT>(1.0) - beta2) * g * g;
MT denom = (sqrt(mom2) / sqrt(static_cast<MT>(1.0) - beta2_pow)) + epsilon;
p += (mom1 / denom) * (-(lr / (static_cast<MT>(1.0) - beta1_pow)));
moment1_out[id] = mom1;
moment2_out[id] = mom2;
param_out[id] = static_cast<T>(p);
if (master_param_out) {
master_param_out[id] = p;
}
}
}
template <typename T, typename MT>
__global__ void AdamWKernelMEM(MT beta1,
MT beta2,
MT epsilon,
MT coeff,
MT lr_ratio,
const MT* beta1_pow_,
const MT* beta2_pow_,
const MT* moment1,
MT* moment1_out,
const MT* moment2,
MT* moment2_out,
const MT* lr_,
const T* grad,
const T* param,
T* param_out,
const MT* master_param,
MT* master_param_out,
int ndim) {
MT lr = *lr_ * lr_ratio;
MT beta1_pow = *beta1_pow_;
MT beta2_pow = *beta2_pow_;
int id = blockIdx.x * blockDim.x + threadIdx.x;
for (; id < ndim; id += gridDim.x * blockDim.x) {
MT p = master_param ? master_param[id] : static_cast<MT>(param[id]);
MT g = static_cast<MT>(grad[id]);
MT mom1 = static_cast<MT>(moment1[id]);
MT mom2 = static_cast<MT>(moment2[id]);
p *= (static_cast<MT>(1.0) - lr * coeff);
mom1 = beta1 * mom1 + (static_cast<MT>(1.0) - beta1) * g;
mom2 = beta2 * mom2 + (static_cast<MT>(1.0) - beta2) * g * g;
MT denom = (sqrt(mom2) / sqrt(static_cast<MT>(1.0) - beta2_pow)) + epsilon;
p += (mom1 / denom) * (-(lr / (static_cast<MT>(1.0) - beta1_pow)));
moment1_out[id] = mom1;
moment2_out[id] = mom2;
param_out[id] = static_cast<T>(p);
if (master_param_out) {
master_param_out[id] = p;
}
}
}
template <typename T>
__global__ void UpdateAdamWBetaPow(T beta1,
T beta2,
const T* beta1_pow_,
const T* beta2_pow_,
T* beta1_pow_out,
T* beta2_pow_out) {
*beta1_pow_out = beta1 * beta1_pow_[0];
*beta2_pow_out = beta2 * beta2_pow_[0];
}
template <typename T, typename Context>
void AdamwDenseKernel(const Context& dev_ctx,
const DenseTensor& param,
const DenseTensor& grad,
const DenseTensor& learning_rate,
const DenseTensor& moment1,
const DenseTensor& moment2,
const DenseTensor& beta1_pow,
const DenseTensor& beta2_pow,
paddle::optional<const DenseTensor&> master_param,
paddle::optional<const DenseTensor&> skip_update,
const Scalar& beta1,
const Scalar& beta2,
const Scalar& epsilon,
float lr_ratio,
float coeff,
bool with_decay,
bool lazy_mode,
int64_t min_row_size_to_use_multithread,
bool multi_precision,
bool use_global_beta_pow,
DenseTensor* param_out,
DenseTensor* moment1_out,
DenseTensor* moment2_out,
DenseTensor* beta1_pow_out,
DenseTensor* beta2_pow_out,
DenseTensor* master_param_outs) {
using MPDType = typename phi::dtype::MPTypeTrait<T>::Type;
VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow;
MPDType coeff_ = static_cast<MPDType>(coeff);
MPDType lr_ratio_ = static_cast<MPDType>(lr_ratio);
bool skip_update_ = false;
if (skip_update.is_initialized()) {
PADDLE_ENFORCE_EQ(
skip_update->numel(),
1,
errors::InvalidArgument("Input(SkipUpdate) size must be 1, but get %d",
skip_update->numel()));
std::vector<bool> skip_update_vec;
paddle::framework::TensorToVector(*skip_update, dev_ctx, &skip_update_vec);
skip_update_ = skip_update_vec[0];
}
// skip_update=true, just copy input to output, and TensorCopy will call
// mutable_data
if (skip_update_) {
VLOG(4) << "Adamw skip update";
phi::Copy(dev_ctx, param, dev_ctx.GetPlace(), false, param_out);
phi::Copy(dev_ctx, moment1, dev_ctx.GetPlace(), false, moment1_out);
phi::Copy(dev_ctx, moment2, dev_ctx.GetPlace(), false, moment2_out);
phi::Copy(dev_ctx, beta1_pow, dev_ctx.GetPlace(), false, beta1_pow_out);
phi::Copy(dev_ctx, beta2_pow, dev_ctx.GetPlace(), false, beta2_pow_out);
return;
}
// if with_decay = false, coeff = 0
if (!with_decay) {
coeff_ = static_cast<MPDType>(0.0);
}
MPDType beta1_ = beta1.to<MPDType>();
MPDType beta2_ = beta2.to<MPDType>();
MPDType epsilon_ = epsilon.to<MPDType>();
VLOG(3) << "beta1_pow.numel() : " << beta1_pow.numel()
<< "beta2_pow.numel() : " << beta2_pow.numel();
VLOG(3) << "param.numel(): " << param.numel();
PADDLE_ENFORCE_EQ(
beta1_pow_out->numel(),
1,
errors::InvalidArgument("beta1 pow output size should be 1, but received "
"value is:%d.",
beta1_pow_out->numel()));
PADDLE_ENFORCE_EQ(
beta2_pow_out->numel(),
1,
errors::InvalidArgument("beta2 pow output size should be 1, but received "
"value is:%d.",
beta2_pow_out->numel()));
const MPDType* master_in_data =
multi_precision ? master_param->data<MPDType>() : nullptr;
MPDType* master_out_data =
multi_precision ? dev_ctx.template Alloc<MPDType>(master_param_outs)
: nullptr;
// update param and moment
int threads = 512;
int blocks = (param.numel() + threads - 1) / threads;
if (beta1_pow.place() == CPUPlace() && beta2_pow.place() == CPUPlace()) {
// Compute with betapow in REG
AdamWKernelREG<T, MPDType><<<blocks, threads, 0, dev_ctx.stream()>>>(
beta1_,
beta2_,
epsilon_,
coeff_,
lr_ratio_,
*beta1_pow.data<MPDType>(),
*beta2_pow.data<MPDType>(),
moment1.data<MPDType>(),
dev_ctx.template Alloc<MPDType>(moment1_out),
moment2.data<MPDType>(),
dev_ctx.template Alloc<MPDType>(moment2_out),
learning_rate.data<MPDType>(),
grad.data<T>(),
param.data<T>(),
dev_ctx.template Alloc<T>(param_out),
master_in_data,
master_out_data,
param.numel());
if (!use_global_beta_pow) {
// Cpu update
dev_ctx.template HostAlloc<MPDType>(beta1_pow_out)[0] =
beta1_ * beta1_pow.data<MPDType>()[0];
dev_ctx.template HostAlloc<MPDType>(beta2_pow_out)[0] =
beta2_ * beta2_pow.data<MPDType>()[0];
}
} else {
AdamWKernelMEM<T, MPDType><<<blocks, threads, 0, dev_ctx.stream()>>>(
beta1_,
beta2_,
epsilon_,
coeff_,
lr_ratio_,
beta1_pow.data<MPDType>(),
beta2_pow.data<MPDType>(),
moment1.data<MPDType>(),
dev_ctx.template Alloc<MPDType>(moment1_out),
moment2.data<MPDType>(),
dev_ctx.template Alloc<MPDType>(moment2_out),
learning_rate.data<MPDType>(),
grad.data<T>(),
param.data<T>(),
dev_ctx.template Alloc<T>(param_out),
master_in_data,
master_out_data,
param.numel());
if (!use_global_beta_pow) {
// Update with gpu
UpdateAdamWBetaPow<MPDType><<<1, 32, 0, dev_ctx.stream()>>>(
beta1_,
beta2_,
beta1_pow.data<MPDType>(),
beta2_pow.data<MPDType>(),
dev_ctx.template Alloc<MPDType>(beta1_pow_out),
dev_ctx.template Alloc<MPDType>(beta2_pow_out));
}
}
}
} // namespace phi
PD_REGISTER_KERNEL(adamw,
GPU,
ALL_LAYOUT,
phi::AdamwDenseKernel,
float,
double,
phi::dtype::float16) {}
......@@ -16,7 +16,7 @@
#include <type_traits>
#include <vector>
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
#include "paddle/phi/kernels/tile_grad_kernel.h"
......@@ -90,8 +90,7 @@ void TileGradKernel(const Context& dev_ctx,
if (just_copy) {
dev_ctx.template Alloc<T>(x_grad);
paddle::framework::TensorCopy(
out_grad, dev_ctx.GetPlace(), dev_ctx, x_grad);
phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, x_grad);
// TensorCopy may change the dims of dx
x_grad->Resize(x_dims);
} else {
......
set(SELECTED_ROWS_KERNEL_DEPS dense_tensor selected_rows sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils math_function custom_kernel)
register_kernels(DEPS ${SELECTED_ROWS_KERNEL_DEPS} SUB_DIR "selected_rows_kernel")
set(SELECTED_ROWS_KERNEL_DEPS dense_tensor selected_rows selected_rows_functor sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils math_function custom_kernel)
register_kernels(DEPS ${SELECTED_ROWS_KERNEL_DEPS} SUB_DIR "selected_rows")
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/common/scalar.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/selected_rows.h"
namespace phi {
namespace sr {
template <typename T, typename Context>
void AdamDenseParamSparseGradKernel(
const Context& dev_ctx,
const DenseTensor& param,
const SelectedRows& grad,
const DenseTensor& learning_rate,
const DenseTensor& moment1,
const DenseTensor& moment2,
const DenseTensor& beta1_pow,
const DenseTensor& beta2_pow,
paddle::optional<const DenseTensor&> master_param,
paddle::optional<const DenseTensor&> skip_update,
const Scalar& beta1,
const Scalar& beta2,
const Scalar& epsilon,
bool lazy_mode,
int64_t min_row_size_to_use_multithread,
bool multi_precision,
bool use_global_beta_pow,
DenseTensor* param_out,
DenseTensor* moment1_out,
DenseTensor* moment2_out,
DenseTensor* beta1_pow_out,
DenseTensor* beta2_pow_out,
DenseTensor* master_param_outs);
} // namespace sr
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/common/scalar.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/selected_rows.h"
namespace phi {
namespace sr {
template <typename T, typename Context>
void AdamwDenseParamSparseGradKernel(
const Context& dev_ctx,
const DenseTensor& param,
const SelectedRows& grad,
const DenseTensor& learning_rate,
const DenseTensor& moment1,
const DenseTensor& moment2,
const DenseTensor& beta1_pow,
const DenseTensor& beta2_pow,
paddle::optional<const DenseTensor&> master_param,
paddle::optional<const DenseTensor&> skip_update,
const Scalar& beta1,
const Scalar& beta2,
const Scalar& epsilon,
float lr_ratio,
float coeff,
bool with_decay,
bool lazy_mode,
int64_t min_row_size_to_use_multithread,
bool multi_precision,
bool use_global_beta_pow,
DenseTensor* param_out,
DenseTensor* moment1_out,
DenseTensor* moment2_out,
DenseTensor* beta1_pow_out,
DenseTensor* beta2_pow_out,
DenseTensor* master_param_outs);
} // namespace sr
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/selected_rows/adam_kernel.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/framework/threadpool.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/kernels/funcs/adam_functors.h"
namespace phi {
namespace sr {
template <typename T, typename Context>
void AdamDenseParamSparseGradKernel(
const Context& dev_ctx,
const DenseTensor& param,
const SelectedRows& grad,
const DenseTensor& learning_rate,
const DenseTensor& moment1,
const DenseTensor& moment2,
const DenseTensor& beta1_pow,
const DenseTensor& beta2_pow,
paddle::optional<const DenseTensor&> master_param,
paddle::optional<const DenseTensor&> skip_update,
const Scalar& beta1,
const Scalar& beta2,
const Scalar& epsilon,
bool lazy_mode,
int64_t min_row_size_to_use_multithread,
bool multi_precision,
bool use_global_beta_pow,
DenseTensor* param_out,
DenseTensor* moment1_out,
DenseTensor* moment2_out,
DenseTensor* beta1_pow_out,
DenseTensor* beta2_pow_out,
DenseTensor* master_param_outs) {
VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow;
bool skip_update_ = false;
if (skip_update.is_initialized()) {
PADDLE_ENFORCE_EQ(
skip_update->numel(),
1,
errors::InvalidArgument("Input(SkipUpdate) size must be 1, but get %d",
skip_update->numel()));
std::vector<bool> skip_update_vec;
paddle::framework::TensorToVector(*skip_update, dev_ctx, &skip_update_vec);
skip_update_ = skip_update_vec[0];
}
// skip_update=true, just copy input to output, and TensorCopy will call
// mutable_data
if (skip_update_) {
VLOG(4) << "Adam skip update";
phi::Copy(dev_ctx, param, dev_ctx.GetPlace(), false, param_out);
phi::Copy(dev_ctx, moment1, dev_ctx.GetPlace(), false, moment1_out);
phi::Copy(dev_ctx, moment2, dev_ctx.GetPlace(), false, moment2_out);
phi::Copy(dev_ctx, beta1_pow, dev_ctx.GetPlace(), false, beta1_pow_out);
phi::Copy(dev_ctx, beta2_pow, dev_ctx.GetPlace(), false, beta2_pow_out);
return;
}
T beta1_ = beta1.to<T>();
T beta2_ = beta2.to<T>();
T epsilon_ = epsilon.to<T>();
VLOG(3) << "beta1_pow.numel() : " << beta1_pow.numel();
VLOG(3) << "beta2_pow.numel() : " << beta2_pow.numel();
VLOG(3) << "param.numel(): " << param.numel();
PADDLE_ENFORCE_EQ(
beta1_pow_out->numel(),
1,
errors::InvalidArgument("beta1 pow output size should be 1, but received "
"value is:%d.",
beta1_pow_out->numel()));
PADDLE_ENFORCE_EQ(
beta2_pow_out->numel(),
1,
errors::InvalidArgument("beta2 pow output size should be 1, but received "
"value is:%d.",
beta2_pow_out->numel()));
if (grad.rows().size() == 0) {
VLOG(3) << "grad row size is 0!!";
return;
}
std::vector<int64_t> cpu_rows(grad.rows().begin(), grad.rows().end());
bool is_strict_sorted = true;
for (size_t i = 1; i < cpu_rows.size(); ++i) {
if (cpu_rows[i - 1] >= cpu_rows[i]) {
is_strict_sorted = false;
break;
}
}
phi::SelectedRows tmp_grad_merge;
const phi::SelectedRows* grad_merge_ptr;
if (is_strict_sorted) {
grad_merge_ptr = &grad;
} else {
// merge duplicated rows if any.
// The rows of grad_merge have been sorted inside MergeAdd functor
paddle::operators::math::scatter::MergeAdd<Context, T> merge_func;
merge_func(dev_ctx, grad, &tmp_grad_merge, true);
grad_merge_ptr = &tmp_grad_merge;
}
auto& grad_merge = *grad_merge_ptr;
auto& grad_tensor = grad_merge.value();
const T* grad_data = grad_tensor.template data<T>();
auto* grad_merge_rows = &grad_merge.rows();
paddle::framework::MixVector<int64_t> mixv_grad_merge_rows(grad_merge_rows);
const int64_t* rows = mixv_grad_merge_rows.Data(dev_ctx.GetPlace());
auto row_numel = grad_tensor.numel() / grad_merge.rows().size();
funcs::SparseAdamFunctor<T, funcs::CPUAdam> functor(
beta1_,
beta2_,
epsilon_,
beta1_pow.data<T>(),
beta2_pow.data<T>(),
moment1.data<T>(),
dev_ctx.template Alloc<T>(moment1_out),
moment2.data<T>(),
dev_ctx.template Alloc<T>(moment2_out),
learning_rate.data<T>(),
grad_data,
param.data<T>(),
dev_ctx.template Alloc<T>(param_out),
rows,
row_numel,
grad_merge.rows().size(),
lazy_mode);
// update beta1 and beta2
if (!use_global_beta_pow) {
dev_ctx.template Alloc<T>(beta1_pow_out)[0] =
beta1_ * beta1_pow.data<T>()[0];
dev_ctx.template Alloc<T>(beta2_pow_out)[0] =
beta2_ * beta2_pow.data<T>()[0];
}
if (lazy_mode) {
VLOG(3) << "run cpu lazy mode";
size_t row_count = grad_merge.rows().size();
std::vector<int64_t> cpu_rows(grad_merge.rows());
for (size_t row_index = 0; row_index < row_count; ++row_index) {
for (size_t offset = 0; offset < row_numel; ++offset) {
size_t i = cpu_rows[row_index] * row_numel + offset;
functor.adam_update(i, grad_data[row_index * row_numel + offset]);
}
}
}
#ifndef _WIN32
else if (FLAGS_inner_op_parallelism > 1 && // NOLINT
min_row_size_to_use_multithread > 0 &&
param.dims()[0] > min_row_size_to_use_multithread) {
VLOG(3) << "use multi thread, inner_op_parallelism="
<< FLAGS_inner_op_parallelism << " min_row_size_to_use_multithread="
<< min_row_size_to_use_multithread;
if (FLAGS_inner_op_parallelism > 10) {
VLOG(1) << "FLAGS_inner_op_parallelism " << FLAGS_inner_op_parallelism
<< " is two large!";
}
auto& grad_rows = grad_merge.rows();
std::unordered_map<size_t, int> row_id_to_grad_row_offset;
size_t param_row_count = param.numel() / row_numel;
if (param_row_count < 1000) {
VLOG(1) << "param_row_count should be larger then 1000 to use "
"multi thread, currently "
<< param_row_count;
}
for (size_t i = 0; i < grad_rows.size(); ++i) {
row_id_to_grad_row_offset[grad_rows[i]] = i;
}
std::vector<std::future<void>> fs;
int64_t line_in_each_thread =
param_row_count / FLAGS_inner_op_parallelism + 1;
for (int i = 0; i < FLAGS_inner_op_parallelism; ++i) {
int64_t start = i * line_in_each_thread;
int64_t end = (i + 1) * line_in_each_thread;
if (start >= static_cast<int64_t>(param_row_count)) {
break;
}
if (end > static_cast<int64_t>(param_row_count)) {
end = static_cast<int64_t>(param_row_count);
}
fs.push_back(paddle::framework::Async([&functor,
&row_id_to_grad_row_offset,
&grad_data,
row_numel,
start,
end]() {
for (int64_t row_id = start; row_id < end; ++row_id) {
auto iter = row_id_to_grad_row_offset.find(row_id);
if (iter != row_id_to_grad_row_offset.end()) {
for (size_t row_offset = 0U; row_offset < row_numel; ++row_offset) {
functor.adam_update(
row_id * row_numel + row_offset,
grad_data[iter->second * row_numel + row_offset]);
}
} else {
for (size_t row_offset = 0U; row_offset < row_numel; ++row_offset) {
functor.adam_update(row_id * row_numel + row_offset, 0);
}
}
}
}));
}
for (size_t i = 0; i < fs.size(); ++i) fs[i].wait();
}
#endif // !_WIN32
else { // NOLINT
functor(param.numel());
}
}
} // namespace sr
} // namespace phi
PD_REGISTER_KERNEL(adam_dense_param_sparse_grad,
CPU,
ALL_LAYOUT,
phi::sr::AdamDenseParamSparseGradKernel,
float,
double) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/selected_rows/adamw_kernel.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/common/float16.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/adam_kernel.h"
#include "paddle/phi/kernels/funcs/adam_functors.h"
#include "paddle/phi/kernels/selected_rows/adam_kernel.h"
namespace phi {
namespace sr {
template <typename T, typename Context>
void AdamwDenseParamSparseGradKernel(
const Context& dev_ctx,
const DenseTensor& param,
const SelectedRows& grad,
const DenseTensor& learning_rate,
const DenseTensor& moment1,
const DenseTensor& moment2,
const DenseTensor& beta1_pow,
const DenseTensor& beta2_pow,
paddle::optional<const DenseTensor&> master_param,
paddle::optional<const DenseTensor&> skip_update,
const Scalar& beta1,
const Scalar& beta2,
const Scalar& epsilon,
float lr_ratio,
float coeff,
bool with_decay,
bool lazy_mode,
int64_t min_row_size_to_use_multithread,
bool multi_precision,
bool use_global_beta_pow,
DenseTensor* param_out,
DenseTensor* moment1_out,
DenseTensor* moment2_out,
DenseTensor* beta1_pow_out,
DenseTensor* beta2_pow_out,
DenseTensor* master_param_outs) {
bool skip_update_ = false;
if (skip_update.is_initialized()) {
PADDLE_ENFORCE_EQ(
skip_update->numel(),
1,
errors::InvalidArgument("Input(SkipUpdate) size must be 1, but get %d",
skip_update->numel()));
std::vector<bool> skip_update_vec;
paddle::framework::TensorToVector(*skip_update, dev_ctx, &skip_update_vec);
skip_update_ = skip_update_vec[0];
}
VLOG(3) << "Skip update" << skip_update_;
if (skip_update_ || !with_decay) {
AdamDenseParamSparseGradKernel<T, Context>(dev_ctx,
param,
grad,
learning_rate,
moment1,
moment2,
beta1_pow,
beta2_pow,
master_param,
skip_update,
beta1,
beta2,
epsilon,
lazy_mode,
min_row_size_to_use_multithread,
multi_precision,
use_global_beta_pow,
param_out,
moment1_out,
moment2_out,
beta1_pow_out,
beta2_pow_out,
master_param_outs);
return;
}
auto* param_ =
master_param.is_initialized() ? master_param.get_ptr() : &param;
T coeff_ = static_cast<T>(coeff);
T lr_ratio_ = static_cast<T>(lr_ratio);
funcs::AdamWFunctor<T, funcs::CPUAdamW> functor(
coeff_,
lr_ratio_,
learning_rate.data<T>(),
const_cast<T*>(param_->data<T>()));
functor(param_->numel());
AdamDenseParamSparseGradKernel<T, Context>(dev_ctx,
param,
grad,
learning_rate,
moment1,
moment2,
beta1_pow,
beta2_pow,
master_param,
skip_update,
beta1,
beta2,
epsilon,
lazy_mode,
min_row_size_to_use_multithread,
multi_precision,
use_global_beta_pow,
param_out,
moment1_out,
moment2_out,
beta1_pow_out,
beta2_pow_out,
master_param_outs);
}
} // namespace sr
} // namespace phi
PD_REGISTER_KERNEL(adamw_dense_param_sparse_grad,
CPU,
ALL_LAYOUT,
phi::sr::AdamwDenseParamSparseGradKernel,
float,
double) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/selected_rows/adam_kernel.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/amp_type_traits.h"
#include "paddle/phi/common/float16.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/kernels/funcs/adam_functors.h"
#include "paddle/phi/kernels/funcs/for_range.h"
namespace phi {
namespace sr {
template <typename T>
__global__ void UpdateBetaPow(T beta1,
T beta2,
const T* beta1_pow_,
const T* beta2_pow_,
T* beta1_pow_out,
T* beta2_pow_out) {
*beta1_pow_out = beta1 * beta1_pow_[0];
*beta2_pow_out = beta2 * beta2_pow_[0];
}
template <typename T, typename MT>
__global__ void SparseAdamCUDAKernelREG(MT beta1,
MT beta2,
MT epsilon,
const MT beta1_pow,
const MT beta2_pow,
const MT* mom1_,
MT* mom1_out_,
const MT* mom2_,
MT* mom2_out_,
const MT* lr_,
const T* grad_,
const T* param_,
T* param_out_,
const MT* master_param,
MT* master_param_out,
const int64_t* rows_,
int64_t row_numel,
int64_t row_count,
bool lazy_mode,
int ndim) {
int id = blockIdx.x * blockDim.x + threadIdx.x;
MT lr = *lr_;
for (; id < ndim; id += blockDim.x * gridDim.x) {
auto row_idx =
phi::funcs::BinarySearch<int64_t>(rows_, row_count, id / row_numel);
if (lazy_mode && row_idx < 0) {
return;
} else {
MT mom1 = mom1_[id];
MT mom2 = mom2_[id];
MT p = master_param ? master_param[id] : static_cast<MT>(param_[id]);
MT g = row_idx >= 0
? static_cast<MT>(grad_[row_idx * row_numel + id % row_numel])
: static_cast<MT>(0);
mom1 = beta1 * mom1 + (static_cast<MT>(1.0) - beta1) * g;
mom2 = beta2 * mom2 + (static_cast<MT>(1.0) - beta2) * g * g;
MT denom =
(sqrt(mom2) / sqrt(static_cast<MT>(1.0) - beta2_pow)) + epsilon;
p += (mom1 / denom) * (-(lr / (static_cast<MT>(1.0) - beta1_pow)));
// Write back to global memory
mom1_out_[id] = mom1;
mom2_out_[id] = mom2;
param_out_[id] = static_cast<T>(p);
if (master_param_out) {
master_param_out[id] = p;
}
}
}
}
template <typename T, typename Context>
void AdamDenseParamSparseGradKernel(
const Context& dev_ctx,
const DenseTensor& param,
const SelectedRows& grad,
const DenseTensor& learning_rate,
const DenseTensor& moment1,
const DenseTensor& moment2,
const DenseTensor& beta1_pow,
const DenseTensor& beta2_pow,
paddle::optional<const DenseTensor&> master_param,
paddle::optional<const DenseTensor&> skip_update,
const Scalar& beta1,
const Scalar& beta2,
const Scalar& epsilon,
bool lazy_mode,
int64_t min_row_size_to_use_multithread,
bool multi_precision,
bool use_global_beta_pow,
DenseTensor* param_out,
DenseTensor* moment1_out,
DenseTensor* moment2_out,
DenseTensor* beta1_pow_out,
DenseTensor* beta2_pow_out,
DenseTensor* master_param_outs) {
using MPDType = typename phi::dtype::MPTypeTrait<T>::Type;
VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow;
bool skip_update_ = false;
if (skip_update.is_initialized()) {
PADDLE_ENFORCE_EQ(
skip_update->numel(),
1,
errors::InvalidArgument("Input(SkipUpdate) size must be 1, but get %d",
skip_update->numel()));
std::vector<bool> skip_update_vec;
paddle::framework::TensorToVector(*skip_update, dev_ctx, &skip_update_vec);
skip_update_ = skip_update_vec[0];
}
// skip_update=true, just copy input to output, and TensorCopy will call
// mutable_data
if (skip_update_) {
VLOG(4) << "Adam skip update";
phi::Copy(dev_ctx, param, dev_ctx.GetPlace(), false, param_out);
phi::Copy(dev_ctx, moment1, dev_ctx.GetPlace(), false, moment1_out);
phi::Copy(dev_ctx, moment2, dev_ctx.GetPlace(), false, moment2_out);
phi::Copy(dev_ctx, beta1_pow, dev_ctx.GetPlace(), false, beta1_pow_out);
phi::Copy(dev_ctx, beta2_pow, dev_ctx.GetPlace(), false, beta2_pow_out);
return;
}
MPDType beta1_ = beta1.to<MPDType>();
MPDType beta2_ = beta2.to<MPDType>();
MPDType epsilon_ = epsilon.to<MPDType>();
VLOG(3) << "beta1_pow.numel() : " << beta1_pow.numel()
<< "beta2_pow.numel() : " << beta2_pow.numel();
VLOG(3) << "param.numel(): " << param.numel();
PADDLE_ENFORCE_EQ(
beta1_pow_out->numel(),
1,
errors::InvalidArgument("beta1 pow output size should be 1, but received "
"value is:%d.",
beta1_pow_out->numel()));
PADDLE_ENFORCE_EQ(
beta2_pow_out->numel(),
1,
errors::InvalidArgument("beta2 pow output size should be 1, but received "
"value is:%d.",
beta2_pow_out->numel()));
const MPDType* master_in_data =
multi_precision ? master_param->data<MPDType>() : nullptr;
MPDType* master_out_data =
multi_precision ? dev_ctx.template Alloc<MPDType>(master_param_outs)
: nullptr;
if (grad.rows().size() == 0) {
VLOG(3) << "grad row size is 0!!";
return;
}
std::vector<int64_t> cpu_rows(grad.rows().begin(), grad.rows().end());
bool is_strict_sorted = true;
for (size_t i = 1; i < cpu_rows.size(); ++i) {
if (cpu_rows[i - 1] >= cpu_rows[i]) {
is_strict_sorted = false;
break;
}
}
phi::SelectedRows tmp_grad_merge;
const phi::SelectedRows* grad_merge_ptr;
if (is_strict_sorted) {
grad_merge_ptr = &grad;
} else {
// merge duplicated rows if any.
// The rows of grad_merge have been sorted inside MergeAdd functor
paddle::operators::math::scatter::MergeAdd<Context, T> merge_func;
merge_func(dev_ctx, grad, &tmp_grad_merge, true);
grad_merge_ptr = &tmp_grad_merge;
}
auto& grad_merge = *grad_merge_ptr;
auto& grad_tensor = grad_merge.value();
const T* grad_data = grad_tensor.template data<T>();
auto* grad_merge_rows = &grad_merge.rows();
paddle::framework::MixVector<int64_t> mixv_grad_merge_rows(grad_merge_rows);
const int64_t* rows = mixv_grad_merge_rows.Data(dev_ctx.GetPlace());
auto row_numel = grad_tensor.numel() / grad_merge.rows().size();
if (beta1_pow.place() == CPUPlace() && beta2_pow.place() == CPUPlace()) {
int threads = 512;
int ndim = param.numel();
int blocks = (ndim + threads - 1) / threads;
SparseAdamCUDAKernelREG<T,
MPDType><<<blocks, threads, 0, dev_ctx.stream()>>>(
beta1_,
beta2_,
epsilon_,
*beta1_pow.data<MPDType>(),
*beta2_pow.data<MPDType>(),
moment1.data<MPDType>(),
dev_ctx.template Alloc<MPDType>(moment1_out),
moment2.data<MPDType>(),
dev_ctx.template Alloc<MPDType>(moment2_out),
learning_rate.data<MPDType>(),
grad_data,
param.data<T>(),
dev_ctx.template Alloc<T>(param_out),
master_in_data,
master_out_data,
rows,
row_numel,
grad_merge.rows().size(),
lazy_mode,
ndim);
if (!use_global_beta_pow) {
// Update with cpu
dev_ctx.template HostAlloc<MPDType>(beta1_pow_out)[0] =
beta1_ * beta1_pow.data<MPDType>()[0];
dev_ctx.template HostAlloc<MPDType>(beta2_pow_out)[0] =
beta2_ * beta2_pow.data<MPDType>()[0];
}
} else {
funcs::SparseAdamFunctor<T, funcs::GPUAdam, MPDType> functor(
beta1_,
beta2_,
epsilon_,
beta1_pow.data<MPDType>(),
beta2_pow.data<MPDType>(),
moment1.data<MPDType>(),
dev_ctx.template Alloc<MPDType>(moment1_out),
moment2.data<MPDType>(),
dev_ctx.template Alloc<MPDType>(moment2_out),
learning_rate.data<MPDType>(),
grad_data,
param.data<T>(),
dev_ctx.template Alloc<T>(param_out),
master_in_data,
master_out_data,
rows,
row_numel,
grad_merge.rows().size(),
lazy_mode);
// FIXME(minqiyang): remove BinarySearch in GPU later
funcs::ForRange<Context> for_range(dev_ctx, param.numel());
for_range(functor);
if (!use_global_beta_pow) {
// update beta1 and beta2
UpdateBetaPow<MPDType><<<1, 32, 0, dev_ctx.stream()>>>(
beta1_,
beta2_,
beta1_pow.data<MPDType>(),
beta2_pow.data<MPDType>(),
dev_ctx.template Alloc<MPDType>(beta1_pow_out),
dev_ctx.template Alloc<MPDType>(beta2_pow_out));
}
}
}
} // namespace sr
} // namespace phi
PD_REGISTER_KERNEL(adam_dense_param_sparse_grad,
GPU,
ALL_LAYOUT,
phi::sr::AdamDenseParamSparseGradKernel,
float,
double,
phi::dtype::float16) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/selected_rows/adamw_kernel.h"
#include <math.h> // for sqrt in CPU and CUDA
#include <vector>
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/amp_type_traits.h"
#include "paddle/phi/common/float16.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/kernels/funcs/adam_functors.h"
#include "paddle/phi/kernels/funcs/for_range.h"
namespace phi {
namespace sr {
template <typename T>
__global__ void UpdateAdamWBetaPow(T beta1,
T beta2,
const T* beta1_pow_,
const T* beta2_pow_,
T* beta1_pow_out,
T* beta2_pow_out) {
*beta1_pow_out = beta1 * beta1_pow_[0];
*beta2_pow_out = beta2 * beta2_pow_[0];
}
template <typename T, typename MT>
__global__ void SparseAdamWCUDAKernelREG(MT beta1,
MT beta2,
MT epsilon,
MT coeff,
MT lr_ratio,
const MT beta1_pow,
const MT beta2_pow,
const MT* mom1_,
MT* mom1_out_,
const MT* mom2_,
MT* mom2_out_,
const MT* lr_,
const T* grad_,
const T* param_,
T* param_out_,
const MT* master_param,
MT* master_param_out,
const int64_t* rows_,
int64_t row_numel,
int64_t row_count,
bool lazy_mode,
int ndim) {
int id = blockIdx.x * blockDim.x + threadIdx.x;
MT lr = *lr_ * lr_ratio;
for (; id < ndim; id += blockDim.x * gridDim.x) {
auto row_idx =
phi::funcs::BinarySearch<int64_t>(rows_, row_count, id / row_numel);
if (lazy_mode && row_idx < 0) {
return;
} else {
MT mom1 = static_cast<MT>(mom1_[id]);
MT mom2 = static_cast<MT>(mom2_[id]);
MT p = master_param ? master_param[id] : static_cast<MT>(param_[id]);
MT g = row_idx >= 0
? static_cast<MT>(grad_[row_idx * row_numel + id % row_numel])
: static_cast<MT>(0);
p *= (static_cast<MT>(1.0) - lr * coeff);
mom1 = beta1 * mom1 + (static_cast<MT>(1.0) - beta1) * g;
mom2 = beta2 * mom2 + (static_cast<MT>(1.0) - beta2) * g * g;
MT denom =
(sqrt(mom2) / sqrt(static_cast<MT>(1.0) - beta2_pow)) + epsilon;
p += (mom1 / denom) * (-(lr / (static_cast<MT>(1.0) - beta1_pow)));
// Write back to global memory
mom1_out_[id] = mom1;
mom2_out_[id] = mom2;
param_out_[id] = static_cast<T>(p);
if (master_param_out) {
master_param_out[id] = p;
}
}
}
}
template <typename T, typename Context>
void AdamwDenseParamSparseGradKernel(
const Context& dev_ctx,
const DenseTensor& param,
const SelectedRows& grad,
const DenseTensor& learning_rate,
const DenseTensor& moment1,
const DenseTensor& moment2,
const DenseTensor& beta1_pow,
const DenseTensor& beta2_pow,
paddle::optional<const DenseTensor&> master_param,
paddle::optional<const DenseTensor&> skip_update,
const Scalar& beta1,
const Scalar& beta2,
const Scalar& epsilon,
float lr_ratio,
float coeff,
bool with_decay,
bool lazy_mode,
int64_t min_row_size_to_use_multithread,
bool multi_precision,
bool use_global_beta_pow,
DenseTensor* param_out,
DenseTensor* moment1_out,
DenseTensor* moment2_out,
DenseTensor* beta1_pow_out,
DenseTensor* beta2_pow_out,
DenseTensor* master_param_outs) {
using MPDType = typename phi::dtype::MPTypeTrait<T>::Type;
VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow;
MPDType coeff_ = static_cast<MPDType>(coeff);
MPDType lr_ratio_ = static_cast<MPDType>(lr_ratio);
bool skip_update_ = false;
if (skip_update.is_initialized()) {
PADDLE_ENFORCE_EQ(
skip_update->numel(),
1,
errors::InvalidArgument("Input(SkipUpdate) size must be 1, but get %d",
skip_update->numel()));
std::vector<bool> skip_update_vec;
paddle::framework::TensorToVector(*skip_update, dev_ctx, &skip_update_vec);
skip_update_ = skip_update_vec[0];
}
// skip_update=true, just copy input to output, and TensorCopy will call
// mutable_data
if (skip_update_) {
VLOG(4) << "Adamw skip update";
phi::Copy(dev_ctx, param, dev_ctx.GetPlace(), false, param_out);
phi::Copy(dev_ctx, moment1, dev_ctx.GetPlace(), false, moment1_out);
phi::Copy(dev_ctx, moment2, dev_ctx.GetPlace(), false, moment2_out);
phi::Copy(dev_ctx, beta1_pow, dev_ctx.GetPlace(), false, beta1_pow_out);
phi::Copy(dev_ctx, beta2_pow, dev_ctx.GetPlace(), false, beta2_pow_out);
return;
}
// if with_decay = false, coeff = 0
if (!with_decay) {
coeff_ = static_cast<MPDType>(0.0);
}
MPDType beta1_ = beta1.to<MPDType>();
MPDType beta2_ = beta2.to<MPDType>();
MPDType epsilon_ = epsilon.to<MPDType>();
VLOG(3) << "beta1_pow.numel() : " << beta1_pow.numel()
<< "beta2_pow.numel() : " << beta2_pow.numel();
VLOG(3) << "param.numel(): " << param.numel();
PADDLE_ENFORCE_EQ(
beta1_pow_out->numel(),
1,
errors::InvalidArgument("beta1 pow output size should be 1, but received "
"value is:%d.",
beta1_pow_out->numel()));
PADDLE_ENFORCE_EQ(
beta2_pow_out->numel(),
1,
errors::InvalidArgument("beta2 pow output size should be 1, but received "
"value is:%d.",
beta2_pow_out->numel()));
const MPDType* master_in_data =
multi_precision ? master_param->data<MPDType>() : nullptr;
MPDType* master_out_data =
multi_precision ? dev_ctx.template Alloc<MPDType>(master_param_outs)
: nullptr;
if (grad.rows().size() == 0) {
VLOG(3) << "grad row size is 0!!";
return;
}
std::vector<int64_t> cpu_rows(grad.rows().begin(), grad.rows().end());
bool is_strict_sorted = true;
for (size_t i = 1; i < cpu_rows.size(); ++i) {
if (cpu_rows[i - 1] >= cpu_rows[i]) {
is_strict_sorted = false;
break;
}
}
phi::SelectedRows tmp_grad_merge;
const phi::SelectedRows* grad_merge_ptr;
if (is_strict_sorted) {
grad_merge_ptr = &grad;
} else {
// merge duplicated rows if any.
// The rows of grad_merge have been sorted inside MergeAdd functor
paddle::operators::math::scatter::MergeAdd<Context, T> merge_func;
merge_func(dev_ctx, grad, &tmp_grad_merge, true);
grad_merge_ptr = &tmp_grad_merge;
}
auto& grad_merge = *grad_merge_ptr;
auto& grad_tensor = grad_merge.value();
const T* grad_data = grad_tensor.template data<T>();
auto* grad_merge_rows = &grad_merge.rows();
paddle::framework::MixVector<int64_t> mixv_grad_merge_rows(grad_merge_rows);
const int64_t* rows = mixv_grad_merge_rows.Data(dev_ctx.GetPlace());
auto row_numel = grad_tensor.numel() / grad_merge.rows().size();
if (beta1_pow.place() == CPUPlace() && beta2_pow.place() == CPUPlace()) {
int threads = 512;
int ndim = param.numel();
int blocks = (ndim + threads - 1) / threads;
SparseAdamWCUDAKernelREG<T,
MPDType><<<blocks, threads, 0, dev_ctx.stream()>>>(
beta1_,
beta2_,
epsilon_,
coeff_,
lr_ratio_,
*beta1_pow.data<MPDType>(),
*beta2_pow.data<MPDType>(),
moment1.data<MPDType>(),
dev_ctx.template Alloc<MPDType>(moment1_out),
moment2.data<MPDType>(),
dev_ctx.template Alloc<MPDType>(moment2_out),
learning_rate.data<MPDType>(),
grad_data,
param.data<T>(),
dev_ctx.template Alloc<T>(param_out),
master_in_data,
master_out_data,
rows,
row_numel,
grad_merge.rows().size(),
lazy_mode,
ndim);
if (!use_global_beta_pow) {
// Update with cpu
dev_ctx.template HostAlloc<MPDType>(beta1_pow_out)[0] =
beta1_ * beta1_pow.data<MPDType>()[0];
dev_ctx.template HostAlloc<MPDType>(beta2_pow_out)[0] =
beta2_ * beta2_pow.data<MPDType>()[0];
}
} else {
funcs::SparseAdamWFunctor<T, funcs::GPUAdamW, MPDType> functor(
beta1_,
beta2_,
epsilon_,
coeff_,
lr_ratio_,
beta1_pow.data<MPDType>(),
beta2_pow.data<MPDType>(),
moment1.data<MPDType>(),
dev_ctx.template Alloc<MPDType>(moment1_out),
moment2.data<MPDType>(),
dev_ctx.template Alloc<MPDType>(moment2_out),
learning_rate.data<MPDType>(),
grad_data,
param.data<T>(),
dev_ctx.template Alloc<T>(param_out),
master_in_data,
master_out_data,
rows,
row_numel,
grad_merge.rows().size(),
lazy_mode);
// FIXME(minqiyang): remove BinarySearch in GPU later
funcs::ForRange<Context> for_range(dev_ctx, param.numel());
for_range(functor);
if (!use_global_beta_pow) {
// update beta1 and beta2
UpdateAdamWBetaPow<MPDType><<<1, 32, 0, dev_ctx.stream()>>>(
beta1_,
beta2_,
beta1_pow.data<MPDType>(),
beta2_pow.data<MPDType>(),
dev_ctx.template Alloc<MPDType>(beta1_pow_out),
dev_ctx.template Alloc<MPDType>(beta2_pow_out));
}
}
}
} // namespace sr
} // namespace phi
PD_REGISTER_KERNEL(adamw_dense_param_sparse_grad,
GPU,
ALL_LAYOUT,
phi::sr::AdamwDenseParamSparseGradKernel,
float,
double,
phi::dtype::float16) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <string>
#include "paddle/phi/core/compat/op_utils.h"
#include "paddle/utils/small_vector.h"
namespace phi {
KernelSignature AdamOpArgumentMapping(const ArgumentMappingContext& ctx) {
paddle::SmallVector<std::string> in_names = {"Param",
"Grad",
"LearningRate",
"Moment1",
"Moment2",
"Beta1Pow",
"Beta2Pow",
"MasterParam",
"SkipUpdate"};
paddle::SmallVector<std::string> out_names = {"ParamOut",
"Moment1Out",
"Moment2Out",
"Beta1PowOut",
"Beta2PowOut",
"MasterParamOut"};
paddle::SmallVector<std::string> attr_names;
attr_names.emplace_back(ctx.HasInput("Beta1Tensor") ? "Beta1Tensor"
: "beta1");
attr_names.emplace_back(ctx.HasInput("Beta2Tensor") ? "Beta2Tensor"
: "beta2");
attr_names.emplace_back(ctx.HasInput("EpsilonTensor") ? "EpsilonTensor"
: "epsilon");
attr_names.emplace_back("lazy_mode");
attr_names.emplace_back("min_row_size_to_use_multithread");
attr_names.emplace_back("multi_precision");
attr_names.emplace_back("use_global_beta_pow");
if (ctx.IsSelectedRowsInput("Grad")) {
return KernelSignature("adam_dense_param_sparse_grad",
std::move(in_names),
std::move(attr_names),
std::move(out_names));
} else if (ctx.IsDenseTensorInput("Grad")) {
return KernelSignature("adam",
std::move(in_names),
std::move(attr_names),
std::move(out_names));
} else {
return KernelSignature("unregistered", {}, {}, {});
}
}
} // namespace phi
PD_REGISTER_ARG_MAPPING_FN(adam, phi::AdamOpArgumentMapping);
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <string>
#include "paddle/phi/core/compat/op_utils.h"
#include "paddle/utils/small_vector.h"
namespace phi {
KernelSignature AdamwOpArgumentMapping(const ArgumentMappingContext& ctx) {
paddle::SmallVector<std::string> in_names = {"Param",
"Grad",
"LearningRate",
"Moment1",
"Moment2",
"Beta1Pow",
"Beta2Pow",
"MasterParam",
"SkipUpdate"};
paddle::SmallVector<std::string> out_names = {"ParamOut",
"Moment1Out",
"Moment2Out",
"Beta1PowOut",
"Beta2PowOut",
"MasterParamOut"};
paddle::SmallVector<std::string> attr_names;
attr_names.emplace_back(ctx.HasInput("Beta1Tensor") ? "Beta1Tensor"
: "beta1");
attr_names.emplace_back(ctx.HasInput("Beta2Tensor") ? "Beta2Tensor"
: "beta2");
attr_names.emplace_back(ctx.HasInput("EpsilonTensor") ? "EpsilonTensor"
: "epsilon");
attr_names.emplace_back("lr_ratio");
attr_names.emplace_back("coeff");
attr_names.emplace_back("with_decay");
attr_names.emplace_back("lazy_mode");
attr_names.emplace_back("min_row_size_to_use_multithread");
attr_names.emplace_back("multi_precision");
attr_names.emplace_back("use_global_beta_pow");
if (ctx.IsSelectedRowsInput("Grad")) {
return KernelSignature("adamw_dense_param_sparse_grad",
std::move(in_names),
std::move(attr_names),
std::move(out_names));
} else if (ctx.IsDenseTensorInput("Grad")) {
return KernelSignature("adamw",
std::move(in_names),
std::move(attr_names),
std::move(out_names));
} else {
return KernelSignature("unregistered", {}, {}, {});
}
}
} // namespace phi
PD_REGISTER_ARG_MAPPING_FN(adamw, phi::AdamwOpArgumentMapping);
......@@ -16,7 +16,7 @@ import os
import re
import json
skip_list = []
skip_list = ["adam_sig.cc", "adamw_sig.cc"]
def parse_compat_registry(kernel_info):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册