未验证 提交 8ccbb863 编写于 作者: Q QingshuChen 提交者: GitHub

add xpu lars_momentum/pow2_decay (#44448)

*test=kunlun
上级 8037901b
......@@ -10,7 +10,7 @@ set(XPU_RT_LIB_NAME "libxpurt.so")
if(NOT DEFINED XPU_BASE_URL)
set(XPU_BASE_URL_WITHOUT_DATE
"https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
set(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220718")
set(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220719")
else()
set(XPU_BASE_URL "${XPU_BASE_URL}")
endif()
......@@ -19,7 +19,7 @@ endif()
if(NOT DEFINED XPU_XDNN_BASE_URL)
set(XPU_XDNN_BASE_URL_WITHOUT_DATE
"https://klx-sdk-release-public.su.bcebos.com/xdnn/dev")
set(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL_WITHOUT_DATE}/20220718")
set(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL_WITHOUT_DATE}/20220719")
else()
set(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL}")
endif()
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_XPU
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/optimizers/lars_momentum_op.h"
#include "paddle/phi/backends/xpu/enforce_xpu.h"
namespace paddle {
namespace operators {
template <typename T>
class LarsMomentumOpXPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
bool multi_precision = ctx.Attr<bool>("multi_precision");
auto param_out = ctx.MultiOutput<framework::LoDTensor>("ParamOut");
auto velocity_out = ctx.MultiOutput<framework::LoDTensor>("VelocityOut");
auto param = ctx.MultiInput<framework::LoDTensor>("Param");
auto velocity = ctx.MultiInput<framework::LoDTensor>("Velocity");
auto learning_rate = ctx.MultiInput<framework::LoDTensor>("LearningRate");
auto grad = ctx.MultiInput<framework::LoDTensor>("Grad");
auto weight_decay_arr = ctx.Attr<std::vector<float>>("lars_weight_decay");
auto master_param = ctx.MultiInput<framework::LoDTensor>("MasterParam");
auto master_param_out =
ctx.MultiOutput<framework::LoDTensor>("MasterParamOut");
T mu = static_cast<T>(ctx.Attr<float>("mu"));
T lars_coeff = ctx.Attr<float>("lars_coeff");
T epsilon = ctx.Attr<float>("epsilon");
T rescale_grad = ctx.Attr<float>("rescale_grad");
std::vector<T*> param_list;
std::vector<T*> grad_list;
std::vector<T*> param_out_list;
std::vector<float*> velocity_list;
std::vector<float*> velocity_out_list;
std::vector<float*> lrs;
std::vector<int> param_sizes;
std::vector<float*> master_param_list;
std::vector<float*> master_param_out_list;
int op_num = param.size();
for (int i = 0; i < op_num; ++i) {
param_list.push_back(const_cast<T*>(param[i]->data<T>()));
grad_list.push_back(const_cast<T*>(grad[i]->data<T>()));
param_out_list.push_back(param_out[i]->mutable_data<T>(ctx.GetPlace()));
velocity_list.push_back(const_cast<float*>(velocity[i]->data<float>()));
velocity_out_list.push_back(
velocity_out[i]->mutable_data<float>(ctx.GetPlace()));
lrs.push_back(const_cast<float*>(learning_rate[i]->data<float>()));
param_sizes.push_back(param[i]->numel());
PADDLE_ENFORCE_EQ(
param_list[i],
param_out_list[i],
platform::errors::InvalidArgument(
"Input(Param) and Output(ParamOut) must be the same Tensors."));
PADDLE_ENFORCE_EQ(velocity_list[i],
velocity_out_list[i],
platform::errors::InvalidArgument(
"Input(Velocity) and Output(VelocityOut) must be "
"the same Tensors."));
if (multi_precision) {
master_param_list.push_back(
const_cast<float*>(master_param[i]->data<float>()));
master_param_out_list.push_back(
master_param_out[i]->mutable_data<float>(ctx.GetPlace()));
PADDLE_ENFORCE_EQ(master_param_list[i],
master_param_out_list[i],
platform::errors::InvalidArgument(
"Input(MasterParam) and Output(MasterParamOut) "
"must be the same Tensors."));
} else {
master_param_list.push_back(nullptr);
master_param_out_list.push_back(nullptr);
}
}
auto& dev_ctx = ctx.template device_context<platform::XPUDeviceContext>();
int r = lars_momentum(dev_ctx.x_context(),
param_list,
grad_list,
velocity_list,
lrs,
master_param_list,
param_out_list,
velocity_out_list,
master_param_out_list,
weight_decay_arr,
param_sizes,
mu,
lars_coeff,
epsilon,
rescale_grad);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "lars_momentum");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_XPU_KERNEL(lars_momentum, ops::LarsMomentumOpXPUKernel<float>);
#endif
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifdef PADDLE_WITH_XPU
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.h"
#include "paddle/fluid/platform/macros.h"
#include "paddle/phi/backends/xpu/enforce_xpu.h"
namespace paddle {
namespace operators {
template <typename T>
class Pow2DecayWithLinearWarmupXPUOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const {
const auto *lr = ctx.Input<framework::Tensor>("LearningRate");
const auto *step = ctx.Input<framework::Tensor>("Step");
auto *lr_out = ctx.Output<framework::Tensor>("LearningRateOut");
auto *step_out = ctx.Output<framework::Tensor>("StepOut");
PADDLE_ENFORCE_EQ(
lr,
lr_out,
platform::errors::InvalidArgument("Input(LearningRate) and "
"Output(LearningRateOut) "
"must be the same."));
PADDLE_ENFORCE_NOT_NULL(lr,
platform::errors::InvalidArgument(
"Input(LearingRate) should not be nullptr."));
PADDLE_ENFORCE_EQ(step,
step_out,
platform::errors::InvalidArgument(
"Input(Step) and Output(StepOut) must be the same."));
PADDLE_ENFORCE_NOT_NULL(step,
platform::errors::InvalidArgument(
"Input(Step) should not be nullptr."));
PADDLE_ENFORCE_EQ(
step->IsInitialized(),
true,
platform::errors::InvalidArgument("Input(Step) must be initialized."));
auto warmup_steps = static_cast<size_t>(ctx.Attr<int64_t>("warmup_steps"));
auto total_steps = static_cast<size_t>(ctx.Attr<int64_t>("total_steps"));
PADDLE_ENFORCE_LE(warmup_steps,
total_steps,
platform::errors::InvalidArgument(
"warmup_steps must not be larger than total_steps."));
auto base_lr = ctx.Attr<float>("base_lr");
auto end_lr = ctx.Attr<float>("end_lr");
auto *lr_data = lr_out->data<T>();
auto *step_data = step_out->data<int64_t>();
auto &dev_ctx = ctx.template device_context<platform::XPUDeviceContext>();
int r = xpu::pow2_decay_with_linear_warmup(dev_ctx.x_context(),
lr_data,
step_data,
warmup_steps,
total_steps,
base_lr,
end_lr);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "pow2_decay_with_linear_warmup");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_XPU_KERNEL(pow2_decay_with_linear_warmup,
ops::Pow2DecayWithLinearWarmupXPUOpKernel<float>);
#endif
......@@ -71,6 +71,8 @@ XPUOpMap& get_kl2_ops() {
XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
pOpKernelType(vartype::FP16, XPUPlace())})},
{"clip", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
{"coalesce_tensor",
XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
{"concat_grad",
XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
pOpKernelType(vartype::FP16, XPUPlace())})},
......@@ -255,6 +257,8 @@ XPUOpMap& get_kl2_ops() {
XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
{"label_smooth",
XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
{"lars_momentum",
XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
{"layer_norm_grad",
XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
{"layer_norm_grad",
......@@ -334,6 +338,8 @@ XPUOpMap& get_kl2_ops() {
pOpKernelType(vartype::FP16, XPUPlace())})},
{"pow", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
{"pow_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
{"pow2_decay_with_linear_warmup",
XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
{"prior_box", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
{"range",
XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
......
......@@ -88,6 +88,7 @@ xpu_test_op_type_white_list = [
'dropout_float16',
'dropout_grad_float16',
"grad_add_float32", # no api for grad_add, skip
"lars_momentum_float32",
"resnet_unit",
"resnet_unit_grad"
]
......
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
from paddle.fluid import core
import sys
sys.path.append("..")
from op_test import OpTest
alignment = 256
import paddle
from op_test_xpu import XPUOpTest
from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
paddle.enable_static()
class XPUTestCoalesceTensorOp(XPUOpTestWrapper):
def __init__(self):
self.op_name = 'coalesce_tensor'
self.use_dynamic_create_class = False
class TestAllocContinuousSpace(XPUOpTest):
def setUp(self):
self.op_type = "coalesce_tensor"
self.use_xpu = True
self.dtype, self.fluid_dtype = self.init_dtype()
attrs = self.init_attr()
self.copy_data = attrs["copy_data"]
self.constant = attrs["constant"]
self.set_constant = attrs["set_constant"]
self.Inputs = self.init_input()
self.Outputs, self.FusedOutput = self.init_output(
self.Inputs, self.set_constant, self.constant)
self.inputs = {'Input': self.Inputs}
self.attrs = attrs
self.outputs = {
'Output': self.Outputs,
'FusedOutput': self.FusedOutput
}
def init_dtype(self):
return np.float32, int(core.VarDesc.VarType.FP32)
def init_input(self):
inputs = []
inputs.append(("x1", np.random.random([20, 3]).astype(self.dtype)))
inputs.append(("x2", np.random.random([20]).astype(self.dtype)))
inputs.append(("x3", np.random.random([1]).astype(self.dtype)))
inputs.append(("x4", np.random.random([200,
30]).astype(self.dtype)))
inputs.append(("x5", np.random.random([30]).astype(self.dtype)))
inputs.append(("x6", np.random.random([1]).astype(self.dtype)))
return inputs
def init_attr(self):
return {
"copy_data": True,
"set_constant": False,
"constant": 0.0,
"dtype": self.fluid_dtype
}
def init_output(self, input_list, set_constant, constant):
inputs = []
outputs = input_list
for input in input_list:
length = len(input[1].flatten())
aligned_len = (length + alignment) / alignment * alignment
out = np.zeros(int(aligned_len))
out[0:length] = input[1].flatten()
inputs.append(out)
coalesce_tensor_var = np.concatenate([input for input in inputs])
if set_constant:
coalesce_tensor_var = np.ones(
(len(coalesce_tensor_var))) * constant
outputs = [(out[0],
np.ones(out[1].shape).astype(self.dtype) * constant)
for out in outputs]
return outputs, coalesce_tensor_var
def test_check_output(self):
self.check_output_with_place(place=core.XPUPlace(0),
no_check_set=["FusedOutput"],
atol=1e-5)
class TestAllocContinuousSpace2(TestAllocContinuousSpace):
def init_attr(self):
return {
"copy_data": False,
"set_constant": True,
"constant": 0.5,
"dtype": self.fluid_dtype,
"user_defined_size_of_dtype": 2
}
def test_check_output(self):
self.check_output_with_place(place=core.XPUPlace(0),
no_check_set=["FusedOutput"],
atol=1e-5)
support_types = get_xpu_op_support_types('coalesce_tensor')
for stype in support_types:
create_test_class(globals(), XPUTestCoalesceTensorOp, stype)
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
from paddle.fluid.contrib.layers.nn import pow2_decay_with_linear_warmup
from paddle.optimizer.lr import LinearWarmup
from paddle.optimizer.lr import PolynomialDecay
import unittest
import sys
sys.path.append("..")
from op_test import OpTest
from op_test_xpu import XPUOpTest
from xpu.get_test_cover_info import record_op_test
def gen_pow2_warmup_op_lr(warmup_steps, total_steps, base_lr, end_lr, place):
main = paddle.static.Program()
startup = paddle.static.Program()
with paddle.static.program_guard(main, startup):
lr = pow2_decay_with_linear_warmup(warmup_steps, total_steps, base_lr,
end_lr)
exe = paddle.static.Executor(place)
with paddle.static.scope_guard(paddle.static.Scope()):
exe.run(startup)
while True:
lr_np = exe.run(main, fetch_list=[lr])[0]
yield lr_np[0]
class Pow2Warmup(LinearWarmup):
def __init__(self, warmup_steps, total_steps, base_lr, end_lr):
assert total_steps > warmup_steps
lr_sch = PolynomialDecay(learning_rate=base_lr,
decay_steps=total_steps - warmup_steps,
end_lr=end_lr,
power=2)
super(Pow2Warmup, self).__init__(learning_rate=lr_sch,
warmup_steps=warmup_steps,
start_lr=0.0,
end_lr=base_lr)
def gen_pow2_warmup_py_lr(warmup_steps, total_steps, base_lr, end_lr, place):
lr_sch = Pow2Warmup(warmup_steps, total_steps, base_lr, end_lr)
lr_sch.step()
while True:
yield lr_sch()
lr_sch.step()
class TestPowWarmup(unittest.TestCase):
def setUp(self):
paddle.enable_static()
self.op_type = 'pow2_decay_with_linear_warmup'
self.params = {
'warmup_steps': 30,
'total_steps': 100,
'base_lr': 0.02,
'end_lr': 0.001,
}
self.step_num = 1000
def check_with_place(self, place):
kwargs = dict(self.params)
kwargs['place'] = place
lr_sch_op = gen_pow2_warmup_op_lr(**kwargs)
lr_sch_py = gen_pow2_warmup_py_lr(**kwargs)
for i, (lr_op, lr_py) in enumerate(zip(lr_sch_op, lr_sch_py)):
self.assertLess(abs(lr_op - lr_py), 1e-6)
if i > self.step_num:
break
def test_main(self):
self.check_with_place(paddle.XPUPlace(0))
record_op_test("pow2_decay_with_linear_warmup", "float32")
if __name__ == "__main__":
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册