未验证 提交 4d805e6a 编写于 作者: Y Yuang Liu 提交者: GitHub

multi pricison for lars op and lars optimizer (#33280)

上级 fc5b3a99
......@@ -34,6 +34,7 @@ class LarsMomentumOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput("LearningRate",
"(LoDTensor, default LoDTensor<float>) "
"Input learning rate");
AddInput("MasterParam", "FP32 master weight for AMP.").AsDispensable();
AddOutput("ParamOut",
"(LoDTensor) This output is updated parameter. "
......@@ -41,6 +42,10 @@ class LarsMomentumOpMaker : public framework::OpProtoAndCheckerMaker {
AddOutput("VelocityOut",
"(LoDTensor) This output is updated velocity. "
"It shared memory with Input(Velocity).");
AddOutput("MasterParamOut",
"The updated FP32 master weight for AMP. "
"It shared memory with Input(MasterParam).")
.AsDispensable();
AddAttr<float>("mu", "(float) Momentum coefficient");
AddAttr<float>("lars_coeff", "(float, default 0.001) LARS coefficient.")
......@@ -51,6 +56,15 @@ class LarsMomentumOpMaker : public framework::OpProtoAndCheckerMaker {
AddAttr<float>("epsilon",
"(float, default 0.0) epsilon to avoid Division by Zero.")
.SetDefault(0.0);
AddAttr<bool>("multi_precision",
"(bool, default false) "
"Whether to use multi-precision during weight updating.")
.SetDefault(false);
AddAttr<float>(
"rescale_grad",
"(float, default 1.0) Multiply the gradient with `rescale_grad`"
"before updating. Often choose to be `1.0/batch_size`.")
.SetDefault(1.0f);
AddComment(R"DOC(
Lars Momentum Optimizer.
......
......@@ -13,36 +13,64 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/amp/fp16_type_traits.h"
#include "paddle/fluid/operators/optimizers/lars_momentum_op.h"
namespace paddle {
namespace operators {
template <typename T>
__global__ void MomentumLarsKernel(const T* p, const T* g, const T* v,
const T* learning_rate, const T mu,
const int64_t num, const T lars_coeff,
const T lars_weight_decay, const T* p_norm,
const T* g_norm, T* p_out, T* v_out,
const T epsilon) {
T lr = learning_rate[0];
T local_lr = learning_rate[0];
using MultiPrecisionType = typename details::MPTypeTrait<T>::Type;
template <typename T, typename MT>
__global__ void MomentumLarsKernel(
const T* p, const T* g, const MT* v,
const MultiPrecisionType<T>* learning_rate, const MT mu, const int64_t num,
const MT lars_coeff, const MT lars_weight_decay,
const MultiPrecisionType<T>* p_norm, const MultiPrecisionType<T>* g_norm,
T* p_out, MT* v_out, const MT epsilon, const MT* master_p, MT* master_p_out,
const MultiPrecisionType<T> rescale_grad) {
const MT lr = static_cast<MT>(learning_rate[0]);
MT local_lr = lr;
const MT p_n = static_cast<MT>(p_norm[0]);
const MT g_n = static_cast<MT>(g_norm[0]);
if (lars_weight_decay > static_cast<MT>(0) && p_n > static_cast<MT>(0) &&
g_n > static_cast<MT>(0)) {
local_lr =
lr * lars_coeff * p_n / (g_n + lars_weight_decay * p_n + epsilon);
}
CUDA_KERNEL_LOOP(i, num) {
if (lars_weight_decay > 0 && p_norm[0] > 0 && g_norm[0] > 0) {
local_lr = lr * lars_coeff * p_norm[0] /
(g_norm[0] + lars_weight_decay * p_norm[0] + epsilon);
}
MT grad = static_cast<MT>(g[i]) * static_cast<MT>(rescale_grad);
MT param = master_p ? master_p[i] : static_cast<MT>(p[i]);
MT v_new = v[i] * mu + local_lr * (grad + lars_weight_decay * param);
MT p_new = param - v_new;
T v_new = v[i] * mu + local_lr * (g[i] + lars_weight_decay * p[i]);
v_out[i] = v_new;
p_out[i] = p[i] - v_new;
p_out[i] = static_cast<T>(p_new);
if (master_p_out) master_p_out[i] = p_new;
}
}
template <typename DeviceContext, typename T>
class LarsMomentumOpCUDAKernel : public framework::OpKernel<T> {
using MPDType = MultiPrecisionType<T>;
public:
void Compute(const framework::ExecutionContext& ctx) const override {
const bool multi_precision = ctx.Attr<bool>("multi_precision");
if (multi_precision) {
InnerCompute<MPDType>(ctx, multi_precision);
} else {
InnerCompute<T>(ctx, multi_precision);
}
}
private:
template <typename MT>
void InnerCompute(const framework::ExecutionContext& ctx,
const bool multi_precision) const {
auto param_out = ctx.Output<framework::LoDTensor>("ParamOut");
auto velocity_out = ctx.Output<framework::LoDTensor>("VelocityOut");
auto param = ctx.Input<framework::LoDTensor>("Param");
......@@ -50,18 +78,40 @@ class LarsMomentumOpCUDAKernel : public framework::OpKernel<T> {
auto grad = ctx.Input<framework::LoDTensor>("Grad");
auto learning_rate = ctx.Input<framework::LoDTensor>("LearningRate");
const framework::Tensor* master_param = nullptr;
framework::Tensor* master_param_out = nullptr;
if (multi_precision) {
bool has_master =
ctx.HasInput("MasterParam") && ctx.HasOutput("MasterParamOut");
PADDLE_ENFORCE_EQ(has_master, true,
platform::errors::InvalidArgument(
"The Input(MasterParam) and Output(MasterParamOut) "
"should not be null when "
"the attr `multi_precision` is true"));
master_param = ctx.Input<framework::Tensor>("MasterParam");
master_param_out = ctx.Output<framework::Tensor>("MasterParamOut");
}
const MT* master_p = multi_precision ? master_param->data<MT>() : nullptr;
MT* master_p_out = multi_precision
? master_param_out->mutable_data<MT>(ctx.GetPlace())
: nullptr;
T* p_out = param_out->mutable_data<T>(ctx.GetPlace());
T* v_out = velocity_out->mutable_data<T>(ctx.GetPlace());
MT* v_out = velocity_out->mutable_data<MT>(ctx.GetPlace());
T mu = static_cast<T>(ctx.Attr<float>("mu"));
T lars_coeff = ctx.Attr<float>("lars_coeff");
T lars_weight_decay = ctx.Attr<float>("lars_weight_decay");
T epsilon = ctx.Attr<float>("epsilon");
MT mu = static_cast<MT>(ctx.Attr<float>("mu"));
MT lars_coeff = static_cast<MT>(ctx.Attr<float>("lars_coeff"));
MT lars_weight_decay =
static_cast<MT>(ctx.Attr<float>("lars_weight_decay"));
MT epsilon = static_cast<MT>(ctx.Attr<float>("epsilon"));
MPDType rescale_grad =
static_cast<MPDType>(ctx.Attr<float>("rescale_grad"));
auto* p = param->data<T>();
auto* v = velocity->data<T>();
auto* g = grad->data<T>();
auto* lr = learning_rate->data<T>();
auto* v = velocity->data<MT>();
auto* lr = learning_rate->data<MPDType>();
int block = 512;
int grid = (param->numel() + block - 1) / block;
......@@ -72,17 +122,24 @@ class LarsMomentumOpCUDAKernel : public framework::OpKernel<T> {
framework::Tensor p_norm_t, g_norm_t;
p_norm_t.Resize({1});
g_norm_t.Resize({1});
auto* p_norm_data = p_norm_t.mutable_data<T>(ctx.GetPlace());
auto* g_norm_data = g_norm_t.mutable_data<T>(ctx.GetPlace());
auto ep_norm = framework::EigenScalar<T>::From(p_norm_t);
auto eg_norm = framework::EigenScalar<T>::From(g_norm_t);
auto* p_norm_data = p_norm_t.mutable_data<MPDType>(ctx.GetPlace());
auto* g_norm_data = g_norm_t.mutable_data<MPDType>(ctx.GetPlace());
auto ep_norm = framework::EigenScalar<MPDType>::From(p_norm_t);
auto eg_norm = framework::EigenScalar<MPDType>::From(g_norm_t);
auto* place = ctx.template device_context<DeviceContext>().eigen_device();
ep_norm.device(*place) = eigen_p.square().sum().sqrt();
eg_norm.device(*place) = eigen_g.square().sum().sqrt();
MomentumLarsKernel<<<grid, block, 0, ctx.cuda_device_context().stream()>>>(
// eigen unsupport fp16 l2-norm
ep_norm.device(*place) =
eigen_p.template cast<MPDType>().square().sum().sqrt();
eg_norm.device(*place) =
(eigen_g.template cast<MPDType>() * rescale_grad).square().sum().sqrt();
MomentumLarsKernel<
T, MT><<<grid, block, 0, ctx.cuda_device_context().stream()>>>(
p, g, v, lr, mu, param->numel(), lars_coeff, lars_weight_decay,
p_norm_data, g_norm_data, p_out, v_out, epsilon);
p_norm_data, g_norm_data, p_out, v_out, epsilon, master_p, master_p_out,
rescale_grad);
}
};
......@@ -93,4 +150,6 @@ namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
lars_momentum,
ops::LarsMomentumOpCUDAKernel<paddle::platform::CUDADeviceContext, float>,
ops::LarsMomentumOpCUDAKernel<paddle::platform::CUDADeviceContext, double>);
ops::LarsMomentumOpCUDAKernel<paddle::platform::CUDADeviceContext, double>,
ops::LarsMomentumOpCUDAKernel<paddle::platform::CUDADeviceContext,
paddle::platform::float16>);
......@@ -135,6 +135,9 @@ class MomentumOp : public framework::OperatorWithKernel {
ctx->SetOutputDim("ParamOut", param_dim);
ctx->SetOutputDim("VelocityOut", param_dim);
if (ctx->HasOutput("MasterParamOut")) {
ctx->SetOutputDim("MasterParamOut", param_dim);
}
}
framework::OpKernelType GetExpectedKernelType(
......
......@@ -73,7 +73,7 @@ def resnet_cifar10(input, depth=32):
return pool
def train(use_pure_fp16=True, use_nesterov=False, use_adam=False):
def train(use_pure_fp16=True, use_nesterov=False, optimizer=""):
classdim = 10
data_shape = [3, 32, 32]
BATCH_SIZE = 32
......@@ -96,12 +96,17 @@ def train(use_pure_fp16=True, use_nesterov=False, use_adam=False):
# Test program
test_program = train_program.clone(for_test=True)
if use_adam:
if optimizer == "Adam":
optimizer = paddle.optimizer.AdamW(
learning_rate=0.001,
epsilon=1e-8,
weight_decay=0.0,
multi_precision=True)
elif optimizer == "Lars":
optimizer = paddle.fluid.optimizer.LarsMomentumOptimizer(
learning_rate=0.001,
momentum=0.9,
multi_precision=use_pure_fp16)
else:
optimizer = paddle.optimizer.Momentum(
learning_rate=0.001,
......@@ -169,9 +174,11 @@ class TestImageMultiPrecision(unittest.TestCase):
if not fluid.core.is_compiled_with_cuda():
return
def do_test(use_nesterov=False, use_adam=False):
if use_adam:
def do_test(use_nesterov=False, optimizer=""):
if optimizer == "Adam":
suffix = "use Adam"
elif optimizer == "Lars":
suffix = "use Lars"
else:
suffix = "with Nesterov" if use_nesterov else "without Nesterov"
with self.scope_prog_guard():
......@@ -180,14 +187,14 @@ class TestImageMultiPrecision(unittest.TestCase):
train_loss_fp16, test_loss_fp16 = train(
use_pure_fp16=True,
use_nesterov=use_nesterov,
use_adam=use_adam)
optimizer=optimizer)
with self.scope_prog_guard():
print("-----------------FP32 Train {}-----------------".format(
suffix))
train_loss_fp32, test_loss_fp32 = train(
use_pure_fp16=False,
use_nesterov=use_nesterov,
use_adam=use_adam)
optimizer=optimizer)
self.assertTrue(
np.allclose(
......@@ -208,7 +215,8 @@ class TestImageMultiPrecision(unittest.TestCase):
do_test(use_nesterov=False)
do_test(use_nesterov=True)
do_test(use_adam=True)
do_test(optimizer="Adam")
do_test(optimizer="Lars")
@contextlib.contextmanager
def scope_prog_guard(self):
......
......@@ -1725,6 +1725,9 @@ class LarsMomentumOptimizer(Optimizer):
For details, please refer to :ref:`api_guide_Name`. Default is None.
exclude_from_weight_decay (list[str], optional): Name string of layers which will be exclude from lars weight decay. Default is None.
epsilon (float, optional): Epsilon to avoid Division by Zero when calculate local lr. Default is 0.
multi_precision (bool, optional): Whether to use multi-precision during weight updating.
rescale_grad (float, optional): Multiply the gradient with `rescale_grad` \
before updating. Often choose to be `1.0/batch_size`.
Examples:
.. code-block:: python
......@@ -1758,7 +1761,9 @@ class LarsMomentumOptimizer(Optimizer):
grad_clip=None,
name=None,
exclude_from_weight_decay=None,
epsilon=0):
epsilon=0,
multi_precision=False,
rescale_grad=1.0):
assert learning_rate is not None
assert momentum is not None
super(LarsMomentumOptimizer, self).__init__(
......@@ -1776,16 +1781,70 @@ class LarsMomentumOptimizer(Optimizer):
self._exclude_from_weight_decay = []
else:
self._exclude_from_weight_decay = exclude_from_weight_decay
self._multi_precision = multi_precision
self._rescale_grad = float(rescale_grad)
self._master_weights = {}
def _create_master_weight(self, param):
assert isinstance(self.helper, LayerHelper)
var_name = param.name + '_fp32_master'
var_name = unique_name.generate(var_name)
var = layers.create_global_var(
name=var_name,
shape=param.shape,
value=0,
dtype='float32',
persistable=True)
block = self.helper.startup_program.global_block()
block.append_op(
type="cast",
inputs={"X": [param]},
outputs={"Out": [var]},
attrs={
"in_dtype": param.dtype,
"out_dtype": core.VarDesc.VarType.FP32
})
self._master_weights[param.name] = var
return var
def _get_accumulator(self, name, param):
"""Utility function to fetch an accumulator for a parameter
Args:
name: name of the accumulator
param: parameter variable for which accumulator is to be fetched
Returns:
accumulator variable for the parameter
"""
if self._name is not None:
name = self._name + "_" + name
find_master = self._multi_precision and param.dtype == core.VarDesc.VarType.FP16
target_param = self._master_weights[
param.name] if find_master else param
target_name = target_param.name
if (name not in self._accumulators or
target_name not in self._accumulators[name]):
raise Exception("Accumulator {} does not exist for parameter {}".
format(name, target_name))
return self._accumulators[name][target_name]
def _create_accumulators(self, block, parameters):
assert isinstance(block, framework.Block)
for p in parameters:
if self._multi_precision and p.dtype == core.VarDesc.VarType.FP16:
master_p = self._create_master_weight(p)
self._add_accumulator(self._velocity_acc_str, master_p)
continue
if p.dtype == core.VarDesc.VarType.FP16 and not self._multi_precision:
warnings.warn(
"Accumulating with FP16 in optimizer can lead to poor accuracy or slow convergence."
"Consider using multi_precision=True option of the Lars optimizer."
)
self._add_accumulator(self._velocity_acc_str, p)
def _append_optimize_op(self, block, param_and_grad):
assert isinstance(block, framework.Block)
_lars_weight_decay = self._lars_weight_decay
param_name = param_and_grad[0].name
if len(self._exclude_from_weight_decay) > 0:
......@@ -1796,25 +1855,40 @@ class LarsMomentumOptimizer(Optimizer):
velocity_acc = self._get_accumulator(self._velocity_acc_str,
param_and_grad[0])
lr = self._create_param_lr(param_and_grad)
find_master = self._multi_precision and param_and_grad[
0].dtype == core.VarDesc.VarType.FP16
master_weight = (self._master_weights[param_and_grad[0].name]
if find_master else None)
attrs = {
"mu": self._momentum,
"lars_coeff": self._lars_coeff,
"lars_weight_decay": _lars_weight_decay,
"multi_precision": find_master,
"rescale_grad": self._rescale_grad
}
inputs = {
"Param": param_and_grad[0],
"Grad": param_and_grad[1],
"Velocity": velocity_acc,
"LearningRate": lr
}
outputs = {"ParamOut": param_and_grad[0], "VelocityOut": velocity_acc}
if find_master:
inputs["MasterParam"] = master_weight
outputs["MasterParamOut"] = master_weight
# create the momentum optimize op
momentum_op = block.append_op(
type=self.type,
inputs={
"Param": param_and_grad[0],
"Grad": param_and_grad[1],
"Velocity": velocity_acc,
"LearningRate": self._create_param_lr(param_and_grad)
},
outputs={
"ParamOut": param_and_grad[0],
"VelocityOut": velocity_acc
},
attrs={
"mu": self._momentum,
"lars_coeff": self._lars_coeff,
"lars_weight_decay": _lars_weight_decay,
"epsilon": self._epsilon
},
inputs=inputs,
outputs=outputs,
attrs=attrs,
stop_gradient=True)
return momentum_op
......
......@@ -134,6 +134,64 @@ class TestMomentumOp2(OpTest):
self.check_output()
@unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA")
class TestLarsMomentumOpWithMP(OpTest):
def setUp(self):
self.op_type = "lars_momentum"
master_param = np.random.random((123, 321)).astype("float32")
param = master_param.astype("float16")
grad = np.random.random((123, 321)).astype("float16")
velocity = np.zeros((123, 321)).astype("float32")
learning_rate = np.array([0.001]).astype("float32")
mu = 0.0001
lars_coeff = 0.001
lars_weight_decay = 0.0005
rescale_grad = 1.0
self.inputs = {
'Param': param,
'Grad': grad,
'Velocity': velocity,
'LearningRate': learning_rate,
'MasterParam': master_param,
}
self.attrs = {
'mu': mu,
'lars_coeff': lars_coeff,
'lars_weight_decay': lars_weight_decay,
'multi_precision': True,
'rescale_grad': rescale_grad
}
fp32_grad = grad.astype("float32")
pnorm = np.sqrt(np.square(master_param).sum())
gnorm = np.sqrt(np.square(fp32_grad).sum())
local_lr = learning_rate * lars_coeff * pnorm / (
gnorm + lars_weight_decay * pnorm)
fp32_grad = fp32_grad * rescale_grad
velocity_out = mu * velocity + local_lr * (fp32_grad + lars_weight_decay
* master_param)
p_new = master_param - velocity_out
param_out = p_new.astype("float16")
master_param_out = p_new
self.outputs = {
'ParamOut': param_out,
'VelocityOut': velocity_out,
'MasterParamOut': master_param_out
}
def test_check_output(self):
paddle.enable_static()
if core.is_compiled_with_cuda():
place = fluid.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place)
class TestLarsMomentumOp(OpTest):
def setUp(self):
self.op_type = "lars_momentum"
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册