未验证 提交 d1e8b1e2 编写于 作者: Y Yiqun Liu 提交者: GitHub

Cherry pick for fix of operator precision. (#52705)

* Fix scale kernel for low precision, cherry pick #50998.

* Fix the FP16 precision problem of add_n. (#50129)

* Change squared_l2_norm to reuse ReduceKernel, and register fp16 and bf16 kernel, which is cherry pick #48315.

* Cherry-pick the fix of MPTypeTrait in KP, which is implemented in #50993.

* Cherry-pick the multi-precision support of AdamW for bf16, #48041.

* Fix compiling error.

* Cherry-pick the fix of CubTensorReduceImpl for bfloat16 in #50993.

* Fix unittest.

---------
Co-authored-by: Nliuruyan <44316842+liuruyan@users.noreply.github.com>
上级 d12588d2
......@@ -986,9 +986,11 @@ template <typename Tx,
template <typename>
class ReduceOp,
typename TransformOp>
static typename std::enable_if<!std::is_same<Tx, phi::dtype::float16>::value,
static
typename std::enable_if<!std::is_same<Tx, phi::dtype::float16>::value &&
!std::is_same<Tx, phi::dtype::bfloat16>::value,
void>::type
CubTensorReduceImpl(const Tx* x_data,
CubTensorReduceImpl(const Tx* x_data,
Ty* y_data,
const TransformOp& transform,
int reduce_num,
......@@ -1037,6 +1039,23 @@ CubTensorReduceImpl(const Tx* x_data,
PADDLE_THROW(phi::errors::InvalidArgument(
"Tx should not be float16 when using cub::DeviceReduce::Reduce()."));
}
template <typename Tx,
typename Ty,
template <typename>
class ReduceOp,
typename TransformOp>
static typename std::enable_if<std::is_same<Tx, phi::dtype::bfloat16>::value,
void>::type
CubTensorReduceImpl(const Tx* x_data,
Ty* y_data,
const TransformOp& transform,
int reduce_num,
const KPDevice& dev_ctx,
KPStream stream) {
PADDLE_THROW(phi::errors::InvalidArgument(
"Tx should not be bfloat16 when using cub::DeviceReduce::Reduce()."));
}
#endif // PADDLE_WITH_XPU_KP
template <typename Tx,
......@@ -1081,7 +1100,8 @@ void ReduceKernel(const KPDevice& dev_ctx,
config.SetOutputData(y_data, dev_ctx, &tmp);
constexpr bool kIsTxFP16 = std::is_same<Tx, phi::dtype::float16>::value;
bool use_cub_reduce = config.reduce_num == numel && !kIsTxFP16;
constexpr bool kIsTxBF16 = std::is_same<Tx, phi::dtype::bfloat16>::value;
bool use_cub_reduce = config.reduce_num == numel && !kIsTxFP16 && !kIsTxBF16;
#ifndef PADDLE_WITH_XPU_KP
if (use_cub_reduce) {
if (is_mean) {
......
......@@ -14,10 +14,10 @@
#include "paddle/phi/kernels/add_n_kernel.h"
#include "paddle/phi/kernels/impl/add_n_kernel_impl.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/common/amp_type_traits.h"
#include "paddle/phi/kernels/impl/add_n_kernel_impl.h"
namespace phi {
......@@ -38,16 +38,18 @@ __global__ void Sum2CUDAKernel(const T *in_0,
template <class T>
__global__ void SumArrayCUDAKernel(
T **in, T *out, int64_t N, size_t in_size, bool read_dst) {
using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
int id = blockIdx.x * blockDim.x + threadIdx.x;
while (id < N) {
T total(read_dst ? out[id] : static_cast<T>(0));
MPType total(read_dst ? static_cast<MPType>(out[id])
: static_cast<MPType>(0));
for (int i = 0; i < in_size; ++i) {
const T *tmp = in[i];
if (tmp) {
total += tmp[id];
total += static_cast<MPType>(tmp[id]);
}
}
out[id] = total;
out[id] = static_cast<T>(total);
id += blockDim.x * gridDim.x;
}
}
......@@ -116,11 +118,12 @@ void AddNKernel(const Context &dev_ctx,
int64_t length_0 = in_0.numel();
int64_t length_1 = in_1.numel();
if (length_0 && length_1 && in_0.IsInitialized() && in_1.IsInitialized()) {
using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
auto result = EigenVector<T>::Flatten(*out);
auto &place = *dev_ctx.eigen_device();
auto in_0_e = EigenVector<T>::Flatten(in_0);
auto in_1_e = EigenVector<T>::Flatten(in_1);
result.device(place) = in_0_e + in_1_e;
auto in_0_e = EigenVector<T>::Flatten(in_0).template cast<MPType>();
auto in_1_e = EigenVector<T>::Flatten(in_1).template cast<MPType>();
result.device(place) = (in_0_e + in_1_e).template cast<T>();
} else if (length_0 && in_0.IsInitialized()) {
auto result = EigenVector<T>::Flatten(*out);
auto &place = *dev_ctx.eigen_device();
......
......@@ -15,28 +15,30 @@ limitations under the License. */
#include "paddle/phi/kernels/scale_kernel.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/amp_type_traits.h"
#include "paddle/phi/common/bfloat16.h"
#include "paddle/phi/common/float16.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/elementwise_base.h"
namespace phi {
template <typename InT>
template <typename DataT, typename ParamT>
struct ScaleFunctor {
InT bias;
InT scale;
ParamT bias;
ParamT scale;
bool bias_after_scale;
ScaleFunctor(InT scale_data, InT bias_data, bool is_bias_after_sacle)
ScaleFunctor(ParamT scale_data, ParamT bias_data, bool is_bias_after_sacle)
: bias(bias_data),
scale(scale_data),
bias_after_scale(is_bias_after_sacle) {}
__device__ __forceinline__ InT operator()(const InT x) const {
__device__ __forceinline__ DataT operator()(const DataT x) const {
if (bias_after_scale) {
return scale * x + bias;
return static_cast<DataT>(scale * static_cast<ParamT>(x) + bias);
} else {
return scale * (x + bias);
return static_cast<DataT>(scale * (static_cast<ParamT>(x) + bias));
}
}
};
......@@ -48,16 +50,21 @@ void ScaleKernel(const Context& dev_ctx,
float bias,
bool bias_after_scale,
DenseTensor* out) {
using MT = typename phi::dtype::MPTypeTrait<T>::Type;
std::vector<const DenseTensor*> inputs;
std::vector<DenseTensor*> outputs;
inputs.emplace_back(&x);
outputs.emplace_back(out);
dev_ctx.template Alloc<T>(out);
if (x.numel() <= 0 || (!x.IsInitialized())) {
return;
}
phi::funcs::ElementwiseKernel<T>(
dev_ctx,
inputs,
&outputs,
ScaleFunctor<T>(scale.to<T>(), static_cast<T>(bias), bias_after_scale));
ScaleFunctor<T, MT>(
scale.to<MT>(), static_cast<MT>(bias), bias_after_scale));
}
} // namespace phi
......
......@@ -15,12 +15,47 @@
#include "paddle/phi/kernels/squared_l2_norm_grad_kernel.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/float16.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/impl/squared_l2_norm_grad_kernel_impl.h"
#include "paddle/phi/kernels/funcs/broadcast_function.h"
namespace phi {
/**
* x*y*2.0
*/
template <typename T>
struct DoubleMulFunctor {
__device__ __forceinline__ T operator()(const T a, const T b) const {
return b * a * static_cast<T>(2.0f);
}
};
template <typename T, typename Context>
void SquaredL2NormGradKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& dout,
DenseTensor* dx) {
dev_ctx.template Alloc<T>(dx);
PADDLE_ENFORCE_EQ(
dout.numel(),
1,
phi::errors::InvalidArgument(
"Input(GRAD@Out) of SquaredL2NormGradOP should be a scalar."));
std::vector<const DenseTensor*> ins{&x, &dout};
std::vector<DenseTensor*> outs{dx};
funcs::BroadcastKernel<ElementwiseType::kBinary, T, T>(
dev_ctx, ins, &outs, -1, phi::DoubleMulFunctor<T>());
}
} // namespace phi
PD_REGISTER_KERNEL(squared_l2_norm_grad,
GPU,
ALL_LAYOUT,
phi::SquaredL2NormGradKernel,
float,
double) {}
double,
phi::dtype::float16,
phi::dtype::bfloat16) {}
......@@ -15,9 +15,34 @@
#include "paddle/phi/kernels/squared_l2_norm_kernel.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/bfloat16.h"
#include "paddle/phi/common/float16.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/impl/squared_l2_norm_kernel_impl.h"
#include "paddle/phi/kernels/funcs/reduce_function.h"
PD_REGISTER_KERNEL(
squared_l2_norm, GPU, ALL_LAYOUT, phi::SquaredL2NormKernel, float, double) {
namespace phi {
template <typename T, typename Context>
void SquaredL2NormKernel(const Context& dev_ctx,
const DenseTensor& x,
DenseTensor* out) {
dev_ctx.template Alloc<T>(out);
std::vector<int> origin_reduce_dims;
for (size_t i = 0; i < x.dims().size(); i++) {
origin_reduce_dims.push_back(i);
}
phi::funcs::ReduceKernel<T, T, kps::AddFunctor, kps::SquareFunctor<T, T>>(
dev_ctx, x, out, kps::SquareFunctor<T, T>(), origin_reduce_dims, false);
}
} // namespace phi
PD_REGISTER_KERNEL(squared_l2_norm,
GPU,
ALL_LAYOUT,
phi::SquaredL2NormKernel,
float,
double,
phi::dtype::float16,
phi::dtype::bfloat16) {}
......@@ -52,6 +52,12 @@ class MPTypeTrait<phi::dtype::float16> {
using Type = float;
};
template <>
class MPTypeTrait<phi::dtype::bfloat16> {
public:
using Type = float;
};
/**
* @brief Will be used in BlockYReduce, get the index of reduce_num in shared
* memory.
......
......@@ -32,8 +32,11 @@ from .framework import default_main_program
from paddle import _C_ops, _legacy_C_ops
__all__ = [
'set_gradient_clip', 'ErrorClipByValue', 'ClipGradByValue',
'ClipGradByNorm', 'ClipGradByGlobalNorm'
'set_gradient_clip',
'ErrorClipByValue',
'ClipGradByValue',
'ClipGradByNorm',
'ClipGradByGlobalNorm',
]
_clip_by_global_norm_using_mp_type_flag = False
......@@ -52,7 +55,8 @@ def _clip_by_global_norm_using_mp_type(*args):
def _cast_to_mp_type_if_enabled(x):
if (x.dtype == core.VarDesc.VarType.FP16
if (
x.dtype == core.VarDesc.VarType.FP16
or x.dtype == core.VarDesc.VarType.BF16
) and _clip_by_global_norm_using_mp_type():
return x.astype(core.VarDesc.VarType.FP32)
......@@ -66,8 +70,7 @@ def _squared_l2_norm(x):
"""
x = _cast_to_mp_type_if_enabled(x)
if core.is_compiled_with_xpu(
) or x.dtype == core.VarDesc.VarType.FP16 or x.dtype == core.VarDesc.VarType.BF16:
if core.is_compiled_with_xpu():
square = layers.square(x)
sum_square = layers.reduce_sum(square)
return sum_square
......@@ -78,7 +81,9 @@ def _squared_l2_norm(x):
return _legacy_C_ops.squared_l2_norm(x)
op_type = 'squared_l2_norm'
check_variable_and_dtype(x, 'x', ['float32', 'float64'], op_type)
check_variable_and_dtype(
x, 'x', ['float32', 'float64', 'float16', 'uint16'], op_type
)
helper = LayerHelper(op_type, **locals())
out = helper.create_variable_for_type_inference(x.dtype)
......@@ -89,7 +94,6 @@ def _squared_l2_norm(x):
class BaseErrorClipAttr(object):
def __str__(self):
raise NotImplementedError()
......@@ -164,8 +168,9 @@ def error_clip_callback(block, context):
for grad_n in [n for n in op_desc.output_arg_names() if n in grad_to_var]:
fwd_var = block._var_recursive(grad_to_var[grad_n])
error_clip = getattr(fwd_var, "error_clip", None)
if not (error_clip is None
or isinstance(error_clip, BaseErrorClipAttr)):
if not (
error_clip is None or isinstance(error_clip, BaseErrorClipAttr)
):
raise TypeError(
"Variable's error_clip should be an instance of BaseErrorClipAttr or None."
)
......@@ -174,7 +179,6 @@ def error_clip_callback(block, context):
class ClipGradBase(object):
def __init__(self):
super(ClipGradBase, self).__init__()
......@@ -197,7 +201,8 @@ class ClipGradBase(object):
warnings.warn(
"'set_gradient_clip' will be ineffective, because you have "
"set 'need_clip' in 'ParamAttr'. So, 'set_gradient_clip' "
"is redundant and you can remove it.")
"is redundant and you can remove it."
)
break
return self._static_clip(params_grads)
......@@ -252,7 +257,7 @@ class ClipGradByValue(ClipGradBase):
def __init__(self, max, min=None):
super(ClipGradByValue, self).__init__()
if min is None:
assert (max > 0.0)
assert max > 0.0
min = -max
self.max = float(max)
self.min = float(min)
......@@ -468,10 +473,9 @@ class ClipGradByGlobalNorm(ClipGradBase):
sdg.step()
"""
def __init__(self,
clip_norm,
group_name="default_group",
auto_skip_clip=False):
def __init__(
self, clip_norm, group_name="default_group", auto_skip_clip=False
):
super(ClipGradByGlobalNorm, self).__init__()
self.clip_norm = float(clip_norm)
self.group_name = group_name
......@@ -503,7 +507,10 @@ class ClipGradByGlobalNorm(ClipGradBase):
merge_grad = layers.get_tensor_from_selected_rows(merge_grad)
sum_square = _squared_l2_norm(merge_grad)
if sum_square.dtype == core.VarDesc.VarType.FP16 or sum_square.dtype == core.VarDesc.VarType.BF16:
if (
sum_square.dtype == core.VarDesc.VarType.FP16
or sum_square.dtype == core.VarDesc.VarType.BF16
):
sum_square_list_fp16.append(sum_square)
elif sum_square.dtype == core.VarDesc.VarType.FP32:
sum_square_list_fp32.append(sum_square)
......@@ -511,8 +518,12 @@ class ClipGradByGlobalNorm(ClipGradBase):
sum_square_list.append(sum_square)
# all parameters have been filterd out
if len(sum_square_list) + len(sum_square_list_fp16) + len(
sum_square_list_fp32) == 0:
if (
len(sum_square_list)
+ len(sum_square_list_fp16)
+ len(sum_square_list_fp32)
== 0
):
return params_grads
sum_dtype = 'float64' if len(sum_square_list) > 0 else "float32"
......@@ -531,22 +542,23 @@ class ClipGradByGlobalNorm(ClipGradBase):
global_norm_var.append(global_norm_var_fp64)
global_norm_var = paddle.add_n(global_norm_var)
global_norm_var = layers.sqrt(global_norm_var)
max_global_norm = layers.fill_constant(shape=[1],
dtype=global_norm_var.dtype,
value=self.clip_norm)
max_global_norm = layers.fill_constant(
shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm
)
need_clip = False
if not self.auto_skip_clip: # always apply clip
need_clip = True
clip_var = layers.elementwise_div(x=max_global_norm,
y=layers.elementwise_max(
x=global_norm_var,
y=max_global_norm))
clip_var = layers.elementwise_div(
x=max_global_norm,
y=layers.elementwise_max(x=global_norm_var, y=max_global_norm),
)
elif global_norm_var > max_global_norm:
# only when global_norm_var > max_global_norm, grad need clip
need_clip = True
clip_var = layers.elementwise_div(x=max_global_norm,
y=global_norm_var)
clip_var = layers.elementwise_div(
x=max_global_norm, y=global_norm_var
)
for p, g in params_grads:
if g is None:
......@@ -556,8 +568,11 @@ class ClipGradByGlobalNorm(ClipGradBase):
continue
# TODO(wangxi): use inplace elementwise_mul
if need_clip:
clip_input = (clip_var.astype(g.dtype)
if clip_var.dtype != g.dtype else clip_var)
clip_input = (
clip_var.astype(g.dtype)
if clip_var.dtype != g.dtype
else clip_var
)
new_grad = layers.elementwise_mul(g, clip_input)
params_and_grads.append((p, new_grad))
else:
......@@ -581,7 +596,8 @@ class ClipGradByGlobalNorm(ClipGradBase):
if g.type == core.VarDesc.VarType.SELECTED_ROWS:
merge_grad = layers.merge_selected_rows(g)
merge_grad = layers.get_tensor_from_selected_rows(
merge_grad)
merge_grad
)
sum_square = _squared_l2_norm(merge_grad)
if sum_square.dtype == core.VarDesc.VarType.FP16:
sum_square_list_fp16.append(sum_square)
......@@ -591,8 +607,12 @@ class ClipGradByGlobalNorm(ClipGradBase):
sum_square_list.append(sum_square)
# all parameters have been filterd out
if len(sum_square_list) + len(sum_square_list_fp16) + len(
sum_square_list_fp32) == 0:
if (
len(sum_square_list)
+ len(sum_square_list_fp16)
+ len(sum_square_list_fp32)
== 0
):
return params_grads
with p.block.program._optimized_guard([p, g]):
......@@ -601,10 +621,14 @@ class ClipGradByGlobalNorm(ClipGradBase):
global_norm_var = []
if len(sum_square_list_fp16) > 0:
global_norm_var_fp16 = layers.sums(sum_square_list_fp16)
if sum_square_list_fp32 or sum_square_list or not _allow_pure_fp16_global_norm_clip(
if (
sum_square_list_fp32
or sum_square_list
or not _allow_pure_fp16_global_norm_clip()
):
global_norm_var.append(
global_norm_var_fp16.astype(sum_dtype))
global_norm_var_fp16.astype(sum_dtype)
)
else:
global_norm_var.append(global_norm_var_fp16)
if len(sum_square_list_fp32) > 0:
......@@ -613,23 +637,28 @@ class ClipGradByGlobalNorm(ClipGradBase):
global_norm_var.append(global_norm_var_fp32)
else:
global_norm_var.append(
global_norm_var_fp32.astype(sum_dtype))
global_norm_var_fp32.astype(sum_dtype)
)
if len(sum_square_list) > 0:
# fp64
global_norm_var_other_dtype = layers.sums(sum_square_list)
global_norm_var.append(global_norm_var_other_dtype)
global_norm_var = layers.sums(global_norm_var) if len(
global_norm_var) > 1 else global_norm_var[0]
global_norm_var = (
layers.sums(global_norm_var)
if len(global_norm_var) > 1
else global_norm_var[0]
)
global_norm_var = layers.sqrt(x=global_norm_var)
max_global_norm = layers.fill_constant(
shape=[1],
dtype=global_norm_var.dtype,
value=self.clip_norm)
scale_var = layers.elementwise_div(x=max_global_norm,
y=layers.elementwise_max(
shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm
)
scale_var = layers.elementwise_div(
x=max_global_norm,
y=global_norm_var))
y=layers.elementwise_max(
x=max_global_norm, y=global_norm_var
),
)
param_new_grad_name_dict = dict()
for p, g in params_grads:
if g is None:
......@@ -641,29 +670,32 @@ class ClipGradByGlobalNorm(ClipGradBase):
with p.block.program._optimized_guard([p, g]):
new_g = _cast_to_mp_type_if_enabled(g)
# inplace
scale_input = (scale_var.astype('float16') if
new_g.dtype == core.VarDesc.VarType.FP16 and
scale_var.dtype != core.VarDesc.VarType.FP16
else scale_var)
scale_input = (
scale_var.astype('float16')
if new_g.dtype == core.VarDesc.VarType.FP16
and scale_var.dtype != core.VarDesc.VarType.FP16
else scale_var
)
# NOTE(Yuang Liu): For pure dp with gradient merge, the p and g
# will be in different blocks with the gradient clip related ops.
# We need to handle the correct block, otherwise will encounter
# a 'NotFoundError' during compile time.
block = default_main_program().current_block()
block.append_op(type='elementwise_mul',
inputs={
'X': new_g,
'Y': scale_input
},
outputs={'Out': new_g})
block.append_op(
type='elementwise_mul',
inputs={'X': new_g, 'Y': scale_input},
outputs={'Out': new_g},
)
if new_g is not g:
block.append_op(type='cast',
block.append_op(
type='cast',
inputs={'X': new_g},
outputs={'Out': g},
attrs={
'in_dtype': new_g.dtype,
'out_dtype': g.dtype
})
'out_dtype': g.dtype,
},
)
param_new_grad_name_dict[p.name] = g.name
params_and_grads.append((p, g))
......@@ -676,7 +708,8 @@ class ClipGradByGlobalNorm(ClipGradBase):
context[self.group_name] = []
context[self.group_name + "_clip_value"] = self.clip_norm
context[self.group_name + "_clip"] = layers.fill_constant(
shape=[1], dtype=grad.dtype, value=self.clip_norm)
shape=[1], dtype=grad.dtype, value=self.clip_norm
)
else:
if not self.clip_norm == context[self.group_name + "_clip_value"]:
raise ValueError(
......@@ -699,20 +732,19 @@ class ClipGradByGlobalNorm(ClipGradBase):
group_norm_var = layers.sums(input=self.context[self.group_name])
group_norm_var = layers.sqrt(x=group_norm_var)
clip_var = self.context[self.group_name + "_clip"]
group_scale_var = layers.elementwise_div(x=clip_var,
y=layers.elementwise_max(
group_scale_var = layers.elementwise_div(
x=clip_var,
y=group_norm_var))
assert group_scale_var.shape == (1, )
y=layers.elementwise_max(x=clip_var, y=group_norm_var),
)
assert group_scale_var.shape == (1,)
self.context[group_scale_name] = group_scale_var
# inplace
param.block.append_op(type='elementwise_mul',
inputs={
'X': grad,
'Y': self.context[group_scale_name]
},
outputs={'Out': grad})
param.block.append_op(
type='elementwise_mul',
inputs={'X': grad, 'Y': self.context[group_scale_name]},
outputs={'Out': grad},
)
return param, grad
......@@ -807,22 +839,26 @@ def set_gradient_clip(clip, param_list=None, program=None):
"""
warnings.warn("Caution! 'set_gradient_clip' is not recommended "
warnings.warn(
"Caution! 'set_gradient_clip' is not recommended "
"and may be deprecated in future! "
"We recommend a new strategy: set 'grad_clip' "
"when initializing the 'optimizer'. "
"This method can reduce the mistakes, please "
"refer to documention of 'optimizer'.")
"refer to documention of 'optimizer'."
)
if not isinstance(clip, ClipGradBase):
raise TypeError(
"'clip' should be an instance of ClipGradBase's derived class")
"'clip' should be an instance of ClipGradBase's derived class"
)
if program is None:
program = framework.default_main_program()
for op in program.block(0).ops:
if 'op_namescope' in op.all_attrs() and "optimizer" in op.attr(
"op_namescope"):
"op_namescope"
):
warnings.warn(
"'minimize' has been invoked before, this will make 'set_gradient_clip' "
"be ineffective! Please invoke 'set_gradient_clip' before 'minimize'."
......@@ -847,14 +883,16 @@ def append_gradient_clip_ops(param_grads):
for p, g in param_grads:
if g is None:
continue
with p.block.program._optimized_guard(
[p, g]), framework.name_scope('gradient_clip'):
with p.block.program._optimized_guard([p, g]), framework.name_scope(
'gradient_clip'
):
clip_attr = getattr(p, 'gradient_clip_attr', None)
if clip_attr is None:
return param_grads
if not isinstance(clip_attr, ClipGradBase):
raise TypeError(
"clip attribute should be an instance of GradientClipBase")
"clip attribute should be an instance of GradientClipBase"
)
clip_attr._process_context(context=context, param=p, grad=g)
......@@ -863,8 +901,9 @@ def append_gradient_clip_ops(param_grads):
for p, g in param_grads:
if g is None:
continue
with p.block.program._optimized_guard(
[p, g]), framework.name_scope('gradient_clip'):
with p.block.program._optimized_guard([p, g]), framework.name_scope(
'gradient_clip'
):
param, new_grad = clip_attr._create_operators(param=p, grad=g)
param_new_grad_name_dict[param.name] = new_grad.name
res.append([param, new_grad])
......@@ -888,12 +927,16 @@ def _correct_clip_op_role_var(params_grads, param_new_grad_name_dict):
continue
block_id_list.append(block_id)
for op in param.block.program.global_block().ops:
if op.has_attr("op_namescope") and "gradient_clip" in op.attr(
"op_namescope") and op.attr('op_role_var'):
if (
op.has_attr("op_namescope")
and "gradient_clip" in op.attr("op_namescope")
and op.attr('op_role_var')
):
param_name = op.attr('op_role_var')[0]
if param_name in param_new_grad_name_dict:
correct_p_g = [
param_name, param_new_grad_name_dict[param_name]
param_name,
param_new_grad_name_dict[param_name],
]
op._set_attr('op_role_var', correct_p_g)
......
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
import paddle
class TestAddnOp(unittest.TestCase):
def setUp(self):
np.random.seed(20)
l = 32
self.x_np = np.random.random([l, 16, 256])
def check_main(self, x_np, dtype, axis=None):
paddle.disable_static()
x = []
for i in range(x_np.shape[0]):
val = paddle.to_tensor(x_np[i].astype(dtype))
val.stop_gradient = False
x.append(val)
y = paddle.add_n(x)
x_g = paddle.grad(y, x)
y_np = y.numpy().astype('float32')
x_g_np = []
for val in x_g:
x_g_np.append(val.numpy().astype('float32'))
paddle.enable_static()
return y_np, x_g_np
def test_add_n_fp16(self):
if not paddle.is_compiled_with_cuda():
return
y_np_16, x_g_np_16 = self.check_main(self.x_np, 'float16')
y_np_32, x_g_np_32 = self.check_main(self.x_np, 'float32')
np.testing.assert_allclose(y_np_16, y_np_32, rtol=1e-03)
for i in range(len(x_g_np_32)):
np.testing.assert_allclose(x_g_np_16[i], x_g_np_32[i], rtol=1e-03)
def test_add_n_api(self):
if not paddle.is_compiled_with_cuda():
return
y_np_32, x_g_np_32 = self.check_main(self.x_np, 'float32')
y_np_gt = np.sum(self.x_np, axis=0).astype('float32')
np.testing.assert_allclose(y_np_32, y_np_gt, rtol=1e-06)
if __name__ == "__main__":
unittest.main()
此差异已折叠。
......@@ -440,16 +440,22 @@ class Optimizer(object):
return self._opti_name_list
def _create_global_learning_rate(self):
# lr var can't be float16, for pure fp16 training, should extra handle the dtype for lr
# lr var can't be float16 or bfloat16, for pure fp16 or fp16 training, should extra handle the dtype for lr
_lr_dtype = (
paddle.get_default_dtype() if self._dtype is None else self._dtype
)
_lr_dtype = (
paddle.float32
if (
(
paddle.get_default_dtype() != "float16"
and _lr_dtype == paddle.float16
)
or (
paddle.get_default_dtype() != "bfloat16"
and _lr_dtype == paddle.bfloat16
)
)
else _lr_dtype
)
if isinstance(self._learning_rate, LRScheduler):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册