diff --git a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu index 5f3c60df9a080a552e3c05293ffc3233bb017c60..e5bab3cae4fab5d0ab744773af1c6ed871519a71 100644 --- a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu +++ b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu @@ -17,6 +17,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/framework/scope_guard.h" +#include "paddle/fluid/platform/bfloat16.h" #include "paddle/fluid/platform/dynload/cublasLt.h" #include "paddle/fluid/platform/float16.h" @@ -63,6 +64,9 @@ class FusedGemmEpilogueKernel : public framework::OpKernel { if (std::is_same::value) { mat_type = CUDA_R_16F; } + if (std::is_same::value) { + mat_type = CUDA_R_16BF; + } if (std::is_same::value) { mat_type = CUDA_R_64F; scale_type = CUDA_R_64F; @@ -354,6 +358,9 @@ class FusedGemmEpilogueGradKernel : public framework::OpKernel { if (std::is_same::value) { mat_type = CUDA_R_16F; } + if (std::is_same::value) { + mat_type = CUDA_R_16BF; + } if (std::is_same::value) { mat_type = CUDA_R_64F; scale_type = CUDA_R_64F; @@ -688,12 +695,14 @@ REGISTER_OP_CUDA_KERNEL( fused_gemm_epilogue, ops::FusedGemmEpilogueKernel, ops::FusedGemmEpilogueKernel, - ops::FusedGemmEpilogueKernel); + ops::FusedGemmEpilogueKernel, + ops::FusedGemmEpilogueKernel); REGISTER_OP_CUDA_KERNEL( fused_gemm_epilogue_grad, ops::FusedGemmEpilogueGradKernel, ops::FusedGemmEpilogueGradKernel, ops::FusedGemmEpilogueGradKernel); + paddle::platform::float16>, + ops::FusedGemmEpilogueKernel); #endif diff --git a/paddle/fluid/platform/device/gpu/gpu_primitives.h b/paddle/fluid/platform/device/gpu/gpu_primitives.h index b99d6de5dbbb42f5b8390cf450c91a351d276879..96eddf09237d9834f66263cafabf22b9346e889c 100644 --- a/paddle/fluid/platform/device/gpu/gpu_primitives.h +++ b/paddle/fluid/platform/device/gpu/gpu_primitives.h @@ -198,61 +198,6 @@ __device__ __forceinline__ void fastAtomicAdd(T *arr, T value) { CudaAtomicAdd(arr + index, value); } - -#ifdef PADDLE_WITH_CUDA -/* - * One thead block deals with elementwise atomicAdd for vector of len. - * @in: [x1, x2, x3, ...] - * @out:[y1+x1, y2+x2, y3+x3, ...] - * */ -template ::value>::type * = nullptr> -__device__ __forceinline__ void VectorizedAtomicAddPerBlock( - const int64_t len, int tid, int threads_per_block, const T *in, T *out) { - for (int i = tid; i < len; i += threads_per_block) { - CudaAtomicAdd(&out[i], in[i]); - } -} - -// Note: assume that len is even. If len is odd, call fastAtomicAdd directly. -template ::value>::type * = nullptr> -__device__ __forceinline__ void VectorizedAtomicAddPerBlock( - const int64_t len, int tid, int threads_per_block, const T *in, T *out) { -#if ((CUDA_VERSION < 10000) || \ - (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700))) - for (int i = tid; i < len; i += threads_per_block) { - CudaAtomicAdd(&out[i], in[i]); - } -#else - int i = 0; - int loops = len / 2 * 2; - - bool aligned_half2 = - (reinterpret_cast(out) % sizeof(__half2) == 0); - - if (aligned_half2) { - for (i = tid * 2; i < loops; i += threads_per_block * 2) { - __half2 value2; - T value_1 = in[i]; - T value_2 = in[i + 1]; - value2.x = *reinterpret_cast<__half *>(&value_1); - value2.y = *reinterpret_cast<__half *>(&value_2); - atomicAdd(reinterpret_cast<__half2 *>(&out[i]), value2); - } - for (; i < len; i += threads_per_block) { - fastAtomicAdd(out, i, len, in[i]); - } - } else { - for (int i = tid; i < len; i += threads_per_block) { - fastAtomicAdd(out, i, len, in[i]); - } - } -#endif -} -#endif #endif // NOTE(zhangbo): cuda do not have atomicCAS for __nv_bfloat16. @@ -601,5 +546,61 @@ CUDA_ATOMIC_WRAPPER(Min, float16) { } #endif +#ifdef PADDLE_CUDA_FP16 +#ifdef PADDLE_WITH_CUDA +/* + * One thead block deals with elementwise atomicAdd for vector of len. + * @in: [x1, x2, x3, ...] + * @out:[y1+x1, y2+x2, y3+x3, ...] + * */ +template ::value>::type * = nullptr> +__device__ __forceinline__ void VectorizedAtomicAddPerBlock( + const int64_t len, int tid, int threads_per_block, const T *in, T *out) { + for (int i = tid; i < len; i += threads_per_block) { + CudaAtomicAdd(&out[i], in[i]); + } +} + +// Note: assume that len is even. If len is odd, call fastAtomicAdd directly. +template ::value>::type * = nullptr> +__device__ __forceinline__ void VectorizedAtomicAddPerBlock( + const int64_t len, int tid, int threads_per_block, const T *in, T *out) { +#if ((CUDA_VERSION < 10000) || \ + (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700))) + for (int i = tid; i < len; i += threads_per_block) { + CudaAtomicAdd(&out[i], in[i]); + } +#else + int i = 0; + int loops = len / 2 * 2; + + bool aligned_half2 = + (reinterpret_cast(out) % sizeof(__half2) == 0); + + if (aligned_half2) { + for (i = tid * 2; i < loops; i += threads_per_block * 2) { + __half2 value2; + T value_1 = in[i]; + T value_2 = in[i + 1]; + value2.x = *reinterpret_cast<__half *>(&value_1); + value2.y = *reinterpret_cast<__half *>(&value_2); + atomicAdd(reinterpret_cast<__half2 *>(&out[i]), value2); + } + for (; i < len; i += threads_per_block) { + fastAtomicAdd(out, i, len, in[i]); + } + } else { + for (int i = tid; i < len; i += threads_per_block) { + fastAtomicAdd(out, i, len, in[i]); + } + } +#endif +} +#endif +#endif } // namespace platform } // namespace paddle diff --git a/paddle/phi/kernels/empty_kernel.cc b/paddle/phi/kernels/empty_kernel.cc index 2c969cc43d2f1a03a95e4294aa30536b62a63a99..01b07c438a5270d0290a7cccd3a71401e8311388 100644 --- a/paddle/phi/kernels/empty_kernel.cc +++ b/paddle/phi/kernels/empty_kernel.cc @@ -88,6 +88,7 @@ PD_REGISTER_KERNEL(empty, int64_t, bool, phi::dtype::float16, + phi::dtype::bfloat16, phi::dtype::complex, phi::dtype::complex) {} diff --git a/paddle/phi/kernels/funcs/activation_functor.h b/paddle/phi/kernels/funcs/activation_functor.h index 51420c5ecb6dcb48ead8027ff52706874af98dc7..2af106ca38c48c32a65dc388900ff28d727d9eec 100644 --- a/paddle/phi/kernels/funcs/activation_functor.h +++ b/paddle/phi/kernels/funcs/activation_functor.h @@ -2169,12 +2169,14 @@ struct CudaSeluFunctor : public BaseActivationFunctor { } __device__ __forceinline__ T operator()(const T x) const { - T res = x; - if (res <= zero) { + using MT = + typename std::conditional<(sizeof(T) > sizeof(float)), T, float>::type; + MT res = static_cast(x); + if (x <= zero) { res = alpha * expf(res) - alpha; } res *= scale; - return res; + return static_cast(res); } private: diff --git a/paddle/phi/kernels/funcs/eigen/broadcast.cu b/paddle/phi/kernels/funcs/eigen/broadcast.cu index 0b749f5c009a5d08cea109c9b1abbb25551b4deb..0c5a3408872c47494682304e47107be391fbe6c9 100644 --- a/paddle/phi/kernels/funcs/eigen/broadcast.cu +++ b/paddle/phi/kernels/funcs/eigen/broadcast.cu @@ -84,6 +84,7 @@ INSTANTIATION(EigenBroadcast, int); INSTANTIATION(EigenBroadcast, int64_t); INSTANTIATION(EigenBroadcastGrad, bool); INSTANTIATION(EigenBroadcastGrad, float); +INSTANTIATION(EigenBroadcastGrad, dtype::bfloat16); INSTANTIATION(EigenBroadcastGrad, dtype::float16); INSTANTIATION(EigenBroadcastGrad, double); INSTANTIATION(EigenBroadcastGrad, dtype::complex); diff --git a/paddle/phi/kernels/gpu/activation_grad_kernel.cu b/paddle/phi/kernels/gpu/activation_grad_kernel.cu index 53f727ec51a39b3faed9ecf8684790d7e4670694..b947c70cb89d495d6fe9f58fe889119e4e8e54a9 100644 --- a/paddle/phi/kernels/gpu/activation_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/activation_grad_kernel.cu @@ -449,4 +449,5 @@ PD_REGISTER_KERNEL(pow_grad, double, int, int64_t, - phi::dtype::float16) {} + phi::dtype::float16, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/gpu/activation_kernel.cu b/paddle/phi/kernels/gpu/activation_kernel.cu index 0e9e754a99706675ae1f130aa118246f137ba5cf..e57332c40756af5e6b3e87f1ed8d966124945553 100644 --- a/paddle/phi/kernels/gpu/activation_kernel.cu +++ b/paddle/phi/kernels/gpu/activation_kernel.cu @@ -265,5 +265,12 @@ PD_REGISTER_KERNEL(pow, double, int, int64_t, - phi::dtype::float16) {} -PD_REGISTER_KERNEL(selu, GPU, ALL_LAYOUT, phi::SeluKernel, float, double) {} + phi::dtype::float16, + phi::dtype::bfloat16) {} +PD_REGISTER_KERNEL(selu, + GPU, + ALL_LAYOUT, + phi::SeluKernel, + float, + double, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/gpu/adam_kernel.cu b/paddle/phi/kernels/gpu/adam_kernel.cu index d44f6d2800da5f3fa015394ca342a4e14eee1081..0597311e219da7c1116b36f69deb844ed23b522d 100644 --- a/paddle/phi/kernels/gpu/adam_kernel.cu +++ b/paddle/phi/kernels/gpu/adam_kernel.cu @@ -372,7 +372,8 @@ PD_REGISTER_KERNEL(adam, phi::AdamDenseKernel, float, double, - phi::dtype::float16) { + phi::dtype::float16, + phi::dtype::bfloat16) { // Skip beta1_pow, beta2_pow, skip_update data transform kernel->InputAt(5).SetBackend(phi::Backend::ALL_BACKEND); kernel->InputAt(6).SetBackend(phi::Backend::ALL_BACKEND); @@ -385,7 +386,8 @@ PD_REGISTER_KERNEL(merged_adam, phi::MergedAdamKernel, float, double, - phi::dtype::float16) { + phi::dtype::float16, + phi::dtype::bfloat16) { // Skip beta1_pow, beta2_pow data transform kernel->InputAt(5).SetBackend(phi::Backend::ALL_BACKEND); kernel->InputAt(6).SetBackend(phi::Backend::ALL_BACKEND); diff --git a/paddle/phi/kernels/gpu/clip_grad_kernel.cu b/paddle/phi/kernels/gpu/clip_grad_kernel.cu index 4566e8468ec1640dbc5b506b03199a27792b1373..60d311a2555a0d50249a44fada1adf1627b12e54 100644 --- a/paddle/phi/kernels/gpu/clip_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/clip_grad_kernel.cu @@ -27,4 +27,5 @@ PD_REGISTER_KERNEL(clip_grad, double, int, int64_t, + phi::dtype::bfloat16, phi::dtype::float16) {} diff --git a/paddle/phi/kernels/gpu/clip_kernel.cu b/paddle/phi/kernels/gpu/clip_kernel.cu index 9e0050db7fdbf178acb4fe5cf7174ebc951fc465..e8d519a5d3a2b902eb6f71f6a8bafda0dc40b970 100644 --- a/paddle/phi/kernels/gpu/clip_kernel.cu +++ b/paddle/phi/kernels/gpu/clip_kernel.cu @@ -27,4 +27,5 @@ PD_REGISTER_KERNEL(clip, double, int, int64_t, - phi::dtype::float16) {} + phi::dtype::float16, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/gpu/embedding_grad_kernel.cu b/paddle/phi/kernels/gpu/embedding_grad_kernel.cu index 6694216214c315c5449200821667c92e1e35697b..e10d01ce9e4a572563ecd42b2b83a0668dbfa4c8 100644 --- a/paddle/phi/kernels/gpu/embedding_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/embedding_grad_kernel.cu @@ -256,7 +256,8 @@ PD_REGISTER_KERNEL(embedding_grad, phi::EmbeddingGradKernel, float, double, - phi::dtype::float16) {} + phi::dtype::float16, + phi::dtype::bfloat16) {} PD_REGISTER_KERNEL(embedding_sparse_grad, GPU, @@ -264,4 +265,5 @@ PD_REGISTER_KERNEL(embedding_sparse_grad, phi::EmbeddingSparseGradKernel, float, double, - phi::dtype::float16) {} + phi::dtype::float16, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/gpu/embedding_kernel.cu b/paddle/phi/kernels/gpu/embedding_kernel.cu index 90f3cc8d36032bab199236dd0d45cf2c4d82ac96..bb22fea5f6493dab2b2f9baa485abb5224e8aad8 100644 --- a/paddle/phi/kernels/gpu/embedding_kernel.cu +++ b/paddle/phi/kernels/gpu/embedding_kernel.cu @@ -125,4 +125,5 @@ PD_REGISTER_KERNEL(embedding, phi::EmbeddingKernel, float, double, - phi::dtype::float16) {} + phi::dtype::float16, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/gpu/gelu_grad_kernel.cu b/paddle/phi/kernels/gpu/gelu_grad_kernel.cu index 1f33d5c901f297d2dd07d8f2a94ec64d34c3cec3..b1ffa921f912b760480d3a7a63a7f1db7ac9360c 100644 --- a/paddle/phi/kernels/gpu/gelu_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/gelu_grad_kernel.cu @@ -99,4 +99,5 @@ PD_REGISTER_KERNEL(gelu_grad, phi::GeluGradKernel, float, double, - phi::dtype::float16) {} + phi::dtype::float16, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/gpu/gelu_kernel.cu b/paddle/phi/kernels/gpu/gelu_kernel.cu index 509a5ccf4d177f6b6186cc7697bdd18888d18d45..e0792c387d7510c7fdf7016bb7a0940462e13a66 100644 --- a/paddle/phi/kernels/gpu/gelu_kernel.cu +++ b/paddle/phi/kernels/gpu/gelu_kernel.cu @@ -93,4 +93,5 @@ PD_REGISTER_KERNEL(gelu, phi::GeluKernel, float, double, - phi::dtype::float16) {} + phi::dtype::float16, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/gpu/pad3d_grad_kernel.cu b/paddle/phi/kernels/gpu/pad3d_grad_kernel.cu index e9f820a318482c1aa01164110661c13194b49599..fb7f1a2325790c56b76e448f1be51c577dd8e5cf 100644 --- a/paddle/phi/kernels/gpu/pad3d_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/pad3d_grad_kernel.cu @@ -509,4 +509,5 @@ PD_REGISTER_KERNEL(pad3d_grad, phi::Pad3dGradKernel, float, double, - phi::dtype::float16) {} + phi::dtype::float16, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/gpu/pad3d_kernel.cu b/paddle/phi/kernels/gpu/pad3d_kernel.cu index d1b1d70667673e76de93711643490c5465d4bbb9..fa85c650bc8542bb7331cb8c906a8ddf6193170d 100644 --- a/paddle/phi/kernels/gpu/pad3d_kernel.cu +++ b/paddle/phi/kernels/gpu/pad3d_kernel.cu @@ -583,6 +583,7 @@ PD_REGISTER_KERNEL(pad3d, ALL_LAYOUT, phi::Pad3dKernel, phi::dtype::float16, + phi::dtype::bfloat16, float, double, int, diff --git a/paddle/phi/kernels/gpu/pixel_shuffle_grad_kernel.cu b/paddle/phi/kernels/gpu/pixel_shuffle_grad_kernel.cu index 1414fb9df0b41ca5c857d04b2251dabe05c8a36f..5c88bbbf425325f0b1111e886eb83e06deb8b162 100644 --- a/paddle/phi/kernels/gpu/pixel_shuffle_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/pixel_shuffle_grad_kernel.cu @@ -23,4 +23,6 @@ PD_REGISTER_KERNEL(pixel_shuffle_grad, ALL_LAYOUT, phi::PixelShuffleGradKernel, float, - double) {} + double, + phi::dtype::float16, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/gpu/pixel_shuffle_kernel.cu b/paddle/phi/kernels/gpu/pixel_shuffle_kernel.cu index e43d6f961236afcd51b29760c34d626464b3d0ca..09eb0485a297fac8d652619ffc437cd0b0758e9d 100644 --- a/paddle/phi/kernels/gpu/pixel_shuffle_kernel.cu +++ b/paddle/phi/kernels/gpu/pixel_shuffle_kernel.cu @@ -18,5 +18,11 @@ #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/pixel_shuffle_kernel_impl.h" -PD_REGISTER_KERNEL( - pixel_shuffle, GPU, ALL_LAYOUT, phi::PixelShuffleKernel, float, double) {} +PD_REGISTER_KERNEL(pixel_shuffle, + GPU, + ALL_LAYOUT, + phi::PixelShuffleKernel, + float, + double, + phi::dtype::float16, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/gpu/selu_grad_kernel.cu b/paddle/phi/kernels/gpu/selu_grad_kernel.cu index 0ed299413c1726f617dee9a8b5b4bf1d79d30efe..c715831ffc7ffcef400eb7ff11551cf5d636f055 100644 --- a/paddle/phi/kernels/gpu/selu_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/selu_grad_kernel.cu @@ -18,5 +18,10 @@ #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/selu_grad_kernel_impl.h" -PD_REGISTER_KERNEL( - selu_grad, GPU, ALL_LAYOUT, phi::SeluGradKernel, float, double) {} +PD_REGISTER_KERNEL(selu_grad, + GPU, + ALL_LAYOUT, + phi::SeluGradKernel, + float, + double, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/gpu/tile_grad_kernel.cu b/paddle/phi/kernels/gpu/tile_grad_kernel.cu index c092609e623d3f4f3dc4b3d77b1c973e6ddfbcf3..d1e356df401a881ec2b55e8576cbabbf14e35d7d 100644 --- a/paddle/phi/kernels/gpu/tile_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/tile_grad_kernel.cu @@ -27,4 +27,5 @@ PD_REGISTER_KERNEL(tile_grad, double, int, int64_t, - phi::dtype::float16) {} + phi::dtype::float16, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/gpu/where_grad_kernel.cu b/paddle/phi/kernels/gpu/where_grad_kernel.cu index 709dddcb82c7e76940d850750916758a31ea4f44..4c411bfb9cd5a312b58ea6fc8d19f50965afc38a 100644 --- a/paddle/phi/kernels/gpu/where_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/where_grad_kernel.cu @@ -25,10 +25,10 @@ __global__ void WhereGradCUDAKernel( int idx = blockDim.x * blockIdx.x + threadIdx.x; for (; idx < N; idx += blockDim.x * gridDim.x) { if (dx != nullptr) { - dx[idx] = cond[idx] ? dout[idx] : 0.; + dx[idx] = cond[idx] ? dout[idx] : static_cast(0.); } if (dy != nullptr) { - dy[idx] = cond[idx] ? 0. : dout[idx]; + dy[idx] = cond[idx] ? static_cast(0.) : dout[idx]; } } } @@ -61,6 +61,8 @@ PD_REGISTER_KERNEL(where_grad, GPU, ALL_LAYOUT, phi::WhereGradKernel, + phi::dtype::float16, + phi::dtype::bfloat16, float, double, int, diff --git a/paddle/phi/kernels/gpu/where_kernel.cu b/paddle/phi/kernels/gpu/where_kernel.cu index 441be02b99efa266c55590a79b21263f05ab46d8..09a974fbc2340097502b0e2a77840fd4f0d739e7 100644 --- a/paddle/phi/kernels/gpu/where_kernel.cu +++ b/paddle/phi/kernels/gpu/where_kernel.cu @@ -45,5 +45,13 @@ void WhereKernel(const Context& ctx, } // namespace phi -PD_REGISTER_KERNEL( - where, GPU, ALL_LAYOUT, phi::WhereKernel, float, double, int, int64_t) {} +PD_REGISTER_KERNEL(where, + GPU, + ALL_LAYOUT, + phi::WhereKernel, + float, + double, + int, + int64_t, + phi::dtype::float16, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/impl/selu_kernel_impl.h b/paddle/phi/kernels/impl/selu_kernel_impl.h index 288f7bb9b793e673e0639d2f1f326c16631b49aa..0725b141252bcb3408ad501d504bab979dd3f59e 100644 --- a/paddle/phi/kernels/impl/selu_kernel_impl.h +++ b/paddle/phi/kernels/impl/selu_kernel_impl.h @@ -57,14 +57,17 @@ struct SeluGradFunctor { dx_data_ptr_(dx_data_ptr) {} HOSTDEVICE void operator()(size_t idx) const { - T y_ele = y_data_ptr_[idx]; - T dy_ele = dy_data_ptr_[idx]; + using MT = + typename std::conditional<(sizeof(T) > sizeof(float)), T, float>::type; - float tmp = scale_; + auto y_ele = static_cast(y_data_ptr_[idx]); + auto dy_ele = static_cast(dy_data_ptr_[idx]); + + auto tmp = static_cast(scale_); if (y_ele <= 0) { - tmp = y_ele + la_; + tmp = y_ele + static_cast(la_); } - dx_data_ptr_[idx] = dy_ele * tmp; + dx_data_ptr_[idx] = static_cast(dy_ele * tmp); } const T* y_data_ptr_; const T* dy_data_ptr_; diff --git a/python/paddle/fluid/clip.py b/python/paddle/fluid/clip.py index e6f2e17c05f243c0df9797bbbbc645cfd797a6ec..e9e364585260304ab20bc4f5960e0aeff912b23d 100644 --- a/python/paddle/fluid/clip.py +++ b/python/paddle/fluid/clip.py @@ -50,8 +50,9 @@ def _clip_by_global_norm_using_mp_type(*args): def _cast_to_mp_type_if_enabled(x): - if x.dtype == core.VarDesc.VarType.FP16 and _clip_by_global_norm_using_mp_type( - ): + if (x.dtype == core.VarDesc.VarType.FP16 + or x.dtype == core.VarDesc.VarType.BF16 + ) and _clip_by_global_norm_using_mp_type(): return x.astype(core.VarDesc.VarType.FP32) else: return x @@ -63,7 +64,8 @@ def _squared_l2_norm(x): """ x = _cast_to_mp_type_if_enabled(x) - if core.is_compiled_with_xpu() or x.dtype == core.VarDesc.VarType.FP16: + if core.is_compiled_with_xpu( + ) or x.dtype == core.VarDesc.VarType.FP16 or x.dtype == core.VarDesc.VarType.BF16: square = layers.square(x) sum_square = layers.reduce_sum(square) return sum_square @@ -499,7 +501,7 @@ class ClipGradByGlobalNorm(ClipGradBase): merge_grad = layers.get_tensor_from_selected_rows(merge_grad) sum_square = _squared_l2_norm(merge_grad) - if sum_square.dtype == core.VarDesc.VarType.FP16: + if sum_square.dtype == core.VarDesc.VarType.FP16 or sum_square.dtype == core.VarDesc.VarType.BF16: sum_square_list_fp16.append(sum_square) elif sum_square.dtype == core.VarDesc.VarType.FP32: sum_square_list_fp32.append(sum_square) @@ -552,8 +554,8 @@ class ClipGradByGlobalNorm(ClipGradBase): continue # TODO(wangxi): use inplace elementwise_mul if need_clip: - clip_input = (clip_var.astype('float16') if g.dtype - == core.VarDesc.VarType.FP16 else clip_var) + clip_input = (clip_var.astype(g.dtype) + if clip_var.dtype != g.dtype else clip_var) new_grad = layers.elementwise_mul(g, clip_input) params_and_grads.append((p, new_grad)) else: diff --git a/python/paddle/optimizer/adam.py b/python/paddle/optimizer/adam.py index 4f8122121b62c0b960ef79ca02b6d5befea7e237..41d22e778fc3ff7c8d125adae24a7c97a19b53df 100644 --- a/python/paddle/optimizer/adam.py +++ b/python/paddle/optimizer/adam.py @@ -275,7 +275,7 @@ class Adam(Optimizer): def _add_moments_pows(self, p): acc_dtype = p.dtype - if acc_dtype == core.VarDesc.VarType.FP16: + if acc_dtype == core.VarDesc.VarType.FP16 or acc_dtype == core.VarDesc.VarType.BF16: acc_dtype = core.VarDesc.VarType.FP32 self._add_accumulator(self._moment1_acc_str, p, dtype=acc_dtype) self._add_accumulator(self._moment2_acc_str, p, dtype=acc_dtype) diff --git a/python/paddle/tensor/stat.py b/python/paddle/tensor/stat.py index b5946459d344c42b910507b14f383ff8dd8279d4..144620f3c6ea4eb41560905d47f6e4006d1d17f6 100644 --- a/python/paddle/tensor/stat.py +++ b/python/paddle/tensor/stat.py @@ -159,8 +159,10 @@ def var(x, axis=None, unbiased=True, keepdim=False, name=None): u = mean(x, axis, True, name) out = paddle.sum((x - u)**2, axis, keepdim=keepdim, name=name) - n = paddle.cast(paddle.numel(x), x.dtype) \ - / paddle.cast(paddle.numel(out), x.dtype) + dtype = x.dtype + n = paddle.cast(paddle.numel(x), paddle.int64) \ + / paddle.cast(paddle.numel(out), paddle.int64) + n = n.astype(dtype) if unbiased: one_const = paddle.ones([1], x.dtype) n = where(n > one_const, n - 1., one_const)