未验证 提交 c0163837 编写于 作者: L Leo Chen 提交者: GitHub

Fix compile problem when cuda_arch < 6000 (#29576)

* fix compile problem when cuda_arch < 6000

* refine code

* refine code
上级 79a41a9e
...@@ -109,7 +109,7 @@ struct PairForLayerNormAddFunctor { ...@@ -109,7 +109,7 @@ struct PairForLayerNormAddFunctor {
template <typename T> template <typename T>
__inline__ __device__ T rsqrt(const T val) { __inline__ __device__ T rsqrt(const T val) {
return ::rsqrt(val); return static_cast<T>(1) / sqrt(val);
} }
template <> template <>
...@@ -117,10 +117,17 @@ __inline__ __device__ float rsqrt(const float val) { ...@@ -117,10 +117,17 @@ __inline__ __device__ float rsqrt(const float val) {
return rsqrtf(val); return rsqrtf(val);
} }
template <>
__inline__ __device__ double rsqrt(const double val) {
return rsqrt(val);
}
#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
template <> template <>
__inline__ __device__ half rsqrt(const half val) { __inline__ __device__ half rsqrt(const half val) {
return hrsqrt(val); return hrsqrt(val);
} }
#endif
template <typename T, typename U, int BlockDim> template <typename T, typename U, int BlockDim>
__global__ void LayerNormForward(const T *x, const U *scale, const U *bias, __global__ void LayerNormForward(const T *x, const U *scale, const U *bias,
...@@ -841,6 +848,7 @@ class LayerNormKernel<platform::CUDADeviceContext, T> ...@@ -841,6 +848,7 @@ class LayerNormKernel<platform::CUDADeviceContext, T>
: public framework::OpKernel<T> { : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext &ctx) const override { void Compute(const framework::ExecutionContext &ctx) const override {
using U = LayerNormParamType<T>;
const float epsilon = ctx.Attr<float>("epsilon"); const float epsilon = ctx.Attr<float>("epsilon");
auto *scale = ctx.Input<Tensor>("Scale"); auto *scale = ctx.Input<Tensor>("Scale");
auto *bias = ctx.Input<Tensor>("Bias"); auto *bias = ctx.Input<Tensor>("Bias");
...@@ -854,12 +862,10 @@ class LayerNormKernel<platform::CUDADeviceContext, T> ...@@ -854,12 +862,10 @@ class LayerNormKernel<platform::CUDADeviceContext, T>
const auto x_dims = x->dims(); const auto x_dims = x->dims();
auto *x_data = x->data<T>(); auto *x_data = x->data<T>();
auto *y_data = y->mutable_data<T>(ctx.GetPlace()); auto *y_data = y->mutable_data<T>(ctx.GetPlace());
auto *mean_data = mean->mutable_data<LayerNormParamType<T>>(ctx.GetPlace()); auto *mean_data = mean->mutable_data<U>(ctx.GetPlace());
auto *var_data = var->mutable_data<LayerNormParamType<T>>(ctx.GetPlace()); auto *var_data = var->mutable_data<U>(ctx.GetPlace());
auto *scale_data = auto *scale_data = (scale == nullptr ? nullptr : scale->data<U>());
(scale == nullptr ? nullptr : scale->data<LayerNormParamType<T>>()); auto *bias_data = (bias == nullptr ? nullptr : bias->data<U>());
auto *bias_data =
(bias == nullptr ? nullptr : bias->data<LayerNormParamType<T>>());
auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis); auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis);
int batch_size = static_cast<int>(matrix_dim[0]); int batch_size = static_cast<int>(matrix_dim[0]);
...@@ -869,7 +875,7 @@ class LayerNormKernel<platform::CUDADeviceContext, T> ...@@ -869,7 +875,7 @@ class LayerNormKernel<platform::CUDADeviceContext, T>
switch (GetDesiredBlockDim(feature_size)) { switch (GetDesiredBlockDim(feature_size)) {
FIXED_BLOCK_DIM_CASE( FIXED_BLOCK_DIM_CASE(
LayerNormForward<T, LayerNormParamType<T>, LayerNormForward<T, U,
kBlockDim><<<batch_size, kBlockDim, 0, stream>>>( kBlockDim><<<batch_size, kBlockDim, 0, stream>>>(
x_data, scale_data, bias_data, y_data, mean_data, var_data, x_data, scale_data, bias_data, y_data, mean_data, var_data,
epsilon, feature_size)); epsilon, feature_size));
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册