diff --git a/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc
index 0eed1a4f5e71f690c2a426164667422b0628bcae..b6f34a8549d3bb9fe4b497740969105605542313 100644
--- a/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc
@@ -60,6 +60,8 @@ class LayerNormOpConverter : public OpConverter {
       // the shape of mean and variance will be determine in configuPlugin.
       std::vector<int64_t> mean_shape{1};
       std::vector<int64_t> variance_shape{1};
+      bool with_fp16 =
+          engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
       plugin::LayerNormPluginDynamic* plugin =
           new plugin::LayerNormPluginDynamic(
               static_cast<const float*>(bias_weight.get().values),
@@ -69,7 +71,8 @@ class LayerNormOpConverter : public OpConverter {
               begin_norm_axis,
               eps,
               mean_shape,
-              variance_shape);
+              variance_shape,
+              with_fp16);
       layernorm_layer = engine_->AddDynamicPlugin(&X, 1, plugin);
     } else {
       int statis_num = 1;
@@ -78,6 +81,8 @@ class LayerNormOpConverter : public OpConverter {
       }
       std::vector<int64_t> mean_shape{statis_num};
       std::vector<int64_t> variance_shape{statis_num};
+      bool with_fp16 =
+          engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
       plugin::LayerNormPlugin* plugin = new plugin::LayerNormPlugin(
           static_cast<const float*>(bias_weight.get().values),
           bias_weight.get().count,
@@ -86,7 +91,8 @@ class LayerNormOpConverter : public OpConverter {
           begin_norm_axis,
           eps,
           mean_shape,
-          variance_shape);
+          variance_shape,
+          with_fp16);
       layernorm_layer = engine_->AddPlugin(
           &X, 1, reinterpret_cast<plugin::PluginTensorRT*>(plugin));
     }
diff --git a/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu
index da4ebdc6cb6e81c6df97918fdc987db7ee05a769..09a93d661bd35150c13353462c6a5db5d89967d8 100644
--- a/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.cu
@@ -26,7 +26,30 @@ namespace inference {
 namespace tensorrt {
 namespace plugin {
 
-int LayerNormPlugin::initialize() TRT_NOEXCEPT { return 0; }
+int LayerNormPlugin::initialize() TRT_NOEXCEPT {
+  cudaMalloc(&bias_gpu_, sizeof(float) * bias_.size());
+  cudaMemcpy(bias_gpu_,
+             bias_.data(),
+             bias_.size() * sizeof(float),
+             cudaMemcpyHostToDevice);
+  cudaMalloc(&scale_gpu_, sizeof(float) * scale_.size());
+  cudaMemcpy(scale_gpu_,
+             scale_.data(),
+             scale_.size() * sizeof(float),
+             cudaMemcpyHostToDevice);
+  return 0;
+}
+
+void LayerNormPlugin::terminate() TRT_NOEXCEPT {
+  if (bias_gpu_) {
+    cudaFree(bias_gpu_);
+    bias_gpu_ = nullptr;
+  }
+  if (scale_gpu_) {
+    cudaFree(scale_gpu_);
+    scale_gpu_ = nullptr;
+  }
+}
 
 nvinfer1::Dims LayerNormPlugin::getOutputDimensions(
     int index, const nvinfer1::Dims *inputDims, int nbInputs) TRT_NOEXCEPT {
@@ -37,6 +60,18 @@ nvinfer1::Dims LayerNormPlugin::getOutputDimensions(
   return output_dims;
 }
 
+bool LayerNormPlugin::supportsFormat(
+    nvinfer1::DataType type, nvinfer1::PluginFormat format) const TRT_NOEXCEPT {
+  if (with_fp16_) {
+    return ((type == nvinfer1::DataType::kFLOAT ||
+             type == nvinfer1::DataType::kHALF) &&
+            (format == nvinfer1::PluginFormat::kLINEAR));
+  } else {
+    return ((type == nvinfer1::DataType::kFLOAT) &&
+            (format == nvinfer1::PluginFormat::kLINEAR));
+  }
+}
+
 int LayerNormPlugin::enqueue(int batch_size,
                              const void *const *inputs,
 #if IS_TRT_VERSION_LT(8000)
@@ -48,8 +83,6 @@ int LayerNormPlugin::enqueue(int batch_size,
 #endif
                              cudaStream_t stream) TRT_NOEXCEPT {
   const auto &input_dims = this->getInputDims(0);
-  const float *input = reinterpret_cast<const float *>(inputs[0]);
-  float *output = reinterpret_cast<float *const *>(outputs)[0];
   int begin_norm_axis = begin_norm_axis_;
   float eps = eps_;
 
@@ -92,42 +125,76 @@ int LayerNormPlugin::enqueue(int batch_size,
                         feature_size,
                         bias_.size()));
 
-  scale_t.Resize(phi::make_ddim({feature_size}));
-  bias_t.Resize(phi::make_ddim({feature_size}));
-  mean_t.Resize(phi::make_ddim({batched_mean_shape}));
-  variance_t.Resize(phi::make_ddim({batched_variance_shape}));
   int device_id;
   cudaGetDevice(&device_id);
-  float *scale_d = scale_t.mutable_data<float>(platform::CUDAPlace(device_id));
-  float *bias_d = bias_t.mutable_data<float>(platform::CUDAPlace(device_id));
+  mean_t.Resize(phi::make_ddim({batched_mean_shape}));
+  variance_t.Resize(phi::make_ddim({batched_variance_shape}));
   float *mean_d = mean_t.mutable_data<float>(platform::CUDAPlace(device_id));
   float *variance_d =
       variance_t.mutable_data<float>(platform::CUDAPlace(device_id));
-  cudaMemcpyAsync(scale_d,
-                  scale_.data(),
-                  sizeof(float) * feature_size,
-                  cudaMemcpyHostToDevice,
-                  stream);
-  cudaMemcpyAsync(bias_d,
-                  bias_.data(),
-                  sizeof(float) * feature_size,
-                  cudaMemcpyHostToDevice,
-                  stream);
-
-  phi::LayerNormDirectCUDAFunctor<float> layer_norm;
-  layer_norm(stream,
-             input,
-             input_shape,
-             bias_d,
-             scale_d,
-             output,
-             mean_d,
-             variance_d,
-             begin_norm_axis,
-             eps);
+  auto input_type = getDataType();
+  if (input_type == nvinfer1::DataType::kFLOAT) {
+    VLOG(1) << "TRT Plugin DataType selected. LayerNorm-->fp32";
+    const float *input = reinterpret_cast<const float *>(inputs[0]);
+    float *output = static_cast<float *>(outputs[0]);
+    phi::LayerNormDirectCUDAFunctor<float, float> layer_norm;
+    layer_norm(stream,
+               input,
+               input_shape,
+               bias_gpu_,
+               scale_gpu_,
+               output,
+               mean_d,
+               variance_d,
+               begin_norm_axis,
+               eps);
+  } else if (input_type == nvinfer1::DataType::kHALF) {
+    VLOG(1) << "TRT Plugin DataType selected. LayerNorm-->fp16";
+    const half *input = reinterpret_cast<const half *>(inputs[0]);
+    half *output = static_cast<half *>(outputs[0]);
+    phi::LayerNormDirectCUDAFunctor<half, float> layer_norm;
+    layer_norm(stream,
+               input,
+               input_shape,
+               bias_gpu_,
+               scale_gpu_,
+               output,
+               mean_d,
+               variance_d,
+               begin_norm_axis,
+               eps);
+  } else {
+    PADDLE_THROW(platform::errors::Fatal(
+        "The LayerNorm TRT Plugin's input type should be float or half."));
+  }
   return cudaGetLastError() != cudaSuccess;
 }
 
+int LayerNormPluginDynamic::initialize() TRT_NOEXCEPT {
+  cudaMalloc(&bias_gpu_, sizeof(float) * bias_.size());
+  cudaMemcpy(bias_gpu_,
+             bias_.data(),
+             bias_.size() * sizeof(float),
+             cudaMemcpyHostToDevice);
+  cudaMalloc(&scale_gpu_, sizeof(float) * scale_.size());
+  cudaMemcpy(scale_gpu_,
+             scale_.data(),
+             scale_.size() * sizeof(float),
+             cudaMemcpyHostToDevice);
+  return 0;
+}
+
+void LayerNormPluginDynamic::terminate() TRT_NOEXCEPT {
+  if (bias_gpu_) {
+    cudaFree(bias_gpu_);
+    bias_gpu_ = nullptr;
+  }
+  if (scale_gpu_) {
+    cudaFree(scale_gpu_);
+    scale_gpu_ = nullptr;
+  }
+}
+
 nvinfer1::DimsExprs LayerNormPluginDynamic::getOutputDimensions(
     int output_index,
     const nvinfer1::DimsExprs *inputDims,
@@ -154,9 +221,14 @@ bool LayerNormPluginDynamic::supportsFormatCombination(
                                         nb_inputs + nb_outputs));
   const nvinfer1::PluginTensorDesc &in = in_out[pos];
   if (pos == 0) {
-    // TODO(Shangzhizhou) FP16 support
-    return (in.type == nvinfer1::DataType::kFLOAT) &&
-           (in.format == nvinfer1::TensorFormat::kLINEAR);
+    if (with_fp16_) {
+      return ((in.type == nvinfer1::DataType::kFLOAT ||
+               in.type == nvinfer1::DataType::kHALF) &&
+              (in.format == nvinfer1::PluginFormat::kLINEAR));
+    } else {
+      return (in.type == nvinfer1::DataType::kFLOAT) &&
+             (in.format == nvinfer1::TensorFormat::kLINEAR);
+    }
   }
   const nvinfer1::PluginTensorDesc &prev = in_out[pos - 1];
   // output
@@ -187,6 +259,11 @@ nvinfer1::DataType LayerNormPluginDynamic::getOutputDataType(
                         "The LayerNormPlugin only has one input, so the "
                         "index value should be 0, but get %d.",
                         index));
+  PADDLE_ENFORCE_EQ((input_types[0] == nvinfer1::DataType::kFLOAT ||
+                     input_types[0] == nvinfer1::DataType::kHALF),
+                    true,
+                    platform::errors::InvalidArgument(
+                        "The input type should be half or float"));
   return input_types[0];
 }
 
@@ -249,42 +326,40 @@ int LayerNormPluginDynamic::enqueue(
                         "but got feature_size:%d, bias's size:%d.",
                         feature_size,
                         bias_.size()));
+
   int device_id;
   cudaGetDevice(&device_id);
+  mean_t.Resize(phi::make_ddim(mean_shape_));
+  variance_t.Resize(phi::make_ddim(variance_shape_));
+  float *mean_d = mean_t.mutable_data<float>(platform::CUDAPlace(device_id));
+  float *variance_d =
+      variance_t.mutable_data<float>(platform::CUDAPlace(device_id));
   auto input_type = input_desc[0].type;
   if (input_type == nvinfer1::DataType::kFLOAT) {
     VLOG(1) << "TRT Plugin DataType selected. LayerNorm-->fp32";
     const float *input = reinterpret_cast<const float *>(inputs[0]);
     float *output = static_cast<float *>(outputs[0]);
-    scale_t.Resize(phi::make_ddim({feature_size}));
-    bias_t.Resize(phi::make_ddim({feature_size}));
-    mean_t.Resize(phi::make_ddim(mean_shape_));
-    variance_t.Resize(phi::make_ddim(variance_shape_));
-
-    float *scale_d =
-        scale_t.mutable_data<float>(platform::CUDAPlace(device_id));
-    float *bias_d = bias_t.mutable_data<float>(platform::CUDAPlace(device_id));
-    float *mean_d = mean_t.mutable_data<float>(platform::CUDAPlace(device_id));
-    float *variance_d =
-        variance_t.mutable_data<float>(platform::CUDAPlace(device_id));
-
-    cudaMemcpyAsync(scale_d,
-                    scale_.data(),
-                    sizeof(float) * feature_size,
-                    cudaMemcpyHostToDevice,
-                    stream);
-    cudaMemcpyAsync(bias_d,
-                    bias_.data(),
-                    sizeof(float) * feature_size,
-                    cudaMemcpyHostToDevice,
-                    stream);
-
-    phi::LayerNormDirectCUDAFunctor<float> layer_norm;
+    phi::LayerNormDirectCUDAFunctor<float, float> layer_norm;
+    layer_norm(stream,
+               input,
+               input_shape,
+               bias_gpu_,
+               scale_gpu_,
+               output,
+               mean_d,
+               variance_d,
+               begin_norm_axis,
+               eps);
+  } else if (input_type == nvinfer1::DataType::kHALF) {
+    VLOG(1) << "TRT Plugin DataType selected. LayerNorm-->fp16";
+    const half *input = reinterpret_cast<const half *>(inputs[0]);
+    half *output = static_cast<half *>(outputs[0]);
+    phi::LayerNormDirectCUDAFunctor<half, float> layer_norm;
     layer_norm(stream,
                input,
                input_shape,
-               bias_d,
-               scale_d,
+               bias_gpu_,
+               scale_gpu_,
                output,
                mean_d,
                variance_d,
@@ -292,7 +367,7 @@ int LayerNormPluginDynamic::enqueue(
                eps);
   } else {
     PADDLE_THROW(platform::errors::Fatal(
-        "The LayerNorm TRT Plugin's input type should be float."));
+        "The LayerNorm TRT Plugin's input type should be float or half."));
   }
   return cudaGetLastError() != cudaSuccess;
 }
diff --git a/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.h
index a8ccabb3cff597f37f49237b3043b0b9273318b2..014a1eca0306ac02c995d6b03ccbe5de80a57c2f 100644
--- a/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/layer_norm_op_plugin.h
@@ -31,8 +31,6 @@ namespace plugin {
 class LayerNormPlugin : public PluginTensorRT {
   std::vector<float> bias_;
   std::vector<float> scale_;
-  framework::Tensor scale_t;
-  framework::Tensor bias_t;
   framework::Tensor mean_t;
   framework::Tensor variance_t;
   int begin_norm_axis_;
@@ -40,12 +38,16 @@ class LayerNormPlugin : public PluginTensorRT {
   std::vector<int64_t> mean_shape_;
   std::vector<int64_t> variance_shape_;
 
+  // data on devices
+  float* bias_gpu_{nullptr};
+  float* scale_gpu_{nullptr};
+
  public:
   size_t getSerializationSize() const TRT_NOEXCEPT override {
     return getBaseSerializationSize() + SerializedSize(bias_) +
            SerializedSize(scale_) + SerializedSize(begin_norm_axis_) +
            SerializedSize(eps_) + SerializedSize(mean_shape_) +
-           SerializedSize(variance_shape_);
+           SerializedSize(variance_shape_) + SerializedSize(with_fp16_);
   }
 
   // TRT will call this func when we need to serialize the configuration of
@@ -59,6 +61,7 @@ class LayerNormPlugin : public PluginTensorRT {
     SerializeValue(&buffer, eps_);
     SerializeValue(&buffer, mean_shape_);
     SerializeValue(&buffer, variance_shape_);
+    SerializeValue(&buffer, with_fp16_);
   }
 
   LayerNormPlugin(const float* bias,
@@ -68,11 +71,13 @@ class LayerNormPlugin : public PluginTensorRT {
                   int begin_norm_axis,
                   float eps,
                   std::vector<int64_t> mean_shape,
-                  std::vector<int64_t> variance_shape)
+                  std::vector<int64_t> variance_shape,
+                  bool with_fp16)
       : begin_norm_axis_(begin_norm_axis),
         eps_(eps),
         mean_shape_(mean_shape),
         variance_shape_(variance_shape) {
+    with_fp16_ = with_fp16;
     bias_.resize(bias_num);
     scale_.resize(scale_num);
     std::copy(bias, bias + bias_num, bias_.data());
@@ -89,24 +94,33 @@ class LayerNormPlugin : public PluginTensorRT {
     DeserializeValue(&serialData, &serialLength, &eps_);
     DeserializeValue(&serialData, &serialLength, &mean_shape_);
     DeserializeValue(&serialData, &serialLength, &variance_shape_);
+    DeserializeValue(&serialData, &serialLength, &with_fp16_);
   }
   ~LayerNormPlugin() {}
   int initialize() TRT_NOEXCEPT override;
+  void terminate() TRT_NOEXCEPT override;
 
   LayerNormPlugin* clone() const TRT_NOEXCEPT override {
-    return new LayerNormPlugin(bias_.data(),
-                               bias_.size(),
-                               scale_.data(),
-                               scale_.size(),
-                               begin_norm_axis_,
-                               eps_,
-                               mean_shape_,
-                               variance_shape_);
+    auto ptr = new LayerNormPlugin(bias_.data(),
+                                   bias_.size(),
+                                   scale_.data(),
+                                   scale_.size(),
+                                   begin_norm_axis_,
+                                   eps_,
+                                   mean_shape_,
+                                   variance_shape_,
+                                   with_fp16_);
+    ptr->bias_gpu_ = bias_gpu_;
+    ptr->scale_gpu_ = scale_gpu_;
+    return ptr;
   }
 
   const char* getPluginType() const TRT_NOEXCEPT override {
     return "layernorm_plugin";
   }
+  bool supportsFormat(nvinfer1::DataType type, nvinfer1::PluginFormat format)
+      const TRT_NOEXCEPT override;
+
   int getNbOutputs() const TRT_NOEXCEPT override { return 1; }
   nvinfer1::Dims getOutputDimensions(int index,
                                      const nvinfer1::Dims* inputs,
@@ -150,11 +164,13 @@ class LayerNormPluginDynamic : public DynamicPluginTensorRT {
                          int begin_norm_axis,
                          float eps,
                          std::vector<int64_t> mean_shape,
-                         std::vector<int64_t> variance_shape)
+                         std::vector<int64_t> variance_shape,
+                         bool with_fp16)
       : begin_norm_axis_(begin_norm_axis),
         eps_(eps),
         mean_shape_(mean_shape),
         variance_shape_(variance_shape) {
+    with_fp16_ = with_fp16;
     bias_.resize(bias_num);
     scale_.resize(scale_num);
     std::copy(bias, bias + bias_num, bias_.data());
@@ -168,28 +184,35 @@ class LayerNormPluginDynamic : public DynamicPluginTensorRT {
     DeserializeValue(&serialData, &serialLength, &eps_);
     DeserializeValue(&serialData, &serialLength, &mean_shape_);
     DeserializeValue(&serialData, &serialLength, &variance_shape_);
+    DeserializeValue(&serialData, &serialLength, &with_fp16_);
   }
   nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override {
-    return new LayerNormPluginDynamic(bias_.data(),
-                                      bias_.size(),
-                                      scale_.data(),
-                                      scale_.size(),
-                                      begin_norm_axis_,
-                                      eps_,
-                                      mean_shape_,
-                                      variance_shape_);
+    auto ptr = new LayerNormPluginDynamic(bias_.data(),
+                                          bias_.size(),
+                                          scale_.data(),
+                                          scale_.size(),
+                                          begin_norm_axis_,
+                                          eps_,
+                                          mean_shape_,
+                                          variance_shape_,
+                                          with_fp16_);
+    ptr->bias_gpu_ = bias_gpu_;
+    ptr->scale_gpu_ = scale_gpu_;
+    return ptr;
   }
 
   const char* getPluginType() const TRT_NOEXCEPT override {
     return "layernorm_plugin_dynamic";
   }
   int getNbOutputs() const TRT_NOEXCEPT override { return 1; }
-  int initialize() TRT_NOEXCEPT override { return 0; }
+  int initialize() TRT_NOEXCEPT override;
+  void terminate() TRT_NOEXCEPT override;
 
   size_t getSerializationSize() const TRT_NOEXCEPT override {
     return SerializedSize(bias_) + SerializedSize(scale_) +
            SerializedSize(begin_norm_axis_) + SerializedSize(eps_) +
-           SerializedSize(mean_shape_) + SerializedSize(variance_shape_);
+           SerializedSize(mean_shape_) + SerializedSize(variance_shape_) +
+           SerializedSize(with_fp16_);
   }
 
   void serialize(void* buffer) const TRT_NOEXCEPT override {
@@ -199,6 +222,7 @@ class LayerNormPluginDynamic : public DynamicPluginTensorRT {
     SerializeValue(&buffer, eps_);
     SerializeValue(&buffer, mean_shape_);
     SerializeValue(&buffer, variance_shape_);
+    SerializeValue(&buffer, with_fp16_);
   }
 
   nvinfer1::DimsExprs getOutputDimensions(int output_index,
@@ -240,14 +264,15 @@ class LayerNormPluginDynamic : public DynamicPluginTensorRT {
  private:
   std::vector<float> bias_;
   std::vector<float> scale_;
-  framework::Tensor scale_t;
-  framework::Tensor bias_t;
   framework::Tensor mean_t;
   framework::Tensor variance_t;
   int begin_norm_axis_;
   float eps_;
   std::vector<int64_t> mean_shape_;
   std::vector<int64_t> variance_shape_;
+  // data on devices
+  float* bias_gpu_{nullptr};
+  float* scale_gpu_{nullptr};
 };
 
 class LayerNormPluginDynamicCreator : public TensorRTPluginCreator {
diff --git a/paddle/fluid/operators/layer_norm_kernel.cu.h b/paddle/fluid/operators/layer_norm_kernel.cu.h
index 899eae3efb45bc8573160554d05eed5d20979208..af3d4cec39737fd36a212cc1e2ac8e3800165deb 100644
--- a/paddle/fluid/operators/layer_norm_kernel.cu.h
+++ b/paddle/fluid/operators/layer_norm_kernel.cu.h
@@ -379,7 +379,8 @@ __global__ void LayerNormForward(
   var_val = BlockReduceSum<U>(var_val, shared_var);
 
   if (threadIdx.x == 0) {
-    auto scale = static_cast<float>(1.) / static_cast<float>(feature_size);
+    auto scale = static_cast<U>(static_cast<float>(1.) /
+                                static_cast<float>(feature_size));
     auto tmp = mean_val * scale;
     mean[blockIdx.x] = mean_share = static_cast<U>(tmp);
     var_share = static_cast<U>(var_val * scale - mean_share * mean_share);
diff --git a/paddle/phi/kernels/gpu/layer_norm_kernel.cu b/paddle/phi/kernels/gpu/layer_norm_kernel.cu
index 4922ba0cc2786bb4e57b1740a4370ff3f8d8cc52..f2ad08c68005882815bac9d8b7c5da42b7d9bd8d 100644
--- a/paddle/phi/kernels/gpu/layer_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/layer_norm_kernel.cu
@@ -21,24 +21,24 @@
 
 namespace phi {
 
-template <typename T>
-void LayerNormDirectCUDAFunctor<T>::operator()(gpuStream_t stream,
-                                               const T *input,
-                                               std::vector<int> input_shape,
-                                               const T *bias,
-                                               const T *scale,
-                                               T *output,
-                                               T *mean,
-                                               T *variance,
-                                               int begin_norm_axis,
-                                               float eps) {
+template <typename T, typename U>
+void LayerNormDirectCUDAFunctor<T, U>::operator()(gpuStream_t stream,
+                                                  const T *input,
+                                                  std::vector<int> input_shape,
+                                                  const U *bias,
+                                                  const U *scale,
+                                                  T *output,
+                                                  U *mean,
+                                                  U *variance,
+                                                  int begin_norm_axis,
+                                                  float eps) {
   const auto x_dims = phi::make_ddim(input_shape);
   auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis);
   int64_t batch_size = static_cast<int64_t>(matrix_dim[0]);
   int64_t feature_size = static_cast<int64_t>(matrix_dim[1]);
   switch (paddle::operators::GetDesiredBlockDim(feature_size)) {
     FIXED_BLOCK_DIM_CASE(
-        paddle::operators::LayerNormForward<T, T, kBlockDim>
+        paddle::operators::LayerNormForward<T, U, kBlockDim>
         <<<batch_size, kBlockDim, 0, stream>>>(
             input, scale, bias, output, mean, variance, eps, feature_size));
     default:
@@ -49,7 +49,10 @@ void LayerNormDirectCUDAFunctor<T>::operator()(gpuStream_t stream,
   }
 }
 
-template class LayerNormDirectCUDAFunctor<float>;
+template class LayerNormDirectCUDAFunctor<float, float>;
+#if defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
+template class LayerNormDirectCUDAFunctor<half, float>;
+#endif
 
 template <typename T, typename Context>
 void LayerNormKernel(const Context &dev_ctx,
diff --git a/paddle/phi/kernels/layer_norm_kernel.h b/paddle/phi/kernels/layer_norm_kernel.h
index 26c04b61af96b96f53f1c6ea787500a62eb85df4..28ffdfd47719b3c1778b2912cacd31ef3eb05679 100644
--- a/paddle/phi/kernels/layer_norm_kernel.h
+++ b/paddle/phi/kernels/layer_norm_kernel.h
@@ -32,17 +32,17 @@ void LayerNormKernel(const Context& ctx,
                      DenseTensor* variance);
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-template <typename T>
+template <typename T, typename U>
 class LayerNormDirectCUDAFunctor {
  public:
   void operator()(gpuStream_t stream,
                   const T* input,
                   std::vector<int> input_shape,
-                  const T* bias,
-                  const T* scale,
+                  const U* bias,
+                  const U* scale,
                   T* output,
-                  T* mean,
-                  T* variance,
+                  U* mean,
+                  U* variance,
                   int begin_norm_axis,
                   float eps);
 };