support data_format='NHWC' for prelu channel mode (#37019)

* support data_format='NHWC' for prelu channel mode

support data_format='NHWC' for prelu channel mode (#37019)
* support data_format='NHWC' for prelu channel mode
3f2a665a · Guoxia Wang · GitHub · 0c82e3a0 · 3f2a665a · 3f2a665a
16 changed file
--- a/paddle/fluid/inference/tensorrt/convert/prelu_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/prelu_op.cc
@@ -34,6 +34,11 @@ class PReluOpConverter : public OpConverter {
    auto* input = engine_->GetITensor(op_desc.Input("X")[0]);
    // Get attrs
    std::string mode = BOOST_GET_CONST(std::string, op_desc.GetAttr("mode"));
+    std::string data_format = "NCHW";
+    if (op_desc.HasAttr("data_format")) {
+      data_format =
+          BOOST_GET_CONST(std::string, op_desc.GetAttr("data_format"));
+    }
    auto* alpha_var = scope.FindVar(op_desc.Input("Alpha")[0]);
    auto* alpha_tensor = alpha_var->GetMutable<framework::LoDTensor>();
@@ -47,7 +52,7 @@ class PReluOpConverter : public OpConverter {
    nvinfer1::ILayer* layer = nullptr;
    if (engine_->with_dynamic_shape()) {
      plugin::PReluPluginDynamic* plugin = new plugin::PReluPluginDynamic(
-          alpha_data, alpha_tensor_temp->numel(), mode);
+          alpha_data, alpha_tensor_temp->numel(), mode, data_format);
      layer = engine_->AddDynamicPlugin(&input, input_num, plugin);
    } else {
 #if IS_TRT_VERSION_GE(7000)
@@ -74,8 +79,8 @@ class PReluOpConverter : public OpConverter {
      layer = TRT_ENGINE_ADD_LAYER(engine_, ParametricReLU, *input,
                                   *alpha_layer_output);
 #else
-      plugin::PReluPlugin* plugin =
+      plugin::PReluPlugin* plugin = new plugin::PReluPlugin(
-          new plugin::PReluPlugin(alpha_data, alpha_tensor_temp->numel(), mode);
+          alpha_data, alpha_tensor_temp->numel(), mode, data_format);
      layer = engine_->AddPlugin(&input, input_num, plugin);
 #endif
    }

--- a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu
@@ -69,10 +69,11 @@ int PReluPlugin::enqueue(int batch_size, const void *const *inputs,
  }
  if (mode_ == "channel") {
+    bool channel_last = data_format_ == "NHWC";
    operators::math::PreluChannelWiseDirectCUDAFunctor<float>
        prelu_channel_wise;
    prelu_channel_wise(stream, input, alpha, output, input_dims.d[0],
-                       input_dims.d[1], numel);
+                       input_dims.d[1], channel_last, numel);
  } else if (mode_ == "element") {
    operators::math::PreluElementWiseDirectCUDAFunctor<float>
        prelu_element_wise;
@@ -168,10 +169,11 @@ int PReluPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *input_desc,
  }
  if (mode_ == "channel") {
+    bool channel_last = data_format_ == "NHWC";
    operators::math::PreluChannelWiseDirectCUDAFunctor<float>
        prelu_channel_wise;
    prelu_channel_wise(stream, input, alpha, output, input_dims.d[0],
-                       input_dims.d[1], numel);
+                       input_dims.d[1], channel_last, numel);
  } else if (mode_ == "element") {
    operators::math::PreluElementWiseDirectCUDAFunctor<float>
        prelu_element_wise;

--- a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h
@@ -32,11 +32,12 @@ class PReluPlugin : public PluginTensorRT {
  std::vector<float> weight_;
  float* p_gpu_weight_;
  std::string mode_;
+  std::string data_format_;
 public:
  size_t getSerializationSize() const TRT_NOEXCEPT override {
    return getBaseSerializationSize() + SerializedSize(mode_.c_str()) +
-           SerializedSize(weight_);
+           SerializedSize(data_format_.c_str()) + SerializedSize(weight_);
  }
  // TRT will call this func when we need to serialize the configuration of
@@ -46,11 +47,12 @@ class PReluPlugin : public PluginTensorRT {
    serializeBase(buffer);
    SerializeValue(&buffer, weight_);
    SerializeValue(&buffer, mode_.c_str());
+    SerializeValue(&buffer, data_format_.c_str());
  }
  PReluPlugin(const float* weight, const int weight_num,
-              std::string const& mode)
+              std::string const& mode, std::string const& data_format)
-      : mode_(mode) {
+      : mode_(mode), data_format_(data_format) {
    weight_.resize(weight_num);
    std::copy(weight, weight + weight_num, weight_.data());
  }
@@ -63,13 +65,17 @@ class PReluPlugin : public PluginTensorRT {
    const char* prelu_mode;
    DeserializeValue(&serialData, &serialLength, &prelu_mode);
    mode_ = std::string(prelu_mode);
+    const char* prelu_data_format;
+    DeserializeValue(&serialData, &serialLength, &prelu_data_format);
+    data_format_ = std::string(prelu_data_format);
  }
  ~PReluPlugin() {}
  int initialize() TRT_NOEXCEPT override;
  void terminate() TRT_NOEXCEPT override;
  PReluPlugin* clone() const TRT_NOEXCEPT override {
-    auto* ptr = new PReluPlugin(weight_.data(), weight_.size(), mode_);
+    auto* ptr =
+        new PReluPlugin(weight_.data(), weight_.size(), mode_, data_format_);
    ptr->p_gpu_weight_ = p_gpu_weight_;
    return ptr;
  }
@@ -108,8 +114,8 @@ REGISTER_TRT_PLUGIN_V2(PReluPluginCreator);
 class PReluPluginDynamic : public DynamicPluginTensorRT {
 public:
  PReluPluginDynamic(const float* weight, const int weight_num,
-                     std::string const& mode)
+                     std::string const& mode, std::string const& data_format)
-      : mode_(mode) {
+      : mode_(mode), data_format_(data_format) {
    weight_.resize(weight_num);
    std::copy(weight, weight + weight_num, weight_.data());
  }
@@ -117,7 +123,8 @@ class PReluPluginDynamic : public DynamicPluginTensorRT {
  PReluPluginDynamic(void const* serialData, size_t serialLength);
  ~PReluPluginDynamic() {}
  nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override {
-    auto ptr = new PReluPluginDynamic(weight_.data(), weight_.size(), mode_);
+    auto ptr = new PReluPluginDynamic(weight_.data(), weight_.size(), mode_,
+                                      data_format_);
    ptr->p_gpu_weight_ = p_gpu_weight_;
    return ptr;
  }
@@ -167,6 +174,7 @@ class PReluPluginDynamic : public DynamicPluginTensorRT {
  std::vector<float> weight_;
  float* p_gpu_weight_;
  std::string mode_;
+  std::string data_format_;
 };
 #endif

--- a/paddle/fluid/operators/math/prelu.cu
+++ b/paddle/fluid/operators/math/prelu.cu
@@ -25,9 +25,9 @@ inline static int PADDLE_GET_BLOCKS(const int N) {
 }
 template <typename T>
-__global__ void PReluChannelWiseKernel(const T *input, const T *alpha,
+__global__ void PReluChannelFirstWiseKernel(const T *input, const T *alpha,
-                                       T *output, size_t channel_num,
+                                            T *output, size_t channel_num,
-                                       size_t plane_size, size_t numel) {
+                                            size_t plane_size, size_t numel) {
  CUDA_KERNEL_LOOP(index, numel) {
    size_t temp = index / plane_size;
    size_t channel_index = temp % channel_num;
@@ -38,6 +38,19 @@ __global__ void PReluChannelWiseKernel(const T *input, const T *alpha,
  }
 }
+template <typename T>
+__global__ void PReluChannelLastWiseKernel(const T *input, const T *alpha,
+                                           T *output, size_t channel_num,
+                                           size_t numel) {
+  CUDA_KERNEL_LOOP(index, numel) {
+    size_t channel_index = index % channel_num;
+    T scale = alpha[channel_index];
+    T x = input[index];
+    T zero = static_cast<T>(0);
+    output[index] = (x > zero) ? x : scale * x;
+  }
+}
 template <typename T>
 __global__ void PReluElementWiseKernel(const T *input, const T *alpha,
                                       T *output, size_t spatial_size,
@@ -65,10 +78,16 @@ __global__ void PReluScalarKernel(const T *input, const T *alpha, T *output,
 template <typename T>
 void PreluChannelWiseDirectCUDAFunctor<T>::operator()(
    gpuStream_t stream, const T *input, const T *alpha, T *output,
-    size_t batch_size, size_t channel, size_t numel) {
+    size_t batch_size, size_t channel, bool channel_last, size_t numel) {
-  PReluChannelWiseKernel<<<PADDLE_GET_BLOCKS(numel), CUDA_NUM_THREADS, 0,
+  if (channel_last) {
-                           stream>>>(input, alpha, output, channel,
+    PReluChannelLastWiseKernel<<<PADDLE_GET_BLOCKS(numel), CUDA_NUM_THREADS, 0,
-                                     numel / batch_size / channel, numel);
+                                 stream>>>(input, alpha, output, channel,
+                                           numel);
+  } else {
+    PReluChannelFirstWiseKernel<<<PADDLE_GET_BLOCKS(numel), CUDA_NUM_THREADS, 0,
+                                  stream>>>(
+        input, alpha, output, channel, numel / batch_size / channel, numel);
+  }
 }
 template <typename T>

--- a/paddle/fluid/operators/math/prelu.h
+++ b/paddle/fluid/operators/math/prelu.h
@@ -31,7 +31,8 @@ template <typename T>
 class PreluChannelWiseDirectCUDAFunctor {
 public:
  void operator()(gpuStream_t stream, const T *input, const T *alpha, T *output,
-                  size_t batch_size, size_t channel, size_t numel);
+                  size_t batch_size, size_t channel, bool channel_last,
+                  size_t numel);
 };
 template <typename T>

--- a/paddle/fluid/operators/mkldnn/prelu_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/prelu_mkldnn_op.cc
@@ -34,7 +34,7 @@ class PReluMKLDNNHandler
                     const dnnl::engine engine, platform::Place cpu_place,
                     const Tensor* x, const Tensor* weights,
                     const std::string& uniq_name, const std::string& mode,
-                     bool is_test = false)
+                     const std::string& data_format, bool is_test = false)
      : platform::MKLDNNHandlerT<T, dnnl::prelu_forward, dnnl::prelu_backward>(
            dev_ctx, engine, cpu_place,
            platform::CreateKey(dev_ctx, framework::vectorize(x->dims()),
@@ -49,8 +49,13 @@ class PReluMKLDNNHandler
      if (weights->dims().size() != x->dims().size()) {
        auto new_weights_dims = std::vector<int64_t>(x->dims().size(), 1);
        if (mode == "channel") {
-          new_weights_dims[1] =
+          if (data_format == "NHWC") {
-              *std::max_element(weights_dims.begin(), weights_dims.end());
+            new_weights_dims[x->dims().size() - 1] =
+                *std::max_element(weights_dims.begin(), weights_dims.end());
+          } else {
+            new_weights_dims[1] =
+                *std::max_element(weights_dims.begin(), weights_dims.end());
+          }
        }
        weights_dims = std::move(new_weights_dims);
      }
@@ -110,9 +115,11 @@ class PReluMKLDNNKernel : public framework::OpKernel<T> {
    auto* out = ctx.Output<Tensor>("Out");
    const bool is_test = ctx.Attr<bool>("is_test");
    const auto mode = ctx.Attr<std::string>("mode");
+    const auto data_format = ctx.Attr<std::string>("data_format");
    PReluMKLDNNHandler<T> handler(dev_ctx, onednn_engine, ctx.GetPlace(), x,
-                                  alpha, ctx.InputName("X"), mode, is_test);
+                                  alpha, ctx.InputName("X"), mode, data_format,
+                                  is_test);
    auto src_memory_p = handler.AcquireSrcMemory(x);
    auto weights_memory_p =
@@ -149,9 +156,11 @@ class PReluGradMKLDNNKernel : public framework::OpKernel<T> {
    auto* alpha = ctx.Input<Tensor>("Alpha");
    const bool is_test = ctx.Attr<bool>("is_test");
    const auto mode = ctx.Attr<std::string>("mode");
+    const auto data_format = ctx.Attr<std::string>("data_format");
    PReluMKLDNNHandler<T> handler(dev_ctx, onednn_engine, ctx.GetPlace(), x,
-                                  alpha, framework::GradVarName("X"), mode);
+                                  alpha, framework::GradVarName("X"), mode,
+                                  data_format);
    auto src_memory_p = handler.AcquireSrcMemory(x);
    auto weights_memory_p =

--- a/paddle/fluid/operators/prelu_op.cc
+++ b/paddle/fluid/operators/prelu_op.cc
@@ -38,12 +38,6 @@ class PReluOp : public framework::OperatorWithKernel {
                            "But recevied alpha's size: %d.",
                            product(ctx->GetInputDim("Alpha"))));
    } else if (mode == "channel") {
-      PADDLE_ENFORCE_EQ(product(ctx->GetInputDim("Alpha")), x_dim[1],
-                        platform::errors::InvalidArgument(
-                            "For mode 'channel', size of weight Alpha must be "
-                            "equal to the number of channels of input(x). But "
-                            "recevied alpha's size: %d, x_dim[1]: %d",
-                            product(ctx->GetInputDim("Alpha")), x_dim[1]));
      auto x_rank = x_dim.size();
      PADDLE_ENFORCE_GE(x_rank, 2,
                        platform::errors::InvalidArgument(
@@ -51,6 +45,33 @@ class PReluOp : public framework::OperatorWithKernel {
                            "equal or larger than 2. But recevied X's "
                            "rank: %d",
                            x_rank));
+      const std::string data_format_str =
+          ctx->Attrs().Get<std::string>("data_format");
+      PADDLE_ENFORCE_EQ(data_format_str == "NCHW" || data_format_str == "NHWC",
+                        true,
+                        platform::errors::InvalidArgument(
+                            "For mode 'channel', data_format must be one of "
+                            "NCHW and NHWC. But recevied data_format: %s",
+                            data_format_str));
+      if (data_format_str == "NCHW") {
+        PADDLE_ENFORCE_EQ(
+            product(ctx->GetInputDim("Alpha")) == x_dim[1], true,
+            platform::errors::InvalidArgument(
+                "For mode 'channel', size of weight Alpha must be "
+                "equal to the number of channels of input(x). But "
+                "recevied alpha's size: %d, x_dim[1]: %d",
+                product(ctx->GetInputDim("Alpha")), x_dim[1]));
+      } else {
+        PADDLE_ENFORCE_EQ(
+            product(ctx->GetInputDim("Alpha")) == x_dim[x_rank - 1], true,
+            platform::errors::InvalidArgument(
+                "For mode 'channel', size of weight Alpha must be "
+                "equal to the number of channels of input(x). But "
+                "recevied alpha's size: %d, x_dim[%d]: %d",
+                product(ctx->GetInputDim("Alpha")), x_rank - 1,
+                x_dim[x_rank - 1]));
+      }
    } else if (mode == "element") {
      auto alpha_dim = ctx->GetInputDim("Alpha");
      auto alpha_rank = alpha_dim.size();
@@ -134,6 +155,9 @@ There are modes:
 )DOC");
    AddAttr<std::string>("mode", "The mode for inputs to share weights.")
        .SetDefault("all");
+    AddAttr<std::string>("data_format",
+                         "Data format that specifies the layout of input")
+        .SetDefault("NCHW");
    AddAttr<bool>("use_mkldnn",
                  "(bool, default false) Only used in mkldnn kernel")
        .SetDefault(false)

--- a/paddle/fluid/operators/prelu_op.cu
+++ b/paddle/fluid/operators/prelu_op.cu
@@ -42,17 +42,22 @@ class CUDAPReluKernel : public framework::OpKernel<T> {
    const T* alpha_ptr = alpha->data<T>();
    auto& mode = context.Attr<std::string>("mode");
+    auto& data_format = context.Attr<std::string>("data_format");
    int numel = x->numel();
    auto dim = x->dims();
+    auto x_rank = dim.size();
-    VLOG(4) << "dim[0]:" << dim[0] << ", dim[1]:" << dim[1]
+    VLOG(4) << "dim[0]:" << dim[0] << ", dim[1]:" << dim[1] << ", dim["
-            << ", numel:" << numel;
+            << x_rank - 1 << "]:" << dim[x_rank - 1] << ", numel:" << numel;
    if (mode == "channel") {
+      bool channel_last = data_format == "NHWC";
+      size_t channel = channel_last ? dim[x_rank - 1] : dim[1];
      math::PreluChannelWiseDirectCUDAFunctor<T> prelu_channel_wise;
      prelu_channel_wise(context.cuda_device_context().stream(), x_ptr,
-                         alpha_ptr, o_ptr, dim[0], dim[1], numel);
+                         alpha_ptr, o_ptr, dim[0], channel, channel_last,
+                         numel);
    } else if (mode == "element") {
      math::PreluElementWiseDirectCUDAFunctor<T> prelu_element_wise;
      prelu_element_wise(context.cuda_device_context().stream(), x_ptr,
@@ -65,7 +70,7 @@ class CUDAPReluKernel : public framework::OpKernel<T> {
  }
 };
-enum PRELU_MODE { Element, Channel, Scalar };
+enum PRELU_MODE { Element, ChannelFirst, ChannelLast, Scalar };
 template <typename T>
 __global__ void PReluOpGradKernel(const T* x_ptr, const T* alpha_ptr,
@@ -78,10 +83,13 @@ __global__ void PReluOpGradKernel(const T* x_ptr, const T* alpha_ptr,
    if (mode == Element) {
      size_t element_index = index % spatial_size;
      scale = alpha_ptr[element_index];
-    } else if (mode == Channel) {
+    } else if (mode == ChannelFirst) {
      size_t temp = index / plane_size;
      size_t channel_index = temp % channel_num;
      scale = alpha_ptr[channel_index];
+    } else if (mode == ChannelLast) {
+      size_t channel_index = index % channel_num;
+      scale = alpha_ptr[channel_index];
    } else {
      scale = alpha_ptr[0];
    }
@@ -105,11 +113,13 @@ class PreluOpGradFunctor {
    }
    size_t plane_size = numel / input_dims[0] / input_dims[1];
    size_t spatial_size = numel / input_dims[0];
+    size_t channel =
+        mode == ChannelLast ? input_dims[input_dims.size() - 1] : input_dims[1];
    PReluOpGradKernel<
        T><<<PADDLE_GET_BLOCKS(numel), CUDA_NUM_THREADS, 0, stream>>>(
-        x, alpha, dy, dx, dalpha, input_dims[1], plane_size, spatial_size,
+        x, alpha, dy, dx, dalpha, channel, plane_size, spatial_size, numel,
-        numel, mode);
+        mode);
  }
 };
@@ -140,9 +150,11 @@ class CUDAPReluGradKernel : public framework::OpKernel<T> {
    if (!dx && !dalpha) return;
    auto& mode = context.Attr<std::string>("mode");
+    auto& data_format = context.Attr<std::string>("data_format");
    int numel = x->numel();
    auto dim = x->dims();
+    auto x_rank = dim.size();
    std::vector<int> input_shape = framework::vectorize<int>(dim);
    auto stream = context.cuda_device_context().stream();
@@ -157,10 +169,12 @@ class CUDAPReluGradKernel : public framework::OpKernel<T> {
    }
    PRELU_MODE m;
+    bool channel_last = false;
    if (mode == "element") {
      m = Element;
    } else if (mode == "channel") {
-      m = Channel;
+      channel_last = data_format == "NHWC";
+      m = channel_last ? ChannelLast : ChannelFirst;
    } else {
      m = Scalar;
    }
@@ -172,7 +186,8 @@ class CUDAPReluGradKernel : public framework::OpKernel<T> {
    std::vector<int> reduce_dims;
    for (size_t i = 0; i < dim.size(); i++) {
-      if (mode == "channel" && i == 1) continue;
+      if (mode == "channel" && !channel_last && i == 1) continue;
+      if (mode == "channel" && channel_last && i == dim.size() - 1) continue;
      if (mode == "element" && i != 0) continue;
      reduce_dims.push_back(i);
    }

--- a/paddle/fluid/operators/prelu_op.h
+++ b/paddle/fluid/operators/prelu_op.h
@@ -33,19 +33,27 @@ class PReluKernel : public framework::OpKernel<T> {
    const T* alpha_ptr = alpha->data<T>();
    auto& mode = context.Attr<std::string>("mode");
+    auto& data_format = context.Attr<std::string>("data_format");
    int numel = x->numel();
    auto dim = x->dims();
    int index = 0;
    int i = 0;
    if (mode == "channel") {
-      int temp = 1;
+      if (data_format == "NCHW") {
-      for (int j = 2; j < dim.size(); j++) {
+        int temp = 1;
-        temp *= dim[j];
+        for (int j = 2; j < dim.size(); j++) {
-      }
+          temp *= dim[j];
-      for (i = 0; i < numel; i++) {
+        }
-        index = (i / temp) % dim[1];
+        for (i = 0; i < numel; i++) {
-        o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[index] * x_ptr[i];
+          index = (i / temp) % dim[1];
+          o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[index] * x_ptr[i];
+        }
+      } else {
+        for (i = 0; i < numel; i++) {
+          index = i % dim[dim.size() - 1];
+          o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[index] * x_ptr[i];
+        }
      }
    } else if (mode == "element") {
      int temp = 1;
@@ -77,6 +85,7 @@ class PReluGradKernel : public framework::OpKernel<T> {
    const T* x_ptr = x->data<T>();
    const T* dout_ptr = dout->data<T>();
    std::string mode = context.Attr<std::string>("mode");
+    auto& data_format = context.Attr<std::string>("data_format");
    int numel = x->numel();
    auto dim = x->dims();
    int index = 0;
@@ -84,14 +93,22 @@ class PReluGradKernel : public framework::OpKernel<T> {
    if (dx) {
      T* dx_ptr = dx->mutable_data<T>(context.GetPlace());
      if (mode == "channel") {
-        int temp = 1;
+        if (data_format == "NCHW") {
-        for (int j = 2; j < dim.size(); j++) {
+          int temp = 1;
-          temp *= dim[j];
+          for (int j = 2; j < dim.size(); j++) {
-        }
+            temp *= dim[j];
-        for (i = 0; i < numel; i++) {
+          }
-          index = (i / temp) % dim[1];
+          for (i = 0; i < numel; i++) {
-          dx_ptr[i] =
+            index = (i / temp) % dim[1];
-              x_ptr[i] > 0 ? dout_ptr[i] : alpha_ptr[index] * dout_ptr[i];
+            dx_ptr[i] =
+                x_ptr[i] > 0 ? dout_ptr[i] : alpha_ptr[index] * dout_ptr[i];
+          }
+        } else {
+          for (i = 0; i < numel; i++) {
+            index = i % dim[dim.size() - 1];
+            dx_ptr[i] =
+                x_ptr[i] > 0 ? dout_ptr[i] : alpha_ptr[index] * dout_ptr[i];
+          }
        }
      } else if (mode == "element") {
        int temp = 1;
@@ -116,13 +133,20 @@ class PReluGradKernel : public framework::OpKernel<T> {
      memset(dalpha_ptr, 0, sizeof(T) * dalpha->numel());
      if (mode == "channel") {
-        int temp = 1;
+        if (data_format == "NCHW") {
-        for (int j = 2; j < dim.size(); j++) {
+          int temp = 1;
-          temp *= dim[j];
+          for (int j = 2; j < dim.size(); j++) {
-        }
+            temp *= dim[j];
-        for (i = 0; i < numel; i++) {
+          }
-          index = (i / temp) % dim[1];
+          for (i = 0; i < numel; i++) {
-          dalpha_ptr[index] += x_ptr[i] > 0 ? 0 : x_ptr[i] * dout_ptr[i];
+            index = (i / temp) % dim[1];
+            dalpha_ptr[index] += x_ptr[i] > 0 ? 0 : x_ptr[i] * dout_ptr[i];
+          }
+        } else {
+          for (i = 0; i < numel; i++) {
+            index = i % dim[dim.size() - 1];
+            dalpha_ptr[index] += x_ptr[i] > 0 ? 0 : x_ptr[i] * dout_ptr[i];
+          }
        }
      } else if (mode == "element") {
        int temp = 1;

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -9791,7 +9791,7 @@ def swish(x, beta=1.0, name=None):
 @deprecated(since="2.0.0", update_to="paddle.static.nn.prelu")
-def prelu(x, mode, param_attr=None, name=None):
+def prelu(x, mode, param_attr=None, data_format="NCHW", name=None):
    r"""
    prelu activation.
@@ -9818,6 +9818,9 @@ def prelu(x, mode, param_attr=None, name=None):
        name (str, optional): Name for the operation (optional, default is None). \
        For more information, please refer to :ref:`api_guide_Name`.
+        data_format(str, optional): Data format that specifies the layout of input.
+            It may be "NC", "NCL", "NCHW", "NCDHW", "NLC", "NHWC" or "NDHWC". Default: "NCHW".
    Returns:
        Tensor: A tensor with the same shape and data type as x.
@@ -9839,17 +9842,32 @@ def prelu(x, mode, param_attr=None, name=None):
    helper = LayerHelper('prelu', **locals())
    if mode not in ['all', 'channel', 'element']:
        raise ValueError('mode should be one of all, channel, element.')
    alpha_shape = [1]
-    # NOTE(): The input of this API should be ``N,C,...`` format,
-    # which means x.shape[0] is batch_size and x.shape[0] is channel.
    if mode == 'channel':
+        true_data_format = [
+            'NC', 'NCL', 'NCHW', 'NCDHW', 'NLC', 'NHWC', 'NDHWC'
+        ]
+        if data_format not in true_data_format:
+            raise ValueError(
+                "data_format must be one of 'NC', 'NCL', 'NCHW', 'NCDHW', "
+                "'NLC', 'NHWC', 'NDHWC' but receive {}".format(data_format))
+        data_format = 'NCHW' if data_format[1] == 'C' else 'NHWC'
        assert len(
            x.shape
        ) >= 2, "The size of input shape should be equal or larger than 2 in prelu() when mode is 'channel'"
        #NOTE(zhiqiu): The alpha_shape should be [1, channel] + [1] * len(x.shape[2:]).
        # To be consistent with Prelu, it is simplified.
        #NOTE(zhiqiu): Revert shape to [1, channel, 1, 1] for compatibility with saved model of old version.
-        alpha_shape = [1, x.shape[1], 1, 1]
+        #NOTE(GuoxiaWang): support NHWC data format
+        if data_format == 'NHWC':
+            alpha_shape = [1, 1, 1, x.shape[1]]
+        else:
+            alpha_shape = [1, x.shape[1], 1, 1]
    elif mode == 'element':
        assert len(
            x.shape
@@ -9867,7 +9885,8 @@ def prelu(x, mode, param_attr=None, name=None):
        type="prelu",
        inputs={"X": x,
                'Alpha': alpha},
-        attrs={"mode": mode},
+        attrs={"mode": mode,
+               "data_format": data_format},
        outputs={"Out": out})
    return out

--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_prelu_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_prelu_op.py
@@ -44,8 +44,12 @@ class TestMkldnnPreluOp(MkldnnAutoScanTest):
                if len(kwargs['in_shape']) <= 1:
                    # not valid case, just return 0
                    return np.zeros((1)).astype(np.float32)
-                return np.random.random(kwargs['in_shape'][1]).astype(
+                if kwargs['data_format'] == 'NCHW':
-                    np.float32)
+                    return np.random.random(kwargs['in_shape'][1]).astype(
+                        np.float32)
+                else:
+                    return np.random.random(kwargs['in_shape'][-1]).astype(
+                        np.float32)
            else:
                if len(kwargs['in_shape']) <= 1:
                    # not valid case, just return 0
@@ -57,7 +61,10 @@ class TestMkldnnPreluOp(MkldnnAutoScanTest):
            inputs={"X": ["input_data"],
                    "Alpha": ["alpha_weight"]},
            outputs={"Out": ["output_data"]},
-            attrs={"mode": kwargs['mode']})
+            attrs={
+                "mode": kwargs['mode'],
+                "data_format": kwargs['data_format']
+            })
        program_config = ProgramConfig(
            ops=[prelu_op],
@@ -82,6 +89,7 @@ class TestMkldnnPreluOp(MkldnnAutoScanTest):
    @given(
        mode=st.sampled_from(['all', 'channel', 'element']),
+        data_format=st.sampled_from(['NCHW', 'NHWC']),
        in_shape=st.lists(
            st.integers(
                min_value=1, max_value=32), min_size=1, max_size=4))

--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_prelu.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_prelu.py
@@ -39,7 +39,8 @@ class TrtConvertPreluTest(TrtLayerAutoScanTest):
        def generate_alpha(attrs: List[Dict[str, Any]], dim1, dim2, dim3):
            if attrs[0]["mode"] == "all":
                return np.random.random(size=(1)).astype(np.float32)
-            elif attrs[0]["mode"] == "channel":
+            elif attrs[0]["mode"] == "channel" and attrs[0][
+                    "data_format"] == "NCHW":
                shape = [1]
                if dim1 != 0:
                    shape.append(dim1)
@@ -48,6 +49,16 @@ class TrtConvertPreluTest(TrtLayerAutoScanTest):
                if dim3 != 0:
                    shape.append(1)
                return np.random.random(size=shape).astype(np.float32)
+            elif attrs[0]["mode"] == "channel" and attrs[0][
+                    "data_format"] == "NHWC":
+                shape = [1]
+                if dim1 != 0:
+                    shape.append(1)
+                if dim2 != 0:
+                    shape.append(1)
+                if dim3 != 0:
+                    shape.append(dim3)
+                return np.random.random(size=shape).astype(np.float32)
            elif attrs[0]["mode"] == "element":
                shape = [1]
                if dim1 != 0:
@@ -72,37 +83,45 @@ class TrtConvertPreluTest(TrtLayerAutoScanTest):
                            continue
                        for mode in ["all", "channel", "element"]:
-                            if mode == "channel" and dim1 == 0:
+                            for data_format in ['NCHW', 'NHWC']:
-                                continue
+                                if mode == "channel" and dim1 == 0 and data_format == "NCHW":
-                            dics = [{"mode": mode}]
+                                    continue
-                            ops_config = [{
+                                if mode == "channel" and dim3 == 0 and data_format == "NHWC":
-                                "op_type": "prelu",
+                                    continue
-                                "op_inputs": {
+                                dics = [{
-                                    "X": ["input_data"],
+                                    "mode": mode,
-                                    "Alpha": ["alpha_weight"]
+                                    "data_format": data_format
-                                },
+                                }]
-                                "op_outputs": {
+                                ops_config = [{
-                                    "Out": ["output_data"]
+                                    "op_type": "prelu",
-                                },
+                                    "op_inputs": {
-                                "op_attrs": dics[0]
+                                        "X": ["input_data"],
-                            }]
+                                        "Alpha": ["alpha_weight"]
-                            ops = self.generate_op_config(ops_config)
+                                    },
+                                    "op_outputs": {
-                            program_config = ProgramConfig(
+                                        "Out": ["output_data"]
-                                ops=ops,
+                                    },
-                                weights={
+                                    "op_attrs": dics[0]
-                                    "alpha_weight": TensorConfig(
+                                }]
-                                        data_gen=partial(generate_alpha, dics,
+                                ops = self.generate_op_config(ops_config)
-                                                         dim1, dim2, dim3))
-                                },
+                                program_config = ProgramConfig(
-                                inputs={
+                                    ops=ops,
-                                    "input_data": TensorConfig(
+                                    weights={
-                                        data_gen=partial(generate_input, batch,
+                                        "alpha_weight": TensorConfig(
-                                                         dim1, dim2, dim3)),
+                                            data_gen=partial(generate_alpha,
-                                },
+                                                             dics, dim1, dim2,
-                                outputs=["output_data"])
+                                                             dim3))
+                                    },
-                            yield program_config
+                                    inputs={
+                                        "input_data": TensorConfig(
+                                            data_gen=partial(generate_input,
+                                                             batch, dim1, dim2,
+                                                             dim3)),
+                                    },
+                                    outputs=["output_data"])
+                                yield program_config
    def sample_predictor_configs(
            self, program_config) -> (paddle_infer.Config, List[int], float):

--- a/python/paddle/fluid/tests/unittests/test_imperative_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_layers.py
@@ -41,10 +41,11 @@ class TestLayerPrint(unittest.TestCase):
        self.assertEqual(
            str(module), 'Hardtanh(min=-1.0, max=1.0, name=Hardtanh)')
-        module = nn.PReLU(1, 0.25, name="PReLU")
+        module = nn.PReLU(1, 0.25, name="PReLU", data_format="NCHW")
        self.assertEqual(
            str(module),
-            'PReLU(num_parameters=1, init=0.25, dtype=float32, name=PReLU)')
+            'PReLU(num_parameters=1, data_format=NCHW, init=0.25, dtype=float32, name=PReLU)'
+        )
        module = nn.ReLU()
        self.assertEqual(str(module), 'ReLU()')

--- a/python/paddle/fluid/tests/unittests/test_prelu_op.py
+++ b/python/paddle/fluid/tests/unittests/test_prelu_op.py
@@ -163,10 +163,18 @@ class PReluTest(OpTest):
        # zero.
        x_np[np.abs(x_np) < 0.005] = 0.02
-        if self.attrs == {'mode': "all"}:
+        if self.attrs == {
+                'mode': "all",
+                "data_format": "NCHW"
+        } or self.attrs == {
+                'mode': "all",
+                "data_format": "NHWC"
+        }:
            alpha_np = np.random.uniform(-1, -0.5, (1))
-        elif self.attrs == {'mode': "channel"}:
+        elif self.attrs == {'mode': "channel", "data_format": "NCHW"}:
            alpha_np = np.random.uniform(-1, -0.5, [1, self.x_shape[1], 1, 1])
+        elif self.attrs == {'mode': "channel", "data_format": "NHWC"}:
+            alpha_np = np.random.uniform(-1, -0.5, [1, 1, 1, self.x_shape[-1]])
        else:
            alpha_np = np.random.uniform(-1, -0.5, [1] + self.x_shape[1:])
        alpha_np = alpha_np.astype(self.dtype)
@@ -176,11 +184,14 @@ class PReluTest(OpTest):
        # NOTE(zhiqu): reshape inputs['Alpha'] from [1, 100, 1, 1] to [1, 100] + [1]*len(x.shape[2:])
        # since np operands could not be broadcast together with shapes (1,100,2,2,2,3) (1,100,1,1) 	
        reshaped_alpha = self.inputs['Alpha']
-        if self.attrs == {'mode': "channel"}:
+        if self.attrs == {'mode': "channel", "data_format": "NCHW"}:
            reshaped_alpha = np.reshape(
                self.inputs['Alpha'],
                [1, self.x_shape[1]] + [1] * len(self.x_shape[2:]))
+        elif self.attrs == {'mode': "channel", "data_format": "NHWC"}:
+            reshaped_alpha = np.reshape(
+                self.inputs['Alpha'],
+                [1] + [1] * len(self.x_shape[1:-1]) + [self.x_shape[-1]])
        out_np = np.maximum(self.inputs['X'], 0.)
        out_np = out_np + np.minimum(self.inputs['X'], 0.) * reshaped_alpha
        assert out_np is not self.inputs['X']
@@ -193,7 +204,7 @@ class PReluTest(OpTest):
        self.x_shape = [2, 100, 3, 4]
    def init_attr(self):
-        self.attrs = {'mode': "channel"}
+        self.attrs = {'mode': "channel", "data_format": "NCHW"}
    def test_check_output(self):
        self.check_output()
@@ -210,7 +221,18 @@ class TestModeAll(PReluTest):
        self.x_shape = [2, 3, 4, 5]
    def init_attr(self):
-        self.attrs = {'mode': "all"}
+        self.attrs = {'mode': "all", "data_format": "NCHW"}
+@skip_check_grad_ci(
+    reason="[skip shape check] Input(Alpha) must be 1-D and only has one data in 'all' mode"
+)
+class TestModeAllNHWC(PReluTest):
+    def init_input_shape(self):
+        self.x_shape = [2, 3, 4, 50]
+    def init_attr(self):
+        self.attrs = {'mode': "all", "data_format": "NHWC"}
 class TestModeElt(PReluTest):
@@ -218,7 +240,15 @@ class TestModeElt(PReluTest):
        self.x_shape = [3, 2, 5, 10]
    def init_attr(self):
-        self.attrs = {'mode': "element"}
+        self.attrs = {'mode': "element", "data_format": "NCHW"}
+class TestModeEltNHWC(PReluTest):
+    def init_input_shape(self):
+        self.x_shape = [3, 2, 5, 10]
+    def init_attr(self):
+        self.attrs = {'mode': "element", "data_format": "NHWC"}
 @skip_check_grad_ci(
@@ -229,7 +259,18 @@ class TestModeAllRank3(PReluTest):
        self.x_shape = [1, 200, 3]
    def init_attr(self):
-        self.attrs = {'mode': "all"}
+        self.attrs = {'mode': "all", "data_format": "NCHW"}
+@skip_check_grad_ci(
+    reason="[skip shape check] Input(Alpha) must be 1-D and only has one data in 'all' mode"
+)
+class TestModeAllRank3NHWC(PReluTest):
+    def init_input_shape(self):
+        self.x_shape = [1, 200, 3]
+    def init_attr(self):
+        self.attrs = {'mode': "all", "data_format": "NHWC"}
 @skip_check_grad_ci(
@@ -240,7 +281,18 @@ class TestModeAllRank6(PReluTest):
        self.x_shape = [1, 2, 3, 4, 5, 6]
    def init_attr(self):
-        self.attrs = {'mode': "all"}
+        self.attrs = {'mode': "all", "data_format": "NCHW"}
+@skip_check_grad_ci(
+    reason="[skip shape check] Input(Alpha) must be 1-D and only has one data in 'all' mode"
+)
+class TestModeAllRank6NHWC(PReluTest):
+    def init_input_shape(self):
+        self.x_shape = [1, 2, 3, 4, 5, 6]
+    def init_attr(self):
+        self.attrs = {'mode': "all", "data_format": "NHWC"}
 class TestModeChannelRank3(PReluTest):
@@ -248,7 +300,15 @@ class TestModeChannelRank3(PReluTest):
        self.x_shape = [1, 200, 3]
    def init_attr(self):
-        self.attrs = {'mode': "channel"}
+        self.attrs = {'mode': "channel", "data_format": "NCHW"}
+class TestModeChannelRank3NHWC(PReluTest):
+    def init_input_shape(self):
+        self.x_shape = [1, 3, 100]
+    def init_attr(self):
+        self.attrs = {'mode': "channel", "data_format": "NHWC"}
 class TestModeChannelRank6(PReluTest):
@@ -256,7 +316,15 @@ class TestModeChannelRank6(PReluTest):
        self.x_shape = [1, 100, 2, 2, 2, 2]
    def init_attr(self):
-        self.attrs = {'mode': "channel"}
+        self.attrs = {'mode': "channel", "data_format": "NCHW"}
+class TestModeChannelRank6NHWC(PReluTest):
+    def init_input_shape(self):
+        self.x_shape = [1, 2, 2, 2, 2, 100]
+    def init_attr(self):
+        self.attrs = {'mode': "channel", "data_format": "NHWC"}
 class TestModeElementRank3(PReluTest):
@@ -264,7 +332,15 @@ class TestModeElementRank3(PReluTest):
        self.x_shape = [3, 10, 10]
    def init_attr(self):
-        self.attrs = {'mode': "element"}
+        self.attrs = {'mode': "element", "data_format": "NCHW"}
+class TestModeElementRank3NHWC(PReluTest):
+    def init_input_shape(self):
+        self.x_shape = [3, 10, 10]
+    def init_attr(self):
+        self.attrs = {'mode': "element", "data_format": "NHWC"}
 class TestModeElementRank6(PReluTest):
@@ -272,7 +348,15 @@ class TestModeElementRank6(PReluTest):
        self.x_shape = [3, 2, 2, 4, 5, 2]
    def init_attr(self):
-        self.attrs = {'mode': "element"}
+        self.attrs = {'mode': "element", "data_format": "NCHW"}
+class TestModeElementRank6NHWC(PReluTest):
+    def init_input_shape(self):
+        self.x_shape = [3, 2, 2, 4, 5, 2]
+    def init_attr(self):
+        self.attrs = {'mode': "element", "data_format": "NHWC"}
 def create_test_fp16_class(parent,
@@ -311,9 +395,16 @@ create_test_fp16_class(TestModeChannelRank3)
 create_test_fp16_class(TestModeChannelRank6)
 create_test_fp16_class(TestModeElementRank3)
 create_test_fp16_class(TestModeElementRank6)
+create_test_fp16_class(TestModeEltNHWC)
+create_test_fp16_class(TestModeAllRank3NHWC)
+create_test_fp16_class(TestModeAllRank6NHWC)
+create_test_fp16_class(TestModeChannelRank3NHWC)
+create_test_fp16_class(TestModeChannelRank6NHWC)
+create_test_fp16_class(TestModeElementRank3NHWC)
+create_test_fp16_class(TestModeElementRank6NHWC)
-def prelu_t(x, mode, param_attr=None, name=None):
+def prelu_t(x, mode, param_attr=None, name=None, data_format='NCHW'):
    helper = fluid.layer_helper.LayerHelper('prelu', **locals())
    alpha_shape = [1, x.shape[1], 1, 1]
    dtype = helper.input_dtype(input_param_name='x')
@@ -328,13 +419,19 @@ def prelu_t(x, mode, param_attr=None, name=None):
        type="prelu",
        inputs={"X": x,
                'Alpha': alpha},
-        attrs={"mode": mode},
+        attrs={"mode": mode,
+               'data_format': data_format},
        outputs={"Out": out})
    return out
 # error message test if mode is not one of 'all', 'channel', 'element'
 class TestModeError(unittest.TestCase):
+    def setUp(self):
+        self.place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(
+        ) else paddle.CPUPlace()
+        self.x_np = np.ones([1, 2, 3, 4]).astype('float32')
    def test_mode_error(self):
        main_program = Program()
        with fluid.program_guard(main_program, Program()):
@@ -344,6 +441,24 @@ class TestModeError(unittest.TestCase):
            except Exception as e:
                assert (e.args[0].find('InvalidArgument') != -1)
+    def test_data_format_error1(self):
+        main_program = Program()
+        with fluid.program_guard(main_program, Program()):
+            x = fluid.data(name='x', shape=[2, 3, 4, 5])
+            try:
+                y = prelu_t(x, 'channel', data_format='N')
+            except Exception as e:
+                assert (e.args[0].find('InvalidArgument') != -1)
+    def test_data_format_error2(self):
+        main_program = Program()
+        with fluid.program_guard(main_program, Program()):
+            x = fluid.data(name='x', shape=[2, 3, 4, 5])
+            try:
+                y = paddle.static.nn.prelu(x, 'channel', data_format='N')
+            except ValueError as e:
+                pass
 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/nn/functional/activation.py
+++ b/python/paddle/nn/functional/activation.py
@@ -442,7 +442,7 @@ def leaky_relu(x, negative_slope=0.01, name=None):
    return out
-def prelu(x, weight, name=None):
+def prelu(x, weight, data_format="NCHW", name=None):
    """
    prelu activation.
@@ -456,6 +456,8 @@ def prelu(x, weight, name=None):
            The weight shape is [1] or [in], where `in` is the input channel of ``x``.
        name (str, optional): Name for the operation (optional, default is None).
            For more information, please refer to :ref:`api_guide_Name`.
+        data_format(str, optional): Data format that specifies the layout of input.
+            It may be "NC", "NCL", "NCHW", "NCDHW", "NLC", "NHWC" or "NDHWC". Default: "NCHW".
    Returns:
        A Tensor with the same data type and shape as ``x`` .
@@ -490,19 +492,34 @@ def prelu(x, weight, name=None):
    assert len(weight.shape
               ) == 1, "The dim count of weight shape should be 1 in prelu()."
-    # NOTE(): The input of this API should be ``N,C,...`` format,
-    # which means x.shape[0] is batch_size and x.shape[0] is channel.
    mode = 'all'
    if weight.shape[0] > 1:
+        true_data_format = [
+            'NC', 'NCL', 'NCHW', 'NCDHW', 'NLC', 'NHWC', 'NDHWC'
+        ]
+        if data_format not in true_data_format:
+            raise ValueError(
+                "data_format must be one of 'NC', 'NCL', 'NCHW', 'NCDHW', "
+                "'NLC', 'NHWC', 'NDHWC' but receive {}".format(data_format))
+        data_format = 'NCHW' if data_format[1] == 'C' else 'NHWC'
        assert len(
            x.shape
        ) > 1, "The dim count of x should be equal or larger than 2 in prelu() when weight shape is not [1]."
-        assert weight.shape[0] == x.shape[
-            1], "The weight size should be equal to x input channel in prelu() when weight shape is not [1]."
+        #NOTE(GuoxiaWang): support NHWC data format
+        if data_format == 'NHWC':
+            assert weight.shape[0] == x.shape[
+                -1], "The weight size should be equal to x input channel in prelu() when weight shape is not [1]."
+        else:
+            assert weight.shape[0] == x.shape[
+                1], "The weight size should be equal to x input channel in prelu() when weight shape is not [1]."
        mode = 'channel'
    if in_dygraph_mode():
-        return _C_ops.prelu(x, weight, 'mode', mode)
+        return _C_ops.prelu(x, weight, 'mode', mode, 'data_format', data_format)
    helper = LayerHelper('prelu', **locals())
    out = helper.create_variable_for_type_inference(x.dtype)
@@ -511,7 +528,8 @@ def prelu(x, weight, name=None):
        inputs={"X": x,
                "Alpha": weight},
        outputs={"Out": out},
-        attrs={"mode": mode})
+        attrs={"mode": mode,
+               "data_format": data_format})
    return out

--- a/python/paddle/nn/layer/activation.py
+++ b/python/paddle/nn/layer/activation.py
@@ -376,6 +376,8 @@ class PReLU(Layer):
            Default is None. For more information, please refer to :ref:`api_paddle_ParamAttr`.
        name (str, optional): Name for the operation (optional, default is None).
            For more information, please refer to :ref:`api_guide_Name`.
+        data_format(str, optional): Data format that specifies the layout of input.
+            It may be "NC", "NCL", "NCHW", "NCDHW", "NLC", "NHWC" or "NDHWC". Default: "NCHW".
    Shape:
        - input: Tensor with any shape. Default dtype is float32.
@@ -406,13 +408,18 @@ class PReLU(Layer):
            #    [ 6.  ,  7.  ,  8.  ,  9.  ]]]]
    """
-    def __init__(self, num_parameters=1, init=0.25, weight_attr=None,
+    def __init__(self,
+                 num_parameters=1,
+                 init=0.25,
+                 weight_attr=None,
+                 data_format="NCHW",
                 name=None):
        super(PReLU, self).__init__()
        self._num_parameters = num_parameters
        self._init = init
        self._weight_attr = weight_attr
        self._name = name
+        self._data_format = data_format
        self._weight = self.create_parameter(
            attr=self._weight_attr,
@@ -422,12 +429,13 @@ class PReLU(Layer):
            default_initializer=Constant(self._init))
    def forward(self, x):
-        return F.prelu(x, self._weight)
+        return F.prelu(x, self._weight, data_format=self._data_format)
    def extra_repr(self):
        name_str = ', name={}'.format(self._name) if self._name else ''
-        return 'num_parameters={}, init={}, dtype={}{}'.format(
+        return 'num_parameters={}, data_format={}, init={}, dtype={}{}'.format(
-            self._num_parameters, self._init, self._dtype, name_str)
+            self._num_parameters, self._data_format, self._init, self._dtype,
+            name_str)
 class ReLU(Layer):