Fix PReLU bug: not fuse PReLU anymore.

79e6e50b · liuqi · de12a6df · 79e6e50b · 79e6e50b · 79e6e50b
34 changed file
--- a/mace/kernels/activation.h
+++ b/mace/kernels/activation.h
@@ -46,8 +46,7 @@ void DoActivation(const T *input_ptr,
                  T *output_ptr,
                  const index_t size,
                  const ActivationType type,
-                  const float relux_max_limit,
+                  const float relux_max_limit) {
-                  const float prelu_alpha) {
  MACE_CHECK(DataTypeToEnum<T>::value != DataType::DT_HALF);
  switch (type) {
@@ -66,17 +65,6 @@ void DoActivation(const T *input_ptr,
                                 static_cast<T>(relux_max_limit));
      }
      break;
-    case PRELU:
-#pragma omp parallel for
-      for (index_t i = 0; i < size; ++i) {
-        T in = input_ptr[i];
-        if (in < 0) {
-          output_ptr[i] = in * prelu_alpha;
-        } else {
-          output_ptr[i] = in;
-        }
-      }
-      break;
    case TANH:
 #pragma omp parallel for
      for (index_t i = 0; i < size; ++i) {
@@ -95,45 +83,70 @@ void DoActivation(const T *input_ptr,
  }
 }
+template <typename T>
+void PReLUActivation(const T *input_ptr,
+                     const index_t size,
+                     const index_t input_chan,
+                     const T *alpha_ptr,
+                     T *output_ptr) {
+#pragma omp parallel for
+  for (index_t i = 0; i < size; ++i) {
+    const index_t chan_idx = i % input_chan;
+    T in = input_ptr[i];
+    if (in < 0) {
+      output_ptr[i] = in * alpha_ptr[chan_idx];
+    } else {
+      output_ptr[i] = in;
+    }
+  }
+}
 template <DeviceType D, typename T>
 class ActivationFunctor {
 public:
-  ActivationFunctor(ActivationType type, T relux_max_limit, T prelu_alpha)
+  ActivationFunctor(ActivationType type, T relux_max_limit)
      : activation_(type),
-        relux_max_limit_(relux_max_limit),
+        relux_max_limit_(relux_max_limit){}
-        prelu_alpha_(prelu_alpha) {}
-  void operator()(const Tensor *input, Tensor *output, StatsFuture *future) {
+  void operator()(const Tensor *input, 
+                  const Tensor *alpha,
+                  Tensor *output,
+                  StatsFuture *future) {
    const T *input_ptr = input->data<T>();
    T *output_ptr = output->mutable_data<T>();
-    DoActivation(input_ptr, output_ptr, output->size(), activation_, relux_max_limit_,
+    if (activation_ == PRELU) {
-                 prelu_alpha_);
+      const T *alpha_ptr = alpha == nullptr ? nullptr : alpha->data<T>();
+      PReLUActivation(input_ptr, output->size(), input->dim(3), alpha_ptr, output_ptr); 
+    } else {
+      DoActivation(input_ptr, output_ptr, output->size(), activation_, relux_max_limit_);
+    }
  }
 private:
  ActivationType activation_;
  T relux_max_limit_;
-  T prelu_alpha_;
 };
 template <>
 void ActivationFunctor<DeviceType::NEON, float>::operator()(
-    const Tensor *input, Tensor *output, StatsFuture *future);
+    const Tensor *input, const Tensor *alpha, Tensor *output, StatsFuture *future);
 template <typename T>
 class ActivationFunctor<DeviceType::OPENCL, T> {
 public:
-  ActivationFunctor(ActivationType type, T relux_max_limit, T prelu_alpha)
+  ActivationFunctor(ActivationType type, T relux_max_limit)
      : activation_(type),
-        relux_max_limit_(relux_max_limit),
+        relux_max_limit_(relux_max_limit){}
-        prelu_alpha_(prelu_alpha) {}
-  void operator()(const Tensor *input, Tensor *output, StatsFuture *future);
+  void operator()(const Tensor *input,
+                  const Tensor *alpha,
+                  Tensor *output,
+                  StatsFuture *future);
 private:
  ActivationType activation_;
  T relux_max_limit_;
-  T prelu_alpha_;
  cl::Kernel kernel_;
  std::string tuning_key_prefix_;
 };

--- a/mace/kernels/batch_norm.h
+++ b/mace/kernels/batch_norm.h
@@ -21,27 +21,23 @@ namespace kernels {
 struct BatchNormFunctorBase {
  BatchNormFunctorBase(bool folded_constant,
                       const ActivationType activation,
-                       const float relux_max_limit,
+                       const float relux_max_limit)
-                       const float prelu_alpha)
      : folded_constant_(folded_constant),
        activation_(activation),
-        relux_max_limit_(relux_max_limit),
+        relux_max_limit_(relux_max_limit){}
-        prelu_alpha_(prelu_alpha) {}
  const bool folded_constant_;
  const ActivationType activation_;
  const float relux_max_limit_;
-  const float prelu_alpha_;
 };
 template <DeviceType D, typename T>
 struct BatchNormFunctor : BatchNormFunctorBase {
  BatchNormFunctor(const bool folded_constant,
                   const ActivationType activation,
-                   const float relux_max_limit,
+                   const float relux_max_limit)
-                   const float prelu_alpha)
      : BatchNormFunctorBase(
-            folded_constant, activation, relux_max_limit, prelu_alpha) {}
+            folded_constant, activation, relux_max_limit) {}
  void operator()(const Tensor *input,
                  const Tensor *scale,
@@ -132,7 +128,7 @@ struct BatchNormFunctor : BatchNormFunctorBase {
      }
    }
    DoActivation(output_ptr, output_ptr, output->NumElements(), activation_,
-                 relux_max_limit_, prelu_alpha_);
+                 relux_max_limit_);
  }
 };
@@ -150,10 +146,9 @@ template <typename T>
 struct BatchNormFunctor<DeviceType::OPENCL, T> : BatchNormFunctorBase {
  BatchNormFunctor(const bool folded_constant,
                   const ActivationType activation,
-                   const float relux_max_limit,
+                   const float relux_max_limit)
-                   const float prelu_alpha)
      : BatchNormFunctorBase(
-            folded_constant, activation, relux_max_limit, prelu_alpha) {}
+            folded_constant, activation, relux_max_limit) {}
  void operator()(const Tensor *input,
                  const Tensor *scale,
                  const Tensor *offset,

--- a/mace/kernels/conv_2d.h
+++ b/mace/kernels/conv_2d.h
@@ -182,15 +182,13 @@ struct Conv2dFunctorBase {
                    const std::vector<int> &paddings,
                    const int *dilations,
                    const ActivationType activation,
-                    const float relux_max_limit,
+                    const float relux_max_limit)
-                    const float prelu_alpha)
      : strides_(strides),
        padding_type_(padding_type),
        paddings_(paddings),
        dilations_(dilations),
        activation_(activation),
-        relux_max_limit_(relux_max_limit),
+        relux_max_limit_(relux_max_limit){}
-        prelu_alpha_(prelu_alpha) {}
  const int *strides_;  // [stride_h, stride_w]
  const Padding padding_type_;
@@ -198,7 +196,6 @@ struct Conv2dFunctorBase {
  const int *dilations_;  // [dilation_h, dilation_w]
  const ActivationType activation_;
  const float relux_max_limit_;
-  const float prelu_alpha_;
 };
 template <DeviceType D, typename T>
@@ -208,15 +205,13 @@ struct Conv2dFunctor : Conv2dFunctorBase {
                const std::vector<int> &paddings,
                const int *dilations,
                const ActivationType activation,
-                const float relux_max_limit,
+                const float relux_max_limit)
-                const float prelu_alpha)
      : Conv2dFunctorBase(strides,
                          padding_type,
                          paddings,
                          dilations,
                          activation,
-                          relux_max_limit,
+                          relux_max_limit) {}
-                          prelu_alpha) {}
  void operator()(const Tensor *input,   // NHWC
                  const Tensor *filter,  // HWOI
@@ -622,7 +617,7 @@ struct Conv2dFunctor : Conv2dFunctorBase {
      }
    }
    DoActivation(output_data, output_data, output->NumElements(), activation_,
-                 relux_max_limit_, prelu_alpha_);
+                 relux_max_limit_);
  }
 };
@@ -640,15 +635,13 @@ struct Conv2dFunctor<DeviceType::OPENCL, T> : Conv2dFunctorBase {
                const std::vector<int> &paddings,
                const int *dilations,
                const ActivationType activation,
-                const float relux_max_limit,
+                const float relux_max_limit)
-                const float prelu_alpha)
      : Conv2dFunctorBase(strides,
                          padding_type,
                          paddings,
                          dilations,
                          activation,
-                          relux_max_limit,
+                          relux_max_limit) {}
-                          prelu_alpha) {}
  void operator()(const Tensor *input,
                  const Tensor *filter,

--- a/mace/kernels/depthwise_conv2d.h
+++ b/mace/kernels/depthwise_conv2d.h
@@ -241,15 +241,13 @@ struct DepthwiseConv2dFunctorBase {
                             const std::vector<int> &paddings,
                             const int *dilations,
                             const ActivationType activation,
-                             const float relux_max_limit,
+                             const float relux_max_limit)
-                             const float prelu_alpha)
      : strides_(strides),
        padding_type_(padding_type),
        paddings_(paddings),
        dilations_(dilations),
        activation_(activation),
-        relux_max_limit_(relux_max_limit),
+        relux_max_limit_(relux_max_limit){}
-        prelu_alpha_(prelu_alpha) {}
  const int *strides_;  // [stride_h, stride_w]
  const Padding padding_type_;
@@ -257,7 +255,6 @@ struct DepthwiseConv2dFunctorBase {
  const int *dilations_;  // [dilation_h, dilation_w]
  const ActivationType activation_;
  const float relux_max_limit_;
-  const float prelu_alpha_;
 };
 template <DeviceType D, typename T>
@@ -267,15 +264,13 @@ struct DepthwiseConv2dFunctor : public DepthwiseConv2dFunctorBase {
                         const std::vector<int> &paddings,
                         const int *dilations,
                         const ActivationType activation,
-                         const float relux_max_limit,
+                         const float relux_max_limit)
-                         const float prelu_alpha)
      : DepthwiseConv2dFunctorBase(strides,
                                   padding_type,
                                   paddings,
                                   dilations,
                                   activation,
-                                   relux_max_limit,
+                                   relux_max_limit) {}
-                                   prelu_alpha) {}
  void operator()(const Tensor *input,   // NHWC
                  const Tensor *filter,  // HWIM
@@ -408,7 +403,7 @@ struct DepthwiseConv2dFunctor : public DepthwiseConv2dFunctorBase {
    output_ptr = output->mutable_data<T>();
    DoActivation(output_ptr, output_ptr, output->NumElements(), activation_,
-                 relux_max_limit_, prelu_alpha_);
+                 relux_max_limit_);
  }
 };
@@ -428,15 +423,13 @@ struct DepthwiseConv2dFunctor<DeviceType::OPENCL, T>
                         const std::vector<int> &paddings,
                         const int *dilations,
                         const ActivationType activation,
-                         const float relux_max_limit,
+                         const float relux_max_limit)
-                         const float prelu_alpha)
      : DepthwiseConv2dFunctorBase(strides,
                                   padding_type,
                                   paddings,
                                   dilations,
                                   activation,
-                                   relux_max_limit,
+                                   relux_max_limit) {}
-                                   prelu_alpha) {}
  void operator()(const Tensor *input,
                  const Tensor *filter,

--- a/mace/kernels/fully_connected.h
+++ b/mace/kernels/fully_connected.h
@@ -15,23 +15,19 @@ namespace kernels {
 struct FullyConnectedBase {
  FullyConnectedBase(const ActivationType activation,
-                     const float relux_max_limit,
+                     const float relux_max_limit)
-                     const float prelu_alpha)
      : activation_(activation),
-        relux_max_limit_(relux_max_limit),
+        relux_max_limit_(relux_max_limit){}
-        prelu_alpha_(prelu_alpha) {}
  const ActivationType activation_;
  const float relux_max_limit_;
-  const float prelu_alpha_;
 };
 template<DeviceType D, typename T>
 struct FullyConnectedFunctor : FullyConnectedBase {
  FullyConnectedFunctor(const ActivationType activation,
-                        const float relux_max_limit,
+                        const float relux_max_limit) :
-                        const float prelu_alpha) :
+      FullyConnectedBase(activation, relux_max_limit) {}
-      FullyConnectedBase(activation, relux_max_limit, prelu_alpha) {}
  void operator()(const Tensor *input,
                  const Tensor *weight,
@@ -70,16 +66,15 @@ struct FullyConnectedFunctor : FullyConnectedBase {
    }
    DoActivation(output_ptr, output_ptr, output->NumElements(), activation_,
-                 relux_max_limit_, prelu_alpha_);
+                 relux_max_limit_);
  }
 };
 template<typename T>
 struct FullyConnectedFunctor<DeviceType::OPENCL, T> : FullyConnectedBase {
  FullyConnectedFunctor(const ActivationType activation,
-                        const float relux_max_limit,
+                        const float relux_max_limit) :
-                        const float prelu_alpha) :
+      FullyConnectedBase(activation, relux_max_limit) {}
-      FullyConnectedBase(activation, relux_max_limit, prelu_alpha) {}
  void operator()(const Tensor *input,
                  const Tensor *weight,

--- a/mace/kernels/opencl/activation_opencl.cc
+++ b/mace/kernels/opencl/activation_opencl.cc
@@ -14,6 +14,7 @@ namespace kernels {
 template <typename T>
 void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
+                                                          const Tensor *alpha,
                                                          Tensor *output,
                                                          StatsFuture *future) {
  const index_t batch = input->dim(0);
@@ -60,8 +61,10 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
        runtime->BuildKernel("activation", kernel_name, built_options);
    int idx = 0;
    kernel_.setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer())));
+    if (activation_ == PRELU) {
+      kernel_.setArg(idx++, *(static_cast<const cl::Image2D *>(alpha->buffer())));
+    }
    kernel_.setArg(idx++, static_cast<float>(relux_max_limit_));
-    kernel_.setArg(idx++, static_cast<float>(prelu_alpha_));
    kernel_.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer())));
  }

--- a/mace/kernels/opencl/batch_norm_opencl.cc
+++ b/mace/kernels/opencl/batch_norm_opencl.cc
@@ -50,9 +50,6 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
      case RELUX:
        built_options.emplace("-DUSE_RELUX");
        break;
-      case PRELU:
-        built_options.emplace("-DUSE_PRELU");
-        break;
      case TANH:
        built_options.emplace("-DUSE_TANH");
        break;
@@ -79,7 +76,6 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
    }
    kernel_.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer())));
    kernel_.setArg(idx++, relux_max_limit_);
-    kernel_.setArg(idx++, prelu_alpha_);
  }
  const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),

--- a/mace/kernels/opencl/cl/activation.cl
+++ b/mace/kernels/opencl/cl/activation.cl
 #include <common.h>
 __kernel void activation(__read_only image2d_t input,
+#ifdef USE_PRELU
+                         __read_only image2d_t alpha,
+#endif
                         __private const float relux_max_limit,
-                         __private const float prelu_alpha,
                         __write_only image2d_t output) {
  const int ch_blk = get_global_id(0);
  const int w = get_global_id(1);
@@ -11,7 +13,12 @@ __kernel void activation(__read_only image2d_t input,
  const int pos = mad24(ch_blk, width, w);
  DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(pos, hb));
-  DATA_TYPE4 out = do_activation(in, relux_max_limit, prelu_alpha);
+#ifdef USE_PRELU
+  DATA_TYPE4 prelu_alpha = READ_IMAGET(alpha, SAMPLER, (int2)(ch_blk, 0));
+  DATA_TYPE4 out = do_activation(in, prelu_alpha, relux_max_limit);
+#else
+  DATA_TYPE4 out = do_activation(in, relux_max_limit);
+#endif
  WRITE_IMAGET(output, (int2)(pos, hb), out);
 }
--- a/mace/kernels/opencl/cl/batch_norm.cl
+++ b/mace/kernels/opencl/cl/batch_norm.cl
@@ -9,8 +9,7 @@ __kernel void batch_norm(__read_only image2d_t input,
                         __private const float epsilon,
 #endif
                         __write_only image2d_t output,
-                         __private const float relux_max_limit,
+                         __private const float relux_max_limit) {
-                         __private const float prelu_alpha) {
  const int ch_blk = get_global_id(0);
  const int w = get_global_id(1);
  const int hb = get_global_id(2);
@@ -35,8 +34,8 @@ __kernel void batch_norm(__read_only image2d_t input,
  DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(pos, hb));
  DATA_TYPE4 out = mad(in, bn_scale, bn_offset);
-#if defined(USE_RELU) || defined(USE_RELUX) || defined(USE_PRELU) || defined(USE_TANH) || defined(USE_SIGMOID)
+#if defined(USE_RELU) || defined(USE_RELUX) || defined(USE_TANH) || defined(USE_SIGMOID)
-  out = do_activation(out, relux_max_limit, prelu_alpha);
+  out = do_activation(out, relux_max_limit);
 #endif
  WRITE_IMAGET(output, (int2)(pos, hb), out);

--- a/mace/kernels/opencl/cl/common.h
+++ b/mace/kernels/opencl/cl/common.h
@@ -22,8 +22,10 @@ __constant sampler_t SAMPLER = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP |
 inline DATA_TYPE4 do_activation(DATA_TYPE4 in,
-                                __private const float relux_max_limit,
+#ifdef USE_PRELU
-                                __private const float prelu_alpha) {
+                                DATA_TYPE4 prelu_alpha,
+#endif
+                                __private const float relux_max_limit) {
  DATA_TYPE4 out;
 #ifdef USE_RELU
  out = fmax(in, 0);

--- a/mace/kernels/opencl/cl/conv_2d.cl
+++ b/mace/kernels/opencl/cl/conv_2d.cl
@@ -7,7 +7,6 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
 #endif
                      __write_only image2d_t output,
                      __private const float relux_max_limit,
-                      __private const float prelu_alpha,
                      __private const int in_height,
                      __private const int in_width,
                      __private const int in_ch_blks,
@@ -112,11 +111,11 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
    }
  }
-#if defined(USE_RELU) || defined(USE_RELUX) || defined(USE_PRELU) || defined(USE_TANH) || defined(USE_SIGMOID)
+#if defined(USE_RELU) || defined(USE_RELUX) || defined(USE_TANH) || defined(USE_SIGMOID)
-  out0 = do_activation(out0, relux_max_limit, prelu_alpha);
+  out0 = do_activation(out0, relux_max_limit);
-  out1 = do_activation(out1, relux_max_limit, prelu_alpha);
+  out1 = do_activation(out1, relux_max_limit);
-  out2 = do_activation(out2, relux_max_limit, prelu_alpha);
+  out2 = do_activation(out2, relux_max_limit);
-  out3 = do_activation(out3, relux_max_limit, prelu_alpha);
+  out3 = do_activation(out3, relux_max_limit);
 #endif
  const int out_x_base = mul24(out_ch_blk, out_width);

--- a/mace/kernels/opencl/cl/conv_2d_1x1.cl
+++ b/mace/kernels/opencl/cl/conv_2d_1x1.cl
@@ -7,7 +7,6 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
 #endif
                          __write_only image2d_t output,
                          __private const float relux_max_limit,
-                          __private const float prelu_alpha,
                          __private const int in_height,
                          __private const int in_width,
                          __private const int in_ch_blks,
@@ -86,11 +85,11 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
    filter_x_base += 4;
  }
-#if defined(USE_RELU) || defined(USE_RELUX) || defined(USE_PRELU) || defined(USE_TANH) || defined(USE_SIGMOID)
+#if defined(USE_RELU) || defined(USE_RELUX) || defined(USE_TANH) || defined(USE_SIGMOID)
-  out0 = do_activation(out0, relux_max_limit, prelu_alpha);
+  out0 = do_activation(out0, relux_max_limit);
-  out1 = do_activation(out1, relux_max_limit, prelu_alpha);
+  out1 = do_activation(out1, relux_max_limit);
-  out2 = do_activation(out2, relux_max_limit, prelu_alpha);
+  out2 = do_activation(out2, relux_max_limit);
-  out3 = do_activation(out3, relux_max_limit, prelu_alpha);
+  out3 = do_activation(out3, relux_max_limit);
 #endif
  const int out_x_base = mul24(out_ch_blk, width);

--- a/mace/kernels/opencl/cl/conv_2d_3x3.cl
+++ b/mace/kernels/opencl/cl/conv_2d_3x3.cl
@@ -7,7 +7,6 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
 #endif
                          __write_only image2d_t output,
                          __private const float relux_max_limit,
-                          __private const float prelu_alpha,
                          __private const int in_height,
                          __private const int in_width,
                          __private const int in_ch_blks,
@@ -120,12 +119,12 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
    }
  }
-#if defined(USE_RELU) || defined(USE_RELUX) || defined(USE_PRELU) || defined(USE_TANH) || defined(USE_SIGMOID)
+#if defined(USE_RELU) || defined(USE_RELUX) || defined(USE_TANH) || defined(USE_SIGMOID)
-  out0 = do_activation(out0, relux_max_limit, prelu_alpha);
+  out0 = do_activation(out0, relux_max_limit);
-  out1 = do_activation(out1, relux_max_limit, prelu_alpha);
+  out1 = do_activation(out1, relux_max_limit);
-  out2 = do_activation(out2, relux_max_limit, prelu_alpha);
+  out2 = do_activation(out2, relux_max_limit);
-  out3 = do_activation(out3, relux_max_limit, prelu_alpha);
+  out3 = do_activation(out3, relux_max_limit);
-  out4 = do_activation(out4, relux_max_limit, prelu_alpha);
+  out4 = do_activation(out4, relux_max_limit);
 #endif
  const int out_x_base = mul24(out_ch_blk, out_width);

--- a/mace/kernels/opencl/cl/depthwise_conv2d.cl
+++ b/mace/kernels/opencl/cl/depthwise_conv2d.cl
@@ -8,7 +8,6 @@ __kernel void depthwise_conv2d(__read_only image2d_t input, /* [c%4 * w * c/4, h
 #endif
                               __write_only image2d_t output,
                               __private const float relux_max_limit,
-                               __private const float prelu_alpha,
                               __private const short in_height,
                               __private const short in_width,
                               __private const short in_ch_blks,
@@ -103,11 +102,11 @@ __kernel void depthwise_conv2d(__read_only image2d_t input, /* [c%4 * w * c/4, h
    in_hb_idx += dilation_h;
  }
-#if defined(USE_RELU) || defined(USE_RELUX) || defined(USE_PRELU) || defined(USE_TANH) || defined(USE_SIGMOID)
+#if defined(USE_RELU) || defined(USE_RELUX) || defined(USE_TANH) || defined(USE_SIGMOID)
-  out0 = do_activation(out0, relux_max_limit, prelu_alpha);
+  out0 = do_activation(out0, relux_max_limit);
-  out1 = do_activation(out1, relux_max_limit, prelu_alpha);
+  out1 = do_activation(out1, relux_max_limit);
-  out2 = do_activation(out2, relux_max_limit, prelu_alpha);
+  out2 = do_activation(out2, relux_max_limit);
-  out3 = do_activation(out3, relux_max_limit, prelu_alpha);
+  out3 = do_activation(out3, relux_max_limit);
 #endif
  const short out_x_base = mul24(out_ch_blk, out_width);
@@ -134,7 +133,6 @@ __kernel void depthwise_conv2d_s1(__read_only image2d_t input, /* [c%4 * w * c/4
 #endif
                                  __write_only image2d_t output,
                                  __private const DATA_TYPE relux_max_limit,
-                                  __private const DATA_TYPE prelu_alpha,
                                  __private const short in_height,
                                  __private const short in_width,
                                  __private const short in_ch_blks,
@@ -220,11 +218,11 @@ __kernel void depthwise_conv2d_s1(__read_only image2d_t input, /* [c%4 * w * c/4
    in_hb_idx += 1;
  }
-#if defined(USE_RELU) || defined(USE_RELUX) || defined(USE_PRELU) || defined(USE_TANH) || defined(USE_SIGMOID)
+#if defined(USE_RELU) || defined(USE_RELUX) || defined(USE_TANH) || defined(USE_SIGMOID)
-  out0 = do_activation(out0, relux_max_limit, prelu_alpha);
+  out0 = do_activation(out0, relux_max_limit);
-  out1 = do_activation(out1, relux_max_limit, prelu_alpha);
+  out1 = do_activation(out1, relux_max_limit);
-  out2 = do_activation(out2, relux_max_limit, prelu_alpha);
+  out2 = do_activation(out2, relux_max_limit);
-  out3 = do_activation(out3, relux_max_limit, prelu_alpha);
+  out3 = do_activation(out3, relux_max_limit);
 #endif
  const short out_x_base = mul24(out_ch_blk, out_width);

--- a/mace/kernels/opencl/cl/fully_connected.cl
+++ b/mace/kernels/opencl/cl/fully_connected.cl
@@ -10,8 +10,7 @@ __kernel void fully_connected(__read_only image2d_t input,
                              __private const int input_height,
                              __private const int input_width,
                              __private const int input_channel,
-                              __private const float relux_max_limit,
+                              __private const float relux_max_limit) {
-                              __private const float prelu_alpha) {
  const int batch_idx = get_global_id(0);
  const int out_blk_idx = get_global_id(1);
  const int input_chan_blk = (input_channel + 3) >> 2;
@@ -51,8 +50,8 @@ __kernel void fully_connected(__read_only image2d_t input,
    input_coord.y++;
  }
-#if defined(USE_RELU) || defined(USE_RELUX) || defined(USE_PRELU) || defined(USE_TANH) || defined(USE_SIGMOID)
+#if defined(USE_RELU) || defined(USE_RELUX) || defined(USE_TANH) || defined(USE_SIGMOID)
-  result = do_activation(result, relux_max_limit, prelu_alpha);
+  result = do_activation(result, relux_max_limit);
 #endif
  WRITE_IMAGET(output, (int2)(out_blk_idx, batch_idx), result);
 }
--- a/mace/kernels/opencl/cl/winograd_transform.cl
+++ b/mace/kernels/opencl/cl/winograd_transform.cl
@@ -115,8 +115,7 @@ __kernel void winograd_inverse_transform_2x2(__read_only image2d_t input,
                                             __private const int out_width,
                                             __private const int round_hw,
                                             __private const int round_w,
-                                             __private const float relux_max_limit,
+                                             __private const float relux_max_limit) {
-                                             __private const float prelu_alpha) {
  const int width_idx = get_global_id(0);
  const int height_idx = get_global_id(1);
  const int out_channel = get_global_size(1);
@@ -183,11 +182,11 @@ __kernel void winograd_inverse_transform_2x2(__read_only image2d_t input,
 #endif
-#if defined(USE_RELU) || defined(USE_RELUX) || defined(USE_PRELU) || defined(USE_TANH) || defined(USE_SIGMOID)
+#if defined(USE_RELU) || defined(USE_RELUX) || defined(USE_TANH) || defined(USE_SIGMOID)
-  in0[0] = do_activation(in0[0], relux_max_limit, prelu_alpha);
+  in0[0] = do_activation(in0[0], relux_max_limit);
-  in0[1] = do_activation(in0[1], relux_max_limit, prelu_alpha);
+  in0[1] = do_activation(in0[1], relux_max_limit);
-  in1[0] = do_activation(in1[0], relux_max_limit, prelu_alpha);
+  in1[0] = do_activation(in1[0], relux_max_limit);
-  in1[1] = do_activation(in1[1], relux_max_limit, prelu_alpha);
+  in1[1] = do_activation(in1[1], relux_max_limit);
 #endif
  WRITE_IMAGET(output, (int2)(coord_x, coord_y), in0[0]);
@@ -205,6 +204,4 @@ __kernel void winograd_inverse_transform_2x2(__read_only image2d_t input,
    WRITE_IMAGET(output, (int2)(coord_x + 1, coord_y + 1), in1[1]);
  }
 }
--- a/mace/kernels/opencl/conv_2d_opencl.cc
+++ b/mace/kernels/opencl/conv_2d_opencl.cc
@@ -17,7 +17,6 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
                             const int *dilations,
                             const ActivationType activation,
                             const float relux_max_limit,
-                             const float prelu_alpha,
                             const DataType dt,
                             Tensor *output,
                             StatsFuture *future);
@@ -31,7 +30,6 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
                             const int *dilations,
                             const ActivationType activation,
                             const float relux_max_limit,
-                             const float prelu_alpha,
                             const DataType dt,
                             Tensor *output,
                             StatsFuture *future);
@@ -45,7 +43,6 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
                         const int *dilations,
                         const ActivationType activation,
                         const float relux_max_limit,
-                         const float prelu_alpha,
                         const DataType dt,
                         Tensor *output,
                         StatsFuture *future);
@@ -60,7 +57,7 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
      cl::Kernel *kernel,
      const Tensor *input, const Tensor *filter, const Tensor *bias, const int stride,
      const int *padding, const int *dilations, const ActivationType activation,
-      const float relux_max_limit, const float prelu_alpha, const DataType dt,
+      const float relux_max_limit, const DataType dt,
      Tensor *output, StatsFuture *future);
  // Selection matrix: kernel_size x stride_size
  static const Conv2dOpenclFunction selector[5] =
@@ -99,12 +96,10 @@ void Conv2dFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
      selector[kernel_h - 1] != nullptr) {
    auto conv2d_func = selector[kernel_h - 1];
    conv2d_func(&kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_, activation_,
-                relux_max_limit_, prelu_alpha_, DataTypeToEnum<T>::value,
+                relux_max_limit_, DataTypeToEnum<T>::value, output, future);
-                output, future);
  } else {
    Conv2dOpencl(&kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_,
-                 activation_, relux_max_limit_, prelu_alpha_,
+                 activation_, relux_max_limit_, DataTypeToEnum<T>::value, output, future);
-                 DataTypeToEnum<T>::value, output, future);
  }
 }

--- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
@@ -19,7 +19,6 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
                             const int *dilations,
                             const ActivationType activation,
                             const float relux_max_limit,
-                             const float prelu_alpha,
                             const DataType dt,
                             Tensor *output,
                             StatsFuture *future) {
@@ -56,9 +55,6 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
      case RELUX:
        built_options.emplace("-DUSE_RELUX");
        break;
-      case PRELU:
-        built_options.emplace("-DUSE_PRELU");
-        break;
      case TANH:
        built_options.emplace("-DUSE_TANH");
        break;
@@ -86,7 +82,6 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel,
                          *(static_cast<const cl::Image2D *>(output->buffer())));
    // FIXME handle flexable data type: half not supported
    kernel->setArg(idx++, relux_max_limit);
-    kernel->setArg(idx++, prelu_alpha);
    kernel->setArg(idx++, static_cast<int>(input_height));
    kernel->setArg(idx++, static_cast<int>(input_width));
    kernel->setArg(idx++, static_cast<int>(input_channel_blocks));

--- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc
@@ -21,7 +21,6 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
                             const int *dilations,
                             const ActivationType activation,
                             const float relux_max_limit,
-                             const float prelu_alpha,
                             const DataType dt,
                             Tensor *output,
                             StatsFuture *future) {
@@ -51,9 +50,6 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
      case RELUX:
        built_options.emplace("-DUSE_RELUX");
        break;
-      case PRELU:
-        built_options.emplace("-DUSE_PRELU");
-        break;
      case TANH:
        built_options.emplace("-DUSE_TANH");
        break;
@@ -80,7 +76,6 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel,
    kernel->setArg(idx++,
                          *(static_cast<const cl::Image2D *>(output->buffer())));
    kernel->setArg(idx++, relux_max_limit);
-    kernel->setArg(idx++, prelu_alpha);
    kernel->setArg(idx++, static_cast<int>(input->dim(1)));
    kernel->setArg(idx++, static_cast<int>(input->dim(2)));
    kernel->setArg(idx++, static_cast<int>(input_channel_blocks));

--- a/mace/kernels/opencl/conv_2d_opencl_general.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_general.cc
@@ -21,7 +21,6 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
                         const int *dilations,
                         const ActivationType activation,
                         const float relux_max_limit,
-                         const float prelu_alpha,
                         const DataType dt,
                         Tensor *output,
                         StatsFuture *future) {
@@ -51,9 +50,6 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
      case RELUX:
        built_options.emplace("-DUSE_RELUX");
        break;
-      case PRELU:
-        built_options.emplace("-DUSE_PRELU");
-        break;
      case TANH:
        built_options.emplace("-DUSE_TANH");
        break;
@@ -80,7 +76,6 @@ extern void Conv2dOpencl(cl::Kernel *kernel,
    kernel->setArg(idx++,
                          *(static_cast<const cl::Image2D *>(output->buffer())));
    kernel->setArg(idx++, relux_max_limit);
-    kernel->setArg(idx++, prelu_alpha);
    kernel->setArg(idx++, static_cast<uint32_t>(input->dim(1)));
    kernel->setArg(idx++, static_cast<uint32_t>(input->dim(2)));
    kernel->setArg(idx++, static_cast<uint32_t>(input_channel_blocks));

--- a/mace/kernels/opencl/depthwise_conv_opencl.cc
+++ b/mace/kernels/opencl/depthwise_conv_opencl.cc
@@ -20,7 +20,6 @@ void DepthwiseConv2d(cl::Kernel *kernel,
                     const int *dilations,
                     const ActivationType activation,
                     const float relux_max_limit,
-                     const float prelu_alpha,
                     const DataType dt,
                     Tensor *output,
                     StatsFuture *future) {
@@ -69,9 +68,6 @@ void DepthwiseConv2d(cl::Kernel *kernel,
      case RELUX:
        built_options.emplace("-DUSE_RELUX");
        break;
-      case PRELU:
-        built_options.emplace("-DUSE_PRELU");
-        break;
      case TANH:
        built_options.emplace("-DUSE_TANH");
        break;
@@ -96,7 +92,6 @@ void DepthwiseConv2d(cl::Kernel *kernel,
    kernel->setArg(
        idx++, *(static_cast<const cl::Image2D *>(output->buffer())));
    kernel->setArg(idx++, relux_max_limit);
-    kernel->setArg(idx++, prelu_alpha);
    kernel->setArg(idx++, static_cast<short>(input_height));
    kernel->setArg(idx++, static_cast<short>(input_width));
    kernel->setArg(idx++, static_cast<short>(input_channel_blocks));
@@ -140,8 +135,8 @@ void DepthwiseConv2dFunctor<DeviceType::OPENCL, T>::operator()(
                 << " is not implemented yet, using slow version";
    // TODO(heliangliang) The CPU/NEON kernel should map the buffer
    DepthwiseConv2dFunctor<DeviceType::CPU, float>(
-        strides_, padding_type_, paddings_, dilations_, activation_, relux_max_limit_,
+        strides_, padding_type_, paddings_, dilations_, activation_,
-        prelu_alpha_)(input, filter, bias, output, future);
+        relux_max_limit_)(input, filter, bias, output, future);
    return;
  }
@@ -169,7 +164,7 @@ void DepthwiseConv2dFunctor<DeviceType::OPENCL, T>::operator()(
  output->ResizeImage(output_shape, output_image_shape);
  DepthwiseConv2d(&kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_,
-                  activation_, relux_max_limit_, prelu_alpha_,
+                  activation_, relux_max_limit_, 
                  DataTypeToEnum<T>::value, output, future);
 }

--- a/mace/kernels/opencl/fully_connected_opencl.cc
+++ b/mace/kernels/opencl/fully_connected_opencl.cc
@@ -48,9 +48,6 @@ void FullyConnectedFunctor<DeviceType::OPENCL, T>::operator()(
      case RELUX:
        built_options.emplace("-DUSE_RELUX");
        break;
-      case PRELU:
-        built_options.emplace("-DUSE_PRELU");
-        break;
      case TANH:
        built_options.emplace("-DUSE_TANH");
        break;
@@ -78,7 +75,6 @@ void FullyConnectedFunctor<DeviceType::OPENCL, T>::operator()(
    kernel_.setArg(idx++, static_cast<int>(input->dim(3)));
    // FIXME handle flexable data type: half not supported
    kernel_.setArg(idx++, relux_max_limit_);
-    kernel_.setArg(idx++, prelu_alpha_);
  }
  const uint32_t gws[2] = {

--- a/mace/kernels/opencl/winograd_transform.cc
+++ b/mace/kernels/opencl/winograd_transform.cc
@@ -129,7 +129,6 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(const Te
    kernel_.setArg(idx++, static_cast<uint32_t>(round_h * round_w));
    kernel_.setArg(idx++, static_cast<uint32_t>(round_w));
    kernel_.setArg(idx++, relux_max_limit_);
-    kernel_.setArg(idx++, prelu_alpha_);
  }
  const uint32_t gws[2] = {static_cast<uint32_t>(input_tensor->dim(2)),

--- a/mace/kernels/winograd_transform.h
+++ b/mace/kernels/winograd_transform.h
@@ -58,21 +58,18 @@ struct WinogradInverseTransformFunctorBase {
                                      const int height,
                                      const int width,
                                      const ActivationType activation,
-                                      const float relux_max_limit,
+                                      const float relux_max_limit)
-                                      const float prelu_alpha)
      : batch_(batch),
        height_(height),
        width_(width),
        activation_(activation),
-        relux_max_limit_(relux_max_limit),
+        relux_max_limit_(relux_max_limit) {}
-        prelu_alpha_(prelu_alpha) {}
  const int batch_;
  const int height_;
  const int width_;
  const ActivationType activation_;
  const float relux_max_limit_;
-  const float prelu_alpha_;
 };
 template<DeviceType D, typename T>
@@ -81,9 +78,8 @@ struct WinogradInverseTransformFunctor : WinogradInverseTransformFunctorBase {
                                  const int height,
                                  const int width,
                                  const ActivationType activation,
-                                  const float relux_max_limit,
+                                  const float relux_max_limit)
-                                  const float prelu_alpha)
+      : WinogradInverseTransformFunctorBase(batch, height, width, activation, relux_max_limit) {}
-      : WinogradInverseTransformFunctorBase(batch, height, width, activation, relux_max_limit, prelu_alpha) {}
  void operator()(const Tensor *input,
                  const Tensor *bias,
@@ -100,9 +96,8 @@ struct WinogradInverseTransformFunctor<DeviceType::OPENCL, T> : WinogradInverseT
                                  const int height,
                                  const int width,
                                  const ActivationType activation,
-                                  const float relux_max_limit,
+                                  const float relux_max_limit)
-                                  const float prelu_alpha)
+      : WinogradInverseTransformFunctorBase(batch, height, width, activation, relux_max_limit) {}
-      : WinogradInverseTransformFunctorBase(batch, height, width, activation, relux_max_limit, prelu_alpha) {}
  void operator()(const Tensor *input,
                  const Tensor *bias,

--- a/mace/ops/activation.h
+++ b/mace/ops/activation.h
@@ -18,15 +18,15 @@ class ActivationOp : public Operator<D, T> {
        functor_(kernels::StringToActivationType(
                     OperatorBase::GetSingleArgument<std::string>("activation",
                                                                  "NOOP")),
-                 OperatorBase::GetSingleArgument<float>("max_limit", 0.0f),
+                 OperatorBase::GetSingleArgument<float>("max_limit", 0.0f)) {}
-                 OperatorBase::GetSingleArgument<float>("alpha", 0.0f)) {}
  bool Run(StatsFuture *future) override {
-    const Tensor *input_tensor = this->inputs_[0];
+    const Tensor *input_tensor = this->Input(0);
+    const Tensor *alpha_tensor = this->InputSize() >= 2 ? this->Input(1) : nullptr;
    Tensor *output_tensor = this->outputs_[0];
    output_tensor->ResizeLike(input_tensor);
-    functor_(input_tensor, output_tensor, future);
+    functor_(input_tensor, alpha_tensor, output_tensor, future);
    return true;
  }

--- a/mace/ops/activation_test.cc
+++ b/mace/ops/activation_test.cc
@@ -213,17 +213,22 @@ void TestSimplePrelu() {
  // Add input data
  net.AddInputFromArray<D, float>(
      "Input", {2, 2, 2, 2},
-      {-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0});
+      {-7, 7, -6, 6, -5, -5, -4, -4, -3, 3, -2, 2, -1, -1, 0, 0});
+  net.AddInputFromArray<D, float>(
+      "Alpha", {2},
+      {2.0, 3.0});
  if (D == DeviceType::OPENCL) {
    BufferToImage<D, float>(net, "Input", "InputImage",
                            kernels::BufferType::IN_OUT_CHANNEL);
+    BufferToImage<D, float>(net, "Alpha", "AlphaImage",
+                            kernels::BufferType::ARGUMENT);
    OpDefBuilder("Activation", "PreluTest")
        .Input("InputImage")
+        .Input("AlphaImage")
        .Output("OutputImage")
        .AddStringArg("activation", "PRELU")
-        .AddFloatArg("alpha", 2.0)
        .Finalize(net.NewOperatorDef());
    // Run
@@ -235,9 +240,9 @@ void TestSimplePrelu() {
  } else {
    OpDefBuilder("Activation", "PreluTest")
        .Input("Input")
+        .Input("Alpha")
        .Output("Output")
        .AddStringArg("activation", "PRELU")
-        .AddFloatArg("alpha", 2.0)
        .Finalize(net.NewOperatorDef());
    // Run
@@ -245,7 +250,7 @@ void TestSimplePrelu() {
  }
  auto expected = CreateTensor<float>(
-      {2, 2, 2, 2}, {-14, 7, -12, 6, -10, 5, -8, 4, -6, 3, -4, 2, -2, 1, 0, 0});
+      {2, 2, 2, 2}, {-14, 7, -12, 6, -10, -15, -8, -12, -6, 3, -4, 2, -2, -3, 0, 0});
  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
 }

--- a/mace/ops/batch_norm.h
+++ b/mace/ops/batch_norm.h
@@ -16,7 +16,7 @@ class BatchNormOp : public Operator<D, T> {
 public:
  BatchNormOp(const OperatorDef &operator_def, Workspace *ws)
      : Operator<D, T>(operator_def, ws),
-        functor_(false, kernels::ActivationType::NOOP, 0.0f, 0.0f) {
+        functor_(false, kernels::ActivationType::NOOP, 0.0f) {
    epsilon_ = OperatorBase::GetSingleArgument<float>("epsilon",
                                                      static_cast<float>(1e-4));
  }

--- a/mace/ops/conv_2d.h
+++ b/mace/ops/conv_2d.h
@@ -23,7 +23,6 @@ class Conv2dOp : public ConvPool2dOpBase<D, T> {
                 this->paddings_,
                 this->dilations_.data(),
                 kernels::ActivationType::NOOP,
-                 0.0f,
                 0.0f) {}
  bool Run(StatsFuture *future) override {

--- a/mace/ops/conv_2d_test.cc
+++ b/mace/ops/conv_2d_test.cc
@@ -412,9 +412,9 @@ static void TestComplexConvNxNS12(const std::vector<index_t> &shape, const int s
 }
 TEST_F(Conv2dOpTest, OPENCLAlignedConvNxNS12) {
-  TestComplexConvNxNS12<DeviceType::OPENCL, float>({32, 16, 16, 32}, 
+  TestComplexConvNxNS12<DeviceType::OPENCL, float>({32, 16, 16, 32},
                                                   1);
-  TestComplexConvNxNS12<DeviceType::OPENCL, float>({32, 16, 16, 32}, 
+  TestComplexConvNxNS12<DeviceType::OPENCL, float>({32, 16, 16, 32},
                                                   2);
 }

--- a/mace/ops/depthwise_conv2d.h
+++ b/mace/ops/depthwise_conv2d.h
@@ -26,8 +26,7 @@ class DepthwiseConv2dOp : public ConvPool2dOpBase<D, T> {
                 kernels::StringToActivationType(
                     OperatorBase::GetSingleArgument<std::string>("activation",
                                                                  "NOOP")),
-                 OperatorBase::GetSingleArgument<float>("max_limit", 0.0f),
+                 OperatorBase::GetSingleArgument<float>("max_limit", 0.0f)) {}
-                 OperatorBase::GetSingleArgument<float>("alpha", 0.0f)) {}
  bool Run(StatsFuture *future) override {
    const Tensor *input = this->Input(INPUT);

--- a/mace/ops/folded_batch_norm.h
+++ b/mace/ops/folded_batch_norm.h
@@ -19,8 +19,7 @@ class FoldedBatchNormOp : public Operator<D, T> {
                 kernels::StringToActivationType(
                     OperatorBase::GetSingleArgument<std::string>("activation",
                                                                  "NOOP")),
-                 OperatorBase::GetSingleArgument<float>("max_limit", 0.0f),
+                 OperatorBase::GetSingleArgument<float>("max_limit", 0.0f)) {}
-                 OperatorBase::GetSingleArgument<float>("alpha", 0.0f)) {}
  bool Run(StatsFuture *future) override {
    const Tensor *input = this->Input(INPUT);

--- a/mace/ops/fully_connected.h
+++ b/mace/ops/fully_connected.h
@@ -19,8 +19,7 @@ class FullyConnectedOp : public Operator<D, T> {
            kernels::StringToActivationType(
                OperatorBase::GetSingleArgument<std::string>("activation",
                                                             "NOOP")),
-            OperatorBase::GetSingleArgument<float>("max_limit", 0.0f),
+            OperatorBase::GetSingleArgument<float>("max_limit", 0.0f)) {}
-            OperatorBase::GetSingleArgument<float>("alpha", 0.0f)) {}
  bool Run(StatsFuture *future) override {
    const Tensor *input = this->Input(INPUT);

--- a/mace/ops/fused_conv_2d.h
+++ b/mace/ops/fused_conv_2d.h
@@ -25,8 +25,7 @@ class FusedConv2dOp : public ConvPool2dOpBase<D, T> {
                 kernels::StringToActivationType(
                     OperatorBase::GetSingleArgument<std::string>("activation",
                                                                  "NOOP")),
-                 OperatorBase::GetSingleArgument<float>("max_limit", 0.0f),
+                 OperatorBase::GetSingleArgument<float>("max_limit", 0.0f)) {}
-                 OperatorBase::GetSingleArgument<float>("alpha", 0.0f)) {}
  bool Run(StatsFuture *future) override {
    const Tensor *input = this->Input(INPUT);

--- a/mace/ops/winograd_inverse_transform.h
+++ b/mace/ops/winograd_inverse_transform.h
@@ -24,8 +24,7 @@ class WinogradInverseTransformOp : public Operator<D, T> {
                 kernels::StringToActivationType(
                     OperatorBase::GetSingleArgument<std::string>("activation",
                                                                  "NOOP")),
-                 OperatorBase::GetSingleArgument<float>("max_limit", 0.0f),
+                 OperatorBase::GetSingleArgument<float>("max_limit", 0.0f)) {}
-                 OperatorBase::GetSingleArgument<float>("alpha", 0.0f)) {}
  bool Run(StatsFuture *future) override {
    const Tensor *input_tensor = this->Input(INPUT);