Merge branch 'support_caffe_relu_slope' into 'master'

support caffe relu with negative_slope See merge request !910

Merge branch 'support_caffe_relu_slope' into 'master'
support caffe relu with negative_slope See merge request !910
27314662 · 刘托 · 350cdef6 · 10ba3061 · 27314662 · 27314662
56 changed file
--- a/mace/ops/activation.cc
+++ b/mace/ops/activation.cc
@@ -36,9 +36,11 @@ class ActivationOp<DeviceType::CPU, float> : public Operation {
      : Operation(context),
        activation_(ops::StringToActivationType(
            Operation::GetOptionalArg<std::string>("activation",
-                                                  "NOOP"))),
+                                                   "NOOP"))),
        relux_max_limit_(Operation::GetOptionalArg<float>("max_limit",
-                                                         0.0f)) {}
+                                                          0.0f)),
+        leakyrelu_coefficient_(Operation::GetOptionalArg<float>(
+              "leakyrelu_coefficient", 0.0f)) {}

  MaceStatus Run(OpContext *context) override {
    MACE_UNUSED(context);
@@ -58,7 +60,7 @@ class ActivationOp<DeviceType::CPU, float> : public Operation {
                      alpha_ptr, output_ptr);
    } else {
      DoActivation(input_ptr, output_ptr, output->size(), activation_,
-                   relux_max_limit_);
+                   relux_max_limit_, leakyrelu_coefficient_);
    }
    return MaceStatus::MACE_SUCCESS;
  }
@@ -66,6 +68,7 @@ class ActivationOp<DeviceType::CPU, float> : public Operation {
 private:
  ActivationType activation_;
  float relux_max_limit_;
+  float leakyrelu_coefficient_;
 };


@@ -80,11 +83,14 @@ class ActivationOp<DeviceType::GPU, T> : public Operation {
                                              "NOOP"));
    auto relux_max_limit = static_cast<T>(
        Operation::GetOptionalArg<float>("max_limit", 0.0f));
+    auto leakyrelu_coefficient = static_cast<T>(
+        Operation::GetOptionalArg<float>("leakyrelu_coefficient", 0.0f));
    MemoryType mem_type;
    if (context->device()->gpu_runtime()->UseImageMemory()) {
      mem_type = MemoryType::GPU_IMAGE;
      kernel_.reset(
-          new opencl::image::ActivationKernel<T>(type, relux_max_limit));
+          new opencl::image::ActivationKernel<T>(type, relux_max_limit,
+                                                 leakyrelu_coefficient));
    } else {
      MACE_NOT_IMPLEMENTED;
    }

--- a/mace/ops/activation.h
+++ b/mace/ops/activation.h
@@ -62,7 +62,8 @@ void DoActivation(const T *input_ptr,
                  T *output_ptr,
                  const index_t size,
                  const ActivationType type,
-                  const float relux_max_limit) {
+                  const float relux_max_limit,
+                  const float leakyrelu_coefficient) {
  MACE_CHECK(DataTypeToEnum<T>::value != DataType::DT_HALF);

  switch (type) {
@@ -97,7 +98,7 @@ void DoActivation(const T *input_ptr,
 #pragma omp parallel for schedule(runtime)
      for (index_t i = 0; i < size; ++i) {
        output_ptr[i] = std::max(input_ptr[i], static_cast<T>(0))
-            + std::min(input_ptr[i], static_cast<T>(0)) * relux_max_limit;
+          + leakyrelu_coefficient * std::min(input_ptr[i], static_cast<T>(0));
      }
      break;
    default:
@@ -110,7 +111,8 @@ inline void DoActivation(const float *input_ptr,
                         float *output_ptr,
                         const index_t size,
                         const ActivationType type,
-                         const float relux_max_limit) {
+                         const float relux_max_limit,
+                         const float leakyrelu_coefficient) {
  switch (type) {
    case NOOP:
      break;
@@ -133,7 +135,7 @@ inline void DoActivation(const float *input_ptr,
      }
      break;
    case LEAKYRELU:
-      LeakyReluNeon(input_ptr, relux_max_limit, size, output_ptr);
+      LeakyReluNeon(input_ptr, leakyrelu_coefficient, size, output_ptr);
      break;
    default:
      LOG(FATAL) << "Unknown activation type: " << type;

--- a/mace/ops/activation_test.cc
+++ b/mace/ops/activation_test.cc
@@ -52,6 +52,42 @@ TEST_F(ActivationOpTest, OPENCLSimpleRelu) {
  TestSimpleRelu<DeviceType::GPU>();
 }

+namespace {
+template <DeviceType D>
+void TestSimpleLeakyRelu() {
+  OpsTestNet net;
+
+  // Add input data
+  net.AddInputFromArray<D, float>(
+      "Input", {2, 2, 2, 2},
+      {-7, 7, -6, 6, -5, 5, -4, 4, -3, 3, -2, 2, -1, 1, 0, 0});
+
+  OpDefBuilder("Activation", "ReluTest")
+      .Input("Input")
+      .Output("Output")
+      .AddStringArg("activation", "LEAKYRELU")
+      .AddFloatArg("leakyrelu_coefficient", 0.1)
+      .Finalize(net.NewOperatorDef());
+
+  // Run
+  net.RunOp(D);
+
+  auto expected = net.CreateTensor<float>(
+      {2, 2, 2, 2},
+      {-0.7, 7, -0.6, 6, -0.5, 5, -0.4, 4, -0.3, 3, -0.2, 2, -0.1, 1, 0, 0});
+
+  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
+}
+}  // namespace
+
+TEST_F(ActivationOpTest, CPUSimpleLeakyRelu) {
+  TestSimpleLeakyRelu<DeviceType::CPU>();
+}
+
+TEST_F(ActivationOpTest, OPENCLSimpleLeakyRelu) {
+  TestSimpleLeakyRelu<DeviceType::GPU>();
+}
+
 namespace {
 template <DeviceType D>
 void TestUnalignedSimpleRelu() {

--- a/mace/ops/batch_norm.cc
+++ b/mace/ops/batch_norm.cc
@@ -35,10 +35,12 @@ class BatchNormOp<DeviceType::CPU, float> : public Operation {
  explicit BatchNormOp(OpConstructContext *context)
      : Operation(context),
        epsilon_(Operation::GetOptionalArg<float>("epsilon",
-                                                 static_cast<float>(1e-4))),
+                                                  static_cast<float>(1e-4))),
        activation_(ops::StringToActivationType(
            Operation::GetOptionalArg<std::string>("activation", "NOOP"))),
-        relux_max_limit_(Operation::GetOptionalArg<float>("max_limit", 0.0f)) {}
+        relux_max_limit_(Operation::GetOptionalArg<float>("max_limit", 0.0f)),
+        leakyrelu_coefficient_(Operation::GetOptionalArg<float>(
+              "leakyrelu_coefficient", 0.0f)) {}

  MaceStatus Run(OpContext *context) override {
    MACE_UNUSED(context);
@@ -121,7 +123,7 @@ class BatchNormOp<DeviceType::CPU, float> : public Operation {
      }
    }
    DoActivation(output_ptr, output_ptr, output->size(), activation_,
-                 relux_max_limit_);
+                 relux_max_limit_, leakyrelu_coefficient_);

    return MaceStatus::MACE_SUCCESS;
  }
@@ -130,6 +132,7 @@ class BatchNormOp<DeviceType::CPU, float> : public Operation {
  float epsilon_;
  const ActivationType activation_;
  const float relux_max_limit_;
+  const float leakyrelu_coefficient_;

 protected:
  MACE_OP_INPUT_TAGS(INPUT, SCALE, OFFSET, MEAN, VAR);
@@ -148,11 +151,13 @@ class BatchNormOp<DeviceType::GPU, T> : public Operation {
    ActivationType activation = ops::StringToActivationType(
        Operation::GetOptionalArg<std::string>("activation", "NOOP"));
    float relux_max_limit = Operation::GetOptionalArg<float>("max_limit", 0.0f);
+    float leakyrelu_coefficient = Operation::GetOptionalArg<float>(
+        "leakyrelu_coefficient", 0.0f);
    MemoryType mem_type;
    if (context->device()->gpu_runtime()->UseImageMemory()) {
      mem_type = MemoryType::GPU_IMAGE;
      kernel_.reset(new opencl::image::BatchNormKernel<T>(
-          epsilon, activation, relux_max_limit));
+          epsilon, activation, relux_max_limit, leakyrelu_coefficient));
    } else {
      MACE_NOT_IMPLEMENTED;
    }

--- a/mace/ops/batch_norm_test.cc
+++ b/mace/ops/batch_norm_test.cc
@@ -88,8 +88,8 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
  // Add input data
  net.AddRandomInput<DeviceType::GPU, float>("Input",
                                             {batch, height, width, channels});
-  net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels}, true);
-  net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels}, true);
+  net.AddRandomInput<DeviceType::GPU, float>("Scale", {channels}, true, false);
+  net.AddRandomInput<DeviceType::GPU, float>("Offset", {channels}, true, false);
  net.AddRandomInput<DeviceType::GPU, float>("Mean", {channels}, true);
  net.AddRandomInput<DeviceType::GPU, float>("Var", {channels}, true);

@@ -105,6 +105,8 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
      .Input("Var")
      .AddFloatArg("epsilon", 1e-3)
      .Output("OutputNCHW")
+      .AddStringArg("activation", "LEAKYRELU")
+      .AddFloatArg("leakyrelu_coefficient", 0.1)
      .Finalize(net.NewOperatorDef());

  // run cpu
@@ -126,6 +128,8 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
      .Input("Var")
      .AddFloatArg("epsilon", 1e-3)
      .Output("Output")
+      .AddStringArg("activation", "LEAKYRELU")
+      .AddFloatArg("leakyrelu_coefficient", 0.1)
      .Finalize(net.NewOperatorDef());

  // Tuning

--- a/mace/ops/conv_2d.cc
+++ b/mace/ops/conv_2d.cc
@@ -58,6 +58,8 @@ class Conv2dOp<DeviceType::CPU, float> : public ConvPool2dOpBase {
            Operation::GetOptionalArg<std::string>("activation",
                                                  "NOOP"))),
        relux_max_limit_(Operation::GetOptionalArg<float>("max_limit", 0.0f)),
+        leakyrelu_coefficient_(Operation::GetOptionalArg<float>(
+              "leakyrelu_coefficient", 0.0f)),
        is_filter_transformed_(false) {}

  MaceStatus Run(OpContext *context) override {
@@ -520,7 +522,7 @@ class Conv2dOp<DeviceType::CPU, float> : public ConvPool2dOpBase {
    }

    DoActivation(output_data, output_data, output->size(), activation_,
-                 relux_max_limit_);
+                 relux_max_limit_, leakyrelu_coefficient_);

    return MaceStatus::MACE_SUCCESS;
  }
@@ -703,6 +705,7 @@ class Conv2dOp<DeviceType::CPU, float> : public ConvPool2dOpBase {
 private:
  const ActivationType activation_;
  const float relux_max_limit_;
+  const float leakyrelu_coefficient_;
  bool is_filter_transformed_;
  SGemm sgemm_;

@@ -721,7 +724,9 @@ class Conv2dOp<DeviceType::CPU, uint8_t> : public ConvPool2dOpBase {
        activation_(ops::StringToActivationType(
            Operation::GetOptionalArg<std::string>("activation",
                                                  "NOOP"))),
-        relux_max_limit_(Operation::GetOptionalArg<float>("max_limit", 0.0f)) {}
+        relux_max_limit_(Operation::GetOptionalArg<float>("max_limit", 0.0f)),
+        leakyrelu_coefficient_(Operation::GetOptionalArg<float>(
+              "leakyrelu_coefficient", 0.0f)) {}

  MaceStatus Run(OpContext *context) override {
    const Tensor *input = this->Input(INPUT);
@@ -944,6 +949,7 @@ class Conv2dOp<DeviceType::CPU, uint8_t> : public ConvPool2dOpBase {
 private:
  const ActivationType activation_;
  const float relux_max_limit_;
+  const float leakyrelu_coefficient_;

 private:
  MACE_OP_INPUT_TAGS(INPUT, FILTER, BIAS);
@@ -961,6 +967,8 @@ class Conv2dOp<DeviceType::GPU, T> : public ConvPool2dOpBase {
            Operation::GetOptionalArg<std::string>("activation",
                                                   "NOOP"))),
        relux_max_limit_(Operation::GetOptionalArg<float>("max_limit", 0.0f)),
+        leakyrelu_coefficient_(Operation::GetOptionalArg<float>(
+              "leakyrelu_coefficient", 0.0f)),
        wino_block_size_(Operation::GetOptionalArg<int>("wino_block_size", 0)) {
    MemoryType mem_type;
    if (context->device()->gpu_runtime()->UseImageMemory()) {
@@ -1007,12 +1015,13 @@ class Conv2dOp<DeviceType::GPU, T> : public ConvPool2dOpBase {
    return kernel_->Compute(context, input, filter, bias,
                            strides_.data(), padding_type_, paddings_,
                            dilations_.data(), activation_, relux_max_limit_,
-                            wino_block_size_, output);
+                            leakyrelu_coefficient_, wino_block_size_, output);
  }

 private:
  const ActivationType activation_;
  const float relux_max_limit_;
+  const float leakyrelu_coefficient_;
  std::unique_ptr<OpenCLConv2dKernel> kernel_;
  int wino_block_size_;


--- a/mace/ops/conv_2d_test.cc
+++ b/mace/ops/conv_2d_test.cc
@@ -527,8 +527,9 @@ void TestComplexConvNxNS12(const std::vector<index_t> &shape,
    // Add input data
    net.AddRandomInput<D, T>("Input", {batch, height, width, input_channels});
    net.AddRandomInput<D, T>(
-        "Filter", {output_channels, input_channels, kernel_h, kernel_w}, true);
-    net.AddRandomInput<D, T>("Bias", {output_channels}, true);
+        "Filter", {output_channels, input_channels, kernel_h, kernel_w}, true,
+        false);
+    net.AddRandomInput<D, T>("Bias", {output_channels}, true, false);
    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
                                                    NCHW);

@@ -541,6 +542,8 @@ void TestComplexConvNxNS12(const std::vector<index_t> &shape,
        .AddIntsArg("strides", {stride_h, stride_w})
        .AddIntArg("padding", type)
        .AddIntsArg("dilations", {1, 1})
+        .AddStringArg("activation", "LEAKYRELU")
+        .AddFloatArg("leakyrelu_coefficient", 0.1)
        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
        .Finalize(net.NewOperatorDef());

@@ -564,6 +567,8 @@ void TestComplexConvNxNS12(const std::vector<index_t> &shape,
        .AddIntsArg("strides", {stride_h, stride_w})
        .AddIntArg("padding", type)
        .AddIntsArg("dilations", {1, 1})
+        .AddStringArg("activation", "LEAKYRELU")
+        .AddFloatArg("leakyrelu_coefficient", 0.1)
        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
        .AddIntArg("wino_block_size", wino_blk_size)
        .Finalize(net.NewOperatorDef());

--- a/mace/ops/deconv_2d.cc
+++ b/mace/ops/deconv_2d.cc
@@ -292,7 +292,8 @@ class Deconv2dOp<DeviceType::CPU, float> : public Deconv2dOpBase {
                        output_data,
                        output->size(),
                        activation_,
-                        relux_max_limit_);
+                        relux_max_limit_,
+                        leakyrelu_coefficient_);

    return MaceStatus::MACE_SUCCESS;
  }
@@ -443,7 +444,8 @@ class Deconv2dOp<DeviceType::GPU, T> : public Deconv2dOpBase {

    return kernel_->Compute(context, input, filter, bias,
                            strides_.data(), in_paddings.data(), activation_,
-                            relux_max_limit_, out_shape, output);
+                            relux_max_limit_, leakyrelu_coefficient_,
+                            out_shape, output);
  }

 private:

--- a/mace/ops/deconv_2d.h
+++ b/mace/ops/deconv_2d.h
@@ -47,7 +47,9 @@ class Deconv2dOpBase : public Operation {
            Operation::GetOptionalArg<std::string>("activation",
                                                   "NOOP"))),
        relux_max_limit_(
-            Operation::GetOptionalArg<float>("max_limit", 0.0f)) {}
+            Operation::GetOptionalArg<float>("max_limit", 0.0f)),
+        leakyrelu_coefficient_(
+            Operation::GetOptionalArg<float>("leakyrelu_coefficient", 0.0f)) {}

  static void CalcDeconvShape_Caffe(
      const index_t *input_shape,   // NHWC
@@ -191,6 +193,7 @@ class Deconv2dOpBase : public Operation {
  const FrameworkType model_type_;
  const ActivationType activation_;
  const float relux_max_limit_;
+  const float leakyrelu_coefficient_;
 };

 template <typename T>

--- a/mace/ops/deconv_2d_test.cc
+++ b/mace/ops/deconv_2d_test.cc
@@ -377,8 +377,9 @@ void TestComplexDeconvNxN(const int batch,
    // Add input data
    net.AddRandomInput<D, T>("Input", {batch, height, width, input_channels});
    net.AddRandomInput<D, T>(
-        "Filter", {output_channels, input_channels, kernel_h, kernel_w}, true);
-    net.AddRandomInput<D, T>("Bias", {output_channels}, true);
+        "Filter", {output_channels, input_channels, kernel_h, kernel_w}, true,
+        false);
+    net.AddRandomInput<D, T>("Bias", {output_channels}, true, false);
    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
                                                    NCHW);
    int out_h = 0;
@@ -418,6 +419,8 @@ void TestComplexDeconvNxN(const int batch,
          .AddIntsArg("strides", {stride_h, stride_w})
          .AddIntsArg("padding_values", paddings)
          .AddIntArg("framework_type", model_type)
+          .AddStringArg("activation", "LEAKYRELU")
+          .AddFloatArg("leakyrelu_coefficient", 0.1)
          .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
          .Finalize(net.NewOperatorDef());
    } else {
@@ -454,6 +457,8 @@ void TestComplexDeconvNxN(const int batch,
          .AddIntsArg("strides", {stride_h, stride_w})
          .AddIntsArg("padding_values", paddings)
          .AddIntArg("framework_type", model_type)
+          .AddStringArg("activation", "LEAKYRELU")
+          .AddFloatArg("leakyrelu_coefficient", 0.1)
          .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
          .Finalize(net.NewOperatorDef());
    } else {

--- a/mace/ops/depthwise_conv2d.cc
+++ b/mace/ops/depthwise_conv2d.cc
@@ -49,10 +49,13 @@ class DepthwiseConv2dOpBase : public ConvPool2dOpBase {
        activation_(ops::StringToActivationType(
            Operation::GetOptionalArg<std::string>("activation",
                                                  "NOOP"))),
-        relux_max_limit_(Operation::GetOptionalArg<float>("max_limit", 0.0f)) {}
+        relux_max_limit_(Operation::GetOptionalArg<float>("max_limit", 0.0f)),
+        leakyrelu_coefficient_(Operation::GetOptionalArg<float>(
+              "leakyrelu_coefficient", 0.0f)) {}
 protected:
  const ActivationType activation_;
  const float relux_max_limit_;
+  const float leakyrelu_coefficient_;
 };

 template <DeviceType D, class T>
@@ -218,7 +221,7 @@ class DepthwiseConv2dOp<DeviceType::CPU, float> : public DepthwiseConv2dOpBase {
    }

    DoActivation(output_data, output_data, output->size(), activation_,
-                 relux_max_limit_);
+                 relux_max_limit_, leakyrelu_coefficient_);

    return MaceStatus::MACE_SUCCESS;
  }
@@ -524,7 +527,7 @@ class DepthwiseConv2dOp<DeviceType::GPU, T> : public DepthwiseConv2dOpBase {
    return kernel_->Compute(context, input, filter, bias,
                            strides_.data(), padding_type_, paddings_,
                            dilations_.data(), activation_, relux_max_limit_,
-                            output);
+                            leakyrelu_coefficient_, output);
  }

 private:

--- a/mace/ops/depthwise_conv2d_test.cc
+++ b/mace/ops/depthwise_conv2d_test.cc
@@ -244,10 +244,10 @@ void TestNxNS12(const index_t height, const index_t width) {
    net.AddRandomInput<DeviceType::GPU, float>(
        "Input", {batch, height, width, channel});
    net.AddRandomInput<DeviceType::GPU, float>(
-        "Filter", {multiplier, channel, kernel_h, kernel_w}, true);
+        "Filter", {multiplier, channel, kernel_h, kernel_w}, true, false);
    net.AddRandomInput<DeviceType::GPU, float>("Bias",
                                               {multiplier * channel},
-                                               true);
+                                               true, false);

    net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
                                                    NCHW);
@@ -260,8 +260,8 @@ void TestNxNS12(const index_t height, const index_t width) {
        .AddIntArg("padding", type)
        .AddIntsArg("dilations", {1, 1})
        .AddIntArg("T", static_cast<int>(DataTypeToEnum<float>::value))
-        .AddStringArg("activation", "RELUX")
-        .AddFloatArg("max_limit", 6.0)
+        .AddStringArg("activation", "LEAKYRELU")
+        .AddFloatArg("leakyrelu_coefficient", 0.1)
        .Finalize(net.NewOperatorDef());

    // Run on cpu
@@ -283,8 +283,8 @@ void TestNxNS12(const index_t height, const index_t width) {
        .AddIntArg("padding", type)
        .AddIntsArg("dilations", {1, 1})
        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
-        .AddStringArg("activation", "RELUX")
-        .AddFloatArg("max_limit", 6.0)
+        .AddStringArg("activation", "LEAKYRELU")
+        .AddFloatArg("leakyrelu_coefficient", 0.1)
        .Finalize(net.NewOperatorDef());

    net.RunOp(DeviceType::GPU);

--- a/mace/ops/depthwise_deconv2d.cc
+++ b/mace/ops/depthwise_deconv2d.cc
@@ -281,7 +281,8 @@ class DepthwiseDeconv2dOp<DeviceType::CPU, float>
                        output_data,
                        output->size(),
                        activation_,
-                        relux_max_limit_);
+                        relux_max_limit_,
+                        leakyrelu_coefficient_);

    return MaceStatus::MACE_SUCCESS;
  }
@@ -458,6 +459,7 @@ class DepthwiseDeconv2dOp<DeviceType::GPU, T> : public Deconv2dOpBase {
                            group_,
                            activation_,
                            relux_max_limit_,
+                            leakyrelu_coefficient_,
                            out_shape,
                            output);
  }

--- a/mace/ops/depthwise_deconv2d_test.cc
+++ b/mace/ops/depthwise_deconv2d_test.cc
@@ -185,12 +185,13 @@ void RandomTest(index_t batch,
  GenerateRandomRealTypeData({multiplier, channel, kernel, kernel},
                             &filter_data);
  net.AddInputFromArray<DeviceType::GPU, float>(
-      "Filter", {multiplier, channel, kernel, kernel}, filter_data, true);
+      "Filter", {multiplier, channel, kernel, kernel}, filter_data, true,
+      false);
  std::vector<float> bias_data(channel * multiplier);
  GenerateRandomRealTypeData({channel * multiplier}, &bias_data);
  net.AddInputFromArray<DeviceType::GPU, float>("Bias",
                                                {channel * multiplier},
-                                                bias_data, true);
+                                                bias_data, true, false);

  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
                                                  NCHW);
@@ -203,6 +204,8 @@ void RandomTest(index_t batch,
      .AddIntsArg("padding_values", {padding, padding})
      .AddIntArg("group", channel)
      .AddIntsArg("dilations", {1, 1})
+      .AddStringArg("activation", "LEAKYRELU")
+      .AddFloatArg("leakyrelu_coefficient", 0.1f)
      .AddIntArg("T", static_cast<int>(DataTypeToEnum<float>::value))
      .Finalize(net.NewOperatorDef());
  // Run
@@ -224,6 +227,8 @@ void RandomTest(index_t batch,
      .AddIntsArg("strides", {stride, stride})
      .AddIntsArg("padding_values", {padding, padding})
      .AddIntArg("group", channel)
+      .AddStringArg("activation", "LEAKYRELU")
+      .AddFloatArg("leakyrelu_coefficient", 0.1f)
      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
      .Finalize(net.NewOperatorDef());


--- a/mace/ops/fully_connected.cc
+++ b/mace/ops/fully_connected.cc
@@ -41,10 +41,13 @@ class FullyConnectedOpBase : public Operation {
        activation_(ops::StringToActivationType(
            Operation::GetOptionalArg<std::string>("activation",
                                                  "NOOP"))),
-        relux_max_limit_(Operation::GetOptionalArg<float>("max_limit", 0.0f)) {}
+        relux_max_limit_(Operation::GetOptionalArg<float>("max_limit", 0.0f)),
+        leakyrelu_coefficient_(Operation::GetOptionalArg<float>(
+              "leakyrelu_coefficient", 0.0f)) {}
 protected:
  const ActivationType activation_;
  const float relux_max_limit_;
+  const float leakyrelu_coefficient_;

  MACE_OP_INPUT_TAGS(INPUT, WEIGHT, BIAS);
  MACE_OP_OUTPUT_TAGS(OUTPUT);
@@ -104,7 +107,7 @@ class FullyConnectedOp<DeviceType::CPU, float> : public FullyConnectedOpBase {
    }

    DoActivation(output_ptr, output_ptr, output->size(), activation_,
-                 relux_max_limit_);
+                 relux_max_limit_, leakyrelu_coefficient_);

    return MaceStatus::MACE_SUCCESS;
  }
@@ -226,7 +229,8 @@ class FullyConnectedOp<DeviceType::GPU, T> : public FullyConnectedOpBase {
        "The shape of Weight: ", MakeString(weight->shape()),
        " don't match.");
    return kernel_->Compute(
-        context, input, weight, bias, activation_, relux_max_limit_, output);
+        context, input, weight, bias, activation_, relux_max_limit_,
+        leakyrelu_coefficient_, output);
  }

 private:

--- a/mace/ops/fully_connected_test.cc
+++ b/mace/ops/fully_connected_test.cc
@@ -123,10 +123,11 @@ void Random(const index_t batch,

  // Add input data
  net.AddRandomInput<DeviceType::GPU, float>("Input",
-                                             {batch, height, width, channels});
+      {batch, height, width, channels}, false, false);
  net.AddRandomInput<DeviceType::GPU, float>(
-      "Weight", {out_channel, channels, height, width}, true);
-  net.AddRandomInput<DeviceType::GPU, float>("Bias", {out_channel}, true);
+      "Weight", {out_channel, channels, height, width}, true, false);
+  net.AddRandomInput<DeviceType::GPU, float>("Bias", {out_channel}, true,
+      false);

  net.TransformDataFormat<DeviceType::CPU, float>("Input", NHWC, "InputNCHW",
                                                  NCHW);
@@ -135,6 +136,8 @@ void Random(const index_t batch,
      .Input("Weight")
      .Input("Bias")
      .Output("OutputNCHW")
+      .AddStringArg("activation", "LEAKYRELU")
+      .AddFloatArg("leakyrelu_coefficient", 0.1f)
      .Finalize(net.NewOperatorDef());

  // run cpu
@@ -152,6 +155,8 @@ void Random(const index_t batch,
      .Input("Weight")
      .Input("Bias")
      .Output("Output")
+      .AddStringArg("activation", "LEAKYRELU")
+      .AddFloatArg("leakyrelu_coefficient", 0.1f)
      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
      .Finalize(net.NewOperatorDef());


--- a/mace/ops/opencl/buffer/conv_2d.h
+++ b/mace/ops/opencl/buffer/conv_2d.h
@@ -38,6 +38,7 @@ extern MaceStatus Conv2d1x1(OpContext *context,
                            const DataType dt,
                            const ActivationType activation,
                            const float relux_max_limit,
+                            const float leakyrelu_coefficient,
                            const bool input_changed,
                            Tensor *output,
                            StatsFuture *future);
@@ -52,6 +53,7 @@ extern MaceStatus Conv2dGeneral(OpContext *context,
                                const DataType dt,
                                const ActivationType activation,
                                const float relux_max_limit,
+                                const float leakyrelu_coefficient,
                                const bool input_changed,
                                Tensor *output,
                                StatsFuture *future);
@@ -81,6 +83,7 @@ class Conv2dKernel : public OpenCLConv2dKernel {
      const int *dilations,
      const ActivationType activation,
      const float relux_max_limit,
+      const float leakyrelu_coefficient,
      const int winograd_blk_size,
      Tensor *output) override;

@@ -120,6 +123,7 @@ MaceStatus Conv2dKernel<T>::Compute(
      const int *dilations,
      const ActivationType activation,
      const float relux_max_limit,
+      const float leakyrelu_coefficient,
      const int winograd_blk_size,
      Tensor *output) {
  MACE_UNUSED(winograd_blk_size);
@@ -221,14 +225,14 @@ MaceStatus Conv2dKernel<T>::Compute(
      return conv2d::Conv2d1x1(
          context, &kernels_[1], pad_input, filter, bias, strides,
          DataTypeToEnum<T>::v(), activation, relux_max_limit,
-          input_changed, output, &conv_future);
+          leakyrelu_coefficient, input_changed, output, &conv_future);
    };
  } else {
    conv_func = [&](const Tensor *pad_input, Tensor *output) -> MaceStatus {
      return conv2d::Conv2dGeneral(
        context, &kernels_[1], pad_input, filter, bias, strides, dilations,
        DataTypeToEnum<T>::v(), activation, relux_max_limit,
-        input_changed, output, &conv_future);
+        leakyrelu_coefficient, input_changed, output, &conv_future);
    };
  }
  MACE_RETURN_IF_ERROR(conv_func(padded_input_ptr, output));

--- a/mace/ops/opencl/buffer/conv_2d_1x1.cc
+++ b/mace/ops/opencl/buffer/conv_2d_1x1.cc
@@ -32,6 +32,7 @@ MaceStatus Conv2d1x1(OpContext *context,
                     const DataType dt,
                     const ActivationType activation,
                     const float relux_max_limit,
+                     const float leakyrelu_coefficient,
                     const bool input_changed,
                     Tensor *output,
                     StatsFuture *future) {
@@ -71,6 +72,9 @@ MaceStatus Conv2d1x1(OpContext *context,
      case SIGMOID:
        built_options.emplace("-DUSE_SIGMOID");
        break;
+      case LEAKYRELU:
+        built_options.emplace("-DUSE_LEAKYRELU");
+        break;
      default:
        LOG(FATAL) << "Unknown activation type: " << activation;
    }
@@ -106,6 +110,7 @@ MaceStatus Conv2d1x1(OpContext *context,
    kernel->setArg(idx++, strides[0]);
    kernel->setArg(idx++, strides[1]);
    kernel->setArg(idx++, relux_max_limit);
+    kernel->setArg(idx++, leakyrelu_coefficient);
    kernel->setArg(idx++, *(output->opencl_buffer()));
  }


--- a/mace/ops/opencl/buffer/conv_2d_general.cc
+++ b/mace/ops/opencl/buffer/conv_2d_general.cc
@@ -33,6 +33,7 @@ MaceStatus Conv2dGeneral(OpContext *context,
                         const DataType dt,
                         const ActivationType activation,
                         const float relux_max_limit,
+                         const float leakyrelu_coefficient,
                         const bool input_changed,
                         Tensor *output,
                         StatsFuture *future) {
@@ -76,6 +77,9 @@ MaceStatus Conv2dGeneral(OpContext *context,
      case SIGMOID:
        built_options.emplace("-DUSE_SIGMOID");
        break;
+      case LEAKYRELU:
+        built_options.emplace("-DUSE_LEAKYRELU");
+        break;
      default:
        LOG(FATAL) << "Unknown activation type: " << activation;
    }
@@ -120,6 +124,7 @@ MaceStatus Conv2dGeneral(OpContext *context,
    kernel->setArg(idx++, static_cast<int32_t>(
        dilations[1] * in_channel));
    kernel->setArg(idx++, relux_max_limit);
+    kernel->setArg(idx++, leakyrelu_coefficient);
    kernel->setArg(idx++, *(output->opencl_buffer()));
  }


--- a/mace/ops/opencl/buffer/depthwise_conv2d.cc
+++ b/mace/ops/opencl/buffer/depthwise_conv2d.cc
@@ -33,6 +33,7 @@ MaceStatus DepthwiseConv2d(OpContext *context,
                           const DataType dt,
                           const ActivationType activation,
                           const float relux_max_limit,
+                           const float leakyrelu_coefficient,
                           const bool input_changed,
                           Tensor *output,
                           StatsFuture *future) {
@@ -76,6 +77,9 @@ MaceStatus DepthwiseConv2d(OpContext *context,
      case SIGMOID:
        built_options.emplace("-DUSE_SIGMOID");
        break;
+      case LEAKYRELU:
+        built_options.emplace("-DUSE_LEAKYRELU");
+        break;
      default:
        LOG(FATAL) << "Unknown activation type: " << activation;
    }
@@ -116,6 +120,7 @@ MaceStatus DepthwiseConv2d(OpContext *context,
    kernel->setArg(idx++, static_cast<int32_t>(
        dilations[1] * in_channel));
    kernel->setArg(idx++, relux_max_limit);
+    kernel->setArg(idx++, leakyrelu_coefficient);
    kernel->setArg(idx++, *(output->opencl_buffer()));
  }


--- a/mace/ops/opencl/buffer/depthwise_conv2d.h
+++ b/mace/ops/opencl/buffer/depthwise_conv2d.h
@@ -39,6 +39,7 @@ MaceStatus DepthwiseConv2d(OpContext *context,
                           const DataType dt,
                           const ActivationType activation,
                           const float relux_max_limit,
+                           const float leakyrelu_coefficient,
                           const bool input_changed,
                           Tensor *output,
                           StatsFuture *future);
@@ -60,6 +61,7 @@ class DepthwiseConv2dKernel : public OpenCLDepthwiseConv2dKernel {
      const int *dilations,
      const ActivationType activation,
      const float relux_max_limit,
+      const float leakyrelu_coefficient,
      Tensor *output) override;

 private:
@@ -81,6 +83,7 @@ MaceStatus DepthwiseConv2dKernel<T>::Compute(
    const int *dilations,
    const ActivationType activation,
    const float relux_max_limit,
+    const float leakyrelu_coefficient,
    Tensor *output) {
  StatsFuture pad_future, dw_conv_future;
  index_t filter_w = filter->dim(3);
@@ -175,7 +178,7 @@ MaceStatus DepthwiseConv2dKernel<T>::Compute(
      depthwise::DepthwiseConv2d(
          context, &kernels_[1], padded_input_ptr, filter, bias, strides,
          dilations, DataTypeToEnum<T>::v(), activation, relux_max_limit,
-          input_changed, output, &dw_conv_future));
+          leakyrelu_coefficient, input_changed, output, &dw_conv_future));
  MergeMultipleFutureWaitFn({pad_future, dw_conv_future}, context->future());
  return MaceStatus::MACE_SUCCESS;
 }

--- a/mace/ops/opencl/cl/activation.cl
+++ b/mace/ops/opencl/cl/activation.cl
@@ -7,6 +7,7 @@ __kernel void activation(OUT_OF_RANGE_PARAMS
                         __read_only image2d_t alpha,
 #endif
                         __private const float relux_max_limit,
+                         __private const float leakyrelu_coefficient,
                         __write_only image2d_t output) {
  const int ch_blk = get_global_id(0);
  const int w = get_global_id(1);
@@ -24,9 +25,9 @@ __kernel void activation(OUT_OF_RANGE_PARAMS
  DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(pos, hb));
 #ifdef USE_PRELU
  DATA_TYPE4 prelu_alpha = READ_IMAGET(alpha, SAMPLER, (int2)(ch_blk, 0));
-  DATA_TYPE4 out = do_activation(in, prelu_alpha, relux_max_limit);
+  DATA_TYPE4 out = do_activation(in, prelu_alpha, relux_max_limit, leakyrelu_coefficient);
 #else
-  DATA_TYPE4 out = do_activation(in, relux_max_limit);
+  DATA_TYPE4 out = do_activation(in, relux_max_limit, leakyrelu_coefficient);
 #endif

  WRITE_IMAGET(output, (int2)(pos, hb), out);

--- a/mace/ops/opencl/cl/batch_norm.cl
+++ b/mace/ops/opencl/cl/batch_norm.cl
@@ -11,7 +11,8 @@ __kernel void batch_norm(OUT_OF_RANGE_PARAMS
                         __private const float epsilon,
 #endif
                         __write_only image2d_t output,
-                         __private const float relux_max_limit) {
+                         __private const float relux_max_limit,
+                         __private const float leakyrelu_coefficient) {
  const int ch_blk = get_global_id(0);
  const int w = get_global_id(1);
  const int hb = get_global_id(2);
@@ -43,8 +44,8 @@ __kernel void batch_norm(OUT_OF_RANGE_PARAMS
  DATA_TYPE4 in = READ_IMAGET(input, SAMPLER, (int2)(pos, hb));
  DATA_TYPE4 out = mad(in, bn_scale, bn_offset);

-#if defined(USE_RELU) || defined(USE_RELUX) || defined(USE_TANH) || defined(USE_SIGMOID)
-  out = do_activation(out, relux_max_limit);
+#if  defined(USE_RELU) || defined(USE_LEAKYRELU) || defined(USE_RELUX) || defined(USE_TANH) || defined(USE_SIGMOID)
+  out = do_activation(out, relux_max_limit, leakyrelu_coefficient);
 #endif

  WRITE_IMAGET(output, (int2)(pos, hb), out);

--- a/mace/ops/opencl/cl/common.h
+++ b/mace/ops/opencl/cl/common.h
@@ -86,7 +86,8 @@ inline DATA_TYPE4 do_activation(DATA_TYPE4 in,
 #ifdef USE_PRELU
                                DATA_TYPE4 prelu_alpha,
 #endif
-                                __private const float relux_max_limit) {
+                                __private const float relux_max_limit,
+                                __private const float leakyrelu_coefficient) {
  DATA_TYPE4 out;
 #ifdef USE_RELU
  out = fmax(in, (DATA_TYPE)0);
@@ -104,7 +105,7 @@ inline DATA_TYPE4 do_activation(DATA_TYPE4 in,
  out = do_sigmoid(in);
 #endif
 #ifdef USE_LEAKYRELU
-  out = fmax(in, (DATA_TYPE)0) * relux_max_limit;
+  out = select(leakyrelu_coefficient * in, in, in >= (DATA_TYPE)0);
 #endif
  return out;
 }

--- a/mace/ops/opencl/cl/conv_2d.cl
+++ b/mace/ops/opencl/cl/conv_2d.cl
@@ -9,6 +9,7 @@ __kernel void conv_2d(OUT_OF_RANGE_PARAMS
 #endif
                      __write_only image2d_t output,
                      __private const float relux_max_limit,
+                      __private const float leakyrelu_coefficient,
                      __private const int in_height,
                      __private const int in_width,
                      __private const int in_ch_blks,
@@ -123,11 +124,11 @@ __kernel void conv_2d(OUT_OF_RANGE_PARAMS
    }
  }

-#if defined(USE_RELU) || defined(USE_RELUX) || defined(USE_TANH) || defined(USE_SIGMOID)
-  out0 = do_activation(out0, relux_max_limit);
-  out1 = do_activation(out1, relux_max_limit);
-  out2 = do_activation(out2, relux_max_limit);
-  out3 = do_activation(out3, relux_max_limit);
+#if  defined(USE_RELU) || defined(USE_LEAKYRELU) || defined(USE_RELUX) || defined(USE_TANH) || defined(USE_SIGMOID)
+  out0 = do_activation(out0, relux_max_limit, leakyrelu_coefficient);
+  out1 = do_activation(out1, relux_max_limit, leakyrelu_coefficient);
+  out2 = do_activation(out2, relux_max_limit, leakyrelu_coefficient);
+  out3 = do_activation(out3, relux_max_limit, leakyrelu_coefficient);
 #endif

  const int out_x_base = mul24(out_ch_blk, out_width);

--- a/mace/ops/opencl/cl/conv_2d_1x1.cl
+++ b/mace/ops/opencl/cl/conv_2d_1x1.cl
@@ -9,6 +9,7 @@ __kernel void conv_2d_1x1(OUT_OF_RANGE_PARAMS
 #endif
                          __write_only image2d_t output,
                          __private const float relux_max_limit,
+                          __private const float leakyrelu_coefficient,
                          __private const int in_height,
                          __private const int in_width,
                          __private const int in_ch_blks,
@@ -96,11 +97,11 @@ __kernel void conv_2d_1x1(OUT_OF_RANGE_PARAMS
    filter_x_base += 4;
  }

-#if defined(USE_RELU) || defined(USE_RELUX) || defined(USE_TANH) || defined(USE_SIGMOID)
-  out0 = do_activation(out0, relux_max_limit);
-  out1 = do_activation(out1, relux_max_limit);
-  out2 = do_activation(out2, relux_max_limit);
-  out3 = do_activation(out3, relux_max_limit);
+#if  defined(USE_RELU) || defined(USE_LEAKYRELU) || defined(USE_RELUX) || defined(USE_TANH) || defined(USE_SIGMOID)
+  out0 = do_activation(out0, relux_max_limit, leakyrelu_coefficient);
+  out1 = do_activation(out1, relux_max_limit, leakyrelu_coefficient);
+  out2 = do_activation(out2, relux_max_limit, leakyrelu_coefficient);
+  out3 = do_activation(out3, relux_max_limit, leakyrelu_coefficient);
 #endif

  const int out_x_base = mul24(out_ch_blk, width);

--- a/mace/ops/opencl/cl/conv_2d_1x1_buffer.cl
+++ b/mace/ops/opencl/cl/conv_2d_1x1_buffer.cl
@@ -17,6 +17,7 @@ __kernel void conv2d(BUFFER_OUT_OF_RANGE_PARAMS
                     __private const int stride_h,
                     __private const int stride_w,
                     __private const float relux_max_limit,
+                     __private const float leakyrelu_coefficient,
                     __global OUT_DATA_TYPE *output) {
  const int out_wc_blk_idx = get_global_id(0);
  const int out_hb_idx = get_global_id(1);
@@ -79,9 +80,9 @@ __kernel void conv2d(BUFFER_OUT_OF_RANGE_PARAMS
    in_offset += 4;
  }

-#if defined(USE_RELU) || defined(USE_RELUX) || defined(USE_TANH) || defined(USE_SIGMOID)
-  out0 = do_activation(out0, relux_max_limit);
-  out1 = do_activation(out1, relux_max_limit);
+#if  defined(USE_RELU) || defined(USE_LEAKYRELU) || defined(USE_RELUX) || defined(USE_TANH) || defined(USE_SIGMOID)
+  out0 = do_activation(out0, relux_max_limit, leakyrelu_coefficient);
+  out1 = do_activation(out1, relux_max_limit, leakyrelu_coefficient);
 #endif

  int out_offset = mad24(mad24(mad24(batch_idx, out_height, out_height_idx),

--- a/mace/ops/opencl/cl/conv_2d_3x3.cl
+++ b/mace/ops/opencl/cl/conv_2d_3x3.cl
@@ -9,6 +9,7 @@ __kernel void conv_2d_3x3(OUT_OF_RANGE_PARAMS
 #endif
                          __write_only image2d_t output,
                          __private const float relux_max_limit,
+                          __private const float leakyrelu_coefficient,
                          __private const int in_height,
                          __private const int in_width,
                          __private const int in_ch_blks,
@@ -128,12 +129,12 @@ __kernel void conv_2d_3x3(OUT_OF_RANGE_PARAMS
    }
  }

-#if defined(USE_RELU) || defined(USE_RELUX) || defined(USE_TANH) || defined(USE_SIGMOID)
-  out0 = do_activation(out0, relux_max_limit);
-  out1 = do_activation(out1, relux_max_limit);
-  out2 = do_activation(out2, relux_max_limit);
-  out3 = do_activation(out3, relux_max_limit);
-  out4 = do_activation(out4, relux_max_limit);
+#if  defined(USE_RELU) || defined(USE_LEAKYRELU) || defined(USE_RELUX) || defined(USE_TANH) || defined(USE_SIGMOID)
+  out0 = do_activation(out0, relux_max_limit, leakyrelu_coefficient);
+  out1 = do_activation(out1, relux_max_limit, leakyrelu_coefficient);
+  out2 = do_activation(out2, relux_max_limit, leakyrelu_coefficient);
+  out3 = do_activation(out3, relux_max_limit, leakyrelu_coefficient);
+  out4 = do_activation(out4, relux_max_limit, leakyrelu_coefficient);
 #endif

  const int out_x_base = mul24(out_ch_blk, out_width);

--- a/mace/ops/opencl/cl/conv_2d_buffer.cl
+++ b/mace/ops/opencl/cl/conv_2d_buffer.cl
@@ -22,6 +22,7 @@ __kernel void conv2d(BUFFER_OUT_OF_RANGE_PARAMS
                     __private const int dilated_h_offset,
                     __private const int dilated_w_offset,
                     __private const float relux_max_limit,
+                     __private const float leakyrelu_coefficient,
                     __global OUT_DATA_TYPE *output) {
  const int out_wc_blk_idx = get_global_id(0);
  const int out_hb_idx = get_global_id(1);
@@ -107,11 +108,11 @@ __kernel void conv2d(BUFFER_OUT_OF_RANGE_PARAMS
    }
  }

-#if defined(USE_RELU) || defined(USE_RELUX) || defined(USE_TANH) || defined(USE_SIGMOID)
-  out0 = do_activation(out0, relux_max_limit);
-  out1 = do_activation(out1, relux_max_limit);
-  out2 = do_activation(out2, relux_max_limit);
-  out3 = do_activation(out3, relux_max_limit);
+#if  defined(USE_RELU) || defined(USE_LEAKYRELU) || defined(USE_RELUX) || defined(USE_TANH) || defined(USE_SIGMOID)
+  out0 = do_activation(out0, relux_max_limit, leakyrelu_coefficient);
+  out1 = do_activation(out1, relux_max_limit, leakyrelu_coefficient);
+  out2 = do_activation(out2, relux_max_limit, leakyrelu_coefficient);
+  out3 = do_activation(out3, relux_max_limit, leakyrelu_coefficient);
 #endif

  int out_offset = mad24(mad24(mad24(batch_idx, out_height, out_height_idx),

--- a/mace/ops/opencl/cl/deconv_2d.cl
+++ b/mace/ops/opencl/cl/deconv_2d.cl
@@ -9,6 +9,7 @@ __kernel void deconv_2d(OUT_OF_RANGE_PARAMS
 #endif
                        __write_only image2d_t output,
                        __private const float relux_max_limit,
+                        __private const float leakyrelu_coefficient,
                        __private const int in_height,
                        __private const int in_width,
                        __private const int in_channels,
@@ -127,12 +128,12 @@ __kernel void deconv_2d(OUT_OF_RANGE_PARAMS
      }
    }

-#if defined(USE_RELU) || defined(USE_RELUX) || defined(USE_TANH) || defined(USE_SIGMOID)
-    out0 = do_activation(out0, relux_max_limit);
-    out1 = do_activation(out1, relux_max_limit);
-    out2 = do_activation(out2, relux_max_limit);
-    out3 = do_activation(out3, relux_max_limit);
-    out4 = do_activation(out4, relux_max_limit);
+#if  defined(USE_RELU) || defined(USE_LEAKYRELU) || defined(USE_RELUX) || defined(USE_TANH) || defined(USE_SIGMOID)
+    out0 = do_activation(out0, relux_max_limit, leakyrelu_coefficient);
+    out1 = do_activation(out1, relux_max_limit, leakyrelu_coefficient);
+    out2 = do_activation(out2, relux_max_limit, leakyrelu_coefficient);
+    out3 = do_activation(out3, relux_max_limit, leakyrelu_coefficient);
+    out4 = do_activation(out4, relux_max_limit, leakyrelu_coefficient);
 #endif

    int2 out_pos;

--- a/mace/ops/opencl/cl/depthwise_conv2d.cl
+++ b/mace/ops/opencl/cl/depthwise_conv2d.cl
@@ -10,6 +10,7 @@ __kernel void depthwise_conv2d(OUT_OF_RANGE_PARAMS
 #endif
                               __write_only image2d_t output,
                               __private const float relux_max_limit,
+                               __private const float leakyrelu_coefficient,
                               __private const short in_height,
                               __private const short in_width,
                               __private const short in_ch_blks,
@@ -112,11 +113,11 @@ __kernel void depthwise_conv2d(OUT_OF_RANGE_PARAMS
    in_hb_idx += dilation_h;
  }

-#if defined(USE_RELU) || defined(USE_RELUX) || defined(USE_TANH) || defined(USE_SIGMOID)
-  out0 = do_activation(out0, relux_max_limit);
-  out1 = do_activation(out1, relux_max_limit);
-  out2 = do_activation(out2, relux_max_limit);
-  out3 = do_activation(out3, relux_max_limit);
+#if  defined(USE_RELU) || defined(USE_LEAKYRELU) || defined(USE_RELUX) || defined(USE_TANH) || defined(USE_SIGMOID)
+  out0 = do_activation(out0, relux_max_limit, leakyrelu_coefficient);
+  out1 = do_activation(out1, relux_max_limit, leakyrelu_coefficient);
+  out2 = do_activation(out2, relux_max_limit, leakyrelu_coefficient);
+  out3 = do_activation(out3, relux_max_limit, leakyrelu_coefficient);
 #endif

  const short out_x_base = mul24(out_ch_blk, out_width);
@@ -145,6 +146,7 @@ __kernel void depthwise_conv2d_s1(OUT_OF_RANGE_PARAMS
 #endif
                                  __write_only image2d_t output,
                                  __private const DATA_TYPE relux_max_limit,
+                                  __private const DATA_TYPE leakyrelu_coefficient,
                                  __private const short in_height,
                                  __private const short in_width,
                                  __private const short in_ch_blks,
@@ -238,11 +240,11 @@ __kernel void depthwise_conv2d_s1(OUT_OF_RANGE_PARAMS
    in_hb_idx += 1;
  }

-#if defined(USE_RELU) || defined(USE_RELUX) || defined(USE_TANH) || defined(USE_SIGMOID)
-  out0 = do_activation(out0, relux_max_limit);
-  out1 = do_activation(out1, relux_max_limit);
-  out2 = do_activation(out2, relux_max_limit);
-  out3 = do_activation(out3, relux_max_limit);
+#if  defined(USE_RELU) || defined(USE_LEAKYRELU) || defined(USE_RELUX) || defined(USE_TANH) || defined(USE_SIGMOID)
+  out0 = do_activation(out0, relux_max_limit, leakyrelu_coefficient);
+  out1 = do_activation(out1, relux_max_limit, leakyrelu_coefficient);
+  out2 = do_activation(out2, relux_max_limit, leakyrelu_coefficient);
+  out3 = do_activation(out3, relux_max_limit, leakyrelu_coefficient);
 #endif

  const short out_x_base = mul24(out_ch_blk, out_width);

--- a/mace/ops/opencl/cl/depthwise_conv2d_buffer.cl
+++ b/mace/ops/opencl/cl/depthwise_conv2d_buffer.cl
@@ -22,6 +22,7 @@ __kernel void depthwise_conv2d(BUFFER_OUT_OF_RANGE_PARAMS
                               __private const int dilated_h_offset,
                               __private const int dilated_w_offset,
                               __private const float relux_max_limit,
+                               __private const float leakyrelu_coefficient,
                               __global OUT_DATA_TYPE *output) {
  const int out_wc_blk_idx = get_global_id(0);
  const int out_hb_idx = get_global_id(1);
@@ -85,11 +86,11 @@ __kernel void depthwise_conv2d(BUFFER_OUT_OF_RANGE_PARAMS
    }
  }

-#if defined(USE_RELU) || defined(USE_RELUX) || defined(USE_TANH) || defined(USE_SIGMOID)
-  out0 = do_activation(out0, relux_max_limit);
-  out1 = do_activation(out1, relux_max_limit);
-  out2 = do_activation(out2, relux_max_limit);
-  out3 = do_activation(out3, relux_max_limit);
+#if  defined(USE_RELU) || defined(USE_LEAKYRELU) || defined(USE_RELUX) || defined(USE_TANH) || defined(USE_SIGMOID)
+  out0 = do_activation(out0, relux_max_limit, leakyrelu_coefficient);
+  out1 = do_activation(out1, relux_max_limit, leakyrelu_coefficient);
+  out2 = do_activation(out2, relux_max_limit, leakyrelu_coefficient);
+  out3 = do_activation(out3, relux_max_limit, leakyrelu_coefficient);
 #endif

  int out_offset = mad24(mad24(mad24(batch_idx, out_height, out_height_idx),

--- a/mace/ops/opencl/cl/depthwise_deconv2d.cl
+++ b/mace/ops/opencl/cl/depthwise_deconv2d.cl
@@ -9,6 +9,7 @@ __kernel void depthwise_deconv2d(OUT_OF_RANGE_PARAMS
 #endif
                                 __write_only image2d_t output,
                                 __private const float relux_max_limit,
+                                 __private const float leakyrelu_coefficient,
                                 __private const int in_height,
                                 __private const int in_width,
                                 __private const int out_height,
@@ -108,12 +109,12 @@ __kernel void depthwise_deconv2d(OUT_OF_RANGE_PARAMS
      }
    }

-#if defined(USE_RELU) || defined(USE_RELUX) || defined(USE_TANH) || defined(USE_SIGMOID)
-    out0 = do_activation(out0, relux_max_limit);
-    out1 = do_activation(out1, relux_max_limit);
-    out2 = do_activation(out2, relux_max_limit);
-    out3 = do_activation(out3, relux_max_limit);
-    out4 = do_activation(out4, relux_max_limit);
+#if  defined(USE_RELU) || defined(USE_LEAKYRELU) || defined(USE_RELUX) || defined(USE_TANH) || defined(USE_SIGMOID)
+    out0 = do_activation(out0, relux_max_limit, leakyrelu_coefficient);
+    out1 = do_activation(out1, relux_max_limit, leakyrelu_coefficient);
+    out2 = do_activation(out2, relux_max_limit, leakyrelu_coefficient);
+    out3 = do_activation(out3, relux_max_limit, leakyrelu_coefficient);
+    out4 = do_activation(out4, relux_max_limit, leakyrelu_coefficient);
 #endif


@@ -146,4 +147,4 @@ __kernel void depthwise_deconv2d(OUT_OF_RANGE_PARAMS
    out_pos.x += stride_w;
    WRITE_IMAGET(output, out_pos, out4);
  }
-}
\ No newline at end of file
+}
--- a/mace/ops/opencl/cl/fully_connected.cl
+++ b/mace/ops/opencl/cl/fully_connected.cl
@@ -12,7 +12,8 @@ __kernel void fully_connected(OUT_OF_RANGE_PARAMS
                              __private const int input_height,
                              __private const int input_width,
                              __private const int input_channel,
-                              __private const float relux_max_limit) {
+                              __private const float relux_max_limit,
+                              __private const float leakyrelu_coefficient) {
  const int batch_idx = get_global_id(0);
  const int out_blk_idx = get_global_id(1);
  const int input_chan_blk = (input_channel + 3) >> 2;
@@ -56,8 +57,8 @@ __kernel void fully_connected(OUT_OF_RANGE_PARAMS
    input_coord.y++;
  }

-#if defined(USE_RELU) || defined(USE_RELUX) || defined(USE_TANH) || defined(USE_SIGMOID)
-  result = do_activation(result, relux_max_limit);
+#if  defined(USE_RELU) || defined(USE_LEAKYRELU) || defined(USE_RELUX) || defined(USE_TANH) || defined(USE_SIGMOID)
+  result = do_activation(result, relux_max_limit, leakyrelu_coefficient);
 #endif

  WRITE_IMAGET(output, (int2)(out_blk_idx, batch_idx), result);
@@ -77,7 +78,8 @@ __kernel void fully_connected_width(OUT_OF_RANGE_PARAMS
                                    __private const int input_width,
                                    __private const int in_chan_blks,
                                    __private const int out_blks,
-                                    __private const float relux_max_limit) {
+                                    __private const float relux_max_limit,
+                                    __private const float leakyrelu_coefficient) {
  const int inter_out_idx = get_global_id(0);
  const int width_blk_idx = get_global_id(1);
  const int width_blk_count = global_size_dim1;
@@ -147,8 +149,8 @@ __kernel void fully_connected_width(OUT_OF_RANGE_PARAMS
      inter_idx += 4;
    }

-#if defined(USE_RELU) || defined(USE_RELUX) || defined(USE_TANH) || defined(USE_SIGMOID)
-    result = do_activation(result, relux_max_limit);
+#if  defined(USE_RELU) || defined(USE_LEAKYRELU) || defined(USE_RELUX) || defined(USE_TANH) || defined(USE_SIGMOID)
+    result = do_activation(result, relux_max_limit, leakyrelu_coefficient);
 #endif

    WRITE_IMAGET(output, (int2)(out_blk_idx, batch_idx), result);

--- a/mace/ops/opencl/cl/winograd_transform.cl
+++ b/mace/ops/opencl/cl/winograd_transform.cl
@@ -127,7 +127,8 @@ __kernel void winograd_inverse_transform_2x2(OUT_OF_RANGE_PARAMS
                                             __private const int out_width,
                                             __private const int round_hw,
                                             __private const int round_w,
-                                             __private const float relux_max_limit) {
+                                             __private const float relux_max_limit,
+                                             __private const float leakyrelu_coefficient) {
  const int width_idx = get_global_id(0);
  const int height_idx = get_global_id(1);

@@ -203,11 +204,11 @@ __kernel void winograd_inverse_transform_2x2(OUT_OF_RANGE_PARAMS
 #endif


-#if defined(USE_RELU) || defined(USE_RELUX) || defined(USE_TANH) || defined(USE_SIGMOID)
-  in0[0] = do_activation(in0[0], relux_max_limit);
-  in0[1] = do_activation(in0[1], relux_max_limit);
-  in1[0] = do_activation(in1[0], relux_max_limit);
-  in1[1] = do_activation(in1[1], relux_max_limit);
+#if  defined(USE_RELU) || defined(USE_LEAKYRELU) || defined(USE_RELUX) || defined(USE_TANH) || defined(USE_SIGMOID)
+  in0[0] = do_activation(in0[0], relux_max_limit, leakyrelu_coefficient);
+  in0[1] = do_activation(in0[1], relux_max_limit, leakyrelu_coefficient);
+  in1[0] = do_activation(in1[0], relux_max_limit, leakyrelu_coefficient);
+  in1[1] = do_activation(in1[1], relux_max_limit, leakyrelu_coefficient);
 #endif

  WRITE_IMAGET(output, (int2)(coord_x, coord_y), in0[0]);
@@ -395,7 +396,8 @@ __kernel void winograd_inverse_transform_4x4(OUT_OF_RANGE_PARAMS
                                             __private const int out_width,
                                             __private const int round_hw,
                                             __private const int round_w,
-                                             __private const float relux_max_limit) {
+                                             __private const float relux_max_limit,
+                                             __private const float leakyrelu_coefficient) {
  const int width_idx = get_global_id(0);
  const int height_idx = get_global_id(1);

@@ -515,23 +517,23 @@ __kernel void winograd_inverse_transform_4x4(OUT_OF_RANGE_PARAMS
    out3[3] += bias_value;
 #endif

-#if defined(USE_RELU) || defined(USE_RELUX) || defined(USE_TANH) || defined(USE_SIGMOID)
-  out0[0] = do_activation(out0[0], relux_max_limit);
-  out0[1] = do_activation(out0[1], relux_max_limit);
-  out0[2] = do_activation(out0[2], relux_max_limit);
-  out0[3] = do_activation(out0[3], relux_max_limit);
-  out1[0] = do_activation(out1[0], relux_max_limit);
-  out1[1] = do_activation(out1[1], relux_max_limit);
-  out1[2] = do_activation(out1[2], relux_max_limit);
-  out1[3] = do_activation(out1[3], relux_max_limit);
-  out2[0] = do_activation(out2[0], relux_max_limit);
-  out2[1] = do_activation(out2[1], relux_max_limit);
-  out2[2] = do_activation(out2[2], relux_max_limit);
-  out2[3] = do_activation(out2[3], relux_max_limit);
-  out3[0] = do_activation(out3[0], relux_max_limit);
-  out3[1] = do_activation(out3[1], relux_max_limit);
-  out3[2] = do_activation(out3[2], relux_max_limit);
-  out3[3] = do_activation(out3[3], relux_max_limit);
+#if  defined(USE_RELU) || defined(USE_LEAKYRELU) || defined(USE_RELUX) || defined(USE_TANH) || defined(USE_SIGMOID)
+  out0[0] = do_activation(out0[0], relux_max_limit, leakyrelu_coefficient);
+  out0[1] = do_activation(out0[1], relux_max_limit, leakyrelu_coefficient);
+  out0[2] = do_activation(out0[2], relux_max_limit, leakyrelu_coefficient);
+  out0[3] = do_activation(out0[3], relux_max_limit, leakyrelu_coefficient);
+  out1[0] = do_activation(out1[0], relux_max_limit, leakyrelu_coefficient);
+  out1[1] = do_activation(out1[1], relux_max_limit, leakyrelu_coefficient);
+  out1[2] = do_activation(out1[2], relux_max_limit, leakyrelu_coefficient);
+  out1[3] = do_activation(out1[3], relux_max_limit, leakyrelu_coefficient);
+  out2[0] = do_activation(out2[0], relux_max_limit, leakyrelu_coefficient);
+  out2[1] = do_activation(out2[1], relux_max_limit, leakyrelu_coefficient);
+  out2[2] = do_activation(out2[2], relux_max_limit, leakyrelu_coefficient);
+  out2[3] = do_activation(out2[3], relux_max_limit, leakyrelu_coefficient);
+  out3[0] = do_activation(out3[0], relux_max_limit, leakyrelu_coefficient);
+  out3[1] = do_activation(out3[1], relux_max_limit, leakyrelu_coefficient);
+  out3[2] = do_activation(out3[2], relux_max_limit, leakyrelu_coefficient);
+  out3[3] = do_activation(out3[3], relux_max_limit, leakyrelu_coefficient);
 #endif

  const int num = min(4, out_width - out_width_idx);
@@ -556,4 +558,4 @@ __kernel void winograd_inverse_transform_4x4(OUT_OF_RANGE_PARAMS
  for (int i = 0; i < num; ++i) {
    WRITE_IMAGET(output, (int2)(coord_x + i, coord_y + 3), out3[i]);
  }
-}
\ No newline at end of file
+}
--- a/mace/ops/opencl/conv_2d.h
+++ b/mace/ops/opencl/conv_2d.h
@@ -45,6 +45,7 @@ class OpenCLConv2dKernel {
      const int *dilations,
      const ActivationType activation,
      const float relux_max_limit,
+      const float leakyrelu_coefficient,
      const int winograd_blk_size,
      Tensor *output) = 0;
  MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLConv2dKernel);

--- a/mace/ops/opencl/deconv_2d.h
+++ b/mace/ops/opencl/deconv_2d.h
@@ -36,6 +36,7 @@ class OpenCLDeconv2dKernel {
      const int *padding_data,
      const ActivationType activation,
      const float relux_max_limit,
+      const float leakyrelu_coefficient,
      const std::vector<index_t> &output_shape,
      Tensor *output) = 0;
  MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLDeconv2dKernel);

--- a/mace/ops/opencl/depthwise_conv2d.h
+++ b/mace/ops/opencl/depthwise_conv2d.h
@@ -38,6 +38,7 @@ class OpenCLDepthwiseConv2dKernel {
      const int *dilations,
      const ActivationType activation,
      const float relux_max_limit,
+      const float leakyrelu_coefficient,
      Tensor *output) = 0;
  MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLDepthwiseConv2dKernel);
 };

--- a/mace/ops/opencl/depthwise_deconv2d.h
+++ b/mace/ops/opencl/depthwise_deconv2d.h
@@ -39,6 +39,7 @@ class OpenCLDepthwiseDeconv2dKernel {
      const int group,
      const ActivationType activation,
      const float relux_max_limit,
+      const float leakyrelu_coefficient,
      const std::vector <index_t> &output_shape,
      Tensor *output) = 0;
  MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLDepthwiseDeconv2dKernel);

--- a/mace/ops/opencl/fully_connected.h
+++ b/mace/ops/opencl/fully_connected.h
@@ -35,6 +35,7 @@ class OpenCLFullyConnectedKernel {
      const Tensor *bias,
      const ActivationType activation,
      const float relux_max_limit,
+      const float leakyrelu_coefficient,
      Tensor *output) = 0;
  MACE_EMPTY_VIRTUAL_DESTRUCTOR(OpenCLFullyConnectedKernel);
 };

--- a/mace/ops/opencl/image/activation.h
+++ b/mace/ops/opencl/image/activation.h
@@ -35,8 +35,10 @@ template <typename T>
 class ActivationKernel : public OpenCLActivationKernel {
 public:
  ActivationKernel(ActivationType type,
-                   T relux_max_limit)
-      : activation_(type), relux_max_limit_(relux_max_limit) {}
+                   T relux_max_limit,
+                   T leakyrelu_coefficient)
+      : activation_(type), relux_max_limit_(relux_max_limit),
+        leakyrelu_coefficient_(leakyrelu_coefficient) {}

  MaceStatus Compute(
      OpContext *context,
@@ -47,6 +49,7 @@ class ActivationKernel : public OpenCLActivationKernel {
 private:
  ActivationType activation_;
  T relux_max_limit_;
+  T leakyrelu_coefficient_;
  cl::Kernel kernel_;
  uint32_t kwg_size_;
  std::vector<index_t> input_shape_;
@@ -128,6 +131,7 @@ MaceStatus ActivationKernel<T>::Compute(
      kernel_.setArg(idx++, *(alpha->opencl_image()));
    }
    kernel_.setArg(idx++, static_cast<float>(relux_max_limit_));
+    kernel_.setArg(idx++, static_cast<float>(leakyrelu_coefficient_));
    kernel_.setArg(idx++, *(output->opencl_image()));

    input_shape_ = input->shape();

--- a/mace/ops/opencl/image/batch_norm.h
+++ b/mace/ops/opencl/image/batch_norm.h
@@ -37,7 +37,8 @@ class BatchNormKernel : public OpenCLBatchNormKernel {
  BatchNormKernel(
      const float epsilon,
      const ActivationType activation,
-      const float relux_max_limit);
+      const float relux_max_limit,
+      const float leakyrelu_coefficient);
  MaceStatus Compute(OpContext *context,
                     const Tensor *input,
                     const Tensor *scale,
@@ -50,6 +51,7 @@ class BatchNormKernel : public OpenCLBatchNormKernel {
  const float epsilon_;
  const ActivationType activation_;
  const float relux_max_limit_;
+  const float leakyrelu_coefficient_;
  cl::Kernel kernel_;
  uint32_t kwg_size_;
  std::vector<index_t> input_shape_;
@@ -58,10 +60,12 @@ class BatchNormKernel : public OpenCLBatchNormKernel {
 template <typename T>
 BatchNormKernel<T>::BatchNormKernel(const float epsilon,
                                    const ActivationType activation,
-                                    const float relux_max_limit)
+                                    const float relux_max_limit,
+                                    const float leakyrelu_coefficient)
    : epsilon_(epsilon),
      activation_(activation),
-      relux_max_limit_(relux_max_limit) {}
+      relux_max_limit_(relux_max_limit),
+      leakyrelu_coefficient_(leakyrelu_coefficient) {}

 template <typename T>
 MaceStatus BatchNormKernel<T>::Compute(
@@ -115,6 +119,9 @@ MaceStatus BatchNormKernel<T>::Compute(
      case SIGMOID:
        built_options.emplace("-DUSE_SIGMOID");
        break;
+      case LEAKYRELU:
+        built_options.emplace("-DUSE_LEAKYRELU");
+        break;
      default:
        LOG(FATAL) << "Unknown activation type: " << activation_;
    }
@@ -140,6 +147,7 @@ MaceStatus BatchNormKernel<T>::Compute(
    }
    kernel_.setArg(idx++, *(output->opencl_image()));
    kernel_.setArg(idx++, relux_max_limit_);
+    kernel_.setArg(idx++, leakyrelu_coefficient_);

    input_shape_ = input->shape();
  }

--- a/mace/ops/opencl/image/conv_2d.h
+++ b/mace/ops/opencl/image/conv_2d.h
@@ -38,6 +38,7 @@ extern MaceStatus Conv2dK1x1(OpContext *context,
                             const int *dilations,
                             const ActivationType activation,
                             const float relux_max_limit,
+                             const float leakyrelu_coefficient,
                             const DataType dt,
                             std::vector<index_t> *prev_input_shape,
                             Tensor *output,
@@ -53,6 +54,7 @@ extern MaceStatus Conv2dK3x3(OpContext *context,
                             const int *dilations,
                             const ActivationType activation,
                             const float relux_max_limit,
+                             const float leakyrelu_coefficient,
                             const DataType dt,
                             std::vector<index_t> *prev_input_shape,
                             Tensor *output,
@@ -68,6 +70,7 @@ extern MaceStatus Conv2d(OpContext *context,
                         const int *dilations,
                         const ActivationType activation,
                         const float relux_max_limit,
+                         const float leakyrelu_coefficient,
                         const DataType dt,
                         std::vector<index_t> *prev_input_shape,
                         Tensor *output,
@@ -81,6 +84,7 @@ extern MaceStatus WinogradConv2dK3x3S1(OpContext *context,
                                       const int *padding,
                                       const ActivationType activation,
                                       const float relux_max_limit,
+                                       const float leakyrelu_coefficient,
                                       const DataType dt,
                                       const int wino_blk_size,
                                       std::vector<index_t> *prev_input_shape,
@@ -109,6 +113,7 @@ class Conv2dKernel : public OpenCLConv2dKernel {
      const int *dilations,
      const ActivationType activation,
      const float relux_max_limit,
+      const float leakyrelu_coefficient,
      const int wino_blk_size,
      Tensor *output) override;

@@ -169,6 +174,7 @@ MaceStatus Conv2dKernel<T>::Compute(
      const int *dilations,
      const ActivationType activation,
      const float relux_max_limit,
+      const float leakyrelu_coefficient,
      const int wino_blk_size,
      Tensor *output) {
  index_t kernel_h = filter->dim(2);
@@ -217,6 +223,7 @@ MaceStatus Conv2dKernel<T>::Compute(
                                  paddings.data(),
                                  activation,
                                  relux_max_limit,
+                                  leakyrelu_coefficient,
                                  DataTypeToEnum<T>::value,
                                  wino_blk_size,
                                  &input_shape_,
@@ -235,6 +242,7 @@ MaceStatus Conv2dKernel<T>::Compute(
                        dilations,
                        activation,
                        relux_max_limit,
+                        leakyrelu_coefficient,
                        DataTypeToEnum<T>::value,
                        &input_shape_,
                        output,
@@ -252,6 +260,7 @@ MaceStatus Conv2dKernel<T>::Compute(
                        dilations,
                        activation,
                        relux_max_limit,
+                        leakyrelu_coefficient,
                        DataTypeToEnum<T>::value,
                        &input_shape_,
                        output,
@@ -269,6 +278,7 @@ MaceStatus Conv2dKernel<T>::Compute(
                    dilations,
                    activation,
                    relux_max_limit,
+                    leakyrelu_coefficient,
                    DataTypeToEnum<T>::value,
                    &input_shape_,
                    output,

--- a/mace/ops/opencl/image/conv_2d_1x1.cc
+++ b/mace/ops/opencl/image/conv_2d_1x1.cc
@@ -76,6 +76,7 @@ extern MaceStatus Conv2dK1x1(OpContext *context,
                             const int *dilations,
                             const ActivationType activation,
                             const float relux_max_limit,
+                             const float leakyrelu_coefficient,
                             const DataType dt,
                             std::vector<index_t> *prev_input_shape,
                             Tensor *output,
@@ -125,6 +126,9 @@ extern MaceStatus Conv2dK1x1(OpContext *context,
      case SIGMOID:
        built_options.emplace("-DUSE_SIGMOID");
        break;
+      case LEAKYRELU:
+        built_options.emplace("-DUSE_LEAKYRELU");
+        break;
      default:
        LOG(FATAL) << "Unknown activation type: " << activation;
    }
@@ -154,6 +158,7 @@ extern MaceStatus Conv2dK1x1(OpContext *context,
    kernel->setArg(idx++, *(output->opencl_image()));
    // FIXME handle flexable data type: half not supported
    kernel->setArg(idx++, relux_max_limit);
+    kernel->setArg(idx++, leakyrelu_coefficient);
    kernel->setArg(idx++, static_cast<int>(input_height));
    kernel->setArg(idx++, static_cast<int>(input_width));
    kernel->setArg(idx++, static_cast<int>(input_channel_blocks));

--- a/mace/ops/opencl/image/conv_2d_3x3.cc
+++ b/mace/ops/opencl/image/conv_2d_3x3.cc
@@ -69,6 +69,7 @@ extern MaceStatus Conv2dK3x3(OpContext *context,
                             const int *dilations,
                             const ActivationType activation,
                             const float relux_max_limit,
+                             const float leakyrelu_coefficient,
                             const DataType dt,
                             std::vector<index_t> *prev_input_shape,
                             Tensor *output,
@@ -110,6 +111,9 @@ extern MaceStatus Conv2dK3x3(OpContext *context,
      case SIGMOID:
        built_options.emplace("-DUSE_SIGMOID");
        break;
+      case LEAKYRELU:
+        built_options.emplace("-DUSE_LEAKYRELU");
+        break;
      default:
        LOG(FATAL) << "Unknown activation type: " << activation;
    }
@@ -138,6 +142,7 @@ extern MaceStatus Conv2dK3x3(OpContext *context,
    }
    kernel->setArg(idx++, *(output->opencl_image()));
    kernel->setArg(idx++, relux_max_limit);
+    kernel->setArg(idx++, leakyrelu_coefficient);
    kernel->setArg(idx++, static_cast<int>(input->dim(1)));
    kernel->setArg(idx++, static_cast<int>(input->dim(2)));
    kernel->setArg(idx++, static_cast<int>(input_channel_blocks));

--- a/mace/ops/opencl/image/conv_2d_general.cc
+++ b/mace/ops/opencl/image/conv_2d_general.cc
@@ -77,6 +77,7 @@ extern MaceStatus Conv2d(OpContext *context,
                         const int *dilations,
                         const ActivationType activation,
                         const float relux_max_limit,
+                         const float leakyrelu_coefficient,
                         const DataType dt,
                         std::vector<index_t> *prev_input_shape,
                         Tensor *output,
@@ -118,6 +119,9 @@ extern MaceStatus Conv2d(OpContext *context,
      case SIGMOID:
        built_options.emplace("-DUSE_SIGMOID");
        break;
+      case LEAKYRELU:
+        built_options.emplace("-DUSE_LEAKYRELU");
+        break;
      default:
        LOG(FATAL) << "Unknown activation type: " << activation;
    }
@@ -146,6 +150,7 @@ extern MaceStatus Conv2d(OpContext *context,
    }
    kernel->setArg(idx++, *(output->opencl_image()));
    kernel->setArg(idx++, relux_max_limit);
+    kernel->setArg(idx++, leakyrelu_coefficient);
    kernel->setArg(idx++, static_cast<uint32_t>(input->dim(1)));
    kernel->setArg(idx++, static_cast<uint32_t>(input->dim(2)));
    kernel->setArg(idx++, static_cast<uint32_t>(input_channel_blocks));

--- a/mace/ops/opencl/image/deconv_2d.h
+++ b/mace/ops/opencl/image/deconv_2d.h
@@ -42,6 +42,7 @@ class Deconv2dKernel : public OpenCLDeconv2dKernel {
      const int *padding_data,
      const ActivationType activation,
      const float relux_max_limit,
+      const float leakyrelu_coefficient,
      const std::vector<index_t> &output_shape,
      Tensor *output) override;

@@ -61,6 +62,7 @@ MaceStatus Deconv2dKernel<T>::Compute(
      const int *padding_data,
      const ActivationType activation,
      const float relux_max_limit,
+      const float leakyrelu_coefficient,
      const std::vector<index_t> &output_shape,
      Tensor *output) {
  std::vector<size_t> output_image_shape;
@@ -119,6 +121,9 @@ MaceStatus Deconv2dKernel<T>::Compute(
      case SIGMOID:
        built_options.emplace("-DUSE_SIGMOID");
        break;
+      case LEAKYRELU:
+        built_options.emplace("-DUSE_LEAKYRELU");
+        break;
      default:
        LOG(FATAL) << "Unknown activation type: " << activation;
    }
@@ -146,6 +151,7 @@ MaceStatus Deconv2dKernel<T>::Compute(
    }
    kernel_.setArg(idx++, *(output->opencl_image()));
    kernel_.setArg(idx++, relux_max_limit);
+    kernel_.setArg(idx++, leakyrelu_coefficient);
    kernel_.setArg(idx++, static_cast<int32_t>(input->dim(1)));
    kernel_.setArg(idx++, static_cast<int32_t>(input->dim(2)));
    kernel_.setArg(idx++, static_cast<int32_t>(input->dim(3)));

--- a/mace/ops/opencl/image/depthwise_conv2d.cc
+++ b/mace/ops/opencl/image/depthwise_conv2d.cc
@@ -73,6 +73,7 @@ MaceStatus DepthwiseConv2d(OpContext *context,
                           const int *dilations,
                           const ActivationType activation,
                           const float relux_max_limit,
+                           const float leakyrelu_coefficient,
                           const DataType dt,
                           std::vector<index_t> *prev_input_shape,
                           Tensor *output,
@@ -126,6 +127,9 @@ MaceStatus DepthwiseConv2d(OpContext *context,
      case SIGMOID:
        built_options.emplace("-DUSE_SIGMOID");
        break;
+      case LEAKYRELU:
+        built_options.emplace("-DUSE_LEAKYRELU");
+        break;
      default:
        LOG(FATAL) << "Unknown activation type: " << activation;
    }
@@ -159,6 +163,7 @@ MaceStatus DepthwiseConv2d(OpContext *context,
    }
    kernel->setArg(idx++, *(output->opencl_image()));
    kernel->setArg(idx++, relux_max_limit);
+    kernel->setArg(idx++, leakyrelu_coefficient);
    kernel->setArg(idx++, static_cast<int16_t>(input_height));
    kernel->setArg(idx++, static_cast<int16_t>(input_width));
    kernel->setArg(idx++, static_cast<int16_t>(input_channel_blocks));

--- a/mace/ops/opencl/image/depthwise_conv2d.h
+++ b/mace/ops/opencl/image/depthwise_conv2d.h
@@ -39,6 +39,7 @@ MaceStatus DepthwiseConv2d(OpContext *context,
                           const int *dilations,
                           const ActivationType activation,
                           const float relux_max_limit,
+                           const float leakyrelu_coefficient,
                           const DataType dt,
                           std::vector<index_t> *prev_input_shape,
                           Tensor *output,
@@ -60,6 +61,7 @@ class DepthwiseConv2dKernel : public OpenCLDepthwiseConv2dKernel {
      const int *dilations,
      const ActivationType activation,
      const float relux_max_limit,
+      const float leakyrelu_coefficient,
      Tensor *output) override;

 private:
@@ -80,6 +82,7 @@ MaceStatus DepthwiseConv2dKernel<T>::Compute(
    const int *dilations,
    const ActivationType activation,
    const float relux_max_limit,
+    const float leakyrelu_coefficient,
    Tensor *output) {
  index_t kernel_h = filter->dim(2);
  index_t kernel_w = filter->dim(3);
@@ -118,8 +121,8 @@ MaceStatus DepthwiseConv2dKernel<T>::Compute(

  return depthwise::DepthwiseConv2d(
      context, &kernel_, input, filter, bias, strides[0], paddings.data(),
-      dilations, activation, relux_max_limit, DataTypeToEnum<T>::value,
-      &input_shape_, output, &kwg_size_);
+      dilations, activation, relux_max_limit, leakyrelu_coefficient,
+      DataTypeToEnum<T>::value, &input_shape_, output, &kwg_size_);
 }

 }  // namespace image

--- a/mace/ops/opencl/image/depthwise_deconv2d.h
+++ b/mace/ops/opencl/image/depthwise_deconv2d.h
@@ -43,6 +43,7 @@ class DepthwiseDeconv2dKernel : public OpenCLDepthwiseDeconv2dKernel {
      const int group,
      const ActivationType activation,
      const float relux_max_limit,
+      const float leakyrelu_coefficient,
      const std::vector<index_t> &output_shape,
      Tensor *output) override;

@@ -63,6 +64,7 @@ MaceStatus DepthwiseDeconv2dKernel<T>::Compute(
    const int group,
    const ActivationType activation,
    const float relux_max_limit,
+    const float leakyrelu_coefficient,
    const std::vector<index_t> &output_shape,
    Tensor *output) {
  const index_t batch = output_shape[0];
@@ -125,6 +127,9 @@ MaceStatus DepthwiseDeconv2dKernel<T>::Compute(
      case SIGMOID:
        built_options.emplace("-DUSE_SIGMOID");
        break;
+      case LEAKYRELU:
+        built_options.emplace("-DUSE_LEAKYRELU");
+        break;
      default:
        LOG(FATAL) << "Unknown activation type: " << activation;
    }
@@ -152,6 +157,7 @@ MaceStatus DepthwiseDeconv2dKernel<T>::Compute(
    }
    kernel_.setArg(idx++, *(output->opencl_image()));
    kernel_.setArg(idx++, relux_max_limit);
+    kernel_.setArg(idx++, leakyrelu_coefficient);
    kernel_.setArg(idx++, static_cast<int32_t>(input->dim(1)));
    kernel_.setArg(idx++, static_cast<int32_t>(input->dim(2)));
    kernel_.setArg(idx++, static_cast<int32_t>(height));

--- a/mace/ops/opencl/image/fully_connected.h
+++ b/mace/ops/opencl/image/fully_connected.h
@@ -40,6 +40,7 @@ class FullyConnectedKernel : public OpenCLFullyConnectedKernel {
      const Tensor *bias,
      const ActivationType activation,
      const float relux_max_limit,
+      const float leakyrelu_coefficient,
      Tensor *output) override;

 private:
@@ -57,6 +58,7 @@ MaceStatus FullyConnectedKernel<T>::Compute(
    const Tensor *bias,
    const ActivationType activation,
    const float relux_max_limit,
+    const float leakyrelu_coefficient,
    Tensor *output) {
  std::vector<index_t> output_shape = {input->dim(0), 1, 1, weight->dim(0)};
  std::vector<size_t> output_image_shape;
@@ -98,6 +100,9 @@ MaceStatus FullyConnectedKernel<T>::Compute(
      case SIGMOID:
        built_options.emplace("-DUSE_SIGMOID");
        break;
+      case LEAKYRELU:
+        built_options.emplace("-DUSE_LEAKYRELU");
+        break;
      default:
        LOG(FATAL) << "Unknown activation type: " << activation;
    }
@@ -148,6 +153,7 @@ MaceStatus FullyConnectedKernel<T>::Compute(
    kernel_.setArg(idx++, static_cast<int>(RoundUpDiv4(input->dim(3))));
    kernel_.setArg(idx++, static_cast<int>(output_blocks));
    kernel_.setArg(idx++, relux_max_limit);
+    kernel_.setArg(idx++, leakyrelu_coefficient);

    input_shape_ = input->shape();
  }

--- a/mace/ops/opencl/image/winograd_conv2d.cc
+++ b/mace/ops/opencl/image/winograd_conv2d.cc
@@ -115,6 +115,7 @@ MaceStatus WinogradOutputTransform(OpContext *context,
                                   const int wino_blk_size,
                                   const ActivationType activation,
                                   const float relux_max_limit,
+                                   const float leakyrelu_coefficient,
                                   const bool input_changed,
                                   Tensor *output_tensor,
                                   uint32_t *kwg_size,
@@ -164,6 +165,9 @@ MaceStatus WinogradOutputTransform(OpContext *context,
      case SIGMOID:
        built_options.emplace("-DUSE_SIGMOID");
        break;
+      case LEAKYRELU:
+        built_options.emplace("-DUSE_LEAKYRELU");
+        break;
      default:
        LOG(FATAL) << "Unknown activation type: " << activation;
    }
@@ -199,6 +203,7 @@ MaceStatus WinogradOutputTransform(OpContext *context,
    kernel->setArg(idx++, static_cast<uint32_t>(round_h * round_w));
    kernel->setArg(idx++, static_cast<uint32_t>(round_w));
    kernel->setArg(idx++, relux_max_limit);
+    kernel->setArg(idx++, leakyrelu_coefficient);
  }
  const std::vector<uint32_t> lws = {*kwg_size / 8, 8, 0};
  std::string tuning_key =
@@ -222,6 +227,7 @@ extern MaceStatus WinogradConv2dK3x3S1(OpContext *context,
                                       const int *paddings,
                                       const ActivationType activation,
                                       const float relux_max_limit,
+                                       const float leakyrelu_coefficient,
                                       const DataType dt,
                                       const int wino_blk_size,
                                       std::vector<index_t> *prev_input_shape,
@@ -338,7 +344,8 @@ extern MaceStatus WinogradConv2dK3x3S1(OpContext *context,
  MACE_RETURN_IF_ERROR(WinogradOutputTransform(
      context, kernels[2], mm_output.get(), bias,
      dt, round_h, round_w, wino_blk_size, activation, relux_max_limit,
-      input_changed, output, kwg_size[2], &t_output_future))
+      leakyrelu_coefficient, input_changed, output, kwg_size[2],
+      &t_output_future))

  MergeMultipleFutureWaitFn({t_input_future, mm_future, t_output_future},
                            context->future());

--- a/mace/python/tools/converter_tool/base_converter.py
+++ b/mace/python/tools/converter_tool/base_converter.py
@@ -168,6 +168,7 @@ class MaceKeyword(object):
    mace_element_type_str = 'type'
    mace_activation_type_str = 'activation'
    mace_activation_max_limit_str = 'max_limit'
+    mace_activation_leakyrelu_coefficient_str = 'leakyrelu_coefficient'
    mace_resize_size_str = 'size'
    mace_batch_to_space_crops_str = 'crops'
    mace_paddings_str = 'paddings'

--- a/mace/python/tools/converter_tool/caffe_converter.py
+++ b/mace/python/tools/converter_tool/caffe_converter.py
@@ -493,6 +493,14 @@ class CaffeConverter(base_converter.ConverterInterface):
                            mace_pb2.DT_FLOAT, alpha_data)
            op.input.extend([alpha_tensor_name])

+        negative_slope = caffe_op.layer.relu_param.negative_slope
+        if caffe_op.type == 'ReLU' and negative_slope != 0:
+            param_arg = op.arg.add()
+            param_arg.name = MaceKeyword.mace_activation_leakyrelu_coefficient_str  # noqa
+            param_arg.f = caffe_op.layer.relu_param.negative_slope
+
+            type_arg.s = six.b(ActivationType.LEAKYRELU.name)
+
    def convert_folded_batchnorm(self, caffe_op):
        op = self.convert_general_op(caffe_op)
        op.type = MaceOp.BatchNorm.name

--- a/mace/python/tools/converter_tool/onnx_converter.py
+++ b/mace/python/tools/converter_tool/onnx_converter.py
@@ -286,10 +286,10 @@ class OnnxConverter(base_converter.ConverterInterface):

    activation_type = {
        OnnxOpType.Relu.name: ActivationType.RELU,
+        OnnxOpType.LeakyRelu.name: ActivationType.LEAKYRELU,
        OnnxOpType.PRelu.name: ActivationType.PRELU,
        OnnxOpType.Tanh.name: ActivationType.TANH,
        OnnxOpType.Sigmoid.name: ActivationType.SIGMOID,
-        OnnxOpType.LeakyRelu.name: ActivationType.LEAKYRELU,
    }

    def __init__(self, option, src_model_file):

--- a/mace/python/tools/converter_tool/transformer.py
+++ b/mace/python/tools/converter_tool/transformer.py
@@ -898,7 +898,9 @@ class Transformer(base_converter.ConverterInterface):
                    op.output[0] = consumer_op.output[0]
                    for arg in consumer_op.arg:
                        if arg.name == MaceKeyword.mace_activation_type_str \
-                                or arg.name == MaceKeyword.mace_activation_max_limit_str:  # noqa
+                                or arg.name == \
+                                    MaceKeyword.mace_activation_max_limit_str \
+                                or arg.name == MaceKeyword.mace_activation_leakyrelu_coefficient_str:  # noqa
                            op.arg.extend([arg])

                    self.replace_quantize_info(op, consumer_op)