diff --git a/mace/core/operator.cc b/mace/core/operator.cc
index 94e4f22fc1fac73400b7b0fec0f4d466d2731113..7c4d922ba8065b31ca84812d07de7ad0a771a792 100644
--- a/mace/core/operator.cc
+++ b/mace/core/operator.cc
@@ -77,7 +77,6 @@ extern void Register_Pooling(OperatorRegistry *op_registry);
 extern void Register_ResizeBilinear(OperatorRegistry *op_registry);
 extern void Register_Softmax(OperatorRegistry *op_registry);
 extern void Register_SpaceToBatchND(OperatorRegistry *op_registry);
-extern void Register_FoldedBatchNorm(OperatorRegistry *op_registry);
 extern void Register_GEMM(OperatorRegistry *op_registry);
 extern void Register_WinogradTransform(OperatorRegistry *op_registry);
 extern void Register_WinogradInverseTransform(OperatorRegistry *op_registry);
@@ -101,7 +100,6 @@ OperatorRegistry::OperatorRegistry() {
   Register_ResizeBilinear(this);
   Register_Softmax(this);
   Register_SpaceToBatchND(this);
-  Register_FoldedBatchNorm(this);
   Register_GEMM(this);
   Register_WinogradTransform(this);
   Register_WinogradInverseTransform(this);
diff --git a/mace/core/registry.h b/mace/core/registry.h
index 5c82ef2e4b60ce3645e6c5708a2b7442f9e8a85e..5a233bcd88815a12e533049dae552b4b93434d9c 100644
--- a/mace/core/registry.h
+++ b/mace/core/registry.h
@@ -19,7 +19,7 @@ class Registry {
   void Register(const SrcType &key, Creator creator) {
     VLOG(2) << "Registering: " << key;
     std::lock_guard<std::mutex> lock(register_mutex_);
-    MACE_CHECK(registry_.count(key) == 0, "Key already registered.");
+    MACE_CHECK(registry_.count(key) == 0, "Key already registered: ", key);
     registry_[key] = creator;
   }
 
diff --git a/mace/kernels/opencl/cl/winograd_transform.cl b/mace/kernels/opencl/cl/winograd_transform.cl
index e5f43411434b05c0c67717dac209ab3b429f3cac..daecd39f9d5e4e45f835166e86b1daba8428574b 100644
--- a/mace/kernels/opencl/cl/winograd_transform.cl
+++ b/mace/kernels/opencl/cl/winograd_transform.cl
@@ -107,17 +107,35 @@ __kernel void winograd_transform_2x2(__read_only image2d_t input,
 }
 
 __kernel void winograd_inverse_transform_2x2(__read_only image2d_t input,
+#ifdef BIAS
+                                             __read_only image2d_t bias, /* cout%4 * cout/4 */
+#endif
                                              __write_only image2d_t output,
                                              __private const int out_height,
                                              __private const int out_width,
                                              __private const int round_hw,
-                                             __private const int round_w) {
+                                             __private const int round_w,
+                                             __private const DATA_TYPE relux_max_limit,
+                                             __private const DATA_TYPE prelu_alpha) {
   const int width_idx = get_global_id(0);
   const int height_idx = get_global_id(1);
   const int out_channel = get_global_size(1);
   int width = width_idx;
   int height = height_idx;
 
+  const int batch = width_idx / round_hw;
+  int t = width_idx % round_hw;
+  const int out_height_idx = (t / round_w) << 1;
+  const int out_width_idx = (t % round_w) << 1;
+  const int out_chan_idx = height_idx;
+  const int coord_x = mad24(out_chan_idx, out_width, out_width_idx);
+  const int coord_y = mad24(batch, out_height, out_height_idx);
+
+#ifdef BIAS
+  DATA_TYPE4 bias_value =
+     READ_IMAGET(bias, SAMPLER, (int2)(out_chan_idx, 0));
+#endif
+
   DATA_TYPE4 in0[4], in1[4], in2[4], in3[4];
 
 #pragma unroll
@@ -157,13 +175,20 @@ __kernel void winograd_inverse_transform_2x2(__read_only image2d_t input,
   in1[0] = in1[0] + in1[1] + in1[2];
   in1[1] = in1[1] - in1[2] - in1[3];
 
-  const int batch = width_idx / round_hw;
-  int t = width_idx % round_hw;
-  const int out_height_idx = (t / round_w) << 1;
-  const int out_width_idx = (t % round_w) << 1;
-  const int out_chan_idx = height_idx;
-  const int coord_x = mad24(out_chan_idx, out_width, out_width_idx);
-  const int coord_y = mad24(batch, out_height, out_height_idx);
+#ifdef BIAS
+  in0[0] += bias_value;
+  in0[1] += bias_value;
+  in1[0] += bias_value;
+  in1[1] += bias_value;
+#endif
+
+
+#if defined(USE_RELU) || defined(USE_RELUX) || defined(USE_PRELU) || defined(USE_TANH) || defined(USE_SIGMOID)
+  in0[0] = do_activation(in0[0], relux_max_limit, prelu_alpha);
+  in0[1] = do_activation(in0[1], relux_max_limit, prelu_alpha);
+  in1[0] = do_activation(in1[0], relux_max_limit, prelu_alpha);
+  in1[1] = do_activation(in1[1], relux_max_limit, prelu_alpha);
+#endif
 
   WRITE_IMAGET(output, (int2)(coord_x, coord_y), in0[0]);
 
diff --git a/mace/kernels/opencl/winograd_transform.cc b/mace/kernels/opencl/winograd_transform.cc
index 78ce863a92124ead787fe9bf57c27b551823248f..10c31c7fe7615c7b43e07e07f138fbf30106c214 100644
--- a/mace/kernels/opencl/winograd_transform.cc
+++ b/mace/kernels/opencl/winograd_transform.cc
@@ -109,6 +109,7 @@ void WinogradTransformFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *i
 
 template<typename T>
 void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input_tensor,
+                                                                        const Tensor *bias,
                                                                         Tensor *output_tensor,
                                                                         StatsFuture *future) {
   std::vector<index_t> output_shape = {batch_, height_, width_, input_tensor->dim(1)};
@@ -121,10 +122,29 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(const Te
   built_options.emplace("-Dwinograd_inverse_transform_2x2=" + obfuscated_kernel_name);
   built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(DataTypeToEnum<T>::value));
   built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(DataTypeToEnum<T>::value));
-  if ((input_tensor->dim(1) % 4 == 0 || input_tensor->dim(0) == 1) &&
-      input_tensor->dim(2) % 4 == 0) {
-    built_options.emplace("-DDIVISIBLE_FOUR");
+  built_options.emplace(bias != nullptr ? "-DBIAS" : "");
+  switch (activation_) {
+    case NOOP:
+      break;
+    case RELU:
+      built_options.emplace("-DUSE_RELU");
+      break;
+    case RELUX:
+      built_options.emplace("-DUSE_RELUX");
+      break;
+    case PRELU:
+      built_options.emplace("-DUSE_PRELU");
+      break;
+    case TANH:
+      built_options.emplace("-DUSE_TANH");
+      break;
+    case SIGMOID:
+      built_options.emplace("-DUSE_SIGMOID");
+      break;
+    defeult:
+      LOG(FATAL) << "Unknown activation type: " << activation_;
   }
+
   auto runtime = OpenCLRuntime::Global();
   auto wino_kernel = runtime->BuildKernel("winograd_transform",
                                          obfuscated_kernel_name,
@@ -134,11 +154,16 @@ void WinogradInverseTransformFunctor<DeviceType::OPENCL, T>::operator()(const Te
   const uint32_t round_w = (width_ + 1) / 2;
   uint32_t idx = 0;
   wino_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(input_tensor->buffer())));
+  if (bias != nullptr) {
+    wino_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(bias->buffer())));
+  }
   wino_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(output_tensor->buffer())));
   wino_kernel.setArg(idx++, static_cast<uint32_t>(output_shape[1]));
   wino_kernel.setArg(idx++, static_cast<uint32_t>(output_shape[2]));
   wino_kernel.setArg(idx++, static_cast<uint32_t>(round_h * round_w));
   wino_kernel.setArg(idx++, static_cast<uint32_t>(round_w));
+  wino_kernel.setArg(idx++, relux_max_limit_);
+  wino_kernel.setArg(idx++, prelu_alpha_);
 
   const size_t gws[2] = {static_cast<size_t>(input_tensor->dim(2)),
                          static_cast<size_t>(RoundUpDiv4(input_tensor->dim(1)))};
diff --git a/mace/kernels/winograd_transform.h b/mace/kernels/winograd_transform.h
index 62284a074048e4ffa406e0d31b9fc8972bc4fe70..a71bda24b120f3eab77171dd2836c606151b6486 100644
--- a/mace/kernels/winograd_transform.h
+++ b/mace/kernels/winograd_transform.h
@@ -8,6 +8,7 @@
 #include "mace/core/future.h"
 #include "mace/core/tensor.h"
 #include "mace/kernels/conv_pool_2d_util.h"
+#include "mace/kernels/activation.h"
 
 namespace mace {
 namespace kernels {
@@ -47,22 +48,37 @@ struct WinogradTransformFunctor<DeviceType::OPENCL, T> : WinogradTransformFuncto
 struct WinogradInverseTransformFunctorBase {
   WinogradInverseTransformFunctorBase(const int batch,
                                       const int height,
-                                      const int width)
-      : batch_(batch), height_(height), width_(width) {}
+                                      const int width,
+                                      const ActivationType activation,
+                                      const float relux_max_limit,
+                                      const float prelu_alpha)
+      : batch_(batch),
+        height_(height),
+        width_(width),
+        activation_(activation),
+        relux_max_limit_(relux_max_limit),
+        prelu_alpha_(prelu_alpha) {}
 
   const int batch_;
   const int height_;
   const int width_;
+  const ActivationType activation_;
+  const float relux_max_limit_;
+  const float prelu_alpha_;
 };
 
 template<DeviceType D, typename T>
 struct WinogradInverseTransformFunctor : WinogradInverseTransformFunctorBase {
   WinogradInverseTransformFunctor(const int batch,
                                   const int height,
-                                  const int width)
-      : WinogradInverseTransformFunctorBase(batch, height, width) {}
+                                  const int width,
+                                  const ActivationType activation,
+                                  const float relux_max_limit,
+                                  const float prelu_alpha)
+      : WinogradInverseTransformFunctorBase(batch, height, width, activation, relux_max_limit, prelu_alpha) {}
 
   void operator()(const Tensor *input,
+                  const Tensor *bias,
                   Tensor *output,
                   StatsFuture *future) {
     MACE_NOT_IMPLEMENTED;
@@ -74,10 +90,14 @@ template<typename T>
 struct WinogradInverseTransformFunctor<DeviceType::OPENCL, T> : WinogradInverseTransformFunctorBase {
   WinogradInverseTransformFunctor(const int batch,
                                   const int height,
-                                  const int width)
-      : WinogradInverseTransformFunctorBase(batch, height, width) {}
+                                  const int width,
+                                  const ActivationType activation,
+                                  const float relux_max_limit,
+                                  const float prelu_alpha)
+      : WinogradInverseTransformFunctorBase(batch, height, width, activation, relux_max_limit, prelu_alpha) {}
 
   void operator()(const Tensor *input,
+                  const Tensor *bias,
                   Tensor *output,
                   StatsFuture *future);
 };
diff --git a/mace/ops/activation_benchmark.cc b/mace/ops/activation_benchmark.cc
index 63d0cf7fa3dd6545d98db7a3c834ac065268eead..8010bc24dea8effe2750826e9d1c2bc8bb99fe9e 100644
--- a/mace/ops/activation_benchmark.cc
+++ b/mace/ops/activation_benchmark.cc
@@ -20,7 +20,7 @@ static void ReluBenchmark(
 
   if (D == DeviceType::OPENCL) {
     BufferToImage<D, float>(net, "Input", "InputImage",
-                            kernels::BufferType::IN_OUT);
+                            kernels::BufferType::IN_OUT_CHANNEL);
 
     OpDefBuilder("Activation", "ReluBM")
         .Input("InputImage")
@@ -79,7 +79,7 @@ static void ReluxBenchmark(
 
   if (D == DeviceType::OPENCL) {
     BufferToImage<D, float>(net, "Input", "InputImage",
-                            kernels::BufferType::IN_OUT);
+                            kernels::BufferType::IN_OUT_CHANNEL);
 
     OpDefBuilder("Activation", "ReluxBM")
         .Input("InputImage")
@@ -140,7 +140,7 @@ static void PreluBenchmark(
 
   if (D == DeviceType::OPENCL) {
     BufferToImage<D, float>(net, "Input", "InputImage",
-                            kernels::BufferType::IN_OUT);
+                            kernels::BufferType::IN_OUT_CHANNEL);
 
     OpDefBuilder("Activation", "PreluBM")
         .Input("InputImage")
@@ -201,7 +201,7 @@ static void TanhBenchmark(
 
   if (D == DeviceType::OPENCL) {
     BufferToImage<D, float>(net, "Input", "InputImage",
-                            kernels::BufferType::IN_OUT);
+                            kernels::BufferType::IN_OUT_CHANNEL);
 
     OpDefBuilder("Activation", "TanhBM")
         .Input("InputImage")
@@ -260,7 +260,7 @@ static void SigmoidBenchmark(
 
   if (D == DeviceType::OPENCL) {
     BufferToImage<D, float>(net, "Input", "InputImage",
-                            kernels::BufferType::IN_OUT);
+                            kernels::BufferType::IN_OUT_CHANNEL);
 
     OpDefBuilder("Activation", "SigmoidBM")
         .Input("InputImage")
diff --git a/mace/ops/activation_test.cc b/mace/ops/activation_test.cc
index e99579ab4bd7be3bc9f4af17351174284ef53acd..2fd1078c88ef3151268c2ff548a281bfb1bf3b3e 100644
--- a/mace/ops/activation_test.cc
+++ b/mace/ops/activation_test.cc
@@ -20,7 +20,7 @@ void TestSimpleRelu() {
 
   if (D == DeviceType::OPENCL) {
     BufferToImage<D, float>(net, "Input", "InputImage",
-                            kernels::BufferType::IN_OUT);
+                            kernels::BufferType::IN_OUT_CHANNEL);
 
     OpDefBuilder("Activation", "ReluTest")
         .Input("InputImage")
@@ -33,7 +33,7 @@ void TestSimpleRelu() {
 
     // Transfer output
     ImageToBuffer<D, float>(net, "OutputImage", "Output",
-                            kernels::BufferType::IN_OUT);
+                            kernels::BufferType::IN_OUT_CHANNEL);
   } else {
     OpDefBuilder("Activation", "ReluTest")
         .Input("Input")
@@ -70,7 +70,7 @@ void TestUnalignedSimpleRelu() {
 
   if (D == DeviceType::OPENCL) {
     BufferToImage<D, float>(net, "Input", "InputImage",
-                            kernels::BufferType::IN_OUT);
+                            kernels::BufferType::IN_OUT_CHANNEL);
 
     OpDefBuilder("Activation", "ReluTest")
         .Input("InputImage")
@@ -83,7 +83,7 @@ void TestUnalignedSimpleRelu() {
 
     // Transfer output
     ImageToBuffer<D, float>(net, "OutputImage", "Output",
-                            kernels::BufferType::IN_OUT);
+                            kernels::BufferType::IN_OUT_CHANNEL);
   } else {
     OpDefBuilder("Activation", "ReluTest")
         .Input("Input")
@@ -125,7 +125,7 @@ void TestSimpleRelux() {
 
   if (D == DeviceType::OPENCL) {
     BufferToImage<D, float>(net, "Input", "InputImage",
-                            kernels::BufferType::IN_OUT);
+                            kernels::BufferType::IN_OUT_CHANNEL);
 
     OpDefBuilder("Activation", "ReluxTest")
         .Input("InputImage")
@@ -139,7 +139,7 @@ void TestSimpleRelux() {
 
     // Transfer output
     ImageToBuffer<D, float>(net, "OutputImage", "Output",
-                            kernels::BufferType::IN_OUT);
+                            kernels::BufferType::IN_OUT_CHANNEL);
   } else {
     OpDefBuilder("Activation", "ReluxTest")
         .Input("Input")
@@ -179,7 +179,7 @@ void TestSimpleReluRelux() {
 
   if (D == DeviceType::OPENCL) {
     BufferToImage<D, float>(net, "Input", "InputImage",
-                            kernels::BufferType::IN_OUT);
+                            kernels::BufferType::IN_OUT_CHANNEL);
 
     OpDefBuilder("Activation", "ReluxTest")
         .Input("InputImage")
@@ -193,7 +193,7 @@ void TestSimpleReluRelux() {
 
     // Transfer output
     ImageToBuffer<D, float>(net, "OutputImage", "Output",
-                            kernels::BufferType::IN_OUT);
+                            kernels::BufferType::IN_OUT_CHANNEL);
   } else {
     OpDefBuilder("Activation", "ReluxTest")
         .Input("Input")
@@ -237,7 +237,7 @@ void TestSimplePrelu() {
 
   if (D == DeviceType::OPENCL) {
     BufferToImage<D, float>(net, "Input", "InputImage",
-                            kernels::BufferType::IN_OUT);
+                            kernels::BufferType::IN_OUT_CHANNEL);
 
     OpDefBuilder("Activation", "PreluTest")
         .Input("InputImage")
@@ -251,7 +251,7 @@ void TestSimplePrelu() {
 
     // Transfer output
     ImageToBuffer<D, float>(net, "OutputImage", "Output",
-                            kernels::BufferType::IN_OUT);
+                            kernels::BufferType::IN_OUT_CHANNEL);
   } else {
     OpDefBuilder("Activation", "PreluTest")
         .Input("Input")
@@ -293,7 +293,7 @@ void TestSimpleTanh() {
 
   if (D == DeviceType::OPENCL) {
     BufferToImage<D, float>(net, "Input", "InputImage",
-                            kernels::BufferType::IN_OUT);
+                            kernels::BufferType::IN_OUT_CHANNEL);
 
     OpDefBuilder("Activation", "TanhTest")
         .Input("InputImage")
@@ -306,7 +306,7 @@ void TestSimpleTanh() {
 
     // Transfer output
     ImageToBuffer<D, float>(net, "OutputImage", "Output",
-                            kernels::BufferType::IN_OUT);
+                            kernels::BufferType::IN_OUT_CHANNEL);
   } else {
     OpDefBuilder("Activation", "TanhTest")
         .Input("Input")
@@ -348,7 +348,7 @@ void TestSimpleSigmoid() {
 
   if (D == DeviceType::OPENCL) {
     BufferToImage<D, float>(net, "Input", "InputImage",
-                            kernels::BufferType::IN_OUT);
+                            kernels::BufferType::IN_OUT_CHANNEL);
 
     OpDefBuilder("Activation", "SigmoidTest")
         .Input("InputImage")
@@ -361,7 +361,7 @@ void TestSimpleSigmoid() {
 
     // Transfer output
     ImageToBuffer<D, float>(net, "OutputImage", "Output",
-                            kernels::BufferType::IN_OUT);
+                            kernels::BufferType::IN_OUT_CHANNEL);
   } else {
     OpDefBuilder("Activation", "SigmoidTest")
         .Input("Input")
diff --git a/mace/ops/conv_2d_benchmark.cc b/mace/ops/conv_2d_benchmark.cc
index 713e08ebb48d7455881209122a601b6b8e7d7c3a..b02eb17e63d5e1e3cf126b6be0284bdb9f73d954 100644
--- a/mace/ops/conv_2d_benchmark.cc
+++ b/mace/ops/conv_2d_benchmark.cc
@@ -96,17 +96,18 @@ static void Conv2d(int iters,
   BM_CONV_2D_MACRO(N, C, H, W, KH, KW, S, P, OC, TYPE, OPENCL);
 
 // ICNet
-//BM_CONV_2D(1, 512, 15, 15, 1, 1, 1, VALID, 1024, half);
+BM_CONV_2D(1, 512, 15, 15, 1, 1, 1, VALID, 1024, half);
 //// SNPE GPU ExecutionDuration = 448us, % ALU Utilization = 105
-//BM_CONV_2D(1, 64, 60, 60, 1, 1, 1, VALID, 128, half);
+BM_CONV_2D(1, 64, 60, 60, 1, 1, 1, VALID, 128, half);
 //// SNPE GPU ExecutionDuration = 258us, % ALU Utilization = 108
-//BM_CONV_2D(1, 32, 60, 60, 1, 1, 1, VALID, 128, half);
-//
-//BM_CONV_2D(1, 128, 60, 60, 3, 3, 1, VALID, 128, half);
+BM_CONV_2D(1, 32, 60, 60, 1, 1, 1, VALID, 128, half);
+
+BM_CONV_2D(1, 128, 60, 60, 3, 3, 1, VALID, 128, half);
 //// SNPE GPU ExecutionDuration = 506us, % ALU Utilization = 106.8
-//BM_CONV_2D(1, 32, 60, 60, 3, 3, 1, SAME, 32, half);
-//BM_CONV_2D(1, 3, 512, 512, 7, 7, 2, SAME, 64, half);
-//BM_CONV_2D(1, 512, 64, 64, 1, 1, 1, SAME, 256, half);
+BM_CONV_2D(1, 32, 60, 60, 3, 3, 1, SAME, 32, half);
+BM_CONV_2D(1, 3, 512, 512, 7, 7, 2, SAME, 64, half);
+BM_CONV_2D(1, 512, 64, 64, 1, 1, 1, SAME, 256, half);
+
 BM_CONV_2D(1, 128, 16, 16, 3, 3, 1, VALID, 32, half);
 BM_CONV_2D(1, 128, 64, 64, 3, 3, 1, VALID, 32, half);
 BM_CONV_2D(1, 128, 128, 128, 3, 3, 1, VALID, 32, half);
diff --git a/mace/ops/folded_batch_norm.cc b/mace/ops/folded_batch_norm.cc
index 5a04c48dd8f2000c9a33b175ec5c67f4c4aebe81..9915bee4128f1e3766a91070d1cae48e044f459f 100644
--- a/mace/ops/folded_batch_norm.cc
+++ b/mace/ops/folded_batch_norm.cc
@@ -7,10 +7,11 @@
 namespace mace {
 
 void Register_FoldedBatchNorm(OperatorRegistry *op_registry) {
-  REGISTER_OPERATOR(op_registry, OpKeyBuilder("FoldedBatchNorm")
-      .Device(DeviceType::CPU)
-      .TypeConstraint<float>("T")
-      .Build(),
+  REGISTER_OPERATOR(op_registry,
+                    OpKeyBuilder("FoldedBatchNorm")
+                        .Device(DeviceType::CPU)
+                        .TypeConstraint<float>("T")
+                        .Build(),
                     FoldedBatchNormOp<DeviceType::CPU, float>);
 
 #if MACE_ENABLE_NEON
@@ -21,16 +22,18 @@ void Register_FoldedBatchNorm(OperatorRegistry *op_registry) {
                     FoldedBatchNormOp<DeviceType::NEON, float>);
 #endif  // MACE_ENABLE_NEON
 
-  REGISTER_OPERATOR(op_registry, OpKeyBuilder("FoldedBatchNorm")
-      .Device(DeviceType::OPENCL)
-      .TypeConstraint<float>("T")
-      .Build(),
+  REGISTER_OPERATOR(op_registry,
+                    OpKeyBuilder("FoldedBatchNorm")
+                        .Device(DeviceType::OPENCL)
+                        .TypeConstraint<float>("T")
+                        .Build(),
                     FoldedBatchNormOp<DeviceType::OPENCL, float>);
 
-  REGISTER_OPERATOR(op_registry, OpKeyBuilder("FoldedBatchNorm")
-      .Device(DeviceType::OPENCL)
-      .TypeConstraint<half>("T")
-      .Build(),
+  REGISTER_OPERATOR(op_registry,
+                    OpKeyBuilder("FoldedBatchNorm")
+                        .Device(DeviceType::OPENCL)
+                        .TypeConstraint<half>("T")
+                        .Build(),
                     FoldedBatchNormOp<DeviceType::OPENCL, half>);
 }
 
diff --git a/mace/ops/winograd_convolution_test.cc b/mace/ops/winograd_convolution_test.cc
index 3aa058a560742f0eb27c792541e55a8f8dc03f8a..d5e54a9746186cf64a9164d90b2e8d863af87a42 100644
--- a/mace/ops/winograd_convolution_test.cc
+++ b/mace/ops/winograd_convolution_test.cc
@@ -11,7 +11,6 @@ namespace mace {
 
 class WinogradConvlutionTest : public OpsTestBase {};
 
-
 void TransposeFilter(const std::vector<float> &input,
                      const std::vector<index_t> &input_shape,
                      std::vector<float> &output) {
@@ -48,14 +47,18 @@ void WinogradConvolution(const index_t batch,
   GenerateRandomRealTypeData<float>(filter_shape, filter_data);
   net.AddRandomInput<D, float>("Input", {batch, height, width, in_channels});
   net.AddInputFromArray<D, float>("Filter", filter_shape, filter_data);
+  net.AddRandomInput<D, T>("Bias", {out_channels});
 
   BufferToImage<D, T>(net, "Input", "InputImage",
                       kernels::BufferType::IN_OUT_CHANNEL);
   BufferToImage<D, T>(net, "Filter", "FilterImage",
                       kernels::BufferType::FILTER);
+  BufferToImage<D, T>(net, "Bias", "BiasImage",
+                      kernels::BufferType::ARGUMENT);
   OpDefBuilder("Conv2D", "Conv2dTest")
       .Input("InputImage")
       .Input("FilterImage")
+      .Input("BiasImage")
       .Output("OutputImage")
       .AddIntsArg("strides", {1, 1})
       .AddIntArg("padding", padding)
@@ -102,6 +105,7 @@ void WinogradConvolution(const index_t batch,
   // Inverse transform
   OpDefBuilder("WinogradInverseTransform", "WinogradInverseTransformTest")
       .Input("WinoGemm")
+      .Input("BiasImage")
       .AddIntArg("batch", batch)
       .AddIntArg("height", output_shape[1])
       .AddIntArg("width", output_shape[2])
@@ -113,7 +117,7 @@ void WinogradConvolution(const index_t batch,
   net.Sync();
 
   ImageToBuffer<D, float>(net, "WinoOutputImage", "WinoOutput",
-                                           kernels::BufferType::IN_OUT_CHANNEL);
+                          kernels::BufferType::IN_OUT_CHANNEL);
   if (DataTypeToEnum<T>::value == DataType::DT_HALF) {
     ExpectTensorNear<float>(expected, *net.GetOutput("WinoOutput"), 1e-1);
   } else {
@@ -121,7 +125,6 @@ void WinogradConvolution(const index_t batch,
   }
 }
 
-
 TEST_F(WinogradConvlutionTest, AlignedConvolution) {
   WinogradConvolution<DeviceType::OPENCL, float>(1, 32, 32, 32, 16, Padding::VALID);
   WinogradConvolution<DeviceType::OPENCL, float>(1, 32, 32, 32, 16, Padding::SAME);
diff --git a/mace/ops/winograd_inverse_transform.h b/mace/ops/winograd_inverse_transform.h
index c620246cd74b1c2d63488701cfd3e36151f5ed8b..4c20769f1fd461f393c1c57e58bc5f089197ed7c 100644
--- a/mace/ops/winograd_inverse_transform.h
+++ b/mace/ops/winograd_inverse_transform.h
@@ -9,6 +9,7 @@
 
 #include "mace/core/operator.h"
 #include "mace/kernels/winograd_transform.h"
+#include "mace/kernels/activation.h"
 
 namespace mace {
 
@@ -19,13 +20,18 @@ class WinogradInverseTransformOp : public Operator<D, T> {
       : Operator<D, T>(op_def, ws),
         functor_(OperatorBase::GetSingleArgument<int>("batch", 1),
                  OperatorBase::GetSingleArgument<int>("height", 0),
-                 OperatorBase::GetSingleArgument<int>("width", 0)) {}
+                 OperatorBase::GetSingleArgument<int>("width", 0),
+                 kernels::StringToActivationType(
+                     OperatorBase::GetSingleArgument<std::string>("activation",
+                                                                  "NOOP")),
+                 OperatorBase::GetSingleArgument<float>("max_limit", 0.0f),
+                 OperatorBase::GetSingleArgument<float>("alpha", 0.0f)) {}
 
   bool Run(StatsFuture *future) override {
     const Tensor *input_tensor = this->Input(INPUT);
+    const Tensor *bias = this->InputSize() == 2 ? this->Input(BIAS) : nullptr;
     Tensor *output_tensor = this->Output(OUTPUT);
-
-    functor_(input_tensor, output_tensor, future);
+    functor_(input_tensor, bias, output_tensor, future);
     return true;
   }
 
@@ -33,7 +39,7 @@ class WinogradInverseTransformOp : public Operator<D, T> {
   kernels::WinogradInverseTransformFunctor<D, T> functor_;
 
  protected:
-  OP_INPUT_TAGS(INPUT);
+  OP_INPUT_TAGS(INPUT, BIAS);
   OP_OUTPUT_TAGS(OUTPUT);
 };
 
diff --git a/mace/proto/BUILD b/mace/proto/BUILD
index 5222b06bda6e1681b15ac7f60317376c5d34fa3d..8649197b94508615dd395a991bccfe5205042804 100644
--- a/mace/proto/BUILD
+++ b/mace/proto/BUILD
@@ -10,15 +10,6 @@ licenses(["notice"])  # Apache 2.0
 
 load("@com_google_protobuf//:protobuf.bzl", "py_proto_library")
 
-py_proto_library(
-    name = "mace_py",
-    srcs = ["mace.proto"],
-    default_runtime = "@com_google_protobuf//:protobuf_python",
-    protoc = "@com_google_protobuf//:protoc",
-    srcs_version = "PY2AND3",
-    deps = ["@com_google_protobuf//:protobuf_python"],
-)
-
 py_proto_library(
     name = "caffe_py",
     srcs = ["caffe.proto"],