diff --git a/mace/kernels/arm/deconv_2d_neon.h b/mace/kernels/arm/deconv_2d_neon.h
index 87b86b066bb1274ca3a3350a85b4d409880d43d6..1cddbf1a264c161bb767eddc4f1a7b5f4192e33d 100644
--- a/mace/kernels/arm/deconv_2d_neon.h
+++ b/mace/kernels/arm/deconv_2d_neon.h
@@ -26,28 +26,24 @@ namespace kernels {
 
 void Deconv2dNeonK3x3S1(const float *input,
                         const float *filter,
-                        const float *bias,
                         const index_t *in_shape,
                         const index_t *out_shape,
                         float *output);
 
 void Deconv2dNeonK3x3S2(const float *input,
                         const float *filter,
-                        const float *bias,
                         const index_t *in_shape,
                         const index_t *out_shape,
                         float *output);
 
 void Deconv2dNeonK4x4S1(const float *input,
                         const float *filter,
-                        const float *bias,
                         const index_t *in_shape,
                         const index_t *out_shape,
                         float *output);
 
 void Deconv2dNeonK4x4S2(const float *input,
                         const float *filter,
-                        const float *bias,
                         const index_t *in_shape,
                         const index_t *out_shape,
                         float *output);
diff --git a/mace/kernels/arm/deconv_2d_neon_3x3.cc b/mace/kernels/arm/deconv_2d_neon_3x3.cc
index d4d7d0cdffe767ca8dd5bf8092bbce57b504d8f5..c8f5006bbb66fb1a56e7a706fca5208de51c0385 100644
--- a/mace/kernels/arm/deconv_2d_neon_3x3.cc
+++ b/mace/kernels/arm/deconv_2d_neon_3x3.cc
@@ -20,7 +20,6 @@ namespace kernels {
 
 void Deconv2dNeonK3x3S1(const float *input,
                         const float *filter,
-                        const float *bias,
                         const index_t *in_shape,
                         const index_t *out_shape,
                         float *output) {
@@ -40,12 +39,6 @@ void Deconv2dNeonK3x3S1(const float *input,
       if (oc + 1 < outch) {
         float *out_base0 = output + (b * outch + oc) * out_img_size;
         float *out_base1 = out_base0 + out_img_size;
-
-        const float bias_value0 = bias ? bias[oc] : 0.f;
-        const float bias_value1 = bias ? bias[oc + 1] : 0.f;
-        std::fill_n(out_base0, out_img_size, bias_value0);
-        std::fill_n(out_base1, out_img_size, bias_value1);
-
         for (index_t ic = 0; ic < inch; ++ic) {
           const float *input_base = input + (b * inch + ic) * h * w;
           const float *kernel_base0 = filter + (oc * inch + ic) * 9;
@@ -197,8 +190,6 @@ void Deconv2dNeonK3x3S1(const float *input,
         }
       } else {
         float *out_base0 = output + (b * outch + oc) * outh * outw;
-        const float bias_value0 = bias ? bias[oc] : 0.f;
-        std::fill_n(out_base0, outh * outw, bias_value0);
         for (index_t ic = 0; ic < inch; ++ic) {
           const float *input_base = input + (b * inch + ic) * h * w;
           const float *kernel_base0 = filter + (oc * inch + ic) * 9;
@@ -290,7 +281,6 @@ void Deconv2dNeonK3x3S1(const float *input,
 
 void Deconv2dNeonK3x3S2(const float *input,
                         const float *filter,
-                        const float *bias,
                         const index_t *in_shape,
                         const index_t *out_shape,
                         float *output) {
@@ -303,15 +293,11 @@ void Deconv2dNeonK3x3S2(const float *input,
   const index_t outw = out_shape[3];
   const index_t out_img_size = outh * outw;
 
-#pragma omp parallel for collapse(2)
+#pragma omp parallel for collapse(3)
   for (index_t b = 0; b < out_shape[0]; ++b) {
     for (index_t oc = 0; oc < outch; ++oc) {
-      float *out_base = output + (b * outch + oc) * out_img_size;
-
-      const float bias_value = bias ? bias[oc] : 0.f;
-      std::fill_n(out_base, out_img_size, bias_value);
-
       for (index_t ic = 0; ic < inch; ++ic) {
+        float *out_base = output + (b * outch + oc) * out_img_size;
         const float *input_base = input + (b * inch + ic) * h * w;
         const float *kernel_base = filter + (oc * inch + ic) * 9;
         const float *in = input_base;
diff --git a/mace/kernels/arm/deconv_2d_neon_4x4.cc b/mace/kernels/arm/deconv_2d_neon_4x4.cc
index 719a17e34fb1fce03276c59fa26a9f0476f3c75b..dd371ada1d223cdad6928d3e6cde6cf152ad2225 100644
--- a/mace/kernels/arm/deconv_2d_neon_4x4.cc
+++ b/mace/kernels/arm/deconv_2d_neon_4x4.cc
@@ -20,7 +20,6 @@ namespace kernels {
 
 void Deconv2dNeonK4x4S1(const float *input,
                         const float *filter,
-                        const float *bias,
                         const index_t *in_shape,
                         const index_t *out_shape,
                         float *output) {
@@ -32,16 +31,12 @@ void Deconv2dNeonK4x4S1(const float *input,
   const index_t outw = out_shape[3];
   const index_t outch = out_shape[1];
   const index_t out_img_size = outh * outw;
-#pragma omp parallel for
+#pragma omp parallel for collapse(2)
   for (int b = 0; b < out_shape[0]; ++b) {
     for (int oc = 0; oc < outch; oc += 2) {
       if (oc + 1 < outch) {
         float *out_base = output + (b * outch + oc) * out_img_size;
         float *out_base1 = out_base + out_img_size;
-        const float bias_value = bias ? bias[oc] : 0.f;
-        std::fill_n(out_base, out_img_size, bias_value);
-        const float bias_value1 = bias ? bias[oc + 1] : 0.f;
-        std::fill_n(out_base1, out_img_size, bias_value1);
         for (int q = 0; q < inch; q++) {
           const float *input_base = input + (b * inch + q) * h * w;
           const float *in = input_base;
@@ -257,8 +252,6 @@ void Deconv2dNeonK4x4S1(const float *input,
         }
       } else {
         float *out_base = output + (b * outch + oc) * out_img_size;
-        const float bias_value = bias ? bias[oc] : 0.f;
-        std::fill_n(out_base, out_img_size, bias_value);
         for (int q = 0; q < inch; q++) {
           const float *input_base = input + (b * inch + q) * h * w;
           const float *kernel_base = filter + (oc * inch + q) * 16;
@@ -381,7 +374,6 @@ void Deconv2dNeonK4x4S1(const float *input,
 
 void Deconv2dNeonK4x4S2(const float *input,
                         const float *filter,
-                        const float *bias,
                         const index_t *in_shape,
                         const index_t *out_shape,
                         float *output) {
@@ -394,14 +386,11 @@ void Deconv2dNeonK4x4S2(const float *input,
   const index_t outch = out_shape[1];
   const index_t out_img_size = outh * outw;
 
-#pragma omp parallel for
+#pragma omp parallel for collapse(3)
   for (int b = 0; b < out_shape[0]; ++b) {
     for (int p = 0; p < outch; p++) {
-      float *out_base = output + (b * outch + p) * out_img_size;
-      const float bias_value = bias ? bias[p] : 0.f;
-      std::fill_n(out_base, outh * outw, bias_value);
-
       for (int q = 0; q < inch; q++) {
+        float *out_base = output + (b * outch + p) * out_img_size;
         const float *input_base = input + (b * inch + q) * h * w;
         const float *kernel_base = filter + (p * inch + q) * 16;
         const float *in = input_base;
diff --git a/mace/kernels/deconv_2d.h b/mace/kernels/deconv_2d.h
index 3656fff8855dbd6001bf9c018c5d3b029acf982a..ab32679aaea4824152395d08d7ee13b26cf5d286 100644
--- a/mace/kernels/deconv_2d.h
+++ b/mace/kernels/deconv_2d.h
@@ -184,7 +184,6 @@ struct Deconv2dFunctor<DeviceType::CPU, float>: Deconv2dFunctorBase {
 
   void Deconv2dGeneral(const float *input,
                        const float *filter,
-                       const float *bias,
                        const index_t kernel_h,
                        const index_t kernel_w,
                        const int *strides,
@@ -206,23 +205,25 @@ struct Deconv2dFunctor<DeviceType::CPU, float>: Deconv2dFunctorBase {
       }
     }
 
-#pragma omp parallel for
-    for (int b = 0; b < in_shape[0]; ++b) {
-      for (int oc = 0; oc < out_shape[1]; ++oc) {
-        float *out_base =
-            output + (b * out_shape[1] + oc) * out_img_size;
-        const float bias_value = bias ? bias[oc] : 0.f;
-        std::fill_n(out_base, out_img_size, bias_value);
+    const index_t batch = in_shape[0];
+    const index_t out_channels = out_shape[1];
+    const index_t in_channels = in_shape[1];
+
+#pragma omp parallel for collapse(4)
+    for (int b = 0; b < batch; ++b) {
+      for (int oc = 0; oc < out_channels; ++oc) {
         for (int i = 0; i < in_height; ++i) {
           for (int j = 0; j < in_width; ++j) {
+            float *out_base =
+                output + (b * out_channels + oc) * out_img_size;
             const index_t out_offset =
                 i * strides[0] * out_width + j * strides[1];
-            for (int ic = 0; ic < in_shape[1]; ++ic) {
+            for (int ic = 0; ic < in_channels; ++ic) {
               const index_t input_idx =
-                  (b * in_shape[1] + ic) * in_img_size + i * in_width + j;
+                  (b * in_channels + ic) * in_img_size + i * in_width + j;
               const float val = input[input_idx];
               const index_t kernel_offset =
-                  (oc * in_shape[1] + ic) * kernel_size;
+                  (oc * in_channels + ic) * kernel_size;
               for (int k = 0; k < kernel_size; ++k) {
                 const index_t out_idx = out_offset + index_map[k];
                 const index_t kernel_idx = kernel_offset + k;
@@ -248,7 +249,7 @@ struct Deconv2dFunctor<DeviceType::CPU, float>: Deconv2dFunctorBase {
 
     const index_t out_height = out_shape[2];
     const index_t out_width = out_shape[3];
-#pragma omp parallel for
+#pragma omp parallel for collapse(3)
     for (int i = 0; i < batch; ++i) {
       for (int j = 0; j < channel; ++j) {
         for (int k = 0; k < out_height; ++k) {
@@ -324,7 +325,6 @@ struct Deconv2dFunctor<DeviceType::CPU, float>: Deconv2dFunctorBase {
                "Input/Output batch size mismatch");
     std::function<void(const float *input,
                        const float *filter,
-                       const float *bias,
                        const index_t *in_shape,
                        const index_t *out_shape,
                        float *output)> deconv_func;
@@ -354,6 +354,8 @@ struct Deconv2dFunctor<DeviceType::CPU, float>: Deconv2dFunctorBase {
     scratch->Rewind();
     scratch->GrowSize(padded_out_size);
     Tensor padded_out(scratch->Scratch(padded_out_size), DT_FLOAT);
+    padded_out.Reshape(padded_out_shape);
+    padded_out.Clear();
     auto *padded_out_data = padded_out.mutable_data<float>();
 
     bool use_neon_3x3_s1 = kernel_h == kernel_w && kernel_h == 3 &&
@@ -369,13 +371,11 @@ struct Deconv2dFunctor<DeviceType::CPU, float>: Deconv2dFunctorBase {
     if (use_neon_3x3_s1) {
       deconv_func = [=](const float *input,
                         const float *filter,
-                        const float *bias,
                         const index_t *in_shape,
                         const index_t *padded_out_shape,
                         float *padded_output) {
         Deconv2dNeonK3x3S1(input,
                            filter,
-                           bias,
                            in_shape,
                            padded_out_shape,
                            padded_output);
@@ -383,13 +383,11 @@ struct Deconv2dFunctor<DeviceType::CPU, float>: Deconv2dFunctorBase {
     } else if (use_neon_3x3_s2) {
       deconv_func = [=](const float *input,
                         const float *filter,
-                        const float *bias,
                         const index_t *in_shape,
                         const index_t *padded_out_shape,
                         float *padded_output) {
         Deconv2dNeonK3x3S2(input,
                            filter,
-                           bias,
                            in_shape,
                            padded_out_shape,
                            padded_output);
@@ -397,13 +395,11 @@ struct Deconv2dFunctor<DeviceType::CPU, float>: Deconv2dFunctorBase {
     } else if (use_neon_4x4_s1) {
       deconv_func = [=](const float *input,
                         const float *filter,
-                        const float *bias,
                         const index_t *in_shape,
                         const index_t *padded_out_shape,
                         float *padded_output) {
         Deconv2dNeonK4x4S1(input,
                            filter,
-                           bias,
                            in_shape,
                            padded_out_shape,
                            padded_output);
@@ -411,13 +407,11 @@ struct Deconv2dFunctor<DeviceType::CPU, float>: Deconv2dFunctorBase {
     } else if (use_neon_4x4_s2) {
       deconv_func = [=](const float *input,
                         const float *filter,
-                        const float *bias,
                         const index_t *in_shape,
                         const index_t *padded_out_shape,
                         float *padded_output) {
         Deconv2dNeonK4x4S2(input,
                            filter,
-                           bias,
                            in_shape,
                            padded_out_shape,
                            padded_output);
@@ -425,13 +419,11 @@ struct Deconv2dFunctor<DeviceType::CPU, float>: Deconv2dFunctorBase {
     } else {
       deconv_func = [=](const float *input,
                         const float *filter,
-                        const float *bias,
                         const index_t *in_shape,
                         const index_t *padded_out_shape,
                         float *padded_output) {
         Deconv2dGeneral(input,
                         filter,
-                        bias,
                         kernel_h,
                         kernel_w,
                         strides_.data(),
@@ -444,9 +436,24 @@ struct Deconv2dFunctor<DeviceType::CPU, float>: Deconv2dFunctorBase {
     bool no_pad =
         padded_out_h == output_shape[2] && padded_out_w == output_shape[3];
     float *out_data = no_pad ? output_data : padded_out_data;
+
+    if (bias_data != nullptr) {
+      const index_t batch = output_shape[0];
+      const index_t channels = output_shape[1];
+      const index_t img_size = output_shape[2] * output_shape[3];
+#pragma omp parallel for collapse(3)
+      for (index_t b = 0; b < batch; ++b) {
+        for (index_t c = 0; c < channels; ++c) {
+          for (index_t i = 0; i < img_size; ++i) {
+            output_data[(b * channels + c) * img_size + i] +=
+                bias_data[c];
+          }
+        }
+      }
+    }
+
     deconv_func(input_data,
                 filter_data,
-                bias_data,
                 in_shape,
                 padded_out_shape.data(),
                 out_data);
@@ -459,6 +466,8 @@ struct Deconv2dFunctor<DeviceType::CPU, float>: Deconv2dFunctorBase {
                  output_data);
     }
 
+
+
     DoActivation<float>(output_data,
                  output_data,
                  output->size(),
diff --git a/mace/ops/deconv_2d_benchmark.cc b/mace/ops/deconv_2d_benchmark.cc
index 9835bc4d1f9c2f57903980d31415317657abff82..269d90865fc091032f0a9f1f3316653379c8827a 100644
--- a/mace/ops/deconv_2d_benchmark.cc
+++ b/mace/ops/deconv_2d_benchmark.cc
@@ -120,15 +120,12 @@ static void Deconv2d(int iters,
   MACE_BM_DECONV_2D_MACRO(N, C, H, W, KH, KW, S, OH, OW, P, OC, float, GPU); \
   MACE_BM_DECONV_2D_MACRO(N, C, H, W, KH, KW, S, OH, OW, P, OC, half, GPU);
 
-MACE_BM_DECONV_2D(1, 128, 15, 15, 1, 1, 1, 15, 15, VALID, 256);
 MACE_BM_DECONV_2D(1, 32, 60, 60, 1, 1, 1, 60, 60, VALID, 128);
 
 MACE_BM_DECONV_2D(1, 128, 60, 60, 3, 3, 1, 62, 62, VALID, 128);
 MACE_BM_DECONV_2D(1, 32, 60, 60, 3, 3, 1, 60, 60, SAME, 32);
 
-MACE_BM_DECONV_2D(1, 128, 60, 60, 4, 4, 1, 63, 63, VALID, 128);
 MACE_BM_DECONV_2D(1, 32, 60, 60, 4, 4, 1, 60, 60, SAME, 32);
-MACE_BM_DECONV_2D(1, 3, 224, 224, 4, 4, 2, 448, 448, SAME, 32);
 MACE_BM_DECONV_2D(1, 3, 224, 224, 4, 4, 2, 450, 450, VALID, 32);
 MACE_BM_DECONV_2D(1, 3, 512, 512, 7, 7, 2, 1023, 1023, SAME, 32);
 MACE_BM_DECONV_2D(1, 128, 16, 16, 5, 5, 1, 20, 20, VALID, 32);
@@ -138,10 +135,10 @@ MACE_BM_DECONV_2D(1, 3, 480, 480, 1, 1, 1, 480, 480, VALID, 3);
 
 MACE_BM_DECONV_2D(1, 64, 32, 32, 1, 1, 1, 32, 32, VALID, 128);
 MACE_BM_DECONV_2D(1, 64, 33, 32, 3, 3, 2, 65, 63, SAME, 128);
-MACE_BM_DECONV_2D(1, 3, 224, 224, 3, 3, 2, 447, 447, SAME, 32);
-MACE_BM_DECONV_2D(1, 3, 224, 224, 3, 3, 2, 449, 449, VALID, 32);
 MACE_BM_DECONV_2D(1, 3, 224, 224, 3, 3, 2, 448, 448, SAME, 32);
 
+MACE_BM_DECONV_2D(1, 32, 1014, 762, 9, 9, 2, 2035, 1531, VALID, 1);
+
 }  // namespace test
 }  // namespace ops
 }  // namespace mace