diff --git a/mace/core/runtime/hexagon/hexagon_dsp_wrapper.cc b/mace/core/runtime/hexagon/hexagon_dsp_wrapper.cc
index a617e7c7f5f534d8bb765529c28524c1807b96ea..0a12bcc484a56e55132a1f240247bb784f047b76 100644
--- a/mace/core/runtime/hexagon/hexagon_dsp_wrapper.cc
+++ b/mace/core/runtime/hexagon/hexagon_dsp_wrapper.cc
@@ -432,7 +432,7 @@ bool HexagonDSPWrapper::ExecuteGraph(const Tensor &input_tensor,
   }
   MACE_CHECK(output_bytes == output_tensor->raw_size(),
              "wrong output bytes inferred.");
-  return res == 0;
+  return true;
 }
 
 bool HexagonDSPWrapper::ExecuteGraphNew(
@@ -495,6 +495,7 @@ bool HexagonDSPWrapper::ExecuteGraphNew(
                                    num_inputs * kNumMetaData,
                                    outputs.data(),
                                    num_outputs * kNumMetaData);
+  MACE_CHECK(res == 0, "execute error");
 
   // handle hexagon output
   for (size_t i = 0; i < num_outputs; ++i) {
@@ -504,12 +505,12 @@ bool HexagonDSPWrapper::ExecuteGraphNew(
         outputs[index].depth};
     MACE_CHECK(output_shape.size() == output_info_[i].shape.size(),
                output_shape.size(), " vs ", output_info_[i].shape.size(),
-                "wrong output shape inferred");
+                " wrong output shape inferred");
     for (size_t j = 0; j < output_shape.size(); ++j) {
       MACE_CHECK(static_cast<index_t>(output_shape[j])
                      == output_info_[i].shape[j],
                  output_shape[j], " vs ", output_info_[i].shape[j],
-                 "wrong output shape inferred");
+                 " wrong output shape[", j, "] inferred");
     }
     auto output_tensor = output_tensors->at(output_info_[i].name);
     MACE_CHECK(static_cast<index_t>(outputs[index].data_valid_len)
@@ -518,7 +519,7 @@ bool HexagonDSPWrapper::ExecuteGraphNew(
                " wrong output bytes inferred.");
   }
 
-  return res == 0;
+  return true;
 }
 
 }  // namespace mace
diff --git a/mace/ops/common/conv_pool_2d_util.cc b/mace/ops/common/conv_pool_2d_util.cc
index 4398888174675cb202cccefcf4cb374b97925aca..072efaa7db51f707bd08aa877f7cdd3b15028964 100644
--- a/mace/ops/common/conv_pool_2d_util.cc
+++ b/mace/ops/common/conv_pool_2d_util.cc
@@ -76,7 +76,8 @@ void CalcPaddingAndOutputSize(const index_t *input_shape,
       output_height = (input_height - k_extent_height) / strides[0] + 1;
       output_width = (input_width - k_extent_width) / strides[1] + 1;
       break;
-    case SAME:output_height = (input_height - 1) / strides[0] + 1;
+    case SAME:
+      output_height = (input_height - 1) / strides[0] + 1;
       output_width = (input_width - 1) / strides[1] + 1;
       break;
     case FULL:
diff --git a/mace/ops/opencl/cl/conv_2d.cl b/mace/ops/opencl/cl/conv_2d.cl
index 4a3d9e0dd894423b44cfa4335e05f95b0c4ed21f..2bf4572b85184b0056d7a9d113953d13b43475e0 100644
--- a/mace/ops/opencl/cl/conv_2d.cl
+++ b/mace/ops/opencl/cl/conv_2d.cl
@@ -17,7 +17,8 @@ __kernel void conv_2d(OUT_OF_RANGE_PARAMS
                       __private const int out_width,
                       __private const int filter_height,
                       __private const int filter_width,
-                      __private const int stride,
+                      __private const int stride_h,
+                      __private const int stride_w,
                       __private const int padding_top,
                       __private const int padding_left,
                       __private const int dilation_h,
@@ -47,12 +48,12 @@ __kernel void conv_2d(OUT_OF_RANGE_PARAMS
   DATA_TYPE4 out3 = 0;
 #endif
 
-  int in_width_stride = mul24(out_w_blks, stride);
-  int in_width0 = mad24(out_w_blk, stride, -padding_left);
+  int in_width_stride = mul24(out_w_blks, stride_w);
+  int in_width0 = mad24(out_w_blk, stride_w, -padding_left);
   int in_width1 = in_width0 + in_width_stride;
   int in_width2 = in_width1 + in_width_stride;
   int in_width3 = in_width2 + in_width_stride;
-  const int height_start = mad24((out_hb % out_height), stride, -padding_top);
+  const int height_start = mad24((out_hb % out_height), stride_h, -padding_top);
   int in_height_gap = select(
       0,
       (-height_start + dilation_h - 1) / dilation_h,
diff --git a/mace/ops/opencl/cl/conv_2d_1x1.cl b/mace/ops/opencl/cl/conv_2d_1x1.cl
index d0dc2e15f02877a25c8f0fa6a868a034d59513b4..1d146028133118015565259b7296455490cd6434 100644
--- a/mace/ops/opencl/cl/conv_2d_1x1.cl
+++ b/mace/ops/opencl/cl/conv_2d_1x1.cl
@@ -15,7 +15,8 @@ __kernel void conv_2d_1x1(OUT_OF_RANGE_PARAMS
                           __private const int in_ch_blks,
                           __private const int height,
                           __private const int width,
-                          __private const int stride) {
+                          __private const int stride_h,
+                          __private const int stride_w) {
   const int out_ch_blk = get_global_id(0);
   const int out_w_blk = get_global_id(1);
   const int out_hb = get_global_id(2);
@@ -41,14 +42,14 @@ __kernel void conv_2d_1x1(OUT_OF_RANGE_PARAMS
 #endif
 
   int4 w;
-  int in_width_stride = mul24(out_w_blks, stride);
-  w.x = mul24(out_w_blk, stride);
+  int in_width_stride = mul24(out_w_blks, stride_w);
+  w.x = mul24(out_w_blk, stride_w);
   w.y = w.x + in_width_stride;
   w.z = w.y + in_width_stride;
   w.w = w.z + in_width_stride;
   int batch = out_hb / height;
   int h_idx = out_hb - mul24(batch, height);
-  int out_hb_idx = mul24(h_idx, stride);
+  int out_hb_idx = mul24(h_idx, stride_h);
 
   w.x = select(w.x, INT_MIN, w.x >= in_width);
   w.y = select(w.y, INT_MIN, w.y >= in_width);
diff --git a/mace/ops/opencl/cl/conv_2d_3x3.cl b/mace/ops/opencl/cl/conv_2d_3x3.cl
index aeb8533290f1e3662ead4cf227e6315f967d3bc4..0c37cd8040cc8eec243d133fd9ce79bef344bf64 100644
--- a/mace/ops/opencl/cl/conv_2d_3x3.cl
+++ b/mace/ops/opencl/cl/conv_2d_3x3.cl
@@ -15,7 +15,8 @@ __kernel void conv_2d_3x3(OUT_OF_RANGE_PARAMS
                           __private const int in_ch_blks,
                           __private const int out_height,
                           __private const int out_width,
-                          __private const int stride,
+                          __private const int stride_h,
+                          __private const int stride_w,
                           __private const int padding_top,
                           __private const int padding_left,
                           __private const int dilation_h,
@@ -47,13 +48,13 @@ __kernel void conv_2d_3x3(OUT_OF_RANGE_PARAMS
   DATA_TYPE4 out4 = 0;
 #endif
 
-  int in_width_stride = mul24(out_w_blks, stride);
-  int in_width0 = mad24(out_w_blk, stride, -padding_left);
+  int in_width_stride = mul24(out_w_blks, stride_w);
+  int in_width0 = mad24(out_w_blk, stride_w, -padding_left);
   int in_width1 = in_width0 + in_width_stride;
   int in_width2 = in_width1 + in_width_stride;
   int in_width3 = in_width2 + in_width_stride;
   int in_width4 = in_width3 + in_width_stride;
-  const int height_start = mad24((out_hb % out_height), stride, -padding_top);
+  const int height_start = mad24((out_hb % out_height), stride_h, -padding_top);
   int in_height_gap = select(
       0,
       (-height_start + dilation_h - 1) / dilation_h,
diff --git a/mace/ops/opencl/image/conv_2d.cc b/mace/ops/opencl/image/conv_2d.cc
index 20c101a2410eb11c1a29fbe7f9aa4cfefda9511f..60a9b15537356435a32b8ff5404091e18f471b43 100644
--- a/mace/ops/opencl/image/conv_2d.cc
+++ b/mace/ops/opencl/image/conv_2d.cc
@@ -73,8 +73,7 @@ MaceStatus Conv2dKernel::Compute(
     Tensor *output) {
   index_t kernel_h = filter->dim(2);
   index_t kernel_w = filter->dim(3);
-  if (strides[0] != strides[1] ||
-      (dilations[0] > 1 && (strides[0] > 1 || kernel_h == 1))) {
+  if (dilations[0] > 1 && (strides[0] > 1 || kernel_h == 1)) {
     LOG(WARNING) << "OpenCL conv2d kernel with "
                  << "filter" << kernel_h << "x" << kernel_w << ","
                  << " stride " << strides[0] << "x" << strides[1]
@@ -131,6 +130,7 @@ MaceStatus Conv2dKernel::Compute(
                         filter,
                         bias,
                         strides[0],
+                        strides[1],
                         paddings.data(),
                         dilations,
                         activation,
@@ -148,6 +148,7 @@ MaceStatus Conv2dKernel::Compute(
                         filter,
                         bias,
                         strides[0],
+                        strides[1],
                         paddings.data(),
                         dilations,
                         activation,
@@ -165,6 +166,7 @@ MaceStatus Conv2dKernel::Compute(
                     filter,
                     bias,
                     strides[0],
+                    strides[1],
                     paddings.data(),
                     dilations,
                     activation,
diff --git a/mace/ops/opencl/image/conv_2d.h b/mace/ops/opencl/image/conv_2d.h
index 84fae55dff77afef2056f3f8e1628413a73e0bc2..a1ee3301b373d43980d38b1fd38fb7876c5c47d2 100644
--- a/mace/ops/opencl/image/conv_2d.h
+++ b/mace/ops/opencl/image/conv_2d.h
@@ -33,7 +33,8 @@ extern MaceStatus Conv2dK1x1(OpContext *context,
                              const Tensor *input,
                              const Tensor *filter,
                              const Tensor *bias,
-                             const int stride,
+                             const int stride_h,
+                             const int stride_w,
                              const int *padding,
                              const int *dilations,
                              const ActivationType activation,
@@ -48,7 +49,8 @@ extern MaceStatus Conv2dK3x3(OpContext *context,
                              const Tensor *input,
                              const Tensor *filter,
                              const Tensor *bias,
-                             const int stride,
+                             const int stride_h,
+                             const int stride_w,
                              const int *padding,
                              const int *dilations,
                              const ActivationType activation,
@@ -63,7 +65,8 @@ extern MaceStatus Conv2d(OpContext *context,
                          const Tensor *input,
                          const Tensor *filter,
                          const Tensor *bias,
-                         const int stride,
+                         const int stride_h,
+                         const int stride_w,
                          const int *padding,
                          const int *dilations,
                          const ActivationType activation,
diff --git a/mace/ops/opencl/image/conv_2d_1x1.cc b/mace/ops/opencl/image/conv_2d_1x1.cc
index 460d01323dccd584b880f2cdc27b5d2e4c2735fe..718240152f6a74e0835c75edbae7782a1fb8c23f 100644
--- a/mace/ops/opencl/image/conv_2d_1x1.cc
+++ b/mace/ops/opencl/image/conv_2d_1x1.cc
@@ -71,7 +71,8 @@ MaceStatus Conv2dK1x1(OpContext *context,
                       const Tensor *input,
                       const Tensor *filter,
                       const Tensor *bias,
-                      const int stride,
+                      const int stride_h,
+                      const int stride_w,
                       const int *padding,
                       const int *dilations,
                       const ActivationType activation,
@@ -170,7 +171,8 @@ MaceStatus Conv2dK1x1(OpContext *context,
     kernel->setArg(idx++, static_cast<int>(input_channel_blocks));
     kernel->setArg(idx++, static_cast<int>(height));
     kernel->setArg(idx++, static_cast<int>(width));
-    kernel->setArg(idx++, stride);
+    kernel->setArg(idx++, stride_h);
+    kernel->setArg(idx++, stride_w);
 
     *prev_input_shape = input->shape();
   }
diff --git a/mace/ops/opencl/image/conv_2d_3x3.cc b/mace/ops/opencl/image/conv_2d_3x3.cc
index a3bd170f64079a5b4533dd2a4fb104dbee752cfd..d8a8b9cfbba611e5d0a320e8708f7f08c0a2b844 100644
--- a/mace/ops/opencl/image/conv_2d_3x3.cc
+++ b/mace/ops/opencl/image/conv_2d_3x3.cc
@@ -64,7 +64,8 @@ MaceStatus Conv2dK3x3(OpContext *context,
                       const Tensor *input,
                       const Tensor *filter,
                       const Tensor *bias,
-                      const int stride,
+                      const int stride_h,
+                      const int stride_w,
                       const int *padding,
                       const int *dilations,
                       const ActivationType activation,
@@ -154,7 +155,8 @@ MaceStatus Conv2dK3x3(OpContext *context,
     kernel->setArg(idx++, static_cast<int>(input_channel_blocks));
     kernel->setArg(idx++, static_cast<int>(height));
     kernel->setArg(idx++, static_cast<int>(width));
-    kernel->setArg(idx++, stride);
+    kernel->setArg(idx++, stride_h);
+    kernel->setArg(idx++, stride_w);
     kernel->setArg(idx++, padding[0] / 2);
     kernel->setArg(idx++, padding[1] / 2);
     kernel->setArg(idx++, dilations[0]);
diff --git a/mace/ops/opencl/image/conv_2d_general.cc b/mace/ops/opencl/image/conv_2d_general.cc
index e1979c03a715a8ec0a74bf26d35e3f34484d0c55..bf4baea72da0e0776eb3223e71650b475c994906 100644
--- a/mace/ops/opencl/image/conv_2d_general.cc
+++ b/mace/ops/opencl/image/conv_2d_general.cc
@@ -72,7 +72,8 @@ MaceStatus Conv2d(OpContext *context,
                   const Tensor *input,
                   const Tensor *filter,
                   const Tensor *bias,
-                  const int stride,
+                  const int stride_h,
+                  const int stride_w,
                   const int *padding,
                   const int *dilations,
                   const ActivationType activation,
@@ -164,7 +165,8 @@ MaceStatus Conv2d(OpContext *context,
     kernel->setArg(idx++, static_cast<uint32_t>(width));
     kernel->setArg(idx++, static_cast<uint32_t>(filter->dim(2)));
     kernel->setArg(idx++, static_cast<uint32_t>(filter->dim(3)));
-    kernel->setArg(idx++, static_cast<uint32_t>(stride));
+    kernel->setArg(idx++, static_cast<uint32_t>(stride_h));
+    kernel->setArg(idx++, static_cast<uint32_t>(stride_w));
     kernel->setArg(idx++, padding[0] / 2);
     kernel->setArg(idx++, padding[1] / 2);
     kernel->setArg(idx++, dilations[0]);
diff --git a/test/ccunit/mace/ops/conv_2d_test.cc b/test/ccunit/mace/ops/conv_2d_test.cc
index 3f97d0d2b70496962af3ac0d53501f6c8026df52..f9823265082b8e917551ba67923679110e9f5c8a 100644
--- a/test/ccunit/mace/ops/conv_2d_test.cc
+++ b/test/ccunit/mace/ops/conv_2d_test.cc
@@ -42,7 +42,8 @@ void TestNHWCSimple3x3VALID(int wino_blk_size = 0) {
   net.AddInputFromArray<D, float>(
       "Filter", {1, 2, 3, 3},
       {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
-       1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}, true);
+       1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f},
+      true);
   net.AddInputFromArray<D, float>("Bias", {1}, {0.1f}, true);
   const std::vector<index_t> output_shape = {1, 1, 1, 1};
 
@@ -100,7 +101,8 @@ void TestNHWCSimple3x3SAME(int wino_blk_size = 0) {
   net.AddInputFromArray<D, float>(
       "Filter", {1, 2, 3, 3},
       {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
-       1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}, true);
+       1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f},
+      true);
   net.AddInputFromArray<D, float>("Bias", {1}, {0.1f}, true);
   const std::vector<index_t> output_shape = {1, 3, 3, 1};
 
@@ -149,6 +151,65 @@ void TestNHWCSimple3x3SAME(int wino_blk_size = 0) {
     ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-3, 1e-3);
   }
 }
+
+template <DeviceType D, typename T>
+void TestNHWCSimple3x3NeqStride(int wino_blk_size = 0) {
+  OpsTestNet net;
+  // Add input data
+  net.AddInputFromArray<D, float>(
+      "Input", {1, 3, 3, 2},
+      {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
+  net.AddInputFromArray<D, float>(
+      "Filter", {1, 2, 3, 3},
+      {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
+       1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f},
+      true);
+  net.AddInputFromArray<D, float>("Bias", {1}, {0.1f}, true);
+  const std::vector<index_t> output_shape = {1, 3, 2, 1};
+
+  if (D == DeviceType::CPU) {
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
+    OpDefBuilder("Conv2D", "Conv2dTest")
+        .Input("InputNCHW")
+        .Input("Filter")
+        .Input("Bias")
+        .Output("OutputNCHW")
+        .AddIntsArg("strides", {1, 2})
+        .AddIntArg("padding", Padding::SAME)
+        .AddIntsArg("dilations", {1, 1})
+        .Finalize(net.NewOperatorDef());
+    // Run
+    net.RunOp(D);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
+  } else if (D == DeviceType::GPU) {
+    OpDefBuilder("Conv2D", "Conv2dTest")
+        .Input("Input")
+        .Input("Filter")
+        .Input("Bias")
+        .Output("Output")
+        .OutputShape(output_shape)
+        .AddIntsArg("strides", {1, 2})
+        .AddIntArg("padding", Padding::SAME)
+        .AddIntsArg("dilations", {1, 1})
+        .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+        .AddIntArg("wino_block_size", wino_blk_size)
+        .Finalize(net.NewOperatorDef());
+
+    net.RunOp(D);
+  } else {
+    MACE_NOT_IMPLEMENTED;
+  }
+
+  auto expected = net.CreateTensor<float>(
+      output_shape, {8.1f, 8.1f, 12.1f, 12.1f, 8.1f, 8.1f});
+  if (DataTypeToEnum<T>::value == DataType::DT_FLOAT) {
+    ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
+  } else {
+    ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-3, 1e-3);
+  }
+}
 }  // namespace
 
 TEST_F(Conv2dOpTest, CPUSimple) {
@@ -171,7 +232,12 @@ TEST_F(Conv2dOpTest, OPENCLSimpleWinograd) {
   TestNHWCSimple3x3VALID<DeviceType::GPU, float>(2);
   TestNHWCSimple3x3VALID<DeviceType::GPU, half>(2);
   // TODO(liutuo): the precision of the last value is not normal.
-//  TestNHWCSimple3x3SAME<DeviceType::GPU, half>(4);
+  //  TestNHWCSimple3x3SAME<DeviceType::GPU, half>(4);
+}
+
+TEST_F(Conv2dOpTest, NotEqualStrideSimple) {
+  TestNHWCSimple3x3NeqStride<DeviceType::CPU, float>();
+  TestNHWCSimple3x3NeqStride<DeviceType::GPU, float>();
 }
 
 namespace {
@@ -186,7 +252,8 @@ void TestNHWCSimple3x3WithoutBias() {
   net.AddInputFromArray<D, T>(
       "Filter", {1, 2, 3, 3},
       {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
-       1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}, true);
+       1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f},
+      true);
 
   if (D == DeviceType::CPU) {
     net.TransformDataFormat<DeviceType::CPU, float>(
@@ -316,7 +383,8 @@ void TestFusedNHWCSimple3x3VALID(int wino_blk_size = 0) {
   net.AddInputFromArray<D, float>(
       "Filter", {1, 2, 3, 3},
       {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
-       1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}, true);
+       1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f},
+      true);
   net.AddInputFromArray<D, float>("Bias", {1}, {-0.1f}, true);
   const std::vector<index_t> output_shape = {1, 1, 1, 1};
 
@@ -372,7 +440,8 @@ void TestFusedNHWCSimple3x3WithoutBias(int wino_blk_size = 0) {
   net.AddInputFromArray<D, float>(
       "Filter", {1, 2, 3, 3},
       {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
-       1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f}, true);
+       1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f},
+      true);
   const std::vector<index_t> output_shape = {1, 1, 1, 1};
 
   if (D == DeviceType::CPU) {
@@ -502,17 +571,87 @@ void TestConv1x1() {
 
   ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
 }
+
+template <DeviceType D>
+void TestConv1x1NeqStride() {
+  // Construct graph
+  OpsTestNet net;
+
+  // Add input data
+  net.AddInputFromArray<D, float>(
+      "Input", {1, 3, 10, 5},
+      {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
+  net.AddInputFromArray<D, float>(
+      "Filter", {2, 5, 1, 1},
+      {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f}, true);
+  net.AddInputFromArray<D, float>("Bias", {2}, {0.1f, 0.2f}, true);
+
+  if (D == DeviceType::CPU) {
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
+    OpDefBuilder("Conv2D", "Conv2DTest")
+        .Input("InputNCHW")
+        .Input("Filter")
+        .Input("Bias")
+        .Output("OutputNCHW")
+        .AddIntsArg("strides", {1, 2})
+        .AddIntArg("padding", Padding::VALID)
+        .AddIntsArg("dilations", {1, 1})
+        .Finalize(net.NewOperatorDef());
+    // Run
+    net.RunOp(D);
+    net.TransformDataFormat<DeviceType::CPU, float>(
+        "OutputNCHW", DataFormat::NCHW, "Output", DataFormat::NHWC);
+  } else if (D == DeviceType::GPU) {
+    OpDefBuilder("Conv2D", "Conv2DTest")
+        .Input("Input")
+        .Input("Filter")
+        .Input("Bias")
+        .Output("Output")
+        .AddIntsArg("strides", {1, 2})
+        .AddIntArg("padding", Padding::VALID)
+        .AddIntsArg("dilations", {1, 1})
+        .Finalize(net.NewOperatorDef());
+    // Run
+    net.RunOp(D);
+  } else {
+    MACE_NOT_IMPLEMENTED;
+  }
+
+  // Check
+  auto expected = net.CreateTensor<float>(
+      {1, 3, 5, 2},
+      {5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f,
+       5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f,
+       5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f, 5.1f, 10.2f});
+
+  ExpectTensorNear<float>(*expected, *net.GetOutput("Output"), 1e-5);
+}
 }  // namespace
 
 TEST_F(Conv2dOpTest, CPUConv1x1) { TestConv1x1<DeviceType::CPU>(); }
 
 TEST_F(Conv2dOpTest, OPENCLConv1x1) { TestConv1x1<DeviceType::GPU>(); }
 
+TEST_F(Conv2dOpTest, CPUConv1x1NotEqualStride) {
+  TestConv1x1NeqStride<DeviceType::CPU>();
+}
+
+TEST_F(Conv2dOpTest, OPENCLConv1x1NotEqualStride) {
+  TestConv1x1NeqStride<DeviceType::GPU>();
+}
+
 namespace {
 template <DeviceType D, typename T>
-void TestComplexConvNxNS12(const std::vector<index_t> &shape,
-                           const int stride,
-                           const int wino_blk_size = 0) {
+void TestComplexConvNxN(const std::vector<index_t> &shape,
+                        const std::vector<int> strides,
+                        const int wino_blk_size = 0) {
   testing::internal::LogToStderr();
   auto func = [&](int kernel_h, int kernel_w, int stride_h, int stride_w,
                   Padding type) {
@@ -581,35 +720,40 @@ void TestComplexConvNxNS12(const std::vector<index_t> &shape,
   };
 
   for (int kernel_size : {1, 3, 5, 7}) {
-    func(kernel_size, kernel_size, stride, stride, VALID);
-    func(kernel_size, kernel_size, stride, stride, SAME);
+    func(kernel_size, kernel_size, strides[0], strides[1], VALID);
+    func(kernel_size, kernel_size, strides[0], strides[1], SAME);
   }
 }
 }  // namespace
 
 TEST_F(Conv2dOpTest, OPENCLAlignedConvNxNS12) {
-  TestComplexConvNxNS12<DeviceType::GPU, float>({32, 16, 16, 32}, 1);
-  TestComplexConvNxNS12<DeviceType::GPU, float>({32, 16, 16, 32}, 2);
+  TestComplexConvNxN<DeviceType::GPU, float>({32, 16, 16, 32}, {1, 1});
+  TestComplexConvNxN<DeviceType::GPU, float>({32, 16, 16, 32}, {2, 2});
 }
 
 TEST_F(Conv2dOpTest, OPENCLAlignedConvNxNWinograd) {
-  TestComplexConvNxNS12<DeviceType::GPU, float>({32, 16, 16, 32}, 1, 2);
-  TestComplexConvNxNS12<DeviceType::GPU, float>({32, 16, 16, 32}, 1, 4);
+  TestComplexConvNxN<DeviceType::GPU, float>({32, 16, 16, 32}, {1, 1}, 2);
+  TestComplexConvNxN<DeviceType::GPU, float>({32, 16, 16, 32}, {1, 1}, 4);
 }
 
 TEST_F(Conv2dOpTest, OPENCLUnalignedConvNxNS12) {
-  TestComplexConvNxNS12<DeviceType::GPU, float>({17, 113, 5, 7}, 1);
-  TestComplexConvNxNS12<DeviceType::GPU, float>({17, 113, 5, 7}, 2);
+  TestComplexConvNxN<DeviceType::GPU, float>({17, 113, 5, 7}, {1, 1});
+  TestComplexConvNxN<DeviceType::GPU, float>({17, 113, 5, 7}, {2, 2});
 }
 
 TEST_F(Conv2dOpTest, OPENCLUnalignedConvNxNWinograd) {
-  TestComplexConvNxNS12<DeviceType::GPU, float>({17, 113, 5, 7}, 1, 4);
-  TestComplexConvNxNS12<DeviceType::GPU, float>({17, 113, 5, 7}, 1, 2);
+  TestComplexConvNxN<DeviceType::GPU, float>({17, 113, 5, 7}, {1, 1}, 4);
+  TestComplexConvNxN<DeviceType::GPU, float>({17, 113, 5, 7}, {1, 1}, 2);
 }
 
 TEST_F(Conv2dOpTest, OPENCLUnalignedConvNxNS34) {
-  TestComplexConvNxNS12<DeviceType::GPU, float>({31, 113, 13, 17}, 3);
-  TestComplexConvNxNS12<DeviceType::GPU, float>({32, 32, 13, 17}, 4);
+  TestComplexConvNxN<DeviceType::GPU, float>({31, 113, 13, 17}, {3, 3});
+  TestComplexConvNxN<DeviceType::GPU, float>({32, 32, 13, 17}, {4, 4});
+}
+
+TEST_F(Conv2dOpTest, OPENCLUnalignedConvNxNWithNotEqualStride) {
+  TestComplexConvNxN<DeviceType::GPU, float>({31, 113, 13, 17}, {1, 2});
+  TestComplexConvNxN<DeviceType::GPU, float>({32, 32, 13, 17}, {3, 4});
 }
 
 namespace {
@@ -646,9 +790,7 @@ void TestHalfComplexConvNxNS12(const std::vector<index_t> &input_shape,
     net.AddInputFromArray<D, float>(
         "Filter", {output_channels, input_channels, kernel_h, kernel_w},
         float_filter_data, true);
-    net.AddInputFromArray<D, float>("Bias",
-                                    {output_channels},
-                                    float_bias_data,
+    net.AddInputFromArray<D, float>("Bias", {output_channels}, float_bias_data,
                                     true);
 
     net.TransformDataFormat<DeviceType::CPU, float>(
@@ -713,11 +855,11 @@ TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv3x3S12) {
 }
 
 TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv3x3Winograd) {
-  TestHalfComplexConvNxNS12<DeviceType::GPU>({32, 32}, {3, 3, 32, 64},
-                                             {1, 1}, 2);
-// TODO(liutuo) : the precision error is large.
-//  TestHalfComplexConvNxNS12<DeviceType::GPU>({32, 32}, {3, 3, 32, 64},
-//                                             {1, 1}, 4);
+  TestHalfComplexConvNxNS12<DeviceType::GPU>({32, 32}, {3, 3, 32, 64}, {1, 1},
+                                             2);
+  // TODO(liutuo) : the precision error is large.
+  //  TestHalfComplexConvNxNS12<DeviceType::GPU>({32, 32}, {3, 3, 32, 64},
+  //                                             {1, 1}, 4);
 }
 
 TEST_F(Conv2dOpTest, OPENCLHalfAlignedConv5x5S12) {
@@ -769,10 +911,10 @@ TEST_F(Conv2dOpTest, OPENCLHalfUnalignedConv3x3S12) {
 
 TEST_F(Conv2dOpTest, OPENCLHalfUnalignedConv3x3Winograd) {
   // TODO(liutuo) : the precision error is large.
-//  TestHalfComplexConvNxNS12<DeviceType::GPU>({107, 113}, {3, 3, 5, 7},
-//                                             {1, 1}, 4);
-  TestHalfComplexConvNxNS12<DeviceType::GPU>({107, 113}, {3, 3, 5, 7},
-                                             {1, 1}, 2);
+  //  TestHalfComplexConvNxNS12<DeviceType::GPU>({107, 113}, {3, 3, 5, 7},
+  //                                             {1, 1}, 4);
+  TestHalfComplexConvNxNS12<DeviceType::GPU>({107, 113}, {3, 3, 5, 7}, {1, 1},
+                                             2);
 }
 
 TEST_F(Conv2dOpTest, OPENCLHalfConv5x5Dilation2) {
@@ -1045,10 +1187,10 @@ TEST_F(Conv2dOpTest, OPENCLAlignedPad2) {
 }
 
 TEST_F(Conv2dOpTest, OPENCLAlignedPad2Winograd) {
-  TestArbitraryPadConvNxN<DeviceType::GPU, float>({128, 128, 16, 16},
-                                                  {2, 2}, 2);
-  TestArbitraryPadConvNxN<DeviceType::GPU, float>({128, 128, 16, 16},
-                                                  {2, 2}, 4);
+  TestArbitraryPadConvNxN<DeviceType::GPU, float>({128, 128, 16, 16}, {2, 2},
+                                                  2);
+  TestArbitraryPadConvNxN<DeviceType::GPU, float>({128, 128, 16, 16}, {2, 2},
+                                                  4);
 }
 
 TEST_F(Conv2dOpTest, OPENCLUnalignedPad4) {
@@ -1071,15 +1213,15 @@ void TestQuantSimple3x3() {
 
   // Add input data
   net.AddInputFromArray<DeviceType::CPU, uint8_t>(
-      "Filter", {1, 3, 3, 2},
-      {102, 150, 123, 135, 1, 216, 137, 47, 53, 75, 145, 130, 171, 62, 255,
-       122, 72, 211}, true, 0.0226, 127);
+      "Filter", {1, 3, 3, 2}, {102, 150, 123, 135, 1, 216, 137, 47, 53, 75, 145,
+                               130, 171, 62, 255, 122, 72, 211},
+      true, 0.0226, 127);
   net.AddInputFromArray<DeviceType::CPU, uint8_t>(
-      "Input", {1, 3, 3, 2},
-      {1, 75, 117, 161, 127, 119, 94, 151, 203, 151, 84, 61, 55, 142, 113, 139,
-       3, 255}, false, 0.0204, 93);
-  net.AddInputFromArray<DeviceType::CPU, int32_t>(
-      "Bias", {1}, {2}, true, 0.00046104, 0);
+      "Input", {1, 3, 3, 2}, {1, 75, 117, 161, 127, 119, 94, 151, 203, 151, 84,
+                              61, 55, 142, 113, 139, 3, 255},
+      false, 0.0204, 93);
+  net.AddInputFromArray<DeviceType::CPU, int32_t>("Bias", {1}, {2}, true,
+                                                  0.00046104, 0);
 
   OpDefBuilder("Conv2D", "Conv2dTest")
       .Input("Input")
@@ -1113,17 +1255,15 @@ void TestQuant(const index_t batch,
                enum Padding padding_type,
                const std::vector<int> &strides) {
   OpsTestNet net;
-  net.AddRandomInput<CPU, float>("Input", {batch, in_height, in_width,
-                                           in_channels});
-  net.AddRandomInput<CPU, float>("Filter", {out_channels, k_height, k_width,
-                                            in_channels}, true);
+  net.AddRandomInput<CPU, float>("Input",
+                                 {batch, in_height, in_width, in_channels});
+  net.AddRandomInput<CPU, float>(
+      "Filter", {out_channels, k_height, k_width, in_channels}, true);
   net.AddRandomInput<CPU, float>("Bias", {out_channels}, true);
   net.TransformDataFormat<DeviceType::CPU, float>(
       "Input", DataFormat::NHWC, "InputNCHW", DataFormat::NCHW);
-  net.TransformFilterDataFormat<DeviceType::CPU, float>("Filter",
-                                                        DataFormat::OHWI,
-                                                        "FilterOIHW",
-                                                        DataFormat::OIHW);
+  net.TransformFilterDataFormat<DeviceType::CPU, float>(
+      "Filter", DataFormat::OHWI, "FilterOIHW", DataFormat::OIHW);
 
   OpDefBuilder("Conv2D", "Conv2dTest")
       .Input("InputNCHW")
@@ -1172,10 +1312,10 @@ void TestQuant(const index_t batch,
   auto bias_data = bias->data<float>();
   float bias_scale = q_input->scale() * q_filter->scale();
   std::vector<int32_t> q_bias(bias->size());
-  QuantizeUtil<float, int32_t>
-      quantize_util(OpTestContext::Get()->thread_pool());
-  quantize_util.QuantizeWithScaleAndZeropoint(
-      bias_data, bias->size(), bias_scale, 0, q_bias.data());
+  QuantizeUtil<float, int32_t> quantize_util(
+      OpTestContext::Get()->thread_pool());
+  quantize_util.QuantizeWithScaleAndZeropoint(bias_data, bias->size(),
+                                              bias_scale, 0, q_bias.data());
   net.AddInputFromArray<DeviceType::CPU, int32_t>(
       "QuantizedBias", {out_channels}, q_bias, true, bias_scale, 0);
 
diff --git a/tools/python/transform/base_converter.py b/tools/python/transform/base_converter.py
index 11a15b2a5ad0612cc8508f53bdb6ff5fa077cfb4..0d6a89debeacbe7b7b6986145480d98148ebcd35 100644
--- a/tools/python/transform/base_converter.py
+++ b/tools/python/transform/base_converter.py
@@ -180,7 +180,6 @@ MaceTransposableDataFormatOps = [MaceOp.Activation,
                                  MaceOp.Eltwise,
                                  MaceOp.Pad,
                                  MaceOp.Reduce,
-                                 MaceOp.Reshape,
                                  MaceOp.Softmax,
                                  MaceOp.Split,
                                  MaceOp.Squeeze,
diff --git a/tools/python/transform/hexagon_converter.py b/tools/python/transform/hexagon_converter.py
index 900c0ba1f3669d6c2a1bc4899362f963ddd2dae4..d8fcc1f77bbb442cc5b2d7bca9d471a5271b7c9b 100644
--- a/tools/python/transform/hexagon_converter.py
+++ b/tools/python/transform/hexagon_converter.py
@@ -121,9 +121,9 @@ class HexagonConverter(base_converter.ConverterInterface):
         # convert op node
         self.convert_ops()
 
-        self.convert_input_output_node()
+        model_inputs = self.convert_input_output_node()
 
-        self.add_node_id()
+        self.add_node_id(model_inputs)
 
         return self._model
 
@@ -235,8 +235,11 @@ class HexagonConverter(base_converter.ConverterInterface):
         for input_node in self._option.input_nodes.values():
             op_name = normalize_name(
                 MaceKeyword.mace_input_node_name + '_' + input_node.name)
-            op = first_quantize_input_op \
-                if op_name == first_quantize_input_op.name else ops[op_name]
+            if op_name == first_quantize_input_op.name:
+                op = first_quantize_input_op
+                quantize_input_op.name = MaceKeyword.mace_input_node_name
+            else:
+                op = ops[op_name]
             mace_check(op.type == HexagonOp.QuantizeINPUT_f_to_8.name,
                        "input node type is: %s" % op.type)
             quantize_input_op.output.extend(op.output)
@@ -276,7 +279,9 @@ class HexagonConverter(base_converter.ConverterInterface):
             dequantize_output_op.type = HexagonOp.OUTPUT.name
             del dequantize_output_op.input[1:]
 
-    def add_node_id(self):
+        return quantize_input_op.output
+
+    def add_node_id(self, model_inputs):
         node_id_counter = 0
         node_id_map = {}
         for tensor in self._model.tensors:
@@ -305,7 +310,11 @@ class HexagonConverter(base_converter.ConverterInterface):
                 node_id = node_id_map[tensor_name]
                 node_input = op.node_input.add()
                 node_input.node_id = node_id
-                node_input.output_port = int(port)
+                if tensor_name in model_inputs:
+                    for i in range(len(model_inputs)):
+                        if model_inputs[i] == tensor_name:
+                            port += i * 3
+                node_input.output_port = port
 
     def convert_ops(self):
         print("Convert mace graph to hexagon.")
diff --git a/tools/python/transform/tensorflow_converter.py b/tools/python/transform/tensorflow_converter.py
index 48b475bbf9f1abd35f25b4f9fa5e44e001cec5e3..38672a97e01b7d41b253744ad03046705506ae25 100644
--- a/tools/python/transform/tensorflow_converter.py
+++ b/tools/python/transform/tensorflow_converter.py
@@ -1075,6 +1075,7 @@ class TensorflowConverter(base_converter.ConverterInterface):
         if tf_op.type == TFOpType.FakeQuantWithMinMaxVars.name:
             self._skip_tensor.add(tf_op.inputs[1].name)
             self._skip_tensor.add(tf_op.inputs[2].name)
+        del op.input[1:]
 
     def convert_cumsum(self, tf_op):
         op = self.convert_general_op(tf_op)
diff --git a/tools/python/transform/transformer.py b/tools/python/transform/transformer.py
index 5344e8d5f73217465196526054b676d193940548..4c56b233f0f9395588d471168cdf4b350cc603c7 100644
--- a/tools/python/transform/transformer.py
+++ b/tools/python/transform/transformer.py
@@ -1734,16 +1734,18 @@ class Transformer(base_converter.ConverterInterface):
         for op in net.op:
             if op.type == 'FakeQuantWithMinMaxVars' or \
                    op.type == 'FakeQuantWithMinMaxArgs':
-                producer_op = self._producer[op.input[0]]
-                minval = ConverterUtil.get_arg(op, 'min').f
-                maxval = ConverterUtil.get_arg(op, 'max').f
-                quantize_info = \
-                    self.add_quantize_info(producer_op, minval, maxval)
-                self._quantize_activation_info[op.input[0]] = quantize_info
-                # for add -> fakequant pattern
-                self._quantize_activation_info[op.output[0]] = quantize_info
-
-                print(op.input[0], op.output[0])
+                if op.input[0] not in self._consts:
+                    producer_op = self._producer[op.input[0]]
+                    minval = ConverterUtil.get_arg(op, 'min').f
+                    maxval = ConverterUtil.get_arg(op, 'max').f
+                    quantize_info = \
+                        self.add_quantize_info(producer_op, minval, maxval)
+                    self._quantize_activation_info[op.input[0]] = quantize_info
+                    # for add -> fakequant pattern
+                    self._quantize_activation_info[op.output[0]] = \
+                        quantize_info
+
+                    print(op.input[0], op.output[0])
                 op.type = MaceOp.Identity.name
 
         return False
@@ -1850,6 +1852,8 @@ class Transformer(base_converter.ConverterInterface):
                 quantize_info.scale = scale
                 quantize_info.zero_point = zero
                 self._quantize_activation_info[new_input_name] = quantize_info
+                input_op = self._producer[input_node.name]
+                input_op.quantize_info.extend([quantize_info])
 
         print("Add default quantize info for ops like Pooling, Softmax")
         for op in self._model.op:
@@ -1904,8 +1908,8 @@ class Transformer(base_converter.ConverterInterface):
             elif (op.type == MaceOp.Eltwise.name
                   and not op.quantize_info
                   and len(op.input) == 2
-                  and len(op.input[0]) not in self._consts
-                  and len(op.input[1]) not in self._consts):
+                  and op.input[0] not in self._consts
+                  and op.input[1] not in self._consts):
                 producer_op0 = self._producer[op.input[0]]
                 producer_op1 = self._producer[op.input[1]]
                 if ConverterUtil.get_arg(
diff --git a/tools/python/utils/util.py b/tools/python/utils/util.py
index d2e25a92c42a8c02a896faae37c1a2229f1d3136..3c8bc80c6d1d1f53ae7bf0fce68fc481b0f454fa 100644
--- a/tools/python/utils/util.py
+++ b/tools/python/utils/util.py
@@ -65,8 +65,8 @@ class MaceLogger:
               + CMDColors.ENDC)
 
     @staticmethod
-    def error(message):
-        print(CMDColors.RED + 'ERROR: ' + get_frame_info() + str(message)
+    def error(message, level=2):
+        print(CMDColors.RED + 'ERROR: ' + get_frame_info(level) + str(message)
               + CMDColors.ENDC)
         exit(1)
 
@@ -76,7 +76,7 @@ def mace_check(condition, message):
         for line in traceback.format_stack():
             print(line.strip())
 
-        MaceLogger.error(message)
+        MaceLogger.error(message, level=3)
 
 
 ################################