diff --git a/mace/core/logging.h b/mace/core/logging.h
index 6a7ea7934b89bbb492fcd4d41c9be61fabbfeda8..f4f427b3e65a98b83917366af4f2698e225b4d45 100644
--- a/mace/core/logging.h
+++ b/mace/core/logging.h
@@ -8,6 +8,7 @@
 #include <limits>
 #include <sstream>
 #include <string>
+#include <vector>
 
 #undef ERROR
 
@@ -41,7 +42,16 @@ template <typename... Args>
 string MakeString(const Args&... args) {
   std::stringstream ss;
   MakeStringInternal(ss, args...);
-  return string(ss.str());
+  return ss.str();
+}
+
+template <typename T>
+string MakeString(const std::vector<T> &args) {
+  std::stringstream ss;
+  for (const T& arg: args) {
+    ss << arg << ", ";
+  }
+  return ss.str();
 }
 
 // Specializations for already-a-string types.
diff --git a/mace/core/net.cc b/mace/core/net.cc
index c6c48ac6a35aea583535173e58dd3cadd646a50d..e6c20cda1217f060985e59b3f3eb7f02559d5774 100644
--- a/mace/core/net.cc
+++ b/mace/core/net.cc
@@ -35,6 +35,8 @@ bool SimpleNet::Run() {
       LOG(ERROR) << "Operator failed: " << ProtoDebugString(op->debug_def());
       return false;
     }
+    VLOG(1) << "Op " << op->debug_def().name()
+            << " has shape: " << internal::MakeString(op->Output(0)->shape());
   }
   return true;
 }
diff --git a/mace/core/tensor.h b/mace/core/tensor.h
index 224c342e1ecfa70491d07615df7c3ac00ba0b056..26626a493ae903689e8fc1a89e0f871b403ace51 100644
--- a/mace/core/tensor.h
+++ b/mace/core/tensor.h
@@ -137,7 +137,7 @@ class Tensor {
     alloc_->CopyBytes(raw_mutable_data(), src, size);
   }
 
-  inline void DebugPrint() {
+  inline void DebugPrint() const {
     std::stringstream os;
     for (int i : shape_) {
       os << i << ", ";
diff --git a/mace/core/workspace.cc b/mace/core/workspace.cc
index 48fa0153307007f7e628b44f65e5459109f04e88..ecd5af3eb4f714021ddf2fc2f3abc64af583bc4a 100644
--- a/mace/core/workspace.cc
+++ b/mace/core/workspace.cc
@@ -53,6 +53,10 @@ Tensor* Workspace::GetTensor(const string& name) {
 void Workspace::LoadModelTensor(const NetDef& net_def, DeviceType type) {
   Serializer serializer;
   for (auto& tensor_proto : net_def.tensors()) {
+
+    VLOG(1) << "Load tensor: " << tensor_proto.name()
+            << " has shape: " << internal::MakeString(vector<index_t>(
+          tensor_proto.dims().begin(), tensor_proto.dims().end()));
     tensor_map_[tensor_proto.name()] =
         serializer.Deserialize(tensor_proto, type);
   }
diff --git a/mace/kernels/conv_2d.h b/mace/kernels/conv_2d.h
index fbe7953aeb01397fa52bfef48d46cc3abee8b670..fa568684dd61b9789b44ed5a54d6cbd3ec3450db 100644
--- a/mace/kernels/conv_2d.h
+++ b/mace/kernels/conv_2d.h
@@ -12,19 +12,8 @@ namespace mace {
 namespace kernels {
 
 template<DeviceType D, typename T>
-class Conv2dFunctor {
- public:
-  Conv2dFunctor(const index_t *input_shape,
-                const index_t *filter_shape,
-                const int *strides,
-                const Padding padding,
-                const int *dilations) :
-      strides_(strides),
-      paddings_(2, 0),
-      dilations_(dilations) {
-    CalPaddingSize(input_shape, filter_shape, dilations_, strides_, padding, paddings_.data());
-  }
-
+struct Conv2dFunctor {
+  Conv2dFunctor() {}
   Conv2dFunctor(const int *strides,
                 const std::vector<int> &paddings,
                 const int *dilations) :
@@ -112,7 +101,6 @@ class Conv2dFunctor {
     }
   }
 
- private:
   const int *strides_;    // [stride_h, stride_w]
   std::vector<int> paddings_;   // [padding_h, padding_w]
   const int *dilations_;  // [dilation_h, dilation_w]
diff --git a/mace/kernels/depthwise_conv2d.h b/mace/kernels/depthwise_conv2d.h
index c9be5c926bdcb9c910a268ed34cb8a6a8deba4ee..276287bb1c70655eaaf655b4276b6988a7a3a5fb 100644
--- a/mace/kernels/depthwise_conv2d.h
+++ b/mace/kernels/depthwise_conv2d.h
@@ -13,18 +13,8 @@ namespace mace {
 namespace kernels {
 
 template<DeviceType D, typename T>
-class DepthwiseConv2dFunctor {
- public:
-  DepthwiseConv2dFunctor(const index_t *input_shape,
-                         const index_t *filter_shape,
-                         const int *strides,
-                         const Padding padding,
-                         const int *dilations) :
-      strides_(strides),
-      paddings_(2, 0),
-      dilations_(dilations) {
-    CalPaddingSize(input_shape, filter_shape, dilations_, strides_, padding, paddings_.data());
-  }
+struct DepthwiseConv2dFunctor {
+  DepthwiseConv2dFunctor() {}
   DepthwiseConv2dFunctor(const int *strides,
                          const std::vector<int> &paddings,
                          const int *dilations) :
@@ -39,7 +29,6 @@ class DepthwiseConv2dFunctor {
                   const T *bias, // c_out
                   T *output, // NCHW
                   const index_t *output_shape) {
-
     MACE_CHECK_NOTNULL(output);
 
     index_t batch = output_shape[0];
@@ -111,7 +100,7 @@ class DepthwiseConv2dFunctor {
       }
     }
   }
- private:
+
   const int *strides_; // [stride_h, stride_w]
   std::vector<int> paddings_;   // [padding_h, padding_w]
   const int *dilations_; // [dilation_h, dilation_w]
diff --git a/mace/kernels/global_avg_pooling.h b/mace/kernels/global_avg_pooling.h
index c339fd41ff3cca59b5af2dea59142bbd9d212ace..ed96c66b9440b21e04170714c97a1e57646158d6 100644
--- a/mace/kernels/global_avg_pooling.h
+++ b/mace/kernels/global_avg_pooling.h
@@ -11,10 +11,7 @@ namespace mace {
 namespace kernels {
 
 template <DeviceType D, typename T>
-class GlobalAvgPoolingFunctor {
- public:
-  GlobalAvgPoolingFunctor() {}
-
+struct GlobalAvgPoolingFunctor {
   void operator()(const T *input, const index_t *input_shape, T *output) {
     index_t batch = input_shape[0];
     index_t channels = input_shape[1];
diff --git a/mace/kernels/pooling.h b/mace/kernels/pooling.h
index 7925a42595ebac8155b456da81dcb02166497fed..1e69cf90b87f9c1e859a7794bc3d3cc88c4d4ee8 100644
--- a/mace/kernels/pooling.h
+++ b/mace/kernels/pooling.h
@@ -18,8 +18,7 @@ enum PoolingType {
 namespace kernels {
 
 template <DeviceType D, typename T>
-class PoolingFunctor {
- public:
+struct PoolingFunctor {
   PoolingFunctor(const PoolingType pooling_type,
                  const int *kernels,
                  const int *strides,
@@ -114,7 +113,6 @@ class PoolingFunctor {
     }
   }
 
- private:
   const PoolingType pooling_type_;
   const int *kernels_;
   const int *strides_;
diff --git a/mace/ops/conv_2d.cc b/mace/ops/conv_2d.cc
index 90542e7be4ab829aefad0c451b9d520b1e2b8103..092a488cdf7c4d4a17e546564a4ddec5da5333f5 100644
--- a/mace/ops/conv_2d.cc
+++ b/mace/ops/conv_2d.cc
@@ -6,10 +6,10 @@
 
 namespace mace {
 
-REGISTER_CPU_OPERATOR(Conv2d, Conv2dOp<DeviceType::CPU, float>);
+REGISTER_CPU_OPERATOR(Conv2D, Conv2dOp<DeviceType::CPU, float>);
 
 #if __ARM_NEON
-REGISTER_NEON_OPERATOR(Conv2d, Conv2dOp<DeviceType::NEON, float>);
+REGISTER_NEON_OPERATOR(Conv2D, Conv2dOp<DeviceType::NEON, float>);
 #endif  // __ARM_NEON
 
 }  // namespace mace
diff --git a/mace/ops/conv_2d.h b/mace/ops/conv_2d.h
index 89b9140241f291a9a62c70d1bcea805775494fe0..d8603ef0849234a841f5f0ac3a760341f4688b69 100644
--- a/mace/ops/conv_2d.h
+++ b/mace/ops/conv_2d.h
@@ -17,12 +17,10 @@ template<DeviceType D, typename T>
 class Conv2dOp : public ConvPool2dOpBase<D, T> {
  public:
   Conv2dOp(const OperatorDef &op_def, Workspace *ws)
-      : ConvPool2dOpBase<D, T>(op_def, ws),
-        functor_(this->Input(INPUT)->shape().data(),
-                 this->Input(FILTER)->shape().data(),
-                 this->strides_.data(),
-                 this->padding_,
-                 this->dilations_.data()) {}
+      : ConvPool2dOpBase<D, T>(op_def, ws) {
+    functor_.strides_ = this->strides_.data();
+    functor_.dilations_ = this->dilations_.data();
+  }
 
   bool Run() override {
     const Tensor *input = this->Input(INPUT);
@@ -37,8 +35,13 @@ class Conv2dOp : public ConvPool2dOpBase<D, T> {
 
     std::vector<index_t> output_shape(4);
     std::vector<int> paddings(2);
-    this->CalOutputSize(input->shape().data(), filter->shape().data(), output_shape.data());
+    kernels::CalcPaddingAndOutputSize(input->shape().data(),
+                                      filter->shape().data(),
+                                      this->dilations_.data(),
+                                      this->strides_.data(), this->padding_,
+                                      output_shape.data(), paddings.data());
     output->Resize(output_shape);
+    functor_.paddings_ = paddings;
 
     functor_(input->data<T>(), input->shape().data(), filter->data<T>(),
              filter->shape().data(), bias_data, output->mutable_data<T>(),
diff --git a/mace/ops/conv_2d_benchmark.cc b/mace/ops/conv_2d_benchmark.cc
index bbf6b608245a918c780e0b8d4ff7f375d7887f94..7356666bb2160f6c6f617899ca22c3b95daa32d8 100644
--- a/mace/ops/conv_2d_benchmark.cc
+++ b/mace/ops/conv_2d_benchmark.cc
@@ -25,7 +25,7 @@ static void Conv2d(int iters,
   mace::testing::StopTiming();
 
   OpsTestNet net;
-  OpDefBuilder("Conv2d", "Conv2dTest")
+  OpDefBuilder("Conv2D", "Conv2dTest")
       .Input("Input")
       .Input("Filter")
       .Input("Bias")
diff --git a/mace/ops/conv_2d_test.cc b/mace/ops/conv_2d_test.cc
index c5b8751430bda21491212e9c867d28d092d785e2..4bc881398faf7c23340e0da0bd0f3c453d3de1f9 100644
--- a/mace/ops/conv_2d_test.cc
+++ b/mace/ops/conv_2d_test.cc
@@ -13,7 +13,7 @@ class Conv2dOpTest : public OpsTestBase {};
 TEST_F(Conv2dOpTest, Simple_VALID) {
   // Construct graph
   auto& net = test_net();
-  OpDefBuilder("Conv2d", "Conv2dTest")
+  OpDefBuilder("Conv2D", "Conv2dTest")
       .Input("Input")
       .Input("Filter")
       .Input("Bias")
@@ -47,7 +47,7 @@ TEST_F(Conv2dOpTest, Simple_VALID) {
 TEST_F(Conv2dOpTest, Simple_SAME) {
   // Construct graph
   auto& net = test_net();
-  OpDefBuilder("Conv2d", "Conv2dTest")
+  OpDefBuilder("Conv2D", "Conv2dTest")
       .Input("Input")
       .Input("Filter")
       .Input("Bias")
@@ -83,7 +83,7 @@ TEST_F(Conv2dOpTest, Simple_SAME) {
 TEST_F(Conv2dOpTest, Combined) {
   // Construct graph
   auto& net = test_net();
-  OpDefBuilder("Conv2d", "Conv2dTest")
+  OpDefBuilder("Conv2D", "Conv2DTest")
       .Input("Input")
       .Input("Filter")
       .Input("Bias")
@@ -121,7 +121,7 @@ TEST_F(Conv2dOpTest, Combined) {
 TEST_F(Conv2dOpTest, Conv1x1) {
   // Construct graph
   auto& net = test_net();
-  OpDefBuilder("Conv2d", "Conv2dTest")
+  OpDefBuilder("Conv2D", "Conv2DTest")
       .Input("Input")
       .Input("Filter")
       .Input("Bias")
@@ -179,7 +179,7 @@ TEST_F(Conv2dOpTest, ConvNxNS12) {
     index_t output_channels = 1 + rand() % 10;
     // Construct graph
     auto& net = test_net();
-    OpDefBuilder("Conv2d", "Conv2dTest")
+    OpDefBuilder("Conv2D", "Conv2dTest")
         .Input("Input")
         .Input("Filter")
         .Input("Bias")
diff --git a/mace/ops/conv_pool_2d_base.h b/mace/ops/conv_pool_2d_base.h
index 1b9e2340a567a854052bfa398315713a6f709660..c9ba9c2529d5195cb7a94028524349dd763e1d8a 100644
--- a/mace/ops/conv_pool_2d_base.h
+++ b/mace/ops/conv_pool_2d_base.h
@@ -19,48 +19,7 @@ class ConvPool2dOpBase : public Operator<D, T> {
         padding_(static_cast<Padding>(OperatorBase::GetSingleArgument<int>(
             "padding", static_cast<int>(SAME)))),
         dilations_(OperatorBase::GetRepeatedArgument<int>("dilations", {1, 1})) {}
-
-  void CalOutputSize(const index_t *input_shape,   // NCHW
-                     const index_t *filter_shape,  // OIHW
-                     index_t *output_shape) {
-
-    MACE_CHECK(dilations_[0] > 0 && dilations_[1] > 0,
-               "Invalid dilations, must >= 1");
-    MACE_CHECK((dilations_[0] == 1 || strides_[0] == 1) &&
-        (dilations_[1] == 1 || strides_[1] == 1),
-               "If dilations > 1, strides should be 1");
-    MACE_CHECK_NOTNULL(output_shape);
-    /*
-    * Convlution/pooling arithmetic:
-    * o = (i + 2 * p - k - (k - 1) * (d - 1)) / s + 1
-    * For details, see https://arxiv.org/pdf/1603.07285.pdf or
-    * http://deeplearning.net/software/theano/tutorial/conv_arithmetic.html
-    */
-
-    index_t output_height = 0, output_width = 0;
-
-    switch (padding_) {
-      case VALID:
-        output_height = (input_shape[2] - (filter_shape[2] - 1) * dilations_[0] - 1) / strides_[0] + 1;
-        output_width = (input_shape[3] - (filter_shape[3] - 1) * dilations_[1] - 1) / strides_[1] + 1;
-        break;
-      case SAME:
-        output_height = (input_shape[2] - 1) / strides_[0] + 1;
-        output_width = (input_shape[3] - 1) / strides_[1] + 1;
-        break;
-      case FULL:
-        output_height = (input_shape[2] + (filter_shape[2] - 1) * dilations_[0] - 1) / strides_[0] + 1;
-        output_width = (input_shape[3] + (filter_shape[3] - 1) * dilations_[1] - 1) / strides_[1] + 1;
-        break;
-      default:
-        MACE_CHECK(false, "Unsupported padding type: ", padding_);
-    }
-
-    output_shape[0] = input_shape[0];
-    output_shape[1] = filter_shape[0];
-    output_shape[2] = output_height;
-    output_shape[3] = output_width;
-  }
+  
  protected:
   std::vector<int> strides_;
   Padding padding_;
diff --git a/mace/ops/depthwise_conv2d.h b/mace/ops/depthwise_conv2d.h
index 9e5dc745212aa9d951243fb1dd2790acf7cf4148..b977115a14baafca0f58cf05891723e4de215995 100644
--- a/mace/ops/depthwise_conv2d.h
+++ b/mace/ops/depthwise_conv2d.h
@@ -18,10 +18,10 @@ template<DeviceType D, typename T>
 class DepthwiseConv2dOp : public ConvPool2dOpBase<D, T> {
  public:
   DepthwiseConv2dOp(const OperatorDef &op_def, Workspace *ws)
-      : ConvPool2dOpBase<D, T>(op_def, ws),
-        functor_(this->Input(INPUT)->shape().data(),
-                 this->Input(FILTER)->shape().data(),
-                 this->strides_.data(), this->padding_, this->dilations_.data()) {};
+      : ConvPool2dOpBase<D, T>(op_def, ws) {
+    functor_.strides_ = this->strides_.data();
+    functor_.dilations_ = this->dilations_.data();
+  }
 
   bool Run() override {
     const Tensor *input = this->Input(INPUT);
@@ -38,8 +38,14 @@ class DepthwiseConv2dOp : public ConvPool2dOpBase<D, T> {
     filter_shape[0] *= filter_shape[1];
     filter_shape[1] = 1;
     std::vector<index_t> output_shape(4);
-    this->CalOutputSize(input->shape().data(), filter_shape.data(), output_shape.data());
+    std::vector<int> paddings(2);
+    kernels::CalcPaddingAndOutputSize(input->shape().data(),
+                                      filter_shape.data(),
+                                      this->dilations_.data(),
+                                      this->strides_.data(), this->padding_,
+                                      output_shape.data(), paddings.data());
     output->Resize(output_shape);
+    functor_.paddings_ = paddings;
 
     functor_(input->data<T>(), input->shape().data(), filter->data<T>(),
              filter_shape.data(), bias_data, output->mutable_data<T>(),
diff --git a/mace/ops/depthwise_conv2d_test.cc b/mace/ops/depthwise_conv2d_test.cc
index b2afe81171b55c2d578caeec8e6a6930ccf241cc..24002b155aa488d36eb34ea007dd9d7d31f6fbe9 100644
--- a/mace/ops/depthwise_conv2d_test.cc
+++ b/mace/ops/depthwise_conv2d_test.cc
@@ -10,9 +10,10 @@ using namespace mace;
 class DepthwiseConv2dOpTest : public OpsTestBase {};
 
 TEST_F(DepthwiseConv2dOpTest, Simple_VALID) {
+  testing::internal::LogToStderr();
   // Construct graph
   auto& net = test_net();
-  OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2dTest")
+  OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
       .Input("Input")
       .Input("Filter")
       .Input("Bias")
@@ -35,7 +36,6 @@ TEST_F(DepthwiseConv2dOpTest, Simple_VALID) {
        3.0f, 7.0f, 11.0f, 15.0f,
        4.0f, 8.0f, 12.0f, 16.0f});
   net.AddInputFromArray<float>("Bias", {4}, {.1f, .2f, .3f, .4f});
-
   // Run
   net.RunOp();
 
@@ -61,7 +61,7 @@ TEST_F(DepthwiseConv2dOpTest, ConvNxNS12) {
     index_t multiplier = 3 + rand() % 10;
     // Construct graph
     auto& net = test_net();
-    OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2dTest")
+    OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2DTest")
         .Input("Input")
         .Input("Filter")
         .Input("Bias")
diff --git a/mace/ops/depthwise_conv_2d_benchmark.cc b/mace/ops/depthwise_conv_2d_benchmark.cc
index f535ea17273d028d01c4e56e8f7f32275c73eb33..5442444ec19f0825bb84d599a4b167eadbbe3d2e 100644
--- a/mace/ops/depthwise_conv_2d_benchmark.cc
+++ b/mace/ops/depthwise_conv_2d_benchmark.cc
@@ -25,7 +25,7 @@ static void DepthwiseConv2d(int iters,
   mace::testing::StopTiming();
 
   OpsTestNet net;
-  OpDefBuilder("DepthwiseConv2d", "DepthwiseConv2dTest")
+  OpDefBuilder("DepthwiseConv2D", "DepthwiseConv2DTest")
       .Input("Input")
       .Input("Filter")
       .Input("Bias")
diff --git a/mace/ops/pooling.h b/mace/ops/pooling.h
index 3afb0f71a522a741ac62b9e6ca725efff127ad01..2a17b5d11efe2f66118e41892ce59e12914002af 100644
--- a/mace/ops/pooling.h
+++ b/mace/ops/pooling.h
@@ -28,6 +28,7 @@ class PoolingOp : public ConvPool2dOpBase<D, T> {
     std::vector<index_t> output_shape(4);
     std::vector<int> paddings(2);
     std::vector<index_t> filter_shape(4);
+    // TODO(chenghui): is it kind of a hack?
     filter_shape[0] = input->shape()[1];
     filter_shape[1] = input->shape()[0];
     filter_shape[2] = kernels_[0];
diff --git a/mace/python/tools/tf_converter.py b/mace/python/tools/tf_converter.py
index 4797af7e2fd61f72439931dc68fdb459fe5f15da..302ef6aa45f71416d2180ab14bd5834a9ac3ce0b 100644
--- a/mace/python/tools/tf_converter.py
+++ b/mace/python/tools/tf_converter.py
@@ -23,6 +23,7 @@ def main(unused_args):
   with gfile.GFile(FLAGS.output, "wb") as f:
     f.write(output_graph_def.SerializeToString())
   with gfile.GFile(FLAGS.output + '_txt', "wb") as f:
+    output_graph_def.ClearField('tensors')
     f.write(str(output_graph_def))
 
 
diff --git a/mace/python/tools/tf_converter_lib.py b/mace/python/tools/tf_converter_lib.py
index e119d03d542ab02af6949018e80555fbb6236695..5e9acbfd639384bf55e473c2f7c8f7099fbe2916 100644
--- a/mace/python/tools/tf_converter_lib.py
+++ b/mace/python/tools/tf_converter_lib.py
@@ -1,5 +1,6 @@
 from mace.proto import mace_pb2
 import tensorflow as tf
+import numpy as np
 
 padding_mode = {
   'VALID': 0,
@@ -24,11 +25,20 @@ def convert_ops(unresolved_ops, net_def):
     tf_tensor = first_op.outputs[0].eval()
     tensor = net_def.tensors.add()
     tensor.name = first_op.outputs[0].name
-    tensor.dims.extend(tf_tensor.shape)
     # TODO: support other type than float
     tensor.data_type = mace_pb2.DT_FLOAT
+
+    shape = list(tf_tensor.shape)
+    if (first_op.name.find('pointwise_kernel') != -1 or
+        first_op.name.find('depthwise_kernel') != -1 or
+        first_op.name.endswith('weights') or
+        first_op.name.endswith('kernel')) \
+        and first_op.outputs[0].consumers()[0].type.find('Conv') != -1:
+      tf_tensor = np.transpose(tf_tensor, axes=(3, 2, 0, 1))
+      shape = [shape[3], shape[2], shape[0], shape[1]]
+      # print (tensor.name, shape)
+    tensor.dims.extend(shape)
     tensor.float_data.extend(tf_tensor.astype(float).flat)
-  # net_def.tensors.extend([tensor])
   elif first_op.type == 'Conv2D' or first_op.type == 'DepthwiseConv2dNative':
     op_def = net_def.op.add()
     op_def.name = first_op.name
@@ -43,10 +53,12 @@ def convert_ops(unresolved_ops, net_def):
     padding_arg.i = padding_mode[first_op.get_attr('padding')]
     strides_arg = op_def.arg.add()
     strides_arg.name = 'strides'
-    strides_arg.ints.extend(first_op.get_attr('strides'))
+    strides_arg.ints.extend(first_op.get_attr('strides')[2:])
     data_format_arg = op_def.arg.add()
     data_format_arg.name = 'data_format'
     data_format_arg.s = first_op.get_attr('data_format')
+    if first_op.get_attr('data_format') != 'NCHW':
+      raise Exception('only support NCHW now')
 
     if ops_count >= 2 and unresolved_ops[1].type == 'BiasAdd':
       bias_add_op = unresolved_ops[1]
@@ -93,7 +105,7 @@ def convert_ops(unresolved_ops, net_def):
     op_def.type = first_op.type
     op_def.input.extend([input.name for input in first_op.inputs])
     op_def.output.extend([output.name for output in first_op.outputs])
-  elif first_op.type == 'AvgPool':
+  elif first_op.type == 'AvgPool' or first_op.type == 'MaxPool':
     op_def = net_def.op.add()
     op_def.name = first_op.name
     op_def.type = 'Pooling'
@@ -107,12 +119,15 @@ def convert_ops(unresolved_ops, net_def):
     padding_arg.i = padding_mode[first_op.get_attr('padding')]
     strides_arg = op_def.arg.add()
     strides_arg.name = 'strides'
-    strides_arg.ints.extend(first_op.get_attr('strides')[1:-1])
-    strides_arg.name = 'kernels'
-    strides_arg.ints.extend(first_op.get_attr('ksize')[1:-1])
+    strides_arg.ints.extend(first_op.get_attr('strides')[2:])
+    kernels_arg = op_def.arg.add()
+    kernels_arg.name = 'kernels'
+    kernels_arg.ints.extend(first_op.get_attr('ksize')[2:])
     data_format_arg = op_def.arg.add()
     data_format_arg.name = 'data_format'
     data_format_arg.s = first_op.get_attr('data_format')
+    if first_op.get_attr('data_format') != 'NCHW':
+      raise Exception('only support NCHW now')
   else:
     raise Exception('Unknown Op: ' + first_op.name)
     pass