diff --git a/mace/kernels/conv_2d.h b/mace/kernels/conv_2d.h
index f20c427b26e8e2d3718bd6184087e5fc20f21157..2e1a145a3aec2c37e77eecb7f853db4fd99f75e0 100644
--- a/mace/kernels/conv_2d.h
+++ b/mace/kernels/conv_2d.h
@@ -22,7 +22,7 @@ class Conv2dFunctor {
 
     void operator()(const T* input, // NCHW
                     const index_t* input_shape,
-                    const T* filter, // kernel_h, kernel_w, c_in, c_out
+                    const T* filter, // c_out, c_in, kernel_h, kernel_w
                     const index_t* filter_shape,
                     const T* bias, // c_out
                     T* output, // NCHW
@@ -39,8 +39,8 @@ class Conv2dFunctor {
       index_t input_height   = input_shape[2];
       index_t input_width    = input_shape[3];
 
-      int kernel_h = filter_shape[0];
-      int kernel_w  = filter_shape[1];
+      index_t kernel_h = filter_shape[2];
+      index_t kernel_w  = filter_shape[3];
 
       int stride_h = strides_[0];
       int stride_w = strides_[1];
@@ -53,10 +53,12 @@ class Conv2dFunctor {
       // The left-upper most offset of the padded input
       int padded_h_start = 0 - paddings_[0] / 2;
       int padded_w_start = 0 - paddings_[1] / 2;
-      int padded_h_stop = input_height + paddings_[0] - paddings_[0] / 2;
-      int padded_w_stop = input_width + paddings_[1] - paddings_[1] / 2;
+      index_t padded_h_stop = input_height + paddings_[0] - paddings_[0] / 2;
+      index_t padded_w_stop = input_width + paddings_[1] - paddings_[1] / 2;
 
-#pragma omp parallel for collpse(2)
+      index_t kernel_size = input_channels * kernel_h * kernel_w;
+
+#pragma omp parallel for collapse(2)
       for (int n = 0; n < batch; ++n) {
         for (int c = 0; c < channels; ++c) {
           for (int h = 0; h < height; ++h) {
@@ -65,17 +67,10 @@ class Conv2dFunctor {
                                c * height * width +
                                h * width + w;
               T sum = 0;
+              const T* filter_ptr = filter + c * kernel_size;
               for (int inc = 0; inc < input_channels; ++inc) {
                 for (int kh = 0; kh < kernel_h; ++kh) {
                   for (int kw = 0; kw < kernel_w; ++kw) {
-                    /*
-                     *  TODO The tensorflow filter order is HWCiCo.
-                     *  We should consider other order for different
-                     *  implementaion to optimize memory access.
-                     */
-                    int filter_offset = kh * kernel_w * input_channels * channels +
-                                        kw * input_channels * channels +
-                                        inc * channels + c;
 
                     int inh = padded_h_start + h * stride_h + dilation_h * kh;
                     int inw = padded_w_start + w * stride_w + dilation_w * kw;
@@ -94,8 +89,9 @@ class Conv2dFunctor {
                         n * input_channels * input_height * input_width +
                         inc * input_height * input_width +
                         inh * input_width + inw;
-                      sum += input[input_offset] * filter[filter_offset];
+                      sum += input[input_offset] * *filter_ptr;
                     }
+                    ++filter_ptr;
                   }
                 }
                 output[offset] = sum + bias[c];
diff --git a/mace/kernels/pooling.h b/mace/kernels/pooling.h
index 8a35919489cd124edc6919ad4fa998d5ee91b754..52956159dea0b25a63d0d90f02f6e2c2bf2672b7 100644
--- a/mace/kernels/pooling.h
+++ b/mace/kernels/pooling.h
@@ -56,10 +56,8 @@ public:
     // The left-upper most offset of the padded input
     int padded_h_start = 0 - paddings_[0] / 2;
     int padded_w_start = 0 - paddings_[1] / 2;
-    int padded_h_stop = input_height + paddings_[0] - paddings_[0] / 2;
-    int padded_w_stop = input_width + paddings_[1] - paddings_[0] / 2;
 
-#pragma omp parallel for collpse(2)
+#pragma omp parallel for collapse(2)
     for (int n = 0; n < batch; ++n) {
       for (int c = 0; c < channels; ++c) {
         index_t out_offset = n * channels * height * width +
diff --git a/mace/ops/conv_2d_test.cc b/mace/ops/conv_2d_test.cc
index 6ab3de8da6c349477341154f7f31043c28f276ca..c95f3dc6e4317c3596034bf2b65d9210258c1481 100644
--- a/mace/ops/conv_2d_test.cc
+++ b/mace/ops/conv_2d_test.cc
@@ -32,7 +32,7 @@ TEST_F(Conv2dOpTest, Simple_VALID) {
                      1, 1, 1,
                      1, 1, 1,
                      1, 1, 1});
-  AddInputFromArray<float>("Filter", {3, 3, 2, 1},
+  AddInputFromArray<float>("Filter", {1, 2, 3, 3},
                            {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
                             1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
                             1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
@@ -69,7 +69,7 @@ TEST_F(Conv2dOpTest, Simple_SAME) {
                      1, 1, 1,
                      1, 1, 1,
                      1, 1, 1});
-  AddInputFromArray<float>("Filter", {3, 3, 2, 1},
+  AddInputFromArray<float>("Filter", {1, 2, 3, 3},
                            {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
                             1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
                             1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f});
@@ -113,16 +113,11 @@ TEST_F(Conv2dOpTest, Combined) {
                      1, 1, 1, 1, 1,
                      1, 1, 1, 1, 1,
                      1, 1, 1, 1, 1});
-  AddInputFromArray<float>("Filter", {3, 3, 2, 2},
-                           {1.0f, 0.5f, 1.0f, 0.5f,
-                            1.0f, 0.5f, 1.0f, 0.5f,
-                            1.0f, 0.5f, 1.0f, 0.5f,
-                            1.0f, 0.5f, 1.0f, 0.5f,
-                            1.0f, 0.5f, 1.0f, 0.5f,
-                            1.0f, 0.5f, 1.0f, 0.5f,
-                            1.0f, 0.5f, 1.0f, 0.5f,
-                            1.0f, 0.5f, 1.0f, 0.5f,
-                            1.0f, 0.5f, 1.0f, 0.5f});
+  AddInputFromArray<float>("Filter", {2, 2, 3, 3},
+                           {1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
+                            1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f,
+                            0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f,
+                            0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f});
   AddInputFromArray<float>("Bias", {2}, {0.1f, 0.2f});
 
   // Run
diff --git a/mace/ops/conv_pool_2d_base.h b/mace/ops/conv_pool_2d_base.h
index 36939e31c63cb116b776e4784f011672c57b3fa2..d95668ec44b4726ec987a8ad045c30d3e2af20d8 100644
--- a/mace/ops/conv_pool_2d_base.h
+++ b/mace/ops/conv_pool_2d_base.h
@@ -27,7 +27,7 @@ class ConvPool2dOpBase : public Operator<D, T> {
     dilations_(OperatorBase::GetRepeatedArgument<int>("dilations")) {}
 
   void CalcPaddingAndOutputSize(const index_t* input_shape,  // NCHW
-                                const index_t* filter_shape,  // HWIO
+                                const index_t* filter_shape,  // OIHW
                                 std::vector<index_t>* output_shape,
                                 std::vector<int>* padding_size) {
     MACE_CHECK(dilations_[0] > 0 && dilations_[1] > 0,
@@ -44,12 +44,12 @@ class ConvPool2dOpBase : public Operator<D, T> {
     *padding_size = {0, 0};
 
     index_t output_height, output_width;
-    index_t kernel_height = filter_shape[0];
-    index_t kernel_width = filter_shape[1];
-    index_t output_channels = filter_shape[3];
+    index_t kernel_height = filter_shape[2];
+    index_t kernel_width = filter_shape[3];
+    index_t output_channels = filter_shape[0];
 
-    int k_extent_height = (kernel_height - 1) * dilations_[0] + 1;
-    int k_extent_width = (kernel_width - 1) * dilations_[1] + 1;
+    index_t k_extent_height = (kernel_height - 1) * dilations_[0] + 1;
+    index_t k_extent_width = (kernel_width - 1) * dilations_[1] + 1;
 
     switch (padding_) {
       case VALID:
diff --git a/mace/ops/pooling.cc b/mace/ops/pooling.cc
index 915035c858bb66d5f1aa7f61f4717187b2f11b5e..58ced63e941e7d1f0f602d4de2191aa40fcb2d0e 100644
--- a/mace/ops/pooling.cc
+++ b/mace/ops/pooling.cc
@@ -4,8 +4,6 @@
 
 
 #include "mace/ops/pooling.h"
-#include "mace/proto/mace.pb.h"
-#include "mace/kernels/pooling.h"
 
 namespace mace {
 
diff --git a/mace/ops/pooling.h b/mace/ops/pooling.h
index bc62a075a3b6fe75b40caa57d31cb7f29a6ad9a7..042eb389c11d2d0c09c72530ead8f3b7c98da5ef 100644
--- a/mace/ops/pooling.h
+++ b/mace/ops/pooling.h
@@ -29,10 +29,10 @@ public:
     std::vector<index_t> output_shape;
     std::vector<int> paddings;
     std::vector<index_t> filter_shape = std::vector<index_t>(4);
-    filter_shape[0] = kernels_[0];
-    filter_shape[1] = kernels_[1];
-    filter_shape[2] = in_shape[0];
-    filter_shape[3] = in_shape[1];
+    filter_shape[0] = in_shape[1];
+    filter_shape[1] = in_shape[0];
+    filter_shape[2] = kernels_[0];
+    filter_shape[3] = kernels_[1];
     this->CalcPaddingAndOutputSize(in_shape.data(), filter_shape.data(),
                                    &output_shape, &paddings);
     output->Resize(output_shape);
@@ -50,8 +50,8 @@ public:
   };
 
 protected:
-  PoolingType pooling_type_;
   std::vector<int> kernels_;
+  PoolingType pooling_type_;
 
   OP_INPUT_TAGS(INPUT);
   OP_OUTPUT_TAGS(OUTPUT);
diff --git a/mace/ops/pooling_test.cc b/mace/ops/pooling_test.cc
index b2484677a3b1b4bddc6ab4c72839a528eb0714fa..f56bff61c3620cdae9638c0d3f6166f5aed64aa4 100644
--- a/mace/ops/pooling_test.cc
+++ b/mace/ops/pooling_test.cc
@@ -5,7 +5,6 @@
 #include "gtest/gtest.h"
 
 #include "mace/core/operator.h"
-#include "mace/core/net.h"
 #include "mace/ops/ops_test_util.h"
 #include "mace/ops/conv_pool_2d_base.h"
 #include "mace/kernels/pooling.h"