diff --git a/mace/core/types.cc b/mace/core/types.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bc37f6d2c210e2649f6736df097e26e283beb173
--- /dev/null
+++ b/mace/core/types.cc
@@ -0,0 +1,26 @@
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+#include "mace/core/types.h"
+
+namespace mace {
+
+bool DataTypeCanUseMemcpy(DataType dt) {
+  switch (dt) {
+    case DT_FLOAT:
+    case DT_DOUBLE:
+    case DT_INT32:
+    case DT_INT64:
+    case DT_UINT16:
+    case DT_UINT8:
+    case DT_INT16:
+    case DT_INT8:
+    case DT_BOOL:
+      return true;
+    default:
+      return false;
+  }
+}
+
+} //  namespace mace
\ No newline at end of file
diff --git a/mace/core/types.h b/mace/core/types.h
index 21c502cf756d8c9c1a15316e95ca14e5a8953e31..2f354a14e9e93e56f750ffa25ebac5607fe5a275 100644
--- a/mace/core/types.h
+++ b/mace/core/types.h
@@ -10,6 +10,8 @@
 
 namespace mace {
 
+bool DataTypeCanUseMemcpy(DataType dt);
+
 template <class T>
 struct IsValidDataType;
 
@@ -51,7 +53,6 @@ MATCH_TYPE_AND_ENUM(int64_t, DT_INT64);
 MATCH_TYPE_AND_ENUM(bool, DT_BOOL);
 
 static const int32_t kint32_tmax = ((int32_t)0x7FFFFFFF);
-
 }  // namespace mace
 
 #endif  // MACE_CORE_TYPES_H_
diff --git a/mace/kernels/concat.h b/mace/kernels/concat.h
new file mode 100644
index 0000000000000000000000000000000000000000..dcf652486c074d42cb9777b6970d857b92d75a64
--- /dev/null
+++ b/mace/kernels/concat.h
@@ -0,0 +1,40 @@
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+#ifndef MACE_KERNELS_CONCAT_H_
+#define MACE_KERNELS_CONCAT_H_
+
+#include "mace/proto/mace.pb.h"
+#include "mace/core/common.h"
+#include "mace/core/types.h"
+namespace mace {
+namespace kernels {
+
+template<DeviceType D, typename T>
+struct ConcatFunctor {
+  void operator()(std::vector<const T *> &input_list,
+                  const index_t inner_dim,
+                  const index_t *outer_dims,
+                  T *output) {
+    const size_t input_count = input_list.size();
+    for (int inner_idx = 0; inner_idx < inner_dim; ++inner_idx) {
+      for (int i = 0; i < input_count; ++i) {
+        if (DataTypeCanUseMemcpy(DataTypeToEnum<T>::v())) {
+          memcpy(output, input_list[i], outer_dims[i] * sizeof(T));
+          output += outer_dims[i];
+          input_list[i] += outer_dims[i];
+        } else {
+          for (index_t k = 0; k < outer_dims[i]; ++k) {
+            *output++ = *input_list[i]++;
+          }
+        }
+      }
+    }
+  }
+};
+
+}  //  namepsace kernels
+} //  namespace mace
+
+#endif //  MACE_KERNELS_CONCAT_H_
diff --git a/mace/kernels/conv_pool_2d_util.cc b/mace/kernels/conv_pool_2d_util.cc
index c8bc2012bda48dd0a5955c783d799b3c08726af6..eb371d66300341276c394a7eb1c58b923ca79369 100644
--- a/mace/kernels/conv_pool_2d_util.cc
+++ b/mace/kernels/conv_pool_2d_util.cc
@@ -30,7 +30,7 @@ void CalcPaddingAndOutputSize(const index_t *input_shape,   // NCHW
   padding_size[0] = 0;
   padding_size[1] = 0;
 
-  index_t output_height, output_width;
+  index_t output_height = 0, output_width = 0;
   index_t kernel_height = filter_shape[2];
   index_t kernel_width = filter_shape[3];
   index_t output_channels = filter_shape[0];
@@ -85,7 +85,7 @@ void CalPaddingSize(const index_t *input_shape,   // NCHW
              "If dilations > 1, strides should be 1");
   MACE_CHECK_NOTNULL(padding_size);
 
-  index_t output_height, output_width;
+  index_t output_height = 0, output_width = 0;
   index_t k_extent_height = (filter_shape[2] - 1) * dilations[0] + 1;
   index_t k_extent_width = (filter_shape[3] - 1) * dilations[1] + 1;
 
diff --git a/mace/ops/BUILD b/mace/ops/BUILD
index eaaa11d3339c0d2a7a7c2f534d68d830605f3daf..3c66c65b71d9b8cbf076555e56256f9834f97a36 100644
--- a/mace/ops/BUILD
+++ b/mace/ops/BUILD
@@ -25,48 +25,67 @@ cc_library(
     name = "ops",
     srcs = glob(
         ["*.cc"],
-        exclude = ["*_test.cc", "*_benchmark.cc"],
+        exclude = [
+            "*_test.cc",
+            "*_benchmark.cc",
+        ],
     ),
     hdrs = glob(
         ["*.h"],
         exclude = ["ops_test_util.h"],
     ),
+    copts = ["-std=c++11"],
     deps = [
         "//mace/core",
         "//mace/kernels",
         "//mace/proto:cc_proto",
     ],
-    copts = ["-std=c++11"],
     alwayslink = 1,
 )
 
 cc_test(
     name = "ops_test",
+    testonly = 1,
     srcs = glob(
         ["*_test.cc"],
     ),
+    copts = ["-std=c++11"],
+    linkopts = if_android(["-ldl"]),
+    linkstatic = 1,
     deps = [
         ":ops",
         ":test",
         "@gtest//:gtest_main",
     ],
+)
+
+cc_test(
+    name = "concat_test",
+    testonly = 1,
+    srcs = glob(
+        ["concat_test.cc"],
+    ),
     copts = ["-std=c++11"],
     linkopts = if_android(["-ldl"]),
     linkstatic = 1,
-    testonly = 1,
+    deps = [
+        ":ops",
+        ":test",
+        "@gtest//:gtest_main",
+    ],
 )
 
 cc_test(
     name = "ops_benchmark",
+    testonly = 1,
     srcs = glob(["*_benchmark.cc"]),
+    copts = ["-std=c++11"],
+    linkopts = if_android(["-ldl"]),
+    linkstatic = 1,
     deps = [
         ":ops",
         ":test",
-        "//mace/core:core",
+        "//mace/core",
         "//mace/core:test_benchmark_main",
     ],
-    copts = ['-std=c++11'],
-    linkopts = if_android(["-ldl"]),
-    linkstatic = 1,
-    testonly = 1,
 )
diff --git a/mace/ops/concat.cc b/mace/ops/concat.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ec47971b72babc3c50b2ec78d1a8554f8c7deb38
--- /dev/null
+++ b/mace/ops/concat.cc
@@ -0,0 +1,11 @@
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+#include "mace/ops/concat.h"
+
+namespace mace {
+
+REGISTER_CPU_OPERATOR(Concat, ConcatOp<DeviceType::CPU, float>);
+
+}  // namespace mace
diff --git a/mace/ops/concat.h b/mace/ops/concat.h
new file mode 100644
index 0000000000000000000000000000000000000000..28a97ac83c15f2ab9221801e2d9e38b168942311
--- /dev/null
+++ b/mace/ops/concat.h
@@ -0,0 +1,68 @@
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+#ifndef MACE_OPS_CONCAT_H_
+#define MACE_OPS_CONCAT_H_
+
+#include "mace/proto/mace.pb.h"
+#include "mace/core/operator.h"
+#include "mace/kernels/concat.h"
+namespace mace {
+
+template<DeviceType D, typename T>
+class ConcatOp : public Operator<D, T> {
+ public:
+  ConcatOp(const OperatorDef &op_def, Workspace *ws)
+      : Operator<D, T>(op_def, ws) {}
+
+  bool Run() override {
+    int32_t values_count = this->InputSize() - 1;
+    const Tensor *input0 = this->Input(0);
+    const Tensor *axis_tensor = this->Input(values_count);
+    MACE_CHECK(axis_tensor->dim_size() == 0,
+               "axis should be a scalar integer, but got shape: ",
+               axis_tensor->dim_size());
+    const int32_t concat_axis = *(axis_tensor->data<int32_t>());
+    const int32_t input_dims = input0->dim_size();
+    const int32_t axis = concat_axis < 0 ? concat_axis + input_dims : concat_axis;
+    MACE_CHECK((0 <= axis && axis < input_dims), "Expected concatenating axis in the range [",
+               -input_dims, ", ", input_dims, "], but got", concat_axis);
+    std::vector<index_t> output_shape(input0->shape());
+    index_t inner_size = 1;
+    for (int i = 0; i < axis; ++i) {
+      inner_size *= output_shape[i];
+    }
+    std::vector<index_t> outer_sizes(values_count, 0);
+    std::vector<const T *> input_list(values_count, nullptr);
+    input_list[0] = input0->data<T>();
+    outer_sizes[0] = input0->size() / inner_size;
+    const Tensor *input = nullptr;
+    for (int i = 1; i < values_count; ++i) {
+      input = this->Input(i);
+      MACE_CHECK(input->dim_size() == input0->dim_size(), "Ranks of all input tensors must be same.");
+      for (int j = 0; j < axis_tensor->dim_size(); ++j) {
+        if (j == axis) { continue; }
+        MACE_CHECK(input->dim(j) == input0->dim(j), "Dimensions of inputs should equal except axis.");
+      }
+      input_list[i] = input->data<T>();
+      outer_sizes[i] = input->size() / inner_size;
+      output_shape[axis] += input->dim(axis);
+    }
+
+    Tensor *output = this->Output(OUTPUT);
+    output->Resize(output_shape);
+
+    functor_(input_list, inner_size, outer_sizes.data(), output->mutable_data<T>());
+    return true;
+  }
+ private:
+  kernels::ConcatFunctor<D, T> functor_;
+
+ private:
+  OP_OUTPUT_TAGS(OUTPUT);
+};
+
+} //  namespace mace
+
+#endif //  MACE_OPS_CONCAT_H_
diff --git a/mace/ops/concat_benchmark.cc b/mace/ops/concat_benchmark.cc
new file mode 100644
index 0000000000000000000000000000000000000000..58d9c8f309a464d44bdbaf0c2a770a6af2f02baf
--- /dev/null
+++ b/mace/ops/concat_benchmark.cc
@@ -0,0 +1,52 @@
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+#include "mace/core/operator.h"
+#include "mace/core/testing/test_benchmark.h"
+#include "mace/ops/ops_test_util.h"
+
+namespace mace {
+template<DeviceType D, typename T>
+static void ConcatHelper(
+    int iters, int concat_dim, int dim1) {
+  mace::testing::StopTiming();
+
+  OpsTestNet net;
+  OpDefBuilder("Concat", "ConcatBM")
+      .Input("Input0")
+      .Input("Input1")
+      .Input("Axis")
+      .Output("Output")
+      .Finalize(net.operator_def());
+
+  // Add input data
+  const int kDim0 = 100;
+  net.AddRandomInput<T>("Input0", {kDim0, dim1});
+  net.AddRandomInput<T>("Input1", {kDim0, dim1});
+  net.AddInputFromArray<int32_t>("Axis", {}, {concat_dim});
+
+  // Warm-up
+  for (int i = 0; i < 5; ++i) {
+    net.RunOp(D);
+  }
+  const int64_t tot = static_cast<int64_t>(iters) * kDim0 * dim1 * 2;
+  mace::testing::ItemsProcessed(tot);
+  testing::BytesProcessed(tot * sizeof(T));
+  mace::testing::StartTiming();
+  while (iters--) {
+    net.RunOp(D);
+  }
+}
+
+static void BM_ConcatDim0Float(int iters, int dim1) {
+  ConcatHelper<DeviceType::CPU, float>(iters, 0, dim1);
+}
+
+static void BM_ConcatDim1Float(int iters, int dim1) {
+  ConcatHelper<DeviceType::CPU, float>(iters, 1, dim1);
+}
+BENCHMARK(BM_ConcatDim0Float)->Arg(1000)->Arg(100000);
+BENCHMARK(BM_ConcatDim1Float)->Arg(1000)->Arg(100000);
+
+}  //  namespace mace
\ No newline at end of file
diff --git a/mace/ops/concat_test.cc b/mace/ops/concat_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6d376c62538ec7490d1db361c398f1078f4a70c5
--- /dev/null
+++ b/mace/ops/concat_test.cc
@@ -0,0 +1,142 @@
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+#include "mace/ops/concat.h"
+#include "mace/ops/ops_test_util.h"
+#include "gmock/gmock.h"
+
+using namespace mace;
+
+class ConcatOpTest : public OpsTestBase {};
+
+TEST_F(ConcatOpTest, Simple_Horizon) {
+  // Construct graph
+  auto &net = test_net();
+  OpDefBuilder("Concat", "ConcatTest")
+      .Input("Input0")
+      .Input("Input1")
+      .Input("Axis")
+      .Output("Output")
+      .Finalize(net.operator_def());
+
+  std::vector<index_t> input_shape = {4, 4};
+  std::vector<float> input0;
+  GenerateRandomRealTypeData(input_shape, input0);
+  std::vector<float> input1;
+  GenerateRandomRealTypeData(input_shape, input1);
+  // Add inputs
+  net.AddInputFromArray<float>("Input0", input_shape, input0);
+  net.AddInputFromArray<float>("Input1", input_shape, input1);
+  net.AddInputFromArray<int>("Axis", {}, {0});
+
+  // Run
+  net.RunOp();
+
+  // Check
+  auto output = net.GetOutput("Output");
+
+  std::vector<index_t> expected_shape = {8, 4};
+  EXPECT_THAT(output->shape(), ::testing::ContainerEq(expected_shape));
+
+  const float *output_ptr = output->data<float>();
+  for (auto f : input0) {
+    ASSERT_EQ(f, *output_ptr++);
+  }
+  for (auto f : input1) {
+    ASSERT_EQ(f, *output_ptr++);
+  }
+}
+
+TEST_F(ConcatOpTest, Simple_Vertical) {
+  // Construct graph
+  auto &net = test_net();
+  OpDefBuilder("Concat", "ConcatTest")
+      .Input("Input0")
+      .Input("Input1")
+      .Input("Axis")
+      .Output("Output")
+      .Finalize(net.operator_def());
+
+  std::vector<index_t> input_shape = {4, 4};
+  std::vector<float> input0;
+  GenerateRandomRealTypeData(input_shape, input0);
+  std::vector<float> input1;
+  GenerateRandomRealTypeData(input_shape, input1);
+  // Add inputs
+  net.AddInputFromArray<float>("Input0", input_shape, input0);
+  net.AddInputFromArray<float>("Input1", input_shape, input1);
+  net.AddInputFromArray<int>("Axis", {}, {1});
+
+  // Run
+  net.RunOp();
+
+  // Check
+  auto output = net.GetOutput("Output");
+
+  std::vector<index_t> expected_shape = {4, 8};
+  EXPECT_THAT(output->shape(), ::testing::ContainerEq(expected_shape));
+
+  const float *output_ptr = output->data<float>();
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 0; j < 4; ++j) {
+      ASSERT_EQ(input0[i * 4 + j], *output_ptr++);
+    }
+    for (int j = 0; j < 4; ++j) {
+      ASSERT_EQ(input1[i * 4 + j], *output_ptr++);
+    }
+  }
+}
+
+TEST_F(ConcatOpTest, Random) {
+  srand(time(nullptr));
+  int dim = 5;
+  int num_inputs = 2 + rand() % 10;
+  int axis = rand() % dim;
+  // Construct graph
+  auto &net = test_net();
+  auto builder = OpDefBuilder("Concat", "ConcatTest");
+  for (int i = 0; i < num_inputs; ++i) {
+    builder = builder.Input(("Input" + std::to_string(i)).c_str());
+  }
+  builder.Input("Axis")
+      .Output("Output")
+      .Finalize(net.operator_def());
+
+  std::vector<index_t> shape_data;
+  GenerateRandomIntTypeData<index_t>({dim}, shape_data, 1, dim);
+  std::vector<std::vector<index_t>> input_shapes(num_inputs, shape_data);
+  std::vector<std::vector<float>> inputs(num_inputs, std::vector<float>());
+  std::vector<float *> input_ptrs(num_inputs, nullptr);
+  index_t concat_axis_size = 0;
+  for (int i = 0; i < num_inputs; ++i) {
+    input_shapes[i][axis] = 1 + rand() % dim;
+    concat_axis_size += input_shapes[i][axis];
+    GenerateRandomRealTypeData(input_shapes[i], inputs[i]);
+    input_ptrs[i] = inputs[i].data();
+    net.AddInputFromArray<float>(("Input" + std::to_string(i)).c_str(), input_shapes[i], inputs[i]);
+  }
+  net.AddInputFromArray<int>("Axis", {}, {axis});
+
+  // Run
+  net.RunOp();
+
+  // Check
+  auto output = net.GetOutput("Output");
+
+  std::vector<index_t> expected_shape = input_shapes[0];
+  expected_shape[axis] = concat_axis_size;
+  EXPECT_THAT(output->shape(), ::testing::ContainerEq(expected_shape));
+
+  const float *output_ptr = output->data<float>();
+  while (output_ptr != (output->data<float>() + output->size())) {
+    for (int i = 0; i < num_inputs; ++i) {
+      index_t num_elements = std::accumulate(input_shapes[i].begin() + axis,
+                                             input_shapes[i].end(), 1,
+                                             std::multiplies<index_t>());
+      for (int j = 0; j < num_elements; ++j) {
+        EXPECT_EQ(*input_ptrs[i]++, *output_ptr++);
+      }
+    }
+  }
+}
diff --git a/mace/ops/conv_pool_2d_base.h b/mace/ops/conv_pool_2d_base.h
index 9b1838a36c2ef4f68217616569edcee32d3a6f9e..1b9e2340a567a854052bfa398315713a6f709660 100644
--- a/mace/ops/conv_pool_2d_base.h
+++ b/mace/ops/conv_pool_2d_base.h
@@ -37,7 +37,7 @@ class ConvPool2dOpBase : public Operator<D, T> {
     * http://deeplearning.net/software/theano/tutorial/conv_arithmetic.html
     */
 
-    index_t output_height, output_width;
+    index_t output_height = 0, output_width = 0;
 
     switch (padding_) {
       case VALID:
diff --git a/mace/ops/ops_test_util.h b/mace/ops/ops_test_util.h
index 151ea4580b02cb19047b5f949a3883232e6b4fc5..ed6516874a5076d5c2012d1a0ef50bf3dd4cabd1 100644
--- a/mace/ops/ops_test_util.h
+++ b/mace/ops/ops_test_util.h
@@ -43,7 +43,7 @@ class OpsTestNet {
  public:
   OpsTestNet() {}
 
-  template <typename T>
+  template<typename T>
   void AddInputFromArray(const char *name,
                          const std::vector<index_t> &shape,
                          const std::vector<T> &data) {
@@ -51,11 +51,11 @@ class OpsTestNet {
         ws_.CreateTensor(name, cpu_allocator(), DataTypeToEnum<T>::v());
     input->Resize(shape);
     T *input_data = input->mutable_data<T>();
-    MACE_CHECK(input->size() == data.size());
+    MACE_CHECK(static_cast<size_t>(input->size()) == data.size());
     memcpy(input_data, data.data(), data.size() * sizeof(T));
   }
 
-  template <typename T>
+  template<typename T>
   void AddRepeatedInput(const char *name,
                         const std::vector<index_t> &shape,
                         const T data) {
@@ -67,7 +67,7 @@ class OpsTestNet {
     std::fill(input_data, input_data + input->size(), data);
   }
 
-  template <typename T>
+  template<typename T>
   void AddRandomInput(const char *name,
                       const std::vector<index_t> &shape,
                       bool positive = false) {
@@ -86,18 +86,6 @@ class OpsTestNet {
                   });
   }
 
-  template <typename T>
-  void AddFixedInput(const char *name,
-                     const std::vector<index_t> &shape,
-                     T value) {
-    Tensor *input =
-        ws_.CreateTensor(name, cpu_allocator(), DataTypeToEnum<T>::v());
-    input->Resize(shape);
-    float *input_data = input->mutable_data<T>();
-
-    std::fill(input_data, input_data + input->size(), value);
-  }
-
   void AddIntArg(const char *name, const int value) {
     auto arg = op_def_.add_arg();
     arg->set_name(name);
@@ -186,7 +174,38 @@ class OpsTestBase : public ::testing::Test {
   OpsTestNet test_net_;
 };
 
-template <typename T>
+template<typename T>
+void GenerateRandomRealTypeData(const std::vector<index_t> &shape, std::vector<T> &res) {
+  std::random_device rd;
+  std::mt19937 gen(rd());
+  std::normal_distribution<T> nd(0, 1);
+
+  index_t size = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<index_t>());
+  res.resize(size);
+
+  std::generate(res.begin(), res.end(),
+                [&gen, &nd] {
+                  return nd(gen);
+                });
+}
+
+template<typename T>
+void GenerateRandomIntTypeData(const std::vector<index_t> &shape, std::vector<T> &res,
+                               const T a = 0, const T b = std::numeric_limits<T>::max()) {
+  std::random_device rd;
+  std::mt19937 gen(rd());
+  std::uniform_int_distribution<> nd(a, b);
+
+  index_t size = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<index_t>());
+  res.resize(size);
+
+  std::generate(res.begin(), res.end(),
+                [&gen, &nd] {
+                  return nd(gen);
+                });
+}
+
+template<typename T>
 unique_ptr<Tensor> CreateTensor(const std::vector<index_t> &shape,
                                 const std::vector<T> &data) {
   unique_ptr<Tensor> res(new Tensor(cpu_allocator(), DataTypeToEnum<T>::v()));
@@ -219,23 +238,23 @@ inline std::string ShapeToString(const Tensor &x) {
   return std::string(stream.str());
 }
 
-template <typename T>
+template<typename T>
 struct is_floating_point_type {
   static const bool value =
       std::is_same<T, float>::value || std::is_same<T, double>::value;
 };
 
-template <typename T>
+template<typename T>
 inline void ExpectEqual(const T &a, const T &b) {
   EXPECT_EQ(a, b);
 }
 
-template <>
+template<>
 inline void ExpectEqual<float>(const float &a, const float &b) {
   EXPECT_FLOAT_EQ(a, b);
 }
 
-template <>
+template<>
 inline void ExpectEqual<double>(const double &a, const double &b) {
   EXPECT_DOUBLE_EQ(a, b);
 }
@@ -246,11 +265,11 @@ inline void AssertSameTypeDims(const Tensor &x, const Tensor &y) {
                                 << "y.shape [ " << ShapeToString(y) << "]";
 }
 
-template <typename T, bool is_fp = is_floating_point_type<T>::value>
+template<typename T, bool is_fp = is_floating_point_type<T>::value>
 struct Expector;
 
 // Partial specialization for float and double.
-template <typename T>
+template<typename T>
 struct Expector<T, true> {
   static void Equal(const T &a, const T &b) { ExpectEqual(a, b); }
 
@@ -276,7 +295,7 @@ struct Expector<T, true> {
   }
 };
 
-template <typename T>
+template<typename T>
 void ExpectTensorNear(const Tensor &x, const Tensor &y, const double abs_err) {
   static_assert(is_floating_point_type<T>::value,
                 "T is not a floating point type");