diff --git a/micro/ops/concat.h b/micro/ops/concat.h
new file mode 100644
index 0000000000000000000000000000000000000000..be6678e5cbd6859b2d3d5f85acc5479b12bc5408
--- /dev/null
+++ b/micro/ops/concat.h
@@ -0,0 +1,85 @@
+// Copyright 2020 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MICRO_OPS_CONCAT_H_
+#define MICRO_OPS_CONCAT_H_
+
+#include "micro/base/utils.h"
+#include "micro/framework/operator.h"
+#include "micro/framework/scratch_buffer.h"
+
+namespace micro {
+namespace ops {
+
+template <typename T>
+class ConcatOp : public framework::Operator {
+ public:
+  MaceStatus OnInit() { return MACE_SUCCESS; }
+
+  MaceStatus Run() {
+    const int32_t *output_dims = GetOutputShapeDims(0);
+    int32_t output_dim_size = GetOutputShapeDimSize(0);
+    int32_t inputs_count = GetInputSize();
+    MACE_ASSERT(inputs_count >= 1);
+
+    int32_t axis = GetArgByName("axis", static_cast<int32_t>(0));
+    axis = axis < 0 ? axis + output_dim_size : axis;
+    MACE_ASSERT(0 <= axis && axis < output_dim_size);
+
+    int32_t inner_size = 1;
+    for (int i = 0; i < axis; ++i) {
+      inner_size *= output_dims[i];
+    }
+
+    ScratchBuffer scratch_buffer(engine_config_);
+
+    int32_t *outer_sizes = scratch_buffer.GetBuffer<int32_t>(inputs_count);
+    for (int32_t i = 0; i < inputs_count; ++i) {
+      const int32_t *input_dims = GetInputShapeDims(i);
+      int32_t input_dim_size = GetInputShapeDimSize(i);
+      MACE_ASSERT(output_dim_size == input_dim_size);
+
+      for (int j = 0; j < output_dim_size; ++j) {
+        if (j == axis) continue;
+        MACE_ASSERT(input_dims[j] == output_dims[j]);
+      }
+
+      outer_sizes[i] = 1;
+      for (int32_t j = axis; j < input_dim_size; ++j) {
+        outer_sizes[i] *= input_dims[j];
+      }
+    }
+
+    const T **input_ptrs = scratch_buffer.GetBuffer<const T *>(inputs_count);
+    for (int32_t i = 0; i < inputs_count; ++i) {
+      input_ptrs[i] = GetInputData<T>(i);
+    }
+
+    T *output = GetOutputData<T>(0);
+    for (int32_t inner_idx = 0; inner_idx < inner_size; ++inner_idx) {
+      for (int32_t i = 0; i < inputs_count; ++i) {
+        for (int32_t k = 0; k < outer_sizes[i]; ++k) {
+          *output++ = *input_ptrs[i]++;
+        }
+      }
+    }
+
+    return MaceStatus::MACE_SUCCESS;
+  }
+};
+
+}  // namespace ops
+}  // namespace micro
+
+#endif  // MICRO_OPS_CONCAT_H_
diff --git a/micro/test/ccunit/micro/ops/concat_test.cc b/micro/test/ccunit/micro/ops/concat_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a82b72668a6728cf5e4352c432762dddb2b6222e
--- /dev/null
+++ b/micro/test/ccunit/micro/ops/concat_test.cc
@@ -0,0 +1,359 @@
+// Copyright 2018 The MACE Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "micro/ops/concat.h"
+#include "gtest/gtest.h"
+#include "micro/ops/gtest_utils.h"
+#include "micro/ops/substitute_op.h"
+#include "micro/ops/test_utils.h"
+
+namespace micro {
+namespace ops {
+namespace test {
+
+class ConcatOpTest : public ::testing::Test {};
+
+TEST_F(ConcatOpTest, TestValueTypeDouble) {
+  // clang-format off
+  double x0[2 * 3] = {
+    0, 1, 2,
+    3, 4, 5
+  };
+  int32_t x0_dims[2] = { 2, 3};
+  int32_t x0_dim_size = 2;
+
+  double x1[2 * 2] = {
+    6, 7,
+    8, 9
+  };
+  int32_t x1_dims[2] = { 2, 2};
+  int32_t x1_dim_size = 2;
+
+  double y[2 * 5] = {};
+  int32_t y_dims[2] = {2, 5};
+  int32_t y_dim_size = 2;
+
+  double y_g[2 * 5] = {
+    0, 1, 2, 6, 7,
+    3, 4, 5, 8, 9
+  };
+  int32_t y_g_dims[2] = {2, 5};
+  int32_t y_g_dim_size = 2;
+  // clang-format on
+
+  ConcatOp<double> Concat_op;
+  framework::SubstituteOp substitude_op;
+  substitude_op.AddInput(x0, x0_dims, x0_dim_size)
+      .AddInput(x1, x1_dims, x1_dim_size)
+      .AddArg("axis", 1)
+      .AddOutput(y, y_dims, y_dim_size);
+
+  Concat_op.Init(NULL, reinterpret_cast<framework::OpContext *>(&substitude_op),
+                 NULL);
+  Concat_op.Run();
+
+  ExpectTensorNear<double>(y, y_dims, y_dim_size, y_g, y_g_dims, y_g_dim_size);
+}
+
+TEST_F(ConcatOpTest, TestValueTypeFloat) {
+  // clang-format off
+  float x0[2 * 3] = {
+    0, 1, 2,
+    3, 4, 5
+  };
+  int32_t x0_dims[2] = { 2, 3};
+  int32_t x0_dim_size = 2;
+
+  float x1[2 * 2] = {
+    6, 7,
+    8, 9
+  };
+  int32_t x1_dims[2] = { 2, 2};
+  int32_t x1_dim_size = 2;
+
+  float y[2 * 5] = {};
+  int32_t y_dims[2] = {2, 5};
+  int32_t y_dim_size = 2;
+
+  float y_g[2 * 5] = {
+    0, 1, 2, 6, 7,
+    3, 4, 5, 8, 9
+  };
+  int32_t y_g_dims[2] = {2, 5};
+  int32_t y_g_dim_size = 2;
+  // clang-format on
+
+  ConcatOp<float> Concat_op;
+  framework::SubstituteOp substitude_op;
+  substitude_op.AddInput(x0, x0_dims, x0_dim_size)
+      .AddInput(x1, x1_dims, x1_dim_size)
+      .AddArg("axis", 1)
+      .AddOutput(y, y_dims, y_dim_size);
+
+  Concat_op.Init(NULL, reinterpret_cast<framework::OpContext *>(&substitude_op),
+                 NULL);
+  Concat_op.Run();
+
+  ExpectTensorNear<float>(y, y_dims, y_dim_size, y_g, y_g_dims, y_g_dim_size);
+}
+
+TEST_F(ConcatOpTest, TestInputOrder) {
+  // clang-format off
+  int32_t x0[2 * 3] = {
+    0, 1, 2,
+    3, 4, 5
+  };
+  int32_t x0_dims[2] = { 2, 3};
+  int32_t x0_dim_size = 2;
+
+  int32_t x1[2 * 2] = {
+    6, 7,
+    8, 9
+  };
+  int32_t x1_dims[2] = { 2, 2};
+  int32_t x1_dim_size = 2;
+
+  int32_t y[2 * 5] = {};
+  int32_t y_dims[2] = {2, 5};
+  int32_t y_dim_size = 2;
+
+  int32_t y_g[2 * 5] = {
+    6, 7, 0, 1, 2,
+    8, 9, 3, 4, 5
+  };
+  int32_t y_g_dims[2] = {2, 5};
+  int32_t y_g_dim_size = 2;
+  // clang-format on
+
+  ConcatOp<int32_t> Concat_op;
+  framework::SubstituteOp substitude_op;
+  substitude_op.AddInput(x1, x1_dims, x1_dim_size)
+      .AddInput(x0, x0_dims, x0_dim_size)
+      .AddArg("axis", 1)
+      .AddOutput(y, y_dims, y_dim_size);
+
+  Concat_op.Init(NULL, reinterpret_cast<framework::OpContext *>(&substitude_op),
+                 NULL);
+  Concat_op.Run();
+
+  ExpectTensorNear<int32_t>(y, y_dims, y_dim_size, y_g, y_g_dims, y_g_dim_size);
+}
+
+TEST_F(ConcatOpTest, TestAxis1) {
+  // clang-format off
+  int32_t x0[2 * 3] = {
+    0, 1, 2,
+    3, 4, 5
+  };
+  int32_t x0_dims[2] = { 2, 3};
+  int32_t x0_dim_size = 2;
+
+  int32_t x1[2 * 2] = {
+    6, 7,
+    8, 9
+  };
+  int32_t x1_dims[2] = { 2, 2};
+  int32_t x1_dim_size = 2;
+
+  int32_t y[2 * 5] = {};
+  int32_t y_dims[2] = {2, 5};
+  int32_t y_dim_size = 2;
+
+  int32_t y_g[2 * 5] = {
+    0, 1, 2, 6, 7,
+    3, 4, 5, 8, 9
+  };
+  int32_t y_g_dims[2] = {2, 5};
+  int32_t y_g_dim_size = 2;
+  // clang-format on
+  ConcatOp<int32_t> Concat_op;
+  framework::SubstituteOp substitude_op;
+  substitude_op.AddInput(x0, x0_dims, x0_dim_size)
+      .AddInput(x1, x1_dims, x1_dim_size)
+      .AddArg("axis", 1)
+      .AddOutput(y, y_dims, y_dim_size);
+
+  Concat_op.Init(NULL, reinterpret_cast<framework::OpContext *>(&substitude_op),
+                 NULL);
+  Concat_op.Run();
+
+  ExpectTensorNear<int32_t>(y, y_dims, y_dim_size, y_g, y_g_dims, y_g_dim_size);
+}
+
+TEST_F(ConcatOpTest, TestAxis0) {
+  // clang-format off
+  int32_t x0[3 * 2] = {
+    0, 1,
+    2, 3,
+    4, 5
+  };
+  int32_t x0_dims[2] = { 3, 2};
+  int32_t x0_dim_size = 2;
+
+  int32_t x1[2 * 2] = {
+    6, 7,
+    8, 9
+  };
+  int32_t x1_dims[2] = { 2, 2};
+  int32_t x1_dim_size = 2;
+
+  int32_t y[5 * 2] = {};
+  int32_t y_dims[2] = {5, 2};
+  int32_t y_dim_size = 2;
+
+  int32_t y_g[5 * 2] = {
+    0, 1,
+    2, 3,
+    4, 5,
+    6, 7,
+    8, 9
+  };
+  int32_t y_g_dims[2] = {5, 2};
+  int32_t y_g_dim_size = 2;
+  // clang-format on
+  ConcatOp<int32_t> Concat_op;
+  framework::SubstituteOp substitude_op;
+  substitude_op.AddInput(x0, x0_dims, x0_dim_size)
+      .AddInput(x1, x1_dims, x1_dim_size)
+      .AddArg("axis", 0)
+      .AddOutput(y, y_dims, y_dim_size);
+
+  Concat_op.Init(NULL, reinterpret_cast<framework::OpContext *>(&substitude_op),
+                 NULL);
+  Concat_op.Run();
+
+  ExpectTensorNear<int32_t>(y, y_dims, y_dim_size, y_g, y_g_dims, y_g_dim_size);
+}
+
+TEST_F(ConcatOpTest, TestInputNumber1) {
+  // clang-format off
+  int32_t x0[2 * 3] = {
+    0, 1, 2,
+    3, 4, 5
+  };
+  int32_t x0_dims[2] = { 2, 3};
+  int32_t x0_dim_size = 2;
+
+  int32_t y[2 * 3] = {};
+  int32_t y_dims[2] = {2, 3};
+  int32_t y_dim_size = 2;
+
+  int32_t y_g[2 * 3] = {
+    0, 1, 2,
+    3, 4, 5
+  };
+  int32_t y_g_dims[2] = {2, 3};
+  int32_t y_g_dim_size = 2;
+  // clang-format on
+
+  ConcatOp<int32_t> Concat_op;
+  framework::SubstituteOp substitude_op;
+  substitude_op.AddInput(x0, x0_dims, x0_dim_size)
+      .AddArg("axis", 1)
+      .AddOutput(y, y_dims, y_dim_size);
+
+  Concat_op.Init(NULL, reinterpret_cast<framework::OpContext *>(&substitude_op),
+                 NULL);
+  Concat_op.Run();
+
+  ExpectTensorNear<int32_t>(y, y_dims, y_dim_size, y_g, y_g_dims, y_g_dim_size);
+}
+
+TEST_F(ConcatOpTest, TestInputNumber2) {
+  // clang-format off
+  int32_t x0[2 * 3] = {
+    0, 1, 2,
+    3, 4, 5
+  };
+  int32_t x0_dims[2] = { 2, 3};
+  int32_t x0_dim_size = 2;
+
+  int32_t x1[2 * 2] = {
+    6, 7,
+    8, 9
+  };
+  int32_t x1_dims[2] = { 2, 2};
+  int32_t x1_dim_size = 2;
+
+  int32_t y[2 * 5] = {};
+  int32_t y_dims[2] = {2, 5};
+  int32_t y_dim_size = 2;
+
+  int32_t y_g[2 * 5] = {
+    0, 1, 2, 6, 7,
+    3, 4, 5, 8, 9
+  };
+  int32_t y_g_dims[2] = {2, 5};
+  int32_t y_g_dim_size = 2;
+  // clang-format on
+  ConcatOp<int32_t> Concat_op;
+  framework::SubstituteOp substitude_op;
+  substitude_op.AddInput(x0, x0_dims, x0_dim_size)
+      .AddInput(x1, x1_dims, x1_dim_size)
+      .AddArg("axis", 1)
+      .AddOutput(y, y_dims, y_dim_size);
+
+  Concat_op.Init(NULL, reinterpret_cast<framework::OpContext *>(&substitude_op),
+                 NULL);
+  Concat_op.Run();
+
+  ExpectTensorNear<int32_t>(y, y_dims, y_dim_size, y_g, y_g_dims, y_g_dim_size);
+}
+
+TEST_F(ConcatOpTest, TestInputNumber3) {
+  // clang-format off
+  int32_t x0[2 * 3] = {
+    0, 1, 2,
+    3, 4, 5
+  };
+  int32_t x0_dims[2] = { 2, 3};
+  int32_t x0_dim_size = 2;
+
+  int32_t x1[2 * 2] = {
+    6, 7,
+    8, 9
+  };
+  int32_t x1_dims[2] = { 2, 2};
+  int32_t x1_dim_size = 2;
+
+  int32_t y[2 * 7] = {};
+  int32_t y_dims[2] = {2, 7};
+  int32_t y_dim_size = 2;
+
+  int32_t y_g[2 * 7] = {
+    0, 1, 2, 6, 7, 6, 7,
+    3, 4, 5, 8, 9, 8, 9
+  };
+  int32_t y_g_dims[2] = {2, 7};
+  int32_t y_g_dim_size = 2;
+  // clang-format on
+
+  ConcatOp<int32_t> Concat_op;
+  framework::SubstituteOp substitude_op;
+  substitude_op.AddInput(x0, x0_dims, x0_dim_size)
+      .AddInput(x1, x1_dims, x1_dim_size)
+      .AddInput(x1, x1_dims, x1_dim_size)
+      .AddArg("axis", 1)
+      .AddOutput(y, y_dims, y_dim_size);
+
+  Concat_op.Init(NULL, reinterpret_cast<framework::OpContext *>(&substitude_op),
+                 NULL);
+  Concat_op.Run();
+
+  ExpectTensorNear<int32_t>(y, y_dims, y_dim_size, y_g, y_g_dims, y_g_dim_size);
+}
+
+}  // namespace test
+}  // namespace ops
+}  // namespace micro
diff --git a/tools/python/micro/micro_support_ops.py b/tools/python/micro/micro_support_ops.py
index 33e471bb8a9a595ce44eedf5ccd93d68a2024703..5f9bb5f7cfd3f048fd4140d893bd95844d0c10b3 100644
--- a/tools/python/micro/micro_support_ops.py
+++ b/tools/python/micro/micro_support_ops.py
@@ -96,6 +96,8 @@ McSupportedOps = [
                  mace_pb2.DT_FLOAT, DataFormat.NHWC),
     OpDescriptor('micro/ops/expand_dims.h', 'ExpandDimsOp',
                  MaceOp.ExpandDims.name, mace_pb2.DT_FLOAT, DataFormat.NHWC),
+    OpDescriptor('micro/ops/concat.h', 'ConcatOp<mifloat>', MaceOp.Concat.name,
+                 mace_pb2.DT_FLOAT, DataFormat.NHWC),
     OpDescriptor('micro/ops/nhwc/depthwise_conv_2d_ref.h',
                  'DepthwiseConv2dRefOp',
                  MaceOp.DepthwiseConv2d.name, mace_pb2.DT_FLOAT,
diff --git a/tools/python/micro/scratch_computer.py b/tools/python/micro/scratch_computer.py
index 347eaba4ef6137f210e31786ff6ade54f8b22534..3edaab3f3415be66643e46f6f5195db9bf8b5f99 100644
--- a/tools/python/micro/scratch_computer.py
+++ b/tools/python/micro/scratch_computer.py
@@ -41,6 +41,7 @@ class ScratchComputer:
             MaceOp.Shape: self.scratch_size_no_need,
             MaceOp.Reshape: self.scratch_size_no_need,
             MaceOp.ExpandDims: self.scratch_size_of_expand_dims,
+            MaceOp.Concat: self.scratch_size_of_concat,
             MaceOp.MatMul: self.scratch_size_of_matmul,
             MaceOp.Pooling: self.scratch_size_of_pooling,
             MaceOp.DepthwiseConv2d: self.scratch_size_of_depthwise_conv,
@@ -131,3 +132,7 @@ class ScratchComputer:
     def scratch_size_eltwise(self, op_def):
         input0_dims = self.get_op_input_dims(op_def, 0)
         return len(input0_dims) * self.get_data_bytes(mace_pb2.DT_INT32) * 3
+
+    def scratch_size_of_concat(self, op_def):
+        # On a 64bit operating system, one pointer data need 8 bytes
+        return len(op_def.input) * self.get_data_bytes(mace_pb2.DT_INT32) * 3