diff --git a/paddle/operators/concat_op.cc b/paddle/operators/concat_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0ebefbab26ec8fdf316f852fbb7f6d9f3bbc48eb
--- /dev/null
+++ b/paddle/operators/concat_op.cc
@@ -0,0 +1,79 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/concat_op.h"
+#include <vector>
+
+namespace paddle {
+namespace operators {
+using framework::Tensor;
+
+class ConcatOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    auto ins = ctx.MultiInput<framework::Tensor>("X");
+    auto *out = ctx.Output<framework::Tensor>("Out");
+    size_t axis = static_cast<size_t>(ctx.Attr<int>("axis"));
+    size_t n = ins.size();
+
+    PADDLE_ENFORCE_GT(n, 1, "Input tensors count should > 1.");
+
+    auto out_dims = ins[0]->dims();
+    size_t in_zero_dims_size = out_dims.size();
+    for (size_t i = 1; i < n; i++) {
+      for (size_t j = 0; j < in_zero_dims_size; j++) {
+        if (j == axis) {
+          out_dims[axis] += ins[i]->dims()[j];
+          continue;
+        }
+        PADDLE_ENFORCE_EQ(out_dims[j], ins[i]->dims()[j],
+                          "Input tensors should have the same "
+                          "elements except the specify axis.")
+      }
+    }
+    out->Resize(out_dims);
+  }
+};
+
+class ConcatOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ConcatOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "the input tensors of concat operator.").AsDuplicable();
+    AddOutput("Out", "the output tensor of concat operator.");
+    AddComment(R"DOC(
+            Join the input tensors along with the axis.
+            Examples:
+              Input[0] = [[1,2],[3,4]]
+              Input[1] = [[5,6]]
+              axis = 0
+              Output = [[1,2],
+                        [3,4],
+                        [5,6]]
+        )DOC");
+    AddAttr<int>("axis", "The axis which the inputs will be joined with.")
+        .SetDefault(0);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(concat, ops::ConcatOp, ops::ConcatOpMaker)
+REGISTER_OP_CPU_KERNEL(concat,
+                       ops::ConcatKernel<paddle::platform::CPUPlace, float>)
diff --git a/paddle/operators/concat_op.cu b/paddle/operators/concat_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..38fee7473dbb2ba97fe95b6632db7a1749cf3bbe
--- /dev/null
+++ b/paddle/operators/concat_op.cu
@@ -0,0 +1,19 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/concat_op.h"
+
+namespace ops = paddle::operators;
+// TODO(Yancey1989) Add GPU kernel
diff --git a/paddle/operators/concat_op.h b/paddle/operators/concat_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..f977054fdf8aa0164db726b94a21c57f770dd674
--- /dev/null
+++ b/paddle/operators/concat_op.h
@@ -0,0 +1,64 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename Place, typename T>
+class ConcatKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto ins = ctx.MultiInput<framework::Tensor>("X");
+    auto* out = ctx.Output<framework::Tensor>("Out");
+    int64_t axis = static_cast<int64_t>(ctx.Attr<int>("axis"));
+    size_t n = ins.size();
+    size_t output_axis_dim = 0;
+    size_t before = 1, after = 1;
+    for (size_t i = 0; i < n; i++) {
+      output_axis_dim += ins[i]->dims()[axis];
+    }
+    auto& input_zero = ins[0];
+    for (int64_t i = 0; i < input_zero->dims().size(); i++) {
+      if (i == axis) {
+        continue;
+      }
+      if (i < axis) {
+        before *= input_zero->dims()[i];
+      } else {
+        after *= input_zero->dims()[i];
+      }
+    }
+    size_t output_offset = 0;
+    for (size_t i = 0; i < n; i++) {
+      auto& in = ins[i];
+      auto axis_dim = in->dims()[axis];
+      for (size_t j = 0; j < before; j++) {
+        size_t len = axis_dim * after * sizeof(T);
+        const T* src = in->data<T>() + axis_dim * after * j;
+        T* out_data = out->mutable_data<T>(platform::CPUPlace());
+        T* dest = out_data + output_offset + output_axis_dim * after * j;
+        memcpy(dest, src, len);
+      }
+      output_offset += axis_dim * after;
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index db701a2a309b794d8a4238f7176e3986236ed087..227b75aff86089d0b21bdae7e6e402292bce67d9 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -49,6 +49,7 @@ USE_OP(minus);
 USE_OP(cos_sim);
 USE_CPU_ONLY_OP(gather);
 USE_CPU_ONLY_OP(scatter);
+USE_CPU_ONLY_OP(concat);
 USE_OP(top_k);
 USE_OP(squared_l2_distance);
 USE_OP(sum);
diff --git a/python/paddle/v2/framework/op.py b/python/paddle/v2/framework/op.py
index 4e91924a50cf6401d4002510e940ddc84edbe61a..9e665adad2d3ad91d183c6815fbd7135ac4e8965 100644
--- a/python/paddle/v2/framework/op.py
+++ b/python/paddle/v2/framework/op.py
@@ -43,7 +43,6 @@ class OpDescCreationMethod(object):
         if len(args) != 0:
             raise ValueError("Only keyword arguments are supported.")
         op_desc = framework_pb2.OpDesc()
-
         for input_parameter in self.__op_proto__.inputs:
             input_arguments = kwargs.get(input_parameter.name, [])
             if is_str(input_arguments):
diff --git a/python/paddle/v2/framework/tests/CMakeLists.txt b/python/paddle/v2/framework/tests/CMakeLists.txt
index 2117fdf0d58520a008d2bd01d56d96dd248be025..2f6be105b6bce1200a29133f019523e6aee23895 100644
--- a/python/paddle/v2/framework/tests/CMakeLists.txt
+++ b/python/paddle/v2/framework/tests/CMakeLists.txt
@@ -35,4 +35,5 @@ py_test(test_lookup_table SRCS test_lookup_table.py)
 py_test(test_scale_and_identity_op SRCS test_scale_and_identity_op.py)
 py_test(test_sum_op SRCS test_sum_op.py)
 py_test(mnist SRCS mnist.py)
+py_test(test_concat_op SRCS test_concat_op.py)
 py_test(test_squared_l2_distance_op SRCS test_squared_l2_distance_op.py)
diff --git a/python/paddle/v2/framework/tests/gradient_checker.py b/python/paddle/v2/framework/tests/gradient_checker.py
index fdb06b7988935ebbe53f72f4eba89d75ac2502d4..51a98284bdd17ec2caeed53c231654b309514c8b 100644
--- a/python/paddle/v2/framework/tests/gradient_checker.py
+++ b/python/paddle/v2/framework/tests/gradient_checker.py
@@ -11,11 +11,10 @@ __all__ = ['get_numeric_gradient']
 def create_op(op_type):
     # TODO need to set attrs
     kwargs = dict()
-    for in_name in Operator.get_op_input_names(op_type):
+    for in_name, _ in Operator.get_op_input_names(op_type):
         kwargs[in_name] = in_name
-    for out_name in Operator.get_op_output_names(op_type):
+    for out_name, _ in Operator.get_op_output_names(op_type):
         kwargs[out_name] = out_name
-
     return Operator(op_type, **kwargs)
 
 
diff --git a/python/paddle/v2/framework/tests/op_test_util.py b/python/paddle/v2/framework/tests/op_test_util.py
index 370f27eaf658dadbf7e82262c118140a10d15c41..54fe5da4401655ee50af08c18eaee7ad90c2fd8d 100644
--- a/python/paddle/v2/framework/tests/op_test_util.py
+++ b/python/paddle/v2/framework/tests/op_test_util.py
@@ -27,17 +27,30 @@ class OpTestMeta(type):
                 places.append(core.GPUPlace(0))
 
             for place in places:
-                for in_name in Operator.get_op_input_names(self.type):
-                    if hasattr(self, "inputs") and in_name in self.inputs:
-                        kwargs[in_name] = in_name
-                        var = scope.new_var(in_name).get_tensor()
-                        arr = self.inputs[in_name]
-                        var.set_dims(arr.shape)
-                        var.set(arr, place)
+                for ins in Operator.get_op_input_names(self.type):
+                    in_name = ins[0]
+                    in_dup = ins[1]
+                    if hasattr(self, 'inputs') and in_name in self.inputs:
+                        kwargs[in_name] = []
+                        if in_dup:
+                            arrays = self.inputs[in_name]
+                            for index, arr in enumerate(arrays):
+                                var = scope.new_var(in_name + str(index))
+                                tensor = var.get_tensor()
+                                tensor.set_dims(arr.shape)
+                                tensor.set(arr, place)
+                                kwargs[in_name].append(in_name + str(index))
+                        else:
+                            kwargs[in_name] = in_name
+                            var = scope.new_var(in_name).get_tensor()
+                            arr = self.inputs[in_name]
+                            var.set_dims(arr.shape)
+                            var.set(arr, place)
                     else:
                         kwargs[in_name] = "@EMPTY@"
 
-                for out_name in Operator.get_op_output_names(self.type):
+                for out_name, out_dup in Operator.get_op_output_names(
+                        self.type):
                     if not hasattr(self, "outputs"):
                         raise ValueError(
                             "The test op must set self.outputs dict.")
@@ -60,7 +73,8 @@ class OpTestMeta(type):
                 ctx = core.DeviceContext.create(place)
                 op.run(scope, ctx)
 
-                for out_name in Operator.get_op_output_names(self.type):
+                for out_name, out_dup in Operator.get_op_output_names(
+                        self.type):
                     actual = numpy.array(scope.find_var(out_name).get_tensor())
                     expect = self.outputs[out_name]
                     self.assertTrue(
diff --git a/python/paddle/v2/framework/tests/test_concat_op.py b/python/paddle/v2/framework/tests/test_concat_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..6bd4c309748369ed6823168f89a7490803c25dfd
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_concat_op.py
@@ -0,0 +1,22 @@
+import unittest
+import numpy as np
+from gradient_checker import GradientChecker, create_op
+from op_test_util import OpTestMeta
+
+
+class TestConcatOp(unittest.TestCase):
+    __metaclass__ = OpTestMeta
+
+    def setUp(self):
+        self.type = "concat"
+        x0 = np.random.random((2, 3, 2, 5)).astype('float32')
+        x1 = np.random.random((2, 3, 3, 5)).astype('float32')
+        x2 = np.random.random((2, 3, 4, 5)).astype('float32')
+        axis = 2
+        self.inputs = {'X': [x0, x1, x2]}
+        self.attrs = {'axis': axis}
+        self.outputs = {'Out': np.concatenate((x0, x1, x2), axis=axis)}
+
+
+if __name__ == '__main__':
+    unittest.main()