diff --git a/mace/core/runtime/opencl/opencl_runtime.cc b/mace/core/runtime/opencl/opencl_runtime.cc
index 488b291d6df1061f95cccd0f89a492046eb4aa08..dc86cffdb325c0308b63fd201837f32e6d11a40d 100644
--- a/mace/core/runtime/opencl/opencl_runtime.cc
+++ b/mace/core/runtime/opencl/opencl_runtime.cc
@@ -147,6 +147,7 @@ const std::map<std::string, std::string>
   {"depthwise_conv_3x3", "depthwise_conv_3x3.cl"},
   {"pooling", "pooling.cl"},
   {"relu", "relu.cl"},
+  {"concat", "concat.cl"},
   {"resize_bilinear", "resize_bilinear.cl"},
   {"space_to_batch", "space_to_batch.cl"},
   {"buffer_to_image", "buffer_to_image.cl"},
diff --git a/mace/kernels/concat.h b/mace/kernels/concat.h
index 807fda0aa8c1ae76538529c61afdd7c7c01d3827..e70b4e73c977b9d8da0735784219739c5dbd468a 100644
--- a/mace/kernels/concat.h
+++ b/mace/kernels/concat.h
@@ -8,25 +8,64 @@
 #include "mace/core/common.h"
 #include "mace/core/types.h"
 #include "mace/proto/mace.pb.h"
+#include "mace/core/tensor.h"
+
 namespace mace {
 namespace kernels {
 
-template <DeviceType D, typename T>
-struct ConcatFunctor {
-  void operator()(std::vector<const T *> &input_list,
-                  const index_t inner_dim,
-                  const index_t *outer_dims,
-                  T *output) {
-    const size_t input_count = input_list.size();
-    for (int inner_idx = 0; inner_idx < inner_dim; ++inner_idx) {
-      for (size_t i = 0; i < input_count; ++i) {
+struct ConcatFunctorBase {
+  ConcatFunctorBase(const int32_t axis): axis_(axis){}
+
+  int32_t axis_;
+};
+
+template<DeviceType D, typename T>
+struct ConcatFunctor : ConcatFunctorBase {
+  ConcatFunctor(const int32_t axis): ConcatFunctorBase(axis){}
+
+  void operator()(const std::vector<const Tensor *> &input_list,
+                  Tensor *output) {
+    const Tensor *input0 = input_list.front();
+    const int inputs_count = input_list.size();
+
+    std::vector<index_t> output_shape(input0->shape());
+    index_t inner_size = 1;
+    for (int i = 0; i < axis_; ++i) {
+      inner_size *= output_shape[i];
+    }
+    std::vector<index_t> outer_sizes(inputs_count, 0);
+    outer_sizes[0] = input0->size() / inner_size;
+    for (int i = 1; i < inputs_count; ++i) {
+      const Tensor *input = input_list[i];
+      MACE_CHECK(input->dim_size() == input0->dim_size(),
+                 "Ranks of all input tensors must be same.");
+      for (int j = 0; j < input->dim_size(); ++j) {
+        if (j == axis_) {
+          continue;
+        }
+        MACE_CHECK(input->dim(j) == input0->dim(j),
+                   "Dimensions of inputs should equal except axis.");
+      }
+      outer_sizes[i] = input->size() / inner_size;
+      output_shape[axis_] += input->dim(axis_);
+    }
+    output->Resize(output_shape);
+
+    T *output_ptr = output->mutable_data<T>();
+
+    std::vector<const T *> input_ptrs(input_list.size(), nullptr);
+    for (size_t i = 0; i < inputs_count; ++i) {
+      input_ptrs[i] = input_list[i]->data<T>();
+    }
+    for (int inner_idx = 0; inner_idx < inner_size; ++inner_idx) {
+      for (size_t i = 0; i < inputs_count; ++i) {
         if (DataTypeCanUseMemcpy(DataTypeToEnum<T>::v())) {
-          memcpy(output, input_list[i], outer_dims[i] * sizeof(T));
-          output += outer_dims[i];
-          input_list[i] += outer_dims[i];
+          memcpy(output_ptr, input_ptrs[i], outer_sizes[i] * sizeof(T));
+          output_ptr += outer_sizes[i];
+          input_ptrs[i] += outer_sizes[i];
         } else {
-          for (index_t k = 0; k < outer_dims[i]; ++k) {
-            *output++ = *input_list[i]++;
+          for (index_t k = 0; k < outer_sizes[i]; ++k) {
+            *output_ptr++ = *input_ptrs[i]++;
           }
         }
       }
@@ -34,6 +73,15 @@ struct ConcatFunctor {
   }
 };
 
+template<typename T>
+struct ConcatFunctor<DeviceType::OPENCL, T> : ConcatFunctorBase{
+  ConcatFunctor(const int32_t axis): ConcatFunctorBase(axis){}
+
+  void operator()(const std::vector<const Tensor *> &input_list,
+                  Tensor *output);
+
+};
+
 }  //  namepsace kernels
 }  //  namespace mace
 
diff --git a/mace/kernels/opencl/cl/concat.cl b/mace/kernels/opencl/cl/concat.cl
new file mode 100644
index 0000000000000000000000000000000000000000..5ae4fb04e141f6510529083b2fee9e8826494b6d
--- /dev/null
+++ b/mace/kernels/opencl/cl/concat.cl
@@ -0,0 +1,98 @@
+#include <common.h>
+
+DATA_TYPE4 stitch_vector(DATA_TYPE4 left,
+                         DATA_TYPE4 right,
+                         const int pos,
+                         const bool reversed) {
+  if (!reversed) {
+    switch (pos) {
+      case 1:return (DATA_TYPE4)(left.x, right.x, right.y, right.z);
+      case 2:return (DATA_TYPE4)(left.x, left.y, right.x, right.y);
+      case 3:return (DATA_TYPE4)(left.x, left.y, left.z, right.x);
+      default:return (DATA_TYPE4) 0;
+    }
+  } else {
+    switch (pos) {
+      case 1:return (DATA_TYPE4)(left.w, right.x, right.y, right.z);
+      case 2:return (DATA_TYPE4)(left.z, left.w, right.x, right.y);
+      case 3:return (DATA_TYPE4)(left.y, left.z, left.w, right.x);
+      default:return (DATA_TYPE4) 0;
+    }
+  }
+}
+
+// Supported data type: half/float
+__kernel void concat_channel(__read_only image2d_t input0,
+                             __read_only image2d_t input1,
+                             __private const int input0_chan,
+                             __write_only image2d_t output) {
+  const int chan_blk_idx = get_global_id(0);
+  const int width_idx = get_global_id(1);
+  const int width = get_global_size(1);
+  const int hb_idx = get_global_id(2);
+  const int input0_chan_blk = (input0_chan + 3) / 4;
+
+  DATA_TYPE4 data = 0;
+#ifdef DIVISIBLE_FOUR
+  if (chan_blk_idx + 1 <= input0_chan_blk) {
+    data = READ_IMAGET(input0,
+                       SAMPLER,
+                       (int2)(chan_blk_idx * width + width_idx, hb_idx));
+  } else {
+    data = READ_IMAGET(input1,
+                       SAMPLER,
+                       (int2)((chan_blk_idx - input0_chan_blk) * width + width_idx, hb_idx));
+  }
+#else
+  if (chan_blk_idx + 1 < input0_chan_blk) {
+    data = READ_IMAGET(input0,
+                       SAMPLER,
+                       (int2)(chan_blk_idx * width + width_idx, hb_idx));
+  } else if (chan_blk_idx >= input0_chan_blk) {
+    const int in_chan_idx = chan_blk_idx - input0_chan_blk;
+    DATA_TYPE4 data0 = READ_IMAGET(input1,
+                                   SAMPLER,
+                                   (int2)(in_chan_idx * width + width_idx, hb_idx));
+    DATA_TYPE4 data1 = READ_IMAGET(input1,
+                                   SAMPLER,
+                                   (int2)((in_chan_idx + 1) * width + width_idx, hb_idx));
+    data = stitch_vector(data0, data1, input0_chan % 4, true);
+  } else {
+    DATA_TYPE4 data0 = READ_IMAGET(input0,
+                                   SAMPLER,
+                                   (int2)(chan_blk_idx * width + width_idx, hb_idx));
+    DATA_TYPE4 data1 = READ_IMAGET(input1,
+                                   SAMPLER,
+                                   (int2)(width_idx, hb_idx));
+    data = stitch_vector(data0, data1, input0_chan % 4, false);
+  }
+#endif
+
+  WRITE_IMAGET(output, (int2)(chan_blk_idx * width + width_idx, hb_idx), data);
+}
+
+//__kernel void concat_width(__read_only image2d_t input0,
+//                           __read_only image2d_t input1,
+//                           __private const int input0_width,
+//                           __write_only image2d_t output) {
+//  const int chan_blk_idx = get_global_id(0);
+//  const int width_idx = get_global_id(1);
+//  const int width = get_global_size(1);
+//  const int hb_idx = get_global_id(2);
+//
+//  const sampler_t SAMPLER = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+//
+//  DATA_TYPE4 data = 0;
+//  if (width_idx < input0_width) {
+//    data = READ_IMAGET(input0,
+//                       SAMPLER,
+//                       (int2)(chan_blk_idx * width + width_idx, hb_idx));
+//  } else {
+//    data = READ_IMAGET(input1,
+//                       SAMPLER,
+//                       (int2)(chan_blk_idx * width + (width_idx - input0_width), hb_idx));
+//  }
+//
+//  WRITE_IMAGET(output, (int2)(chan_blk_idx * width + width_idx, hb_idx), data);
+//}
+
diff --git a/mace/kernels/opencl/concat.cc b/mace/kernels/opencl/concat.cc
new file mode 100644
index 0000000000000000000000000000000000000000..80a23a44ccdbb50cd2be9976f2dfecf8d0576a91
--- /dev/null
+++ b/mace/kernels/opencl/concat.cc
@@ -0,0 +1,102 @@
+//
+// Copyright (c) 2017 XiaoMi All rights reserved.
+//
+
+#include "mace/kernels/concat.h"
+#include "mace/core/runtime/opencl/opencl_runtime.h"
+#include "mace/kernels/opencl/helper.h"
+#include "mace/utils/utils.h"
+
+namespace mace {
+namespace kernels {
+
+static void Concat2(const Tensor *input0,
+                    const Tensor *input1,
+                    const DataType dt,
+                    Tensor *output) {
+  const index_t batch = output->dim(0);
+  const index_t height = output->dim(1);
+  const index_t width = output->dim(2);
+  const index_t channel = output->dim(3);
+
+  const int channel_blk = RoundUpDiv4(channel);
+
+  auto runtime = OpenCLRuntime::Get();
+  std::set<std::string> built_options;
+  if (input0->dtype() == output->dtype()) {
+    built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt));
+  } else {
+    built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
+    built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
+  }
+  if (input0->dim(3) % 4 == 0) {
+    built_options.emplace("-DDIVISIBLE_FOUR");
+  }
+  auto concat_kernel = runtime->BuildKernel("concat", "concat_channel", built_options);
+
+  uint32_t idx = 0;
+  concat_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(input0->buffer())));
+  concat_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(input1->buffer())));
+  concat_kernel.setArg(idx++, static_cast<int32_t>(input0->dim(3)));
+  concat_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer())));
+
+  const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(concat_kernel);
+
+  uint32_t lws[3];
+  lws[0] = std::min<uint32_t>(channel_blk, kwg_size);
+  lws[1] = std::min<uint32_t>(width, kwg_size / lws[0]);
+  lws[2] = std::min<uint32_t>(height * batch, kwg_size / (lws[0] * lws[1]));
+
+  cl_int error = runtime->command_queue().enqueueNDRangeKernel(
+      concat_kernel, cl::NullRange,
+      cl::NDRange(static_cast<uint32_t>(channel_blk),
+                  static_cast<uint32_t>(width),
+                  static_cast<uint32_t>(height * batch)),
+      cl::NDRange(lws[0], lws[1], lws[2]),
+      NULL, OpenCLRuntime::Get()->GetDefaultEvent());
+  MACE_CHECK(error == CL_SUCCESS);
+}
+
+template<typename T>
+void ConcatFunctor<DeviceType::OPENCL, T>::operator()(const std::vector<const Tensor *> &input_list,
+                                                      Tensor *output) {
+  const int inputs_count = input_list.size();
+  MACE_CHECK(inputs_count == 2 && axis_ == 3)
+    << "Concat opencl kernel only support two elements with axis == 3";
+
+  const Tensor *input0 = input_list[0];
+
+  std::vector<index_t> output_shape(input0->shape());
+  for (int i = 1; i < inputs_count; ++i) {
+    const Tensor *input = input_list[i];
+    MACE_CHECK(input->dim_size() == input0->dim_size(),
+               "Ranks of all input tensors must be same.");
+    for (int j = 0; j < input->dim_size(); ++j) {
+      if (j == axis_) {
+        continue;
+      }
+      MACE_CHECK(input->dim(j) == input0->dim(j),
+                 "Dimensions of inputs should equal except axis.");
+    }
+    output_shape[axis_] += input->dim(axis_);
+  }
+  std::vector<size_t> image_shape;
+  CalImage2DShape(output_shape, BufferType::IN_OUT, image_shape);
+  output->ResizeImage(output_shape, image_shape);
+
+  switch (inputs_count) {
+    case 2:
+      Concat2(input_list[0], input_list[1], DataTypeToEnum<T>::value, output);
+      break;
+    default:MACE_NOT_IMPLEMENTED;
+  }
+};
+
+template
+struct ConcatFunctor<DeviceType::OPENCL, float>;
+template
+struct ConcatFunctor<DeviceType::OPENCL, half>;
+
+}  // namespace kernels
+} //  namespace mace
diff --git a/mace/ops/concat.cc b/mace/ops/concat.cc
index df040904bff47587143f4580c07516444341a7b6..929da85f2f36c8c8cff02608523475e14f5cbc47 100644
--- a/mace/ops/concat.cc
+++ b/mace/ops/concat.cc
@@ -7,8 +7,20 @@
 namespace mace {
 
 REGISTER_CPU_OPERATOR(OpKeyBuilder("Concat")
-                             .TypeConstraint<float>("T")
-                             .Build(),
+                          .TypeConstraint<float>("T")
+                          .Build(),
                       ConcatOp<DeviceType::CPU, float>);
+REGISTER_CPU_OPERATOR(OpKeyBuilder("Concat")
+                          .TypeConstraint<half>("T")
+                          .Build(),
+                      ConcatOp<DeviceType::CPU, half>);
 
+REGISTER_OPENCL_OPERATOR(OpKeyBuilder("Concat")
+                             .TypeConstraint<float>("T")
+                             .Build(),
+                         ConcatOp<DeviceType::OPENCL, float>);
+REGISTER_OPENCL_OPERATOR(OpKeyBuilder("Concat")
+                             .TypeConstraint<half>("T")
+                             .Build(),
+                         ConcatOp<DeviceType::OPENCL, half>);
 }  // namespace mace
diff --git a/mace/ops/concat.h b/mace/ops/concat.h
index 2b82c0cb1647a0505b6125ef0b837d1136af53a1..77e430304c93341c176dd732c30559f0721e4f8a 100644
--- a/mace/ops/concat.h
+++ b/mace/ops/concat.h
@@ -14,53 +14,23 @@ template <DeviceType D, typename T>
 class ConcatOp : public Operator<D, T> {
  public:
   ConcatOp(const OperatorDef &op_def, Workspace *ws)
-      : Operator<D, T>(op_def, ws) {}
+      : Operator<D, T>(op_def, ws),
+        functor_(OperatorBase::GetSingleArgument<int>("axis", 3)){}
 
   bool Run() override {
-    int32_t values_count = this->InputSize() - 1;
-    const Tensor *input0 = this->Input(0);
-    const Tensor *axis_tensor = this->Input(values_count);
-    MACE_CHECK(axis_tensor->dim_size() == 0,
-               "axis should be a scalar integer, but got shape: ",
-               axis_tensor->dim_size());
-    const int32_t concat_axis = *(axis_tensor->data<int32_t>());
-    const int32_t input_dims = input0->dim_size();
+    MACE_CHECK(this->InputSize() >= 2) << "There must be at least two inputs to concat";
+    const std::vector<const Tensor *> input_list = this->Inputs();
+    const int32_t concat_axis = OperatorBase::GetSingleArgument<int>("axis", 3);
+    const int32_t input_dims = input_list[0]->dim_size();
     const int32_t axis =
         concat_axis < 0 ? concat_axis + input_dims : concat_axis;
     MACE_CHECK((0 <= axis && axis < input_dims),
                "Expected concatenating axis in the range [", -input_dims, ", ",
                input_dims, "], but got", concat_axis);
-    std::vector<index_t> output_shape(input0->shape());
-    index_t inner_size = 1;
-    for (int i = 0; i < axis; ++i) {
-      inner_size *= output_shape[i];
-    }
-    std::vector<index_t> outer_sizes(values_count, 0);
-    std::vector<const T *> input_list(values_count, nullptr);
-    input_list[0] = input0->data<T>();
-    outer_sizes[0] = input0->size() / inner_size;
-    const Tensor *input = nullptr;
-    for (int i = 1; i < values_count; ++i) {
-      input = this->Input(i);
-      MACE_CHECK(input->dim_size() == input0->dim_size(),
-                 "Ranks of all input tensors must be same.");
-      for (int j = 0; j < axis_tensor->dim_size(); ++j) {
-        if (j == axis) {
-          continue;
-        }
-        MACE_CHECK(input->dim(j) == input0->dim(j),
-                   "Dimensions of inputs should equal except axis.");
-      }
-      input_list[i] = input->data<T>();
-      outer_sizes[i] = input->size() / inner_size;
-      output_shape[axis] += input->dim(axis);
-    }
 
     Tensor *output = this->Output(OUTPUT);
-    output->Resize(output_shape);
 
-    functor_(input_list, inner_size, outer_sizes.data(),
-             output->mutable_data<T>());
+    functor_(input_list, output);
     return true;
   }
 
diff --git a/mace/ops/concat_benchmark.cc b/mace/ops/concat_benchmark.cc
index 275886a6d345293b5a8a965ef442ea99932a8fba..0e4425251426e8056a2973352ded79cd2b4f88ef 100644
--- a/mace/ops/concat_benchmark.cc
+++ b/mace/ops/concat_benchmark.cc
@@ -15,7 +15,7 @@ static void ConcatHelper(int iters, int concat_dim, int dim1) {
   OpDefBuilder("Concat", "ConcatBM")
       .Input("Input0")
       .Input("Input1")
-      .Input("Axis")
+      .AddIntArg("axis", concat_dim)
       .Output("Output")
       .Finalize(net.NewOperatorDef());
 
@@ -23,7 +23,6 @@ static void ConcatHelper(int iters, int concat_dim, int dim1) {
   const int kDim0 = 100;
   net.AddRandomInput<DeviceType::CPU, T>("Input0", {kDim0, dim1});
   net.AddRandomInput<DeviceType::CPU, T>("Input1", {kDim0, dim1});
-  net.AddInputFromArray<DeviceType::CPU, int32_t>("Axis", {}, {concat_dim});
 
   // Warm-up
   for (int i = 0; i < 5; ++i) {
@@ -38,14 +37,65 @@ static void ConcatHelper(int iters, int concat_dim, int dim1) {
   }
 }
 
-static void BM_ConcatDim0Float(int iters, int dim1) {
+static void BM_CONCAT_Dim0Float(int iters, int dim1) {
   ConcatHelper<DeviceType::CPU, float>(iters, 0, dim1);
 }
 
-static void BM_ConcatDim1Float(int iters, int dim1) {
+static void BM_CONCAT_Dim1Float(int iters, int dim1) {
   ConcatHelper<DeviceType::CPU, float>(iters, 1, dim1);
 }
-BENCHMARK(BM_ConcatDim0Float)->Arg(1000)->Arg(100000);
-BENCHMARK(BM_ConcatDim1Float)->Arg(1000)->Arg(100000);
+BENCHMARK(BM_CONCAT_Dim0Float)->Arg(1000)->Arg(100000);
+BENCHMARK(BM_CONCAT_Dim1Float)->Arg(1000)->Arg(100000);
+
+template <typename T>
+static void OpenclConcatHelper(int iters,
+                               const std::vector<index_t> &shape0,
+                               const std::vector<index_t> &shape1,
+                               int concat_dim) {
+  mace::testing::StopTiming();
+
+  OpsTestNet net;
+
+  // Add input data
+  net.AddRandomInput<DeviceType::OPENCL, float>("Input0", shape0);
+  net.AddRandomInput<DeviceType::OPENCL, float>("Input1", shape1);
+
+  BufferToImage<DeviceType::OPENCL, T>(net, "Input0", "InputImage0", kernels::BufferType::IN_OUT);
+  BufferToImage<DeviceType::OPENCL, T>(net, "Input1", "InputImage1", kernels::BufferType::IN_OUT);
+  OpDefBuilder("Concat", "ConcatBM")
+      .Input("InputImage0")
+      .Input("InputImage1")
+      .AddIntArg("axis", concat_dim)
+      .Output("OutputImage")
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .Finalize(net.NewOperatorDef());
+
+  // Warm-up
+  for (int i = 0; i < 5; ++i) {
+    net.RunOp(DeviceType::OPENCL);
+  }
+
+  const int64_t tot = static_cast<int64_t>(iters) *
+      (net.GetTensor("Input0")->size() + net.GetTensor("Input1")->size());
+  mace::testing::ItemsProcessed(tot);
+  testing::BytesProcessed(tot * sizeof(T));
+  mace::testing::StartTiming();
+  while (iters--) {
+    net.RunOp(DeviceType::OPENCL);
+  }
+}
+
+static void BM_CONCATOPENCLFloat(int iters, int dim1) {
+  std::vector<index_t> shape = {3, 32, 32, dim1};
+  OpenclConcatHelper<float>(iters, shape, shape, 3);
+}
+
+static void BM_CONCATOPENCLHalf(int iters, int dim1) {
+  std::vector<index_t> shape = {3, 32, 32, dim1};
+  OpenclConcatHelper<half>(iters, shape, shape, 3);
+}
+
+BENCHMARK(BM_CONCATOPENCLFloat)->Arg(32)->Arg(64)->Arg(128)->Arg(256);
+BENCHMARK(BM_CONCATOPENCLHalf)->Arg(32)->Arg(64)->Arg(128)->Arg(256);
 
 }  //  namespace mace
\ No newline at end of file
diff --git a/mace/ops/concat_test.cc b/mace/ops/concat_test.cc
index 1d94d4679a5f9292b744444f1a24e3719f35f3c1..8a42899e49183dd35fcfaf804679b4807219688b 100644
--- a/mace/ops/concat_test.cc
+++ b/mace/ops/concat_test.cc
@@ -10,13 +10,13 @@ using namespace mace;
 
 class ConcatOpTest : public OpsTestBase {};
 
-TEST_F(ConcatOpTest, Simple_Horizon) {
+TEST_F(ConcatOpTest, CPUSimpleHorizon) {
   // Construct graph
   auto &net = test_net();
   OpDefBuilder("Concat", "ConcatTest")
       .Input("Input0")
       .Input("Input1")
-      .Input("Axis")
+      .AddIntArg("axis", 0)
       .Output("Output")
       .Finalize(net.NewOperatorDef());
 
@@ -28,7 +28,6 @@ TEST_F(ConcatOpTest, Simple_Horizon) {
   // Add inputs
   net.AddInputFromArray<DeviceType::CPU, float>("Input0", input_shape, input0);
   net.AddInputFromArray<DeviceType::CPU, float>("Input1", input_shape, input1);
-  net.AddInputFromArray<DeviceType::CPU, int>("Axis", {}, {0});
 
   // Run
   net.RunOp();
@@ -48,13 +47,13 @@ TEST_F(ConcatOpTest, Simple_Horizon) {
   }
 }
 
-TEST_F(ConcatOpTest, Simple_Vertical) {
+TEST_F(ConcatOpTest, CPUSimpleVertical) {
   // Construct graph
   auto &net = test_net();
   OpDefBuilder("Concat", "ConcatTest")
       .Input("Input0")
       .Input("Input1")
-      .Input("Axis")
+      .AddIntArg("axis", 1)
       .Output("Output")
       .Finalize(net.NewOperatorDef());
 
@@ -66,7 +65,6 @@ TEST_F(ConcatOpTest, Simple_Vertical) {
   // Add inputs
   net.AddInputFromArray<DeviceType::CPU, float>("Input0", input_shape, input0);
   net.AddInputFromArray<DeviceType::CPU, float>("Input1", input_shape, input1);
-  net.AddInputFromArray<DeviceType::CPU, int>("Axis", {}, {1});
 
   // Run
   net.RunOp();
@@ -88,7 +86,7 @@ TEST_F(ConcatOpTest, Simple_Vertical) {
   }
 }
 
-TEST_F(ConcatOpTest, Random) {
+TEST_F(ConcatOpTest, CPURandom) {
   srand(time(nullptr));
   int dim = 5;
   int num_inputs = 2 + rand() % 10;
@@ -99,7 +97,7 @@ TEST_F(ConcatOpTest, Random) {
   for (int i = 0; i < num_inputs; ++i) {
     builder = builder.Input(("Input" + ToString(i)).c_str());
   }
-  builder.Input("Axis").Output("Output").Finalize(net.NewOperatorDef());
+  builder.AddIntArg("axis", axis).Output("Output").Finalize(net.NewOperatorDef());
 
   std::vector<index_t> shape_data;
   GenerateRandomIntTypeData<index_t>({dim}, shape_data, 1, dim);
@@ -115,7 +113,6 @@ TEST_F(ConcatOpTest, Random) {
     net.AddInputFromArray<DeviceType::CPU, float>(("Input" + ToString(i)).c_str(),
                                  input_shapes[i], inputs[i]);
   }
-  net.AddInputFromArray<DeviceType::CPU, int>("Axis", {}, {axis});
 
   // Run
   net.RunOp();
@@ -139,3 +136,87 @@ TEST_F(ConcatOpTest, Random) {
     }
   }
 }
+
+template<typename T>
+void OpenclRandomTest(const std::vector<std::vector<index_t>> &shapes,
+                      const int axis) {
+  srand(time(nullptr));
+  int num_inputs = 2;
+  int concat_axis_size = 0;
+  // Construct graph
+  OpsTestNet net;
+  for (int i = 0; i < num_inputs; ++i) {
+    const std::string input_name = ("Input" + ToString(i)).c_str();
+    const std::string image_name = ("InputImage" + ToString(i)).c_str();
+    concat_axis_size += shapes[i][axis];
+    net.AddRandomInput<DeviceType::OPENCL, float>(input_name,
+                                                  shapes[i]);
+    BufferToImage<DeviceType::OPENCL, T>(net, input_name, image_name, kernels::BufferType::IN_OUT);
+  }
+
+  auto builder = OpDefBuilder("Concat", "ConcatTest");
+  for (int i = 0; i < num_inputs; ++i) {
+    const std::string image_name = ("InputImage" + ToString(i)).c_str();
+    builder = builder.Input(image_name);
+  }
+  builder.AddIntArg("axis", axis)
+      .Output("OutputImage")
+      .AddIntArg("T", static_cast<int>(DataTypeToEnum<T>::value))
+      .Finalize(net.NewOperatorDef());
+
+  // Run
+  net.RunOp(DeviceType::OPENCL);
+
+  ImageToBuffer<DeviceType::OPENCL, float>(net, "OutputImage", "Output", kernels::BufferType::IN_OUT);
+
+  // Check
+  auto output = net.GetOutput("Output");
+
+  std::vector<index_t> expected_shape = shapes[0];
+  expected_shape[axis] = concat_axis_size;
+  EXPECT_THAT(output->shape(), ::testing::ContainerEq(expected_shape));
+
+  Tensor::MappingGuard output_mapper(output);
+  const float *output_ptr = output->data<float>();
+  int k = 0;
+  while (output_ptr != (output->data<float>() + output->size())) {
+    for (int i = 0; i < num_inputs; ++i) {
+      index_t num_elements =
+          std::accumulate(shapes[i].begin() + axis, shapes[i].end(),
+                          1, std::multiplies<index_t>());
+
+      const std::string input_name = ("Input" + ToString(i)).c_str();
+      const Tensor *input_tensor = net.GetTensor(input_name.data());
+      Tensor::MappingGuard input_guard(input_tensor);
+      const float *input_ptr = input_tensor->data<float>() + k * num_elements;
+      for (int j = 0; j < num_elements; ++j) {
+        EXPECT_NEAR(*(input_ptr + j), *output_ptr++, 1e-2) << "With index: " << i << ", " << j;
+      }
+    }
+    k++;
+  }
+}
+
+TEST_F(ConcatOpTest, OPENCLAligned) {
+  OpenclRandomTest<float>({
+                              {3, 32, 32, 32},
+                              {3, 32, 32, 64}
+                          },
+                          3);
+}
+
+TEST_F(ConcatOpTest, OPENCLHalfAligned) {
+  OpenclRandomTest<half>({
+                              {3, 32, 32, 32},
+                              {3, 32, 32, 64}
+                          },
+                          3);
+}
+
+TEST_F(ConcatOpTest, OPENCLUnAligned) {
+  OpenclRandomTest<float>({
+                              {3, 32, 32, 13},
+                              {3, 32, 32, 17}
+                          },
+                          3);
+}