diff --git a/README.md b/README.md
index 2572f25444dc4268e7a6a3f43cfdc1b38dae8e02..c3a30009825a7f8f9f5c4940a847fc88fe6a840e 100644
--- a/README.md
+++ b/README.md
@@ -37,7 +37,8 @@
 
 开发文档主要是关于编译、运行等问题。做为开发者，它可以和贡献文档共同结合使用。
 * [iOS](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_ios.md)
-* [Android](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_android.md)
+* [Android_CPU](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_android.md)
+* [Android_GPU](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_android_GPU.md)
 * [FPGA](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_fpga.md)
 * [ARM_LINUX](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_arm_linux.md)
 
diff --git a/src/operators/kernel/cl/cl_kernel/concat_kernel.cl b/src/operators/kernel/cl/cl_kernel/concat_kernel.cl
new file mode 100644
index 0000000000000000000000000000000000000000..b07ee4d819b25ef77729ed868c54b19a3d8699ae
--- /dev/null
+++ b/src/operators/kernel/cl/cl_kernel/concat_kernel.cl
@@ -0,0 +1,154 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+/*
+
+__kernel void concatByC(__read_only image2d_t input_image1,
+                      __read_only image2d_t input_image2,
+                      __write_only image2d_t output_image,
+                      __private const int out_C,
+                      __private const int out_H,
+                      __private const int out_W,
+                      __private const int out_C_Start,
+                      __private const int in_W,
+                      __private const int in_H,
+                      __private const int int_C1,
+                      __private const int int_C2) {
+
+                       const int in_c = get_global_id(0);
+                       const int in_w = get_global_id(1);
+                       const int in_nh = get_global_id(2);
+                       int out_c1 = (out_C_Start)/4 + in_c;
+
+                       int out_c2 = out_c1 + 1;
+
+                       int2 output_pos1;
+                       int2 output_pos2;
+
+                       output_pos1.x = out_c1 * out_W + in_w;
+                       output_pos1.y = in_nh;
+
+                       output_pos2.x = out_c2 * out_W + in_w;
+                       output_pos2.y = in_nh;
+
+                       int2 input_pos1;
+                       if(in_c==0){
+                        input_pos1.x = ((in_C1-1)/4) * in_W + in_w;
+                       }else{
+                        input_pos1.x = (in_c - 1) * in_W + in_w;
+                       }
+
+                       input_pos1.y = in_nh;
+
+                       int2 input_pos2;
+                       input_pos2.x = in_c * in_W + in_w;
+                       input_pos2.y = in_nh;
+
+                       half4 output1;
+                       half4 output2;
+                       half4 input1;
+                       half4 input2;
+                       const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                                                   CLK_ADDRESS_CLAMP |
+                                                   CLK_FILTER_NEAREST;
+                       if(in_c==0){
+                       input1 = read_imageh(input_image1, sampler,input_pos1);
+
+                       }else {
+                       input1 = read_imageh(input_image2, sampler,input_pos1);
+                       }
+                       input2 = read_imageh(input_image2, sampler,input_pos2);
+                       output1 = input1;
+
+                       if(out_C_Start%4==0){
+                        output2 = input2;
+
+                       }else if(out_C_Start%4==1){
+                        output1.y = input2.x;
+                        output1.z = input2.y;
+                        output1.w = input2.z;
+                        output2.x = input2.w;
+                        output2.y = 0.0f;
+                        output2.z = 0.0f;
+                        output2.w = 0.0f;
+
+                       }else if(out_C_Start%4==2){
+                       output1.z = input2.x;
+                       output1.w = input2.y;
+                       output2.x = input2.z;
+                       output2.y = input2.w;
+                       output2.z = 0.0f;
+                       output2.w = 0.0f;
+
+                       }else if(out_C_Start%4==3){
+                       output1.w = input2.x;
+                       output2.x = input2.y;
+                       output2.y = input2.z;
+                       output2.z = input2.w;
+                       output2.w = 0.0f;
+                       }
+                       write_imageh(output_image, output_pos1, output1);
+                       write_imageh(output_image, output_pos2, output2);
+}
+
+__kernel void concatByW0(__read_only image2d_t input_image,
+                      __write_only image2d_t output_image,
+                      __private const int out_W) {
+
+                      const int in_c = get_global_id(0);
+                      const int in_w = get_global_id(1);
+                      const int in_nh = get_global_id(2);
+
+                      int2 input_pos = in_c * out_W + in_w;
+
+                      const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                                                CLK_ADDRESS_CLAMP |
+                                                CLK_FILTER_NEAREST;
+                      half4 input;
+                      input = read_imageh(input_image, sampler,input_pos);
+
+                      write_imageh(output_image, input_pos, input);
+
+}
+*/
+
+__kernel void concatByH(__read_only image2d_t input_image,
+                      __write_only image2d_t output_image,
+                      __private const int out_W,
+                      __private const int out_H_Start) {
+
+                      const int in_c = get_global_id(0);
+                      const int in_w = get_global_id(1);
+                      const int in_nh = get_global_id(2);
+
+                      int2 input_pos;
+                      input_pos.x = in_c * out_W + in_w;
+                      input_pos.y = in_nh;
+
+                      const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                                                CLK_ADDRESS_CLAMP |
+                                                CLK_FILTER_NEAREST;
+                      half4 input;
+                      input = read_imageh(input_image, sampler,input_pos);
+
+                      int2 output_pos;
+                      output_pos.x = input_pos.x;
+                      output_pos.y = out_H_Start + input_pos.y;
+
+                      write_imageh(output_image, output_pos, input);
+
+}
+
+
diff --git a/src/operators/kernel/cl/concat_kernel.cpp b/src/operators/kernel/cl/concat_kernel.cpp
index 48fbd03315957f54873e5ee18bc95896ca306554..3deb31e7aa0c408cc2b87c523d324001f75ade88 100644
--- a/src/operators/kernel/cl/concat_kernel.cpp
+++ b/src/operators/kernel/cl/concat_kernel.cpp
@@ -21,11 +21,49 @@ namespace operators {
 
 template <>
 bool ConcatKernel<GPU_CL, float>::Init(ConcatParam<GPU_CL> *param) {
+  if (param->Out()->dims().size() < 4) {
+    this->cl_helper_.AddKernel("concatByH", "concat_kernel.cl");
+  }
   return true;
 }
 
 template <>
-void ConcatKernel<GPU_CL, float>::Compute(const ConcatParam<GPU_CL> &param) {}
+void ConcatKernel<GPU_CL, float>::Compute(const ConcatParam<GPU_CL> &param) {
+  if (param.Out()->dims().size() < 4) {
+    auto kernel = this->cl_helper_.KernelAt(0);
+    auto inputs = param.Inputs();
+    auto *output_image = param.Out()->GetCLImage();
+    int out_W = 0;
+    if (param.Out()->dims().size() == 3) {
+      out_W = param.Out()->dims()[2];
+    } else if (param.Out()->dims().size() == 2) {
+      out_W = param.Out()->dims()[1];
+    }
+    int out_H_Start = 0;
+    for (int i = 0; i < inputs.size(); i++) {
+      auto input_image = inputs[i]->GetCLImage();
+      auto default_work_size = this->cl_helper_.DefaultWorkSize(*inputs[i]);
+      cl_int status;
+      status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image);
+      CL_CHECK_ERRORS(status);
+      status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image);
+      CL_CHECK_ERRORS(status);
+      status = clSetKernelArg(kernel, 2, sizeof(int), &out_W);
+      CL_CHECK_ERRORS(status);
+      status = clSetKernelArg(kernel, 3, sizeof(int), &out_H_Start);
+      CL_CHECK_ERRORS(status);
+      status = clEnqueueNDRangeKernel(
+          this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(),
+          NULL, default_work_size.data(), NULL, 0, NULL, NULL);
+      CL_CHECK_ERRORS(status);
+      if (param.Out()->dims().size() == 3) {
+        out_H_Start += inputs[i]->dims()[1];
+      } else if (param.Out()->dims().size() == 2) {
+        out_H_Start += inputs[i]->dims()[0];
+      }
+    }
+  }
+}
 
 }  // namespace operators
 }  // namespace paddle_mobile