diff --git a/paddle/fluid/operators/concat_op.h b/paddle/fluid/operators/concat_op.h
index a65b1987cb547d7d02c454df1c6758e74037a1b6..6ac70eacaf9b5b1c9205f84ab3c7047e4b1bffc9 100644
--- a/paddle/fluid/operators/concat_op.h
+++ b/paddle/fluid/operators/concat_op.h
@@ -33,6 +33,7 @@ class ConcatKernel : public framework::OpKernel<T> {
     auto place = ctx.GetPlace();
     out->mutable_data<T>(place);
 
+    // TODO(zcd): Sometimes direct copies will be faster
     std::vector<framework::Tensor> inputs(ins.size());
     for (size_t j = 0; j < ins.size(); ++j) {
       inputs[j] = *ins[j];
@@ -51,6 +52,7 @@ class ConcatGradKernel : public framework::OpKernel<T> {
     auto outs = ctx.MultiOutput<framework::Tensor>(framework::GradVarName("X"));
     int64_t axis = static_cast<int64_t>(ctx.Attr<int>("axis"));
 
+    // TODO(zcd): Sometimes direct copies will be faster
     std::vector<framework::Tensor> outputs(outs.size());
     for (size_t j = 0; j < outs.size(); ++j) {
       outs[j]->mutable_data<T>(ctx.GetPlace());
diff --git a/paddle/fluid/operators/math/concat.cc b/paddle/fluid/operators/math/concat.cc
index 5c5c6489d601d63aa975c8811e3320c4c03922c2..b542143419e05e9baf29e9a2322447f32ddd9829 100644
--- a/paddle/fluid/operators/math/concat.cc
+++ b/paddle/fluid/operators/math/concat.cc
@@ -19,7 +19,8 @@ namespace operators {
 namespace math {
 
 /*
- * All tensors' dimension should be the same.
+ * All tensors' dimension should be the same and the values of
+ * each dimension are the same, except the axis dimension.
  */
 template <typename T>
 class ConcatFunctor<platform::CPUDeviceContext, T> {
@@ -27,12 +28,9 @@ class ConcatFunctor<platform::CPUDeviceContext, T> {
   void operator()(const platform::CPUDeviceContext& context,
                   const std::vector<framework::Tensor>& input, const int axis,
                   framework::Tensor* output) {
-    // assume the the max size of input is less than 8 and see the performance
-    // save origin dim
+    // TODO(zcd): Add input data validity checking
     int num = input.size();
-    std::vector<paddle::framework::DDim> origin_dim(num);
 
-    // get the matrix size
     int rows = 1;
     auto dim_0 = input[0].dims();
     for (int i = 0; i < axis; ++i) {
@@ -40,7 +38,6 @@ class ConcatFunctor<platform::CPUDeviceContext, T> {
     }
     int out_rows = rows, out_cols = 0;
 
-    // get input's cols
     std::vector<int64_t> input_cols(input.size());
     for (int i = 0; i < num; ++i) {
       int t_cols = input[i].numel() / rows;
@@ -64,18 +61,19 @@ class ConcatFunctor<platform::CPUDeviceContext, T> {
   }
 };
 
+/*
+ * All tensors' dimension should be the same and the values of
+ * each dimension are the same, except the axis dimension.
+ */
 template <typename T>
 class ConcatGradFunctor<platform::CPUDeviceContext, T> {
  public:
   void operator()(const platform::CPUDeviceContext& context,
                   const framework::Tensor& input, const int axis,
                   std::vector<framework::Tensor>& outputs) {
-    // assume the the max size of input is less than 8 and see the performance
-    // save origin dim
+    // TODO(zcd): Add input data validity checking
     int num = outputs.size();
-    std::vector<paddle::framework::DDim> origin_dim(num);
 
-    // get the matrix size
     int input_rows = 1;
     auto dim_0 = outputs[0].dims();
     for (int i = 0; i < axis; ++i) {
@@ -83,7 +81,6 @@ class ConcatGradFunctor<platform::CPUDeviceContext, T> {
     }
     int input_cols = 0;
 
-    // get outputs' cols
     std::vector<int64_t> output_cols(outputs.size());
     for (int i = 0; i < num; ++i) {
       int t_cols = outputs[i].numel() / input_rows;
diff --git a/paddle/fluid/operators/math/concat.cu b/paddle/fluid/operators/math/concat.cu
index 8af7233426c89df3e2147b56592b856de803a8a6..5f64856a1a5b816afaeaed0e0d63b74ed5d6aa85 100644
--- a/paddle/fluid/operators/math/concat.cu
+++ b/paddle/fluid/operators/math/concat.cu
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/framework/mixed_vector.h"
 #include "paddle/fluid/operators/math/concat.h"
 #include "paddle/fluid/platform/cuda_helper.h"
 
@@ -19,16 +20,6 @@ namespace paddle {
 namespace operators {
 namespace math {
 
-// TODO(zcd): This can be replaced by tensor,
-// if that, maybe we should add int8 to VarType::Type.
-// Or replaced by tensorArray.
-static constexpr int MaxSize = 8;
-template <typename T>
-struct CUDADeviceArray {
-  T data[MaxSize];
-  int size;
-};
-
 template <typename T>
 __device__ T upper_bound(const T* first, T count, T val) {
   const T* orig = first;
@@ -49,25 +40,24 @@ __device__ T upper_bound(const T* first, T count, T val) {
 }
 
 template <typename T>
-__global__ void KernelConcat(const CUDADeviceArray<const T*> inputs,
-                             const CUDADeviceArray<int> input_cols,
+__global__ void KernelConcat(T** inputs, const int* input_cols, int col_size,
                              const int output_rows, const int output_cols,
                              T* output) {
   int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
-  int segment = upper_bound<int>(input_cols.data, input_cols.size, tid_x) - 1;
+  int segment = upper_bound<int>(input_cols, col_size, tid_x) - 1;
 
-  int curr_offset = input_cols.data[segment];
+  int curr_offset = input_cols[segment];
   int curr_segment = segment;
   for (; tid_x < output_cols; tid_x += blockDim.x * gridDim.x) {
     T curr_col_offset;
-    while ((curr_col_offset = input_cols.data[curr_segment + 1]) <= tid_x) {
+    while ((curr_col_offset = input_cols[curr_segment + 1]) <= tid_x) {
       curr_offset = curr_col_offset;
       ++curr_segment;
     }
 
     int local_col = tid_x - curr_offset;
     int segment_width = curr_col_offset - curr_offset;
-    const T* input_ptr = inputs.data[curr_segment];
+    T* input_ptr = inputs[curr_segment];
     int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
     for (; tid_y < output_rows; tid_y += blockDim.y * gridDim.y)
       output[tid_y * output_cols + tid_x] =
@@ -76,41 +66,41 @@ __global__ void KernelConcat(const CUDADeviceArray<const T*> inputs,
 }
 
 template <typename T>
-__global__ void KernelConcat(const CUDADeviceArray<const T*> inputs,
-                             const int input_col, const int output_rows,
-                             const int output_cols, T* output) {
+__global__ void KernelConcat(T** inputs, const int input_col,
+                             const int output_rows, const int output_cols,
+                             T* output) {
   int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
   float inv_input_col = 1.0 / input_col;
   for (; tid_x < output_cols; tid_x += blockDim.x * gridDim.x) {
     int split = tid_x * inv_input_col;
     int in_offset = tid_x - split * input_col;
-    const T* input_ptr = inputs.data[split];
+    T* input_ptr = inputs[split];
     int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
-    for (; tid_y < output_rows; tid_y += blockDim.y * gridDim.y)
+    for (; tid_y < output_rows; tid_y += blockDim.y * gridDim.y) {
       output[tid_y * output_cols + tid_x] =
           input_ptr[tid_y * input_col + in_offset];
+    }
   }
 }
 
 template <typename T>
 __global__ void KernelConcatGrad(const T* input, const int input_row,
-                                 const int input_col,
-                                 CUDADeviceArray<int> output_cols,
-                                 CUDADeviceArray<T*> outputs) {
+                                 const int input_col, const int* output_cols,
+                                 int col_size, T** outputs) {
   int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
-  int segment = upper_bound<int>(output_cols.data, output_cols.size, tid_x) - 1;
-  int curr_offset = output_cols.data[segment];
+  int segment = upper_bound<int>(output_cols, col_size, tid_x) - 1;
+  int curr_offset = output_cols[segment];
   int curr_segment = segment;
   for (; tid_x < input_col; tid_x += blockDim.x * gridDim.x) {
     T curr_col_offset;
-    while ((curr_col_offset = output_cols.data[curr_segment + 1]) <= tid_x) {
+    while ((curr_col_offset = output_cols[curr_segment + 1]) <= tid_x) {
       curr_offset = curr_col_offset;
       ++curr_segment;
     }
 
     int local_col = tid_x - curr_offset;
     int segment_width = curr_col_offset - curr_offset;
-    T* output_ptr = outputs.data[curr_segment];
+    T* output_ptr = outputs[curr_segment];
     int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
     for (; tid_y < input_row; tid_y += blockDim.y * gridDim.y)
       output_ptr[tid_y * segment_width + local_col] =
@@ -121,13 +111,13 @@ __global__ void KernelConcatGrad(const T* input, const int input_row,
 template <typename T>
 __global__ void KernelConcatGrad(const T* input, const int input_row,
                                  const int input_col, const int output_cols,
-                                 CUDADeviceArray<T*> outputs) {
+                                 T** outputs) {
   int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
   float inv_input_col = 1.0 / input_col;
   for (; tid_x < input_col; tid_x += blockDim.x * gridDim.x) {
     int split = tid_x * inv_input_col;
     int in_offset = tid_x - split * input_col;
-    T* output_ptr = outputs.data[split];
+    T* output_ptr = outputs[split];
     int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
     for (; tid_y < input_row; tid_y += blockDim.y * gridDim.y)
       output_ptr[tid_y * output_cols + in_offset] =
@@ -136,7 +126,8 @@ __global__ void KernelConcatGrad(const T* input, const int input_row,
 }
 
 /*
- * All tensors' dimension should be the same.
+ * All tensors' dimension should be the same and the values of
+ * each dimension are the same, except the axis dimension.
  */
 template <typename T>
 class ConcatFunctor<platform::CUDADeviceContext, T> {
@@ -144,12 +135,8 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
   void operator()(const platform::CUDADeviceContext& context,
                   const std::vector<framework::Tensor>& input, const int axis,
                   framework::Tensor* output) {
-    // assume the the max size of input is less than 8 and see the performance
-    // save origin dim
+    // TODO(zcd): Add input data validity checking
     int num = input.size();
-    PADDLE_ENFORCE_LT(num, MaxSize, "input number should be less than %d",
-                      MaxSize);
-    // get the matrix size
     int rows = 1;
     auto dim_0 = input[0].dims();
     for (int i = 0; i < axis; ++i) {
@@ -157,25 +144,27 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
     }
     int cols = input[0].numel() / rows;
     int out_rows = rows, out_cols = 0;
-    bool sameShape = true;
 
-    CUDADeviceArray<const T*> inputs_data;
-    CUDADeviceArray<int> inputs_cols;
-    inputs_data.size = num;
-    inputs_cols.size = num + 1;
-    inputs_cols.data[0] = 0;
-    // reshape to matrix
-    // check input shape is valid
+    paddle::framework::Vector<int16_t> inputs_data(num * sizeof(T*) / 2);
+    paddle::framework::Vector<int> inputs_cols(num + 1);
+    inputs_cols[0] = 0;
+    T** inputs_ptr = reinterpret_cast<T**>(inputs_data.data());
+
+    bool sameShape = true;
     for (int i = 0; i < num; ++i) {
       int t_cols = input[i].numel() / rows;
       if (sameShape) {
         if (t_cols != cols) sameShape = false;
       }
       out_cols += t_cols;
-      inputs_cols.data[i + 1] = out_cols;
-      inputs_data.data[i] = input[i].data<T>();
+      inputs_cols[i + 1] = out_cols;
+      inputs_ptr[i] = const_cast<T*>(input[i].data<T>());
     }
 
+    T** ins_gpu =
+        reinterpret_cast<T**>(inputs_data.CUDAMutableData(context.GetPlace()));
+    const int* ins_col_gpu = inputs_cols.CUDAData(context.GetPlace());
+
     // computation
     // set the thread block and grid according to CurrentDeviceId
     const int kThreadsPerBlock = 1024;
@@ -198,27 +187,27 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
 
     if (sameShape) {
       KernelConcat<<<grid_size, block_size, 0, context.stream()>>>(
-          inputs_data, cols, out_rows, out_cols, output->data<T>());
+          ins_gpu, cols, out_rows, out_cols, output->data<T>());
     } else {
       KernelConcat<<<grid_size, block_size, 0, context.stream()>>>(
-          inputs_data, inputs_cols, out_rows, out_cols, output->data<T>());
+          ins_gpu, ins_col_gpu, static_cast<int>(inputs_cols.size()), out_rows,
+          out_cols, output->data<T>());
     }
   }
 };
 
+/*
+ * All tensors' dimension should be the same and the values of
+ * each dimension are the same, except the axis dimension.
+ */
 template <typename T>
 class ConcatGradFunctor<platform::CUDADeviceContext, T> {
  public:
   void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& input, const int axis,
                   std::vector<framework::Tensor>& outputs) {
-    // assume the the max size of input is less than 8 and see the performance
-    // save origin dim
+    // TODO(zcd): Add input data validity checking
     int num = outputs.size();
-    PADDLE_ENFORCE_LT(num, MaxSize, "input number should be less than %d",
-                      MaxSize);
-
-    // get the matrix size
     int input_row = 1;
     auto dim_0 = outputs[0].dims();
     for (int i = 0; i < axis; ++i) {
@@ -229,11 +218,10 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> {
     int input_col = 0;
     bool sameShape = true;
 
-    CUDADeviceArray<T*> outputs_data;
-    CUDADeviceArray<int> outputs_cols;
-    outputs_data.size = num;
-    outputs_cols.size = num + 1;
-    outputs_cols.data[0] = 0;
+    paddle::framework::Vector<int16_t> outputs_data(num * sizeof(T*) / 2);
+    paddle::framework::Vector<int> outputs_cols(num + 1);
+    outputs_cols[0] = 0;
+    T** outputs_ptr = reinterpret_cast<T**>(outputs_data.data());
 
     for (int i = 0; i < num; ++i) {
       int t_col = outputs[i].numel() / input_row;
@@ -241,12 +229,16 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> {
         if (t_col != output_col_0) sameShape = false;
       }
       input_col += t_col;
-      outputs_cols.data[i + 1] = input_col;
-      outputs_data.data[i] = outputs[i].data<T>();
+      outputs_cols[i + 1] = input_col;
+      outputs_ptr[i] = outputs[i].data<T>();
     }
 
+    T** outs_gpu =
+        reinterpret_cast<T**>(outputs_data.CUDAMutableData(context.GetPlace()));
+    const int* outs_col_gpu = outputs_cols.CUDAData(context.GetPlace());
+
     // computation
-    const int kThreadsPerBlock = 256;
+    const int kThreadsPerBlock = 1024;
     int block_cols = std::min(input_col, kThreadsPerBlock);
     int block_rows = std::max(kThreadsPerBlock / block_cols, 1);
     dim3 block_size = dim3(block_cols, block_rows, 1);
@@ -257,10 +249,11 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> {
 
     if (sameShape) {
       KernelConcatGrad<<<grid_size, block_size, 0, context.stream()>>>(
-          input.data<T>(), input_row, input_col, output_col_0, outputs_data);
+          input.data<T>(), input_row, input_col, output_col_0, outs_gpu);
     } else {
       KernelConcatGrad<<<grid_size, block_size, 0, context.stream()>>>(
-          input.data<T>(), input_row, input_col, outputs_cols, outputs_data);
+          input.data<T>(), input_row, input_col, outs_col_gpu,
+          static_cast<int>(outputs_cols.size()), outs_gpu);
     }
   }
 };
diff --git a/paddle/fluid/operators/math/concat.h b/paddle/fluid/operators/math/concat.h
index bc878318883d197d17823d2e6862251f1b02e6b3..22147d79e4b1eeee76f7445dd963bf5062049a34 100644
--- a/paddle/fluid/operators/math/concat.h
+++ b/paddle/fluid/operators/math/concat.h
@@ -20,7 +20,16 @@ namespace operators {
 namespace math {
 
 /*
+ * \brief Concatenate the input tensors along the dimension axis.
+ *  TODO(zcd): maybe it needs to be more detailed.
+ *  Examples:
+ *     Input[0] = [[1,2],[3,4]]
+ *     Input[1] = [[5,6]]
+ *     axis = 0
  *
+ *     Output = [[1,2],
+ *               [3,4],
+ *               [5,6]]
  */
 template <typename DeviceContext, typename T>
 class ConcatFunctor {
@@ -30,6 +39,18 @@ class ConcatFunctor {
                   framework::Tensor* output);
 };
 
+/*
+ * \brief Split the input tensors along the dimension axis into outputs.
+ *  TODO(zcd): maybe it needs to be more detailed.
+ *  Examples:
+ *     Input = [[1,2],
+ *              [3,4],
+ *              [5,6]]
+ *     axis = 0
+ *
+ *     Output[0] = [[1,2],[3,4]]
+ *     Output[1] = [[5,6]]
+ */
 template <typename DeviceContext, typename T>
 class ConcatGradFunctor {
  public: