follow comments and refine code

82bd82c1 · chengduoZH · 00e596ed · 82bd82c1 · 82bd82c1 · 82bd82c1
4 changed file
--- a/paddle/fluid/operators/concat_op.h
+++ b/paddle/fluid/operators/concat_op.h
@@ -33,6 +33,7 @@ class ConcatKernel : public framework::OpKernel<T> {
    auto place = ctx.GetPlace();
    out->mutable_data<T>(place);

+    // TODO(zcd): Sometimes direct copies will be faster
    std::vector<framework::Tensor> inputs(ins.size());
    for (size_t j = 0; j < ins.size(); ++j) {
      inputs[j] = *ins[j];
@@ -51,6 +52,7 @@ class ConcatGradKernel : public framework::OpKernel<T> {
    auto outs = ctx.MultiOutput<framework::Tensor>(framework::GradVarName("X"));
    int64_t axis = static_cast<int64_t>(ctx.Attr<int>("axis"));

+    // TODO(zcd): Sometimes direct copies will be faster
    std::vector<framework::Tensor> outputs(outs.size());
    for (size_t j = 0; j < outs.size(); ++j) {
      outs[j]->mutable_data<T>(ctx.GetPlace());

--- a/paddle/fluid/operators/math/concat.cc
+++ b/paddle/fluid/operators/math/concat.cc
@@ -19,7 +19,8 @@ namespace operators {
 namespace math {

 /*
- * All tensors' dimension should be the same.
+ * All tensors' dimension should be the same and the values of
+ * each dimension are the same, except the axis dimension.
 */
 template <typename T>
 class ConcatFunctor<platform::CPUDeviceContext, T> {
@@ -27,12 +28,9 @@ class ConcatFunctor<platform::CPUDeviceContext, T> {
  void operator()(const platform::CPUDeviceContext& context,
                  const std::vector<framework::Tensor>& input, const int axis,
                  framework::Tensor* output) {
-    // assume the the max size of input is less than 8 and see the performance
-    // save origin dim
+    // TODO(zcd): Add input data validity checking
    int num = input.size();
-    std::vector<paddle::framework::DDim> origin_dim(num);

-    // get the matrix size
    int rows = 1;
    auto dim_0 = input[0].dims();
    for (int i = 0; i < axis; ++i) {
@@ -40,7 +38,6 @@ class ConcatFunctor<platform::CPUDeviceContext, T> {
    }
    int out_rows = rows, out_cols = 0;

-    // get input's cols
    std::vector<int64_t> input_cols(input.size());
    for (int i = 0; i < num; ++i) {
      int t_cols = input[i].numel() / rows;
@@ -64,18 +61,19 @@ class ConcatFunctor<platform::CPUDeviceContext, T> {
  }
 };

+/*
+ * All tensors' dimension should be the same and the values of
+ * each dimension are the same, except the axis dimension.
+ */
 template <typename T>
 class ConcatGradFunctor<platform::CPUDeviceContext, T> {
 public:
  void operator()(const platform::CPUDeviceContext& context,
                  const framework::Tensor& input, const int axis,
                  std::vector<framework::Tensor>& outputs) {
-    // assume the the max size of input is less than 8 and see the performance
-    // save origin dim
+    // TODO(zcd): Add input data validity checking
    int num = outputs.size();
-    std::vector<paddle::framework::DDim> origin_dim(num);

-    // get the matrix size
    int input_rows = 1;
    auto dim_0 = outputs[0].dims();
    for (int i = 0; i < axis; ++i) {
@@ -83,7 +81,6 @@ class ConcatGradFunctor<platform::CPUDeviceContext, T> {
    }
    int input_cols = 0;

-    // get outputs' cols
    std::vector<int64_t> output_cols(outputs.size());
    for (int i = 0; i < num; ++i) {
      int t_cols = outputs[i].numel() / input_rows;

--- a/paddle/fluid/operators/math/concat.cu
+++ b/paddle/fluid/operators/math/concat.cu
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

+#include "paddle/fluid/framework/mixed_vector.h"
 #include "paddle/fluid/operators/math/concat.h"
 #include "paddle/fluid/platform/cuda_helper.h"

@@ -19,16 +20,6 @@ namespace paddle {
 namespace operators {
 namespace math {

-// TODO(zcd): This can be replaced by tensor,
-// if that, maybe we should add int8 to VarType::Type.
-// Or replaced by tensorArray.
-static constexpr int MaxSize = 8;
-template <typename T>
-struct CUDADeviceArray {
-  T data[MaxSize];
-  int size;
-};
-
 template <typename T>
 __device__ T upper_bound(const T* first, T count, T val) {
  const T* orig = first;
@@ -49,25 +40,24 @@ __device__ T upper_bound(const T* first, T count, T val) {
 }

 template <typename T>
-__global__ void KernelConcat(const CUDADeviceArray<const T*> inputs,
-                             const CUDADeviceArray<int> input_cols,
+__global__ void KernelConcat(T** inputs, const int* input_cols, int col_size,
                             const int output_rows, const int output_cols,
                             T* output) {
  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
-  int segment = upper_bound<int>(input_cols.data, input_cols.size, tid_x) - 1;
+  int segment = upper_bound<int>(input_cols, col_size, tid_x) - 1;

-  int curr_offset = input_cols.data[segment];
+  int curr_offset = input_cols[segment];
  int curr_segment = segment;
  for (; tid_x < output_cols; tid_x += blockDim.x * gridDim.x) {
    T curr_col_offset;
-    while ((curr_col_offset = input_cols.data[curr_segment + 1]) <= tid_x) {
+    while ((curr_col_offset = input_cols[curr_segment + 1]) <= tid_x) {
      curr_offset = curr_col_offset;
      ++curr_segment;
    }

    int local_col = tid_x - curr_offset;
    int segment_width = curr_col_offset - curr_offset;
-    const T* input_ptr = inputs.data[curr_segment];
+    T* input_ptr = inputs[curr_segment];
    int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
    for (; tid_y < output_rows; tid_y += blockDim.y * gridDim.y)
      output[tid_y * output_cols + tid_x] =
@@ -76,41 +66,41 @@ __global__ void KernelConcat(const CUDADeviceArray<const T*> inputs,
 }

 template <typename T>
-__global__ void KernelConcat(const CUDADeviceArray<const T*> inputs,
-                             const int input_col, const int output_rows,
-                             const int output_cols, T* output) {
+__global__ void KernelConcat(T** inputs, const int input_col,
+                             const int output_rows, const int output_cols,
+                             T* output) {
  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
  float inv_input_col = 1.0 / input_col;
  for (; tid_x < output_cols; tid_x += blockDim.x * gridDim.x) {
    int split = tid_x * inv_input_col;
    int in_offset = tid_x - split * input_col;
-    const T* input_ptr = inputs.data[split];
+    T* input_ptr = inputs[split];
    int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
-    for (; tid_y < output_rows; tid_y += blockDim.y * gridDim.y)
+    for (; tid_y < output_rows; tid_y += blockDim.y * gridDim.y) {
      output[tid_y * output_cols + tid_x] =
          input_ptr[tid_y * input_col + in_offset];
+    }
  }
 }

 template <typename T>
 __global__ void KernelConcatGrad(const T* input, const int input_row,
-                                 const int input_col,
-                                 CUDADeviceArray<int> output_cols,
-                                 CUDADeviceArray<T*> outputs) {
+                                 const int input_col, const int* output_cols,
+                                 int col_size, T** outputs) {
  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
-  int segment = upper_bound<int>(output_cols.data, output_cols.size, tid_x) - 1;
-  int curr_offset = output_cols.data[segment];
+  int segment = upper_bound<int>(output_cols, col_size, tid_x) - 1;
+  int curr_offset = output_cols[segment];
  int curr_segment = segment;
  for (; tid_x < input_col; tid_x += blockDim.x * gridDim.x) {
    T curr_col_offset;
-    while ((curr_col_offset = output_cols.data[curr_segment + 1]) <= tid_x) {
+    while ((curr_col_offset = output_cols[curr_segment + 1]) <= tid_x) {
      curr_offset = curr_col_offset;
      ++curr_segment;
    }

    int local_col = tid_x - curr_offset;
    int segment_width = curr_col_offset - curr_offset;
-    T* output_ptr = outputs.data[curr_segment];
+    T* output_ptr = outputs[curr_segment];
    int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
    for (; tid_y < input_row; tid_y += blockDim.y * gridDim.y)
      output_ptr[tid_y * segment_width + local_col] =
@@ -121,13 +111,13 @@ __global__ void KernelConcatGrad(const T* input, const int input_row,
 template <typename T>
 __global__ void KernelConcatGrad(const T* input, const int input_row,
                                 const int input_col, const int output_cols,
-                                 CUDADeviceArray<T*> outputs) {
+                                 T** outputs) {
  int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
  float inv_input_col = 1.0 / input_col;
  for (; tid_x < input_col; tid_x += blockDim.x * gridDim.x) {
    int split = tid_x * inv_input_col;
    int in_offset = tid_x - split * input_col;
-    T* output_ptr = outputs.data[split];
+    T* output_ptr = outputs[split];
    int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
    for (; tid_y < input_row; tid_y += blockDim.y * gridDim.y)
      output_ptr[tid_y * output_cols + in_offset] =
@@ -136,7 +126,8 @@ __global__ void KernelConcatGrad(const T* input, const int input_row,
 }

 /*
- * All tensors' dimension should be the same.
+ * All tensors' dimension should be the same and the values of
+ * each dimension are the same, except the axis dimension.
 */
 template <typename T>
 class ConcatFunctor<platform::CUDADeviceContext, T> {
@@ -144,12 +135,8 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
  void operator()(const platform::CUDADeviceContext& context,
                  const std::vector<framework::Tensor>& input, const int axis,
                  framework::Tensor* output) {
-    // assume the the max size of input is less than 8 and see the performance
-    // save origin dim
+    // TODO(zcd): Add input data validity checking
    int num = input.size();
-    PADDLE_ENFORCE_LT(num, MaxSize, "input number should be less than %d",
-                      MaxSize);
-    // get the matrix size
    int rows = 1;
    auto dim_0 = input[0].dims();
    for (int i = 0; i < axis; ++i) {
@@ -157,25 +144,27 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
    }
    int cols = input[0].numel() / rows;
    int out_rows = rows, out_cols = 0;
-    bool sameShape = true;

-    CUDADeviceArray<const T*> inputs_data;
-    CUDADeviceArray<int> inputs_cols;
-    inputs_data.size = num;
-    inputs_cols.size = num + 1;
-    inputs_cols.data[0] = 0;
-    // reshape to matrix
-    // check input shape is valid
+    paddle::framework::Vector<int16_t> inputs_data(num * sizeof(T*) / 2);
+    paddle::framework::Vector<int> inputs_cols(num + 1);
+    inputs_cols[0] = 0;
+    T** inputs_ptr = reinterpret_cast<T**>(inputs_data.data());
+
+    bool sameShape = true;
    for (int i = 0; i < num; ++i) {
      int t_cols = input[i].numel() / rows;
      if (sameShape) {
        if (t_cols != cols) sameShape = false;
      }
      out_cols += t_cols;
-      inputs_cols.data[i + 1] = out_cols;
-      inputs_data.data[i] = input[i].data<T>();
+      inputs_cols[i + 1] = out_cols;
+      inputs_ptr[i] = const_cast<T*>(input[i].data<T>());
    }

+    T** ins_gpu =
+        reinterpret_cast<T**>(inputs_data.CUDAMutableData(context.GetPlace()));
+    const int* ins_col_gpu = inputs_cols.CUDAData(context.GetPlace());
+
    // computation
    // set the thread block and grid according to CurrentDeviceId
    const int kThreadsPerBlock = 1024;
@@ -198,27 +187,27 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {

    if (sameShape) {
      KernelConcat<<<grid_size, block_size, 0, context.stream()>>>(
-          inputs_data, cols, out_rows, out_cols, output->data<T>());
+          ins_gpu, cols, out_rows, out_cols, output->data<T>());
    } else {
      KernelConcat<<<grid_size, block_size, 0, context.stream()>>>(
-          inputs_data, inputs_cols, out_rows, out_cols, output->data<T>());
+          ins_gpu, ins_col_gpu, static_cast<int>(inputs_cols.size()), out_rows,
+          out_cols, output->data<T>());
    }
  }
 };

+/*
+ * All tensors' dimension should be the same and the values of
+ * each dimension are the same, except the axis dimension.
+ */
 template <typename T>
 class ConcatGradFunctor<platform::CUDADeviceContext, T> {
 public:
  void operator()(const platform::CUDADeviceContext& context,
                  const framework::Tensor& input, const int axis,
                  std::vector<framework::Tensor>& outputs) {
-    // assume the the max size of input is less than 8 and see the performance
-    // save origin dim
+    // TODO(zcd): Add input data validity checking
    int num = outputs.size();
-    PADDLE_ENFORCE_LT(num, MaxSize, "input number should be less than %d",
-                      MaxSize);
-
-    // get the matrix size
    int input_row = 1;
    auto dim_0 = outputs[0].dims();
    for (int i = 0; i < axis; ++i) {
@@ -229,11 +218,10 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> {
    int input_col = 0;
    bool sameShape = true;

-    CUDADeviceArray<T*> outputs_data;
-    CUDADeviceArray<int> outputs_cols;
-    outputs_data.size = num;
-    outputs_cols.size = num + 1;
-    outputs_cols.data[0] = 0;
+    paddle::framework::Vector<int16_t> outputs_data(num * sizeof(T*) / 2);
+    paddle::framework::Vector<int> outputs_cols(num + 1);
+    outputs_cols[0] = 0;
+    T** outputs_ptr = reinterpret_cast<T**>(outputs_data.data());

    for (int i = 0; i < num; ++i) {
      int t_col = outputs[i].numel() / input_row;
@@ -241,12 +229,16 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> {
        if (t_col != output_col_0) sameShape = false;
      }
      input_col += t_col;
-      outputs_cols.data[i + 1] = input_col;
-      outputs_data.data[i] = outputs[i].data<T>();
+      outputs_cols[i + 1] = input_col;
+      outputs_ptr[i] = outputs[i].data<T>();
    }

+    T** outs_gpu =
+        reinterpret_cast<T**>(outputs_data.CUDAMutableData(context.GetPlace()));
+    const int* outs_col_gpu = outputs_cols.CUDAData(context.GetPlace());
+
    // computation
-    const int kThreadsPerBlock = 256;
+    const int kThreadsPerBlock = 1024;
    int block_cols = std::min(input_col, kThreadsPerBlock);
    int block_rows = std::max(kThreadsPerBlock / block_cols, 1);
    dim3 block_size = dim3(block_cols, block_rows, 1);
@@ -257,10 +249,11 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> {

    if (sameShape) {
      KernelConcatGrad<<<grid_size, block_size, 0, context.stream()>>>(
-          input.data<T>(), input_row, input_col, output_col_0, outputs_data);
+          input.data<T>(), input_row, input_col, output_col_0, outs_gpu);
    } else {
      KernelConcatGrad<<<grid_size, block_size, 0, context.stream()>>>(
-          input.data<T>(), input_row, input_col, outputs_cols, outputs_data);
+          input.data<T>(), input_row, input_col, outs_col_gpu,
+          static_cast<int>(outputs_cols.size()), outs_gpu);
    }
  }
 };

--- a/paddle/fluid/operators/math/concat.h
+++ b/paddle/fluid/operators/math/concat.h
@@ -20,7 +20,16 @@ namespace operators {
 namespace math {

 /*
+ * \brief Concatenate the input tensors along the dimension axis.
+ *  TODO(zcd): maybe it needs to be more detailed.
+ *  Examples:
+ *     Input[0] = [[1,2],[3,4]]
+ *     Input[1] = [[5,6]]
+ *     axis = 0
 *
+ *     Output = [[1,2],
+ *               [3,4],
+ *               [5,6]]
 */
 template <typename DeviceContext, typename T>
 class ConcatFunctor {
@@ -30,6 +39,18 @@ class ConcatFunctor {
                  framework::Tensor* output);
 };

+/*
+ * \brief Split the input tensors along the dimension axis into outputs.
+ *  TODO(zcd): maybe it needs to be more detailed.
+ *  Examples:
+ *     Input = [[1,2],
+ *              [3,4],
+ *              [5,6]]
+ *     axis = 0
+ *
+ *     Output[0] = [[1,2],[3,4]]
+ *     Output[1] = [[5,6]]
+ */
 template <typename DeviceContext, typename T>
 class ConcatGradFunctor {
 public: