optimize sequence_mask. test=develop (#4120)

192be07b · Wilber · GitHub · 9e38adc8 · 192be07b · 192be07b
3 changed file
--- a/lite/kernels/cuda/sequence_mask_compute.cu
+++ b/lite/kernels/cuda/sequence_mask_compute.cu
@@ -37,6 +37,40 @@ __global__ void SequenceMaskKernel(T* dst,
  }
 }

+template <typename T>
+__global__ void VecMaxKernel(const T* in_data, T* out, const int count) {
+  extern __shared__ T cache[];
+
+  int i = blockDim.x * blockIdx.x + threadIdx.x;
+  int cache_index = threadIdx.x;
+  T tmp = -1;
+
+  while (i < count) {
+    if (in_data[i] > tmp) {
+      tmp = in_data[i];
+    }
+    i += blockDim.x * gridDim.x;
+  }
+  cache[cache_index] = tmp;
+
+  __syncthreads();
+
+  // perform parallel reduction, blockDim.x must be 2^n
+  int ib = blockDim.x / 2;
+  while (ib != 0) {
+    if (cache_index < ib && cache[cache_index + ib] > cache[cache_index]) {
+      cache[cache_index] = cache[cache_index + ib];
+    }
+
+    __syncthreads();
+
+    ib /= 2;
+  }
+  if (cache_index == 0) {
+    out[blockIdx.x] = cache[0];
+  }
+}
+
 template <typename T, PrecisionType Ptype>
 void SequenceMaskCompute<T, Ptype>::Run() {
  auto& param = this->template Param<param_t>();
@@ -57,11 +91,34 @@ void SequenceMaskCompute<T, Ptype>::Run() {
  }

  if (maxlen < 0) {
-    maxlen = static_cast<int>(
-        thrust::reduce(thrust::device_pointer_cast(x_data),
-                       thrust::device_pointer_cast(x_data) + x->numel(),
-                       static_cast<int64_t>(0),
-                       thrust::maximum<int64_t>()));
+    // choose algorithm according to magic_num.
+    const int magic_num = 256;
+    std::vector<int64_t> h_max_data;
+    if (x->numel() < magic_num) {
+      h_max_data.resize(x->numel());
+      TargetWrapperCuda::MemcpySync(h_max_data.data(),
+                                    x_data,
+                                    x->numel() * sizeof(int64_t),
+                                    IoDirection::DtoH);
+    } else {
+      const int threads = 256;
+      const int blocks = (x->numel() + threads - 1) / threads;
+      max_tensor_.Resize({blocks});
+      auto* max_data = max_tensor_.mutable_data<int64_t>(TARGET(kCUDA));
+      VecMaxKernel<
+          int64_t><<<blocks, threads, threads * sizeof(int64_t), stream>>>(
+          x_data, max_data, x->numel());
+      h_max_data.resize(blocks);
+      TargetWrapperCuda::MemcpyAsync(h_max_data.data(),
+                                     max_data,
+                                     sizeof(int64_t) * blocks,
+                                     IoDirection::DtoH,
+                                     stream);
+      TargetWrapperCuda::StreamSync(stream);
+    }
+    auto maxlen_iterator =
+        std::max_element(h_max_data.begin(), h_max_data.end());
+    maxlen = h_max_data[std::distance(h_max_data.begin(), maxlen_iterator)];
  }

  auto y_dim = x->dims().Vectorize();

--- a/lite/kernels/cuda/sequence_mask_compute.h
+++ b/lite/kernels/cuda/sequence_mask_compute.h
@@ -28,6 +28,9 @@ class SequenceMaskCompute : public KernelLite<TARGET(kCUDA), Ptype> {

  void Run() override;
  virtual ~SequenceMaskCompute() = default;
+
+ private:
+  lite::Tensor max_tensor_;
 };

 }  // namespace cuda

--- a/lite/operators/fc_op.cc
+++ b/lite/operators/fc_op.cc
@@ -50,9 +50,15 @@ bool FcOpLite::CheckShape() const {

 bool FcOpLite::InferShapeImpl() const {
  const auto& input_dims = param_.input->dims();
+  int64_t w_dims_1;
+  if (param_.w_dims.empty()) {
+    const auto& w_dims = param_.w->dims();
+    w_dims_1 = param_.padding_weights ? w_dims[1] - 4 : w_dims[1];
+  } else {
    const auto& w_dims = param_.w_dims;
+    w_dims_1 = param_.padding_weights ? w_dims[1] - 4 : w_dims[1];
+  }
  int in_num_col_dims = param_.in_num_col_dims;
-  int64_t w_dims_1 = param_.padding_weights ? w_dims[1] - 4 : w_dims[1];

  // Set output dims
  std::vector<DDim::value_type> output_dims(in_num_col_dims + 1);