fix var_conv_2d to support cascading use. test=develop (#2766)

- 修复var_conv_2d级联使用中计算错误的bug - x86的var_conv_2d中显示指定lod level为3

fix var_conv_2d to support cascading use. test=develop (#2766)
- 修复var_conv_2d级联使用中计算错误的bug - x86的var_conv_2d中显示指定lod level为3
9b84dc91 · Wilber · GitHub · 974c50db · 9b84dc91 · 9b84dc91
3 changed file
--- a/lite/kernels/cuda/var_conv_2d_compute.cu
+++ b/lite/kernels/cuda/var_conv_2d_compute.cu
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

+#include <functional>
 #include <memory>
 #include <vector>
 #include "lite/backends/cuda/math/gemm.h"
@@ -38,6 +39,32 @@ inline int ConvOutputSize(int input_size,
  return output_size;
 }

+// Eliminate the effects of pad, support batch > 1.
+template <typename dtype>
+__global__ void eliminate_pad_effect(dtype* src,
+                                     const int64_t* offset,
+                                     const int num_batch,
+                                     const int batch_stride,
+                                     const int num_channel,
+                                     const int channel_stride,
+                                     const int num_height,
+                                     const int height_stride,
+                                     const int num_width,
+                                     const int width_stride,
+                                     const int count) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  int thread_num = blockDim.x * gridDim.x;
+  for (tid = threadIdx.x + blockIdx.x * blockDim.x; tid < count;
+       tid += thread_num) {
+    int batch_id = tid / batch_stride;
+    int width_id = tid % num_width;
+    int cur_len = offset[batch_id + 1] - offset[batch_id];
+    if (width_id >= cur_len) {
+      src[tid] = 0.;
+    }
+  }
+}
+
 void VarConv2DCompute::PrepareForRun() {
  auto& context = this->ctx_->template As<CUDAContext>();
  auto stream = context.exec_stream();
@@ -102,6 +129,46 @@ void VarConv2DCompute::Run() {
  conv_param_.output->Resize({output_shape});
  conv_impl_->create(conv_param_, &context);
  conv_impl_->run(conv_param_);
+
+  // Avoid situations where cascading conv does not support multiple batch
+  // calculations
+  float* out_data = param.Out->mutable_data<float>();
+  const int batch_num = output_shape[1] * output_shape[2] * output_shape[3];
+  std::vector<int64_t> lod(param.X->lod()[0].size(), 0);
+  for (size_t i = 0; i < param.X->lod()[0].size(); ++i) {
+    lod[i] = param.X->lod()[0][i];
+  }
+  int count = std::accumulate(
+      output_shape.begin(), output_shape.end(), 1, std::multiplies<int>());
+  int width_stride = 1;
+  int height_stride = output_shape[3];
+  int channel_stride = output_shape[2] * output_shape[3];
+  int batch_stride = output_shape[1] * output_shape[2] * output_shape[3];
+  int threads = 512;
+  int blocks = (count + threads - 1) / threads;
+
+  offset_.Resize({static_cast<int64_t>(lod.size())});
+  int64_t* d_offset = offset_.mutable_data<int64_t>(TARGET(kCUDA));
+  TargetWrapperCuda::MemcpyAsync(d_offset,
+                                 lod.data(),
+                                 sizeof(int64_t) * lod.size(),
+                                 IoDirection::HtoD,
+                                 stream);
+
+  eliminate_pad_effect<float><<<blocks, threads, 0, stream>>>(out_data,
+                                                              d_offset,
+                                                              output_shape[0],
+                                                              batch_stride,
+                                                              output_shape[1],
+                                                              channel_stride,
+                                                              output_shape[2],
+                                                              height_stride,
+                                                              output_shape[3],
+                                                              width_stride,
+                                                              count);
+
+  cudaError_t error = cudaGetLastError();
+  if (error != cudaSuccess) LOG(ERROR) << cudaGetErrorString(error);
 }

 }  // namespace cuda

--- a/lite/kernels/cuda/var_conv_2d_compute.h
+++ b/lite/kernels/cuda/var_conv_2d_compute.h
@@ -33,6 +33,7 @@ class VarConv2DCompute : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
 private:
  mutable operators::ConvParam conv_param_;
  std::unique_ptr<lite::cuda::math::CudnnConv2D<PRECISION(kFloat)>> conv_impl_;
+  lite::Tensor offset_;
 };

 }  // namespace cuda

--- a/lite/kernels/x86/var_conv_2d_compute.h
+++ b/lite/kernels/x86/var_conv_2d_compute.h
@@ -44,6 +44,7 @@ class VarConv2DCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
    // 2-D lod info.
    // const auto& offset_x = in_col->lod()[0];
    // const auto& offset_y = in_row->lod()[0];
+    CHECK_EQ(param.X->lod().size(), 3) << "input lod size should be 3!";
    const auto& offset_y = param.X->lod()[1];
    const auto& offset_x = param.X->lod()[2];