add cuda kernel:lookup table, test=develop (#2403)

add cuda kernel:lookup table

add cuda kernel:lookup table, test=develop (#2403)
add cuda kernel:lookup table
15eccb9e · Pei Yang · GitHub · d197de00 · 15eccb9e · 15eccb9e
4 changed file
--- a/lite/kernels/cuda/CMakeLists.txt
+++ b/lite/kernels/cuda/CMakeLists.txt
@@ -22,6 +22,7 @@ add_kernel(dropout_compute_cuda CUDA basic SRCS dropout_compute.cc DEPS ${lite_k
 add_kernel(softmax_compute_cuda CUDA basic SRCS softmax_compute.cu DEPS ${lite_kernel_deps})
 add_kernel(pool_compute_cuda CUDA basic SRCS pool_compute.cu DEPS ${lite_kernel_deps})
 add_kernel(bilinear_interp_compute_cuda CUDA basic SRCS bilinear_interp_compute.cu DEPS ${lite_kernel_deps})
+add_kernel(lookup_table_compute_cuda CUDA extra SRCS lookup_table_compute.cu DEPS ${lite_kernel_deps})

 lite_cc_test(calib_compute_cuda_test SRCS calib_compute_cuda_test.cc DEPS calib_compute_cuda)
 nv_test(conv2d_cuda_test SRCS conv_compute_test.cc DEPS conv2d_cuda)
@@ -37,3 +38,6 @@ nv_test(softmax_compute_cuda_test SRCS softmax_compute_test.cc DEPS softmax_comp
 nv_test(mul_compute_cuda_test SRCS mul_compute_test.cc DEPS mul_compute_cuda)
 nv_test(dropout_compute_cuda_test SRCS dropout_compute_test.cc DEPS dropout_compute_cuda )
 nv_test(bilinear_interp_compute_cuda_test SRCS bilinear_interp_compute_test.cc DEPS bilinear_interp_compute_cuda)
+if(LITE_BUILD_EXTRA)
+    nv_test(lookup_table_compute_cuda_test SRCS lookup_table_compute_test.cc DEPS lookup_table_compute_cuda)
+endif()
--- a/lite/kernels/cuda/lookup_table_compute.cu
+++ b/lite/kernels/cuda/lookup_table_compute.cu
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "lite/core/op_registry.h"
+#include "lite/kernels/cuda/lookup_table_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+using Tensor = lite::Tensor;
+
+template <int BlockDimX, int BlockDimY, int GridDimX, bool PaddingFlag>
+__global__ void LookupTableKernel(float *output,
+                                  const float *table,
+                                  const int64_t *ids,
+                                  const int64_t N,
+                                  const int64_t K,
+                                  const int64_t D,
+                                  const int64_t padding_idx) {
+  int idx = threadIdx.x;
+  int idy = blockIdx.x + threadIdx.y * GridDimX;
+
+  while (idy < K) {
+    int64_t id = ids[idy];
+    float *out = output + idy * D;
+    const float *tab = table + id * D;
+    for (int i = idx; i < D; i += BlockDimX) {
+      if (PaddingFlag) {
+        if (id == padding_idx)
+          out[i] = static_cast<float>(0);
+        else
+          out[i] = tab[i];
+      } else {
+        out[i] = tab[i];
+      }
+    }
+    idy += BlockDimY * GridDimX;
+  }
+}
+
+void LookupTableCompute::Run() {
+  auto &param = this->Param<param_t>();
+  auto &ctx = this->ctx_->template As<CUDAContext>();
+  auto stream = ctx.exec_stream();
+  Tensor *w_t = param.W;
+  Tensor *ids_t = param.Ids;
+  Tensor *out_t = param.Out;
+  int64_t padding_idx = param.padding_idx;
+
+  size_t N = w_t->dims()[0];
+  size_t D = w_t->dims()[1];
+  size_t K = ids_t->numel();
+
+  auto *w = w_t->data<float>();
+  auto *ids = ids_t->data<int64_t>();
+  auto *out = out_t->mutable_data<float>(TARGET(kCUDA));
+
+  dim3 threads(128, 8);
+  dim3 grids(8, 1);
+
+  if (padding_idx == -1) {
+    LookupTableKernel<128, 8, 8, false><<<grids, threads, 0, stream>>>(
+        out, w, ids, N, K, D, padding_idx);
+  } else {
+    LookupTableKernel<128, 8, 8, true><<<grids, threads, 0, stream>>>(
+        out, w, ids, N, K, D, padding_idx);
+  }
+
+  cudaError_t error = cudaGetLastError();
+  if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(lookup_table,
+                     kCUDA,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::cuda::LookupTableCompute,
+                     def)
+    .BindInput("W", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFloat))})
+    .BindInput("Ids", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kInt64))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFloat))})
+    .Finalize();
--- a/lite/kernels/cuda/lookup_table_compute.h
+++ b/lite/kernels/cuda/lookup_table_compute.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+class LookupTableCompute
+    : public KernelLite<TARGET(kCUDA), PRECISION(kFloat), DATALAYOUT(kNCHW)> {
+ public:
+  using param_t = operators::LookupTableParam;
+
+  void Run() override;
+  virtual ~LookupTableCompute() = default;
+};
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/cuda/lookup_table_compute_test.cc
+++ b/lite/kernels/cuda/lookup_table_compute_test.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/cuda/lookup_table_compute.h"
+#include <gtest/gtest.h>
+#include <cmath>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+using Tensor = lite::Tensor;
+
+void LookupTableComputeRef(const operators::LookupTableParam& param) {
+  auto* ids_t = param.Ids;
+  auto* output_t = param.Out;
+  int64_t padding_idx = param.padding_idx;
+  auto* ids = ids_t->data<int64_t>();
+  int64_t ids_numel = ids_t->dims().production();
+
+  auto* table_t = param.W;
+  int64_t row_number = table_t->dims()[0];
+  int64_t row_width = table_t->dims()[1];
+
+  auto* table = table_t->data<float>();
+  auto* output = output_t->mutable_data<float>();
+  memset(output, 0, output_t->dims().production() * sizeof(float));
+  for (int64_t i = 0; i < ids_numel; ++i) {
+    if (padding_idx != -1 && ids[i] == padding_idx) {
+      memset(output + i * row_width, 0, row_width * sizeof(float));
+    } else {
+      CHECK_LT(ids[i], row_number);
+      CHECK_GE(ids[i], 0);
+      memcpy(output + i * row_width,
+             table + ids[i] * row_width,
+             row_width * sizeof(float));
+    }
+  }
+}
+
+TEST(lookup_table_cuda, retrieve_op) {
+  auto lookup_table =
+      KernelRegistry::Global().Create<TARGET(kCUDA), PRECISION(kFloat)>(
+          "lookup_table");
+  ASSERT_FALSE(lookup_table.empty());
+  ASSERT_TRUE(lookup_table.front());
+}
+
+TEST(lookup_table_cuda, init) {
+  LookupTableCompute lookup_table;
+  ASSERT_EQ(lookup_table.precision(), PRECISION(kFloat));
+  ASSERT_EQ(lookup_table.target(), TARGET(kCUDA));
+}
+
+TEST(lookup_table_cuda, compute) {
+  LookupTableCompute lookup_table;
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  auto& context = ctx->As<CUDAContext>();
+
+  operators::LookupTableParam param;
+
+  Tensor w, ids, out;
+  Tensor w_cpu, ids_cpu, out_cpu;
+  Tensor w_ref, ids_ref, out_ref;
+
+  int64_t padding_idx = 0;
+
+  int vocab_size = 128;
+  int emb_size = 64;
+  int ids_h = 50;
+  int ids_w = 30;
+
+  auto w_dim = DDim({vocab_size, emb_size});
+  auto ids_dim = DDim({ids_h, ids_w});
+  auto out_dim = DDim({ids_h, ids_w, emb_size});
+
+  int w_num = w_dim.production();
+  int ids_num = ids_dim.production();
+  int out_num = out_dim.production();
+
+  w.Resize(w_dim);
+  ids.Resize(ids_dim);
+  out.Resize(out_dim);
+  w_cpu.Resize(w_dim);
+  ids_cpu.Resize(ids_dim);
+  out_cpu.Resize(out_dim);
+  w_ref.Resize(w_dim);
+  ids_ref.Resize(ids_dim);
+  out_ref.Resize(out_dim);
+
+  auto* out_data = out.mutable_data<float>(TARGET(kCUDA));
+  auto* w_cpu_data = w_cpu.mutable_data<float>();
+  auto* ids_cpu_data = ids_cpu.mutable_data<int64_t>();
+  auto* out_cpu_data = out_cpu.mutable_data<float>();
+  auto* w_ref_data = w_ref.mutable_data<float>();
+  auto* ids_ref_data = ids_ref.mutable_data<int64_t>();
+  auto* out_ref_data = out_ref.mutable_data<float>();
+
+  // generate test data
+  for (int i = 0; i < w_num; i++) {
+    w_cpu_data[i] = static_cast<float>(i + 1) / (w_num + 1);
+    w_ref_data[i] = static_cast<float>(i + 1) / (w_num + 1);
+  }
+  for (int i = 0; i < ids_num; i++) {
+    ids_cpu_data[i] = i % vocab_size;
+    ids_ref_data[i] = i % vocab_size;
+  }
+
+  w.Assign<float, lite::DDim, TARGET(kCUDA)>(w_cpu_data, w_dim);
+  ids.Assign<int64_t, lite::DDim, TARGET(kCUDA)>(ids_cpu_data, ids_dim);
+
+  param.W = &w;
+  param.Ids = &ids;
+  param.Out = &out;
+  param.padding_idx = padding_idx;
+  lookup_table.SetParam(param);
+
+  // run cuda kernel
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+  context.SetExecStream(stream);
+
+  lookup_table.SetContext(std::move(ctx));
+  lookup_table.Launch();
+  cudaDeviceSynchronize();
+
+  CopySync<TARGET(kCUDA)>(
+      out_cpu_data, out_data, sizeof(float) * out.numel(), IoDirection::DtoH);
+
+  // run ref kernel
+  param.W = &w_ref;
+  param.Ids = &ids_ref;
+  param.Out = &out_ref;
+  LookupTableComputeRef(param);
+
+  for (int i = 0; i < out_num; i++) {
+    EXPECT_NEAR(out_cpu_data[i], out_ref_data[i], 1e-5);
+  }
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(lookup_table, kCUDA, kFloat, kNCHW, def);