From de43e4796b8762202f170a072726ba8a9701d687 Mon Sep 17 00:00:00 2001
From: Wilber <jiweibo@baidu.com>
Date: Thu, 29 Aug 2019 21:24:46 +0800
Subject: [PATCH] Add yolo_box_cuda multiclass_nms_host kernel. (#1908)

* add yolo_box_compute cuda

* move multiclass_nms(arm) to host

* add lod in scale op

* add yolo_box_cuda cmake config

* modify shuffle_channel_fuse and transpose_softmax_transpose_fuse to support run ssd model. test=develop

* reshape and transpose op don't have xshape output.

* modify yolo_box_compute_cuda, use tensor to manage cuda memory test=develop

* add yolo_box use kernel test=develop
---
 lite/api/_paddle_use_kernels.h                |   3 +-
 lite/arm/math/CMakeLists.txt                  |   1 -
 lite/arm/math/funcs.h                         |   1 -
 lite/core/mir/fusion/shuffle_channel_fuser.cc |  29 +-
 .../transpose_softmax_transpose_fuser.cc      |  20 +-
 lite/kernels/arm/CMakeLists.txt               |   2 -
 lite/kernels/arm/multiclass_nms_compute.cc    | 110 --------
 lite/kernels/arm/scale_compute.cc             |   3 +
 lite/kernels/cuda/CMakeLists.txt              |   4 +-
 lite/kernels/cuda/yolo_box_compute.cu         | 225 +++++++++++++++
 .../cuda/yolo_box_compute.h}                  |  38 +--
 lite/kernels/cuda/yolo_box_compute_test.cc    | 258 ++++++++++++++++++
 lite/kernels/host/CMakeLists.txt              |   2 +
 .../host/multiclass_nms_compute.cc}           | 123 +++++++--
 .../{arm => host}/multiclass_nms_compute.h    |  10 +-
 .../multiclass_nms_compute_test.cc            |  32 +--
 lite/tests/kernels/CMakeLists.txt             |   1 -
 .../kernels/multiclass_nms_compute_test.cc    | 181 ------------
 lite/tools/ci_build.sh                        |  19 --
 19 files changed, 647 insertions(+), 415 deletions(-)
 delete mode 100644 lite/kernels/arm/multiclass_nms_compute.cc
 create mode 100644 lite/kernels/cuda/yolo_box_compute.cu
 rename lite/{arm/math/multiclass_nms.h => kernels/cuda/yolo_box_compute.h} (51%)
 create mode 100644 lite/kernels/cuda/yolo_box_compute_test.cc
 rename lite/{arm/math/multiclass_nms.cc => kernels/host/multiclass_nms_compute.cc} (74%)
 rename lite/kernels/{arm => host}/multiclass_nms_compute.h (85%)
 rename lite/kernels/{arm => host}/multiclass_nms_compute_test.cc (95%)
 delete mode 100644 lite/tests/kernels/multiclass_nms_compute_test.cc

diff --git a/lite/api/_paddle_use_kernels.h b/lite/api/_paddle_use_kernels.h
index fe52259237..d54caa83e1 100644
--- a/lite/api/_paddle_use_kernels.h
+++ b/lite/api/_paddle_use_kernels.h
@@ -31,6 +31,7 @@ USE_LITE_KERNEL(fetch, kFPGA, kFP16, kNHWC, def);
 // host kernels
 USE_LITE_KERNEL(reshape, kHost, kAny, kAny, def);
 USE_LITE_KERNEL(reshape2, kHost, kAny, kAny, def);
+USE_LITE_KERNEL(multiclass_nms, kHost, kFloat, kNCHW, def);
 
 #ifdef LITE_WITH_ARM
 USE_LITE_KERNEL(fc, kARM, kFloat, kNCHW, def);
@@ -92,7 +93,6 @@ USE_LITE_KERNEL(top_k, kARM, kFloat, kNCHW, def);
 USE_LITE_KERNEL(increment, kARM, kFloat, kNCHW, def);
 USE_LITE_KERNEL(write_to_array, kARM, kFloat, kNCHW, def);
 USE_LITE_KERNEL(read_from_array, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(multiclass_nms, kARM, kFloat, kNCHW, def);
 USE_LITE_KERNEL(reduce_max, kARM, kFloat, kNCHW, def);
 USE_LITE_KERNEL(sequence_expand, kARM, kFloat, kNCHW, def);
 USE_LITE_KERNEL(sequence_pool, kARM, kFloat, kNCHW, def);
@@ -155,6 +155,7 @@ USE_LITE_KERNEL(io_copy, kCUDA, kAny, kAny, device_to_host);
 USE_LITE_KERNEL(io_copy_once, kCUDA, kAny, kAny, host_to_device);
 USE_LITE_KERNEL(io_copy_once, kCUDA, kAny, kAny, device_to_host);
 USE_LITE_KERNEL(leaky_relu, kCUDA, kFloat, kNCHW, def);
+USE_LITE_KERNEL(yolo_box, kCUDA, kFloat, kNCHW, def);
 #endif
 
 #ifdef LITE_WITH_OPENCL
diff --git a/lite/arm/math/CMakeLists.txt b/lite/arm/math/CMakeLists.txt
index e228259da3..f17928cc29 100644
--- a/lite/arm/math/CMakeLists.txt
+++ b/lite/arm/math/CMakeLists.txt
@@ -56,7 +56,6 @@ if (NOT HAS_ARM_MATH_LIB_DIR)
       elementwise.cc
       lrn.cc
       decode_bboxes.cc
-      multiclass_nms.cc
       concat.cc
       sgemv.cc
       type_trans.cc
diff --git a/lite/arm/math/funcs.h b/lite/arm/math/funcs.h
index 9b4a1ca972..18a9d95041 100644
--- a/lite/arm/math/funcs.h
+++ b/lite/arm/math/funcs.h
@@ -39,7 +39,6 @@
 #include "lite/arm/math/increment.h"
 #include "lite/arm/math/interpolate.h"
 #include "lite/arm/math/lrn.h"
-#include "lite/arm/math/multiclass_nms.h"
 #include "lite/arm/math/negative.h"
 #include "lite/arm/math/norm.h"
 #include "lite/arm/math/packed_sgemm.h"
diff --git a/lite/core/mir/fusion/shuffle_channel_fuser.cc b/lite/core/mir/fusion/shuffle_channel_fuser.cc
index 01a091577a..f0087f8991 100644
--- a/lite/core/mir/fusion/shuffle_channel_fuser.cc
+++ b/lite/core/mir/fusion/shuffle_channel_fuser.cc
@@ -28,12 +28,17 @@ void ShuffleChannelFuser::BuildPattern() {
   auto* y2 = VarNode("y2")->assert_is_op_output(transpose_type_, "Out");
   auto* out = VarNode("out")->assert_is_op_output(reshape_type_, "Out");
 
-  auto* xshape1 =
-      VarNode("xshape1")->assert_is_op_output(reshape_type_, "XShape");
-  auto* xshape2 =
-      VarNode("xshape2")->assert_is_op_output(transpose_type_, "XShape");
-  auto* xshape3 =
-      VarNode("xshape3")->assert_is_op_output(reshape_type_, "XShape");
+  PMNode* xshape1 = nullptr;
+  PMNode* xshape2 = nullptr;
+  PMNode* xshape3 = nullptr;
+  if (reshape_type_ == "reshape2") {
+    xshape1 = VarNode("xshape1")->assert_is_op_output(reshape_type_, "XShape");
+    xshape3 = VarNode("xshape3")->assert_is_op_output(reshape_type_, "XShape");
+  }
+  if (transpose_type_ == "transpose2") {
+    xshape2 =
+        VarNode("xshape2")->assert_is_op_output(transpose_type_, "XShape");
+  }
 
   auto* reshape1 = OpNode("reshape1", reshape_type_)
                        ->assert_op_attr_satisfied<std::vector<int>>(
@@ -54,16 +59,16 @@ void ShuffleChannelFuser::BuildPattern() {
 
   // create topology.
   *x1 >> *reshape1 >> *y1 >> *transpose >> *y2 >> *reshape2 >> *out;
-  *reshape1 >> *xshape1;
-  *transpose >> *xshape2;
-  *reshape2 >> *xshape3;
+  if (xshape1) *reshape1 >> *xshape1;
+  if (xshape2) *transpose >> *xshape2;
+  if (xshape3) *reshape2 >> *xshape3;
 
   // Some op specialities.
   y1->AsIntermediate();
   y2->AsIntermediate();
-  xshape1->AsIntermediate();
-  xshape2->AsIntermediate();
-  xshape3->AsIntermediate();
+  if (xshape1) xshape1->AsIntermediate();
+  if (xshape2) xshape2->AsIntermediate();
+  if (xshape3) xshape3->AsIntermediate();
   reshape1->AsIntermediate();
   transpose->AsIntermediate();
   reshape2->AsIntermediate();
diff --git a/lite/core/mir/fusion/transpose_softmax_transpose_fuser.cc b/lite/core/mir/fusion/transpose_softmax_transpose_fuser.cc
index 5e55999442..d578b725ec 100644
--- a/lite/core/mir/fusion/transpose_softmax_transpose_fuser.cc
+++ b/lite/core/mir/fusion/transpose_softmax_transpose_fuser.cc
@@ -28,10 +28,14 @@ void TransposeSoftmaxTransposeFuser::BuildPattern() {
   auto* y2 = VarNode("y2")->assert_is_op_output(softmax_type_, "Out");
   auto* out = VarNode("out")->assert_is_op_output(transpose_type_, "Out");
 
-  auto* xshape1 =
-      VarNode("xshape1")->assert_is_op_output(transpose_type_, "XShape");
-  auto* xshape2 =
-      VarNode("xshape2")->assert_is_op_output(transpose_type_, "XShape");
+  PMNode* xshape1 = nullptr;
+  PMNode* xshape2 = nullptr;
+  if (transpose_type_ == "transpose2") {
+    xshape1 =
+        VarNode("xshape1")->assert_is_op_output(transpose_type_, "XShape");
+    xshape2 =
+        VarNode("xshape2")->assert_is_op_output(transpose_type_, "XShape");
+  }
 
   auto* transpose1 =
       OpNode("transpose1", transpose_type_)->assert_is_op(transpose_type_);
@@ -45,14 +49,14 @@ void TransposeSoftmaxTransposeFuser::BuildPattern() {
 
   // create topology.
   *x1 >> *transpose1 >> *y1 >> *softmax >> *y2 >> *transpose2 >> *out;
-  *transpose1 >> *xshape1;
-  *transpose2 >> *xshape2;
+  if (xshape1) *transpose1 >> *xshape1;
+  if (xshape2) *transpose2 >> *xshape2;
 
   // nodes to remove
   y1->AsIntermediate();
   y2->AsIntermediate();
-  xshape1->AsIntermediate();
-  xshape2->AsIntermediate();
+  if (xshape1) xshape1->AsIntermediate();
+  if (xshape2) xshape2->AsIntermediate();
   transpose1->AsIntermediate();
   softmax->AsIntermediate();
   transpose2->AsIntermediate();
diff --git a/lite/kernels/arm/CMakeLists.txt b/lite/kernels/arm/CMakeLists.txt
index 19d3f3fd76..9b6997f1ef 100644
--- a/lite/kernels/arm/CMakeLists.txt
+++ b/lite/kernels/arm/CMakeLists.txt
@@ -15,7 +15,6 @@ add_kernel(batch_norm_compute_arm ARM basic SRCS batch_norm_compute.cc DEPS ${li
 add_kernel(elementwise_compute_arm ARM basic SRCS elementwise_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(lrn_compute_arm ARM basic SRCS lrn_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(decode_bboxes_compute_arm ARM basic SRCS decode_bboxes_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(multiclass_nms_compute_arm ARM basic SRCS multiclass_nms_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(pool_compute_arm ARM basic SRCS pool_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(split_compute_arm ARM basic SRCS split_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(concat_compute_arm ARM basic SRCS concat_compute.cc DEPS ${lite_kernel_deps} math_arm)
@@ -80,7 +79,6 @@ lite_cc_test(test_batch_norm_compute_arm SRCS batch_norm_compute_test.cc DEPS ba
 lite_cc_test(test_elementwise_compute_arm SRCS elementwise_compute_test.cc DEPS elementwise_compute_arm)
 lite_cc_test(test_lrn_compute_arm SRCS lrn_compute_test.cc DEPS lrn_compute_arm)
 lite_cc_test(test_decode_bboxes_compute_arm SRCS decode_bboxes_compute_test.cc DEPS decode_bboxes_compute_arm)
-lite_cc_test(test_multiclass_nms_compute_arm SRCS multiclass_nms_compute_test.cc DEPS multiclass_nms_compute_arm)
 lite_cc_test(test_pool_compute_arm SRCS pool_compute_test.cc DEPS pool_compute_arm)
 lite_cc_test(test_mul_compute_arm SRCS mul_compute_test.cc DEPS mul_compute_arm)
 lite_cc_test(test_split_compute_arm SRCS split_compute_test.cc DEPS split_compute_arm)
diff --git a/lite/kernels/arm/multiclass_nms_compute.cc b/lite/kernels/arm/multiclass_nms_compute.cc
deleted file mode 100644
index c36a81d152..0000000000
--- a/lite/kernels/arm/multiclass_nms_compute.cc
+++ /dev/null
@@ -1,110 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/multiclass_nms_compute.h"
-#include <string>
-#include "lite/arm/math/funcs.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-void MulticlassNmsCompute::Run() {
-  auto& param = Param<operators::MulticlassNmsParam>();
-  // bbox shape : N, M, 4
-  // scores shape : N, C, M
-  const float* bbox_data = param.bbox_data->data<float>();
-  const float* conf_data = param.conf_data->data<float>();
-
-  CHECK_EQ(param.bbox_data->dims().production() % 4, 0);
-
-  std::vector<float> result;
-  int N = param.bbox_data->dims()[0];
-  int M = param.bbox_data->dims()[1];
-  std::vector<int> priors(N, M);
-  int class_num = param.conf_data->dims()[1];
-  int background_label = param.background_label;
-  int keep_top_k = param.keep_top_k;
-  int nms_top_k = param.nms_top_k;
-  float score_threshold = param.score_threshold;
-  float nms_threshold = param.nms_threshold;
-  float nms_eta = param.nms_eta;
-  bool share_location = param.share_location;
-
-  lite::arm::math::multiclass_nms(bbox_data,
-                                  conf_data,
-                                  &result,
-                                  priors,
-                                  class_num,
-                                  background_label,
-                                  keep_top_k,
-                                  nms_top_k,
-                                  score_threshold,
-                                  nms_threshold,
-                                  nms_eta,
-                                  share_location);
-  lite::LoD* lod = param.out->mutable_lod();
-  std::vector<uint64_t> lod_info;
-  lod_info.push_back(0);
-  std::vector<float> result_corrected;
-  int tmp_batch_id;
-  uint64_t num = 0;
-  for (int i = 0; i < result.size(); ++i) {
-    if (i == 0) {
-      tmp_batch_id = result[i];
-    }
-    if (i % 7 == 0) {
-      if (result[i] == tmp_batch_id) {
-        ++num;
-      } else {
-        lod_info.push_back(num);
-        ++num;
-        tmp_batch_id = result[i];
-      }
-    } else {
-      result_corrected.push_back(result[i]);
-    }
-  }
-  lod_info.push_back(num);
-  (*lod).push_back(lod_info);
-
-  if (result_corrected.empty()) {
-    (*lod).clear();
-    (*lod).push_back(std::vector<uint64_t>({0, 1}));
-    param.out->Resize({static_cast<int64_t>(1)});
-    param.out->mutable_data<float>()[0] = -1.;
-  } else {
-    param.out->Resize({static_cast<int64_t>(result_corrected.size() / 6), 6});
-    float* out = param.out->mutable_data<float>();
-    std::memcpy(
-        out, result_corrected.data(), sizeof(float) * result_corrected.size());
-  }
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(multiclass_nms,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::MulticlassNmsCompute,
-                     def)
-    .BindInput("BBoxes", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Scores", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
diff --git a/lite/kernels/arm/scale_compute.cc b/lite/kernels/arm/scale_compute.cc
index 42a5575cc3..07287111a1 100644
--- a/lite/kernels/arm/scale_compute.cc
+++ b/lite/kernels/arm/scale_compute.cc
@@ -32,6 +32,9 @@ void ScaleCompute::Run() {
     bias *= scale;
   }
   lite::arm::math::scale(x_data, output_data, x_dims.production(), scale, bias);
+  if (!param.x->lod().empty()) {
+    param.output->set_lod(param.x->lod());
+  }
 }
 
 }  // namespace arm
diff --git a/lite/kernels/cuda/CMakeLists.txt b/lite/kernels/cuda/CMakeLists.txt
index a9a85bab91..1a198c1dd5 100644
--- a/lite/kernels/cuda/CMakeLists.txt
+++ b/lite/kernels/cuda/CMakeLists.txt
@@ -7,13 +7,15 @@ message(STATUS "compile with lite CUDA kernels")
 nv_library(mul_compute_cuda SRCS mul_compute.cc DEPS ${lite_kernel_deps} context)
 lite_cc_library(io_copy_compute_cuda SRCS io_copy_compute.cc DEPS ${lite_kernel_deps})
 nv_library(leaky_relu_compute_cuda SRCS leaky_relu_compute.cu DEPS ${lite_kernel_deps})
-
 lite_cc_test(leaky_relu_compute_cuda_test SRCS leaky_relu_compute_test.cc DEPS leaky_relu_compute_cuda)
+nv_library(yolo_box_compute_cuda SRCS yolo_box_compute.cu DEPS ${lite_kernel_deps})
+lite_cc_test(yolo_box_compute_cuda_test SRCS yolo_box_compute_test.cc DEPS yolo_box_compute_cuda)
 
 set(cuda_kernels
 mul_compute_cuda 
 io_copy_compute_cuda
 leaky_relu_compute_cuda
+yolo_box_compute_cuda
 )
 
 set(cuda_kernels "${cuda_kernels}" CACHE GLOBAL "cuda kernels")
diff --git a/lite/kernels/cuda/yolo_box_compute.cu b/lite/kernels/cuda/yolo_box_compute.cu
new file mode 100644
index 0000000000..99fff9a709
--- /dev/null
+++ b/lite/kernels/cuda/yolo_box_compute.cu
@@ -0,0 +1,225 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <vector>
+#include "lite/core/op_registry.h"
+#include "lite/kernels/cuda/yolo_box_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+__host__ __device__ inline int GetEntryIndex(int batch,
+                                             int an_idx,
+                                             int hw_idx,
+                                             int an_num,
+                                             int an_stride,
+                                             int stride,
+                                             int entry) {
+  return (batch * an_num + an_idx) * an_stride + entry * stride + hw_idx;
+}
+
+template <typename T>
+__host__ __device__ inline T sigmoid(T x) {
+  return 1.0 / (1.0 + std::exp(-x));
+}
+
+template <typename T>
+__host__ __device__ inline void GetYoloBox(T* box,
+                                           const T* x,
+                                           const int* anchors,
+                                           int i,
+                                           int j,
+                                           int an_idx,
+                                           int grid_size,
+                                           int input_size,
+                                           int index,
+                                           int stride,
+                                           int img_height,
+                                           int img_width) {
+  box[0] = (i + sigmoid<T>(x[index])) * img_width / grid_size;
+  box[1] = (j + sigmoid<T>(x[index + stride])) * img_height / grid_size;
+  box[2] = std::exp(x[index + 2 * stride]) * anchors[2 * an_idx] * img_width /
+           input_size;
+  box[3] = std::exp(x[index + 3 * stride]) * anchors[2 * an_idx + 1] *
+           img_height / input_size;
+}
+
+template <typename T>
+__host__ __device__ inline void CalcDetectionBox(T* boxes,
+                                                 T* box,
+                                                 const int box_idx,
+                                                 const int img_height,
+                                                 const int img_width) {
+  boxes[box_idx] = box[0] - box[2] / 2;
+  boxes[box_idx + 1] = box[1] - box[3] / 2;
+  boxes[box_idx + 2] = box[0] + box[2] / 2;
+  boxes[box_idx + 3] = box[1] + box[3] / 2;
+
+  boxes[box_idx] = boxes[box_idx] > 0 ? boxes[box_idx] : static_cast<T>(0);
+  boxes[box_idx + 1] =
+      boxes[box_idx + 1] > 0 ? boxes[box_idx + 1] : static_cast<T>(0);
+  boxes[box_idx + 2] = boxes[box_idx + 2] < img_width - 1
+                           ? boxes[box_idx + 2]
+                           : static_cast<T>(img_width - 1);
+  boxes[box_idx + 3] = boxes[box_idx + 3] < img_height - 1
+                           ? boxes[box_idx + 3]
+                           : static_cast<T>(img_height - 1);
+}
+
+template <typename T>
+__host__ __device__ inline void CalcLabelScore(T* scores,
+                                               const T* input,
+                                               const int label_idx,
+                                               const int score_idx,
+                                               const int class_num,
+                                               const T conf,
+                                               const int stride) {
+  for (int i = 0; i < class_num; ++i) {
+    scores[score_idx + i] = conf * sigmoid<T>(input[label_idx + i * stride]);
+  }
+}
+
+template <typename T>
+__global__ void KeYoloBoxFw(const T* input,
+                            const T* imgsize,
+                            T* boxes,
+                            T* scores,
+                            const float conf_thresh,
+                            const int* anchors,
+                            const int n,
+                            const int h,
+                            const int w,
+                            const int an_num,
+                            const int class_num,
+                            const int box_num,
+                            int input_size) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  T box[4];
+  for (; tid < n * box_num; tid += stride) {
+    int grid_num = h * w;
+    int i = tid / box_num;
+    int j = (tid % box_num) / grid_num;
+    int k = (tid % grid_num) / w;
+    int l = tid % w;
+
+    int an_stride = (5 + class_num) * grid_num;
+    int img_height = static_cast<int>(imgsize[2 * i]);
+    int img_width = static_cast<int>(imgsize[2 * i + 1]);
+
+    int obj_idx =
+        GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 4);
+    T conf = sigmoid<T>(input[obj_idx]);
+    if (conf < conf_thresh) {
+      continue;
+    }
+
+    int box_idx =
+        GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 0);
+    GetYoloBox<T>(box,
+                  input,
+                  anchors,
+                  l,
+                  k,
+                  j,
+                  h,
+                  input_size,
+                  box_idx,
+                  grid_num,
+                  img_height,
+                  img_width);
+    box_idx = (i * box_num + j * grid_num + k * w + l) * 4;
+    CalcDetectionBox<T>(boxes, box, box_idx, img_height, img_width);
+
+    int label_idx =
+        GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 5);
+    int score_idx = (i * box_num + j * grid_num + k * w + l) * class_num;
+    CalcLabelScore<T>(
+        scores, input, label_idx, score_idx, class_num, conf, grid_num);
+  }
+}
+
+void YoloBoxCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->template As<CUDAContext>();
+  auto stream = ctx.exec_stream();
+
+  lite::Tensor* X = param.X;
+  lite::Tensor* ImgSize = param.ImgSize;
+  lite::Tensor* Boxes = param.Boxes;
+  lite::Tensor* Scores = param.Scores;
+  std::vector<int> anchors = param.anchors;
+  int class_num = param.class_num;
+  float conf_thresh = param.conf_thresh;
+  int downsample_ratio = param.downsample_ratio;
+
+  const float* input = X->data<float>();
+  const float* imgsize = ImgSize->data<float>();
+  float* boxes = Boxes->mutable_data<float>(TARGET(kCUDA));
+  float* scores = Scores->mutable_data<float>(TARGET(kCUDA));
+
+  const int n = X->dims()[0];
+  const int h = X->dims()[2];
+  const int w = X->dims()[3];
+  const int box_num = Boxes->dims()[1];
+  const int an_num = anchors.size() / 2;
+  int input_size = downsample_ratio * h;
+
+  anchors_.Resize(static_cast<int>({anchors.size()}));
+  int* d_anchors = anchors_.mutable_data<int>(TARGET(kCUDA));
+  CopySync<TARGET(kCUDA)>(d_anchors,
+                          anchors.data(),
+                          sizeof(int) * anchors.size(),
+                          IoDirection::HtoD);
+
+  int threads = 512;
+  int blocks = (n * box_num + threads - 1) / threads;
+  blocks = blocks > 8 ? 8 : blocks;
+
+  KeYoloBoxFw<float><<<blocks, threads, 0, stream>>>(input,
+                                                     imgsize,
+                                                     boxes,
+                                                     scores,
+                                                     conf_thresh,
+                                                     d_anchors,
+                                                     n,
+                                                     h,
+                                                     w,
+                                                     an_num,
+                                                     class_num,
+                                                     box_num,
+                                                     input_size);
+  cudaError_t error = cudaGetLastError();
+  if (error != cudaSuccess) LOG(INFO) << cudaGetErrorString(error);
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(yolo_box,
+                     kCUDA,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::cuda::YoloBoxCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindInput("ImgSize", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindOutput("Boxes", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .BindOutput("Scores", {LiteType::GetTensorTy(TARGET(kCUDA))})
+    .Finalize();
diff --git a/lite/arm/math/multiclass_nms.h b/lite/kernels/cuda/yolo_box_compute.h
similarity index 51%
rename from lite/arm/math/multiclass_nms.h
rename to lite/kernels/cuda/yolo_box_compute.h
index a5f39b6462..c8ea9d25b6 100644
--- a/lite/arm/math/multiclass_nms.h
+++ b/lite/kernels/cuda/yolo_box_compute.h
@@ -13,33 +13,25 @@
 // limitations under the License.
 
 #pragma once
-
-#include <algorithm>
-#include <map>
-#include <string>
-#include <utility>
-#include <vector>
+#include "lite/core/kernel.h"
 
 namespace paddle {
 namespace lite {
-namespace arm {
-namespace math {
+namespace kernels {
+namespace cuda {
+
+class YoloBoxCompute : public KernelLite<TARGET(kCUDA), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::YoloBoxParam;
+
+  void Run() override;
+  virtual ~YoloBoxCompute() = default;
 
-template <typename dtype>
-void multiclass_nms(const dtype* bbox_cpu_data,
-                    const dtype* conf_cpu_data,
-                    std::vector<dtype>* result,
-                    const std::vector<int>& priors,
-                    int class_num,
-                    int background_id,
-                    int keep_topk,
-                    int nms_topk,
-                    float conf_thresh,
-                    float nms_thresh,
-                    float nms_eta,
-                    bool share_location);
+ private:
+  lite::Tensor anchors_;
+};
 
-}  // namespace math
-}  // namespace arm
+}  // namespace cuda
+}  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/kernels/cuda/yolo_box_compute_test.cc b/lite/kernels/cuda/yolo_box_compute_test.cc
new file mode 100644
index 0000000000..5cd9579383
--- /dev/null
+++ b/lite/kernels/cuda/yolo_box_compute_test.cc
@@ -0,0 +1,258 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/cuda/yolo_box_compute.h"
+#include <gtest/gtest.h>
+#include <memory>
+#include <utility>
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace cuda {
+
+inline static float sigmoid(float x) { return 1.f / (1.f + expf(-x)); }
+
+inline static void get_yolo_box(float* box,
+                                const float* x,
+                                const int* anchors,
+                                int i,
+                                int j,
+                                int an_idx,
+                                int grid_size,
+                                int input_size,
+                                int index,
+                                int stride,
+                                int img_height,
+                                int img_width) {
+  box[0] = (i + sigmoid(x[index])) * img_width / grid_size;
+  box[1] = (j + sigmoid(x[index + stride])) * img_height / grid_size;
+  box[2] = std::exp(x[index + 2 * stride]) * anchors[2 * an_idx] * img_width /
+           input_size;
+  box[3] = std::exp(x[index + 3 * stride]) * anchors[2 * an_idx + 1] *
+           img_height / input_size;
+}
+
+inline static int get_entry_index(int batch,
+                                  int an_idx,
+                                  int hw_idx,
+                                  int an_num,
+                                  int an_stride,
+                                  int stride,
+                                  int entry) {
+  return (batch * an_num + an_idx) * an_stride + entry * stride + hw_idx;
+}
+
+inline static void calc_detection_box(float* boxes,
+                                      float* box,
+                                      const int box_idx,
+                                      const int img_height,
+                                      const int img_width) {
+  boxes[box_idx] = box[0] - box[2] / 2;
+  boxes[box_idx + 1] = box[1] - box[3] / 2;
+  boxes[box_idx + 2] = box[0] + box[2] / 2;
+  boxes[box_idx + 3] = box[1] + box[3] / 2;
+
+  boxes[box_idx] = boxes[box_idx] > 0 ? boxes[box_idx] : static_cast<float>(0);
+  boxes[box_idx + 1] =
+      boxes[box_idx + 1] > 0 ? boxes[box_idx + 1] : static_cast<float>(0);
+  boxes[box_idx + 2] = boxes[box_idx + 2] < img_width - 1
+                           ? boxes[box_idx + 2]
+                           : static_cast<float>(img_width - 1);
+  boxes[box_idx + 3] = boxes[box_idx + 3] < img_height - 1
+                           ? boxes[box_idx + 3]
+                           : static_cast<float>(img_height - 1);
+}
+
+inline static void calc_label_score(float* scores,
+                                    const float* input,
+                                    const int label_idx,
+                                    const int score_idx,
+                                    const int class_num,
+                                    const float conf,
+                                    const int stride) {
+  for (int i = 0; i < class_num; i++) {
+    scores[score_idx + i] = conf * sigmoid(input[label_idx + i * stride]);
+  }
+}
+
+template <typename T>
+static void YoloBoxRef(const T* input,
+                       const T* imgsize,
+                       T* boxes,
+                       T* scores,
+                       const float conf_thresh,
+                       const int* anchors,
+                       const int n,
+                       const int h,
+                       const int w,
+                       const int an_num,
+                       const int class_num,
+                       const int box_num,
+                       int input_size) {
+  const int stride = h * w;
+  const int an_stride = (class_num + 5) * stride;
+  float box[4];
+
+  for (int i = 0; i < n; i++) {
+    int img_height = static_cast<int>(imgsize[2 * i]);
+    int img_width = static_cast<int>(imgsize[2 * i + 1]);
+
+    for (int j = 0; j < an_num; j++) {
+      for (int k = 0; k < h; k++) {
+        for (int l = 0; l < w; l++) {
+          int obj_idx =
+              get_entry_index(i, j, k * w + l, an_num, an_stride, stride, 4);
+          float conf = sigmoid(input[obj_idx]);
+          if (conf < conf_thresh) {
+            continue;
+          }
+
+          int box_idx =
+              get_entry_index(i, j, k * w + l, an_num, an_stride, stride, 0);
+          get_yolo_box(box,
+                       input,
+                       anchors,
+                       l,
+                       k,
+                       j,
+                       h,
+                       input_size,
+                       box_idx,
+                       stride,
+                       img_height,
+                       img_width);
+          box_idx = (i * box_num + j * stride + k * w + l) * 4;
+          calc_detection_box(boxes, box, box_idx, img_height, img_width);
+
+          int label_idx =
+              get_entry_index(i, j, k * w + l, an_num, an_stride, stride, 5);
+          int score_idx = (i * box_num + j * stride + k * w + l) * class_num;
+          calc_label_score(
+              scores, input, label_idx, score_idx, class_num, conf, stride);
+        }
+      }
+    }
+  }
+}
+
+TEST(yolo_box, normal) {
+  YoloBoxCompute yolo_box_kernel;
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  auto& context = ctx->As<CUDAContext>();
+
+  operators::YoloBoxParam param;
+
+  lite::Tensor x, sz, x_cpu, sz_cpu;
+  lite::Tensor boxes, scores, boxes_cpu, scores_cpu;
+  lite::Tensor x_ref, sz_ref, boxes_ref, scores_ref;
+  int s = 3, cls = 4;
+  int n = 1, c = s * (5 + cls), h = 16, w = 16;
+  param.anchors = {2, 3, 4, 5, 8, 10};
+  param.downsample_ratio = 2;
+  param.conf_thresh = 0.5;
+  param.class_num = cls;
+  int m = h * w * param.anchors.size() / 2;
+
+  x.Resize({n, c, h, w});
+  sz.Resize({1, 2});
+  boxes.Resize({n, m, 4});
+  scores.Resize({n, cls, m});
+
+  x_cpu.Resize({n, c, h, w});
+  sz_cpu.Resize({1, 2});
+  boxes_cpu.Resize({n, m, 4});
+  scores_cpu.Resize({n, cls, m});
+
+  x_ref.Resize({n, c, h, w});
+  sz_ref.Resize({1, 2});
+  boxes_ref.Resize({n, m, 4});
+  scores_ref.Resize({n, cls, m});
+
+  auto* x_data = x.mutable_data<float>(TARGET(kCUDA));
+  auto* sz_data = sz.mutable_data<float>(TARGET(kCUDA));
+  auto* boxes_data = boxes.mutable_data<float>(TARGET(kCUDA));
+  auto* scores_data = scores.mutable_data<float>(TARGET(kCUDA));
+
+  float* x_cpu_data = x_cpu.mutable_data<float>();
+  float* sz_cpu_data = sz_cpu.mutable_data<float>();
+  float* boxes_cpu_data = boxes_cpu.mutable_data<float>();
+  float* scores_cpu_data = scores_cpu.mutable_data<float>();
+
+  float* x_ref_data = x_ref.mutable_data<float>();
+  float* sz_ref_data = sz_ref.mutable_data<float>();
+  float* boxes_ref_data = boxes_ref.mutable_data<float>();
+  float* scores_ref_data = scores_ref.mutable_data<float>();
+
+  for (int i = 0; i < x_cpu.numel(); ++i) {
+    x_cpu_data[i] = i - 5.0;
+    x_ref_data[i] = i - 5.0;
+  }
+  sz_cpu_data[0] = 16;
+  sz_cpu_data[1] = 32;
+  sz_ref_data[0] = 16;
+  sz_ref_data[1] = 32;
+
+  x.Assign<float, lite::DDim, TARGET(kCUDA)>(x_cpu_data, x_cpu.dims());
+  sz.Assign<float, lite::DDim, TARGET(kCUDA)>(sz_cpu_data, sz_cpu.dims());
+
+  param.X = &x;
+  param.ImgSize = &sz;
+  param.Boxes = &boxes;
+  param.Scores = &scores;
+  yolo_box_kernel.SetParam(param);
+
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+  context.SetExecStream(stream);
+
+  yolo_box_kernel.SetContext(std::move(ctx));
+  yolo_box_kernel.Launch();
+  cudaDeviceSynchronize();
+
+  CopySync<TARGET(kCUDA)>(boxes_cpu_data,
+                          boxes_data,
+                          sizeof(float) * boxes.numel(),
+                          IoDirection::DtoH);
+  CopySync<TARGET(kCUDA)>(scores_cpu_data,
+                          scores_data,
+                          sizeof(float) * scores.numel(),
+                          IoDirection::DtoH);
+
+  YoloBoxRef<float>(x_ref_data,
+                    sz_ref_data,
+                    boxes_ref_data,
+                    scores_ref_data,
+                    param.conf_thresh,
+                    param.anchors.data(),
+                    n,
+                    h,
+                    w,
+                    param.anchors.size() / 2,
+                    cls,
+                    m,
+                    param.downsample_ratio * h);
+
+  for (int i = 0; i < boxes.numel(); i++) {
+    EXPECT_NEAR(boxes_cpu_data[i], boxes_ref_data[i], 1e-5);
+  }
+  for (int i = 0; i < scores.numel(); i++) {
+    EXPECT_NEAR(scores_cpu_data[i], scores_ref_data[i], 1e-5);
+  }
+}
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/host/CMakeLists.txt b/lite/kernels/host/CMakeLists.txt
index abd96317cc..5bcaafeabb 100644
--- a/lite/kernels/host/CMakeLists.txt
+++ b/lite/kernels/host/CMakeLists.txt
@@ -3,5 +3,7 @@ message(STATUS "compile with lite host kernels")
 add_kernel(feed_compute_host Host basic SRCS feed_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(fetch_compute_host Host basic SRCS fetch_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(reshape_compute_host Host basic SRCS reshape_compute.cc DEPS ${lite_kernel_deps} reshape_op)
+add_kernel(multiclass_nms_compute_host Host basic SRCS multiclass_nms_compute.cc DEPS ${lite_kernel_deps})
 
 lite_cc_test(test_reshape_compute_host SRCS reshape_compute_test.cc DEPS reshape_compute_host any)
+lite_cc_test(test_multiclass_nms_compute_host SRCS multiclass_nms_compute_test.cc DEPS multiclass_nms_compute_host any)
diff --git a/lite/arm/math/multiclass_nms.cc b/lite/kernels/host/multiclass_nms_compute.cc
similarity index 74%
rename from lite/arm/math/multiclass_nms.cc
rename to lite/kernels/host/multiclass_nms_compute.cc
index 3baeb2d844..0d490d6011 100644
--- a/lite/arm/math/multiclass_nms.cc
+++ b/lite/kernels/host/multiclass_nms_compute.cc
@@ -12,13 +12,15 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/arm/math/multiclass_nms.h"
-#include "lite/arm/math/funcs.h"
+#include "lite/kernels/host/multiclass_nms_compute.h"
+#include <map>
+#include <utility>
+#include <vector>
 
 namespace paddle {
 namespace lite {
-namespace arm {
-namespace math {
+namespace kernels {
+namespace host {
 
 template <typename dtype>
 static bool sort_score_pair_descend(const std::pair<float, dtype>& pair1,
@@ -269,31 +271,92 @@ void multiclass_nms(const dtype* bbox_cpu_data,
   }
 }
 
-template float jaccard_overlap(const float* bbox1, const float* bbox2);
-
-template void apply_nms_fast(const float* bboxes,
-                             const float* scores,
-                             int num,
-                             float score_threshold,
-                             float nms_threshold,
-                             float eta,
-                             int top_k,
-                             std::vector<int>* indices);
-
-template void multiclass_nms(const float* bbox_cpu_data,
-                             const float* conf_cpu_data,
-                             std::vector<float>* result,
-                             const std::vector<int>& priors,
-                             int class_num,
-                             int background_id,
-                             int keep_topk,
-                             int nms_topk,
-                             float conf_thresh,
-                             float nms_thresh,
-                             float nms_eta,
-                             bool share_location);
-
-}  // namespace math
-}  // namespace arm
+void MulticlassNmsCompute::Run() {
+  auto& param = Param<operators::MulticlassNmsParam>();
+  // bbox shape : N, M, 4
+  // scores shape : N, C, M
+  const float* bbox_data = param.bbox_data->data<float>();
+  const float* conf_data = param.conf_data->data<float>();
+
+  CHECK_EQ(param.bbox_data->dims().production() % 4, 0);
+
+  std::vector<float> result;
+  int N = param.bbox_data->dims()[0];
+  int M = param.bbox_data->dims()[1];
+  std::vector<int> priors(N, M);
+  int class_num = param.conf_data->dims()[1];
+  int background_label = param.background_label;
+  int keep_top_k = param.keep_top_k;
+  int nms_top_k = param.nms_top_k;
+  float score_threshold = param.score_threshold;
+  float nms_threshold = param.nms_threshold;
+  float nms_eta = param.nms_eta;
+  bool share_location = param.share_location;
+
+  multiclass_nms(bbox_data,
+                 conf_data,
+                 &result,
+                 priors,
+                 class_num,
+                 background_label,
+                 keep_top_k,
+                 nms_top_k,
+                 score_threshold,
+                 nms_threshold,
+                 nms_eta,
+                 share_location);
+
+  lite::LoD lod;
+  std::vector<uint64_t> lod_info;
+  lod_info.push_back(0);
+  std::vector<float> result_corrected;
+  int tmp_batch_id;
+  uint64_t num = 0;
+  for (int i = 0; i < result.size(); ++i) {
+    if (i == 0) {
+      tmp_batch_id = result[i];
+    }
+    if (i % 7 == 0) {
+      if (result[i] == tmp_batch_id) {
+        ++num;
+      } else {
+        lod_info.push_back(num);
+        ++num;
+        tmp_batch_id = result[i];
+      }
+    } else {
+      result_corrected.push_back(result[i]);
+    }
+  }
+  lod_info.push_back(num);
+  lod.push_back(lod_info);
+  if (result_corrected.empty()) {
+    lod.clear();
+    lod.push_back(std::vector<uint64_t>({0, 1}));
+    param.out->Resize({static_cast<int64_t>(1)});
+    param.out->mutable_data<float>()[0] = -1.;
+    param.out->set_lod(lod);
+  } else {
+    param.out->Resize({static_cast<int64_t>(result_corrected.size() / 6), 6});
+    float* out = param.out->mutable_data<float>();
+    std::memcpy(
+        out, result_corrected.data(), sizeof(float) * result_corrected.size());
+    param.out->set_lod(lod);
+  }
+}
+
+}  // namespace host
+}  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
+
+REGISTER_LITE_KERNEL(multiclass_nms,
+                     kHost,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::host::MulticlassNmsCompute,
+                     def)
+    .BindInput("BBoxes", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindInput("Scores", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
+    .Finalize();
diff --git a/lite/kernels/arm/multiclass_nms_compute.h b/lite/kernels/host/multiclass_nms_compute.h
similarity index 85%
rename from lite/kernels/arm/multiclass_nms_compute.h
rename to lite/kernels/host/multiclass_nms_compute.h
index 6b77e216fa..9391a0b2b5 100644
--- a/lite/kernels/arm/multiclass_nms_compute.h
+++ b/lite/kernels/host/multiclass_nms_compute.h
@@ -13,26 +13,24 @@
 // limitations under the License.
 
 #pragma once
-#include <vector>
+#include <algorithm>
 #include "lite/core/kernel.h"
 #include "lite/core/op_registry.h"
 
 namespace paddle {
 namespace lite {
 namespace kernels {
-namespace arm {
+namespace host {
 
 class MulticlassNmsCompute
-    : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+    : public KernelLite<TARGET(kHost), PRECISION(kFloat)> {
  public:
-  using param_t = operators::MulticlassNmsParam;
-
   void Run() override;
 
   virtual ~MulticlassNmsCompute() = default;
 };
 
-}  // namespace arm
+}  // namespace host
 }  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/kernels/arm/multiclass_nms_compute_test.cc b/lite/kernels/host/multiclass_nms_compute_test.cc
similarity index 95%
rename from lite/kernels/arm/multiclass_nms_compute_test.cc
rename to lite/kernels/host/multiclass_nms_compute_test.cc
index b0352f77c5..37c04bc290 100644
--- a/lite/kernels/arm/multiclass_nms_compute_test.cc
+++ b/lite/kernels/host/multiclass_nms_compute_test.cc
@@ -12,19 +12,16 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/arm/multiclass_nms_compute.h"
+#include "lite/kernels/host/multiclass_nms_compute.h"
 #include <gtest/gtest.h>
-#include <algorithm>
 #include <map>
-#include <string>
 #include <utility>
 #include <vector>
-#include "lite/core/op_registry.h"
 
 namespace paddle {
 namespace lite {
 namespace kernels {
-namespace arm {
+namespace host {
 
 template <typename dtype>
 static bool sort_score_pair_descend(const std::pair<float, dtype>& pair1,
@@ -279,21 +276,21 @@ void multiclass_nms_compute_ref(const operators::MulticlassNmsParam& param,
   }
 }
 
-TEST(multiclass_nms_arm, retrive_op) {
+TEST(multiclass_nms_host, init) {
+  MulticlassNmsCompute multiclass_nms;
+  ASSERT_EQ(multiclass_nms.precision(), PRECISION(kFloat));
+  ASSERT_EQ(multiclass_nms.target(), TARGET(kHost));
+}
+
+TEST(multiclass_nms_host, retrive_op) {
   auto multiclass_nms =
-      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
+      KernelRegistry::Global().Create<TARGET(kHost), PRECISION(kFloat)>(
           "multiclass_nms");
   ASSERT_FALSE(multiclass_nms.empty());
   ASSERT_TRUE(multiclass_nms.front());
 }
 
-TEST(multiclass_nms_arm, init) {
-  MulticlassNmsCompute multiclass_nms;
-  ASSERT_EQ(multiclass_nms.precision(), PRECISION(kFloat));
-  ASSERT_EQ(multiclass_nms.target(), TARGET(kARM));
-}
-
-TEST(multiclass_nms_arm, compute) {
+TEST(multiclass_nms_host, compute) {
   MulticlassNmsCompute multiclass_nms;
   operators::MulticlassNmsParam param;
   lite::Tensor bbox, conf, out;
@@ -306,9 +303,6 @@ TEST(multiclass_nms_arm, compute) {
         DDim* bbox_dim;
         DDim* conf_dim;
         int M = priors[0];
-        // for (int i = 0; i < priors.size(); ++i) {
-        //  M += priors[i];
-        //}
         if (share_location) {
           bbox_dim = new DDim({N, M, 4});
         } else {
@@ -368,9 +362,9 @@ TEST(multiclass_nms_arm, compute) {
   }
 }
 
-}  // namespace arm
+}  // namespace host
 }  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
 
-USE_LITE_KERNEL(multiclass_nms, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(multiclass_nms, kHost, kFloat, kNCHW, def);
diff --git a/lite/tests/kernels/CMakeLists.txt b/lite/tests/kernels/CMakeLists.txt
index a78424dede..482d2d9520 100644
--- a/lite/tests/kernels/CMakeLists.txt
+++ b/lite/tests/kernels/CMakeLists.txt
@@ -33,7 +33,6 @@ endif()
     lite_cc_test(test_sgemm SRCS test_sgemm.cc DEPS ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_pad2d_compute SRCS  pad2d_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_prior_box_compute SRCS  prior_box_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_multiclass_nms_compute SRCS  multiclass_nms_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_negative_compute SRCS negative_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_bilinear_interp_compute SRCS bilinear_interp_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
     lite_cc_test(test_kernel_nearest_interp_compute SRCS nearest_interp_compute_test.cc DEPS arena_framework ${x86_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
diff --git a/lite/tests/kernels/multiclass_nms_compute_test.cc b/lite/tests/kernels/multiclass_nms_compute_test.cc
deleted file mode 100644
index 1658acd0c7..0000000000
--- a/lite/tests/kernels/multiclass_nms_compute_test.cc
+++ /dev/null
@@ -1,181 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/core/arena/framework.h"
-
-bool read_file(std::vector<float>* result, const char* file_name) {
-  std::ifstream infile(file_name);
-
-  if (!infile.good()) {
-    std::cout << "Cannot open " << file_name << std::endl;
-    return false;
-  }
-
-  LOG(INFO) << "found filename: " << file_name;
-  std::string line;
-
-  while (std::getline(infile, line)) {
-    (*result).push_back(static_cast<float>(atof(line.c_str())));
-  }
-
-  return true;
-}
-
-const char* bboxes_file = "multiclass_nms_bboxes_file.txt";
-const char* scores_file = "multiclass_nms_scores_file.txt";
-const char* out_file = "multiclass_nms_out_file.txt";
-
-namespace paddle {
-namespace lite {
-class MulticlassNmsComputeTester : public arena::TestCase {
- protected:
-  // common attributes for this op.
-  std::string bbox_ = "BBoxes";
-  std::string conf_ = "Scores";
-  std::string out_ = "Out";
-  std::vector<int> priors_;
-  int class_num_;
-  int background_id_;
-  int keep_topk_;
-  int nms_topk_;
-  float conf_thresh_;
-  float nms_thresh_;
-  float nms_eta_;
-  bool share_location_;
-  DDim bbox_dims_;
-  DDim conf_dims_;
-
- public:
-  MulticlassNmsComputeTester(const Place& place,
-                             const std::string& alias,
-                             std::vector<int> priors,
-                             int class_num,
-                             int background_id,
-                             int keep_topk,
-                             int nms_topk,
-                             float conf_thresh,
-                             float nms_thresh,
-                             float nms_eta,
-                             bool share_location,
-                             DDim bbox_dims,
-                             DDim conf_dims)
-      : TestCase(place, alias),
-        priors_(priors),
-        class_num_(class_num),
-        background_id_(background_id),
-        keep_topk_(keep_topk),
-        nms_topk_(nms_topk),
-        conf_thresh_(conf_thresh),
-        nms_thresh_(nms_thresh),
-        nms_eta_(nms_eta),
-        share_location_(share_location),
-        bbox_dims_(bbox_dims),
-        conf_dims_(conf_dims) {}
-
-  void RunBaseline(Scope* scope) override {
-    std::vector<float> vbbox;
-    std::vector<float> vscore;
-    std::vector<float> vout;
-
-    if (!read_file(&vout, out_file)) {
-      LOG(ERROR) << "load ground truth failed";
-      return;
-    }
-
-    auto* out = scope->NewTensor(out_);
-    CHECK(out);
-    out->Resize(DDim({static_cast<int64_t>(vout.size() / 6), 6}));
-    auto* out_data = out->mutable_data<float>();
-    memcpy(out_data, vout.data(), vout.size() * sizeof(float));
-    out->mutable_lod()->push_back(std::vector<uint64_t>({0, 10}));
-  }
-
-  void PrepareOpDesc(cpp::OpDesc* op_desc) {
-    op_desc->SetType("multiclass_nms");
-    op_desc->SetInput("BBoxes", {bbox_});
-    op_desc->SetInput("Scores", {conf_});
-    op_desc->SetOutput("Out", {out_});
-    op_desc->SetAttr("background_label", background_id_);
-    op_desc->SetAttr("keep_top_k", keep_topk_);
-    op_desc->SetAttr("nms_top_k", nms_topk_);
-    op_desc->SetAttr("score_threshold", conf_thresh_);
-    op_desc->SetAttr("nms_threshold", nms_thresh_);
-    op_desc->SetAttr("nms_eta", nms_eta_);
-    op_desc->SetAttr("share_location", share_location_);
-  }
-
-  void PrepareData() override {
-    std::vector<float> bbox_data;
-    std::vector<float> conf_data;
-
-    if (!read_file(&bbox_data, bboxes_file)) {
-      LOG(ERROR) << "load bbox file failed";
-      return;
-    }
-    if (!read_file(&conf_data, scores_file)) {
-      LOG(ERROR) << "load score file failed";
-      return;
-    }
-
-    SetCommonTensor(bbox_, bbox_dims_, bbox_data.data());
-    SetCommonTensor(conf_, conf_dims_, conf_data.data());
-  }
-};
-
-void test_multiclass_nms(Place place) {
-  int keep_top_k = 200;
-  int nms_top_k = 400;
-  float nms_eta = 1.;
-  float score_threshold = 0.009999999776482582;
-  int background_label = 0;
-  float nms_threshold = 0.44999998807907104;
-  int N = 1;
-  int M = 1917;
-  int class_num = 21;
-  bool share_location = true;
-  std::vector<int> priors(N, M);
-
-  std::unique_ptr<arena::TestCase> tester(
-      new MulticlassNmsComputeTester(place,
-                                     "def",
-                                     priors,
-                                     class_num,
-                                     background_label,
-                                     keep_top_k,
-                                     nms_top_k,
-                                     score_threshold,
-                                     nms_threshold,
-                                     nms_eta,
-                                     share_location,
-                                     DDim({N, M, 4}),
-                                     DDim({N, class_num, M})));
-  arena::Arena arena(std::move(tester), place, 2e-5);
-  arena.TestPrecision();
-}
-
-TEST(MulticlassNms, precision) {
-#ifdef LITE_WITH_X86
-  Place place(TARGET(kX86));
-#endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_multiclass_nms(place);
-#endif
-}
-
-}  // namespace lite
-}  // namespace paddle
diff --git a/lite/tools/ci_build.sh b/lite/tools/ci_build.sh
index 80432f4378..433e08c251 100755
--- a/lite/tools/ci_build.sh
+++ b/lite/tools/ci_build.sh
@@ -524,22 +524,6 @@ function build_npu {
     fi
 }
 
-function __prepare_multiclass_nms_test_files {
-    local port=$1
-    local adb_work_dir="/data/local/tmp"
-
-    wget --no-check-certificate https://raw.githubusercontent.com/jiweibo/TestData/master/multiclass_nms_bboxes_file.txt \
-        -O lite/tests/kernels/multiclass_nms_bboxes_file.txt
-    wget --no-check-certificate https://raw.githubusercontent.com/jiweibo/TestData/master/multiclass_nms_scores_file.txt \
-        -O lite/tests/kernels/multiclass_nms_scores_file.txt
-    wget --no-check-certificate https://raw.githubusercontent.com/jiweibo/TestData/master/multiclass_nms_out_file.txt \
-        -O lite/tests/kernels/multiclass_nms_out_file.txt
-
-    adb -s emulator-${port} push lite/tests/kernels/multiclass_nms_bboxes_file.txt ${adb_work_dir}
-    adb -s emulator-${port} push lite/tests/kernels/multiclass_nms_scores_file.txt ${adb_work_dir}
-    adb -s emulator-${port} push lite/tests/kernels/multiclass_nms_out_file.txt ${adb_work_dir}
-}
-
 # $1: ARM_TARGET_OS in "android" , "armlinux"
 # $2: ARM_TARGET_ARCH_ABI in "armv8", "armv7" ,"armv7hf"
 # $3: ARM_TARGET_LANG in "gcc" "clang"
@@ -562,9 +546,6 @@ function test_arm {
         return 0
     fi
 
-    echo "prepare multiclass_nms_test files..."
-    __prepare_multiclass_nms_test_files $port
-
     # prepare for CXXApi test
     local adb="adb -s emulator-${port}"
     $adb shell mkdir -p /data/local/tmp/lite_naive_model_opt
-- 
GitLab