diff --git a/cmake/lite.cmake b/cmake/lite.cmake
index 9a633409cd4d1c5e650a4794fcf30b9154c8638a..66abeab483beacc3d466f626be3b0659516c4162 100644
--- a/cmake/lite.cmake
+++ b/cmake/lite.cmake
@@ -413,7 +413,9 @@ function(add_kernel TARGET device level)
     if ("${device}" STREQUAL "MLU")
         if (NOT LITE_WITH_MLU)
             foreach(src ${args_SRCS})
-                file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
+                if (NOT (src MATCHES ".*\\.o"))
+                    file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
+                endif()
             endforeach()
             return()
         endif()
@@ -446,7 +448,13 @@ function(add_kernel TARGET device level)
 
     # the source list will collect for paddle_use_kernel.h code generation.
     foreach(src ${args_SRCS})
-        file(APPEND ${kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
+        if (LITE_WITH_MLU)
+            if (NOT (src MATCHES ".*\\.o"))
+                file(APPEND ${kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
+            endif()
+        else()
+            file(APPEND ${kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
+        endif()
     endforeach()
 
     lite_cc_library(${TARGET} SRCS ${args_SRCS}
diff --git a/lite/kernels/mlu/CMakeLists.txt b/lite/kernels/mlu/CMakeLists.txt
index 5557f86c589951b514b0d44f55c8be8f2a825e0f..eed70d7fa6c39c7de1e03f2fc03a824ab50c1f3a 100644
--- a/lite/kernels/mlu/CMakeLists.txt
+++ b/lite/kernels/mlu/CMakeLists.txt
@@ -7,5 +7,11 @@ add_kernel(subgraph_compute_mlu MLU basic SRCS subgraph_compute.cc DEPS ${lite_k
 add_kernel(io_copy_compute_mlu MLU basic SRCS io_copy_compute.cc DEPS ${lite_kernel_deps} ${target_wrapper_mlu})
 add_kernel(calib_compute_mlu MLU basic SRCS calib_compute.cc DEPS ${lite_kernel_deps})
 # depend on transpose function in backend/x86/math/math_function
+add_kernel(roi_align_compute_mlu MLU extra SRCS roi_align_compute.cc mlu_kernel/roi_align_kernel.o DEPS ${lite_kernel_deps})
+
+if(LITE_BUILD_EXTRA)
+    lite_cc_test(test_roi_align_compute_mlu SRCS roi_align_compute_test.cc DEPS roi_align_compute_mlu)
+endif()
+
 add_kernel(layout_compute_mlu MLU basic SRCS layout_compute.cc DEPS ${lite_kernel_deps} ${math_function} ${target_wrapper_mlu})
 add_kernel(cast_compute_mlu MLU basic SRCS cast_compute.cc DEPS ${lite_kernel_deps} ${target_wrapper_mlu})
diff --git a/lite/kernels/mlu/mlu_kernel/roi_align_kernel.o b/lite/kernels/mlu/mlu_kernel/roi_align_kernel.o
new file mode 100644
index 0000000000000000000000000000000000000000..d43056577e894203a0cfdd0f612478e22b7718e1
Binary files /dev/null and b/lite/kernels/mlu/mlu_kernel/roi_align_kernel.o differ
diff --git a/lite/kernels/mlu/roi_align_compute.cc b/lite/kernels/mlu/roi_align_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..52bfdfea15b60cd190ca74854fb838a75c03f921
--- /dev/null
+++ b/lite/kernels/mlu/roi_align_compute.cc
@@ -0,0 +1,198 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.ddNod
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/mlu/roi_align_compute.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace mlu {
+
+void RoiAlignCompute::Run() {
+  auto& mlu_context = this->ctx_->template As<MLUContext>();
+  auto& exec_queue = mlu_context.exec_queue();
+  this->Run(exec_queue);
+}
+
+void RoiAlignCompute::Run(const cnrtQueue_t& exec_queue) {
+  auto& param = this->Param<param_t>();
+
+  auto* rois = param.ROIs;
+  auto rois_dims = rois->dims();
+  int rois_num = rois_dims[0];
+  if (rois_num == 0) {
+    return;
+  }
+
+  auto* in = param.X;
+  auto* out = param.Out;
+  float spatial_scale = param.spatial_scale;
+  int pooled_height = param.pooled_height;
+  int pooled_width = param.pooled_width;
+  int sampling_ratio = param.sampling_ratio;
+
+  half spatial_scale_half;
+  cnrtConvertFloatToHalf(&spatial_scale_half, spatial_scale);
+
+  auto in_dims = in->dims();
+  // int batch_size = in_dims[0];
+  int channels = in_dims[1];
+  int height = in_dims[2];
+  int width = in_dims[3];
+  auto out_dims = out->dims();
+
+  std::vector<int> roi_ind_vec(rois_num);
+  auto rois_lod = rois->lod().back();
+  for (int n = 0, rois_batch_size = rois_lod.size() - 1; n < rois_batch_size;
+       ++n) {
+    for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
+      roi_ind_vec[i] = n;
+    }
+  }
+
+  auto* input_data = in->data<float>();
+  auto* output_data = out->mutable_data<float>();
+  auto* rois_data = rois->data<float>();
+
+  std::vector<half> input_tmp_vec(in_dims.production());
+  std::vector<half> rois_tmp_vec(rois_dims.production());
+  std::vector<half> output_tmp_vec(out_dims.production());
+
+  std::vector<int> nchw2nhwc_dimorder{0, 2, 3, 1};
+  std::vector<int> tmp_in_dims;
+  for (int i = 0; i < in_dims.size(); i++) {
+    tmp_in_dims.emplace_back(static_cast<int>(in_dims[i]));
+  }
+  cnrtTransOrderAndCast(const_cast<float*>(input_data),
+                        CNRT_FLOAT32,
+                        input_tmp_vec.data(),
+                        CNRT_FLOAT16,
+                        NULL,
+                        tmp_in_dims.size(),
+                        tmp_in_dims.data(),
+                        nchw2nhwc_dimorder.data());
+  cnrtCastDataType(const_cast<float*>(rois_data),
+                   CNRT_FLOAT32,
+                   const_cast<half*>(rois_tmp_vec.data()),
+                   CNRT_FLOAT16,
+                   rois_dims.production(),
+                   NULL);
+
+  void *input_mlu_data = nullptr, *rois_mlu_data = nullptr,
+       *roi_batch_id_mlu_data = nullptr, *output_mlu_data = nullptr;
+  cnrtMalloc(&input_mlu_data,
+             input_tmp_vec.size() * sizeof(input_tmp_vec.front()));
+  cnrtMemcpy(input_mlu_data,
+             input_tmp_vec.data(),
+             input_tmp_vec.size() * sizeof(input_tmp_vec.front()),
+             CNRT_MEM_TRANS_DIR_HOST2DEV);
+  cnrtMalloc(&rois_mlu_data,
+             rois_tmp_vec.size() * sizeof(rois_tmp_vec.front()));
+  cnrtMemcpy(rois_mlu_data,
+             rois_tmp_vec.data(),
+             rois_tmp_vec.size() * sizeof(rois_tmp_vec.front()),
+             CNRT_MEM_TRANS_DIR_HOST2DEV);
+  cnrtMalloc(&roi_batch_id_mlu_data,
+             roi_ind_vec.size() * sizeof(roi_ind_vec.front()));
+  cnrtMemcpy(roi_batch_id_mlu_data,
+             roi_ind_vec.data(),
+             roi_ind_vec.size() * sizeof(roi_ind_vec.front()),
+             CNRT_MEM_TRANS_DIR_HOST2DEV);
+
+  // malloc output memory on device
+  cnrtMalloc(&output_mlu_data,
+             output_tmp_vec.size() * sizeof(output_tmp_vec.front()));
+
+  // prepare kernel params
+  cnrtKernelParamsBuffer_t params;
+  cnrtGetKernelParamsBuffer(&params);
+  cnrtKernelParamsBufferAddParam(
+      params, &input_mlu_data, sizeof(input_mlu_data));
+  cnrtKernelParamsBufferAddParam(params, &rois_mlu_data, sizeof(rois_mlu_data));
+  cnrtKernelParamsBufferAddParam(
+      params, &roi_batch_id_mlu_data, sizeof(roi_batch_id_mlu_data));
+  cnrtKernelParamsBufferAddParam(
+      params, &output_mlu_data, sizeof(output_mlu_data));
+  cnrtKernelParamsBufferAddParam(params, &height, sizeof(height));
+  cnrtKernelParamsBufferAddParam(params, &width, sizeof(width));
+  cnrtKernelParamsBufferAddParam(params, &channels, sizeof(channels));
+  cnrtKernelParamsBufferAddParam(params, &pooled_height, sizeof(pooled_height));
+  cnrtKernelParamsBufferAddParam(params, &pooled_width, sizeof(pooled_width));
+  cnrtKernelParamsBufferAddParam(params, &rois_num, sizeof(rois_num));
+  cnrtKernelParamsBufferAddParam(
+      params, &spatial_scale_half, sizeof(spatial_scale_half));
+  cnrtKernelParamsBufferAddParam(
+      params, &sampling_ratio, sizeof(sampling_ratio));
+
+  cnrtDim3_t task_dims;
+  task_dims.x = 1, task_dims.y = 1, task_dims.z = 1;
+  cnrtFunctionType_t func_type = CNRT_FUNC_TYPE_BLOCK;
+
+  // invoke kernel and sync to compute on MLU
+  CNRT_CALL(cnrtInvokeKernel_V2(reinterpret_cast<void*>(&roi_align_kernel),
+                                task_dims,
+                                params,
+                                func_type,
+                                exec_queue));
+  CNRT_CALL(cnrtSyncQueue(exec_queue));
+
+  cnrtMemcpy(output_tmp_vec.data(),
+             output_mlu_data,
+             output_tmp_vec.size() * sizeof(output_tmp_vec.front()),
+             CNRT_MEM_TRANS_DIR_DEV2HOST);
+  std::vector<int> tmp_out_dims;
+  for (int i = 0; i < out_dims.size(); i++) {
+    // out_dims = {N, C, H, W}, tmp_out_dims = {N, H, W, C}
+    tmp_out_dims.emplace_back(out_dims[nchw2nhwc_dimorder[i]]);
+  }
+  std::vector<int> nhwc2nchw_dimorder{0, 3, 1, 2};
+  cnrtTransOrderAndCast(output_tmp_vec.data(),
+                        CNRT_FLOAT16,
+                        output_data,
+                        CNRT_FLOAT32,
+                        NULL,
+                        tmp_out_dims.size(),
+                        tmp_out_dims.data(),
+                        nhwc2nchw_dimorder.data());
+
+  // realease resource
+  cnrtDestroyKernelParamsBuffer(params);
+  cnrtFree(input_mlu_data);
+  cnrtFree(rois_mlu_data);
+  cnrtFree(roi_batch_id_mlu_data);
+  cnrtFree(output_mlu_data);
+}
+
+}  // namespace mlu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(roi_align,
+                     kMLU,
+                     kFloat,
+                     kNCHW,
+                     paddle::lite::kernels::mlu::RoiAlignCompute,
+                     def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kNCHW))})
+    .BindInput("ROIs",
+               {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kFloat))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kHost),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kNCHW))})
+    .Finalize();
diff --git a/lite/kernels/mlu/roi_align_compute.h b/lite/kernels/mlu/roi_align_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..fa571efee012d2bee291138653f079ec0028b175
--- /dev/null
+++ b/lite/kernels/mlu/roi_align_compute.h
@@ -0,0 +1,49 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "lite/core/kernel.h"
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/type_system.h"
+#include "lite/kernels/mlu/bridges/utility.h"
+#include "lite/kernels/mlu/roi_align_kernel.h"
+#include "lite/operators/layout_op.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace mlu {
+
+class RoiAlignCompute
+    : public KernelLite<TARGET(kMLU), PRECISION(kFloat), DATALAYOUT(kNCHW)> {
+ public:
+  using param_t = operators::RoiAlignParam;
+
+  void Run() override;
+  void Run(const cnrtQueue_t& exec_queue);
+
+  std::string doc() const override { return "Mlu roi align"; }
+
+  virtual ~RoiAlignCompute() = default;
+};
+
+}  // namespace mlu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/kernels/mlu/roi_align_compute_test.cc b/lite/kernels/mlu/roi_align_compute_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9cbcc74136d7d8c815e4b964f68e6b1d83377d98
--- /dev/null
+++ b/lite/kernels/mlu/roi_align_compute_test.cc
@@ -0,0 +1,132 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.ddNod
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/mlu/roi_align_compute.h"
+
+#include <gtest/gtest.h>
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace mlu {
+
+TEST(roi_align_mlu, retrive_op) {
+  auto roi_align =
+      KernelRegistry::Global().Create<TARGET(kMLU), PRECISION(kFloat)>(
+          "roi_align");
+  ASSERT_FALSE(roi_align.empty());
+  ASSERT_TRUE(roi_align.front());
+}
+
+TEST(roi_align_mlu, init) {
+  RoiAlignCompute roi_align;
+  ASSERT_EQ(roi_align.precision(), PRECISION(kFloat));
+  ASSERT_EQ(roi_align.target(), TARGET(kMLU));
+}
+
+TEST(roi_align_mlu, run_test) {
+  constexpr int ROI_SIZE = 4;
+
+  // image_height * spatial_scale == featuremap_height, width is also like this
+  constexpr int batch_size = 2, channels = 3, featuremap_height = 9,
+                featuremap_width = 16, pooled_height = 2, pooled_width = 1,
+                num_rois = 3, sampling_rate = 2;
+  constexpr float spatial_scale = 0.5;
+
+  lite::Tensor x, rois, out;
+
+  x.Resize(
+      lite::DDim({batch_size, channels, featuremap_height, featuremap_width}));
+  rois.Resize(lite::DDim({num_rois, ROI_SIZE}));
+  // here lod use offset representation: [0, 1), [1, num_rois)
+  rois.set_lod({{0, 1, num_rois}});
+  out.Resize(lite::DDim({num_rois, channels, pooled_height, pooled_width}));
+
+  auto x_data = x.mutable_data<float>();
+  auto rois_data = rois.mutable_data<float>();
+  auto out_data = out.mutable_data<float>();
+
+  // {0.0, 1.0, ...}
+  std::iota(x_data, x_data + x.dims().production(), 0.0f);
+  std::iota(rois_data, rois_data + rois.dims().production(), 0.25f);
+  RoiAlignCompute roi_align_op;
+
+  operators::RoiAlignParam param;
+  param.X = &x;
+  param.ROIs = &rois;
+  param.Out = &out;
+  param.pooled_height = pooled_height;
+  param.pooled_width = pooled_width;
+  param.spatial_scale = spatial_scale;
+  param.sampling_ratio = sampling_rate;
+
+  // std::unique_ptr<KernelContext> ctx(new KernelContext);
+  // ctx->As<MLUContext>();
+  // roi_align_op.SetContext(std::move(ctx));
+
+  CNRT_CALL(cnrtInit(0));
+  // cnrtInvokeFuncParam_t forward_param;
+  // u32_t affinity = 1;
+  // int data_param = 1;
+  // forward_param.data_parallelism = &data_param;
+  // forward_param.affinity = &affinity;
+  // forward_param.end = CNRT_PARAM_END;
+  cnrtDev_t dev_handle;
+  CNRT_CALL(cnrtGetDeviceHandle(&dev_handle, 0));
+  CNRT_CALL(cnrtSetCurrentDevice(dev_handle));
+  cnrtQueue_t queue;
+  CNRT_CALL(cnrtCreateQueue(&queue));
+
+  roi_align_op.SetParam(param);
+  roi_align_op.Run(queue);
+
+  CNRT_CALL(cnrtDestroyQueue(queue));
+
+  std::vector<float> ref_results = {14.625,
+                                    22.625,
+                                    158.625,
+                                    166.625,
+                                    302.625,
+                                    310.625,
+
+                                    480.625,
+                                    488.625,
+                                    624.625,
+                                    632.625,
+                                    768.625,
+                                    776.625,
+
+                                    514.625,
+                                    522.625,
+                                    658.625,
+                                    666.625,
+                                    802.625,
+                                    810.625};
+  for (int i = 0; i < out.dims().production(); i++) {
+    EXPECT_NEAR(out_data[i], ref_results[i], (4e-3f * ref_results[i]));
+  }
+}
+
+}  // namespace mlu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(roi_align, kMLU, kFloat, kNCHW, def);
diff --git a/lite/kernels/mlu/roi_align_kernel.h b/lite/kernels/mlu/roi_align_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..a2298a1f89c94f51367af1a473aff2734457958a
--- /dev/null
+++ b/lite/kernels/mlu/roi_align_kernel.h
@@ -0,0 +1,78 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef LITE_KERNELS_MLU_ROI_ALIGN_KERNEL_H_
+#define LITE_KERNELS_MLU_ROI_ALIGN_KERNEL_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+typedef uint16_t half;
+
+/**
+ * @brief Region of interests align is used to implement bilinear interpolation.
+ * It can change the input of uneven size into a fixed size feature map. The
+ * operation passes pooled_width and pooled_height divides each recommended
+ * area into equal sized blocks. The position remains the same. In each ROI
+ * block, take sampling_ratio points (if - 1, all points in the frame are
+ * taken). Each point is directly calculated by bilinear interpolation. Then
+ * take the average value of the points taken in the block as the coordinate
+ * value of the small box.
+ *
+ * @param[in] input: 4-D sensor of shape [N, H, W, C], n is the batch size, C is
+ * the number of input channels, H feature height and W feature width. Datatype
+ * is float16
+ * @param[in] rois: 2-D tensor of shape [num_rois, 4]. ROIs to be pooled
+ * (regions of interest). For example [[x1, Y1, X2, Y2],...], (x1, Y1) is the
+ * upper left point coordinate, (X2, Y2) is the lower right point coordinate.
+ * Data type is float16
+ * @param[in] roi_ind: 1-D tensor of shape [num_boxes] with values in [0,
+ * batch). The value of box_ind[i] specifies the image that the i-th roi refers
+ * to. Data type is int
+ * @param[out] output: 4-D tensor of shape [num_rois, pooled_height,
+ * pooled_weight, C].
+ * @param[in] height: The height of input
+ * @param[in] width: The width of input
+ * @param[in] channels: The channel of input
+ * @param[in] pooled_height: Output height after pooling
+ * @param[in] pooled_width: Output width after pooling
+ * @param[in] num_rois: The number of roi
+ * @param[in] spatial_scale: The scale factor of multiplicative space, when
+ * pooling, transforms the ROI coordinate to the scale used in the
+ * operation.image_height * spatial_scale == featuremap_height, width is also
+ * like this
+ * @param[in] sampling_ratio: The number of sampling points in the interpolation
+ * lattice. If it < = 0, they will adapt to ROI_Width and pooled_W, the same is
+ * true for height.
+ * @retval void
+ */
+void roi_align_kernel(half *input,
+                      half *rois,
+                      int *roi_ind,
+                      half *output,
+                      const int height,
+                      const int width,
+                      const int channels,
+                      const int pooled_height,
+                      const int pooled_width,
+                      const int rois_num,
+                      const half spatial_scale,
+                      const int sampling_ratio);
+
+#ifdef __cplusplus
+}
+#endif  // __cplusplus
+
+#endif  // LITE_KERNELS_MLU_ROI_ALIGN_KERNEL_H_
diff --git a/lite/kernels/x86/roi_align_compute.cc b/lite/kernels/x86/roi_align_compute.cc
index 26efd9160c59d0a45e53800d62e050bbfd941799..3c0614ebf4e1d888e836cc20a096a4981b280a94 100644
--- a/lite/kernels/x86/roi_align_compute.cc
+++ b/lite/kernels/x86/roi_align_compute.cc
@@ -13,9 +13,11 @@
 // limitations under the License.
 
 #include "lite/kernels/x86/roi_align_compute.h"
+
 #include <cmath>
 #include <string>
 #include <vector>
+
 #include "lite/core/op_registry.h"
 #include "lite/core/tensor.h"
 #include "lite/core/type_system.h"