diff --git a/paddle/fluid/operators/roi_align_op_npu.cc b/paddle/fluid/operators/roi_align_op_npu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c1ba046ca6af1a95165a0bf78458a1be56e29c0e
--- /dev/null
+++ b/paddle/fluid/operators/roi_align_op_npu.cc
@@ -0,0 +1,101 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/roi_align_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class ROIAlignNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* X = ctx.Input<framework::Tensor>("X");              // (B,C,H,W）
+    auto* ROIs = ctx.Input<framework::Tensor>("ROIs");        // (N，4）
+    auto* ROIsNum = ctx.Input<framework::Tensor>("RoisNum");  // [0 1 1 2 2 2]
+    auto* Out = ctx.Output<framework::Tensor>("Out");
+    Out->mutable_data<T>(ctx.GetPlace());
+
+    auto spatial_scale = ctx.Attr<float>("spatial_scale");
+    auto pooled_height = ctx.Attr<int>("pooled_height");
+    auto pooled_width = ctx.Attr<int>("pooled_width");
+    auto sample_num = ctx.Attr<int>("sampling_ratio");
+    auto aligned = ctx.Attr<bool>("aligned");
+    auto roi_end_mode = 0;
+    PADDLE_ENFORCE_EQ(
+        aligned, false,
+        platform::errors::InvalidArgument(
+            "ROIAlignNPU only support Aligned attribute equaled to False"));
+
+    framework::NPUAttributeMap attr_roi = {{"spatial_scale", spatial_scale},
+                                           {"pooled_height", pooled_height},
+                                           {"pooled_width", pooled_width},
+                                           {"sample_num", sample_num},
+                                           {"roi_end_mode", roi_end_mode}};
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    // Combine *ROIsNum with ROIs to get new ROIs
+    // change roisnum's datatype & resize
+    int dtype =
+        static_cast<int>(ConvertToNpuDtype(framework::proto::VarType::FP32));
+    framework::NPUAttributeMap attr_cast = {{"dst_type", dtype}};
+    Tensor ROIsNum_fp(ROIs->type());
+    ROIsNum_fp.Resize(framework::make_ddim({ROIs->dims()[0], 1}));
+    ROIsNum_fp.mutable_data<T>(ctx.GetPlace());
+
+    const auto& runner_c =
+        NpuOpRunner("Cast", {*ROIsNum}, {ROIsNum_fp}, attr_cast);
+    runner_c.Run(stream);
+
+    // concate to make (N, 5)
+    std::vector<paddle::framework::Tensor> x_list;
+    x_list.push_back(ROIsNum_fp);
+    x_list.push_back(*ROIs);
+    auto axis = 1;
+    // output of concate
+    Tensor ROIs_N5(ROIs->type());
+    ROIs_N5.Resize(framework::make_ddim({ROIs->dims()[0], 5}));
+    ROIs_N5.mutable_data<T>(ctx.GetPlace());
+
+    // attribute of concate
+    auto EleNum = 2;
+    framework::NPUAttributeMap attr_concat = {{"N", EleNum},
+                                              {"concat_dim", axis}};
+
+    NpuOpRunner runner0;
+    runner0.SetType("ConcatD")
+        .AddInputs(x_list)
+        .AddOutput(ROIs_N5)
+        .AddInputNames({"x0", "x1"})
+        .AddAttrs(attr_concat);
+    runner0.Run(stream);
+
+    const auto& runner =
+        NpuOpRunner("ROIAlign", {*X, ROIs_N5}, {*Out}, attr_roi);
+    runner.Run(stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_NPU_KERNEL(
+    roi_align,
+    ops::ROIAlignNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::ROIAlignNPUKernel<paddle::platform::NPUDeviceContext, double>,
+    ops::ROIAlignNPUKernel<paddle::platform::NPUDeviceContext, int>);
diff --git a/python/paddle/fluid/tests/unittests/npu/test_roi_align_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_roi_align_op_npu.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ca2856886e08cc1ae0ca2da2dea31f073c24c2b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_roi_align_op_npu.py
@@ -0,0 +1,217 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import math
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+
+paddle.enable_static()
+np.random.seed(1243)
+
+
+class TestROIAlignNPUOp(OpTest):
+    def set_data(self):
+        self.init_test_case()
+        self.make_rois()
+        self.calc_roi_align()
+
+        seq_len = self.rois_lod[0]
+
+        self.inputs = {
+            'X': self.x,
+            'ROIs': self.rois[:, 1:5],
+            'RoisNum': np.asarray(seq_len).astype('int32')
+        }
+
+        self.attrs = {
+            'spatial_scale': self.spatial_scale,
+            'pooled_height': self.pooled_height,
+            'pooled_width': self.pooled_width,
+            'sampling_ratio': self.sampling_ratio,
+            'aligned': self.aligned
+        }
+
+        self.outputs = {'Out': self.out_data}
+
+    def init_test_case(self):
+        self.batch_size = 3
+        self.channels = 3
+        self.height = 8
+        self.width = 6
+
+        # n, c, h, w
+        self.x_dim = (self.batch_size, self.channels, self.height, self.width)
+
+        self.spatial_scale = 1.0 / 2.0
+        self.pooled_height = 2
+        self.pooled_width = 2
+        self.sampling_ratio = 2
+        self.aligned = False
+
+        self.x = np.random.random(self.x_dim).astype('float32')
+
+    def pre_calc(self, x_i, roi_xmin, roi_ymin, roi_bin_grid_h, roi_bin_grid_w,
+                 bin_size_h, bin_size_w):
+        count = roi_bin_grid_h * roi_bin_grid_w
+        bilinear_pos = np.zeros(
+            [self.channels, self.pooled_height, self.pooled_width, count, 4],
+            np.float32)
+        bilinear_w = np.zeros(
+            [self.pooled_height, self.pooled_width, count, 4], np.float32)
+        for ph in range(self.pooled_width):
+            for pw in range(self.pooled_height):
+                c = 0
+                for iy in range(roi_bin_grid_h):
+                    y = roi_ymin + ph * bin_size_h + (iy + 0.5) * \
+                        bin_size_h / roi_bin_grid_h
+                    for ix in range(roi_bin_grid_w):
+                        x = roi_xmin + pw * bin_size_w + (ix + 0.5) * \
+                            bin_size_w / roi_bin_grid_w
+                        if y < -1.0 or y > self.height or \
+                               x < -1.0 or x > self.width:
+                            continue
+                        if y <= 0:
+                            y = 0
+                        if x <= 0:
+                            x = 0
+                        y_low = int(y)
+                        x_low = int(x)
+                        if y_low >= self.height - 1:
+                            y = y_high = y_low = self.height - 1
+                        else:
+                            y_high = y_low + 1
+                        if x_low >= self.width - 1:
+                            x = x_high = x_low = self.width - 1
+                        else:
+                            x_high = x_low + 1
+                        ly = y - y_low
+                        lx = x - x_low
+                        hy = 1 - ly
+                        hx = 1 - lx
+                        for ch in range(self.channels):
+                            bilinear_pos[ch, ph, pw, c, 0] = x_i[ch, y_low,
+                                                                 x_low]
+                            bilinear_pos[ch, ph, pw, c, 1] = x_i[ch, y_low,
+                                                                 x_high]
+                            bilinear_pos[ch, ph, pw, c, 2] = x_i[ch, y_high,
+                                                                 x_low]
+                            bilinear_pos[ch, ph, pw, c, 3] = x_i[ch, y_high,
+                                                                 x_high]
+                        bilinear_w[ph, pw, c, 0] = hy * hx
+                        bilinear_w[ph, pw, c, 1] = hy * lx
+                        bilinear_w[ph, pw, c, 2] = ly * hx
+                        bilinear_w[ph, pw, c, 3] = ly * lx
+                        c = c + 1
+        return bilinear_pos, bilinear_w
+
+    def calc_roi_align(self):
+        self.out_data = np.zeros(
+            (self.rois_num, self.channels, self.pooled_height,
+             self.pooled_width)).astype('float32')
+
+        offset = 0.5 if self.aligned else 0.
+        for i in range(self.rois_num):
+            roi = self.rois[i]
+            roi_batch_id = int(roi[0])
+            x_i = self.x[roi_batch_id]
+            roi_xmin = roi[1] * self.spatial_scale - offset
+            roi_ymin = roi[2] * self.spatial_scale - offset
+            roi_xmax = roi[3] * self.spatial_scale - offset
+            roi_ymax = roi[4] * self.spatial_scale - offset
+
+            roi_width = roi_xmax - roi_xmin
+            roi_height = roi_ymax - roi_ymin
+            if not self.aligned:
+                roi_width = max(roi_width, 1)
+                roi_height = max(roi_height, 1)
+
+            bin_size_h = float(roi_height) / float(self.pooled_height)
+            bin_size_w = float(roi_width) / float(self.pooled_width)
+            roi_bin_grid_h = self.sampling_ratio if self.sampling_ratio > 0 else \
+                                 math.ceil(roi_height / self.pooled_height)
+            roi_bin_grid_w = self.sampling_ratio if self.sampling_ratio > 0 else \
+                                 math.ceil(roi_width / self.pooled_width)
+            count = max(int(roi_bin_grid_h * roi_bin_grid_w), 1)
+            pre_size = count * self.pooled_width * self.pooled_height
+            bilinear_pos, bilinear_w = self.pre_calc(x_i, roi_xmin, roi_ymin,
+                                                     int(roi_bin_grid_h),
+                                                     int(roi_bin_grid_w),
+                                                     bin_size_h, bin_size_w)
+            for ch in range(self.channels):
+                align_per_bin = (bilinear_pos[ch] * bilinear_w).sum(axis=-1)
+                output_val = align_per_bin.mean(axis=-1)
+                self.out_data[i, ch, :, :] = output_val
+
+    def make_rois(self):
+        rois = []
+        self.rois_lod = [[]]
+        for bno in range(self.batch_size):
+            # for i in range(bno + 1):
+            self.rois_lod[0].append(bno)
+            x1 = np.random.randint(
+                0, self.width // self.spatial_scale - self.pooled_width)
+            y1 = np.random.randint(
+                0, self.height // self.spatial_scale - self.pooled_height)
+
+            x2 = np.random.randint(x1 + self.pooled_width,
+                                   self.width // self.spatial_scale)
+            y2 = np.random.randint(y1 + self.pooled_height,
+                                   self.height // self.spatial_scale)
+
+            roi = [bno, x1, y1, x2, y2]
+            rois.append(roi)
+
+        self.rois_num = len(rois)
+        self.rois = np.array(rois).astype("float32")
+
+    def setUp(self):
+        self.op_type = "roi_align"
+        self.__class__.use_npu = True
+        self.place = paddle.NPUPlace(0)
+        self.set_data()
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(self.place, ['X'], 'Out')
+
+
+class TestROIAlignOpWithMinusSample(TestROIAlignNPUOp):
+    def init_test_case(self):
+        self.batch_size = 3
+        self.channels = 3
+        self.height = 8
+        self.width = 6
+
+        # n, c, h, w
+        self.x_dim = (self.batch_size, self.channels, self.height, self.width)
+
+        self.spatial_scale = 1.0 / 2.0
+        self.pooled_height = 2
+        self.pooled_width = 2
+        self.sampling_ratio = -1
+        self.aligned = False
+
+        self.x = np.random.random(self.x_dim).astype('float32')
+
+
+if __name__ == '__main__':
+    unittest.main()