diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt
index 6e5ea3e8aa721844b9a1cd2a29d32d462c72fd94..c05c39e88d74adc5db2ef92662d65c95c954152c 100644
--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
@@ -45,6 +45,9 @@ if(WITH_XPU)
   detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op_xpu.cc)
   detection_library(generate_proposals_v2_op SRCS generate_proposals_v2_op.cc
                     generate_proposals_v2_op_xpu.cc)
+elseif(WITH_MLU)
+  detection_library(iou_similarity_op SRCS iou_similarity_op.cc
+                    iou_similarity_op_mlu.cc)
 elseif(WITH_ASCEND_CL)
   detection_library(iou_similarity_op SRCS iou_similarity_op.cc
                     iou_similarity_op_npu.cc)
diff --git a/paddle/fluid/operators/detection/iou_similarity_op_mlu.cc b/paddle/fluid/operators/detection/iou_similarity_op_mlu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2d86a264b1152bbf835579cb9a88b74c547b99a3
--- /dev/null
+++ b/paddle/fluid/operators/detection/iou_similarity_op_mlu.cc
@@ -0,0 +1,227 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/detection/iou_similarity_op.h"
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+struct IouFunction {
+ public:
+  explicit IouFunction(const framework::ExecutionContext& ctx) : ctx(ctx) {
+    place = ctx.GetPlace();
+  }
+  void Transpose(const Tensor* x, Tensor* y, const std::vector<int>& axis) {
+    //  y should be init first
+    TransposeFromMLUTensor<T>(ctx, axis, x, y, false /*need_reshape_or_alloc*/);
+  }
+  void Add(const Tensor* x, const Tensor* y, Tensor* z) {
+    //  y should be init first
+    MLUCnnlTensorDesc x_desc(*x);
+    MLUCnnlTensorDesc y_desc(*y);
+    MLUCnnlTensorDesc z_desc(*z);
+
+    MLUCnnlOpTensorDesc add_op_desc(CNNL_OP_TENSOR_ADD, ToCnnlDataType<T>(),
+                                    CNNL_NOT_PROPAGATE_NAN);
+    MLUCnnl::OpTensor(ctx, add_op_desc.get(), x_desc.get(), GetBasePtr(x),
+                      y_desc.get(), GetBasePtr(y), z_desc.get(), GetBasePtr(z),
+                      ToCnnlDataType<T>());
+  }
+
+  void Sub(const Tensor* x, const Tensor* y, Tensor* z) {
+    //  y should be init first
+    MLUCnnlTensorDesc x_desc(*x);
+    MLUCnnlTensorDesc y_desc(*y);
+    MLUCnnlTensorDesc z_desc(*z);
+
+    MLUCnnlOpTensorDesc sub_op_desc(CNNL_OP_TENSOR_SUB, ToCnnlDataType<T>(),
+                                    CNNL_NOT_PROPAGATE_NAN);
+    MLUCnnl::OpTensor(ctx, sub_op_desc.get(), x_desc.get(), GetBasePtr(x),
+                      y_desc.get(), GetBasePtr(y), z_desc.get(), GetBasePtr(z),
+                      ToCnnlDataType<T>());
+  }
+  void Mul(const Tensor* x, const Tensor* y, Tensor* z) {
+    //  z should be init first
+    MLUCnnlTensorDesc x_desc(*x);
+    MLUCnnlTensorDesc y_desc(*y);
+    MLUCnnlTensorDesc z_desc(*z);
+
+    MLUCnnlOpTensorDesc mul_op_desc(CNNL_OP_TENSOR_MUL, ToCnnlDataType<T>(),
+                                    CNNL_NOT_PROPAGATE_NAN);
+    MLUCnnl::OpTensor(ctx, mul_op_desc.get(), x_desc.get(), GetBasePtr(x),
+                      y_desc.get(), GetBasePtr(y), z_desc.get(), GetBasePtr(z),
+                      ToCnnlDataType<T>());
+  }
+  void DivNoNan(const Tensor* x, const Tensor* y, Tensor* z) {
+    //  z should be init first
+    MLUCnnlTensorDesc x_desc(*x);
+    MLUCnnlTensorDesc y_desc(*y);
+    MLUCnnlTensorDesc z_desc(*z);
+
+    cnnlComputationPreference_t prefer = CNNL_COMPUTATION_FAST;
+
+    MLUCnnl::DivNoNan(ctx, prefer, x_desc.get(), GetBasePtr(x), y_desc.get(),
+                      GetBasePtr(y), z_desc.get(), GetBasePtr(z));
+  }
+  void Adds(const Tensor* x, float scalar, Tensor* y) {
+    //  y should be init first
+    MLUCnnlTensorDesc x_desc(*x);
+    MLUCnnlTensorDesc y_desc(*y);
+    float alpha = 1.0;
+    float beta = scalar;
+    MLUCnnl::Transform(ctx, &alpha, &beta, x_desc.get(), GetBasePtr(x),
+                       y_desc.get(), GetBasePtr(y));
+  }
+  void Maximum(const Tensor* x, const Tensor* y, Tensor* z) {
+    //  z should be init first
+    MLUCnnlTensorDesc x_desc(*x);
+    MLUCnnlTensorDesc y_desc(*y);
+    MLUCnnlTensorDesc z_desc(*z);
+
+    MLUCnnl::Maximum(ctx, x_desc.get(), GetBasePtr(x), y_desc.get(),
+                     GetBasePtr(y), z_desc.get(), GetBasePtr(z));
+  }
+  void Minimum(const Tensor* x, const Tensor* y, Tensor* z) {
+    //  z should be init first
+    MLUCnnlTensorDesc x_desc(*x);
+    MLUCnnlTensorDesc y_desc(*y);
+    MLUCnnlTensorDesc z_desc(*z);
+
+    MLUCnnl::Minimum(ctx, x_desc.get(), GetBasePtr(x), y_desc.get(),
+                     GetBasePtr(y), z_desc.get(), GetBasePtr(z));
+  }
+
+ private:
+  platform::Place place;
+  const framework::ExecutionContext& ctx;
+};
+
+template <typename T>
+class IouSimilarityMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::LoDTensor>("X");
+    auto* y = ctx.Input<framework::Tensor>("Y");
+    bool normalized = ctx.Attr<bool>("box_normalized");
+    auto* out = ctx.Output<framework::LoDTensor>("Out");
+
+    auto _type = x->dtype();
+    auto place = ctx.GetPlace();
+
+    IouFunction<T> F(ctx);
+
+    auto N = x->dims()[0];
+    auto M = y->dims()[0];
+
+    out->mutable_data<T>({N, M}, place);
+    Tensor xt(_type);
+    Tensor yt(_type);
+    xt.mutable_data<T>({4, N}, place);
+    yt.mutable_data<T>({4, M}, place);
+    std::vector<int> vec_trans = {1, 0};
+    F.Transpose(x, &xt, vec_trans);
+    F.Transpose(y, &yt, vec_trans);
+    Tensor xmin1 = xt.Slice(0, 1);
+    Tensor ymin1 = xt.Slice(1, 2);
+    Tensor xmax1 = xt.Slice(2, 3);
+    Tensor ymax1 = xt.Slice(3, 4);
+    Tensor xmin2 = yt.Slice(0, 1);
+    Tensor ymin2 = yt.Slice(1, 2);
+    Tensor xmax2 = yt.Slice(2, 3);
+    Tensor ymax2 = yt.Slice(3, 4);
+    xmin1.Resize({N, 1});
+    ymin1.Resize({N, 1});
+    xmax1.Resize({N, 1});
+    ymax1.Resize({N, 1});
+    xmin2.Resize({1, M});
+    ymin2.Resize({1, M});
+    xmax2.Resize({1, M});
+    ymax2.Resize({1, M});
+
+    Tensor w1(_type);
+    Tensor h1(_type);
+    Tensor w2(_type);
+    Tensor h2(_type);
+    Tensor area1(_type);
+    Tensor area2(_type);
+    w1.mutable_data<T>({N, 1}, place);
+    h1.mutable_data<T>({N, 1}, place);
+    w2.mutable_data<T>({1, M}, place);
+    h2.mutable_data<T>({1, M}, place);
+    area1.mutable_data<T>({N, 1}, place);
+    area2.mutable_data<T>({1, M}, place);
+    F.Sub(&xmax1, &xmin1, &w1);
+    F.Sub(&ymax1, &ymin1, &h1);
+    F.Sub(&xmax2, &xmin2, &w2);
+    F.Sub(&ymax2, &ymin2, &h2);
+    if (!normalized) {
+      F.Adds(&w1, 1.0f, &w1);
+      F.Adds(&h1, 1.0f, &h1);
+      F.Adds(&w2, 1.0f, &w2);
+      F.Adds(&h2, 1.0f, &h2);
+    }
+    F.Mul(&w1, &h1, &area1);
+    F.Mul(&w2, &h2, &area2);
+
+    Tensor inter_xmax(_type);
+    Tensor inter_ymax(_type);
+    Tensor inter_xmin(_type);
+    Tensor inter_ymin(_type);
+    inter_xmax.mutable_data<T>({N, M}, place);
+    inter_ymax.mutable_data<T>({N, M}, place);
+    inter_xmin.mutable_data<T>({N, M}, place);
+    inter_ymin.mutable_data<T>({N, M}, place);
+    F.Minimum(&xmax1, &xmax2, &inter_xmax);
+    F.Minimum(&ymax1, &ymax2, &inter_ymax);
+    F.Maximum(&xmin1, &xmin2, &inter_xmin);
+    F.Maximum(&ymin1, &ymin2, &inter_ymin);
+
+    Tensor inter_w(_type);
+    Tensor inter_h(_type);
+    inter_w.mutable_data<T>({N, M}, place);
+    inter_h.mutable_data<T>({N, M}, place);
+    F.Sub(&inter_xmax, &inter_xmin, &inter_w);
+    F.Sub(&inter_ymax, &inter_ymin, &inter_h);
+
+    if (!normalized) {
+      F.Adds(&inter_w, 1.0f, &inter_w);
+      F.Adds(&inter_h, 1.0f, &inter_h);
+    }
+    Tensor zeros(_type);
+    zeros.mutable_data<T>({1}, place);
+    FillMLUTensorWithHostValue<T>(ctx, static_cast<T>(0), &zeros);
+    F.Maximum(&inter_w, &zeros, &inter_w);
+    F.Maximum(&inter_h, &zeros, &inter_h);
+
+    F.Mul(&inter_w, &inter_h, out);
+    Tensor union_area(_type);
+    union_area.mutable_data<T>({N, M}, place);
+    F.Add(&area1, &area2, &union_area);
+    F.Sub(&union_area, out, &union_area);
+    F.DivNoNan(out, &union_area, out);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_MLU_KERNEL(iou_similarity, ops::IouSimilarityMLUKernel<float>,
+                       ops::IouSimilarityMLUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/mlu/mlu_baseop.cc b/paddle/fluid/operators/mlu/mlu_baseop.cc
index 8414a7921debc882451df4607382924463df9b2b..dd1ac8149380b89bedd61f62691d88d6924c4a97 100644
--- a/paddle/fluid/operators/mlu/mlu_baseop.cc
+++ b/paddle/fluid/operators/mlu/mlu_baseop.cc
@@ -2857,6 +2857,20 @@ MLUCnnlTrigonDesc::~MLUCnnlTrigonDesc() {
       nullptr /*max_norm*/, nullptr /*norm_type*/, output_desc, output));
 }
 
+/* static */ void MLUCnnl::Transform(const ExecutionContext& ctx,
+                                     const void* alpha, const void* beta,
+                                     const cnnlTensorDescriptor_t input_desc,
+                                     const void* input,
+                                     const cnnlTensorDescriptor_t output_desc,
+                                     void* output) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  const cnnlPointerMode_t pointer_mode = CNNL_POINTER_MODE_HOST;
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlTransform_v2(handle, pointer_mode, alpha,
+                                              input_desc, input, beta,
+                                              output_desc, output));
+}
+
 /* static */ void MLUCnnl::EmbeddingBackward(
     const ExecutionContext& ctx, int padding_idx, bool scale_grad_by_freq,
     const cnnlTensorDescriptor_t indices_desc, const void* indices,
diff --git a/paddle/fluid/operators/mlu/mlu_baseop.h b/paddle/fluid/operators/mlu/mlu_baseop.h
index 6c5f716625c2d6fbe411556f59ae526629abc879..636618bf2d9cf906003f66453c7ff2e56784cc02 100644
--- a/paddle/fluid/operators/mlu/mlu_baseop.h
+++ b/paddle/fluid/operators/mlu/mlu_baseop.h
@@ -1289,6 +1289,12 @@ class MLUCnnl {
       const cnnlTensorDescriptor_t indices_desc, const int* indices,
       const cnnlTensorDescriptor_t output_desc, void* output);
 
+  static void Transform(const ExecutionContext& ctx, const void* alpha,
+                        const void* beta,
+                        const cnnlTensorDescriptor_t input_desc,
+                        const void* input,
+                        const cnnlTensorDescriptor_t output_desc, void* output);
+
   static void EmbeddingBackward(
       const ExecutionContext& ctx, int padding_idx, bool scale_grad_by_freq,
       const cnnlTensorDescriptor_t indices_desc, const void* indices,
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_iou_similarity_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_iou_similarity_op_mlu.py
new file mode 100644
index 0000000000000000000000000000000000000000..4120cb8fbcc22db7011140a9bef29e38194e7e9d
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_iou_similarity_op_mlu.py
@@ -0,0 +1,131 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import numpy.random as random
+import sys
+
+sys.path.append("..")
+import math
+import paddle
+from op_test import OpTest
+
+paddle.enable_static()
+
+np.random.seed(2022)
+
+
+class TestMluIouSimilarityOp(OpTest):
+
+    def setUp(self):
+        self.op_type = "iou_similarity"
+        self.set_mlu()
+        self.init_dtype()
+        self.set_init_config()
+        self.set_attrs()
+        self.set_inputs()
+        self.set_outputs()
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.MLUPlace(0)
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def set_init_config(self):
+        self.N = 2
+        self.M = 3
+        self.box_normalized = False
+        self.use_lod = False
+
+    def set_inputs(self):
+        self.boxes1 = random.rand(self.N, 4).astype(self.dtype)
+        self.boxes2 = random.rand(self.M, 4).astype(self.dtype)
+        if self.use_lod:
+            self.boxes1_lod = [[1 for _ in range(self.N)]]
+            self.inputs = {
+                'X': (self.boxes1, self.boxes1_lod),
+                'Y': self.boxes2
+            }
+        else:
+            self.inputs = {'X': self.boxes1, 'Y': self.boxes2}
+
+    def set_attrs(self):
+        self.attrs = {"box_normalized": self.box_normalized}
+
+    def set_outputs(self):
+        self.output = random.rand(self.N, self.M).astype(self.dtype)
+        self._compute_iou()
+        self.outputs = {'Out': self.output}
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def _compute_iou(self, ):
+        for row in range(self.boxes1.shape[0]):
+            for col in range(self.boxes2.shape[0]):
+                xmin1, ymin1, xmax1, ymax1 = self.boxes1[row]
+                xmin2, ymin2, xmax2, ymax2 = self.boxes2[col]
+                if not self.box_normalized:
+                    area1 = (ymax1 - ymin1 + 1) * (xmax1 - xmin1 + 1)
+                    area2 = (ymax2 - ymin2 + 1) * (xmax2 - xmin2 + 1)
+                else:
+                    area1 = (ymax1 - ymin1) * (xmax1 - xmin1)
+                    area2 = (ymax2 - ymin2) * (xmax2 - xmin2)
+
+                inter_xmax = min(xmax1, xmax2)
+                inter_ymax = min(ymax1, ymax2)
+                inter_xmin = max(xmin1, xmin2)
+                inter_ymin = max(ymin1, ymin2)
+                inter_height = inter_ymax - inter_ymin
+                inter_width = inter_xmax - inter_xmin
+                if not self.box_normalized:
+                    inter_height += 1
+                    inter_width += 1
+                inter_height = max(inter_height, 0)
+                inter_width = max(inter_width, 0)
+                inter_area = inter_width * inter_height
+                union_area = area1 + area2 - inter_area
+                sim_score = inter_area / union_area
+                self.output[row, col] = sim_score
+
+
+class TestMluIouSimilarityOpWithLoD(TestMluIouSimilarityOp):
+
+    def set_init_config(self):
+        super(TestMluIouSimilarityOpWithLoD, self).set_init_config()
+        self.box_normalized = True
+        self.use_lod = True
+
+
+class TestMluIouSimilarityOpWithBoxNormalized(TestMluIouSimilarityOp):
+
+    def set_init_config(self):
+        super(TestMluIouSimilarityOpWithBoxNormalized, self).set_init_config()
+        self.box_normalized = True
+        self.use_lod = True
+
+
+def TestMluIouSimilarityOpFp16(TestMluIouSimilarityOp):
+
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
+if __name__ == '__main__':
+    unittest.main()