diff --git a/paddle/fluid/operators/clip_by_norm_op_npu.cc b/paddle/fluid/operators/clip_by_norm_op_npu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e6b46421afa7268e026cfb79438f59a2c5457cdf
--- /dev/null
+++ b/paddle/fluid/operators/clip_by_norm_op_npu.cc
@@ -0,0 +1,94 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/clip_by_norm_op.h"
+#include "paddle/fluid/platform/device/npu/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class NPUClipByNormKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto max_norm = context.Attr<float>("max_norm");
+    auto in_var = context.InputVar("X");
+
+    if (!(in_var->IsType<framework::LoDTensor>())) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Invalid input variable type, only support LodTensor"
+          "type, but got type is %s.",
+          framework::ToTypeName(in_var->Type())));
+    }
+
+    auto place = context.GetPlace();
+    auto& dev_ctx =
+        context.template device_context<paddle::platform::NPUDeviceContext>();
+    auto stream = dev_ctx.stream();
+
+    auto* input = context.Input<Tensor>("X");
+    auto* output = context.Output<Tensor>("Out");
+    output->mutable_data<T>(place);
+
+    PADDLE_ENFORCE_NOT_NULL(input,
+                            platform::errors::InvalidArgument(
+                                "Input(X) of ClipByNormOp should not be null. "
+                                "Please check if it is created correctly."));
+
+    Tensor square_sum(input->type());
+    square_sum.mutable_data<T>(framework::DDim({1}), place);
+    const auto& x_dims = input->dims();
+    std::vector<int> axis;
+    for (int i = 0; i < x_dims.size(); ++i) {
+      axis.push_back(i);
+    }
+    const auto& square_sum_runner =
+        NpuOpRunner("SquareSumV1", {*input}, {square_sum},
+                    {{"axis", axis}, {"keep_dims", false}});
+    square_sum_runner.Run(stream);
+
+    Tensor x_norm(input->type());
+    x_norm.mutable_data<T>(framework::DDim({1}), place);
+    const auto& x_norm_runner = NpuOpRunner("Sqrt", {square_sum}, {x_norm}, {});
+    x_norm_runner.Run(stream);
+
+    Tensor x_norm_t;
+    framework::TensorCopySync(x_norm, platform::CPUPlace(), &x_norm_t);
+    auto x_norm_v = static_cast<float>(*x_norm_t.data<T>());
+    if (x_norm_v <= max_norm) {
+      framework::TensorCopy(*input, place, dev_ctx, output);
+    } else {
+      auto epsilon = x_norm_v <= static_cast<float>(1e-30)
+                         ? static_cast<float>(1e-6)
+                         : static_cast<float>(0);
+      float scaling = max_norm / (x_norm_v + epsilon);
+      const auto& muls_runner =
+          NpuOpRunner("Muls", {*input}, {*output}, {{"value", scaling}});
+      muls_runner.Run(stream);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_NPU_KERNEL(
+    clip_by_norm,
+    ops::NPUClipByNormKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::NPUClipByNormKernel<paddle::platform::NPUDeviceContext,
+                             plat::float16>);
diff --git a/python/paddle/fluid/tests/unittests/npu/test_clip_by_norm_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_clip_by_norm_op_npu.py
new file mode 100644
index 0000000000000000000000000000000000000000..d71fc142ade3a6f0d05996a778daf8793873634a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_clip_by_norm_op_npu.py
@@ -0,0 +1,108 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import Program, program_guard
+import sys
+sys.path.append("..")
+from op_test import OpTest
+
+paddle.enable_static()
+
+
+class TestClipByNormOp(OpTest):
+    def setUp(self):
+        self.set_npu()
+        self.max_relative_error = 0.006
+        self.init_dtype()
+        self.initTestCase()
+        input = np.random.random(self.shape).astype(self.dtype)
+        input[np.abs(input) < self.max_relative_error] = 0.5
+        self.op_type = "clip_by_norm"
+        self.inputs = {'X': input, }
+        self.attrs = {}
+        self.attrs['max_norm'] = self.max_norm
+        norm = np.sqrt(np.sum(np.square(input)))
+        if norm > self.max_norm:
+            output = self.max_norm * input / norm
+        else:
+            output = input
+        self.outputs = {'Out': output}
+
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.place = paddle.NPUPlace(0)
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def initTestCase(self):
+        self.shape = (100, )
+        self.max_norm = 1.0
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+
+class TestCase1(TestClipByNormOp):
+    def initTestCase(self):
+        self.shape = (100, )
+        self.max_norm = 1e20
+
+
+class TestCase2(TestClipByNormOp):
+    def initTestCase(self):
+        self.shape = (16, 16)
+        self.max_norm = 0.1
+
+
+class TestCase3(TestClipByNormOp):
+    def initTestCase(self):
+        self.shape = (4, 8, 16)
+        self.max_norm = 1.0
+
+
+class TestClipByNormOpFp16(TestClipByNormOp):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, atol=1e-3)
+
+
+class TestClipByNormOpFp16Case1(TestClipByNormOpFp16):
+    def initTestCase(self):
+        self.shape = (100, )
+        self.max_norm = 1e20
+
+
+class TestClipByNormOpFp16Case2(TestClipByNormOpFp16):
+    def initTestCase(self):
+        self.shape = (16, 16)
+        self.max_norm = 0.1
+
+
+class TestClipByNormOpFp16Case3(TestClipByNormOpFp16):
+    def initTestCase(self):
+        self.shape = (4, 8, 16)
+        self.max_norm = 1.0
+
+
+if __name__ == '__main__':
+    unittest.main()