diff --git a/paddle/cinn/auto_schedule/analysis/analyze_ir.cc b/paddle/cinn/auto_schedule/analysis/analyze_ir.cc
index cf4b17747ad3b3981b446be1cd36bac8b47ff6e2..62cef5e2694c1c39515bd3ecc2d74a86d905d8a0 100644
--- a/paddle/cinn/auto_schedule/analysis/analyze_ir.cc
+++ b/paddle/cinn/auto_schedule/analysis/analyze_ir.cc
@@ -145,7 +145,7 @@ bool NeedsMultiLevelTiling(const ir::ScheduleBlockRealize& sche_block_realize) {
 
 ir::LoweredFunc UpdateFuncWithNewBody(const common::Target& target,
                                       const ir::LoweredFunc& old_func,
-                                      ir::Expr& body) {
+                                      ir::Expr& body) {  // NOLINT
   ir::ModuleExpr mod_expr(std::vector<ir::Expr>({body}));
   ir::IRSchedule ir_sch(mod_expr);
 
diff --git a/paddle/cinn/auto_schedule/analysis/analyze_ir.h b/paddle/cinn/auto_schedule/analysis/analyze_ir.h
index 4e48be04ee5fc11c431cf9adbea45cc5d712a400..f59aa06b2dcaf6db2dc4c236c3dc13a0c16a6218 100644
--- a/paddle/cinn/auto_schedule/analysis/analyze_ir.h
+++ b/paddle/cinn/auto_schedule/analysis/analyze_ir.h
@@ -46,7 +46,7 @@ bool NeedsMultiLevelTiling(const ir::ScheduleBlockRealize& sche_block_realize);
  */
 ir::LoweredFunc UpdateFuncWithNewBody(const common::Target& target,
                                       const ir::LoweredFunc& old_func,
-                                      ir::Expr& body);
+                                      ir::Expr& body);  // NOLINT
 
 }  // namespace auto_schedule
 }  // namespace cinn
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_inline.h b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_inline.h
index 02090467049a08b3cf01b497bd53b358d0c45270..8a556dfdf488fbbec463841c654fe7aa8872f99c 100644
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_inline.h
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_inline.h
@@ -64,7 +64,7 @@ class AutoInline : public AutoGenRule {
                                         const std::string& block_name) override;
 
  private:
-  void Apply(ir::IRSchedule* ir_schedule, ir::Expr& block_expr);
+  void Apply(ir::IRSchedule* ir_schedule, ir::Expr& block_expr);  // NOLINT
 
  private:
   std::vector<ir::Expr> all_block_realizes_;
diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling.h b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling.h
index 378a67c1620d08d36dcc4c6b7caef3dc0c724d2a..cd26bc69b8531938589c4e699ac4ef78ffe131cd 100644
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling.h
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling.h
@@ -120,9 +120,12 @@ class MultiLevelTiling : public AutoGenRule {
   }
 
  private:
-  void ApplyTiling(ir::IRSchedule* ir_schedule, ir::Expr& block_expr);
-  void ApplyCacheRead(ir::IRSchedule* ir_schedule, ir::Expr& block_expr);
-  void ApplyCacheWrite(ir::IRSchedule* ir_schedule, ir::Expr& block_expr);
+  void ApplyTiling(ir::IRSchedule* ir_schedule,
+                   ir::Expr& block_expr);  // NOLINT
+  void ApplyCacheRead(ir::IRSchedule* ir_schedule,
+                      ir::Expr& block_expr);  // NOLINT
+  void ApplyCacheWrite(ir::IRSchedule* ir_schedule,
+                       ir::Expr& block_expr);  // NOLINT
 
  private:
   std::vector<ir::Expr> all_block_realizes_;
diff --git a/paddle/cinn/auto_schedule/task/task_optimizer.cc b/paddle/cinn/auto_schedule/task/task_optimizer.cc
index f988a03eb301b6a35974ae7ec046ff20dee4e677..cc658934dd4da4ddd36ad8ea1e9155e9f4daf592 100644
--- a/paddle/cinn/auto_schedule/task/task_optimizer.cc
+++ b/paddle/cinn/auto_schedule/task/task_optimizer.cc
@@ -51,7 +51,7 @@ using cinn::hlir::op::ExternalApiRegistry;
 // *** update a scheduled function with several post-processors
 ir::LoweredFunc FuncWithUpdatedBody(const common::Target& target,
                                     const ir::LoweredFunc& old_func,
-                                    ir::Expr& body);
+                                    ir::Expr& body);  // NOLINT
 // check whether a scheduled lowered function is valid
 bool PruneInvalid(const ir::LoweredFunc& lowered_func,
                   const common::Target& target);
diff --git a/paddle/cinn/backends/llvm/codegen_llvm.cc b/paddle/cinn/backends/llvm/codegen_llvm.cc
index 5e49c36525b307c7ab4d78041a5a797ab308b32f..57dc97ecc94f9108a4dc62e04986d90a5ed62047 100644
--- a/paddle/cinn/backends/llvm/codegen_llvm.cc
+++ b/paddle/cinn/backends/llvm/codegen_llvm.cc
@@ -1038,13 +1038,13 @@ llvm::Value *CodeGenLLVM::Visit(const ir::_Tensor_ *op) {
 
 template <typename T,
           std::enable_if_t<std::is_same<const ir::Expr &, T>::value, int> = 0>
-void appendBody(std::vector<Expr> &new_body, T &&v) {
+void appendBody(std::vector<Expr> &new_body, T &&v) {  // NOLINT
   new_body.push_back(v);
 }
 
 template <typename T,
           std::enable_if_t<!std::is_same<const ir::Expr &, T>::value, int> = 1>
-void appendBody(std::vector<Expr> &new_body, T &&v) {
+void appendBody(std::vector<Expr> &new_body, T &&v) {  // NOLINT
   new_body.insert(new_body.end(), v.begin(), v.end());
 }
 
diff --git a/paddle/cinn/backends/llvm/codegen_llvm.h b/paddle/cinn/backends/llvm/codegen_llvm.h
index facf13d05147b90795b5e1920fee951e0a4cbd02..bf5be73adcb11fc36bdfd006d546d4d95d851c36 100644
--- a/paddle/cinn/backends/llvm/codegen_llvm.h
+++ b/paddle/cinn/backends/llvm/codegen_llvm.h
@@ -98,7 +98,7 @@ class SymbolTable {
 };
 
 struct SymbolTableGuard {
-  explicit SymbolTableGuard(SymbolTable &symbol_table)
+  explicit SymbolTableGuard(SymbolTable &symbol_table)  // NOLINT
       : symbol_table_(symbol_table) {
     symbol_table.PushScope();
   }
diff --git a/paddle/cinn/common/arithmatic.cc b/paddle/cinn/common/arithmatic.cc
index 44ed4846e782eabcce2df48b40d6c3e41f07ad03..6dff837ca10173178cccab5a219b1cfbec9c04b8 100644
--- a/paddle/cinn/common/arithmatic.cc
+++ b/paddle/cinn/common/arithmatic.cc
@@ -180,7 +180,8 @@ class GiNaCToExprVisitor : public GiNaC::symbol::visitor,
   ir::Expr cur;
 
  public:
-  explicit GiNaCToExprVisitor(std::map<std::string, ir::Expr>& repr_to_expr)
+  explicit GiNaCToExprVisitor(
+      std::map<std::string, ir::Expr>& repr_to_expr)  // NOLINT
       : repr_to_expr(repr_to_expr) {}
 
   Expr operator()(GiNaC::ex ex) {
diff --git a/paddle/cinn/common/float16.h b/paddle/cinn/common/float16.h
index 3cdaa8b2f4c39c5fcee277b100b94c18ad62d9c9..d64731387d596aa668aa39294a28f6c1e21cea04 100644
--- a/paddle/cinn/common/float16.h
+++ b/paddle/cinn/common/float16.h
@@ -500,25 +500,25 @@ __host__ __device__ inline float16 operator-(const float16& a) {
 #endif
 }
 
-__host__ __device__ inline float16& operator+=(float16& a,
+__host__ __device__ inline float16& operator+=(float16& a,          // NOLINT
                                                const float16& b) {  // NOLINT
   a = a + b;
   return a;
 }
 
-__host__ __device__ inline float16& operator-=(float16& a,
+__host__ __device__ inline float16& operator-=(float16& a,          // NOLINT
                                                const float16& b) {  // NOLINT
   a = a - b;
   return a;
 }
 
-__host__ __device__ inline float16& operator*=(float16& a,
+__host__ __device__ inline float16& operator*=(float16& a,          // NOLINT
                                                const float16& b) {  // NOLINT
   a = a * b;
   return a;
 }
 
-__host__ __device__ inline float16& operator/=(float16& a,
+__host__ __device__ inline float16& operator/=(float16& a,          // NOLINT
                                                const float16& b) {  // NOLINT
   a = a / b;
   return a;
diff --git a/paddle/cinn/frontend/computation.cc b/paddle/cinn/frontend/computation.cc
index 868dc50807e9e4e83fbdc4112d432664e9b83715..ab6ea29ee92434615ad4fa49e937ab8b7be22b2c 100644
--- a/paddle/cinn/frontend/computation.cc
+++ b/paddle/cinn/frontend/computation.cc
@@ -42,7 +42,7 @@ struct ComputationContext {
 
 std::shared_ptr<ComputationContext> CompileProgram(
     const Target &target,
-    Program &program,
+    Program &program,  // NOLINT
     const std::vector<Variable> &outputs,
     std::shared_ptr<hlir::framework::Scope> scope,
     const CinnComputation::CompileOptions &options,
diff --git a/paddle/cinn/frontend/computation.h b/paddle/cinn/frontend/computation.h
index 8b16f9483320f768bcfeaa65a3273198650444f0..7ad4e381b24f4a038328eda87c308dd021831987 100644
--- a/paddle/cinn/frontend/computation.h
+++ b/paddle/cinn/frontend/computation.h
@@ -59,7 +59,7 @@ class CinnComputation {
    */
   static std::shared_ptr<CinnComputation> BuildAndCompile(
       const Target &target,
-      NetBuilder &builder,
+      NetBuilder &builder,  // NOLINT
       const CompileOptions &options = DefaultCompileOptions(),
       const std::vector<Variable> &outputs = {},
       void *stream = nullptr);
@@ -77,7 +77,7 @@ class CinnComputation {
    */
   static std::shared_ptr<CinnComputation> Compile(
       const Target &target,
-      Program &program,
+      Program &program,  // NOLINT
       const CompileOptions &options = DefaultCompileOptions(),
       const std::vector<Variable> &outputs = {},
       void *stream = nullptr);
@@ -130,7 +130,9 @@ class CinnComputation {
    * @param data address of the memory buffer to store tensor's data
    * @param size size of the memory buffer
    */
-  void SetTensorData(hlir::framework::Tensor &t, void *data, size_t size);
+  void SetTensorData(hlir::framework::Tensor &t,  // NOLINT
+                     void *data,
+                     size_t size);
 
   /**
    * set the data of a tensor (specified by it's name) from user specified
@@ -148,7 +150,9 @@ class CinnComputation {
    * @param data address of the memory buffer to store tensor's data
    * @param size size of the memory buffer
    */
-  void GetTensorData(hlir::framework::Tensor &t, void *data, size_t size);
+  void GetTensorData(hlir::framework::Tensor &t,  // NOLINT
+                     void *data,
+                     size_t size);
   /**
    * copy the data of a tensor (specified by it's name) to user specified
    * buffer. if tensor is in NVGPU device memory, cudaMemcpy is used.
diff --git a/paddle/cinn/frontend/pass/expand_zero_dim_pass_test.cc b/paddle/cinn/frontend/pass/expand_zero_dim_pass_test.cc
index 51f80a25035d0bf94f5359b2906eadefcc207946..58eb8a01e39c8f70dd0cf8ac344163cd9f6ae94f 100644
--- a/paddle/cinn/frontend/pass/expand_zero_dim_pass_test.cc
+++ b/paddle/cinn/frontend/pass/expand_zero_dim_pass_test.cc
@@ -33,7 +33,7 @@
 namespace cinn {
 namespace frontend {
 
-int GetSize(std::vector<int>& shape) {
+int GetSize(const std::vector<int>& shape) {
   return std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>());
 }
 
diff --git a/paddle/cinn/frontend/syntax.cc b/paddle/cinn/frontend/syntax.cc
index b2d4e9a54e9689283787f651a33dca160af588c7..9b534d8d146c015969d9929616c8af9c4b70faa8 100644
--- a/paddle/cinn/frontend/syntax.cc
+++ b/paddle/cinn/frontend/syntax.cc
@@ -296,12 +296,12 @@ std::tuple<std::unique_ptr<Program>,
            absl::flat_hash_map<std::string, Variable>,
            absl::flat_hash_map<std::string, std::string>,
            absl::flat_hash_set<std::string>>
-LoadPaddleProgram(
-    const std::string& model_dir,
-    Scope* scope,
-    std::unordered_map<std::string, std::vector<int>>& input_shape_map,
-    bool is_combined,
-    const common::Target& target) {
+LoadPaddleProgram(const std::string& model_dir,
+                  Scope* scope,
+                  std::unordered_map<std::string, std::vector<int>>&
+                      input_shape_map,  // NOLINT
+                  bool is_combined,
+                  const common::Target& target) {
   VLOG(1) << "Loading Paddle model from " << model_dir;
   PaddleModelToProgram paddle_to_program(scope, input_shape_map, target);
   return std::make_tuple(paddle_to_program(model_dir, is_combined),
diff --git a/paddle/cinn/frontend/syntax.h b/paddle/cinn/frontend/syntax.h
index 2d5b0223d58343c92ce7c0d1396abae6fdff852e..a405e22ddb565c2db5a967640e7ff592b48c8edf 100644
--- a/paddle/cinn/frontend/syntax.h
+++ b/paddle/cinn/frontend/syntax.h
@@ -528,12 +528,12 @@ std::tuple<std::unique_ptr<Program>,
            absl::flat_hash_map<std::string, Variable>,
            absl::flat_hash_map<std::string, std::string>,
            absl::flat_hash_set<std::string>>
-LoadPaddleProgram(
-    const std::string& model_dir,
-    hlir::framework::Scope* scope,
-    std::unordered_map<std::string, std::vector<int>>& input_shape_map,
-    bool is_combined,
-    const common::Target& target = common::DefaultHostTarget());
+LoadPaddleProgram(const std::string& model_dir,
+                  hlir::framework::Scope* scope,
+                  std::unordered_map<std::string, std::vector<int>>&
+                      input_shape_map,  // NOLINT
+                  bool is_combined,
+                  const common::Target& target = common::DefaultHostTarget());
 
 std::ostream& operator<<(std::ostream& os, const Variable& x);
 std::ostream& operator<<(std::ostream& os, const Instruction& instr);
diff --git a/paddle/cinn/hlir/framework/graph_compiler.cc b/paddle/cinn/hlir/framework/graph_compiler.cc
index 22c54891ee507ed06308f8d94b3ec01dfc6687f6..2d959353c52b8c32b90683f0bf46167ce9ccda42 100644
--- a/paddle/cinn/hlir/framework/graph_compiler.cc
+++ b/paddle/cinn/hlir/framework/graph_compiler.cc
@@ -1643,7 +1643,7 @@ std::shared_ptr<Scope> BuildScope(Target target,
 std::vector<ir::LoweredFunc> GetFuncFromImpl(
     const std::shared_ptr<OpImpl>& impl,
     const common::CINNValuePack& cinn_inputs,
-    std::vector<ir::Tensor>& all_arg_tensors,
+    std::vector<ir::Tensor>& all_arg_tensors,  // NOLINT
     const std::vector<std::string>& input_output_nodes,
     const std::string& node_id,
     const Target& target) {
diff --git a/paddle/cinn/hlir/framework/graph_compiler.h b/paddle/cinn/hlir/framework/graph_compiler.h
index ae482e1165ff9919aea2c7b20c6fe45e05f4854c..6e72b6b97bbdb1824ecd2b7a592af3f4c5c7070f 100644
--- a/paddle/cinn/hlir/framework/graph_compiler.h
+++ b/paddle/cinn/hlir/framework/graph_compiler.h
@@ -222,7 +222,7 @@ std::shared_ptr<Scope> BuildScope(Target target,
 std::vector<ir::LoweredFunc> GetFuncFromImpl(
     const std::shared_ptr<OpImpl>& impl,
     const common::CINNValuePack& cinn_inputs,
-    std::vector<ir::Tensor>& tensor_inputs,
+    std::vector<ir::Tensor>& tensor_inputs,  // NOLINT
     const std::vector<std::string>& input_output_nodes,
     const std::string& node_id,
     const Target& target);
diff --git a/paddle/cinn/hlir/framework/op_lowering.cc b/paddle/cinn/hlir/framework/op_lowering.cc
index 12ed8311afe19a626b629d494aba4c265e64dcb3..bf6099cc9a6bf4caacd2c6510a8376bb268b6328 100644
--- a/paddle/cinn/hlir/framework/op_lowering.cc
+++ b/paddle/cinn/hlir/framework/op_lowering.cc
@@ -45,7 +45,7 @@ OpLowerer::OpLowerer(
     const Target& target)
     : type_dict_(type_dict), shape_dict_(shape_dict), target_(target) {}
 
-std::vector<ir::LoweredFunc> OpLowerer::Lower(GroupPtr& group) {
+std::vector<ir::LoweredFunc> OpLowerer::Lower(GroupPtr& group) {  // NOLINT
   VLOG(3) << "Lowering Group : " << group->group_id
           << " , Op Pattern : " << group->op_pattern_kind;
   group->input_names.clear();
diff --git a/paddle/cinn/hlir/framework/op_lowering.h b/paddle/cinn/hlir/framework/op_lowering.h
old mode 100755
new mode 100644
index 97bdaeb485883daf52f2fe5ae5101ee07b293eb8..5e909d1196bbc7c21891ef2f3843214e0bdcec90
--- a/paddle/cinn/hlir/framework/op_lowering.h
+++ b/paddle/cinn/hlir/framework/op_lowering.h
@@ -52,8 +52,8 @@ class OpLowerer {
   OpLowerer(const absl::flat_hash_map<std::string, Type>&,
             const absl::flat_hash_map<std::string, shape_t>&,
             const Target&);
-  std::vector<ir::LoweredFunc> Lower(GroupPtr& group);
-  std::vector<ir::LoweredFunc> LowerWithoutSchedule(GroupPtr& group);
+  std::vector<ir::LoweredFunc> Lower(GroupPtr& group);                 // NOLINT
+  std::vector<ir::LoweredFunc> LowerWithoutSchedule(GroupPtr& group);  // NOLINT
 
  private:
   std::vector<ir::LoweredFunc> IRLowerOp(IRComputeFunction, GroupPtr&);
@@ -75,7 +75,7 @@ class OpLowerer {
   DEFINE_IR_COMPUTE(OutEWiseFusable);
 
   void IRSchedule(
-      ir::IRSchedule& ir_sch,
+      ir::IRSchedule& ir_sch,  // NOLINT
       const GroupPtr& group,
       const std::unordered_map<std::string, ir::Tensor>& tensor_map);
 
diff --git a/paddle/cinn/hlir/framework/op_lowering_test.cc b/paddle/cinn/hlir/framework/op_lowering_test.cc
index c5d615782c5308b54701df1557f557c9cd272167..3cd063a07e6346042c9554c227afb51d7d3796a7 100644
--- a/paddle/cinn/hlir/framework/op_lowering_test.cc
+++ b/paddle/cinn/hlir/framework/op_lowering_test.cc
@@ -32,7 +32,7 @@ namespace framework {
 using frontend::NetBuilder;
 using frontend::RunDecomposer;
 
-void CodeGen(ir::LoweredFunc& func) {
+void CodeGen(const ir::LoweredFunc& func) {
 #ifdef CINN_WITH_CUDA
   auto target = common::DefaultNVGPUTarget();
   Module::Builder builder("module_builder", target);
@@ -56,7 +56,7 @@ void CodeGen(ir::LoweredFunc& func) {
 #endif
 }
 
-void Compile(NetBuilder& net_builder) {
+void Compile(NetBuilder& net_builder) {  // NOLINT
   auto program = net_builder.Build();
   auto target = common::DefaultTarget();
   RunDecomposer(&program, target);
diff --git a/paddle/cinn/hlir/framework/op_lowering_util.cc b/paddle/cinn/hlir/framework/op_lowering_util.cc
index 24a90a6d138898af1f8dc90970a8667a409e267e..06ec4488203621f1b45ebaab3e415fd686e0134c 100644
--- a/paddle/cinn/hlir/framework/op_lowering_util.cc
+++ b/paddle/cinn/hlir/framework/op_lowering_util.cc
@@ -92,8 +92,8 @@ ir::Tensor GetTensor(
 
 std::vector<ir::Tensor> CollectInputTensor(
     const Node* node,
-    std::vector<ir::Tensor>& func_args,
-    std::unordered_map<std::string, ir::Tensor>& tensor_map,
+    std::vector<ir::Tensor>& func_args,                       // NOLINT
+    std::unordered_map<std::string, ir::Tensor>& tensor_map,  // NOLINT
     const absl::flat_hash_map<std::string, Type>& type_dict,
     const absl::flat_hash_map<std::string, shape_t>& shape_dict) {
   std::vector<ir::Tensor> tensors;
@@ -543,7 +543,7 @@ bool WithoutLastDimInReduce(const std::vector<int>& shape,
   }
 }
 
-void LoopOrderAssignReduce(ir::IRSchedule& ir_sch,
+void LoopOrderAssignReduce(ir::IRSchedule& ir_sch,  // NOLINT
                            const std::string& block_name,
                            const std::vector<int>& axes,
                            const common::Target& target,
@@ -593,7 +593,7 @@ void LoopOrderAssignReduce(ir::IRSchedule& ir_sch,
   }
 }
 
-void LoopAssignReduceWithoutLast(ir::IRSchedule& ir_sch,
+void LoopAssignReduceWithoutLast(ir::IRSchedule& ir_sch,  // NOLINT
                                  const std::string& block_name,
                                  const std::vector<int>& inshape,
                                  const std::vector<int>& axes,
@@ -707,7 +707,7 @@ void LoopAssignReduceWithoutLast(ir::IRSchedule& ir_sch,
   ir_sch.Reorder(block_name, new_order);
 }
 
-void LoopAssignReduceWithLast(ir::IRSchedule& ir_sch,
+void LoopAssignReduceWithLast(ir::IRSchedule& ir_sch,  // NOLINT
                               const std::string& block_name,
                               const std::vector<int>& inshape,
                               const std::vector<int>& axes,
@@ -974,7 +974,7 @@ Node* GetMasterToComputeAt(
 }
 
 void LoopAssignReduce(
-    ir::IRSchedule& ir_sch,
+    ir::IRSchedule& ir_sch,  // NOLINT
     const Node* node,
     const Node* reducer,
     const Target& target,
@@ -1217,8 +1217,8 @@ class RemoveExpr : public ir::IRMutator<> {
 };
 
 void MergeLoops(ir::Expr root,
-                std::vector<ir::Expr>& src,
-                std::vector<ir::Expr>& dst,
+                std::vector<ir::Expr>& src,  // NOLINT
+                std::vector<ir::Expr>& dst,  // NOLINT
                 int index) {
   if (index < 0) {
     return;
@@ -1247,7 +1247,7 @@ void MergeLoops(ir::Expr root,
 }
 
 void InsertSyncThread(
-    ir::IRSchedule& ir_sch,
+    ir::IRSchedule& ir_sch,  // NOLINT
     const Node* node,
     const absl::flat_hash_map<std::string, shape_t>& shape_dict,
     const std::unordered_map<std::string, ir::Tensor>& tensor_map) {
@@ -1318,7 +1318,7 @@ class InsertExpr : public ir::IRMutator<> {
 };
 
 void MergeReduceToReduce(
-    ir::IRSchedule& ir_sch,
+    ir::IRSchedule& ir_sch,  // NOLINT
     const Node* node,
     const Node* master,
     const absl::flat_hash_map<std::string, shape_t>& shape_dict,
@@ -1506,7 +1506,7 @@ void MergeReduceToReduce(
 }
 
 void MergeReduceLoop(
-    ir::IRSchedule& ir_sch,
+    ir::IRSchedule& ir_sch,  // NOLINT
     Node* node,
     const Node* master,
     const absl::flat_hash_map<std::string, shape_t>& shape_dict,
@@ -1611,7 +1611,7 @@ class FindExprInBlock : public ir::IRMutator<> {
 };
 
 void LoopComputeAt(
-    ir::IRSchedule& ir_sch,
+    ir::IRSchedule& ir_sch,  // NOLINT
     Node* node,
     const Node* master,
     const GroupPtr& group,
@@ -1712,7 +1712,7 @@ std::unordered_set<Node*> GetMasters(
 }
 
 void SyncThreadWithShared(
-    ir::IRSchedule& ir_sch,
+    ir::IRSchedule& ir_sch,  // NOLINT
     const GroupPtr& group,
     const std::unordered_set<Node*>& nodes_inline,
     const std::unordered_set<Node*>& nodes_set,
diff --git a/paddle/cinn/hlir/framework/op_lowering_util.h b/paddle/cinn/hlir/framework/op_lowering_util.h
index 02741820db85ce26bb252dfb64a0ae30592ba430..504ee0600479d584f45ac3e35ebed394a691d133 100644
--- a/paddle/cinn/hlir/framework/op_lowering_util.h
+++ b/paddle/cinn/hlir/framework/op_lowering_util.h
@@ -31,8 +31,8 @@ ir::Tensor GetTensor(
 
 std::vector<ir::Tensor> CollectInputTensor(
     const Node* node,
-    std::vector<ir::Tensor>& func_args,
-    std::unordered_map<std::string, ir::Tensor>& tensor_map,
+    std::vector<ir::Tensor>& func_args,                       // NOLINT
+    std::unordered_map<std::string, ir::Tensor>& tensor_map,  // NOLINT
     const absl::flat_hash_map<std::string, Type>& type_dict,
     const absl::flat_hash_map<std::string, shape_t>& shape_dict);
 
@@ -87,7 +87,7 @@ std::unordered_set<Node*> GetMasters(
     const std::unordered_set<Node*>& nodes_set);
 
 void LoopAssignReduce(
-    ir::IRSchedule& ir_sch,
+    ir::IRSchedule& ir_sch,  // NOLINT
     const Node* node,
     const Node* reducer,
     const Target& target,
@@ -95,7 +95,7 @@ void LoopAssignReduce(
     const absl::flat_hash_map<std::string, shape_t>& shape_dict);
 
 void LoopComputeAt(
-    ir::IRSchedule& ir_sch,
+    ir::IRSchedule& ir_sch,  // NOLINT
     Node* node,
     const Node* master,
     const GroupPtr& group,
@@ -103,7 +103,7 @@ void LoopComputeAt(
     const std::unordered_map<std::string, ir::Tensor>& tensor_map);
 
 void SyncThreadWithShared(
-    ir::IRSchedule& ir_sch,
+    ir::IRSchedule& ir_sch,  // NOLINT
     const GroupPtr& group,
     const std::unordered_set<Node*>& nodes_inline,
     const std::unordered_set<Node*>& nodes_set,
diff --git a/paddle/cinn/hlir/framework/parallel_compiler.h b/paddle/cinn/hlir/framework/parallel_compiler.h
index 8638d07230386ffd7b3092c2b0460002dd1cb00d..45fc4fef77a8d2a363a0e6dd50b82bc5104986c6 100644
--- a/paddle/cinn/hlir/framework/parallel_compiler.h
+++ b/paddle/cinn/hlir/framework/parallel_compiler.h
@@ -36,8 +36,8 @@ class ParallelCompiler {
   };
 
  public:
-  explicit ParallelCompiler(std::shared_ptr<Scope>& scope,
-                            std::shared_ptr<Graph>& graph,
+  explicit ParallelCompiler(std::shared_ptr<Scope>& scope,  // NOLINT
+                            std::shared_ptr<Graph>& graph,  // NOLINT
                             const CompileOptions& option,
                             const common::Target& target)
       : scope_(scope), graph_(graph), option_(option), target_(target) {}
@@ -53,8 +53,8 @@ class ParallelCompiler {
   struct Task {
    public:
     Task(ParallelCompiler* p,
-         std::shared_ptr<Scope>& s,
-         std::shared_ptr<Graph>& g,
+         std::shared_ptr<Scope>& s,  // NOLINT
+         std::shared_ptr<Graph>& g,  // NOLINT
          const CompileOptions& cp,
          const Target& t)
         : compiler(p), scope(s), graph(g), options(cp), target(t) {}
diff --git a/paddle/cinn/hlir/op/op_broadcast_test.cc b/paddle/cinn/hlir/op/op_broadcast_test.cc
old mode 100755
new mode 100644
index 086cb43528aa6212fbd39c22083024e87989a20c..4af7108f383614726b957414ffac2c5bed054ed9
--- a/paddle/cinn/hlir/op/op_broadcast_test.cc
+++ b/paddle/cinn/hlir/op/op_broadcast_test.cc
@@ -258,7 +258,7 @@ TEST(Operator, Operator_BroadcastTo) {
 
 common::CINNValuePack GetComputeResult(
     const std::shared_ptr<OpImpl> &impl,
-    std::vector<common::CINNValue> &cinn_inputs,
+    std::vector<common::CINNValue> &cinn_inputs,  // NOLINT
     const std::string &output_name = "") {
   if (FLAGS_cinn_ir_schedule) {
     cinn_inputs.emplace_back(output_name);
diff --git a/paddle/cinn/hlir/op/op_nn_test.cc b/paddle/cinn/hlir/op/op_nn_test.cc
index b2dff5cfdb7ee956d20cb9cee15aa0188f477dbc..7f0b287579b010cfefccdd1f78c2d426c136cbb3 100644
--- a/paddle/cinn/hlir/op/op_nn_test.cc
+++ b/paddle/cinn/hlir/op/op_nn_test.cc
@@ -44,7 +44,7 @@ Module LowerToModule(const std::string test_name,
                      const std::shared_ptr<OpImpl> &impl,
                      std::vector<std::string> input_names,
                      const std::string &output_name,
-                     std::vector<ir::Tensor> &inputs,
+                     std::vector<ir::Tensor> &inputs,  // NOLINT
                      std::vector<common::CINNValue> cinn_inputs,
                      const Target &target) {
   Module::Builder builder("module", target);
diff --git a/paddle/cinn/hlir/pass/common_subexpression_elimination.cc b/paddle/cinn/hlir/pass/common_subexpression_elimination.cc
index 3c14e1d03b6804157e3fad5ba3e399adcd05fc21..e595783c7b11b8f2be3dfccfbc384c98d8111bb4 100644
--- a/paddle/cinn/hlir/pass/common_subexpression_elimination.cc
+++ b/paddle/cinn/hlir/pass/common_subexpression_elimination.cc
@@ -67,7 +67,9 @@ std::unordered_map<std::string, int> special_attrs = {
     {"axes", 2},
     {"perm", 2}};
 
-bool IsSameSubexpression(Node* op1, Node* op2, shape_dict_t& shape_dict) {
+bool IsSameSubexpression(Node* op1,
+                         Node* op2,
+                         shape_dict_t& shape_dict) {  // NOLINT
   // Get the input edges for op1 and op2 in order.
   auto op1_in_edges = op1->inlinks_in_order();
   auto op2_in_edges = op2->inlinks_in_order();
@@ -201,13 +203,14 @@ void RemoveNodes(framework::Graph* graph, GraphNode* node) {
   graph->DropNode(node);
 }
 
-void RemoveNodes(framework::Graph* graph, std::vector<Node*>& nodes) {
+void RemoveNodes(framework::Graph* graph, const std::vector<Node*>& nodes) {
   for (auto* node : nodes) {
     RemoveNodes(graph, node);
   }
 }
 
-void RemoveNodes(framework::Graph* graph, std::vector<NodeData*>& nodes_data) {
+void RemoveNodes(framework::Graph* graph,
+                 const std::vector<NodeData*>& nodes_data) {
   for (auto* data : nodes_data) {
     if (std::find(graph->outputs.begin(), graph->outputs.end(), data) !=
         graph->outputs.end()) {
diff --git a/paddle/cinn/hlir/pass/constant_folding_pass_test.cc b/paddle/cinn/hlir/pass/constant_folding_pass_test.cc
index 2698d035d30d92b7416fa35fa49a6992d2c9785e..7729f4a49f1583ed25f0bfbcce7f210bfcedde06 100644
--- a/paddle/cinn/hlir/pass/constant_folding_pass_test.cc
+++ b/paddle/cinn/hlir/pass/constant_folding_pass_test.cc
@@ -19,7 +19,7 @@
 namespace cinn {
 namespace frontend {
 
-int GetSize(std::vector<int>& shape) {
+int GetSize(const std::vector<int>& shape) {
   return std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>());
 }
 
@@ -36,7 +36,7 @@ std::unordered_map<std::string, std::vector<float>> GetInputRandom(
 }
 
 std::unordered_map<std::string, std::vector<float>> RunModelTest(
-    Program& program,
+    Program& program,  // NOLINT
     const std::vector<std::string>&& passes,
     const std::unordered_map<std::string, std::vector<float>>& input_data,
     const std::unordered_set<std::string>& fetch_ids) {
diff --git a/paddle/cinn/hlir/pass/dense_merge_pass_test.cc b/paddle/cinn/hlir/pass/dense_merge_pass_test.cc
index 05ee12558f7cac786ecdb352f609033f8902c0d2..07e16556729bdb3b9c75068ec08577f450aced9a 100644
--- a/paddle/cinn/hlir/pass/dense_merge_pass_test.cc
+++ b/paddle/cinn/hlir/pass/dense_merge_pass_test.cc
@@ -19,11 +19,11 @@
 namespace cinn {
 namespace frontend {
 
-int GetSize(std::vector<int>& shape) {
+int GetSize(const std::vector<int>& shape) {
   return std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>());
 }
 
-void RunModelTest(Program& program,
+void RunModelTest(Program& program,  // NOLINT
                   const std::vector<Variable>&& inputs,
                   const std::unordered_set<std::string>& fetch_ids) {
   // init input data.
diff --git a/paddle/cinn/hlir/pass/dot_merger.cc b/paddle/cinn/hlir/pass/dot_merger.cc
index 30cc12c083676e015fc6461eee93a53ecf675ff4..8638200180f66ef16eeea31375456b3921986467 100644
--- a/paddle/cinn/hlir/pass/dot_merger.cc
+++ b/paddle/cinn/hlir/pass/dot_merger.cc
@@ -130,7 +130,7 @@ class DotBuilder {
   const shape_dict_t& shape_dict() const { return shape_dict_; }
 
   // Currently the constructor of `NodeData` needs to pass in `Shared<Node>`.
-  NodeData* Var(common::Shared<Node>& producer) {
+  NodeData* Var(common::Shared<Node>& producer) {  // NOLINT
     auto* res = new NodeData(producer, 0, 0, node_name("var"), false);
     graph_->RegisterNode(producer->id(), res);
     graph_->RegisterNode(res->id(), producer.get());
diff --git a/paddle/cinn/hlir/pass/dot_merger_test.cc b/paddle/cinn/hlir/pass/dot_merger_test.cc
index 17258623f0d346f3375518e43a7e3d9734f9f611..77f02e709625aaa34beaaba9d874c229c589a6ca 100644
--- a/paddle/cinn/hlir/pass/dot_merger_test.cc
+++ b/paddle/cinn/hlir/pass/dot_merger_test.cc
@@ -19,11 +19,11 @@
 namespace cinn {
 namespace frontend {
 
-int GetSize(std::vector<int>& shape) {
+int GetSize(const std::vector<int>& shape) {
   return std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>());
 }
 
-void RunModelTest(Program& program,
+void RunModelTest(Program& program,  // NOLINT
                   const std::vector<Variable>&& inputs,
                   const std::unordered_set<std::string>& fetch_ids) {
   // init input data.
diff --git a/paddle/cinn/hlir/pass/infershape.cc b/paddle/cinn/hlir/pass/infershape.cc
old mode 100755
new mode 100644
index 87d8e263567c0156de6e4ec12e393774025cc5d8..b082c98a0fcf69b96ff26750a5dbaa650966162e
--- a/paddle/cinn/hlir/pass/infershape.cc
+++ b/paddle/cinn/hlir/pass/infershape.cc
@@ -38,8 +38,8 @@ using dtype_dict_t = absl::flat_hash_map<std::string, common::Type>;
 using shape_dict_t = absl::flat_hash_map<std::string, framework::shape_t>;
 
 void InferShape(Node* node,
-                dtype_dict_t& dtype_dict,
-                shape_dict_t& shape_dict) {
+                dtype_dict_t& dtype_dict,    // NOLINT
+                shape_dict_t& shape_dict) {  // NOLINT
   VLOG(3) << "Begin InferShape of node " << node->id();
   auto op_infershape = Operator::GetAttrs<infershape_t>("infershape");
   auto op_inferdtype = Operator::GetAttrs<inferdtype_t>("inferdtype");
diff --git a/paddle/cinn/hlir/pass/infershape.h b/paddle/cinn/hlir/pass/infershape.h
index 7ed0ca5fe78ae6263471bb4b0fee92180bf5a19a..db9b8e21e7020818f93c777378ae2e7e91c3511d 100644
--- a/paddle/cinn/hlir/pass/infershape.h
+++ b/paddle/cinn/hlir/pass/infershape.h
@@ -24,8 +24,9 @@ namespace pass {
 
 void InferShape(
     framework::Node* node,
-    absl::flat_hash_map<std::string, common::Type>& dtype_dict,
-    absl::flat_hash_map<std::string, framework::shape_t>& shape_dict);
+    absl::flat_hash_map<std::string, common::Type>& dtype_dict,  // NOLINT
+    absl::flat_hash_map<std::string, framework::shape_t>&
+        shape_dict);  // NOLINT
 
 }  // namespace pass
 }  // namespace hlir
diff --git a/paddle/cinn/hlir/pass/reduce_split_pass_test.cc b/paddle/cinn/hlir/pass/reduce_split_pass_test.cc
index 688d6a0bd607ca8eff530ee78624ab7f05a1a910..5d9973209d0be8c3c396eddb1612862e7e6d3729 100644
--- a/paddle/cinn/hlir/pass/reduce_split_pass_test.cc
+++ b/paddle/cinn/hlir/pass/reduce_split_pass_test.cc
@@ -20,7 +20,7 @@ namespace cinn {
 namespace frontend {
 
 std::unordered_map<std::string, std::vector<float>> RunModelTest(
-    Program& program,
+    Program& program,  // NOLINT
     const std::vector<std::string>&& passes,
     const std::unordered_map<std::string, std::vector<float>>& input_data,
     const std::unordered_set<std::string>& fetch_ids) {
diff --git a/paddle/cinn/hlir/pe/ir_schedule_pe.cc b/paddle/cinn/hlir/pe/ir_schedule_pe.cc
index 784f7ff85d5aba990410b9cc26ac07d957561931..a97b248c1223b677e36f5689bc3cc3f7e5788270 100644
--- a/paddle/cinn/hlir/pe/ir_schedule_pe.cc
+++ b/paddle/cinn/hlir/pe/ir_schedule_pe.cc
@@ -39,7 +39,7 @@ namespace cinn {
 namespace hlir {
 namespace pe {
 
-void IRElementwiseSchedule(ir::IRSchedule &ir_sch,
+void IRElementwiseSchedule(ir::IRSchedule &ir_sch,  // NOLINT
                            const std::vector<int> &output_shape,
                            const common::Target &target) {
   VLOG(3) << "Before IRElementwiseSchedule, new ir is : "
@@ -67,7 +67,7 @@ void IRElementwiseSchedule(ir::IRSchedule &ir_sch,
           << ir_sch.GetModule().GetExprs().at(0);
 }
 
-void IRInjectiveSchedule(ir::IRSchedule &ir_sch,
+void IRInjectiveSchedule(ir::IRSchedule &ir_sch,  // NOLINT
                          const std::vector<int> &output_shape,
                          const common::Target &target) {
   VLOG(3) << "Before IRInjectiveSchedule, new ir is : "
@@ -95,7 +95,7 @@ void IRInjectiveSchedule(ir::IRSchedule &ir_sch,
           << ir_sch.GetModule().GetExprs().at(0);
 }
 
-void IRScheduleInjectiveCPU(ir::IRSchedule &ir_sch,
+void IRScheduleInjectiveCPU(ir::IRSchedule &ir_sch,  // NOLINT
                             const std::vector<int> &output_shape,
                             const common::Target &target,
                             bool vectorizable) {
@@ -132,7 +132,7 @@ void IRScheduleInjectiveCPU(ir::IRSchedule &ir_sch,
           << ir_sch.GetModule().GetExprs().at(0);
 }
 
-void IRCudaScheduleInjective(ir::IRSchedule &ir_sch,
+void IRCudaScheduleInjective(ir::IRSchedule &ir_sch,  // NOLINT
                              const std::vector<int> &output_shape,
                              const common::Target &target) {
   VLOG(3) << "Begin IRCudaScheduleInjective ";
@@ -208,7 +208,7 @@ std::vector<common::CINNValue> IRCudaScheduleMatMul(
   return {common::CINNValue(ir_sch.GetModule().GetExprs().at(0))};
 }
 
-void IRCudaScheduleMul(ir::IRSchedule &ir_sch,
+void IRCudaScheduleMul(ir::IRSchedule &ir_sch,  // NOLINT
                        const std::vector<int> &output_shape,
                        const common::Target &target) {
   auto all_blocks = ir_sch.GetAllBlocks();
@@ -221,7 +221,7 @@ void IRCudaScheduleMul(ir::IRSchedule &ir_sch,
   ir_sch.Bind(loops[1], "threadIdx.x");
 }
 
-void IRMulScheduleCPU(ir::IRSchedule &ir_sch,
+void IRMulScheduleCPU(ir::IRSchedule &ir_sch,  // NOLINT
                       const std::vector<int> &reduce_first_shape,
                       const common::Target &target) {
   ir_sch.MergeExprs();
@@ -238,7 +238,7 @@ void IRMulScheduleCPU(ir::IRSchedule &ir_sch,
   }
 }
 
-void IRCudaSplitSchedule(ir::IRSchedule &ir_sch,
+void IRCudaSplitSchedule(ir::IRSchedule &ir_sch,  // NOLINT
                          const std::vector<std::vector<int>> &output_shapes,
                          int axis,
                          const common::Target &target) {
@@ -334,7 +334,7 @@ void IRCudaSplitSchedule(ir::IRSchedule &ir_sch,
           << ir_sch.GetModule().GetExprs().at(0);
 }
 
-void IRCudaScheduleReduce(ir::IRSchedule &ir_sch,
+void IRCudaScheduleReduce(ir::IRSchedule &ir_sch,  // NOLINT
                           ir::Tensor output,
                           int last_dimension_num,
                           const common::Target &target) {
@@ -390,7 +390,7 @@ void IRCudaScheduleReduce(ir::IRSchedule &ir_sch,
           << ir_sch.GetModule().GetExprs().at(0);
 }
 
-void IRCudaScheduleBlockReduceInternal(ir::IRSchedule &ir_sch,
+void IRCudaScheduleBlockReduceInternal(ir::IRSchedule &ir_sch,  // NOLINT
                                        ir::Tensor tmp_out,
                                        ir::Tensor out,
                                        const common::Target &target) {
@@ -478,7 +478,7 @@ void IRCudaScheduleBlockReduceInternal(ir::IRSchedule &ir_sch,
           << ir_sch.GetModule().GetExprs().at(0);
 }
 
-void IRCudaScheduleBlockReduce(ir::IRSchedule &ir_sch,
+void IRCudaScheduleBlockReduce(ir::IRSchedule &ir_sch,  // NOLINT
                                ir::Tensor reduce_tmp_out,
                                ir::Tensor tmp_out,
                                ir::Tensor out,
@@ -621,7 +621,7 @@ void IRCudaScheduleBlockReduce(ir::IRSchedule &ir_sch,
           << ir_sch.GetModule().GetExprs().at(0);
 }
 
-void IRCudaScheduleBlockShuffleReduce(ir::IRSchedule &ir_sch,
+void IRCudaScheduleBlockShuffleReduce(ir::IRSchedule &ir_sch,  // NOLINT
                                       ir::Tensor reshape,
                                       ir::Tensor internal,
                                       ir::Tensor reduce_out,
@@ -880,7 +880,7 @@ void IRCudaScheduleBlockShuffleReduce(ir::IRSchedule &ir_sch,
           << ir_sch.GetModule().GetExprs().at(0);
 }
 
-void IRCudaTwoStepReduceSchedule(ir::IRSchedule &ir_sch,
+void IRCudaTwoStepReduceSchedule(ir::IRSchedule &ir_sch,  // NOLINT
                                  ir::Tensor reshape,
                                  ir::Tensor internal,
                                  ir::Tensor tmp_out,
@@ -991,7 +991,7 @@ void IRCudaTwoStepReduceSchedule(ir::IRSchedule &ir_sch,
   // ir_sch.GetLoops(out->name)[0]);
 }
 
-void IRSoftmaxScheduleCPU(ir::IRSchedule &ir_sch, int axis) {
+void IRSoftmaxScheduleCPU(ir::IRSchedule &ir_sch, int axis) {  // NOLINT
   ir_sch.MergeExprs();
   auto all_blocks = ir_sch.GetAllBlocks();
   CHECK_EQ(all_blocks.size(), 3U);
@@ -1010,7 +1010,7 @@ void IRSoftmaxScheduleCPU(ir::IRSchedule &ir_sch, int axis) {
   ir_sch.ComputeAt(all_blocks[1], loops[0]);
 }
 
-void IRPoolScheduleGPU(ir::IRSchedule &ir_sch,
+void IRPoolScheduleGPU(ir::IRSchedule &ir_sch,  // NOLINT
                        const common::Target &target,
                        int arg_pack_size) {
   VLOG(3) << "Before IRPoolScheduleGPU: "
@@ -1028,7 +1028,7 @@ void IRPoolScheduleGPU(ir::IRSchedule &ir_sch,
   VLOG(3) << "End IRPoolScheduleGPU: " << ir_sch.GetModule().GetExprs().at(0);
 }
 
-void IRGlobalPoolScheduleGPU(ir::IRSchedule &ir_sch,
+void IRGlobalPoolScheduleGPU(ir::IRSchedule &ir_sch,  // NOLINT
                              const common::Target &target) {
   VLOG(3) << "Before IRGlobalPoolScheduleGPU: "
           << ir_sch.GetModule().GetExprs().at(0);
@@ -1071,7 +1071,7 @@ void IRGlobalPoolScheduleGPU(ir::IRSchedule &ir_sch,
           << ir_sch.GetModule().GetExprs().at(0);
 }
 
-void IRCudaScheduleDepthwiseConv(ir::IRSchedule &ir_sch,
+void IRCudaScheduleDepthwiseConv(ir::IRSchedule &ir_sch,  // NOLINT
                                  const std::vector<ir::Expr> &tensors) {
   if (tensors.size() == 3U) {
     CHECK(tensors[1].as_tensor());
@@ -1097,7 +1097,8 @@ void IRCudaScheduleDepthwiseConv(ir::IRSchedule &ir_sch,
           << ir_sch.GetModule().GetExprs().at(0);
 }
 
-void IRCudaScheduleConv(ir::IRSchedule &ir_sch, const common::Target &target) {
+void IRCudaScheduleConv(ir::IRSchedule &ir_sch,  // NOLINT
+                        const common::Target &target) {
   VLOG(3) << "Begin IRCudaScheduleConv with expr: "
           << ir_sch.GetModule().GetExprs().at(0);
   auto &res = ScheduleParam::get_cuda_instance().GetParam();
@@ -1238,10 +1239,10 @@ void IRCudaScheduleConv(ir::IRSchedule &ir_sch, const common::Target &target) {
           << ir_sch.GetModule().GetExprs().at(0);
 }
 
-void IRCudaScheduleConv2(ir::IRSchedule &ir_sch,
-                         ir::Tensor &input_pad,
-                         ir::Tensor &weights,
-                         ir::Tensor &output,
+void IRCudaScheduleConv2(ir::IRSchedule &ir_sch,  // NOLINT
+                         ir::Tensor &input_pad,   // NOLINT
+                         ir::Tensor &weights,     // NOLINT
+                         ir::Tensor &output,      // NOLINT
                          const common::Target &target,
                          const std::string &key) {
   auto &res = ScheduleParam::get_cuda_instance().GetParam();
diff --git a/paddle/cinn/hlir/pe/ir_schedule_pe.h b/paddle/cinn/hlir/pe/ir_schedule_pe.h
index e7839fcc1ae570b2101c7c8ff5d8de1776a655b0..82967f3d9f536a0015f92423b2191f01ff43da1a 100644
--- a/paddle/cinn/hlir/pe/ir_schedule_pe.h
+++ b/paddle/cinn/hlir/pe/ir_schedule_pe.h
@@ -31,20 +31,20 @@ namespace cinn {
 namespace hlir {
 namespace pe {
 
-void IRElementwiseSchedule(ir::IRSchedule &ir_sch,
+void IRElementwiseSchedule(ir::IRSchedule &ir_sch,  // NOLINT
                            const std::vector<int> &output_shape,
                            const common::Target &target);
 
-void IRInjectiveSchedule(ir::IRSchedule &ir_sch,
+void IRInjectiveSchedule(ir::IRSchedule &ir_sch,  // NOLINT
                          const std::vector<int> &output_shape,
                          const common::Target &target);
 
-void IRScheduleInjectiveCPU(ir::IRSchedule &ir_sch,
+void IRScheduleInjectiveCPU(ir::IRSchedule &ir_sch,  // NOLINT
                             const std::vector<int> &output_shape,
                             const common::Target &target,
                             bool vectorizable = true);
 
-void IRCudaScheduleInjective(ir::IRSchedule &ir_sch,
+void IRCudaScheduleInjective(ir::IRSchedule &ir_sch,  // NOLINT
                              const std::vector<int> &output_shape,
                              const common::Target &target);
 
@@ -53,68 +53,69 @@ std::vector<common::CINNValue> IRCudaScheduleMatMul(
     const std::vector<int> &output_shape,
     const common::Target &target);
 
-void IRCudaScheduleMul(ir::IRSchedule &ir_sch,
+void IRCudaScheduleMul(ir::IRSchedule &ir_sch,  // NOLINT
                        const std::vector<int> &output_shape,
                        const common::Target &target);
 
-void IRMulScheduleCPU(ir::IRSchedule &ir_sch,
+void IRMulScheduleCPU(ir::IRSchedule &ir_sch,  // NOLINT
                       const std::vector<int> &reduce_first_shape,
                       const common::Target &target);
 
-void IRCudaSplitSchedule(ir::IRSchedule &ir_sch,
+void IRCudaSplitSchedule(ir::IRSchedule &ir_sch,  // NOLINT
                          const std::vector<std::vector<int>> &output_shapes,
                          int axis,
                          const common::Target &target);
 
-void IRCudaScheduleReduce(ir::IRSchedule &ir_sch,
+void IRCudaScheduleReduce(ir::IRSchedule &ir_sch,  // NOLINT
                           ir::Tensor out,
                           int last_dimension_num,
                           const common::Target &target);
 
-void IRCudaScheduleBlockReduce(ir::IRSchedule &ir_sch,
+void IRCudaScheduleBlockReduce(ir::IRSchedule &ir_sch,  // NOLINT
                                ir::Tensor reduce_tmp_out,
                                ir::Tensor tmp_out,
                                ir::Tensor out,
                                const common::Target &target);
 
-void IRCudaScheduleBlockReduceInternal(ir::IRSchedule &ir_sch,
+void IRCudaScheduleBlockReduceInternal(ir::IRSchedule &ir_sch,  // NOLINT
                                        ir::Tensor tmp_out,
                                        ir::Tensor out,
                                        const common::Target &target);
 
-void IRCudaScheduleBlockShuffleReduce(ir::IRSchedule &ir_sch,
+void IRCudaScheduleBlockShuffleReduce(ir::IRSchedule &ir_sch,  // NOLINT
                                       ir::Tensor reshape,
                                       ir::Tensor internal,
                                       ir::Tensor out,
                                       const common::Target &target);
 
-void IRCudaTwoStepReduceSchedule(ir::IRSchedule &ir_sch,
+void IRCudaTwoStepReduceSchedule(ir::IRSchedule &ir_sch,  // NOLINT
                                  ir::Tensor reshape,
                                  ir::Tensor internal,
                                  ir::Tensor tmp_out,
                                  ir::Tensor out,
                                  const common::Target &target);
 
-void IRSoftmaxScheduleCPU(ir::IRSchedule &ir_sch, int axis = -1);
+void IRSoftmaxScheduleCPU(ir::IRSchedule &ir_sch, int axis = -1);  // NOLINT
 
-void IRPoolScheduleGPU(ir::IRSchedule &ir_sch,
+void IRPoolScheduleGPU(ir::IRSchedule &ir_sch,  // NOLINT
                        const common::Target &target,
                        int arg_pack_size = 3);
 
-void IRCudaScheduleDepthwiseConv(ir::IRSchedule &ir_sch,
+void IRCudaScheduleDepthwiseConv(ir::IRSchedule &ir_sch,  // NOLINT
                                  const std::vector<ir::Expr> &tensors);
 
-void IRGlobalPoolScheduleGPU(ir::IRSchedule &ir_sch,
+void IRGlobalPoolScheduleGPU(ir::IRSchedule &ir_sch,  // NOLINT
                              const common::Target &target);
 
-void IRCudaScheduleConv2(ir::IRSchedule &ir_sch,
-                         ir::Tensor &input_pad,
-                         ir::Tensor &weights,
-                         ir::Tensor &output,
+void IRCudaScheduleConv2(ir::IRSchedule &ir_sch,  // NOLINT
+                         ir::Tensor &input_pad,   // NOLINT
+                         ir::Tensor &weights,     // NOLINT
+                         ir::Tensor &output,      // NOLINT
                          const common::Target &target,
                          const std::string &key);
 
-void IRCudaScheduleConv(ir::IRSchedule &ir_sch, const common::Target &target);
+void IRCudaScheduleConv(ir::IRSchedule &ir_sch,  // NOLINT
+                        const common::Target &target);
 
 }  // namespace pe
 }  // namespace hlir
diff --git a/paddle/cinn/hlir/pe/nn_util.cc b/paddle/cinn/hlir/pe/nn_util.cc
index 30d316bcaf845d1c82c144e9da47864bb8467dd6..22ae26d03aea7a78e7824d129cc5fa87e71ace3e 100644
--- a/paddle/cinn/hlir/pe/nn_util.cc
+++ b/paddle/cinn/hlir/pe/nn_util.cc
@@ -456,8 +456,8 @@ int GetTailSize(const std::vector<int>& inshape, const std::vector<int>& axes) {
 
 std::vector<int> GetFirstStepReduceShape(const std::vector<int>& shape,
                                          const std::vector<int>& axes,
-                                         bool& inbound,
-                                         int& tail) {
+                                         bool& inbound,  // NOLINT
+                                         int& tail) {    // NOLINT
   // post parallel size
   int post_parallel_size = GetPostParallelSize(shape, axes);
   // the size to unfold las reduce axis
diff --git a/paddle/cinn/hlir/pe/nn_util.h b/paddle/cinn/hlir/pe/nn_util.h
index 7ea201e0461ab0655d90fadc4387a41238168d42..b5b35cb2e58d664c91ff1913c2afa736f3a59f59 100644
--- a/paddle/cinn/hlir/pe/nn_util.h
+++ b/paddle/cinn/hlir/pe/nn_util.h
@@ -41,8 +41,8 @@ std::vector<ir::Tensor> winograd_transform_matrices(const int& tile_size,
 
 std::vector<int> GetFirstStepReduceShape(const std::vector<int>& shape,
                                          const std::vector<int>& axes,
-                                         bool& inbound,
-                                         int& tail);
+                                         bool& inbound,  // NOLINT
+                                         int& tail);     // NOLINT
 
 }  // namespace pe
 }  // namespace hlir
diff --git a/paddle/cinn/hlir/pe/schedule.cc b/paddle/cinn/hlir/pe/schedule.cc
index 2547a43862a9f28f3cd13afb87336cb48358c863..45022f740d7eb44c796b9f3dcc29f7df3fe5fd70 100644
--- a/paddle/cinn/hlir/pe/schedule.cc
+++ b/paddle/cinn/hlir/pe/schedule.cc
@@ -624,7 +624,7 @@ void PoolScheduleCPU(poly::StageMap stages,
 }
 
 void PoolScheduleGPU(poly::StageMap stages,
-                     ir::Tensor &output,
+                     const ir::Tensor &output,
                      const common::Target &target) {
   CHECK_GE(stages[output]->axis_names().size(), 4);
   stages[output]->Fuse({0, 1, 2, 3});
@@ -866,7 +866,7 @@ void CreateX86SerialData(const std::string &file_name) {
 
 void Conv2d_NCHWc_1X1_Schedule_CPU(poly::StageMap stages,
                                    const ir::Tensor &res,
-                                   ir::Tensor &packed_out,
+                                   ir::Tensor &packed_out,  // NOLINT
                                    const ir::Tensor &input_pad,
                                    const ir::Tensor &weights_dilation,
                                    const ir::Tensor &data,
@@ -1017,7 +1017,7 @@ void Conv2d_NCHWc_1X1_Schedule_CPU(poly::StageMap stages,
 
 void Conv2d_NCHWc_1X1_Schedule_CPU_Nofuse(poly::StageMap stages,
                                           const ir::Tensor &res,
-                                          ir::Tensor &packed_out,
+                                          ir::Tensor &packed_out,  // NOLINT
                                           const ir::Tensor &input_pad,
                                           const ir::Tensor &weights_dilation,
                                           const ir::Tensor &data,
@@ -1139,7 +1139,7 @@ void Conv2d_NCHWc_1X1_Schedule_CPU_Nofuse(poly::StageMap stages,
 
 void Conv2d_NCHWc_Schedule_CPU_Nofuse(poly::StageMap stages,
                                       const ir::Tensor &res,
-                                      ir::Tensor &packed_out,
+                                      ir::Tensor &packed_out,  // NOLINT
                                       const ir::Tensor &input_pad,
                                       const ir::Tensor &weights_dilation,
                                       const ir::Tensor &data,
@@ -1244,7 +1244,7 @@ void Conv2d_NCHWc_Schedule_CPU_Nofuse(poly::StageMap stages,
 
 void Conv2d_NCHWc_Schedule_CPU(poly::StageMap stages,
                                const ir::Tensor &res,
-                               ir::Tensor &packed_out,
+                               ir::Tensor &packed_out,  // NOLINT
                                const ir::Tensor &input_pad,
                                const ir::Tensor &weights_dilation,
                                const ir::Tensor &data,
@@ -1377,7 +1377,7 @@ void Conv2d_NCHWc_Schedule_CPU(poly::StageMap stages,
 void Depthwise_Conv2d_NCHWc_Schedule_CPU_Nofuse(
     poly::StageMap stages,
     const ir::Tensor &res,
-    ir::Tensor &packed_out,
+    ir::Tensor &packed_out,  // NOLINT
     const ir::Tensor &input_pad,
     const ir::Tensor &weights_dilation,
     const ir::Tensor &data,
@@ -2300,7 +2300,7 @@ void SaveSerialData(
 }
 
 void CudaScheduleDepthwiseConv(poly::StageMap stages,
-                               ir::Tensor &output,
+                               ir::Tensor &output,  // NOLINT
                                const common::Target &target) {
   auto OL = stages[output]->CacheWrite("local", stages, output);
   stages[output]->Bind(0, "blockIdx.x");
@@ -2313,9 +2313,9 @@ void CudaScheduleDepthwiseConv(poly::StageMap stages,
 }
 
 void CudaScheduleConv(poly::StageMap stages,
-                      ir::Tensor &input_pad,
-                      ir::Tensor &weights,
-                      ir::Tensor &output,
+                      ir::Tensor &input_pad,  // NOLINT
+                      ir::Tensor &weights,    // NOLINT
+                      ir::Tensor &output,     // NOLINT
                       const common::Target &target) {
   auto &res = ScheduleParam::get_cuda_instance().GetParam();
   int n = output->shape[0].as_int32();
@@ -2382,9 +2382,9 @@ void CudaScheduleConv(poly::StageMap stages,
 }
 
 void CudaScheduleConv2(poly::StageMap stages,
-                       ir::Tensor &input_pad,
-                       ir::Tensor &weights,
-                       ir::Tensor &output,
+                       ir::Tensor &input_pad,  // NOLINT
+                       ir::Tensor &weights,    // NOLINT
+                       ir::Tensor &output,     // NOLINT
                        const common::Target &target,
                        const std::string &key) {
   auto &res = ScheduleParam::get_cuda_instance().GetParam();
@@ -2516,7 +2516,7 @@ void CudaScheduleConv2(poly::StageMap stages,
 }
 
 void CudaScheduleWinogradConv(poly::StageMap wino_stages,
-                              std::vector<ir::Tensor> &all_tensors,
+                              std::vector<ir::Tensor> &all_tensors,  // NOLINT
                               const common::Target &target) {
   auto &res = ScheduleParam::get_cuda_instance().GetParam();
   auto &wino_weights_dilation = all_tensors[0];
diff --git a/paddle/cinn/hlir/pe/schedule.h b/paddle/cinn/hlir/pe/schedule.h
index c22c8fbb7a93a449c161c9c1c1bdaea15b6e04c6..cd3262cf3ad18e668cd677e584f9e658e846d9ce 100644
--- a/paddle/cinn/hlir/pe/schedule.h
+++ b/paddle/cinn/hlir/pe/schedule.h
@@ -124,7 +124,7 @@ void GetConv2d1x1Factors(absl::flat_hash_map<std::string, int> *factors,
 
 void Conv2d_NCHWc_Schedule_CPU(poly::StageMap stages,
                                const ir::Tensor &res,
-                               ir::Tensor &packed_out,
+                               ir::Tensor &packed_out,  // NOLINT
                                const ir::Tensor &input_pad,
                                const ir::Tensor &weights_dilation,
                                const ir::Tensor &data,
@@ -138,12 +138,12 @@ void PoolScheduleCPU(poly::StageMap stages,
                      const ir::Tensor &output,
                      const common::Target &target);
 void PoolScheduleGPU(poly::StageMap stages,
-                     ir::Tensor &output,
+                     const ir::Tensor &output,
                      const common::Target &target);
 
 void Conv2d_NCHWc_Schedule_CPU_Nofuse(poly::StageMap stages,
                                       const ir::Tensor &res,
-                                      ir::Tensor &packed_out,
+                                      ir::Tensor &packed_out,  // NOLINT
                                       const ir::Tensor &input_pad,
                                       const ir::Tensor &weights_dilation,
                                       const ir::Tensor &data,
@@ -151,7 +151,7 @@ void Conv2d_NCHWc_Schedule_CPU_Nofuse(poly::StageMap stages,
 
 void Conv2d_NCHWc_1X1_Schedule_CPU(poly::StageMap stages,
                                    const ir::Tensor &res,
-                                   ir::Tensor &packed_out,
+                                   ir::Tensor &packed_out,  // NOLINT
                                    const ir::Tensor &input_pad,
                                    const ir::Tensor &weights_dilation,
                                    const ir::Tensor &data,
@@ -161,7 +161,7 @@ void Conv2d_NCHWc_1X1_Schedule_CPU(poly::StageMap stages,
 
 void Conv2d_NCHWc_1X1_Schedule_CPU_Nofuse(poly::StageMap stages,
                                           const ir::Tensor &res,
-                                          ir::Tensor &packed_out,
+                                          ir::Tensor &packed_out,  // NOLINT
                                           const ir::Tensor &input_pad,
                                           const ir::Tensor &weights_dilation,
                                           const ir::Tensor &data,
@@ -170,7 +170,7 @@ void Conv2d_NCHWc_1X1_Schedule_CPU_Nofuse(poly::StageMap stages,
 void Depthwise_Conv2d_NCHWc_Schedule_CPU_Nofuse(
     poly::StageMap stages,
     const ir::Tensor &res,
-    ir::Tensor &packed_out,
+    ir::Tensor &packed_out,  // NOLINT
     const ir::Tensor &input_pad,
     const ir::Tensor &weights_dilation,
     const ir::Tensor &data,
@@ -218,23 +218,23 @@ void CudaTwoStepReduceSchedule(poly::StageMap stages,
                                const common::Target &target);
 
 void CudaScheduleDepthwiseConv(poly::StageMap stages,
-                               ir::Tensor &output,
+                               ir::Tensor &output,  // NOLINT
                                const common::Target &target);
 
 void CudaScheduleConv(poly::StageMap stages,
-                      ir::Tensor &input_pad,
-                      ir::Tensor &weights,
-                      ir::Tensor &output,
+                      ir::Tensor &input_pad,  // NOLINT
+                      ir::Tensor &weights,    // NOLINT
+                      ir::Tensor &output,     // NOLINT
                       const common::Target &target);
 
 void CudaScheduleWinogradConv(poly::StageMap wino_stages,
-                              std::vector<ir::Tensor> &all_tensors,
+                              std::vector<ir::Tensor> &all_tensors,  // NOLINT
                               const common::Target &target);
 
 void CudaScheduleConv2(poly::StageMap stages,
-                       ir::Tensor &input_pad,
-                       ir::Tensor &weights,
-                       ir::Tensor &output,
+                       ir::Tensor &input_pad,  // NOLINT
+                       ir::Tensor &weights,    // NOLINT
+                       ir::Tensor &output,     // NOLINT
                        const common::Target &target,
                        const std::string &key);
 
diff --git a/paddle/cinn/ir/ir_schedule.cc b/paddle/cinn/ir/ir_schedule.cc
index 3489b84a45e505f706c16af6aa15be579c1be59f..de48a7c28d8f149448cf4421da74ee62d5ba2644 100644
--- a/paddle/cinn/ir/ir_schedule.cc
+++ b/paddle/cinn/ir/ir_schedule.cc
@@ -96,7 +96,7 @@ class ScheduleImpl {
                   int write_buffer_index,
                   const std::string& memory_type);
   void SyncThreads(const Expr& ir_node, bool after_node = true);
-  void SetBuffer(Expr& block,
+  void SetBuffer(Expr& block,  // NOLINT
                  const std::string& memory_type,
                  bool fixed = false);
   Expr Reorder(const std::vector<Expr>& loops);
@@ -114,7 +114,7 @@ class ScheduleImpl {
   Expr Rfactor(const Expr& rf_loop, int rf_axis);
   Expr AddUnitLoop(const Expr& block) const;
   void Annotate(const Expr& block, const std::string& key, const attr_t& value);
-  void Unannotate(Expr& block, const std::string& key);
+  void Unannotate(Expr& block, const std::string& key);  // NOLINT
   void FlattenLoops(const std::vector<Expr>& loops,
                     const bool force_flat = false);
   void CopyTransformAndLoopInfo(const Expr& block, const Expr& block_target);
diff --git a/paddle/cinn/ir/ir_schedule.h b/paddle/cinn/ir/ir_schedule.h
index d847e933eb54daaec1b2bc26cf1484a173b0ae04..2689eb48a27e5b0062e7cc9df611a332b2e370c3 100644
--- a/paddle/cinn/ir/ir_schedule.h
+++ b/paddle/cinn/ir/ir_schedule.h
@@ -247,9 +247,9 @@ class IRSchedule {
    * \param memory_type The memory type we want to set. Should be "local",
    * "shared" or "global".
    */
-  void SetBuffer(Expr& block,
+  void SetBuffer(Expr& block,  // NOLINT
                  const std::string& memory_type,
-                 bool fixed = false);
+                 bool fixed = false);  // NOLINT
 
   /**
    * \brief Reorder the loops in the order of vector.
@@ -391,7 +391,7 @@ class IRSchedule {
    * \param block The block to be unannotated
    * \param key The attribute key
    */
-  void Unannotate(Expr& block, const std::string& key);
+  void Unannotate(Expr& block, const std::string& key);  // NOLINT
 
   /*!
    * \brief flatten the loops in one dim.
@@ -620,7 +620,7 @@ class LeafBlockRemovalPlan : public ir::IRMutator<> {
 
 class ComputeInlineChecker : public ir::IRMutator<> {
  public:
-  ComputeInlineChecker(IRSchedule& schedule, Expr& block)
+  ComputeInlineChecker(IRSchedule& schedule, Expr& block)  // NOLINT
       : ir_schedule_(schedule), block_(block) {}
 
   bool Check();
diff --git a/paddle/cinn/ir/ir_schedule_util.cc b/paddle/cinn/ir/ir_schedule_util.cc
index 34e6cf6e7cd2385981ecbe76cc977acad1b59c13..4b7ca206487422a9ca8bc85d461489c97ac84752 100644
--- a/paddle/cinn/ir/ir_schedule_util.cc
+++ b/paddle/cinn/ir/ir_schedule_util.cc
@@ -503,7 +503,7 @@ Expr MakeCacheBlock(const std::vector<IterRange>& buffer_ranges,
   return block;
 }
 
-void FindInsertionPoint(Expr& root, CacheBlockInfo* info, bool is_write) {
+void FindInsertionPoint(const Expr& root, CacheBlockInfo* info, bool is_write) {
   Expr find_tensor =
       is_write ? Expr(info->write_tensor) : Expr(info->read_tensor);
   auto find_produce_read =
@@ -651,7 +651,7 @@ Expr ConstructOtherStmtChain(const std::vector<Expr>& stmts,
 Expr ConstructNewLoopChain(const std::vector<Expr>& chain,
                            const std::vector<Expr>& ordered_loops,
                            const std::set<Expr, CompExpr>& loop_set,
-                           std::vector<Expr>& if_nodes) {
+                           std::vector<Expr>& if_nodes) {  // NOLINT
   std::vector<std::set<std::string>> condition_vars;
   // In each IfThenElse node, find the vars its condition depends on.
   for (auto& if_expr : if_nodes) {
@@ -923,7 +923,7 @@ void CheckComputeAtValidation(const Expr& block,
   CHECK(find_block_in_loop.empty()) << "loop should not be block's ancestor!";
 }
 
-void InsertBlock(Expr& for_loop, const Expr& insertion, int index) {
+void InsertBlock(Expr& for_loop, const Expr& insertion, int index) {  // NOLINT
   CHECK(for_loop.As<ir::For>());
   CHECK(for_loop.As<ir::For>()->body.As<Block>());
   ir::Block* dst_block = for_loop.As<ir::For>()->body.As<Block>();
diff --git a/paddle/cinn/ir/ir_schedule_util.h b/paddle/cinn/ir/ir_schedule_util.h
index 0107054e413767e5dc7895bf624963947a238dce..762cd166d2004de9e2523c36ec252558968684c4 100644
--- a/paddle/cinn/ir/ir_schedule_util.h
+++ b/paddle/cinn/ir/ir_schedule_util.h
@@ -326,7 +326,7 @@ Expr MakeCacheBlock(const std::vector<IterRange>& buffer_ranges,
  * @param info The information of cache block.
  * @param is_write Are we inserting a write cache tensor or a read cache tensor.
  */
-void FindInsertionPoint(Expr& root, CacheBlockInfo* info, bool is_write);
+void FindInsertionPoint(const Expr& root, CacheBlockInfo* info, bool is_write);
 
 /**
  * \brief Given a vector of For loops, return a set of them.
@@ -359,7 +359,7 @@ std::vector<Expr> GetLoopsInRange(const Expr& top, const Expr& bottom);
 Expr ConstructNewLoopChain(const std::vector<Expr>& chain,
                            const std::vector<Expr>& ordered_loops,
                            const std::set<Expr, CompExpr>& loop_set,
-                           std::vector<Expr>& if_nodes);
+                           std::vector<Expr>& if_nodes);  // NOLINT
 
 /*!
  * \brief Find producers of block in root.
@@ -395,7 +395,9 @@ void CheckComputeAtValidation(const Expr& block,
  *        - `index = -1` means inserted into the tail
  *        - otherwise, it should be a index between [0, stmts size)
  */
-void InsertBlock(Expr& for_loop, const Expr& insertion, int index = 0);
+void InsertBlock(Expr& for_loop,  // NOLINT
+                 const Expr& insertion,
+                 int index = 0);  // NOLINT
 
 /*!
  * \brief Make a union of two range. The detailed function is :
diff --git a/paddle/cinn/lang/lower_impl.cc b/paddle/cinn/lang/lower_impl.cc
index fcc6c12ce11ddd7c6d945c0645838c82ff583ee9..f68396a3a3ab107fa1fa170bc280d91339347cf7 100644
--- a/paddle/cinn/lang/lower_impl.cc
+++ b/paddle/cinn/lang/lower_impl.cc
@@ -49,7 +49,7 @@ void CheckNoIslCallRemains(Expr* expr) {
   }
 }
 
-void BindBuffer(StageMap& stages) {
+void BindBuffer(StageMap& stages) {  // NOLINT
   absl::flat_hash_map<std::string, ir::_Tensor_*> tensor_map;
   for (auto& stage : stages) {
     tensor_map[stage.second->tensor()->name] = stage.second->tensor();
@@ -71,13 +71,13 @@ void BindBuffer(StageMap& stages) {
   }
 }
 
-Expr LowerGroup(
-    const poly::ScheduleGroup& group,
-    const std::map<std::string, Expr>& tuple_to_expr,
-    std::map<std::string, ir::Tensor>* global_tensor_map,
-    std::unordered_map<std::string, std::vector<Expr>>& resized_buffer_cache,
-    StageMap stage_map,
-    ir::CudaAxisInfo* cuda_axis_info) {
+Expr LowerGroup(const poly::ScheduleGroup& group,
+                const std::map<std::string, Expr>& tuple_to_expr,
+                std::map<std::string, ir::Tensor>* global_tensor_map,
+                std::unordered_map<std::string, std::vector<Expr>>&
+                    resized_buffer_cache,  // NOLINT
+                StageMap stage_map,
+                ir::CudaAxisInfo* cuda_axis_info) {
   BindBuffer(stage_map);
   std::vector<poly::Stage*> stages;
   for (auto& node : group.nodes) {
diff --git a/paddle/cinn/lang/lower_impl.h b/paddle/cinn/lang/lower_impl.h
index 505e80ca6a49e7ed650522791113bc8b5be471f3..bc7494f57f6696388d2fb6b1c5808b9693c57303 100644
--- a/paddle/cinn/lang/lower_impl.h
+++ b/paddle/cinn/lang/lower_impl.h
@@ -69,7 +69,7 @@ void CheckNoIslCallRemains(const Expr* expr);
 Expr LowerGroup(const poly::ScheduleGroup& group,
                 const std::map<std::string, Expr>& tuple_to_expr,
                 std::map<std::string, Tensor>* global_tensor_map,
-                std::unordered_set<std::string>& resized_buffer,
+                std::unordered_set<std::string>& resized_buffer,  // NOLINT
                 StageMap stage_map,
                 ir::CudaAxisInfo* cuda_axis_info = nullptr);
 
diff --git a/paddle/cinn/optim/ir_simplify.cc b/paddle/cinn/optim/ir_simplify.cc
index 48645690f9de7ac42457cd471e398e817052574d..51915c925007e95901a672c32e7104c4bf062387 100644
--- a/paddle/cinn/optim/ir_simplify.cc
+++ b/paddle/cinn/optim/ir_simplify.cc
@@ -55,7 +55,8 @@ void PartialSimplify(
 //! Simplify the expression but Load.
 struct SimplifyButStoreLoadMutator : public ir::IRMutator<ir::Expr*> {
   common::cas_intervals_t& var_intervals;
-  explicit SimplifyButStoreLoadMutator(common::cas_intervals_t& var_intervals)
+  explicit SimplifyButStoreLoadMutator(
+      common::cas_intervals_t& var_intervals)  // NOLINT
       : var_intervals(var_intervals) {}
 
   void operator()(Expr* x) { ir::IRMutator<ir::Expr*>::Visit(x, x); }
diff --git a/paddle/cinn/poly/poly_scheduler.cc b/paddle/cinn/poly/poly_scheduler.cc
old mode 100755
new mode 100644
index d3a34e4544507187bee03b12f723324faf573e5b..b916e5952ffe4b7ce928db900376dff5f50c37cb
--- a/paddle/cinn/poly/poly_scheduler.cc
+++ b/paddle/cinn/poly/poly_scheduler.cc
@@ -177,7 +177,7 @@ bool IsBetween(const common::GraphNode* x,
   return false;
 }
 
-std::vector<Group> TopoSortGroups(std::vector<Group>& groups) {
+std::vector<Group> TopoSortGroups(std::vector<Group>& groups) {  // NOLINT
   // collect indegree.
   absl::flat_hash_map<Group*, int> group_indegree;
   std::vector<Group*> start_groups;
diff --git a/paddle/cinn/poly/stage.cc b/paddle/cinn/poly/stage.cc
index 5af88bf251ed8d59dc5ac5c2ea247c396cd03a18..149a285f77558fbd0cfe695b2c6ed2094395376c 100644
--- a/paddle/cinn/poly/stage.cc
+++ b/paddle/cinn/poly/stage.cc
@@ -40,7 +40,7 @@
 
 namespace cinn {
 namespace poly {
-void RemoveDuplicate(std::vector<std::vector<Expr>> &indices) {
+void RemoveDuplicate(std::vector<std::vector<Expr>> &indices) {  // NOLINT
   std::set<std::string> temp;
   for (int i = 0; i < indices.size(); i++) {
     std::string index_str = "";
@@ -309,7 +309,7 @@ int Minus(const Expr &a, const Expr &b) {
 }
 
 // Return the range = max - min among all indices[i][axis](i = 0,1,2,...)
-int GetRange(std::vector<std::vector<Expr>> &indices, int axis) {
+int GetRange(std::vector<std::vector<Expr>> &indices, int axis) {  // NOLINT
   Expr max_expr = indices[0][axis];
   Expr min_expr = indices[0][axis];
   for (auto i = 1; i < indices.size(); i++) {
@@ -1420,7 +1420,7 @@ struct CacheReplaceMutator : public ir::IRMutator<> {
 };
 }  // namespace
 
-void CacheReadWriteReplace(std::vector<ir::Tensor> &readers,
+void CacheReadWriteReplace(const std::vector<ir::Tensor> &readers,
                            ir::Tensor cache_tensor,
                            std::string origin_tensor_name) {
   for (auto k : readers) {
diff --git a/paddle/cinn/poly/stage.h b/paddle/cinn/poly/stage.h
old mode 100755
new mode 100644
index 869f8f038de5e9cc06ab9ec4fa95e3e24ae5762f..cf0586710bc9c733525765d955d151249de3a315
--- a/paddle/cinn/poly/stage.h
+++ b/paddle/cinn/poly/stage.h
@@ -270,7 +270,7 @@ class Stage : public Object {
    * @param readers the readers of the \p tensor
    */
   ir::Tensor CacheRead(const std::string& memory_type,
-                       std::vector<ir::Tensor>& readers,
+                       std::vector<ir::Tensor>& readers,  // NOLINT
                        poly::StageMap stages);
 
   /**
@@ -303,7 +303,8 @@ class Stage : public Object {
 
   void ShowISL() const;
 
-  void AddForLoopInTransform(std::vector<std::vector<Expr>>& indices);
+  void AddForLoopInTransform(
+      std::vector<std::vector<Expr>>& indices);  // NOLINT
   /**
    * Create a cache for write to the original tensor.
    * @param tensor the tensor to create the cache for.
@@ -312,7 +313,7 @@ class Stage : public Object {
    */
   ir::Tensor CacheWrite(const std::string& memory_type,
                         poly::StageMap stages,
-                        ir::Tensor& key_tensor);
+                        ir::Tensor& key_tensor);  // NOLINT
 
   /**
    * Generate the `syncthreads()` code to sync all threads on CUDA backends.
diff --git a/paddle/cinn/pybind/common.cc b/paddle/cinn/pybind/common.cc
index 994308433bc7f81632bbc5de1a6d4d88fc94e6ee..a39d51628df09002d3d18de7143be3286f3eb8a7 100644
--- a/paddle/cinn/pybind/common.cc
+++ b/paddle/cinn/pybind/common.cc
@@ -236,12 +236,12 @@ inline auto __binary_op_fn_dispatch(T1 x, T2 y, F fn, std::false_type) {
 
 template <typename T1, typename T2, typename F>
 inline void __binary_op_visitor_dispatch(
-    CINNValue &v, T1 lhs, T2 rhs, F fn, std::true_type) {
+    CINNValue &v, T1 lhs, T2 rhs, F fn, std::true_type) {  // NOLINT
   v = CINNValue();
 }
 template <typename T1, typename T2, typename F>
 inline void __binary_op_visitor_dispatch(
-    CINNValue &v, T1 lhs, T2 rhs, F fn, std::false_type) {
+    CINNValue &v, T1 lhs, T2 rhs, F fn, std::false_type) {  // NOLINT
   v.Set(fn(lhs, rhs));
 }
 
diff --git a/paddle/cinn/runtime/cuda/float16.h b/paddle/cinn/runtime/cuda/float16.h
index be847bfc2e7cd183ea663bd05486aea3df0c8ab7..cae59186dc83221993e5f720480087a90e647810 100644
--- a/paddle/cinn/runtime/cuda/float16.h
+++ b/paddle/cinn/runtime/cuda/float16.h
@@ -500,25 +500,25 @@ __host__ __device__ inline float16 operator-(const float16& a) {
 #endif
 }
 
-__host__ __device__ inline float16& operator+=(float16& a,
+__host__ __device__ inline float16& operator+=(float16& a,          // NOLINT
                                                const float16& b) {  // NOLINT
   a = a + b;
   return a;
 }
 
-__host__ __device__ inline float16& operator-=(float16& a,
+__host__ __device__ inline float16& operator-=(float16& a,          // NOLINT
                                                const float16& b) {  // NOLINT
   a = a - b;
   return a;
 }
 
-__host__ __device__ inline float16& operator*=(float16& a,
+__host__ __device__ inline float16& operator*=(float16& a,          // NOLINT
                                                const float16& b) {  // NOLINT
   a = a * b;
   return a;
 }
 
-__host__ __device__ inline float16& operator/=(float16& a,
+__host__ __device__ inline float16& operator/=(float16& a,          // NOLINT
                                                const float16& b) {  // NOLINT
   a = a / b;
   return a;