[CodeStyle][CINN] fix cpplint codestyle for [runtime/references] (#55068)

3e5018df · Wang Xin · GitHub · 13c2342a · 3e5018df · 3e5018df
50 changed file
--- a/paddle/cinn/auto_schedule/analysis/analyze_ir.cc
+++ b/paddle/cinn/auto_schedule/analysis/analyze_ir.cc
@@ -145,7 +145,7 @@ bool NeedsMultiLevelTiling(const ir::ScheduleBlockRealize& sche_block_realize) {
 ir::LoweredFunc UpdateFuncWithNewBody(const common::Target& target,
                                      const ir::LoweredFunc& old_func,
-                                      ir::Expr& body) {
+                                      ir::Expr& body) {  // NOLINT
  ir::ModuleExpr mod_expr(std::vector<ir::Expr>({body}));
  ir::IRSchedule ir_sch(mod_expr);

--- a/paddle/cinn/auto_schedule/analysis/analyze_ir.h
+++ b/paddle/cinn/auto_schedule/analysis/analyze_ir.h
@@ -46,7 +46,7 @@ bool NeedsMultiLevelTiling(const ir::ScheduleBlockRealize& sche_block_realize);
 */
 ir::LoweredFunc UpdateFuncWithNewBody(const common::Target& target,
                                      const ir::LoweredFunc& old_func,
-                                      ir::Expr& body);
+                                      ir::Expr& body);  // NOLINT
 }  // namespace auto_schedule
 }  // namespace cinn
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_inline.h
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_inline.h
@@ -64,7 +64,7 @@ class AutoInline : public AutoGenRule {
                                        const std::string& block_name) override;
 private:
-  void Apply(ir::IRSchedule* ir_schedule, ir::Expr& block_expr);
+  void Apply(ir::IRSchedule* ir_schedule, ir::Expr& block_expr);  // NOLINT
 private:
  std::vector<ir::Expr> all_block_realizes_;

--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling.h
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling.h
@@ -120,9 +120,12 @@ class MultiLevelTiling : public AutoGenRule {
  }
 private:
-  void ApplyTiling(ir::IRSchedule* ir_schedule, ir::Expr& block_expr);
+  void ApplyTiling(ir::IRSchedule* ir_schedule,
-  void ApplyCacheRead(ir::IRSchedule* ir_schedule, ir::Expr& block_expr);
+                   ir::Expr& block_expr);  // NOLINT
-  void ApplyCacheWrite(ir::IRSchedule* ir_schedule, ir::Expr& block_expr);
+  void ApplyCacheRead(ir::IRSchedule* ir_schedule,
+                      ir::Expr& block_expr);  // NOLINT
+  void ApplyCacheWrite(ir::IRSchedule* ir_schedule,
+                       ir::Expr& block_expr);  // NOLINT
 private:
  std::vector<ir::Expr> all_block_realizes_;

--- a/paddle/cinn/auto_schedule/task/task_optimizer.cc
+++ b/paddle/cinn/auto_schedule/task/task_optimizer.cc
@@ -51,7 +51,7 @@ using cinn::hlir::op::ExternalApiRegistry;
 // *** update a scheduled function with several post-processors
 ir::LoweredFunc FuncWithUpdatedBody(const common::Target& target,
                                    const ir::LoweredFunc& old_func,
-                                    ir::Expr& body);
+                                    ir::Expr& body);  // NOLINT
 // check whether a scheduled lowered function is valid
 bool PruneInvalid(const ir::LoweredFunc& lowered_func,
                  const common::Target& target);

--- a/paddle/cinn/backends/llvm/codegen_llvm.cc
+++ b/paddle/cinn/backends/llvm/codegen_llvm.cc
@@ -1038,13 +1038,13 @@ llvm::Value *CodeGenLLVM::Visit(const ir::_Tensor_ *op) {
 template <typename T,
          std::enable_if_t<std::is_same<const ir::Expr &, T>::value, int> = 0>
-void appendBody(std::vector<Expr> &new_body, T &&v) {
+void appendBody(std::vector<Expr> &new_body, T &&v) {  // NOLINT
  new_body.push_back(v);
 }
 template <typename T,
          std::enable_if_t<!std::is_same<const ir::Expr &, T>::value, int> = 1>
-void appendBody(std::vector<Expr> &new_body, T &&v) {
+void appendBody(std::vector<Expr> &new_body, T &&v) {  // NOLINT
  new_body.insert(new_body.end(), v.begin(), v.end());
 }

--- a/paddle/cinn/backends/llvm/codegen_llvm.h
+++ b/paddle/cinn/backends/llvm/codegen_llvm.h
@@ -98,7 +98,7 @@ class SymbolTable {
 };
 struct SymbolTableGuard {
-  explicit SymbolTableGuard(SymbolTable &symbol_table)
+  explicit SymbolTableGuard(SymbolTable &symbol_table)  // NOLINT
      : symbol_table_(symbol_table) {
    symbol_table.PushScope();
  }

--- a/paddle/cinn/common/arithmatic.cc
+++ b/paddle/cinn/common/arithmatic.cc
@@ -180,7 +180,8 @@ class GiNaCToExprVisitor : public GiNaC::symbol::visitor,
  ir::Expr cur;
 public:
-  explicit GiNaCToExprVisitor(std::map<std::string, ir::Expr>& repr_to_expr)
+  explicit GiNaCToExprVisitor(
+      std::map<std::string, ir::Expr>& repr_to_expr)  // NOLINT
      : repr_to_expr(repr_to_expr) {}
  Expr operator()(GiNaC::ex ex) {

--- a/paddle/cinn/common/float16.h
+++ b/paddle/cinn/common/float16.h
@@ -500,25 +500,25 @@ __host__ __device__ inline float16 operator-(const float16& a) {
 #endif
 }
-__host__ __device__ inline float16& operator+=(float16& a,
+__host__ __device__ inline float16& operator+=(float16& a,          // NOLINT
                                               const float16& b) {  // NOLINT
  a = a + b;
  return a;
 }
-__host__ __device__ inline float16& operator-=(float16& a,
+__host__ __device__ inline float16& operator-=(float16& a,          // NOLINT
                                               const float16& b) {  // NOLINT
  a = a - b;
  return a;
 }
-__host__ __device__ inline float16& operator*=(float16& a,
+__host__ __device__ inline float16& operator*=(float16& a,          // NOLINT
                                               const float16& b) {  // NOLINT
  a = a * b;
  return a;
 }
-__host__ __device__ inline float16& operator/=(float16& a,
+__host__ __device__ inline float16& operator/=(float16& a,          // NOLINT
                                               const float16& b) {  // NOLINT
  a = a / b;
  return a;

--- a/paddle/cinn/frontend/computation.cc
+++ b/paddle/cinn/frontend/computation.cc
@@ -42,7 +42,7 @@ struct ComputationContext {
 std::shared_ptr<ComputationContext> CompileProgram(
    const Target &target,
-    Program &program,
+    Program &program,  // NOLINT
    const std::vector<Variable> &outputs,
    std::shared_ptr<hlir::framework::Scope> scope,
    const CinnComputation::CompileOptions &options,

--- a/paddle/cinn/frontend/computation.h
+++ b/paddle/cinn/frontend/computation.h
@@ -59,7 +59,7 @@ class CinnComputation {
   */
  static std::shared_ptr<CinnComputation> BuildAndCompile(
      const Target &target,
-      NetBuilder &builder,
+      NetBuilder &builder,  // NOLINT
      const CompileOptions &options = DefaultCompileOptions(),
      const std::vector<Variable> &outputs = {},
      void *stream = nullptr);
@@ -77,7 +77,7 @@ class CinnComputation {
   */
  static std::shared_ptr<CinnComputation> Compile(
      const Target &target,
-      Program &program,
+      Program &program,  // NOLINT
      const CompileOptions &options = DefaultCompileOptions(),
      const std::vector<Variable> &outputs = {},
      void *stream = nullptr);
@@ -130,7 +130,9 @@ class CinnComputation {
   * @param data address of the memory buffer to store tensor's data
   * @param size size of the memory buffer
   */
-  void SetTensorData(hlir::framework::Tensor &t, void *data, size_t size);
+  void SetTensorData(hlir::framework::Tensor &t,  // NOLINT
+                     void *data,
+                     size_t size);
  /**
   * set the data of a tensor (specified by it's name) from user specified
@@ -148,7 +150,9 @@ class CinnComputation {
   * @param data address of the memory buffer to store tensor's data
   * @param size size of the memory buffer
   */
-  void GetTensorData(hlir::framework::Tensor &t, void *data, size_t size);
+  void GetTensorData(hlir::framework::Tensor &t,  // NOLINT
+                     void *data,
+                     size_t size);
  /**
   * copy the data of a tensor (specified by it's name) to user specified
   * buffer. if tensor is in NVGPU device memory, cudaMemcpy is used.

--- a/paddle/cinn/frontend/pass/expand_zero_dim_pass_test.cc
+++ b/paddle/cinn/frontend/pass/expand_zero_dim_pass_test.cc
@@ -33,7 +33,7 @@
 namespace cinn {
 namespace frontend {
-int GetSize(std::vector<int>& shape) {
+int GetSize(const std::vector<int>& shape) {
  return std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>());
 }

--- a/paddle/cinn/frontend/syntax.cc
+++ b/paddle/cinn/frontend/syntax.cc
@@ -296,10 +296,10 @@ std::tuple<std::unique_ptr<Program>,
           absl::flat_hash_map<std::string, Variable>,
           absl::flat_hash_map<std::string, std::string>,
           absl::flat_hash_set<std::string>>
-LoadPaddleProgram(
+LoadPaddleProgram(const std::string& model_dir,
-    const std::string& model_dir,
                  Scope* scope,
-    std::unordered_map<std::string, std::vector<int>>& input_shape_map,
+                  std::unordered_map<std::string, std::vector<int>>&
+                      input_shape_map,  // NOLINT
                  bool is_combined,
                  const common::Target& target) {
  VLOG(1) << "Loading Paddle model from " << model_dir;

--- a/paddle/cinn/frontend/syntax.h
+++ b/paddle/cinn/frontend/syntax.h
@@ -528,10 +528,10 @@ std::tuple<std::unique_ptr<Program>,
           absl::flat_hash_map<std::string, Variable>,
           absl::flat_hash_map<std::string, std::string>,
           absl::flat_hash_set<std::string>>
-LoadPaddleProgram(
+LoadPaddleProgram(const std::string& model_dir,
-    const std::string& model_dir,
                  hlir::framework::Scope* scope,
-    std::unordered_map<std::string, std::vector<int>>& input_shape_map,
+                  std::unordered_map<std::string, std::vector<int>>&
+                      input_shape_map,  // NOLINT
                  bool is_combined,
                  const common::Target& target = common::DefaultHostTarget());

--- a/paddle/cinn/hlir/framework/graph_compiler.cc
+++ b/paddle/cinn/hlir/framework/graph_compiler.cc
@@ -1643,7 +1643,7 @@ std::shared_ptr<Scope> BuildScope(Target target,
 std::vector<ir::LoweredFunc> GetFuncFromImpl(
    const std::shared_ptr<OpImpl>& impl,
    const common::CINNValuePack& cinn_inputs,
-    std::vector<ir::Tensor>& all_arg_tensors,
+    std::vector<ir::Tensor>& all_arg_tensors,  // NOLINT
    const std::vector<std::string>& input_output_nodes,
    const std::string& node_id,
    const Target& target) {

--- a/paddle/cinn/hlir/framework/graph_compiler.h
+++ b/paddle/cinn/hlir/framework/graph_compiler.h
@@ -222,7 +222,7 @@ std::shared_ptr<Scope> BuildScope(Target target,
 std::vector<ir::LoweredFunc> GetFuncFromImpl(
    const std::shared_ptr<OpImpl>& impl,
    const common::CINNValuePack& cinn_inputs,
-    std::vector<ir::Tensor>& tensor_inputs,
+    std::vector<ir::Tensor>& tensor_inputs,  // NOLINT
    const std::vector<std::string>& input_output_nodes,
    const std::string& node_id,
    const Target& target);

--- a/paddle/cinn/hlir/framework/op_lowering.cc
+++ b/paddle/cinn/hlir/framework/op_lowering.cc
@@ -45,7 +45,7 @@ OpLowerer::OpLowerer(
    const Target& target)
    : type_dict_(type_dict), shape_dict_(shape_dict), target_(target) {}
-std::vector<ir::LoweredFunc> OpLowerer::Lower(GroupPtr& group) {
+std::vector<ir::LoweredFunc> OpLowerer::Lower(GroupPtr& group) {  // NOLINT
  VLOG(3) << "Lowering Group : " << group->group_id
          << " , Op Pattern : " << group->op_pattern_kind;
  group->input_names.clear();

--- a/paddle/cinn/hlir/framework/op_lowering.h
+++ b/paddle/cinn/hlir/framework/op_lowering.h
@@ -52,8 +52,8 @@ class OpLowerer {
  OpLowerer(const absl::flat_hash_map<std::string, Type>&,
            const absl::flat_hash_map<std::string, shape_t>&,
            const Target&);
-  std::vector<ir::LoweredFunc> Lower(GroupPtr& group);
+  std::vector<ir::LoweredFunc> Lower(GroupPtr& group);                 // NOLINT
-  std::vector<ir::LoweredFunc> LowerWithoutSchedule(GroupPtr& group);
+  std::vector<ir::LoweredFunc> LowerWithoutSchedule(GroupPtr& group);  // NOLINT
 private:
  std::vector<ir::LoweredFunc> IRLowerOp(IRComputeFunction, GroupPtr&);
@@ -75,7 +75,7 @@ class OpLowerer {
  DEFINE_IR_COMPUTE(OutEWiseFusable);
  void IRSchedule(
-      ir::IRSchedule& ir_sch,
+      ir::IRSchedule& ir_sch,  // NOLINT
      const GroupPtr& group,
      const std::unordered_map<std::string, ir::Tensor>& tensor_map);

--- a/paddle/cinn/hlir/framework/op_lowering_test.cc
+++ b/paddle/cinn/hlir/framework/op_lowering_test.cc
@@ -32,7 +32,7 @@ namespace framework {
 using frontend::NetBuilder;
 using frontend::RunDecomposer;
-void CodeGen(ir::LoweredFunc& func) {
+void CodeGen(const ir::LoweredFunc& func) {
 #ifdef CINN_WITH_CUDA
  auto target = common::DefaultNVGPUTarget();
  Module::Builder builder("module_builder", target);
@@ -56,7 +56,7 @@ void CodeGen(ir::LoweredFunc& func) {
 #endif
 }
-void Compile(NetBuilder& net_builder) {
+void Compile(NetBuilder& net_builder) {  // NOLINT
  auto program = net_builder.Build();
  auto target = common::DefaultTarget();
  RunDecomposer(&program, target);

--- a/paddle/cinn/hlir/framework/op_lowering_util.cc
+++ b/paddle/cinn/hlir/framework/op_lowering_util.cc
@@ -92,8 +92,8 @@ ir::Tensor GetTensor(
 std::vector<ir::Tensor> CollectInputTensor(
    const Node* node,
-    std::vector<ir::Tensor>& func_args,
+    std::vector<ir::Tensor>& func_args,                       // NOLINT
-    std::unordered_map<std::string, ir::Tensor>& tensor_map,
+    std::unordered_map<std::string, ir::Tensor>& tensor_map,  // NOLINT
    const absl::flat_hash_map<std::string, Type>& type_dict,
    const absl::flat_hash_map<std::string, shape_t>& shape_dict) {
  std::vector<ir::Tensor> tensors;
@@ -543,7 +543,7 @@ bool WithoutLastDimInReduce(const std::vector<int>& shape,
  }
 }
-void LoopOrderAssignReduce(ir::IRSchedule& ir_sch,
+void LoopOrderAssignReduce(ir::IRSchedule& ir_sch,  // NOLINT
                           const std::string& block_name,
                           const std::vector<int>& axes,
                           const common::Target& target,
@@ -593,7 +593,7 @@ void LoopOrderAssignReduce(ir::IRSchedule& ir_sch,
  }
 }
-void LoopAssignReduceWithoutLast(ir::IRSchedule& ir_sch,
+void LoopAssignReduceWithoutLast(ir::IRSchedule& ir_sch,  // NOLINT
                                 const std::string& block_name,
                                 const std::vector<int>& inshape,
                                 const std::vector<int>& axes,
@@ -707,7 +707,7 @@ void LoopAssignReduceWithoutLast(ir::IRSchedule& ir_sch,
  ir_sch.Reorder(block_name, new_order);
 }
-void LoopAssignReduceWithLast(ir::IRSchedule& ir_sch,
+void LoopAssignReduceWithLast(ir::IRSchedule& ir_sch,  // NOLINT
                              const std::string& block_name,
                              const std::vector<int>& inshape,
                              const std::vector<int>& axes,
@@ -974,7 +974,7 @@ Node* GetMasterToComputeAt(
 }
 void LoopAssignReduce(
-    ir::IRSchedule& ir_sch,
+    ir::IRSchedule& ir_sch,  // NOLINT
    const Node* node,
    const Node* reducer,
    const Target& target,
@@ -1217,8 +1217,8 @@ class RemoveExpr : public ir::IRMutator<> {
 };
 void MergeLoops(ir::Expr root,
-                std::vector<ir::Expr>& src,
+                std::vector<ir::Expr>& src,  // NOLINT
-                std::vector<ir::Expr>& dst,
+                std::vector<ir::Expr>& dst,  // NOLINT
                int index) {
  if (index < 0) {
    return;
@@ -1247,7 +1247,7 @@ void MergeLoops(ir::Expr root,
 }
 void InsertSyncThread(
-    ir::IRSchedule& ir_sch,
+    ir::IRSchedule& ir_sch,  // NOLINT
    const Node* node,
    const absl::flat_hash_map<std::string, shape_t>& shape_dict,
    const std::unordered_map<std::string, ir::Tensor>& tensor_map) {
@@ -1318,7 +1318,7 @@ class InsertExpr : public ir::IRMutator<> {
 };
 void MergeReduceToReduce(
-    ir::IRSchedule& ir_sch,
+    ir::IRSchedule& ir_sch,  // NOLINT
    const Node* node,
    const Node* master,
    const absl::flat_hash_map<std::string, shape_t>& shape_dict,
@@ -1506,7 +1506,7 @@ void MergeReduceToReduce(
 }
 void MergeReduceLoop(
-    ir::IRSchedule& ir_sch,
+    ir::IRSchedule& ir_sch,  // NOLINT
    Node* node,
    const Node* master,
    const absl::flat_hash_map<std::string, shape_t>& shape_dict,
@@ -1611,7 +1611,7 @@ class FindExprInBlock : public ir::IRMutator<> {
 };
 void LoopComputeAt(
-    ir::IRSchedule& ir_sch,
+    ir::IRSchedule& ir_sch,  // NOLINT
    Node* node,
    const Node* master,
    const GroupPtr& group,
@@ -1712,7 +1712,7 @@ std::unordered_set<Node*> GetMasters(
 }
 void SyncThreadWithShared(
-    ir::IRSchedule& ir_sch,
+    ir::IRSchedule& ir_sch,  // NOLINT
    const GroupPtr& group,
    const std::unordered_set<Node*>& nodes_inline,
    const std::unordered_set<Node*>& nodes_set,

--- a/paddle/cinn/hlir/framework/op_lowering_util.h
+++ b/paddle/cinn/hlir/framework/op_lowering_util.h
@@ -31,8 +31,8 @@ ir::Tensor GetTensor(
 std::vector<ir::Tensor> CollectInputTensor(
    const Node* node,
-    std::vector<ir::Tensor>& func_args,
+    std::vector<ir::Tensor>& func_args,                       // NOLINT
-    std::unordered_map<std::string, ir::Tensor>& tensor_map,
+    std::unordered_map<std::string, ir::Tensor>& tensor_map,  // NOLINT
    const absl::flat_hash_map<std::string, Type>& type_dict,
    const absl::flat_hash_map<std::string, shape_t>& shape_dict);
@@ -87,7 +87,7 @@ std::unordered_set<Node*> GetMasters(
    const std::unordered_set<Node*>& nodes_set);
 void LoopAssignReduce(
-    ir::IRSchedule& ir_sch,
+    ir::IRSchedule& ir_sch,  // NOLINT
    const Node* node,
    const Node* reducer,
    const Target& target,
@@ -95,7 +95,7 @@ void LoopAssignReduce(
    const absl::flat_hash_map<std::string, shape_t>& shape_dict);
 void LoopComputeAt(
-    ir::IRSchedule& ir_sch,
+    ir::IRSchedule& ir_sch,  // NOLINT
    Node* node,
    const Node* master,
    const GroupPtr& group,
@@ -103,7 +103,7 @@ void LoopComputeAt(
    const std::unordered_map<std::string, ir::Tensor>& tensor_map);
 void SyncThreadWithShared(
-    ir::IRSchedule& ir_sch,
+    ir::IRSchedule& ir_sch,  // NOLINT
    const GroupPtr& group,
    const std::unordered_set<Node*>& nodes_inline,
    const std::unordered_set<Node*>& nodes_set,

--- a/paddle/cinn/hlir/framework/parallel_compiler.h
+++ b/paddle/cinn/hlir/framework/parallel_compiler.h
@@ -36,8 +36,8 @@ class ParallelCompiler {
  };
 public:
-  explicit ParallelCompiler(std::shared_ptr<Scope>& scope,
+  explicit ParallelCompiler(std::shared_ptr<Scope>& scope,  // NOLINT
-                            std::shared_ptr<Graph>& graph,
+                            std::shared_ptr<Graph>& graph,  // NOLINT
                            const CompileOptions& option,
                            const common::Target& target)
      : scope_(scope), graph_(graph), option_(option), target_(target) {}
@@ -53,8 +53,8 @@ class ParallelCompiler {
  struct Task {
   public:
    Task(ParallelCompiler* p,
-         std::shared_ptr<Scope>& s,
+         std::shared_ptr<Scope>& s,  // NOLINT
-         std::shared_ptr<Graph>& g,
+         std::shared_ptr<Graph>& g,  // NOLINT
         const CompileOptions& cp,
         const Target& t)
        : compiler(p), scope(s), graph(g), options(cp), target(t) {}

--- a/paddle/cinn/hlir/op/op_broadcast_test.cc
+++ b/paddle/cinn/hlir/op/op_broadcast_test.cc
@@ -258,7 +258,7 @@ TEST(Operator, Operator_BroadcastTo) {
 common::CINNValuePack GetComputeResult(
    const std::shared_ptr<OpImpl> &impl,
-    std::vector<common::CINNValue> &cinn_inputs,
+    std::vector<common::CINNValue> &cinn_inputs,  // NOLINT
    const std::string &output_name = "") {
  if (FLAGS_cinn_ir_schedule) {
    cinn_inputs.emplace_back(output_name);

--- a/paddle/cinn/hlir/op/op_nn_test.cc
+++ b/paddle/cinn/hlir/op/op_nn_test.cc
@@ -44,7 +44,7 @@ Module LowerToModule(const std::string test_name,
                     const std::shared_ptr<OpImpl> &impl,
                     std::vector<std::string> input_names,
                     const std::string &output_name,
-                     std::vector<ir::Tensor> &inputs,
+                     std::vector<ir::Tensor> &inputs,  // NOLINT
                     std::vector<common::CINNValue> cinn_inputs,
                     const Target &target) {
  Module::Builder builder("module", target);

--- a/paddle/cinn/hlir/pass/common_subexpression_elimination.cc
+++ b/paddle/cinn/hlir/pass/common_subexpression_elimination.cc
@@ -67,7 +67,9 @@ std::unordered_map<std::string, int> special_attrs = {
    {"axes", 2},
    {"perm", 2}};
-bool IsSameSubexpression(Node* op1, Node* op2, shape_dict_t& shape_dict) {
+bool IsSameSubexpression(Node* op1,
+                         Node* op2,
+                         shape_dict_t& shape_dict) {  // NOLINT
  // Get the input edges for op1 and op2 in order.
  auto op1_in_edges = op1->inlinks_in_order();
  auto op2_in_edges = op2->inlinks_in_order();
@@ -201,13 +203,14 @@ void RemoveNodes(framework::Graph* graph, GraphNode* node) {
  graph->DropNode(node);
 }
-void RemoveNodes(framework::Graph* graph, std::vector<Node*>& nodes) {
+void RemoveNodes(framework::Graph* graph, const std::vector<Node*>& nodes) {
  for (auto* node : nodes) {
    RemoveNodes(graph, node);
  }
 }
-void RemoveNodes(framework::Graph* graph, std::vector<NodeData*>& nodes_data) {
+void RemoveNodes(framework::Graph* graph,
+                 const std::vector<NodeData*>& nodes_data) {
  for (auto* data : nodes_data) {
    if (std::find(graph->outputs.begin(), graph->outputs.end(), data) !=
        graph->outputs.end()) {

--- a/paddle/cinn/hlir/pass/constant_folding_pass_test.cc
+++ b/paddle/cinn/hlir/pass/constant_folding_pass_test.cc
@@ -19,7 +19,7 @@
 namespace cinn {
 namespace frontend {
-int GetSize(std::vector<int>& shape) {
+int GetSize(const std::vector<int>& shape) {
  return std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>());
 }
@@ -36,7 +36,7 @@ std::unordered_map<std::string, std::vector<float>> GetInputRandom(
 }
 std::unordered_map<std::string, std::vector<float>> RunModelTest(
-    Program& program,
+    Program& program,  // NOLINT
    const std::vector<std::string>&& passes,
    const std::unordered_map<std::string, std::vector<float>>& input_data,
    const std::unordered_set<std::string>& fetch_ids) {

--- a/paddle/cinn/hlir/pass/dense_merge_pass_test.cc
+++ b/paddle/cinn/hlir/pass/dense_merge_pass_test.cc
@@ -19,11 +19,11 @@
 namespace cinn {
 namespace frontend {
-int GetSize(std::vector<int>& shape) {
+int GetSize(const std::vector<int>& shape) {
  return std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>());
 }
-void RunModelTest(Program& program,
+void RunModelTest(Program& program,  // NOLINT
                  const std::vector<Variable>&& inputs,
                  const std::unordered_set<std::string>& fetch_ids) {
  // init input data.

--- a/paddle/cinn/hlir/pass/dot_merger.cc
+++ b/paddle/cinn/hlir/pass/dot_merger.cc
@@ -130,7 +130,7 @@ class DotBuilder {
  const shape_dict_t& shape_dict() const { return shape_dict_; }
  // Currently the constructor of `NodeData` needs to pass in `Shared<Node>`.
-  NodeData* Var(common::Shared<Node>& producer) {
+  NodeData* Var(common::Shared<Node>& producer) {  // NOLINT
    auto* res = new NodeData(producer, 0, 0, node_name("var"), false);
    graph_->RegisterNode(producer->id(), res);
    graph_->RegisterNode(res->id(), producer.get());

--- a/paddle/cinn/hlir/pass/dot_merger_test.cc
+++ b/paddle/cinn/hlir/pass/dot_merger_test.cc
@@ -19,11 +19,11 @@
 namespace cinn {
 namespace frontend {
-int GetSize(std::vector<int>& shape) {
+int GetSize(const std::vector<int>& shape) {
  return std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>());
 }
-void RunModelTest(Program& program,
+void RunModelTest(Program& program,  // NOLINT
                  const std::vector<Variable>&& inputs,
                  const std::unordered_set<std::string>& fetch_ids) {
  // init input data.

--- a/paddle/cinn/hlir/pass/infershape.cc
+++ b/paddle/cinn/hlir/pass/infershape.cc
@@ -38,8 +38,8 @@ using dtype_dict_t = absl::flat_hash_map<std::string, common::Type>;
 using shape_dict_t = absl::flat_hash_map<std::string, framework::shape_t>;
 void InferShape(Node* node,
-                dtype_dict_t& dtype_dict,
+                dtype_dict_t& dtype_dict,    // NOLINT
-                shape_dict_t& shape_dict) {
+                shape_dict_t& shape_dict) {  // NOLINT
  VLOG(3) << "Begin InferShape of node " << node->id();
  auto op_infershape = Operator::GetAttrs<infershape_t>("infershape");
  auto op_inferdtype = Operator::GetAttrs<inferdtype_t>("inferdtype");

--- a/paddle/cinn/hlir/pass/infershape.h
+++ b/paddle/cinn/hlir/pass/infershape.h
@@ -24,8 +24,9 @@ namespace pass {
 void InferShape(
    framework::Node* node,
-    absl::flat_hash_map<std::string, common::Type>& dtype_dict,
+    absl::flat_hash_map<std::string, common::Type>& dtype_dict,  // NOLINT
-    absl::flat_hash_map<std::string, framework::shape_t>& shape_dict);
+    absl::flat_hash_map<std::string, framework::shape_t>&
+        shape_dict);  // NOLINT
 }  // namespace pass
 }  // namespace hlir

--- a/paddle/cinn/hlir/pass/reduce_split_pass_test.cc
+++ b/paddle/cinn/hlir/pass/reduce_split_pass_test.cc
@@ -20,7 +20,7 @@ namespace cinn {
 namespace frontend {
 std::unordered_map<std::string, std::vector<float>> RunModelTest(
-    Program& program,
+    Program& program,  // NOLINT
    const std::vector<std::string>&& passes,
    const std::unordered_map<std::string, std::vector<float>>& input_data,
    const std::unordered_set<std::string>& fetch_ids) {

--- a/paddle/cinn/hlir/pe/ir_schedule_pe.cc
+++ b/paddle/cinn/hlir/pe/ir_schedule_pe.cc
@@ -39,7 +39,7 @@ namespace cinn {
 namespace hlir {
 namespace pe {
-void IRElementwiseSchedule(ir::IRSchedule &ir_sch,
+void IRElementwiseSchedule(ir::IRSchedule &ir_sch,  // NOLINT
                           const std::vector<int> &output_shape,
                           const common::Target &target) {
  VLOG(3) << "Before IRElementwiseSchedule, new ir is : "
@@ -67,7 +67,7 @@ void IRElementwiseSchedule(ir::IRSchedule &ir_sch,
          << ir_sch.GetModule().GetExprs().at(0);
 }
-void IRInjectiveSchedule(ir::IRSchedule &ir_sch,
+void IRInjectiveSchedule(ir::IRSchedule &ir_sch,  // NOLINT
                         const std::vector<int> &output_shape,
                         const common::Target &target) {
  VLOG(3) << "Before IRInjectiveSchedule, new ir is : "
@@ -95,7 +95,7 @@ void IRInjectiveSchedule(ir::IRSchedule &ir_sch,
          << ir_sch.GetModule().GetExprs().at(0);
 }
-void IRScheduleInjectiveCPU(ir::IRSchedule &ir_sch,
+void IRScheduleInjectiveCPU(ir::IRSchedule &ir_sch,  // NOLINT
                            const std::vector<int> &output_shape,
                            const common::Target &target,
                            bool vectorizable) {
@@ -132,7 +132,7 @@ void IRScheduleInjectiveCPU(ir::IRSchedule &ir_sch,
          << ir_sch.GetModule().GetExprs().at(0);
 }
-void IRCudaScheduleInjective(ir::IRSchedule &ir_sch,
+void IRCudaScheduleInjective(ir::IRSchedule &ir_sch,  // NOLINT
                             const std::vector<int> &output_shape,
                             const common::Target &target) {
  VLOG(3) << "Begin IRCudaScheduleInjective ";
@@ -208,7 +208,7 @@ std::vector<common::CINNValue> IRCudaScheduleMatMul(
  return {common::CINNValue(ir_sch.GetModule().GetExprs().at(0))};
 }
-void IRCudaScheduleMul(ir::IRSchedule &ir_sch,
+void IRCudaScheduleMul(ir::IRSchedule &ir_sch,  // NOLINT
                       const std::vector<int> &output_shape,
                       const common::Target &target) {
  auto all_blocks = ir_sch.GetAllBlocks();
@@ -221,7 +221,7 @@ void IRCudaScheduleMul(ir::IRSchedule &ir_sch,
  ir_sch.Bind(loops[1], "threadIdx.x");
 }
-void IRMulScheduleCPU(ir::IRSchedule &ir_sch,
+void IRMulScheduleCPU(ir::IRSchedule &ir_sch,  // NOLINT
                      const std::vector<int> &reduce_first_shape,
                      const common::Target &target) {
  ir_sch.MergeExprs();
@@ -238,7 +238,7 @@ void IRMulScheduleCPU(ir::IRSchedule &ir_sch,
  }
 }
-void IRCudaSplitSchedule(ir::IRSchedule &ir_sch,
+void IRCudaSplitSchedule(ir::IRSchedule &ir_sch,  // NOLINT
                         const std::vector<std::vector<int>> &output_shapes,
                         int axis,
                         const common::Target &target) {
@@ -334,7 +334,7 @@ void IRCudaSplitSchedule(ir::IRSchedule &ir_sch,
          << ir_sch.GetModule().GetExprs().at(0);
 }
-void IRCudaScheduleReduce(ir::IRSchedule &ir_sch,
+void IRCudaScheduleReduce(ir::IRSchedule &ir_sch,  // NOLINT
                          ir::Tensor output,
                          int last_dimension_num,
                          const common::Target &target) {
@@ -390,7 +390,7 @@ void IRCudaScheduleReduce(ir::IRSchedule &ir_sch,
          << ir_sch.GetModule().GetExprs().at(0);
 }
-void IRCudaScheduleBlockReduceInternal(ir::IRSchedule &ir_sch,
+void IRCudaScheduleBlockReduceInternal(ir::IRSchedule &ir_sch,  // NOLINT
                                       ir::Tensor tmp_out,
                                       ir::Tensor out,
                                       const common::Target &target) {
@@ -478,7 +478,7 @@ void IRCudaScheduleBlockReduceInternal(ir::IRSchedule &ir_sch,
          << ir_sch.GetModule().GetExprs().at(0);
 }
-void IRCudaScheduleBlockReduce(ir::IRSchedule &ir_sch,
+void IRCudaScheduleBlockReduce(ir::IRSchedule &ir_sch,  // NOLINT
                               ir::Tensor reduce_tmp_out,
                               ir::Tensor tmp_out,
                               ir::Tensor out,
@@ -621,7 +621,7 @@ void IRCudaScheduleBlockReduce(ir::IRSchedule &ir_sch,
          << ir_sch.GetModule().GetExprs().at(0);
 }
-void IRCudaScheduleBlockShuffleReduce(ir::IRSchedule &ir_sch,
+void IRCudaScheduleBlockShuffleReduce(ir::IRSchedule &ir_sch,  // NOLINT
                                      ir::Tensor reshape,
                                      ir::Tensor internal,
                                      ir::Tensor reduce_out,
@@ -880,7 +880,7 @@ void IRCudaScheduleBlockShuffleReduce(ir::IRSchedule &ir_sch,
          << ir_sch.GetModule().GetExprs().at(0);
 }
-void IRCudaTwoStepReduceSchedule(ir::IRSchedule &ir_sch,
+void IRCudaTwoStepReduceSchedule(ir::IRSchedule &ir_sch,  // NOLINT
                                 ir::Tensor reshape,
                                 ir::Tensor internal,
                                 ir::Tensor tmp_out,
@@ -991,7 +991,7 @@ void IRCudaTwoStepReduceSchedule(ir::IRSchedule &ir_sch,
  // ir_sch.GetLoops(out->name)[0]);
 }
-void IRSoftmaxScheduleCPU(ir::IRSchedule &ir_sch, int axis) {
+void IRSoftmaxScheduleCPU(ir::IRSchedule &ir_sch, int axis) {  // NOLINT
  ir_sch.MergeExprs();
  auto all_blocks = ir_sch.GetAllBlocks();
  CHECK_EQ(all_blocks.size(), 3U);
@@ -1010,7 +1010,7 @@ void IRSoftmaxScheduleCPU(ir::IRSchedule &ir_sch, int axis) {
  ir_sch.ComputeAt(all_blocks[1], loops[0]);
 }
-void IRPoolScheduleGPU(ir::IRSchedule &ir_sch,
+void IRPoolScheduleGPU(ir::IRSchedule &ir_sch,  // NOLINT
                       const common::Target &target,
                       int arg_pack_size) {
  VLOG(3) << "Before IRPoolScheduleGPU: "
@@ -1028,7 +1028,7 @@ void IRPoolScheduleGPU(ir::IRSchedule &ir_sch,
  VLOG(3) << "End IRPoolScheduleGPU: " << ir_sch.GetModule().GetExprs().at(0);
 }
-void IRGlobalPoolScheduleGPU(ir::IRSchedule &ir_sch,
+void IRGlobalPoolScheduleGPU(ir::IRSchedule &ir_sch,  // NOLINT
                             const common::Target &target) {
  VLOG(3) << "Before IRGlobalPoolScheduleGPU: "
          << ir_sch.GetModule().GetExprs().at(0);
@@ -1071,7 +1071,7 @@ void IRGlobalPoolScheduleGPU(ir::IRSchedule &ir_sch,
          << ir_sch.GetModule().GetExprs().at(0);
 }
-void IRCudaScheduleDepthwiseConv(ir::IRSchedule &ir_sch,
+void IRCudaScheduleDepthwiseConv(ir::IRSchedule &ir_sch,  // NOLINT
                                 const std::vector<ir::Expr> &tensors) {
  if (tensors.size() == 3U) {
    CHECK(tensors[1].as_tensor());
@@ -1097,7 +1097,8 @@ void IRCudaScheduleDepthwiseConv(ir::IRSchedule &ir_sch,
          << ir_sch.GetModule().GetExprs().at(0);
 }
-void IRCudaScheduleConv(ir::IRSchedule &ir_sch, const common::Target &target) {
+void IRCudaScheduleConv(ir::IRSchedule &ir_sch,  // NOLINT
+                        const common::Target &target) {
  VLOG(3) << "Begin IRCudaScheduleConv with expr: "
          << ir_sch.GetModule().GetExprs().at(0);
  auto &res = ScheduleParam::get_cuda_instance().GetParam();
@@ -1238,10 +1239,10 @@ void IRCudaScheduleConv(ir::IRSchedule &ir_sch, const common::Target &target) {
          << ir_sch.GetModule().GetExprs().at(0);
 }
-void IRCudaScheduleConv2(ir::IRSchedule &ir_sch,
+void IRCudaScheduleConv2(ir::IRSchedule &ir_sch,  // NOLINT
-                         ir::Tensor &input_pad,
+                         ir::Tensor &input_pad,   // NOLINT
-                         ir::Tensor &weights,
+                         ir::Tensor &weights,     // NOLINT
-                         ir::Tensor &output,
+                         ir::Tensor &output,      // NOLINT
                         const common::Target &target,
                         const std::string &key) {
  auto &res = ScheduleParam::get_cuda_instance().GetParam();

--- a/paddle/cinn/hlir/pe/ir_schedule_pe.h
+++ b/paddle/cinn/hlir/pe/ir_schedule_pe.h
@@ -31,20 +31,20 @@ namespace cinn {
 namespace hlir {
 namespace pe {
-void IRElementwiseSchedule(ir::IRSchedule &ir_sch,
+void IRElementwiseSchedule(ir::IRSchedule &ir_sch,  // NOLINT
                           const std::vector<int> &output_shape,
                           const common::Target &target);
-void IRInjectiveSchedule(ir::IRSchedule &ir_sch,
+void IRInjectiveSchedule(ir::IRSchedule &ir_sch,  // NOLINT
                         const std::vector<int> &output_shape,
                         const common::Target &target);
-void IRScheduleInjectiveCPU(ir::IRSchedule &ir_sch,
+void IRScheduleInjectiveCPU(ir::IRSchedule &ir_sch,  // NOLINT
                            const std::vector<int> &output_shape,
                            const common::Target &target,
                            bool vectorizable = true);
-void IRCudaScheduleInjective(ir::IRSchedule &ir_sch,
+void IRCudaScheduleInjective(ir::IRSchedule &ir_sch,  // NOLINT
                             const std::vector<int> &output_shape,
                             const common::Target &target);
@@ -53,68 +53,69 @@ std::vector<common::CINNValue> IRCudaScheduleMatMul(
    const std::vector<int> &output_shape,
    const common::Target &target);
-void IRCudaScheduleMul(ir::IRSchedule &ir_sch,
+void IRCudaScheduleMul(ir::IRSchedule &ir_sch,  // NOLINT
                       const std::vector<int> &output_shape,
                       const common::Target &target);
-void IRMulScheduleCPU(ir::IRSchedule &ir_sch,
+void IRMulScheduleCPU(ir::IRSchedule &ir_sch,  // NOLINT
                      const std::vector<int> &reduce_first_shape,
                      const common::Target &target);
-void IRCudaSplitSchedule(ir::IRSchedule &ir_sch,
+void IRCudaSplitSchedule(ir::IRSchedule &ir_sch,  // NOLINT
                         const std::vector<std::vector<int>> &output_shapes,
                         int axis,
                         const common::Target &target);
-void IRCudaScheduleReduce(ir::IRSchedule &ir_sch,
+void IRCudaScheduleReduce(ir::IRSchedule &ir_sch,  // NOLINT
                          ir::Tensor out,
                          int last_dimension_num,
                          const common::Target &target);
-void IRCudaScheduleBlockReduce(ir::IRSchedule &ir_sch,
+void IRCudaScheduleBlockReduce(ir::IRSchedule &ir_sch,  // NOLINT
                               ir::Tensor reduce_tmp_out,
                               ir::Tensor tmp_out,
                               ir::Tensor out,
                               const common::Target &target);
-void IRCudaScheduleBlockReduceInternal(ir::IRSchedule &ir_sch,
+void IRCudaScheduleBlockReduceInternal(ir::IRSchedule &ir_sch,  // NOLINT
                                       ir::Tensor tmp_out,
                                       ir::Tensor out,
                                       const common::Target &target);
-void IRCudaScheduleBlockShuffleReduce(ir::IRSchedule &ir_sch,
+void IRCudaScheduleBlockShuffleReduce(ir::IRSchedule &ir_sch,  // NOLINT
                                      ir::Tensor reshape,
                                      ir::Tensor internal,
                                      ir::Tensor out,
                                      const common::Target &target);
-void IRCudaTwoStepReduceSchedule(ir::IRSchedule &ir_sch,
+void IRCudaTwoStepReduceSchedule(ir::IRSchedule &ir_sch,  // NOLINT
                                 ir::Tensor reshape,
                                 ir::Tensor internal,
                                 ir::Tensor tmp_out,
                                 ir::Tensor out,
                                 const common::Target &target);
-void IRSoftmaxScheduleCPU(ir::IRSchedule &ir_sch, int axis = -1);
+void IRSoftmaxScheduleCPU(ir::IRSchedule &ir_sch, int axis = -1);  // NOLINT
-void IRPoolScheduleGPU(ir::IRSchedule &ir_sch,
+void IRPoolScheduleGPU(ir::IRSchedule &ir_sch,  // NOLINT
                       const common::Target &target,
                       int arg_pack_size = 3);
-void IRCudaScheduleDepthwiseConv(ir::IRSchedule &ir_sch,
+void IRCudaScheduleDepthwiseConv(ir::IRSchedule &ir_sch,  // NOLINT
                                 const std::vector<ir::Expr> &tensors);
-void IRGlobalPoolScheduleGPU(ir::IRSchedule &ir_sch,
+void IRGlobalPoolScheduleGPU(ir::IRSchedule &ir_sch,  // NOLINT
                             const common::Target &target);
-void IRCudaScheduleConv2(ir::IRSchedule &ir_sch,
+void IRCudaScheduleConv2(ir::IRSchedule &ir_sch,  // NOLINT
-                         ir::Tensor &input_pad,
+                         ir::Tensor &input_pad,   // NOLINT
-                         ir::Tensor &weights,
+                         ir::Tensor &weights,     // NOLINT
-                         ir::Tensor &output,
+                         ir::Tensor &output,      // NOLINT
                         const common::Target &target,
                         const std::string &key);
-void IRCudaScheduleConv(ir::IRSchedule &ir_sch, const common::Target &target);
+void IRCudaScheduleConv(ir::IRSchedule &ir_sch,  // NOLINT
+                        const common::Target &target);
 }  // namespace pe
 }  // namespace hlir

--- a/paddle/cinn/hlir/pe/nn_util.cc
+++ b/paddle/cinn/hlir/pe/nn_util.cc
@@ -456,8 +456,8 @@ int GetTailSize(const std::vector<int>& inshape, const std::vector<int>& axes) {
 std::vector<int> GetFirstStepReduceShape(const std::vector<int>& shape,
                                         const std::vector<int>& axes,
-                                         bool& inbound,
+                                         bool& inbound,  // NOLINT
-                                         int& tail) {
+                                         int& tail) {    // NOLINT
  // post parallel size
  int post_parallel_size = GetPostParallelSize(shape, axes);
  // the size to unfold las reduce axis

--- a/paddle/cinn/hlir/pe/nn_util.h
+++ b/paddle/cinn/hlir/pe/nn_util.h
@@ -41,8 +41,8 @@ std::vector<ir::Tensor> winograd_transform_matrices(const int& tile_size,
 std::vector<int> GetFirstStepReduceShape(const std::vector<int>& shape,
                                         const std::vector<int>& axes,
-                                         bool& inbound,
+                                         bool& inbound,  // NOLINT
-                                         int& tail);
+                                         int& tail);     // NOLINT
 }  // namespace pe
 }  // namespace hlir

--- a/paddle/cinn/hlir/pe/schedule.cc
+++ b/paddle/cinn/hlir/pe/schedule.cc
@@ -624,7 +624,7 @@ void PoolScheduleCPU(poly::StageMap stages,
 }
 void PoolScheduleGPU(poly::StageMap stages,
-                     ir::Tensor &output,
+                     const ir::Tensor &output,
                     const common::Target &target) {
  CHECK_GE(stages[output]->axis_names().size(), 4);
  stages[output]->Fuse({0, 1, 2, 3});
@@ -866,7 +866,7 @@ void CreateX86SerialData(const std::string &file_name) {
 void Conv2d_NCHWc_1X1_Schedule_CPU(poly::StageMap stages,
                                   const ir::Tensor &res,
-                                   ir::Tensor &packed_out,
+                                   ir::Tensor &packed_out,  // NOLINT
                                   const ir::Tensor &input_pad,
                                   const ir::Tensor &weights_dilation,
                                   const ir::Tensor &data,
@@ -1017,7 +1017,7 @@ void Conv2d_NCHWc_1X1_Schedule_CPU(poly::StageMap stages,
 void Conv2d_NCHWc_1X1_Schedule_CPU_Nofuse(poly::StageMap stages,
                                          const ir::Tensor &res,
-                                          ir::Tensor &packed_out,
+                                          ir::Tensor &packed_out,  // NOLINT
                                          const ir::Tensor &input_pad,
                                          const ir::Tensor &weights_dilation,
                                          const ir::Tensor &data,
@@ -1139,7 +1139,7 @@ void Conv2d_NCHWc_1X1_Schedule_CPU_Nofuse(poly::StageMap stages,
 void Conv2d_NCHWc_Schedule_CPU_Nofuse(poly::StageMap stages,
                                      const ir::Tensor &res,
-                                      ir::Tensor &packed_out,
+                                      ir::Tensor &packed_out,  // NOLINT
                                      const ir::Tensor &input_pad,
                                      const ir::Tensor &weights_dilation,
                                      const ir::Tensor &data,
@@ -1244,7 +1244,7 @@ void Conv2d_NCHWc_Schedule_CPU_Nofuse(poly::StageMap stages,
 void Conv2d_NCHWc_Schedule_CPU(poly::StageMap stages,
                               const ir::Tensor &res,
-                               ir::Tensor &packed_out,
+                               ir::Tensor &packed_out,  // NOLINT
                               const ir::Tensor &input_pad,
                               const ir::Tensor &weights_dilation,
                               const ir::Tensor &data,
@@ -1377,7 +1377,7 @@ void Conv2d_NCHWc_Schedule_CPU(poly::StageMap stages,
 void Depthwise_Conv2d_NCHWc_Schedule_CPU_Nofuse(
    poly::StageMap stages,
    const ir::Tensor &res,
-    ir::Tensor &packed_out,
+    ir::Tensor &packed_out,  // NOLINT
    const ir::Tensor &input_pad,
    const ir::Tensor &weights_dilation,
    const ir::Tensor &data,
@@ -2300,7 +2300,7 @@ void SaveSerialData(
 }
 void CudaScheduleDepthwiseConv(poly::StageMap stages,
-                               ir::Tensor &output,
+                               ir::Tensor &output,  // NOLINT
                               const common::Target &target) {
  auto OL = stages[output]->CacheWrite("local", stages, output);
  stages[output]->Bind(0, "blockIdx.x");
@@ -2313,9 +2313,9 @@ void CudaScheduleDepthwiseConv(poly::StageMap stages,
 }
 void CudaScheduleConv(poly::StageMap stages,
-                      ir::Tensor &input_pad,
+                      ir::Tensor &input_pad,  // NOLINT
-                      ir::Tensor &weights,
+                      ir::Tensor &weights,    // NOLINT
-                      ir::Tensor &output,
+                      ir::Tensor &output,     // NOLINT
                      const common::Target &target) {
  auto &res = ScheduleParam::get_cuda_instance().GetParam();
  int n = output->shape[0].as_int32();
@@ -2382,9 +2382,9 @@ void CudaScheduleConv(poly::StageMap stages,
 }
 void CudaScheduleConv2(poly::StageMap stages,
-                       ir::Tensor &input_pad,
+                       ir::Tensor &input_pad,  // NOLINT
-                       ir::Tensor &weights,
+                       ir::Tensor &weights,    // NOLINT
-                       ir::Tensor &output,
+                       ir::Tensor &output,     // NOLINT
                       const common::Target &target,
                       const std::string &key) {
  auto &res = ScheduleParam::get_cuda_instance().GetParam();
@@ -2516,7 +2516,7 @@ void CudaScheduleConv2(poly::StageMap stages,
 }
 void CudaScheduleWinogradConv(poly::StageMap wino_stages,
-                              std::vector<ir::Tensor> &all_tensors,
+                              std::vector<ir::Tensor> &all_tensors,  // NOLINT
                              const common::Target &target) {
  auto &res = ScheduleParam::get_cuda_instance().GetParam();
  auto &wino_weights_dilation = all_tensors[0];

--- a/paddle/cinn/hlir/pe/schedule.h
+++ b/paddle/cinn/hlir/pe/schedule.h
@@ -124,7 +124,7 @@ void GetConv2d1x1Factors(absl::flat_hash_map<std::string, int> *factors,
 void Conv2d_NCHWc_Schedule_CPU(poly::StageMap stages,
                               const ir::Tensor &res,
-                               ir::Tensor &packed_out,
+                               ir::Tensor &packed_out,  // NOLINT
                               const ir::Tensor &input_pad,
                               const ir::Tensor &weights_dilation,
                               const ir::Tensor &data,
@@ -138,12 +138,12 @@ void PoolScheduleCPU(poly::StageMap stages,
                     const ir::Tensor &output,
                     const common::Target &target);
 void PoolScheduleGPU(poly::StageMap stages,
-                     ir::Tensor &output,
+                     const ir::Tensor &output,
                     const common::Target &target);
 void Conv2d_NCHWc_Schedule_CPU_Nofuse(poly::StageMap stages,
                                      const ir::Tensor &res,
-                                      ir::Tensor &packed_out,
+                                      ir::Tensor &packed_out,  // NOLINT
                                      const ir::Tensor &input_pad,
                                      const ir::Tensor &weights_dilation,
                                      const ir::Tensor &data,
@@ -151,7 +151,7 @@ void Conv2d_NCHWc_Schedule_CPU_Nofuse(poly::StageMap stages,
 void Conv2d_NCHWc_1X1_Schedule_CPU(poly::StageMap stages,
                                   const ir::Tensor &res,
-                                   ir::Tensor &packed_out,
+                                   ir::Tensor &packed_out,  // NOLINT
                                   const ir::Tensor &input_pad,
                                   const ir::Tensor &weights_dilation,
                                   const ir::Tensor &data,
@@ -161,7 +161,7 @@ void Conv2d_NCHWc_1X1_Schedule_CPU(poly::StageMap stages,
 void Conv2d_NCHWc_1X1_Schedule_CPU_Nofuse(poly::StageMap stages,
                                          const ir::Tensor &res,
-                                          ir::Tensor &packed_out,
+                                          ir::Tensor &packed_out,  // NOLINT
                                          const ir::Tensor &input_pad,
                                          const ir::Tensor &weights_dilation,
                                          const ir::Tensor &data,
@@ -170,7 +170,7 @@ void Conv2d_NCHWc_1X1_Schedule_CPU_Nofuse(poly::StageMap stages,
 void Depthwise_Conv2d_NCHWc_Schedule_CPU_Nofuse(
    poly::StageMap stages,
    const ir::Tensor &res,
-    ir::Tensor &packed_out,
+    ir::Tensor &packed_out,  // NOLINT
    const ir::Tensor &input_pad,
    const ir::Tensor &weights_dilation,
    const ir::Tensor &data,
@@ -218,23 +218,23 @@ void CudaTwoStepReduceSchedule(poly::StageMap stages,
                               const common::Target &target);
 void CudaScheduleDepthwiseConv(poly::StageMap stages,
-                               ir::Tensor &output,
+                               ir::Tensor &output,  // NOLINT
                               const common::Target &target);
 void CudaScheduleConv(poly::StageMap stages,
-                      ir::Tensor &input_pad,
+                      ir::Tensor &input_pad,  // NOLINT
-                      ir::Tensor &weights,
+                      ir::Tensor &weights,    // NOLINT
-                      ir::Tensor &output,
+                      ir::Tensor &output,     // NOLINT
                      const common::Target &target);
 void CudaScheduleWinogradConv(poly::StageMap wino_stages,
-                              std::vector<ir::Tensor> &all_tensors,
+                              std::vector<ir::Tensor> &all_tensors,  // NOLINT
                              const common::Target &target);
 void CudaScheduleConv2(poly::StageMap stages,
-                       ir::Tensor &input_pad,
+                       ir::Tensor &input_pad,  // NOLINT
-                       ir::Tensor &weights,
+                       ir::Tensor &weights,    // NOLINT
-                       ir::Tensor &output,
+                       ir::Tensor &output,     // NOLINT
                       const common::Target &target,
                       const std::string &key);

--- a/paddle/cinn/ir/ir_schedule.cc
+++ b/paddle/cinn/ir/ir_schedule.cc
@@ -96,7 +96,7 @@ class ScheduleImpl {
                  int write_buffer_index,
                  const std::string& memory_type);
  void SyncThreads(const Expr& ir_node, bool after_node = true);
-  void SetBuffer(Expr& block,
+  void SetBuffer(Expr& block,  // NOLINT
                 const std::string& memory_type,
                 bool fixed = false);
  Expr Reorder(const std::vector<Expr>& loops);
@@ -114,7 +114,7 @@ class ScheduleImpl {
  Expr Rfactor(const Expr& rf_loop, int rf_axis);
  Expr AddUnitLoop(const Expr& block) const;
  void Annotate(const Expr& block, const std::string& key, const attr_t& value);
-  void Unannotate(Expr& block, const std::string& key);
+  void Unannotate(Expr& block, const std::string& key);  // NOLINT
  void FlattenLoops(const std::vector<Expr>& loops,
                    const bool force_flat = false);
  void CopyTransformAndLoopInfo(const Expr& block, const Expr& block_target);

--- a/paddle/cinn/ir/ir_schedule.h
+++ b/paddle/cinn/ir/ir_schedule.h
@@ -247,9 +247,9 @@ class IRSchedule {
   * \param memory_type The memory type we want to set. Should be "local",
   * "shared" or "global".
   */
-  void SetBuffer(Expr& block,
+  void SetBuffer(Expr& block,  // NOLINT
                 const std::string& memory_type,
-                 bool fixed = false);
+                 bool fixed = false);  // NOLINT
  /**
   * \brief Reorder the loops in the order of vector.
@@ -391,7 +391,7 @@ class IRSchedule {
   * \param block The block to be unannotated
   * \param key The attribute key
   */
-  void Unannotate(Expr& block, const std::string& key);
+  void Unannotate(Expr& block, const std::string& key);  // NOLINT
  /*!
   * \brief flatten the loops in one dim.
@@ -620,7 +620,7 @@ class LeafBlockRemovalPlan : public ir::IRMutator<> {
 class ComputeInlineChecker : public ir::IRMutator<> {
 public:
-  ComputeInlineChecker(IRSchedule& schedule, Expr& block)
+  ComputeInlineChecker(IRSchedule& schedule, Expr& block)  // NOLINT
      : ir_schedule_(schedule), block_(block) {}
  bool Check();

--- a/paddle/cinn/ir/ir_schedule_util.cc
+++ b/paddle/cinn/ir/ir_schedule_util.cc
@@ -503,7 +503,7 @@ Expr MakeCacheBlock(const std::vector<IterRange>& buffer_ranges,
  return block;
 }
-void FindInsertionPoint(Expr& root, CacheBlockInfo* info, bool is_write) {
+void FindInsertionPoint(const Expr& root, CacheBlockInfo* info, bool is_write) {
  Expr find_tensor =
      is_write ? Expr(info->write_tensor) : Expr(info->read_tensor);
  auto find_produce_read =
@@ -651,7 +651,7 @@ Expr ConstructOtherStmtChain(const std::vector<Expr>& stmts,
 Expr ConstructNewLoopChain(const std::vector<Expr>& chain,
                           const std::vector<Expr>& ordered_loops,
                           const std::set<Expr, CompExpr>& loop_set,
-                           std::vector<Expr>& if_nodes) {
+                           std::vector<Expr>& if_nodes) {  // NOLINT
  std::vector<std::set<std::string>> condition_vars;
  // In each IfThenElse node, find the vars its condition depends on.
  for (auto& if_expr : if_nodes) {
@@ -923,7 +923,7 @@ void CheckComputeAtValidation(const Expr& block,
  CHECK(find_block_in_loop.empty()) << "loop should not be block's ancestor!";
 }
-void InsertBlock(Expr& for_loop, const Expr& insertion, int index) {
+void InsertBlock(Expr& for_loop, const Expr& insertion, int index) {  // NOLINT
  CHECK(for_loop.As<ir::For>());
  CHECK(for_loop.As<ir::For>()->body.As<Block>());
  ir::Block* dst_block = for_loop.As<ir::For>()->body.As<Block>();

--- a/paddle/cinn/ir/ir_schedule_util.h
+++ b/paddle/cinn/ir/ir_schedule_util.h
@@ -326,7 +326,7 @@ Expr MakeCacheBlock(const std::vector<IterRange>& buffer_ranges,
 * @param info The information of cache block.
 * @param is_write Are we inserting a write cache tensor or a read cache tensor.
 */
-void FindInsertionPoint(Expr& root, CacheBlockInfo* info, bool is_write);
+void FindInsertionPoint(const Expr& root, CacheBlockInfo* info, bool is_write);
 /**
 * \brief Given a vector of For loops, return a set of them.
@@ -359,7 +359,7 @@ std::vector<Expr> GetLoopsInRange(const Expr& top, const Expr& bottom);
 Expr ConstructNewLoopChain(const std::vector<Expr>& chain,
                           const std::vector<Expr>& ordered_loops,
                           const std::set<Expr, CompExpr>& loop_set,
-                           std::vector<Expr>& if_nodes);
+                           std::vector<Expr>& if_nodes);  // NOLINT
 /*!
 * \brief Find producers of block in root.
@@ -395,7 +395,9 @@ void CheckComputeAtValidation(const Expr& block,
 *        - `index = -1` means inserted into the tail
 *        - otherwise, it should be a index between [0, stmts size)
 */
-void InsertBlock(Expr& for_loop, const Expr& insertion, int index = 0);
+void InsertBlock(Expr& for_loop,  // NOLINT
+                 const Expr& insertion,
+                 int index = 0);  // NOLINT
 /*!
 * \brief Make a union of two range. The detailed function is :

--- a/paddle/cinn/lang/lower_impl.cc
+++ b/paddle/cinn/lang/lower_impl.cc
@@ -49,7 +49,7 @@ void CheckNoIslCallRemains(Expr* expr) {
  }
 }
-void BindBuffer(StageMap& stages) {
+void BindBuffer(StageMap& stages) {  // NOLINT
  absl::flat_hash_map<std::string, ir::_Tensor_*> tensor_map;
  for (auto& stage : stages) {
    tensor_map[stage.second->tensor()->name] = stage.second->tensor();
@@ -71,11 +71,11 @@ void BindBuffer(StageMap& stages) {
  }
 }
-Expr LowerGroup(
+Expr LowerGroup(const poly::ScheduleGroup& group,
-    const poly::ScheduleGroup& group,
                const std::map<std::string, Expr>& tuple_to_expr,
                std::map<std::string, ir::Tensor>* global_tensor_map,
-    std::unordered_map<std::string, std::vector<Expr>>& resized_buffer_cache,
+                std::unordered_map<std::string, std::vector<Expr>>&
+                    resized_buffer_cache,  // NOLINT
                StageMap stage_map,
                ir::CudaAxisInfo* cuda_axis_info) {
  BindBuffer(stage_map);

--- a/paddle/cinn/lang/lower_impl.h
+++ b/paddle/cinn/lang/lower_impl.h
@@ -69,7 +69,7 @@ void CheckNoIslCallRemains(const Expr* expr);
 Expr LowerGroup(const poly::ScheduleGroup& group,
                const std::map<std::string, Expr>& tuple_to_expr,
                std::map<std::string, Tensor>* global_tensor_map,
-                std::unordered_set<std::string>& resized_buffer,
+                std::unordered_set<std::string>& resized_buffer,  // NOLINT
                StageMap stage_map,
                ir::CudaAxisInfo* cuda_axis_info = nullptr);

--- a/paddle/cinn/optim/ir_simplify.cc
+++ b/paddle/cinn/optim/ir_simplify.cc
@@ -55,7 +55,8 @@ void PartialSimplify(
 //! Simplify the expression but Load.
 struct SimplifyButStoreLoadMutator : public ir::IRMutator<ir::Expr*> {
  common::cas_intervals_t& var_intervals;
-  explicit SimplifyButStoreLoadMutator(common::cas_intervals_t& var_intervals)
+  explicit SimplifyButStoreLoadMutator(
+      common::cas_intervals_t& var_intervals)  // NOLINT
      : var_intervals(var_intervals) {}
  void operator()(Expr* x) { ir::IRMutator<ir::Expr*>::Visit(x, x); }

--- a/paddle/cinn/poly/poly_scheduler.cc
+++ b/paddle/cinn/poly/poly_scheduler.cc
@@ -177,7 +177,7 @@ bool IsBetween(const common::GraphNode* x,
  return false;
 }
-std::vector<Group> TopoSortGroups(std::vector<Group>& groups) {
+std::vector<Group> TopoSortGroups(std::vector<Group>& groups) {  // NOLINT
  // collect indegree.
  absl::flat_hash_map<Group*, int> group_indegree;
  std::vector<Group*> start_groups;

--- a/paddle/cinn/poly/stage.cc
+++ b/paddle/cinn/poly/stage.cc
@@ -40,7 +40,7 @@
 namespace cinn {
 namespace poly {
-void RemoveDuplicate(std::vector<std::vector<Expr>> &indices) {
+void RemoveDuplicate(std::vector<std::vector<Expr>> &indices) {  // NOLINT
  std::set<std::string> temp;
  for (int i = 0; i < indices.size(); i++) {
    std::string index_str = "";
@@ -309,7 +309,7 @@ int Minus(const Expr &a, const Expr &b) {
 }
 // Return the range = max - min among all indices[i][axis](i = 0,1,2,...)
-int GetRange(std::vector<std::vector<Expr>> &indices, int axis) {
+int GetRange(std::vector<std::vector<Expr>> &indices, int axis) {  // NOLINT
  Expr max_expr = indices[0][axis];
  Expr min_expr = indices[0][axis];
  for (auto i = 1; i < indices.size(); i++) {
@@ -1420,7 +1420,7 @@ struct CacheReplaceMutator : public ir::IRMutator<> {
 };
 }  // namespace
-void CacheReadWriteReplace(std::vector<ir::Tensor> &readers,
+void CacheReadWriteReplace(const std::vector<ir::Tensor> &readers,
                           ir::Tensor cache_tensor,
                           std::string origin_tensor_name) {
  for (auto k : readers) {

--- a/paddle/cinn/poly/stage.h
+++ b/paddle/cinn/poly/stage.h
@@ -270,7 +270,7 @@ class Stage : public Object {
   * @param readers the readers of the \p tensor
   */
  ir::Tensor CacheRead(const std::string& memory_type,
-                       std::vector<ir::Tensor>& readers,
+                       std::vector<ir::Tensor>& readers,  // NOLINT
                       poly::StageMap stages);
  /**
@@ -303,7 +303,8 @@ class Stage : public Object {
  void ShowISL() const;
-  void AddForLoopInTransform(std::vector<std::vector<Expr>>& indices);
+  void AddForLoopInTransform(
+      std::vector<std::vector<Expr>>& indices);  // NOLINT
  /**
   * Create a cache for write to the original tensor.
   * @param tensor the tensor to create the cache for.
@@ -312,7 +313,7 @@ class Stage : public Object {
   */
  ir::Tensor CacheWrite(const std::string& memory_type,
                        poly::StageMap stages,
-                        ir::Tensor& key_tensor);
+                        ir::Tensor& key_tensor);  // NOLINT
  /**
   * Generate the `syncthreads()` code to sync all threads on CUDA backends.

--- a/paddle/cinn/pybind/common.cc
+++ b/paddle/cinn/pybind/common.cc
@@ -236,12 +236,12 @@ inline auto __binary_op_fn_dispatch(T1 x, T2 y, F fn, std::false_type) {
 template <typename T1, typename T2, typename F>
 inline void __binary_op_visitor_dispatch(
-    CINNValue &v, T1 lhs, T2 rhs, F fn, std::true_type) {
+    CINNValue &v, T1 lhs, T2 rhs, F fn, std::true_type) {  // NOLINT
  v = CINNValue();
 }
 template <typename T1, typename T2, typename F>
 inline void __binary_op_visitor_dispatch(
-    CINNValue &v, T1 lhs, T2 rhs, F fn, std::false_type) {
+    CINNValue &v, T1 lhs, T2 rhs, F fn, std::false_type) {  // NOLINT
  v.Set(fn(lhs, rhs));
 }

--- a/paddle/cinn/runtime/cuda/float16.h
+++ b/paddle/cinn/runtime/cuda/float16.h
@@ -500,25 +500,25 @@ __host__ __device__ inline float16 operator-(const float16& a) {
 #endif
 }
-__host__ __device__ inline float16& operator+=(float16& a,
+__host__ __device__ inline float16& operator+=(float16& a,          // NOLINT
                                               const float16& b) {  // NOLINT
  a = a + b;
  return a;
 }
-__host__ __device__ inline float16& operator-=(float16& a,
+__host__ __device__ inline float16& operator-=(float16& a,          // NOLINT
                                               const float16& b) {  // NOLINT
  a = a - b;
  return a;
 }
-__host__ __device__ inline float16& operator*=(float16& a,
+__host__ __device__ inline float16& operator*=(float16& a,          // NOLINT
                                               const float16& b) {  // NOLINT
  a = a * b;
  return a;
 }
-__host__ __device__ inline float16& operator/=(float16& a,
+__host__ __device__ inline float16& operator/=(float16& a,          // NOLINT
                                               const float16& b) {  // NOLINT
  a = a / b;
  return a;