【CINN】Remove Remaining Old Schedule, Now We Completely Remove it. (#55566)

Remove the remaining old schedules.

【CINN】Remove Remaining Old Schedule, Now We Completely Remove it. (#55566)
Remove the remaining old schedules.
011f97bc · Huihuang Zheng · GitHub · 669bcf54 · 011f97bc · 011f97bc
30 changed file
--- a/paddle/cinn/auto_schedule/auto_tuner_test.cc
+++ b/paddle/cinn/auto_schedule/auto_tuner_test.cc
@@ -32,7 +32,6 @@
 #include "paddle/cinn/runtime/flags.h"
 DECLARE_bool(auto_schedule_use_cost_model);
-DECLARE_bool(cinn_ir_schedule);
 namespace cinn {
 namespace auto_schedule {
@@ -70,8 +69,6 @@ class TestAutoTuner : public ::testing::Test {
  void SetUp() override {
    srand(0);
-    // AutoTuner is combined with new IR Schedule
-    FLAGS_cinn_ir_schedule = true;
    std::unordered_set<std::string> fetch_ids;
    auto program = CreateAddReluProgram();
    auto graph = cinn::frontend::Optimize(&program, fetch_ids, target);

--- a/paddle/cinn/auto_schedule/measure/measurer_test.cc
+++ b/paddle/cinn/auto_schedule/measure/measurer_test.cc
@@ -27,8 +27,6 @@
 #include "paddle/cinn/hlir/framework/graph_compiler.h"
 #include "paddle/cinn/runtime/flags.h"
-DECLARE_bool(cinn_ir_schedule);
 namespace cinn {
 namespace auto_schedule {
@@ -55,7 +53,6 @@ class TestMeasurer : public ::testing::Test {
  std::vector<MeasureInput> inputs;
  void SetUp() override {
-    FLAGS_cinn_ir_schedule = true;
 #ifdef CINN_WITH_CUDA
    Target target = common::DefaultNVGPUTarget();
 #else

--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_inline_test.cc
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_inline_test.cc
@@ -40,8 +40,6 @@
 #include "paddle/cinn/utils/string.h"
 #include "test/cpp/cinn/concrete_program_builder.h"
-DECLARE_bool(cinn_ir_schedule);
 namespace cinn {
 namespace auto_schedule {
@@ -155,7 +153,6 @@ TEST(AutoInline, AddReluInline) {
  frontend::Program program = builder.Build();
-  FLAGS_cinn_ir_schedule = true;
  auto graph = std::make_shared<Graph>(program, target);
  hlir::framework::ApplyPass(graph.get(), "OpFusionPass");

--- a/paddle/cinn/auto_schedule/task/task_registry_test.cc
+++ b/paddle/cinn/auto_schedule/task/task_registry_test.cc
@@ -29,7 +29,6 @@
 #include "paddle/cinn/utils/type_defs.h"
 DECLARE_bool(auto_schedule_use_cost_model);
-DECLARE_bool(cinn_ir_schedule);
 namespace cinn {
 namespace auto_schedule {
@@ -70,7 +69,6 @@ std::shared_ptr<hlir::framework::Graph> CreateAddProgram(
 TEST(TestTaskRegistry, basic) {
  FLAGS_auto_schedule_use_cost_model = true;
-  FLAGS_cinn_ir_schedule = true;
 #ifdef CINN_WITH_CUDA
  Target target = common::DefaultNVGPUTarget();

--- a/paddle/cinn/auto_schedule/task/tune_task_test.cc
+++ b/paddle/cinn/auto_schedule/task/tune_task_test.cc
@@ -35,8 +35,6 @@
 #include "paddle/cinn/ir/utils/ir_printer.h"
 #include "paddle/cinn/utils/string.h"
-DECLARE_bool(cinn_ir_schedule);
 namespace cinn {
 namespace auto_schedule {
@@ -59,8 +57,6 @@ Program CreateAddProgram() {
 }
 TEST(TuneTask, GraphToUnoptLoweredFunc_NoPass) {
-  // Auto tuner is combined with IR schedule
-  FLAGS_cinn_ir_schedule = true;
  Context::Global().ResetNameId();
 #ifdef CINN_WITH_CUDA
  Target target = common::DefaultNVGPUTarget();
@@ -170,8 +166,6 @@ TEST(TuneTask, GraphToUnoptLoweredFunc_NoPass) {
 }
 TEST(TuneTask, GraphToUnoptLoweredFunc_ApplyPass) {
-  // Auto tuner is combined with IR schedule
-  FLAGS_cinn_ir_schedule = true;
  Context::Global().ResetNameId();
 #ifdef CINN_WITH_CUDA
  Target target = common::DefaultNVGPUTarget();

--- a/paddle/cinn/hlir/framework/graph_compiler.cc
+++ b/paddle/cinn/hlir/framework/graph_compiler.cc
@@ -30,8 +30,6 @@
 #include "paddle/cinn/poly/stage.h"
 #include "paddle/cinn/utils/profiler.h"
-DECLARE_bool(cinn_ir_schedule);
 namespace cinn {
 namespace hlir {
 namespace framework {

--- a/paddle/cinn/hlir/framework/op_lowering.cc
+++ b/paddle/cinn/hlir/framework/op_lowering.cc
@@ -19,7 +19,6 @@
 #include "paddle/cinn/ir/schedule/ir_schedule.h"
 #include "paddle/cinn/optim/transform_gpu_forloop.h"
-DECLARE_bool(cinn_ir_schedule);
 DECLARE_bool(cinn_use_cuda_vectorize);
 namespace cinn {
@@ -52,7 +51,6 @@ std::vector<ir::LoweredFunc> OpLowerer::Lower(const GroupPtr& group,
          << " , Op Pattern : " << group->op_pattern_kind;
  group->input_names.clear();
  group->output_names.clear();
-  if (FLAGS_cinn_ir_schedule) {
  switch (group->op_pattern_kind) {
    case framework::kElementWise:
    case framework::kBroadcast:
@@ -76,9 +74,6 @@ std::vector<ir::LoweredFunc> OpLowerer::Lower(const GroupPtr& group,
    default:
      LOG(FATAL) << "Group Pattern Kind Is Unknown!";
  }
-  } else {
-    LOG(FATAL) << "Previous IR Schedule Is Not Implemented!";
-  }
 }
 bool OpLowerer::ElementwiseScheduleDetermineFunction(Node* node) {

--- a/paddle/cinn/hlir/framework/op_test.cc
+++ b/paddle/cinn/hlir/framework/op_test.cc
@@ -27,8 +27,6 @@
 #include "paddle/cinn/hlir/pe/broadcast.h"
 #include "paddle/cinn/runtime/flags.h"
-DECLARE_bool(cinn_ir_schedule);
 namespace cinn {
 namespace hlir {
 namespace framework {
@@ -57,7 +55,6 @@ TEST(Operator, GetAttrs) {
  std::string func_name = "add1";
-  if (FLAGS_cinn_ir_schedule) {
  std::string out_name = "C";
  common::CINNValuePack cinn_input =
      common::CINNValuePack{{common::CINNValue(A),
@@ -72,21 +69,6 @@ TEST(Operator, GetAttrs) {
    LOG(INFO) << "Test Operator_ElementWise_Add_Test0's Strategy, func is :\n"
              << func;
  }
-  } else {
-    common::CINNValuePack cinn_input =
-        common::CINNValuePack{{common::CINNValue(A), common::CINNValue(B)}};
-    common::CINNValuePack rets = impl->fcompute(cinn_input);
-    ASSERT_EQ(rets.size(), 2UL);
-    rets = impl->fschedule(rets);
-    ASSERT_EQ(rets.size(), 2UL);
-    // the last element is a StageMap
-    for (int i = 0; i < rets->size() - 1; i++) {
-      ir::Expr temp = rets[i];
-      inputs.push_back(temp.as_tensor_ref());
-    }
-    auto func = Lower(func_name, rets.back(), inputs);
-    LOG(INFO) << "Test Strategy Codegen:\n" << func;
-  }
 }
 }  // namespace framework

--- a/paddle/cinn/hlir/op/broadcast.cc
+++ b/paddle/cinn/hlir/op/broadcast.cc
@@ -26,8 +26,6 @@
 #include "paddle/cinn/ir/layout.h"
 #include "paddle/cinn/ir/op/ir_operators.h"
-DECLARE_bool(cinn_ir_schedule);
 namespace cinn {
 namespace hlir {
 namespace op {

--- a/paddle/cinn/hlir/op/contrib/argmax.cc
+++ b/paddle/cinn/hlir/op/contrib/argmax.cc
@@ -35,8 +35,6 @@
 #include "paddle/cinn/lang/builtin.h"
 #include "paddle/cinn/lang/compute.h"
-DECLARE_bool(cinn_ir_schedule);
 namespace cinn {
 namespace hlir {
 namespace op {

--- a/paddle/cinn/hlir/op/contrib/argmin.cc
+++ b/paddle/cinn/hlir/op/contrib/argmin.cc
@@ -35,8 +35,6 @@
 #include "paddle/cinn/lang/builtin.h"
 #include "paddle/cinn/lang/compute.h"
-DECLARE_bool(cinn_ir_schedule);
 namespace cinn {
 namespace hlir {
 namespace op {

--- a/paddle/cinn/hlir/op/contrib/gather_nd.cc
+++ b/paddle/cinn/hlir/op/contrib/gather_nd.cc
@@ -38,7 +38,6 @@
 #include "paddle/cinn/ir/tensor.h"
 #include "paddle/cinn/lang/builtin.h"
 #include "paddle/cinn/lang/compute.h"
-DECLARE_bool(cinn_ir_schedule);
 namespace cinn {
 namespace hlir {

--- a/paddle/cinn/hlir/op/contrib/logical_right_shift.cc
+++ b/paddle/cinn/hlir/op/contrib/logical_right_shift.cc
@@ -38,8 +38,6 @@
 #include "paddle/cinn/lang/builtin.h"
 #include "paddle/cinn/lang/compute.h"
-DECLARE_bool(cinn_ir_schedule);
 namespace cinn {
 namespace hlir {
 namespace op {

--- a/paddle/cinn/hlir/op/contrib/lookup_table.cc
+++ b/paddle/cinn/hlir/op/contrib/lookup_table.cc
@@ -38,7 +38,6 @@
 #include "paddle/cinn/ir/tensor.h"
 #include "paddle/cinn/lang/builtin.h"
 #include "paddle/cinn/lang/compute.h"
-DECLARE_bool(cinn_ir_schedule);
 namespace cinn {
 namespace hlir {

--- a/paddle/cinn/hlir/op/contrib/one_hot.cc
+++ b/paddle/cinn/hlir/op/contrib/one_hot.cc
@@ -39,8 +39,6 @@
 #include "paddle/cinn/lang/builtin.h"
 #include "paddle/cinn/lang/compute.h"
-DECLARE_bool(cinn_ir_schedule);
 namespace cinn {
 namespace hlir {
 namespace op {

--- a/paddle/cinn/hlir/op/contrib/reciprocal.cc
+++ b/paddle/cinn/hlir/op/contrib/reciprocal.cc
@@ -38,8 +38,6 @@
 #include "paddle/cinn/lang/builtin.h"
 #include "paddle/cinn/lang/compute.h"
-DECLARE_bool(cinn_ir_schedule);
 namespace cinn {
 namespace hlir {
 namespace op {

--- a/paddle/cinn/hlir/op/contrib/repeat.cc
+++ b/paddle/cinn/hlir/op/contrib/repeat.cc
@@ -38,8 +38,6 @@
 #include "paddle/cinn/lang/builtin.h"
 #include "paddle/cinn/lang/compute.h"
-DECLARE_bool(cinn_ir_schedule);
 namespace cinn {
 namespace hlir {
 namespace op {
@@ -163,12 +161,9 @@ std::shared_ptr<framework::OpStrategy> StrategyForRepeat(
    auto tensor_A = A.as_tensor_ref();
    VLOG(3) << "A shape: " << utils::Join(tensor_A->shape, ", ")
            << ", output_shapes: " << utils::Join(output_shapes[0], ", ");
-    std::string tensor_name = common::UniqName("T_Repeat_out");
-    if (FLAGS_cinn_ir_schedule) {
    CHECK_EQ(pack_args.size(), 2U);
-      tensor_name = pack_args[1].operator std::string();
+    std::string tensor_name = pack_args[1].operator std::string();
-    }
    std::vector<ir::Tensor> out = Repeat(tensor_A, repeats, axis, tensor_name);
    CHECK(out.size() == 1U) << "The size of Repeat's output should be 1";
@@ -186,7 +181,6 @@ std::shared_ptr<framework::OpStrategy> StrategyForRepeat(
  framework::CINNSchedule repeat_schedule([=](lang::Args args,
                                              lang::RetValue *ret) {
-    if (FLAGS_cinn_ir_schedule) {
    CHECK(!args.empty())
        << "The input argument of repeat schedule is empty! Please check.\n";
    common::CINNValuePack arg_pack = args[0];
@@ -209,21 +203,12 @@ std::shared_ptr<framework::OpStrategy> StrategyForRepeat(
      if (target.arch == Target::Arch::NVGPU) {
        pe::IRCudaScheduleInjective(ir_sch, output_shapes.front(), target);
      } else if (target.arch == Target::Arch::X86) {
-          pe::IRScheduleInjectiveCPU(
+        pe::IRScheduleInjectiveCPU(ir_sch, output_shapes.front(), target, true);
-              ir_sch, output_shapes.front(), target, true);
      }
    }
    std::vector<common::CINNValue> res{
        common::CINNValue(ir_sch.GetModule().GetExprs().at(0))};
    *ret = common::CINNValuePack{res};
-    } else {
-      CHECK(!args.empty())
-          << "The input argument of repeat schedule is empty! Please check.\n";
-      CINNValuePack arg_pack = args[0];
-      Expr out = arg_pack[0];
-      CHECK(out.as_tensor());
-      *ret = arg_pack;
-    }
  });
  auto strategy = std::make_shared<framework::OpStrategy>();

--- a/paddle/cinn/hlir/op/contrib/resize.cc
+++ b/paddle/cinn/hlir/op/contrib/resize.cc
@@ -37,8 +37,6 @@
 #include "paddle/cinn/lang/builtin.h"
 #include "paddle/cinn/lang/compute.h"
-DECLARE_bool(cinn_ir_schedule);
 namespace cinn {
 namespace hlir {
 namespace op {

--- a/paddle/cinn/hlir/op/contrib/sort.cc
+++ b/paddle/cinn/hlir/op/contrib/sort.cc
@@ -38,8 +38,6 @@
 #include "paddle/cinn/lang/builtin.h"
 #include "paddle/cinn/lang/compute.h"
-DECLARE_bool(cinn_ir_schedule);
 namespace cinn {
 namespace hlir {
 namespace op {

--- a/paddle/cinn/hlir/op/elementwise.cc
+++ b/paddle/cinn/hlir/op/elementwise.cc
@@ -27,8 +27,6 @@
 #include "paddle/cinn/ir/op/ir_operators.h"
 #include "paddle/cinn/utils/functional.h"
-DECLARE_bool(cinn_ir_schedule);
 namespace cinn {
 namespace hlir {
 namespace op {

--- a/paddle/cinn/hlir/op/op_broadcast_test.cc
+++ b/paddle/cinn/hlir/op/op_broadcast_test.cc
@@ -29,8 +29,6 @@
 #include "paddle/cinn/hlir/pe/broadcast.h"
 #include "paddle/cinn/runtime/flags.h"
-DECLARE_bool(cinn_ir_schedule);
 namespace cinn {
 namespace hlir {
 namespace framework {

--- a/paddle/cinn/hlir/op/op_nn_test.cc
+++ b/paddle/cinn/hlir/op/op_nn_test.cc
@@ -30,8 +30,6 @@
 #include "paddle/cinn/hlir/pe/nn.h"
 #include "paddle/cinn/runtime/flags.h"
-DECLARE_bool(cinn_ir_schedule);
 namespace cinn {
 namespace hlir {
 namespace framework {
@@ -49,7 +47,6 @@ Module LowerToModule(const std::string test_name,
                     const Target &target) {
  Module::Builder builder("module", target);
-  if (FLAGS_cinn_ir_schedule) {
  cinn_inputs.emplace_back(output_name);
  common::CINNValuePack cinn_input = common::CINNValuePack{cinn_inputs};
  input_names.push_back(output_name);
@@ -61,21 +58,6 @@ Module LowerToModule(const std::string test_name,
    LOG(INFO) << "Test" << test_name << "'s Strategy, func is :\n" << func;
    builder.AddFunction(func);
  }
-  } else {
-    common::CINNValuePack cinn_input = common::CINNValuePack{cinn_inputs};
-    common::CINNValuePack rets = impl->fcompute(cinn_input);
-    rets = impl->fschedule(rets);
-    // the last element is a StageMap
-    for (int i = 0; i < rets->size() - 1; i++) {
-      Expr temp = rets[i];
-      inputs.push_back(temp.as_tensor_ref());
-    }
-    auto func = Lower("fn_" + func_name, rets.back(), inputs);
-    LOG(INFO) << "Test Strategy Codegen:\n" << func;
-    builder.AddFunction(func);
-  }
  return builder.Build();
 }

--- a/paddle/cinn/hlir/op/reduction_test.cc
+++ b/paddle/cinn/hlir/op/reduction_test.cc
@@ -39,7 +39,6 @@
 #include "paddle/cinn/hlir/pe/nn.h"
 #include "paddle/cinn/runtime/cinn_runtime.h"
 #include "paddle/cinn/runtime/cuda/cuda_module.h"
-DECLARE_bool(cinn_ir_schedule);
 namespace cinn {
 namespace hlir {
 namespace framework {
@@ -94,34 +93,14 @@ std::pair<ir::Module, std::string> GenReduceCode(
      strategy(attrs, inputs, out_type, {output_shape}, target));
  std::vector<ir::LoweredFunc> func;
-  if (!FLAGS_cinn_ir_schedule) {
-    common::CINNValuePack cinn_input =
-        common::CINNValuePack{{common::CINNValue(X)}};
-    common::CINNValuePack rets = impl->fcompute(cinn_input);
-    rets = impl->fschedule(rets);
-    poly::StageMap stages = rets.back();
-    // the last element is a StageMap
-    for (int i = 0; i < rets->size() - 1; i++) {
-      Expr temp = rets[i];
-      if (!temp.as_tensor_ref()->buffer.defined() &&
-          !stages[temp.as_tensor_ref()]->inlined()) {
-        inputs.push_back(temp.as_tensor_ref());
-      }
-    }
-    func =
-        lang::LowerVec(func_name, rets.back(), inputs, {}, {}, nullptr, target);
-  } else {
  std::vector<std::string> input_output_nodes{"X", op_name};
-    func = GetFuncFromImpl(impl,
+  func = GetFuncFromImpl(
-                           common::CINNValuePack{{common::CINNValue(X),
+      impl,
-                                                  common::CINNValue(op_name)}},
+      common::CINNValuePack{{common::CINNValue(X), common::CINNValue(op_name)}},
      inputs,
      input_output_nodes,
      func_name,
      target);
-  }
  Module::Builder builder(func_name + "_builder", target);
  for (auto& f : func) {
@@ -139,11 +118,7 @@ std::pair<ir::Module, std::string> GenReduceCode(
  backends::CodeGenCUDA_Dev codegen(target);
  std::string source_code;
-  if (!FLAGS_cinn_ir_schedule) {
-    source_code = codegen.Compile(builder.Build());
-  } else {
  source_code = codegen.Compile(device_module);
-  }
  // LOG(INFO) << "compiled code:\n" << device_module;
  return std::pair<ir::Module, std::string>(host_module, source_code);
@@ -385,18 +360,12 @@ void TestCaseForReduce(const float init_val,
      dev_x, buffer_x->memory, buffer_x->memory_size, cudaMemcpyHostToDevice));
  dim3 grid;
  dim3 block;
-  if (!FLAGS_cinn_ir_schedule) {
-    grid = {n * c, 1, 1};
-    block = {h * w, 1, 1};
-  } else {
  grid = {c, 1, 1};
  int block_dim_x = n * w * h > 1024 ? 1024 : n * w * h;
  block = {block_dim_x, 1, 1};
-  }
  void* args[] = {&dev_x, &dev_z};
-  std::string new_test_name = test_name;
+  std::string new_test_name = "fn_" + test_name + "_kernel";
-  if (FLAGS_cinn_ir_schedule) new_test_name = "fn_" + new_test_name + "_kernel";
  cuda_module.LaunchKernel(0, new_test_name, grid, block, args);
  CUDA_CALL(cudaMemcpy(
      buffer_z->memory, dev_z, buffer_z->memory_size, cudaMemcpyDeviceToHost));
@@ -458,8 +427,7 @@ TEST(Operator, Operator_Reduction_Case_7) {
  CUDA_CALL(cudaSetDevice(0));
  runtime::cuda::CUDAModule cuda_module(ptx,
                                        runtime::cuda::CUDAModule::Kind::PTX);
-  std::string new_func_name = func_name;
+  std::string new_func_name = "fn_" + func_name;
-  if (FLAGS_cinn_ir_schedule) new_func_name = "fn_" + new_func_name;
  void* reduce_sum_kernel =
      cuda_module.GetFunction(0, new_func_name + "_kernel");
  CHECK(reduce_sum_kernel);

--- a/paddle/cinn/hlir/op/transform.cc
+++ b/paddle/cinn/hlir/op/transform.cc
@@ -28,8 +28,6 @@
 #include "paddle/cinn/ir/utils/ir_printer.h"
 #include "paddle/cinn/utils/string.h"
-DECLARE_bool(cinn_ir_schedule);
 namespace cinn {
 namespace hlir {
 namespace op {

--- a/paddle/cinn/hlir/op/transform_test.cc
+++ b/paddle/cinn/hlir/op/transform_test.cc
@@ -40,8 +40,6 @@
 #include "paddle/cinn/runtime/cuda/cuda_module.h"
 #include "paddle/cinn/runtime/flags.h"
-DECLARE_bool(cinn_ir_schedule);
 namespace cinn {
 namespace hlir {
 namespace framework {

--- a/paddle/cinn/ir/lowered_func.cc
+++ b/paddle/cinn/ir/lowered_func.cc
@@ -30,7 +30,6 @@
 #include "paddle/cinn/optim/tensor_write_tell.h"
 #include "paddle/cinn/runtime/intrinsic.h"
 #include "paddle/cinn/utils/string.h"
-DECLARE_bool(cinn_ir_schedule);
 namespace cinn {
 namespace ir {
@@ -58,8 +57,7 @@ LoweredFunc _LoweredFunc_::Make(const std::string& name,
  n->PrepareCreateTempBufferExprs();
  n->PrepareAllocTempBufferExprs();
  n->AllocTempBuffer();
-  bool with_expr_gen_tensor = true;
+  bool with_expr_gen_tensor = false;
-  if (FLAGS_cinn_ir_schedule) with_expr_gen_tensor = false;
  n->PrepareBufferCastExprs(with_expr_gen_tensor);
  n->PrepareArgumentExprs();
  n->PrepareDeallocTempBufferExprs();

--- a/paddle/cinn/optim/optimize.cc
+++ b/paddle/cinn/optim/optimize.cc
@@ -37,8 +37,6 @@
 #include "paddle/cinn/optim/unroll_loops.h"
 #include "paddle/cinn/optim/vectorize_loops.h"
-DECLARE_bool(cinn_ir_schedule);
 namespace cinn {
 namespace optim {
@@ -60,7 +58,7 @@ Expr Optimize(Expr e,
  VectorizeLoops(&copied, target);
  VLOG(4) << "After Optimize VectorizeLoops:" << copied;
 #ifdef CINN_WITH_CUDA
-  if (FLAGS_cinn_ir_schedule && copied.as_lowered_func()) {
+  if (copied.as_lowered_func()) {
    ir::SetCudaAxisInfo(&copied);
  }
  if (remove_gpu_for_loops) {
@@ -93,10 +91,8 @@ Expr Optimize(Expr e,
 ir::Module Optimize(const ir::Module& module, const Target& target) {
  auto copied = IRCopy(Expr(module));
-  if (FLAGS_cinn_ir_schedule) {
  UnrollLoop(&copied);
  VectorizeLoops(&copied, Target());
-  }
  VLOG(10) << "After VectorizeLoops:" << copied.as_module_ref();
  RemoveScheduleBlock(&copied);
  VLOG(10) << "After RemoveScheduleBlock:" << copied.as_module_ref();

--- a/paddle/cinn/pybind/framework.cc
+++ b/paddle/cinn/pybind/framework.cc
@@ -28,8 +28,6 @@
 #include "paddle/cinn/pybind/bind.h"
 #include "paddle/cinn/runtime/flags.h"
-DECLARE_bool(cinn_ir_schedule);
 namespace cinn::pybind {
 namespace py = pybind11;
@@ -64,7 +62,6 @@ void BindFramework(pybind11::module *m) {
             }
             ir::LoweredFunc func;
-             if (FLAGS_cinn_ir_schedule) {
             std::string output_name = "out";
             temp_inputs.emplace_back(output_name);
             std::vector<std::string> input_output_names;
@@ -82,23 +79,6 @@ void BindFramework(pybind11::module *m) {
                     target);
             CHECK_EQ(funcs.size(), 1U);
             func = funcs[0];
-             } else {
-               common::CINNValuePack C =
-                   impl->fcompute(common::CINNValuePack{temp_inputs});
-               poly::StageMap stages = C.back();
-               // make sure all the tensors in the stages before schedule
-               // launch.
-               for (int i = 0; i < C->size() - 1; i++) {
-                 ir::Expr temp = C[i];
-                 stages->InsertLazily(temp.as_tensor_ref());
-               }
-               C = impl->fschedule(C);
-               for (int i = 0; i < C->size() - 1; i++) {
-                 ir::Expr temp = C[i];
-                 res.push_back(temp.as_tensor_ref());
-               }
-               func = Lower(key, stages, res);
-             }
             return func;
           });

--- a/paddle/cinn/runtime/flags.cc
+++ b/paddle/cinn/runtime/flags.cc
@@ -89,10 +89,6 @@ DEFINE_bool(cinn_use_cuda_vectorize,
            BoolFromEnv("FLAGS_cinn_use_cuda_vectorize", false),
            "Whether use cuda vectroize on schedule config");
-DEFINE_bool(cinn_ir_schedule,
-            BoolFromEnv("FLAGS_cinn_ir_schedule", true),
-            "Whether use reconstructed schedule primitives.");
 DEFINE_bool(use_reduce_split_pass,
            BoolFromEnv("FLAGS_use_reduce_split_pass", false),
            "Whether use reduce split pass.");

--- a/test/cpp/cinn/benchmark/test_utils.cc
+++ b/test/cpp/cinn/benchmark/test_utils.cc
@@ -24,8 +24,6 @@
 #include "paddle/cinn/runtime/flags.h"
 #include "paddle/cinn/utils/timer.h"
-DECLARE_bool(cinn_ir_schedule);
 namespace cinn {
 namespace tests {
 using ir::Tensor;
@@ -87,7 +85,6 @@ Module OpBenchmarkTester::CreateCinnModule(
    auto impl = hlir::framework::OpStrategy::SelectImpl(
        strategy[op](attrs, input_tensors, out_types, input_shapes_, target_));
-    if (FLAGS_cinn_ir_schedule) {
    std::string output_name = "out";
    std::vector<common::CINNValue> temp_inputs;
    std::vector<ir::Tensor> all_arg_tensors;
@@ -151,8 +148,8 @@ Module OpBenchmarkTester::CreateCinnModule(
      res.push_back(funcs[i]);
    }
    for (int i = 0; i < res.size(); i++) {
-        res[i] = optim::Optimize(Expr(funcs[i]), target_, false)
+      res[i] =
-                     .as_lowered_func_ref();
+          optim::Optimize(Expr(funcs[i]), target_, false).as_lowered_func_ref();
    }
    for (auto func : res) {
@@ -164,41 +161,13 @@ Module OpBenchmarkTester::CreateCinnModule(
          for (auto& shape_dim : arg.buffer_arg()->shape) {
            LOG(INFO) << shape_dim << ",";
            CHECK(shape_dim.is_constant());
-              output_shape.push_back(
+            output_shape.push_back(static_cast<int>(shape_dim.get_constant()));
-                  static_cast<int>(shape_dim.get_constant()));
          }
          output_shapes_.push_back(output_shape);
          break;
        }
      }
    }
-    } else {
-      std::vector<common::CINNValue> temp_inputs;
-      for (auto& tensor : input_tensors) {
-        temp_inputs.push_back(common::CINNValue(tensor));
-      }
-      common::CINNValuePack C =
-          impl->fcompute(common::CINNValuePack(temp_inputs));
-      stages = C.back();
-      C = impl->fschedule(C);
-      for (int i = 0; i < C->size() - 1; i++) {
-        ir::Expr temp = C[i];
-        stages->InsertLazily(temp.as_tensor_ref());
-        std::vector<Expr> output_shape_expr =
-            temp.as_tensor_ref()->domain_without_reduce_axis();
-        std::vector<int> output_shape;
-        for (auto& shape : output_shape_expr) {
-          LOG(INFO) << shape;
-          output_shape.push_back(common::AutoSimplify(shape).as_int32());
-        }
-        output_shapes_.push_back(output_shape);
-        rets.push_back(temp.as_tensor_ref());
-      }
-      auto func = Lower(op_name_, stages, rets);
-      LOG(INFO) << "After Lower, func is: \n" << func;
-      builder.AddFunction(func);
-    }
  } else {
    stages = CreateStages(input_tensors);
    outs = CreateSpecificStrategy(input_tensors, &stages);