Code merge | Merge CINN into Paddle (#54749)

* feat(cmake): add cmake of cinn * feat(cmake): add cmake of cinn python test * feat(cmake): add jit * feat(cmake): test/CMakeList.txt * feat(cmake): rebase to develop * feat(cmake): remove some flags * fix(cmake): fix cinn's gflags depends * feat(cmake): add ci scripts of cinn * feat(cmake): copy code of cinn * fix(cmake): fix cinn third_party model path * gflags dynamic dependce * fix ci build_demo * tmp update to c++17 of cinn-only test * fix cinn only with c++17

Code merge | Merge CINN into Paddle (#54749)
* feat(cmake): add cmake of cinn * feat(cmake): add cmake of cinn python test * feat(cmake): add jit * feat(cmake): test/CMakeList.txt * feat(cmake): rebase to develop * feat(cmake): remove some flags * fix(cmake): fix cinn's gflags depends * feat(cmake): add ci scripts of cinn * feat(cmake): copy code of cinn * fix(cmake): fix cinn third_party model path * gflags dynamic dependce * fix ci build_demo * tmp update to c++17 of cinn-only test * fix cinn only with c++17
67c69dca · 6clc · GitHub · 72b8c7c2 · 67c69dca · 67c69dca
1000 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -63,6 +63,17 @@ option(WITH_ONNXRUNTIME "Compile PaddlePaddle with ONNXRUNTIME" OFF)
 option(WITH_CUSPARSELT "Compile PaddlePaddle with CUSPARSELT" OFF)
 option(WITH_SETUP_INSTALL "Compile PaddlePaddle with setup.py" OFF)
 option(WITH_SHARED_PHI "Compile PaddlePaddle with SHARED LIB of PHI" OFF)
+option(CINN_ONLY "Compile CINN only in Paddle" OFF)
+option(CINN_WITH_CUDNN "Compile CINN with CUDNN support" ON)
+find_package(Git REQUIRED)
+# config GIT_URL with github mirrors to speed up dependent repos clone
+option(GIT_URL "Git URL to clone dependent repos" ${GIT_URL})
+if(NOT GIT_URL)
+  set(GIT_URL "https://github.com")
+endif()
 # Note(zhouwei): It use option above, so put here
 include(init)
 include(generic) # simplify cmake module
@@ -229,13 +240,6 @@ else()
  )
 endif()
-find_package(Git REQUIRED)
-# config GIT_URL with github mirrors to speed up dependent repos clone
-option(GIT_URL "Git URL to clone dependent repos" ${GIT_URL})
-if(NOT GIT_URL)
-  set(GIT_URL "https://github.com")
-endif()
 find_package(Threads REQUIRED)
@@ -569,6 +573,37 @@ include(third_party
 include(flags) # set paddle compile flags
+#------------- cinn cmake config start --------------
+set(WITH_MKL_CBLAS ${WITH_MKL})
+set(WITH_CUDA ${WITH_GPU})
+set(WITH_CUDNN ${WITH_GPU})
+if(WITH_CINN)
+  message(STATUS "Compile Paddle with CINN.")
+  include(cmake/cinn.cmake)
+  add_definitions(-DPADDLE_WITH_CINN)
+  # TODO(6clc): Use CINN_WITH_CUDNN to completely replace WITH_CUDNN in CINN.
+  #             Use WITH_GPU to completely replace WITH_CUDA in CINN.
+  if(WITH_GPU)
+    set(WITH_CUDA ${WITH_GPU})
+    add_definitions(-DCINN_WITH_CUDA)
+    set(WITH_CUDNN ${CINN_WITH_CUDNN})
+    if(WITH_CUDNN)
+      add_definitions(-DCINN_WITH_CUDNN)
+    endif()
+  endif()
+  if(CINN_ONLY)
+    if(WITH_PYTHON)
+      add_subdirectory(python)
+    endif()
+    add_subdirectory(test)
+    return()
+  endif()
+endif()
+#------------- cinn cmake config end --------------
 if(WITH_PROFILER)
  find_package(Gperftools REQUIRED)
  include_directories(${GPERFTOOLS_INCLUDE_DIR})

--- a/cmake/cinn.cmake
+++ b/cmake/cinn.cmake
+set(CINN_THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party")
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+set(DOWNLOAD_MODEL_DIR "${CINN_THIRD_PARTY_PATH}/model")
+string(REGEX MATCH "-std=(c\\+\\+[^ ]+)" STD_FLAG "${CMAKE_CXX_FLAGS}")
+if (NOT STD_FLAG)
+  if (NOT CMAKE_CXX_STANDARD)
+    message(STATUS "STD_FLAG and CMAKE_CXX_STANDARD not found, using default flag: -std=c++17")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17")
+    set(CMAKE_CXX_STANDARD 17)
+  else()
+    message(STATUS "Got CMAKE_CXX_STANDARD=${CMAKE_CXX_STANDARD}, append -std=c++${CMAKE_CXX_STANDARD} to CMAKE_CXX_FLAGS")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++${CMAKE_CXX_STANDARD}")
+  endif()
+else()
+  string(REGEX MATCH "([0-9]+)" STD_VALUE "${STD_FLAG}")
+  message(STATUS "Got STD_FLAG=${STD_FLAG}, set CMAKE_CXX_STANDARD=${STD_VALUE}")
+  set(CMAKE_CXX_STANDARD ${STD_VALUE})
+endif()
+if(NOT DEFINED ENV{runtime_include_dir})
+  message(
+    STATUS
+      "set runtime_include_dir: ${CMAKE_SOURCE_DIR}/paddle/cinn/runtime/cuda")
+  set(ENV{runtime_include_dir} "${CMAKE_SOURCE_DIR}/paddle/cinn/runtime/cuda")
+  add_definitions(
+    -DRUNTIME_INCLUDE_DIR="${CMAKE_SOURCE_DIR}/paddle/cinn/runtime/cuda")
+endif()
+if(WITH_TESTING)
+  add_definitions(-DCINN_WITH_TEST)
+endif()
+if(WITH_DEBUG)
+  add_definitions(-DCINN_WITH_DEBUG)
+endif()
+# TODO(zhhsplendid): CINN has lots of warnings during early development.
+# They will be treated as errors under paddle. We set no-error now and we will
+# clean the code in the future.
+add_definitions(-w)
+include(cmake/cinn/version.cmake)
+# include the customized configures
+if(NOT EXISTS ${CMAKE_BINARY_DIR}/cmake/cinn/config.cmake)
+  file(COPY ${PROJECT_SOURCE_DIR}/cmake/cinn/config.cmake DESTINATION ${CMAKE_BINARY_DIR}/cmake/cinn)
+endif()
+include(${CMAKE_BINARY_DIR}/cmake/cinn/config.cmake)
+if(WITH_MKL)
+  generate_dummy_static_lib(LIB_NAME "cinn_mklml" GENERATOR "mklml.cmake")
+  target_link_libraries(cinn_mklml ${MKLML_LIB} ${MKLML_IOMP_LIB})
+  add_definitions(-DCINN_WITH_MKL_CBLAS)
+endif()
+if(WITH_MKLDNN)
+  add_definitions(-DCINN_WITH_MKLDNN)
+endif()
+if(WITH_GPU)
+  message(STATUS "Enable CINN CUDA")
+  add_definitions(-DCINN_WITH_CUDA)
+  message(STATUS "Enable CINN CUDNN")
+  add_definitions(-DCINN_WITH_CUDNN)
+  enable_language(CUDA)
+  find_package(CUDA REQUIRED)
+  include_directories(${CUDA_INCLUDE_DIRS})
+  include_directories(${CMAKE_SOURCE_DIR}/paddle/cinn/runtime/cuda)
+  include_directories(/usr/lib/x86_64-linux-gnu)
+  set(CUDA_SEPARABLE_COMPILATION ON)
+  cuda_select_nvcc_arch_flags(ARCH_FLAGS Auto)
+  list(APPEND CUDA_NVCC_FLAGS ${ARCH_FLAGS})
+  set(CMAKE_CUDA_STANDARD ${CMAKE_CXX_STANDARD})
+  message(
+    STATUS
+      "copy paddle/cinn/common/float16.h paddle/cinn/common/bfloat16.h to $ENV{runtime_include_dir}"
+  )
+  file(COPY paddle/cinn/common/float16.h paddle/cinn/common/bfloat16.h
+       DESTINATION $ENV{runtime_include_dir})
+  find_library(CUDASTUB libcuda.so HINTS ${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs/
+                                         REQUIRED)
+  find_library(CUBLAS libcublas.so HINTS ${CUDA_TOOLKIT_ROOT_DIR}/lib64 /usr/lib /usr/lib64 REQUIRED)
+  find_library(CUDNN libcudnn.so HINTS ${CUDA_TOOLKIT_ROOT_DIR}/lib64 /usr/lib /usr/lib64 REQUIRED)
+  find_library(CURAND libcurand.so HINTS ${CUDA_TOOLKIT_ROOT_DIR}/lib64 /usr/lib /usr/lib64 REQUIRED)
+  find_library(CUSOLVER libcusolver.so HINTS ${CUDA_TOOLKIT_ROOT_DIR}/lib64 /usr/lib /usr/lib64 REQUIRED)
+endif()
+set(cinnapi_src CACHE INTERNAL "" FORCE)
+set(core_src CACHE INTERNAL "" FORCE)
+set(core_includes CACHE INTERNAL "" FORCE)
+set(core_proto_includes CACHE INTERNAL "" FORCE)
+include_directories(${CMAKE_SOURCE_DIR})
+include_directories(${CMAKE_BINARY_DIR})
+include(cmake/generic.cmake)
+include(cmake/cinn/system.cmake)
+include(cmake/cinn/core.cmake)
+include(cmake/cinn/external/absl.cmake)
+include(cmake/cinn/nvrtc.cmake)
+include(cmake/cinn/nvtx.cmake)
+include(cmake/cinn/external/llvm.cmake)
+include(cmake/cinn/external/isl.cmake)
+include(cmake/cinn/external/ginac.cmake)
+include(cmake/cinn/external/openmp.cmake)
+include(cmake/cinn/external/jitify.cmake)
+if(CINN_ONLY)
+  LINK_LIBRARIES(gflags)
+endif()
+set(LINK_FLAGS
+    "-Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/cmake/cinn/export.map"
+    CACHE INTERNAL "")
+set(global_test_args
+    "--cinn_x86_builtin_code_root=${CMAKE_SOURCE_DIR}/paddle/cinn/backends")
+set(Python_VIRTUALENV FIRST)
+if(NOT PYTHON_EXECUTABLE)
+  find_package(PythonInterp ${PY_VERSION} REQUIRED)
+endif()
+if(NOT PYTHON_LIBRARIES)
+  find_package(PythonLibs ${PY_VERSION} REQUIRED)
+endif()
+message(STATUS "PYTHON_LIBRARIES: ${PYTHON_LIBRARIES}")
+message(STATUS "PYTHON_INCLUDE_DIR: ${PYTHON_INCLUDE_DIR}")
+include_directories(${PYTHON_INCLUDE_DIR})
+set(core_deps CACHE INTERNAL "" FORCE)
+set(hlir_src CACHE INTERNAL "" FORCE)
+# TODO(chenweihang): The logic later depends adding cinn subdirectory here,
+# but better to move to paddle/CMakeLists.txt
+add_subdirectory(paddle/cinn)
+set(core_src "${cinnapi_src}")
+cinn_cc_library(
+  cinnapi
+  SHARED
+  SRCS
+  ${cinnapi_src}
+  DEPS
+  glog
+  ${llvm_libs}
+  cinn_framework_proto
+  param_proto
+  auto_schedule_proto
+  schedule_desc_proto
+  absl
+  isl
+  ginac
+  pybind
+  ${jitify_deps})
+add_dependencies(cinnapi GEN_LLVM_RUNTIME_IR_HEADER ZLIB::ZLIB)
+add_dependencies(cinnapi GEN_LLVM_RUNTIME_IR_HEADER ${core_deps})
+target_link_libraries(cinnapi ${PYTHON_LIBRARIES})
+if(WITH_MKL)
+  target_link_libraries(cinnapi cinn_mklml)
+  add_dependencies(cinnapi cinn_mklml)
+  if(WITH_MKLDNN)
+    target_link_libraries(cinnapi mkldnn)
+    add_dependencies(cinnapi mkldnn)
+  endif()
+endif()
+if(WITH_GPU)
+  target_link_libraries(
+    cinnapi
+    ${CUDA_NVRTC_LIB}
+    ${CUDA_LIBRARIES}
+    ${CUDASTUB}
+    ${CUBLAS}
+    ${CUDNN}
+    ${CURAND}
+    ${CUSOLVER})
+  if(NVTX_FOUND)
+    target_link_libraries(cinnapi ${CUDA_NVTX_LIB})
+  endif()
+endif()
+function(gen_cinncore LINKTYPE)
+  set(CINNCORE_TARGET cinncore)
+  if(${LINKTYPE} STREQUAL "STATIC")
+    set(CINNCORE_TARGET cinncore_static)
+  endif()
+  cinn_cc_library(
+    ${CINNCORE_TARGET}
+    ${LINKTYPE}
+    SRCS
+    ${core_src}
+    DEPS
+    glog
+    ${llvm_libs}
+    cinn_framework_proto
+    param_proto
+    auto_schedule_proto
+    schedule_desc_proto
+    absl
+    isl
+    ginac)
+  add_dependencies(${CINNCORE_TARGET} GEN_LLVM_RUNTIME_IR_HEADER ZLIB::ZLIB)
+  add_dependencies(${CINNCORE_TARGET} GEN_LLVM_RUNTIME_IR_HEADER ${core_deps})
+  add_dependencies(${CINNCORE_TARGET} pybind)
+  target_link_libraries(${CINNCORE_TARGET} ${PYTHON_LIBRARIES})
+  if(WITH_MKL)
+    target_link_libraries(${CINNCORE_TARGET} cinn_mklml)
+    add_dependencies(${CINNCORE_TARGET} cinn_mklml)
+    if(WITH_MKLDNN)
+      target_link_libraries(${CINNCORE_TARGET} mkldnn)
+      add_dependencies(${CINNCORE_TARGET} mkldnn)
+    endif()
+  endif()
+  if(WITH_GPU)
+    target_link_libraries(
+      ${CINNCORE_TARGET}
+      ${CUDA_NVRTC_LIB}
+      ${CUDA_LIBRARIES}
+      ${CUDASTUB}
+      ${CUBLAS}
+      ${CUDNN}
+      ${CURAND}
+      ${CUSOLVER}
+      ${jitify_deps})
+    if(NVTX_FOUND)
+      target_link_libraries(${CINNCORE_TARGET} ${CUDA_NVTX_LIB})
+    endif()
+  endif()
+endfunction()
+gen_cinncore(STATIC)
+gen_cinncore(SHARED)
+# --------distribute cinncore lib and include begin--------
+set(PUBLISH_LIBS ON)
+if(PUBLISH_LIBS)
+  set(core_includes
+      "${core_includes};paddle/cinn/runtime/cuda/cinn_cuda_runtime_source.cuh")
+  foreach(header ${core_includes})
+    get_filename_component(prefix ${header} DIRECTORY)
+    file(COPY ${header}
+         DESTINATION ${CMAKE_BINARY_DIR}/dist/cinn/include/${prefix})
+  endforeach()
+  foreach(proto_header ${core_proto_includes})
+    string(REPLACE ${CMAKE_BINARY_DIR}/ "" relname ${proto_header})
+    get_filename_component(prefix ${relname} DIRECTORY)
+    set(target_name ${CMAKE_BINARY_DIR}/dist/cinn/include/${relname})
+    add_custom_command(
+      TARGET cinnapi
+      POST_BUILD
+      COMMENT "copy generated proto header '${relname}' to dist"
+      COMMAND cmake -E copy ${proto_header} ${target_name} DEPENDS cinnapi)
+  endforeach()
+  add_custom_command(
+    TARGET cinnapi
+    POST_BUILD
+    COMMAND cmake -E copy ${CMAKE_BINARY_DIR}/libcinnapi.so
+            ${CMAKE_BINARY_DIR}/dist/cinn/lib/libcinnapi.so
+            COMMAND cmake -E copy_directory ${CINN_THIRD_PARTY_PATH}/install
+            ${CMAKE_BINARY_DIR}/dist/third_party DEPENDS cinnapi)
+  add_custom_command(
+    TARGET cinncore_static
+    POST_BUILD
+    COMMAND cmake -E copy ${PROJECT_SOURCE_DIR}/tools/cinn/tutorials_demo/demo.cc
+            ${CMAKE_BINARY_DIR}/dist/demo.cc
+    COMMAND cmake -E copy ${PROJECT_SOURCE_DIR}/tools/cinn/tutorials_demo/build_demo.sh
+            ${CMAKE_BINARY_DIR}/dist/build_demo.sh
+    COMMAND cmake -E copy ${CMAKE_BINARY_DIR}/libcinncore_static.a
+            ${CMAKE_BINARY_DIR}/dist/cinn/lib/libcinncore_static.a
+    COMMAND
+      cmake -E copy
+      ${CMAKE_BINARY_DIR}/paddle/cinn/frontend/paddle/libcinn_framework_proto.a
+      ${CMAKE_BINARY_DIR}/dist/cinn/lib/libcinn_framework_proto.a
+    COMMAND
+      cmake -E copy ${CMAKE_BINARY_DIR}/paddle/cinn/hlir/pe/libparam_proto.a
+      ${CMAKE_BINARY_DIR}/dist/cinn/lib/libparam_proto.a
+    COMMAND
+      cmake -E copy
+      ${CMAKE_BINARY_DIR}/paddle/cinn/auto_schedule/libauto_schedule_proto.a
+      ${CMAKE_BINARY_DIR}/dist/cinn/lib/libauto_schedule_proto.a
+    COMMAND
+      cmake -E copy ${CMAKE_BINARY_DIR}/paddle/cinn/ir/libschedule_desc_proto.a
+      ${CMAKE_BINARY_DIR}/dist/cinn/lib/libschedule_desc_proto.a
+    COMMENT "distribute libcinncore_static.a and related header files." DEPENDS
+            cinncore_static)
+endif()
+# --------distribute cinncore lib and include end--------
+set(CINN_LIB_NAME "libcinnapi.so")
+set(CINN_LIB_LOCATION "${CMAKE_BINARY_DIR}/dist/cinn/lib")
+set(CINN_LIB "${CINN_LIB_LOCATION}/${CINN_LIB_NAME}")
+######################################
+# Add CINN's dependencies header files
+######################################
+# Add absl
+set(ABSL_INCLUDE_DIR "${CMAKE_BINARY_DIR}/dist/third_party/absl/include")
+include_directories(${ABSL_INCLUDE_DIR})
+# Add isl
+set(ISL_INCLUDE_DIR "${CMAKE_BINARY_DIR}/dist/third_party/isl/include")
+include_directories(${ISL_INCLUDE_DIR})
+# Add LLVM
+set(LLVM_INCLUDE_DIR "${CMAKE_BINARY_DIR}/dist/third_party/llvm/include")
+include_directories(${LLVM_INCLUDE_DIR})
+######################################################
+# Put external_cinn and dependencies together as a lib
+######################################################
+set(CINN_INCLUDE_DIR "${CMAKE_BINARY_DIR}/dist/cinn/include")
+include_directories(${CINN_INCLUDE_DIR})
--- a/cmake/cinn/config.cmake
+++ b/cmake/cinn/config.cmake
+# The home path of ISL
+# Required!
+set(ISL_HOME "")
+set(USE_OPENMP "intel")
--- a/cmake/cinn/core.cmake
+++ b/cmake/cinn/core.cmake
+set(CMAKE_CXX_FLAGS
+    "${CMAKE_CXX_FLAGS} -fPIC -mavx -mfma -Wno-write-strings -Wno-psabi")
+set(PADDLE_RESOURCE_URL
+    "http://paddle-inference-dist.bj.bcebos.com"
+    CACHE STRING "inference download url")
+function(cinn_cc_library TARGET_NAME)
+  set(options STATIC static SHARED shared)
+  set(oneValueArgs "")
+  set(multiValueArgs SRCS DEPS)
+  cmake_parse_arguments(cinn_cc_library "${options}" "${oneValueArgs}"
+                        "${multiValueArgs}" ${ARGN})
+  if(cinn_cc_library_SRCS)
+    if(cinn_cc_library_SHARED OR cinn_cc_library_shared) # build *.so
+      add_library(${TARGET_NAME} SHARED ${cinn_cc_library_SRCS})
+    else()
+      add_library(${TARGET_NAME} STATIC ${cinn_cc_library_SRCS})
+    endif()
+    if(cinn_cc_library_DEPS)
+      # Don't need link libwarpctc.so
+      target_link_libraries(${TARGET_NAME} ${cinn_cc_library_DEPS})
+      add_dependencies(${TARGET_NAME} ${cinn_cc_library_DEPS})
+    endif()
+    # cpplint code style
+    foreach(source_file ${cinn_cc_library_SRCS})
+      string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file})
+      if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
+        list(APPEND cinn_cc_library_HEADERS
+             ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
+      endif()
+    endforeach()
+  else(cinn_cc_library_SRCS)
+    if(cinn_cc_library_DEPS)
+      cinn_merge_static_libs(${TARGET_NAME} ${cinn_cc_library_DEPS})
+    else()
+      message(
+        FATAL_ERROR
+          "Please specify source files or libraries in cinn_cc_library(${TARGET_NAME} ...)."
+      )
+    endif()
+  endif(cinn_cc_library_SRCS)
+  if((NOT ("${TARGET_NAME}" STREQUAL "cinn_gtest_main"))
+     AND (NOT ("${TARGET_NAME}" STREQUAL "utils"))
+     AND (NOT ("${TARGET_NAME}" STREQUAL "lib")))
+    target_link_libraries(${TARGET_NAME} Threads::Threads)
+  endif(
+    (NOT ("${TARGET_NAME}" STREQUAL "cinn_gtest_main"))
+    AND (NOT ("${TARGET_NAME}" STREQUAL "utils"))
+    AND (NOT ("${TARGET_NAME}" STREQUAL "lib")))
+endfunction(cinn_cc_library)
+list(APPEND CMAKE_CTEST_ARGUMENTS)
+function(remove_gflags TARGET_NAME)
+  get_target_property(TARGET_LIBRARIES ${TARGET_NAME} LINK_LIBRARIES)
+  list(REMOVE_ITEM TARGET_LIBRARIES glog)
+  list(REMOVE_ITEM TARGET_LIBRARIES gflags)
+  set_property(TARGET ${TARGET_NAME} PROPERTY LINK_LIBRARIES
+                                              ${TARGET_LIBRARIES})
+endfunction()
+function(cinn_cc_test TARGET_NAME)
+  if(WITH_TESTING AND CINN_ONLY)
+    set(options SERIAL)
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS ARGS)
+    cmake_parse_arguments(cinn_cc_test "${options}" "${oneValueArgs}"
+                          "${multiValueArgs}" ${ARGN})
+    add_executable(${TARGET_NAME} ${cinn_cc_test_SRCS})
+    get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
+    target_link_libraries(${TARGET_NAME} ${os_dependency_modules}
+                          cinn_gtest_main gtest glog ${cinn_cc_test_DEPS})
+    add_dependencies(${TARGET_NAME} cinn_gtest_main gtest glog
+                     ${cinn_cc_test_DEPS})
+    add_test(
+      NAME ${TARGET_NAME}
+      COMMAND ${TARGET_NAME} "${cinn_cc_test_ARGS}"
+      WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+    if(${cinn_cc_test_SERIAL})
+      set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
+    endif()
+    # No unit test should exceed 10 minutes.
+    set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 6000)
+    remove_gflags(${TARGET_NAME})
+  endif()
+endfunction()
+function(cinn_nv_library TARGET_NAME)
+  if(WITH_GPU)
+    set(options STATIC static SHARED shared)
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS)
+    cmake_parse_arguments(cinn_nv_library "${options}" "${oneValueArgs}"
+                          "${multiValueArgs}" ${ARGN})
+    if(cinn_nv_library_SRCS)
+      if(cinn_nv_library_SHARED OR cinn_nv_library_shared) # build *.so
+        cuda_add_library(${TARGET_NAME} SHARED ${cinn_nv_library_SRCS})
+      else()
+        cuda_add_library(${TARGET_NAME} STATIC ${cinn_nv_library_SRCS})
+      endif()
+      if(cinn_nv_library_DEPS)
+        add_dependencies(${TARGET_NAME} ${cinn_nv_library_DEPS})
+        target_link_libraries(${TARGET_NAME} ${cinn_nv_library_DEPS})
+      endif()
+      # cpplint code style
+      foreach(source_file ${cinn_nv_library_SRCS})
+        string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file})
+        if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
+          list(APPEND cinn_nv_library_HEADERS
+               ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
+        endif()
+      endforeach()
+    else(cinn_nv_library_SRCS)
+      if(cinn_nv_library_DEPS)
+        cinn_merge_static_libs(${TARGET_NAME} ${cinn_nv_library_DEPS})
+      else()
+        message(FATAL
+                "Please specify source file or library in cinn_nv_library.")
+      endif()
+    endif(cinn_nv_library_SRCS)
+    target_link_libraries(${TARGET_NAME} Threads::Threads)
+  endif()
+endfunction(cinn_nv_library)
+function(cinn_nv_binary TARGET_NAME)
+  if(WITH_GPU)
+    set(options "")
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS)
+    cmake_parse_arguments(cinn_nv_binary "${options}" "${oneValueArgs}"
+                          "${multiValueArgs}" ${ARGN})
+    cuda_add_executable(${TARGET_NAME} ${cinn_nv_binary_SRCS})
+    if(cinn_nv_binary_DEPS)
+      target_link_libraries(${TARGET_NAME} ${cinn_nv_binary_DEPS})
+      add_dependencies(${TARGET_NAME} ${cinn_nv_binary_DEPS})
+      common_link(${TARGET_NAME})
+    endif()
+  endif()
+endfunction(cinn_nv_binary)
+function(cinn_nv_test TARGET_NAME)
+  if(WITH_GPU AND WITH_TESTING AND CINN_ONLY)
+    set(options SERIAL)
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS ARGS)
+    cmake_parse_arguments(cinn_nv_test "${options}" "${oneValueArgs}"
+                          "${multiValueArgs}" ${ARGN})
+    cuda_add_executable(${TARGET_NAME} ${cinn_nv_test_SRCS} OPTIONS "-std=c++${CMAKE_CUDA_STANDARD}")
+    get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
+    target_link_libraries(
+      ${TARGET_NAME}
+      ${cinn_nv_test_DEPS}
+      cinn_gtest_main
+      gtest
+      ${os_dependency_modules}
+      ${CUDNN_LIBRARY}
+      ${CUBLAS_LIBRARIES}
+      ${CUDA_LIBRARIES})
+    add_dependencies(${TARGET_NAME} ${cinn_nv_test_DEPS} cinn_gtest_main gtest)
+    common_link(${TARGET_NAME})
+    # add_test(${TARGET_NAME} ${TARGET_NAME})
+    add_test(
+      NAME ${TARGET_NAME}
+      COMMAND ${TARGET_NAME} "${cinn_nv_test_ARGS}"
+      WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+    if(cinn_nv_test_SERIAL)
+      set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
+    endif()
+    target_link_libraries(
+      ${TARGET_NAME} Threads::Threads ${CUDA_NVRTC_LIB} ${CUDA_LIBRARIES}
+      ${CUDA_cudart_static_LIBRARY}
+      ${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs/libcuda.so)
+    if(NVTX_FOUND)
+      target_link_libraries(${TARGET_NAME} ${CUDA_NVTX_LIB})
+    endif()
+    remove_gflags(${TARGET_NAME})
+  endif()
+endfunction(cinn_nv_test)
+# Add dependency that TARGET will depend on test result of DEP, this function executes the DEP during make.
+function(add_run_test_dependency TARGET_NAME DEP_NAME)
+  if(WITH_TESTING AND CINN_ONLY)
+    set(custom_target_name ${TARGET_NAME}_TEST_OUTPUT_DEPENDENCY_ON_${DEP_NAME})
+    add_custom_target(
+      ${custom_target_name}
+      COMMAND
+        cd ${CMAKE_CURRENT_BINARY_DIR} && ./${DEP_NAME}
+        --cinn_x86_builtin_code_root=${CMAKE_SOURCE_DIR}/paddle/cinn/backends
+      COMMAND cd ${CMAKE_BINARY_DIR}
+      DEPENDS ${DEP_NAME})
+    add_dependencies(${TARGET_NAME} ${DEP_NAME} ${custom_target_name})
+  endif(WITH_TESTING AND CINN_ONLY)
+endfunction(add_run_test_dependency)
+# find all third_party modules is used for paddle static library
+# for reduce the dependency when building the inference libs.
+set_property(GLOBAL PROPERTY FLUID_THIRD_PARTY)
+function(find_fluid_thirdparties TARGET_NAME)
+  get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE)
+  string(REGEX REPLACE "^${PADDLE_SOURCE_DIR}/" "" __target_path
+                       ${__target_path})
+  string(FIND "${__target_path}" "third_party" pos)
+  if(pos GREATER 1)
+    get_property(fluid_ GLOBAL PROPERTY FLUID_THIRD_PARTY)
+    set(fluid_third_partys ${fluid_third_partys} ${TARGET_NAME})
+    set_property(GLOBAL PROPERTY FLUID_THIRD_PARTY "${fluid_third_partys}")
+  endif()
+endfunction(find_fluid_thirdparties)
+function(cinn_merge_static_libs TARGET_NAME)
+  set(libs ${ARGN})
+  list(REMOVE_DUPLICATES libs)
+  # Get all propagation dependencies from the merged libraries
+  foreach(lib ${libs})
+    list(APPEND libs_deps ${${lib}_LIB_DEPENDS})
+  endforeach()
+  if(libs_deps)
+    list(REMOVE_DUPLICATES libs_deps)
+  endif()
+  # To produce a library we need at least one source file.
+  # It is created by add_custom_command below and will helps
+  # also help to track dependencies.
+  set(target_SRCS ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_dummy.c)
+  if(APPLE) # Use OSX's libtool to merge archives
+    # Make the generated dummy source file depended on all static input
+    # libs. If input lib changes,the source file is touched
+    # which causes the desired effect (relink).
+    add_custom_command(
+      OUTPUT ${target_SRCS}
+      COMMAND ${CMAKE_COMMAND} -E touch ${target_SRCS}
+      DEPENDS ${libs})
+    # Generate dummy staic lib
+    file(WRITE ${target_SRCS}
+         "const char *dummy_${TARGET_NAME} = \"${target_SRCS}\";")
+    add_library(${TARGET_NAME} STATIC ${target_SRCS})
+    target_link_libraries(${TARGET_NAME} ${libs_deps})
+    foreach(lib ${libs})
+      # Get the file names of the libraries to be merged
+      set(libfiles ${libfiles} $<TARGET_FILE:${lib}>)
+    endforeach()
+    add_custom_command(
+      TARGET ${TARGET_NAME}
+      POST_BUILD
+      COMMAND rm "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a"
+      COMMAND /usr/bin/libtool -static -o
+              "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a" ${libfiles})
+  endif(APPLE)
+  if(LINUX
+  )# general UNIX: use "ar" to extract objects and re-add to a common lib
+    set(target_DIR ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}.dir)
+    foreach(lib ${libs})
+      set(objlistfile ${target_DIR}/${lib}.objlist
+      )# list of objects in the input library
+      set(objdir ${target_DIR}/${lib}.objdir)
+      add_custom_command(
+        OUTPUT ${objdir}
+        COMMAND ${CMAKE_COMMAND} -E make_directory ${objdir}
+        DEPENDS ${lib})
+      add_custom_command(
+        OUTPUT ${objlistfile}
+        COMMAND ${CMAKE_AR} -x "$<TARGET_FILE:${lib}>"
+        COMMAND ${CMAKE_AR} -t "$<TARGET_FILE:${lib}>" > ${objlistfile}
+        DEPENDS ${lib} ${objdir}
+        WORKING_DIRECTORY ${objdir})
+      list(APPEND target_OBJS "${objlistfile}")
+    endforeach()
+    # Make the generated dummy source file depended on all static input
+    # libs. If input lib changes,the source file is touched
+    # which causes the desired effect (relink).
+    add_custom_command(
+      OUTPUT ${target_SRCS}
+      COMMAND ${CMAKE_COMMAND} -E touch ${target_SRCS}
+      DEPENDS ${libs} ${target_OBJS})
+    # Generate dummy static lib
+    file(WRITE ${target_SRCS}
+         "const char *dummy_${TARGET_NAME} = \"${target_SRCS}\";")
+    add_library(${TARGET_NAME} STATIC ${target_SRCS})
+    target_link_libraries(${TARGET_NAME} ${libs_deps})
+    # Get the file name of the generated library
+    set(target_LIBNAME "$<TARGET_FILE:${TARGET_NAME}>")
+    add_custom_command(
+      TARGET ${TARGET_NAME}
+      POST_BUILD
+      COMMAND ${CMAKE_AR} crs ${target_LIBNAME} `find ${target_DIR} -name '*.o'`
+      COMMAND ${CMAKE_RANLIB} ${target_LIBNAME}
+      WORKING_DIRECTORY ${target_DIR})
+  endif(LINUX)
+  if(WIN32)
+    # windows do not support gcc/nvcc combined compiling. Use msvc lib.exe to merge libs.
+    # Make the generated dummy source file depended on all static input
+    # libs. If input lib changes,the source file is touched
+    # which causes the desired effect (relink).
+    add_custom_command(
+      OUTPUT ${target_SRCS}
+      COMMAND ${CMAKE_COMMAND} -E touch ${target_SRCS}
+      DEPENDS ${libs})
+    # Generate dummy static lib
+    file(WRITE ${target_SRCS}
+         "const char *dummy_${TARGET_NAME} = \"${target_SRCS}\";")
+    add_library(${TARGET_NAME} STATIC ${target_SRCS})
+    target_link_libraries(${TARGET_NAME} ${libs_deps})
+    foreach(lib ${libs})
+      # Get the file names of the libraries to be merged
+      set(libfiles ${libfiles} $<TARGET_FILE:${lib}>)
+    endforeach()
+    # msvc will put library in directory of "/Release/xxxlib" by default
+    #       COMMAND cmake -E remove "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/${TARGET_NAME}.lib"
+    add_custom_command(
+      TARGET ${TARGET_NAME}
+      POST_BUILD
+      COMMAND cmake -E make_directory
+              "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}"
+      COMMAND
+        lib
+        /OUT:${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/lib${TARGET_NAME}.lib
+        ${libfiles})
+  endif(WIN32)
+endfunction(cinn_merge_static_libs)
+# Modification of standard 'protobuf_generate_cpp()' with protobuf-lite support
+# Usage:
+#   paddle_protobuf_generate_cpp(<proto_srcs> <proto_hdrs> <proto_files>)
+function(paddle_protobuf_generate_cpp SRCS HDRS)
+  if(NOT ARGN)
+    message(
+      SEND_ERROR
+        "Error: paddle_protobuf_generate_cpp() called without any proto files")
+    return()
+  endif()
+  set(${SRCS})
+  set(${HDRS})
+  foreach(FIL ${ARGN})
+    get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
+    get_filename_component(FIL_WE ${FIL} NAME_WE)
+    set(_protobuf_protoc_src "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}.pb.cc")
+    set(_protobuf_protoc_hdr "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}.pb.h")
+    list(APPEND ${SRCS} "${_protobuf_protoc_src}")
+    list(APPEND ${HDRS} "${_protobuf_protoc_hdr}")
+    add_custom_command(
+      OUTPUT "${_protobuf_protoc_src}" "${_protobuf_protoc_hdr}"
+      COMMAND ${CMAKE_COMMAND} -E make_directory "${CMAKE_CURRENT_BINARY_DIR}"
+      COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} -I${CMAKE_SOURCE_DIR} --cpp_out
+              "${CMAKE_BINARY_DIR}" ${ABS_FIL}
+      DEPENDS ${ABS_FIL} protoc
+      COMMENT "Running C++ protocol buffer compiler on ${FIL}"
+      VERBATIM)
+  endforeach()
+  set_source_files_properties(${${SRCS}} ${${HDRS}} PROPERTIES GENERATED TRUE)
+  set(${SRCS}
+      ${${SRCS}}
+      PARENT_SCOPE)
+  set(${HDRS}
+      ${${HDRS}}
+      PARENT_SCOPE)
+endfunction()
+function(cinn_proto_library TARGET_NAME)
+  set(oneValueArgs "")
+  set(multiValueArgs SRCS DEPS)
+  cmake_parse_arguments(cinn_proto_library "${options}" "${oneValueArgs}"
+                        "${multiValueArgs}" ${ARGN})
+  set(proto_srcs)
+  set(proto_hdrs)
+  paddle_protobuf_generate_cpp(proto_srcs proto_hdrs ${cinn_proto_library_SRCS})
+  cinn_cc_library(${TARGET_NAME} SRCS ${proto_srcs} DEPS
+                  ${cinn_proto_library_DEPS} protobuf)
+  set("${TARGET_NAME}_HDRS"
+      ${proto_hdrs}
+      PARENT_SCOPE)
+  set("${TARGET_NAME}_SRCS"
+      ${proto_srcs}
+      PARENT_SCOPE)
+endfunction()
+function(common_link TARGET_NAME)
+  if(WITH_PROFILER)
+    target_link_libraries(${TARGET_NAME} gperftools::profiler)
+  endif()
+  if(WITH_JEMALLOC)
+    target_link_libraries(${TARGET_NAME} jemalloc::jemalloc)
+  endif()
+endfunction()
+# This method is borrowed from Paddle-Lite.
+function(download_and_uncompress INSTALL_DIR URL FILENAME)
+  message(STATUS "Download inference test stuff from ${URL}/${FILENAME}")
+  string(REGEX REPLACE "[-%.]" "_" FILENAME_EX ${FILENAME})
+  set(EXTERNAL_PROJECT_NAME "extern_lite_download_${FILENAME_EX}")
+  set(UNPACK_DIR "${INSTALL_DIR}/src/${EXTERNAL_PROJECT_NAME}")
+  ExternalProject_Add(
+    ${EXTERNAL_PROJECT_NAME}
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    PREFIX ${INSTALL_DIR}
+    DOWNLOAD_COMMAND
+      wget --no-check-certificate -q -O ${INSTALL_DIR}/${FILENAME}
+      ${URL}/${FILENAME} && ${CMAKE_COMMAND} -E tar xzf
+      ${INSTALL_DIR}/${FILENAME}
+    DOWNLOAD_DIR ${INSTALL_DIR}
+    DOWNLOAD_NO_PROGRESS 1
+    CONFIGURE_COMMAND ""
+    BUILD_COMMAND ""
+    UPDATE_COMMAND ""
+    INSTALL_COMMAND "")
+endfunction()
+function(gather_srcs SRC_GROUP)
+  set(options)
+  set(oneValueArgs)
+  set(multiValueArgs "SRCS")
+  cmake_parse_arguments(prefix "" "" "${multiValueArgs}" ${ARGN})
+  foreach(cpp ${prefix_SRCS})
+    set(${SRC_GROUP}
+        "${${SRC_GROUP}};${CMAKE_CURRENT_SOURCE_DIR}/${cpp}"
+        CACHE INTERNAL "")
+  endforeach()
+endfunction()
+function(core_gather_headers)
+  file(
+    GLOB includes
+    LIST_DIRECTORIES false
+    RELATIVE ${CMAKE_SOURCE_DIR}
+    *.h)
+  foreach(header ${includes})
+    set(core_includes
+        "${core_includes};${header}"
+        CACHE INTERNAL "")
+  endforeach()
+endfunction()
--- a/cmake/cinn/export.map
+++ b/cmake/cinn/export.map
+{
+	global:
+        RegisterKernels;
+	local:
+		*;
+};
--- a/cmake/cinn/external/absl.cmake
+++ b/cmake/cinn/external/absl.cmake
+include(ExternalProject)
+set(ABSL_SOURCES_DIR ${CINN_THIRD_PARTY_PATH}/absl)
+set(ABSL_INSTALL_DIR ${CINN_THIRD_PARTY_PATH}/install/absl)
+set(ABSL_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
+set(ABSL_REPOSITORY "https://github.com/abseil/abseil-cpp.git")
+set(ABSL_TAG "20210324.2")
+set(OPTIONAL_ARGS
+    "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}"
+    "-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}"
+    "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}"
+    "-DABSL_CXX_STANDARD=17"
+    "-DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}"
+    "-DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}"
+    "-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}"
+    "-DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}"
+    "-DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}")
+ExternalProject_Add(
+  external_absl
+  ${EXTERNAL_PROJECT_LOG_ARGS}
+  DEPENDS gflags
+  GIT_REPOSITORY ${ABSL_REPOSITORY}
+  GIT_TAG ${ABSL_TAG}
+  PREFIX ${ABSL_SOURCES_DIR}
+  UPDATE_COMMAND ""
+  CMAKE_ARGS ${OPTIONAL_ARGS}
+             -DCMAKE_INSTALL_PREFIX=${ABSL_INSTALL_DIR}
+             -DCMAKE_INSTALL_LIBDIR=${ABSL_INSTALL_DIR}/lib
+             -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+             -DWITH_GFLAGS=ON
+             -Dgflags_DIR=${GFLAGS_INSTALL_DIR}/lib/cmake/gflags
+             -DBUILD_TESTING=OFF
+             -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+             ${EXTERNAL_OPTIONAL_ARGS}
+  CMAKE_CACHE_ARGS
+    -DCMAKE_INSTALL_PREFIX:PATH=${ABSL_INSTALL_DIR}
+    -DCMAKE_INSTALL_LIBDIR:PATH=${ABSL_INSTALL_DIR}/lib
+    -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+    -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
+  BUILD_BYPRODUCTS ${ABSL_INSTALL_DIR}/lib/libabsl_base.a
+  BUILD_BYPRODUCTS ${ABSL_INSTALL_DIR}/lib/libabsl_hash.a
+  BUILD_BYPRODUCTS ${ABSL_INSTALL_DIR}/lib/libabsl_wyhash.a
+  BUILD_BYPRODUCTS ${ABSL_INSTALL_DIR}/lib/libabsl_city.a
+  BUILD_BYPRODUCTS ${ABSL_INSTALL_DIR}/lib/libabsl_strings.a
+  BUILD_BYPRODUCTS ${ABSL_INSTALL_DIR}/lib/libabsl_throw_delegate.a
+  BUILD_BYPRODUCTS ${ABSL_INSTALL_DIR}/lib/libabsl_bad_any_cast_impl.a
+  BUILD_BYPRODUCTS ${ABSL_INSTALL_DIR}/lib/libabsl_bad_optional_access.a
+  BUILD_BYPRODUCTS ${ABSL_INSTALL_DIR}/lib/libabsl_bad_variant_access.a
+  BUILD_BYPRODUCTS ${ABSL_INSTALL_DIR}/lib/libabsl_raw_hash_set.a)
+# It may be more convinent if we just include all absl libs
+set(ABSL_LIB_NAMES
+    hash
+    wyhash
+    city
+    strings
+    throw_delegate
+    bad_any_cast_impl
+    bad_optional_access
+    bad_variant_access
+    raw_hash_set)
+set(ABSL_LIBS "")
+add_library(absl STATIC IMPORTED GLOBAL)
+set_property(TARGET absl PROPERTY IMPORTED_LOCATION
+                                  ${ABSL_INSTALL_DIR}/lib/libabsl_base.a)
+if(NOT USE_PREBUILD_EXTERNAL)
+  add_dependencies(absl external_absl)
+endif()
+foreach(lib_name ${ABSL_LIB_NAMES})
+  target_link_libraries(absl
+                        INTERFACE ${ABSL_INSTALL_DIR}/lib/libabsl_${lib_name}.a)
+endforeach()
+include_directories(${ABSL_INSTALL_DIR}/include)
--- a/cmake/cinn/external/boost.cmake
+++ b/cmake/cinn/external/boost.cmake
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+include(ExternalProject)
+set(BOOST_PROJECT "extern_boost")
+# To release PaddlePaddle as a pip package, we have to follow the
+# manylinux1 standard, which features as old Linux kernels and
+# compilers as possible and recommends CentOS 5. Indeed, the earliest
+# CentOS version that works with NVIDIA CUDA is CentOS 6.  And a new
+# version of boost, say, 1.66.0, doesn't build on CentOS 6.  We
+# checked that the devtools package of CentOS 6 installs boost 1.41.0.
+# So we use 1.41.0 here.
+set(BOOST_VER "1.41.0")
+set(BOOST_TAR
+    "boost_1_41_0"
+    CACHE STRING "" FORCE)
+set(BOOST_URL
+    "http://paddlepaddledeps.bj.bcebos.com/${BOOST_TAR}.tar.gz"
+    CACHE STRING "" FORCE)
+message(STATUS "BOOST_TAR: ${BOOST_TAR}, BOOST_URL: ${BOOST_URL}")
+set(BOOST_SOURCES_DIR ${CINN_THIRD_PARTY_PATH}/boost)
+set(BOOST_DOWNLOAD_DIR "${BOOST_SOURCES_DIR}/src/${BOOST_PROJECT}")
+set(BOOST_INCLUDE_DIR
+    "${BOOST_DOWNLOAD_DIR}"
+    CACHE PATH "boost include directory." FORCE)
+set_directory_properties(PROPERTIES CLEAN_NO_CUSTOM 1)
+include_directories(${BOOST_INCLUDE_DIR})
+ExternalProject_Add(
+  ${BOOST_PROJECT}
+  ${EXTERNAL_PROJECT_LOG_ARGS}
+  DOWNLOAD_DIR ${BOOST_DOWNLOAD_DIR}
+  URL ${BOOST_URL}
+  DOWNLOAD_NO_PROGRESS 1
+  PREFIX ${BOOST_SOURCES_DIR}
+  CONFIGURE_COMMAND ""
+  BUILD_COMMAND ""
+  INSTALL_COMMAND ""
+  UPDATE_COMMAND "")
+if(${CMAKE_VERSION} VERSION_LESS "3.3.0" OR NOT WIN32)
+  set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/boost_dummy.c)
+  file(WRITE ${dummyfile} "const char *dummy = \"${dummyfile}\";")
+  add_library(boost STATIC ${dummyfile})
+else()
+  add_library(boost INTERFACE)
+endif()
+add_dependencies(boost ${BOOST_PROJECT})
+set(Boost_INCLUDE_DIR ${BOOST_INCLUDE_DIR})
--- a/cmake/cinn/external/ginac.cmake
+++ b/cmake/cinn/external/ginac.cmake
+include(ExternalProject)
+# gmp-6.2.1 https://gmplib.org/download/gmp/gmp-6.2.1.tar.xz
+# cln-1.3.6 https://www.ginac.de/CLN/cln-1.3.6.tar.bz2
+# ginac-1.8.1 https://www.ginac.de/ginac-1.8.1.tar.bz2
+#  all build with CFLAGS="-fPIC -DPIC" CXXFLAGS="-fPIC -DPIC" --enable-static=yes
+set(GINAC_DOWNLOAD_URL
+    https://paddle-inference-dist.bj.bcebos.com/CINN/ginac-1.8.1_cln-1.3.6_gmp-6.2.1.tar.gz
+)
+set(GINAC_MD5 ebc3e4b7770dd604777ac3f01bfc8b06)
+ExternalProject_Add(
+  external_ginac
+  ${EXTERNAL_PROJECT_LOG_ARGS}
+  URL ${GINAC_DOWNLOAD_URL}
+  URL_MD5 ${GINAC_MD5}
+  PREFIX ${CINN_THIRD_PARTY_PATH}/ginac
+  SOURCE_DIR ${CINN_THIRD_PARTY_PATH}/install/ginac
+  CONFIGURE_COMMAND ""
+  BUILD_COMMAND ""
+  UPDATE_COMMAND ""
+  INSTALL_COMMAND ""
+  BUILD_BYPRODUCTS ${CINN_THIRD_PARTY_PATH}/install/ginac/lib/libginac.a
+  BUILD_BYPRODUCTS ${CINN_THIRD_PARTY_PATH}/install/ginac/lib/libcln.a
+  BUILD_BYPRODUCTS ${CINN_THIRD_PARTY_PATH}/install/ginac/lib/libgmp.a)
+add_library(ginac STATIC IMPORTED GLOBAL)
+add_dependencies(ginac external_ginac)
+set_property(
+  TARGET ginac PROPERTY IMPORTED_LOCATION
+                        ${CINN_THIRD_PARTY_PATH}/install/ginac/lib/libginac.a)
+target_link_libraries(
+  ginac INTERFACE ${CINN_THIRD_PARTY_PATH}/install/ginac/lib/libcln.a
+                  ${CINN_THIRD_PARTY_PATH}/install/ginac/lib/libgmp.a)
+include_directories(${CINN_THIRD_PARTY_PATH}/install/ginac/include)
--- a/cmake/cinn/external/isl.cmake
+++ b/cmake/cinn/external/isl.cmake
+include(ExternalProject)
+# isl https://github.com/inducer/ISL
+# commit-id 6a1760fe46967cda2a06387793a6b7d4a0876581
+#   depends on llvm f9dc2b7079350d0fed3bb3775f496b90483c9e42
+#   depends on gmp-6.2.1
+# static build
+# CPPFLAGS="-fPIC -DPIC" ./configure --with-gmp-prefix=<gmp-install-path> --with-clang-prefix=<llvm-install-path> --enable-shared=no --enable-static=yes
+set(ISL_DOWNLOAD_URL
+    https://paddle-inference-dist.bj.bcebos.com/CINN/isl-6a1760fe.tar.gz)
+set(ISL_MD5 fff10083fb79d394b8a7b7b2089f6183)
+ExternalProject_Add(
+  external_isl
+  ${EXTERNAL_PROJECT_LOG_ARGS}
+  URL ${ISL_DOWNLOAD_URL}
+  URL_MD5 ${ISL_MD5}
+  PREFIX ${CINN_THIRD_PARTY_PATH}/isl
+  SOURCE_DIR ${CINN_THIRD_PARTY_PATH}/install/isl
+  CONFIGURE_COMMAND ""
+  BUILD_COMMAND ""
+  UPDATE_COMMAND ""
+  INSTALL_COMMAND ""
+  BUILD_BYPRODUCTS ${CINN_THIRD_PARTY_PATH}/install/isl/lib/libisl.a)
+add_library(isl STATIC IMPORTED GLOBAL)
+set_property(
+  TARGET isl PROPERTY IMPORTED_LOCATION
+                      ${CINN_THIRD_PARTY_PATH}/install/isl/lib/libisl.a)
+add_dependencies(isl external_isl)
+include_directories(${CINN_THIRD_PARTY_PATH}/install/isl/include)
--- a/cmake/cinn/external/jitify.cmake
+++ b/cmake/cinn/external/jitify.cmake
+if(NOT WITH_GPU)
+  set(JITIFY_FOUND OFF)
+  return()
+endif()
+include(ExternalProject)
+set(JITIFY_SOURCE_PATH ${CINN_THIRD_PARTY_PATH}/install/jitify)
+ExternalProject_Add(
+  external_jitify
+  ${EXTERNAL_PROJECT_LOG_ARGS}
+  GIT_REPOSITORY "https://github.com/NVIDIA/jitify.git"
+  GIT_TAG 57de649139c866eb83acacfe50c92ad7c6278776
+  PREFIX ${CINN_THIRD_PARTY_PATH}/jitify
+  SOURCE_DIR ${JITIFY_SOURCE_PATH}
+  CONFIGURE_COMMAND ""
+  PATCH_COMMAND ""
+  BUILD_COMMAND ""
+  UPDATE_COMMAND ""
+  INSTALL_COMMAND "")
+include_directories(${JITIFY_SOURCE_PATH})
+add_library(extern_jitify INTERFACE)
+add_dependencies(extern_jitify external_jitify)
+set(jitify_deps extern_jitify)
--- a/cmake/cinn/external/llvm.cmake
+++ b/cmake/cinn/external/llvm.cmake
+include(FetchContent)
+# set(LLVM_DOWNLOAD_URL https://paddle-inference-dist.bj.bcebos.com/CINN/llvm11.tar.gz)
+# set(LLVM_MD5 39d32b6be466781dddf5869318dcba53)
+set(LLVM_DOWNLOAD_URL
+    https://paddle-inference-dist.bj.bcebos.com/CINN/llvm11-glibc2.17.tar.gz)
+set(LLVM_MD5 33c7d3cc6d370585381e8d90bd7c2198)
+set(FETCHCONTENT_BASE_DIR ${CINN_THIRD_PARTY_PATH}/llvm)
+set(FETCHCONTENT_QUIET OFF)
+FetchContent_Declare(
+  external_llvm
+  URL ${LLVM_DOWNLOAD_URL}
+  URL_MD5 ${LLVM_MD5}
+  PREFIX ${CINN_THIRD_PARTY_PATH}/llvm SOURCE_DIR
+  ${CINN_THIRD_PARTY_PATH}/install/llvm)
+if(NOT LLVM_PATH)
+  FetchContent_GetProperties(external_llvm)
+  if(NOT external_llvm_POPULATED)
+    FetchContent_Populate(external_llvm)
+  endif()
+  set(LLVM_PATH ${CINN_THIRD_PARTY_PATH}/install/llvm)
+  set(LLVM_DIR ${CINN_THIRD_PARTY_PATH}/install/llvm/lib/cmake/llvm)
+  set(MLIR_DIR ${CINN_THIRD_PARTY_PATH}/install/llvm/lib/cmake/mlir)
+else()
+  set(LLVM_DIR ${LLVM_PATH}/lib/cmake/llvm)
+  set(MLIR_DIR ${LLVM_PATH}/lib/cmake/mlir)
+endif()
+if(${CMAKE_CXX_COMPILER} STREQUAL "clang++")
+  set(CMAKE_EXE_LINKER_FLAGS
+      "${CMAKE_EXE_LINKER_FLAGS} -stdlib=libc++ -lc++abi")
+endif()
+message(STATUS "set LLVM_DIR: ${LLVM_DIR}")
+message(STATUS "set MLIR_DIR: ${MLIR_DIR}")
+find_package(LLVM REQUIRED CONFIG HINTS ${LLVM_DIR})
+find_package(MLIR REQUIRED CONFIG HINTS ${MLIR_DIR})
+find_package(ZLIB REQUIRED)
+list(APPEND CMAKE_MODULE_PATH "${LLVM_CMAKE_DIR}")
+include(AddLLVM)
+include_directories(${LLVM_INCLUDE_DIRS})
+list(APPEND CMAKE_MODULE_PATH "${LLVM_CMAKE_DIR}")
+list(APPEND CMAKE_MODULE_PATH "${MLIR_CMAKE_DIR}")
+include(AddLLVM)
+include(TableGen)
+include(AddMLIR)
+message(STATUS "Found MLIR: ${MLIR_DIR}")
+message(STATUS "Found LLVM ${LLVM_PACKAGE_VERSION}")
+message(STATUS "Using LLVMConfig.cmake in: ${LLVM_DIR}")
+# To build with MLIR, the LLVM is build from source code using the following flags:
+#[==[
+cmake -G Ninja ../llvm \
+  -DLLVM_ENABLE_PROJECTS="mlir;clang" \
+  -DLLVM_BUILD_EXAMPLES=OFF \
+  -DLLVM_TARGETS_TO_BUILD="X86" \
+  -DCMAKE_BUILD_TYPE=Release \
+  -DLLVM_ENABLE_ASSERTIONS=ON \
+  -DLLVM_ENABLE_ZLIB=OFF \
+  -DLLVM_ENABLE_RTTI=ON \
+  -DLLVM_ENABLE_TERMINFO=OFF \
+  -DCMAKE_INSTALL_PREFIX=./install
+#]==]
+# The matched llvm-project version is f9dc2b7079350d0fed3bb3775f496b90483c9e42 (currently a temporary commit)
+# Update: to build llvm in manylinux docker with glibc-2.17, and use it in manylinux and ubuntu docker,
+# the patch https://gist.github.com/zhiqiu/6e8d969176dce13d98fd15338a16265e is needed.
+add_definitions(${LLVM_DEFINITIONS})
+llvm_map_components_to_libnames(
+  llvm_libs
+  Support
+  Core
+  irreader
+  X86
+  executionengine
+  orcjit
+  mcjit
+  all
+  codegen)
+message(STATUS "LLVM libs: ${llvm_libs}")
+get_property(mlir_libs GLOBAL PROPERTY MLIR_ALL_LIBS)
+message(STATUS "MLIR libs: ${mlir_libs}")
+add_definitions(${LLVM_DEFINITIONS})
+# The minimum needed libraries for MLIR IR parse and transform.
+set(MLIR_IR_LIBS
+    MLIRAnalysis
+    MLIRStandardOps
+    MLIRPass
+    MLIRParser
+    MLIRDialect
+    MLIRIR
+    MLIROptLib)
+# tb_base is the name of a xxx.td file (without the .td suffix)
+function(mlir_tablegen_on td_base)
+  set(options)
+  set(oneValueArgs DIALECT)
+  cmake_parse_arguments(mlir_tablegen_on "${options}" "${oneValueArgs}"
+                        "${multiValueArgs}" ${ARGN})
+  set(LLVM_TARGET_DEFINITIONS ${td_base}.td)
+  mlir_tablegen(${td_base}.hpp.inc -gen-op-decls)
+  mlir_tablegen(${td_base}.cpp.inc -gen-op-defs)
+  if(mlir_tablegen_on_DIALECT)
+    mlir_tablegen(${td_base}_dialect.hpp.inc --gen-dialect-decls
+                  -dialect=${mlir_tablegen_on_DIALECT})
+  endif()
+  add_public_tablegen_target(${td_base}_IncGen)
+  add_custom_target(${td_base}_inc DEPENDS ${td_base}_IncGen)
+endfunction()
+function(mlir_add_rewriter td_base)
+  set(LLVM_TARGET_DEFINITIONS ${td_base}.td)
+  mlir_tablegen(${td_base}.hpp.inc -gen-rewriters
+                "-I${CMAKE_SOURCE_DIR}/infrt/dialect/pass")
+  add_public_tablegen_target(${td_base}_IncGen)
+  add_custom_target(${td_base}_inc DEPENDS ${td_base}_IncGen)
+endfunction()
--- a/cmake/cinn/external/openmp.cmake
+++ b/cmake/cinn/external/openmp.cmake
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+if(USE_OPENMP STREQUAL "gnu")
+  find_package(OpenMP)
+  if(OPENMP_FOUND)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+    add_definitions(-DCINN_USE_OPENMP)
+    set(WITH_OPENMP ON)
+    message(STATUS "Build with OpenMP ${OpenMP_CXX_LIBRARIES}")
+    message(STATUS "CMAKE_CXX_FLAGS " ${CMAKE_CXX_FLAGS})
+  else()
+    set(WITH_OPENMP OFF)
+  endif()
+elseif(USE_OPENMP STREQUAL "intel")
+  find_package(OpenMP)
+  if(OPENMP_FOUND)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+    message(STATUS "CMAKE_CXX_FLAGS " ${CMAKE_CXX_FLAGS})
+    add_definitions(-DCINN_USE_OPENMP)
+    set(WITH_OPENMP ON)
+    message(STATUS "Build with OpenMP " ${MKLML_IOMP_LIB})
+  else()
+    set(WITH_OPENMP OFF)
+  endif()
+endif()
--- a/cmake/cinn/llvm.cmake
+++ b/cmake/cinn/llvm.cmake
+if(${CMAKE_CXX_COMPILER} STREQUAL "clang++")
+  set(CMAKE_EXE_LINKER_FLAGS
+      "${CMAKE_EXE_LINKER_FLAGS} -stdlib=libc++ -lc++abi")
+endif()
+message(STATUS "set LLVM_DIR: ${LLVM_DIR}")
+message(STATUS "set MLIR_DIR: ${MLIR_DIR}")
+find_package(LLVM REQUIRED CONFIG HINTS ${LLVM_DIR})
+find_package(MLIR REQUIRED CONFIG HINTS ${MLIR_DIR})
+find_package(ZLIB REQUIRED)
+list(APPEND CMAKE_MODULE_PATH "${LLVM_CMAKE_DIR}")
+include(AddLLVM)
+include_directories(${LLVM_INCLUDE_DIRS})
+list(APPEND CMAKE_MODULE_PATH "${LLVM_CMAKE_DIR}")
+list(APPEND CMAKE_MODULE_PATH "${MLIR_CMAKE_DIR}")
+include(AddLLVM)
+include(TableGen)
+include(AddMLIR)
+message(STATUS "Found MLIR: ${MLIR_DIR}")
+message(STATUS "Found LLVM ${LLVM_PACKAGE_VERSION}")
+message(STATUS "Using LLVMConfig.cmake in: ${LLVM_DIR}")
+# To build with MLIR, the LLVM is build from source code using the following flags:
+#[==[
+cmake -G Ninja ../llvm \
+  -DLLVM_ENABLE_PROJECTS=mlir \
+  -DLLVM_BUILD_EXAMPLES=OFF \
+  -DLLVM_TARGETS_TO_BUILD="X86" \
+  -DCMAKE_BUILD_TYPE=Release \
+  -DLLVM_ENABLE_ASSERTIONS=ON \
+  -DLLVM_ENABLE_ZLIB=OFF \
+  -DLLVM_ENABLE_RTTI=ON \
+#]==]
+# The matched llvm-project version is f9dc2b7079350d0fed3bb3775f496b90483c9e42 (currently a temporary commit)
+add_definitions(${LLVM_DEFINITIONS})
+llvm_map_components_to_libnames(
+  llvm_libs
+  Support
+  Core
+  irreader
+  X86
+  executionengine
+  orcjit
+  mcjit
+  all
+  codegen)
+message(STATUS "LLVM libs: ${llvm_libs}")
+get_property(mlir_libs GLOBAL PROPERTY MLIR_ALL_LIBS)
+message(STATUS "MLIR libs: ${mlir_libs}")
+add_definitions(${LLVM_DEFINITIONS})
+# The minimum needed libraries for MLIR IR parse and transform.
+set(MLIR_IR_LIBS
+    MLIRAnalysis
+    MLIRStandardOps
+    MLIRPass
+    MLIRParser
+    MLIRDialect
+    MLIRIR
+    MLIROptLib)
+# tb_base is the name of a xxx.td file (without the .td suffix)
+function(mlir_tablegen_on td_base)
+  set(options)
+  set(oneValueArgs DIALECT)
+  cmake_parse_arguments(mlir_tablegen_on "${options}" "${oneValueArgs}"
+                        "${multiValueArgs}" ${ARGN})
+  set(LLVM_TARGET_DEFINITIONS ${td_base}.td)
+  mlir_tablegen(${td_base}.hpp.inc -gen-op-decls)
+  mlir_tablegen(${td_base}.cpp.inc -gen-op-defs)
+  if(mlir_tablegen_on_DIALECT)
+    mlir_tablegen(${td_base}_dialect.hpp.inc --gen-dialect-decls
+                  -dialect=${mlir_tablegen_on_DIALECT})
+  endif()
+  add_public_tablegen_target(${td_base}_IncGen)
+  add_custom_target(${td_base}_inc DEPENDS ${td_base}_IncGen)
+endfunction()
--- a/cmake/cinn/nvrtc.cmake
+++ b/cmake/cinn/nvrtc.cmake
+if(NOT WITH_GPU)
+  return()
+endif()
+find_package(PkgConfig)
+find_library(
+  CUDA_NVRTC_LIB libnvrtc nvrtc
+  HINTS "${CUDA_TOOLKIT_ROOT_DIR}/lib64" "${LIBNVRTC_LIBRARY_DIR}"
+        "${CUDA_TOOLKIT_ROOT_DIR}/lib/x64" /usr/lib64 /usr/local/cuda/lib64)
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(LibNVRTC DEFAULT_MSG CUDA_NVRTC_LIB)
+message(STATUS "found NVRTC: ${CUDA_NVRTC_LIB}")
+mark_as_advanced(CUDA_NVRTC_LIB)
+if(NOT LIBNVRTC_FOUND)
+  message(
+    FATAL_ERROR
+      "Cuda NVRTC Library not found: Specify the LIBNVRTC_LIBRARY_DIR where libnvrtc is located"
+  )
+endif()
--- a/cmake/cinn/nvtx.cmake
+++ b/cmake/cinn/nvtx.cmake
+if((NOT WITH_GPU)
+   OR WIN32
+   OR APPLE)
+  set(NVTX_FOUND OFF)
+  return()
+endif()
+set(NVTX_ROOT
+    "/usr"
+    CACHE PATH "NVTX ROOT")
+find_path(
+  NVTX_INCLUDE_DIR nvToolsExt.h
+  PATHS ${NVTX_ROOT} ${NVTX_ROOT}/include $ENV{NVTX_ROOT}
+        $ENV{NVTX_ROOT}/include ${CUDA_TOOLKIT_INCLUDE}
+  NO_DEFAULT_PATH)
+get_filename_component(__libpath_hint ${CUDA_CUDART_LIBRARY} PATH)
+set(TARGET_ARCH "x86_64")
+if(NOT ${CMAKE_SYSTEM_PROCESSOR})
+  set(TARGET_ARCH ${CMAKE_SYSTEM_PROCESSOR})
+endif()
+list(
+  APPEND
+  NVTX_CHECK_LIBRARY_DIRS
+  ${NVTX_ROOT}
+  ${NVTX_ROOT}/lib64
+  ${NVTX_ROOT}/lib
+  ${NVTX_ROOT}/lib/${TARGET_ARCH}-linux-gnu
+  $ENV{NVTX_ROOT}
+  $ENV{NVTX_ROOT}/lib64
+  $ENV{NVTX_ROOT}/lib
+  ${CUDA_TOOLKIT_ROOT_DIR}
+  ${CUDA_TOOLKIT_ROOT_DIR}/targets/${TARGET_ARCH}-linux/lib)
+find_library(
+  CUDA_NVTX_LIB
+  NAMES libnvToolsExt.so
+  PATHS ${NVTX_CHECK_LIBRARY_DIRS} ${NVTX_INCLUDE_DIR} ${__libpath_hint}
+  NO_DEFAULT_PATH
+  DOC "Path to the NVTX library.")
+if(NVTX_INCLUDE_DIR AND CUDA_NVTX_LIB)
+  set(NVTX_FOUND ON)
+else()
+  set(NVTX_FOUND OFF)
+endif()
+if(NVTX_FOUND)
+  include_directories(${NVTX_INCLUDE_DIR})
+  add_definitions(-DCINN_WITH_NVTX)
+endif()
--- a/cmake/cinn/system.cmake
+++ b/cmake/cinn/system.cmake
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Detects the OS and sets appropriate variables.
+# CMAKE_SYSTEM_NAME only give us a coarse-grained name of the OS CMake is
+# building for, but the host processor name like centos is necessary
+# in some scenes to distinguish system for customization.
+#
+# for instance, protobuf libs path is <install_dir>/lib64
+# on CentOS, but <install_dir>/lib on other systems.
+if(UNIX AND NOT APPLE)
+  # except apple from nix*Os family
+  set(LINUX TRUE)
+endif()
+if(WIN32)
+  set(HOST_SYSTEM "win32")
+else()
+  if(APPLE)
+    set(HOST_SYSTEM "macosx")
+    exec_program(
+      sw_vers ARGS
+      -productVersion
+      OUTPUT_VARIABLE HOST_SYSTEM_VERSION)
+    string(REGEX MATCH "[0-9]+.[0-9]+" MACOS_VERSION "${HOST_SYSTEM_VERSION}")
+    if(NOT DEFINED $ENV{MACOSX_DEPLOYMENT_TARGET})
+      # Set cache variable - end user may change this during ccmake or cmake-gui configure.
+      set(CMAKE_OSX_DEPLOYMENT_TARGET
+          ${MACOS_VERSION}
+          CACHE
+            STRING
+            "Minimum OS X version to target for deployment (at runtime); newer APIs weak linked. Set to empty string for default value."
+      )
+    endif()
+    set(CMAKE_EXE_LINKER_FLAGS "-framework CoreFoundation -framework Security")
+  else()
+    if(EXISTS "/etc/issue")
+      file(READ "/etc/issue" LINUX_ISSUE)
+      if(LINUX_ISSUE MATCHES "CentOS")
+        set(HOST_SYSTEM "centos")
+      elseif(LINUX_ISSUE MATCHES "Debian")
+        set(HOST_SYSTEM "debian")
+      elseif(LINUX_ISSUE MATCHES "Ubuntu")
+        set(HOST_SYSTEM "ubuntu")
+      elseif(LINUX_ISSUE MATCHES "Red Hat")
+        set(HOST_SYSTEM "redhat")
+      elseif(LINUX_ISSUE MATCHES "Fedora")
+        set(HOST_SYSTEM "fedora")
+      endif()
+      string(REGEX MATCH "(([0-9]+)\\.)+([0-9]+)" HOST_SYSTEM_VERSION
+                   "${LINUX_ISSUE}")
+    endif()
+    if(EXISTS "/etc/redhat-release")
+      file(READ "/etc/redhat-release" LINUX_ISSUE)
+      if(LINUX_ISSUE MATCHES "CentOS")
+        set(HOST_SYSTEM "centos")
+      endif()
+    endif()
+    if(NOT HOST_SYSTEM)
+      set(HOST_SYSTEM ${CMAKE_SYSTEM_NAME})
+    endif()
+  endif()
+endif()
+# query number of logical cores
+cmake_host_system_information(RESULT CPU_CORES QUERY NUMBER_OF_LOGICAL_CORES)
+mark_as_advanced(HOST_SYSTEM CPU_CORES)
+message(
+  STATUS
+    "Found Paddle host system: ${HOST_SYSTEM}, version: ${HOST_SYSTEM_VERSION}")
+message(STATUS "Found Paddle host system's CPU: ${CPU_CORES} cores")
+# external dependencies log output
+set(EXTERNAL_PROJECT_LOG_ARGS
+    LOG_DOWNLOAD
+    0 # Wrap download in script to log output
+    LOG_UPDATE
+    1 # Wrap update in script to log output
+    LOG_CONFIGURE
+    1 # Wrap configure in script to log output
+    LOG_BUILD
+    0 # Wrap build in script to log output
+    LOG_TEST
+    1 # Wrap test in script to log output
+    LOG_INSTALL
+    0 # Wrap install in script to log output
+)
--- a/cmake/cinn/version.cmake
+++ b/cmake/cinn/version.cmake
+# Get the latest git tag.
+set(CINN_VERSION $ENV{CINN_VERSION})
+set(tmp_version "HEAD")
+set(TAG_VERSION_REGEX "[0-9]+\\.[0-9]+\\.[0-9]+(\\.(a|b|rc)\\.[0-9]+)?")
+set(COMMIT_VERSION_REGEX "[0-9a-f]+[0-9a-f]+[0-9a-f]+[0-9a-f]+[0-9a-f]+")
+while("${CINN_VERSION}" STREQUAL "")
+  # Check current branch name
+  execute_process(
+    COMMAND ${GIT_EXECUTABLE} rev-parse --abbrev-ref ${tmp_version}
+    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
+    OUTPUT_VARIABLE GIT_BRANCH_NAME
+    RESULT_VARIABLE GIT_BRANCH_RESULT
+    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+  if(NOT ${GIT_BRANCH_RESULT})
+    execute_process(
+      COMMAND ${GIT_EXECUTABLE} describe --tags --abbrev=0 --always
+              ${tmp_version}
+      WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
+      OUTPUT_VARIABLE GIT_TAG_NAME
+      RESULT_VARIABLE GIT_RESULT
+      ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+    if(NOT ${GIT_RESULT})
+      # Check if current branch is release branch
+      if(${GIT_BRANCH_NAME} MATCHES "release/${TAG_VERSION_REGEX}")
+        # Check the tag is a correct version
+        if(${GIT_TAG_NAME} MATCHES "${COMMIT_VERSION_REGEX}")
+          # if no tag was found, set CINN_VERSION to 0.0.0 to represent latest
+          set(CINN_VERSION "0.0.0")
+        elseif(${GIT_TAG_NAME} MATCHES "v${TAG_VERSION_REGEX}")
+          string(REPLACE "v" "" CINN_VERSION ${GIT_TAG_NAME})
+        else() # otherwise, get the previous git tag name.
+          set(tmp_version "${GIT_TAG_NAME}~1")
+        endif()
+      else()
+        execute_process(
+          COMMAND ${GIT_EXECUTABLE} describe --exact-match --tags ${tmp_version}
+          WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
+          OUTPUT_VARIABLE GIT_EXACT_TAG_NAME
+          RESULT_VARIABLE GIT_EXACT_TAG_RESULT
+          ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+        if(NOT ${GIT_EXACT_TAG_NAME})
+          # Check if current branch is tag branch
+          if(${GIT_EXACT_TAG_NAME} MATCHES "v${TAG_VERSION_REGEX}")
+            string(REPLACE "v" "" CINN_VERSION ${GIT_EXACT_TAG_NAME})
+          else()
+            set(CINN_VERSION "0.0.0")
+          endif()
+        else()
+          # otherwise, we always set CINN_VERSION to 0.0.0 to represent latest
+          set(CINN_VERSION "0.0.0")
+        endif()
+      endif()
+    else()
+      set(CINN_VERSION "0.0.0")
+      message(WARNING "Cannot add CINN version from git tag")
+    endif()
+  else()
+    set(CINN_VERSION "0.0.0")
+    message(WARNING "Cannot add CINN version for wrong git branch result")
+  endif()
+endwhile()
+string(REPLACE "-" "." CINN_VER_LIST ${CINN_VERSION})
+string(REPLACE "." ";" CINN_VER_LIST ${CINN_VER_LIST})
+list(GET CINN_VER_LIST 0 CINN_MAJOR_VER)
+list(GET CINN_VER_LIST 1 CINN_MINOR_VER)
+list(GET CINN_VER_LIST 2 CINN_PATCH_VER)
+math(EXPR CINN_VERSION_INTEGER "${CINN_MAJOR_VER} * 1000000
+    + ${CINN_MINOR_VER} * 1000 + ${CINN_PATCH_VER}")
+add_definitions(-DCINN_VERSION=${CINN_VERSION})
+add_definitions(-DCINN_VERSION_INTEGER=${CINN_VERSION_INTEGER})
+message(
+  STATUS
+    "CINN version is ${CINN_VERSION} (major: ${CINN_MAJOR_VER}, minor: ${CINN_MINOR_VER}, patch: ${CINN_PATCH_VER})"
+)
--- a/cmake/external/cinn.cmake
+++ b/cmake/external/cinn.cmake
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-if(NOT WITH_CINN)
-  return()
-endif()
-if(NOT CINN_GIT_TAG)
-  set(CINN_GIT_TAG develop)
-endif()
-message(STATUS "CINN  version: " ${CINN_GIT_TAG})
-# TODO(zhhsplendid): CINN has lots of warnings during early development.
-# They will be treated as errors under paddle. We set no-error now and we will
-# clean the code in the future.
-add_definitions(-w)
-######################################
-# Build CINN from Git External Project
-######################################
-include(ExternalProject)
-set(CINN_PREFIX_DIR ${THIRD_PARTY_PATH}/CINN)
-set(CINN_OPTIONAL_ARGS
-    -DPY_VERSION=${PY_VERSION}
-    -DWITH_CUDA=${WITH_GPU}
-    -DWITH_CUDNN=${WITH_GPU}
-    -DWITH_MKL_CBLAS=${WITH_MKL}
-    -DWITH_MKLDNN=${WITH_MKL}
-    -DPUBLISH_LIBS=ON
-    -DWITH_TESTING=ON
-    -DPYTHON_EXECUTABLE=${PYTHON_EXECUTABLE}
-    -DPYTHON_INCLUDE_DIR=${PYTHON_INCLUDE_DIR}
-    -DPYTHON_LIBRARIES=${PYTHON_LIBRARIES})
-set(CINN_BUILD_COMMAND ${CMAKE_COMMAND} --build . --target cinnapi -j)
-set(CINN_BINARY_DIR ${CINN_PREFIX_DIR}/src/external_cinn-build)
-set(CINN_LIB_NAME "libcinnapi.so")
-set(CINN_LIB_LOCATION "${CINN_BINARY_DIR}/dist/cinn/lib")
-set(CINN_LIB "${CINN_LIB_LOCATION}/${CINN_LIB_NAME}")
-ExternalProject_Add(
-  external_cinn
-  ${EXTERNAL_PROJECT_LOG_ARGS}
-  GIT_REPOSITORY "${GIT_URL}/PaddlePaddle/CINN.git"
-  GIT_TAG ${CINN_GIT_TAG}
-  PREFIX ${CINN_PREFIX_DIR}
-  BUILD_COMMAND ${CINN_BUILD_COMMAND}
-  INSTALL_COMMAND ""
-  CMAKE_ARGS ${CINN_OPTIONAL_ARGS}
-  BUILD_BYPRODUCTS ${CINN_LIB})
-ExternalProject_Get_Property(external_cinn BINARY_DIR)
-ExternalProject_Get_Property(external_cinn SOURCE_DIR)
-set(CINN_SOURCE_DIR ${SOURCE_DIR})
-message(STATUS "CINN BINARY_DIR: ${CINN_BINARY_DIR}")
-message(STATUS "CINN SOURCE_DIR: ${CINN_SOURCE_DIR}")
-######################################
-# Add CINN's dependencies header files
-######################################
-# Add absl
-set(ABSL_INCLUDE_DIR "${CINN_BINARY_DIR}/dist/third_party/absl/include")
-include_directories(${ABSL_INCLUDE_DIR})
-# Add isl
-set(ISL_INCLUDE_DIR "${CINN_BINARY_DIR}/dist/third_party/isl/include")
-include_directories(${ISL_INCLUDE_DIR})
-# Add LLVM
-set(LLVM_INCLUDE_DIR "${CINN_BINARY_DIR}/dist/third_party/llvm/include")
-include_directories(${LLVM_INCLUDE_DIR})
-######################################################
-# Put external_cinn and dependencies together as a lib
-######################################################
-set(CINN_INCLUDE_DIR "${CINN_BINARY_DIR}/dist/cinn/include")
-add_library(cinn SHARED IMPORTED GLOBAL)
-set_target_properties(cinn PROPERTIES IMPORTED_LOCATION
-                                      "${CINN_LIB_LOCATION}/${CINN_LIB_NAME}")
-include_directories(${CINN_INCLUDE_DIR})
-add_dependencies(cinn external_cinn)
--- a/cmake/external/gtest.cmake
+++ b/cmake/external/gtest.cmake
@@ -56,9 +56,14 @@ else()
      "${GTEST_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}/libgmock.a"
      CACHE FILEPATH "gmock libraries." FORCE)
  set(GTEST_CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
-  set(GTEST_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+  if(CINN_ONLY)
+    set(GTEST_CMAKE_CXX_FLAGS "-std=c++17 ${CMAKE_CXX_FLAGS}")
+  else()
+    set(GTEST_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+  endif()
 endif()
 if(WITH_MKLML)
  # wait for mklml downloading completed
  set(GTEST_DEPENDS ${MKLML_PROJECT})

--- a/cmake/external/pybind11.cmake
+++ b/cmake/external/pybind11.cmake
@@ -24,7 +24,7 @@ set(SOURCE_INCLUDE_DIR ${SOURCE_DIR}/include)
 include_directories(${PYBIND_INCLUDE_DIR})
 set(PYBIND_PATCH_COMMAND "")
-if(NOT WIN32)
+if(NOT WIN32 AND NOT CINN_ONLY)
  file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/pybind/cast.h.patch
       native_dst)
  # Note: [Why calling some `git` commands before `patch`?]

--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -260,6 +260,36 @@ if(${CMAKE_VERSION} VERSION_GREATER "3.5.2")
 endif()
 ########################### include third_party according to flags ###############################
+# cinn_only includes third-party libraries separately
+if(CINN_ONLY)
+  include(external/zlib)
+  include(external/gflags)
+  include(external/glog)
+  include(external/gtest)
+  include(external/protobuf)
+  if(WITH_PYTHON)
+    include(external/pybind11)
+  endif()
+  if(WITH_MKL)
+    include(external/mklml)
+  endif()
+  if(WITH_MKLDNN)
+    include(external/mkldnn)
+  endif()
+  return()
+endif()
+if(WITH_CINN)
+  if(WITH_MKL)
+    add_definitions(-DCINN_WITH_MKL_CBLAS)
+  endif()
+  if(WITH_MKLDNN)
+    add_definitions(-DCINN_WITH_MKLDNN)
+  endif()
+endif()
 include(external/zlib) # download, build, install zlib
 include(external/gflags) # download, build, install gflags
 include(external/glog) # download, build, install glog
@@ -474,20 +504,6 @@ if(WITH_LITE)
  include(external/lite)
 endif()
-if(WITH_CINN)
-  message(STATUS "Compile Paddle with CINN.")
-  include(external/cinn)
-  add_definitions(-DPADDLE_WITH_CINN)
-  if(WITH_GPU)
-    add_definitions(-DCINN_WITH_CUDA)
-    add_definitions(-DCINN_WITH_CUDNN)
-  endif()
-  if(WITH_MKL)
-    add_definitions(-DCINN_WITH_MKL_CBLAS)
-    add_definitions(-DCINN_WITH_MKLDNN)
-  endif()
-endif()
 if(WITH_CRYPTO)
  include(external/cryptopp) # download, build, install cryptopp
  list(APPEND third_party_deps extern_cryptopp)

--- a/paddle/cinn/CMakeLists.txt
+++ b/paddle/cinn/CMakeLists.txt
+if (WITH_TESTING)
+  cinn_cc_library(cinn_gtest_main SRCS gtest_main.cc DEPS gtest gflags)
+endif()
+add_subdirectory(auto_schedule)
+add_subdirectory(common)
+add_subdirectory(utils)
+add_subdirectory(poly)
+add_subdirectory(runtime)
+add_subdirectory(ir)
+add_subdirectory(backends)
+add_subdirectory(lang)
+add_subdirectory(optim)
+add_subdirectory(hlir)
+if(CINN_ONLY)
+  add_subdirectory(pybind)
+endif()
+add_subdirectory(frontend)
+# Download a model
+download_and_uncompress("${DOWNLOAD_MODEL_DIR}" "${PADDLE_RESOURCE_URL}" "lite_naive_model.tar.gz")
+core_gather_headers()
--- a/paddle/cinn/auto_schedule/CMakeLists.txt
+++ b/paddle/cinn/auto_schedule/CMakeLists.txt
+add_subdirectory(analysis)
+add_subdirectory(cost_model)
+add_subdirectory(database)
+add_subdirectory(measure)
+add_subdirectory(post_schedule_rule)
+add_subdirectory(search_space)
+add_subdirectory(search_strategy)
+add_subdirectory(task)
+add_subdirectory(task_scheduler)
+add_subdirectory(tests)
+cinn_proto_library(auto_schedule_proto SRCS auto_schedule.proto DEPS schedule_desc_proto)
+core_gather_headers()
+gather_srcs(cinnapi_src SRCS auto_tuner.cc)
+#cinn_cc_test(test_auto_tuner SRCS auto_tuner_test.cc DEPS cinncore)
+foreach(header ${auto_schedule_proto_HDRS})
+  set(core_proto_includes "${core_proto_includes};${header}" CACHE INTERNAL "")
+endforeach()
--- a/paddle/cinn/auto_schedule/analysis/CMakeLists.txt
+++ b/paddle/cinn/auto_schedule/analysis/CMakeLists.txt
+core_gather_headers()
+gather_srcs(cinnapi_src SRCS analyze_ir.cc)
+cinn_cc_test(test_analyze_ir SRCS analyze_ir_test.cc DEPS cinncore)
--- a/paddle/cinn/auto_schedule/analysis/analyze_ir.cc
+++ b/paddle/cinn/auto_schedule/analysis/analyze_ir.cc
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/cinn/auto_schedule/analysis/analyze_ir.h"
+#include <glog/logging.h>
+#include <algorithm>
+#include <string>
+#include <unordered_set>
+#include "paddle/cinn/ir/buffer.h"
+#include "paddle/cinn/ir/collect_ir_nodes.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "paddle/cinn/ir/ir_schedule.h"
+#include "paddle/cinn/ir/lowered_func.h"
+#include "paddle/cinn/ir/tensor.h"
+#include "paddle/cinn/lang/lower.h"
+#include "paddle/cinn/optim/ir_copy.h"
+#include "paddle/cinn/optim/optimize.h"
+#include "paddle/cinn/optim/transform_gpu_forloop.h"
+namespace cinn {
+namespace auto_schedule {
+std::vector<ir::Var> IndicesToVars(const std::vector<ir::Expr>& indices) {
+  std::vector<ir::Var> result;
+  for (const ir::Expr& e : indices) {
+    // Whether we have to convert other types, like const numbers to Var?
+    if (e.As<ir::_Var_>() != nullptr) {
+      ir::Expr copy_e    = optim::IRCopy(e);
+      ir::_Var_* var_ref = copy_e.As<ir::_Var_>();
+      result.emplace_back(ir::Var(var_ref));
+    }
+  }
+  return result;
+}
+void AnalyzeScheduleBlockReadWriteBuffer(ir::ScheduleBlock* sche_block) {
+  if (!sche_block->read_buffers.empty() || !sche_block->write_buffers.empty()) {
+    return;
+  }
+  ir::CollectIRNodesWithoutTensor(sche_block->body, [&](const Expr* x) {
+    const ir::Load* load_expr = x->As<ir::Load>();
+    if (load_expr != nullptr) {
+      const ir::Tensor t = load_expr->tensor.as_tensor_ref();
+      sche_block->read_buffers.emplace_back(ir::BufferRange(t->buffer, IndicesToVars(load_expr->indices)));
+      return false;
+    }
+    const ir::Store* store_expr = x->As<ir::Store>();
+    if (store_expr != nullptr) {
+      const ir::Tensor t = store_expr->tensor.as_tensor_ref();
+      sche_block->write_buffers.emplace_back(ir::BufferRange(t->buffer, IndicesToVars(store_expr->indices)));
+      return false;
+    }
+    return false;
+  });
+}
+bool ContainsNodeType(ir::Expr expr, const std::unordered_set<ir::IrNodeTy>& node_types) {
+  std::set<ir::Expr> collection = ir::CollectIRNodesWithoutTensor(
+      expr, [&](const Expr* x) { return node_types.find(x->node_type()) != node_types.end(); });
+  return !collection.empty();
+}
+std::unordered_set<std::string> GetOutputNamesFromLoweredFunc(const std::vector<ir::LoweredFunc>& lowered_funcs) {
+  std::unordered_set<std::string> result;
+  for (const ir::LoweredFunc& func : lowered_funcs) {
+    for (const ir::Argument& arg : func->args) {
+      if (arg.is_output()) {
+        result.insert(arg.name());
+      }
+    }
+  }
+  return result;
+}
+bool NeedsMultiLevelTiling(const ir::ScheduleBlockRealize& sche_block_realize) {
+  const ir::ScheduleBlock* sche_block = sche_block_realize.schedule_block.As<ir::ScheduleBlock>();
+  if (sche_block->write_buffers.size() != 1 || sche_block->read_buffers.empty()) {
+    return false;
+  }
+  const ir::Expr& write_buffer = sche_block->write_buffers[0].As<ir::_BufferRange_>()->buffer;
+  // Enumerate each read region, get the number of schedule block iter vars
+  // which  are not used to index the read region
+  int total_unused_iter_vars = 0;
+  for (const ir::Expr& read_buffer_expr : sche_block->read_buffers) {
+    const ir::_BufferRange_* read_buffer = read_buffer_expr.As<ir::_BufferRange_>();
+    // Skip the reduction buffer
+    if (read_buffer->buffer == write_buffer) {
+      continue;
+    }
+    // Collect the vars in schedule block that are used to index the read region
+    std::unordered_set<std::string> vars_index_read;
+    for (const Var& range : read_buffer->ranges) {
+      vars_index_read.insert(range->name);
+    }
+    // Check the block iter vars are not used to index the read region
+    int n_unused_block_vars = 0;
+    for (const ir::Var& block_iter_var : sche_block->iter_vars) {
+      if (!block_iter_var->is_reduce_axis) {
+        bool iter_var_in_read = false;
+        for (const std::string& var : vars_index_read) {
+          if (var == block_iter_var->name) {
+            iter_var_in_read = true;
+            break;
+          }
+        }
+        if (!iter_var_in_read) {
+          ++n_unused_block_vars;
+        }
+      }
+    }
+    total_unused_iter_vars += n_unused_block_vars;
+  }
+  return total_unused_iter_vars >= 1;
+}
+ir::LoweredFunc UpdateFuncWithNewBody(const common::Target& target, const ir::LoweredFunc& old_func, ir::Expr& body) {
+  ir::ModuleExpr mod_expr(std::vector<ir::Expr>({body}));
+  ir::IRSchedule ir_sch(mod_expr);
+  // temp_bufs may be deleted during auto tuning (such as auto inline),
+  // we have to check from old temp bufs and set them as local buffer.
+  for (const ir::Buffer& buf : old_func->temp_bufs) {
+    const std::string& buf_name              = buf->name;
+    std::vector<ir::Expr> all_block_realizes = ir_sch.GetAllBlocks();
+    for (ir::Expr& e : all_block_realizes) {
+      const ir::ScheduleBlockRealize* sche_block_realize = e.As<ir::ScheduleBlockRealize>();
+      const std::string& sche_name = sche_block_realize->schedule_block.As<ir::ScheduleBlock>()->name;
+      if (buf_name == "_" + sche_name) {
+        VLOG(6) << "Set local buffer for temp buffer " << buf_name;
+        ir_sch.SetBuffer(e, "local", true);
+        break;
+      }
+    }
+  }
+  ir::Expr updated_body = ir_sch.GetModule().GetExprs()[0];
+#ifdef CINN_WITH_CUDA
+  optim::OptimizeExprGPU(&updated_body);
+#endif
+  // Get new temp bufs by analyzing.
+  std::vector<ir::Buffer> new_temp_bufs = lang::GetTempBuffers(old_func->args, updated_body);
+  ir::LoweredFunc new_func = ir::_LoweredFunc_::Make(old_func->name, old_func->args, updated_body, new_temp_bufs);
+#ifdef CINN_WITH_CUDA
+  if (target == common::DefaultNVGPUTarget()) {
+    new_func->PrepareCudaAxisInfoFromBody();
+  }
+#endif
+  new_func = optim::Optimize(Expr(new_func), target, false).as_lowered_func_ref();
+  new_func->PrepareBufferCastExprs(/*with_expr_gen_tensor = */ false);
+  return new_func;
+}
+}  // namespace auto_schedule
+}  // namespace cinn
--- a/paddle/cinn/auto_schedule/analysis/analyze_ir.h
+++ b/paddle/cinn/auto_schedule/analysis/analyze_ir.h
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <string>
+#include <unordered_set>
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/ir/ir_schedule.h"
+#include "paddle/cinn/ir/lowered_func.h"
+namespace cinn {
+namespace auto_schedule {
+void AnalyzeScheduleBlockReadWriteBuffer(ir::ScheduleBlock* sche_block);
+bool ContainsNodeType(ir::Expr expr, const std::unordered_set<ir::IrNodeTy>& node_types);
+/**
+ * Collects all input lowered_funcs and return names of all output arguments
+ */
+std::unordered_set<std::string> GetOutputNamesFromLoweredFunc(const std::vector<ir::LoweredFunc>& lowered_funcs);
+/**
+ * Determine whether a schedule block needs multileveltiling
+ */
+bool NeedsMultiLevelTiling(const ir::ScheduleBlockRealize& sche_block_realize);
+/**
+ * Update a LoweredFunc by regenerating related fields with a new function body
+ */
+ir::LoweredFunc UpdateFuncWithNewBody(const common::Target& target, const ir::LoweredFunc& old_func, ir::Expr& body);
+}  // namespace auto_schedule
+}  // namespace cinn
--- a/paddle/cinn/auto_schedule/analysis/analyze_ir_test.cc
+++ b/paddle/cinn/auto_schedule/analysis/analyze_ir_test.cc
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/cinn/auto_schedule/analysis/analyze_ir.h"
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include <sstream>
+#include <vector>
+#include "paddle/cinn/common/context.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/ir/ir_schedule.h"
+#include "paddle/cinn/ir/tensor.h"
+#include "paddle/cinn/lang/compute.h"
+#include "paddle/cinn/lang/lower.h"
+#include "paddle/cinn/lang/placeholder.h"
+#include "paddle/cinn/poly/stage.h"
+#include "paddle/cinn/utils/string.h"
+namespace cinn {
+namespace auto_schedule {
+TEST(AnalyzeIr, AnalyzeScheduleBlockReadWriteBuffer_SimpleAssign) {
+  Context::Global().ResetNameId();
+#ifdef CINN_WITH_CUDA
+  Target target = common::DefaultNVGPUTarget();
+#else
+  Target target = common::DefaultHostTarget();
+#endif
+  ir::Expr M(32);
+  ir::Expr N(32);
+  lang::Placeholder<float> A("A", {M, N});
+  ir::Tensor B = lang::Compute(
+      {M, N}, [&](Var i, Var j) { return A(i, j); }, "B");
+  poly::StageMap stages              = poly::CreateStages({A, B});
+  std::vector<ir::LoweredFunc> funcs = lang::LowerVec("SimpleAssign", stages, {A, B}, {}, {}, nullptr, target, true);
+  ASSERT_FALSE(funcs.empty());
+  ir::Expr ast_expr = funcs[0]->body;
+  VLOG(6) << "Analyzing for Expr:";
+  VLOG(6) << ast_expr;
+  std::vector<Expr> vec_ast{ast_expr};
+  ir::ModuleExpr mod_expr(vec_ast);
+  ir::IRSchedule ir_sch(mod_expr);
+  std::vector<ir::Expr> all_block_realizes = ir_sch.GetAllBlocks();
+  ASSERT_EQ(all_block_realizes.size(), 1UL);
+  ir::ScheduleBlockRealize* sche_block_realize = all_block_realizes[0].As<ir::ScheduleBlockRealize>();
+  ir::ScheduleBlock* sche_block                = sche_block_realize->schedule_block.As<ir::ScheduleBlock>();
+  AnalyzeScheduleBlockReadWriteBuffer(sche_block);
+  /*
+   * the sche_block_realize will be:
+   * ScheduleBlock(B)
+   * {
+   *   i0, i1 = axis.bind(i, j)
+   *   read_buffers(_A[i0(undefined:undefined), i1(undefined:undefined)])
+   *   write_buffers(_B[i0(undefined:undefined), i1(undefined:undefined)])
+   *   B[i0, i1] = A[i0, i1]
+   * }
+   */
+  VLOG(6) << "ScheduleBlockRealize: ";
+  VLOG(6) << all_block_realizes[0];
+  ASSERT_EQ(sche_block->read_buffers.size(), 1UL);
+  std::stringstream read_ss;
+  read_ss << sche_block->read_buffers[0];
+  ASSERT_EQ(read_ss.str(), "_A[i0(0:32), i1(0:32)]");
+  ASSERT_EQ(sche_block->write_buffers.size(), 1UL);
+  std::stringstream write_ss;
+  write_ss << sche_block->write_buffers[0];
+  ASSERT_EQ(write_ss.str(), "_B[i0(0:32), i1(0:32)]");
+}
+TEST(AnalyzeIr, AnalyzeScheduleBlockReadWriteBuffer_AddDiffShape) {
+  Context::Global().ResetNameId();
+#ifdef CINN_WITH_CUDA
+  Target target = common::DefaultNVGPUTarget();
+#else
+  Target target = common::DefaultHostTarget();
+#endif
+  ir::Expr M(32);
+  ir::Expr N(128);
+  lang::Placeholder<float> A("A", {M});
+  lang::Placeholder<float> B("B", {N});
+  ir::Tensor C = lang::Compute(
+      {M, N}, [&](Var i, Var j) { return A(i) + B(j); }, "C");
+  poly::StageMap stages              = poly::CreateStages({C});
+  std::vector<ir::LoweredFunc> funcs = lang::LowerVec("AddDiffShape", stages, {C}, {}, {}, nullptr, target, true);
+  ir::Expr ast_expr = funcs[0]->body;
+  VLOG(6) << "Expr before MultiLevelTiling: ";
+  VLOG(6) << ast_expr;
+  std::vector<Expr> vec_ast{ast_expr};
+  ir::ModuleExpr mod_expr(vec_ast);
+  ir::IRSchedule ir_sch(mod_expr);
+  std::vector<ir::Expr> all_block_realizes = ir_sch.GetAllBlocks();
+  ASSERT_EQ(all_block_realizes.size(), 1UL);
+  ir::ScheduleBlockRealize* sche_block_realize = all_block_realizes[0].As<ir::ScheduleBlockRealize>();
+  ir::ScheduleBlock* sche_block                = sche_block_realize->schedule_block.As<ir::ScheduleBlock>();
+  AnalyzeScheduleBlockReadWriteBuffer(sche_block);
+  VLOG(6) << "ScheduleBlockRealize: ";
+  VLOG(6) << all_block_realizes[0];
+  ASSERT_EQ(sche_block->read_buffers.size(), 2UL);
+  std::vector<std::string> expect_read = {"_A[i0(0:32)]", "_B[i1(0:128)]"};
+  ASSERT_EQ(sche_block->read_buffers.size(), expect_read.size());
+  for (size_t i = 0; i < expect_read.size(); ++i) {
+    std::stringstream read_ss;
+    read_ss << sche_block->read_buffers[i];
+    ASSERT_EQ(read_ss.str(), expect_read[i]);
+  }
+  ASSERT_EQ(sche_block->write_buffers.size(), 1UL);
+  std::stringstream write_ss;
+  write_ss << sche_block->write_buffers[0];
+  ASSERT_EQ(write_ss.str(), "_C[i0(0:32), i1(0:128)]");
+}
+TEST(AnalyzeIr, ContainsNodeType) {
+  Context::Global().ResetNameId();
+#ifdef CINN_WITH_CUDA
+  Target target = common::DefaultNVGPUTarget();
+#else
+  Target target = common::DefaultHostTarget();
+#endif
+  ir::Expr M(32);
+  ir::Expr N(32);
+  lang::Placeholder<float> A("A", {M, N});
+  ir::Tensor B = lang::Compute(
+      {M, N}, [&](Var i, Var j) { return A(i, j); }, "B");
+  poly::StageMap stages              = poly::CreateStages({A, B});
+  std::vector<ir::LoweredFunc> funcs = lang::LowerVec("SimpleAssign", stages, {A, B}, {}, {}, nullptr, target, true);
+  ASSERT_FALSE(funcs.empty());
+  ir::Expr ast_expr = funcs[0]->body;
+  VLOG(6) << "Analyzing for Expr:";
+  VLOG(6) << ast_expr;
+  ASSERT_TRUE(ContainsNodeType(ast_expr, {ir::IrNodeTy::Load, ir::IrNodeTy::Store}));
+  ASSERT_TRUE(ContainsNodeType(ast_expr, {ir::IrNodeTy::Load, ir::IrNodeTy::IfThenElse}));
+  ASSERT_FALSE(ContainsNodeType(ast_expr, {ir::IrNodeTy::IfThenElse, ir::IrNodeTy::Sum}));
+}
+}  // namespace auto_schedule
+}  // namespace cinn
--- a/paddle/cinn/auto_schedule/auto_schedule.proto
+++ b/paddle/cinn/auto_schedule/auto_schedule.proto
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+syntax ="proto3";
+package cinn.auto_schedule.proto;
+import "paddle/cinn/ir/schedule_desc.proto";
+message TuningRecord {
+  string task_key = 1;
+  double execution_cost = 2;
+  double predicted_cost = 3;
+  cinn.ir.proto.ScheduleDesc trace = 4;
+}
--- a/paddle/cinn/auto_schedule/auto_tuner.cc
+++ b/paddle/cinn/auto_schedule/auto_tuner.cc
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/cinn/auto_schedule/auto_tuner.h"
+#include <glog/logging.h>
+#include <pybind11/embed.h>
+#include <algorithm>
+#include <memory>
+#include <utility>
+#include "paddle/cinn/auto_schedule/database/jsonfile_database.h"
+#include "paddle/cinn/auto_schedule/measure/schedule_measurer.h"
+#include "paddle/cinn/auto_schedule/measure/simple_builder.h"
+#include "paddle/cinn/auto_schedule/measure/simple_runner.h"
+#include "paddle/cinn/auto_schedule/task/task_creator.h"
+#include "paddle/cinn/auto_schedule/task/task_registry.h"
+#include "paddle/cinn/auto_schedule/task/tune_task.h"
+#include "paddle/cinn/auto_schedule/task_scheduler/task_scheduler.h"
+#include "paddle/cinn/common/context.h"
+#include "paddle/cinn/common/type.h"
+#include "paddle/cinn/hlir/framework/op.h"
+#include "paddle/cinn/hlir/framework/visualize_helper.h"
+#include "paddle/cinn/utils/string.h"
+namespace cinn {
+namespace auto_schedule {
+AutoTuner::AutoTuner(const common::Target& target, hlir::framework::Graph* graph) : target_(target), graph_(graph) {}
+void AutoTuner::Initialize(const Config& config, hlir::framework::GraphCompiler* graph_compiler) {
+  // create builder, runner, and schedule measurer
+  builder_           = std::make_unique<SimpleBuilder>(graph_compiler);
+  runner_            = std::make_unique<SimpleRunner>(config.runner_repeat_times);
+  schedule_measurer_ = std::make_unique<ScheduleMeasurer>(builder_.get(), runner_.get());
+  // initialize database
+  database_ = std::move(Database::Make(config.database_config));
+  // create tasks
+  TaskCreator task_creator;
+  tasks_ = task_creator.CreateTuneTaskOpLevel(graph_);
+  const auto& dtype_dict = graph_->GetAttrs<absl::flat_hash_map<std::string, common::Type>>("inferdtype");
+  const auto& shape_dict = graph_->GetAttrs<absl::flat_hash_map<std::string, hlir::framework::shape_t>>("infershape");
+  op_lowerer_                        = std::make_unique<hlir::framework::OpLowerer>(dtype_dict, shape_dict, target_);
+  InitialTaskRegistry* task_registry = InitialTaskRegistry::Global();
+  for (auto i = 0; i < tasks_.size(); ++i) {
+    auto&& task = tasks_[i];
+    task.Initialize(shape_dict, dtype_dict, op_lowerer_.get());
+    // Register the initial ModuleExpr corresponding to the task
+    task_registry->Regist(task.serialized_key, ir::ModuleExpr(task.GetLoweredFuncBodyExprs()));
+    VLOG(3) << "Add a task, id:" << i << ", serialized_key:\n" << task.serialized_key;
+  }
+  // create task optimizers
+  utils::LinearRandomEngine::StateType initial_seed = utils::LinearRandomEngine::GetDeviceRandomValue();
+  task_optimizers_.resize(tasks_.size());
+  std::transform(tasks_.begin(), tasks_.end(), task_optimizers_.begin(), [&](TuneTask& task) {
+    return std::make_unique<TaskOptimizer>(
+        &task, schedule_measurer_.get(), database_.get(), utils::ForkRandomState(&initial_seed));
+  });
+  // create task scheduler
+  task_scheduler_ = TaskScheduler::Make(tasks_, config.task_schedule_config, config.task_schedule_strategy);
+}
+void PrintResult(std::shared_ptr<hlir::framework::Graph::Group> group) {
+  if (!VLOG_IS_ON(3)) {
+    return;
+  }
+  auto nodes = group->CollectNodes();
+  VLOG(3) << "Node size:" << nodes.size();
+  VLOG(3) << "Group {";
+  for (auto* node : nodes) {
+    VLOG(3) << "  " << hlir::framework::DebugString(node);
+  }
+  VLOG(3) << "}";
+}
+void PrintResult(const FunctionGroup& functions) {
+  if (!VLOG_IS_ON(3)) {
+    return;
+  }
+  VLOG(3) << "Function size:" << functions.size();
+  for (auto i = 0; i < functions.size(); ++i) {
+    const ir::LoweredFunc& func = functions.at(i);
+    VLOG(3) << "LoweredFunc-" << i << " detail:\n" << func;
+  }
+}
+void PrintResult(const TuningResult& result) {
+  if (!VLOG_IS_ON(3)) {
+    return;
+  }
+  VLOG(3) << "###### Debug TuningResult ######\n";
+  VLOG(3) << "Tuned SubGraph num:" << result.subgraphs.size();
+  for (auto i = 0; i < result.subgraphs.size(); ++i) {
+    VLOG(3) << "****** SubGraph-" << i << " Detail ******\n";
+    PrintResult(result.subgraphs.at(i));
+    VLOG(3) << "****** SubGraph End ******";
+  }
+  VLOG(3) << "Tuned FunctionGroup num:" << result.function_groups.size();
+  for (auto i = 0; i < result.function_groups.size(); ++i) {
+    VLOG(3) << "****** FunctionGroup-" << i << " Detail ******\n";
+    PrintResult(result.function_groups.at(i));
+    VLOG(3) << "****** FunctionGroup End ******";
+  }
+  VLOG(3) << "###### TuningResult End ######";
+}
+TuningResult AutoTuner::Tune(const TuningOptions& options) {
+  CHECK_GT(options.num_tuning_rounds, 0) << "Invalid config";
+  VLOG(3) << "Begin tuning with round num=" << options.num_tuning_rounds << ", tasks size=" << tasks_.size();
+  TuningResult result;
+  result.subgraphs.resize(tasks_.size());
+  result.function_groups.resize(tasks_.size());
+  // A task only tunes schedule now, so we populate its sub_graph
+  // as default result of graph tuning, and that should be updated
+  // once we support graph tuning.
+  for (auto i = 0; i < tasks_.size(); ++i) {
+    auto&& task         = tasks_.at(i);
+    result.subgraphs[i] = task.subgraph;
+  }
+  for (int r = 0; r < options.num_tuning_rounds; ++r) {
+    VLOG(3) << "<<<<<< Round " << r << " >>>>>>";
+    int run_id = -1;
+    task_scheduler_->Reset();
+    while ((run_id = task_scheduler_->NextTaskId()) != -1) {
+      VLOG(3) << "Start tuning Task-" << run_id;
+      auto* opt           = task_optimizers_.at(run_id).get();
+      auto function_group = opt->Optimize(options);
+      VLOG(3) << "Task-" << run_id << " finished, print optimized functions:\n";
+      PrintResult(function_group);
+      // update the best schedules searched so far.
+      result.function_groups.at(run_id) = std::move(function_group);
+    }
+  }
+  PrintResult(result);
+  return result;
+}
+}  // namespace auto_schedule
+}  // namespace cinn
--- a/paddle/cinn/auto_schedule/auto_tuner.h
+++ b/paddle/cinn/auto_schedule/auto_tuner.h
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <memory>
+#include <string>
+#include <vector>
+#include "paddle/cinn/auto_schedule/measure/schedule_measurer.h"
+#include "paddle/cinn/auto_schedule/task/task_optimizer.h"
+#include "paddle/cinn/auto_schedule/task/tune_task.h"
+#include "paddle/cinn/auto_schedule/task_scheduler/task_scheduler.h"
+#include "paddle/cinn/auto_schedule/tuning.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/hlir/framework/graph.h"
+#include "paddle/cinn/hlir/framework/graph_compiler.h"
+#include "paddle/cinn/hlir/framework/op_lowering.h"
+namespace cinn {
+namespace auto_schedule {
+// This class is entrance of auto-tune, users can use it
+// to tune graph (not supported yet) and search a series of schedules
+// that maybe more likely to obtain better performance.
+// Internally, it creates necessary components and use them to perform tuning.
+class AutoTuner {
+ public:
+  // configure how to perform auto-tune, such as
+  // the way to create tasks, the strategy of scheduling tasks and so on.
+  struct Config {
+    std::string task_schedule_strategy = "round_robin";
+    TaskScheduler::Config task_schedule_config;
+    int runner_repeat_times = 1;
+    DatabaseConfig database_config;
+  };
+  AutoTuner(const common::Target& target, hlir::framework::Graph* graph);
+  // Initialize tuner with specific config and auxiliary objects.
+  void Initialize(const Config& config, hlir::framework::GraphCompiler* graph_compiler);
+  // Perform the tuning process and return the final result
+  TuningResult Tune(const TuningOptions& options);
+ private:
+  const common::Target& target_;
+  hlir::framework::Graph* graph_;
+  std::unique_ptr<hlir::framework::OpLowerer> op_lowerer_;
+  // Tasks to tune
+  std::vector<TuneTask> tasks_;
+  // Scheduler that select a task to tune at every turn.
+  std::unique_ptr<TaskScheduler> task_scheduler_;
+  // The actor to perform auto-tune, each optimizer take a task.
+  std::vector<std::unique_ptr<TaskOptimizer>> task_optimizers_;
+  // Classes used to measure AutoTune samples
+  std::unique_ptr<ScheduleBuilder> builder_;
+  std::unique_ptr<ScheduleRunner> runner_;
+  std::unique_ptr<ScheduleMeasurer> schedule_measurer_;
+  // The database to store tuning record
+  std::unique_ptr<Database> database_;
+};
+}  // namespace auto_schedule
+}  // namespace cinn
--- a/paddle/cinn/auto_schedule/auto_tuner_test.cc
+++ b/paddle/cinn/auto_schedule/auto_tuner_test.cc
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/cinn/auto_schedule/auto_tuner.h"
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include <cstdlib>
+#include <iostream>
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/frontend/net_builder.h"
+#include "paddle/cinn/frontend/optimize.h"
+#include "paddle/cinn/frontend/syntax.h"
+#include "paddle/cinn/hlir/framework/graph.h"
+#include "paddle/cinn/hlir/framework/graph_compiler.h"
+#include "paddle/cinn/hlir/framework/node.h"
+#include "paddle/cinn/hlir/framework/pass.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/runtime/flags.h"
+DECLARE_bool(auto_schedule_use_cost_model);
+DECLARE_bool(cinn_ir_schedule);
+namespace cinn {
+namespace auto_schedule {
+using ::cinn::hlir::framework::BuildScope;
+using ::cinn::hlir::framework::Graph;
+using ::cinn::hlir::framework::GraphCompiler;
+using ::cinn::hlir::framework::Instruction;
+using ::cinn::hlir::framework::Node;
+using ::cinn::hlir::framework::Scope;
+class TestAutoTuner : public ::testing::Test {
+ public:
+#ifdef CINN_WITH_CUDA
+  Target target = common::DefaultNVGPUTarget();
+#else
+  Target target = common::DefaultHostTarget();
+#endif
+  std::shared_ptr<Graph> graph;
+  std::shared_ptr<Scope> compiled_scope;
+  std::unique_ptr<GraphCompiler> graph_compiler;
+  std::unique_ptr<AutoTuner> tuner;
+  frontend::Program CreateAddReluProgram() {
+    frontend::NetBuilder builder("test");
+    auto a = builder.CreateInput(Float(32), {1, 64, 112, 112}, "A");
+    auto b = builder.CreateInput(Float(32), {64}, "B");
+    auto c = builder.Add(a, b, 1);
+    auto d = builder.Relu(c);
+    return builder.Build();
+  }
+  void SetUp() override {
+    srand(0);
+    // AutoTuner is combined with new IR Schedule
+    FLAGS_cinn_ir_schedule = true;
+    std::unordered_set<std::string> fetch_ids;
+    auto program   = CreateAddReluProgram();
+    auto graph     = cinn::frontend::Optimize(&program, fetch_ids, target);
+    compiled_scope = BuildScope(target, graph);
+    graph_compiler = std::make_unique<GraphCompiler>(target, compiled_scope, graph);
+    tuner          = std::make_unique<AutoTuner>(target, graph.get());
+  }
+  TuningResult InitializeAndTune(const AutoTuner::Config& config, const TuningOptions& options) {
+    tuner->Initialize(config, graph_compiler.get());
+    return tuner->Tune(options);
+  }
+  virtual void BasicCheckResult(const TuningResult& result) {
+    ASSERT_EQ(1, result.subgraphs.size());
+    auto nodes = result.subgraphs.front()->CollectNodes();
+    ASSERT_EQ(nodes.size(), 4UL);
+    ASSERT_EQ(nodes[0]->op()->name, "broadcast_to");
+    ASSERT_EQ(nodes[1]->op()->name, "fill_constant");
+    ASSERT_EQ(nodes[2]->op()->name, "elementwise_add");
+    ASSERT_EQ(nodes[3]->op()->name, "max");
+    ASSERT_EQ(result.function_groups.size(), 1UL);
+    ASSERT_EQ(result.function_groups[0].size(), 1UL);
+  }
+  virtual void ApplyTunedAndRun(const TuningResult& result) {
+    // build runtime program with tuning result
+    GraphCompiler::CompileOptions compile_options;
+    compile_options.with_instantiate_variables = true;
+    compile_options.Apply(result);
+    ASSERT_EQ(1, compile_options.groups.size());
+    ASSERT_EQ(1, compile_options.lowered_funcs.size());
+    VLOG(6) << "Print lowered_funcs before building";
+    VLOG(6) << compile_options.lowered_funcs[0][0];
+    VLOG(6) << compile_options.lowered_funcs[1][0];
+    auto runtime_program = graph_compiler->Build(compile_options).runtime_program;
+    ASSERT_EQ(1, runtime_program->size());
+    runtime_program->Execute();
+  }
+  void ZeroMeasure() {
+    // set config and options
+    AutoTuner::Config tuning_config;
+    tuning_config.task_schedule_strategy = "round_robin";
+    TuningOptions tuning_options;
+    tuning_options.num_measure_trials = 0;
+    auto result                       = InitializeAndTune(tuning_config, tuning_options);
+    BasicCheckResult(result);
+    ApplyTunedAndRun(result);
+  }
+  void NonZeroMeasure() {
+    // set config and options
+    AutoTuner::Config tuning_config;
+    tuning_config.task_schedule_strategy = "round_robin";
+    TuningOptions tuning_options;
+    tuning_options.num_measure_trials        = 4;
+    tuning_options.num_samples_per_iteration = 2;
+    auto result = InitializeAndTune(tuning_config, tuning_options);
+    BasicCheckResult(result);
+    ApplyTunedAndRun(result);
+  }
+};
+TEST_F(TestAutoTuner, ZeroMeasure_DisableCostModel) {
+  FLAGS_auto_schedule_use_cost_model = false;
+  ZeroMeasure();
+}
+TEST_F(TestAutoTuner, ZeroMeasure_EnableCostModel) {
+  FLAGS_auto_schedule_use_cost_model = true;
+  ZeroMeasure();
+}
+TEST_F(TestAutoTuner, NonZeroMeasure_DisableCostModel) {
+  FLAGS_auto_schedule_use_cost_model = false;
+  NonZeroMeasure();
+}
+TEST_F(TestAutoTuner, NonZeroMeasure_EnableCostModel) {
+  FLAGS_auto_schedule_use_cost_model = true;
+  NonZeroMeasure();
+}
+}  // namespace auto_schedule
+}  // namespace cinn
--- a/paddle/cinn/auto_schedule/cost_model/CMakeLists.txt
+++ b/paddle/cinn/auto_schedule/cost_model/CMakeLists.txt
+core_gather_headers()
+gather_srcs(cinnapi_src SRCS xgb_cost_model.cc expr_cost_model.cc feature.cc feature_extractor.cc)
+cinn_cc_test(test_xgb_cost_model SRCS xgb_cost_model_test.cc DEPS cinncore)
+cinn_cc_test(test_feature_extractor SRCS feature_extractor_test.cc DEPS cinncore)
+cinn_cc_test(test_feature SRCS feature_test.cc DEPS cinncore)
--- a/paddle/cinn/auto_schedule/cost_model/expr_cost_model.cc
+++ b/paddle/cinn/auto_schedule/cost_model/expr_cost_model.cc
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/cinn/auto_schedule/cost_model/expr_cost_model.h"
+#include <glog/logging.h>
+#include <atomic>
+#include <vector>
+#include "paddle/cinn/auto_schedule/cost_model/feature.h"
+#include "paddle/cinn/auto_schedule/cost_model/feature_extractor.h"
+#include "paddle/cinn/auto_schedule/search_space/search_state.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/ir/ir_schedule.h"
+namespace cinn {
+namespace auto_schedule {
+float ExprCostModel::Predict(const ir::ModuleExpr& sample, const common::Target& target) const {
+  if (trained_times_.load() == 0) {
+    return SearchState::NOT_INIT_COST;
+  }
+  FeatureExtractor extractor;
+  Feature feature                    = extractor.Extract(sample, target);
+  std::vector<float> feature_numbers = feature.ToFixedSizeVector();
+  std::vector<float> pred            = XgbCostModel::Predict({feature_numbers});
+  return pred[0];
+}
+void ExprCostModel::Train(const std::vector<const ir::ModuleExpr*>& samples,
+                          const std::vector<float>& labels,
+                          const common::Target& target) {
+  trained_times_.store(1);
+  size_t total_size = samples.size();
+  CHECK_EQ(total_size, labels.size()) << "Samples must have same size as labels";
+  std::vector<std::vector<float>> train_feature_numbers(total_size);
+  FeatureExtractor extractor;
+  for (size_t i = 0; i < total_size; ++i) {
+    CHECK(samples[i] != nullptr) << "Train samples cannot be nullptr";
+    Feature feature          = extractor.Extract(*samples[i], target);
+    train_feature_numbers[i] = feature.ToFixedSizeVector();
+  }
+  XgbCostModel::Train(train_feature_numbers, labels);
+}
+void ExprCostModel::Update(const std::vector<const ir::ModuleExpr*>& samples,
+                           const std::vector<float>& labels,
+                           const common::Target& target) {
+  ++trained_times_;
+  size_t total_size = samples.size();
+  CHECK_EQ(total_size, labels.size()) << "Samples must have same size as labels";
+  std::vector<std::vector<float>> train_feature_numbers(total_size);
+  FeatureExtractor extractor;
+  for (size_t i = 0; i < total_size; ++i) {
+    CHECK(samples[i] != nullptr) << "Train samples cannot be nullptr";
+    Feature feature          = extractor.Extract(*samples[i], target);
+    train_feature_numbers[i] = feature.ToFixedSizeVector();
+  }
+  XgbCostModel::Update(train_feature_numbers, labels);
+}
+}  // namespace auto_schedule
+}  // namespace cinn
--- a/paddle/cinn/auto_schedule/cost_model/expr_cost_model.h
+++ b/paddle/cinn/auto_schedule/cost_model/expr_cost_model.h
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <atomic>
+#include <vector>
+#include "paddle/cinn/auto_schedule/cost_model/xgb_cost_model.h"
+#include "paddle/cinn/ir/ir_schedule.h"
+namespace cinn {
+namespace auto_schedule {
+/**
+ * A C++ cost model which trains and predicts on ir::Expr
+ *
+ */
+class ExprCostModel : public XgbCostModel {
+ public:
+  virtual float Predict(const ir::ModuleExpr& sample, const common::Target& target) const;
+  void Train(const std::vector<const ir::ModuleExpr*>& samples,
+             const std::vector<float>& labels,
+             const common::Target& target);
+  void Update(const std::vector<const ir::ModuleExpr*>& samples,
+              const std::vector<float>& labels,
+              const common::Target& target);
+ private:
+  std::atomic<int> trained_times_{0};
+};
+}  // namespace auto_schedule
+}  // namespace cinn
--- a/paddle/cinn/auto_schedule/cost_model/feature.cc
+++ b/paddle/cinn/auto_schedule/cost_model/feature.cc
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/cinn/auto_schedule/cost_model/feature.h"
+#include <glog/logging.h>
+#include <vector>
+#include "paddle/cinn/common/target.h"
+namespace cinn {
+namespace auto_schedule {
+Feature::Feature()
+    : target_(common::UnkTarget()),
+      stack_encoded_feature_(1),  // initialize a LoopBlockFeature as root block
+      current_loop_block_index_(0),
+      parent_indices_(1, -1) {}
+Feature::Feature(const common::Target& target)
+    : target_(target),
+      stack_encoded_feature_(1),  // initialize a LoopBlockFeature as root block
+      current_loop_block_index_(0),
+      parent_indices_(1, -1) {}
+std::vector<float> Feature::ToFixedSizeVector() {
+  std::vector<float> ret(LoopBlockFeature::kTotalSize + 1, 0);  // LoopBlockFeature::kTotalSize plus 1 for target
+  if (target_ == common::DefaultNVGPUTarget()) {
+    ret[0] = 1;
+  }  // else 0 for other cases
+  // loop[i] feature count should multiply iter_multi_num[i]
+  std::vector<int> iter_multi_num;
+  for (size_t i = 0; i < stack_encoded_feature_.size(); ++i) {
+    int j                                = 1;
+    const LoopBlockFeature& loop_feature = stack_encoded_feature_[i];
+    int loop_prod                        = 1;
+    int parent_prod                      = 1;
+    if (i != 0) {
+      parent_prod = iter_multi_num[parent_indices_[i]];
+      loop_prod   = parent_prod * loop_feature.loop_length;
+    }
+    iter_multi_num.push_back(loop_prod);
+    ret[j] += (loop_feature.float_add_or_sub * loop_prod);
+    ++j;
+    ret[j] += (loop_feature.float_mul * loop_prod);
+    ++j;
+    ret[j] += (loop_feature.float_div_or_mod * loop_prod);
+    ++j;
+    ret[j] += (loop_feature.float_cmp * loop_prod);
+    ++j;
+    ret[j] += (loop_feature.float_math_func * loop_prod);
+    ++j;
+    ret[j] += (loop_feature.float_other_call * loop_prod);
+    ++j;
+    ret[j] += (loop_feature.int_add_or_sub * loop_prod);
+    ++j;
+    ret[j] += (loop_feature.int_mul * loop_prod);
+    ++j;
+    ret[j] += (loop_feature.int_div_or_mod * loop_prod);
+    ++j;
+    ret[j] += (loop_feature.int_cmp * loop_prod);
+    ++j;
+    ret[j] += (loop_feature.int_math_func * loop_prod);
+    ++j;
+    ret[j] += (loop_feature.int_other_call * loop_prod);
+    ++j;
+    ret[j] += (loop_feature.bool_op * loop_prod);
+    ++j;
+    ret[j] += (loop_feature.select_op * loop_prod);
+    ++j;
+    ret[j] += (loop_feature.mem_alloc * loop_prod);
+    ++j;
+    ret[j] += (loop_feature.mem_free * loop_prod);
+    ++j;
+    ret[j] += (loop_feature.mem_read * loop_prod);
+    ++j;
+    ret[j] += (loop_feature.mem_write * loop_prod);
+    ++j;
+    ret[j] += (loop_feature.float_reduce_sum_or_sub * loop_prod);
+    ++j;
+    ret[j] += (loop_feature.float_reduce_mul * loop_prod);
+    ++j;
+    ret[j] += (loop_feature.float_reduce_div * loop_prod);
+    ++j;
+    ret[j] += (loop_feature.float_reduce_max_or_min * loop_prod);
+    ++j;
+    ret[j] += (loop_feature.float_broadcast * loop_prod);
+    ++j;
+    ret[j] += (loop_feature.int_reduce_sum_or_sub * loop_prod);
+    ++j;
+    ret[j] += (loop_feature.int_reduce_mul * loop_prod);
+    ++j;
+    ret[j] += (loop_feature.int_reduce_div * loop_prod);
+    ++j;
+    ret[j] += (loop_feature.int_reduce_max_or_min * loop_prod);
+    ++j;
+    ret[j] += (loop_feature.int_broadcast * loop_prod);
+    ++j;
+    ret[j + static_cast<int>(loop_feature.loop_opt_type)] += 1;
+    j += LoopBlockFeature::kOptApplySize;
+    ret[j] += (loop_feature.len_blockIdx_x * parent_prod);
+    ++j;
+    ret[j] += (loop_feature.len_blockIdx_y * parent_prod);
+    ++j;
+    ret[j] += (loop_feature.len_blockIdx_z * parent_prod);
+    ++j;
+    ret[j] += (loop_feature.len_threadIdx_x * parent_prod);
+    ++j;
+    ret[j] += (loop_feature.len_threadIdx_y * parent_prod);
+    ++j;
+    ret[j] += (loop_feature.len_threadIdx_z * parent_prod);
+    ++j;
+    ret[j] += (loop_feature.len_vthread * parent_prod);
+    ++j;
+    ret[j] += (loop_feature.vectorize_factor * parent_prod);
+    ++j;
+  }
+  for (size_t i = 0; i < ret.size(); ++i) {
+    ret[i] = slog(ret[i]);
+  }
+  return ret;
+}
+void Feature::IntoLoopBlock() {
+  stack_encoded_feature_.emplace_back(LoopBlockFeature());
+  stack_encoded_feature_[current_loop_block_index_].num_sub_loops += 1;
+  parent_indices_.push_back(current_loop_block_index_);
+  current_loop_block_index_ = stack_encoded_feature_.size() - 1;
+}
+void Feature::ExitLoopBlock() { current_loop_block_index_ = parent_indices_[current_loop_block_index_]; }
+LoopBlockFeature& Feature::CurrentLoopBlock() { return stack_encoded_feature_[current_loop_block_index_]; }
+const LoopBlockFeature& Feature::CurrentLoopBlock() const { return stack_encoded_feature_[current_loop_block_index_]; }
+}  // namespace auto_schedule
+}  // namespace cinn
--- a/paddle/cinn/auto_schedule/cost_model/feature.h
+++ b/paddle/cinn/auto_schedule/cost_model/feature.h
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <cmath>
+#include <vector>
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/ir/ir_schedule.h"
+namespace cinn {
+namespace auto_schedule {
+/* Loop feature enums */
+enum class ForOptimizeFeatureEnum : int { kNone, kGpuBind, kParallel, kUnroll, kVectorize };
+/* function to scale feature numbers */
+inline float slog(float x) { return x < 0 ? std::log2(-x + 1) : std::log2(x + 1); }
+class LoopBlockFeature {
+ public:
+  // TODO(zhhsplendid): distinguish more types such as float16, float32,
+  // float64, etc. However speed the gap between float and int are larger than
+  // different bits, so we just distinguished int and float here
+  /* Arithmetic features */
+  int float_add_or_sub = 0;
+  int float_mul        = 0;
+  int float_div_or_mod = 0;
+  int float_cmp        = 0;
+  int float_math_func  = 0;
+  int float_other_call = 0;  // like simple assign, cast, etc.
+  int int_add_or_sub = 0;
+  int int_mul        = 0;
+  int int_div_or_mod = 0;
+  int int_cmp        = 0;
+  int int_math_func  = 0;
+  int int_other_call = 0;  // like simple assign, cast, etc.
+  int bool_op   = 0;
+  int select_op = 0;
+  static constexpr int kArithSize = 6 * 2 + 2;
+  /**
+   * Buffer memory features, which is the number of memory operations.
+   * Note that different size of memory operation can have various speed,
+   * however the speed difference would be small in OS. A meticulous TODO
+   * may be collect operand sizes (like alloc size, write size, or so)
+   */
+  int mem_alloc = 0;
+  int mem_free  = 0;
+  int mem_read  = 0;
+  int mem_write = 0;
+  static constexpr int kMemSize = 4;
+  /**
+   * Reduce and Broadcast features
+   */
+  int float_reduce_sum_or_sub = 0;
+  int float_reduce_mul        = 0;
+  int float_reduce_div        = 0;
+  int float_reduce_max_or_min = 0;
+  int float_broadcast         = 0;
+  int int_reduce_sum_or_sub = 0;
+  int int_reduce_mul        = 0;
+  int int_reduce_div        = 0;
+  int int_reduce_max_or_min = 0;
+  int int_broadcast         = 0;
+  static constexpr int kReduceBroadcastSize = 10;
+  /* Loop type features */
+  // A TODO maybe add loop position (Inner, Outer, Middle) feature
+  ForOptimizeFeatureEnum loop_opt_type = ForOptimizeFeatureEnum::kNone;
+  static constexpr int kOptApplySize = 5;
+  /* Thread features if loop is optimized by GPU or CPU parallelism.
+   * Useless in other cases.
+   */
+  int len_blockIdx_x   = 0;
+  int len_blockIdx_y   = 0;
+  int len_blockIdx_z   = 0;
+  int len_threadIdx_x  = 0;
+  int len_threadIdx_y  = 0;
+  int len_threadIdx_z  = 0;
+  int len_vthread      = 0;  // length of virtual thread
+  int vectorize_factor = 0;
+  static constexpr int kThreadFeatureSize = 8;
+  static constexpr int kTotalSize = kArithSize + kMemSize + kReduceBroadcastSize + kOptApplySize + kThreadFeatureSize;
+  /* Non-feature attributes, used to maintain during feature_extractor */
+  // Number to indicate the loop block inside current one
+  int num_sub_loops = 0;
+  // Number of repeats of this loop, -1 represents unknown
+  int loop_length = 1;
+};
+/**
+ * Feature of Expr. It is used in CostModel
+ */
+class Feature {
+ public:
+  Feature();
+  Feature(const common::Target& target);
+  // Convert the various-length loop block features to fixed-size vector
+  std::vector<float> ToFixedSizeVector();
+  // Call when visit into a loop block to collect LoopBlockFeature
+  void IntoLoopBlock();
+  // Call when exit a loop block to collect LoopBlockFeature
+  void ExitLoopBlock();
+  // The current loop block which we should collect feature on
+  LoopBlockFeature& CurrentLoopBlock();
+  // The current loop block which we should collect feature on
+  const LoopBlockFeature& CurrentLoopBlock() const;
+ private:
+  // We treat a computation feature to be encoded as variable-length vector.
+  // The root compute block is not a loop, but we treat it as a size-1 loop.
+  // Blocks are encoded like a stack. Each LoopBlockFeature contains a
+  // num_sub_loops to indicate the next level sub-loop-block it contains.
+  //
+  // For example, code like:
+  //
+  // some_compute_0
+  // loop1 {
+  //   some_compute_1
+  //   loop2 {
+  //     some_compute_2
+  //   }
+  // }
+  //
+  // loop3 {
+  //   some_compute_3
+  // }
+  //
+  // We go through the code and push loops into stack, then the features are encoded as
+  // [loop_block_feature_0, loop_block_feature_1, loop_block_feature_2, loop_block_feature_3]
+  // where loop_block_feature_i stores the features of some_compute_i (such
+  // as number of arithmetic operations)
+  //
+  // loop_block_feature_0.num_sub_loops = 2
+  // loop_block_feature_1.num_sub_loops = 1
+  // loop_block_feature_2.num_sub_loops = 0
+  // loop_block_feature_3.num_sub_loops = 0
+  std::vector<LoopBlockFeature> stack_encoded_feature_;
+  int current_loop_block_index_;
+  std::vector<int> parent_indices_;
+  common::Target target_;
+};
+}  // namespace auto_schedule
+}  // namespace cinn
--- a/paddle/cinn/auto_schedule/cost_model/feature_extractor.cc
+++ b/paddle/cinn/auto_schedule/cost_model/feature_extractor.cc
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/cinn/auto_schedule/cost_model/feature_extractor.h"
+#include <vector>
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/common/type.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "paddle/cinn/ir/ir_schedule.h"
+#include "paddle/cinn/optim/ir_copy.h"
+#include "paddle/cinn/optim/transform_polyfor_to_for.h"
+namespace cinn {
+namespace auto_schedule {
+using namespace ::cinn::ir;
+FeatureExtractor::FeatureExtractor() {}
+void FeatureExtractor::Visit(const Expr *x) { IRVisitor::Visit(x); }
+Feature FeatureExtractor::Extract(const ir::ModuleExpr &mod_expr, const common::Target &target) {
+  feature_ = Feature(target);
+  for (const ir::Expr &e : mod_expr.GetExprs()) {
+    Visit(&e);
+  }
+  return feature_;
+}
+#define VisitDoNothing(NodeType)                            \
+  void FeatureExtractor::Visit(const NodeType *x) {         \
+    std::vector<const Expr *> sub_exprs = x->expr_fields(); \
+    for (const Expr *e : sub_exprs) {                       \
+      if (e->defined()) {                                   \
+        Visit(e);                                           \
+      }                                                     \
+    }                                                       \
+  }
+VisitDoNothing(IntImm);
+VisitDoNothing(UIntImm);
+VisitDoNothing(FloatImm);
+VisitDoNothing(StringImm);
+VisitDoNothing(Block);
+VisitDoNothing(_Module_);
+VisitDoNothing(_Var_);
+VisitDoNothing(_LoweredFunc_);
+VisitDoNothing(ScheduleBlock);
+VisitDoNothing(ScheduleBlockRealize);
+VisitDoNothing(Ramp);
+VisitDoNothing(_Buffer_);
+VisitDoNothing(_BufferRange_);
+#define NotVisitExprFields(NodeType) \
+  void FeatureExtractor::Visit(const NodeType *x) {}
+NotVisitExprFields(_Tensor_)
+#define VisitForDtypePattern(NodeType, member)                                                    \
+  void FeatureExtractor::Visit(const NodeType *x) {                                               \
+    if (x->type() == common::F32() || x->type() == common::F16() || x->type() == common::F64()) { \
+      feature_.CurrentLoopBlock().float_##member += x->type().lanes();                            \
+    } else {                                                                                      \
+      feature_.CurrentLoopBlock().int_##member += x->type().lanes();                              \
+    }                                                                                             \
+    std::vector<const Expr *> sub_exprs = x->expr_fields();                                       \
+    for (const Expr *e : sub_exprs) {                                                             \
+      if (e->defined()) {                                                                         \
+        Visit(e);                                                                                 \
+      }                                                                                           \
+    }                                                                                             \
+  }
+    VisitForDtypePattern(Add, add_or_sub);
+VisitForDtypePattern(Sub, add_or_sub);
+VisitForDtypePattern(Minus, add_or_sub);
+VisitForDtypePattern(Mul, mul);
+VisitForDtypePattern(Div, div_or_mod);
+VisitForDtypePattern(Mod, div_or_mod);
+VisitForDtypePattern(FracOp, div_or_mod);
+VisitForDtypePattern(EQ, cmp);
+VisitForDtypePattern(NE, cmp);
+VisitForDtypePattern(GT, cmp);
+VisitForDtypePattern(GE, cmp);
+VisitForDtypePattern(LT, cmp);
+VisitForDtypePattern(LE, cmp);
+VisitForDtypePattern(Call, math_func);
+VisitForDtypePattern(PrimitiveNode, math_func);
+VisitForDtypePattern(Cast, other_call);
+VisitForDtypePattern(Let, other_call);
+#define VisitForMultiOperandsDtypePattern(NodeType, member)                                       \
+  void FeatureExtractor::Visit(const NodeType *x) {                                               \
+    if (x->type() == common::F32() || x->type() == common::F16() || x->type() == common::F64()) { \
+      feature_.CurrentLoopBlock().float_##member += (x->operands().size() - 1);                   \
+    } else {                                                                                      \
+      feature_.CurrentLoopBlock().int_##member += (x->operands().size() - 1);                     \
+    }                                                                                             \
+    std::vector<const Expr *> sub_exprs = x->expr_fields();                                       \
+    for (const Expr *e : sub_exprs) {                                                             \
+      if (e->defined()) {                                                                         \
+        Visit(e);                                                                                 \
+      }                                                                                           \
+    }                                                                                             \
+  }
+VisitForMultiOperandsDtypePattern(Sum, add_or_sub);
+VisitForMultiOperandsDtypePattern(Product, mul);
+#define VisitCountMemberPattern(NodeType, member)           \
+  void FeatureExtractor::Visit(const NodeType *x) {         \
+    feature_.CurrentLoopBlock().member += 1;                \
+    std::vector<const Expr *> sub_exprs = x->expr_fields(); \
+    for (const Expr *e : sub_exprs) {                       \
+      if (e->defined()) {                                   \
+        Visit(e);                                           \
+      }                                                     \
+    }                                                       \
+  }
+VisitCountMemberPattern(And, bool_op);
+VisitCountMemberPattern(Or, bool_op);
+VisitCountMemberPattern(Not, bool_op);
+VisitCountMemberPattern(Max, select_op);
+VisitCountMemberPattern(Min, select_op);
+VisitCountMemberPattern(IfThenElse, select_op);
+VisitCountMemberPattern(Select, select_op);
+VisitCountMemberPattern(Alloc, mem_alloc);
+VisitCountMemberPattern(Free, mem_free);
+VisitCountMemberPattern(Load, mem_read);
+VisitCountMemberPattern(Store, mem_write);
+/* Visit for loops */
+void FeatureExtractor::Visit(const For *x) {
+  feature_.IntoLoopBlock();
+  LoopBlockFeature &loop_feature = feature_.CurrentLoopBlock();
+  if (x->min.is_constant() && x->extent.is_constant()) {
+    loop_feature.loop_length = (x->extent.get_constant() - x->min.get_constant());
+  } else {
+    loop_feature.loop_length = -1;  // -1 represents unknown
+  }
+  if (x->is_parallel()) {
+    loop_feature.loop_opt_type = ForOptimizeFeatureEnum::kParallel;
+    loop_feature.len_vthread   = loop_feature.loop_length;
+  } else if (x->is_unrolled()) {
+    loop_feature.loop_opt_type = ForOptimizeFeatureEnum::kUnroll;
+  } else if (x->is_vectorized()) {
+    loop_feature.loop_opt_type    = ForOptimizeFeatureEnum::kVectorize;
+    loop_feature.vectorize_factor = x->vectorize_info().factor;
+  } else if (x->is_binded()) {
+    loop_feature.loop_opt_type = ForOptimizeFeatureEnum::kGpuBind;
+    const BindInfo &bind_info  = x->bind_info();
+    int offset                 = bind_info.offset;
+    if (bind_info.for_type == ForType::GPUBlock) {
+      if (offset == 0) {
+        loop_feature.len_blockIdx_x = loop_feature.loop_length;
+      } else if (offset == 1) {
+        loop_feature.len_blockIdx_y = loop_feature.loop_length;
+      } else if (offset == 2) {
+        loop_feature.len_blockIdx_z = loop_feature.loop_length;
+      }
+    } else if (bind_info.for_type == ForType::GPUThread) {
+      if (offset == 0) {
+        loop_feature.len_threadIdx_x = loop_feature.loop_length;
+      } else if (offset == 1) {
+        loop_feature.len_threadIdx_y = loop_feature.loop_length;
+      } else if (offset == 2) {
+        loop_feature.len_threadIdx_z = loop_feature.loop_length;
+      }
+    }
+  }
+  std::vector<const Expr *> sub_exprs = x->expr_fields();
+  for (const Expr *e : sub_exprs) {
+    Visit(e);
+  }
+  feature_.ExitLoopBlock();
+}
+void FeatureExtractor::Visit(const PolyFor *x) {
+  Expr copy = optim::IRCopy(Expr(x));
+  feature_.IntoLoopBlock();
+  optim::TransformPolyForToFor(&copy);
+  ir::For *loop = copy.As<For>();
+  CHECK(loop != nullptr);
+  Visit(loop);
+  feature_.ExitLoopBlock();
+}
+/* Visit for Reduce and Broadcast */
+void FeatureExtractor::Visit(const Reduce *x) {
+  if (x->type() == common::F32() || x->type() == common::F16() || x->type() == common::F64()) {
+    switch (x->reduce_type) {
+      case Reduce::ReduceType::kSum:
+        feature_.CurrentLoopBlock().float_reduce_sum_or_sub += x->type().lanes();
+        break;
+      case Reduce::ReduceType::kSub:
+        feature_.CurrentLoopBlock().float_reduce_sum_or_sub += x->type().lanes();
+        break;
+      case Reduce::ReduceType::kDiv:
+        feature_.CurrentLoopBlock().float_reduce_div += x->type().lanes();
+        break;
+      case Reduce::ReduceType::kMul:
+        feature_.CurrentLoopBlock().float_reduce_mul += x->type().lanes();
+        break;
+      case Reduce::ReduceType::kMax:
+        feature_.CurrentLoopBlock().float_reduce_max_or_min += x->type().lanes();
+        break;
+      case Reduce::ReduceType::kMin:
+        feature_.CurrentLoopBlock().float_reduce_max_or_min += x->type().lanes();
+        break;
+    }
+  } else {
+    switch (x->reduce_type) {
+      case Reduce::ReduceType::kSum:
+        feature_.CurrentLoopBlock().int_reduce_sum_or_sub += x->type().lanes();
+        break;
+      case Reduce::ReduceType::kSub:
+        feature_.CurrentLoopBlock().int_reduce_sum_or_sub += x->type().lanes();
+        break;
+      case Reduce::ReduceType::kDiv:
+        feature_.CurrentLoopBlock().int_reduce_div += x->type().lanes();
+        break;
+      case Reduce::ReduceType::kMul:
+        feature_.CurrentLoopBlock().int_reduce_mul += x->type().lanes();
+        break;
+      case Reduce::ReduceType::kMax:
+        feature_.CurrentLoopBlock().int_reduce_max_or_min += x->type().lanes();
+        break;
+      case Reduce::ReduceType::kMin:
+        feature_.CurrentLoopBlock().int_reduce_max_or_min += x->type().lanes();
+        break;
+    }
+  }
+  std::vector<const Expr *> sub_exprs = x->expr_fields();
+  for (const Expr *e : sub_exprs) {
+    Visit(e);
+  }
+}
+VisitForDtypePattern(Broadcast, broadcast);
+/* Visit for IntrinsicOp */
+void FeatureExtractor::Visit(const IntrinsicOp *x) {
+  switch (x->getKind()) {
+#define __(op__)                                \
+  case IntrinsicKind::k##op__:                  \
+    Visit(llvm::dyn_cast<intrinsics::op__>(x)); \
+    break;
+    INTRINSIC_KIND_FOR_EACH(__)
+#undef __
+  }
+}
+VisitDoNothing(intrinsics::BufferGetDataHandle);
+VisitDoNothing(intrinsics::BufferGetDataConstHandle);
+VisitDoNothing(intrinsics::PodValueToX);
+VisitDoNothing(intrinsics::BufferCreate);
+VisitDoNothing(intrinsics::GetAddr);
+VisitDoNothing(intrinsics::ArgsConstruct);
+VisitForDtypePattern(intrinsics::BuiltinIntrin, other_call)
+}  // namespace auto_schedule
+}  // namespace cinn
--- a/paddle/cinn/auto_schedule/cost_model/feature_extractor.h
+++ b/paddle/cinn/auto_schedule/cost_model/feature_extractor.h
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/cinn/auto_schedule/cost_model/feature.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/ir/ir_schedule.h"
+#include "paddle/cinn/ir/ir_visitor.h"
+namespace cinn {
+namespace auto_schedule {
+class FeatureExtractor : public ir::IRVisitor {
+ public:
+  FeatureExtractor();
+  Feature Extract(const ir::ModuleExpr& mod_expr, const common::Target& target);
+  void Visit(const Expr* x) override;
+#define __(op__) void Visit(const ir::op__* x) override;
+  NODETY_FORALL(__)
+#undef __
+#define __(op__) virtual void Visit(const ir::intrinsics::op__* x);
+  INTRINSIC_KIND_FOR_EACH(__)
+#undef __
+ private:
+  Feature feature_;
+};
+}  // namespace auto_schedule
+}  // namespace cinn
--- a/paddle/cinn/auto_schedule/cost_model/feature_extractor_test.cc
+++ b/paddle/cinn/auto_schedule/cost_model/feature_extractor_test.cc
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/cinn/auto_schedule/cost_model/feature_extractor.h"
+#include <gtest/gtest.h>
+#include <pybind11/embed.h>
+#include <cmath>
+#include <unordered_set>
+#include <vector>
+#include "paddle/cinn/common/context.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/ir/ir_schedule.h"
+#include "paddle/cinn/lang/builtin.h"
+#include "paddle/cinn/lang/compute.h"
+#include "paddle/cinn/lang/lower.h"
+#include "paddle/cinn/lang/placeholder.h"
+#include "paddle/cinn/poly/stage.h"
+namespace cinn {
+namespace auto_schedule {
+TEST(FeatureExtractor, SimpleAssign) {
+  Context::Global().ResetNameId();
+#ifdef CINN_WITH_CUDA
+  Target target = common::DefaultNVGPUTarget();
+#else
+  Target target = common::DefaultHostTarget();
+#endif
+  ir::Expr M(32);
+  ir::Expr N(32);
+  lang::Placeholder<float> A("A", {M, N});
+  ir::Tensor B = lang::Compute(
+      {M, N}, [&](Var i, Var j) { return A(i, j); }, "B");
+  poly::StageMap stages              = poly::CreateStages({A, B});
+  std::vector<ir::LoweredFunc> funcs = lang::LowerVec("SimpleAssign", stages, {A, B}, {}, {}, nullptr, target, true);
+  ir::Expr ast_expr                  = funcs[0]->body;
+  VLOG(6) << "Expr to test: " << ast_expr;
+  std::vector<Expr> vec_ast{ast_expr};
+  ir::ModuleExpr mod_expr(vec_ast);
+  FeatureExtractor extractor;
+  Feature feature = extractor.Extract(mod_expr, target);
+  std::vector<float> to_check = feature.ToFixedSizeVector();
+  ASSERT_EQ(to_check.size(), static_cast<size_t>(LoopBlockFeature::kTotalSize + 1));
+  VLOG(6) << "Feature data before slog:";
+  for (size_t i = 0; i < to_check.size(); ++i) {
+    VLOG(6) << i << " " << (std::pow(2, to_check[i]) - 1);
+    if (i != 0 && i != 17 && i != 18 && i != 29) {
+      ASSERT_EQ(to_check[i], 0);
+    }
+  }
+  // target
+#ifdef CINN_WITH_CUDA
+  ASSERT_EQ(to_check[0], 1);
+#else
+  ASSERT_EQ(to_check[0], 0);
+#endif
+  // mem_read
+  ASSERT_EQ(to_check[17], slog(M.get_constant() * N.get_constant()));  // mem_read
+  // mem_write
+  ASSERT_EQ(to_check[18], slog(M.get_constant() * N.get_constant()));  // mem_write
+  // non-opt loops, including root block
+  ASSERT_EQ(to_check[29], slog(3));
+}
+TEST(FeatureExtractor, MatrixMultiply) {
+  Context::Global().ResetNameId();
+#ifdef CINN_WITH_CUDA
+  Target target = common::DefaultNVGPUTarget();
+#else
+  Target target = common::DefaultHostTarget();
+#endif
+  ir::Expr M(2);
+  ir::Expr N(2);
+  ir::Expr K(4);
+  lang::Placeholder<float> A("A", {M, K});
+  lang::Placeholder<float> B("B", {K, N});
+  ir::Var k(K.as_int32(), "reduce_axis_k");
+  ir::Tensor C = lang::Compute(
+      {M, N}, [&](Var i, Var j) { return lang::ReduceSum(A(i, k) * B(k, j), {k}); }, "C");
+  poly::StageMap stages              = poly::CreateStages({C});
+  std::vector<ir::LoweredFunc> funcs = lang::LowerVec("MatrixMultiply", stages, {C}, {}, {}, nullptr, target, true);
+  std::vector<Expr> vec_ast{funcs[0]->body};
+  ir::ModuleExpr mod_expr(vec_ast);
+  ir::IRSchedule ir_sch(mod_expr);
+  std::vector<ir::Expr> blocks = ir_sch.GetAllBlocks();
+  std::vector<ir::Expr> loops  = ir_sch.GetLoops(blocks[0]);
+  ir_sch.Bind(loops.back(), "threadIdx.x");
+  ir::Expr ast_expr = mod_expr.GetExprs()[0];
+  VLOG(6) << "Expr to test: " << ast_expr;
+  FeatureExtractor extractor;
+  Feature feature = extractor.Extract(mod_expr, target);
+  std::vector<float> to_check = feature.ToFixedSizeVector();
+  ASSERT_EQ(to_check.size(), static_cast<size_t>(LoopBlockFeature::kTotalSize + 1));
+  std::unordered_set<size_t> non_zero_indice = {0, 1, 2, 17, 18, 29, 30, 37};
+  for (size_t i = 0; i < to_check.size(); ++i) {
+    VLOG(6) << i << " " << (std::pow(2, to_check[i]) - 1);
+    if (!non_zero_indice.count(i)) {
+      ASSERT_EQ(to_check[i], 0);
+    }
+  }
+  // target
+#ifdef CINN_WITH_CUDA
+  ASSERT_EQ(to_check[0], 1);
+#else
+  ASSERT_EQ(to_check[0], 0);
+#endif
+  float out_loop   = M.get_constant() * N.get_constant();
+  float total_loop = out_loop * K.get_constant();
+  // float_mul
+  ASSERT_EQ(to_check[1], slog(total_loop));
+  // float_add_or_sub
+  ASSERT_EQ(to_check[2], slog(total_loop));
+  // mem_read
+  ASSERT_EQ(to_check[17], slog(total_loop * 3));
+  // mem_write
+  ASSERT_EQ(to_check[18], slog(total_loop + out_loop));
+  // non-opt loops, including root block
+  ASSERT_EQ(to_check[29], slog(3));
+  // GpuBind loop
+  ASSERT_EQ(to_check[30], slog(1));
+  // GpuBind loop
+  ASSERT_EQ(to_check[37], slog(out_loop));
+}
+}  // namespace auto_schedule
+}  // namespace cinn
--- a/paddle/cinn/auto_schedule/cost_model/feature_test.cc
+++ b/paddle/cinn/auto_schedule/cost_model/feature_test.cc
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/cinn/auto_schedule/cost_model/feature.h"
+#include <gtest/gtest.h>
+#include <pybind11/embed.h>
+namespace cinn {
+namespace auto_schedule {
+TEST(Feature, Basic) {
+  // TODO(zhhsplendid): add some basic tests
+}
+}  // namespace auto_schedule
+}  // namespace cinn
--- a/paddle/cinn/auto_schedule/cost_model/xgb_cost_model.cc
+++ b/paddle/cinn/auto_schedule/cost_model/xgb_cost_model.cc
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/cinn/auto_schedule/cost_model/xgb_cost_model.h"
+#include <dirent.h>
+#include <glog/logging.h>
+#include <pybind11/embed.h>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <atomic>
+#include <cassert>
+#include <cstring>
+#include <iostream>
+#include <memory>
+#include <mutex>
+#include <regex>
+#include <string>
+#include <utility>
+#include <vector>
+#include "paddle/cinn/common/python_interpreter_guard.h"
+namespace cinn {
+namespace auto_schedule {
+std::atomic<int> XgbCostModel::xgb_cost_model_count_(0);
+// Convert 1D vector to py numpy
+template <typename Dtype>
+pybind11::array VectorToNumpy(const std::vector<Dtype>& vec) {
+  return pybind11::array(pybind11::cast(vec));
+}
+// Convert 2D vector to py numpy
+template <typename Dtype>
+pybind11::array VectorToNumpy(const std::vector<std::vector<Dtype>>& vec) {
+  if (vec.size() == 0) {
+    return pybind11::array(pybind11::dtype::of<Dtype>(), {0, 0});
+  }
+  std::vector<size_t> shape{vec.size(), vec[0].size()};
+  pybind11::array ret(pybind11::dtype::of<Dtype>(), shape);
+  Dtype* py_data = static_cast<Dtype*>(ret.mutable_data());
+  for (size_t i = 0; i < vec.size(); ++i) {
+    assert(vec[i].size() == shape[1] && "Sub vectors must have same size in VectorToNumpy");
+    memcpy(py_data + (shape[1] * i), vec[i].data(), shape[1] * sizeof(Dtype));
+  }
+  return ret;
+}
+// the Pybind default Python interpreter doesn't contain some paths in
+// sys.path, so we have to add it.
+//
+// Note: the Pybind default Python interpreter only uses default Python.
+// Something may be wrong when users use virtual Python environment.
+void AddDistPkgToPythonSysPath() {
+  pybind11::module sys_py_mod = pybind11::module::import("sys");
+  // short version such as "3.7", "3.8", ...
+  std::string py_short_version = sys_py_mod.attr("version").cast<std::string>().substr(0, 3);
+  std::string site_pkg_str = "/usr/local/lib/python" + py_short_version + "/dist-packages";
+  sys_py_mod.attr("path").attr("append")(site_pkg_str);
+  // TODO(zhhsplendid): warning to users if setuptools hasn't been installed
+  DIR* site_pkg_dir = opendir(site_pkg_str.c_str());
+  if (site_pkg_dir != nullptr) {
+    std::regex setuptool_regex("setuptools-.*-py" + py_short_version + "\\.egg");
+    struct dirent* entry = nullptr;
+    while ((entry = readdir(site_pkg_dir)) != nullptr) {
+      if (std::regex_match(entry->d_name, setuptool_regex)) {
+        sys_py_mod.attr("path").attr("append")(site_pkg_str + "/" + entry->d_name);
+      }
+    }
+    closedir(site_pkg_dir);
+  }
+}
+XgbCostModel::XgbCostModel() {
+  common::PythonInterpreterGuard::Guard();
+  int previous = xgb_cost_model_count_.fetch_add(1);
+  if (previous == 0) {
+    AddDistPkgToPythonSysPath();
+  }
+  xgb_module_  = pybind11::module::import("xgboost");
+  xgb_booster_ = xgb_module_.attr("Booster")();
+}
+void XgbCostModel::Train(const std::vector<std::vector<float>>& samples, const std::vector<float>& labels) {
+  update_samples_            = samples;
+  update_labels_             = labels;
+  pybind11::array np_samples = VectorToNumpy<float>(samples);
+  pybind11::array np_labels  = VectorToNumpy<float>(labels);
+  pybind11::object dmatrix = xgb_module_.attr("DMatrix")(np_samples, np_labels);
+  xgb_booster_             = xgb_module_.attr("train")(pybind11::dict(), dmatrix, pybind11::int_(kTrainRound_));
+}
+std::vector<float> XgbCostModel::Predict(const std::vector<std::vector<float>>& samples) const {
+  pybind11::array np_samples = VectorToNumpy<float>(samples);
+  pybind11::object dmatrix   = xgb_module_.attr("DMatrix")(np_samples);
+  pybind11::array py_result  = xgb_booster_.attr("predict")(dmatrix);
+  return py_result.cast<std::vector<float>>();
+}
+void XgbCostModel::Update(const std::vector<std::vector<float>>& samples, const std::vector<float>& labels) {
+  update_samples_.insert(update_samples_.end(), samples.begin(), samples.end());
+  update_labels_.insert(update_labels_.end(), labels.begin(), labels.end());
+  pybind11::array np_samples = VectorToNumpy<float>(update_samples_);
+  pybind11::array np_labels  = VectorToNumpy<float>(update_labels_);
+  pybind11::object dmatrix = xgb_module_.attr("DMatrix")(np_samples, np_labels);
+  xgb_booster_             = xgb_module_.attr("train")(pybind11::dict(), dmatrix, pybind11::int_(kTrainRound_));
+}
+void XgbCostModel::Save(const std::string& path) { xgb_booster_.attr("save_model")(pybind11::str(path)); }
+void XgbCostModel::Load(const std::string& path) { xgb_booster_.attr("load_model")(pybind11::str(path)); }
+}  // namespace auto_schedule
+}  // namespace cinn
--- a/paddle/cinn/auto_schedule/cost_model/xgb_cost_model.h
+++ b/paddle/cinn/auto_schedule/cost_model/xgb_cost_model.h
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <pybind11/embed.h>
+#include <atomic>
+#include <memory>
+#include <mutex>
+#include <string>
+#include <vector>
+#include "paddle/cinn/common/cost_model.h"
+namespace cinn {
+namespace auto_schedule {
+/**
+ * A C++ cost model which calls Python xgboost via pybind
+ *
+ * Note: this class handles Python interpreter life time in class.
+ * If you have to call other Python functions out of this class so that meet
+ * life time conflict, you can check cinn::common::PythonInterpreterGuard
+ *
+ * For cinn::common::PythonInterpreterGuard, see:
+ *   cinn/common/python_interpreter_guard.h .cc
+ *
+ * For pybind interpreter lifetime management, see:
+ *
+ *   https://pybind11.readthedocs.io/en/stable/advanced/embedding.html#interpreter-lifetime
+ *   https://pybind11.readthedocs.io/en/stable/reference.html#_CPPv422initialize_interpreterbiPPCKcb
+ */
+class XgbCostModel : public CostModel {
+ public:
+  XgbCostModel();
+  ~XgbCostModel() = default;
+  void Train(const std::vector<std::vector<float>>& samples, const std::vector<float>& labels) override;
+  std::vector<float> Predict(const std::vector<std::vector<float>>& samples) const override;
+  void Update(const std::vector<std::vector<float>>& samples, const std::vector<float>& labels) override;
+  void Save(const std::string& path) override;
+  void Load(const std::string& path) override;
+ private:
+  // Python xgboost module
+  pybind11::module xgb_module_;
+  // Object points to Python xgb.Booster()
+  pybind11::object xgb_booster_;
+  // atomic int to handle python interpreter lifetime and package dependency
+  static std::atomic<int> xgb_cost_model_count_;
+  // Default train rounds
+  static constexpr int kTrainRound_ = 10;
+  std::vector<std::vector<float>> update_samples_;
+  std::vector<float> update_labels_;
+};
+}  // namespace auto_schedule
+}  // namespace cinn
--- a/paddle/cinn/auto_schedule/cost_model/xgb_cost_model_test.cc
+++ b/paddle/cinn/auto_schedule/cost_model/xgb_cost_model_test.cc
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/cinn/auto_schedule/cost_model/xgb_cost_model.h"
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include <pybind11/embed.h>
+#include <cstdio>
+#include <cstdlib>
+#include <ctime>
+#include <vector>
+namespace cinn {
+namespace auto_schedule {
+TEST(CostModel, Basic) {
+  XgbCostModel cost_model;
+  srand(time(NULL));
+  int batch_size   = 16;
+  int feature_size = 8;
+  std::vector<float> labels(batch_size, 1.0);
+  std::vector<std::vector<float>> samples(batch_size, std::vector<float>(feature_size));
+  for (int i = 0; i < batch_size; ++i) {
+    for (int j = 0; j < feature_size; ++j) {
+      samples[i][j] = rand() % 10;
+    }
+  }
+  cost_model.Train(samples, labels);
+  std::vector<float> pred = cost_model.Predict(samples);
+  std::string path = "./test_cost_model.cpp_save_model";
+  cost_model.Save(path);
+  XgbCostModel load_cost_model;
+  load_cost_model.Load(path);
+  std::vector<float> load_pred = cost_model.Predict(samples);
+  ASSERT_EQ(pred.size(), load_pred.size());
+  for (size_t i = 0; i < pred.size(); ++i) {
+    ASSERT_FLOAT_EQ(pred[i], load_pred[i]);
+    VLOG(6) << "pred[" << i << "] = " << pred[i];
+  }
+  std::remove(path.c_str());
+  cost_model.Update(samples, labels);
+  pred = cost_model.Predict(samples);
+  for (size_t i = 0; i < pred.size(); ++i) {
+    VLOG(6) << "pred[" << i << "] = " << pred[i];
+  }
+}
+}  // namespace auto_schedule
+}  // namespace cinn
--- a/paddle/cinn/auto_schedule/database/CMakeLists.txt
+++ b/paddle/cinn/auto_schedule/database/CMakeLists.txt
+core_gather_headers()
+gather_srcs(cinnapi_src SRCS database.cc jsonfile_database.cc)
+cinn_cc_test(test_database SRCS database_test.cc DEPS cinncore)
+cinn_cc_test(test_jsonfile_database SRCS jsonfile_database_test.cc DEPS cinncore)
--- a/paddle/cinn/auto_schedule/database/database.cc
+++ b/paddle/cinn/auto_schedule/database/database.cc
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/cinn/auto_schedule/database/database.h"
+#include <google/protobuf/message.h>
+#include <google/protobuf/text_format.h>
+#include <google/protobuf/util/json_util.h>
+#include "paddle/cinn/auto_schedule/database/jsonfile_database.h"
+#include "paddle/cinn/auto_schedule/task/task_registry.h"
+#include "paddle/cinn/ir/ir_schedule.h"
+#include "paddle/cinn/ir/schedule_desc.h"
+namespace cinn {
+namespace auto_schedule {
+bool TuningRecord::Compare::operator()(const TuningRecord& lhs, const TuningRecord& rhs) const {
+  return lhs.execution_cost < rhs.execution_cost;
+}
+proto::TuningRecord TuningRecord::ToProto() const {
+  proto::TuningRecord record_proto;
+  record_proto.set_task_key(task_key);
+  record_proto.set_execution_cost(execution_cost);
+  record_proto.set_predicted_cost(predicted_cost);
+  record_proto.mutable_trace()->CopyFrom(trace);
+  return record_proto;
+}
+Database::Database(int capacity_per_task) : capacity_per_task_(capacity_per_task) {
+  CHECK_GT(capacity_per_task_, 0) << "capacity_per_task_ should be greater than 0";
+}
+std::unique_ptr<Database> Database::Make(const DatabaseConfig& config) {
+  if (config.type == DatabaseType::kMemory) {
+    return std::make_unique<Database>(config.capacity_per_task);
+  } else if (config.type == DatabaseType::kJSONFile) {
+    return std::make_unique<JSONFileDatabase>(config.capacity_per_task, config.record_file_path, true);
+  }
+  LOG(FATAL) << "Unimplemented database type.";
+  return nullptr;
+}
+void Database::Insert(const TuningRecord& record) {
+  auto& records = key2record_[record.task_key];
+  records.emplace(record);
+  if (records.size() > capacity_per_task_) {
+    records.erase(std::prev(records.end()));
+  }
+}
+bool Database::AddRecord(const TuningRecord& record) {
+  CHECK(!record.task_key.empty()) << "task_key of TuningRecord can't be empty";
+  Insert(record);
+  return Commit(record);
+}
+std::vector<TuningRecord> Database::LookUp(const std::string& task_key) {
+  auto fit = key2record_.find(task_key);
+  if (fit == key2record_.end()) {
+    return {};
+  }
+  std::vector<TuningRecord> results;
+  results.reserve(fit->second.size());
+  results.assign(fit->second.begin(), fit->second.end());
+  return results;
+}
+std::vector<TuningRecord> Database::GetTopK(const std::string& task_key, int k) {
+  auto fit = key2record_.find(task_key);
+  if (fit == key2record_.end() || k <= 0) {
+    return {};
+  }
+  if (k > capacity_per_task_) {
+    LOG(WARNING) << "Top k=" << k << " is greater than the capacity, will adjust k=" << capacity_per_task_;
+    k = capacity_per_task_;
+  }
+  std::vector<TuningRecord> results;
+  results.reserve(k);
+  for (const TuningRecord& record : fit->second) {
+    results.emplace_back(record);
+    if (results.size() == k) {
+      break;
+    }
+  }
+  return results;
+}
+size_t Database::Size() {
+  auto res =
+      std::accumulate(key2record_.begin(), key2record_.end(), size_t(0), [](size_t res, const auto& kv) -> size_t {
+        return std::move(res) + kv.second.size();
+      });
+  return res;
+}
+size_t Database::Count(const std::string& task_key) {
+  auto fit = key2record_.find(task_key);
+  if (fit == key2record_.end()) {
+    return 0;
+  }
+  return fit->second.size();
+}
+}  // namespace auto_schedule
+}  // namespace cinn
--- a/paddle/cinn/auto_schedule/database/database.h
+++ b/paddle/cinn/auto_schedule/database/database.h
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <unordered_map>
+#include "paddle/cinn/auto_schedule/auto_schedule.pb.h"
+#include "paddle/cinn/auto_schedule/search_space/search_state.h"
+#include "paddle/cinn/ir/schedule_desc.pb.h"
+namespace cinn {
+namespace auto_schedule {
+// Record related data about tuning process of a measure candidate
+struct TuningRecord {
+  // the unique key to identify a task
+  std::string task_key;
+  // the predicted cost of CostModel
+  float predicted_cost;  // unit: us
+  // the ScheduleDesc of this tuning process
+  ir::proto::ScheduleDesc trace;
+  // the cost time of the candidate executed during measure
+  double execution_cost;  // unit: us
+  TuningRecord() = default;
+  TuningRecord(const proto::TuningRecord& record)
+      : task_key(record.task_key()),
+        predicted_cost(record.predicted_cost()),
+        trace(record.trace()),
+        execution_cost(record.execution_cost()) {}
+  TuningRecord(const std::string& task_key, const SearchState& state, double execution_cost)
+      : task_key(task_key),
+        predicted_cost(state->predicted_cost),
+        trace(state->ir_schedule.GetTraceDesc().ToProto()),
+        execution_cost(execution_cost) {}
+  // convert to proto object
+  proto::TuningRecord ToProto() const;
+  // a binary compare function that denotes when the left
+  // will be sorted in the front of the right
+  struct Compare {
+    bool operator()(const TuningRecord& lhs, const TuningRecord& rhs) const;
+  };
+};
+enum class DatabaseType : int { kMemory, kJSONFile };
+struct DatabaseConfig {
+  DatabaseType type            = DatabaseType::kMemory;
+  int capacity_per_task        = 2;
+  std::string record_file_path = "/tmp/tuning_record.json";
+};
+// A database supports insert or lookup historial tuning result with specified traits.
+// It can be implemented with a concrete storage to save/load underlying data,
+// such as memory, file, database server and so on, this base class can be regarded as
+// one using memory as its underlying storage medium.
+class Database {
+ public:
+  explicit Database(int capacity_per_task);
+  ~Database() = default;
+  // Create a Database with the specific config
+  static std::unique_ptr<Database> Make(const DatabaseConfig& config);
+  // add a record into the database
+  bool AddRecord(const TuningRecord& record);
+  // return all records whose task_keys are equal to the specified key
+  std::vector<TuningRecord> LookUp(const std::string& task_key);
+  // return the states of the top k in sorted candidates
+  std::vector<TuningRecord> GetTopK(const std::string& task_key, int k);
+  // return the total number of stored candidates
+  size_t Size();
+  // return the number of stored candidates with specified key
+  size_t Count(const std::string& task_key);
+ protected:
+  // commit the newly added record into underlying storage
+  virtual bool Commit(const TuningRecord& record) { return true; }
+  // insert a newly added record into memory storage
+  void Insert(const TuningRecord& record);
+  // map task_key to its records
+  std::unordered_map<std::string, std::multiset<TuningRecord, TuningRecord::Compare>> key2record_;
+  // the max number of candidates stored
+  const int capacity_per_task_;
+};
+}  // namespace auto_schedule
+}  // namespace cinn
--- a/paddle/cinn/auto_schedule/database/database_test.cc
+++ b/paddle/cinn/auto_schedule/database/database_test.cc
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/cinn/auto_schedule/database/database.h"
+#include <gtest/gtest.h>
+#include <vector>
+#include "paddle/cinn/auto_schedule/auto_schedule.pb.h"
+#include "paddle/cinn/auto_schedule/search_space/search_state.h"
+#include "paddle/cinn/ir/ir_schedule.h"
+namespace cinn {
+namespace auto_schedule {
+class TestDatabase : public ::testing::Test {
+ public:
+  TestDatabase() : test_db(2) {
+    auto state = SearchState(ir::IRSchedule());
+    test_db.AddRecord(TuningRecord("k1", state, 1.0));
+    test_db.AddRecord(TuningRecord("k2", state, 2.0));
+    test_db.AddRecord(TuningRecord("k2", state, 3.0));
+    test_db.AddRecord(TuningRecord("k3", state, 3.0));
+    test_db.AddRecord(TuningRecord("k3", state, 4.0));
+    test_db.AddRecord(TuningRecord("k3", state, 5.0));
+    test_db.AddRecord(TuningRecord("k4", state, 4.0));
+  }
+  void SetUp() override {}
+  Database test_db;
+};
+TEST_F(TestDatabase, Basic) {
+  ASSERT_EQ(test_db.Size(), 6);
+  auto records = test_db.LookUp("k3");
+  // check the max number of stored candidates will
+  // be restricted to capacity_per_task
+  ASSERT_EQ(test_db.Count("k3"), 2);
+  ASSERT_EQ(records.size(), 2);
+  EXPECT_EQ(records[0].execution_cost, 3.0);
+  EXPECT_EQ(records[1].execution_cost, 4.0);
+}
+TEST_F(TestDatabase, GetTopK) {
+  ASSERT_TRUE(test_db.GetTopK("k5", 2).empty());
+  ASSERT_EQ(test_db.GetTopK("k4", 3).size(), 1);
+  test_db.AddRecord(TuningRecord("k4", SearchState(ir::IRSchedule(), 1.2), 2.0));
+  test_db.AddRecord(TuningRecord("k4", SearchState(ir::IRSchedule(), 1.0), 3.0));
+  auto records = test_db.GetTopK("k4", 3);
+  ASSERT_EQ(records.size(), 2);
+  EXPECT_FLOAT_EQ(records[0].predicted_cost, 1.2);
+  EXPECT_FLOAT_EQ(records[1].predicted_cost, 1.0);
+}
+}  // namespace auto_schedule
+}  // namespace cinn
--- a/paddle/cinn/auto_schedule/database/jsonfile_database.cc
+++ b/paddle/cinn/auto_schedule/database/jsonfile_database.cc
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/cinn/auto_schedule/database/jsonfile_database.h"
+#include <google/protobuf/message.h>
+#include <google/protobuf/text_format.h>
+#include <google/protobuf/util/json_util.h>
+#include <fstream>
+#include "paddle/cinn/auto_schedule/auto_schedule.pb.h"
+#include "paddle/cinn/auto_schedule/task/task_registry.h"
+#include "paddle/cinn/utils/multi_threading.h"
+namespace cinn {
+namespace auto_schedule {
+// append a line to file
+void AppendLineToFile(const std::string& file_path, const std::string& line) {
+  std::ofstream os(file_path, std::ofstream::app);
+  CHECK(os.good()) << "Cannot open the file to write: " << file_path;
+  os << line << std::endl;
+}
+// read lines from a json file
+std::vector<std::string> ReadLinesFromFile(const std::string& file_path, bool allow_new_file) {
+  std::ifstream is(file_path);
+  if (is.good()) {
+    std::vector<std::string> json_strs;
+    for (std::string str; std::getline(is, str);) {
+      json_strs.push_back(str);
+    }
+    return json_strs;
+  }
+  CHECK(allow_new_file) << "File doesn't exist: " << file_path;
+  std::ofstream os(file_path);
+  CHECK(os.good()) << "Cannot create new file: " << file_path;
+  return {};
+}
+JSONFileDatabase::JSONFileDatabase(int capacity_per_task, const std::string& record_file_path, bool allow_new_file)
+    : Database(capacity_per_task), record_file_path_(record_file_path) {
+  VLOG(3) << "Auto schedule will save/load tuning records on file:" << record_file_path;
+  auto json_lines = ReadLinesFromFile(record_file_path_, allow_new_file);
+  std::vector<cinn::auto_schedule::proto::TuningRecord> all_records_proto(json_lines.size());
+  // convert JSON string to proto object
+  auto worker_fn = [this, &json_lines, &all_records_proto](int index) {
+    cinn::auto_schedule::proto::TuningRecord record_proto;
+    auto status = google::protobuf::util::JsonStringToMessage(json_lines[index], &record_proto);
+    CHECK(status.ok()) << "Failed to parse JSON: " << json_lines[index];
+    all_records_proto[index].Swap(&record_proto);
+  };
+  utils::parallel_run(worker_fn, utils::SequenceDispatcher(0, json_lines.size()), -1);
+  InitialTaskRegistry* task_registry = InitialTaskRegistry::Global();
+  for (const auto& record_proto : all_records_proto) {
+    std::string task_key = record_proto.task_key();
+    if (task_registry->Has(task_key)) {
+      VLOG(4) << "Add a measured TuningRecord with task_key=" << task_key;
+      Insert(TuningRecord(record_proto));
+    }
+  }
+}
+// convert a TuningRecord object to string in JSON format
+std::string JSONFileDatabase::RecordToJSON(const TuningRecord& record) {
+  proto::TuningRecord record_proto = record.ToProto();
+  std::string json_string;
+  auto status = google::protobuf::util::MessageToJsonString(record_proto, &json_string);
+  CHECK(status.ok()) << "Failed to serialize record to JSON, task key = " << record.task_key;
+  VLOG(4) << "json_string = \n" << json_string;
+  return json_string;
+}
+bool JSONFileDatabase::Commit(const TuningRecord& record) {
+  std::string json_string = RecordToJSON(record);
+  AppendLineToFile(record_file_path_, json_string);
+  return true;
+}
+}  // namespace auto_schedule
+}  // namespace cinn
--- a/paddle/cinn/auto_schedule/database/jsonfile_database.h
+++ b/paddle/cinn/auto_schedule/database/jsonfile_database.h
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/cinn/auto_schedule/database/database.h"
+namespace cinn {
+namespace auto_schedule {
+// JSONFileDatabase is a database implemented by JSON file to save/load underlying data.
+class JSONFileDatabase : public Database {
+ public:
+  /*!
+   * \brief Build a JSONFileDatabase object from a json file.
+   * \param capacity_per_task The max number of candidates stored.
+   * \param record_file_path The path of the json file.
+   * \param allow_new_file Whether to create new file when the given path is not found.
+   */
+  JSONFileDatabase(int capacity_per_task, const std::string& record_file_path, bool allow_new_file);
+  ~JSONFileDatabase() = default;
+  // convert a TuningRecord object to string in JSON format
+  std::string RecordToJSON(const TuningRecord& record);
+ protected:
+  // commit the newly added record into json file
+  bool Commit(const TuningRecord& record) override;
+  // the name of the json file to save tuning records.
+  std::string record_file_path_;
+};
+// append a line to file
+void AppendLineToFile(const std::string& file_path, const std::string& line);
+// read lines from a json file
+std::vector<std::string> ReadLinesFromFile(const std::string& file_path, bool allow_new_file = true);
+}  // namespace auto_schedule
+}  // namespace cinn
--- a/paddle/cinn/auto_schedule/database/jsonfile_database_test.cc
+++ b/paddle/cinn/auto_schedule/database/jsonfile_database_test.cc
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/cinn/auto_schedule/database/jsonfile_database.h"
+#include <google/protobuf/util/message_differencer.h>
+#include <gtest/gtest.h>
+#include <fstream>
+#include <vector>
+#include "paddle/cinn/auto_schedule/search_space/search_state.h"
+#include "paddle/cinn/auto_schedule/task/task_registry.h"
+#include "paddle/cinn/cinn.h"
+#include "paddle/cinn/ir/ir_printer.h"
+#include "paddle/cinn/ir/ir_schedule.h"
+#include "paddle/cinn/optim/ir_copy.h"
+namespace cinn {
+namespace auto_schedule {
+// Return lowerd ir AST for example functions used in this test
+std::vector<ir::LoweredFunc> LowerCompute(const std::vector<int>& shape, const Target& target) {
+  CHECK(shape.size() == 2) << "shape should be 2";
+  std::vector<Expr> domain;
+  for (auto i = 0; i < shape.size(); ++i) {
+    domain.emplace_back(shape[i]);
+  }
+  Placeholder<float> A("A", domain);
+  ir::Tensor B, C;
+  B = Compute(
+      domain, [&A](Var i, Var j) { return A(i, j); }, "B");
+  C = Compute(
+      domain, [&B](Var i, Var j) { return B(i, j); }, "C");
+  return cinn::lang::LowerVec("test_func", CreateStages({A, B}), {A, B}, {}, {}, nullptr, target, true);
+}
+// Create a new IRSchedule with copied ir::LoweredFunc AST
+ir::IRSchedule MakeIRSchedule(const std::vector<ir::LoweredFunc>& lowered_funcs, const std::string& task_key) {
+  std::vector<Expr> exprs;
+  for (auto&& func : lowered_funcs) {
+    exprs.emplace_back(optim::IRCopy(func->body));
+  }
+  InitialTaskRegistry* task_registry = InitialTaskRegistry::Global();
+  task_registry->Regist(task_key, ir::ModuleExpr(exprs));
+  return ir::IRSchedule(ir::ModuleExpr(exprs));
+}
+class TestJSONFileDatabase : public ::testing::Test {
+ public:
+  TestJSONFileDatabase() : record_file_path("/tmp/test_record.json"), test_db(2, record_file_path, true) {}
+  void SetUp() override { lowered_funcs = LowerCompute({32, 32}, target); }
+  void TearDown() override {
+    auto isFileExists = [](const std::string& file_path) -> bool {
+      std::ifstream f(file_path.c_str());
+      return f.good();
+    };
+    if (isFileExists(record_file_path)) {
+      if (remove(record_file_path.c_str()) == 0) {
+        LOG(INFO) << "Successfully deleted file: " << record_file_path;
+      } else {
+        LOG(INFO) << "failed to delete file: " << record_file_path;
+      }
+    } else {
+      LOG(INFO) << "file: " << record_file_path << "does not exist.";
+    }
+  }
+  std::string record_file_path;
+  JSONFileDatabase test_db;
+  std::vector<ir::LoweredFunc> lowered_funcs;
+  Target target = common::DefaultHostTarget();
+};
+TEST_F(TestJSONFileDatabase, Serialize) {
+  ir::IRSchedule ir_sch = MakeIRSchedule(lowered_funcs, "test");
+  auto fused            = ir_sch.Fuse("B", {0, 1});
+  VLOG(3) << "after Fuse, Expr: " << fused;
+  TuningRecord record1("test", SearchState(std::move(ir_sch), 2.0), 1.0);
+  std::string str = test_db.RecordToJSON(record1);
+  VLOG(3) << "RecordToJSON: " << str;
+  // Because the serialization of protobuf does not guarantee the order, we give all possible results.
+  std::string case1 =
+      "{\"taskKey\":\"test\",\"executionCost\":1,\"predictedCost\":2,\"trace\":{\"steps\":[{\"type\":\"FuseWithName\","
+      "\"outputs\":[\"e0\"],\"attrs\":[{\"name\":\"loops_index\",\"dtype\":\"INTS\",\"ints\":[0,1]},{\"name\":\"block_"
+      "name\",\"dtype\":\"STRING\",\"s\":\"B\"}]}]}}";
+  std::string case2 =
+      "{\"taskKey\":\"test\",\"executionCost\":1,\"predictedCost\":2,\"trace\":{\"steps\":[{\"type\":\"FuseWithName\","
+      "\"outputs\":[\"e0\"],\"attrs\":[{\"name\":\"block_name\",\"dtype\":\"STRING\",\"s\":\"B\"},{\"name\":\"loops_"
+      "index\",\"dtype\":\"INTS\",\"ints\":[0,1]}]}]}}";
+  EXPECT_EQ(true, str == case1 || str == case2);
+}
+TEST_F(TestJSONFileDatabase, SaveLoad) {
+  ir::IRSchedule ir_sch1 = MakeIRSchedule(lowered_funcs, "k1");
+  auto fused1            = ir_sch1.Fuse("B", {0, 1});
+  ir::IRSchedule ir_sch2 = MakeIRSchedule(lowered_funcs, "k2");
+  test_db.AddRecord(TuningRecord("k1", SearchState(std::move(ir_sch1), 1.5), 1.0));
+  test_db.AddRecord(TuningRecord("k2", SearchState(std::move(ir_sch2), 3.5), 3.0));
+  std::vector<std::string> strs = ReadLinesFromFile(record_file_path);
+  ASSERT_EQ(strs.size(), 2);
+  // Because the serialization of protobuf does not guarantee the order, we give all possible results.
+  std::string case1 =
+      "{\"taskKey\":\"k1\",\"executionCost\":1,\"predictedCost\":1.5,\"trace\":{\"steps\":[{\"type\":\"FuseWithName\","
+      "\"outputs\":[\"e0\"],\"attrs\":[{\"name\":\"loops_index\",\"dtype\":\"INTS\",\"ints\":[0,1]},{\"name\":\"block_"
+      "name\",\"dtype\":\"STRING\",\"s\":\"B\"}]}]}}";
+  std::string case2 =
+      "{\"taskKey\":\"k1\",\"executionCost\":1,\"predictedCost\":1.5,\"trace\":{\"steps\":[{\"type\":\"FuseWithName\","
+      "\"outputs\":[\"e0\"],\"attrs\":[{\"name\":\"block_name\",\"dtype\":\"STRING\",\"s\":\"B\"},{\"name\":\"loops_"
+      "index\",\"dtype\":\"INTS\",\"ints\":[0,1]}]}]}}";
+  EXPECT_EQ(true, strs[0] == case1 || strs[0] == case2);
+  EXPECT_EQ(strs[1], "{\"taskKey\":\"k2\",\"executionCost\":3,\"predictedCost\":3.5,\"trace\":{}}");
+}
+TEST_F(TestJSONFileDatabase, Basic) {
+  test_db.AddRecord(TuningRecord("k1", SearchState(MakeIRSchedule(lowered_funcs, "k1"), 1.0), 1.0));
+  test_db.AddRecord(TuningRecord("k2", SearchState(MakeIRSchedule(lowered_funcs, "k2"), 1.0), 2.0));
+  test_db.AddRecord(TuningRecord("k2", SearchState(MakeIRSchedule(lowered_funcs, "k2"), 1.0), 3.0));
+  test_db.AddRecord(TuningRecord("k3", SearchState(MakeIRSchedule(lowered_funcs, "k3"), 8.0), 3.0));
+  test_db.AddRecord(TuningRecord("k3", SearchState(MakeIRSchedule(lowered_funcs, "k3"), 7.0), 4.0));
+  test_db.AddRecord(TuningRecord("k3", SearchState(MakeIRSchedule(lowered_funcs, "k3"), 6.0), 5.0));
+  test_db.AddRecord(TuningRecord("k4", SearchState(MakeIRSchedule(lowered_funcs, "k4"), 1.0), 4.0));
+  ASSERT_EQ(test_db.Size(), 6);
+  auto records = test_db.LookUp("k3");
+  // check the max number of stored candidates will
+  // be restricted to capacity_per_task
+  ASSERT_EQ(test_db.Count("k3"), 2);
+  ASSERT_EQ(records.size(), 2);
+  EXPECT_EQ(records[0].execution_cost, 3.0);
+  EXPECT_EQ(records[1].execution_cost, 4.0);
+}
+TEST_F(TestJSONFileDatabase, GetTopK) {
+  test_db.AddRecord(TuningRecord("k1", SearchState(MakeIRSchedule(lowered_funcs, "k1"), 1.0), 1.0));
+  test_db.AddRecord(TuningRecord("k2", SearchState(MakeIRSchedule(lowered_funcs, "k2"), 1.0), 2.0));
+  test_db.AddRecord(TuningRecord("k2", SearchState(MakeIRSchedule(lowered_funcs, "k2"), 1.0), 3.0));
+  test_db.AddRecord(TuningRecord("k3", SearchState(MakeIRSchedule(lowered_funcs, "k3"), 1.0), 3.0));
+  test_db.AddRecord(TuningRecord("k3", SearchState(MakeIRSchedule(lowered_funcs, "k3"), 1.0), 4.0));
+  test_db.AddRecord(TuningRecord("k3", SearchState(MakeIRSchedule(lowered_funcs, "k3"), 1.0), 5.0));
+  test_db.AddRecord(TuningRecord("k4", SearchState(MakeIRSchedule(lowered_funcs, "k4"), 2.0), 4.0));
+  test_db.AddRecord(TuningRecord("k4", SearchState(MakeIRSchedule(lowered_funcs, "k4"), 1.2), 2.0));
+  test_db.AddRecord(TuningRecord("k4", SearchState(MakeIRSchedule(lowered_funcs, "k4"), 1.0), 3.0));
+  auto records = test_db.GetTopK("k4", 3);
+  ASSERT_EQ(records.size(), 2);
+  EXPECT_FLOAT_EQ(records[0].predicted_cost, 1.2);
+  EXPECT_FLOAT_EQ(records[1].predicted_cost, 1.0);
+}
+TEST_F(TestJSONFileDatabase, Reload) {
+  ir::IRSchedule ir_sch = MakeIRSchedule(lowered_funcs, "k1");
+  auto fused            = ir_sch.Fuse("B", {0, 1});
+  test_db.AddRecord(TuningRecord("k1", SearchState(std::move(ir_sch), 1.0), 1.0));
+  test_db.AddRecord(TuningRecord("k2", SearchState(MakeIRSchedule(lowered_funcs, "k2"), 1.0), 2.0));
+  auto records = test_db.LookUp("k1");
+  ASSERT_EQ(records.size(), 1);
+  JSONFileDatabase new_db(2, record_file_path, false);
+  ASSERT_EQ(new_db.Size(), 2);
+  auto loaded_records = new_db.LookUp("k1");
+  ASSERT_EQ(records.size(), loaded_records.size());
+  EXPECT_EQ(records[0].task_key, loaded_records[0].task_key);
+  EXPECT_EQ(records[0].execution_cost, loaded_records[0].execution_cost);
+  EXPECT_EQ(records[0].predicted_cost, loaded_records[0].predicted_cost);
+  // check the equality of trace info between original TuningRecord and the loaded TuningRecord
+  const auto& lhs_trace = records[0].trace;
+  const auto& rhs_trace = loaded_records[0].trace;
+  google::protobuf::util::MessageDifferencer dif;
+  static const google::protobuf::Descriptor* descriptor = cinn::ir::proto::ScheduleDesc_Step::descriptor();
+  dif.TreatAsSet(descriptor->FindFieldByName("attrs"));
+  EXPECT_TRUE(dif.Compare(lhs_trace, rhs_trace));
+  // check the equality of module expr between original TuningRecord
+  // and the loaded TuningRecord by replaying with tracing ScheduleDesc
+  ir::IRSchedule lhs_sch = MakeIRSchedule(lowered_funcs, "k1");
+  ir::IRSchedule rhs_sch = MakeIRSchedule(lowered_funcs, "k1");
+  ir::ScheduleDesc::ReplayWithProto(lhs_trace, &lhs_sch);
+  ir::ScheduleDesc::ReplayWithProto(rhs_trace, &rhs_sch);
+  auto lhs_exprs = lhs_sch.GetModule().GetExprs();
+  auto rhs_exprs = rhs_sch.GetModule().GetExprs();
+  ASSERT_EQ(lhs_exprs.size(), rhs_exprs.size());
+  for (auto i = 0; i < lhs_exprs.size(); ++i) {
+    std::string lhs          = utils::GetStreamCnt(lhs_exprs.at(i));
+    std::string rhs          = utils::GetStreamCnt(rhs_exprs.at(i));
+    size_t remove_prefix_len = 28;
+    ASSERT_EQ(lhs.erase(0, remove_prefix_len), rhs.erase(0, remove_prefix_len));
+  }
+}
+}  // namespace auto_schedule
+}  // namespace cinn
--- a/paddle/cinn/auto_schedule/measure/CMakeLists.txt
+++ b/paddle/cinn/auto_schedule/measure/CMakeLists.txt
+core_gather_headers()
+gather_srcs(cinnapi_src SRCS schedule_measurer.cc simple_builder.cc simple_runner.cc)
+cinn_cc_test(test_simple_runner SRCS simple_runner_test.cc DEPS cinncore)
+cinn_cc_test(test_measurer SRCS measurer_test.cc DEPS cinncore)
--- a/paddle/cinn/auto_schedule/measure/measure.h
+++ b/paddle/cinn/auto_schedule/measure/measure.h
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+#include "paddle/cinn/auto_schedule/task/tune_task.h"
+#include "paddle/cinn/hlir/framework/graph_compiler.h"
+#include "paddle/cinn/hlir/framework/instruction.h"
+#include "paddle/cinn/runtime/cinn_runtime.h"
+namespace cinn {
+namespace auto_schedule {
+// The input to a measurer
+struct MeasureInput {
+  // The task object related to this measurement.
+  const TuneTask* task;
+  // lowered Exprs to be measured
+  std::vector<ir::LoweredFunc> lowered_funcs;
+  // It is used to pass for some arguments that maybe
+  // specified value in advance. default is null
+  const std::map<std::string, cinn_pod_value_t>* execution_args = nullptr;
+};
+// The result of a measurement
+struct MeasureResult {
+  // The time cost of execution in average of running
+  // with a specific repeated times.
+  double execution_cost = 0.0;  // unit: us
+  // The time cost of the whole measurement process including
+  // building and running
+  double elapsed_time = 0.0;  // unit: us
+  // used to return detail messages once an error occurred during measurement,
+  // empty if nothing goes wrong
+  std::string error_msg;
+};
+// The result of building with input schedule
+struct BuildResult {
+  // The scope that owns detail compilation infos of parameters in the runtime program
+  const hlir::framework::Scope* compiled_scope;
+  // The executable program
+  std::unique_ptr<hlir::framework::Program> runtime_program;
+};
+// This interface defines how to generate executable objects
+// with input schedule. A builder should not contain stateful data
+// related to any task so it can be called parallelly among multiple
+// processes of task tuning.
+class ScheduleBuilder {
+ public:
+  virtual BuildResult Build(const MeasureInput& input) = 0;
+};
+// This interface defines how to run the built result. Like above ScheduleBuilder,
+// a runner shoule be implemented with not bound to a specific task.
+class ScheduleRunner {
+ public:
+  virtual MeasureResult Run(const MeasureInput& input, const BuildResult& build_result) = 0;
+};
+}  // namespace auto_schedule
+}  // namespace cinn
--- a/paddle/cinn/auto_schedule/measure/measurer_test.cc
+++ b/paddle/cinn/auto_schedule/measure/measurer_test.cc
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <gtest/gtest.h>
+#include <memory>
+#include "paddle/cinn/auto_schedule/measure/schedule_measurer.h"
+#include "paddle/cinn/auto_schedule/measure/simple_builder.h"
+#include "paddle/cinn/auto_schedule/measure/simple_runner.h"
+#include "paddle/cinn/auto_schedule/task/task_creator.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/frontend/net_builder.h"
+#include "paddle/cinn/frontend/optimize.h"
+#include "paddle/cinn/frontend/syntax.h"
+#include "paddle/cinn/hlir/framework/graph_compiler.h"
+#include "paddle/cinn/runtime/flags.h"
+DECLARE_bool(cinn_ir_schedule);
+namespace cinn {
+namespace auto_schedule {
+using ::cinn::hlir::framework::BuildScope;
+using ::cinn::hlir::framework::Graph;
+using ::cinn::hlir::framework::GraphCompiler;
+frontend::Program CreateAddReluProgram() {
+  constexpr int M = 32;
+  constexpr int N = 24;
+  frontend::NetBuilder builder("test");
+  auto a = builder.CreateInput(Float(32), {M, N}, "A");
+  auto b = builder.CreateInput(Float(32), {M, N}, "B");
+  auto c = builder.Add(a, b);
+  auto d = builder.Relu(c);
+  return builder.Build();
+}
+class TestMeasurer : public ::testing::Test {
+ public:
+  std::unique_ptr<GraphCompiler> graph_compiler;
+  std::vector<TuneTask> tasks;
+  std::vector<MeasureInput> inputs;
+  void SetUp() override {
+    FLAGS_cinn_ir_schedule = true;
+#ifdef CINN_WITH_CUDA
+    Target target = common::DefaultNVGPUTarget();
+#else
+    Target target = common::DefaultHostTarget();
+#endif
+    std::unordered_set<std::string> fetch_ids;
+    auto program   = CreateAddReluProgram();
+    auto graph     = cinn::frontend::Optimize(&program, fetch_ids, target);
+    auto scope     = BuildScope(target, graph);
+    graph_compiler = std::make_unique<GraphCompiler>(target, scope, graph);
+    TaskCreator task_creator;
+    tasks                  = task_creator.CreateTuneTaskOpLevel(graph.get());
+    const auto& dtype_dict = graph->GetAttrs<absl::flat_hash_map<std::string, common::Type>>("inferdtype");
+    const auto& shape_dict = graph->GetAttrs<absl::flat_hash_map<std::string, hlir::framework::shape_t>>("infershape");
+    auto op_lowerer = std::make_unique<hlir::framework::OpLowerer>(dtype_dict, shape_dict, target);
+    inputs.reserve(tasks.size());
+    for (int i = 0; i < tasks.size(); ++i) {
+      auto* task = &tasks[i];
+      task->Initialize(shape_dict, dtype_dict, op_lowerer.get());
+      MeasureInput input;
+      input.task          = task;
+      input.lowered_funcs = task->lowered_funcs;
+      inputs.emplace_back(input);
+    }
+  }
+};
+class ThrowExceptionBuilder : public ScheduleBuilder {
+  struct Exception : public std::exception {
+    const char* what() const throw() { return "BuildError"; }
+  };
+  BuildResult Build(const MeasureInput& input) override { throw Exception(); }
+};
+class ThrowExceptionRunner : public ScheduleRunner {
+  struct Exception : public std::exception {
+    const char* what() const throw() { return "RunError"; }
+  };
+  MeasureResult Run(const MeasureInput& input, const BuildResult& build_result) override { throw Exception(); }
+};
+TEST_F(TestMeasurer, Basic) {
+  auto builder                       = std::make_unique<SimpleBuilder>(graph_compiler.get());
+  auto runner                        = std::make_unique<SimpleRunner>(1);
+  auto measurer                      = std::make_unique<ScheduleMeasurer>(builder.get(), runner.get());
+  std::vector<MeasureResult> results = measurer->Measure(inputs);
+  ASSERT_EQ(inputs.size(), results.size());
+}
+TEST_F(TestMeasurer, CatchException) {
+  auto builder                       = std::make_unique<SimpleBuilder>(graph_compiler.get());
+  auto runner                        = std::make_unique<SimpleRunner>(1);
+  auto throw_builder                 = std::make_unique<ThrowExceptionBuilder>();
+  auto throw_runner                  = std::make_unique<ThrowExceptionRunner>();
+  auto measurer_with_build_error     = std::make_unique<ScheduleMeasurer>(throw_builder.get(), runner.get(), 2);
+  std::vector<MeasureResult> results = measurer_with_build_error->Measure(inputs);
+  ASSERT_EQ(inputs.size(), results.size());
+  EXPECT_EQ(results[0].error_msg, "Build failed, error: BuildError\n");
+  // TODO(CtfGo): test parallel build after we support thread-safe compilation
+  auto measurer_with_run_error = std::make_unique<ScheduleMeasurer>(builder.get(), throw_runner.get(), 1);
+  results                      = measurer_with_run_error->Measure(inputs);
+  ASSERT_EQ(inputs.size(), results.size());
+  EXPECT_EQ(results[0].error_msg, "Run failed, error: RunError\n");
+}
+}  // namespace auto_schedule
+}  // namespace cinn
--- a/paddle/cinn/auto_schedule/measure/schedule_measurer.cc
+++ b/paddle/cinn/auto_schedule/measure/schedule_measurer.cc
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/cinn/auto_schedule/measure/schedule_measurer.h"
+#include <exception>
+#include "paddle/cinn/utils/multi_threading.h"
+namespace cinn {
+namespace auto_schedule {
+ScheduleMeasurer::ScheduleMeasurer(ScheduleBuilder* builder, ScheduleRunner* runner, int num_threads)
+    : builder_(builder), runner_(runner), num_threads_(num_threads) {}
+std::vector<MeasureResult> ScheduleMeasurer::Measure(const std::vector<MeasureInput>& inputs) {
+  if (inputs.empty()) {
+    LOG(WARNING) << "inputs is empty";
+    return {};
+  }
+  std::vector<BuildResult> build_results(inputs.size());
+  std::vector<MeasureResult> results(inputs.size());
+  // define how to build a candidate with the specified index
+  auto build_fn = [builder = builder_, &inputs, &build_results, &results](int index) {
+    VLOG(6) << "Build candidate index: " << index;
+    auto m_start = std::chrono::steady_clock::now();
+    try {
+      build_results[index] = builder->Build(inputs[index]);
+    } catch (std::exception& e) {
+      results[index].error_msg = utils::StringFormat("Build failed, error: %s\n", e.what());
+    }
+    auto time_span = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::steady_clock::now() - m_start);
+    results[index].elapsed_time += static_cast<double>(time_span.count());
+  };
+  // define how to run a candidate with the specified index
+  auto run_fn = [runner = runner_, &inputs, &build_results, &results](int index) {
+    VLOG(6) << "Run candidate index: " << index;
+    auto m_start = std::chrono::steady_clock::now();
+    try {
+      // if error occurred in building, then skip running
+      if (results[index].error_msg.empty()) {
+        results[index] = runner->Run(inputs[index], build_results[index]);
+      }
+    } catch (std::exception& e) {
+      results[index].error_msg = utils::StringFormat("Run failed, error: %s\n", e.what());
+    }
+    auto time_span = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::steady_clock::now() - m_start);
+    results[index].elapsed_time += static_cast<double>(time_span.count());
+  };
+  // measure a candidate by calling build and run successively
+  auto measure_fn = [&build_fn, &run_fn](int index) {
+    build_fn(index);
+    run_fn(index);
+  };
+  // default num_threads_ is 1 and in that case it will perform all measurements sequentially inplace.
+  utils::parallel_run(measure_fn, utils::SequenceDispatcher(0, inputs.size()), num_threads_);
+  VLOG(4) << "Measure " << inputs.size() << " candidates";
+  return results;
+}
+}  // namespace auto_schedule
+}  // namespace cinn
--- a/paddle/cinn/auto_schedule/measure/schedule_measurer.h
+++ b/paddle/cinn/auto_schedule/measure/schedule_measurer.h
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <vector>
+#include "paddle/cinn/auto_schedule/measure/measure.h"
+namespace cinn {
+namespace auto_schedule {
+// Entrance of schedule measurement, it mainly includes two processes:
+// which are building the input schedules and running the generated codes.
+class ScheduleMeasurer {
+ public:
+  ScheduleMeasurer(ScheduleBuilder* builder, ScheduleRunner* runner, int num_threads = 1);
+  // Measure a batch of inputs and return all results once.
+  std::vector<MeasureResult> Measure(const std::vector<MeasureInput>& inputs);
+ private:
+  // The handle to implemented ScheduleBuilder
+  ScheduleBuilder* builder_;
+  // The handle to implemented ScheduleRunner
+  ScheduleRunner* runner_;
+  // The number of threads used to perform measurement,
+  // if it is greater than 1 that means parallel measurement.
+  const int num_threads_;
+};
+}  // namespace auto_schedule
+}  // namespace cinn
--- a/paddle/cinn/auto_schedule/measure/simple_builder.cc
+++ b/paddle/cinn/auto_schedule/measure/simple_builder.cc
--- a/paddle/cinn/auto_schedule/measure/simple_builder.h
+++ b/paddle/cinn/auto_schedule/measure/simple_builder.h
--- a/paddle/cinn/auto_schedule/measure/simple_runner.cc
+++ b/paddle/cinn/auto_schedule/measure/simple_runner.cc
--- a/paddle/cinn/auto_schedule/measure/simple_runner.h
+++ b/paddle/cinn/auto_schedule/measure/simple_runner.h
--- a/paddle/cinn/auto_schedule/measure/simple_runner_test.cc
+++ b/paddle/cinn/auto_schedule/measure/simple_runner_test.cc
--- a/paddle/cinn/auto_schedule/post_schedule_rule/CMakeLists.txt
+++ b/paddle/cinn/auto_schedule/post_schedule_rule/CMakeLists.txt
+core_gather_headers()
+gather_srcs(cinnapi_src SRCS
+  cooperative_process.cc
+	)
+if (WITH_CUDA)
+  cinn_nv_test(test_cooperative_process SRCS cooperative_process_test.cc DEPS cinncore auto_gen_rule_test_helper test_program_builder)
+endif()
--- a/paddle/cinn/auto_schedule/post_schedule_rule/cooperative_process.cc
+++ b/paddle/cinn/auto_schedule/post_schedule_rule/cooperative_process.cc
--- a/paddle/cinn/auto_schedule/post_schedule_rule/cooperative_process.h
+++ b/paddle/cinn/auto_schedule/post_schedule_rule/cooperative_process.h
--- a/paddle/cinn/auto_schedule/post_schedule_rule/cooperative_process_test.cc
+++ b/paddle/cinn/auto_schedule/post_schedule_rule/cooperative_process_test.cc
--- a/paddle/cinn/auto_schedule/post_schedule_rule/post_schedule_rule.h
+++ b/paddle/cinn/auto_schedule/post_schedule_rule/post_schedule_rule.h
--- a/paddle/cinn/auto_schedule/search_space/CMakeLists.txt
+++ b/paddle/cinn/auto_schedule/search_space/CMakeLists.txt
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/CMakeLists.txt
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/CMakeLists.txt
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_bind.cc
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_bind.cc
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_bind.h
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_bind.h
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_bind_test.cc
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_bind_test.cc
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.cc
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.cc
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.h
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_gen_rule.h
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_inline.cc
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_inline.cc
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_inline.h
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_inline.h
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_inline_test.cc
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_inline_test.cc
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll.cc
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll.cc
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll.h
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll.h
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll_test.cc
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_unroll_test.cc
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/mix_rules_test.cc
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/mix_rules_test.cc
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling.cc
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling.cc
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling.h
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling.h
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling_test.cc
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/multi_level_tiling_test.cc
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/skip_rule.cc
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/skip_rule.cc
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/skip_rule.h
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/skip_rule.h
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/skip_rule_test.cc
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/skip_rule_test.cc
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/test_helper.cc
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/test_helper.cc
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/test_helper.h
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/test_helper.h
--- a/paddle/cinn/auto_schedule/search_space/block_sampler.cc
+++ b/paddle/cinn/auto_schedule/search_space/block_sampler.cc
--- a/paddle/cinn/auto_schedule/search_space/block_sampler.h
+++ b/paddle/cinn/auto_schedule/search_space/block_sampler.h
--- a/paddle/cinn/auto_schedule/search_space/block_sampler_test.cc
+++ b/paddle/cinn/auto_schedule/search_space/block_sampler_test.cc
--- a/paddle/cinn/auto_schedule/search_space/rule_sampler.cc
+++ b/paddle/cinn/auto_schedule/search_space/rule_sampler.cc
--- a/paddle/cinn/auto_schedule/search_space/rule_sampler.h
+++ b/paddle/cinn/auto_schedule/search_space/rule_sampler.h
--- a/paddle/cinn/auto_schedule/search_space/rule_sampler_test.cc
+++ b/paddle/cinn/auto_schedule/search_space/rule_sampler_test.cc
--- a/paddle/cinn/auto_schedule/search_space/search_space.cc
+++ b/paddle/cinn/auto_schedule/search_space/search_space.cc
--- a/paddle/cinn/auto_schedule/search_space/search_space.h
+++ b/paddle/cinn/auto_schedule/search_space/search_space.h
--- a/paddle/cinn/auto_schedule/search_space/search_space_test.cc
+++ b/paddle/cinn/auto_schedule/search_space/search_space_test.cc
--- a/paddle/cinn/auto_schedule/search_space/search_state.cc
+++ b/paddle/cinn/auto_schedule/search_space/search_state.cc
--- a/paddle/cinn/auto_schedule/search_space/search_state.h
+++ b/paddle/cinn/auto_schedule/search_space/search_state.h
--- a/paddle/cinn/auto_schedule/search_space/search_state_test.cc
+++ b/paddle/cinn/auto_schedule/search_space/search_state_test.cc
--- a/paddle/cinn/auto_schedule/search_strategy/CMakeLists.txt
+++ b/paddle/cinn/auto_schedule/search_strategy/CMakeLists.txt
+add_subdirectory(mutate_rule)
+core_gather_headers()
+gather_srcs(cinnapi_src SRCS evolutionary_search.cc)
+cinn_cc_test(test_evolutionary_search SRCS evolutionary_search_test.cc DEPS cinncore test_program_builder)
--- a/paddle/cinn/auto_schedule/search_strategy/evolutionary_search.cc
+++ b/paddle/cinn/auto_schedule/search_strategy/evolutionary_search.cc
--- a/paddle/cinn/auto_schedule/search_strategy/evolutionary_search.h
+++ b/paddle/cinn/auto_schedule/search_strategy/evolutionary_search.h
--- a/paddle/cinn/auto_schedule/search_strategy/evolutionary_search_test.cc
+++ b/paddle/cinn/auto_schedule/search_strategy/evolutionary_search_test.cc
--- a/paddle/cinn/auto_schedule/search_strategy/mutate_rule/CMakeLists.txt
+++ b/paddle/cinn/auto_schedule/search_strategy/mutate_rule/CMakeLists.txt
--- a/paddle/cinn/auto_schedule/search_strategy/mutate_rule/mutate_rule.cc
+++ b/paddle/cinn/auto_schedule/search_strategy/mutate_rule/mutate_rule.cc
--- a/paddle/cinn/auto_schedule/search_strategy/mutate_rule/mutate_rule.h
+++ b/paddle/cinn/auto_schedule/search_strategy/mutate_rule/mutate_rule.h
--- a/paddle/cinn/auto_schedule/search_strategy/mutate_rule/mutate_tile_size.cc
+++ b/paddle/cinn/auto_schedule/search_strategy/mutate_rule/mutate_tile_size.cc
--- a/paddle/cinn/auto_schedule/search_strategy/mutate_rule/mutate_tile_size.h
+++ b/paddle/cinn/auto_schedule/search_strategy/mutate_rule/mutate_tile_size.h
--- a/paddle/cinn/auto_schedule/search_strategy/mutate_rule/mutate_tile_size_test.cc
+++ b/paddle/cinn/auto_schedule/search_strategy/mutate_rule/mutate_tile_size_test.cc
--- a/paddle/cinn/auto_schedule/task/CMakeLists.txt
+++ b/paddle/cinn/auto_schedule/task/CMakeLists.txt
--- a/paddle/cinn/auto_schedule/task/task_creator.cc
+++ b/paddle/cinn/auto_schedule/task/task_creator.cc
--- a/paddle/cinn/auto_schedule/task/task_creator.h
+++ b/paddle/cinn/auto_schedule/task/task_creator.h
--- a/paddle/cinn/auto_schedule/task/task_creator_test.cc
+++ b/paddle/cinn/auto_schedule/task/task_creator_test.cc
--- a/paddle/cinn/auto_schedule/task/task_optimizer.cc
+++ b/paddle/cinn/auto_schedule/task/task_optimizer.cc
--- a/paddle/cinn/auto_schedule/task/task_optimizer.h
+++ b/paddle/cinn/auto_schedule/task/task_optimizer.h
--- a/paddle/cinn/auto_schedule/task/task_registry.h
+++ b/paddle/cinn/auto_schedule/task/task_registry.h
--- a/paddle/cinn/auto_schedule/task/task_registry_test.cc
+++ b/paddle/cinn/auto_schedule/task/task_registry_test.cc
--- a/paddle/cinn/auto_schedule/task/tune_task.cc
+++ b/paddle/cinn/auto_schedule/task/tune_task.cc
--- a/paddle/cinn/auto_schedule/task/tune_task.h
+++ b/paddle/cinn/auto_schedule/task/tune_task.h
--- a/paddle/cinn/auto_schedule/task/tune_task_test.cc
+++ b/paddle/cinn/auto_schedule/task/tune_task_test.cc
--- a/paddle/cinn/auto_schedule/task_scheduler/CMakeLists.txt
+++ b/paddle/cinn/auto_schedule/task_scheduler/CMakeLists.txt
--- a/paddle/cinn/auto_schedule/task_scheduler/efficiency_priority.cc
+++ b/paddle/cinn/auto_schedule/task_scheduler/efficiency_priority.cc
--- a/paddle/cinn/auto_schedule/task_scheduler/efficiency_priority.h
+++ b/paddle/cinn/auto_schedule/task_scheduler/efficiency_priority.h
--- a/paddle/cinn/auto_schedule/task_scheduler/round_robin.cc
+++ b/paddle/cinn/auto_schedule/task_scheduler/round_robin.cc
--- a/paddle/cinn/auto_schedule/task_scheduler/round_robin.h
+++ b/paddle/cinn/auto_schedule/task_scheduler/round_robin.h
--- a/paddle/cinn/auto_schedule/task_scheduler/task_scheduler.cc
+++ b/paddle/cinn/auto_schedule/task_scheduler/task_scheduler.cc
--- a/paddle/cinn/auto_schedule/task_scheduler/task_scheduler.h
+++ b/paddle/cinn/auto_schedule/task_scheduler/task_scheduler.h
--- a/paddle/cinn/auto_schedule/task_scheduler/task_scheduler_test.cc
+++ b/paddle/cinn/auto_schedule/task_scheduler/task_scheduler_test.cc
--- a/paddle/cinn/auto_schedule/tests/CMakeLists.txt
+++ b/paddle/cinn/auto_schedule/tests/CMakeLists.txt
--- a/paddle/cinn/auto_schedule/tests/performance_comparison_test.cc
+++ b/paddle/cinn/auto_schedule/tests/performance_comparison_test.cc
--- a/paddle/cinn/auto_schedule/tuning.h
+++ b/paddle/cinn/auto_schedule/tuning.h
--- a/paddle/cinn/backends/CMakeLists.txt
+++ b/paddle/cinn/backends/CMakeLists.txt
--- a/paddle/cinn/backends/_x86_builtin_source.cc
+++ b/paddle/cinn/backends/_x86_builtin_source.cc
--- a/paddle/cinn/backends/codegen_c.cc
+++ b/paddle/cinn/backends/codegen_c.cc
--- a/paddle/cinn/backends/codegen_c.h
+++ b/paddle/cinn/backends/codegen_c.h
--- a/paddle/cinn/backends/codegen_c_test.cc
+++ b/paddle/cinn/backends/codegen_c_test.cc
--- a/paddle/cinn/backends/codegen_c_x86.cc
+++ b/paddle/cinn/backends/codegen_c_x86.cc
--- a/paddle/cinn/backends/codegen_c_x86.h
+++ b/paddle/cinn/backends/codegen_c_x86.h
--- a/paddle/cinn/backends/codegen_c_x86_test.cc
+++ b/paddle/cinn/backends/codegen_c_x86_test.cc
--- a/paddle/cinn/backends/codegen_cuda_dev.cc
+++ b/paddle/cinn/backends/codegen_cuda_dev.cc
--- a/paddle/cinn/backends/codegen_cuda_dev.h
+++ b/paddle/cinn/backends/codegen_cuda_dev.h
--- a/paddle/cinn/backends/codegen_cuda_generate_test.cc
+++ b/paddle/cinn/backends/codegen_cuda_generate_test.cc
--- a/paddle/cinn/backends/codegen_cuda_host.cc
+++ b/paddle/cinn/backends/codegen_cuda_host.cc
--- a/paddle/cinn/backends/codegen_cuda_host.h
+++ b/paddle/cinn/backends/codegen_cuda_host.h
--- a/paddle/cinn/backends/codegen_cuda_util.cc
+++ b/paddle/cinn/backends/codegen_cuda_util.cc
--- a/paddle/cinn/backends/codegen_cuda_util.h
+++ b/paddle/cinn/backends/codegen_cuda_util.h
--- a/paddle/cinn/backends/codegen_debug_test.cc
+++ b/paddle/cinn/backends/codegen_debug_test.cc
--- a/paddle/cinn/backends/compiler.cc
+++ b/paddle/cinn/backends/compiler.cc
--- a/paddle/cinn/backends/compiler.h
+++ b/paddle/cinn/backends/compiler.h
--- a/paddle/cinn/backends/compiler_test.cc
+++ b/paddle/cinn/backends/compiler_test.cc
--- a/paddle/cinn/backends/cuda_util.cc
+++ b/paddle/cinn/backends/cuda_util.cc
--- a/paddle/cinn/backends/cuda_util.h
+++ b/paddle/cinn/backends/cuda_util.h
--- a/paddle/cinn/backends/extern_func_emitter.cc
+++ b/paddle/cinn/backends/extern_func_emitter.cc
--- a/paddle/cinn/backends/extern_func_emitter.h
+++ b/paddle/cinn/backends/extern_func_emitter.h
--- a/paddle/cinn/backends/extern_func_emitter_builtin.cc
+++ b/paddle/cinn/backends/extern_func_emitter_builtin.cc
--- a/paddle/cinn/backends/extern_func_emitter_builtin.h
+++ b/paddle/cinn/backends/extern_func_emitter_builtin.h
--- a/paddle/cinn/backends/extern_func_jit_register.cc
+++ b/paddle/cinn/backends/extern_func_jit_register.cc
--- a/paddle/cinn/backends/extern_func_jit_register.h
+++ b/paddle/cinn/backends/extern_func_jit_register.h
--- a/paddle/cinn/backends/extern_func_protos.cc
+++ b/paddle/cinn/backends/extern_func_protos.cc
--- a/paddle/cinn/backends/extern_func_protos.h
+++ b/paddle/cinn/backends/extern_func_protos.h
--- a/paddle/cinn/backends/function_prototype.cc
+++ b/paddle/cinn/backends/function_prototype.cc
--- a/paddle/cinn/backends/function_prototype.h
+++ b/paddle/cinn/backends/function_prototype.h
--- a/paddle/cinn/backends/generated1.cu
+++ b/paddle/cinn/backends/generated1.cu
--- a/paddle/cinn/backends/generated_module1.cc
+++ b/paddle/cinn/backends/generated_module1.cc
--- a/paddle/cinn/backends/ir_schedule_test.cc
+++ b/paddle/cinn/backends/ir_schedule_test.cc
--- a/paddle/cinn/backends/llvm/CMakeLists.txt
+++ b/paddle/cinn/backends/llvm/CMakeLists.txt
--- a/paddle/cinn/backends/llvm/codegen_llvm.cc
+++ b/paddle/cinn/backends/llvm/codegen_llvm.cc
--- a/paddle/cinn/backends/llvm/codegen_llvm.h
+++ b/paddle/cinn/backends/llvm/codegen_llvm.h
--- a/paddle/cinn/backends/llvm/codegen_llvm_test.cc
+++ b/paddle/cinn/backends/llvm/codegen_llvm_test.cc
--- a/paddle/cinn/backends/llvm/codegen_x86.cc
+++ b/paddle/cinn/backends/llvm/codegen_x86.cc
--- a/paddle/cinn/backends/llvm/codegen_x86.h
+++ b/paddle/cinn/backends/llvm/codegen_x86.h
--- a/paddle/cinn/backends/llvm/codegen_x86_test.cc
+++ b/paddle/cinn/backends/llvm/codegen_x86_test.cc
--- a/paddle/cinn/backends/llvm/execution_engine.cc
+++ b/paddle/cinn/backends/llvm/execution_engine.cc
--- a/paddle/cinn/backends/llvm/execution_engine.h
+++ b/paddle/cinn/backends/llvm/execution_engine.h
--- a/paddle/cinn/backends/llvm/execution_engine_test.cc
+++ b/paddle/cinn/backends/llvm/execution_engine_test.cc
--- a/paddle/cinn/backends/llvm/generate_runtime_llvm_ir.py
+++ b/paddle/cinn/backends/llvm/generate_runtime_llvm_ir.py
--- a/paddle/cinn/backends/llvm/ir_builder_mixin.h
+++ b/paddle/cinn/backends/llvm/ir_builder_mixin.h
--- a/paddle/cinn/backends/llvm/llvm_intrin_rule.h
+++ b/paddle/cinn/backends/llvm/llvm_intrin_rule.h
--- a/paddle/cinn/backends/llvm/llvm_optimizer.cc
+++ b/paddle/cinn/backends/llvm/llvm_optimizer.cc
--- a/paddle/cinn/backends/llvm/llvm_optimizer.h
+++ b/paddle/cinn/backends/llvm/llvm_optimizer.h
--- a/paddle/cinn/backends/llvm/llvm_util.cc
+++ b/paddle/cinn/backends/llvm/llvm_util.cc
--- a/paddle/cinn/backends/llvm/llvm_util.h
+++ b/paddle/cinn/backends/llvm/llvm_util.h
--- a/paddle/cinn/backends/llvm/runtime_symbol_registry.cc
+++ b/paddle/cinn/backends/llvm/runtime_symbol_registry.cc
--- a/paddle/cinn/backends/llvm/runtime_symbol_registry.h
+++ b/paddle/cinn/backends/llvm/runtime_symbol_registry.h
--- a/paddle/cinn/backends/llvm/simple_jit.cc
+++ b/paddle/cinn/backends/llvm/simple_jit.cc
--- a/paddle/cinn/backends/llvm/simple_jit.h
+++ b/paddle/cinn/backends/llvm/simple_jit.h
--- a/paddle/cinn/backends/modular.cc
+++ b/paddle/cinn/backends/modular.cc
--- a/paddle/cinn/backends/modular.h
+++ b/paddle/cinn/backends/modular.h
--- a/paddle/cinn/backends/nvrtc/CMakeLists.txt
+++ b/paddle/cinn/backends/nvrtc/CMakeLists.txt
--- a/paddle/cinn/backends/nvrtc/header_generator.cc
+++ b/paddle/cinn/backends/nvrtc/header_generator.cc
--- a/paddle/cinn/backends/nvrtc/header_generator.h
+++ b/paddle/cinn/backends/nvrtc/header_generator.h
--- a/paddle/cinn/backends/nvrtc/nvrtc_util.cc
+++ b/paddle/cinn/backends/nvrtc/nvrtc_util.cc
--- a/paddle/cinn/backends/nvrtc/nvrtc_util.h
+++ b/paddle/cinn/backends/nvrtc/nvrtc_util.h
--- a/paddle/cinn/backends/nvrtc/nvrtc_util_test.cc
+++ b/paddle/cinn/backends/nvrtc/nvrtc_util_test.cc
--- a/paddle/cinn/backends/outputs.cc
+++ b/paddle/cinn/backends/outputs.cc
--- a/paddle/cinn/backends/outputs.h
+++ b/paddle/cinn/backends/outputs.h
--- a/paddle/cinn/backends/raw_cuda_code_test.cu
+++ b/paddle/cinn/backends/raw_cuda_code_test.cu
--- a/paddle/cinn/cinn.h
+++ b/paddle/cinn/cinn.h
--- a/paddle/cinn/common/CMakeLists.txt
+++ b/paddle/cinn/common/CMakeLists.txt
--- a/paddle/cinn/common/arithmatic.cc
+++ b/paddle/cinn/common/arithmatic.cc
--- a/paddle/cinn/common/arithmatic.h
+++ b/paddle/cinn/common/arithmatic.h
--- a/paddle/cinn/common/arithmatic_test.cc
+++ b/paddle/cinn/common/arithmatic_test.cc
--- a/paddle/cinn/common/axis.cc
+++ b/paddle/cinn/common/axis.cc
--- a/paddle/cinn/common/axis.h
+++ b/paddle/cinn/common/axis.h
--- a/paddle/cinn/common/axis_test.cc
+++ b/paddle/cinn/common/axis_test.cc
--- a/paddle/cinn/common/bfloat16.h
+++ b/paddle/cinn/common/bfloat16.h
--- a/paddle/cinn/common/cas.cc
+++ b/paddle/cinn/common/cas.cc
--- a/paddle/cinn/common/cas.h
+++ b/paddle/cinn/common/cas.h
--- a/paddle/cinn/common/cas_test.cc
+++ b/paddle/cinn/common/cas_test.cc
--- a/paddle/cinn/common/cinn_value.cc
+++ b/paddle/cinn/common/cinn_value.cc
--- a/paddle/cinn/common/cinn_value.h
+++ b/paddle/cinn/common/cinn_value.h
--- a/paddle/cinn/common/cinn_value_test.cc
+++ b/paddle/cinn/common/cinn_value_test.cc
--- a/paddle/cinn/common/common.h
+++ b/paddle/cinn/common/common.h
--- a/paddle/cinn/common/context.cc
+++ b/paddle/cinn/common/context.cc
--- a/paddle/cinn/common/context.h
+++ b/paddle/cinn/common/context.h
--- a/paddle/cinn/common/cost_model.h
+++ b/paddle/cinn/common/cost_model.h
--- a/paddle/cinn/common/cuda_test_helper.cc
+++ b/paddle/cinn/common/cuda_test_helper.cc
--- a/paddle/cinn/common/cuda_test_helper.h
+++ b/paddle/cinn/common/cuda_test_helper.h
--- a/paddle/cinn/common/debug_manager.cc
+++ b/paddle/cinn/common/debug_manager.cc
--- a/paddle/cinn/common/debug_manager.h
+++ b/paddle/cinn/common/debug_manager.h
--- a/paddle/cinn/common/float16.h
+++ b/paddle/cinn/common/float16.h
--- a/paddle/cinn/common/float16_bfloat16_cuda_test.cu
+++ b/paddle/cinn/common/float16_bfloat16_cuda_test.cu
--- a/paddle/cinn/common/float16_bfloat16_host_test.cc
+++ b/paddle/cinn/common/float16_bfloat16_host_test.cc
--- a/paddle/cinn/common/float16_bfloat16_utils.h
+++ b/paddle/cinn/common/float16_bfloat16_utils.h
--- a/paddle/cinn/common/graph_utils.cc
+++ b/paddle/cinn/common/graph_utils.cc
--- a/paddle/cinn/common/graph_utils.h
+++ b/paddle/cinn/common/graph_utils.h
--- a/paddle/cinn/common/graph_utils_test.cc
+++ b/paddle/cinn/common/graph_utils_test.cc
--- a/paddle/cinn/common/info_registry.cc
+++ b/paddle/cinn/common/info_registry.cc
--- a/paddle/cinn/common/info_registry.h
+++ b/paddle/cinn/common/info_registry.h
--- a/paddle/cinn/common/ir_util.cc
+++ b/paddle/cinn/common/ir_util.cc
--- a/paddle/cinn/common/ir_util.h
+++ b/paddle/cinn/common/ir_util.h
--- a/paddle/cinn/common/macros.h
+++ b/paddle/cinn/common/macros.h
--- a/paddle/cinn/common/object.cc
+++ b/paddle/cinn/common/object.cc
--- a/paddle/cinn/common/object.h
+++ b/paddle/cinn/common/object.h
--- a/paddle/cinn/common/python_interpreter_guard.cc
+++ b/paddle/cinn/common/python_interpreter_guard.cc
--- a/paddle/cinn/common/python_interpreter_guard.h
+++ b/paddle/cinn/common/python_interpreter_guard.h
--- a/paddle/cinn/common/shared.cc
+++ b/paddle/cinn/common/shared.cc
--- a/paddle/cinn/common/shared.h
+++ b/paddle/cinn/common/shared.h
--- a/paddle/cinn/common/shared_test.cc
+++ b/paddle/cinn/common/shared_test.cc
--- a/paddle/cinn/common/target.cc
+++ b/paddle/cinn/common/target.cc
--- a/paddle/cinn/common/target.h
+++ b/paddle/cinn/common/target.h
--- a/paddle/cinn/common/test_helper.cc
+++ b/paddle/cinn/common/test_helper.cc
--- a/paddle/cinn/common/test_helper.h
+++ b/paddle/cinn/common/test_helper.h
--- a/paddle/cinn/common/type.cc
+++ b/paddle/cinn/common/type.cc
--- a/paddle/cinn/common/type.h
+++ b/paddle/cinn/common/type.h
--- a/paddle/cinn/common/type_test.cc
+++ b/paddle/cinn/common/type_test.cc
--- a/paddle/cinn/common/union_find.cc
+++ b/paddle/cinn/common/union_find.cc
--- a/paddle/cinn/common/union_find.h
+++ b/paddle/cinn/common/union_find.h
--- a/paddle/cinn/frontend/CMakeLists.txt
+++ b/paddle/cinn/frontend/CMakeLists.txt
--- a/paddle/cinn/frontend/computation.cc
+++ b/paddle/cinn/frontend/computation.cc
--- a/paddle/cinn/frontend/computation.h
+++ b/paddle/cinn/frontend/computation.h
--- a/paddle/cinn/frontend/computation_test.cc
+++ b/paddle/cinn/frontend/computation_test.cc
--- a/paddle/cinn/frontend/decomposer/CMakeLists.txt
+++ b/paddle/cinn/frontend/decomposer/CMakeLists.txt
--- a/paddle/cinn/frontend/decomposer/activation.cc
+++ b/paddle/cinn/frontend/decomposer/activation.cc
--- a/paddle/cinn/frontend/decomposer/activation_test.cc
+++ b/paddle/cinn/frontend/decomposer/activation_test.cc
--- a/paddle/cinn/frontend/decomposer/batch_norm.cc
+++ b/paddle/cinn/frontend/decomposer/batch_norm.cc
--- a/paddle/cinn/frontend/decomposer/batch_norm_test.cc
+++ b/paddle/cinn/frontend/decomposer/batch_norm_test.cc
--- a/paddle/cinn/frontend/decomposer/broadcast.cc
+++ b/paddle/cinn/frontend/decomposer/broadcast.cc
--- a/paddle/cinn/frontend/decomposer/broadcast_test.cc
+++ b/paddle/cinn/frontend/decomposer/broadcast_test.cc
--- a/paddle/cinn/frontend/decomposer/elementwise.cc
+++ b/paddle/cinn/frontend/decomposer/elementwise.cc
--- a/paddle/cinn/frontend/decomposer/elementwise_test.cc
+++ b/paddle/cinn/frontend/decomposer/elementwise_test.cc
--- a/paddle/cinn/frontend/decomposer/test_helper.cc
+++ b/paddle/cinn/frontend/decomposer/test_helper.cc
--- a/paddle/cinn/frontend/decomposer/test_helper.h
+++ b/paddle/cinn/frontend/decomposer/test_helper.h
--- a/paddle/cinn/frontend/decomposer/top_k.cc
+++ b/paddle/cinn/frontend/decomposer/top_k.cc
--- a/paddle/cinn/frontend/decomposer/top_k_test.cc
+++ b/paddle/cinn/frontend/decomposer/top_k_test.cc
--- a/paddle/cinn/frontend/decomposer/use_decomposer.h
+++ b/paddle/cinn/frontend/decomposer/use_decomposer.h
--- a/paddle/cinn/frontend/decomposer_registry.h
+++ b/paddle/cinn/frontend/decomposer_registry.h
--- a/paddle/cinn/frontend/decomposer_registry_test.cc
+++ b/paddle/cinn/frontend/decomposer_registry_test.cc
--- a/paddle/cinn/frontend/interpreter.cc
+++ b/paddle/cinn/frontend/interpreter.cc
--- a/paddle/cinn/frontend/interpreter.h
+++ b/paddle/cinn/frontend/interpreter.h
--- a/paddle/cinn/frontend/interpreter_test.cc
+++ b/paddle/cinn/frontend/interpreter_test.cc
--- a/paddle/cinn/frontend/net_builder.cc
+++ b/paddle/cinn/frontend/net_builder.cc
--- a/paddle/cinn/frontend/net_builder.h
+++ b/paddle/cinn/frontend/net_builder.h
--- a/paddle/cinn/frontend/net_builder_test.cc
+++ b/paddle/cinn/frontend/net_builder_test.cc
--- a/paddle/cinn/frontend/op_mapper_registry.cc
+++ b/paddle/cinn/frontend/op_mapper_registry.cc
--- a/paddle/cinn/frontend/op_mapper_registry.h
+++ b/paddle/cinn/frontend/op_mapper_registry.h
--- a/paddle/cinn/frontend/op_mapper_registry_test.cc
+++ b/paddle/cinn/frontend/op_mapper_registry_test.cc
--- a/paddle/cinn/frontend/op_mappers/CMakeLists.txt
+++ b/paddle/cinn/frontend/op_mappers/CMakeLists.txt
--- a/paddle/cinn/frontend/op_mappers/common_utils.h
+++ b/paddle/cinn/frontend/op_mappers/common_utils.h
--- a/paddle/cinn/frontend/op_mappers/paddle/CMakeLists.txt
+++ b/paddle/cinn/frontend/op_mappers/paddle/CMakeLists.txt
--- a/paddle/cinn/frontend/op_mappers/paddle/arg_min_max.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/arg_min_max.cc
--- a/paddle/cinn/frontend/op_mappers/paddle/argsort.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/argsort.cc
--- a/paddle/cinn/frontend/op_mappers/paddle/atan.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/atan.cc
--- a/paddle/cinn/frontend/op_mappers/paddle/batchnorm.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/batchnorm.cc
--- a/paddle/cinn/frontend/op_mappers/paddle/binary.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/binary.cc
--- a/paddle/cinn/frontend/op_mappers/paddle/cholesky.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/cholesky.cc
--- a/paddle/cinn/frontend/op_mappers/paddle/clip.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/clip.cc
--- a/paddle/cinn/frontend/op_mappers/paddle/compare.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/compare.cc
--- a/paddle/cinn/frontend/op_mappers/paddle/concat.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/concat.cc
--- a/paddle/cinn/frontend/op_mappers/paddle/constant.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/constant.cc
--- a/paddle/cinn/frontend/op_mappers/paddle/conv2d.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/conv2d.cc
--- a/paddle/cinn/frontend/op_mappers/paddle/cumsum.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/cumsum.cc
--- a/paddle/cinn/frontend/op_mappers/paddle/dropout.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/dropout.cc
--- a/paddle/cinn/frontend/op_mappers/paddle/elementwise.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/elementwise.cc
--- a/paddle/cinn/frontend/op_mappers/paddle/expand.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/expand.cc
--- a/paddle/cinn/frontend/op_mappers/paddle/fetch_feed.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/fetch_feed.cc
--- a/paddle/cinn/frontend/op_mappers/paddle/flip.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/flip.cc
--- a/paddle/cinn/frontend/op_mappers/paddle/gather.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/gather.cc
--- a/paddle/cinn/frontend/op_mappers/paddle/gather_nd.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/gather_nd.cc
--- a/paddle/cinn/frontend/op_mappers/paddle/gaussian_random.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/gaussian_random.cc
--- a/paddle/cinn/frontend/op_mappers/paddle/layer_norm.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/layer_norm.cc
--- a/paddle/cinn/frontend/op_mappers/paddle/log.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/log.cc
--- a/paddle/cinn/frontend/op_mappers/paddle/lookup_table.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/lookup_table.cc
--- a/paddle/cinn/frontend/op_mappers/paddle/matmul.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/matmul.cc
--- a/paddle/cinn/frontend/op_mappers/paddle/mul.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/mul.cc
--- a/paddle/cinn/frontend/op_mappers/paddle/norm.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/norm.cc
--- a/paddle/cinn/frontend/op_mappers/paddle/one_hot.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/one_hot.cc
--- a/paddle/cinn/frontend/op_mappers/paddle/pool2d.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/pool2d.cc
--- a/paddle/cinn/frontend/op_mappers/paddle/randint.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/randint.cc
--- a/paddle/cinn/frontend/op_mappers/paddle/reduce.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/reduce.cc
--- a/paddle/cinn/frontend/op_mappers/paddle/relu.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/relu.cc
--- a/paddle/cinn/frontend/op_mappers/paddle/reshape.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/reshape.cc
--- a/paddle/cinn/frontend/op_mappers/paddle/reverse.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/reverse.cc
--- a/paddle/cinn/frontend/op_mappers/paddle/roll.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/roll.cc
--- a/paddle/cinn/frontend/op_mappers/paddle/scale.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/scale.cc
--- a/paddle/cinn/frontend/op_mappers/paddle/scatter.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/scatter.cc
--- a/paddle/cinn/frontend/op_mappers/paddle/slice.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/slice.cc
--- a/paddle/cinn/frontend/op_mappers/paddle/softmax.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/softmax.cc
--- a/paddle/cinn/frontend/op_mappers/paddle/squeeze.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/squeeze.cc
--- a/paddle/cinn/frontend/op_mappers/paddle/strided_slice.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/strided_slice.cc
--- a/paddle/cinn/frontend/op_mappers/paddle/take_along_axis.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/take_along_axis.cc
--- a/paddle/cinn/frontend/op_mappers/paddle/tile.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/tile.cc
--- a/paddle/cinn/frontend/op_mappers/paddle/top_k.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/top_k.cc
--- a/paddle/cinn/frontend/op_mappers/paddle/transpose.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/transpose.cc
--- a/paddle/cinn/frontend/op_mappers/paddle/triangular_solve.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/triangular_solve.cc
--- a/paddle/cinn/frontend/op_mappers/paddle/unary.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/unary.cc
--- a/paddle/cinn/frontend/op_mappers/paddle/uniform_random.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/uniform_random.cc
--- a/paddle/cinn/frontend/op_mappers/paddle/unsqueeze.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/unsqueeze.cc
--- a/paddle/cinn/frontend/op_mappers/paddle/where.cc
+++ b/paddle/cinn/frontend/op_mappers/paddle/where.cc
--- a/paddle/cinn/frontend/op_mappers/science/CMakeLists.txt
+++ b/paddle/cinn/frontend/op_mappers/science/CMakeLists.txt
--- a/paddle/cinn/frontend/op_mappers/science/broadcast.cc
+++ b/paddle/cinn/frontend/op_mappers/science/broadcast.cc
--- a/paddle/cinn/frontend/op_mappers/science/compare.cc
+++ b/paddle/cinn/frontend/op_mappers/science/compare.cc
--- a/paddle/cinn/frontend/op_mappers/science/math.cc
+++ b/paddle/cinn/frontend/op_mappers/science/math.cc
--- a/paddle/cinn/frontend/op_mappers/science/transform.cc
+++ b/paddle/cinn/frontend/op_mappers/science/transform.cc
--- a/paddle/cinn/frontend/op_mappers/use_op_mappers.h
+++ b/paddle/cinn/frontend/op_mappers/use_op_mappers.h
--- a/paddle/cinn/frontend/optimize.cc
+++ b/paddle/cinn/frontend/optimize.cc
--- a/paddle/cinn/frontend/optimize.h
+++ b/paddle/cinn/frontend/optimize.h
--- a/paddle/cinn/frontend/paddle/CMakeLists.txt
+++ b/paddle/cinn/frontend/paddle/CMakeLists.txt
--- a/paddle/cinn/frontend/paddle/README.md
+++ b/paddle/cinn/frontend/paddle/README.md
--- a/paddle/cinn/frontend/paddle/compatible_pb.cc
+++ b/paddle/cinn/frontend/paddle/compatible_pb.cc
--- a/paddle/cinn/frontend/paddle/compatible_pb.h
+++ b/paddle/cinn/frontend/paddle/compatible_pb.h
--- a/paddle/cinn/frontend/paddle/cpp/CMakeLists.txt
+++ b/paddle/cinn/frontend/paddle/cpp/CMakeLists.txt
--- a/paddle/cinn/frontend/paddle/cpp/block_desc.cc
+++ b/paddle/cinn/frontend/paddle/cpp/block_desc.cc
--- a/paddle/cinn/frontend/paddle/cpp/block_desc.h
+++ b/paddle/cinn/frontend/paddle/cpp/block_desc.h
--- a/paddle/cinn/frontend/paddle/cpp/desc_api.h
+++ b/paddle/cinn/frontend/paddle/cpp/desc_api.h
--- a/paddle/cinn/frontend/paddle/cpp/op_desc.cc
+++ b/paddle/cinn/frontend/paddle/cpp/op_desc.cc
--- a/paddle/cinn/frontend/paddle/cpp/op_desc.h
+++ b/paddle/cinn/frontend/paddle/cpp/op_desc.h
--- a/paddle/cinn/frontend/paddle/cpp/program_desc.cc
+++ b/paddle/cinn/frontend/paddle/cpp/program_desc.cc
--- a/paddle/cinn/frontend/paddle/cpp/program_desc.h
+++ b/paddle/cinn/frontend/paddle/cpp/program_desc.h
--- a/paddle/cinn/frontend/paddle/cpp/var_desc.cc
+++ b/paddle/cinn/frontend/paddle/cpp/var_desc.cc
--- a/paddle/cinn/frontend/paddle/cpp/var_desc.h
+++ b/paddle/cinn/frontend/paddle/cpp/var_desc.h
--- a/paddle/cinn/frontend/paddle/framework.proto
+++ b/paddle/cinn/frontend/paddle/framework.proto
--- a/paddle/cinn/frontend/paddle/model_parser.cc
+++ b/paddle/cinn/frontend/paddle/model_parser.cc
--- a/paddle/cinn/frontend/paddle/model_parser.h
+++ b/paddle/cinn/frontend/paddle/model_parser.h
--- a/paddle/cinn/frontend/paddle/model_parser_test.cc
+++ b/paddle/cinn/frontend/paddle/model_parser_test.cc
--- a/paddle/cinn/frontend/paddle/pb/CMakeLists.txt
+++ b/paddle/cinn/frontend/paddle/pb/CMakeLists.txt
--- a/paddle/cinn/frontend/paddle/pb/block_desc.cc
+++ b/paddle/cinn/frontend/paddle/pb/block_desc.cc
--- a/paddle/cinn/frontend/paddle/pb/block_desc.h
+++ b/paddle/cinn/frontend/paddle/pb/block_desc.h
--- a/paddle/cinn/frontend/paddle/pb/op_desc.cc
+++ b/paddle/cinn/frontend/paddle/pb/op_desc.cc
--- a/paddle/cinn/frontend/paddle/pb/op_desc.h
+++ b/paddle/cinn/frontend/paddle/pb/op_desc.h
--- a/paddle/cinn/frontend/paddle/pb/program_desc.cc
+++ b/paddle/cinn/frontend/paddle/pb/program_desc.cc
--- a/paddle/cinn/frontend/paddle/pb/program_desc.h
+++ b/paddle/cinn/frontend/paddle/pb/program_desc.h
--- a/paddle/cinn/frontend/paddle/pb/var_desc.cc
+++ b/paddle/cinn/frontend/paddle/pb/var_desc.cc
--- a/paddle/cinn/frontend/paddle/pb/var_desc.h
+++ b/paddle/cinn/frontend/paddle/pb/var_desc.h
--- a/paddle/cinn/frontend/paddle_model_convertor.cc
+++ b/paddle/cinn/frontend/paddle_model_convertor.cc
--- a/paddle/cinn/frontend/paddle_model_convertor.h
+++ b/paddle/cinn/frontend/paddle_model_convertor.h
--- a/paddle/cinn/frontend/paddle_model_convertor_test.cc
+++ b/paddle/cinn/frontend/paddle_model_convertor_test.cc
--- a/paddle/cinn/frontend/paddle_model_to_program.cc
+++ b/paddle/cinn/frontend/paddle_model_to_program.cc
--- a/paddle/cinn/frontend/paddle_model_to_program.h
+++ b/paddle/cinn/frontend/paddle_model_to_program.h
--- a/paddle/cinn/frontend/pass/CMakeLists.txt
+++ b/paddle/cinn/frontend/pass/CMakeLists.txt
--- a/paddle/cinn/frontend/pass/auto_broadcast.cc
+++ b/paddle/cinn/frontend/pass/auto_broadcast.cc
--- a/paddle/cinn/frontend/pass/auto_cast.cc
+++ b/paddle/cinn/frontend/pass/auto_cast.cc
--- a/paddle/cinn/frontend/pass/auto_cast_test.cc
+++ b/paddle/cinn/frontend/pass/auto_cast_test.cc
--- a/paddle/cinn/frontend/pass/cast_collapsing.cc
+++ b/paddle/cinn/frontend/pass/cast_collapsing.cc
--- a/paddle/cinn/frontend/pass/cast_collapsing_test.cc
+++ b/paddle/cinn/frontend/pass/cast_collapsing_test.cc
--- a/paddle/cinn/frontend/pass/dead_code_eliminate.cc
+++ b/paddle/cinn/frontend/pass/dead_code_eliminate.cc
--- a/paddle/cinn/frontend/pass/dead_code_eliminate_test.cc
+++ b/paddle/cinn/frontend/pass/dead_code_eliminate_test.cc
--- a/paddle/cinn/frontend/pass/decomposer.cc
+++ b/paddle/cinn/frontend/pass/decomposer.cc
--- a/paddle/cinn/frontend/pass/decomposer_test.cc
+++ b/paddle/cinn/frontend/pass/decomposer_test.cc
--- a/paddle/cinn/frontend/pass/expand_zero_dim_pass.cc
+++ b/paddle/cinn/frontend/pass/expand_zero_dim_pass.cc
--- a/paddle/cinn/frontend/pass/expand_zero_dim_pass_test.cc
+++ b/paddle/cinn/frontend/pass/expand_zero_dim_pass_test.cc
--- a/paddle/cinn/frontend/pass/fill_constant_folding.cc
+++ b/paddle/cinn/frontend/pass/fill_constant_folding.cc
--- a/paddle/cinn/frontend/pass/fill_constant_folding_test.cc
+++ b/paddle/cinn/frontend/pass/fill_constant_folding_test.cc
--- a/paddle/cinn/frontend/pass/fill_constant_rewriter.cc
+++ b/paddle/cinn/frontend/pass/fill_constant_rewriter.cc
--- a/paddle/cinn/frontend/pass/fill_constant_rewriter_test.cc
+++ b/paddle/cinn/frontend/pass/fill_constant_rewriter_test.cc
--- a/paddle/cinn/frontend/pass/gemm_rewriter.cc
+++ b/paddle/cinn/frontend/pass/gemm_rewriter.cc
--- a/paddle/cinn/frontend/pass/gemm_rewriter_test.cc
+++ b/paddle/cinn/frontend/pass/gemm_rewriter_test.cc
--- a/paddle/cinn/frontend/pass/pass_test_helper.h
+++ b/paddle/cinn/frontend/pass/pass_test_helper.h
--- a/paddle/cinn/frontend/pass/program_topoerror_test.cc
+++ b/paddle/cinn/frontend/pass/program_topoerror_test.cc
--- a/paddle/cinn/frontend/pass/remove_identity.cc
+++ b/paddle/cinn/frontend/pass/remove_identity.cc
--- a/paddle/cinn/frontend/pass/remove_identity_test.cc
+++ b/paddle/cinn/frontend/pass/remove_identity_test.cc
--- a/paddle/cinn/frontend/pass/test_helper.h
+++ b/paddle/cinn/frontend/pass/test_helper.h
--- a/paddle/cinn/frontend/pass/transpose_collapsing.cc
+++ b/paddle/cinn/frontend/pass/transpose_collapsing.cc
--- a/paddle/cinn/frontend/pass/transpose_collapsing_test.cc
+++ b/paddle/cinn/frontend/pass/transpose_collapsing_test.cc
--- a/paddle/cinn/frontend/pass/transpose_folding_base.h
+++ b/paddle/cinn/frontend/pass/transpose_folding_base.h
--- a/paddle/cinn/frontend/pass/transpose_folding_input.cc
+++ b/paddle/cinn/frontend/pass/transpose_folding_input.cc
--- a/paddle/cinn/frontend/pass/transpose_folding_input_test.cc
+++ b/paddle/cinn/frontend/pass/transpose_folding_input_test.cc
--- a/paddle/cinn/frontend/pass/transpose_folding_output.cc
+++ b/paddle/cinn/frontend/pass/transpose_folding_output.cc
--- a/paddle/cinn/frontend/pass/transpose_folding_output_test.cc
+++ b/paddle/cinn/frontend/pass/transpose_folding_output_test.cc
--- a/paddle/cinn/frontend/pass/transpose_scale_folding_test.cc
+++ b/paddle/cinn/frontend/pass/transpose_scale_folding_test.cc
--- a/paddle/cinn/frontend/pass/use_program_pass.h
+++ b/paddle/cinn/frontend/pass/use_program_pass.h
--- a/paddle/cinn/frontend/program_pass.cc
+++ b/paddle/cinn/frontend/program_pass.cc
--- a/paddle/cinn/frontend/program_pass.h
+++ b/paddle/cinn/frontend/program_pass.h
--- a/paddle/cinn/frontend/syntax.cc
+++ b/paddle/cinn/frontend/syntax.cc
--- a/paddle/cinn/frontend/syntax.h
+++ b/paddle/cinn/frontend/syntax.h
--- a/paddle/cinn/frontend/syntax_test.cc
+++ b/paddle/cinn/frontend/syntax_test.cc
--- a/paddle/cinn/frontend/var_type_utils.h
+++ b/paddle/cinn/frontend/var_type_utils.h
--- a/paddle/cinn/gtest_main.cc
+++ b/paddle/cinn/gtest_main.cc
--- a/paddle/cinn/hlir/CMakeLists.txt
+++ b/paddle/cinn/hlir/CMakeLists.txt
--- a/paddle/cinn/hlir/framework/CMakeLists.txt
+++ b/paddle/cinn/hlir/framework/CMakeLists.txt
--- a/paddle/cinn/hlir/framework/accuracy_checker.cc
+++ b/paddle/cinn/hlir/framework/accuracy_checker.cc
--- a/paddle/cinn/hlir/framework/accuracy_checker.h
+++ b/paddle/cinn/hlir/framework/accuracy_checker.h
--- a/paddle/cinn/hlir/framework/accuracy_checker_test.cc
+++ b/paddle/cinn/hlir/framework/accuracy_checker_test.cc
--- a/paddle/cinn/hlir/framework/buffer.cc
+++ b/paddle/cinn/hlir/framework/buffer.cc
--- a/paddle/cinn/hlir/framework/buffer.h
+++ b/paddle/cinn/hlir/framework/buffer.h
--- a/paddle/cinn/hlir/framework/buffer_test.cc
+++ b/paddle/cinn/hlir/framework/buffer_test.cc
--- a/paddle/cinn/hlir/framework/graph.cc
+++ b/paddle/cinn/hlir/framework/graph.cc
--- a/paddle/cinn/hlir/framework/graph.h
+++ b/paddle/cinn/hlir/framework/graph.h
--- a/paddle/cinn/hlir/framework/graph_compiler.cc
+++ b/paddle/cinn/hlir/framework/graph_compiler.cc
--- a/paddle/cinn/hlir/framework/graph_compiler.h
+++ b/paddle/cinn/hlir/framework/graph_compiler.h
--- a/paddle/cinn/hlir/framework/graph_compiler_test.cc
+++ b/paddle/cinn/hlir/framework/graph_compiler_test.cc
--- a/paddle/cinn/hlir/framework/graph_test.cc
+++ b/paddle/cinn/hlir/framework/graph_test.cc
--- a/paddle/cinn/hlir/framework/instruction.cc
+++ b/paddle/cinn/hlir/framework/instruction.cc
--- a/paddle/cinn/hlir/framework/instruction.h
+++ b/paddle/cinn/hlir/framework/instruction.h
--- a/paddle/cinn/hlir/framework/instruction_test.cc
+++ b/paddle/cinn/hlir/framework/instruction_test.cc
--- a/paddle/cinn/hlir/framework/memory.cc
+++ b/paddle/cinn/hlir/framework/memory.cc
--- a/paddle/cinn/hlir/framework/memory.h
+++ b/paddle/cinn/hlir/framework/memory.h
--- a/paddle/cinn/hlir/framework/node.cc
+++ b/paddle/cinn/hlir/framework/node.cc
--- a/paddle/cinn/hlir/framework/node.h
+++ b/paddle/cinn/hlir/framework/node.h
--- a/paddle/cinn/hlir/framework/op.h
+++ b/paddle/cinn/hlir/framework/op.h
--- a/paddle/cinn/hlir/framework/op_lowering.cc
+++ b/paddle/cinn/hlir/framework/op_lowering.cc
--- a/paddle/cinn/hlir/framework/op_lowering.h
+++ b/paddle/cinn/hlir/framework/op_lowering.h
--- a/paddle/cinn/hlir/framework/op_lowering_test.cc
+++ b/paddle/cinn/hlir/framework/op_lowering_test.cc
--- a/paddle/cinn/hlir/framework/op_lowering_util.cc
+++ b/paddle/cinn/hlir/framework/op_lowering_util.cc
--- a/paddle/cinn/hlir/framework/op_lowering_util.h
+++ b/paddle/cinn/hlir/framework/op_lowering_util.h
--- a/paddle/cinn/hlir/framework/op_strategy.cc
+++ b/paddle/cinn/hlir/framework/op_strategy.cc
--- a/paddle/cinn/hlir/framework/op_strategy.h
+++ b/paddle/cinn/hlir/framework/op_strategy.h
--- a/paddle/cinn/hlir/framework/op_test.cc
+++ b/paddle/cinn/hlir/framework/op_test.cc
--- a/paddle/cinn/hlir/framework/parallel_compiler.cc
+++ b/paddle/cinn/hlir/framework/parallel_compiler.cc
--- a/paddle/cinn/hlir/framework/parallel_compiler.h
+++ b/paddle/cinn/hlir/framework/parallel_compiler.h
--- a/paddle/cinn/hlir/framework/parallel_compiler_test.cc
+++ b/paddle/cinn/hlir/framework/parallel_compiler_test.cc
--- a/paddle/cinn/hlir/framework/pass.cc
+++ b/paddle/cinn/hlir/framework/pass.cc
--- a/paddle/cinn/hlir/framework/pass.h
+++ b/paddle/cinn/hlir/framework/pass.h
--- a/paddle/cinn/hlir/framework/print_graph_pass_test.cc
+++ b/paddle/cinn/hlir/framework/print_graph_pass_test.cc
--- a/paddle/cinn/hlir/framework/schedule.h
+++ b/paddle/cinn/hlir/framework/schedule.h
--- a/paddle/cinn/hlir/framework/scope.cc
+++ b/paddle/cinn/hlir/framework/scope.cc
--- a/paddle/cinn/hlir/framework/scope.h
+++ b/paddle/cinn/hlir/framework/scope.h
--- a/paddle/cinn/hlir/framework/scope_test.cc
+++ b/paddle/cinn/hlir/framework/scope_test.cc
--- a/paddle/cinn/hlir/framework/tensor.cc
+++ b/paddle/cinn/hlir/framework/tensor.cc
--- a/paddle/cinn/hlir/framework/tensor.h
+++ b/paddle/cinn/hlir/framework/tensor.h
--- a/paddle/cinn/hlir/framework/tensor_test.cc
+++ b/paddle/cinn/hlir/framework/tensor_test.cc
--- a/paddle/cinn/hlir/framework/variable.cc
+++ b/paddle/cinn/hlir/framework/variable.cc
--- a/paddle/cinn/hlir/framework/variable.h
+++ b/paddle/cinn/hlir/framework/variable.h
--- a/paddle/cinn/hlir/framework/visualize_helper.cc
+++ b/paddle/cinn/hlir/framework/visualize_helper.cc
--- a/paddle/cinn/hlir/framework/visualize_helper.h
+++ b/paddle/cinn/hlir/framework/visualize_helper.h
--- a/paddle/cinn/hlir/kernels/CMakeLists.txt
+++ b/paddle/cinn/hlir/kernels/CMakeLists.txt
--- a/paddle/cinn/hlir/op/CMakeLists.txt
+++ b/paddle/cinn/hlir/op/CMakeLists.txt
--- a/paddle/cinn/hlir/op/broadcast.cc
+++ b/paddle/cinn/hlir/op/broadcast.cc
--- a/paddle/cinn/hlir/op/contrib/CMakeLists.txt
+++ b/paddle/cinn/hlir/op/contrib/CMakeLists.txt
--- a/paddle/cinn/hlir/op/contrib/argmax.cc
+++ b/paddle/cinn/hlir/op/contrib/argmax.cc
--- a/paddle/cinn/hlir/op/contrib/argmax.h
+++ b/paddle/cinn/hlir/op/contrib/argmax.h
--- a/paddle/cinn/hlir/op/contrib/argmax_test.cc
+++ b/paddle/cinn/hlir/op/contrib/argmax_test.cc
--- a/paddle/cinn/hlir/op/contrib/argmin.cc
+++ b/paddle/cinn/hlir/op/contrib/argmin.cc
--- a/paddle/cinn/hlir/op/contrib/argmin.h
+++ b/paddle/cinn/hlir/op/contrib/argmin.h
--- a/paddle/cinn/hlir/op/contrib/argmin_test.cc
+++ b/paddle/cinn/hlir/op/contrib/argmin_test.cc
--- a/paddle/cinn/hlir/op/contrib/assert_true.cc
+++ b/paddle/cinn/hlir/op/contrib/assert_true.cc
--- a/paddle/cinn/hlir/op/contrib/bitcast_convert.cc
+++ b/paddle/cinn/hlir/op/contrib/bitcast_convert.cc
--- a/paddle/cinn/hlir/op/contrib/cholesky.cc
+++ b/paddle/cinn/hlir/op/contrib/cholesky.cc
--- a/paddle/cinn/hlir/op/contrib/gather_nd.cc
+++ b/paddle/cinn/hlir/op/contrib/gather_nd.cc
--- a/paddle/cinn/hlir/op/contrib/gather_nd.h
+++ b/paddle/cinn/hlir/op/contrib/gather_nd.h
--- a/paddle/cinn/hlir/op/contrib/gather_nd_test.cc
+++ b/paddle/cinn/hlir/op/contrib/gather_nd_test.cc
--- a/paddle/cinn/hlir/op/contrib/gaussian_random.cc
+++ b/paddle/cinn/hlir/op/contrib/gaussian_random.cc
--- a/paddle/cinn/hlir/op/contrib/logical_right_shift.cc
+++ b/paddle/cinn/hlir/op/contrib/logical_right_shift.cc
--- a/paddle/cinn/hlir/op/contrib/logical_right_shift.h
+++ b/paddle/cinn/hlir/op/contrib/logical_right_shift.h
--- a/paddle/cinn/hlir/op/contrib/logical_right_shift_test.cc
+++ b/paddle/cinn/hlir/op/contrib/logical_right_shift_test.cc
--- a/paddle/cinn/hlir/op/contrib/lookup_table.cc
+++ b/paddle/cinn/hlir/op/contrib/lookup_table.cc
--- a/paddle/cinn/hlir/op/contrib/lookup_table.h
+++ b/paddle/cinn/hlir/op/contrib/lookup_table.h
--- a/paddle/cinn/hlir/op/contrib/lookup_table_test.cc
+++ b/paddle/cinn/hlir/op/contrib/lookup_table_test.cc
--- a/paddle/cinn/hlir/op/contrib/one_hot.cc
+++ b/paddle/cinn/hlir/op/contrib/one_hot.cc
--- a/paddle/cinn/hlir/op/contrib/one_hot.h
+++ b/paddle/cinn/hlir/op/contrib/one_hot.h
--- a/paddle/cinn/hlir/op/contrib/one_hot_test.cc
+++ b/paddle/cinn/hlir/op/contrib/one_hot_test.cc
--- a/paddle/cinn/hlir/op/contrib/randint.cc
+++ b/paddle/cinn/hlir/op/contrib/randint.cc
--- a/paddle/cinn/hlir/op/contrib/reciprocal.cc
+++ b/paddle/cinn/hlir/op/contrib/reciprocal.cc
--- a/paddle/cinn/hlir/op/contrib/reciprocal.h
+++ b/paddle/cinn/hlir/op/contrib/reciprocal.h
--- a/paddle/cinn/hlir/op/contrib/reciprocal_test.cc
+++ b/paddle/cinn/hlir/op/contrib/reciprocal_test.cc
--- a/paddle/cinn/hlir/op/contrib/repeat.cc
+++ b/paddle/cinn/hlir/op/contrib/repeat.cc
--- a/paddle/cinn/hlir/op/contrib/repeat.h
+++ b/paddle/cinn/hlir/op/contrib/repeat.h
--- a/paddle/cinn/hlir/op/contrib/repeat_test.cc
+++ b/paddle/cinn/hlir/op/contrib/repeat_test.cc
--- a/paddle/cinn/hlir/op/contrib/resize.cc
+++ b/paddle/cinn/hlir/op/contrib/resize.cc
--- a/paddle/cinn/hlir/op/contrib/resize.h
+++ b/paddle/cinn/hlir/op/contrib/resize.h
--- a/paddle/cinn/hlir/op/contrib/sort.cc
+++ b/paddle/cinn/hlir/op/contrib/sort.cc
--- a/paddle/cinn/hlir/op/contrib/sort.h
+++ b/paddle/cinn/hlir/op/contrib/sort.h
--- a/paddle/cinn/hlir/op/contrib/sort_test.cc
+++ b/paddle/cinn/hlir/op/contrib/sort_test.cc
--- a/paddle/cinn/hlir/op/contrib/triangular_solve.cc
+++ b/paddle/cinn/hlir/op/contrib/triangular_solve.cc
--- a/paddle/cinn/hlir/op/contrib/uniform_random.cc
+++ b/paddle/cinn/hlir/op/contrib/uniform_random.cc
--- a/paddle/cinn/hlir/op/custom_call.cc
+++ b/paddle/cinn/hlir/op/custom_call.cc
--- a/paddle/cinn/hlir/op/elementwise.cc
+++ b/paddle/cinn/hlir/op/elementwise.cc
--- a/paddle/cinn/hlir/op/external_api_registry.cc
+++ b/paddle/cinn/hlir/op/external_api_registry.cc
--- a/paddle/cinn/hlir/op/external_api_registry.h
+++ b/paddle/cinn/hlir/op/external_api_registry.h
--- a/paddle/cinn/hlir/op/external_api_registry_test.cc
+++ b/paddle/cinn/hlir/op/external_api_registry_test.cc
--- a/paddle/cinn/hlir/op/nn.cc
+++ b/paddle/cinn/hlir/op/nn.cc
--- a/paddle/cinn/hlir/op/op_broadcast_test.cc
+++ b/paddle/cinn/hlir/op/op_broadcast_test.cc
--- a/paddle/cinn/hlir/op/op_nn_test.cc
+++ b/paddle/cinn/hlir/op/op_nn_test.cc
--- a/paddle/cinn/hlir/op/op_util.cc
+++ b/paddle/cinn/hlir/op/op_util.cc
--- a/paddle/cinn/hlir/op/op_util.h
+++ b/paddle/cinn/hlir/op/op_util.h
--- a/paddle/cinn/hlir/op/reduction.cc
+++ b/paddle/cinn/hlir/op/reduction.cc
--- a/paddle/cinn/hlir/op/reduction_test.cc
+++ b/paddle/cinn/hlir/op/reduction_test.cc
--- a/paddle/cinn/hlir/op/transform.cc
+++ b/paddle/cinn/hlir/op/transform.cc
--- a/paddle/cinn/hlir/op/transform_test.cc
+++ b/paddle/cinn/hlir/op/transform_test.cc
--- a/paddle/cinn/hlir/op/use_ops.h
+++ b/paddle/cinn/hlir/op/use_ops.h
--- a/paddle/cinn/hlir/pass/CMakeLists.txt
+++ b/paddle/cinn/hlir/pass/CMakeLists.txt
--- a/paddle/cinn/hlir/pass/alterlayout.cc
+++ b/paddle/cinn/hlir/pass/alterlayout.cc
--- a/paddle/cinn/hlir/pass/alterlayout_test.cc
+++ b/paddle/cinn/hlir/pass/alterlayout_test.cc
--- a/paddle/cinn/hlir/pass/check_fusion_accuracy_pass.cc
+++ b/paddle/cinn/hlir/pass/check_fusion_accuracy_pass.cc
--- a/paddle/cinn/hlir/pass/check_fusion_accuracy_pass_test.cc
+++ b/paddle/cinn/hlir/pass/check_fusion_accuracy_pass_test.cc
--- a/paddle/cinn/hlir/pass/common_subexpression_elimination.cc
+++ b/paddle/cinn/hlir/pass/common_subexpression_elimination.cc
--- a/paddle/cinn/hlir/pass/common_subexpression_elimination_test.cc
+++ b/paddle/cinn/hlir/pass/common_subexpression_elimination_test.cc
--- a/paddle/cinn/hlir/pass/const_propagate.cc
+++ b/paddle/cinn/hlir/pass/const_propagate.cc
--- a/paddle/cinn/hlir/pass/const_propagate_test.cc
+++ b/paddle/cinn/hlir/pass/const_propagate_test.cc
--- a/paddle/cinn/hlir/pass/constant_folding_pass.cc
+++ b/paddle/cinn/hlir/pass/constant_folding_pass.cc
--- a/paddle/cinn/hlir/pass/constant_folding_pass_test.cc
+++ b/paddle/cinn/hlir/pass/constant_folding_pass_test.cc
--- a/paddle/cinn/hlir/pass/constant_folding_pass_util.cc
+++ b/paddle/cinn/hlir/pass/constant_folding_pass_util.cc
--- a/paddle/cinn/hlir/pass/constant_folding_pass_util.h
+++ b/paddle/cinn/hlir/pass/constant_folding_pass_util.h
--- a/paddle/cinn/hlir/pass/custom_call_pass.cc
+++ b/paddle/cinn/hlir/pass/custom_call_pass.cc
--- a/paddle/cinn/hlir/pass/dce_pass.cc
+++ b/paddle/cinn/hlir/pass/dce_pass.cc
--- a/paddle/cinn/hlir/pass/dce_pass_test.cc
+++ b/paddle/cinn/hlir/pass/dce_pass_test.cc
--- a/paddle/cinn/hlir/pass/dense_merge_pass.cc
+++ b/paddle/cinn/hlir/pass/dense_merge_pass.cc
--- a/paddle/cinn/hlir/pass/dense_merge_pass_test.cc
+++ b/paddle/cinn/hlir/pass/dense_merge_pass_test.cc
--- a/paddle/cinn/hlir/pass/dot_merger.cc
+++ b/paddle/cinn/hlir/pass/dot_merger.cc
--- a/paddle/cinn/hlir/pass/dot_merger_test.cc
+++ b/paddle/cinn/hlir/pass/dot_merger_test.cc
--- a/paddle/cinn/hlir/pass/fusion_helper_base.h
+++ b/paddle/cinn/hlir/pass/fusion_helper_base.h
--- a/paddle/cinn/hlir/pass/fusion_merge_pass.cc
+++ b/paddle/cinn/hlir/pass/fusion_merge_pass.cc
--- a/paddle/cinn/hlir/pass/fusion_merge_pass_test.cc
+++ b/paddle/cinn/hlir/pass/fusion_merge_pass_test.cc
--- a/paddle/cinn/hlir/pass/fusion_merge_pass_util.h
+++ b/paddle/cinn/hlir/pass/fusion_merge_pass_util.h
--- a/paddle/cinn/hlir/pass/infershape.cc
+++ b/paddle/cinn/hlir/pass/infershape.cc
--- a/paddle/cinn/hlir/pass/infershape.h
+++ b/paddle/cinn/hlir/pass/infershape.h
--- a/paddle/cinn/hlir/pass/op_fusion_pass.cc
+++ b/paddle/cinn/hlir/pass/op_fusion_pass.cc
--- a/paddle/cinn/hlir/pass/op_fusion_pass_test.cc
+++ b/paddle/cinn/hlir/pass/op_fusion_pass_test.cc
--- a/paddle/cinn/hlir/pass/op_fusion_pass_util.h
+++ b/paddle/cinn/hlir/pass/op_fusion_pass_util.h
--- a/paddle/cinn/hlir/pass/opfusion.cc
+++ b/paddle/cinn/hlir/pass/opfusion.cc
--- a/paddle/cinn/hlir/pass/opfusion_test.cc
+++ b/paddle/cinn/hlir/pass/opfusion_test.cc
--- a/paddle/cinn/hlir/pass/reduce_split_pass.cc
+++ b/paddle/cinn/hlir/pass/reduce_split_pass.cc
--- a/paddle/cinn/hlir/pass/reduce_split_pass_test.cc
+++ b/paddle/cinn/hlir/pass/reduce_split_pass_test.cc
--- a/paddle/cinn/hlir/pass/single_group_optimize_pass.cc
+++ b/paddle/cinn/hlir/pass/single_group_optimize_pass.cc
--- a/paddle/cinn/hlir/pass/test_dot_merger.cc
+++ b/paddle/cinn/hlir/pass/test_dot_merger.cc
--- a/paddle/cinn/hlir/pass/test_primitive_ops.cc
+++ b/paddle/cinn/hlir/pass/test_primitive_ops.cc
--- a/paddle/cinn/hlir/pass/use_pass.h
+++ b/paddle/cinn/hlir/pass/use_pass.h
--- a/paddle/cinn/hlir/pe/CMakeLists.txt
+++ b/paddle/cinn/hlir/pe/CMakeLists.txt
--- a/paddle/cinn/hlir/pe/broadcast.cc
+++ b/paddle/cinn/hlir/pe/broadcast.cc
--- a/paddle/cinn/hlir/pe/broadcast.h
+++ b/paddle/cinn/hlir/pe/broadcast.h
--- a/paddle/cinn/hlir/pe/elementwise.cc
+++ b/paddle/cinn/hlir/pe/elementwise.cc
--- a/paddle/cinn/hlir/pe/elementwise.h
+++ b/paddle/cinn/hlir/pe/elementwise.h
--- a/paddle/cinn/hlir/pe/ir_schedule_pe.cc
+++ b/paddle/cinn/hlir/pe/ir_schedule_pe.cc
--- a/paddle/cinn/hlir/pe/ir_schedule_pe.h
+++ b/paddle/cinn/hlir/pe/ir_schedule_pe.h
--- a/paddle/cinn/hlir/pe/load_params_test.cc
+++ b/paddle/cinn/hlir/pe/load_params_test.cc
--- a/paddle/cinn/hlir/pe/load_x86_params.cc
+++ b/paddle/cinn/hlir/pe/load_x86_params.cc
--- a/paddle/cinn/hlir/pe/load_x86_params.h
+++ b/paddle/cinn/hlir/pe/load_x86_params.h
--- a/paddle/cinn/hlir/pe/nn.cc
+++ b/paddle/cinn/hlir/pe/nn.cc
--- a/paddle/cinn/hlir/pe/nn.h
+++ b/paddle/cinn/hlir/pe/nn.h
--- a/paddle/cinn/hlir/pe/nn_util.cc
+++ b/paddle/cinn/hlir/pe/nn_util.cc
--- a/paddle/cinn/hlir/pe/nn_util.h
+++ b/paddle/cinn/hlir/pe/nn_util.h
--- a/paddle/cinn/hlir/pe/pe_broadcast_test.cc
+++ b/paddle/cinn/hlir/pe/pe_broadcast_test.cc
--- a/paddle/cinn/hlir/pe/pe_elementwise_test.cc
+++ b/paddle/cinn/hlir/pe/pe_elementwise_test.cc
--- a/paddle/cinn/hlir/pe/pe_transform_test.cc
+++ b/paddle/cinn/hlir/pe/pe_transform_test.cc
--- a/paddle/cinn/hlir/pe/reduction.cc
+++ b/paddle/cinn/hlir/pe/reduction.cc
--- a/paddle/cinn/hlir/pe/reduction.h
+++ b/paddle/cinn/hlir/pe/reduction.h
--- a/paddle/cinn/hlir/pe/schedule.cc
+++ b/paddle/cinn/hlir/pe/schedule.cc
--- a/paddle/cinn/hlir/pe/schedule.h
+++ b/paddle/cinn/hlir/pe/schedule.h
--- a/paddle/cinn/hlir/pe/schedule_param.proto
+++ b/paddle/cinn/hlir/pe/schedule_param.proto
--- a/paddle/cinn/hlir/pe/transform.cc
+++ b/paddle/cinn/hlir/pe/transform.cc
--- a/paddle/cinn/hlir/pe/transform.h
+++ b/paddle/cinn/hlir/pe/transform.h
--- a/paddle/cinn/hlir/pe/vision.cc
+++ b/paddle/cinn/hlir/pe/vision.cc
--- a/paddle/cinn/hlir/pe/vision.h
+++ b/paddle/cinn/hlir/pe/vision.h
--- a/paddle/cinn/ir/CMakeLists.txt
+++ b/paddle/cinn/ir/CMakeLists.txt
--- a/paddle/cinn/ir/buffer.cc
+++ b/paddle/cinn/ir/buffer.cc
--- a/paddle/cinn/ir/buffer.h
+++ b/paddle/cinn/ir/buffer.h
--- a/paddle/cinn/ir/buffer_test.cc
+++ b/paddle/cinn/ir/buffer_test.cc
--- a/paddle/cinn/ir/collect_ir_nodes.cc
+++ b/paddle/cinn/ir/collect_ir_nodes.cc
--- a/paddle/cinn/ir/collect_ir_nodes.h
+++ b/paddle/cinn/ir/collect_ir_nodes.h
--- a/paddle/cinn/ir/collect_ir_nodes_test.cc
+++ b/paddle/cinn/ir/collect_ir_nodes_test.cc
--- a/paddle/cinn/ir/function_base.cc
+++ b/paddle/cinn/ir/function_base.cc
--- a/paddle/cinn/ir/function_base.h
+++ b/paddle/cinn/ir/function_base.h
--- a/paddle/cinn/ir/function_definition.cc
+++ b/paddle/cinn/ir/function_definition.cc
--- a/paddle/cinn/ir/function_definition.h
+++ b/paddle/cinn/ir/function_definition.h
--- a/paddle/cinn/ir/intrinsic_ops.cc
+++ b/paddle/cinn/ir/intrinsic_ops.cc
--- a/paddle/cinn/ir/intrinsic_ops.h
+++ b/paddle/cinn/ir/intrinsic_ops.h
--- a/paddle/cinn/ir/intrinsic_ops_test.cc
+++ b/paddle/cinn/ir/intrinsic_ops_test.cc
--- a/paddle/cinn/ir/ir.cc
+++ b/paddle/cinn/ir/ir.cc
--- a/paddle/cinn/ir/ir.h
+++ b/paddle/cinn/ir/ir.h
--- a/paddle/cinn/ir/ir_base.cc
+++ b/paddle/cinn/ir/ir_base.cc
--- a/paddle/cinn/ir/ir_base.h
+++ b/paddle/cinn/ir/ir_base.h
--- a/paddle/cinn/ir/ir_compare.cc
+++ b/paddle/cinn/ir/ir_compare.cc
--- a/paddle/cinn/ir/ir_compare.h
+++ b/paddle/cinn/ir/ir_compare.h
--- a/paddle/cinn/ir/ir_compare_test.cc
+++ b/paddle/cinn/ir/ir_compare_test.cc
--- a/paddle/cinn/ir/ir_mutator.cc
+++ b/paddle/cinn/ir/ir_mutator.cc
--- a/paddle/cinn/ir/ir_mutator.h
+++ b/paddle/cinn/ir/ir_mutator.h
--- a/paddle/cinn/ir/ir_operators.cc
+++ b/paddle/cinn/ir/ir_operators.cc
--- a/paddle/cinn/ir/ir_operators.h
+++ b/paddle/cinn/ir/ir_operators.h
--- a/paddle/cinn/ir/ir_operators_test.cc
+++ b/paddle/cinn/ir/ir_operators_test.cc
--- a/paddle/cinn/ir/ir_printer.cc
+++ b/paddle/cinn/ir/ir_printer.cc
--- a/paddle/cinn/ir/ir_printer.h
+++ b/paddle/cinn/ir/ir_printer.h
--- a/paddle/cinn/ir/ir_printer_test.cc
+++ b/paddle/cinn/ir/ir_printer_test.cc
--- a/paddle/cinn/ir/ir_schedule.cc
+++ b/paddle/cinn/ir/ir_schedule.cc
--- a/paddle/cinn/ir/ir_schedule.h
+++ b/paddle/cinn/ir/ir_schedule.h
--- a/paddle/cinn/ir/ir_schedule_util.cc
+++ b/paddle/cinn/ir/ir_schedule_util.cc
--- a/paddle/cinn/ir/ir_schedule_util.h
+++ b/paddle/cinn/ir/ir_schedule_util.h
--- a/paddle/cinn/ir/ir_test.cc
+++ b/paddle/cinn/ir/ir_test.cc
--- a/paddle/cinn/ir/ir_verify.cc
+++ b/paddle/cinn/ir/ir_verify.cc
--- a/paddle/cinn/ir/ir_verify.h
+++ b/paddle/cinn/ir/ir_verify.h
--- a/paddle/cinn/ir/ir_verify_test.cc
+++ b/paddle/cinn/ir/ir_verify_test.cc
--- a/paddle/cinn/ir/ir_visitor.cc
+++ b/paddle/cinn/ir/ir_visitor.cc
--- a/paddle/cinn/ir/ir_visitor.h
+++ b/paddle/cinn/ir/ir_visitor.h
--- a/paddle/cinn/ir/layout.cc
+++ b/paddle/cinn/ir/layout.cc
--- a/paddle/cinn/ir/layout.h
+++ b/paddle/cinn/ir/layout.h
--- a/paddle/cinn/ir/lowered_func.cc
+++ b/paddle/cinn/ir/lowered_func.cc
--- a/paddle/cinn/ir/lowered_func.h
+++ b/paddle/cinn/ir/lowered_func.h
--- a/paddle/cinn/ir/module.cc
+++ b/paddle/cinn/ir/module.cc
--- a/paddle/cinn/ir/module.h
+++ b/paddle/cinn/ir/module.h
--- a/paddle/cinn/ir/operation.cc
+++ b/paddle/cinn/ir/operation.cc
--- a/paddle/cinn/ir/operation.h
+++ b/paddle/cinn/ir/operation.h
--- a/paddle/cinn/ir/registry.cc
+++ b/paddle/cinn/ir/registry.cc
--- a/paddle/cinn/ir/registry.h
+++ b/paddle/cinn/ir/registry.h
--- a/paddle/cinn/ir/schedule_desc.cc
+++ b/paddle/cinn/ir/schedule_desc.cc
--- a/paddle/cinn/ir/schedule_desc.h
+++ b/paddle/cinn/ir/schedule_desc.h
--- a/paddle/cinn/ir/schedule_desc.proto
+++ b/paddle/cinn/ir/schedule_desc.proto
--- a/paddle/cinn/ir/schedule_desc_test.cc
+++ b/paddle/cinn/ir/schedule_desc_test.cc
--- a/paddle/cinn/ir/tensor.cc
+++ b/paddle/cinn/ir/tensor.cc
--- a/paddle/cinn/ir/tensor.h
+++ b/paddle/cinn/ir/tensor.h
--- a/paddle/cinn/ir/tensor_test.cc
+++ b/paddle/cinn/ir/tensor_test.cc
--- a/paddle/cinn/lang/CMakeLists.txt
+++ b/paddle/cinn/lang/CMakeLists.txt
--- a/paddle/cinn/lang/README.md
+++ b/paddle/cinn/lang/README.md
--- a/paddle/cinn/lang/buffer.cc
+++ b/paddle/cinn/lang/buffer.cc
--- a/paddle/cinn/lang/buffer.h
+++ b/paddle/cinn/lang/buffer.h
--- a/paddle/cinn/lang/builtin.cc
+++ b/paddle/cinn/lang/builtin.cc
--- a/paddle/cinn/lang/builtin.h
+++ b/paddle/cinn/lang/builtin.h
--- a/paddle/cinn/lang/compute.cc
+++ b/paddle/cinn/lang/compute.cc
--- a/paddle/cinn/lang/compute.h
+++ b/paddle/cinn/lang/compute.h
--- a/paddle/cinn/lang/compute_test.cc
+++ b/paddle/cinn/lang/compute_test.cc
--- a/paddle/cinn/lang/lower.cc
+++ b/paddle/cinn/lang/lower.cc
--- a/paddle/cinn/lang/lower.h
+++ b/paddle/cinn/lang/lower.h
--- a/paddle/cinn/lang/lower_impl.cc
+++ b/paddle/cinn/lang/lower_impl.cc
--- a/paddle/cinn/lang/lower_impl.h
+++ b/paddle/cinn/lang/lower_impl.h
--- a/paddle/cinn/lang/lower_impl_test.cc
+++ b/paddle/cinn/lang/lower_impl_test.cc
--- a/paddle/cinn/lang/lower_test.cc
+++ b/paddle/cinn/lang/lower_test.cc
--- a/paddle/cinn/lang/packed_func.cc
+++ b/paddle/cinn/lang/packed_func.cc
--- a/paddle/cinn/lang/packed_func.h
+++ b/paddle/cinn/lang/packed_func.h
--- a/paddle/cinn/lang/packed_func_test.cc
+++ b/paddle/cinn/lang/packed_func_test.cc
--- a/paddle/cinn/lang/placeholder.cc
+++ b/paddle/cinn/lang/placeholder.cc
--- a/paddle/cinn/lang/placeholder.h
+++ b/paddle/cinn/lang/placeholder.h
--- a/paddle/cinn/lang/placeholder_test.cc
+++ b/paddle/cinn/lang/placeholder_test.cc
--- a/paddle/cinn/optim/CMakeLists.txt
+++ b/paddle/cinn/optim/CMakeLists.txt
--- a/paddle/cinn/optim/buffer_assign.cc
+++ b/paddle/cinn/optim/buffer_assign.cc
--- a/paddle/cinn/optim/buffer_assign.h
+++ b/paddle/cinn/optim/buffer_assign.h
--- a/paddle/cinn/optim/cache_read_write_replace_test.cc
+++ b/paddle/cinn/optim/cache_read_write_replace_test.cc
--- a/paddle/cinn/optim/call_arg_list_to_pod_value.cc
+++ b/paddle/cinn/optim/call_arg_list_to_pod_value.cc
--- a/paddle/cinn/optim/call_arg_list_to_pod_value.h
+++ b/paddle/cinn/optim/call_arg_list_to_pod_value.h
--- a/paddle/cinn/optim/cast_bool_to_int8.cc
+++ b/paddle/cinn/optim/cast_bool_to_int8.cc
--- a/paddle/cinn/optim/cast_bool_to_int8.h
+++ b/paddle/cinn/optim/cast_bool_to_int8.h
--- a/paddle/cinn/optim/cast_simplify.cc
+++ b/paddle/cinn/optim/cast_simplify.cc
--- a/paddle/cinn/optim/cast_simplify.h
+++ b/paddle/cinn/optim/cast_simplify.h
--- a/paddle/cinn/optim/cast_simplify_test.cc
+++ b/paddle/cinn/optim/cast_simplify_test.cc
--- a/paddle/cinn/optim/collect_undefined_vars.cc
+++ b/paddle/cinn/optim/collect_undefined_vars.cc
--- a/paddle/cinn/optim/collect_undefined_vars.h
+++ b/paddle/cinn/optim/collect_undefined_vars.h
--- a/paddle/cinn/optim/compute_inline_expand.cc
+++ b/paddle/cinn/optim/compute_inline_expand.cc
--- a/paddle/cinn/optim/compute_inline_expand.h
+++ b/paddle/cinn/optim/compute_inline_expand.h
--- a/paddle/cinn/optim/eliminate_broadcast_in_forloop.cc
+++ b/paddle/cinn/optim/eliminate_broadcast_in_forloop.cc
--- a/paddle/cinn/optim/eliminate_broadcast_in_forloop.h
+++ b/paddle/cinn/optim/eliminate_broadcast_in_forloop.h
--- a/paddle/cinn/optim/extern_call_process.cc
+++ b/paddle/cinn/optim/extern_call_process.cc
--- a/paddle/cinn/optim/extern_call_process.h
+++ b/paddle/cinn/optim/extern_call_process.h
--- a/paddle/cinn/optim/fold_cinn_call_arguments.cc
+++ b/paddle/cinn/optim/fold_cinn_call_arguments.cc
--- a/paddle/cinn/optim/fold_cinn_call_arguments.h
+++ b/paddle/cinn/optim/fold_cinn_call_arguments.h
--- a/paddle/cinn/optim/if_simplify.cc
+++ b/paddle/cinn/optim/if_simplify.cc
--- a/paddle/cinn/optim/if_simplify.h
+++ b/paddle/cinn/optim/if_simplify.h
--- a/paddle/cinn/optim/if_simplify_test.cc
+++ b/paddle/cinn/optim/if_simplify_test.cc
--- a/paddle/cinn/optim/insert_debug_log_callee.cc
+++ b/paddle/cinn/optim/insert_debug_log_callee.cc
--- a/paddle/cinn/optim/insert_debug_log_callee.h
+++ b/paddle/cinn/optim/insert_debug_log_callee.h
--- a/paddle/cinn/optim/ir_copy.cc
+++ b/paddle/cinn/optim/ir_copy.cc
--- a/paddle/cinn/optim/ir_copy.h
+++ b/paddle/cinn/optim/ir_copy.h
--- a/paddle/cinn/optim/ir_copy_test.cc
+++ b/paddle/cinn/optim/ir_copy_test.cc
--- a/paddle/cinn/optim/ir_replace.cc
+++ b/paddle/cinn/optim/ir_replace.cc
--- a/paddle/cinn/optim/ir_replace.h
+++ b/paddle/cinn/optim/ir_replace.h
--- a/paddle/cinn/optim/ir_simplify.cc
+++ b/paddle/cinn/optim/ir_simplify.cc
--- a/paddle/cinn/optim/ir_simplify.h
+++ b/paddle/cinn/optim/ir_simplify.h
--- a/paddle/cinn/optim/ir_simplify_test.cc
+++ b/paddle/cinn/optim/ir_simplify_test.cc
--- a/paddle/cinn/optim/lower_function_call_bind_vars.cc
+++ b/paddle/cinn/optim/lower_function_call_bind_vars.cc
--- a/paddle/cinn/optim/lower_function_call_bind_vars.h
+++ b/paddle/cinn/optim/lower_function_call_bind_vars.h
--- a/paddle/cinn/optim/lower_intrin.cc
+++ b/paddle/cinn/optim/lower_intrin.cc
--- a/paddle/cinn/optim/lower_intrin.h
+++ b/paddle/cinn/optim/lower_intrin.h
--- a/paddle/cinn/optim/map_extern_call.cc
+++ b/paddle/cinn/optim/map_extern_call.cc
--- a/paddle/cinn/optim/map_extern_call.h
+++ b/paddle/cinn/optim/map_extern_call.h
--- a/paddle/cinn/optim/optimize.cc
+++ b/paddle/cinn/optim/optimize.cc
--- a/paddle/cinn/optim/optimize.h
+++ b/paddle/cinn/optim/optimize.h
--- a/paddle/cinn/optim/optimize_test.cc
+++ b/paddle/cinn/optim/optimize_test.cc
--- a/paddle/cinn/optim/remove_nested_block.cc
+++ b/paddle/cinn/optim/remove_nested_block.cc
--- a/paddle/cinn/optim/remove_nested_block.h
+++ b/paddle/cinn/optim/remove_nested_block.h
--- a/paddle/cinn/optim/remove_nested_block_test.cc
+++ b/paddle/cinn/optim/remove_nested_block_test.cc
--- a/paddle/cinn/optim/remove_schedule_block.cc
+++ b/paddle/cinn/optim/remove_schedule_block.cc
--- a/paddle/cinn/optim/remove_schedule_block.h
+++ b/paddle/cinn/optim/remove_schedule_block.h
--- a/paddle/cinn/optim/remove_schedule_block_test.cc
+++ b/paddle/cinn/optim/remove_schedule_block_test.cc
--- a/paddle/cinn/optim/replace_call_with_expr.cc
+++ b/paddle/cinn/optim/replace_call_with_expr.cc
--- a/paddle/cinn/optim/replace_call_with_expr.h
+++ b/paddle/cinn/optim/replace_call_with_expr.h
--- a/paddle/cinn/optim/replace_call_with_expr_test.cc
+++ b/paddle/cinn/optim/replace_call_with_expr_test.cc
--- a/paddle/cinn/optim/replace_const_param_to_integer.cc
+++ b/paddle/cinn/optim/replace_const_param_to_integer.cc
--- a/paddle/cinn/optim/replace_const_param_to_integer.h
+++ b/paddle/cinn/optim/replace_const_param_to_integer.h
--- a/paddle/cinn/optim/replace_var_with_expr.cc
+++ b/paddle/cinn/optim/replace_var_with_expr.cc
--- a/paddle/cinn/optim/replace_var_with_expr.h
+++ b/paddle/cinn/optim/replace_var_with_expr.h
--- a/paddle/cinn/optim/tensor_write_tell.cc
+++ b/paddle/cinn/optim/tensor_write_tell.cc
--- a/paddle/cinn/optim/tensor_write_tell.h
+++ b/paddle/cinn/optim/tensor_write_tell.h
--- a/paddle/cinn/optim/transform_gpu_forloop.cc
+++ b/paddle/cinn/optim/transform_gpu_forloop.cc
--- a/paddle/cinn/optim/transform_gpu_forloop.h
+++ b/paddle/cinn/optim/transform_gpu_forloop.h
--- a/paddle/cinn/optim/transform_polyfor_to_for.cc
+++ b/paddle/cinn/optim/transform_polyfor_to_for.cc
--- a/paddle/cinn/optim/transform_polyfor_to_for.h
+++ b/paddle/cinn/optim/transform_polyfor_to_for.h
--- a/paddle/cinn/optim/transform_polyfor_to_for_test.cc
+++ b/paddle/cinn/optim/transform_polyfor_to_for_test.cc
--- a/paddle/cinn/optim/unroll_loops.cc
+++ b/paddle/cinn/optim/unroll_loops.cc
--- a/paddle/cinn/optim/unroll_loops.h
+++ b/paddle/cinn/optim/unroll_loops.h
--- a/paddle/cinn/optim/unroll_loops_test.cc
+++ b/paddle/cinn/optim/unroll_loops_test.cc
--- a/paddle/cinn/optim/var_mod_simplify.cc
+++ b/paddle/cinn/optim/var_mod_simplify.cc
--- a/paddle/cinn/optim/var_mod_simplify.h
+++ b/paddle/cinn/optim/var_mod_simplify.h
--- a/paddle/cinn/optim/vectorize_loops.cc
+++ b/paddle/cinn/optim/vectorize_loops.cc
--- a/paddle/cinn/optim/vectorize_loops.h
+++ b/paddle/cinn/optim/vectorize_loops.h
--- a/paddle/cinn/optim/vectorize_loops_test.cc
+++ b/paddle/cinn/optim/vectorize_loops_test.cc
--- a/paddle/cinn/poly/CMakeLists.txt
+++ b/paddle/cinn/poly/CMakeLists.txt
--- a/paddle/cinn/poly/ast_gen.cc
+++ b/paddle/cinn/poly/ast_gen.cc
--- a/paddle/cinn/poly/ast_gen.h
+++ b/paddle/cinn/poly/ast_gen.h
--- a/paddle/cinn/poly/ast_gen_test.cc
+++ b/paddle/cinn/poly/ast_gen_test.cc
--- a/paddle/cinn/poly/compute_at_transform.cc
+++ b/paddle/cinn/poly/compute_at_transform.cc
--- a/paddle/cinn/poly/compute_at_transform.h
+++ b/paddle/cinn/poly/compute_at_transform.h
--- a/paddle/cinn/poly/compute_at_transform_test.cc
+++ b/paddle/cinn/poly/compute_at_transform_test.cc
--- a/paddle/cinn/poly/dim.cc
+++ b/paddle/cinn/poly/dim.cc
--- a/paddle/cinn/poly/dim.h
+++ b/paddle/cinn/poly/dim.h
--- a/paddle/cinn/poly/domain.cc
+++ b/paddle/cinn/poly/domain.cc
--- a/paddle/cinn/poly/domain.h
+++ b/paddle/cinn/poly/domain.h
--- a/paddle/cinn/poly/domain_add_unit_loop_mutator.cc
+++ b/paddle/cinn/poly/domain_add_unit_loop_mutator.cc
--- a/paddle/cinn/poly/domain_add_unit_loop_mutator.h
+++ b/paddle/cinn/poly/domain_add_unit_loop_mutator.h
--- a/paddle/cinn/poly/graph.cc
+++ b/paddle/cinn/poly/graph.cc
--- a/paddle/cinn/poly/graph.h
+++ b/paddle/cinn/poly/graph.h
--- a/paddle/cinn/poly/graph_test.cc
+++ b/paddle/cinn/poly/graph_test.cc
--- a/paddle/cinn/poly/isl_utils.cc
+++ b/paddle/cinn/poly/isl_utils.cc
--- a/paddle/cinn/poly/isl_utils.h
+++ b/paddle/cinn/poly/isl_utils.h
--- a/paddle/cinn/poly/isl_utils_test.cc
+++ b/paddle/cinn/poly/isl_utils_test.cc
--- a/paddle/cinn/poly/map.cc
+++ b/paddle/cinn/poly/map.cc
--- a/paddle/cinn/poly/map.h
+++ b/paddle/cinn/poly/map.h
--- a/paddle/cinn/poly/naive_scheduler.cc
+++ b/paddle/cinn/poly/naive_scheduler.cc
--- a/paddle/cinn/poly/naive_scheduler.h
+++ b/paddle/cinn/poly/naive_scheduler.h
--- a/paddle/cinn/poly/poly_scheduler.cc
+++ b/paddle/cinn/poly/poly_scheduler.cc
--- a/paddle/cinn/poly/poly_scheduler.h
+++ b/paddle/cinn/poly/poly_scheduler.h
--- a/paddle/cinn/poly/poly_scheduler_test.cc
+++ b/paddle/cinn/poly/poly_scheduler_test.cc
--- a/paddle/cinn/poly/schedule.cc
+++ b/paddle/cinn/poly/schedule.cc
--- a/paddle/cinn/poly/schedule.h
+++ b/paddle/cinn/poly/schedule.h
--- a/paddle/cinn/poly/schedule_test.cc
+++ b/paddle/cinn/poly/schedule_test.cc
--- a/paddle/cinn/poly/stage.cc
+++ b/paddle/cinn/poly/stage.cc
--- a/paddle/cinn/poly/stage.h
+++ b/paddle/cinn/poly/stage.h
--- a/paddle/cinn/poly/stage_test.cc
+++ b/paddle/cinn/poly/stage_test.cc
--- a/paddle/cinn/pybind/CMakeLists.txt
+++ b/paddle/cinn/pybind/CMakeLists.txt
--- a/paddle/cinn/pybind/backends.cc
+++ b/paddle/cinn/pybind/backends.cc
--- a/paddle/cinn/pybind/bind.cc
+++ b/paddle/cinn/pybind/bind.cc
--- a/paddle/cinn/pybind/bind.h
+++ b/paddle/cinn/pybind/bind.h
--- a/paddle/cinn/pybind/bind_utils.h
+++ b/paddle/cinn/pybind/bind_utils.h
--- a/paddle/cinn/pybind/common.cc
+++ b/paddle/cinn/pybind/common.cc
--- a/paddle/cinn/pybind/framework.cc
+++ b/paddle/cinn/pybind/framework.cc
--- a/paddle/cinn/pybind/frontend.cc
+++ b/paddle/cinn/pybind/frontend.cc
--- a/paddle/cinn/pybind/ir.cc
+++ b/paddle/cinn/pybind/ir.cc
--- a/paddle/cinn/pybind/lang.cc
+++ b/paddle/cinn/pybind/lang.cc
--- a/paddle/cinn/pybind/optim.cc
+++ b/paddle/cinn/pybind/optim.cc
--- a/paddle/cinn/pybind/pe.cc
+++ b/paddle/cinn/pybind/pe.cc
--- a/paddle/cinn/pybind/poly.cc
+++ b/paddle/cinn/pybind/poly.cc
--- a/paddle/cinn/pybind/runtime.cc
+++ b/paddle/cinn/pybind/runtime.cc
--- a/paddle/cinn/pybind/utils.cc
+++ b/paddle/cinn/pybind/utils.cc
--- a/paddle/cinn/runtime/CMakeLists.txt
+++ b/paddle/cinn/runtime/CMakeLists.txt
--- a/paddle/cinn/runtime/buffer.cc
+++ b/paddle/cinn/runtime/buffer.cc
--- a/paddle/cinn/runtime/buffer.h
+++ b/paddle/cinn/runtime/buffer.h
--- a/paddle/cinn/runtime/cinn_runtime.cc
+++ b/paddle/cinn/runtime/cinn_runtime.cc
--- a/paddle/cinn/runtime/cinn_runtime.h
+++ b/paddle/cinn/runtime/cinn_runtime.h
--- a/paddle/cinn/runtime/cinn_runtime_test.cc
+++ b/paddle/cinn/runtime/cinn_runtime_test.cc
--- a/paddle/cinn/runtime/cinn_x86_device_impl.cc
+++ b/paddle/cinn/runtime/cinn_x86_device_impl.cc
--- a/paddle/cinn/runtime/cpu/CMakeLists.txt
+++ b/paddle/cinn/runtime/cpu/CMakeLists.txt
--- a/paddle/cinn/runtime/cpu/cblas.cc
+++ b/paddle/cinn/runtime/cpu/cblas.cc
--- a/paddle/cinn/runtime/cpu/cblas.h
+++ b/paddle/cinn/runtime/cpu/cblas.h
--- a/paddle/cinn/runtime/cpu/host_intrinsics.cc
+++ b/paddle/cinn/runtime/cpu/host_intrinsics.cc
--- a/paddle/cinn/runtime/cpu/host_intrinsics.h
+++ b/paddle/cinn/runtime/cpu/host_intrinsics.h
--- a/paddle/cinn/runtime/cpu/host_intrinsics_test.cc
+++ b/paddle/cinn/runtime/cpu/host_intrinsics_test.cc
--- a/paddle/cinn/runtime/cpu/mkl_math.cc
+++ b/paddle/cinn/runtime/cpu/mkl_math.cc
--- a/paddle/cinn/runtime/cpu/mkl_math.h
+++ b/paddle/cinn/runtime/cpu/mkl_math.h
--- a/paddle/cinn/runtime/cpu/mkl_math_test.cc
+++ b/paddle/cinn/runtime/cpu/mkl_math_test.cc
--- a/paddle/cinn/runtime/cpu/mkldnn_math.cc
+++ b/paddle/cinn/runtime/cpu/mkldnn_math.cc
--- a/paddle/cinn/runtime/cpu/mkldnn_math.h
+++ b/paddle/cinn/runtime/cpu/mkldnn_math.h
--- a/paddle/cinn/runtime/cpu/mkldnn_math_test.cc
+++ b/paddle/cinn/runtime/cpu/mkldnn_math_test.cc
--- a/paddle/cinn/runtime/cpu/thread_backend.cc
+++ b/paddle/cinn/runtime/cpu/thread_backend.cc
--- a/paddle/cinn/runtime/cpu/thread_backend.h
+++ b/paddle/cinn/runtime/cpu/thread_backend.h
--- a/paddle/cinn/runtime/cpu/use_extern_funcs.h
+++ b/paddle/cinn/runtime/cpu/use_extern_funcs.h
--- a/paddle/cinn/runtime/cuda/CMakeLists.txt
+++ b/paddle/cinn/runtime/cuda/CMakeLists.txt
--- a/paddle/cinn/runtime/cuda/bfloat16.h
+++ b/paddle/cinn/runtime/cuda/bfloat16.h
--- a/paddle/cinn/runtime/cuda/cinn_cuda_runtime_source.cuh
+++ b/paddle/cinn/runtime/cuda/cinn_cuda_runtime_source.cuh
--- a/paddle/cinn/runtime/cuda/cublas_util.h
+++ b/paddle/cinn/runtime/cuda/cublas_util.h
--- a/paddle/cinn/runtime/cuda/cuda_instrinsics_bfloat16.cc
+++ b/paddle/cinn/runtime/cuda/cuda_instrinsics_bfloat16.cc
--- a/paddle/cinn/runtime/cuda/cuda_instrinsics_float16.cc
+++ b/paddle/cinn/runtime/cuda/cuda_instrinsics_float16.cc
--- a/paddle/cinn/runtime/cuda/cuda_intrinsics.cc
+++ b/paddle/cinn/runtime/cuda/cuda_intrinsics.cc
--- a/paddle/cinn/runtime/cuda/cuda_intrinsics_reduce.cc
+++ b/paddle/cinn/runtime/cuda/cuda_intrinsics_reduce.cc
--- a/paddle/cinn/runtime/cuda/cuda_module.cc
+++ b/paddle/cinn/runtime/cuda/cuda_module.cc
--- a/paddle/cinn/runtime/cuda/cuda_module.h
+++ b/paddle/cinn/runtime/cuda/cuda_module.h
--- a/paddle/cinn/runtime/cuda/cuda_module_test.cc
+++ b/paddle/cinn/runtime/cuda/cuda_module_test.cc
--- a/paddle/cinn/runtime/cuda/cuda_util.cc
+++ b/paddle/cinn/runtime/cuda/cuda_util.cc
--- a/paddle/cinn/runtime/cuda/cuda_util.h
+++ b/paddle/cinn/runtime/cuda/cuda_util.h
--- a/paddle/cinn/runtime/cuda/float16.h
+++ b/paddle/cinn/runtime/cuda/float16.h
--- a/paddle/cinn/runtime/cuda/test_util.h
+++ b/paddle/cinn/runtime/cuda/test_util.h
--- a/paddle/cinn/runtime/cuda/use_extern_funcs.h
+++ b/paddle/cinn/runtime/cuda/use_extern_funcs.h
--- a/paddle/cinn/runtime/custom_function.cc
+++ b/paddle/cinn/runtime/custom_function.cc
--- a/paddle/cinn/runtime/custom_function.h
+++ b/paddle/cinn/runtime/custom_function.h
--- a/paddle/cinn/runtime/custom_function_test.cc
+++ b/paddle/cinn/runtime/custom_function_test.cc
--- a/paddle/cinn/runtime/flags.cc
+++ b/paddle/cinn/runtime/flags.cc
--- a/paddle/cinn/runtime/flags.h
+++ b/paddle/cinn/runtime/flags.h
--- a/paddle/cinn/runtime/intrinsic.cc
+++ b/paddle/cinn/runtime/intrinsic.cc
--- a/paddle/cinn/runtime/intrinsic.h
+++ b/paddle/cinn/runtime/intrinsic.h
--- a/paddle/cinn/runtime/intrinsic_types.cc
+++ b/paddle/cinn/runtime/intrinsic_types.cc
--- a/paddle/cinn/runtime/intrinsic_types.h
+++ b/paddle/cinn/runtime/intrinsic_types.h
--- a/paddle/cinn/runtime/tiny_runtime.cc
+++ b/paddle/cinn/runtime/tiny_runtime.cc
--- a/paddle/cinn/runtime/use_extern_funcs.h
+++ b/paddle/cinn/runtime/use_extern_funcs.h
--- a/paddle/cinn/utils/CMakeLists.txt
+++ b/paddle/cinn/utils/CMakeLists.txt
--- a/paddle/cinn/utils/data_util.cc
+++ b/paddle/cinn/utils/data_util.cc
--- a/paddle/cinn/utils/data_util.h
+++ b/paddle/cinn/utils/data_util.h
--- a/paddle/cinn/utils/dot_lang.cc
+++ b/paddle/cinn/utils/dot_lang.cc
--- a/paddle/cinn/utils/dot_lang.h
+++ b/paddle/cinn/utils/dot_lang.h
--- a/paddle/cinn/utils/error.cc
+++ b/paddle/cinn/utils/error.cc
--- a/paddle/cinn/utils/error.h
+++ b/paddle/cinn/utils/error.h
--- a/paddle/cinn/utils/event.cc
+++ b/paddle/cinn/utils/event.cc
--- a/paddle/cinn/utils/event.h
+++ b/paddle/cinn/utils/event.h
--- a/paddle/cinn/utils/functional.cc
+++ b/paddle/cinn/utils/functional.cc
--- a/paddle/cinn/utils/functional.h
+++ b/paddle/cinn/utils/functional.h
--- a/paddle/cinn/utils/functional_test.cc
+++ b/paddle/cinn/utils/functional_test.cc
--- a/paddle/cinn/utils/multi_threading.cc
+++ b/paddle/cinn/utils/multi_threading.cc
--- a/paddle/cinn/utils/multi_threading.h
+++ b/paddle/cinn/utils/multi_threading.h
--- a/paddle/cinn/utils/multi_threading_test.cc
+++ b/paddle/cinn/utils/multi_threading_test.cc
--- a/paddle/cinn/utils/profiler.cc
+++ b/paddle/cinn/utils/profiler.cc
--- a/paddle/cinn/utils/profiler.h
+++ b/paddle/cinn/utils/profiler.h
--- a/paddle/cinn/utils/profiler_test.cc
+++ b/paddle/cinn/utils/profiler_test.cc
--- a/paddle/cinn/utils/random_engine.cc
+++ b/paddle/cinn/utils/random_engine.cc
--- a/paddle/cinn/utils/random_engine.h
+++ b/paddle/cinn/utils/random_engine.h
--- a/paddle/cinn/utils/registry.h
+++ b/paddle/cinn/utils/registry.h
--- a/paddle/cinn/utils/sized_multi_set.cc
+++ b/paddle/cinn/utils/sized_multi_set.cc
--- a/paddle/cinn/utils/sized_multi_set.h
+++ b/paddle/cinn/utils/sized_multi_set.h
--- a/paddle/cinn/utils/sized_multi_set_test.cc
+++ b/paddle/cinn/utils/sized_multi_set_test.cc
--- a/paddle/cinn/utils/small_vector.cc
+++ b/paddle/cinn/utils/small_vector.cc
--- a/paddle/cinn/utils/small_vector.h
+++ b/paddle/cinn/utils/small_vector.h
--- a/paddle/cinn/utils/string.cc
+++ b/paddle/cinn/utils/string.cc
--- a/paddle/cinn/utils/string.h
+++ b/paddle/cinn/utils/string.h
--- a/paddle/cinn/utils/string_test.cc
+++ b/paddle/cinn/utils/string_test.cc
--- a/paddle/cinn/utils/timer.cc
+++ b/paddle/cinn/utils/timer.cc
--- a/paddle/cinn/utils/timer.h
+++ b/paddle/cinn/utils/timer.h
--- a/paddle/cinn/utils/type_defs.h
+++ b/paddle/cinn/utils/type_defs.h
--- a/paddle/fluid/framework/paddle2cinn/CMakeLists.txt
+++ b/paddle/fluid/framework/paddle2cinn/CMakeLists.txt
--- a/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc
+++ b/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc
--- a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
--- a/paddle/fluid/framework/paddle2cinn/cinn_compiler.h
+++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler.h
--- a/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc
--- a/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.cc
--- a/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.h
+++ b/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.h
--- a/paddle/fluid/framework/paddle2cinn/cinn_lib_test.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_lib_test.cc
--- a/paddle/fluid/framework/paddle2cinn/transform_desc.h
+++ b/paddle/fluid/framework/paddle2cinn/transform_desc.h
--- a/paddle/fluid/framework/paddle2cinn/transform_type.cc
+++ b/paddle/fluid/framework/paddle2cinn/transform_type.cc
--- a/paddle/fluid/framework/paddle2cinn/transform_type_test.cc
+++ b/paddle/fluid/framework/paddle2cinn/transform_type_test.cc
--- a/paddle/fluid/operators/cinn/CMakeLists.txt
+++ b/paddle/fluid/operators/cinn/CMakeLists.txt
--- a/paddle/fluid/operators/cinn/cinn_instruction_run_op.h
+++ b/paddle/fluid/operators/cinn/cinn_instruction_run_op.h
--- a/paddle/fluid/operators/cinn/cinn_launch_context.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_context.cc
--- a/paddle/fluid/operators/cinn/cinn_launch_op.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_op.cc
--- a/paddle/fluid/operators/cinn/cinn_launch_op.cu.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_op.cu.cc
--- a/paddle/fluid/operators/cinn/cinn_launch_op.h
+++ b/paddle/fluid/operators/cinn/cinn_launch_op.h
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
--- a/python/cinn/__init__.py
+++ b/python/cinn/__init__.py
--- a/python/cinn/auto_schedule/__init__.py
+++ b/python/cinn/auto_schedule/__init__.py
--- a/python/cinn/auto_schedule/cost_model/__init__.py
+++ b/python/cinn/auto_schedule/cost_model/__init__.py
--- a/python/cinn/auto_schedule/cost_model/cost_model.py
+++ b/python/cinn/auto_schedule/cost_model/cost_model.py
--- a/python/cinn/auto_schedule/cost_model/xgb_cost_model.py
+++ b/python/cinn/auto_schedule/cost_model/xgb_cost_model.py
--- a/python/cinn/backends.py
+++ b/python/cinn/backends.py
--- a/python/cinn/common.py
+++ b/python/cinn/common.py
--- a/python/cinn/framework.py
+++ b/python/cinn/framework.py
--- a/python/cinn/frontend.py
+++ b/python/cinn/frontend.py
--- a/python/cinn/ir/__init__.py
+++ b/python/cinn/ir/__init__.py
--- a/python/cinn/lang.py
+++ b/python/cinn/lang.py
--- a/python/cinn/libs/__init__.py
+++ b/python/cinn/libs/__init__.py
--- a/python/cinn/optim.py
+++ b/python/cinn/optim.py
--- a/python/cinn/pe.py
+++ b/python/cinn/pe.py
--- a/python/cinn/poly.py
+++ b/python/cinn/poly.py
--- a/python/cinn/runtime.py
+++ b/python/cinn/runtime.py
--- a/python/cinn/utils.py
+++ b/python/cinn/utils.py
--- a/python/cinn/version/__init__.py
+++ b/python/cinn/version/__init__.py
--- a/python/setup.py.in
+++ b/python/setup.py.in
--- a/python/setup_cinn.py.in
+++ b/python/setup_cinn.py.in
--- a/setup.py
+++ b/setup.py
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
--- a/test/cinn/CMakeLists.txt
+++ b/test/cinn/CMakeLists.txt
--- a/test/cinn/auto_schedule/cost_model/test_cost_model.py
+++ b/test/cinn/auto_schedule/cost_model/test_cost_model.py
--- a/test/cinn/conv2d_utils.py
+++ b/test/cinn/conv2d_utils.py
--- a/test/cinn/fake_model/naive_mul.py
+++ b/test/cinn/fake_model/naive_mul.py
--- a/test/cinn/fake_model/naive_multi_fc.py
+++ b/test/cinn/fake_model/naive_multi_fc.py
--- a/test/cinn/fake_model/resnet_model.py
+++ b/test/cinn/fake_model/resnet_model.py
--- a/test/cinn/fusion/fusion_test.py
+++ b/test/cinn/fusion/fusion_test.py
--- a/test/cinn/fusion/test_cast_broadcast_reduce_max.py
+++ b/test/cinn/fusion/test_cast_broadcast_reduce_max.py
--- a/test/cinn/fusion/test_reduce_cast.py
+++ b/test/cinn/fusion/test_reduce_cast.py
--- a/test/cinn/fusion/test_select_reduce.py
+++ b/test/cinn/fusion/test_select_reduce.py
--- a/test/cinn/op_mappers/op_mapper_test.py
+++ b/test/cinn/op_mappers/op_mapper_test.py
--- a/test/cinn/op_mappers/test_argmax_op.py
+++ b/test/cinn/op_mappers/test_argmax_op.py
--- a/test/cinn/op_mappers/test_argmin_op.py
+++ b/test/cinn/op_mappers/test_argmin_op.py
--- a/test/cinn/op_mappers/test_argsort_op.py
+++ b/test/cinn/op_mappers/test_argsort_op.py
--- a/test/cinn/op_mappers/test_assign_value_op.py
+++ b/test/cinn/op_mappers/test_assign_value_op.py
--- a/test/cinn/op_mappers/test_atan2_op.py
+++ b/test/cinn/op_mappers/test_atan2_op.py
--- a/test/cinn/op_mappers/test_batch_norm_op.py
+++ b/test/cinn/op_mappers/test_batch_norm_op.py
--- a/test/cinn/op_mappers/test_bitwise_op.py
+++ b/test/cinn/op_mappers/test_bitwise_op.py
--- a/test/cinn/op_mappers/test_cholesky_op.py
+++ b/test/cinn/op_mappers/test_cholesky_op.py
--- a/test/cinn/op_mappers/test_clip_op.py
+++ b/test/cinn/op_mappers/test_clip_op.py
--- a/test/cinn/op_mappers/test_compare_op.py
+++ b/test/cinn/op_mappers/test_compare_op.py
--- a/test/cinn/op_mappers/test_conv2d_op.py
+++ b/test/cinn/op_mappers/test_conv2d_op.py
--- a/test/cinn/op_mappers/test_cumsum_op.py
+++ b/test/cinn/op_mappers/test_cumsum_op.py
--- a/test/cinn/op_mappers/test_elementwise_op.py
+++ b/test/cinn/op_mappers/test_elementwise_op.py
--- a/test/cinn/op_mappers/test_expand_op.py
+++ b/test/cinn/op_mappers/test_expand_op.py
--- a/test/cinn/op_mappers/test_expand_v2_op.py
+++ b/test/cinn/op_mappers/test_expand_v2_op.py
--- a/test/cinn/op_mappers/test_fill_constant_op.py
+++ b/test/cinn/op_mappers/test_fill_constant_op.py
--- a/test/cinn/op_mappers/test_flip_op.py
+++ b/test/cinn/op_mappers/test_flip_op.py
--- a/test/cinn/op_mappers/test_gather_nd_op.py
+++ b/test/cinn/op_mappers/test_gather_nd_op.py
--- a/test/cinn/op_mappers/test_gather_op.py
+++ b/test/cinn/op_mappers/test_gather_op.py
--- a/test/cinn/op_mappers/test_gaussian_random_op.py
+++ b/test/cinn/op_mappers/test_gaussian_random_op.py
--- a/test/cinn/op_mappers/test_layer_norm_op.py
+++ b/test/cinn/op_mappers/test_layer_norm_op.py
--- a/test/cinn/op_mappers/test_log1p_op.py
+++ b/test/cinn/op_mappers/test_log1p_op.py
--- a/test/cinn/op_mappers/test_logical_op.py
+++ b/test/cinn/op_mappers/test_logical_op.py
--- a/test/cinn/op_mappers/test_lookup_table_op.py
+++ b/test/cinn/op_mappers/test_lookup_table_op.py
--- a/test/cinn/op_mappers/test_mul_op.py
+++ b/test/cinn/op_mappers/test_mul_op.py
--- a/test/cinn/op_mappers/test_norm_op.py
+++ b/test/cinn/op_mappers/test_norm_op.py
--- a/test/cinn/op_mappers/test_one_hot_op.py
+++ b/test/cinn/op_mappers/test_one_hot_op.py
--- a/test/cinn/op_mappers/test_pool2d_op.py
+++ b/test/cinn/op_mappers/test_pool2d_op.py
--- a/test/cinn/op_mappers/test_pow_op.py
+++ b/test/cinn/op_mappers/test_pow_op.py
--- a/test/cinn/op_mappers/test_randint_op.py
+++ b/test/cinn/op_mappers/test_randint_op.py
--- a/test/cinn/op_mappers/test_reduce_op.py
+++ b/test/cinn/op_mappers/test_reduce_op.py
--- a/test/cinn/op_mappers/test_reverse_op.py
+++ b/test/cinn/op_mappers/test_reverse_op.py
--- a/test/cinn/op_mappers/test_roll_op.py
+++ b/test/cinn/op_mappers/test_roll_op.py
--- a/test/cinn/op_mappers/test_scale_op.py
+++ b/test/cinn/op_mappers/test_scale_op.py
--- a/test/cinn/op_mappers/test_scatter_op.py
+++ b/test/cinn/op_mappers/test_scatter_op.py
--- a/test/cinn/op_mappers/test_sign_op.py
+++ b/test/cinn/op_mappers/test_sign_op.py
--- a/test/cinn/op_mappers/test_split_op.py
+++ b/test/cinn/op_mappers/test_split_op.py
--- a/test/cinn/op_mappers/test_squeeze_op.py
+++ b/test/cinn/op_mappers/test_squeeze_op.py
--- a/test/cinn/op_mappers/test_stack_op.py
+++ b/test/cinn/op_mappers/test_stack_op.py
--- a/test/cinn/op_mappers/test_strided_slice_op.py
+++ b/test/cinn/op_mappers/test_strided_slice_op.py
--- a/test/cinn/op_mappers/test_take_along_axis_op.py
+++ b/test/cinn/op_mappers/test_take_along_axis_op.py
--- a/test/cinn/op_mappers/test_tile_op.py
+++ b/test/cinn/op_mappers/test_tile_op.py
--- a/test/cinn/op_mappers/test_transpose2_op.py
+++ b/test/cinn/op_mappers/test_transpose2_op.py
--- a/test/cinn/op_mappers/test_triangular_solve_op.py
+++ b/test/cinn/op_mappers/test_triangular_solve_op.py
--- a/test/cinn/op_mappers/test_unary_op.py
+++ b/test/cinn/op_mappers/test_unary_op.py
--- a/test/cinn/op_mappers/test_uniform_random_op.py
+++ b/test/cinn/op_mappers/test_uniform_random_op.py
--- a/test/cinn/op_mappers/test_where_op.py
+++ b/test/cinn/op_mappers/test_where_op.py
--- a/test/cinn/ops/op_test.py
+++ b/test/cinn/ops/op_test.py
--- a/test/cinn/ops/op_test_helper.py
+++ b/test/cinn/ops/op_test_helper.py
--- a/test/cinn/ops/test_abs_op.py
+++ b/test/cinn/ops/test_abs_op.py
--- a/test/cinn/ops/test_acos_op.py
+++ b/test/cinn/ops/test_acos_op.py
--- a/test/cinn/ops/test_add_op.py
+++ b/test/cinn/ops/test_add_op.py
--- a/test/cinn/ops/test_arange_op.py
+++ b/test/cinn/ops/test_arange_op.py
--- a/test/cinn/ops/test_argsort_op.py
+++ b/test/cinn/ops/test_argsort_op.py
--- a/test/cinn/ops/test_asin_op.py
+++ b/test/cinn/ops/test_asin_op.py
--- a/test/cinn/ops/test_asinh_op.py
+++ b/test/cinn/ops/test_asinh_op.py
--- a/test/cinn/ops/test_atan2_op.py
+++ b/test/cinn/ops/test_atan2_op.py
--- a/test/cinn/ops/test_atan_op.py
+++ b/test/cinn/ops/test_atan_op.py
--- a/test/cinn/ops/test_atanh_op.py
+++ b/test/cinn/ops/test_atanh_op.py
--- a/test/cinn/ops/test_batch_norm_op.py
+++ b/test/cinn/ops/test_batch_norm_op.py
--- a/test/cinn/ops/test_binary_elementwise_op.py
+++ b/test/cinn/ops/test_binary_elementwise_op.py
--- a/test/cinn/ops/test_bitcast_convert_op.py
+++ b/test/cinn/ops/test_bitcast_convert_op.py
--- a/test/cinn/ops/test_bitwise_op.py
+++ b/test/cinn/ops/test_bitwise_op.py
--- a/test/cinn/ops/test_broadcast_to_op.py
+++ b/test/cinn/ops/test_broadcast_to_op.py
--- a/test/cinn/ops/test_broadcast_to_op_new.py
+++ b/test/cinn/ops/test_broadcast_to_op_new.py
--- a/test/cinn/ops/test_cast_op.py
+++ b/test/cinn/ops/test_cast_op.py
--- a/test/cinn/ops/test_cbrt_op.py
+++ b/test/cinn/ops/test_cbrt_op.py
--- a/test/cinn/ops/test_ceil_op.py
+++ b/test/cinn/ops/test_ceil_op.py
--- a/test/cinn/ops/test_cholesky_op.py
+++ b/test/cinn/ops/test_cholesky_op.py
--- a/test/cinn/ops/test_clz_op.py
+++ b/test/cinn/ops/test_clz_op.py
--- a/test/cinn/ops/test_comparison_op.py
+++ b/test/cinn/ops/test_comparison_op.py
--- a/test/cinn/ops/test_concat_op.py
+++ b/test/cinn/ops/test_concat_op.py
--- a/test/cinn/ops/test_constant_op.py
+++ b/test/cinn/ops/test_constant_op.py
--- a/test/cinn/ops/test_conv2d_op.py
+++ b/test/cinn/ops/test_conv2d_op.py
--- a/test/cinn/ops/test_cos_op.py
+++ b/test/cinn/ops/test_cos_op.py
--- a/test/cinn/ops/test_cosh_op.py
+++ b/test/cinn/ops/test_cosh_op.py
--- a/test/cinn/ops/test_depthwise_conv2d_op.py
+++ b/test/cinn/ops/test_depthwise_conv2d_op.py
--- a/test/cinn/ops/test_divide_op.py
+++ b/test/cinn/ops/test_divide_op.py
--- a/test/cinn/ops/test_dropout_infer_op.py
+++ b/test/cinn/ops/test_dropout_infer_op.py
--- a/test/cinn/ops/test_erf_op.py
+++ b/test/cinn/ops/test_erf_op.py
--- a/test/cinn/ops/test_exp_op.py
+++ b/test/cinn/ops/test_exp_op.py
--- a/test/cinn/ops/test_expand_dims.py
+++ b/test/cinn/ops/test_expand_dims.py
--- a/test/cinn/ops/test_fill_constant_op.py
+++ b/test/cinn/ops/test_fill_constant_op.py
--- a/test/cinn/ops/test_floor_divide_op.py
+++ b/test/cinn/ops/test_floor_divide_op.py
--- a/test/cinn/ops/test_floor_op.py
+++ b/test/cinn/ops/test_floor_op.py
--- a/test/cinn/ops/test_gather_nd_op.py
+++ b/test/cinn/ops/test_gather_nd_op.py
--- a/test/cinn/ops/test_gather_op.py
+++ b/test/cinn/ops/test_gather_op.py
--- a/test/cinn/ops/test_gaussian_random_op.py
+++ b/test/cinn/ops/test_gaussian_random_op.py
--- a/test/cinn/ops/test_gelu_op.py
+++ b/test/cinn/ops/test_gelu_op.py
--- a/test/cinn/ops/test_identity_op.py
+++ b/test/cinn/ops/test_identity_op.py
--- a/test/cinn/ops/test_is_finite_op.py
+++ b/test/cinn/ops/test_is_finite_op.py
--- a/test/cinn/ops/test_is_inf_op.py
+++ b/test/cinn/ops/test_is_inf_op.py
--- a/test/cinn/ops/test_is_nan_op.py
+++ b/test/cinn/ops/test_is_nan_op.py
--- a/test/cinn/ops/test_isclose_op.py
+++ b/test/cinn/ops/test_isclose_op.py
--- a/test/cinn/ops/test_left_shift_op.py
+++ b/test/cinn/ops/test_left_shift_op.py
--- a/test/cinn/ops/test_log_op.py
+++ b/test/cinn/ops/test_log_op.py
--- a/test/cinn/ops/test_logical_right_shift_op.py
+++ b/test/cinn/ops/test_logical_right_shift_op.py
--- a/test/cinn/ops/test_lookup_table_op.py
+++ b/test/cinn/ops/test_lookup_table_op.py
--- a/test/cinn/ops/test_matmul_op.py
+++ b/test/cinn/ops/test_matmul_op.py
--- a/test/cinn/ops/test_max_op.py
+++ b/test/cinn/ops/test_max_op.py