diff --git a/python/akg/build_module.py b/python/akg/build_module.py
index 7a4f91bec0e21ca8e3b8f9248545972b5b36560c..6c593a3c0581abed1373822448657aa5329c9e49 100644
--- a/python/akg/build_module.py
+++ b/python/akg/build_module.py
@@ -80,9 +80,9 @@ def build_config(**kwargs):
 
 
 @vc_util.check_input_type(schedule.Schedule, (list, tuple), (list, tuple), str,
-                          (dict, type(None)), (dict, type(None)), bool, bool, bool, bool)
+                          (dict, type(None)), (dict, type(None)), bool, bool, bool, str)
 def lower(sch, args, shape_params=None, name="default_function", binds=None, attrs=None,
-          simple_mode=False, polyhedral=False, tuning=False, aicpu=False):
+          simple_mode=False, polyhedral=False, tuning=False, target="cce"):
     """Lowering function."""
     tmp_binds = None
     if binds is not None:
@@ -96,7 +96,7 @@ def lower(sch, args, shape_params=None, name="default_function", binds=None, att
     cfg = _api_internal._GetCurrentBuildConfig()
     ret = _api_internal._Lower(sch, args, shape_params, name,
                                tmp_binds, tmp_attrs, simple_mode,
-                               polyhedral, tuning, aicpu, cfg)
+                               polyhedral, tuning, target, cfg)
 
     level = tmp_attrs.get("help_tiling")
     if tuning or (level is not None and level > help_tiling_level['None']):
@@ -116,9 +116,9 @@ def lower(sch, args, shape_params=None, name="default_function", binds=None, att
 
 
 @vc_util.check_input_type(schedule.Schedule, (list, tuple), (list, tuple, type(None)), str,
-                          (dict, type(None)), (dict, type(None)), bool, bool)
+                          (dict, type(None)), (dict, type(None)), bool, str)
 def build_to_func(inputs, args, shape_params=None, name="default_function",
-                  binds=None, attrs=None, polyhedral=False, aicpu=False):
+                  binds=None, attrs=None, polyhedral=False, target="cce"):
     """Build module."""
     tmp_binds = None
     if binds is not None:
@@ -132,14 +132,13 @@ def build_to_func(inputs, args, shape_params=None, name="default_function",
         shape_params = []
     cfg = _api_internal._GetCurrentBuildConfig()
     return _api_internal._BuildToFunc(inputs, args, shape_params, name, tmp_binds, tmp_attrs,
-                                      polyhedral, aicpu, cfg)
+                                      polyhedral, target, cfg)
 
-@vc_util.check_input_type(schedule.Schedule, (list, tuple), (str, type(None)), (list, tuple), str,
-                          (dict, type(None)), (dict, type(None)), bool, bool)
-def build(inputs, args, target=None, shape_params=None, name="default_function",
-          binds=None, attrs=None, polyhedral=False, aicpu=False):
+@vc_util.check_input_type(schedule.Schedule, (list, tuple), str, (list, tuple), str,
+                          (dict, type(None)), (dict, type(None)), bool)
+def build(inputs, args, target='cce', shape_params=None, name="default_function",
+          binds=None, attrs=None, polyhedral=False):
     tmp_rst = build_to_func(inputs, args, shape_params=shape_params, name=name, binds=binds,
-                            attrs=attrs, polyhedral=polyhedral, aicpu=aicpu)
+                            attrs=attrs, polyhedral=polyhedral, target=target)
 
-    tmp_target = target if target is not None else 'cce'
-    return _api_internal._BuildToModule(tmp_rst, tmp_target)
+    return _api_internal._BuildToModule(tmp_rst, target)
diff --git a/python/akg/ms/op_build.py b/python/akg/ms/op_build.py
index 1bb70b08cf14362839e63cef6dd16bc09c056f5a..0e1628960c810ed1c210ed864c56f3f58a2372d2 100644
--- a/python/akg/ms/op_build.py
+++ b/python/akg/ms/op_build.py
@@ -42,7 +42,6 @@ def op_build_to_func(opnames, computes, args, custom_schedule, device, kernel_na
         logging.error("Device %s is not in [aicore, aicpu].", device)
         return None
 
-    aicpu = device == "aicpu"
     polyhedral = True
     dump_ir = os.getenv(MS_AKG_DUMP_IR) == "on"
 
@@ -57,9 +56,9 @@ def op_build_to_func(opnames, computes, args, custom_schedule, device, kernel_na
             if attrs:
                 binds = attrs.pop(BINDS, None)
                 rst = akg.build_to_func(s, args, name=kernel_name, attrs=attrs, polyhedral=polyhedral,
-                                        binds=binds, aicpu=aicpu)
+                                        binds=binds, target=device)
             else:
-                rst = akg.build_to_func(s, args, name=kernel_name, polyhedral=polyhedral, aicpu=aicpu)
+                rst = akg.build_to_func(s, args, name=kernel_name, polyhedral=polyhedral, target=device)
 
     except Exception:
         logging.error(traceback.format_exc())
diff --git a/python/akg/utils/kernel_exec.py b/python/akg/utils/kernel_exec.py
index 861107b74e71a964c446541c18a1cc2e83a87378..97ebf1616e34b427b4387141909bfb69a6d5f1d3 100644
--- a/python/akg/utils/kernel_exec.py
+++ b/python/akg/utils/kernel_exec.py
@@ -724,13 +724,14 @@ def op_build(op_func, input_shapes, input_types, op_attrs=None, kernel_name="",
         if TensorUtils.is_output_value(output):
             op_var = op_var + [output]
 
-    if sch_tmpl != None:
-        assert(sch_tmpl['target'] == 'cuda')
+    if sch_tmpl is not None:
+        if sch_tmpl['target'] != 'cuda':
+            raise ValueError("Only support cuda as target when using schedule template.")
         kernel_name = kernel_name if kernel_name != "" else sch_tmpl['op_name']
         with akg.tvm.target.cuda() as target:
             s = sch_tmpl['schedule'](sch_tmpl['output'])
-            with akg.tvm.build_config(dump_pass_ir = True):
-                mod = akg.tvm.build(s, op_var, target, target_host = 'stackvm', name = kernel_name)
+            with akg.build_config(dump_pass_ir=True):
+                mod = akg.build(s, op_var, "cuda", shape_var, name=kernel_name, attrs=attrs, polyhedral=polyhedral, binds=binds)
                 dump_cuda_meta.dump(mod, kernel_name, s, op_var)
                 return mod
 
diff --git a/src/codegen/build_module.cc b/src/codegen/build_module.cc
index 15030ab426a94c898c2b3ffcbd4920cc7309e624..ece83690f468ca6dd6da96f3ceeeaec39fd6bc17 100644
--- a/src/codegen/build_module.cc
+++ b/src/codegen/build_module.cc
@@ -436,7 +436,7 @@ void FixParametricBinds(const Map<Tensor, Buffer> &binds, const Array<NodeRef> &
 
 NodeRef Lower(Schedule sch, const Array<NodeRef> &in_args, const Array<NodeRef> &shape_vars, const std::string &name,
               const Map<Tensor, Buffer> &in_binds, const Map<std::string, NodeRef> &in_attrs, bool simple_mode,
-              bool polyhedral, bool tuning, bool aicpu, const BuildConfig &config) {
+              bool polyhedral, bool tuning, const std::string &target, const BuildConfig &config) {
   ir::TestExprCompuationSimplify();
   CHECK(sch.defined()) << "sch is not defined.";
   CHECK(!name.empty()) << "name is empty.";
@@ -486,6 +486,41 @@ NodeRef Lower(Schedule sch, const Array<NodeRef> &in_args, const Array<NodeRef>
   auto new_sch = sch.normalize();
   auto bounds = air::schedule::InferBound(new_sch);
   Stmt stmt = make_pass("schedule.ScheduleOps", new_sch, bounds, false);
+
+  if (target == "cuda") {
+    // Phase 1
+    stmt = NEXT_PASS(RewriteForTensorCore, stmt, new_sch, binds_0);
+    stmt = NEXT_PASS(StorageFlatten, stmt, binds_0, 64, config->instrument_bound_checkers);
+    stmt = NEXT_PASS(CanonicalSimplify, stmt);
+
+    // Phase 2
+    if (!simple_mode) {
+      stmt = NEXT_PASS(LoopPartition, stmt, config->partition_const_loop);
+    }
+    if (config->disable_vectorize) {
+      stmt = NEXT_PASS(SkipVectorize, stmt);
+    } else {
+      stmt = NEXT_PASS(VectorizeLoop, stmt);
+    }
+    stmt = NEXT_PASS(InjectVirtualThread, stmt);
+    stmt = NEXT_PASS(InjectDoubleBuffer, stmt, config->double_buffer_split_loop);
+    stmt = NEXT_PASS(StorageRewrite, stmt);
+    stmt = NEXT_PASS(UnrollLoop, stmt, config->auto_unroll_max_step, config->auto_unroll_max_depth,
+                     config->auto_unroll_max_extent, config->unroll_explicit);
+
+    // Phase 3
+    stmt = NEXT_PASS(Simplify, stmt);
+    stmt = NEXT_PASS(RemoveNoOp, stmt);
+    if (config->instrument_bound_checkers) {
+      stmt = NEXT_PASS(InstrumentBoundCheckers, stmt);
+    }
+    if (simple_mode) {
+      return stmt;
+    }
+    LoweredFunc lowered_func = NEXT_PASS(MakeAPI, stmt, name, arg_list_0, 0, config->restricted_func);
+    return lowered_func;
+  }
+
   if (!polyhedral) {
     // for conv-matmul manual schedule
     stmt = NEXT_PASS(AutoMadPragmaAttr, stmt, true);
@@ -518,7 +553,7 @@ NodeRef Lower(Schedule sch, const Array<NodeRef> &in_args, const Array<NodeRef>
 
   PassMgr::SetArgs(arg_list_0);
 
-  if (!aicpu) {
+  if (target != "aicpu") {
     stmt = NEXT_PASS(MathIntrinRewrite, stmt);
   }
 
@@ -527,7 +562,7 @@ NodeRef Lower(Schedule sch, const Array<NodeRef> &in_args, const Array<NodeRef>
   }
 
   // Phase 1
-  if (!aicpu && polyhedral) {
+  if (target != "aicpu" && polyhedral) {
     stmt = NEXT_PASS(UnifyLoopVars, stmt, binds_0, arg_list_0);
     stmt = NEXT_PASS(CheckShapeParams, stmt, binds_0);
     stmt = NEXT_PASS(AlignPartitionCCE, stmt);
@@ -597,12 +632,13 @@ NodeRef Lower(Schedule sch, const Array<NodeRef> &in_args, const Array<NodeRef>
   }
 
   // micro-tuning configs: current strategy is to retry autopoly up to 3 times when storage flatten/rewrite fails
-  bool need_micro_tuning = !aicpu && polyhedral && !is_dynamic && global_attrs.GetStringAttr("dim", "").empty();
+  bool need_micro_tuning =
+    target != "aicpu" && polyhedral && !is_dynamic && global_attrs.GetStringAttr("dim", "").empty();
   const int max_enter_poly_times = global_attrs.GetIntAttr(kMaxNumRetryPoly, need_micro_tuning ? 4 : 1);
   int enter_count = 0;
   Stmt stmt_before_poly = stmt;
   while (enter_count < max_enter_poly_times) {
-    if (!aicpu && polyhedral) {
+    if (target != "aicpu" && polyhedral) {
       Array<NodeRef> poly_res = NEXT_PASS(AutoPoly, stmt_before_poly, binds_0, global_attrs, false, is_dynamic);
       enter_count++;
       CHECK_EQ(poly_res.size(), 2);
@@ -704,7 +740,7 @@ NodeRef Lower(Schedule sch, const Array<NodeRef> &in_args, const Array<NodeRef>
 
     // Loop Partition args : 2 : split_const_loop, 3 : remove Div / Mod ops by partitioning,
     //                       4 : whether to partition convolution or not
-    if (!aicpu && global_attrs.GetBoolAttr(kEnablePostPolyLoopPartition, true)) {
+    if (target != "aicpu" && global_attrs.GetBoolAttr(kEnablePostPolyLoopPartition, true)) {
       stmt = NEXT_PASS(LoopPartitionCCE, stmt, true, false, !polyhedral);
     }
 
@@ -731,7 +767,7 @@ NodeRef Lower(Schedule sch, const Array<NodeRef> &in_args, const Array<NodeRef>
       stmt = NEXT_PASS(FixLoopExtent, stmt);
     }
 
-    if (!aicpu) {
+    if (target != "aicpu") {
       stmt = NEXT_PASS(AutoPragma, stmt);
     }
     stmt = NEXT_PASS(EliminateAtomicDma, stmt);
@@ -741,7 +777,7 @@ NodeRef Lower(Schedule sch, const Array<NodeRef> &in_args, const Array<NodeRef>
 
     if (is_dynamic) {
       stmt = NEXT_PASS(AnalyzeMinAlignDynamic, stmt, global_attrs.GetIntAttr(kEnableConvAnalyzeAlign, true),
-                      global_attrs.GetIntAttr(kEnableScalarAlign, false));
+                       global_attrs.GetIntAttr(kEnableScalarAlign, false));
     } else {
       stmt = NEXT_PASS(RewriteBroadcastVector, stmt);
       stmt = NEXT_PASS(OptimizePragma, stmt);
@@ -815,7 +851,7 @@ NodeRef Lower(Schedule sch, const Array<NodeRef> &in_args, const Array<NodeRef>
       stmt = NEXT_PASS(AutoDoubleBuffer, stmt);
     }
     stmt = NEXT_PASS(InjectAccessPtrMSG, stmt);
-    if (!aicpu) {
+    if (target != "aicpu") {
       stmt = NEXT_PASS(InjectPipe, stmt);
     }
     stmt = NEXT_PASS(ModDivEliminate, stmt);
@@ -853,7 +889,7 @@ NodeRef Lower(Schedule sch, const Array<NodeRef> &in_args, const Array<NodeRef>
 
   stmt = NEXT_PASS(SpecialValueReplacer, stmt);
   stmt = NEXT_PASS(Simplify, stmt);
-  if (!aicpu) {
+  if (target != "aicpu") {
     stmt = NEXT_PASS(InjectSync, stmt);
   }
 
@@ -925,52 +961,65 @@ void BuildForDevice(const Array<LoweredFunc> &flist, const std::string &target_n
   TVMContext context{kDLCce, 0};
   DLDeviceType device_type = context.device_type;
 
-  Array<LoweredFunc> out_flist_0;
+  Array<LoweredFunc> fhost;
   Array<LoweredFunc> fdevice;
-  for (const auto &func : flist) {
+  for (auto func : flist) {
     if (func->func_type == air::LoweredFuncType::kMixedFunc) {
+      if (target_name == "cuda") {
+        if (BuildConfig::Current()->detect_global_barrier) {
+          func = NEXT_PASS(ThreadSync, func, "global");
+        }
+        func = NEXT_PASS(ThreadSync, func, "shared");
+        func = NEXT_PASS(ThreadSync, func, "warp");
+        func = NEXT_PASS(InferFragment, func);
+        func = NEXT_PASS(LowerThreadAllreduce, func, target->thread_warp_size);
+      }
       Array<LoweredFunc> fsplits = NEXT_PASS(SplitHostDevice, func);
-      out_flist_0.push_back(fsplits[0]);
+      fhost.push_back(fsplits[0]);
       for (size_t idx = 1; idx < fsplits.size(); idx++) {
         fdevice.push_back(fsplits[idx]);
       }
     } else if (func->func_type == air::LoweredFuncType::kHostFunc) {
-      out_flist_0.push_back(func);
+      fhost.push_back(func);
     } else if (func->func_type == air::LoweredFuncType::kDeviceFunc) {
-      out_flist_0.push_back(func);
+      fdevice.push_back(func);
     } else {
       LOG(FATAL) << "unknown function type " << func->func_type;
     }
   }
 
-  Array<LoweredFunc> out_flist_1;
-  for (const auto &func : out_flist_0) {
-    LoweredFunc lowered_func = NEXT_PASS(BindDeviceType, func, static_cast<int>(device_type));
-    out_flist_1.push_back(lowered_func);
+  if (target_name == "cuda") {
+    for (size_t i = 0; i < fdevice.size(); ++i) {
+      fdevice.Set(i, NEXT_PASS(LowerWarpMemory, fdevice[i], target->thread_warp_size));
+    }
   }
-  Array<LoweredFunc> out_flist_2;
-  for (const auto &func : out_flist_1) {
-    LoweredFunc lowered_func = NEXT_PASS(LowerTVMBuiltin, func);
-    out_flist_2.push_back(lowered_func);
+
+  for (size_t i = 0; i < fhost.size(); ++i) {
+    fhost.Set(i, NEXT_PASS(BindDeviceType, fhost[i], static_cast<int>(device_type)));
+    fhost.Set(i, NEXT_PASS(LowerTVMBuiltin, fhost[i]));
   }
 
   Target target_host = Target::Create(target_host_name);
-  Array<LoweredFunc> fdevice_0;
-  for (const auto &func : fdevice) {
-    LoweredFunc lowered_func = NEXT_PASS(LowerIntrin, func, target->target_name);
-    fdevice_0.push_back(lowered_func);
+
+  for (size_t i = 0; i < fdevice.size(); ++i) {
+    if (target_name == "cuda") {
+      fdevice.Set(i, NEXT_PASS(LowerDeviceStorageAccessInfo, fdevice[i]));
+    }
+    fdevice.Set(i, NEXT_PASS(LowerIntrin, fdevice[i], target->target_name));
   }
 
-  Array<LoweredFunc> out_flist_3;
-  for (const auto &func : out_flist_2) {
-    LoweredFunc lowered_func = NEXT_PASS(LowerIntrin, func, target_host->target_name);
-    out_flist_3.push_back(lowered_func);
+  for (size_t i = 0; i < fhost.size(); ++i) {
+    if (target_name == "cuda") {
+      fhost.Set(i, NEXT_PASS(LowerDeviceStorageAccessInfo, fhost[i]));
+    }
+    fhost.Set(i, NEXT_PASS(LowerIntrin, fhost[i], target_host->target_name));
+    fhost.Set(i, NEXT_PASS(CombineContextCall, fhost[i]));
   }
-  for (const auto &func : out_flist_3) {
-    LoweredFunc lowered_func = NEXT_PASS(CombineContextCall, func);
-    out_flist->push_back(lowered_func);
+
+  for (const auto &func : fhost) {
+    out_flist->push_back(func);
   }
-  *out_mdev = air::codegen::Build(fdevice_0, target_name, g_external_call_name);
+  *out_mdev = air::codegen::Build(fdevice, target_name, g_external_call_name);
   return;
 }
 
@@ -987,7 +1036,7 @@ TVM_REGISTER_NODE_TYPE(BuildRstNode);
 
 BuildRst BuildToFunc(const Schedule &inputs, const Array<NodeRef> &in_args, const Array<NodeRef> &shape_vars,
                      const std::string &name, const Map<Tensor, Buffer> &in_binds,
-                     const Map<std::string, NodeRef> &in_attrs, bool polyhedral, bool aicpu,
+                     const Map<std::string, NodeRef> &in_attrs, bool polyhedral, const std::string &target,
                      const BuildConfig &config) {
   CHECK(inputs.defined()) << "inputs is not defined.";
   CHECK(!name.empty()) << "name is empty.";
@@ -1005,7 +1054,7 @@ BuildRst BuildToFunc(const Schedule &inputs, const Array<NodeRef> &in_args, cons
     attrs = in_attrs;
   }
 
-  auto rst = Lower(inputs, args, shape_vars, name, binds, attrs, false, polyhedral, false, aicpu, config);
+  auto rst = Lower(inputs, args, shape_vars, name, binds, attrs, false, polyhedral, false, target, config);
   return BuildRstNode::make(rst, name);
 }
 
@@ -1073,11 +1122,11 @@ air::runtime::Module BuildToModule(const NodeRef &ref, const std::string &target
 }
 
 air::runtime::Module BuildModule(const Schedule &inputs, const Array<NodeRef> &in_args,
-                                  const Array<NodeRef> &shape_vars, const std::string &target_name,
-                                  const std::string &name, const Map<Tensor, Buffer> &in_binds,
-                                  const Map<std::string, NodeRef> &in_attrs, bool polyhedral, bool aicpu,
-                                  const BuildConfig &config) {
-  auto func = BuildToFunc(inputs, in_args, shape_vars, name, in_binds, in_attrs, polyhedral, aicpu, config);
+                                 const Array<NodeRef> &shape_vars, const std::string &target_name,
+                                 const std::string &name, const Map<Tensor, Buffer> &in_binds,
+                                 const Map<std::string, NodeRef> &in_attrs, bool polyhedral, const std::string &target,
+                                 const BuildConfig &config) {
+  auto func = BuildToFunc(inputs, in_args, shape_vars, name, in_binds, in_attrs, polyhedral, target, config);
 
   return BuildToModule(func, target_name);
 }
diff --git a/src/composite/composite.cc b/src/composite/composite.cc
index 8be07afe9c56d66c9173901e563e496e0a69f6e8..7ad315bed3f801e092b471c8398ff8947f114b99 100644
--- a/src/composite/composite.cc
+++ b/src/composite/composite.cc
@@ -454,7 +454,7 @@ NodeRef composite_with_json_to_func(const std::string &json_str, Map<std::string
   CHECK(config.defined());
   config->dump_pass_ir = akg_dump_pass_ir != nullptr;
   attrs.Set("pragma_reschedule", make_const(Int(32), 1));
-  auto build_rst = akg::BuildToFunc(sch, args, shape_vars, kernel_name, in_binds, attrs, true, false, config);
+  auto build_rst = akg::BuildToFunc(sch, args, shape_vars, kernel_name, in_binds, attrs, true, "cce", config);
   CHECK(build_rst.defined());
   return build_rst;
 }
@@ -519,7 +519,7 @@ NodeRef composite_lower(const std::string &json_str, Map<std::string, NodeRef> a
   akg::BuildConfig config = akg::BuildConfig::Current();
   CHECK(config.defined());
   bool tuning = attrs.find("tuning") != attrs.end();
-  return akg::Lower(sch, args, shape_vars, kernel_name, in_binds, attrs, false, true, tuning, false, config);
+  return akg::Lower(sch, args, shape_vars, kernel_name, in_binds, attrs, false, true, tuning, "cce", config);
 }
 
 TVM_REGISTER_GLOBAL("composite_with_json_to_func").set_body_typed(composite_with_json_to_func);
diff --git a/src/include/build_module.h b/src/include/build_module.h
index 2716748e1d6067d09e217233f9434e6926700d46..020b4e2d0cbffd8df3d1465d60bf08d3878dba79 100644
--- a/src/include/build_module.h
+++ b/src/include/build_module.h
@@ -47,19 +47,19 @@ class MemoryAllocationException : public std::exception {
 
 NodeRef Lower(Schedule sch, const Array<NodeRef> &in_args, const Array<NodeRef> &shape_vars, const std::string &name,
               const Map<Tensor, Buffer> &in_binds, const Map<std::string, NodeRef> &in_attrs, bool simple_mode,
-              bool polyhedral, bool tuning, bool aicpu, const BuildConfig &config);
+              bool polyhedral, bool tuning, const std::string &target, const BuildConfig &config);
 
 air::runtime::Module BuildModule(const Schedule &inputs, const Array<NodeRef> &in_args,
                                   const Array<NodeRef> &shape_vars, const std::string &target_name,
                                   const std::string &name, const Map<Tensor, Buffer> &in_binds,
-                                  const Map<std::string, NodeRef> &in_attrs, bool polyhedral, bool aicpu,
+                                  const Map<std::string, NodeRef> &in_attrs, bool polyhedral, const std::string &target,
                                   const BuildConfig &config);
 
 class BuildRst;
 
 BuildRst BuildToFunc(const Schedule &inputs, const Array<NodeRef> &in_args, const Array<NodeRef> &shape_vars,
                      const std::string &name, const Map<Tensor, Buffer> &in_binds,
-                     const Map<std::string, NodeRef> &in_attrs, bool polyhedral, bool aicpu, const BuildConfig &config);
+                     const Map<std::string, NodeRef> &in_attrs, bool polyhedral, const std::string &target, const BuildConfig &config);
 
 air::runtime::Module BuildToModule(const NodeRef &ref, const std::string &target_name = "cce");