diff --git a/README.md b/README.md
index cdbf2d9f3bf9973fb6c7fe2365ea61f05ce998c1..c4c5decec54300b2fc08cd64461cf371b3d2ce91 100644
--- a/README.md
+++ b/README.md
@@ -14,9 +14,8 @@ English | [简体中文](./README_cn.md)
 
 Welcome to the PaddlePaddle GitHub.
 
-PaddlePaddle, as the only independent R&D deep learning platform in China, has been officially open-sourced to professional communities since 2016. It is an industrial platform with advanced technologies and rich features that cover core deep learning frameworks, basic model libraries, end-to-end development kits, tools & components as well as service platforms.
-PaddlePaddle is originated from industrial practices with dedication and commitments to industrialization. It has been widely adopted by a wide range of sectors including manufacturing, agriculture, enterprise service, and so on while serving more than 4 million developers. With such advantages, PaddlePaddle has helped an increasing number of partners commercialize AI.
-
+PaddlePaddle, as the first independent R&D deep learning platform in China, has been officially open-sourced to professional communities since 2016. It is an industrial platform with advanced technologies and rich features that cover core deep learning frameworks, basic model libraries, end-to-end development kits, tools & components as well as service platforms.
+PaddlePaddle is originated from industrial practices with dedication and commitments to industrialization. It has been widely adopted by a wide range of sectors including manufacturing, agriculture, enterprise service, and so on while serving more than 4 million developers, 157,000 companies and generating 476,000 models. With such advantages, PaddlePaddle has helped an increasing number of partners commercialize AI.
 
 
 ## Installation
diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
index 7f21bcee87ab705097d3c2beaf799e5f2d93b833..2af407f711ec10097e5692d7e189cf847bcfeaf1 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
@@ -395,7 +395,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Barrier(
   platform::CUDADeviceGuard gpuGuard;
   for (auto& place : places) {
     gpuGuard.SetDeviceIndex(place.GetDeviceId());
-    auto dt = full({1}, 0, phi::DataType::FLOAT32, phi::Backend::GPU);
+    auto dt = full({1}, 0, phi::DataType::FLOAT32, phi::GPUPlace());
     barrierTensors.push_back(dt);
   }
   auto task = ProcessGroupNCCL::AllReduce(barrierTensors);
@@ -417,7 +417,7 @@ void CheckTensorsInDifferentDevices(const std::vector<Tensor>& tensors,
   std::set<Place> used_devices;
 
   for (const auto& t : tensors) {
-    PADDLE_ENFORCE_EQ(t.is_cuda() && t.is_dense_tensor(), true,
+    PADDLE_ENFORCE_EQ(t.is_gpu() && t.is_dense_tensor(), true,
                       platform::errors::InvalidArgument(
                           "Tensors must be CUDA and dense tensor."));
 
diff --git a/paddle/fluid/distributed/collective/reducer.cc b/paddle/fluid/distributed/collective/reducer.cc
index be4c5423943f5076201b75e307094c75d3d9c103..79961cca85ae03d654ea59b4e58c814dd2c4c543 100644
--- a/paddle/fluid/distributed/collective/reducer.cc
+++ b/paddle/fluid/distributed/collective/reducer.cc
@@ -321,7 +321,7 @@ EagerReducer::EagerReducer(
   if (find_unused_vars_each_step_) {
     global_used_vars_ = paddle::experimental::empty(
         ScalarArray({static_cast<int32_t>(tensors_.size())}), DataType::INT32,
-        TransToBackend(inner_place_));
+        inner_place_);
   }
 }
 
@@ -363,10 +363,8 @@ void EagerReducer::InitializeGroups(
     } else {
       // process the dense gradient.
       InitializeDenseGroups(tensor_indices_, &group);
-      // experimental::Backend backend =  TransToBackend(inner_place_);
       group.dense_contents_ = paddle::experimental::empty(
-          ScalarArray({group.all_length_}), group.dtype_,
-          TransToBackend(inner_place_));
+          ScalarArray({group.all_length_}), group.dtype_, inner_place_);
     }
 
     // map tensors to this group by VariableLocator
diff --git a/paddle/fluid/eager/api/utils/tensor_utils.cc b/paddle/fluid/eager/api/utils/tensor_utils.cc
index b485beca57a214bc00cb813e9de6a53eca1e67ea..81ea92d1c3c4848ceb6ff3b333030a90eccc0b79 100644
--- a/paddle/fluid/eager/api/utils/tensor_utils.cc
+++ b/paddle/fluid/eager/api/utils/tensor_utils.cc
@@ -43,8 +43,7 @@ paddle::experimental::Tensor CreateTensorWithValue(
     const phi::DataType& dtype, const phi::DataLayout& layout, float value,
     bool is_leaf) {
   paddle::experimental::Tensor out = paddle::experimental::full(
-      phi::vectorize(ddim), paddle::experimental::Scalar(value), dtype,
-      phi::TransToPhiBackend(place));
+      phi::vectorize(ddim), paddle::experimental::Scalar(value), dtype, place);
 
   auto meta = EagerUtils::autograd_meta(&out);
   if (is_leaf) {
diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
index 5b7315e1ee7b1a2f67b1025bb4f47c0ea051f5d7..97114b9fe1189a9d9774eeb223e69926c3e11cbc 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
@@ -29,7 +29,7 @@ yaml_types_mapping = {
     'int' : 'int', 'int32' : 'int32_t', 'int64' : 'int64_t',  'size_t' : 'size_t', \
     'float' : 'float', 'double' : 'double', 'bool' : 'bool', \
     'str' : 'std::string', \
-    'Backend' : 'paddle::experimental::Backend', 'DataLayout' : 'paddle::experimental::DataLayout', 'DataType' : 'paddle::experimental::DataType', \
+    'Place' : 'paddle::experimental::Place', 'DataLayout' : 'paddle::experimental::DataLayout', 'DataType' : 'paddle::experimental::DataType', \
     'int64[]' : 'std::vector<int64_t>', 'int[]' : 'std::vector<int>',
     'Tensor' : 'Tensor',
     'Tensor[]' : 'std::vector<Tensor>',
@@ -56,6 +56,14 @@ def ParseArguments():
 #################
 ###  Helpers  ###
 #################
+def RecoverBaseNameOfInplaceFunction(function_name):
+    return function_name[:-1]
+
+
+def GetInplacedFunctionName(function_name):
+    return function_name + "_"
+
+
 def FindGradName(string):
     return string + "_grad"
 
@@ -149,6 +157,24 @@ def ReadBwdFile(filepath):
 ######################
 ###  Yaml Parsers  ###
 ######################
+def ParseInplaceInfo(string):
+    # string: "(x -> out0), (y -> out2)"
+    inplace_map = {}
+    for pair in string.split(","):
+        pair = pair.strip()
+        if pair.startswith("("):
+            pair = pair[1:]
+
+        if pair.endswith(")"):
+            pair = pair[:-1]
+
+        key = pair.split("->")[0].strip()
+        val = pair.split("->")[1].strip()
+        inplace_map[key] = val
+
+    return inplace_map
+
+
 def RemoveSpecialSymbolsInName(string):
     # Remove any name after '@'
     ret = string.split("@")[0]
@@ -684,9 +710,10 @@ std::vector<std::vector<paddle::experimental::Tensor>> {}::operator()(const std:
 
 def GenerateNodeCreationCodes(
         fwd_api_name, bwd_api_name, forward_inputs_position_map,
-        forward_outputs_position_map, forward_attrs_list,
+        forward_outputs_position_map, forward_attrs_list, forward_call_str,
         backward_fwd_input_map, backward_grad_input_map,
-        backward_grad_output_map, backward_attrs_list, optional_inputs):
+        backward_grad_output_map, backward_attrs_list, optional_inputs,
+        inplace_map):
     # fwd_api_name = ""
     # forward_inputs_position_map = { "name" : [type, fwd_position] }
     # forward_outputs_position_map = { "name" : [type, fwd_position] }
@@ -724,19 +751,19 @@ def GenerateNodeCreationCodes(
         output_autograd_meta_vec_name = GetAutoGradMetaVectorName(name)
         if num_fwd_outputs == 1:
             if IsPlainTensorType(rtype):
-                output_autograd_meta = f"    egr::AutogradMeta* {output_autograd_meta_name} = egr::EagerUtils::autograd_meta(&api_result);"
+                output_autograd_meta = f"        egr::AutogradMeta* {output_autograd_meta_name} = egr::EagerUtils::autograd_meta(&api_result);"
             else:
                 assert IsVectorTensorType(rtype)
-                output_autograd_meta = f"    std::vector<egr::AutogradMeta*> {output_autograd_meta_vec_name} = egr::EagerUtils::autograd_meta(&api_result);\n"
-                output_autograd_meta += f"    std::vector<egr::AutogradMeta*>* {output_autograd_meta_name} = &{output_autograd_meta_vec_name};"
+                output_autograd_meta = f"        std::vector<egr::AutogradMeta*> {output_autograd_meta_vec_name} = egr::EagerUtils::autograd_meta(&api_result);\n"
+                output_autograd_meta += f"        std::vector<egr::AutogradMeta*>* {output_autograd_meta_name} = &{output_autograd_meta_vec_name};"
         else:
             # Tuple api_result
             if IsPlainTensorType(rtype):
-                output_autograd_meta = f"    egr::AutogradMeta* {output_autograd_meta_name} = egr::EagerUtils::autograd_meta(&std::get<{pos}>(api_result));"
+                output_autograd_meta = f"        egr::AutogradMeta* {output_autograd_meta_name} = egr::EagerUtils::autograd_meta(&std::get<{pos}>(api_result));"
             else:
                 assert IsVectorTensorType(rtype)
-                output_autograd_meta = f"    std::vector<egr::AutogradMeta*> {output_autograd_meta_vec_name} = egr::EagerUtils::autograd_meta(&std::get<{pos}>(api_result));\n"
-                output_autograd_meta += f"    std::vector<egr::AutogradMeta*>* {output_autograd_meta_name} = &{output_autograd_meta_vec_name};"
+                output_autograd_meta = f"        std::vector<egr::AutogradMeta*> {output_autograd_meta_vec_name} = egr::EagerUtils::autograd_meta(&std::get<{pos}>(api_result));\n"
+                output_autograd_meta += f"        std::vector<egr::AutogradMeta*>* {output_autograd_meta_name} = &{output_autograd_meta_vec_name};"
 
         outputs_autograd_meta_list.append(output_autograd_meta)
         pass_stop_gradient_args_list.append(output_autograd_meta_name)
@@ -745,16 +772,34 @@ def GenerateNodeCreationCodes(
     outputs_autograd_meta_str = "\n".join(outputs_autograd_meta_list)
     pass_stop_gradient_args_str = ",".join(pass_stop_gradient_args_list)
 
+    # Check Inplace
+    check_inplace_str = ""
+    bump_inplace_version_str = ""
+    for inplace_name in inplace_map.keys():
+        inplace_autograd_meta_name = GetAutoGradMetaName(inplace_name)
+        check_inplace_str += f"""
+    // Check Inplace
+    egr::EagerUtils::CheckInplace({inplace_name}, {inplace_autograd_meta_name}, require_any_grad);\n
+"""
+
+        bump_inplace_version_str += f"""
+    // Bump Inplace Version
+    {inplace_name}.bump_inplace_version();
+    VLOG(3) << \"Tensor(\" << {inplace_name}.name() << \") uses Inplace Strategy.\";\n
+"""
+
     # Node Construction
     num_bwd_inputs = len(backward_grad_input_map.keys())
     num_bwd_outputs = len(backward_grad_output_map.keys())
-    grad_node_name = GetGradNodeName(fwd_api_name)
-    node_construction_str = f"        auto grad_node = std::make_shared<{grad_node_name}>({num_bwd_inputs}, {num_bwd_outputs});"
+    grad_node_name = GetGradNodeName(
+        RecoverBaseNameOfInplaceFunction(
+            fwd_api_name)) if inplace_map else GetGradNodeName(fwd_api_name)
+    node_construction_str = f"            auto grad_node = std::make_shared<{grad_node_name}>({num_bwd_inputs}, {num_bwd_outputs});"
 
     # SetAttributes
     set_attributes_list = []
     for name, _, _, _ in backward_attrs_list:
-        set_attributes = f"        grad_node->SetAttribute{name}({name});"
+        set_attributes = f"            grad_node->SetAttribute{name}({name});"
         set_attributes_list.append(set_attributes)
     set_attributes_str = "\n".join(set_attributes_list)
 
@@ -765,9 +810,9 @@ def GenerateNodeCreationCodes(
 
         if is_fwd_input:
             if is_optional:
-                set_tensor_wrappers = f"        if({name}.is_initialized()) grad_node->SetTensorWrapper{name}({name}, true);"
+                set_tensor_wrappers = f"            if({name}.is_initialized()) grad_node->SetTensorWrapper{name}({name}, true);"
             else:
-                set_tensor_wrappers = f"        grad_node->SetTensorWrapper{name}({name}, true);"
+                set_tensor_wrappers = f"            grad_node->SetTensorWrapper{name}({name}, true);"
         else:
             if num_fwd_outputs > 1:
                 # Aligned with forward output position
@@ -778,9 +823,9 @@ def GenerateNodeCreationCodes(
                 tw_name = f"api_result"
 
             if is_optional:
-                set_tensor_wrappers = f"        if({tw_name}.is_initialized()) grad_node->SetTensorWrapper{name}({tw_name}, false);"
+                set_tensor_wrappers = f"            if({tw_name}.is_initialized()) grad_node->SetTensorWrapper{name}({tw_name}, false);"
             else:
-                set_tensor_wrappers = f"        grad_node->SetTensorWrapper{name}({tw_name}, false);"
+                set_tensor_wrappers = f"            grad_node->SetTensorWrapper{name}({tw_name}, false);"
         set_tensor_wrappers_list.append(set_tensor_wrappers)
     set_tensor_wrappers_str = "\n".join(set_tensor_wrappers_list)
 
@@ -789,8 +834,8 @@ def GenerateNodeCreationCodes(
     set_edges_list = []
     for name, (_, pos) in forward_inputs_position_map.items():
         input_autograd_meta_name = GetAutoGradMetaName(name)
-        set_grad_out_meta = f"        grad_node->SetGradOutMeta({name}, {pos});"
-        set_edges = f"        grad_node->AddEdges({input_autograd_meta_name}, {pos});"
+        set_grad_out_meta = f"            grad_node->SetGradOutMeta({name}, {pos});"
+        set_edges = f"            grad_node->AddEdges({input_autograd_meta_name}, {pos});"
         set_grad_out_meta_list.append(set_grad_out_meta)
         set_edges_list.append(set_edges)
     set_grad_out_meta_str = "\n".join(set_grad_out_meta_list)
@@ -804,14 +849,14 @@ def GenerateNodeCreationCodes(
     num_outputs = len(forward_outputs_position_map.keys())
     for name, (_, pos) in forward_outputs_position_map.items():
         output_autograd_meta_name = GetAutoGradMetaName(name)
-        set_out_rank = f"        egr::EagerUtils::SetOutRankWithSlot({output_autograd_meta_name}, {pos});"
-        set_history = f"        egr::EagerUtils::SetHistory({output_autograd_meta_name}, grad_node);"
+        set_out_rank = f"            egr::EagerUtils::SetOutRankWithSlot({output_autograd_meta_name}, {pos});"
+        set_history = f"            egr::EagerUtils::SetHistory({output_autograd_meta_name}, grad_node);"
         if num_outputs == 1:
-            set_retain_grad = f"        egr::EagerUtils::CheckAndRetainGrad(api_result);"
-            set_grad_in_meta = f"        grad_node->SetGradInMeta(api_result, {pos});"
+            set_retain_grad = f"            egr::EagerUtils::CheckAndRetainGrad(api_result);"
+            set_grad_in_meta = f"            grad_node->SetGradInMeta(api_result, {pos});"
         else:
-            set_retain_grad = f"        egr::EagerUtils::CheckAndRetainGrad(std::get<{pos}>(api_result));"
-            set_grad_in_meta = f"        grad_node->SetGradInMeta(std::get<{pos}>(api_result), {pos});"
+            set_retain_grad = f"            egr::EagerUtils::CheckAndRetainGrad(std::get<{pos}>(api_result));"
+            set_grad_in_meta = f"            grad_node->SetGradInMeta(std::get<{pos}>(api_result), {pos});"
 
         set_out_rank_list.append(set_out_rank)
         set_history_list.append(set_history)
@@ -823,55 +868,64 @@ def GenerateNodeCreationCodes(
     set_grad_in_meta_str = "\n".join(set_grad_in_meta_list)
     set_retain_grad_str = "\n".join(set_retain_grad_list)
 
+    node_event_name = fwd_api_name + " node_creation"
+    NODE_CREATION_TEMPLATE = """
+        paddle::platform::RecordEvent node_creation_record_event(\"{}\", paddle::platform::TracerEventType::Operator, 1);\n
+        """
+    node_creation_event_str = NODE_CREATION_TEMPLATE.format(node_event_name)
+
     NODE_CREATION_TEMPLATE = """
 
     // Get AutoGradMeta
-{}
 {}
     bool trace_backward = egr::Controller::Instance().HasGrad();
-
     bool require_any_grad = egr::EagerUtils::ComputeRequireGrad({});
-    if(require_any_grad) {{
-        egr::EagerUtils::PassStopGradient({});
-        
-        // Node Construction
 {}
-
-        // SetAttributes
+    // Forward API Call
+    {}
 {}
-
-        // SetTensorWrappers
+    {{
 {}
-
-        // SetGradOutMeta & SetEdges
 {}
+        if(require_any_grad) {{
+            egr::EagerUtils::PassStopGradient({});
+            
+            // Node Construction
 {}
-
-        // SetOutRank & SetHistory & SetGradInMeta & RetainGrad
+            // SetAttributes
 {}
+            // SetTensorWrappers
 {}
+            // SetGradOutMeta & SetEdges
 {}
 {}
-
+            // SetOutRank & SetHistory & SetGradInMeta & RetainGrad
+{}
+{}
+{}
+{}
+        }}
     }}
 
 """
     node_creation_str = NODE_CREATION_TEMPLATE.format(
-        inputs_autograd_meta_str, outputs_autograd_meta_str,
-        compute_require_grad_args_str, pass_stop_gradient_args_str,
-        node_construction_str, set_attributes_str, set_tensor_wrappers_str,
-        set_grad_out_meta_str, set_edges_str, set_out_rank_str, set_history_str,
-        set_grad_in_meta_str, set_retain_grad_str)
+        inputs_autograd_meta_str, compute_require_grad_args_str,
+        check_inplace_str, forward_call_str, bump_inplace_version_str,
+        node_creation_event_str, outputs_autograd_meta_str,
+        pass_stop_gradient_args_str, node_construction_str, set_attributes_str,
+        set_tensor_wrappers_str, set_grad_out_meta_str, set_edges_str,
+        set_out_rank_str, set_history_str, set_grad_in_meta_str,
+        set_retain_grad_str)
 
     return node_creation_str
 
 
-def GenerateForwardDefinition(fwd_api_name, bwd_api_name,
-                              forward_inputs_position_map,
-                              forward_outputs_position_map, forward_attrs_list,
-                              backward_fwd_input_map, backward_grad_input_map,
-                              backward_grad_output_map, backward_attrs_list,
-                              optional_inputs, intermediate_outputs):
+def GenerateForwardDefinition(
+        fwd_api_name, bwd_api_name, forward_inputs_position_map,
+        forward_outputs_position_map, forward_attrs_list,
+        backward_fwd_input_map, backward_grad_input_map,
+        backward_grad_output_map, backward_attrs_list, optional_inputs,
+        intermediate_outputs, inplace_map):
     # fwd_api_name = ""
     # forward_inputs_position_map = { "name" : [type, fwd_position] }
     # forward_outputs_position_map = { "name" : [type, fwd_position] }
@@ -895,7 +949,10 @@ def GenerateForwardDefinition(fwd_api_name, bwd_api_name,
             if is_optional:
                 arg_str = f"const paddle::optional<paddle::experimental::Tensor>& {name}"
             else:
-                arg_str = f"const paddle::experimental::Tensor& {name}"
+                if inplace_map and name in inplace_map.keys():
+                    arg_str = f"paddle::experimental::Tensor& {name}"
+                else:
+                    arg_str = f"const paddle::experimental::Tensor& {name}"
         else:
             assert IsVectorTensorType(ttype)
             arg_str = f"const std::vector<paddle::experimental::Tensor>& {name}"
@@ -958,26 +1015,16 @@ def GenerateForwardDefinition(fwd_api_name, bwd_api_name,
 
     node_creation_str = GenerateNodeCreationCodes(
         fwd_api_name, bwd_api_name, forward_inputs_position_map,
-        forward_outputs_position_map, forward_attrs_list,
+        forward_outputs_position_map, forward_attrs_list, forward_call_str,
         backward_fwd_input_map, backward_grad_input_map,
-        backward_grad_output_map, backward_attrs_list, optional_inputs)
-
-    node_event_name = fwd_api_name + " node_creation"
-    NODE_CREATION_TEMPLATE = """{{\n
-           paddle::platform::RecordEvent node_creation_record_event(\"{}\", paddle::platform::TracerEventType::Operator, 1);\n
-           {}\n
-        }}"""
-    node_creation_str = NODE_CREATION_TEMPLATE.format(node_event_name,
-                                                      node_creation_str)
+        backward_grad_output_map, backward_attrs_list, optional_inputs,
+        inplace_map)
 
     dygraph_event_str = f"paddle::platform::RecordEvent dygraph_entrance_record_event(\"{fwd_api_name} dygraph\", paddle::platform::TracerEventType::Operator, 1);"
 
     FORWARD_FUNCTION_TEMPLATE = """
 {} {}({}) {{
     {}
-
-    // Forward API Call
-    {}
     
 {}
 
@@ -989,7 +1036,7 @@ def GenerateForwardDefinition(fwd_api_name, bwd_api_name,
     forward_function_name = GetForwardFunctionName(fwd_api_name)
     forward_function_str = FORWARD_FUNCTION_TEMPLATE.format(
         returns_type_str, forward_function_name, inputs_args_definition_str,
-        dygraph_event_str, forward_call_str, node_creation_str, returns_str)
+        dygraph_event_str, node_creation_str, returns_str)
     forward_function_declaration_str = f"{returns_type_str} {forward_function_name}({inputs_args_declaration_str});"
 
     return forward_function_str, forward_function_declaration_str
@@ -1191,6 +1238,10 @@ if __name__ == "__main__":
             fwd_args_str = fwd_api['args']
             fwd_returns_str = fwd_api['output']
 
+            inplace_map = {}
+            if 'inplace' in fwd_api.keys():
+                inplace_map = ParseInplaceInfo(fwd_api['inplace'])
+
             bwd_api_name = fwd_api['backward']
             assert bwd_api_name in grad_api_dict.keys(), bwd_api_name
             bwd_api = grad_api_dict[bwd_api_name]
@@ -1287,7 +1338,7 @@ if __name__ == "__main__":
                 forward_outputs_position_map, orig_forward_attrs_list,
                 backward_fwd_input_map, backward_grad_input_map,
                 backward_grad_output_map, backward_attrs_list, optional_inputs,
-                intermediate_outputs)
+                intermediate_outputs, {})
             print("Generated Forward Definition: ", forward_definition_str)
             print("Generated Forward Declaration: ", forward_declaration_str)
             yaml_forward_definition_str += definition_declaration_pair[0]
@@ -1298,6 +1349,30 @@ if __name__ == "__main__":
                                       forward_outputs_position_map,
                                       orig_forward_attrs_list)
 
+            # Inplaced Version Dygraph Function Generation
+            if fwd_api_name != "sum" and "inplace" in fwd_api.keys():
+                fwd_api_name_inplaced = GetInplacedFunctionName(fwd_api_name)
+
+                # Node Definition Generation
+                definition_declaration_pair = GenerateForwardDefinition(
+                    fwd_api_name_inplaced, bwd_api_name,
+                    forward_inputs_position_map, forward_outputs_position_map,
+                    forward_attrs_list, backward_fwd_input_map,
+                    backward_grad_input_map, backward_grad_output_map,
+                    backward_attrs_list, optional_inputs, intermediate_outputs,
+                    inplace_map)
+                print("Generated Inplaced Forward Definition: ",
+                      forward_definition_str)
+                print("Generated Inplaced Forward Declaration: ",
+                      forward_declaration_str)
+                forward_definition_str += definition_declaration_pair[0]
+                forward_declaration_str += definition_declaration_pair[1]
+
+                # For python-level API dispatch
+                CollectCoreOpsInformation(
+                    fwd_api_name_inplaced, forward_inputs_position_map,
+                    forward_outputs_position_map, forward_attrs_list)
+
         if len(namespace) > 0:
             forward_definition_str += f"""namespace {namespace} {{
     {yaml_forward_definition_str}
diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
index 203e0098a64d4aec79aff72454d1d11461ea068e..5a732212a5649b6899394b80399d3af32707197d 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
@@ -15,7 +15,7 @@
 import os
 import argparse
 import logging
-from eager_gen import namespace, yaml_types_mapping, ReadFwdFile, ParseDispensable, IsVectorTensorType, GetForwardFunctionName, ParseYamlForward, DetermineForwardPositionMap
+from eager_gen import namespace, yaml_types_mapping, ReadFwdFile, ParseDispensable, IsVectorTensorType, GetForwardFunctionName, ParseYamlForward, DetermineForwardPositionMap, GetInplacedFunctionName, ParseInplaceInfo
 
 ###########################
 ## Global Configurations ##
@@ -43,7 +43,7 @@ atype_to_parsing_function = {
     "std::vector<std::string>": "CastPyArg2Strings",
     "paddle::experimental::Scalar": "CastPyArg2Scalar",
     "paddle::experimental::ScalarArray": "CastPyArg2ScalarArray",
-    "paddle::experimental::Backend": "CastPyArg2Backend",
+    "paddle::experimental::Place": "CastPyArg2Place",
     "paddle::experimental::DataType": "CastPyArg2DataType",
 }
 
@@ -71,6 +71,14 @@ RECORD_EVENT_TEMPLATE = \
 "    paddle::platform::RecordEvent {}(\"{} {}\", paddle::platform::TracerEventType::Operator, 1);"
 
 
+RETURN_INPLACE_PYOBJECT_TEMPLATE = \
+"""
+    ssize_t arg_id = GetIdxFromCoreOpsInfoMap(core_ops_final_state_args_info, \"final_state_{}\", \"{}\");
+    ssize_t return_id = GetIdxFromCoreOpsInfoMap(core_ops_final_state_returns_info, \"final_state_{}\", \"{}\");
+    return ToPyObject(out, return_id, args, arg_id);
+"""
+
+
 PYTHON_C_FUNCTION_TEMPLATE = \
 """
 static PyObject * eager_final_state_api_{}(PyObject *self, PyObject *args, PyObject *kwargs)
@@ -94,7 +102,7 @@ static PyObject * eager_final_state_api_{}(PyObject *self, PyObject *args, PyObj
     
     PyEval_RestoreThread(tstate);
     tstate = nullptr;
-    return ToPyObject(out);
+{}
   }}
   catch(...) {{
     if (tstate) {{
@@ -287,9 +295,10 @@ class PythonCSingleFunctionGenerator:
         self.forward_inputs_position_map, self.forward_outputs_position_map = DetermineForwardPositionMap(
             forward_inputs_list, forward_returns_list)
 
-    def GeneratePythonCFunction(self):
+    def GeneratePythonCFunction(self, inplace_map):
         namespace = self.namespace
-        forward_api_name = self.forward_api_name
+        forward_api_name = GetInplacedFunctionName(
+            self.forward_api_name) if inplace_map else self.forward_api_name
         forward_attrs_list = self.forward_attrs_list
         forward_inputs_position_map = self.forward_inputs_position_map
         forward_outputs_position_map = self.forward_outputs_position_map
@@ -341,19 +350,31 @@ class PythonCSingleFunctionGenerator:
             fwd_function_name = FUNCTION_NAME_TEMPLATE.format(
                 "::", namespace, GetForwardFunctionName(forward_api_name))
 
+        if inplace_map:
+            assert len(
+                inplace_map
+            ) == 1, f"size of inplace_map must be 1, but inplace_map of \"{forward_api_name}\" op got {len(inplace_map)}"
+            for inplace_input, inplace_output in inplace_map.items():
+                return_str = RETURN_INPLACE_PYOBJECT_TEMPLATE.format(
+                    forward_api_name, inplace_input, forward_api_name,
+                    inplace_output)
+                break
+        else:
+            return_str = "    return ToPyObject(out);"
+
         # Generate Record Event for performance profiling
         pythonc_record_event_str = RECORD_EVENT_TEMPLATE.format(
             "pythonc_record_event", forward_api_name, "pybind_imperative_func")
         self.python_c_function_str = PYTHON_C_FUNCTION_TEMPLATE.format(
             forward_api_name, pythonc_record_event_str, forward_api_name,
             get_eager_tensor_str, parse_attributes_str, fwd_function_name,
-            dygraph_function_call_str)
+            dygraph_function_call_str, return_str)
 
         # Generate Python-C Function Registration
         self.python_c_function_reg_str = PYTHON_C_FUNCTION_REG_TEMPLATE.format(
             forward_api_name, namespace, forward_api_name, forward_api_name)
 
-    def run(self):
+    def run(self, inplace_map):
         # Initialized is_forward_only
         self.CollectIsForwardOnly()
 
@@ -384,7 +405,7 @@ class PythonCSingleFunctionGenerator:
         )
 
         # Code Generation
-        self.GeneratePythonCFunction()
+        self.GeneratePythonCFunction(inplace_map)
         logging.info(
             f"Generated Python-C Function: {self.python_c_function_str}")
         logging.info(
@@ -416,12 +437,23 @@ class PythonCYamlGenerator:
         for forward_api_content in forward_api_list:
             f_generator = PythonCSingleFunctionGenerator(forward_api_content,
                                                          namespace)
-            status = f_generator.run()
+            status = f_generator.run({})
 
             if status == True:
                 self.python_c_functions_reg_str += f_generator.python_c_function_reg_str + ",\n"
                 self.python_c_functions_str += f_generator.python_c_function_str + "\n"
 
+            if 'inplace' in forward_api_content.keys():
+                inplace_map = ParseInplaceInfo(forward_api_content['inplace'])
+
+                f_generator_inplace = PythonCSingleFunctionGenerator(
+                    forward_api_content, namespace)
+                status = f_generator_inplace.run(inplace_map)
+
+                if status == True:
+                    self.python_c_functions_reg_str += f_generator_inplace.python_c_function_reg_str + ",\n"
+                    self.python_c_functions_str += f_generator_inplace.python_c_function_str + "\n"
+
     def InferNameSpace(self):
         yaml_path = self.yaml_path
         if "sparse" in yaml_path:
diff --git a/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc b/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc
index c8b2d22dcf95139db47704be86a6f64554f7c0ba..5fec38bf25a433542e09e0eec0cb52c61cff7167 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc
+++ b/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc
@@ -90,7 +90,7 @@ TEST(Tensor, MemberFunction) {
   VLOG(6) << "Set impl";
   CHECK_EQ(et3.initialized(), true);
   CHECK_EQ(et3.is_cpu(), true);
-  CHECK_EQ(et3.is_cuda(), false);
+  CHECK_EQ(et3.is_gpu(), false);
   CHECK_EQ(et3.numel(), 2);
   auto expected_dim = phi::make_ddim({1, 2});
   CHECK_EQ(et3.dims(), expected_dim);
diff --git a/paddle/fluid/operators/graph_send_recv_op.cc b/paddle/fluid/operators/graph_send_recv_op.cc
index f7c006dbcb1a9a23ec619c8d790df8a093530eee..f67dea7402864b324756914e0ad23c7d87b8e867 100644
--- a/paddle/fluid/operators/graph_send_recv_op.cc
+++ b/paddle/fluid/operators/graph_send_recv_op.cc
@@ -38,7 +38,7 @@ class GraphSendRecvGradOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    auto in_dims = ctx->GetInputDim(framework::GradVarName("Out"));
+    auto in_dims = ctx->GetInputDim("X");
     ctx->SetOutputDim(framework::GradVarName("X"), in_dims);
   }
 
@@ -68,6 +68,12 @@ class GraphSendRecvOpMaker : public framework::OpProtoAndCheckerMaker {
                          "tensors of Dst_index.")
         .SetDefault("SUM")
         .InEnum({"SUM", "MEAN", "MIN", "MAX"});
+    AddAttr<int64_t>(
+        "out_size",
+        "(int64_t, default 0)"
+        "Define the first dimension of Output tensor."
+        "If set default 0, then the shape of Out is the same with X.")
+        .SetDefault(0);
     AddComment(R"DOC(
 Graph Learning Send_Recv combine operator.
 
@@ -93,6 +99,7 @@ class GraphSendRecvGradOpMaker : public framework::SingleGradOpMaker<T> {
     op->SetType("graph_send_recv_grad");
     op->SetInput("Src_index", this->Input("Src_index"));
     op->SetInput("Dst_index", this->Input("Dst_index"));
+    op->SetInput("X", this->Input("X"));
 
     if (BOOST_GET_CONST(std::string, this->GetAttr("pool_type")) == "MEAN") {
       op->SetInput("Dst_count", this->Output("Dst_count"));
@@ -100,7 +107,6 @@ class GraphSendRecvGradOpMaker : public framework::SingleGradOpMaker<T> {
 
     if (BOOST_GET_CONST(std::string, this->GetAttr("pool_type")) == "MIN" ||
         BOOST_GET_CONST(std::string, this->GetAttr("pool_type")) == "MAX") {
-      op->SetInput("X", this->Input("X"));
       op->SetInput("Out", this->Output("Out"));
     }
 
diff --git a/paddle/fluid/operators/group_norm_op.cu b/paddle/fluid/operators/group_norm_op.cu
index ab8c50d90b8ece68b8e4e05d46cecd13fa84d7e0..c08f1920205daa7f6d5d3f032afcf5d832230ced 100644
--- a/paddle/fluid/operators/group_norm_op.cu
+++ b/paddle/fluid/operators/group_norm_op.cu
@@ -152,6 +152,21 @@ __device__ __forceinline__ void ThreadReduce(phi::Array<const T*, Num> arrs,
   }
 }
 
+template <typename T>
+__device__ __forceinline__ void ReduceMeanAndVar(T* mean, T* var, T x_mean,
+                                                 T x_var, int size) {
+  const int nc = blockIdx.x;
+  x_mean = kps::details::BlockXReduce<T, kps::AddFunctor<T>>(
+      x_mean, kps::AddFunctor<T>());
+  x_var = kps::details::BlockXReduce<T, kps::AddFunctor<T>>(
+      x_var, kps::AddFunctor<T>());
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    mean[nc] = static_cast<T>(x_mean / size);
+    var[nc] = static_cast<T>(x_var / size);
+  }
+}
+
 template <typename T>
 __global__ void ScalarGetMeanAndVarNCHW(const T* x, T* mean, T* var, int size) {
   int i = blockIdx.x;
@@ -162,10 +177,7 @@ __global__ void ScalarGetMeanAndVarNCHW(const T* x, T* mean, T* var, int size) {
     x_mean += val;
     x_var += val * val;
   }
-  x_mean /= size;
-  x_var /= size;
-  CudaAtomicAddWithWarp(&mean[i], x_mean);
-  CudaAtomicAddWithWarp(&var[i], x_var);
+  ReduceMeanAndVar<T>(mean, var, x_mean, x_var, size);
 }
 
 template <typename T, typename AccT, int VecSize>
@@ -174,21 +186,12 @@ __global__ void VectorizedGetMeanAndVarNCHW(const T* x, T* mean, T* var,
   int i = blockIdx.x;
   AccT x_mean = static_cast<AccT>(0);
   AccT x_var = static_cast<AccT>(0);
-  const int input_offset = ((uint64_t)x) % ALIGN_BYTES / sizeof(T);
   x += i * size;
+  const int input_offset = ((uint64_t)x) % ALIGN_BYTES / sizeof(T);
   phi::Array<const T*, 1> ins;
   ins[0] = x;
   ThreadReduce<T, AccT, VecSize, 1>(ins, size, input_offset, &x_mean, &x_var);
-
-  x_mean = kps::details::BlockXReduce<AccT, kps::AddFunctor<AccT>>(
-      x_mean, kps::AddFunctor<AccT>());
-  x_var = kps::details::BlockXReduce<AccT, kps::AddFunctor<AccT>>(
-      x_var, kps::AddFunctor<AccT>());
-  __syncthreads();
-  if (threadIdx.x == 0) {
-    mean[i] = static_cast<T>(x_mean / size);
-    var[i] = static_cast<T>(x_var / size);
-  }
+  ReduceMeanAndVar<AccT>(mean, var, x_mean, x_var, size);
 }
 
 template <typename T, int flags>
@@ -272,10 +275,6 @@ class GroupNormKernel<platform::CUDADeviceContext, T>
     auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
     Tensor temp_var;
     temp_var.mutable_data<T>(var->dims(), ctx.GetPlace());
-
-    set_zero(dev_ctx, mean, static_cast<T>(0));
-    set_zero(dev_ctx, &temp_var, static_cast<T>(0));
-
     auto* x_data = x->data<T>();
     auto* y_data = y->data<T>();
     auto* mean_data = mean->data<T>();
@@ -319,7 +318,7 @@ class GroupNormKernel<platform::CUDADeviceContext, T>
       block_size_nchw = std::max(block_size_nchw, kps::details::kWarpSize);
       dim3 grids(x_dims[0] * groups);
       dim3 blocks(block_size_nchw);
-      if (size < vec_size) {
+      if (size < vec_size * block_size_nchw) {
         ScalarGetMeanAndVarNCHW<T><<<grids, blocks, 0, dev_ctx.stream()>>>(
             x_data, mean_data, temp_var_data, size);
       } else {
@@ -328,6 +327,8 @@ class GroupNormKernel<platform::CUDADeviceContext, T>
             x_data, mean_data, temp_var_data, size);
       }
     } else {
+      set_zero(dev_ctx, mean, static_cast<T>(0));
+      set_zero(dev_ctx, &temp_var, static_cast<T>(0));
       GroupNormForwardGetMeanAndVar<T><<<grid, threads, 0, dev_ctx.stream()>>>(
           x_data, x_dims[0], C, W, imsize, groups, group_size, mean_data,
           temp_var_data);
@@ -424,24 +425,15 @@ __global__ void VectorizedGetDsDbCUDAKernel(int imsize, const T* x, const T* dy,
   int i = blockIdx.x;
   AccT ds_sum = static_cast<AccT>(0);
   AccT db_sum = static_cast<AccT>(0);
-  const int input_offset = ((uint64_t)x) % ALIGN_BYTES / sizeof(T);
   x += i * imsize;
+  const int input_offset = ((uint64_t)x) % ALIGN_BYTES / sizeof(T);
 
   phi::Array<const T*, 2> ins;
   ins[0] = x;
   ins[1] = dy;
   ThreadReduce<T, AccT, VecSize, 2>(ins, imsize, input_offset, &db_sum,
                                     &ds_sum);
-
-  ds_sum = kps::details::BlockXReduce<AccT, kps::AddFunctor<AccT>>(
-      ds_sum, kps::AddFunctor<AccT>());
-  db_sum = kps::details::BlockXReduce<AccT, kps::AddFunctor<AccT>>(
-      db_sum, kps::AddFunctor<AccT>());
-  __syncthreads();
-  if (threadIdx.x == 0) {
-    ds[i] = ds_sum;
-    db[i] = db_sum;
-  }
+  ReduceMeanAndVar<AccT>(db, ds, db_sum, ds_sum, 1);
 }
 
 template <typename T>
@@ -455,8 +447,7 @@ __global__ void ScalarGetDsDbCUDAKernel(int imsize, const T* x, const T* dy,
     ds_sum += dy[index] * x[index];
     db_sum += dy[index];
   }
-  CudaAtomicAddWithWarp(&ds[nc], ds_sum);
-  CudaAtomicAddWithWarp(&db[nc], db_sum);
+  ReduceMeanAndVar<T>(db, ds, db_sum, ds_sum, 1);
 }
 
 template <typename T>
@@ -641,13 +632,7 @@ class GroupNormGradKernel<platform::CUDADeviceContext, T>
       }
       block_size_nchw = std::max(block_size_nchw, kps::details::kWarpSize);
       dim3 blocks(block_size_nchw);
-      if (imsize < vec_size) {
-        if (d_scale) {
-          set_zero(dev_ctx, d_scale, static_cast<T>(0));
-        }
-        if (d_bias) {
-          set_zero(dev_ctx, d_bias, static_cast<T>(0));
-        }
+      if (imsize < vec_size * block_size_nchw) {
         ScalarGetDsDbCUDAKernel<
             T><<<x_dims[0] * C, blocks, 0, dev_ctx.stream()>>>(
             imsize, x_data, dy_data, ds_data, db_data);
@@ -687,7 +672,6 @@ class GroupNormGradKernel<platform::CUDADeviceContext, T>
             imsize, C, group_size, groups, p1_data, p2_data, p3_data, x_data,
             dy_data, d_x_data);
       }
-
     } else {
       if (d_scale) {
         set_zero(dev_ctx, d_scale, static_cast<T>(0));
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h
index 65cca94814e88111239aef3559285d6fe321a72d..ff1ddb4175feff0b059c7ba43e77b306197a11a0 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.h
@@ -265,67 +265,6 @@ class ReduceKernel : public framework::OpKernel<T> {
         framework::TransToPhiDataType(cast_out_dtype), output);
   }
 };
-template <typename DeviceContext, typename OutT, typename Functor>
-class BoolReduceKernel : public framework::OpKernel<OutT> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    bool reduce_all = context.Attr<bool>("reduce_all");
-    auto* input = context.Input<Tensor>("X");
-    auto* output = context.Output<Tensor>("Out");
-    output->mutable_data<OutT>(context.GetPlace());
-
-    auto dims = context.Attr<std::vector<int>>("dim");
-    bool keep_dim = context.Attr<bool>("keep_dim");
-
-    // The dims has full dim, set the reduce_all is True
-    const auto& input_dim_size = context.Input<Tensor>("X")->dims().size();
-    std::set<int> dims_set(dims.begin(), dims.end());
-    bool full_dim = true;
-    for (auto i = 0; i < input_dim_size; i++) {
-      if (dims_set.find(i) == dims_set.end()) {
-        full_dim = false;
-        break;
-      }
-    }
-    reduce_all = (reduce_all || full_dim);
-
-    if (reduce_all) {
-      // Flatten and reduce 1-D tensor
-      auto x = EigenVector<OutT>::Flatten(*input);
-      auto out = EigenScalar<OutT>::From(*output);
-      auto& place =
-          *context.template device_context<DeviceContext>().eigen_device();
-      auto reduce_dim = Eigen::array<int, 1>({{0}});
-      Functor functor;
-      functor(place, &x, &out, reduce_dim);
-    } else {
-      int ndim = input->dims().size();
-      int rdim = dims.size();
-      // comments for accelerating compiling temporarily.
-      if (ndim > 6) {
-        HandleLargeDim<DeviceContext, OutT, Functor>(context, input, output,
-                                                     dims, keep_dim);
-      } else {
-        HANDLE_DIM(6, 5);
-        HANDLE_DIM(6, 4);
-        HANDLE_DIM(6, 3);
-        HANDLE_DIM(6, 2);
-        HANDLE_DIM(6, 1);
-        HANDLE_DIM(5, 4);
-        HANDLE_DIM(5, 3);
-        HANDLE_DIM(5, 2);
-        HANDLE_DIM(5, 1);
-        HANDLE_DIM(4, 3);
-        HANDLE_DIM(4, 2);
-        HANDLE_DIM(4, 1);
-        HANDLE_DIM(3, 2);
-        HANDLE_DIM(3, 1);
-        HANDLE_DIM(2, 1);
-        HANDLE_DIM(1, 1);
-      }
-    }
-  }
-};
 
 template <typename DeviceContext, typename T, typename Functor>
 void LaunchReduceGradKernel(const framework::ExecutionContext& context,
diff --git a/paddle/fluid/pybind/eager.cc b/paddle/fluid/pybind/eager.cc
index 1052f93d32ec3cb626577c4b584cc6172c83da2e..75f0babdfe85b61ae2a7fef7f026dc924dc9e511 100644
--- a/paddle/fluid/pybind/eager.cc
+++ b/paddle/fluid/pybind/eager.cc
@@ -132,8 +132,7 @@ void InitTensorWithTensor(TensorObject* self,
     self->tensor.set_impl(impl);
     VLOG(4) << "Same place, do ShareDataWith";
   } else {
-    self->tensor.set_impl(
-        src.copy_to(phi::TransToPhiBackend(place), true).impl());
+    self->tensor.set_impl(src.copy_to(place, true).impl());
     VLOG(4) << "Different place, do TensorCopy";
   }
   if (src.get_autograd_meta()) {
@@ -156,8 +155,7 @@ void InitTensorWithFrameworkTensor(TensorObject* self,
   } else {
     auto temp =
         paddle::experimental::Tensor(std::make_shared<phi::DenseTensor>(src));
-    self->tensor.set_impl(
-        temp.copy_to(phi::TransToPhiBackend(place), true).impl());
+    self->tensor.set_impl(temp.copy_to(place, true).impl());
     VLOG(4) << "Different place, do TensorCopy";
   }
   egr::EagerUtils::autograd_meta(&(self->tensor))->SetPersistable(false);
diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc
index 528bd75eb0013b95057d7549e083b2fa1318cac1..f3c48309e69fe8b40099b076f673e4ba8c8bcabd 100644
--- a/paddle/fluid/pybind/eager_functions.cc
+++ b/paddle/fluid/pybind/eager_functions.cc
@@ -159,7 +159,7 @@ static PyObject* eager_api_tensor_copy(PyObject* self, PyObject* args,
   auto place = CastPyArg2Place(PyTuple_GET_ITEM(args, 2), 2);
   bool blocking = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 3), 3);
 
-  dst = src.copy_to(phi::TransToPhiBackend(place), blocking);
+  dst = src.copy_to(place, blocking);
   egr::EagerUtils::autograd_meta(&dst)->SetStopGradient(
       egr::EagerUtils::autograd_meta(&(src))->StopGradient());
   egr::EagerUtils::autograd_meta(&dst)->SetPersistable(
diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc
index 49745e5679d9af3c8cf07ba0cac679217a691052..cce663e410ddb5d4d987e58d16d578ec85ee6142 100644
--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -175,7 +175,7 @@ static PyObject* tensor_method_numpy(TensorObject* self, PyObject* args,
           pybind11::detail::npy_api::NPY_ARRAY_WRITEABLE_,
       nullptr);
 
-  if (self->tensor.is_cpu()) {
+  if (self->tensor.is_cpu() || self->tensor.is_gpu_pinned()) {
     auto dense_tensor =
         std::dynamic_pointer_cast<phi::DenseTensor>(self->tensor.impl());
     platform::CPUPlace place;
@@ -184,7 +184,7 @@ static PyObject* tensor_method_numpy(TensorObject* self, PyObject* args,
                                     pybind11::detail::array_proxy(array)->data),
                          place, dense_tensor->data(), sizeof_dtype * numel);
 #if defined(PADDLE_WITH_CUDA)
-  } else if (self->tensor.is_cuda()) {
+  } else if (self->tensor.is_gpu()) {
     auto dense_tensor =
         std::dynamic_pointer_cast<phi::DenseTensor>(self->tensor.impl());
 
@@ -218,8 +218,7 @@ static PyObject* tensor_method__copy_to(TensorObject* self, PyObject* args,
   EAGER_TRY
   auto place = CastPyArg2Place(PyTuple_GET_ITEM(args, 0), 0);
   bool blocking = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 1), 1);
-  auto cp_tensor =
-      self->tensor.copy_to(phi::TransToPhiBackend(place), blocking);
+  auto cp_tensor = self->tensor.copy_to(place, blocking);
   egr::EagerUtils::autograd_meta(&cp_tensor)->SetStopGradient(true);
   egr::EagerUtils::autograd_meta(&cp_tensor)
       ->SetPersistable(
@@ -231,8 +230,7 @@ static PyObject* tensor_method__copy_to(TensorObject* self, PyObject* args,
 static PyObject* tensor_method_cpu(TensorObject* self, PyObject* args,
                                    PyObject* kwargs) {
   EAGER_TRY
-  auto cp_tensor =
-      self->tensor.copy_to(phi::TransToPhiBackend(phi::CPUPlace()), true);
+  auto cp_tensor = self->tensor.copy_to(phi::CPUPlace(), true);
   egr::EagerUtils::autograd_meta(&cp_tensor)->SetStopGradient(true);
   egr::EagerUtils::autograd_meta(&cp_tensor)
       ->SetPersistable(
diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc
index 24bb2c9e65cde6909d5f934eb41b0a363044dde9..2e884b212aff307d8f83815f8edac37cb884f3f0 100644
--- a/paddle/fluid/pybind/eager_utils.cc
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -929,28 +929,10 @@ std::vector<paddle::framework::Scope*> GetScopePtrListFromArgs(
   return result;
 }
 
-paddle::experimental::Backend CastPyArg2Backend(PyObject* obj,
-                                                const std::string& op_type,
-                                                ssize_t arg_pos) {
-  if (obj == Py_None) {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "%s(): argument (position %d) must be "
-        "int or place, but got %s",
-        op_type, arg_pos + 1,
-        ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
-  }
-
-  PyTypeObject* type = obj->ob_type;
-  auto type_name = std::string(type->tp_name);
-  if (type_name == "int") {
-    int value = CastPyArg2Int(obj, op_type, arg_pos);
-    return static_cast<paddle::experimental::Backend>(value);
-  } else {
-    platform::Place place = CastPyArg2Place(obj, arg_pos);
-    return phi::TransToPhiBackend(place);
-  }
-
-  return paddle::experimental::Backend::CPU;
+paddle::experimental::Place CastPyArg2Place(PyObject* obj,
+                                            const std::string& op_type,
+                                            ssize_t arg_pos) {
+  return CastPyArg2Place(obj, arg_pos);
 }
 
 paddle::experimental::DataType CastPyArg2DataType(PyObject* obj,
diff --git a/paddle/fluid/pybind/eager_utils.h b/paddle/fluid/pybind/eager_utils.h
index fba1485bcf44ea70db286225fbbe3c70caceb4bd..3500082ba645f35b0f9587985af63a311d12128f 100644
--- a/paddle/fluid/pybind/eager_utils.h
+++ b/paddle/fluid/pybind/eager_utils.h
@@ -154,9 +154,9 @@ paddle::experimental::Scalar CastPyArg2Scalar(PyObject* obj,
 paddle::experimental::ScalarArray CastPyArg2ScalarArray(
     PyObject* obj, const std::string& op_type, ssize_t arg_pos);
 
-paddle::experimental::Backend CastPyArg2Backend(PyObject* obj,
-                                                const std::string& op_type,
-                                                ssize_t arg_pos);
+paddle::experimental::Place CastPyArg2Place(PyObject* obj,
+                                            const std::string& op_type,
+                                            ssize_t arg_pos);
 
 paddle::experimental::DataType CastPyArg2DataType(PyObject* obj,
                                                   const std::string& op_type,
diff --git a/paddle/phi/api/include/tensor.h b/paddle/phi/api/include/tensor.h
index c58ebe69523eb97a4357bea061de64e5e01ec181..6fab6643f398dca696640b184c76ec90dec342cd 100644
--- a/paddle/phi/api/include/tensor.h
+++ b/paddle/phi/api/include/tensor.h
@@ -31,7 +31,6 @@ using gpuStream_t = hipStream_t;
 
 #include "paddle/phi/api/ext/dll_decl.h"
 #include "paddle/phi/api/ext/place.h"
-#include "paddle/phi/common/backend.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/layout.h"
 #include "paddle/phi/common/place.h"
@@ -269,12 +268,20 @@ class PADDLE_API Tensor final {
   bool is_cpu() const;
 
   /**
-   * @brief Determine whether the tensor device is CUDA
+   * @brief Determine whether the tensor device is GPU
    *
    * @return true
    * @return false
    */
-  bool is_cuda() const;
+  bool is_gpu() const;
+
+  /**
+   * @brief Determine whether the tensor device is GPU_PINNED
+   *
+   * @return true
+   * @return false
+   */
+  bool is_gpu_pinned() const;
 
   /* Part 4: Data Access methods */
 
@@ -407,11 +414,11 @@ class PADDLE_API Tensor final {
   /**
    * @brief Transfer the current Tensor to the specified device and return.
    *
-   * @param backend, The target backend of which the tensor will copy to.
+   * @param place, The target place of which the tensor will copy to.
    * @param blocking, Should we copy this in sync way.
    * @return Tensor
    */
-  Tensor copy_to(Backend backend, bool blocking) const;
+  Tensor copy_to(Place place, bool blocking) const;
 
   /**
    * @brief Transfer the source Tensor to current Tensor.
diff --git a/paddle/phi/api/lib/api_custom_impl.cc b/paddle/phi/api/lib/api_custom_impl.cc
index fc1afb26bf4143e5c75398b3dc1042581e1f1546..98f28ddcbdb3362cfbde9dc53176a78b9b952576 100644
--- a/paddle/phi/api/lib/api_custom_impl.cc
+++ b/paddle/phi/api/lib/api_custom_impl.cc
@@ -19,6 +19,7 @@ limitations under the License. */
 #include "paddle/phi/api/lib/data_transform.h"
 #include "paddle/phi/api/lib/kernel_dispatch.h"
 #include "paddle/phi/api/lib/utils/storage.h"
+#include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/meta_tensor.h"
 #include "paddle/phi/infermeta/binary.h"
@@ -31,9 +32,10 @@ limitations under the License. */
 namespace paddle {
 namespace experimental {
 
-Tensor copy_to_impl(const Tensor& x, Backend backend, bool blocking) {
+Tensor copy_to_impl(const Tensor& x, Place place, bool blocking) {
   auto kernel_key_set = ParseKernelKeyByInputArgs(x);
-  kernel_key_set.backend_set = kernel_key_set.backend_set | BackendSet(backend);
+  kernel_key_set.backend_set =
+      kernel_key_set.backend_set | BackendSet(phi::TransToPhiBackend(place));
   auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
   auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
       "copy", kernel_key);
@@ -57,8 +59,7 @@ Tensor copy_to_impl(const Tensor& x, Backend backend, bool blocking) {
                                     phi::DenseTensor*);
 
   auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
-  (*kernel_fn)(
-      *dev_ctx, *dense_x, phi::TransToPhiPlace(backend), blocking, kernel_out);
+  (*kernel_fn)(*dev_ctx, *dense_x, place, blocking, kernel_out);
 
   return out;
 }
diff --git a/paddle/phi/api/lib/api_custom_impl.h b/paddle/phi/api/lib/api_custom_impl.h
index 5acb68a3281332565d0b094a37fc8ee38c4904ab..48eda2d954647e8c2786b29b615cfc6166734bca 100644
--- a/paddle/phi/api/lib/api_custom_impl.h
+++ b/paddle/phi/api/lib/api_custom_impl.h
@@ -15,15 +15,14 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/phi/api/include/tensor.h"
-#include "paddle/phi/common/backend.h"
+#include "paddle/phi/common/place.h"
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/common/scalar_array.h"
 
 namespace paddle {
 namespace experimental {
 
-// TODO(chenweihang): Replace backend by place when place is ready
-Tensor copy_to_impl(const Tensor& x, Backend backend, bool blocking);
+Tensor copy_to_impl(const Tensor& x, Place place, bool blocking);
 
 std::vector<Tensor> split_impl(const Tensor& x,
                                const ScalarArray& num_or_sections,
diff --git a/paddle/phi/api/lib/backend_set.h b/paddle/phi/api/lib/backend_set.h
index 88f7b086715d6535be1d5b30e02497c89026fb6f..2aa4f969221d9c0f8936e7fc7c3a41cc04a63454 100644
--- a/paddle/phi/api/lib/backend_set.h
+++ b/paddle/phi/api/lib/backend_set.h
@@ -35,7 +35,7 @@ class BackendSet final {
       : bitset_(b == Backend::UNDEFINED ? 0 : 1ULL << (static_cast<uint8_t>(b) -
                                                        1)) {}
 
-  uint64_t bitset() const { return bitset_; }
+  inline uint64_t bitset() const { return bitset_; }
 
   bool inline Has(Backend b) const {
     PD_CHECK(b != Backend::UNDEFINED, "Backend argument can't be UNDEFINED.");
diff --git a/paddle/phi/api/lib/data_transform.cc b/paddle/phi/api/lib/data_transform.cc
index 8bf5f3b481a0e041b439ffd99a8ac017f4aae50e..7d886e50dbc23ef20557956ae1ae9503fbefbb45 100644
--- a/paddle/phi/api/lib/data_transform.cc
+++ b/paddle/phi/api/lib/data_transform.cc
@@ -39,7 +39,7 @@ inline bool NeedTransformPlace(const paddle::platform::Place& input,
                                const TransformFlag& transform_flag) {
   bool ret = transform_flag.need_trans_backend() &&
              target != Backend::ALL_BACKEND &&
-             !platform::is_same_place(input, phi::TransToPhiPlace(target));
+             phi::TransToPhiBackend(input) != target;
   return ret;
 }
 
@@ -180,21 +180,20 @@ std::shared_ptr<phi::DenseTensor> PrepareData(
     const phi::TensorArgDef& target_args_def,
     const TransformFlag& transform_flag) {
   const auto& tensor_in = input.impl();
-  VLOG(6) << tensor_in->dtype() << "\t" << target_args_def.dtype;
-  if (!transform_flag.NeedTransform() || !tensor_in->initialized() ||
+  phi::DenseTensor& dense_tensor =
+      *static_cast<phi::DenseTensor*>(tensor_in.get());
+  if (!transform_flag.NeedTransform() || !dense_tensor.initialized() ||
       (!NeedTransformPlace(
-           tensor_in->place(), target_args_def.backend, transform_flag) &&
+           dense_tensor.place(), target_args_def.backend, transform_flag) &&
        !NeedTransformDataType(
-           tensor_in->dtype(), target_args_def.dtype, transform_flag) &&
+           dense_tensor.dtype(), target_args_def.dtype, transform_flag) &&
        !NeedTransformLayout(
-           tensor_in->layout(), target_args_def.layout, transform_flag))) {
+           dense_tensor.layout(), target_args_def.layout, transform_flag))) {
     return std::static_pointer_cast<phi::DenseTensor>(tensor_in);
   }
 
   phi::DenseTensor out =
-      TransformData(*(static_cast<phi::DenseTensor*>(tensor_in.get())),
-                    target_args_def,
-                    transform_flag);
+      TransformData(dense_tensor, target_args_def, transform_flag);
   return std::make_shared<phi::DenseTensor>(std::move(out));
 }
 
diff --git a/paddle/phi/api/lib/data_type_set.h b/paddle/phi/api/lib/data_type_set.h
index ecc1b37c3a6af0d55f1cf71e860fd87657ed759b..4b5e6bde247002ca96a964f8739ea06a2036d580 100644
--- a/paddle/phi/api/lib/data_type_set.h
+++ b/paddle/phi/api/lib/data_type_set.h
@@ -30,7 +30,7 @@ class DataTypeSet final {
                     ? 0
                     : 1ULL << (static_cast<uint8_t>(dtype) - 1)) {}
 
-  uint64_t bitset() const { return bitset_; }
+  inline uint64_t bitset() const { return bitset_; }
 
   bool inline Has(DataType dtype) const {
     PD_CHECK(dtype != DataType::UNDEFINED,
diff --git a/paddle/phi/api/lib/kernel_dispatch.cc b/paddle/phi/api/lib/kernel_dispatch.cc
index 5e334b9b727dc19f6bba7dc8c9c15b2414d13abb..c2f7a7981f001119608c276a643a68199d0df0c4 100644
--- a/paddle/phi/api/lib/kernel_dispatch.cc
+++ b/paddle/phi/api/lib/kernel_dispatch.cc
@@ -16,13 +16,16 @@ limitations under the License. */
 
 #include "paddle/phi/api/include/context_pool.h"
 #include "paddle/phi/core/compat/convert_utils.h"
+#ifdef _MSC_VER
+#include <intrin.h>
+#endif
 
 namespace paddle {
 namespace experimental {
 namespace detail {
 
-BackendSet GetTensorBackendSet(const Tensor& t) {
-  BackendSet backend_set(phi::TransToPhiBackend(t.inner_place()));
+BackendSet GetTensorBackendSet(const phi::TensorBase& t) {
+  BackendSet backend_set(phi::TransToPhiBackend(t.place()));
   switch (t.layout()) {
     case DataLayout::MKLDNN:
       backend_set = backend_set | BackendSet(Backend::MKLDNN);
@@ -35,6 +38,11 @@ BackendSet GetTensorBackendSet(const Tensor& t) {
 }
 
 std::size_t CountLeadingZeros(uint64_t val) {
+#if defined(__clang__) || defined(__GNUC__)
+  return __builtin_clzl(val);
+#elif defined(_MSC_VER)
+  return __lzcnt64(val);
+#else
   if (val == 0) {
     return 64;
   }
@@ -48,6 +56,7 @@ std::size_t CountLeadingZeros(uint64_t val) {
     }
   }
   return zero_bits;
+#endif
 }
 
 }  // namespace detail
@@ -82,13 +91,17 @@ DataType ParseDataTypeWithInputOrder(DataType dtype, const Tensor& tensor) {
   return dtype != DataType::UNDEFINED ? dtype : ParseDataType(tensor);
 }
 
-Backend ParseBackend(Backend backend) { return backend; }
+Backend ParseBackend(const Place& place) {
+  return phi::TransToPhiBackend(place);
+}
 Backend ParseBackend(const Tensor& tensor) {
   return phi::TransToPhiBackend(tensor.inner_place());
 }
 
-Backend ParseBackendWithInputOrder(Backend backend, const Tensor& tensor) {
-  return backend != Backend::UNDEFINED ? backend : ParseBackend(tensor);
+Backend ParseBackendWithInputOrder(const Place& place, const Tensor& tensor) {
+  return place.GetType() != phi::AllocationType::UNDEFINED
+             ? ParseBackend(place)
+             : ParseBackend(tensor);
 }
 
 DataLayout ParseLayout(DataLayout layout) { return layout; }
diff --git a/paddle/phi/api/lib/kernel_dispatch.h b/paddle/phi/api/lib/kernel_dispatch.h
index 9a09bc2183ad73857d5afee8909d957e65c5a664..25b74e7fe31b955a8b25529871c503179b302629 100644
--- a/paddle/phi/api/lib/kernel_dispatch.h
+++ b/paddle/phi/api/lib/kernel_dispatch.h
@@ -33,7 +33,7 @@ namespace paddle {
 namespace experimental {
 
 namespace detail {
-BackendSet GetTensorBackendSet(const Tensor& t);
+BackendSet GetTensorBackendSet(const phi::TensorBase& t);
 std::size_t CountLeadingZeros(uint64_t val);
 }  // namespace detail
 
@@ -93,11 +93,13 @@ struct KernelKeyParser : ArgsIterator<KernelKeyParser> {
   // TODO(chenweihang): deal with multiple diff input Tensors
   // TODO(chenweihang): add global device guard method to set backend
   void operator()(const Tensor& x) {
-    key_set.backend_set = key_set.backend_set | detail::GetTensorBackendSet(x);
-    // TODO(chenweihang): selecte multi layout and dtype
-    key_set.layout = x.layout();
-    key_set.dtype = x.type();
-    dtype_set = dtype_set | DataTypeSet(x.dtype());
+    const phi::TensorBase& tensor = *x.impl();
+    key_set.backend_set =
+        key_set.backend_set | detail::GetTensorBackendSet(tensor);
+    // TODO(chenweihang): select multi layout and dtype
+    key_set.layout = tensor.layout();
+    key_set.dtype = tensor.dtype();
+    dtype_set = dtype_set | DataTypeSet(key_set.dtype);
     auto promote_result = PromoteTypes(dtype_set);
     if (promote_result != DataType::UNDEFINED) {
       key_set.dtype = promote_result;
@@ -105,11 +107,12 @@ struct KernelKeyParser : ArgsIterator<KernelKeyParser> {
   }
 
   void operator()(const std::vector<Tensor>& x) {
+    const phi::TensorBase& tensor = *x.at(0).impl();
     key_set.backend_set =
-        key_set.backend_set | detail::GetTensorBackendSet(x[0]);
-    // TODO(chenweihang): selecte multi layout and dtype
-    key_set.layout = x[0].layout();
-    key_set.dtype = x[0].type();
+        key_set.backend_set | detail::GetTensorBackendSet(tensor);
+    // TODO(chenweihang): select multi layout and dtype
+    key_set.layout = tensor.layout();
+    key_set.dtype = tensor.dtype();
   }
 
   // skip other type args, these args don't used in kernel selection
@@ -154,7 +157,7 @@ DataType ParseDataType(const Tensor& tensor);
 DataType ParseDataType(const std::vector<Tensor>& tensors);
 DataType ParseDataTypeWithInputOrder(DataType dtype, const Tensor& tensor);
 
-Backend ParseBackend(Backend backend);
+Backend ParseBackend(const Place& place);
 Backend ParseBackend(const Tensor& tensor);
 template <typename T, typename... Args>
 Backend ParseBackend(T t, Args... args) {
@@ -163,7 +166,7 @@ Backend ParseBackend(T t, Args... args) {
   return static_cast<Backend>(64 -
                               detail::CountLeadingZeros(backend_set.bitset()));
 }
-Backend ParseBackendWithInputOrder(Backend backend, const Tensor& tensor);
+Backend ParseBackendWithInputOrder(const Place& place, const Tensor& tensor);
 
 DataLayout ParseLayout(DataLayout layout);
 DataLayout ParseLayout(const Tensor& tensor);
diff --git a/paddle/phi/api/lib/tensor.cc b/paddle/phi/api/lib/tensor.cc
index 066287d4244797e316a34d524eca171e94afcfdf..b9b6ca36f673b03ba0b548424711833f21e612a9 100644
--- a/paddle/phi/api/lib/tensor.cc
+++ b/paddle/phi/api/lib/tensor.cc
@@ -163,10 +163,14 @@ bool Tensor::is_cpu() const {
   return paddle::platform::is_cpu_place(inner_place());
 }
 
-bool Tensor::is_cuda() const {
+bool Tensor::is_gpu() const {
   return paddle::platform::is_gpu_place(inner_place());
 }
 
+bool Tensor::is_gpu_pinned() const {
+  return paddle::platform::is_cuda_pinned_place(inner_place());
+}
+
 /* Part 4: Data Access methods */
 
 template <typename T>
diff --git a/paddle/phi/api/lib/tensor_method.cc b/paddle/phi/api/lib/tensor_method.cc
index cc797507e68ec11005d6ac35d5dca2d19418598d..c6214052f7bc30ef8bf801fe20171256d8d0b142 100644
--- a/paddle/phi/api/lib/tensor_method.cc
+++ b/paddle/phi/api/lib/tensor_method.cc
@@ -27,14 +27,14 @@ namespace paddle {
 namespace experimental {
 // declare cast api
 Tensor cast(const Tensor &x, DataType out_dtype);
-Tensor copy_to(const Tensor &x, Backend backend, bool blocking);
+Tensor copy_to(const Tensor &x, Place place, bool blocking);
 
 Tensor Tensor::cast(DataType target_type) const {
   return experimental::cast(*this, target_type);
 }
 
-Tensor Tensor::copy_to(Backend backend, bool blocking) const {
-  return experimental::copy_to(*this, backend, blocking);
+Tensor Tensor::copy_to(Place place, bool blocking) const {
+  return experimental::copy_to(*this, place, blocking);
 }
 
 template <typename T>
@@ -44,7 +44,7 @@ Tensor Tensor::copy_to(const PlaceType &target_place) const {
                   "`copy_to` method without template argument instead. "
                   "reason: copying a Tensor to another device does not need "
                   "to specify the data type template argument.";
-  return copy_to(ConvertExtPlaceToBackend(target_place), /*blocking=*/false);
+  return copy_to(ConvertExtPlaceToInnerPlace(target_place), /*blocking=*/false);
 }
 
 template PADDLE_API Tensor
diff --git a/paddle/phi/common/place.h b/paddle/phi/common/place.h
index 53ddd499a7e24e09ded721619407b8679cab7628..4c6d47597bd2cc12c397794eb575a74960542a6d 100644
--- a/paddle/phi/common/place.h
+++ b/paddle/phi/common/place.h
@@ -203,5 +203,10 @@ namespace paddle {
 namespace experimental {
 using AllocationType = phi::AllocationType;
 using Place = phi::Place;
+using CPUPlace = phi::CPUPlace;
+using GPUPlace = phi::GPUPlace;
+using GPUPinnedPlace = phi::GPUPinnedPlace;
+using XPUPlace = phi::XPUPlace;
+using NPUPlace = phi::NPUPlace;
 }  // namespace experimental
 }  // namespace paddle
diff --git a/paddle/phi/core/compat/convert_utils.cc b/paddle/phi/core/compat/convert_utils.cc
index 67245f1da5a6b65cdf81d2cd329d7d6b3828852e..667cee10675d8a6b756a6af701b65c63b3b359de 100644
--- a/paddle/phi/core/compat/convert_utils.cc
+++ b/paddle/phi/core/compat/convert_utils.cc
@@ -26,13 +26,14 @@ limitations under the License. */
 namespace phi {
 
 Backend TransToPhiBackend(const phi::Place& place) {
-  if (place.GetType() == phi::AllocationType::CPU) {
+  auto allocation_type = place.GetType();
+  if (allocation_type == phi::AllocationType::CPU) {
     return Backend::CPU;
-  } else if (place.GetType() == phi::AllocationType::GPU) {
+  } else if (allocation_type == phi::AllocationType::GPU) {
     return Backend::GPU;
-  } else if (place.GetType() == phi::AllocationType::XPU) {
+  } else if (allocation_type == phi::AllocationType::XPU) {
     return Backend::XPU;
-  } else if (place.GetType() == phi::AllocationType::CUSTOM) {
+  } else if (allocation_type == phi::AllocationType::CUSTOM) {
     return static_cast<Backend>(
         static_cast<size_t>(Backend::NUM_BACKENDS) +
         GetOrRegisterGlobalDeviceTypeId(place.GetDeviceType()));
diff --git a/paddle/phi/infermeta/ternary.cc b/paddle/phi/infermeta/ternary.cc
index 556fb874470dd248dcf7c77e8a8ac3510bd6f63e..a72b8d913f81d721bb02116b84306fff401a8873 100644
--- a/paddle/phi/infermeta/ternary.cc
+++ b/paddle/phi/infermeta/ternary.cc
@@ -145,6 +145,7 @@ void GraphSendRecvInferMeta(const MetaTensor& x,
                             const MetaTensor& src_index,
                             const MetaTensor& dst_index,
                             const std::string& pool_type,
+                            int64_t out_size,
                             MetaTensor* out,
                             MetaTensor* dst_count) {
   auto src_index_dims = src_index.dims();
@@ -187,11 +188,23 @@ void GraphSendRecvInferMeta(const MetaTensor& x,
                         "Src_index and Dst_index should have the same shape."));
 
   auto dims = x.dims();
-  out->set_dims(dims);
+  if (out_size <= 0) {
+    out->set_dims(dims);
+  } else {
+    std::vector<int64_t> dims_ = phi::vectorize(dims);
+    if (dims_.size() > 0) {
+      dims_[0] = out_size;
+    }
+    out->set_dims(phi::make_ddim(dims_));
+  }
   out->set_dtype(x.dtype());
 
   if (pool_type == "MEAN") {
-    dst_count->set_dims({dims[0]});
+    if (out_size <= 0) {
+      dst_count->set_dims({dims[0]});
+    } else {
+      dst_count->set_dims({out_size});
+    }
     dst_count->set_dtype(DataType::INT32);
   }
 }
diff --git a/paddle/phi/infermeta/ternary.h b/paddle/phi/infermeta/ternary.h
index 42a0f35dc1d8d6aef13b631d355a4cee951a4ed1..8521a1ee855c66aa0f2adcf22dcdbe2ca7218cbe 100644
--- a/paddle/phi/infermeta/ternary.h
+++ b/paddle/phi/infermeta/ternary.h
@@ -51,6 +51,7 @@ void GraphSendRecvInferMeta(const MetaTensor& x,
                             const MetaTensor& src_index,
                             const MetaTensor& dst_index,
                             const std::string& pool_type,
+                            int64_t out_size,
                             MetaTensor* out,
                             MetaTensor* dst_count);
 
diff --git a/paddle/phi/kernels/cpu/graph_send_recv_grad_kernel.cc b/paddle/phi/kernels/cpu/graph_send_recv_grad_kernel.cc
index 8538461b1b83b887223224bac231600221547f11..6a83cee1ae40d1dec6c3d7d2f4499d7eb71390f9 100644
--- a/paddle/phi/kernels/cpu/graph_send_recv_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/graph_send_recv_grad_kernel.cc
@@ -23,15 +23,14 @@
 namespace phi {
 
 template <typename T, typename IndexT, typename Functor>
-void GraphSendRecvCpuGradLoop(const int& input_size,
-                              const int& index_size,
+void GraphSendRecvCpuGradLoop(const int& index_size,
                               const IndexT* s_index,
                               const IndexT* d_index,
                               const DenseTensor& src,
+                              const DenseTensor& input,
                               DenseTensor* dst,
                               const std::string& pool_type,
                               const int* dst_count = nullptr,
-                              const DenseTensor* input = nullptr,
                               const DenseTensor* output = nullptr) {
   if (pool_type == "SUM") {
     Functor functor;
@@ -55,7 +54,7 @@ void GraphSendRecvCpuGradLoop(const int& input_size,
     for (int i = 0; i < index_size; ++i) {
       const IndexT& forward_src_idx = d_index[i];
       const IndexT& forward_dst_idx = s_index[i];
-      auto input_slice = input->Slice(forward_src_idx, forward_src_idx + 1);
+      auto input_slice = input.Slice(forward_src_idx, forward_src_idx + 1);
       auto output_slice = output->Slice(forward_dst_idx, forward_dst_idx + 1);
       auto eigen_input = phi::EigenVector<T>::Flatten(input_slice);
       auto eigen_output = phi::EigenVector<T>::Flatten(output_slice);
@@ -73,18 +72,18 @@ template <typename Context, typename T, typename IndexT>
 void GraphSendRecvGradOpKernelLaunchHelper(
     const Context& ctx,
     const DenseTensor& out_grad,
+    const DenseTensor& x,
     const DenseTensor& src_index,
     const DenseTensor& dst_index,
     const std::string& pool_type,
     DenseTensor* x_grad,
     const DenseTensor* dst_count = nullptr,
-    const DenseTensor* x = nullptr,
     const DenseTensor* out = nullptr) {
   const int& index_size = dst_index.dims()[0];
 
   ctx.template Alloc<T>(x_grad);
   T* p_output = x_grad->data<T>();
-  const auto& src_dims = out_grad.dims();
+  const auto& src_dims = x.dims();
   int64_t memset_size = 1;
   for (int i = 0; i < src_dims.size(); ++i) memset_size *= src_dims[i];
   const size_t& memset_bytes = memset_size * sizeof(T);
@@ -97,29 +96,22 @@ void GraphSendRecvGradOpKernelLaunchHelper(
 
   if (pool_type == "SUM") {
     GraphSendRecvCpuGradLoop<T, IndexT, GraphSendRecvSumFunctor<T>>(
-        src_dims[0], index_size, d_index, s_index, out_grad, x_grad, pool_type);
+        index_size, d_index, s_index, out_grad, x, x_grad, pool_type);
   } else if (pool_type == "MEAN") {
     const int* s_count = dst_count->data<int>();
     // Functor not used here.
-    GraphSendRecvCpuGradLoop<T, IndexT, GraphSendRecvSumFunctor<T>>(src_dims[0],
-                                                                    index_size,
-                                                                    d_index,
-                                                                    s_index,
-                                                                    out_grad,
-                                                                    x_grad,
-                                                                    pool_type,
-                                                                    s_count);
+    GraphSendRecvCpuGradLoop<T, IndexT, GraphSendRecvSumFunctor<T>>(
+        index_size, d_index, s_index, out_grad, x, x_grad, pool_type, s_count);
   } else if (pool_type == "MIN" || pool_type == "MAX") {
     // Functor not used here.
-    GraphSendRecvCpuGradLoop<T, IndexT, GraphSendRecvMinFunctor<T>>(src_dims[0],
-                                                                    index_size,
+    GraphSendRecvCpuGradLoop<T, IndexT, GraphSendRecvMinFunctor<T>>(index_size,
                                                                     d_index,
                                                                     s_index,
                                                                     out_grad,
+                                                                    x,
                                                                     x_grad,
                                                                     pool_type,
                                                                     nullptr,
-                                                                    x,
                                                                     out);
   }
 }
@@ -127,7 +119,7 @@ void GraphSendRecvGradOpKernelLaunchHelper(
 template <typename T, typename Context>
 void GraphSendRecvGradKernel(const Context& ctx,
                              const DenseTensor& out_grad,
-                             paddle::optional<const DenseTensor&> x,
+                             const DenseTensor& x,
                              paddle::optional<const DenseTensor&> out,
                              const DenseTensor& src_index,
                              const DenseTensor& dst_index,
@@ -139,23 +131,23 @@ void GraphSendRecvGradKernel(const Context& ctx,
     GraphSendRecvGradOpKernelLaunchHelper<Context, T, int32_t>(
         ctx,
         out_grad,
+        x,
         src_index,
         dst_index,
         pool_type,
         x_grad,
         dst_count.get_ptr(),
-        x.get_ptr(),
         out.get_ptr());
   } else if (index_type == phi::DataType::INT64) {
     GraphSendRecvGradOpKernelLaunchHelper<Context, T, int64_t>(
         ctx,
         out_grad,
+        x,
         src_index,
         dst_index,
         pool_type,
         x_grad,
         dst_count.get_ptr(),
-        x.get_ptr(),
         out.get_ptr());
   }
 }
diff --git a/paddle/phi/kernels/cpu/graph_send_recv_kernel.cc b/paddle/phi/kernels/cpu/graph_send_recv_kernel.cc
index fecbd4b1d7aa05c94d2cb713f302abd6f2aeb1c4..8f71ba12cc4fa25bc20e337bf3986518e3ce3672 100644
--- a/paddle/phi/kernels/cpu/graph_send_recv_kernel.cc
+++ b/paddle/phi/kernels/cpu/graph_send_recv_kernel.cc
@@ -83,6 +83,7 @@ void GraphSendRecvOpKernelLaunchHelper(const Context& ctx,
                                        const DenseTensor& src_index,
                                        const DenseTensor& dst_index,
                                        const std::string& pool_type,
+                                       int64_t out_size,
                                        DenseTensor* out,
                                        DenseTensor* dst_count = nullptr) {
   const int& index_size = src_index.dims()[0];
@@ -91,7 +92,16 @@ void GraphSendRecvOpKernelLaunchHelper(const Context& ctx,
   T* p_output = out->data<T>();
   const auto& src_dims = x.dims();
   int64_t memset_size = 1;
-  for (int i = 0; i < src_dims.size(); ++i) memset_size *= src_dims[i];
+  if (out_size <= 0) {
+    for (int i = 0; i < src_dims.size(); ++i) {
+      memset_size *= src_dims[i];
+    }
+  } else {
+    memset_size = out_size;
+    for (int i = 1; i < src_dims.size(); ++i) {
+      memset_size *= src_dims[i];
+    }
+  }
   const size_t& memset_bytes = memset_size * sizeof(T);
   memset(p_output, 0, memset_bytes);
 
@@ -129,15 +139,16 @@ void GraphSendRecvKernel(const Context& ctx,
                          const DenseTensor& src_index,
                          const DenseTensor& dst_index,
                          const std::string& pool_type,
+                         int64_t out_size,
                          DenseTensor* out,
                          DenseTensor* dst_count) {
   auto index_type = src_index.dtype();
   if (index_type == phi::DataType::INT32) {
     GraphSendRecvOpKernelLaunchHelper<Context, T, int32_t>(
-        ctx, x, src_index, dst_index, pool_type, out, dst_count);
+        ctx, x, src_index, dst_index, pool_type, out_size, out, dst_count);
   } else if (index_type == phi::DataType::INT64) {
     GraphSendRecvOpKernelLaunchHelper<Context, T, int64_t>(
-        ctx, x, src_index, dst_index, pool_type, out, dst_count);
+        ctx, x, src_index, dst_index, pool_type, out_size, out, dst_count);
   }
 }
 
diff --git a/paddle/phi/kernels/cpu/reduce_grad_kernel.cc b/paddle/phi/kernels/cpu/reduce_grad_kernel.cc
index 78a7ae8d415b5d4b18fdf8e469576db50f739e38..4b3b1fc16e9c426daccb6364f926a7c44ab63e59 100644
--- a/paddle/phi/kernels/cpu/reduce_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/reduce_grad_kernel.cc
@@ -99,8 +99,8 @@ void ReduceSumGradKernel(const Context& dev_ctx,
 
   ReduceGradKernel<Context, T, funcs::SumGradFunctor, true>(dev_ctx,
                                                             x,
-                                                            out_grad,
                                                             paddle::none,
+                                                            out_grad,
                                                             dims,
                                                             keep_dim,
                                                             reduce_all,
@@ -121,8 +121,8 @@ void ReduceMeanGradKernel(const Context& dev_ctx,
                           DenseTensor* x_grad) {
   ReduceGradKernel<Context, T, funcs::MeanGradFunctor, true>(dev_ctx,
                                                              x,
-                                                             out_grad,
                                                              paddle::none,
+                                                             out_grad,
                                                              dims,
                                                              keep_dim,
                                                              reduce_all,
diff --git a/paddle/phi/kernels/funcs/mode.h b/paddle/phi/kernels/funcs/mode.h
index 1b7641762e2639acf3db540280891b518f22eed2..3bd6c19545e16c1bca2881c4804db43af2b0b227 100644
--- a/paddle/phi/kernels/funcs/mode.h
+++ b/paddle/phi/kernels/funcs/mode.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(__NVCC__) || defined(__HIPCC__)
 #include <thrust/device_vector.h>
 #include <thrust/execution_policy.h>
 #include <thrust/extrema.h>
@@ -143,7 +143,7 @@ static void ModeAssign(const Type& input_height,
   }
 }
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(__NVCC__) || defined(__HIPCC__)
 template <typename T>
 static void GetModebySort(const phi::GPUContext& dev_ctx,
                           const DenseTensor* input_tensor,
diff --git a/paddle/phi/kernels/funcs/select_impl.cu.h b/paddle/phi/kernels/funcs/select_impl.cu.h
index 3a1d9b8ea7a7a36c31f31f7fc60dffc1f827d34e..16e00414ad772a162d33fb18c0dfafc889e53951 100644
--- a/paddle/phi/kernels/funcs/select_impl.cu.h
+++ b/paddle/phi/kernels/funcs/select_impl.cu.h
@@ -168,57 +168,82 @@ __global__ void CumsumOneBlock(
   }
 }
 
+// where_index
 template <typename OutT,
           typename MT,
           typename InT,
-          typename IdT,
           typename Functor,
           int VecSize,
           int IsBoundary,
-          int IsMaskData>
+          int MaskData>
 struct SelectCaller {
-  __device__ void inline operator()(OutT *store_data,
+  __device__ void inline operator()(OutT *out,
                                     const MT *mask_data,
                                     const InT *in,
                                     Functor func,
-                                    int num,
-                                    int data_offset) {
-    // where_index op
-    IdT index_reg[VecSize];
-    // Set data index of global
-    kps::InitWithDataIndex<IdT, VecSize, 1, 1>(&index_reg[0], data_offset);
+                                    int data_offset,
+                                    int store_num,
+                                    int thread_fix,
+                                    int num) {
+    int64_t in_data[VecSize];
+    OutT store_data[VecSize * phi::DDim::kMaxRank];
+    // set index
+    kps::InitWithDataIndex<int64_t, VecSize, 1, 1>(&in_data[0], data_offset);
     // Get store data according to mask_idt
-    kps::OperatorTernary<MT, IdT, OutT, Functor>(
-        store_data, mask_data, &index_reg[0], func, VecSize);
+    kps::OperatorTernary<MT, int64_t, OutT, Functor>(
+        store_data, mask_data, &in_data[0], func, VecSize);
+    kps::details::WriteData<OutT>(out + thread_fix, &store_data[0], store_num);
   }
 };
 
+// masked_select
 template <typename OutT,
           typename MT,
           typename InT,
-          typename IdT,
           typename Functor,
           int VecSize,
           int IsBoundary>
-struct SelectCaller<OutT,
-                    MT,
-                    InT,
-                    IdT,
-                    Functor,
-                    VecSize,
-                    IsBoundary,
-                    1> {  // masked_select
-  __device__ void inline operator()(OutT *store_data,
+struct SelectCaller<OutT, MT, InT, Functor, VecSize, IsBoundary, 1> {
+  __device__ void inline operator()(OutT *out,
                                     const MT *mask_data,
                                     const InT *in,
                                     Functor func,
-                                    int num,
-                                    int data_offset) {
+                                    int data_offset,
+                                    int store_num,
+                                    int thread_fix,
+                                    int num) {
     InT in_data[VecSize];
+    OutT store_data[VecSize * phi::DDim::kMaxRank];
     kps::ReadData<InT, VecSize, 1, 1, IsBoundary>(&in_data[0], in, num);
     // Get store data according to mask_idt
     kps::OperatorTernary<MT, InT, OutT, Functor>(
         store_data, mask_data, &in_data[0], func, VecSize);
+    kps::details::WriteData<OutT>(out + thread_fix, &store_data[0], store_num);
+  }
+};
+
+// masked_select_grad
+template <typename OutT,
+          typename MT,
+          typename InT,
+          typename Functor,
+          int VecSize,
+          int IsBoundary>
+struct SelectCaller<OutT, MT, InT, Functor, VecSize, IsBoundary, 2> {
+  __device__ void inline operator()(OutT *out,
+                                    const MT *mask_data,
+                                    const InT *in,
+                                    Functor func,
+                                    int data_offset,
+                                    int store_num,
+                                    int thread_fix,
+                                    int num) {
+    InT in_data[VecSize];
+    OutT store_data[VecSize * phi::DDim::kMaxRank];
+    kps::details::ReadData<InT>(&in_data[0], in + thread_fix, store_num);
+    kps::OperatorTernary<MT, InT, OutT, Functor>(
+        store_data, mask_data, &in_data[0], func, VecSize);
+    kps::WriteData<OutT, VecSize, 1, 1, IsBoundary>(out, &store_data[0], num);
   }
 };
 
@@ -253,7 +278,6 @@ SelectKernelImpl(OutT *out,
   IdT num_thread[kCVecSize];
   IdT cumsum_thread[kCVecSize];
 
-  OutT store_data[VecSize * phi::DDim::kMaxRank];
   MT mask_data[VecSize];
   IdT mask_idt[VecSize];
   // init data_pr
@@ -271,17 +295,15 @@ SelectKernelImpl(OutT *out,
   // Get cumsum_thread cumsum from 0 to num_thread cumsum_thread[0] is the
   // thread_fix
   kps::Cumsum<IdT, IdT, 1, Add>(&cumsum_thread[0], &num_thread[0], Add());
-  // Get store data(index) according to mask_idt
-  SelectCaller<OutT, MT, InT, IdT, Functor, VecSize, IsBoundary, MaskData>
-      compute;
-  compute(&store_data[0], &mask_data[0], in, func, num, data_offset);
   // get thread_fix
   int thread_fix =
       (static_cast<int>(cumsum_thread[0] - num_thread[0]) * store_rank);
   // get how many data need to store
   int store_num = static_cast<int>(num_thread[0]) * store_rank;
   // thread store num data, each thread may has different num
-  kps::details::WriteData<OutT>(out + thread_fix, &store_data[0], store_num);
+  // Get store data(index) according to mask_idt
+  SelectCaller<OutT, MT, InT, Functor, VecSize, IsBoundary, MaskData> select;
+  select(out, mask_data, in, func, data_offset, store_num, thread_fix, num);
 }
 
 template <typename MT,
@@ -303,15 +325,17 @@ __global__ void SelectKernel(OutT *out,
   int stride = BLOCK_NUM_X * GRID_NUM_X * VecSize;
   int repeat = 0;
   int size = VecSize * BLOCK_ID_X;
+  CT block_store_offset = 0;
   for (; data_offset < main_offset; data_offset += stride) {
     // Cumsum index
     int idx_cumsum = repeat * GRID_NUM_X + BLOCK_ID_X;
-    // niuliling todo: us ReadData API
-    int block_store_offset = cumsum[idx_cumsum];
+    kps::details::ReadData<CT>(&block_store_offset, cumsum + idx_cumsum, 1);
+    int out_fix = MaskData < 2 ? block_store_offset * store_rank : data_offset;
+    int in_fix = MaskData < 2 ? data_offset : block_store_offset * store_rank;
     SelectKernelImpl<InT, MT, OutT, Functor, VecSize, MaskData, false>(
-        out + block_store_offset * store_rank,
+        out + out_fix,
         mask + data_offset,
-        in + data_offset,
+        in + in_fix,
         func,
         size,
         data_offset,
@@ -323,12 +347,13 @@ __global__ void SelectKernel(OutT *out,
   if (num > 0) {
     // Cumsum index
     int idx_cumsum = repeat * GRID_NUM_X + BLOCK_ID_X;
-    // niuliling todo: us ReadData API
-    int block_store_offset = static_cast<int>(cumsum[idx_cumsum]);
+    kps::details::ReadData<CT>(&block_store_offset, cumsum + idx_cumsum, 1);
+    int out_fix = MaskData < 2 ? block_store_offset * store_rank : data_offset;
+    int in_fix = MaskData < 2 ? data_offset : block_store_offset * store_rank;
     SelectKernelImpl<InT, MT, OutT, Functor, VecSize, MaskData, true>(
-        out + block_store_offset * store_rank,
+        out + out_fix,
         mask + data_offset,
-        in + data_offset,
+        in + in_fix,
         func,
         num,
         data_offset,
@@ -402,6 +427,7 @@ void SelectKernel(const KPDevice &dev_ctx,
   const int kCumVesize = 2;
   const int block_c = 256;
   const int main_offset_c = Floor(size_count_block, (kCumVesize * block_c));
+
   using Add = kps::AddFunctor<CT>;
   CumsumOneBlock<CT, CT, Add, kCumVesize><<<1, block_c, 0, stream>>>(
       count_data, cumsum_data, size_count_block, main_offset_c, Add());
@@ -418,10 +444,13 @@ void SelectKernel(const KPDevice &dev_ctx,
   dev_ctx.Wait();
   // 3.1.2 allock for out with total_true_num
   std::vector<int64_t> out_dim = {static_cast<int64_t>(total_true_num)};
-  if (SelectData == 0) {  // where_index
+
+  if (SelectData == 1) {
+    out->Resize(phi::make_ddim(out_dim));
+  } else if (SelectData == 0) {  // == 0 where_index
     out_dim.push_back(rank);
+    out->Resize(phi::make_ddim(out_dim));
   }
-  out->Resize(phi::make_ddim(out_dim));
   auto out_data = out->mutable_data<OutT>(cuda_place);
   // 3.2 get true data's index according to cond_data and cumsum_data
   if (total_true_num <= 0) return;
diff --git a/paddle/phi/kernels/gpu/copy_kernel.cu b/paddle/phi/kernels/gpu/copy_kernel.cu
index 4545f9ce436ea4028d43d3a91ae46a21cde41bb5..a16c8369cc9e5b71ab483b5c2eb9a3d2e61f900a 100644
--- a/paddle/phi/kernels/gpu/copy_kernel.cu
+++ b/paddle/phi/kernels/gpu/copy_kernel.cu
@@ -87,7 +87,8 @@ void Copy(const Context& dev_ctx,
                  : reinterpret_cast<const phi::GPUContext&>(dev_ctx).stream();
     paddle::memory::Copy(
         dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
-  } else if (paddle::platform::is_cpu_place(src_place) &&  // NOLINT
+  } else if ((paddle::platform::is_cpu_place(src_place) ||
+              paddle::platform::is_cuda_pinned_place(src_place)) &&  // NOLINT
              paddle::platform::is_gpu_place(dst_place)) {
     auto src_cpu_place = src_place;
     auto dst_gpu_place = dst_place;
diff --git a/paddle/phi/kernels/gpu/graph_send_recv_grad_kernel.cu b/paddle/phi/kernels/gpu/graph_send_recv_grad_kernel.cu
index 75692966b4662c12feaefdf5bfc7311ed9d628b8..8bd3337280d7596ebd7a638b1cc86cf37b70632d 100644
--- a/paddle/phi/kernels/gpu/graph_send_recv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/graph_send_recv_grad_kernel.cu
@@ -28,19 +28,19 @@ template <typename Context, typename T, typename IndexT>
 void GraphSendRecvGradOpCUDAKernelLaunchHelper(
     const Context& ctx,
     const DenseTensor& out_grad,
+    const DenseTensor& x,
     const DenseTensor& src_index,
     const DenseTensor& dst_index,
     const std::string& pool_type,
     DenseTensor* x_grad,
     const DenseTensor* dst_count = nullptr,
-    const DenseTensor* x = nullptr,
     const DenseTensor* out = nullptr) {
   const int& index_size = dst_index.dims()[0];
 
   ctx.template Alloc<T>(x_grad);
   T* p_output = x_grad->data<T>();
 
-  const auto& src_dims = out_grad.dims();
+  const auto& src_dims = x.dims();
   int64_t memset_size = 1;
   for (int i = 0; i < src_dims.size(); ++i) {
     memset_size *= src_dims[i];
@@ -86,7 +86,7 @@ void GraphSendRecvGradOpCUDAKernelLaunchHelper(
     ManipulateMeanGradCUDAKernel<T, IndexT><<<grid, block, 0, ctx.stream()>>>(
         p_src, d_index, s_index, p_output, index_size, slice_size, s_count);
   } else if (pool_type == "MAX" || pool_type == "MIN") {
-    const T* ptr_input = x->data<T>();
+    const T* ptr_input = x.data<T>();
     const T* ptr_output = out->data<T>();
     ManipulateMinMaxGradCUDAKernel<T, IndexT><<<grid, block, 0, ctx.stream()>>>(
         p_src,
@@ -103,7 +103,7 @@ void GraphSendRecvGradOpCUDAKernelLaunchHelper(
 template <typename T, typename Context>
 void GraphSendRecvGradKernel(const Context& ctx,
                              const DenseTensor& out_grad,
-                             paddle::optional<const DenseTensor&> x,
+                             const DenseTensor& x,
                              paddle::optional<const DenseTensor&> out,
                              const DenseTensor& src_index,
                              const DenseTensor& dst_index,
@@ -115,23 +115,23 @@ void GraphSendRecvGradKernel(const Context& ctx,
     GraphSendRecvGradOpCUDAKernelLaunchHelper<Context, T, int32_t>(
         ctx,
         out_grad,
+        x,
         src_index,
         dst_index,
         pool_type,
         x_grad,
         dst_count.get_ptr(),
-        x.get_ptr(),
         out.get_ptr());
   } else if (index_type == phi::DataType::INT64) {
     GraphSendRecvGradOpCUDAKernelLaunchHelper<Context, T, int64_t>(
         ctx,
         out_grad,
+        x,
         src_index,
         dst_index,
         pool_type,
         x_grad,
         dst_count.get_ptr(),
-        x.get_ptr(),
         out.get_ptr());
   }
 }
diff --git a/paddle/phi/kernels/gpu/graph_send_recv_kernel.cu b/paddle/phi/kernels/gpu/graph_send_recv_kernel.cu
index fab306f831a6f401b858b0dfe11011463827a50b..2826c071d6ec3e21e42f4a8f3947cde78ab85e2d 100644
--- a/paddle/phi/kernels/gpu/graph_send_recv_kernel.cu
+++ b/paddle/phi/kernels/gpu/graph_send_recv_kernel.cu
@@ -32,6 +32,7 @@ void GraphSendRecvOpCUDAKernelLaunchHelper(const Context& ctx,
                                            const DenseTensor& src_index,
                                            const DenseTensor& dst_index,
                                            const std::string& pool_type,
+                                           int64_t out_size,
                                            DenseTensor* out,
                                            DenseTensor* dst_count = nullptr) {
   const int& index_size = src_index.dims()[0];
@@ -39,8 +40,15 @@ void GraphSendRecvOpCUDAKernelLaunchHelper(const Context& ctx,
   T* p_output = out->data<T>();
   const auto& src_dims = x.dims();
   int64_t memset_size = 1;
-  for (int i = 0; i < src_dims.size(); ++i) {
-    memset_size *= src_dims[i];
+  if (out_size <= 0) {
+    for (int i = 0; i < src_dims.size(); ++i) {
+      memset_size *= src_dims[i];
+    }
+  } else {
+    memset_size = out_size;
+    for (int i = 1; i < src_dims.size(); ++i) {
+      memset_size *= src_dims[i];
+    }
   }
   const size_t& memset_bytes = memset_size * sizeof(T);
   if (pool_type == "SUM" || pool_type == "MEAN") {
@@ -100,6 +108,9 @@ void GraphSendRecvOpCUDAKernelLaunchHelper(const Context& ctx,
                                     IndexT>><<<grid, block, 0, ctx.stream()>>>(
         p_src, s_index, d_index, p_output, index_size, slice_size, functor);
 
+    if (out_size > 0) {
+      input_size = out_size;
+    }
     int64_t grid_max_tmp = (input_size * slice_size + block - 1) / block;
     int64_t grid_max =
         grid_max_tmp < max_grid_dimx ? grid_max_tmp : max_grid_dimx;
@@ -114,6 +125,9 @@ void GraphSendRecvOpCUDAKernelLaunchHelper(const Context& ctx,
                                     IndexT>><<<grid, block, 0, ctx.stream()>>>(
         p_src, s_index, d_index, p_output, index_size, slice_size, functor);
 
+    if (out_size > 0) {
+      input_size = out_size;
+    }
     int64_t grid_min_tmp = (input_size * slice_size + block - 1) / block;
     int64_t grid_min =
         grid_min_tmp < max_grid_dimx ? grid_min_tmp : max_grid_dimx;
@@ -130,6 +144,9 @@ void GraphSendRecvOpCUDAKernelLaunchHelper(const Context& ctx,
 
     ctx.template Alloc<int32_t>(dst_count);
     int32_t* p_dst_count = dst_count->data<int32_t>();
+    if (out_size > 0) {
+      input_size = out_size;
+    }
 
 #ifdef PADDLE_WITH_HIP
     hipMemset(p_dst_count, 0, input_size * sizeof(int));
@@ -155,15 +172,16 @@ void GraphSendRecvKernel(const Context& ctx,
                          const DenseTensor& src_index,
                          const DenseTensor& dst_index,
                          const std::string& pool_type,
+                         int64_t out_size,
                          DenseTensor* out,
                          DenseTensor* dst_count) {
   auto index_type = src_index.dtype();
   if (index_type == phi::DataType::INT32) {
     GraphSendRecvOpCUDAKernelLaunchHelper<Context, T, int32_t>(
-        ctx, x, src_index, dst_index, pool_type, out, dst_count);
+        ctx, x, src_index, dst_index, pool_type, out_size, out, dst_count);
   } else if (index_type == phi::DataType::INT64) {
     GraphSendRecvOpCUDAKernelLaunchHelper<Context, T, int64_t>(
-        ctx, x, src_index, dst_index, pool_type, out, dst_count);
+        ctx, x, src_index, dst_index, pool_type, out_size, out, dst_count);
   }
 }
 
diff --git a/paddle/phi/kernels/gpu/masked_select_grad_kernel.cu b/paddle/phi/kernels/gpu/masked_select_grad_kernel.cu
index 71b7cd8750462fdf0dad20b2b221bd18cc6dbbe6..5d0097af2ca9abe4c7a4feb2d312068a5150ae1b 100644
--- a/paddle/phi/kernels/gpu/masked_select_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/masked_select_grad_kernel.cu
@@ -17,38 +17,31 @@
 #include <thrust/reverse.h>
 #include <thrust/scan.h>
 
-#include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/select_impl.cu.h"
 #include "paddle/phi/kernels/masked_select_grad_kernel.h"
 
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
 namespace phi {
 
-__global__ void SetMaskArrayT(const bool* mask, int32_t* mask_array, int size) {
-  int idx = blockDim.x * blockIdx.x + threadIdx.x;
-  for (; idx < size; idx += blockDim.x * gridDim.x) {
-    if (mask[idx])
-      mask_array[idx] = 1;
-    else
-      mask_array[idx] = 0;
-  }
-}
+template <typename MT, typename InT, typename OutT>
+struct MaskedSelectGradFunctor {
+  HOSTDEVICE MaskedSelectGradFunctor() {}
 
-template <typename T>
-__global__ void SelectGradWithPrefixMask(const int32_t* mask_prefix_sum,
-                                         const bool* mask,
-                                         const T* input,
-                                         T* out,
-                                         int size) {
-  int idx = blockDim.x * blockIdx.x + threadIdx.x;
-  for (; idx < size; idx += blockDim.x * gridDim.x) {
-    if (mask[idx]) {
-      int index = mask_prefix_sum[idx];
-      out[idx] = input[index];
-    } else {
-      out[idx] = 0;
+  HOSTDEVICE inline void operator()(OutT* out,
+                                    const MT* mask,
+                                    const InT* value,
+                                    int num) {
+    int read_fix = 0;
+    for (int idx = 0; idx < num; idx++) {
+      if (mask[idx]) {
+        out[idx] = value[read_fix++];
+      } else {
+        out[idx] = 0;
+      }
     }
   }
-}
+};
 
 template <typename T, typename Context>
 void MaskedSelectGradKernel(const Context& dev_ctx,
@@ -56,42 +49,12 @@ void MaskedSelectGradKernel(const Context& dev_ctx,
                             const DenseTensor& x,
                             const DenseTensor& mask,
                             DenseTensor* x_grad) {
-  auto* mask_data = mask.data<bool>();
-  auto* input_data = out_grad.data<T>();
-  auto* out_data = x_grad->mutable_data<T>(dev_ctx.GetPlace());
-
-  auto input_size = out_grad.numel();
   auto mask_size = mask.numel();
-  auto mask_dim = mask.dims();
-
-  auto out_size = mask_size;
-
-  DenseTensor mask_array;
-  DenseTensor mask_prefix_sum;
-  mask_array.Resize(mask_dim);
-  mask_prefix_sum.Resize(mask_dim);
-
-  int32_t* mask_array_data =
-      mask_array.mutable_data<int32_t>(dev_ctx.GetPlace());
-  int32_t* mask_prefix_sum_data =
-      mask_prefix_sum.mutable_data<int32_t>(dev_ctx.GetPlace());
-  int threads = 512;
-  int grid = (mask_size + threads - 1) / threads;
-  auto stream = dev_ctx.stream();
-  SetMaskArrayT<<<grid, threads, 0, stream>>>(
-      mask_data, mask_array_data, mask_size);
-
-  thrust::device_ptr<int32_t> mask_array_dev_ptr =
-      thrust::device_pointer_cast(mask_array_data);
-  thrust::device_vector<int32_t> mask_array_vec(mask_array_dev_ptr,
-                                                mask_array_dev_ptr + mask_size);
-  thrust::exclusive_scan(thrust::device,
-                         mask_array_vec.begin(),
-                         mask_array_vec.end(),
-                         mask_prefix_sum_data);
-
-  SelectGradWithPrefixMask<T><<<grid, threads, 0, stream>>>(
-      mask_prefix_sum_data, mask_data, input_data, out_data, mask_size);
+  auto* out_data = x_grad->mutable_data<T>(dev_ctx.GetPlace());
+  if (mask_size <= 0) return;
+  using Functor = MaskedSelectGradFunctor<bool, T, T>;
+  phi::funcs::SelectKernel<bool, T, T, 2, Functor>(
+      dev_ctx, mask, out_grad, x_grad, Functor());
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/masked_select_kernel.cu b/paddle/phi/kernels/gpu/masked_select_kernel.cu
index b443ae6b8fb5e6c3bf5264a50d25205a419f22ad..8986c97583e2086eb9ba53aee34745533dbc91c5 100644
--- a/paddle/phi/kernels/gpu/masked_select_kernel.cu
+++ b/paddle/phi/kernels/gpu/masked_select_kernel.cu
@@ -17,11 +17,12 @@
 #include <thrust/reverse.h>
 #include <thrust/scan.h>
 
-#include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/select_impl.cu.h"
 #include "paddle/phi/kernels/masked_select_kernel.h"
 
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
 namespace phi {
 
 template <typename MT, typename InT, typename OutT>
diff --git a/paddle/phi/kernels/gpu/where_index_kernel.cu b/paddle/phi/kernels/gpu/where_index_kernel.cu
index 9538533f70d597e21b393d2650d56bebd823c360..616679057ffce29b8d911d56d5cf428801138589 100644
--- a/paddle/phi/kernels/gpu/where_index_kernel.cu
+++ b/paddle/phi/kernels/gpu/where_index_kernel.cu
@@ -20,13 +20,14 @@
 namespace cub = hipcub;
 #endif
 
-#include "paddle/phi/core/ddim.h"
-#include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/select_impl.cu.h"
 #include "paddle/phi/kernels/where_index_kernel.h"
 
+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+
 namespace phi {
 template <typename T1, typename T2, typename OutT>
 struct IndexFunctor {
diff --git a/paddle/phi/kernels/graph_send_recv_grad_kernel.h b/paddle/phi/kernels/graph_send_recv_grad_kernel.h
index d163e6e278a0755c63be3192ae749ddbac5262d4..3694c8f1e6c9905586ea82d49a87b6b1a12e1e71 100644
--- a/paddle/phi/kernels/graph_send_recv_grad_kernel.h
+++ b/paddle/phi/kernels/graph_send_recv_grad_kernel.h
@@ -23,7 +23,7 @@ namespace phi {
 template <typename T, typename Context>
 void GraphSendRecvGradKernel(const Context& ctx,
                              const DenseTensor& out_grad,
-                             paddle::optional<const DenseTensor&> x,
+                             const DenseTensor& x,
                              paddle::optional<const DenseTensor&> out,
                              const DenseTensor& src_index,
                              const DenseTensor& dst_index,
diff --git a/paddle/phi/kernels/graph_send_recv_kernel.h b/paddle/phi/kernels/graph_send_recv_kernel.h
index 95dbdc4443ad003c3b9e7e584976cbe5c0503e20..51768fbc18f019da7713f2f95334e445620da91b 100644
--- a/paddle/phi/kernels/graph_send_recv_kernel.h
+++ b/paddle/phi/kernels/graph_send_recv_kernel.h
@@ -25,6 +25,7 @@ void GraphSendRecvKernel(const Context& ctx,
                          const DenseTensor& src_index,
                          const DenseTensor& dst_index,
                          const std::string& pool_type,
+                         int64_t out_size,
                          DenseTensor* out,
                          DenseTensor* dst_count);
 
diff --git a/paddle/phi/kernels/impl/frobenius_norm_grad_kernel_impl.h b/paddle/phi/kernels/impl/frobenius_norm_grad_kernel_impl.h
index 65d903a7fe426c6eed6cba6f38e8c636001d47b0..1877a4ecc227e36cad474d8a5c11de5ff7a1946c 100644
--- a/paddle/phi/kernels/impl/frobenius_norm_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/frobenius_norm_grad_kernel_impl.h
@@ -33,7 +33,7 @@ void FrobeniusNormGradKernel(const Context& ctx,
                              DataType out_dtype,
                              DenseTensor* dx) {
   ReduceGradKernel<Context, T, funcs::FrobeniusNormGradFunctor>(
-      ctx, x, dout, out, axis, keep_dim, reduce_all, in_dtype, out_dtype, dx);
+      ctx, x, out, dout, axis, keep_dim, reduce_all, in_dtype, out_dtype, dx);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/impl/reduce_grad.h b/paddle/phi/kernels/impl/reduce_grad.h
index f56d3d3ed50f7e72910115f7ec28914a5eade2e8..0b1c43b5f040227b691fca6bdc257fbf8c697957 100644
--- a/paddle/phi/kernels/impl/reduce_grad.h
+++ b/paddle/phi/kernels/impl/reduce_grad.h
@@ -87,8 +87,8 @@ template <typename Context,
           bool kNoNeedBufferY = false>
 void ReduceGradKernel(const Context& dev_ctx,
                       const DenseTensor& x,
-                      const DenseTensor& out_grad,
                       const paddle::optional<DenseTensor>& out,
+                      const DenseTensor& out_grad,
                       const std::vector<int64_t>& dims,
                       bool keep_dim,
                       bool reduce_all,
diff --git a/paddle/phi/kernels/impl/reduce_max_grad_kernel_impl.h b/paddle/phi/kernels/impl/reduce_max_grad_kernel_impl.h
index 4a74416e3916492e6d3a40e09ca347db485fff7c..0a0c1abac808646db157b39ef929b007a25e6985 100644
--- a/paddle/phi/kernels/impl/reduce_max_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/reduce_max_grad_kernel_impl.h
@@ -24,8 +24,8 @@ namespace phi {
 template <typename T, typename Context>
 void ReduceMaxGradKernel(const Context& dev_ctx,
                          const DenseTensor& x,
-                         const DenseTensor& out_grad,
                          const DenseTensor& out,
+                         const DenseTensor& out_grad,
                          const std::vector<int64_t>& dims,
                          bool keep_dim,
                          bool reduce_all,
@@ -34,8 +34,8 @@ void ReduceMaxGradKernel(const Context& dev_ctx,
                          DenseTensor* x_grad) {
   ReduceGradKernel<Context, T, funcs::MaxOrMinGradFunctor>(dev_ctx,
                                                            x,
-                                                           out_grad,
                                                            out,
+                                                           out_grad,
                                                            dims,
                                                            keep_dim,
                                                            reduce_all,
diff --git a/paddle/phi/kernels/impl/reduce_min_grad_kernel_impl.h b/paddle/phi/kernels/impl/reduce_min_grad_kernel_impl.h
index baaa544f137366f1e0343c25bc373cc08350f7fd..965fc686e27838ccc3ec73b7ae7d8c5267cd59bf 100644
--- a/paddle/phi/kernels/impl/reduce_min_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/reduce_min_grad_kernel_impl.h
@@ -24,8 +24,8 @@ namespace phi {
 template <typename T, typename Context>
 void ReduceMinGradKernel(const Context& dev_ctx,
                          const DenseTensor& x,
-                         const DenseTensor& out_grad,
                          const DenseTensor& out,
+                         const DenseTensor& out_grad,
                          const std::vector<int64_t>& dims,
                          bool keep_dim,
                          bool reduce_all,
@@ -34,8 +34,8 @@ void ReduceMinGradKernel(const Context& dev_ctx,
                          DenseTensor* x_grad) {
   ReduceGradKernel<Context, T, funcs::MaxOrMinGradFunctor>(dev_ctx,
                                                            x,
-                                                           out_grad,
                                                            out,
+                                                           out_grad,
                                                            dims,
                                                            keep_dim,
                                                            reduce_all,
diff --git a/paddle/phi/kernels/impl/reduce_prod_grad_kernel_impl.h b/paddle/phi/kernels/impl/reduce_prod_grad_kernel_impl.h
index 6b93e98cec0168ab55e15e3401a72738f79d3a07..fb361e34205582bd3478696ade10602240b7d66b 100644
--- a/paddle/phi/kernels/impl/reduce_prod_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/reduce_prod_grad_kernel_impl.h
@@ -24,8 +24,8 @@ namespace phi {
 template <typename T, typename Context>
 void ReduceProdGradKernel(const Context& dev_ctx,
                           const DenseTensor& x,
-                          const DenseTensor& out_grad,
                           const DenseTensor& out,
+                          const DenseTensor& out_grad,
                           const std::vector<int64_t>& dims,
                           bool keep_dim,
                           bool reduce_all,
@@ -34,8 +34,8 @@ void ReduceProdGradKernel(const Context& dev_ctx,
                           DenseTensor* x_grad) {
   ReduceGradKernel<Context, T, funcs::ProdGradFunctor>(dev_ctx,
                                                        x,
-                                                       out_grad,
                                                        out,
+                                                       out_grad,
                                                        dims,
                                                        keep_dim,
                                                        reduce_all,
diff --git a/paddle/phi/kernels/primitive/datamover_primitives.h b/paddle/phi/kernels/primitive/datamover_primitives.h
index 1d4181f3b9a89509ada2a8fe27d584a9b5aa039c..993349f2d9e14112e2acbd0bc098f3a00351232b 100644
--- a/paddle/phi/kernels/primitive/datamover_primitives.h
+++ b/paddle/phi/kernels/primitive/datamover_primitives.h
@@ -123,6 +123,15 @@ __device__ __forceinline__ void WriteData(T* dst,
     dst[i] = src[i];
   }
 }
+
+template <typename T>
+__device__ __forceinline__ void ReadData(T* dst,
+                                         const T* __restrict__ src,
+                                         int num) {
+  for (int i = 0; i < num; i++) {
+    dst[i] = src[i];
+  }
+}
 #undef INT_BITS
 }  // namespace details
 
diff --git a/paddle/phi/kernels/reduce_grad_kernel.h b/paddle/phi/kernels/reduce_grad_kernel.h
index ee6f3d19a094d29546e82e7138933eceb96459d0..a4b472c445888fd45ff497884d309778b7447151 100644
--- a/paddle/phi/kernels/reduce_grad_kernel.h
+++ b/paddle/phi/kernels/reduce_grad_kernel.h
@@ -43,8 +43,8 @@ void ReduceMeanGradKernel(const Context& dev_ctx,
 template <typename T, typename Context>
 void ReduceProdGradKernel(const Context& dev_ctx,
                           const DenseTensor& x,
-                          const DenseTensor& out_grad,
                           const DenseTensor& out,
+                          const DenseTensor& out_grad,
                           const std::vector<int64_t>& dims,
                           bool keep_dim,
                           bool reduce_all,
@@ -55,8 +55,8 @@ void ReduceProdGradKernel(const Context& dev_ctx,
 template <typename T, typename Context>
 void ReduceMaxGradKernel(const Context& dev_ctx,
                          const DenseTensor& x,
-                         const DenseTensor& out_grad,
                          const DenseTensor& out,
+                         const DenseTensor& out_grad,
                          const std::vector<int64_t>& dims,
                          bool keep_dim,
                          bool reduce_all,
@@ -67,8 +67,8 @@ void ReduceMaxGradKernel(const Context& dev_ctx,
 template <typename T, typename Context>
 void ReduceMinGradKernel(const Context& dev_ctx,
                          const DenseTensor& x,
-                         const DenseTensor& out_grad,
                          const DenseTensor& out,
+                         const DenseTensor& out_grad,
                          const std::vector<int64_t>& dims,
                          bool keep_dim,
                          bool reduce_all,
diff --git a/paddle/phi/ops/compat/batch_norm_sig.cc b/paddle/phi/ops/compat/batch_norm_sig.cc
index fa1fac5d23779597fee7f8a6e4e467c02d6d4c94..803bb50b438a58eb49beb50caa43f8fb0f408f8c 100644
--- a/paddle/phi/ops/compat/batch_norm_sig.cc
+++ b/paddle/phi/ops/compat/batch_norm_sig.cc
@@ -18,10 +18,17 @@ namespace phi {
 
 KernelSignature BatchNormOpArgumentMapping(const ArgumentMappingContext& ctx) {
   bool is_test = paddle::any_cast<bool>(ctx.Attr("is_test"));
-  bool use_global_stats = paddle::any_cast<bool>(ctx.Attr("use_global_stats"));
+  bool use_global_stats =
+      ctx.HasAttr("use_global_stats")
+          ? paddle::any_cast<bool>(ctx.Attr("use_global_stats"))
+          : false;
   bool trainable_statistics =
-      paddle::any_cast<bool>(ctx.Attr("trainable_statistics"));
-  bool fuse_with_relu = paddle::any_cast<bool>(ctx.Attr("fuse_with_relu"));
+      ctx.HasAttr("trainable_statistics")
+          ? paddle::any_cast<bool>(ctx.Attr("trainable_statistics"))
+          : false;
+  bool fuse_with_relu = ctx.HasAttr("fuse_with_relu")
+                            ? paddle::any_cast<bool>(ctx.Attr("fuse_with_relu"))
+                            : false;
   // Dispenable `MomentumTensor` is useless now
   if (is_test && !use_global_stats && !trainable_statistics &&
       !fuse_with_relu) {
diff --git a/paddle/phi/ops/compat/graph_send_recv_sig.cc b/paddle/phi/ops/compat/graph_send_recv_sig.cc
index dacb8b25a89f9c151198f573c3fc4fda37e2939e..fa4da0704c9871f8b5969fdf79da6dcb8617cae9 100644
--- a/paddle/phi/ops/compat/graph_send_recv_sig.cc
+++ b/paddle/phi/ops/compat/graph_send_recv_sig.cc
@@ -16,6 +16,14 @@ limitations under the License. */
 
 namespace phi {
 
+KernelSignature GraphSendRecvOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("graph_send_recv",
+                         {"X", "Src_index", "Dst_index"},
+                         {"pool_type", "out_size"},
+                         {"Out", "Dst_count"});
+}
+
 KernelSignature GraphSendRecvGradOpArgumentMapping(
     const ArgumentMappingContext& ctx) {
   return KernelSignature(
@@ -27,5 +35,8 @@ KernelSignature GraphSendRecvGradOpArgumentMapping(
 
 }  // namespace phi
 
+PD_REGISTER_ARG_MAPPING_FN(graph_send_recv,
+                           phi::GraphSendRecvOpArgumentMapping);
+
 PD_REGISTER_ARG_MAPPING_FN(graph_send_recv_grad,
                            phi::GraphSendRecvGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/reduce_sig.cc b/paddle/phi/ops/compat/reduce_sig.cc
index 4bca0523801c1a94f90197c93cc495c2c4f56eeb..273badee623816ee32d25b52f8951dd0f854d8a4 100644
--- a/paddle/phi/ops/compat/reduce_sig.cc
+++ b/paddle/phi/ops/compat/reduce_sig.cc
@@ -149,7 +149,7 @@ KernelSignature ReduceMaxGradOpArgumentMapping(
     const ArgumentMappingContext& ctx) {
   return KernelSignature(
       "max_grad",
-      {"X", GradVarName("Out"), "Out"},
+      {"X", "Out", GradVarName("Out")},
       {"dim", "keep_dim", "reduce_all", "in_dtype", "out_dtype"},
       {GradVarName("X")});
 }
@@ -158,7 +158,7 @@ KernelSignature ReduceMinGradOpArgumentMapping(
     const ArgumentMappingContext& ctx) {
   return KernelSignature(
       "min_grad",
-      {"X", GradVarName("Out"), "Out"},
+      {"X", "Out", GradVarName("Out")},
       {"dim", "keep_dim", "reduce_all", "in_dtype", "out_dtype"},
       {GradVarName("X")});
 }
@@ -167,7 +167,7 @@ KernelSignature ReduceProdGradOpArgumentMapping(
     const ArgumentMappingContext& ctx) {
   return KernelSignature(
       "prod_grad",
-      {"X", GradVarName("Out"), "Out"},
+      {"X", "Out", GradVarName("Out")},
       {"dim", "keep_dim", "reduce_all", "in_dtype", "out_dtype"},
       {GradVarName("X")});
 }
diff --git a/paddle/phi/tests/api/test_data_transform.cc b/paddle/phi/tests/api/test_data_transform.cc
index dd008ff36d50a8e30166d478b916ac180d564c7c..a2bd1f2cad9fcfa2e29b6fd433a3065a3bcb4c10 100644
--- a/paddle/phi/tests/api/test_data_transform.cc
+++ b/paddle/phi/tests/api/test_data_transform.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 
 #include "paddle/phi/api/include/api.h"
 #include "paddle/phi/common/complex.h"
+#include "paddle/phi/common/place.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
@@ -39,10 +40,10 @@ TEST(API, data_transform_same_place) {
   auto x = paddle::experimental::full({3, 3},
                                       1.0,
                                       experimental::DataType::COMPLEX128,
-                                      experimental::Backend::CPU);
+                                      experimental::CPUPlace());
 
   auto y = paddle::experimental::full(
-      {3, 3}, 2.0, experimental::DataType::FLOAT32, experimental::Backend::CPU);
+      {3, 3}, 2.0, experimental::DataType::FLOAT32, experimental::CPUPlace());
 
   std::vector<phi::dtype::complex<double>> sum(9, 6.0);
 
@@ -74,10 +75,10 @@ TEST(API, data_transform_same_place) {
 TEST(Tensor, data_transform_diff_place) {
   // 1. create tensor
   auto x = paddle::experimental::full(
-      {3, 3}, 1.0, experimental::DataType::FLOAT64, experimental::Backend::CPU);
+      {3, 3}, 1.0, experimental::DataType::FLOAT64, experimental::CPUPlace());
 
   auto y = paddle::experimental::full(
-      {3, 3}, 2.0, experimental::DataType::FLOAT64, experimental::Backend::GPU);
+      {3, 3}, 2.0, experimental::DataType::FLOAT64, experimental::GPUPlace());
 
   std::vector<float> sum(9, 6.0);
 
@@ -95,7 +96,7 @@ TEST(Tensor, data_transform_diff_place) {
   ASSERT_EQ(out.impl()->place(),
             phi::TransToPhiPlace(experimental::Backend::GPU));
 
-  auto ref_out = experimental::copy_to(out, experimental::Backend::CPU, true);
+  auto ref_out = experimental::copy_to(out, experimental::CPUPlace(), true);
 
   auto dense_out = std::dynamic_pointer_cast<phi::DenseTensor>(ref_out.impl());
   for (size_t i = 0; i < 9; i++) {
diff --git a/paddle/phi/tests/api/test_scale_benchmark.cc b/paddle/phi/tests/api/test_scale_benchmark.cc
index 05a5563344966e4d0664a83051dc84b3f2da499a..ca4a264e511bd574b7f4640bab84862df3f676b0 100644
--- a/paddle/phi/tests/api/test_scale_benchmark.cc
+++ b/paddle/phi/tests/api/test_scale_benchmark.cc
@@ -30,7 +30,7 @@ namespace tests {
 
 TEST(API, scale) {
   auto x = experimental::full(
-      {3, 4}, 1.0, experimental::DataType::FLOAT32, experimental::Backend::CPU);
+      {3, 4}, 1.0, experimental::DataType::FLOAT32, experimental::CPUPlace());
 
   const size_t cycles = 300;
   phi::tests::Timer timer;
diff --git a/paddle/phi/tests/api/test_to_api.cc b/paddle/phi/tests/api/test_to_api.cc
index 66c478e4c000150b8bfd50d6cb5b79e90d7a5d80..4e8755be0c773ec38f454aad03b924e4daee532d 100644
--- a/paddle/phi/tests/api/test_to_api.cc
+++ b/paddle/phi/tests/api/test_to_api.cc
@@ -69,10 +69,10 @@ TEST(API, copy_to) {
 
 // 2. test API
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  auto tmp = paddle::experimental::copy_to(x, phi::Backend::GPU, false);
-  auto out = paddle::experimental::copy_to(tmp, phi::Backend::CPU, true);
+  auto tmp = paddle::experimental::copy_to(x, phi::GPUPlace(), false);
+  auto out = paddle::experimental::copy_to(tmp, phi::CPUPlace(), true);
 #else
-  auto out = paddle::experimental::copy_to(x, phi::Backend::CPU, false);
+  auto out = paddle::experimental::copy_to(x, phi::CPUPlace(), false);
 #endif
 
   // 3. check result
@@ -85,10 +85,10 @@ TEST(Tensor, copy_to) {
 
 // 2. test API
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  auto tmp = x.copy_to(phi::Backend::GPU, false);
-  auto out = tmp.copy_to(phi::Backend::CPU, true);
+  auto tmp = x.copy_to(phi::GPUPlace(), false);
+  auto out = tmp.copy_to(phi::CPUPlace(), true);
 #else
-  auto out = x.copy_to(phi::Backend::CPU, false);
+  auto out = x.copy_to(phi::CPUPlace(), false);
 #endif
 
   // 3. check result
diff --git a/python/paddle/distributed/run/__init__.py b/python/paddle/distributed/launch/__init__.py
similarity index 55%
rename from python/paddle/distributed/run/__init__.py
rename to python/paddle/distributed/launch/__init__.py
index f25ddb794cc4d573429ac960e646bd8125c48d16..f39bb76114345d7144e1f5b005eb42792a84cf14 100644
--- a/python/paddle/distributed/run/__init__.py
+++ b/python/paddle/distributed/launch/__init__.py
@@ -12,74 +12,69 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .job.container import Container
-from .job.pod import Pod
-from .job.job import Job
-from . import plugins
-
-#__all__ = [Container, Pod, Job]
+__all__ = []
 '''
-Paddle distribution training entry ``python -m paddle.distributed.run``.
+Paddle distributed training entry ``python -m paddle.distributed.launch``.
 
 Help
 
 # for arg usage and explanation, try the following command
-# python -m paddle.distributed.run -h
+# python -m paddle.distributed.launch -h
 
 Collective Mode
 
 Case 1: 1 node
 
 use all visible devices
-# python -m paddle.distributed.run train.py
+# python -m paddle.distributed.launch train.py
 
 use specified devices
-# python -m paddle.distributed.run --devices=0,1,2,3 train.py
+# python -m paddle.distributed.launch --devices=0,1,2,3 train.py
 
 Case 2: multi-node, auto detect ip/port
 
-# python -m paddle.distributed.run --np 2 train.py
+# python -m paddle.distributed.launch --nnodes 2 train.py
 # auto print following command
-# python -m paddle.distributed.run --master 10.0.0.1:13538 --np 2 demo.py
+# python -m paddle.distributed.launch --master 10.0.0.1:13538 --nnodes 2 demo.py
 # then copy and paste above command to other nodes
 
 Case 3: multi-node, specified master/rendezvous server
 
-# python -m paddle.distributed.run --np 2 --master 10.0.0.1:2379 train.py
+# python -m paddle.distributed.launch --nnodes 2 --master 10.0.0.1:2379 train.py
 # the master ip must be one of the node and the port must available
 
 Parameter Server Mode
 
 Case 1.1: 1 node, 1 ps, 1 trainer
 
-# python -m paddle.distributed.run --mode ps train.py
-# python -m paddle.distributed.run --server_num=1 --trainer_num=1 train.py
+# python -m paddle.distributed.launch --mode ps train.py
+# python -m paddle.distributed.launch --server_num=1 --trainer_num=1 train.py
 
 Case 1.2: 1 node, 2 ps, 2 trainer
 
-# python -m paddle.distributed.run --server_num=2 --trainer_num=2 train.py
+# python -m paddle.distributed.launch --server_num=2 --trainer_num=2 train.py
 
 Case 2: 2 node, 2 ps, 2 trainer per node
 
-# python -m paddle.distributed.run --server_num=2 --trainer_num=2 --np 2 train.py
+# python -m paddle.distributed.launch --server_num=2 --trainer_num=2 --nnodes 2 train.py
 # auto print following command
-# python -m paddle.distributed.run --master 10.0.0.1:13538 --server_num=2 --trainer_num=2 --np 2 train.py
+# python -m paddle.distributed.launch --master 10.0.0.1:13538 --server_num=2 --trainer_num=2 --nnodes 2 train.py
 # then copy and paste above command to other nodes
 
 Case 3: multi-node, specified master/rendezvous server
 
-# python -m paddle.distributed.run --master 10.0.0.1:13538 --server_num=2 --trainer_num=2 --np 2 train.py
+# python -m paddle.distributed.launch --master 10.0.0.1:13538 --server_num=2 --trainer_num=2 --nnodes 2 train.py
 # the master ip must be one of the node and the port must available
 
 Case 4: specified servers and trainers in each node
 
-python -m paddle.distributed.run --servers 127.0.0.1:8900,127.0.0.1:8901 --trainers 127.0.0.1:8902,127.0.0.1:8903 train.py
+python -m paddle.distributed.launch --servers 127.0.0.1:8900,127.0.0.1:8901 --trainers 127.0.0.1:8902,127.0.0.1:8903 train.py
 
 
 Elastic Mode
 
 # run following command in 3 node to run immediately, or in 2 node to run after elastic_timeout
-# python -m paddle.distributed.run --master etcd://10.0.0.1:2379 --np 2:3 train.py
+# python -m paddle.distributed.launch --master etcd://10.0.0.1:2379 --nnodes 2:3 train.py
 
 # once the peer number changes between 2:3, the strategy holds
 
diff --git a/python/paddle/distributed/run/__main__.py b/python/paddle/distributed/launch/__main__.py
similarity index 61%
rename from python/paddle/distributed/run/__main__.py
rename to python/paddle/distributed/launch/__main__.py
index e32df59a328081e33aa86b42ed9b8e489ac399e8..9cd6f4408c9897d5eaeaa0fd31602c4a0f6de09f 100644
--- a/python/paddle/distributed/run/__main__.py
+++ b/python/paddle/distributed/launch/__main__.py
@@ -15,14 +15,28 @@
 from .context import Context
 from . import controllers
 
-# initialize the context to run
-ctx = Context()
 
-# initialize the selected controller
-c = controllers.init(ctx)
+def launch():
+    # initialize the context to run
+    ctx = Context()
 
-# run the pods
-c.run()
+    if ctx.is_legacy_mode():
 
-# manager or just wait pod
-c.finalize()
+        # legacy mode
+        from paddle.distributed.fleet import launch
+        launch.launch()
+
+    else:
+
+        # initialize the selected controller
+        c = controllers.init(ctx)
+
+        # run the pods
+        c.run()
+
+        # manager or just wait pod
+        c.finalize()
+
+
+if __name__ == "__main__":
+    launch()
diff --git a/python/paddle/distributed/launch/context/__init__.py b/python/paddle/distributed/launch/context/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e13bb2a5f0ba720037e488e92d1777a9a9111b68
--- /dev/null
+++ b/python/paddle/distributed/launch/context/__init__.py
@@ -0,0 +1,88 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.distributed.launch import plugins
+
+from .node import Node
+from .status import Status
+from .args_envs import parse_args, fetch_envs, env_args_mapping
+
+import logging
+
+
+class Context(object):
+    def __init__(self, enable_plugin=True):
+        self.args, self.unknown_args = parse_args()
+        self.envs = fetch_envs()
+        self.logger = self.get_logger()
+
+        self.node = Node()
+        self.status = Status()
+
+        self.set_env_in_args()
+
+        # design for event queue, later
+        self.events = []
+
+        if enable_plugin:
+            self._enable_plugin()
+
+    def is_legacy_mode(self):
+        if self.args.legacy:
+            return True
+
+        if len(self.unknown_args) > 0:
+            self.logger.warning("Compatible mode enable with args {}".format(
+                self.unknown_args))
+            return True
+
+        legacy_env_list = [
+            'DISTRIBUTED_TRAINER_ENDPOINTS',
+            'PADDLE_ELASTIC_JOB_ID',
+            'PADDLE_DISTRI_BACKEND',
+            'FLAGS_START_PORT',
+        ]
+
+        for env in legacy_env_list:
+            if env in self.envs:
+                self.logger.warning(
+                    "ENV {} is deprecated, legacy launch enable".format(env))
+                return True
+
+        if self.args.master:
+            return False
+
+        return False
+
+    def get_envs(self):
+        return self.envs.copy()
+
+    def _enable_plugin(self):
+        for pl in plugins.enabled_plugins:
+            pl(self)
+
+    def get_logger(self, level=logging.INFO):
+        logger = logging.getLogger("LAUNCH")
+        logger.setLevel(self.args.log_level.upper() or level)
+        formatter = logging.Formatter(
+            fmt='%(name)s %(levelname)s %(asctime)s %(message)s')
+        ch = logging.StreamHandler()
+        ch.setFormatter(formatter)
+        logger.addHandler(ch)
+        return logger
+
+    def set_env_in_args(self):
+        for k, v in env_args_mapping.items():
+            if k in self.envs:
+                setattr(self.args, v, self.envs[k])
diff --git a/python/paddle/distributed/launch/context/args_envs.py b/python/paddle/distributed/launch/context/args_envs.py
new file mode 100644
index 0000000000000000000000000000000000000000..d504a11e5f3d1cfc06c73cbb4b723e8758de52ce
--- /dev/null
+++ b/python/paddle/distributed/launch/context/args_envs.py
@@ -0,0 +1,151 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from argparse import ArgumentParser, REMAINDER
+
+env_args_mapping = {
+    'POD_IP': 'host',
+    'PADDLE_MASTER': 'master',
+    'PADDLE_DEVICES': 'devices',
+    'PADDLE_NNODES': 'nnodes',
+    'PADDLE_MODE': 'mode',
+    'PADDLE_LOG_LEVEL': 'log_level',
+    'PADDLE_NPROC_PER_NODE': 'nproc_per_node',
+    'PADDLE_JOB_ID': 'job_id',
+    'PADDLE_RANK': 'rank',
+    'PADDLE_LOG_DIR': 'log_dir',
+    'PADDLE_MAX_RESTART': 'max_restart',
+    'PADDLE_ELASTIC_LEVEL': 'elastic_level',
+    'PADDLE_ELASTIC_TIMEOUT': 'elastic_timeout',
+    'PADDLE_SERVER_NUM': 'server_num',
+    'PADDLE_TRAINER_NUM': 'trainer_num',
+    'PADDLE_SERVERS_ENDPOINTS': 'servers',
+    'PADDLE_TRAINERS_ENDPOINTS': 'trainers',
+    'PADDLE_GLOO_PORT': 'gloo_port',
+    'PADDLE_WITH_GLOO': 'with_gloo',
+}
+
+
+def fetch_envs():
+    os.environ.pop('http_proxy', None)
+    os.environ.pop('https_proxy', None)
+
+    return os.environ.copy()
+
+
+def parse_args():
+    parser = ArgumentParser()
+
+    base_group = parser.add_argument_group("Base Parameters")
+
+    base_group.add_argument(
+        "--master",
+        type=str,
+        default=None,
+        help="the master/rendezvous server, ip:port")
+
+    base_group.add_argument(
+        "--legacy", type=bool, default=False, help="use legacy launch")
+
+    base_group.add_argument(
+        "--rank", type=int, default=-1, help="the peer rank")
+
+    base_group.add_argument(
+        "--log_level", type=str, default="INFO", help="log level. Default INFO")
+
+    base_group.add_argument(
+        "--nnodes",
+        type=str,
+        default="1",
+        help="the number of peers, i.e. pod/node number")
+
+    base_group.add_argument(
+        "--nproc_per_node",
+        type=int,
+        default=None,
+        help="the number of processes in a pod")
+
+    base_group.add_argument(
+        "--log_dir",
+        type=str,
+        default="log",
+        help="the path for each process's log. Default ./log")
+    base_group.add_argument(
+        "--mode",
+        type=str,
+        default="collective",
+        help="run mode of the job, collective/ps/ps-heter")
+
+    base_group.add_argument(
+        "--job_id",
+        type=str,
+        default="default",
+        help="unique id of the job. Default default")
+
+    base_group.add_argument(
+        "--devices",
+        type=str,
+        default=None,
+        help="accelerate devices. as --gpus,npus,xps")
+
+    base_group.add_argument("--host", type=str, default=None, help="host ip")
+
+    base_group.add_argument(
+        "training_script",
+        type=str,
+        help="the full path of py script,"
+        "followed by arguments for the "
+        "training script")
+
+    base_group.add_argument('training_script_args', nargs=REMAINDER)
+
+    ps_group = parser.add_argument_group("Parameter-Server Parameters")
+    # for parameter server
+    ps_group.add_argument(
+        "--servers", type=str, default='', help="servers endpoints full list")
+    ps_group.add_argument(
+        "--trainers", type=str, default='', help="trainers endpoints full list")
+
+    ps_group.add_argument(
+        "--trainer_num", type=int, default=None, help="number of trainers")
+    ps_group.add_argument(
+        "--server_num", type=int, default=None, help="number of servers")
+    ps_group.add_argument(
+        "--gloo_port", type=int, default=6767, help="gloo http port")
+    ps_group.add_argument(
+        "--with_gloo", type=str, default="0", help="use gloo or not")
+
+    # parameter elastic mode
+    elastic_group = parser.add_argument_group("Elastic Parameters")
+    elastic_group.add_argument(
+        "--max_restart",
+        type=int,
+        default=3,
+        help="the times can restart. Default 3")
+
+    elastic_group.add_argument(
+        "--elastic_level",
+        type=int,
+        default=-1,
+        help="elastic level: -1 disable, 0 failed exit, peers hold, 1 internal restart"
+    )
+
+    elastic_group.add_argument(
+        "--elastic_timeout",
+        type=int,
+        default=30,
+        help="seconds to wait before elastic perform training")
+
+    return parser.parse_known_args()
diff --git a/python/paddle/distributed/launch/context/device.py b/python/paddle/distributed/launch/context/device.py
new file mode 100644
index 0000000000000000000000000000000000000000..9163e7abd918371ddf4eca388bc912b630684f1f
--- /dev/null
+++ b/python/paddle/distributed/launch/context/device.py
@@ -0,0 +1,149 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+
+class DeviceType:
+    CPU = 'cpu'
+    GPU = 'gpu'
+    XPU = 'xpu'
+    NPU = 'npu'
+    MLU = 'mlu'
+
+
+class Device(object):
+    def __init__(self, dtype=None, memory="", labels=""):
+        self._dtype = dtype
+        self._memory = memory
+        self._labels = labels
+
+    def __str__(self):
+        return ",".join(self._labels)
+
+    @property
+    def dtype(self):
+        return self._dtype
+
+    @property
+    def count(self):
+        return len(self._labels) or 1
+
+    @property
+    def memory(self):
+        return self._memory
+
+    @property
+    def labels(self):
+        return self._labels
+
+    @labels.setter
+    def labels(self, lbs):
+        if isinstance(lbs, str):
+            self._labels = lbs.split(',')
+        elif isinstance(lbs, list):
+            self._labels = lbs
+        else:
+            self._labels = []
+
+    def get_selected_flag_key(self):
+        if self._dtype == DeviceType.CPU:
+            return 'FLAGS_selected_cpus'
+        if self._dtype == DeviceType.GPU:
+            return 'FLAGS_selected_gpus'
+        if self._dtype == DeviceType.NPU:
+            return 'FLAGS_selected_npus'
+        if self._dtype == DeviceType.XPU:
+            return 'FLAGS_selected_xpus'
+        if self._dtype == DeviceType.MLU:
+            return 'FLAGS_selected_mlus'
+        return 'FLAGS_selected_devices'
+
+    def get_selected_flag_label(self, idx):
+        if idx < len(self._labels):
+            return self._labels[idx]
+        else:
+            return '0'
+
+    def selected_flags(self, idx=None):
+        if idx is None:
+            return {self.get_selected_flag_key(): ','.join(self._labels)}
+        else:
+            return {
+                self.get_selected_flag_key(): self.get_selected_flag_label(idx)
+            }
+
+    @classmethod
+    def parse_device(self):
+        dev = Device()
+        visible_devices = None
+        if 'CUDA_VISIBLE_DEVICES' in os.environ or 'NVIDIA_VISIBLE_DEVICES' in os.environ:
+            dev._dtype = DeviceType.GPU
+            visible_devices = os.getenv("CUDA_VISIBLE_DEVICES") or os.getenv(
+                "NVIDIA_VISIBLE_DEVICES")
+        elif 'XPU_VISIBLE_DEVICES' in os.environ:
+            dev._dtype = DeviceType.XPU
+            visible_devices = os.getenv("XPU_VISIBLE_DEVICES")
+        elif 'ASCEND_VISIBLE_DEVICES' in os.environ:
+            dev._dtype = DeviceType.NPU
+            visible_devices = os.getenv("ASCEND_VISIBLE_DEVICES")
+        elif 'MLU_VISIBLE_DEVICES' in os.environ:
+            dev._dtype = DeviceType.MLU
+            visible_devices = os.getenv("MLU_VISIBLE_DEVICES")
+
+        if visible_devices is not None and visible_devices != 'all':
+            dev._labels = visible_devices.split(',')
+        else:
+            return self.detect_device()
+
+        return dev
+
+    @classmethod
+    def detect_device(self):
+        import paddle.fluid as fluid
+
+        dev = Device()
+        num = 0
+        visible_devices = None
+        if fluid.core.is_compiled_with_cuda():
+            dev._dtype = DeviceType.GPU
+            num = fluid.core.get_cuda_device_count()
+            visible_devices = os.getenv("CUDA_VISIBLE_DEVICES") or os.getenv(
+                "NVIDIA_VISIBLE_DEVICES")
+        elif fluid.core.is_compiled_with_xpu():
+            dev._dtype = DeviceType.XPU
+            num = fluid.core.get_xpu_device_count()
+            visible_devices = os.getenv("XPU_VISIBLE_DEVICES")
+        elif fluid.core.is_compiled_with_npu():
+            dev._dtype = DeviceType.NPU
+            num = fluid.core.get_npu_device_count()
+            visible_devices = os.getenv("ASCEND_VISIBLE_DEVICES")
+        elif fluid.core.is_compiled_with_mlu():
+            dev._dtype = DeviceType.MLU
+            num = fluid.core.get_mlu_device_count()
+            visible_devices = os.getenv("MLU_VISIBLE_DEVICES")
+
+        if num == 0:
+            dev._dtype = DeviceType.CPU
+        elif visible_devices is None or visible_devices == "all":
+            dev._labels = [str(x) for x in range(0, num)]
+        else:
+            dev._labels = visible_devices.split(',')
+
+        return dev
+
+
+if __name__ == '__main__':
+    d = Device.parse_device()
+    print(d.get_selected_flag())
diff --git a/python/paddle/distributed/run/context/event.py b/python/paddle/distributed/launch/context/event.py
similarity index 100%
rename from python/paddle/distributed/run/context/event.py
rename to python/paddle/distributed/launch/context/event.py
diff --git a/python/paddle/distributed/run/context/node.py b/python/paddle/distributed/launch/context/node.py
similarity index 100%
rename from python/paddle/distributed/run/context/node.py
rename to python/paddle/distributed/launch/context/node.py
diff --git a/python/paddle/distributed/run/context/resource.py b/python/paddle/distributed/launch/context/resource.py
similarity index 100%
rename from python/paddle/distributed/run/context/resource.py
rename to python/paddle/distributed/launch/context/resource.py
diff --git a/python/paddle/distributed/run/context/status.py b/python/paddle/distributed/launch/context/status.py
similarity index 100%
rename from python/paddle/distributed/run/context/status.py
rename to python/paddle/distributed/launch/context/status.py
diff --git a/python/paddle/distributed/run/controllers/__init__.py b/python/paddle/distributed/launch/controllers/__init__.py
similarity index 98%
rename from python/paddle/distributed/run/controllers/__init__.py
rename to python/paddle/distributed/launch/controllers/__init__.py
index e5557151ad5489cb4af0c34b3ad47c31774b3326..706131300f0d884b216071a6ea3fe75caaf714df 100644
--- a/python/paddle/distributed/run/controllers/__init__.py
+++ b/python/paddle/distributed/launch/controllers/__init__.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__all__ = ["init"]
+__all__ = []
 
 from .collective import CollectiveController
 from .collective import CollectiveElasticController
diff --git a/python/paddle/distributed/run/controllers/collective.py b/python/paddle/distributed/launch/controllers/collective.py
similarity index 95%
rename from python/paddle/distributed/run/controllers/collective.py
rename to python/paddle/distributed/launch/controllers/collective.py
index c4feb54428a07265693c0969e6e385a380e22f3d..c3fa4e6e07de9bca8acd8db585be679e3cf0244c 100644
--- a/python/paddle/distributed/run/controllers/collective.py
+++ b/python/paddle/distributed/launch/controllers/collective.py
@@ -89,6 +89,10 @@ class CollectiveController(Controller):
                 "PADDLE_TRAINERS_NUM": "{}".format(global_size),
                 "PADDLE_RANK_IN_NODE": str(i),
             }
+            if self.pod.replicas == 1:
+                e.update(self.ctx.node.device.selected_flags())
+            else:
+                e.update(self.ctx.node.device.selected_flags(i))
             self.add_container(envs=e, log_tag=i)
 
         return True
@@ -106,7 +110,8 @@ class CollectiveElasticController(CollectiveController):
     def register(self):
         if self.job.id == 'default':
             self.ctx.logger.warning(
-                'Using default job name may cause conflict, add --id in args')
+                'Using default job name may cause conflict, add --job_id in args'
+            )
 
         self.master.register_heartbeat(self.job.id, self.pod.name)
 
@@ -114,6 +119,8 @@ class CollectiveElasticController(CollectiveController):
         '''
         watch self and peer status, return true to exit
         '''
+
+        self.ctx.logger.info("Watching {}".format(self.pod))
         while not self.ctx.status.is_done():
             # self status
             status = self.pod.watch(timeout=2)
@@ -171,13 +178,8 @@ class CollectiveElasticController(CollectiveController):
                 continue
 
             self.master.set_status(self.ctx.status.RUNNING)
-            self.ctx.status.run()
-
-            assert len(self.pod.containers) > 0, "No container in the pod"
-            self.ctx.logger.debug("Run {}".format(self.pod))
-            self.ctx.logger.debug("Run {}".format(self.pod.containers[0]))
 
-            self.pod.deploy()
+            self.deploy_pod()
 
             if self.watch():
                 break
diff --git a/python/paddle/distributed/run/controllers/controller.py b/python/paddle/distributed/launch/controllers/controller.py
similarity index 90%
rename from python/paddle/distributed/run/controllers/controller.py
rename to python/paddle/distributed/launch/controllers/controller.py
index 2d904cf2a2cca5b9abaab06d1545c03c160e3d93..60e34b85a12bc46636f7edb9bbb4926eddfc73ba 100644
--- a/python/paddle/distributed/run/controllers/controller.py
+++ b/python/paddle/distributed/launch/controllers/controller.py
@@ -16,9 +16,9 @@ import sys
 import os
 import signal
 
-from paddle.distributed.run.job import Job
-from paddle.distributed.run.job import Pod
-from paddle.distributed.run.job import Container
+from paddle.distributed.launch.job.job import Job
+from paddle.distributed.launch.job.pod import Pod
+from paddle.distributed.launch.job.container import Container
 
 from .master import Master
 
@@ -39,38 +39,43 @@ class ControllerBase(object):
         self.ctx = ctx
         self.master = Master.factory(self.ctx)
 
-        self.job = Job(np=self.ctx.args.np,
+        self.job = Job(nnodes=self.ctx.args.nnodes,
                        mode=self.ctx.args.mode,
-                       id=self.ctx.args.id)
+                       jid=self.ctx.args.job_id)
         self.pod = Pod()
 
         self.join_server = None
 
-    def run(self):
-        self.build_job()
-        self.build_pod()
+    def deploy_pod(self):
 
-        if len(self.pod.containers) < 1:
-            self.ctx.logger.error("No container in the pod {}".format(self.pod))
-            return
+        assert len(self.pod.containers) > 0, "No container in the pod"
 
         self.ctx.logger.info("Run {}".format(self.pod))
         self.ctx.logger.debug(self.pod.containers[0])
 
+        self.ctx.status.run()
         self.pod.deploy()
 
+    def run(self):
+        self.build_job()
+        self.build_pod()
+
+        self.deploy_pod()
+
         self.watch()
 
     def watch(self) -> bool:
+        self.ctx.logger.info("Watching {}".format(self.pod))
+
         status = self.pod.watch()
 
         if status == self.ctx.status.COMPLETED:
             self.ctx.logger.info("Pod {}".format(status))
         elif status == self.ctx.status.FAILED:
+            fc = self.pod.failed_container()
             self.ctx.logger.info("Pod {}".format(status))
-            self.ctx.logger.error("Container failed !!!\n{}".format(
-                self.pod.failed_container()))
-            self.pod.tail()
+            self.ctx.logger.error("Container failed !!!\n{}".format(fc[0]))
+            fc[0].tail()
             self.pod.stop()
 
     def stop(self, sigint=None):
diff --git a/python/paddle/distributed/run/controllers/master.py b/python/paddle/distributed/launch/controllers/master.py
similarity index 96%
rename from python/paddle/distributed/run/controllers/master.py
rename to python/paddle/distributed/launch/controllers/master.py
index 257ba3bad8da3c331ac303b7a3ee415461fd13b8..f9f484eb125eed264cb7e8658476255281a048cf 100644
--- a/python/paddle/distributed/run/controllers/master.py
+++ b/python/paddle/distributed/launch/controllers/master.py
@@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from paddle.distributed.run.utils.kv_client import KVClient
-from paddle.distributed.run.utils.kv_server import KVServer
+from paddle.distributed.launch.utils.kv_client import KVClient
+from paddle.distributed.launch.utils.kv_server import KVServer
 
 import time
 import sys
@@ -84,7 +84,7 @@ class HTTPMaster(Master):
 
             print("Copy the following command to other nodes to run.")
             cmd = [
-                sys.executable.split('/')[-1], "-m", "paddle.distributed.run"
+                sys.executable.split('/')[-1], "-m", "paddle.distributed.launch"
             ]
             cmd.extend(["--master", self.endpoint])
             cmd.extend(sys.argv[1:])
@@ -118,9 +118,12 @@ class HTTPMaster(Master):
         self._stop_server()
 
     def sync_peers(self, prefix, key, value, size, rank=-1) -> (list, int):
+
         if size < 2:
             return [value], 0
 
+        self.ctx.logger.info("Waiting peer ready...")
+
         self.lazy_init()
 
         while not self.ctx.status.is_done():
@@ -130,7 +133,7 @@ class HTTPMaster(Master):
                 self.ctx.logger.warning("master not ready")
                 time.sleep(0.1)
 
-        # 'aaaaaa' make suer main pod (master server) as rank 0
+        # 'aaaaaa' make sure main pod (master server) as rank 0
         ky = 'aaaaaa' if rank < 0 and self.role == Master.MAIN else key
         k = "{}/{}/{}".format(prefix, ky, rank)
 
@@ -177,6 +180,12 @@ class ETCDMaster(Master):
         sync_peers gather all value for key under scope prefix
         result always be sorted either by rank or alphabet of pod.name
         '''
+
+        if size < 2:
+            return [value], 0
+
+        self.ctx.logger.info("Waiting peer ready...")
+
         path = "{}/{}/{}".format(prefix, key, rank)
 
         self.client.delete_prefix(prefix)
diff --git a/python/paddle/distributed/run/controllers/ps.py b/python/paddle/distributed/launch/controllers/ps.py
similarity index 98%
rename from python/paddle/distributed/run/controllers/ps.py
rename to python/paddle/distributed/launch/controllers/ps.py
index cc43c336cf1862fe075e5a4463b1f5b666a5005c..d3d0ef59bfd2f8102fe92beecef100c2079b1698 100644
--- a/python/paddle/distributed/run/controllers/ps.py
+++ b/python/paddle/distributed/launch/controllers/ps.py
@@ -22,7 +22,8 @@ class PSController(Controller):
     @classmethod
     def enable(cls, ctx):
         if ctx.args.mode == ControleMode.PS or ctx.args.server_num or len(
-                ctx.args.servers) > 0:
+                ctx.args.servers) > 0 or ctx.args.trainer_num or len(
+                    ctx.args.trainers) > 0:
             ctx.logger.debug("{} enabled".format(cls.__name__))
             ctx.args.mode = ControleMode.PS
             return True
diff --git a/python/paddle/distributed/launch.py b/python/paddle/distributed/launch/job/__init__.py
similarity index 78%
rename from python/paddle/distributed/launch.py
rename to python/paddle/distributed/launch/job/__init__.py
index e02a439025b77f9ae612aa790bc521b521fb481f..97043fd7ba6885aac81cad5a49924c23c67d4d47 100644
--- a/python/paddle/distributed/launch.py
+++ b/python/paddle/distributed/launch/job/__init__.py
@@ -1,18 +1,13 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-# 
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-from paddle.distributed.fleet import launch
-launch.launch()
-
-__all__ = []
diff --git a/python/paddle/distributed/run/job/container.py b/python/paddle/distributed/launch/job/container.py
similarity index 90%
rename from python/paddle/distributed/run/job/container.py
rename to python/paddle/distributed/launch/job/container.py
index 651932d6c88378034d7ab9cb05bac00ee3ea7ddf..7105cae9024f25f4b5d63a8d74d5d492777d4046 100644
--- a/python/paddle/distributed/run/job/container.py
+++ b/python/paddle/distributed/launch/job/container.py
@@ -13,12 +13,11 @@
 # limitations under the License.
 
 from collections import OrderedDict
-from paddle.distributed.run.utils.process_context import ProcessContext
+from paddle.distributed.launch.utils.process_context import ProcessContext
 
 from .status import Status
 
 import os, copy, sys
-import time
 
 
 class Container(object):
@@ -78,6 +77,11 @@ class Container(object):
         kwargs = {k: v for k, v in kwargs.items() if isinstance(v, str)}
         self._env.update(kwargs)
 
+    def _valide_env(self):
+        for k, v in self._env.items():
+            assert isinstance(k, str) and isinstance(
+                v, str), 'env {}:{} must be str'.format(k, v)
+
     def _get_fd(self, pth):
         if not pth:
             return None
@@ -90,12 +94,12 @@ class Container(object):
         except:
             return None
 
-    def start(self, timeout=-1):
-        end = time.time() + timeout
-
+    def start(self):
         if self._proc and self._proc.alive():
             return True
 
+        self._valide_env()
+
         self._stdout = self._get_fd(self._out) or sys.stdout
         if self._out == self._err:
             self._stderr = self._stdout
@@ -106,14 +110,6 @@ class Container(object):
             self._entrypoint, env=self._env, out=self._stdout, err=self._stderr)
         self._proc.start()
 
-        while timeout > 0 and time.time() < end:
-            if self._proc.alive():
-                time.sleep(0.1)
-                continue
-            if self._proc.exit_code() == 0:
-                return True
-            return False
-
     def terminate(self, force=False):
         if self._log_handler:
             self._log_handler.close()
@@ -125,9 +121,11 @@ class Container(object):
     def wait(self, timeout=None):
         self._proc.wait(timeout)
 
+    @property
     def exit_code(self):
         return self._proc.exit_code() if self._proc else -1
 
+    @property
     def status(self):
         if not self._proc:
             return Status.UNINIT
@@ -141,9 +139,9 @@ class Container(object):
     def __str__(self):
         return 'Container rank {} status {} cmd {} code {} log {} \nenv {}'.format(
             self._rank,
-            self.status(),
+            self.status,
             self._entrypoint,
-            self.exit_code(),
+            self.exit_code,
             self.errfile,
             self._env, )
 
diff --git a/python/paddle/distributed/run/job/job.py b/python/paddle/distributed/launch/job/job.py
similarity index 89%
rename from python/paddle/distributed/run/job/job.py
rename to python/paddle/distributed/launch/job/job.py
index 3469ed862576faed3bd7546710927f638b8fe0d5..31827968ddce605b424e53f5609ff302e5e4ac95 100644
--- a/python/paddle/distributed/run/job/job.py
+++ b/python/paddle/distributed/launch/job/job.py
@@ -20,16 +20,16 @@ class JobMode:
 
 
 class Job(object):
-    def __init__(self, id='default', mode=JobMode.COLLECTIVE, np="1"):
+    def __init__(self, jid='default', mode=JobMode.COLLECTIVE, nnodes="1"):
         self._mode = mode
-        self._id = id
+        self._id = jid
 
         self._replicas = 0
         self._replicas_min = self._replicas
         self._replicas_max = self._replicas
         self._elastic = False
 
-        self.set_replicas(str(np))
+        self.set_replicas(str(nnodes))
 
     def __str__(self):
         return "Job: {}, mode {}, replicas {}[{}:{}], elastic {}".format(
@@ -64,8 +64,8 @@ class Job(object):
     def replicas(self, replicas):
         self._replicas = replicas
 
-    def set_replicas(self, np: str):
-        np = str(np) if np else '1'
+    def set_replicas(self, nnodes: str):
+        np = str(nnodes) if nnodes else '1'
 
         if ':' in np:
             nps = np.split(':')
diff --git a/python/paddle/distributed/run/job/pod.py b/python/paddle/distributed/launch/job/pod.py
similarity index 78%
rename from python/paddle/distributed/run/job/pod.py
rename to python/paddle/distributed/launch/job/pod.py
index f7c31edce1d552befac3a6f54e5e79c326b31c67..701adf45f94e880adef6108b6f7e77ea58155a9e 100644
--- a/python/paddle/distributed/run/job/pod.py
+++ b/python/paddle/distributed/launch/job/pod.py
@@ -34,7 +34,7 @@ class PodSepc(object):
         #self.status: Status = None
 
         self._rank = -1
-        self._init_timeout = 120  # 2 min timeout for each init container
+        self._init_timeout = None
         self._restart = -1
         self._replicas = 0  # number of containers
         self._exit_code = 0
@@ -45,15 +45,15 @@ class Pod(PodSepc):
         super().__init__()
 
     def __str__(self):
-        return "Pod: {}, replicas {}, status {}".format(self.name,
-                                                        self.replicas,
-                                                        self.status())
+        return "Pod: {}, replicas {}, status {}".format(
+            self.name, self.replicas, self.status)
 
     def failed_container(self):
+        cs = []
         for c in self._containers:
-            if c.status() == Status.FAILED:
-                return c
-        return None
+            if c.status == Status.FAILED:
+                cs.append(c)
+        return cs
 
     @property
     def name(self):
@@ -65,7 +65,7 @@ class Pod(PodSepc):
 
     @replicas.setter
     def replicas(self, r):
-        self._replicas = r
+        self._replicas = max(r, 1)
 
     @property
     def rank(self):
@@ -98,13 +98,15 @@ class Pod(PodSepc):
     @property
     def exit_code(self):
         for c in self._containers:
-            if c.exit_code() != 0:
-                return c.exit_code()
+            if c.exit_code != 0:
+                return c.exit_code
         return 0
 
     def deploy(self):
+        # init container should stop before run containers
         for i in self._init_containers:
-            i.start(self._init_timeout)
+            i.start()
+            i.wait(self._init_timeout)
 
         for c in self._containers:
             c.start()
@@ -120,6 +122,7 @@ class Pod(PodSepc):
         for c in self._containers:
             c.wait(None)
 
+    @property
     def status(self):
         if self.is_failed():
             return Status.FAILED
@@ -127,6 +130,9 @@ class Pod(PodSepc):
         if self.is_completed():
             return Status.COMPLETED
 
+        if self.is_running():
+            return Status.RUNNING
+
         return Status.READY
 
     def reset(self):
@@ -135,31 +141,31 @@ class Pod(PodSepc):
 
     def is_failed(self):
         for c in self._containers:
-            if c.status() == Status.FAILED:
+            if c.status == Status.FAILED:
                 return True
         return False
 
     def is_completed(self):
         for c in self._containers:
-            if c.status() != Status.COMPLETED:
+            if c.status != Status.COMPLETED:
+                return False
+        return True
+
+    def is_running(self):
+        for c in self._containers:
+            if c.status != Status.RUNNING:
                 return False
         return True
 
     def logs(self, idx=None):
         if idx is None:
-            if self.failed_container():
-                self.failed_container().logs()
-            else:
-                self._containers[0].logs()
+            self._containers[0].logs()
         else:
             self._containers[idx].logs()
 
     def tail(self, idx=None):
         if idx is None:
-            if self.failed_container():
-                self.failed_container().tail()
-            else:
-                self._containers[0].tail()
+            self._containers[0].tail()
         else:
             self._containers[idx].tail()
 
@@ -175,10 +181,10 @@ class Pod(PodSepc):
         end = time.time() + timeout
         while timeout < 0 or time.time() < end:
             for c in self._containers:
-                if c.status() in any_list:
-                    return c.status()
+                if c.status in any_list:
+                    return c.status
 
-            s = [c.status() for c in self._containers]
+            s = [c.status for c in self._containers]
             if len(set(s)) == 1 and s[0] in all_list:
                 return s[0]
 
diff --git a/python/paddle/distributed/run/job/status.py b/python/paddle/distributed/launch/job/status.py
similarity index 100%
rename from python/paddle/distributed/run/job/status.py
rename to python/paddle/distributed/launch/job/status.py
diff --git a/python/paddle/distributed/run/plugins/__init__.py b/python/paddle/distributed/launch/plugins/__init__.py
similarity index 71%
rename from python/paddle/distributed/run/plugins/__init__.py
rename to python/paddle/distributed/launch/plugins/__init__.py
index ec91402a7aad359c9860cf737a78cb7c1f1375d1..1862f75a77f65d39715e031b0ba72ebea6ab5523 100644
--- a/python/paddle/distributed/run/plugins/__init__.py
+++ b/python/paddle/distributed/launch/plugins/__init__.py
@@ -30,15 +30,26 @@ def process_args(ctx):
     argdev = ctx.args.devices
     if argdev:
         ctx.node.device.labels = argdev.split(',')
-        ctx.node.device.count = len(ctx.node.device.labels)
         ctx.logger.debug('Device reset by args {}'.format(argdev))
 
 
 def collective_compatible(ctx):
     if 'PADDLE_TRAINER_ENDPOINTS' in ctx.envs:
-        ctx.master = ctx.envs['PADDLE_TRAINER_ENDPOINTS'].split(',')[0]
+        eps = ctx.envs['PADDLE_TRAINER_ENDPOINTS'].split(',')
+        hosts = set([h.split(':')[0] for h in eps])
+        ctx.args.master = eps[0] if ':' in eps[0] else '{}:6768'.format(eps[0])
+        ctx.args.nnodes = len(hosts)
+        ctx.logger.info('args reset by env PADDLE_TRAINER_ENDPOINTS\n{}'.format(
+            eps))
+    '''
     if 'DISTRIBUTED_TRAINER_ENDPOINTS' in ctx.envs:
-        ctx.master = ctx.envs['DISTRIBUTED_TRAINER_ENDPOINTS'].split(',')[0]
+        eps = ctx.envs['DISTRIBUTED_TRAINER_ENDPOINTS'].split(',')
+        hosts = set([h.split(':')[0] for h in eps])
+        ctx.args.master = eps[0]
+        ctx.args.nnodes = len(hosts)
+        ctx.logger.info(
+            'args reset by env DISTRIBUTED_TRAINER_ENDPOINTS\n{}'.format(eps))
+    '''
 
 
 def rewrite_host_ip(ctx):
diff --git a/python/paddle/distributed/run/job/__init__.py b/python/paddle/distributed/launch/utils/__init__.py
similarity index 78%
rename from python/paddle/distributed/run/job/__init__.py
rename to python/paddle/distributed/launch/utils/__init__.py
index 66d2abbce21ebc72cda5373a3d3a242c077beaa8..97043fd7ba6885aac81cad5a49924c23c67d4d47 100644
--- a/python/paddle/distributed/run/job/__init__.py
+++ b/python/paddle/distributed/launch/utils/__init__.py
@@ -11,15 +11,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-from .pod import Pod
-from .job import Job
-from .container import Container
-from .status import Status
-
-__all__ = [
-    'Pod',
-    'Job',
-    'Container',
-    'Status',
-]
diff --git a/python/paddle/distributed/run/utils/kv_client.py b/python/paddle/distributed/launch/utils/kv_client.py
similarity index 100%
rename from python/paddle/distributed/run/utils/kv_client.py
rename to python/paddle/distributed/launch/utils/kv_client.py
diff --git a/python/paddle/distributed/run/utils/kv_server.py b/python/paddle/distributed/launch/utils/kv_server.py
similarity index 100%
rename from python/paddle/distributed/run/utils/kv_server.py
rename to python/paddle/distributed/launch/utils/kv_server.py
diff --git a/python/paddle/distributed/run/utils/process_context.py b/python/paddle/distributed/launch/utils/process_context.py
similarity index 100%
rename from python/paddle/distributed/run/utils/process_context.py
rename to python/paddle/distributed/launch/utils/process_context.py
diff --git a/python/paddle/distributed/run/context/__init__.py b/python/paddle/distributed/run/context/__init__.py
deleted file mode 100644
index 86dff0f1f8056e784268a6ef3a3ebabb44aa9c6d..0000000000000000000000000000000000000000
--- a/python/paddle/distributed/run/context/__init__.py
+++ /dev/null
@@ -1,219 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-#     http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from argparse import ArgumentParser, REMAINDER
-import os, copy
-
-from paddle.distributed.run import plugins
-
-from .node import Node
-from .status import Status
-
-import logging
-
-
-class Context(object):
-    def __init__(self, enable_plugin=True):
-        os.environ.pop('http_proxy', None)
-        os.environ.pop('https_proxy', None)
-
-        self.args = self.parse_args()
-        self.envs = self.fetch_envs()
-        self.logger = self.get_logger()
-
-        self.node = Node()
-        self.status = Status()
-
-        self.set_env_in_args()
-
-        # design for event queue, later
-        self.events = []
-
-        if enable_plugin:
-            self._enable_plugin()
-
-    def get_envs(self):
-        return self.envs.copy()
-
-    def _enable_plugin(self):
-        for pl in plugins.enabled_plugins:
-            pl(self)
-
-    def parse_args(self):
-        parser = ArgumentParser()
-
-        base_group = parser.add_argument_group("Base Parameters")
-
-        base_group.add_argument(
-            "--master",
-            type=str,
-            default=None,
-            help="the master/rendezvous server, ip:port")
-
-        base_group.add_argument(
-            "--rank", type=int, default=-1, help="the peer rank")
-
-        base_group.add_argument(
-            "--log", type=str, default="INFO", help="log level. Default INFO")
-
-        base_group.add_argument(
-            "--np",
-            type=str,
-            default="1",
-            help="the number of peers, i.e. pod/node number")
-
-        base_group.add_argument(
-            "--nproc_per_node",
-            type=int,
-            default=None,
-            help="the number of processes in a pod")
-
-        base_group.add_argument(
-            "--log_dir",
-            type=str,
-            default="log",
-            help="the path for each process's log. Default ./log")
-        base_group.add_argument(
-            "--mode",
-            type=str,
-            default="collective",
-            help="run mode of the job, collective/ps/ps-heter")
-
-        base_group.add_argument(
-            "--id",
-            type=str,
-            default="default",
-            help="unique id of the job. Default default")
-
-        base_group.add_argument(
-            "--devices",
-            type=str,
-            default=None,
-            help="accelerate devices. as --gpus,npus,xps")
-
-        base_group.add_argument(
-            "--host", type=str, default=None, help="host ip")
-
-        base_group.add_argument(
-            "training_script",
-            type=str,
-            help="the full path of py script,"
-            "followed by arguments for the "
-            "training script")
-
-        base_group.add_argument('training_script_args', nargs=REMAINDER)
-
-        ps_group = parser.add_argument_group("Parameter-Server Parameters")
-        # for parameter server
-        ps_group.add_argument(
-            "--servers",
-            type=str,
-            default='',
-            help="servers endpoints full list")
-        ps_group.add_argument(
-            "--trainers",
-            type=str,
-            default='',
-            help="trainers endpoints full list")
-
-        ps_group.add_argument(
-            "--trainer_num", type=int, default=None, help="number of trainers")
-        ps_group.add_argument(
-            "--server_num", type=int, default=None, help="number of servers")
-        ps_group.add_argument(
-            "--gloo_port", type=int, default=6767, help="gloo http port")
-        ps_group.add_argument(
-            "--with_gloo", type=str, default="0", help="use gloo or not")
-
-        # parameter elastic mode
-        elastic_group = parser.add_argument_group("Elastic Parameters")
-        elastic_group.add_argument(
-            "--max_restart",
-            type=int,
-            default=3,
-            help="the times can restart. Default 3")
-
-        elastic_group.add_argument(
-            "--elastic_level",
-            type=int,
-            default=-1,
-            help="elastic level: -1 disable, 0 failed exit, peers hold, 1 internal restart"
-        )
-
-        elastic_group.add_argument(
-            "--elastic_timeout",
-            type=int,
-            default=30,
-            help="seconds to wait before elastic perform training")
-        return parser.parse_args()
-
-    def _valide_env(self, key):
-        if key in ['POD_IP']:
-            return True
-        if key.endswith('_VISIBLE_DEVICES'):
-            return True
-        if key.startswith('PADDLE_'):
-            return True
-
-        return False
-
-    def fetch_envs(self):
-        ge = os.environ.copy()
-
-        black_env_list = ['http_proxy', 'https_proxy']
-        for key in black_env_list:
-            ge.pop(key, None)
-
-        return ge
-        '''
-        # use black list instead white list
-        return {k: ge[k] for k in ge if self._valide_env(k)}
-        '''
-
-    def get_logger(self, level=logging.INFO):
-        logger = logging.getLogger("PADDLERUN")
-        logger.setLevel(self.args.log.upper() or level)
-        formatter = logging.Formatter(
-            fmt='%(name)s %(levelname)s %(asctime)s %(message)s')
-        ch = logging.StreamHandler()
-        ch.setFormatter(formatter)
-        logger.addHandler(ch)
-        return logger
-
-    def set_env_in_args(self):
-        env_args = {
-            'POD_IP': 'host',
-            'PADDLE_MASTER': 'master',
-            'PADDLE_DEVICES': 'devices',
-            'PADDLE_NP': 'np',
-            'PADDLE_MODE': 'mode',
-            'PADDLE_LOG': 'log',
-            'PADDLE_NPROC_PER_NODE': 'nproc_per_node',
-            'PADDLE_JOB_ID': 'id',
-            'PADDLE_RANK': 'rank',
-            'PADDLE_LOG_DIR': 'log_dir',
-            'PADDLE_MAX_RESTlRT': 'max_restart',
-            'PADDLE_ELASTIC_LEVEL': 'elastic_level',
-            'PADDLE_ELASTIC_TIMEOUT': 'elastic_timeout',
-            'PADDLE_SERVER_NUM': 'server_num',
-            'PADDLE_TRAINER_NUM': 'trainer_num',
-            'PADDLE_SERVERS_ENDPOINTS': 'servers',
-            'PADDLE_TRAINERS_ENDPOINTS': 'trainers',
-            'PADDLE_GLOO_PORT': 'gloo_port',
-            'PADDLE_WITH_GLOO': 'with_gloo',
-        }
-
-        for k, v in env_args.items():
-            if k in self.envs:
-                setattr(self.args, v, self.envs[k])
diff --git a/python/paddle/distributed/run/context/device.py b/python/paddle/distributed/run/context/device.py
deleted file mode 100644
index d8bbd851ccf83a1ebfac60758576384bbe1aa4f4..0000000000000000000000000000000000000000
--- a/python/paddle/distributed/run/context/device.py
+++ /dev/null
@@ -1,88 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-#     http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-
-
-class DeviceType:
-    CPU = 'cpu'
-    GPU = 'gpu'
-    XPU = 'xpu'
-    NPU = 'npu'
-
-
-class Device(object):
-    def __init__(self, dtype=None, count=1, memory="", labels=""):
-        self.dtype = dtype
-        self.count = count
-        self.memory = memory
-        self.labels = labels
-
-    def __str__(self):
-        return ",".join(self.labels)
-
-    @classmethod
-    def parse_device(self):
-        dev = Device()
-        visible_devices = None
-        if 'CUDA_VISIBLE_DEVICES' in os.environ or 'NVIDIA_VISIBLE_DEVICES' in os.environ:
-            dev.dtype = DeviceType.GPU
-            visible_devices = os.getenv("CUDA_VISIBLE_DEVICES") or os.getenv(
-                "NVIDIA_VISIBLE_DEVICES")
-        elif 'XPU_VISIBLE_DEVICES' in os.environ:
-            dev.dtype = DeviceType.XPU
-            visible_devices = os.getenv("XPU_VISIBLE_DEVICES")
-        elif 'ASCEND_VISIBLE_DEVICES' in os.environ:
-            dev.dtype = DeviceType.NPU
-            visible_devices = os.getenv("ASCEND_VISIBLE_DEVICES")
-
-        if visible_devices and visible_devices != 'all':
-            dev.labels = visible_devices.split(',')
-            dev.count = len(dev.labels)
-        else:
-            return self.detect_device()
-
-        return dev
-
-    @classmethod
-    def detect_device(self):
-        import paddle.fluid as fluid
-
-        dev = Device()
-        num = 0
-        visible_devices = None
-        if fluid.core.is_compiled_with_cuda():
-            dev.dtype = DeviceType.GPU
-            num = fluid.core.get_cuda_device_count()
-            visible_devices = os.getenv("CUDA_VISIBLE_DEVICES") or os.getenv(
-                "NVIDIA_VISIBLE_DEVICES")
-        elif fluid.core.is_compiled_with_xpu():
-            dev.dtype = DeviceType.XPU
-            num = fluid.core.get_xpu_device_count()
-            visible_devices = os.getenv("XPU_VISIBLE_DEVICES")
-        elif fluid.core.is_compiled_with_npu():
-            dev.dtype = DeviceType.NPU
-            num = fluid.core.get_npu_device_count()
-            visible_devices = os.getenv("ASCEND_VISIBLE_DEVICES")
-
-        if num == 0:
-            dev.dtype = DeviceType.CPU
-        elif visible_devices is None or visible_devices == "all" or visible_devices == "":
-            dev.labels = [str(x) for x in range(0, num)]
-            dev.count = num
-        else:
-            dev.labels = visible_devices.split(',')
-            dev.count = len(dev.labels)
-
-        return dev
diff --git a/python/paddle/distributed/run/plugins/ip.py b/python/paddle/distributed/run/plugins/ip.py
deleted file mode 100644
index 0809ed5864da9f3bea29235621e7c29b75823391..0000000000000000000000000000000000000000
--- a/python/paddle/distributed/run/plugins/ip.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-#     http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import socket
-
-
-def get_local_ip(ctx):
-    _, ip = _get_host_name_ip()
-    ctx.args.host = ip
-    ctx.envs["POD_IP"] = ip
-
-
-def _get_host_name_ip():
-    try:
-        host_name = socket.gethostname()
-        host_ip = socket.gethostbyname(host_name)
-        return host_name, host_ip
-    except:
-        return None
diff --git a/python/paddle/fluid/dygraph/layers.py b/python/paddle/fluid/dygraph/layers.py
index 6957850d205794363183b4e6ca58a6daf3e11358..f4334085620f510e3d520f89332b754a93aa120a 100644
--- a/python/paddle/fluid/dygraph/layers.py
+++ b/python/paddle/fluid/dygraph/layers.py
@@ -1155,7 +1155,8 @@ class Layer(object):
                 layers[name] = None
             else:
                 _buffers = self.__dict__.get('_buffers', None)
-                if type(value) == core.VarBase:
+                if type(value) == core.VarBase or \
+                    type(value) == core.eager.Tensor:
                     if _buffers is None:
                         raise ValueError(
                             "super(YourLayer, self).__init__() should be called first"
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 44e6f8e8f2a6d11371f21fff5a9dccefcd72ebed..2acf530eea3fbd2d7fbc9cf04d2c792b7175035c 100755
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -949,7 +949,7 @@ if (WITH_DISTRIBUTE AND NOT APPLE)
 endif()
 
 # setting timeout value as 15S
-set_tests_properties(test_run PROPERTIES TIMEOUT 200)
+set_tests_properties(test_run PROPERTIES TIMEOUT 120)
 set_tests_properties(test_sync_batch_norm_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_cross_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_imperative_lod_tensor_to_selected_rows PROPERTIES TIMEOUT 200)
diff --git a/python/paddle/fluid/tests/unittests/test_c_comm_init_op.sh b/python/paddle/fluid/tests/unittests/test_c_comm_init_op.sh
index aba95a68ab790828a63b1f46afaaa87f7826d248..9b99e553d182bc5f9ce8cd1835d7600ebfb956f2 100644
--- a/python/paddle/fluid/tests/unittests/test_c_comm_init_op.sh
+++ b/python/paddle/fluid/tests/unittests/test_c_comm_init_op.sh
@@ -17,5 +17,4 @@
 set -e
 # use default values
 # FIXME: random fails on Unknown command lines -c (or -m).
-launch_py=${PADDLE_BINARY_DIR}/python/paddle/distributed/launch.py
-CUDA_VISIBLE_DEVICES=0,1 python ${launch_py} c_comm_init_op.py
+CUDA_VISIBLE_DEVICES=0,1 python -m paddle.distributed.launch c_comm_init_op.py
diff --git a/python/paddle/fluid/tests/unittests/test_egr_python_api.py b/python/paddle/fluid/tests/unittests/test_egr_python_api.py
index 98ef339e04535bb943add02b6cf6efe490f0354b..8166598677a3eb7ce7a4cb42b8a96b6b100aeb20 100644
--- a/python/paddle/fluid/tests/unittests/test_egr_python_api.py
+++ b/python/paddle/fluid/tests/unittests/test_egr_python_api.py
@@ -637,6 +637,10 @@ class EagerVariablePropertiesAndMethodsTestCase(unittest.TestCase):
                 self.assertTrue(tensor3.persistable, True)
                 self.assertTrue(tensor3.stop_gradient, True)
                 self.assertTrue(tensor3.place.is_gpu_place())
+                tensor4 = paddle.to_tensor([1, 2, 3], place='gpu_pinned')
+                tensor5 = tensor4._copy_to(core.CUDAPlace(0), True)
+                self.assertTrue(
+                    np.array_equal(tensor4.numpy(), tensor5.numpy()))
             else:
                 tensor3 = tensor2._copy_to(core.CPUPlace(), True)
                 self.assertTrue(np.array_equal(tensor3.numpy(), arr2))
diff --git a/python/paddle/fluid/tests/unittests/test_graph_send_recv_op.py b/python/paddle/fluid/tests/unittests/test_graph_send_recv_op.py
index 68b354775d13e659492d7b4355ecf162546d2f27..30f943e3248e90820abbef26214e81cf691f50a4 100644
--- a/python/paddle/fluid/tests/unittests/test_graph_send_recv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_graph_send_recv_op.py
@@ -304,6 +304,35 @@ class API_GraphSendRecvOpTest(unittest.TestCase):
                 "two value is\
                 {}\n{}, check diff!".format(np_res, ret_res))
 
+    def test_set_outsize_gpu(self):
+        if paddle.fluid.core.is_compiled_with_cuda():
+            x = paddle.to_tensor(
+                np.array([[0, 2, 3], [1, 4, 5], [2, 6, 6]]), dtype="float32")
+            src_index = paddle.to_tensor(np.array([0, 0, 1]), dtype="int32")
+            dst_index = paddle.to_tensor(np.array([0, 1, 1]), dtype="int32")
+            res = paddle.incubate.graph_send_recv(x, src_index, dst_index,
+                                                  "sum")
+            out_size = paddle.max(dst_index) + 1
+            res_set_outsize = paddle.incubate.graph_send_recv(
+                x, src_index, dst_index, "sum", out_size)
+
+            np_res = np.array(
+                [[0, 2, 3], [1, 6, 8], [0, 0, 0]], dtype="float32")
+            np_res_set_outsize = np.array(
+                [[0, 2, 3], [1, 6, 8]], dtype="float32")
+
+            self.assertTrue(
+                np.allclose(
+                    np_res, res, atol=1e-6),
+                "two value is\
+                {}\n{}, check diff!".format(np_res, res))
+            self.assertTrue(
+                np.allclose(
+                    np_res_set_outsize, res_set_outsize, atol=1e-6),
+                "two value is\
+                {}\n{}, check diff!"
+                .format(np_res_set_outsize, res_set_outsize))
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_inplace_eager_fluid.py b/python/paddle/fluid/tests/unittests/test_inplace_eager_fluid.py
index 6736783d1bc4627cb0b447ad578e6ba4bd5a7b76..27f7903a42e8a1c073b9840b8eebe950b526f5cb 100644
--- a/python/paddle/fluid/tests/unittests/test_inplace_eager_fluid.py
+++ b/python/paddle/fluid/tests/unittests/test_inplace_eager_fluid.py
@@ -195,6 +195,8 @@ class TestDygraphInplaceFlatten(TestDygraphInplace):
         return var.flatten_()
 
 
+"""
+# This case will fail while using `_C_ops.final_state_scatter`.
 class TestDygraphInplaceScatter(TestDygraphInplace):
     def init_data(self):
         self.input_var_numpy = np.array([[1, 1], [2, 2], [3, 3]])
@@ -213,6 +215,7 @@ class TestDygraphInplaceScatter(TestDygraphInplace):
             [[1, 1], [2, 2], [3, 3], [4, 4]], dtype='float32')
 
         return paddle.scatter_(var, index, updates, overwrite=False)
+"""
 
 
 class TestDygraphInplaceElu(TestDygraphInplace):
diff --git a/python/paddle/fluid/tests/unittests/test_run.py b/python/paddle/fluid/tests/unittests/test_run.py
index 8fe5fb9bb9455aa58d84bda03f9e9e16038a3be0..498aecf7c6e75cbb3dd1e73e4aae6151ed65f0db 100644
--- a/python/paddle/fluid/tests/unittests/test_run.py
+++ b/python/paddle/fluid/tests/unittests/test_run.py
@@ -39,9 +39,7 @@ import os
 env = os.environ.copy()
 assert "PADDLE_PSERVERS_IP_PORT_LIST" in env
 assert "PADDLE_TRAINER_ENDPOINTS" in env
-#assert "PADDLE_PSERVER_ENDPOINTS" in env
-#assert "PADDLE_TRAINER_ENDPOINTS" in env
-#assert "PADDLE_ROLE" in env
+assert "PADDLE_ROLE" in env
 #assert "PADDLE_RANK" in env
 '''
 
@@ -62,27 +60,24 @@ class Collective_Test(unittest.TestCase):
         write_file(pyname, colpyfile)
 
     def pdrun(self, args, env=None):
-        cmd = [sys.executable.split('/')[-1], "-m", "paddle.distributed.run"]
+        cmd = [sys.executable.split('/')[-1], "-m", "paddle.distributed.launch"]
         if args:
             cmd.extend(args.split(" "))
         cmd.extend([pyname])
         proc = subprocess.Popen(cmd, env)
         return proc
 
-    '''
     def test_collective_1(self):
-        args = "--id test1"
+        args = "--job_id test1"
         p = self.pdrun(args)
         p.wait()
         self.assertTrue(p.poll() == 0)
 
-    '''
-
     def test_collective_2(self):
         if os.path.exists('./log'):
             shutil.rmtree('./log')
 
-        args = "--id test2 --devices 0,1,2"
+        args = "--job_id test2 --devices 0,1,2"
         p = self.pdrun(args)
         p.wait()
         self.assertTrue(p.poll() == 0)
@@ -95,7 +90,7 @@ class Collective_Test(unittest.TestCase):
             shutil.rmtree('./log')
 
         port = random.randrange(6000, 8000)
-        args = "--id test3 --devices 0,1 --master 127.0.0.1:{} --np 2".format(
+        args = "--job_id test3 --devices 0,1 --master 127.0.0.1:{} --np 2".format(
             port)
         p1 = self.pdrun(args)
         p2 = self.pdrun(args)
@@ -113,14 +108,13 @@ class PS_Test(unittest.TestCase):
         write_file(pyname, pspyfile)
 
     def pdrun(self, args, env=None):
-        cmd = [sys.executable.split('/')[-1], "-m", "paddle.distributed.run"]
+        cmd = [sys.executable.split('/')[-1], "-m", "paddle.distributed.launch"]
         if args:
             cmd.extend(args.split(" "))
         cmd.extend([pyname])
         proc = subprocess.Popen(cmd, env)
         return proc
 
-    '''
     def test_ps_1(self):
         args = "--mode ps"
         p = self.pdrun(args)
@@ -131,21 +125,20 @@ class PS_Test(unittest.TestCase):
         if os.path.exists('./log'):
             shutil.rmtree('./log')
 
-        args = "--id ps2 --server_num=2 --trainer_num=2"
+        args = "--job_id ps2 --server_num=2 --trainer_num=2"
         p = self.pdrun(args)
         p.wait()
         self.assertTrue(p.poll() == 0)
 
         c = get_files('log', 'ps2')
         self.assertTrue(len(c) == 5)
-    '''
 
     def test_ps_3(self):
         if os.path.exists('./log'):
             shutil.rmtree('./log')
 
         port = random.randrange(6000, 8000)
-        args = "--id ps3 --master 127.0.0.1:{} --np 2 --server_num=1 --trainer_num=1".format(
+        args = "--job_id ps3 --master 127.0.0.1:{} --np 2 --server_num=1 --trainer_num=1".format(
             port)
         p1 = self.pdrun(args)
         p2 = self.pdrun(args)
@@ -161,7 +154,7 @@ class PS_Test(unittest.TestCase):
         if os.path.exists('./log'):
             shutil.rmtree('./log')
 
-        args = "--id ps4 --servers 127.0.0.1:8900,127.0.0.1:8901 --trainers 127.0.0.1:8902,127.0.0.1:8903"
+        args = "--job_id ps4 --servers 127.0.0.1:8900,127.0.0.1:8901 --trainers 127.0.0.1:8902,127.0.0.1:8903"
         p1 = self.pdrun(args)
         p1.wait()
         self.assertTrue(p1.poll() == 0)
diff --git a/python/paddle/incubate/operators/graph_send_recv.py b/python/paddle/incubate/operators/graph_send_recv.py
index 45810621e42076aedbb1c416e7f6f5d90f1c0f6e..05f6a80a442f28fc00d6766747d571664bb389fc 100644
--- a/python/paddle/incubate/operators/graph_send_recv.py
+++ b/python/paddle/incubate/operators/graph_send_recv.py
@@ -19,7 +19,12 @@ from paddle.fluid import core
 from paddle import _C_ops
 
 
-def graph_send_recv(x, src_index, dst_index, pool_type="sum", name=None):
+def graph_send_recv(x,
+                    src_index,
+                    dst_index,
+                    pool_type="sum",
+                    out_size=None,
+                    name=None):
     r"""
 
     Graph Learning Send_Recv combine operator.
@@ -27,7 +32,7 @@ def graph_send_recv(x, src_index, dst_index, pool_type="sum", name=None):
     This operator is mainly used in Graph Learning domain, and the main purpose is to reduce intermediate memory 
     consumption in the process of message passing. Take `x` as the input tensor, we first use `src_index`
     to gather the corresponding data, and then use `dst_index` to update the corresponding position of output tensor 
-    in different pooling types, like sum, mean, max, or min.
+    in different pooling types, like sum, mean, max, or min. Besides, we can set `out_size` to get necessary output shape.
 
     .. code-block:: text
 
@@ -43,6 +48,8 @@ def graph_send_recv(x, src_index, dst_index, pool_type="sum", name=None):
 
            pool_type = "sum"
 
+           out_size = None
+
            Then:
 
            Out = [[0, 2, 3],
@@ -56,6 +63,9 @@ def graph_send_recv(x, src_index, dst_index, pool_type="sum", name=None):
                             The available data type is int32, int64. 
         pool_type (str): The pooling type of graph_send_recv, including `sum`, `mean`, `max`, `min`.
                          Default value is `sum`.
+        out_size (int64|None): We can set `out_size` to get necessary output shape. If not set, then this 
+                              attribute will not be used. If set, it should be equal with or larger than
+                              max(dst_index) + 1.
         name (str, optional): Name for the operation (optional, default is None).
                               For more information, please refer to :ref:`api_guide_Name`.
 
@@ -75,6 +85,21 @@ def graph_send_recv(x, src_index, dst_index, pool_type="sum", name=None):
             out = paddle.incubate.graph_send_recv(x, src_index, dst_index, pool_type="sum")
             # Outputs: [[0., 2., 3.], [2., 8., 10.], [1., 4., 5.]]
 
+            x = paddle.to_tensor([[0, 2, 3], [1, 4, 5], [2, 6, 7]], dtype="float32")
+            indexes = paddle.to_tensor([[0, 1], [2, 1], [0, 0]], dtype="int32")
+            src_index = indexes[:, 0]
+            dst_index = indexes[:, 1]
+            out_size = paddle.max(dst_index) + 1
+            out = paddle.incubate.graph_send_recv(x, src_index, dst_index, pool_type="sum", out_size=out_size)
+            # Outputs: [[0., 2., 3.], [[2., 8., 10.]]]
+
+            x = paddle.to_tensor([[0, 2, 3], [1, 4, 5], [2, 6, 7]], dtype="float32")
+            indexes = paddle.to_tensor([[0, 1], [2, 1], [0, 0]], dtype="int32")
+            src_index = indexes[:, 0]
+            dst_index = indexes[:, 1]
+            out = paddle.incubate.graph_send_recv(x, src_index, dst_index, pool_type="sum")
+            # Outputs: [[0., 2., 3.], [2., 8., 10.], [0., 0., 0.]]
+
     """
 
     if pool_type not in ["sum", "mean", "max", "min"]:
@@ -82,9 +107,16 @@ def graph_send_recv(x, src_index, dst_index, pool_type="sum", name=None):
             "pool_type should be `sum`, `mean`, `max` or `min`, but received %s"
             % pool_type)
 
+    # TODO(daisiming): Should we add judgement for out_size: max(dst_index) + 1.
+
     if in_dygraph_mode():
-        out, tmp = _C_ops.graph_send_recv(x, src_index, dst_index, 'pool_type',
-                                          pool_type.upper())
+        if out_size is None or out_size <= 0:
+            out, tmp = _C_ops.graph_send_recv(x, src_index, dst_index,
+                                              'pool_type', pool_type.upper())
+        else:
+            out, tmp = _C_ops.graph_send_recv(
+                x, src_index, dst_index, 'pool_type',
+                pool_type.upper(), 'out_size', out_size)
         return out
 
     check_variable_and_dtype(x, "X", ("float32", "float64", "int32", "int64"),
@@ -105,5 +137,8 @@ def graph_send_recv(x, src_index, dst_index, pool_type="sum", name=None):
                 "Dst_index": dst_index},
         outputs={"Out": out,
                  "Dst_count": dst_count},
-        attrs={"pool_type": pool_type.upper()})
+        attrs={
+            "pool_type": pool_type.upper(),
+            "out_size": 0 if out_size is None or out_size <= 0 else out_size
+        })
     return out
diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py
index cef6ba48f83994cfcc47ae408bed1bbe80ddb178..ed7fd91b7a7aa90f4f595d7e33f29d065113bc36 100644
--- a/python/paddle/nn/functional/activation.py
+++ b/python/paddle/nn/functional/activation.py
@@ -578,6 +578,8 @@ def relu_(x, name=None):
     Inplace version of ``relu`` API, the output Tensor will be inplaced with input ``x``.
     Please refer to :ref:`api_nn_cn_relu`.
     """
+    if _in_eager_mode():
+        return _C_ops.final_state_relu_(x)
     return _C_ops.relu_(x)
 
 
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index bdb0eabe2bbb2968a00d5baf3f9ada14e05a635e..9cef336aa54ae4daae14a8bb99d70470e5dd7511 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -127,7 +127,7 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True):
                     "\n\tFaild to convert input data to a regular ndarray :\n\t - Usually "
                     "this means the input data contains nested lists with different lengths. "
                 )
-        elif isinstance(data, paddle.Tensor):
+        elif isinstance(data, (paddle.Tensor, core.eager.Tensor)):
             data = data._copy_to(place, False)
             data = _handle_dtype(data, dtype)
             data.stop_gradient = stop_gradient
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index a4972061d420853b4790dce5df457d981127194c..98b9811f85c1f77e39a4ddd027aa426d5ae72fc4 100755
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -1735,12 +1735,12 @@ def tile(x, repeat_times, name=None):
 
     Args:
         x (Tensor): The input tensor, its data type should be bool, float32, float64, int32 or int64.
-        repeat_times (Tensor|tuple|list): The number of repeating times. If repeat_times is a list or tuple, all its elements
+        repeat_times (list|tuple|Tensor): The number of repeating times. If repeat_times is a list or tuple, all its elements
             should be integers or 1-D Tensors with the data type int32. If repeat_times is a Tensor, it should be an 1-D Tensor with the data type int32.
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        N-D Tensor. The data type is the same as ``x``.
+        N-D Tensor. The data type is the same as ``x``. The size of the i-th dimension is equal to ``x[i] * repeat_times[i]``.
 
     Examples:
         .. code-block:: python
@@ -1750,16 +1750,18 @@ def tile(x, repeat_times, name=None):
             data = paddle.to_tensor([1, 2, 3], dtype='int32')
             out = paddle.tile(data, repeat_times=[2, 1])
             np_out = out.numpy()
-            # [[1, 2, 3], [1, 2, 3]]
+            # [[1, 2, 3]
+            #  [1, 2, 3]]
 
-            out = paddle.tile(data, repeat_times=[2, 2])
+            out = paddle.tile(data, repeat_times=(2, 2))
             np_out = out.numpy()
-            # [[1, 2, 3, 1, 2, 3], [1, 2, 3, 1, 2, 3]]
+            # [[1, 2, 3, 1, 2, 3]
+            #  [1, 2, 3, 1, 2, 3]]
 
-            repeat_times = paddle.to_tensor([2, 1], dtype='int32')
+            repeat_times = paddle.to_tensor([1, 2], dtype='int32')
             out = paddle.tile(data, repeat_times=repeat_times)
             np_out = out.numpy()
-            # [[1, 2, 3], [1, 2, 3]]
+            # [[1, 2, 3, 1, 2, 3]]
     """
     if paddle.in_dynamic_mode():
         if _in_eager_mode():
diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml
index ba9598db64c416dafe7f8ebbed267913e14a3960..ee6503a39c6f126b52db23d302a59b2ef1196f74 100644
--- a/python/paddle/utils/code_gen/api.yaml
+++ b/python/paddle/utils/code_gen/api.yaml
@@ -36,9 +36,9 @@
     func : conj
 
 - api : copy_to
-  args : (Tensor x, Backend backend, bool blocking)
+  args : (Tensor x, Place place, bool blocking)
   output : Tensor
-  invoke : copy_to_impl(x, backend, blocking)
+  invoke : copy_to_impl(x, place, blocking)
 
 - api : divide
   args : (Tensor x, Tensor y)
@@ -57,7 +57,7 @@
     func : dot
 
 - api : empty
-  args : (ScalarArray shape, DataType dtype=DataType::FLOAT32, Backend place=Backend::CPU)
+  args : (ScalarArray shape, DataType dtype=DataType::FLOAT32, Place place=CPUPlace())
   output: Tensor
   infer_meta :
     func : CreateInferMeta
@@ -69,7 +69,7 @@
     backend : place
 
 - api : empty_like
-  args : (Tensor x, DataType dtype = DataType::UNDEFINED, Backend place = Backend::UNDEFINED)
+  args : (Tensor x, DataType dtype = DataType::UNDEFINED, Place place = {})
   output: Tensor
   infer_meta :
     func : CreateLikeInferMeta
@@ -89,7 +89,7 @@
     func : flatten
 
 - api : full
-  args : (ScalarArray shape, Scalar value, DataType dtype=DataType::FLOAT32, Backend place=Backend::CPU)
+  args : (ScalarArray shape, Scalar value, DataType dtype=DataType::FLOAT32, Place place=CPUPlace())
   output: Tensor
   infer_meta :
     func : CreateInferMeta
@@ -101,7 +101,7 @@
     backend : place
 
 - api : full_like
-  args : (Tensor x, Scalar value, DataType dtype = DataType::UNDEFINED, Backend place = Backend::UNDEFINED)
+  args : (Tensor x, Scalar value, DataType dtype = DataType::UNDEFINED, Place place = {})
   output: Tensor
   infer_meta :
     func : CreateLikeInferMeta
@@ -138,7 +138,7 @@
     func : multiply
 
 - api : ones_like
-  args : (Tensor x, DataType dtype=DataType::UNDEFINED, Backend place=Backend::UNDEFINED)
+  args : (Tensor x, DataType dtype=DataType::UNDEFINED, Place place={})
   output : Tensor
   invoke : full_like(x, 1, dtype, place)
 
@@ -219,7 +219,7 @@
     data_type : x
 
 - api : zeros_like
-  args : (Tensor x, DataType dtype=DataType::UNDEFINED, Backend place=Backend::UNDEFINED)
+  args : (Tensor x, DataType dtype=DataType::UNDEFINED, Place place = {})
   output : Tensor
   invoke : full_like(x, 0, dtype, place)
 
@@ -231,7 +231,7 @@
     func : OneHotInferMeta
   kernel :
     func : one_hot
-    
+
 - api : digamma
   args : (Tensor x)
   output : Tensor
@@ -1484,3 +1484,16 @@
   kernel :
     func : mv
   backward : mv_grad
+
+
+
+#  =================================== sep0
+
+
+#  =================================== sep1
+
+
+#  =================================== sep2
+
+
+#  =================================== sep3
diff --git a/python/paddle/utils/code_gen/api_base.py b/python/paddle/utils/code_gen/api_base.py
index bf3d7b3d19eab806706f1d2d654957aac5b33434..a42691d320875ef8d8eaf04faadbb99ab7b41cbf 100644
--- a/python/paddle/utils/code_gen/api_base.py
+++ b/python/paddle/utils/code_gen/api_base.py
@@ -99,7 +99,7 @@ class BaseAPI(object):
             'double': 'double',
             'bool': 'bool',
             'str': 'const std::string&',
-            'Backend': 'Backend',
+            'Place': 'Place',
             'DataLayout': 'DataLayout',
             'DataType': 'DataType',
             'int64[]': 'const std::vector<int64_t>&',
@@ -118,7 +118,7 @@ class BaseAPI(object):
             'float': 'paddle::optional<float>',
             'double': 'paddle::optional<double>',
             'bool': 'paddle::optional<bool>',
-            'Backend': 'paddle::optional<Backend>',
+            'Place': 'paddle::optional<Place>',
             'DataLayout': 'paddle::optional<DataLayout>',
             'DataType': 'paddle::optional<DataType>',
             'int64[]': 'paddle::optional<std::vector<int64_t>>',
@@ -327,9 +327,9 @@ PADDLE_API {self.gene_return_type_code()} {self.get_api_func_name() + '_'}({self
         attr_layout_count = 0
         attr_data_type_count = 0
         for attr_name in attrs['names']:
-            if attrs['attr_info'][attr_name][0] == 'Backend':
+            if attrs['attr_info'][attr_name][0] == 'Place':
                 assert kernel['backend'] is not None, \
-                    f"{api} api: When there is a parameter with 'Backend' type in attributes, you must set backend of kernel manually."
+                    f"{api} api: When there is a parameter with 'Place' type in attributes, you must set backend of kernel manually."
                 attr_backend_count = attr_backend_count + 1
             if attrs['attr_info'][attr_name][0] == 'DataLayout':
                 assert kernel['layout'] is not None, \
@@ -348,8 +348,8 @@ PADDLE_API {self.gene_return_type_code()} {self.get_api_func_name() + '_'}({self
                 assert len(
                     vars_list
                 ) == 2, f"{api} api: The number of params to set backend with '>' only allows 2, but received {len(vars_list)}."
-                assert (vars_list[0].strip() in attrs['names']) and (attrs['attr_info'][vars_list[0].strip()][0] == 'Backend'), \
-                    f"{api} api: When use '>' to set kernel backend, the first param should be a attribute with Backend type."
+                assert (vars_list[0].strip() in attrs['names']) and (attrs['attr_info'][vars_list[0].strip()][0] == 'Place'), \
+                    f"{api} api: When use '>' to set kernel backend, the first param should be a attribute with Place type."
                 kernel_select_code = kernel_select_code + f"""
   kernel_backend = ParseBackendWithInputOrder({vars_list[0].strip()}, {vars_list[1].strip()});
 """
diff --git a/python/paddle/utils/code_gen/backward.yaml b/python/paddle/utils/code_gen/backward.yaml
index 0c55735f3b105b25003d0cd527ea1b4e952ddf95..c6611781b69478fcd86e75458c121b8fee736540 100644
--- a/python/paddle/utils/code_gen/backward.yaml
+++ b/python/paddle/utils/code_gen/backward.yaml
@@ -56,6 +56,16 @@
   kernel :
     func : abs_grad
 
+- backward_api : relu_grad
+  forward : relu (Tensor x) -> Tensor(out)
+  args : (Tensor x, Tensor out_grad)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [x]
+  kernel :
+    func : relu_grad
+
 - backward_api : trunc_grad
   forward : trunc (Tensor x) -> Tensor(out)
   args : (Tensor out_grad)
@@ -868,3 +878,16 @@
     param : [x, vec]
   kernel :
     func : mv_grad
+
+
+
+#  =================================== sep0
+
+
+#  =================================== sep1
+
+
+#  =================================== sep2
+
+
+#  =================================== sep3
diff --git a/python/setup.py.in b/python/setup.py.in
index 44998bd3e1675f2a3f77edd26c9cd8fa85121b6a..0a10e9dcc698d7433ceab5dd35ee4d5fa1729636 100755
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -282,6 +282,12 @@ packages=['paddle',
           'paddle.distribution',
           'paddle.distributed.sharding',
           'paddle.distributed.fleet',
+          'paddle.distributed.launch',
+          'paddle.distributed.launch.context',
+          'paddle.distributed.launch.controllers',
+          'paddle.distributed.launch.job',
+          'paddle.distributed.launch.plugins',
+          'paddle.distributed.launch.utils',
           'paddle.distributed.fleet.base',
           'paddle.distributed.fleet.elastic',
           'paddle.distributed.fleet.meta_optimizers',
@@ -727,7 +733,7 @@ with redirect_stdout():
         },
         entry_points={
             'console_scripts': [
-                'fleetrun = paddle.distributed.fleet.launch:launch'
+                'fleetrun = paddle.distributed.launch.__main__:launch'
             ]
         },
         classifiers=[