Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into add_some_yaml_config

9501b7e5 · phlrain · 0c1d5408 · d2aaa751 · 9501b7e5 · 9501b7e5
96 changed file
--- a/README.md
+++ b/README.md
@@ -14,9 +14,8 @@ English | [简体中文](./README_cn.md)

 Welcome to the PaddlePaddle GitHub.

-PaddlePaddle, as the only independent R&D deep learning platform in China, has been officially open-sourced to professional communities since 2016. It is an industrial platform with advanced technologies and rich features that cover core deep learning frameworks, basic model libraries, end-to-end development kits, tools & components as well as service platforms.
-PaddlePaddle is originated from industrial practices with dedication and commitments to industrialization. It has been widely adopted by a wide range of sectors including manufacturing, agriculture, enterprise service, and so on while serving more than 4 million developers. With such advantages, PaddlePaddle has helped an increasing number of partners commercialize AI.
-
+PaddlePaddle, as the first independent R&D deep learning platform in China, has been officially open-sourced to professional communities since 2016. It is an industrial platform with advanced technologies and rich features that cover core deep learning frameworks, basic model libraries, end-to-end development kits, tools & components as well as service platforms.
+PaddlePaddle is originated from industrial practices with dedication and commitments to industrialization. It has been widely adopted by a wide range of sectors including manufacturing, agriculture, enterprise service, and so on while serving more than 4 million developers, 157,000 companies and generating 476,000 models. With such advantages, PaddlePaddle has helped an increasing number of partners commercialize AI.


 ## Installation

--- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
@@ -395,7 +395,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Barrier(
  platform::CUDADeviceGuard gpuGuard;
  for (auto& place : places) {
    gpuGuard.SetDeviceIndex(place.GetDeviceId());
-    auto dt = full({1}, 0, phi::DataType::FLOAT32, phi::Backend::GPU);
+    auto dt = full({1}, 0, phi::DataType::FLOAT32, phi::GPUPlace());
    barrierTensors.push_back(dt);
  }
  auto task = ProcessGroupNCCL::AllReduce(barrierTensors);
@@ -417,7 +417,7 @@ void CheckTensorsInDifferentDevices(const std::vector<Tensor>& tensors,
  std::set<Place> used_devices;

  for (const auto& t : tensors) {
-    PADDLE_ENFORCE_EQ(t.is_cuda() && t.is_dense_tensor(), true,
+    PADDLE_ENFORCE_EQ(t.is_gpu() && t.is_dense_tensor(), true,
                      platform::errors::InvalidArgument(
                          "Tensors must be CUDA and dense tensor."));


--- a/paddle/fluid/distributed/collective/reducer.cc
+++ b/paddle/fluid/distributed/collective/reducer.cc
@@ -321,7 +321,7 @@ EagerReducer::EagerReducer(
  if (find_unused_vars_each_step_) {
    global_used_vars_ = paddle::experimental::empty(
        ScalarArray({static_cast<int32_t>(tensors_.size())}), DataType::INT32,
-        TransToBackend(inner_place_));
+        inner_place_);
  }
 }

@@ -363,10 +363,8 @@ void EagerReducer::InitializeGroups(
    } else {
      // process the dense gradient.
      InitializeDenseGroups(tensor_indices_, &group);
-      // experimental::Backend backend =  TransToBackend(inner_place_);
      group.dense_contents_ = paddle::experimental::empty(
-          ScalarArray({group.all_length_}), group.dtype_,
-          TransToBackend(inner_place_));
+          ScalarArray({group.all_length_}), group.dtype_, inner_place_);
    }

    // map tensors to this group by VariableLocator

--- a/paddle/fluid/eager/api/utils/tensor_utils.cc
+++ b/paddle/fluid/eager/api/utils/tensor_utils.cc
@@ -43,8 +43,7 @@ paddle::experimental::Tensor CreateTensorWithValue(
    const phi::DataType& dtype, const phi::DataLayout& layout, float value,
    bool is_leaf) {
  paddle::experimental::Tensor out = paddle::experimental::full(
-      phi::vectorize(ddim), paddle::experimental::Scalar(value), dtype,
-      phi::TransToPhiBackend(place));
+      phi::vectorize(ddim), paddle::experimental::Scalar(value), dtype, place);

  auto meta = EagerUtils::autograd_meta(&out);
  if (is_leaf) {

--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
@@ -29,7 +29,7 @@ yaml_types_mapping = {
    'int' : 'int', 'int32' : 'int32_t', 'int64' : 'int64_t',  'size_t' : 'size_t', \
    'float' : 'float', 'double' : 'double', 'bool' : 'bool', \
    'str' : 'std::string', \
-    'Backend' : 'paddle::experimental::Backend', 'DataLayout' : 'paddle::experimental::DataLayout', 'DataType' : 'paddle::experimental::DataType', \
+    'Place' : 'paddle::experimental::Place', 'DataLayout' : 'paddle::experimental::DataLayout', 'DataType' : 'paddle::experimental::DataType', \
    'int64[]' : 'std::vector<int64_t>', 'int[]' : 'std::vector<int>',
    'Tensor' : 'Tensor',
    'Tensor[]' : 'std::vector<Tensor>',
@@ -56,6 +56,14 @@ def ParseArguments():
 #################
 ###  Helpers  ###
 #################
+def RecoverBaseNameOfInplaceFunction(function_name):
+    return function_name[:-1]
+
+
+def GetInplacedFunctionName(function_name):
+    return function_name + "_"
+
+
 def FindGradName(string):
    return string + "_grad"

@@ -149,6 +157,24 @@ def ReadBwdFile(filepath):
 ######################
 ###  Yaml Parsers  ###
 ######################
+def ParseInplaceInfo(string):
+    # string: "(x -> out0), (y -> out2)"
+    inplace_map = {}
+    for pair in string.split(","):
+        pair = pair.strip()
+        if pair.startswith("("):
+            pair = pair[1:]
+
+        if pair.endswith(")"):
+            pair = pair[:-1]
+
+        key = pair.split("->")[0].strip()
+        val = pair.split("->")[1].strip()
+        inplace_map[key] = val
+
+    return inplace_map
+
+
 def RemoveSpecialSymbolsInName(string):
    # Remove any name after '@'
    ret = string.split("@")[0]
@@ -684,9 +710,10 @@ std::vector<std::vector<paddle::experimental::Tensor>> {}::operator()(const std:

 def GenerateNodeCreationCodes(
        fwd_api_name, bwd_api_name, forward_inputs_position_map,
-        forward_outputs_position_map, forward_attrs_list,
+        forward_outputs_position_map, forward_attrs_list, forward_call_str,
        backward_fwd_input_map, backward_grad_input_map,
-        backward_grad_output_map, backward_attrs_list, optional_inputs):
+        backward_grad_output_map, backward_attrs_list, optional_inputs,
+        inplace_map):
    # fwd_api_name = ""
    # forward_inputs_position_map = { "name" : [type, fwd_position] }
    # forward_outputs_position_map = { "name" : [type, fwd_position] }
@@ -724,19 +751,19 @@ def GenerateNodeCreationCodes(
        output_autograd_meta_vec_name = GetAutoGradMetaVectorName(name)
        if num_fwd_outputs == 1:
            if IsPlainTensorType(rtype):
-                output_autograd_meta = f"    egr::AutogradMeta* {output_autograd_meta_name} = egr::EagerUtils::autograd_meta(&api_result);"
+                output_autograd_meta = f"        egr::AutogradMeta* {output_autograd_meta_name} = egr::EagerUtils::autograd_meta(&api_result);"
            else:
                assert IsVectorTensorType(rtype)
-                output_autograd_meta = f"    std::vector<egr::AutogradMeta*> {output_autograd_meta_vec_name} = egr::EagerUtils::autograd_meta(&api_result);\n"
-                output_autograd_meta += f"    std::vector<egr::AutogradMeta*>* {output_autograd_meta_name} = &{output_autograd_meta_vec_name};"
+                output_autograd_meta = f"        std::vector<egr::AutogradMeta*> {output_autograd_meta_vec_name} = egr::EagerUtils::autograd_meta(&api_result);\n"
+                output_autograd_meta += f"        std::vector<egr::AutogradMeta*>* {output_autograd_meta_name} = &{output_autograd_meta_vec_name};"
        else:
            # Tuple api_result
            if IsPlainTensorType(rtype):
-                output_autograd_meta = f"    egr::AutogradMeta* {output_autograd_meta_name} = egr::EagerUtils::autograd_meta(&std::get<{pos}>(api_result));"
+                output_autograd_meta = f"        egr::AutogradMeta* {output_autograd_meta_name} = egr::EagerUtils::autograd_meta(&std::get<{pos}>(api_result));"
            else:
                assert IsVectorTensorType(rtype)
-                output_autograd_meta = f"    std::vector<egr::AutogradMeta*> {output_autograd_meta_vec_name} = egr::EagerUtils::autograd_meta(&std::get<{pos}>(api_result));\n"
-                output_autograd_meta += f"    std::vector<egr::AutogradMeta*>* {output_autograd_meta_name} = &{output_autograd_meta_vec_name};"
+                output_autograd_meta = f"        std::vector<egr::AutogradMeta*> {output_autograd_meta_vec_name} = egr::EagerUtils::autograd_meta(&std::get<{pos}>(api_result));\n"
+                output_autograd_meta += f"        std::vector<egr::AutogradMeta*>* {output_autograd_meta_name} = &{output_autograd_meta_vec_name};"

        outputs_autograd_meta_list.append(output_autograd_meta)
        pass_stop_gradient_args_list.append(output_autograd_meta_name)
@@ -745,16 +772,34 @@ def GenerateNodeCreationCodes(
    outputs_autograd_meta_str = "\n".join(outputs_autograd_meta_list)
    pass_stop_gradient_args_str = ",".join(pass_stop_gradient_args_list)

+    # Check Inplace
+    check_inplace_str = ""
+    bump_inplace_version_str = ""
+    for inplace_name in inplace_map.keys():
+        inplace_autograd_meta_name = GetAutoGradMetaName(inplace_name)
+        check_inplace_str += f"""
+    // Check Inplace
+    egr::EagerUtils::CheckInplace({inplace_name}, {inplace_autograd_meta_name}, require_any_grad);\n
+"""
+
+        bump_inplace_version_str += f"""
+    // Bump Inplace Version
+    {inplace_name}.bump_inplace_version();
+    VLOG(3) << \"Tensor(\" << {inplace_name}.name() << \") uses Inplace Strategy.\";\n
+"""
+
    # Node Construction
    num_bwd_inputs = len(backward_grad_input_map.keys())
    num_bwd_outputs = len(backward_grad_output_map.keys())
-    grad_node_name = GetGradNodeName(fwd_api_name)
-    node_construction_str = f"        auto grad_node = std::make_shared<{grad_node_name}>({num_bwd_inputs}, {num_bwd_outputs});"
+    grad_node_name = GetGradNodeName(
+        RecoverBaseNameOfInplaceFunction(
+            fwd_api_name)) if inplace_map else GetGradNodeName(fwd_api_name)
+    node_construction_str = f"            auto grad_node = std::make_shared<{grad_node_name}>({num_bwd_inputs}, {num_bwd_outputs});"

    # SetAttributes
    set_attributes_list = []
    for name, _, _, _ in backward_attrs_list:
-        set_attributes = f"        grad_node->SetAttribute{name}({name});"
+        set_attributes = f"            grad_node->SetAttribute{name}({name});"
        set_attributes_list.append(set_attributes)
    set_attributes_str = "\n".join(set_attributes_list)

@@ -765,9 +810,9 @@ def GenerateNodeCreationCodes(

        if is_fwd_input:
            if is_optional:
-                set_tensor_wrappers = f"        if({name}.is_initialized()) grad_node->SetTensorWrapper{name}({name}, true);"
+                set_tensor_wrappers = f"            if({name}.is_initialized()) grad_node->SetTensorWrapper{name}({name}, true);"
            else:
-                set_tensor_wrappers = f"        grad_node->SetTensorWrapper{name}({name}, true);"
+                set_tensor_wrappers = f"            grad_node->SetTensorWrapper{name}({name}, true);"
        else:
            if num_fwd_outputs > 1:
                # Aligned with forward output position
@@ -778,9 +823,9 @@ def GenerateNodeCreationCodes(
                tw_name = f"api_result"

            if is_optional:
-                set_tensor_wrappers = f"        if({tw_name}.is_initialized()) grad_node->SetTensorWrapper{name}({tw_name}, false);"
+                set_tensor_wrappers = f"            if({tw_name}.is_initialized()) grad_node->SetTensorWrapper{name}({tw_name}, false);"
            else:
-                set_tensor_wrappers = f"        grad_node->SetTensorWrapper{name}({tw_name}, false);"
+                set_tensor_wrappers = f"            grad_node->SetTensorWrapper{name}({tw_name}, false);"
        set_tensor_wrappers_list.append(set_tensor_wrappers)
    set_tensor_wrappers_str = "\n".join(set_tensor_wrappers_list)

@@ -789,8 +834,8 @@ def GenerateNodeCreationCodes(
    set_edges_list = []
    for name, (_, pos) in forward_inputs_position_map.items():
        input_autograd_meta_name = GetAutoGradMetaName(name)
-        set_grad_out_meta = f"        grad_node->SetGradOutMeta({name}, {pos});"
-        set_edges = f"        grad_node->AddEdges({input_autograd_meta_name}, {pos});"
+        set_grad_out_meta = f"            grad_node->SetGradOutMeta({name}, {pos});"
+        set_edges = f"            grad_node->AddEdges({input_autograd_meta_name}, {pos});"
        set_grad_out_meta_list.append(set_grad_out_meta)
        set_edges_list.append(set_edges)
    set_grad_out_meta_str = "\n".join(set_grad_out_meta_list)
@@ -804,14 +849,14 @@ def GenerateNodeCreationCodes(
    num_outputs = len(forward_outputs_position_map.keys())
    for name, (_, pos) in forward_outputs_position_map.items():
        output_autograd_meta_name = GetAutoGradMetaName(name)
-        set_out_rank = f"        egr::EagerUtils::SetOutRankWithSlot({output_autograd_meta_name}, {pos});"
-        set_history = f"        egr::EagerUtils::SetHistory({output_autograd_meta_name}, grad_node);"
+        set_out_rank = f"            egr::EagerUtils::SetOutRankWithSlot({output_autograd_meta_name}, {pos});"
+        set_history = f"            egr::EagerUtils::SetHistory({output_autograd_meta_name}, grad_node);"
        if num_outputs == 1:
-            set_retain_grad = f"        egr::EagerUtils::CheckAndRetainGrad(api_result);"
-            set_grad_in_meta = f"        grad_node->SetGradInMeta(api_result, {pos});"
+            set_retain_grad = f"            egr::EagerUtils::CheckAndRetainGrad(api_result);"
+            set_grad_in_meta = f"            grad_node->SetGradInMeta(api_result, {pos});"
        else:
-            set_retain_grad = f"        egr::EagerUtils::CheckAndRetainGrad(std::get<{pos}>(api_result));"
-            set_grad_in_meta = f"        grad_node->SetGradInMeta(std::get<{pos}>(api_result), {pos});"
+            set_retain_grad = f"            egr::EagerUtils::CheckAndRetainGrad(std::get<{pos}>(api_result));"
+            set_grad_in_meta = f"            grad_node->SetGradInMeta(std::get<{pos}>(api_result), {pos});"

        set_out_rank_list.append(set_out_rank)
        set_history_list.append(set_history)
@@ -823,55 +868,64 @@ def GenerateNodeCreationCodes(
    set_grad_in_meta_str = "\n".join(set_grad_in_meta_list)
    set_retain_grad_str = "\n".join(set_retain_grad_list)

+    node_event_name = fwd_api_name + " node_creation"
+    NODE_CREATION_TEMPLATE = """
+        paddle::platform::RecordEvent node_creation_record_event(\"{}\", paddle::platform::TracerEventType::Operator, 1);\n
+        """
+    node_creation_event_str = NODE_CREATION_TEMPLATE.format(node_event_name)
+
    NODE_CREATION_TEMPLATE = """

    // Get AutoGradMeta
-{}
 {}
    bool trace_backward = egr::Controller::Instance().HasGrad();
-
    bool require_any_grad = egr::EagerUtils::ComputeRequireGrad({});
-    if(require_any_grad) {{
-        egr::EagerUtils::PassStopGradient({});
-        
-        // Node Construction
 {}
-
-        // SetAttributes
+    // Forward API Call
+    {}
 {}
-
-        // SetTensorWrappers
+    {{
 {}
-
-        // SetGradOutMeta & SetEdges
 {}
+        if(require_any_grad) {{
+            egr::EagerUtils::PassStopGradient({});
+            
+            // Node Construction
 {}
-
-        // SetOutRank & SetHistory & SetGradInMeta & RetainGrad
+            // SetAttributes
 {}
+            // SetTensorWrappers
 {}
+            // SetGradOutMeta & SetEdges
 {}
 {}
-
+            // SetOutRank & SetHistory & SetGradInMeta & RetainGrad
+{}
+{}
+{}
+{}
+        }}
    }}

 """
    node_creation_str = NODE_CREATION_TEMPLATE.format(
-        inputs_autograd_meta_str, outputs_autograd_meta_str,
-        compute_require_grad_args_str, pass_stop_gradient_args_str,
-        node_construction_str, set_attributes_str, set_tensor_wrappers_str,
-        set_grad_out_meta_str, set_edges_str, set_out_rank_str, set_history_str,
-        set_grad_in_meta_str, set_retain_grad_str)
+        inputs_autograd_meta_str, compute_require_grad_args_str,
+        check_inplace_str, forward_call_str, bump_inplace_version_str,
+        node_creation_event_str, outputs_autograd_meta_str,
+        pass_stop_gradient_args_str, node_construction_str, set_attributes_str,
+        set_tensor_wrappers_str, set_grad_out_meta_str, set_edges_str,
+        set_out_rank_str, set_history_str, set_grad_in_meta_str,
+        set_retain_grad_str)

    return node_creation_str


-def GenerateForwardDefinition(fwd_api_name, bwd_api_name,
-                              forward_inputs_position_map,
-                              forward_outputs_position_map, forward_attrs_list,
-                              backward_fwd_input_map, backward_grad_input_map,
-                              backward_grad_output_map, backward_attrs_list,
-                              optional_inputs, intermediate_outputs):
+def GenerateForwardDefinition(
+        fwd_api_name, bwd_api_name, forward_inputs_position_map,
+        forward_outputs_position_map, forward_attrs_list,
+        backward_fwd_input_map, backward_grad_input_map,
+        backward_grad_output_map, backward_attrs_list, optional_inputs,
+        intermediate_outputs, inplace_map):
    # fwd_api_name = ""
    # forward_inputs_position_map = { "name" : [type, fwd_position] }
    # forward_outputs_position_map = { "name" : [type, fwd_position] }
@@ -895,7 +949,10 @@ def GenerateForwardDefinition(fwd_api_name, bwd_api_name,
            if is_optional:
                arg_str = f"const paddle::optional<paddle::experimental::Tensor>& {name}"
            else:
-                arg_str = f"const paddle::experimental::Tensor& {name}"
+                if inplace_map and name in inplace_map.keys():
+                    arg_str = f"paddle::experimental::Tensor& {name}"
+                else:
+                    arg_str = f"const paddle::experimental::Tensor& {name}"
        else:
            assert IsVectorTensorType(ttype)
            arg_str = f"const std::vector<paddle::experimental::Tensor>& {name}"
@@ -958,26 +1015,16 @@ def GenerateForwardDefinition(fwd_api_name, bwd_api_name,

    node_creation_str = GenerateNodeCreationCodes(
        fwd_api_name, bwd_api_name, forward_inputs_position_map,
-        forward_outputs_position_map, forward_attrs_list,
+        forward_outputs_position_map, forward_attrs_list, forward_call_str,
        backward_fwd_input_map, backward_grad_input_map,
-        backward_grad_output_map, backward_attrs_list, optional_inputs)
-
-    node_event_name = fwd_api_name + " node_creation"
-    NODE_CREATION_TEMPLATE = """{{\n
-           paddle::platform::RecordEvent node_creation_record_event(\"{}\", paddle::platform::TracerEventType::Operator, 1);\n
-           {}\n
-        }}"""
-    node_creation_str = NODE_CREATION_TEMPLATE.format(node_event_name,
-                                                      node_creation_str)
+        backward_grad_output_map, backward_attrs_list, optional_inputs,
+        inplace_map)

    dygraph_event_str = f"paddle::platform::RecordEvent dygraph_entrance_record_event(\"{fwd_api_name} dygraph\", paddle::platform::TracerEventType::Operator, 1);"

    FORWARD_FUNCTION_TEMPLATE = """
 {} {}({}) {{
    {}
-
-    // Forward API Call
-    {}
    
 {}

@@ -989,7 +1036,7 @@ def GenerateForwardDefinition(fwd_api_name, bwd_api_name,
    forward_function_name = GetForwardFunctionName(fwd_api_name)
    forward_function_str = FORWARD_FUNCTION_TEMPLATE.format(
        returns_type_str, forward_function_name, inputs_args_definition_str,
-        dygraph_event_str, forward_call_str, node_creation_str, returns_str)
+        dygraph_event_str, node_creation_str, returns_str)
    forward_function_declaration_str = f"{returns_type_str} {forward_function_name}({inputs_args_declaration_str});"

    return forward_function_str, forward_function_declaration_str
@@ -1191,6 +1238,10 @@ if __name__ == "__main__":
            fwd_args_str = fwd_api['args']
            fwd_returns_str = fwd_api['output']

+            inplace_map = {}
+            if 'inplace' in fwd_api.keys():
+                inplace_map = ParseInplaceInfo(fwd_api['inplace'])
+
            bwd_api_name = fwd_api['backward']
            assert bwd_api_name in grad_api_dict.keys(), bwd_api_name
            bwd_api = grad_api_dict[bwd_api_name]
@@ -1287,7 +1338,7 @@ if __name__ == "__main__":
                forward_outputs_position_map, orig_forward_attrs_list,
                backward_fwd_input_map, backward_grad_input_map,
                backward_grad_output_map, backward_attrs_list, optional_inputs,
-                intermediate_outputs)
+                intermediate_outputs, {})
            print("Generated Forward Definition: ", forward_definition_str)
            print("Generated Forward Declaration: ", forward_declaration_str)
            yaml_forward_definition_str += definition_declaration_pair[0]
@@ -1298,6 +1349,30 @@ if __name__ == "__main__":
                                      forward_outputs_position_map,
                                      orig_forward_attrs_list)

+            # Inplaced Version Dygraph Function Generation
+            if fwd_api_name != "sum" and "inplace" in fwd_api.keys():
+                fwd_api_name_inplaced = GetInplacedFunctionName(fwd_api_name)
+
+                # Node Definition Generation
+                definition_declaration_pair = GenerateForwardDefinition(
+                    fwd_api_name_inplaced, bwd_api_name,
+                    forward_inputs_position_map, forward_outputs_position_map,
+                    forward_attrs_list, backward_fwd_input_map,
+                    backward_grad_input_map, backward_grad_output_map,
+                    backward_attrs_list, optional_inputs, intermediate_outputs,
+                    inplace_map)
+                print("Generated Inplaced Forward Definition: ",
+                      forward_definition_str)
+                print("Generated Inplaced Forward Declaration: ",
+                      forward_declaration_str)
+                forward_definition_str += definition_declaration_pair[0]
+                forward_declaration_str += definition_declaration_pair[1]
+
+                # For python-level API dispatch
+                CollectCoreOpsInformation(
+                    fwd_api_name_inplaced, forward_inputs_position_map,
+                    forward_outputs_position_map, forward_attrs_list)
+
        if len(namespace) > 0:
            forward_definition_str += f"""namespace {namespace} {{
    {yaml_forward_definition_str}

--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
@@ -15,7 +15,7 @@
 import os
 import argparse
 import logging
-from eager_gen import namespace, yaml_types_mapping, ReadFwdFile, ParseDispensable, IsVectorTensorType, GetForwardFunctionName, ParseYamlForward, DetermineForwardPositionMap
+from eager_gen import namespace, yaml_types_mapping, ReadFwdFile, ParseDispensable, IsVectorTensorType, GetForwardFunctionName, ParseYamlForward, DetermineForwardPositionMap, GetInplacedFunctionName, ParseInplaceInfo

 ###########################
 ## Global Configurations ##
@@ -43,7 +43,7 @@ atype_to_parsing_function = {
    "std::vector<std::string>": "CastPyArg2Strings",
    "paddle::experimental::Scalar": "CastPyArg2Scalar",
    "paddle::experimental::ScalarArray": "CastPyArg2ScalarArray",
-    "paddle::experimental::Backend": "CastPyArg2Backend",
+    "paddle::experimental::Place": "CastPyArg2Place",
    "paddle::experimental::DataType": "CastPyArg2DataType",
 }

@@ -71,6 +71,14 @@ RECORD_EVENT_TEMPLATE = \
 "    paddle::platform::RecordEvent {}(\"{} {}\", paddle::platform::TracerEventType::Operator, 1);"


+RETURN_INPLACE_PYOBJECT_TEMPLATE = \
+"""
+    ssize_t arg_id = GetIdxFromCoreOpsInfoMap(core_ops_final_state_args_info, \"final_state_{}\", \"{}\");
+    ssize_t return_id = GetIdxFromCoreOpsInfoMap(core_ops_final_state_returns_info, \"final_state_{}\", \"{}\");
+    return ToPyObject(out, return_id, args, arg_id);
+"""
+
+
 PYTHON_C_FUNCTION_TEMPLATE = \
 """
 static PyObject * eager_final_state_api_{}(PyObject *self, PyObject *args, PyObject *kwargs)
@@ -94,7 +102,7 @@ static PyObject * eager_final_state_api_{}(PyObject *self, PyObject *args, PyObj
    
    PyEval_RestoreThread(tstate);
    tstate = nullptr;
-    return ToPyObject(out);
+{}
  }}
  catch(...) {{
    if (tstate) {{
@@ -287,9 +295,10 @@ class PythonCSingleFunctionGenerator:
        self.forward_inputs_position_map, self.forward_outputs_position_map = DetermineForwardPositionMap(
            forward_inputs_list, forward_returns_list)

-    def GeneratePythonCFunction(self):
+    def GeneratePythonCFunction(self, inplace_map):
        namespace = self.namespace
-        forward_api_name = self.forward_api_name
+        forward_api_name = GetInplacedFunctionName(
+            self.forward_api_name) if inplace_map else self.forward_api_name
        forward_attrs_list = self.forward_attrs_list
        forward_inputs_position_map = self.forward_inputs_position_map
        forward_outputs_position_map = self.forward_outputs_position_map
@@ -341,19 +350,31 @@ class PythonCSingleFunctionGenerator:
            fwd_function_name = FUNCTION_NAME_TEMPLATE.format(
                "::", namespace, GetForwardFunctionName(forward_api_name))

+        if inplace_map:
+            assert len(
+                inplace_map
+            ) == 1, f"size of inplace_map must be 1, but inplace_map of \"{forward_api_name}\" op got {len(inplace_map)}"
+            for inplace_input, inplace_output in inplace_map.items():
+                return_str = RETURN_INPLACE_PYOBJECT_TEMPLATE.format(
+                    forward_api_name, inplace_input, forward_api_name,
+                    inplace_output)
+                break
+        else:
+            return_str = "    return ToPyObject(out);"
+
        # Generate Record Event for performance profiling
        pythonc_record_event_str = RECORD_EVENT_TEMPLATE.format(
            "pythonc_record_event", forward_api_name, "pybind_imperative_func")
        self.python_c_function_str = PYTHON_C_FUNCTION_TEMPLATE.format(
            forward_api_name, pythonc_record_event_str, forward_api_name,
            get_eager_tensor_str, parse_attributes_str, fwd_function_name,
-            dygraph_function_call_str)
+            dygraph_function_call_str, return_str)

        # Generate Python-C Function Registration
        self.python_c_function_reg_str = PYTHON_C_FUNCTION_REG_TEMPLATE.format(
            forward_api_name, namespace, forward_api_name, forward_api_name)

-    def run(self):
+    def run(self, inplace_map):
        # Initialized is_forward_only
        self.CollectIsForwardOnly()

@@ -384,7 +405,7 @@ class PythonCSingleFunctionGenerator:
        )

        # Code Generation
-        self.GeneratePythonCFunction()
+        self.GeneratePythonCFunction(inplace_map)
        logging.info(
            f"Generated Python-C Function: {self.python_c_function_str}")
        logging.info(
@@ -416,12 +437,23 @@ class PythonCYamlGenerator:
        for forward_api_content in forward_api_list:
            f_generator = PythonCSingleFunctionGenerator(forward_api_content,
                                                         namespace)
-            status = f_generator.run()
+            status = f_generator.run({})

            if status == True:
                self.python_c_functions_reg_str += f_generator.python_c_function_reg_str + ",\n"
                self.python_c_functions_str += f_generator.python_c_function_str + "\n"

+            if 'inplace' in forward_api_content.keys():
+                inplace_map = ParseInplaceInfo(forward_api_content['inplace'])
+
+                f_generator_inplace = PythonCSingleFunctionGenerator(
+                    forward_api_content, namespace)
+                status = f_generator_inplace.run(inplace_map)
+
+                if status == True:
+                    self.python_c_functions_reg_str += f_generator_inplace.python_c_function_reg_str + ",\n"
+                    self.python_c_functions_str += f_generator_inplace.python_c_function_str + "\n"
+
    def InferNameSpace(self):
        yaml_path = self.yaml_path
        if "sparse" in yaml_path:

--- a/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc
+++ b/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc
@@ -90,7 +90,7 @@ TEST(Tensor, MemberFunction) {
  VLOG(6) << "Set impl";
  CHECK_EQ(et3.initialized(), true);
  CHECK_EQ(et3.is_cpu(), true);
-  CHECK_EQ(et3.is_cuda(), false);
+  CHECK_EQ(et3.is_gpu(), false);
  CHECK_EQ(et3.numel(), 2);
  auto expected_dim = phi::make_ddim({1, 2});
  CHECK_EQ(et3.dims(), expected_dim);

--- a/paddle/fluid/operators/graph_send_recv_op.cc
+++ b/paddle/fluid/operators/graph_send_recv_op.cc
@@ -38,7 +38,7 @@ class GraphSendRecvGradOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;

  void InferShape(framework::InferShapeContext* ctx) const override {
-    auto in_dims = ctx->GetInputDim(framework::GradVarName("Out"));
+    auto in_dims = ctx->GetInputDim("X");
    ctx->SetOutputDim(framework::GradVarName("X"), in_dims);
  }

@@ -68,6 +68,12 @@ class GraphSendRecvOpMaker : public framework::OpProtoAndCheckerMaker {
                         "tensors of Dst_index.")
        .SetDefault("SUM")
        .InEnum({"SUM", "MEAN", "MIN", "MAX"});
+    AddAttr<int64_t>(
+        "out_size",
+        "(int64_t, default 0)"
+        "Define the first dimension of Output tensor."
+        "If set default 0, then the shape of Out is the same with X.")
+        .SetDefault(0);
    AddComment(R"DOC(
 Graph Learning Send_Recv combine operator.

@@ -93,6 +99,7 @@ class GraphSendRecvGradOpMaker : public framework::SingleGradOpMaker<T> {
    op->SetType("graph_send_recv_grad");
    op->SetInput("Src_index", this->Input("Src_index"));
    op->SetInput("Dst_index", this->Input("Dst_index"));
+    op->SetInput("X", this->Input("X"));

    if (BOOST_GET_CONST(std::string, this->GetAttr("pool_type")) == "MEAN") {
      op->SetInput("Dst_count", this->Output("Dst_count"));
@@ -100,7 +107,6 @@ class GraphSendRecvGradOpMaker : public framework::SingleGradOpMaker<T> {

    if (BOOST_GET_CONST(std::string, this->GetAttr("pool_type")) == "MIN" ||
        BOOST_GET_CONST(std::string, this->GetAttr("pool_type")) == "MAX") {
-      op->SetInput("X", this->Input("X"));
      op->SetInput("Out", this->Output("Out"));
    }


--- a/paddle/fluid/operators/group_norm_op.cu
+++ b/paddle/fluid/operators/group_norm_op.cu
@@ -152,6 +152,21 @@ __device__ __forceinline__ void ThreadReduce(phi::Array<const T*, Num> arrs,
  }
 }

+template <typename T>
+__device__ __forceinline__ void ReduceMeanAndVar(T* mean, T* var, T x_mean,
+                                                 T x_var, int size) {
+  const int nc = blockIdx.x;
+  x_mean = kps::details::BlockXReduce<T, kps::AddFunctor<T>>(
+      x_mean, kps::AddFunctor<T>());
+  x_var = kps::details::BlockXReduce<T, kps::AddFunctor<T>>(
+      x_var, kps::AddFunctor<T>());
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    mean[nc] = static_cast<T>(x_mean / size);
+    var[nc] = static_cast<T>(x_var / size);
+  }
+}
+
 template <typename T>
 __global__ void ScalarGetMeanAndVarNCHW(const T* x, T* mean, T* var, int size) {
  int i = blockIdx.x;
@@ -162,10 +177,7 @@ __global__ void ScalarGetMeanAndVarNCHW(const T* x, T* mean, T* var, int size) {
    x_mean += val;
    x_var += val * val;
  }
-  x_mean /= size;
-  x_var /= size;
-  CudaAtomicAddWithWarp(&mean[i], x_mean);
-  CudaAtomicAddWithWarp(&var[i], x_var);
+  ReduceMeanAndVar<T>(mean, var, x_mean, x_var, size);
 }

 template <typename T, typename AccT, int VecSize>
@@ -174,21 +186,12 @@ __global__ void VectorizedGetMeanAndVarNCHW(const T* x, T* mean, T* var,
  int i = blockIdx.x;
  AccT x_mean = static_cast<AccT>(0);
  AccT x_var = static_cast<AccT>(0);
-  const int input_offset = ((uint64_t)x) % ALIGN_BYTES / sizeof(T);
  x += i * size;
+  const int input_offset = ((uint64_t)x) % ALIGN_BYTES / sizeof(T);
  phi::Array<const T*, 1> ins;
  ins[0] = x;
  ThreadReduce<T, AccT, VecSize, 1>(ins, size, input_offset, &x_mean, &x_var);
-
-  x_mean = kps::details::BlockXReduce<AccT, kps::AddFunctor<AccT>>(
-      x_mean, kps::AddFunctor<AccT>());
-  x_var = kps::details::BlockXReduce<AccT, kps::AddFunctor<AccT>>(
-      x_var, kps::AddFunctor<AccT>());
-  __syncthreads();
-  if (threadIdx.x == 0) {
-    mean[i] = static_cast<T>(x_mean / size);
-    var[i] = static_cast<T>(x_var / size);
-  }
+  ReduceMeanAndVar<AccT>(mean, var, x_mean, x_var, size);
 }

 template <typename T, int flags>
@@ -272,10 +275,6 @@ class GroupNormKernel<platform::CUDADeviceContext, T>
    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
    Tensor temp_var;
    temp_var.mutable_data<T>(var->dims(), ctx.GetPlace());
-
-    set_zero(dev_ctx, mean, static_cast<T>(0));
-    set_zero(dev_ctx, &temp_var, static_cast<T>(0));
-
    auto* x_data = x->data<T>();
    auto* y_data = y->data<T>();
    auto* mean_data = mean->data<T>();
@@ -319,7 +318,7 @@ class GroupNormKernel<platform::CUDADeviceContext, T>
      block_size_nchw = std::max(block_size_nchw, kps::details::kWarpSize);
      dim3 grids(x_dims[0] * groups);
      dim3 blocks(block_size_nchw);
-      if (size < vec_size) {
+      if (size < vec_size * block_size_nchw) {
        ScalarGetMeanAndVarNCHW<T><<<grids, blocks, 0, dev_ctx.stream()>>>(
            x_data, mean_data, temp_var_data, size);
      } else {
@@ -328,6 +327,8 @@ class GroupNormKernel<platform::CUDADeviceContext, T>
            x_data, mean_data, temp_var_data, size);
      }
    } else {
+      set_zero(dev_ctx, mean, static_cast<T>(0));
+      set_zero(dev_ctx, &temp_var, static_cast<T>(0));
      GroupNormForwardGetMeanAndVar<T><<<grid, threads, 0, dev_ctx.stream()>>>(
          x_data, x_dims[0], C, W, imsize, groups, group_size, mean_data,
          temp_var_data);
@@ -424,24 +425,15 @@ __global__ void VectorizedGetDsDbCUDAKernel(int imsize, const T* x, const T* dy,
  int i = blockIdx.x;
  AccT ds_sum = static_cast<AccT>(0);
  AccT db_sum = static_cast<AccT>(0);
-  const int input_offset = ((uint64_t)x) % ALIGN_BYTES / sizeof(T);
  x += i * imsize;
+  const int input_offset = ((uint64_t)x) % ALIGN_BYTES / sizeof(T);

  phi::Array<const T*, 2> ins;
  ins[0] = x;
  ins[1] = dy;
  ThreadReduce<T, AccT, VecSize, 2>(ins, imsize, input_offset, &db_sum,
                                    &ds_sum);
-
-  ds_sum = kps::details::BlockXReduce<AccT, kps::AddFunctor<AccT>>(
-      ds_sum, kps::AddFunctor<AccT>());
-  db_sum = kps::details::BlockXReduce<AccT, kps::AddFunctor<AccT>>(
-      db_sum, kps::AddFunctor<AccT>());
-  __syncthreads();
-  if (threadIdx.x == 0) {
-    ds[i] = ds_sum;
-    db[i] = db_sum;
-  }
+  ReduceMeanAndVar<AccT>(db, ds, db_sum, ds_sum, 1);
 }

 template <typename T>
@@ -455,8 +447,7 @@ __global__ void ScalarGetDsDbCUDAKernel(int imsize, const T* x, const T* dy,
    ds_sum += dy[index] * x[index];
    db_sum += dy[index];
  }
-  CudaAtomicAddWithWarp(&ds[nc], ds_sum);
-  CudaAtomicAddWithWarp(&db[nc], db_sum);
+  ReduceMeanAndVar<T>(db, ds, db_sum, ds_sum, 1);
 }

 template <typename T>
@@ -641,13 +632,7 @@ class GroupNormGradKernel<platform::CUDADeviceContext, T>
      }
      block_size_nchw = std::max(block_size_nchw, kps::details::kWarpSize);
      dim3 blocks(block_size_nchw);
-      if (imsize < vec_size) {
-        if (d_scale) {
-          set_zero(dev_ctx, d_scale, static_cast<T>(0));
-        }
-        if (d_bias) {
-          set_zero(dev_ctx, d_bias, static_cast<T>(0));
-        }
+      if (imsize < vec_size * block_size_nchw) {
        ScalarGetDsDbCUDAKernel<
            T><<<x_dims[0] * C, blocks, 0, dev_ctx.stream()>>>(
            imsize, x_data, dy_data, ds_data, db_data);
@@ -687,7 +672,6 @@ class GroupNormGradKernel<platform::CUDADeviceContext, T>
            imsize, C, group_size, groups, p1_data, p2_data, p3_data, x_data,
            dy_data, d_x_data);
      }
-
    } else {
      if (d_scale) {
        set_zero(dev_ctx, d_scale, static_cast<T>(0));

--- a/paddle/fluid/operators/reduce_ops/reduce_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.h
@@ -265,67 +265,6 @@ class ReduceKernel : public framework::OpKernel<T> {
        framework::TransToPhiDataType(cast_out_dtype), output);
  }
 };
-template <typename DeviceContext, typename OutT, typename Functor>
-class BoolReduceKernel : public framework::OpKernel<OutT> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    bool reduce_all = context.Attr<bool>("reduce_all");
-    auto* input = context.Input<Tensor>("X");
-    auto* output = context.Output<Tensor>("Out");
-    output->mutable_data<OutT>(context.GetPlace());
-
-    auto dims = context.Attr<std::vector<int>>("dim");
-    bool keep_dim = context.Attr<bool>("keep_dim");
-
-    // The dims has full dim, set the reduce_all is True
-    const auto& input_dim_size = context.Input<Tensor>("X")->dims().size();
-    std::set<int> dims_set(dims.begin(), dims.end());
-    bool full_dim = true;
-    for (auto i = 0; i < input_dim_size; i++) {
-      if (dims_set.find(i) == dims_set.end()) {
-        full_dim = false;
-        break;
-      }
-    }
-    reduce_all = (reduce_all || full_dim);
-
-    if (reduce_all) {
-      // Flatten and reduce 1-D tensor
-      auto x = EigenVector<OutT>::Flatten(*input);
-      auto out = EigenScalar<OutT>::From(*output);
-      auto& place =
-          *context.template device_context<DeviceContext>().eigen_device();
-      auto reduce_dim = Eigen::array<int, 1>({{0}});
-      Functor functor;
-      functor(place, &x, &out, reduce_dim);
-    } else {
-      int ndim = input->dims().size();
-      int rdim = dims.size();
-      // comments for accelerating compiling temporarily.
-      if (ndim > 6) {
-        HandleLargeDim<DeviceContext, OutT, Functor>(context, input, output,
-                                                     dims, keep_dim);
-      } else {
-        HANDLE_DIM(6, 5);
-        HANDLE_DIM(6, 4);
-        HANDLE_DIM(6, 3);
-        HANDLE_DIM(6, 2);
-        HANDLE_DIM(6, 1);
-        HANDLE_DIM(5, 4);
-        HANDLE_DIM(5, 3);
-        HANDLE_DIM(5, 2);
-        HANDLE_DIM(5, 1);
-        HANDLE_DIM(4, 3);
-        HANDLE_DIM(4, 2);
-        HANDLE_DIM(4, 1);
-        HANDLE_DIM(3, 2);
-        HANDLE_DIM(3, 1);
-        HANDLE_DIM(2, 1);
-        HANDLE_DIM(1, 1);
-      }
-    }
-  }
-};

 template <typename DeviceContext, typename T, typename Functor>
 void LaunchReduceGradKernel(const framework::ExecutionContext& context,

--- a/paddle/fluid/pybind/eager.cc
+++ b/paddle/fluid/pybind/eager.cc
@@ -132,8 +132,7 @@ void InitTensorWithTensor(TensorObject* self,
    self->tensor.set_impl(impl);
    VLOG(4) << "Same place, do ShareDataWith";
  } else {
-    self->tensor.set_impl(
-        src.copy_to(phi::TransToPhiBackend(place), true).impl());
+    self->tensor.set_impl(src.copy_to(place, true).impl());
    VLOG(4) << "Different place, do TensorCopy";
  }
  if (src.get_autograd_meta()) {
@@ -156,8 +155,7 @@ void InitTensorWithFrameworkTensor(TensorObject* self,
  } else {
    auto temp =
        paddle::experimental::Tensor(std::make_shared<phi::DenseTensor>(src));
-    self->tensor.set_impl(
-        temp.copy_to(phi::TransToPhiBackend(place), true).impl());
+    self->tensor.set_impl(temp.copy_to(place, true).impl());
    VLOG(4) << "Different place, do TensorCopy";
  }
  egr::EagerUtils::autograd_meta(&(self->tensor))->SetPersistable(false);

--- a/paddle/fluid/pybind/eager_functions.cc
+++ b/paddle/fluid/pybind/eager_functions.cc
@@ -159,7 +159,7 @@ static PyObject* eager_api_tensor_copy(PyObject* self, PyObject* args,
  auto place = CastPyArg2Place(PyTuple_GET_ITEM(args, 2), 2);
  bool blocking = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 3), 3);

-  dst = src.copy_to(phi::TransToPhiBackend(place), blocking);
+  dst = src.copy_to(place, blocking);
  egr::EagerUtils::autograd_meta(&dst)->SetStopGradient(
      egr::EagerUtils::autograd_meta(&(src))->StopGradient());
  egr::EagerUtils::autograd_meta(&dst)->SetPersistable(

--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -175,7 +175,7 @@ static PyObject* tensor_method_numpy(TensorObject* self, PyObject* args,
          pybind11::detail::npy_api::NPY_ARRAY_WRITEABLE_,
      nullptr);

-  if (self->tensor.is_cpu()) {
+  if (self->tensor.is_cpu() || self->tensor.is_gpu_pinned()) {
    auto dense_tensor =
        std::dynamic_pointer_cast<phi::DenseTensor>(self->tensor.impl());
    platform::CPUPlace place;
@@ -184,7 +184,7 @@ static PyObject* tensor_method_numpy(TensorObject* self, PyObject* args,
                                    pybind11::detail::array_proxy(array)->data),
                         place, dense_tensor->data(), sizeof_dtype * numel);
 #if defined(PADDLE_WITH_CUDA)
-  } else if (self->tensor.is_cuda()) {
+  } else if (self->tensor.is_gpu()) {
    auto dense_tensor =
        std::dynamic_pointer_cast<phi::DenseTensor>(self->tensor.impl());

@@ -218,8 +218,7 @@ static PyObject* tensor_method__copy_to(TensorObject* self, PyObject* args,
  EAGER_TRY
  auto place = CastPyArg2Place(PyTuple_GET_ITEM(args, 0), 0);
  bool blocking = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 1), 1);
-  auto cp_tensor =
-      self->tensor.copy_to(phi::TransToPhiBackend(place), blocking);
+  auto cp_tensor = self->tensor.copy_to(place, blocking);
  egr::EagerUtils::autograd_meta(&cp_tensor)->SetStopGradient(true);
  egr::EagerUtils::autograd_meta(&cp_tensor)
      ->SetPersistable(
@@ -231,8 +230,7 @@ static PyObject* tensor_method__copy_to(TensorObject* self, PyObject* args,
 static PyObject* tensor_method_cpu(TensorObject* self, PyObject* args,
                                   PyObject* kwargs) {
  EAGER_TRY
-  auto cp_tensor =
-      self->tensor.copy_to(phi::TransToPhiBackend(phi::CPUPlace()), true);
+  auto cp_tensor = self->tensor.copy_to(phi::CPUPlace(), true);
  egr::EagerUtils::autograd_meta(&cp_tensor)->SetStopGradient(true);
  egr::EagerUtils::autograd_meta(&cp_tensor)
      ->SetPersistable(

--- a/paddle/fluid/pybind/eager_utils.cc
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -929,28 +929,10 @@ std::vector<paddle::framework::Scope*> GetScopePtrListFromArgs(
  return result;
 }

-paddle::experimental::Backend CastPyArg2Backend(PyObject* obj,
-                                                const std::string& op_type,
-                                                ssize_t arg_pos) {
-  if (obj == Py_None) {
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "%s(): argument (position %d) must be "
-        "int or place, but got %s",
-        op_type, arg_pos + 1,
-        ((PyTypeObject*)obj->ob_type)->tp_name));  // NOLINT
-  }
-
-  PyTypeObject* type = obj->ob_type;
-  auto type_name = std::string(type->tp_name);
-  if (type_name == "int") {
-    int value = CastPyArg2Int(obj, op_type, arg_pos);
-    return static_cast<paddle::experimental::Backend>(value);
-  } else {
-    platform::Place place = CastPyArg2Place(obj, arg_pos);
-    return phi::TransToPhiBackend(place);
-  }
-
-  return paddle::experimental::Backend::CPU;
+paddle::experimental::Place CastPyArg2Place(PyObject* obj,
+                                            const std::string& op_type,
+                                            ssize_t arg_pos) {
+  return CastPyArg2Place(obj, arg_pos);
 }

 paddle::experimental::DataType CastPyArg2DataType(PyObject* obj,

--- a/paddle/fluid/pybind/eager_utils.h
+++ b/paddle/fluid/pybind/eager_utils.h
@@ -154,9 +154,9 @@ paddle::experimental::Scalar CastPyArg2Scalar(PyObject* obj,
 paddle::experimental::ScalarArray CastPyArg2ScalarArray(
    PyObject* obj, const std::string& op_type, ssize_t arg_pos);

-paddle::experimental::Backend CastPyArg2Backend(PyObject* obj,
-                                                const std::string& op_type,
-                                                ssize_t arg_pos);
+paddle::experimental::Place CastPyArg2Place(PyObject* obj,
+                                            const std::string& op_type,
+                                            ssize_t arg_pos);

 paddle::experimental::DataType CastPyArg2DataType(PyObject* obj,
                                                  const std::string& op_type,

--- a/paddle/phi/api/include/tensor.h
+++ b/paddle/phi/api/include/tensor.h
@@ -31,7 +31,6 @@ using gpuStream_t = hipStream_t;

 #include "paddle/phi/api/ext/dll_decl.h"
 #include "paddle/phi/api/ext/place.h"
-#include "paddle/phi/common/backend.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/layout.h"
 #include "paddle/phi/common/place.h"
@@ -269,12 +268,20 @@ class PADDLE_API Tensor final {
  bool is_cpu() const;

  /**
-   * @brief Determine whether the tensor device is CUDA
+   * @brief Determine whether the tensor device is GPU
   *
   * @return true
   * @return false
   */
-  bool is_cuda() const;
+  bool is_gpu() const;
+
+  /**
+   * @brief Determine whether the tensor device is GPU_PINNED
+   *
+   * @return true
+   * @return false
+   */
+  bool is_gpu_pinned() const;

  /* Part 4: Data Access methods */

@@ -407,11 +414,11 @@ class PADDLE_API Tensor final {
  /**
   * @brief Transfer the current Tensor to the specified device and return.
   *
-   * @param backend, The target backend of which the tensor will copy to.
+   * @param place, The target place of which the tensor will copy to.
   * @param blocking, Should we copy this in sync way.
   * @return Tensor
   */
-  Tensor copy_to(Backend backend, bool blocking) const;
+  Tensor copy_to(Place place, bool blocking) const;

  /**
   * @brief Transfer the source Tensor to current Tensor.

--- a/paddle/phi/api/lib/api_custom_impl.cc
+++ b/paddle/phi/api/lib/api_custom_impl.cc
@@ -19,6 +19,7 @@ limitations under the License. */
 #include "paddle/phi/api/lib/data_transform.h"
 #include "paddle/phi/api/lib/kernel_dispatch.h"
 #include "paddle/phi/api/lib/utils/storage.h"
+#include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/meta_tensor.h"
 #include "paddle/phi/infermeta/binary.h"
@@ -31,9 +32,10 @@ limitations under the License. */
 namespace paddle {
 namespace experimental {

-Tensor copy_to_impl(const Tensor& x, Backend backend, bool blocking) {
+Tensor copy_to_impl(const Tensor& x, Place place, bool blocking) {
  auto kernel_key_set = ParseKernelKeyByInputArgs(x);
-  kernel_key_set.backend_set = kernel_key_set.backend_set | BackendSet(backend);
+  kernel_key_set.backend_set =
+      kernel_key_set.backend_set | BackendSet(phi::TransToPhiBackend(place));
  auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
  auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
      "copy", kernel_key);
@@ -57,8 +59,7 @@ Tensor copy_to_impl(const Tensor& x, Backend backend, bool blocking) {
                                    phi::DenseTensor*);

  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
-  (*kernel_fn)(
-      *dev_ctx, *dense_x, phi::TransToPhiPlace(backend), blocking, kernel_out);
+  (*kernel_fn)(*dev_ctx, *dense_x, place, blocking, kernel_out);

  return out;
 }

--- a/paddle/phi/api/lib/api_custom_impl.h
+++ b/paddle/phi/api/lib/api_custom_impl.h
@@ -15,15 +15,14 @@ limitations under the License. */
 #pragma once

 #include "paddle/phi/api/include/tensor.h"
-#include "paddle/phi/common/backend.h"
+#include "paddle/phi/common/place.h"
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/common/scalar_array.h"

 namespace paddle {
 namespace experimental {

-// TODO(chenweihang): Replace backend by place when place is ready
-Tensor copy_to_impl(const Tensor& x, Backend backend, bool blocking);
+Tensor copy_to_impl(const Tensor& x, Place place, bool blocking);

 std::vector<Tensor> split_impl(const Tensor& x,
                               const ScalarArray& num_or_sections,

--- a/paddle/phi/api/lib/backend_set.h
+++ b/paddle/phi/api/lib/backend_set.h
@@ -35,7 +35,7 @@ class BackendSet final {
      : bitset_(b == Backend::UNDEFINED ? 0 : 1ULL << (static_cast<uint8_t>(b) -
                                                       1)) {}

-  uint64_t bitset() const { return bitset_; }
+  inline uint64_t bitset() const { return bitset_; }

  bool inline Has(Backend b) const {
    PD_CHECK(b != Backend::UNDEFINED, "Backend argument can't be UNDEFINED.");

--- a/paddle/phi/api/lib/data_transform.cc
+++ b/paddle/phi/api/lib/data_transform.cc
@@ -39,7 +39,7 @@ inline bool NeedTransformPlace(const paddle::platform::Place& input,
                               const TransformFlag& transform_flag) {
  bool ret = transform_flag.need_trans_backend() &&
             target != Backend::ALL_BACKEND &&
-             !platform::is_same_place(input, phi::TransToPhiPlace(target));
+             phi::TransToPhiBackend(input) != target;
  return ret;
 }

@@ -180,21 +180,20 @@ std::shared_ptr<phi::DenseTensor> PrepareData(
    const phi::TensorArgDef& target_args_def,
    const TransformFlag& transform_flag) {
  const auto& tensor_in = input.impl();
-  VLOG(6) << tensor_in->dtype() << "\t" << target_args_def.dtype;
-  if (!transform_flag.NeedTransform() || !tensor_in->initialized() ||
+  phi::DenseTensor& dense_tensor =
+      *static_cast<phi::DenseTensor*>(tensor_in.get());
+  if (!transform_flag.NeedTransform() || !dense_tensor.initialized() ||
      (!NeedTransformPlace(
-           tensor_in->place(), target_args_def.backend, transform_flag) &&
+           dense_tensor.place(), target_args_def.backend, transform_flag) &&
       !NeedTransformDataType(
-           tensor_in->dtype(), target_args_def.dtype, transform_flag) &&
+           dense_tensor.dtype(), target_args_def.dtype, transform_flag) &&
       !NeedTransformLayout(
-           tensor_in->layout(), target_args_def.layout, transform_flag))) {
+           dense_tensor.layout(), target_args_def.layout, transform_flag))) {
    return std::static_pointer_cast<phi::DenseTensor>(tensor_in);
  }

  phi::DenseTensor out =
-      TransformData(*(static_cast<phi::DenseTensor*>(tensor_in.get())),
-                    target_args_def,
-                    transform_flag);
+      TransformData(dense_tensor, target_args_def, transform_flag);
  return std::make_shared<phi::DenseTensor>(std::move(out));
 }


--- a/paddle/phi/api/lib/data_type_set.h
+++ b/paddle/phi/api/lib/data_type_set.h
@@ -30,7 +30,7 @@ class DataTypeSet final {
                    ? 0
                    : 1ULL << (static_cast<uint8_t>(dtype) - 1)) {}

-  uint64_t bitset() const { return bitset_; }
+  inline uint64_t bitset() const { return bitset_; }

  bool inline Has(DataType dtype) const {
    PD_CHECK(dtype != DataType::UNDEFINED,

--- a/paddle/phi/api/lib/kernel_dispatch.cc
+++ b/paddle/phi/api/lib/kernel_dispatch.cc
@@ -16,13 +16,16 @@ limitations under the License. */

 #include "paddle/phi/api/include/context_pool.h"
 #include "paddle/phi/core/compat/convert_utils.h"
+#ifdef _MSC_VER
+#include <intrin.h>
+#endif

 namespace paddle {
 namespace experimental {
 namespace detail {

-BackendSet GetTensorBackendSet(const Tensor& t) {
-  BackendSet backend_set(phi::TransToPhiBackend(t.inner_place()));
+BackendSet GetTensorBackendSet(const phi::TensorBase& t) {
+  BackendSet backend_set(phi::TransToPhiBackend(t.place()));
  switch (t.layout()) {
    case DataLayout::MKLDNN:
      backend_set = backend_set | BackendSet(Backend::MKLDNN);
@@ -35,6 +38,11 @@ BackendSet GetTensorBackendSet(const Tensor& t) {
 }

 std::size_t CountLeadingZeros(uint64_t val) {
+#if defined(__clang__) || defined(__GNUC__)
+  return __builtin_clzl(val);
+#elif defined(_MSC_VER)
+  return __lzcnt64(val);
+#else
  if (val == 0) {
    return 64;
  }
@@ -48,6 +56,7 @@ std::size_t CountLeadingZeros(uint64_t val) {
    }
  }
  return zero_bits;
+#endif
 }

 }  // namespace detail
@@ -82,13 +91,17 @@ DataType ParseDataTypeWithInputOrder(DataType dtype, const Tensor& tensor) {
  return dtype != DataType::UNDEFINED ? dtype : ParseDataType(tensor);
 }

-Backend ParseBackend(Backend backend) { return backend; }
+Backend ParseBackend(const Place& place) {
+  return phi::TransToPhiBackend(place);
+}
 Backend ParseBackend(const Tensor& tensor) {
  return phi::TransToPhiBackend(tensor.inner_place());
 }

-Backend ParseBackendWithInputOrder(Backend backend, const Tensor& tensor) {
-  return backend != Backend::UNDEFINED ? backend : ParseBackend(tensor);
+Backend ParseBackendWithInputOrder(const Place& place, const Tensor& tensor) {
+  return place.GetType() != phi::AllocationType::UNDEFINED
+             ? ParseBackend(place)
+             : ParseBackend(tensor);
 }

 DataLayout ParseLayout(DataLayout layout) { return layout; }

--- a/paddle/phi/api/lib/kernel_dispatch.h
+++ b/paddle/phi/api/lib/kernel_dispatch.h
@@ -33,7 +33,7 @@ namespace paddle {
 namespace experimental {

 namespace detail {
-BackendSet GetTensorBackendSet(const Tensor& t);
+BackendSet GetTensorBackendSet(const phi::TensorBase& t);
 std::size_t CountLeadingZeros(uint64_t val);
 }  // namespace detail

@@ -93,11 +93,13 @@ struct KernelKeyParser : ArgsIterator<KernelKeyParser> {
  // TODO(chenweihang): deal with multiple diff input Tensors
  // TODO(chenweihang): add global device guard method to set backend
  void operator()(const Tensor& x) {
-    key_set.backend_set = key_set.backend_set | detail::GetTensorBackendSet(x);
-    // TODO(chenweihang): selecte multi layout and dtype
-    key_set.layout = x.layout();
-    key_set.dtype = x.type();
-    dtype_set = dtype_set | DataTypeSet(x.dtype());
+    const phi::TensorBase& tensor = *x.impl();
+    key_set.backend_set =
+        key_set.backend_set | detail::GetTensorBackendSet(tensor);
+    // TODO(chenweihang): select multi layout and dtype
+    key_set.layout = tensor.layout();
+    key_set.dtype = tensor.dtype();
+    dtype_set = dtype_set | DataTypeSet(key_set.dtype);
    auto promote_result = PromoteTypes(dtype_set);
    if (promote_result != DataType::UNDEFINED) {
      key_set.dtype = promote_result;
@@ -105,11 +107,12 @@ struct KernelKeyParser : ArgsIterator<KernelKeyParser> {
  }

  void operator()(const std::vector<Tensor>& x) {
+    const phi::TensorBase& tensor = *x.at(0).impl();
    key_set.backend_set =
-        key_set.backend_set | detail::GetTensorBackendSet(x[0]);
-    // TODO(chenweihang): selecte multi layout and dtype
-    key_set.layout = x[0].layout();
-    key_set.dtype = x[0].type();
+        key_set.backend_set | detail::GetTensorBackendSet(tensor);
+    // TODO(chenweihang): select multi layout and dtype
+    key_set.layout = tensor.layout();
+    key_set.dtype = tensor.dtype();
  }

  // skip other type args, these args don't used in kernel selection
@@ -154,7 +157,7 @@ DataType ParseDataType(const Tensor& tensor);
 DataType ParseDataType(const std::vector<Tensor>& tensors);
 DataType ParseDataTypeWithInputOrder(DataType dtype, const Tensor& tensor);

-Backend ParseBackend(Backend backend);
+Backend ParseBackend(const Place& place);
 Backend ParseBackend(const Tensor& tensor);
 template <typename T, typename... Args>
 Backend ParseBackend(T t, Args... args) {
@@ -163,7 +166,7 @@ Backend ParseBackend(T t, Args... args) {
  return static_cast<Backend>(64 -
                              detail::CountLeadingZeros(backend_set.bitset()));
 }
-Backend ParseBackendWithInputOrder(Backend backend, const Tensor& tensor);
+Backend ParseBackendWithInputOrder(const Place& place, const Tensor& tensor);

 DataLayout ParseLayout(DataLayout layout);
 DataLayout ParseLayout(const Tensor& tensor);

--- a/paddle/phi/api/lib/tensor.cc
+++ b/paddle/phi/api/lib/tensor.cc
@@ -163,10 +163,14 @@ bool Tensor::is_cpu() const {
  return paddle::platform::is_cpu_place(inner_place());
 }

-bool Tensor::is_cuda() const {
+bool Tensor::is_gpu() const {
  return paddle::platform::is_gpu_place(inner_place());
 }

+bool Tensor::is_gpu_pinned() const {
+  return paddle::platform::is_cuda_pinned_place(inner_place());
+}
+
 /* Part 4: Data Access methods */

 template <typename T>

--- a/paddle/phi/api/lib/tensor_method.cc
+++ b/paddle/phi/api/lib/tensor_method.cc
@@ -27,14 +27,14 @@ namespace paddle {
 namespace experimental {
 // declare cast api
 Tensor cast(const Tensor &x, DataType out_dtype);
-Tensor copy_to(const Tensor &x, Backend backend, bool blocking);
+Tensor copy_to(const Tensor &x, Place place, bool blocking);

 Tensor Tensor::cast(DataType target_type) const {
  return experimental::cast(*this, target_type);
 }

-Tensor Tensor::copy_to(Backend backend, bool blocking) const {
-  return experimental::copy_to(*this, backend, blocking);
+Tensor Tensor::copy_to(Place place, bool blocking) const {
+  return experimental::copy_to(*this, place, blocking);
 }

 template <typename T>
@@ -44,7 +44,7 @@ Tensor Tensor::copy_to(const PlaceType &target_place) const {
                  "`copy_to` method without template argument instead. "
                  "reason: copying a Tensor to another device does not need "
                  "to specify the data type template argument.";
-  return copy_to(ConvertExtPlaceToBackend(target_place), /*blocking=*/false);
+  return copy_to(ConvertExtPlaceToInnerPlace(target_place), /*blocking=*/false);
 }

 template PADDLE_API Tensor

--- a/paddle/phi/common/place.h
+++ b/paddle/phi/common/place.h
@@ -203,5 +203,10 @@ namespace paddle {
 namespace experimental {
 using AllocationType = phi::AllocationType;
 using Place = phi::Place;
+using CPUPlace = phi::CPUPlace;
+using GPUPlace = phi::GPUPlace;
+using GPUPinnedPlace = phi::GPUPinnedPlace;
+using XPUPlace = phi::XPUPlace;
+using NPUPlace = phi::NPUPlace;
 }  // namespace experimental
 }  // namespace paddle
--- a/paddle/phi/core/compat/convert_utils.cc
+++ b/paddle/phi/core/compat/convert_utils.cc
@@ -26,13 +26,14 @@ limitations under the License. */
 namespace phi {

 Backend TransToPhiBackend(const phi::Place& place) {
-  if (place.GetType() == phi::AllocationType::CPU) {
+  auto allocation_type = place.GetType();
+  if (allocation_type == phi::AllocationType::CPU) {
    return Backend::CPU;
-  } else if (place.GetType() == phi::AllocationType::GPU) {
+  } else if (allocation_type == phi::AllocationType::GPU) {
    return Backend::GPU;
-  } else if (place.GetType() == phi::AllocationType::XPU) {
+  } else if (allocation_type == phi::AllocationType::XPU) {
    return Backend::XPU;
-  } else if (place.GetType() == phi::AllocationType::CUSTOM) {
+  } else if (allocation_type == phi::AllocationType::CUSTOM) {
    return static_cast<Backend>(
        static_cast<size_t>(Backend::NUM_BACKENDS) +
        GetOrRegisterGlobalDeviceTypeId(place.GetDeviceType()));

--- a/paddle/phi/infermeta/ternary.cc
+++ b/paddle/phi/infermeta/ternary.cc
@@ -145,6 +145,7 @@ void GraphSendRecvInferMeta(const MetaTensor& x,
                            const MetaTensor& src_index,
                            const MetaTensor& dst_index,
                            const std::string& pool_type,
+                            int64_t out_size,
                            MetaTensor* out,
                            MetaTensor* dst_count) {
  auto src_index_dims = src_index.dims();
@@ -187,11 +188,23 @@ void GraphSendRecvInferMeta(const MetaTensor& x,
                        "Src_index and Dst_index should have the same shape."));

  auto dims = x.dims();
-  out->set_dims(dims);
+  if (out_size <= 0) {
+    out->set_dims(dims);
+  } else {
+    std::vector<int64_t> dims_ = phi::vectorize(dims);
+    if (dims_.size() > 0) {
+      dims_[0] = out_size;
+    }
+    out->set_dims(phi::make_ddim(dims_));
+  }
  out->set_dtype(x.dtype());

  if (pool_type == "MEAN") {
-    dst_count->set_dims({dims[0]});
+    if (out_size <= 0) {
+      dst_count->set_dims({dims[0]});
+    } else {
+      dst_count->set_dims({out_size});
+    }
    dst_count->set_dtype(DataType::INT32);
  }
 }

--- a/paddle/phi/infermeta/ternary.h
+++ b/paddle/phi/infermeta/ternary.h
@@ -51,6 +51,7 @@ void GraphSendRecvInferMeta(const MetaTensor& x,
                            const MetaTensor& src_index,
                            const MetaTensor& dst_index,
                            const std::string& pool_type,
+                            int64_t out_size,
                            MetaTensor* out,
                            MetaTensor* dst_count);


--- a/paddle/phi/kernels/cpu/graph_send_recv_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/graph_send_recv_grad_kernel.cc
@@ -23,15 +23,14 @@
 namespace phi {

 template <typename T, typename IndexT, typename Functor>
-void GraphSendRecvCpuGradLoop(const int& input_size,
-                              const int& index_size,
+void GraphSendRecvCpuGradLoop(const int& index_size,
                              const IndexT* s_index,
                              const IndexT* d_index,
                              const DenseTensor& src,
+                              const DenseTensor& input,
                              DenseTensor* dst,
                              const std::string& pool_type,
                              const int* dst_count = nullptr,
-                              const DenseTensor* input = nullptr,
                              const DenseTensor* output = nullptr) {
  if (pool_type == "SUM") {
    Functor functor;
@@ -55,7 +54,7 @@ void GraphSendRecvCpuGradLoop(const int& input_size,
    for (int i = 0; i < index_size; ++i) {
      const IndexT& forward_src_idx = d_index[i];
      const IndexT& forward_dst_idx = s_index[i];
-      auto input_slice = input->Slice(forward_src_idx, forward_src_idx + 1);
+      auto input_slice = input.Slice(forward_src_idx, forward_src_idx + 1);
      auto output_slice = output->Slice(forward_dst_idx, forward_dst_idx + 1);
      auto eigen_input = phi::EigenVector<T>::Flatten(input_slice);
      auto eigen_output = phi::EigenVector<T>::Flatten(output_slice);
@@ -73,18 +72,18 @@ template <typename Context, typename T, typename IndexT>
 void GraphSendRecvGradOpKernelLaunchHelper(
    const Context& ctx,
    const DenseTensor& out_grad,
+    const DenseTensor& x,
    const DenseTensor& src_index,
    const DenseTensor& dst_index,
    const std::string& pool_type,
    DenseTensor* x_grad,
    const DenseTensor* dst_count = nullptr,
-    const DenseTensor* x = nullptr,
    const DenseTensor* out = nullptr) {
  const int& index_size = dst_index.dims()[0];

  ctx.template Alloc<T>(x_grad);
  T* p_output = x_grad->data<T>();
-  const auto& src_dims = out_grad.dims();
+  const auto& src_dims = x.dims();
  int64_t memset_size = 1;
  for (int i = 0; i < src_dims.size(); ++i) memset_size *= src_dims[i];
  const size_t& memset_bytes = memset_size * sizeof(T);
@@ -97,29 +96,22 @@ void GraphSendRecvGradOpKernelLaunchHelper(

  if (pool_type == "SUM") {
    GraphSendRecvCpuGradLoop<T, IndexT, GraphSendRecvSumFunctor<T>>(
-        src_dims[0], index_size, d_index, s_index, out_grad, x_grad, pool_type);
+        index_size, d_index, s_index, out_grad, x, x_grad, pool_type);
  } else if (pool_type == "MEAN") {
    const int* s_count = dst_count->data<int>();
    // Functor not used here.
-    GraphSendRecvCpuGradLoop<T, IndexT, GraphSendRecvSumFunctor<T>>(src_dims[0],
-                                                                    index_size,
-                                                                    d_index,
-                                                                    s_index,
-                                                                    out_grad,
-                                                                    x_grad,
-                                                                    pool_type,
-                                                                    s_count);
+    GraphSendRecvCpuGradLoop<T, IndexT, GraphSendRecvSumFunctor<T>>(
+        index_size, d_index, s_index, out_grad, x, x_grad, pool_type, s_count);
  } else if (pool_type == "MIN" || pool_type == "MAX") {
    // Functor not used here.
-    GraphSendRecvCpuGradLoop<T, IndexT, GraphSendRecvMinFunctor<T>>(src_dims[0],
-                                                                    index_size,
+    GraphSendRecvCpuGradLoop<T, IndexT, GraphSendRecvMinFunctor<T>>(index_size,
                                                                    d_index,
                                                                    s_index,
                                                                    out_grad,
+                                                                    x,
                                                                    x_grad,
                                                                    pool_type,
                                                                    nullptr,
-                                                                    x,
                                                                    out);
  }
 }
@@ -127,7 +119,7 @@ void GraphSendRecvGradOpKernelLaunchHelper(
 template <typename T, typename Context>
 void GraphSendRecvGradKernel(const Context& ctx,
                             const DenseTensor& out_grad,
-                             paddle::optional<const DenseTensor&> x,
+                             const DenseTensor& x,
                             paddle::optional<const DenseTensor&> out,
                             const DenseTensor& src_index,
                             const DenseTensor& dst_index,
@@ -139,23 +131,23 @@ void GraphSendRecvGradKernel(const Context& ctx,
    GraphSendRecvGradOpKernelLaunchHelper<Context, T, int32_t>(
        ctx,
        out_grad,
+        x,
        src_index,
        dst_index,
        pool_type,
        x_grad,
        dst_count.get_ptr(),
-        x.get_ptr(),
        out.get_ptr());
  } else if (index_type == phi::DataType::INT64) {
    GraphSendRecvGradOpKernelLaunchHelper<Context, T, int64_t>(
        ctx,
        out_grad,
+        x,
        src_index,
        dst_index,
        pool_type,
        x_grad,
        dst_count.get_ptr(),
-        x.get_ptr(),
        out.get_ptr());
  }
 }

--- a/paddle/phi/kernels/cpu/graph_send_recv_kernel.cc
+++ b/paddle/phi/kernels/cpu/graph_send_recv_kernel.cc
@@ -83,6 +83,7 @@ void GraphSendRecvOpKernelLaunchHelper(const Context& ctx,
                                       const DenseTensor& src_index,
                                       const DenseTensor& dst_index,
                                       const std::string& pool_type,
+                                       int64_t out_size,
                                       DenseTensor* out,
                                       DenseTensor* dst_count = nullptr) {
  const int& index_size = src_index.dims()[0];
@@ -91,7 +92,16 @@ void GraphSendRecvOpKernelLaunchHelper(const Context& ctx,
  T* p_output = out->data<T>();
  const auto& src_dims = x.dims();
  int64_t memset_size = 1;
-  for (int i = 0; i < src_dims.size(); ++i) memset_size *= src_dims[i];
+  if (out_size <= 0) {
+    for (int i = 0; i < src_dims.size(); ++i) {
+      memset_size *= src_dims[i];
+    }
+  } else {
+    memset_size = out_size;
+    for (int i = 1; i < src_dims.size(); ++i) {
+      memset_size *= src_dims[i];
+    }
+  }
  const size_t& memset_bytes = memset_size * sizeof(T);
  memset(p_output, 0, memset_bytes);

@@ -129,15 +139,16 @@ void GraphSendRecvKernel(const Context& ctx,
                         const DenseTensor& src_index,
                         const DenseTensor& dst_index,
                         const std::string& pool_type,
+                         int64_t out_size,
                         DenseTensor* out,
                         DenseTensor* dst_count) {
  auto index_type = src_index.dtype();
  if (index_type == phi::DataType::INT32) {
    GraphSendRecvOpKernelLaunchHelper<Context, T, int32_t>(
-        ctx, x, src_index, dst_index, pool_type, out, dst_count);
+        ctx, x, src_index, dst_index, pool_type, out_size, out, dst_count);
  } else if (index_type == phi::DataType::INT64) {
    GraphSendRecvOpKernelLaunchHelper<Context, T, int64_t>(
-        ctx, x, src_index, dst_index, pool_type, out, dst_count);
+        ctx, x, src_index, dst_index, pool_type, out_size, out, dst_count);
  }
 }


--- a/paddle/phi/kernels/cpu/reduce_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/reduce_grad_kernel.cc
@@ -99,8 +99,8 @@ void ReduceSumGradKernel(const Context& dev_ctx,

  ReduceGradKernel<Context, T, funcs::SumGradFunctor, true>(dev_ctx,
                                                            x,
-                                                            out_grad,
                                                            paddle::none,
+                                                            out_grad,
                                                            dims,
                                                            keep_dim,
                                                            reduce_all,
@@ -121,8 +121,8 @@ void ReduceMeanGradKernel(const Context& dev_ctx,
                          DenseTensor* x_grad) {
  ReduceGradKernel<Context, T, funcs::MeanGradFunctor, true>(dev_ctx,
                                                             x,
-                                                             out_grad,
                                                             paddle::none,
+                                                             out_grad,
                                                             dims,
                                                             keep_dim,
                                                             reduce_all,

--- a/paddle/phi/kernels/funcs/mode.h
+++ b/paddle/phi/kernels/funcs/mode.h
@@ -14,7 +14,7 @@

 #pragma once

-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(__NVCC__) || defined(__HIPCC__)
 #include <thrust/device_vector.h>
 #include <thrust/execution_policy.h>
 #include <thrust/extrema.h>
@@ -143,7 +143,7 @@ static void ModeAssign(const Type& input_height,
  }
 }

-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#if defined(__NVCC__) || defined(__HIPCC__)
 template <typename T>
 static void GetModebySort(const phi::GPUContext& dev_ctx,
                          const DenseTensor* input_tensor,

--- a/paddle/phi/kernels/funcs/select_impl.cu.h
+++ b/paddle/phi/kernels/funcs/select_impl.cu.h
@@ -168,57 +168,82 @@ __global__ void CumsumOneBlock(
  }
 }

+// where_index
 template <typename OutT,
          typename MT,
          typename InT,
-          typename IdT,
          typename Functor,
          int VecSize,
          int IsBoundary,
-          int IsMaskData>
+          int MaskData>
 struct SelectCaller {
-  __device__ void inline operator()(OutT *store_data,
+  __device__ void inline operator()(OutT *out,
                                    const MT *mask_data,
                                    const InT *in,
                                    Functor func,
-                                    int num,
-                                    int data_offset) {
-    // where_index op
-    IdT index_reg[VecSize];
-    // Set data index of global
-    kps::InitWithDataIndex<IdT, VecSize, 1, 1>(&index_reg[0], data_offset);
+                                    int data_offset,
+                                    int store_num,
+                                    int thread_fix,
+                                    int num) {
+    int64_t in_data[VecSize];
+    OutT store_data[VecSize * phi::DDim::kMaxRank];
+    // set index
+    kps::InitWithDataIndex<int64_t, VecSize, 1, 1>(&in_data[0], data_offset);
    // Get store data according to mask_idt
-    kps::OperatorTernary<MT, IdT, OutT, Functor>(
-        store_data, mask_data, &index_reg[0], func, VecSize);
+    kps::OperatorTernary<MT, int64_t, OutT, Functor>(
+        store_data, mask_data, &in_data[0], func, VecSize);
+    kps::details::WriteData<OutT>(out + thread_fix, &store_data[0], store_num);
  }
 };

+// masked_select
 template <typename OutT,
          typename MT,
          typename InT,
-          typename IdT,
          typename Functor,
          int VecSize,
          int IsBoundary>
-struct SelectCaller<OutT,
-                    MT,
-                    InT,
-                    IdT,
-                    Functor,
-                    VecSize,
-                    IsBoundary,
-                    1> {  // masked_select
-  __device__ void inline operator()(OutT *store_data,
+struct SelectCaller<OutT, MT, InT, Functor, VecSize, IsBoundary, 1> {
+  __device__ void inline operator()(OutT *out,
                                    const MT *mask_data,
                                    const InT *in,
                                    Functor func,
-                                    int num,
-                                    int data_offset) {
+                                    int data_offset,
+                                    int store_num,
+                                    int thread_fix,
+                                    int num) {
    InT in_data[VecSize];
+    OutT store_data[VecSize * phi::DDim::kMaxRank];
    kps::ReadData<InT, VecSize, 1, 1, IsBoundary>(&in_data[0], in, num);
    // Get store data according to mask_idt
    kps::OperatorTernary<MT, InT, OutT, Functor>(
        store_data, mask_data, &in_data[0], func, VecSize);
+    kps::details::WriteData<OutT>(out + thread_fix, &store_data[0], store_num);
+  }
+};
+
+// masked_select_grad
+template <typename OutT,
+          typename MT,
+          typename InT,
+          typename Functor,
+          int VecSize,
+          int IsBoundary>
+struct SelectCaller<OutT, MT, InT, Functor, VecSize, IsBoundary, 2> {
+  __device__ void inline operator()(OutT *out,
+                                    const MT *mask_data,
+                                    const InT *in,
+                                    Functor func,
+                                    int data_offset,
+                                    int store_num,
+                                    int thread_fix,
+                                    int num) {
+    InT in_data[VecSize];
+    OutT store_data[VecSize * phi::DDim::kMaxRank];
+    kps::details::ReadData<InT>(&in_data[0], in + thread_fix, store_num);
+    kps::OperatorTernary<MT, InT, OutT, Functor>(
+        store_data, mask_data, &in_data[0], func, VecSize);
+    kps::WriteData<OutT, VecSize, 1, 1, IsBoundary>(out, &store_data[0], num);
  }
 };

@@ -253,7 +278,6 @@ SelectKernelImpl(OutT *out,
  IdT num_thread[kCVecSize];
  IdT cumsum_thread[kCVecSize];

-  OutT store_data[VecSize * phi::DDim::kMaxRank];
  MT mask_data[VecSize];
  IdT mask_idt[VecSize];
  // init data_pr
@@ -271,17 +295,15 @@ SelectKernelImpl(OutT *out,
  // Get cumsum_thread cumsum from 0 to num_thread cumsum_thread[0] is the
  // thread_fix
  kps::Cumsum<IdT, IdT, 1, Add>(&cumsum_thread[0], &num_thread[0], Add());
-  // Get store data(index) according to mask_idt
-  SelectCaller<OutT, MT, InT, IdT, Functor, VecSize, IsBoundary, MaskData>
-      compute;
-  compute(&store_data[0], &mask_data[0], in, func, num, data_offset);
  // get thread_fix
  int thread_fix =
      (static_cast<int>(cumsum_thread[0] - num_thread[0]) * store_rank);
  // get how many data need to store
  int store_num = static_cast<int>(num_thread[0]) * store_rank;
  // thread store num data, each thread may has different num
-  kps::details::WriteData<OutT>(out + thread_fix, &store_data[0], store_num);
+  // Get store data(index) according to mask_idt
+  SelectCaller<OutT, MT, InT, Functor, VecSize, IsBoundary, MaskData> select;
+  select(out, mask_data, in, func, data_offset, store_num, thread_fix, num);
 }

 template <typename MT,
@@ -303,15 +325,17 @@ __global__ void SelectKernel(OutT *out,
  int stride = BLOCK_NUM_X * GRID_NUM_X * VecSize;
  int repeat = 0;
  int size = VecSize * BLOCK_ID_X;
+  CT block_store_offset = 0;
  for (; data_offset < main_offset; data_offset += stride) {
    // Cumsum index
    int idx_cumsum = repeat * GRID_NUM_X + BLOCK_ID_X;
-    // niuliling todo: us ReadData API
-    int block_store_offset = cumsum[idx_cumsum];
+    kps::details::ReadData<CT>(&block_store_offset, cumsum + idx_cumsum, 1);
+    int out_fix = MaskData < 2 ? block_store_offset * store_rank : data_offset;
+    int in_fix = MaskData < 2 ? data_offset : block_store_offset * store_rank;
    SelectKernelImpl<InT, MT, OutT, Functor, VecSize, MaskData, false>(
-        out + block_store_offset * store_rank,
+        out + out_fix,
        mask + data_offset,
-        in + data_offset,
+        in + in_fix,
        func,
        size,
        data_offset,
@@ -323,12 +347,13 @@ __global__ void SelectKernel(OutT *out,
  if (num > 0) {
    // Cumsum index
    int idx_cumsum = repeat * GRID_NUM_X + BLOCK_ID_X;
-    // niuliling todo: us ReadData API
-    int block_store_offset = static_cast<int>(cumsum[idx_cumsum]);
+    kps::details::ReadData<CT>(&block_store_offset, cumsum + idx_cumsum, 1);
+    int out_fix = MaskData < 2 ? block_store_offset * store_rank : data_offset;
+    int in_fix = MaskData < 2 ? data_offset : block_store_offset * store_rank;
    SelectKernelImpl<InT, MT, OutT, Functor, VecSize, MaskData, true>(
-        out + block_store_offset * store_rank,
+        out + out_fix,
        mask + data_offset,
-        in + data_offset,
+        in + in_fix,
        func,
        num,
        data_offset,
@@ -402,6 +427,7 @@ void SelectKernel(const KPDevice &dev_ctx,
  const int kCumVesize = 2;
  const int block_c = 256;
  const int main_offset_c = Floor(size_count_block, (kCumVesize * block_c));
+
  using Add = kps::AddFunctor<CT>;
  CumsumOneBlock<CT, CT, Add, kCumVesize><<<1, block_c, 0, stream>>>(
      count_data, cumsum_data, size_count_block, main_offset_c, Add());
@@ -418,10 +444,13 @@ void SelectKernel(const KPDevice &dev_ctx,
  dev_ctx.Wait();
  // 3.1.2 allock for out with total_true_num
  std::vector<int64_t> out_dim = {static_cast<int64_t>(total_true_num)};
-  if (SelectData == 0) {  // where_index
+
+  if (SelectData == 1) {
+    out->Resize(phi::make_ddim(out_dim));
+  } else if (SelectData == 0) {  // == 0 where_index
    out_dim.push_back(rank);
+    out->Resize(phi::make_ddim(out_dim));
  }
-  out->Resize(phi::make_ddim(out_dim));
  auto out_data = out->mutable_data<OutT>(cuda_place);
  // 3.2 get true data's index according to cond_data and cumsum_data
  if (total_true_num <= 0) return;

--- a/paddle/phi/kernels/gpu/copy_kernel.cu
+++ b/paddle/phi/kernels/gpu/copy_kernel.cu
@@ -87,7 +87,8 @@ void Copy(const Context& dev_ctx,
                 : reinterpret_cast<const phi::GPUContext&>(dev_ctx).stream();
    paddle::memory::Copy(
        dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
-  } else if (paddle::platform::is_cpu_place(src_place) &&  // NOLINT
+  } else if ((paddle::platform::is_cpu_place(src_place) ||
+              paddle::platform::is_cuda_pinned_place(src_place)) &&  // NOLINT
             paddle::platform::is_gpu_place(dst_place)) {
    auto src_cpu_place = src_place;
    auto dst_gpu_place = dst_place;

--- a/paddle/phi/kernels/gpu/graph_send_recv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/graph_send_recv_grad_kernel.cu
@@ -28,19 +28,19 @@ template <typename Context, typename T, typename IndexT>
 void GraphSendRecvGradOpCUDAKernelLaunchHelper(
    const Context& ctx,
    const DenseTensor& out_grad,
+    const DenseTensor& x,
    const DenseTensor& src_index,
    const DenseTensor& dst_index,
    const std::string& pool_type,
    DenseTensor* x_grad,
    const DenseTensor* dst_count = nullptr,
-    const DenseTensor* x = nullptr,
    const DenseTensor* out = nullptr) {
  const int& index_size = dst_index.dims()[0];

  ctx.template Alloc<T>(x_grad);
  T* p_output = x_grad->data<T>();

-  const auto& src_dims = out_grad.dims();
+  const auto& src_dims = x.dims();
  int64_t memset_size = 1;
  for (int i = 0; i < src_dims.size(); ++i) {
    memset_size *= src_dims[i];
@@ -86,7 +86,7 @@ void GraphSendRecvGradOpCUDAKernelLaunchHelper(
    ManipulateMeanGradCUDAKernel<T, IndexT><<<grid, block, 0, ctx.stream()>>>(
        p_src, d_index, s_index, p_output, index_size, slice_size, s_count);
  } else if (pool_type == "MAX" || pool_type == "MIN") {
-    const T* ptr_input = x->data<T>();
+    const T* ptr_input = x.data<T>();
    const T* ptr_output = out->data<T>();
    ManipulateMinMaxGradCUDAKernel<T, IndexT><<<grid, block, 0, ctx.stream()>>>(
        p_src,
@@ -103,7 +103,7 @@ void GraphSendRecvGradOpCUDAKernelLaunchHelper(
 template <typename T, typename Context>
 void GraphSendRecvGradKernel(const Context& ctx,
                             const DenseTensor& out_grad,
-                             paddle::optional<const DenseTensor&> x,
+                             const DenseTensor& x,
                             paddle::optional<const DenseTensor&> out,
                             const DenseTensor& src_index,
                             const DenseTensor& dst_index,
@@ -115,23 +115,23 @@ void GraphSendRecvGradKernel(const Context& ctx,
    GraphSendRecvGradOpCUDAKernelLaunchHelper<Context, T, int32_t>(
        ctx,
        out_grad,
+        x,
        src_index,
        dst_index,
        pool_type,
        x_grad,
        dst_count.get_ptr(),
-        x.get_ptr(),
        out.get_ptr());
  } else if (index_type == phi::DataType::INT64) {
    GraphSendRecvGradOpCUDAKernelLaunchHelper<Context, T, int64_t>(
        ctx,
        out_grad,
+        x,
        src_index,
        dst_index,
        pool_type,
        x_grad,
        dst_count.get_ptr(),
-        x.get_ptr(),
        out.get_ptr());
  }
 }

--- a/paddle/phi/kernels/gpu/graph_send_recv_kernel.cu
+++ b/paddle/phi/kernels/gpu/graph_send_recv_kernel.cu
@@ -32,6 +32,7 @@ void GraphSendRecvOpCUDAKernelLaunchHelper(const Context& ctx,
                                           const DenseTensor& src_index,
                                           const DenseTensor& dst_index,
                                           const std::string& pool_type,
+                                           int64_t out_size,
                                           DenseTensor* out,
                                           DenseTensor* dst_count = nullptr) {
  const int& index_size = src_index.dims()[0];
@@ -39,8 +40,15 @@ void GraphSendRecvOpCUDAKernelLaunchHelper(const Context& ctx,
  T* p_output = out->data<T>();
  const auto& src_dims = x.dims();
  int64_t memset_size = 1;
-  for (int i = 0; i < src_dims.size(); ++i) {
-    memset_size *= src_dims[i];
+  if (out_size <= 0) {
+    for (int i = 0; i < src_dims.size(); ++i) {
+      memset_size *= src_dims[i];
+    }
+  } else {
+    memset_size = out_size;
+    for (int i = 1; i < src_dims.size(); ++i) {
+      memset_size *= src_dims[i];
+    }
  }
  const size_t& memset_bytes = memset_size * sizeof(T);
  if (pool_type == "SUM" || pool_type == "MEAN") {
@@ -100,6 +108,9 @@ void GraphSendRecvOpCUDAKernelLaunchHelper(const Context& ctx,
                                    IndexT>><<<grid, block, 0, ctx.stream()>>>(
        p_src, s_index, d_index, p_output, index_size, slice_size, functor);

+    if (out_size > 0) {
+      input_size = out_size;
+    }
    int64_t grid_max_tmp = (input_size * slice_size + block - 1) / block;
    int64_t grid_max =
        grid_max_tmp < max_grid_dimx ? grid_max_tmp : max_grid_dimx;
@@ -114,6 +125,9 @@ void GraphSendRecvOpCUDAKernelLaunchHelper(const Context& ctx,
                                    IndexT>><<<grid, block, 0, ctx.stream()>>>(
        p_src, s_index, d_index, p_output, index_size, slice_size, functor);

+    if (out_size > 0) {
+      input_size = out_size;
+    }
    int64_t grid_min_tmp = (input_size * slice_size + block - 1) / block;
    int64_t grid_min =
        grid_min_tmp < max_grid_dimx ? grid_min_tmp : max_grid_dimx;
@@ -130,6 +144,9 @@ void GraphSendRecvOpCUDAKernelLaunchHelper(const Context& ctx,

    ctx.template Alloc<int32_t>(dst_count);
    int32_t* p_dst_count = dst_count->data<int32_t>();
+    if (out_size > 0) {
+      input_size = out_size;
+    }

 #ifdef PADDLE_WITH_HIP
    hipMemset(p_dst_count, 0, input_size * sizeof(int));
@@ -155,15 +172,16 @@ void GraphSendRecvKernel(const Context& ctx,
                         const DenseTensor& src_index,
                         const DenseTensor& dst_index,
                         const std::string& pool_type,
+                         int64_t out_size,
                         DenseTensor* out,
                         DenseTensor* dst_count) {
  auto index_type = src_index.dtype();
  if (index_type == phi::DataType::INT32) {
    GraphSendRecvOpCUDAKernelLaunchHelper<Context, T, int32_t>(
-        ctx, x, src_index, dst_index, pool_type, out, dst_count);
+        ctx, x, src_index, dst_index, pool_type, out_size, out, dst_count);
  } else if (index_type == phi::DataType::INT64) {
    GraphSendRecvOpCUDAKernelLaunchHelper<Context, T, int64_t>(
-        ctx, x, src_index, dst_index, pool_type, out, dst_count);
+        ctx, x, src_index, dst_index, pool_type, out_size, out, dst_count);
  }
 }


--- a/paddle/phi/kernels/gpu/masked_select_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/masked_select_grad_kernel.cu
@@ -17,38 +17,31 @@
 #include <thrust/reverse.h>
 #include <thrust/scan.h>

-#include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/select_impl.cu.h"
 #include "paddle/phi/kernels/masked_select_grad_kernel.h"

+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
 namespace phi {

-__global__ void SetMaskArrayT(const bool* mask, int32_t* mask_array, int size) {
-  int idx = blockDim.x * blockIdx.x + threadIdx.x;
-  for (; idx < size; idx += blockDim.x * gridDim.x) {
-    if (mask[idx])
-      mask_array[idx] = 1;
-    else
-      mask_array[idx] = 0;
-  }
-}
+template <typename MT, typename InT, typename OutT>
+struct MaskedSelectGradFunctor {
+  HOSTDEVICE MaskedSelectGradFunctor() {}

-template <typename T>
-__global__ void SelectGradWithPrefixMask(const int32_t* mask_prefix_sum,
-                                         const bool* mask,
-                                         const T* input,
-                                         T* out,
-                                         int size) {
-  int idx = blockDim.x * blockIdx.x + threadIdx.x;
-  for (; idx < size; idx += blockDim.x * gridDim.x) {
-    if (mask[idx]) {
-      int index = mask_prefix_sum[idx];
-      out[idx] = input[index];
-    } else {
-      out[idx] = 0;
+  HOSTDEVICE inline void operator()(OutT* out,
+                                    const MT* mask,
+                                    const InT* value,
+                                    int num) {
+    int read_fix = 0;
+    for (int idx = 0; idx < num; idx++) {
+      if (mask[idx]) {
+        out[idx] = value[read_fix++];
+      } else {
+        out[idx] = 0;
+      }
    }
  }
-}
+};

 template <typename T, typename Context>
 void MaskedSelectGradKernel(const Context& dev_ctx,
@@ -56,42 +49,12 @@ void MaskedSelectGradKernel(const Context& dev_ctx,
                            const DenseTensor& x,
                            const DenseTensor& mask,
                            DenseTensor* x_grad) {
-  auto* mask_data = mask.data<bool>();
-  auto* input_data = out_grad.data<T>();
-  auto* out_data = x_grad->mutable_data<T>(dev_ctx.GetPlace());
-
-  auto input_size = out_grad.numel();
  auto mask_size = mask.numel();
-  auto mask_dim = mask.dims();
-
-  auto out_size = mask_size;
-
-  DenseTensor mask_array;
-  DenseTensor mask_prefix_sum;
-  mask_array.Resize(mask_dim);
-  mask_prefix_sum.Resize(mask_dim);
-
-  int32_t* mask_array_data =
-      mask_array.mutable_data<int32_t>(dev_ctx.GetPlace());
-  int32_t* mask_prefix_sum_data =
-      mask_prefix_sum.mutable_data<int32_t>(dev_ctx.GetPlace());
-  int threads = 512;
-  int grid = (mask_size + threads - 1) / threads;
-  auto stream = dev_ctx.stream();
-  SetMaskArrayT<<<grid, threads, 0, stream>>>(
-      mask_data, mask_array_data, mask_size);
-
-  thrust::device_ptr<int32_t> mask_array_dev_ptr =
-      thrust::device_pointer_cast(mask_array_data);
-  thrust::device_vector<int32_t> mask_array_vec(mask_array_dev_ptr,
-                                                mask_array_dev_ptr + mask_size);
-  thrust::exclusive_scan(thrust::device,
-                         mask_array_vec.begin(),
-                         mask_array_vec.end(),
-                         mask_prefix_sum_data);
-
-  SelectGradWithPrefixMask<T><<<grid, threads, 0, stream>>>(
-      mask_prefix_sum_data, mask_data, input_data, out_data, mask_size);
+  auto* out_data = x_grad->mutable_data<T>(dev_ctx.GetPlace());
+  if (mask_size <= 0) return;
+  using Functor = MaskedSelectGradFunctor<bool, T, T>;
+  phi::funcs::SelectKernel<bool, T, T, 2, Functor>(
+      dev_ctx, mask, out_grad, x_grad, Functor());
 }

 }  // namespace phi

--- a/paddle/phi/kernels/gpu/masked_select_kernel.cu
+++ b/paddle/phi/kernels/gpu/masked_select_kernel.cu
@@ -17,11 +17,12 @@
 #include <thrust/reverse.h>
 #include <thrust/scan.h>

-#include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/select_impl.cu.h"
 #include "paddle/phi/kernels/masked_select_kernel.h"

+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
 namespace phi {

 template <typename MT, typename InT, typename OutT>

--- a/paddle/phi/kernels/gpu/where_index_kernel.cu
+++ b/paddle/phi/kernels/gpu/where_index_kernel.cu
@@ -20,13 +20,14 @@
 namespace cub = hipcub;
 #endif

-#include "paddle/phi/core/ddim.h"
-#include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/funcs/select_impl.cu.h"
 #include "paddle/phi/kernels/where_index_kernel.h"

+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+
 namespace phi {
 template <typename T1, typename T2, typename OutT>
 struct IndexFunctor {

--- a/paddle/phi/kernels/graph_send_recv_grad_kernel.h
+++ b/paddle/phi/kernels/graph_send_recv_grad_kernel.h
@@ -23,7 +23,7 @@ namespace phi {
 template <typename T, typename Context>
 void GraphSendRecvGradKernel(const Context& ctx,
                             const DenseTensor& out_grad,
-                             paddle::optional<const DenseTensor&> x,
+                             const DenseTensor& x,
                             paddle::optional<const DenseTensor&> out,
                             const DenseTensor& src_index,
                             const DenseTensor& dst_index,

--- a/paddle/phi/kernels/graph_send_recv_kernel.h
+++ b/paddle/phi/kernels/graph_send_recv_kernel.h
@@ -25,6 +25,7 @@ void GraphSendRecvKernel(const Context& ctx,
                         const DenseTensor& src_index,
                         const DenseTensor& dst_index,
                         const std::string& pool_type,
+                         int64_t out_size,
                         DenseTensor* out,
                         DenseTensor* dst_count);


--- a/paddle/phi/kernels/impl/frobenius_norm_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/frobenius_norm_grad_kernel_impl.h
@@ -33,7 +33,7 @@ void FrobeniusNormGradKernel(const Context& ctx,
                             DataType out_dtype,
                             DenseTensor* dx) {
  ReduceGradKernel<Context, T, funcs::FrobeniusNormGradFunctor>(
-      ctx, x, dout, out, axis, keep_dim, reduce_all, in_dtype, out_dtype, dx);
+      ctx, x, out, dout, axis, keep_dim, reduce_all, in_dtype, out_dtype, dx);
 }

 }  // namespace phi
--- a/paddle/phi/kernels/impl/reduce_grad.h
+++ b/paddle/phi/kernels/impl/reduce_grad.h
@@ -87,8 +87,8 @@ template <typename Context,
          bool kNoNeedBufferY = false>
 void ReduceGradKernel(const Context& dev_ctx,
                      const DenseTensor& x,
-                      const DenseTensor& out_grad,
                      const paddle::optional<DenseTensor>& out,
+                      const DenseTensor& out_grad,
                      const std::vector<int64_t>& dims,
                      bool keep_dim,
                      bool reduce_all,

--- a/paddle/phi/kernels/impl/reduce_max_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/reduce_max_grad_kernel_impl.h
@@ -24,8 +24,8 @@ namespace phi {
 template <typename T, typename Context>
 void ReduceMaxGradKernel(const Context& dev_ctx,
                         const DenseTensor& x,
-                         const DenseTensor& out_grad,
                         const DenseTensor& out,
+                         const DenseTensor& out_grad,
                         const std::vector<int64_t>& dims,
                         bool keep_dim,
                         bool reduce_all,
@@ -34,8 +34,8 @@ void ReduceMaxGradKernel(const Context& dev_ctx,
                         DenseTensor* x_grad) {
  ReduceGradKernel<Context, T, funcs::MaxOrMinGradFunctor>(dev_ctx,
                                                           x,
-                                                           out_grad,
                                                           out,
+                                                           out_grad,
                                                           dims,
                                                           keep_dim,
                                                           reduce_all,

--- a/paddle/phi/kernels/impl/reduce_min_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/reduce_min_grad_kernel_impl.h
@@ -24,8 +24,8 @@ namespace phi {
 template <typename T, typename Context>
 void ReduceMinGradKernel(const Context& dev_ctx,
                         const DenseTensor& x,
-                         const DenseTensor& out_grad,
                         const DenseTensor& out,
+                         const DenseTensor& out_grad,
                         const std::vector<int64_t>& dims,
                         bool keep_dim,
                         bool reduce_all,
@@ -34,8 +34,8 @@ void ReduceMinGradKernel(const Context& dev_ctx,
                         DenseTensor* x_grad) {
  ReduceGradKernel<Context, T, funcs::MaxOrMinGradFunctor>(dev_ctx,
                                                           x,
-                                                           out_grad,
                                                           out,
+                                                           out_grad,
                                                           dims,
                                                           keep_dim,
                                                           reduce_all,

--- a/paddle/phi/kernels/impl/reduce_prod_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/reduce_prod_grad_kernel_impl.h
@@ -24,8 +24,8 @@ namespace phi {
 template <typename T, typename Context>
 void ReduceProdGradKernel(const Context& dev_ctx,
                          const DenseTensor& x,
-                          const DenseTensor& out_grad,
                          const DenseTensor& out,
+                          const DenseTensor& out_grad,
                          const std::vector<int64_t>& dims,
                          bool keep_dim,
                          bool reduce_all,
@@ -34,8 +34,8 @@ void ReduceProdGradKernel(const Context& dev_ctx,
                          DenseTensor* x_grad) {
  ReduceGradKernel<Context, T, funcs::ProdGradFunctor>(dev_ctx,
                                                       x,
-                                                       out_grad,
                                                       out,
+                                                       out_grad,
                                                       dims,
                                                       keep_dim,
                                                       reduce_all,

--- a/paddle/phi/kernels/primitive/datamover_primitives.h
+++ b/paddle/phi/kernels/primitive/datamover_primitives.h
@@ -123,6 +123,15 @@ __device__ __forceinline__ void WriteData(T* dst,
    dst[i] = src[i];
  }
 }
+
+template <typename T>
+__device__ __forceinline__ void ReadData(T* dst,
+                                         const T* __restrict__ src,
+                                         int num) {
+  for (int i = 0; i < num; i++) {
+    dst[i] = src[i];
+  }
+}
 #undef INT_BITS
 }  // namespace details


--- a/paddle/phi/kernels/reduce_grad_kernel.h
+++ b/paddle/phi/kernels/reduce_grad_kernel.h
@@ -43,8 +43,8 @@ void ReduceMeanGradKernel(const Context& dev_ctx,
 template <typename T, typename Context>
 void ReduceProdGradKernel(const Context& dev_ctx,
                          const DenseTensor& x,
-                          const DenseTensor& out_grad,
                          const DenseTensor& out,
+                          const DenseTensor& out_grad,
                          const std::vector<int64_t>& dims,
                          bool keep_dim,
                          bool reduce_all,
@@ -55,8 +55,8 @@ void ReduceProdGradKernel(const Context& dev_ctx,
 template <typename T, typename Context>
 void ReduceMaxGradKernel(const Context& dev_ctx,
                         const DenseTensor& x,
-                         const DenseTensor& out_grad,
                         const DenseTensor& out,
+                         const DenseTensor& out_grad,
                         const std::vector<int64_t>& dims,
                         bool keep_dim,
                         bool reduce_all,
@@ -67,8 +67,8 @@ void ReduceMaxGradKernel(const Context& dev_ctx,
 template <typename T, typename Context>
 void ReduceMinGradKernel(const Context& dev_ctx,
                         const DenseTensor& x,
-                         const DenseTensor& out_grad,
                         const DenseTensor& out,
+                         const DenseTensor& out_grad,
                         const std::vector<int64_t>& dims,
                         bool keep_dim,
                         bool reduce_all,

--- a/paddle/phi/ops/compat/batch_norm_sig.cc
+++ b/paddle/phi/ops/compat/batch_norm_sig.cc
@@ -18,10 +18,17 @@ namespace phi {

 KernelSignature BatchNormOpArgumentMapping(const ArgumentMappingContext& ctx) {
  bool is_test = paddle::any_cast<bool>(ctx.Attr("is_test"));
-  bool use_global_stats = paddle::any_cast<bool>(ctx.Attr("use_global_stats"));
+  bool use_global_stats =
+      ctx.HasAttr("use_global_stats")
+          ? paddle::any_cast<bool>(ctx.Attr("use_global_stats"))
+          : false;
  bool trainable_statistics =
-      paddle::any_cast<bool>(ctx.Attr("trainable_statistics"));
-  bool fuse_with_relu = paddle::any_cast<bool>(ctx.Attr("fuse_with_relu"));
+      ctx.HasAttr("trainable_statistics")
+          ? paddle::any_cast<bool>(ctx.Attr("trainable_statistics"))
+          : false;
+  bool fuse_with_relu = ctx.HasAttr("fuse_with_relu")
+                            ? paddle::any_cast<bool>(ctx.Attr("fuse_with_relu"))
+                            : false;
  // Dispenable `MomentumTensor` is useless now
  if (is_test && !use_global_stats && !trainable_statistics &&
      !fuse_with_relu) {

--- a/paddle/phi/ops/compat/graph_send_recv_sig.cc
+++ b/paddle/phi/ops/compat/graph_send_recv_sig.cc
@@ -16,6 +16,14 @@ limitations under the License. */

 namespace phi {

+KernelSignature GraphSendRecvOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("graph_send_recv",
+                         {"X", "Src_index", "Dst_index"},
+                         {"pool_type", "out_size"},
+                         {"Out", "Dst_count"});
+}
+
 KernelSignature GraphSendRecvGradOpArgumentMapping(
    const ArgumentMappingContext& ctx) {
  return KernelSignature(
@@ -27,5 +35,8 @@ KernelSignature GraphSendRecvGradOpArgumentMapping(

 }  // namespace phi

+PD_REGISTER_ARG_MAPPING_FN(graph_send_recv,
+                           phi::GraphSendRecvOpArgumentMapping);
+
 PD_REGISTER_ARG_MAPPING_FN(graph_send_recv_grad,
                           phi::GraphSendRecvGradOpArgumentMapping);
--- a/paddle/phi/ops/compat/reduce_sig.cc
+++ b/paddle/phi/ops/compat/reduce_sig.cc
@@ -149,7 +149,7 @@ KernelSignature ReduceMaxGradOpArgumentMapping(
    const ArgumentMappingContext& ctx) {
  return KernelSignature(
      "max_grad",
-      {"X", GradVarName("Out"), "Out"},
+      {"X", "Out", GradVarName("Out")},
      {"dim", "keep_dim", "reduce_all", "in_dtype", "out_dtype"},
      {GradVarName("X")});
 }
@@ -158,7 +158,7 @@ KernelSignature ReduceMinGradOpArgumentMapping(
    const ArgumentMappingContext& ctx) {
  return KernelSignature(
      "min_grad",
-      {"X", GradVarName("Out"), "Out"},
+      {"X", "Out", GradVarName("Out")},
      {"dim", "keep_dim", "reduce_all", "in_dtype", "out_dtype"},
      {GradVarName("X")});
 }
@@ -167,7 +167,7 @@ KernelSignature ReduceProdGradOpArgumentMapping(
    const ArgumentMappingContext& ctx) {
  return KernelSignature(
      "prod_grad",
-      {"X", GradVarName("Out"), "Out"},
+      {"X", "Out", GradVarName("Out")},
      {"dim", "keep_dim", "reduce_all", "in_dtype", "out_dtype"},
      {GradVarName("X")});
 }

--- a/paddle/phi/tests/api/test_data_transform.cc
+++ b/paddle/phi/tests/api/test_data_transform.cc
@@ -17,6 +17,7 @@ limitations under the License. */

 #include "paddle/phi/api/include/api.h"
 #include "paddle/phi/common/complex.h"
+#include "paddle/phi/common/place.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/kernel_registry.h"
@@ -39,10 +40,10 @@ TEST(API, data_transform_same_place) {
  auto x = paddle::experimental::full({3, 3},
                                      1.0,
                                      experimental::DataType::COMPLEX128,
-                                      experimental::Backend::CPU);
+                                      experimental::CPUPlace());

  auto y = paddle::experimental::full(
-      {3, 3}, 2.0, experimental::DataType::FLOAT32, experimental::Backend::CPU);
+      {3, 3}, 2.0, experimental::DataType::FLOAT32, experimental::CPUPlace());

  std::vector<phi::dtype::complex<double>> sum(9, 6.0);

@@ -74,10 +75,10 @@ TEST(API, data_transform_same_place) {
 TEST(Tensor, data_transform_diff_place) {
  // 1. create tensor
  auto x = paddle::experimental::full(
-      {3, 3}, 1.0, experimental::DataType::FLOAT64, experimental::Backend::CPU);
+      {3, 3}, 1.0, experimental::DataType::FLOAT64, experimental::CPUPlace());

  auto y = paddle::experimental::full(
-      {3, 3}, 2.0, experimental::DataType::FLOAT64, experimental::Backend::GPU);
+      {3, 3}, 2.0, experimental::DataType::FLOAT64, experimental::GPUPlace());

  std::vector<float> sum(9, 6.0);

@@ -95,7 +96,7 @@ TEST(Tensor, data_transform_diff_place) {
  ASSERT_EQ(out.impl()->place(),
            phi::TransToPhiPlace(experimental::Backend::GPU));

-  auto ref_out = experimental::copy_to(out, experimental::Backend::CPU, true);
+  auto ref_out = experimental::copy_to(out, experimental::CPUPlace(), true);

  auto dense_out = std::dynamic_pointer_cast<phi::DenseTensor>(ref_out.impl());
  for (size_t i = 0; i < 9; i++) {

--- a/paddle/phi/tests/api/test_scale_benchmark.cc
+++ b/paddle/phi/tests/api/test_scale_benchmark.cc
@@ -30,7 +30,7 @@ namespace tests {

 TEST(API, scale) {
  auto x = experimental::full(
-      {3, 4}, 1.0, experimental::DataType::FLOAT32, experimental::Backend::CPU);
+      {3, 4}, 1.0, experimental::DataType::FLOAT32, experimental::CPUPlace());

  const size_t cycles = 300;
  phi::tests::Timer timer;

--- a/paddle/phi/tests/api/test_to_api.cc
+++ b/paddle/phi/tests/api/test_to_api.cc
@@ -69,10 +69,10 @@ TEST(API, copy_to) {

 // 2. test API
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  auto tmp = paddle::experimental::copy_to(x, phi::Backend::GPU, false);
-  auto out = paddle::experimental::copy_to(tmp, phi::Backend::CPU, true);
+  auto tmp = paddle::experimental::copy_to(x, phi::GPUPlace(), false);
+  auto out = paddle::experimental::copy_to(tmp, phi::CPUPlace(), true);
 #else
-  auto out = paddle::experimental::copy_to(x, phi::Backend::CPU, false);
+  auto out = paddle::experimental::copy_to(x, phi::CPUPlace(), false);
 #endif

  // 3. check result
@@ -85,10 +85,10 @@ TEST(Tensor, copy_to) {

 // 2. test API
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  auto tmp = x.copy_to(phi::Backend::GPU, false);
-  auto out = tmp.copy_to(phi::Backend::CPU, true);
+  auto tmp = x.copy_to(phi::GPUPlace(), false);
+  auto out = tmp.copy_to(phi::CPUPlace(), true);
 #else
-  auto out = x.copy_to(phi::Backend::CPU, false);
+  auto out = x.copy_to(phi::CPUPlace(), false);
 #endif

  // 3. check result

--- a/python/paddle/distributed/run/__init__.py
+++ b/python/paddle/distributed/run/__init__.py
@@ -12,74 +12,69 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-from .job.container import Container
-from .job.pod import Pod
-from .job.job import Job
-from . import plugins
-
-#__all__ = [Container, Pod, Job]
+__all__ = []
 '''
-Paddle distribution training entry ``python -m paddle.distributed.run``.
+Paddle distributed training entry ``python -m paddle.distributed.launch``.

 Help

 # for arg usage and explanation, try the following command
-# python -m paddle.distributed.run -h
+# python -m paddle.distributed.launch -h

 Collective Mode

 Case 1: 1 node

 use all visible devices
-# python -m paddle.distributed.run train.py
+# python -m paddle.distributed.launch train.py

 use specified devices
-# python -m paddle.distributed.run --devices=0,1,2,3 train.py
+# python -m paddle.distributed.launch --devices=0,1,2,3 train.py

 Case 2: multi-node, auto detect ip/port

-# python -m paddle.distributed.run --np 2 train.py
+# python -m paddle.distributed.launch --nnodes 2 train.py
 # auto print following command
-# python -m paddle.distributed.run --master 10.0.0.1:13538 --np 2 demo.py
+# python -m paddle.distributed.launch --master 10.0.0.1:13538 --nnodes 2 demo.py
 # then copy and paste above command to other nodes

 Case 3: multi-node, specified master/rendezvous server

-# python -m paddle.distributed.run --np 2 --master 10.0.0.1:2379 train.py
+# python -m paddle.distributed.launch --nnodes 2 --master 10.0.0.1:2379 train.py
 # the master ip must be one of the node and the port must available

 Parameter Server Mode

 Case 1.1: 1 node, 1 ps, 1 trainer

-# python -m paddle.distributed.run --mode ps train.py
-# python -m paddle.distributed.run --server_num=1 --trainer_num=1 train.py
+# python -m paddle.distributed.launch --mode ps train.py
+# python -m paddle.distributed.launch --server_num=1 --trainer_num=1 train.py

 Case 1.2: 1 node, 2 ps, 2 trainer

-# python -m paddle.distributed.run --server_num=2 --trainer_num=2 train.py
+# python -m paddle.distributed.launch --server_num=2 --trainer_num=2 train.py

 Case 2: 2 node, 2 ps, 2 trainer per node

-# python -m paddle.distributed.run --server_num=2 --trainer_num=2 --np 2 train.py
+# python -m paddle.distributed.launch --server_num=2 --trainer_num=2 --nnodes 2 train.py
 # auto print following command
-# python -m paddle.distributed.run --master 10.0.0.1:13538 --server_num=2 --trainer_num=2 --np 2 train.py
+# python -m paddle.distributed.launch --master 10.0.0.1:13538 --server_num=2 --trainer_num=2 --nnodes 2 train.py
 # then copy and paste above command to other nodes

 Case 3: multi-node, specified master/rendezvous server

-# python -m paddle.distributed.run --master 10.0.0.1:13538 --server_num=2 --trainer_num=2 --np 2 train.py
+# python -m paddle.distributed.launch --master 10.0.0.1:13538 --server_num=2 --trainer_num=2 --nnodes 2 train.py
 # the master ip must be one of the node and the port must available

 Case 4: specified servers and trainers in each node

-python -m paddle.distributed.run --servers 127.0.0.1:8900,127.0.0.1:8901 --trainers 127.0.0.1:8902,127.0.0.1:8903 train.py
+python -m paddle.distributed.launch --servers 127.0.0.1:8900,127.0.0.1:8901 --trainers 127.0.0.1:8902,127.0.0.1:8903 train.py


 Elastic Mode

 # run following command in 3 node to run immediately, or in 2 node to run after elastic_timeout
-# python -m paddle.distributed.run --master etcd://10.0.0.1:2379 --np 2:3 train.py
+# python -m paddle.distributed.launch --master etcd://10.0.0.1:2379 --nnodes 2:3 train.py

 # once the peer number changes between 2:3, the strategy holds


--- a/python/paddle/distributed/run/__main__.py
+++ b/python/paddle/distributed/run/__main__.py
@@ -15,14 +15,28 @@
 from .context import Context
 from . import controllers

-# initialize the context to run
-ctx = Context()

-# initialize the selected controller
-c = controllers.init(ctx)
+def launch():
+    # initialize the context to run
+    ctx = Context()

-# run the pods
-c.run()
+    if ctx.is_legacy_mode():

-# manager or just wait pod
-c.finalize()
+        # legacy mode
+        from paddle.distributed.fleet import launch
+        launch.launch()
+
+    else:
+
+        # initialize the selected controller
+        c = controllers.init(ctx)
+
+        # run the pods
+        c.run()
+
+        # manager or just wait pod
+        c.finalize()
+
+
+if __name__ == "__main__":
+    launch()
--- a/python/paddle/distributed/launch/context/__init__.py
+++ b/python/paddle/distributed/launch/context/__init__.py
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.distributed.launch import plugins
+
+from .node import Node
+from .status import Status
+from .args_envs import parse_args, fetch_envs, env_args_mapping
+
+import logging
+
+
+class Context(object):
+    def __init__(self, enable_plugin=True):
+        self.args, self.unknown_args = parse_args()
+        self.envs = fetch_envs()
+        self.logger = self.get_logger()
+
+        self.node = Node()
+        self.status = Status()
+
+        self.set_env_in_args()
+
+        # design for event queue, later
+        self.events = []
+
+        if enable_plugin:
+            self._enable_plugin()
+
+    def is_legacy_mode(self):
+        if self.args.legacy:
+            return True
+
+        if len(self.unknown_args) > 0:
+            self.logger.warning("Compatible mode enable with args {}".format(
+                self.unknown_args))
+            return True
+
+        legacy_env_list = [
+            'DISTRIBUTED_TRAINER_ENDPOINTS',
+            'PADDLE_ELASTIC_JOB_ID',
+            'PADDLE_DISTRI_BACKEND',
+            'FLAGS_START_PORT',
+        ]
+
+        for env in legacy_env_list:
+            if env in self.envs:
+                self.logger.warning(
+                    "ENV {} is deprecated, legacy launch enable".format(env))
+                return True
+
+        if self.args.master:
+            return False
+
+        return False
+
+    def get_envs(self):
+        return self.envs.copy()
+
+    def _enable_plugin(self):
+        for pl in plugins.enabled_plugins:
+            pl(self)
+
+    def get_logger(self, level=logging.INFO):
+        logger = logging.getLogger("LAUNCH")
+        logger.setLevel(self.args.log_level.upper() or level)
+        formatter = logging.Formatter(
+            fmt='%(name)s %(levelname)s %(asctime)s %(message)s')
+        ch = logging.StreamHandler()
+        ch.setFormatter(formatter)
+        logger.addHandler(ch)
+        return logger
+
+    def set_env_in_args(self):
+        for k, v in env_args_mapping.items():
+            if k in self.envs:
+                setattr(self.args, v, self.envs[k])
--- a/python/paddle/distributed/launch/context/args_envs.py
+++ b/python/paddle/distributed/launch/context/args_envs.py
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from argparse import ArgumentParser, REMAINDER
+
+env_args_mapping = {
+    'POD_IP': 'host',
+    'PADDLE_MASTER': 'master',
+    'PADDLE_DEVICES': 'devices',
+    'PADDLE_NNODES': 'nnodes',
+    'PADDLE_MODE': 'mode',
+    'PADDLE_LOG_LEVEL': 'log_level',
+    'PADDLE_NPROC_PER_NODE': 'nproc_per_node',
+    'PADDLE_JOB_ID': 'job_id',
+    'PADDLE_RANK': 'rank',
+    'PADDLE_LOG_DIR': 'log_dir',
+    'PADDLE_MAX_RESTART': 'max_restart',
+    'PADDLE_ELASTIC_LEVEL': 'elastic_level',
+    'PADDLE_ELASTIC_TIMEOUT': 'elastic_timeout',
+    'PADDLE_SERVER_NUM': 'server_num',
+    'PADDLE_TRAINER_NUM': 'trainer_num',
+    'PADDLE_SERVERS_ENDPOINTS': 'servers',
+    'PADDLE_TRAINERS_ENDPOINTS': 'trainers',
+    'PADDLE_GLOO_PORT': 'gloo_port',
+    'PADDLE_WITH_GLOO': 'with_gloo',
+}
+
+
+def fetch_envs():
+    os.environ.pop('http_proxy', None)
+    os.environ.pop('https_proxy', None)
+
+    return os.environ.copy()
+
+
+def parse_args():
+    parser = ArgumentParser()
+
+    base_group = parser.add_argument_group("Base Parameters")
+
+    base_group.add_argument(
+        "--master",
+        type=str,
+        default=None,
+        help="the master/rendezvous server, ip:port")
+
+    base_group.add_argument(
+        "--legacy", type=bool, default=False, help="use legacy launch")
+
+    base_group.add_argument(
+        "--rank", type=int, default=-1, help="the peer rank")
+
+    base_group.add_argument(
+        "--log_level", type=str, default="INFO", help="log level. Default INFO")
+
+    base_group.add_argument(
+        "--nnodes",
+        type=str,
+        default="1",
+        help="the number of peers, i.e. pod/node number")
+
+    base_group.add_argument(
+        "--nproc_per_node",
+        type=int,
+        default=None,
+        help="the number of processes in a pod")
+
+    base_group.add_argument(
+        "--log_dir",
+        type=str,
+        default="log",
+        help="the path for each process's log. Default ./log")
+    base_group.add_argument(
+        "--mode",
+        type=str,
+        default="collective",
+        help="run mode of the job, collective/ps/ps-heter")
+
+    base_group.add_argument(
+        "--job_id",
+        type=str,
+        default="default",
+        help="unique id of the job. Default default")
+
+    base_group.add_argument(
+        "--devices",
+        type=str,
+        default=None,
+        help="accelerate devices. as --gpus,npus,xps")
+
+    base_group.add_argument("--host", type=str, default=None, help="host ip")
+
+    base_group.add_argument(
+        "training_script",
+        type=str,
+        help="the full path of py script,"
+        "followed by arguments for the "
+        "training script")
+
+    base_group.add_argument('training_script_args', nargs=REMAINDER)
+
+    ps_group = parser.add_argument_group("Parameter-Server Parameters")
+    # for parameter server
+    ps_group.add_argument(
+        "--servers", type=str, default='', help="servers endpoints full list")
+    ps_group.add_argument(
+        "--trainers", type=str, default='', help="trainers endpoints full list")
+
+    ps_group.add_argument(
+        "--trainer_num", type=int, default=None, help="number of trainers")
+    ps_group.add_argument(
+        "--server_num", type=int, default=None, help="number of servers")
+    ps_group.add_argument(
+        "--gloo_port", type=int, default=6767, help="gloo http port")
+    ps_group.add_argument(
+        "--with_gloo", type=str, default="0", help="use gloo or not")
+
+    # parameter elastic mode
+    elastic_group = parser.add_argument_group("Elastic Parameters")
+    elastic_group.add_argument(
+        "--max_restart",
+        type=int,
+        default=3,
+        help="the times can restart. Default 3")
+
+    elastic_group.add_argument(
+        "--elastic_level",
+        type=int,
+        default=-1,
+        help="elastic level: -1 disable, 0 failed exit, peers hold, 1 internal restart"
+    )
+
+    elastic_group.add_argument(
+        "--elastic_timeout",
+        type=int,
+        default=30,
+        help="seconds to wait before elastic perform training")
+
+    return parser.parse_known_args()
--- a/python/paddle/distributed/run/context/device.py
+++ b/python/paddle/distributed/run/context/device.py
@@ -20,36 +20,90 @@ class DeviceType:
    GPU = 'gpu'
    XPU = 'xpu'
    NPU = 'npu'
+    MLU = 'mlu'


 class Device(object):
-    def __init__(self, dtype=None, count=1, memory="", labels=""):
-        self.dtype = dtype
-        self.count = count
-        self.memory = memory
-        self.labels = labels
+    def __init__(self, dtype=None, memory="", labels=""):
+        self._dtype = dtype
+        self._memory = memory
+        self._labels = labels

    def __str__(self):
-        return ",".join(self.labels)
+        return ",".join(self._labels)
+
+    @property
+    def dtype(self):
+        return self._dtype
+
+    @property
+    def count(self):
+        return len(self._labels) or 1
+
+    @property
+    def memory(self):
+        return self._memory
+
+    @property
+    def labels(self):
+        return self._labels
+
+    @labels.setter
+    def labels(self, lbs):
+        if isinstance(lbs, str):
+            self._labels = lbs.split(',')
+        elif isinstance(lbs, list):
+            self._labels = lbs
+        else:
+            self._labels = []
+
+    def get_selected_flag_key(self):
+        if self._dtype == DeviceType.CPU:
+            return 'FLAGS_selected_cpus'
+        if self._dtype == DeviceType.GPU:
+            return 'FLAGS_selected_gpus'
+        if self._dtype == DeviceType.NPU:
+            return 'FLAGS_selected_npus'
+        if self._dtype == DeviceType.XPU:
+            return 'FLAGS_selected_xpus'
+        if self._dtype == DeviceType.MLU:
+            return 'FLAGS_selected_mlus'
+        return 'FLAGS_selected_devices'
+
+    def get_selected_flag_label(self, idx):
+        if idx < len(self._labels):
+            return self._labels[idx]
+        else:
+            return '0'
+
+    def selected_flags(self, idx=None):
+        if idx is None:
+            return {self.get_selected_flag_key(): ','.join(self._labels)}
+        else:
+            return {
+                self.get_selected_flag_key(): self.get_selected_flag_label(idx)
+            }

    @classmethod
    def parse_device(self):
        dev = Device()
        visible_devices = None
        if 'CUDA_VISIBLE_DEVICES' in os.environ or 'NVIDIA_VISIBLE_DEVICES' in os.environ:
-            dev.dtype = DeviceType.GPU
+            dev._dtype = DeviceType.GPU
            visible_devices = os.getenv("CUDA_VISIBLE_DEVICES") or os.getenv(
                "NVIDIA_VISIBLE_DEVICES")
        elif 'XPU_VISIBLE_DEVICES' in os.environ:
-            dev.dtype = DeviceType.XPU
+            dev._dtype = DeviceType.XPU
            visible_devices = os.getenv("XPU_VISIBLE_DEVICES")
        elif 'ASCEND_VISIBLE_DEVICES' in os.environ:
-            dev.dtype = DeviceType.NPU
+            dev._dtype = DeviceType.NPU
            visible_devices = os.getenv("ASCEND_VISIBLE_DEVICES")
+        elif 'MLU_VISIBLE_DEVICES' in os.environ:
+            dev._dtype = DeviceType.MLU
+            visible_devices = os.getenv("MLU_VISIBLE_DEVICES")

-        if visible_devices and visible_devices != 'all':
-            dev.labels = visible_devices.split(',')
-            dev.count = len(dev.labels)
+        if visible_devices is not None and visible_devices != 'all':
+            dev._labels = visible_devices.split(',')
        else:
            return self.detect_device()

@@ -63,26 +117,33 @@ class Device(object):
        num = 0
        visible_devices = None
        if fluid.core.is_compiled_with_cuda():
-            dev.dtype = DeviceType.GPU
+            dev._dtype = DeviceType.GPU
            num = fluid.core.get_cuda_device_count()
            visible_devices = os.getenv("CUDA_VISIBLE_DEVICES") or os.getenv(
                "NVIDIA_VISIBLE_DEVICES")
        elif fluid.core.is_compiled_with_xpu():
-            dev.dtype = DeviceType.XPU
+            dev._dtype = DeviceType.XPU
            num = fluid.core.get_xpu_device_count()
            visible_devices = os.getenv("XPU_VISIBLE_DEVICES")
        elif fluid.core.is_compiled_with_npu():
-            dev.dtype = DeviceType.NPU
+            dev._dtype = DeviceType.NPU
            num = fluid.core.get_npu_device_count()
            visible_devices = os.getenv("ASCEND_VISIBLE_DEVICES")
+        elif fluid.core.is_compiled_with_mlu():
+            dev._dtype = DeviceType.MLU
+            num = fluid.core.get_mlu_device_count()
+            visible_devices = os.getenv("MLU_VISIBLE_DEVICES")

        if num == 0:
-            dev.dtype = DeviceType.CPU
-        elif visible_devices is None or visible_devices == "all" or visible_devices == "":
-            dev.labels = [str(x) for x in range(0, num)]
-            dev.count = num
+            dev._dtype = DeviceType.CPU
+        elif visible_devices is None or visible_devices == "all":
+            dev._labels = [str(x) for x in range(0, num)]
        else:
-            dev.labels = visible_devices.split(',')
-            dev.count = len(dev.labels)
+            dev._labels = visible_devices.split(',')

        return dev
+
+
+if __name__ == '__main__':
+    d = Device.parse_device()
+    print(d.get_selected_flag())
--- a/python/paddle/distributed/run/context/event.py
+++ b/python/paddle/distributed/run/context/event.py
--- a/python/paddle/distributed/run/context/node.py
+++ b/python/paddle/distributed/run/context/node.py
--- a/python/paddle/distributed/run/context/resource.py
+++ b/python/paddle/distributed/run/context/resource.py
--- a/python/paddle/distributed/run/context/status.py
+++ b/python/paddle/distributed/run/context/status.py
--- a/python/paddle/distributed/run/controllers/__init__.py
+++ b/python/paddle/distributed/run/controllers/__init__.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-__all__ = ["init"]
+__all__ = []

 from .collective import CollectiveController
 from .collective import CollectiveElasticController

--- a/python/paddle/distributed/run/controllers/collective.py
+++ b/python/paddle/distributed/run/controllers/collective.py
@@ -89,6 +89,10 @@ class CollectiveController(Controller):
                "PADDLE_TRAINERS_NUM": "{}".format(global_size),
                "PADDLE_RANK_IN_NODE": str(i),
            }
+            if self.pod.replicas == 1:
+                e.update(self.ctx.node.device.selected_flags())
+            else:
+                e.update(self.ctx.node.device.selected_flags(i))
            self.add_container(envs=e, log_tag=i)

        return True
@@ -106,7 +110,8 @@ class CollectiveElasticController(CollectiveController):
    def register(self):
        if self.job.id == 'default':
            self.ctx.logger.warning(
-                'Using default job name may cause conflict, add --id in args')
+                'Using default job name may cause conflict, add --job_id in args'
+            )

        self.master.register_heartbeat(self.job.id, self.pod.name)

@@ -114,6 +119,8 @@ class CollectiveElasticController(CollectiveController):
        '''
        watch self and peer status, return true to exit
        '''
+
+        self.ctx.logger.info("Watching {}".format(self.pod))
        while not self.ctx.status.is_done():
            # self status
            status = self.pod.watch(timeout=2)
@@ -171,13 +178,8 @@ class CollectiveElasticController(CollectiveController):
                continue

            self.master.set_status(self.ctx.status.RUNNING)
-            self.ctx.status.run()
-
-            assert len(self.pod.containers) > 0, "No container in the pod"
-            self.ctx.logger.debug("Run {}".format(self.pod))
-            self.ctx.logger.debug("Run {}".format(self.pod.containers[0]))

-            self.pod.deploy()
+            self.deploy_pod()

            if self.watch():
                break

--- a/python/paddle/distributed/run/controllers/controller.py
+++ b/python/paddle/distributed/run/controllers/controller.py
@@ -16,9 +16,9 @@ import sys
 import os
 import signal

-from paddle.distributed.run.job import Job
-from paddle.distributed.run.job import Pod
-from paddle.distributed.run.job import Container
+from paddle.distributed.launch.job.job import Job
+from paddle.distributed.launch.job.pod import Pod
+from paddle.distributed.launch.job.container import Container

 from .master import Master

@@ -39,38 +39,43 @@ class ControllerBase(object):
        self.ctx = ctx
        self.master = Master.factory(self.ctx)

-        self.job = Job(np=self.ctx.args.np,
+        self.job = Job(nnodes=self.ctx.args.nnodes,
                       mode=self.ctx.args.mode,
-                       id=self.ctx.args.id)
+                       jid=self.ctx.args.job_id)
        self.pod = Pod()

        self.join_server = None

-    def run(self):
-        self.build_job()
-        self.build_pod()
+    def deploy_pod(self):

-        if len(self.pod.containers) < 1:
-            self.ctx.logger.error("No container in the pod {}".format(self.pod))
-            return
+        assert len(self.pod.containers) > 0, "No container in the pod"

        self.ctx.logger.info("Run {}".format(self.pod))
        self.ctx.logger.debug(self.pod.containers[0])

+        self.ctx.status.run()
        self.pod.deploy()

+    def run(self):
+        self.build_job()
+        self.build_pod()
+
+        self.deploy_pod()
+
        self.watch()

    def watch(self) -> bool:
+        self.ctx.logger.info("Watching {}".format(self.pod))
+
        status = self.pod.watch()

        if status == self.ctx.status.COMPLETED:
            self.ctx.logger.info("Pod {}".format(status))
        elif status == self.ctx.status.FAILED:
+            fc = self.pod.failed_container()
            self.ctx.logger.info("Pod {}".format(status))
-            self.ctx.logger.error("Container failed !!!\n{}".format(
-                self.pod.failed_container()))
-            self.pod.tail()
+            self.ctx.logger.error("Container failed !!!\n{}".format(fc[0]))
+            fc[0].tail()
            self.pod.stop()

    def stop(self, sigint=None):

--- a/python/paddle/distributed/run/controllers/master.py
+++ b/python/paddle/distributed/run/controllers/master.py
@@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-from paddle.distributed.run.utils.kv_client import KVClient
-from paddle.distributed.run.utils.kv_server import KVServer
+from paddle.distributed.launch.utils.kv_client import KVClient
+from paddle.distributed.launch.utils.kv_server import KVServer

 import time
 import sys
@@ -84,7 +84,7 @@ class HTTPMaster(Master):

            print("Copy the following command to other nodes to run.")
            cmd = [
-                sys.executable.split('/')[-1], "-m", "paddle.distributed.run"
+                sys.executable.split('/')[-1], "-m", "paddle.distributed.launch"
            ]
            cmd.extend(["--master", self.endpoint])
            cmd.extend(sys.argv[1:])
@@ -118,9 +118,12 @@ class HTTPMaster(Master):
        self._stop_server()

    def sync_peers(self, prefix, key, value, size, rank=-1) -> (list, int):
+
        if size < 2:
            return [value], 0

+        self.ctx.logger.info("Waiting peer ready...")
+
        self.lazy_init()

        while not self.ctx.status.is_done():
@@ -130,7 +133,7 @@ class HTTPMaster(Master):
                self.ctx.logger.warning("master not ready")
                time.sleep(0.1)

-        # 'aaaaaa' make suer main pod (master server) as rank 0
+        # 'aaaaaa' make sure main pod (master server) as rank 0
        ky = 'aaaaaa' if rank < 0 and self.role == Master.MAIN else key
        k = "{}/{}/{}".format(prefix, ky, rank)

@@ -177,6 +180,12 @@ class ETCDMaster(Master):
        sync_peers gather all value for key under scope prefix
        result always be sorted either by rank or alphabet of pod.name
        '''
+
+        if size < 2:
+            return [value], 0
+
+        self.ctx.logger.info("Waiting peer ready...")
+
        path = "{}/{}/{}".format(prefix, key, rank)

        self.client.delete_prefix(prefix)

--- a/python/paddle/distributed/run/controllers/ps.py
+++ b/python/paddle/distributed/run/controllers/ps.py
@@ -22,7 +22,8 @@ class PSController(Controller):
    @classmethod
    def enable(cls, ctx):
        if ctx.args.mode == ControleMode.PS or ctx.args.server_num or len(
-                ctx.args.servers) > 0:
+                ctx.args.servers) > 0 or ctx.args.trainer_num or len(
+                    ctx.args.trainers) > 0:
            ctx.logger.debug("{} enabled".format(cls.__name__))
            ctx.args.mode = ControleMode.PS
            return True

--- a/python/paddle/distributed/launch.py
+++ b/python/paddle/distributed/launch.py
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-# 
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 #     http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-from paddle.distributed.fleet import launch
-launch.launch()
-
-__all__ = []
--- a/python/paddle/distributed/run/job/container.py
+++ b/python/paddle/distributed/run/job/container.py
@@ -13,12 +13,11 @@
 # limitations under the License.

 from collections import OrderedDict
-from paddle.distributed.run.utils.process_context import ProcessContext
+from paddle.distributed.launch.utils.process_context import ProcessContext

 from .status import Status

 import os, copy, sys
-import time


 class Container(object):
@@ -78,6 +77,11 @@ class Container(object):
        kwargs = {k: v for k, v in kwargs.items() if isinstance(v, str)}
        self._env.update(kwargs)

+    def _valide_env(self):
+        for k, v in self._env.items():
+            assert isinstance(k, str) and isinstance(
+                v, str), 'env {}:{} must be str'.format(k, v)
+
    def _get_fd(self, pth):
        if not pth:
            return None
@@ -90,12 +94,12 @@ class Container(object):
        except:
            return None

-    def start(self, timeout=-1):
-        end = time.time() + timeout
-
+    def start(self):
        if self._proc and self._proc.alive():
            return True

+        self._valide_env()
+
        self._stdout = self._get_fd(self._out) or sys.stdout
        if self._out == self._err:
            self._stderr = self._stdout
@@ -106,14 +110,6 @@ class Container(object):
            self._entrypoint, env=self._env, out=self._stdout, err=self._stderr)
        self._proc.start()

-        while timeout > 0 and time.time() < end:
-            if self._proc.alive():
-                time.sleep(0.1)
-                continue
-            if self._proc.exit_code() == 0:
-                return True
-            return False
-
    def terminate(self, force=False):
        if self._log_handler:
            self._log_handler.close()
@@ -125,9 +121,11 @@ class Container(object):
    def wait(self, timeout=None):
        self._proc.wait(timeout)

+    @property
    def exit_code(self):
        return self._proc.exit_code() if self._proc else -1

+    @property
    def status(self):
        if not self._proc:
            return Status.UNINIT
@@ -141,9 +139,9 @@ class Container(object):
    def __str__(self):
        return 'Container rank {} status {} cmd {} code {} log {} \nenv {}'.format(
            self._rank,
-            self.status(),
+            self.status,
            self._entrypoint,
-            self.exit_code(),
+            self.exit_code,
            self.errfile,
            self._env, )


--- a/python/paddle/distributed/run/job/job.py
+++ b/python/paddle/distributed/run/job/job.py
@@ -20,16 +20,16 @@ class JobMode:


 class Job(object):
-    def __init__(self, id='default', mode=JobMode.COLLECTIVE, np="1"):
+    def __init__(self, jid='default', mode=JobMode.COLLECTIVE, nnodes="1"):
        self._mode = mode
-        self._id = id
+        self._id = jid

        self._replicas = 0
        self._replicas_min = self._replicas
        self._replicas_max = self._replicas
        self._elastic = False

-        self.set_replicas(str(np))
+        self.set_replicas(str(nnodes))

    def __str__(self):
        return "Job: {}, mode {}, replicas {}[{}:{}], elastic {}".format(
@@ -64,8 +64,8 @@ class Job(object):
    def replicas(self, replicas):
        self._replicas = replicas

-    def set_replicas(self, np: str):
-        np = str(np) if np else '1'
+    def set_replicas(self, nnodes: str):
+        np = str(nnodes) if nnodes else '1'

        if ':' in np:
            nps = np.split(':')

--- a/python/paddle/distributed/run/job/pod.py
+++ b/python/paddle/distributed/run/job/pod.py
@@ -34,7 +34,7 @@ class PodSepc(object):
        #self.status: Status = None

        self._rank = -1
-        self._init_timeout = 120  # 2 min timeout for each init container
+        self._init_timeout = None
        self._restart = -1
        self._replicas = 0  # number of containers
        self._exit_code = 0
@@ -45,15 +45,15 @@ class Pod(PodSepc):
        super().__init__()

    def __str__(self):
-        return "Pod: {}, replicas {}, status {}".format(self.name,
-                                                        self.replicas,
-                                                        self.status())
+        return "Pod: {}, replicas {}, status {}".format(
+            self.name, self.replicas, self.status)

    def failed_container(self):
+        cs = []
        for c in self._containers:
-            if c.status() == Status.FAILED:
-                return c
-        return None
+            if c.status == Status.FAILED:
+                cs.append(c)
+        return cs

    @property
    def name(self):
@@ -65,7 +65,7 @@ class Pod(PodSepc):

    @replicas.setter
    def replicas(self, r):
-        self._replicas = r
+        self._replicas = max(r, 1)

    @property
    def rank(self):
@@ -98,13 +98,15 @@ class Pod(PodSepc):
    @property
    def exit_code(self):
        for c in self._containers:
-            if c.exit_code() != 0:
-                return c.exit_code()
+            if c.exit_code != 0:
+                return c.exit_code
        return 0

    def deploy(self):
+        # init container should stop before run containers
        for i in self._init_containers:
-            i.start(self._init_timeout)
+            i.start()
+            i.wait(self._init_timeout)

        for c in self._containers:
            c.start()
@@ -120,6 +122,7 @@ class Pod(PodSepc):
        for c in self._containers:
            c.wait(None)

+    @property
    def status(self):
        if self.is_failed():
            return Status.FAILED
@@ -127,6 +130,9 @@ class Pod(PodSepc):
        if self.is_completed():
            return Status.COMPLETED

+        if self.is_running():
+            return Status.RUNNING
+
        return Status.READY

    def reset(self):
@@ -135,31 +141,31 @@ class Pod(PodSepc):

    def is_failed(self):
        for c in self._containers:
-            if c.status() == Status.FAILED:
+            if c.status == Status.FAILED:
                return True
        return False

    def is_completed(self):
        for c in self._containers:
-            if c.status() != Status.COMPLETED:
+            if c.status != Status.COMPLETED:
+                return False
+        return True
+
+    def is_running(self):
+        for c in self._containers:
+            if c.status != Status.RUNNING:
                return False
        return True

    def logs(self, idx=None):
        if idx is None:
-            if self.failed_container():
-                self.failed_container().logs()
-            else:
-                self._containers[0].logs()
+            self._containers[0].logs()
        else:
            self._containers[idx].logs()

    def tail(self, idx=None):
        if idx is None:
-            if self.failed_container():
-                self.failed_container().tail()
-            else:
-                self._containers[0].tail()
+            self._containers[0].tail()
        else:
            self._containers[idx].tail()

@@ -175,10 +181,10 @@ class Pod(PodSepc):
        end = time.time() + timeout
        while timeout < 0 or time.time() < end:
            for c in self._containers:
-                if c.status() in any_list:
-                    return c.status()
+                if c.status in any_list:
+                    return c.status

-            s = [c.status() for c in self._containers]
+            s = [c.status for c in self._containers]
            if len(set(s)) == 1 and s[0] in all_list:
                return s[0]


--- a/python/paddle/distributed/run/job/status.py
+++ b/python/paddle/distributed/run/job/status.py
--- a/python/paddle/distributed/run/plugins/__init__.py
+++ b/python/paddle/distributed/run/plugins/__init__.py
@@ -30,15 +30,26 @@ def process_args(ctx):
    argdev = ctx.args.devices
    if argdev:
        ctx.node.device.labels = argdev.split(',')
-        ctx.node.device.count = len(ctx.node.device.labels)
        ctx.logger.debug('Device reset by args {}'.format(argdev))


 def collective_compatible(ctx):
    if 'PADDLE_TRAINER_ENDPOINTS' in ctx.envs:
-        ctx.master = ctx.envs['PADDLE_TRAINER_ENDPOINTS'].split(',')[0]
+        eps = ctx.envs['PADDLE_TRAINER_ENDPOINTS'].split(',')
+        hosts = set([h.split(':')[0] for h in eps])
+        ctx.args.master = eps[0] if ':' in eps[0] else '{}:6768'.format(eps[0])
+        ctx.args.nnodes = len(hosts)
+        ctx.logger.info('args reset by env PADDLE_TRAINER_ENDPOINTS\n{}'.format(
+            eps))
+    '''
    if 'DISTRIBUTED_TRAINER_ENDPOINTS' in ctx.envs:
-        ctx.master = ctx.envs['DISTRIBUTED_TRAINER_ENDPOINTS'].split(',')[0]
+        eps = ctx.envs['DISTRIBUTED_TRAINER_ENDPOINTS'].split(',')
+        hosts = set([h.split(':')[0] for h in eps])
+        ctx.args.master = eps[0]
+        ctx.args.nnodes = len(hosts)
+        ctx.logger.info(
+            'args reset by env DISTRIBUTED_TRAINER_ENDPOINTS\n{}'.format(eps))
+    '''


 def rewrite_host_ip(ctx):

--- a/python/paddle/distributed/run/job/__init__.py
+++ b/python/paddle/distributed/run/job/__init__.py
@@ -11,15 +11,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-from .pod import Pod
-from .job import Job
-from .container import Container
-from .status import Status
-
-__all__ = [
-    'Pod',
-    'Job',
-    'Container',
-    'Status',
-]
--- a/python/paddle/distributed/run/utils/kv_client.py
+++ b/python/paddle/distributed/run/utils/kv_client.py
--- a/python/paddle/distributed/run/utils/kv_server.py
+++ b/python/paddle/distributed/run/utils/kv_server.py
--- a/python/paddle/distributed/run/utils/process_context.py
+++ b/python/paddle/distributed/run/utils/process_context.py
--- a/python/paddle/distributed/run/context/__init__.py
+++ b/python/paddle/distributed/run/context/__init__.py
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-#     http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from argparse import ArgumentParser, REMAINDER
-import os, copy
-
-from paddle.distributed.run import plugins
-
-from .node import Node
-from .status import Status
-
-import logging
-
-
-class Context(object):
-    def __init__(self, enable_plugin=True):
-        os.environ.pop('http_proxy', None)
-        os.environ.pop('https_proxy', None)
-
-        self.args = self.parse_args()
-        self.envs = self.fetch_envs()
-        self.logger = self.get_logger()
-
-        self.node = Node()
-        self.status = Status()
-
-        self.set_env_in_args()
-
-        # design for event queue, later
-        self.events = []
-
-        if enable_plugin:
-            self._enable_plugin()
-
-    def get_envs(self):
-        return self.envs.copy()
-
-    def _enable_plugin(self):
-        for pl in plugins.enabled_plugins:
-            pl(self)
-
-    def parse_args(self):
-        parser = ArgumentParser()
-
-        base_group = parser.add_argument_group("Base Parameters")
-
-        base_group.add_argument(
-            "--master",
-            type=str,
-            default=None,
-            help="the master/rendezvous server, ip:port")
-
-        base_group.add_argument(
-            "--rank", type=int, default=-1, help="the peer rank")
-
-        base_group.add_argument(
-            "--log", type=str, default="INFO", help="log level. Default INFO")
-
-        base_group.add_argument(
-            "--np",
-            type=str,
-            default="1",
-            help="the number of peers, i.e. pod/node number")
-
-        base_group.add_argument(
-            "--nproc_per_node",
-            type=int,
-            default=None,
-            help="the number of processes in a pod")
-
-        base_group.add_argument(
-            "--log_dir",
-            type=str,
-            default="log",
-            help="the path for each process's log. Default ./log")
-        base_group.add_argument(
-            "--mode",
-            type=str,
-            default="collective",
-            help="run mode of the job, collective/ps/ps-heter")
-
-        base_group.add_argument(
-            "--id",
-            type=str,
-            default="default",
-            help="unique id of the job. Default default")
-
-        base_group.add_argument(
-            "--devices",
-            type=str,
-            default=None,
-            help="accelerate devices. as --gpus,npus,xps")
-
-        base_group.add_argument(
-            "--host", type=str, default=None, help="host ip")
-
-        base_group.add_argument(
-            "training_script",
-            type=str,
-            help="the full path of py script,"
-            "followed by arguments for the "
-            "training script")
-
-        base_group.add_argument('training_script_args', nargs=REMAINDER)
-
-        ps_group = parser.add_argument_group("Parameter-Server Parameters")
-        # for parameter server
-        ps_group.add_argument(
-            "--servers",
-            type=str,
-            default='',
-            help="servers endpoints full list")
-        ps_group.add_argument(
-            "--trainers",
-            type=str,
-            default='',
-            help="trainers endpoints full list")
-
-        ps_group.add_argument(
-            "--trainer_num", type=int, default=None, help="number of trainers")
-        ps_group.add_argument(
-            "--server_num", type=int, default=None, help="number of servers")
-        ps_group.add_argument(
-            "--gloo_port", type=int, default=6767, help="gloo http port")
-        ps_group.add_argument(
-            "--with_gloo", type=str, default="0", help="use gloo or not")
-
-        # parameter elastic mode
-        elastic_group = parser.add_argument_group("Elastic Parameters")
-        elastic_group.add_argument(
-            "--max_restart",
-            type=int,
-            default=3,
-            help="the times can restart. Default 3")
-
-        elastic_group.add_argument(
-            "--elastic_level",
-            type=int,
-            default=-1,
-            help="elastic level: -1 disable, 0 failed exit, peers hold, 1 internal restart"
-        )
-
-        elastic_group.add_argument(
-            "--elastic_timeout",
-            type=int,
-            default=30,
-            help="seconds to wait before elastic perform training")
-        return parser.parse_args()
-
-    def _valide_env(self, key):
-        if key in ['POD_IP']:
-            return True
-        if key.endswith('_VISIBLE_DEVICES'):
-            return True
-        if key.startswith('PADDLE_'):
-            return True
-
-        return False
-
-    def fetch_envs(self):
-        ge = os.environ.copy()
-
-        black_env_list = ['http_proxy', 'https_proxy']
-        for key in black_env_list:
-            ge.pop(key, None)
-
-        return ge
-        '''
-        # use black list instead white list
-        return {k: ge[k] for k in ge if self._valide_env(k)}
-        '''
-
-    def get_logger(self, level=logging.INFO):
-        logger = logging.getLogger("PADDLERUN")
-        logger.setLevel(self.args.log.upper() or level)
-        formatter = logging.Formatter(
-            fmt='%(name)s %(levelname)s %(asctime)s %(message)s')
-        ch = logging.StreamHandler()
-        ch.setFormatter(formatter)
-        logger.addHandler(ch)
-        return logger
-
-    def set_env_in_args(self):
-        env_args = {
-            'POD_IP': 'host',
-            'PADDLE_MASTER': 'master',
-            'PADDLE_DEVICES': 'devices',
-            'PADDLE_NP': 'np',
-            'PADDLE_MODE': 'mode',
-            'PADDLE_LOG': 'log',
-            'PADDLE_NPROC_PER_NODE': 'nproc_per_node',
-            'PADDLE_JOB_ID': 'id',
-            'PADDLE_RANK': 'rank',
-            'PADDLE_LOG_DIR': 'log_dir',
-            'PADDLE_MAX_RESTlRT': 'max_restart',
-            'PADDLE_ELASTIC_LEVEL': 'elastic_level',
-            'PADDLE_ELASTIC_TIMEOUT': 'elastic_timeout',
-            'PADDLE_SERVER_NUM': 'server_num',
-            'PADDLE_TRAINER_NUM': 'trainer_num',
-            'PADDLE_SERVERS_ENDPOINTS': 'servers',
-            'PADDLE_TRAINERS_ENDPOINTS': 'trainers',
-            'PADDLE_GLOO_PORT': 'gloo_port',
-            'PADDLE_WITH_GLOO': 'with_gloo',
-        }
-
-        for k, v in env_args.items():
-            if k in self.envs:
-                setattr(self.args, v, self.envs[k])
--- a/python/paddle/distributed/run/plugins/ip.py
+++ b/python/paddle/distributed/run/plugins/ip.py
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-#     http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import socket
-
-
-def get_local_ip(ctx):
-    _, ip = _get_host_name_ip()
-    ctx.args.host = ip
-    ctx.envs["POD_IP"] = ip
-
-
-def _get_host_name_ip():
-    try:
-        host_name = socket.gethostname()
-        host_ip = socket.gethostbyname(host_name)
-        return host_name, host_ip
-    except:
-        return None
--- a/python/paddle/fluid/dygraph/layers.py
+++ b/python/paddle/fluid/dygraph/layers.py
@@ -1155,7 +1155,8 @@ class Layer(object):
                layers[name] = None
            else:
                _buffers = self.__dict__.get('_buffers', None)
-                if type(value) == core.VarBase:
+                if type(value) == core.VarBase or \
+                    type(value) == core.eager.Tensor:
                    if _buffers is None:
                        raise ValueError(
                            "super(YourLayer, self).__init__() should be called first"

--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -949,7 +949,7 @@ if (WITH_DISTRIBUTE AND NOT APPLE)
 endif()

 # setting timeout value as 15S
-set_tests_properties(test_run PROPERTIES TIMEOUT 200)
+set_tests_properties(test_run PROPERTIES TIMEOUT 120)
 set_tests_properties(test_sync_batch_norm_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_cross_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_imperative_lod_tensor_to_selected_rows PROPERTIES TIMEOUT 200)

--- a/python/paddle/fluid/tests/unittests/test_c_comm_init_op.sh
+++ b/python/paddle/fluid/tests/unittests/test_c_comm_init_op.sh
@@ -17,5 +17,4 @@
 set -e
 # use default values
 # FIXME: random fails on Unknown command lines -c (or -m).
-launch_py=${PADDLE_BINARY_DIR}/python/paddle/distributed/launch.py
-CUDA_VISIBLE_DEVICES=0,1 python ${launch_py} c_comm_init_op.py
+CUDA_VISIBLE_DEVICES=0,1 python -m paddle.distributed.launch c_comm_init_op.py
--- a/python/paddle/fluid/tests/unittests/test_egr_python_api.py
+++ b/python/paddle/fluid/tests/unittests/test_egr_python_api.py
@@ -637,6 +637,10 @@ class EagerVariablePropertiesAndMethodsTestCase(unittest.TestCase):
                self.assertTrue(tensor3.persistable, True)
                self.assertTrue(tensor3.stop_gradient, True)
                self.assertTrue(tensor3.place.is_gpu_place())
+                tensor4 = paddle.to_tensor([1, 2, 3], place='gpu_pinned')
+                tensor5 = tensor4._copy_to(core.CUDAPlace(0), True)
+                self.assertTrue(
+                    np.array_equal(tensor4.numpy(), tensor5.numpy()))
            else:
                tensor3 = tensor2._copy_to(core.CPUPlace(), True)
                self.assertTrue(np.array_equal(tensor3.numpy(), arr2))

--- a/python/paddle/fluid/tests/unittests/test_graph_send_recv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_graph_send_recv_op.py
@@ -304,6 +304,35 @@ class API_GraphSendRecvOpTest(unittest.TestCase):
                "two value is\
                {}\n{}, check diff!".format(np_res, ret_res))

+    def test_set_outsize_gpu(self):
+        if paddle.fluid.core.is_compiled_with_cuda():
+            x = paddle.to_tensor(
+                np.array([[0, 2, 3], [1, 4, 5], [2, 6, 6]]), dtype="float32")
+            src_index = paddle.to_tensor(np.array([0, 0, 1]), dtype="int32")
+            dst_index = paddle.to_tensor(np.array([0, 1, 1]), dtype="int32")
+            res = paddle.incubate.graph_send_recv(x, src_index, dst_index,
+                                                  "sum")
+            out_size = paddle.max(dst_index) + 1
+            res_set_outsize = paddle.incubate.graph_send_recv(
+                x, src_index, dst_index, "sum", out_size)
+
+            np_res = np.array(
+                [[0, 2, 3], [1, 6, 8], [0, 0, 0]], dtype="float32")
+            np_res_set_outsize = np.array(
+                [[0, 2, 3], [1, 6, 8]], dtype="float32")
+
+            self.assertTrue(
+                np.allclose(
+                    np_res, res, atol=1e-6),
+                "two value is\
+                {}\n{}, check diff!".format(np_res, res))
+            self.assertTrue(
+                np.allclose(
+                    np_res_set_outsize, res_set_outsize, atol=1e-6),
+                "two value is\
+                {}\n{}, check diff!"
+                .format(np_res_set_outsize, res_set_outsize))
+

 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_inplace_eager_fluid.py
+++ b/python/paddle/fluid/tests/unittests/test_inplace_eager_fluid.py
@@ -195,6 +195,8 @@ class TestDygraphInplaceFlatten(TestDygraphInplace):
        return var.flatten_()


+"""
+# This case will fail while using `_C_ops.final_state_scatter`.
 class TestDygraphInplaceScatter(TestDygraphInplace):
    def init_data(self):
        self.input_var_numpy = np.array([[1, 1], [2, 2], [3, 3]])
@@ -213,6 +215,7 @@ class TestDygraphInplaceScatter(TestDygraphInplace):
            [[1, 1], [2, 2], [3, 3], [4, 4]], dtype='float32')

        return paddle.scatter_(var, index, updates, overwrite=False)
+"""


 class TestDygraphInplaceElu(TestDygraphInplace):

--- a/python/paddle/fluid/tests/unittests/test_run.py
+++ b/python/paddle/fluid/tests/unittests/test_run.py
@@ -39,9 +39,7 @@ import os
 env = os.environ.copy()
 assert "PADDLE_PSERVERS_IP_PORT_LIST" in env
 assert "PADDLE_TRAINER_ENDPOINTS" in env
-#assert "PADDLE_PSERVER_ENDPOINTS" in env
-#assert "PADDLE_TRAINER_ENDPOINTS" in env
-#assert "PADDLE_ROLE" in env
+assert "PADDLE_ROLE" in env
 #assert "PADDLE_RANK" in env
 '''

@@ -62,27 +60,24 @@ class Collective_Test(unittest.TestCase):
        write_file(pyname, colpyfile)

    def pdrun(self, args, env=None):
-        cmd = [sys.executable.split('/')[-1], "-m", "paddle.distributed.run"]
+        cmd = [sys.executable.split('/')[-1], "-m", "paddle.distributed.launch"]
        if args:
            cmd.extend(args.split(" "))
        cmd.extend([pyname])
        proc = subprocess.Popen(cmd, env)
        return proc

-    '''
    def test_collective_1(self):
-        args = "--id test1"
+        args = "--job_id test1"
        p = self.pdrun(args)
        p.wait()
        self.assertTrue(p.poll() == 0)

-    '''
-
    def test_collective_2(self):
        if os.path.exists('./log'):
            shutil.rmtree('./log')

-        args = "--id test2 --devices 0,1,2"
+        args = "--job_id test2 --devices 0,1,2"
        p = self.pdrun(args)
        p.wait()
        self.assertTrue(p.poll() == 0)
@@ -95,7 +90,7 @@ class Collective_Test(unittest.TestCase):
            shutil.rmtree('./log')

        port = random.randrange(6000, 8000)
-        args = "--id test3 --devices 0,1 --master 127.0.0.1:{} --np 2".format(
+        args = "--job_id test3 --devices 0,1 --master 127.0.0.1:{} --np 2".format(
            port)
        p1 = self.pdrun(args)
        p2 = self.pdrun(args)
@@ -113,14 +108,13 @@ class PS_Test(unittest.TestCase):
        write_file(pyname, pspyfile)

    def pdrun(self, args, env=None):
-        cmd = [sys.executable.split('/')[-1], "-m", "paddle.distributed.run"]
+        cmd = [sys.executable.split('/')[-1], "-m", "paddle.distributed.launch"]
        if args:
            cmd.extend(args.split(" "))
        cmd.extend([pyname])
        proc = subprocess.Popen(cmd, env)
        return proc

-    '''
    def test_ps_1(self):
        args = "--mode ps"
        p = self.pdrun(args)
@@ -131,21 +125,20 @@ class PS_Test(unittest.TestCase):
        if os.path.exists('./log'):
            shutil.rmtree('./log')

-        args = "--id ps2 --server_num=2 --trainer_num=2"
+        args = "--job_id ps2 --server_num=2 --trainer_num=2"
        p = self.pdrun(args)
        p.wait()
        self.assertTrue(p.poll() == 0)

        c = get_files('log', 'ps2')
        self.assertTrue(len(c) == 5)
-    '''

    def test_ps_3(self):
        if os.path.exists('./log'):
            shutil.rmtree('./log')

        port = random.randrange(6000, 8000)
-        args = "--id ps3 --master 127.0.0.1:{} --np 2 --server_num=1 --trainer_num=1".format(
+        args = "--job_id ps3 --master 127.0.0.1:{} --np 2 --server_num=1 --trainer_num=1".format(
            port)
        p1 = self.pdrun(args)
        p2 = self.pdrun(args)
@@ -161,7 +154,7 @@ class PS_Test(unittest.TestCase):
        if os.path.exists('./log'):
            shutil.rmtree('./log')

-        args = "--id ps4 --servers 127.0.0.1:8900,127.0.0.1:8901 --trainers 127.0.0.1:8902,127.0.0.1:8903"
+        args = "--job_id ps4 --servers 127.0.0.1:8900,127.0.0.1:8901 --trainers 127.0.0.1:8902,127.0.0.1:8903"
        p1 = self.pdrun(args)
        p1.wait()
        self.assertTrue(p1.poll() == 0)

--- a/python/paddle/incubate/operators/graph_send_recv.py
+++ b/python/paddle/incubate/operators/graph_send_recv.py
@@ -19,7 +19,12 @@ from paddle.fluid import core
 from paddle import _C_ops


-def graph_send_recv(x, src_index, dst_index, pool_type="sum", name=None):
+def graph_send_recv(x,
+                    src_index,
+                    dst_index,
+                    pool_type="sum",
+                    out_size=None,
+                    name=None):
    r"""

    Graph Learning Send_Recv combine operator.
@@ -27,7 +32,7 @@ def graph_send_recv(x, src_index, dst_index, pool_type="sum", name=None):
    This operator is mainly used in Graph Learning domain, and the main purpose is to reduce intermediate memory 
    consumption in the process of message passing. Take `x` as the input tensor, we first use `src_index`
    to gather the corresponding data, and then use `dst_index` to update the corresponding position of output tensor 
-    in different pooling types, like sum, mean, max, or min.
+    in different pooling types, like sum, mean, max, or min. Besides, we can set `out_size` to get necessary output shape.

    .. code-block:: text

@@ -43,6 +48,8 @@ def graph_send_recv(x, src_index, dst_index, pool_type="sum", name=None):

           pool_type = "sum"

+           out_size = None
+
           Then:

           Out = [[0, 2, 3],
@@ -56,6 +63,9 @@ def graph_send_recv(x, src_index, dst_index, pool_type="sum", name=None):
                            The available data type is int32, int64. 
        pool_type (str): The pooling type of graph_send_recv, including `sum`, `mean`, `max`, `min`.
                         Default value is `sum`.
+        out_size (int64|None): We can set `out_size` to get necessary output shape. If not set, then this 
+                              attribute will not be used. If set, it should be equal with or larger than
+                              max(dst_index) + 1.
        name (str, optional): Name for the operation (optional, default is None).
                              For more information, please refer to :ref:`api_guide_Name`.

@@ -75,6 +85,21 @@ def graph_send_recv(x, src_index, dst_index, pool_type="sum", name=None):
            out = paddle.incubate.graph_send_recv(x, src_index, dst_index, pool_type="sum")
            # Outputs: [[0., 2., 3.], [2., 8., 10.], [1., 4., 5.]]

+            x = paddle.to_tensor([[0, 2, 3], [1, 4, 5], [2, 6, 7]], dtype="float32")
+            indexes = paddle.to_tensor([[0, 1], [2, 1], [0, 0]], dtype="int32")
+            src_index = indexes[:, 0]
+            dst_index = indexes[:, 1]
+            out_size = paddle.max(dst_index) + 1
+            out = paddle.incubate.graph_send_recv(x, src_index, dst_index, pool_type="sum", out_size=out_size)
+            # Outputs: [[0., 2., 3.], [[2., 8., 10.]]]
+
+            x = paddle.to_tensor([[0, 2, 3], [1, 4, 5], [2, 6, 7]], dtype="float32")
+            indexes = paddle.to_tensor([[0, 1], [2, 1], [0, 0]], dtype="int32")
+            src_index = indexes[:, 0]
+            dst_index = indexes[:, 1]
+            out = paddle.incubate.graph_send_recv(x, src_index, dst_index, pool_type="sum")
+            # Outputs: [[0., 2., 3.], [2., 8., 10.], [0., 0., 0.]]
+
    """

    if pool_type not in ["sum", "mean", "max", "min"]:
@@ -82,9 +107,16 @@ def graph_send_recv(x, src_index, dst_index, pool_type="sum", name=None):
            "pool_type should be `sum`, `mean`, `max` or `min`, but received %s"
            % pool_type)

+    # TODO(daisiming): Should we add judgement for out_size: max(dst_index) + 1.
+
    if in_dygraph_mode():
-        out, tmp = _C_ops.graph_send_recv(x, src_index, dst_index, 'pool_type',
-                                          pool_type.upper())
+        if out_size is None or out_size <= 0:
+            out, tmp = _C_ops.graph_send_recv(x, src_index, dst_index,
+                                              'pool_type', pool_type.upper())
+        else:
+            out, tmp = _C_ops.graph_send_recv(
+                x, src_index, dst_index, 'pool_type',
+                pool_type.upper(), 'out_size', out_size)
        return out

    check_variable_and_dtype(x, "X", ("float32", "float64", "int32", "int64"),
@@ -105,5 +137,8 @@ def graph_send_recv(x, src_index, dst_index, pool_type="sum", name=None):
                "Dst_index": dst_index},
        outputs={"Out": out,
                 "Dst_count": dst_count},
-        attrs={"pool_type": pool_type.upper()})
+        attrs={
+            "pool_type": pool_type.upper(),
+            "out_size": 0 if out_size is None or out_size <= 0 else out_size
+        })
    return out
--- a/python/paddle/nn/functional/activation.py
+++ b/python/paddle/nn/functional/activation.py
@@ -578,6 +578,8 @@ def relu_(x, name=None):
    Inplace version of ``relu`` API, the output Tensor will be inplaced with input ``x``.
    Please refer to :ref:`api_nn_cn_relu`.
    """
+    if _in_eager_mode():
+        return _C_ops.final_state_relu_(x)
    return _C_ops.relu_(x)



--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -127,7 +127,7 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True):
                    "\n\tFaild to convert input data to a regular ndarray :\n\t - Usually "
                    "this means the input data contains nested lists with different lengths. "
                )
-        elif isinstance(data, paddle.Tensor):
+        elif isinstance(data, (paddle.Tensor, core.eager.Tensor)):
            data = data._copy_to(place, False)
            data = _handle_dtype(data, dtype)
            data.stop_gradient = stop_gradient

--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -1735,12 +1735,12 @@ def tile(x, repeat_times, name=None):

    Args:
        x (Tensor): The input tensor, its data type should be bool, float32, float64, int32 or int64.
-        repeat_times (Tensor|tuple|list): The number of repeating times. If repeat_times is a list or tuple, all its elements
+        repeat_times (list|tuple|Tensor): The number of repeating times. If repeat_times is a list or tuple, all its elements
            should be integers or 1-D Tensors with the data type int32. If repeat_times is a Tensor, it should be an 1-D Tensor with the data type int32.
        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.

    Returns:
-        N-D Tensor. The data type is the same as ``x``.
+        N-D Tensor. The data type is the same as ``x``. The size of the i-th dimension is equal to ``x[i] * repeat_times[i]``.

    Examples:
        .. code-block:: python
@@ -1750,16 +1750,18 @@ def tile(x, repeat_times, name=None):
            data = paddle.to_tensor([1, 2, 3], dtype='int32')
            out = paddle.tile(data, repeat_times=[2, 1])
            np_out = out.numpy()
-            # [[1, 2, 3], [1, 2, 3]]
+            # [[1, 2, 3]
+            #  [1, 2, 3]]

-            out = paddle.tile(data, repeat_times=[2, 2])
+            out = paddle.tile(data, repeat_times=(2, 2))
            np_out = out.numpy()
-            # [[1, 2, 3, 1, 2, 3], [1, 2, 3, 1, 2, 3]]
+            # [[1, 2, 3, 1, 2, 3]
+            #  [1, 2, 3, 1, 2, 3]]

-            repeat_times = paddle.to_tensor([2, 1], dtype='int32')
+            repeat_times = paddle.to_tensor([1, 2], dtype='int32')
            out = paddle.tile(data, repeat_times=repeat_times)
            np_out = out.numpy()
-            # [[1, 2, 3], [1, 2, 3]]
+            # [[1, 2, 3, 1, 2, 3]]
    """
    if paddle.in_dynamic_mode():
        if _in_eager_mode():

--- a/python/paddle/utils/code_gen/api.yaml
+++ b/python/paddle/utils/code_gen/api.yaml
@@ -36,9 +36,9 @@
    func : conj

 - api : copy_to
-  args : (Tensor x, Backend backend, bool blocking)
+  args : (Tensor x, Place place, bool blocking)
  output : Tensor
-  invoke : copy_to_impl(x, backend, blocking)
+  invoke : copy_to_impl(x, place, blocking)

 - api : divide
  args : (Tensor x, Tensor y)
@@ -57,7 +57,7 @@
    func : dot

 - api : empty
-  args : (ScalarArray shape, DataType dtype=DataType::FLOAT32, Backend place=Backend::CPU)
+  args : (ScalarArray shape, DataType dtype=DataType::FLOAT32, Place place=CPUPlace())
  output: Tensor
  infer_meta :
    func : CreateInferMeta
@@ -69,7 +69,7 @@
    backend : place

 - api : empty_like
-  args : (Tensor x, DataType dtype = DataType::UNDEFINED, Backend place = Backend::UNDEFINED)
+  args : (Tensor x, DataType dtype = DataType::UNDEFINED, Place place = {})
  output: Tensor
  infer_meta :
    func : CreateLikeInferMeta
@@ -89,7 +89,7 @@
    func : flatten

 - api : full
-  args : (ScalarArray shape, Scalar value, DataType dtype=DataType::FLOAT32, Backend place=Backend::CPU)
+  args : (ScalarArray shape, Scalar value, DataType dtype=DataType::FLOAT32, Place place=CPUPlace())
  output: Tensor
  infer_meta :
    func : CreateInferMeta
@@ -101,7 +101,7 @@
    backend : place

 - api : full_like
-  args : (Tensor x, Scalar value, DataType dtype = DataType::UNDEFINED, Backend place = Backend::UNDEFINED)
+  args : (Tensor x, Scalar value, DataType dtype = DataType::UNDEFINED, Place place = {})
  output: Tensor
  infer_meta :
    func : CreateLikeInferMeta
@@ -138,7 +138,7 @@
    func : multiply

 - api : ones_like
-  args : (Tensor x, DataType dtype=DataType::UNDEFINED, Backend place=Backend::UNDEFINED)
+  args : (Tensor x, DataType dtype=DataType::UNDEFINED, Place place={})
  output : Tensor
  invoke : full_like(x, 1, dtype, place)

@@ -219,7 +219,7 @@
    data_type : x

 - api : zeros_like
-  args : (Tensor x, DataType dtype=DataType::UNDEFINED, Backend place=Backend::UNDEFINED)
+  args : (Tensor x, DataType dtype=DataType::UNDEFINED, Place place = {})
  output : Tensor
  invoke : full_like(x, 0, dtype, place)

@@ -231,7 +231,7 @@
    func : OneHotInferMeta
  kernel :
    func : one_hot
-    
+
 - api : digamma
  args : (Tensor x)
  output : Tensor
@@ -1484,3 +1484,16 @@
  kernel :
    func : mv
  backward : mv_grad
+
+
+
+#  =================================== sep0
+
+
+#  =================================== sep1
+
+
+#  =================================== sep2
+
+
+#  =================================== sep3
--- a/python/paddle/utils/code_gen/api_base.py
+++ b/python/paddle/utils/code_gen/api_base.py
@@ -99,7 +99,7 @@ class BaseAPI(object):
            'double': 'double',
            'bool': 'bool',
            'str': 'const std::string&',
-            'Backend': 'Backend',
+            'Place': 'Place',
            'DataLayout': 'DataLayout',
            'DataType': 'DataType',
            'int64[]': 'const std::vector<int64_t>&',
@@ -118,7 +118,7 @@ class BaseAPI(object):
            'float': 'paddle::optional<float>',
            'double': 'paddle::optional<double>',
            'bool': 'paddle::optional<bool>',
-            'Backend': 'paddle::optional<Backend>',
+            'Place': 'paddle::optional<Place>',
            'DataLayout': 'paddle::optional<DataLayout>',
            'DataType': 'paddle::optional<DataType>',
            'int64[]': 'paddle::optional<std::vector<int64_t>>',
@@ -327,9 +327,9 @@ PADDLE_API {self.gene_return_type_code()} {self.get_api_func_name() + '_'}({self
        attr_layout_count = 0
        attr_data_type_count = 0
        for attr_name in attrs['names']:
-            if attrs['attr_info'][attr_name][0] == 'Backend':
+            if attrs['attr_info'][attr_name][0] == 'Place':
                assert kernel['backend'] is not None, \
-                    f"{api} api: When there is a parameter with 'Backend' type in attributes, you must set backend of kernel manually."
+                    f"{api} api: When there is a parameter with 'Place' type in attributes, you must set backend of kernel manually."
                attr_backend_count = attr_backend_count + 1
            if attrs['attr_info'][attr_name][0] == 'DataLayout':
                assert kernel['layout'] is not None, \
@@ -348,8 +348,8 @@ PADDLE_API {self.gene_return_type_code()} {self.get_api_func_name() + '_'}({self
                assert len(
                    vars_list
                ) == 2, f"{api} api: The number of params to set backend with '>' only allows 2, but received {len(vars_list)}."
-                assert (vars_list[0].strip() in attrs['names']) and (attrs['attr_info'][vars_list[0].strip()][0] == 'Backend'), \
-                    f"{api} api: When use '>' to set kernel backend, the first param should be a attribute with Backend type."
+                assert (vars_list[0].strip() in attrs['names']) and (attrs['attr_info'][vars_list[0].strip()][0] == 'Place'), \
+                    f"{api} api: When use '>' to set kernel backend, the first param should be a attribute with Place type."
                kernel_select_code = kernel_select_code + f"""
  kernel_backend = ParseBackendWithInputOrder({vars_list[0].strip()}, {vars_list[1].strip()});
 """

--- a/python/paddle/utils/code_gen/backward.yaml
+++ b/python/paddle/utils/code_gen/backward.yaml
@@ -56,6 +56,16 @@
  kernel :
    func : abs_grad

+- backward_api : relu_grad
+  forward : relu (Tensor x) -> Tensor(out)
+  args : (Tensor x, Tensor out_grad)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [x]
+  kernel :
+    func : relu_grad
+
 - backward_api : trunc_grad
  forward : trunc (Tensor x) -> Tensor(out)
  args : (Tensor out_grad)
@@ -868,3 +878,16 @@
    param : [x, vec]
  kernel :
    func : mv_grad
+
+
+
+#  =================================== sep0
+
+
+#  =================================== sep1
+
+
+#  =================================== sep2
+
+
+#  =================================== sep3
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -282,6 +282,12 @@ packages=['paddle',
          'paddle.distribution',
          'paddle.distributed.sharding',
          'paddle.distributed.fleet',
+          'paddle.distributed.launch',
+          'paddle.distributed.launch.context',
+          'paddle.distributed.launch.controllers',
+          'paddle.distributed.launch.job',
+          'paddle.distributed.launch.plugins',
+          'paddle.distributed.launch.utils',
          'paddle.distributed.fleet.base',
          'paddle.distributed.fleet.elastic',
          'paddle.distributed.fleet.meta_optimizers',
@@ -727,7 +733,7 @@ with redirect_stdout():
        },
        entry_points={
            'console_scripts': [
-                'fleetrun = paddle.distributed.fleet.launch:launch'
+                'fleetrun = paddle.distributed.launch.__main__:launch'
            ]
        },
        classifiers=[