diff --git a/README.md b/README.md index cdbf2d9f3bf9973fb6c7fe2365ea61f05ce998c1..c4c5decec54300b2fc08cd64461cf371b3d2ce91 100644 --- a/README.md +++ b/README.md @@ -14,9 +14,8 @@ English | [简体中文](./README_cn.md) Welcome to the PaddlePaddle GitHub. -PaddlePaddle, as the only independent R&D deep learning platform in China, has been officially open-sourced to professional communities since 2016. It is an industrial platform with advanced technologies and rich features that cover core deep learning frameworks, basic model libraries, end-to-end development kits, tools & components as well as service platforms. -PaddlePaddle is originated from industrial practices with dedication and commitments to industrialization. It has been widely adopted by a wide range of sectors including manufacturing, agriculture, enterprise service, and so on while serving more than 4 million developers. With such advantages, PaddlePaddle has helped an increasing number of partners commercialize AI. - +PaddlePaddle, as the first independent R&D deep learning platform in China, has been officially open-sourced to professional communities since 2016. It is an industrial platform with advanced technologies and rich features that cover core deep learning frameworks, basic model libraries, end-to-end development kits, tools & components as well as service platforms. +PaddlePaddle is originated from industrial practices with dedication and commitments to industrialization. It has been widely adopted by a wide range of sectors including manufacturing, agriculture, enterprise service, and so on while serving more than 4 million developers, 157,000 companies and generating 476,000 models. With such advantages, PaddlePaddle has helped an increasing number of partners commercialize AI. ## Installation diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc index 7f21bcee87ab705097d3c2beaf799e5f2d93b833..2af407f711ec10097e5692d7e189cf847bcfeaf1 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc @@ -395,7 +395,7 @@ std::shared_ptr ProcessGroupNCCL::Barrier( platform::CUDADeviceGuard gpuGuard; for (auto& place : places) { gpuGuard.SetDeviceIndex(place.GetDeviceId()); - auto dt = full({1}, 0, phi::DataType::FLOAT32, phi::Backend::GPU); + auto dt = full({1}, 0, phi::DataType::FLOAT32, phi::GPUPlace()); barrierTensors.push_back(dt); } auto task = ProcessGroupNCCL::AllReduce(barrierTensors); @@ -417,7 +417,7 @@ void CheckTensorsInDifferentDevices(const std::vector& tensors, std::set used_devices; for (const auto& t : tensors) { - PADDLE_ENFORCE_EQ(t.is_cuda() && t.is_dense_tensor(), true, + PADDLE_ENFORCE_EQ(t.is_gpu() && t.is_dense_tensor(), true, platform::errors::InvalidArgument( "Tensors must be CUDA and dense tensor.")); diff --git a/paddle/fluid/distributed/collective/reducer.cc b/paddle/fluid/distributed/collective/reducer.cc index be4c5423943f5076201b75e307094c75d3d9c103..79961cca85ae03d654ea59b4e58c814dd2c4c543 100644 --- a/paddle/fluid/distributed/collective/reducer.cc +++ b/paddle/fluid/distributed/collective/reducer.cc @@ -321,7 +321,7 @@ EagerReducer::EagerReducer( if (find_unused_vars_each_step_) { global_used_vars_ = paddle::experimental::empty( ScalarArray({static_cast(tensors_.size())}), DataType::INT32, - TransToBackend(inner_place_)); + inner_place_); } } @@ -363,10 +363,8 @@ void EagerReducer::InitializeGroups( } else { // process the dense gradient. InitializeDenseGroups(tensor_indices_, &group); - // experimental::Backend backend = TransToBackend(inner_place_); group.dense_contents_ = paddle::experimental::empty( - ScalarArray({group.all_length_}), group.dtype_, - TransToBackend(inner_place_)); + ScalarArray({group.all_length_}), group.dtype_, inner_place_); } // map tensors to this group by VariableLocator diff --git a/paddle/fluid/eager/api/utils/tensor_utils.cc b/paddle/fluid/eager/api/utils/tensor_utils.cc index b485beca57a214bc00cb813e9de6a53eca1e67ea..81ea92d1c3c4848ceb6ff3b333030a90eccc0b79 100644 --- a/paddle/fluid/eager/api/utils/tensor_utils.cc +++ b/paddle/fluid/eager/api/utils/tensor_utils.cc @@ -43,8 +43,7 @@ paddle::experimental::Tensor CreateTensorWithValue( const phi::DataType& dtype, const phi::DataLayout& layout, float value, bool is_leaf) { paddle::experimental::Tensor out = paddle::experimental::full( - phi::vectorize(ddim), paddle::experimental::Scalar(value), dtype, - phi::TransToPhiBackend(place)); + phi::vectorize(ddim), paddle::experimental::Scalar(value), dtype, place); auto meta = EagerUtils::autograd_meta(&out); if (is_leaf) { diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py index 5b7315e1ee7b1a2f67b1025bb4f47c0ea051f5d7..97114b9fe1189a9d9774eeb223e69926c3e11cbc 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py @@ -29,7 +29,7 @@ yaml_types_mapping = { 'int' : 'int', 'int32' : 'int32_t', 'int64' : 'int64_t', 'size_t' : 'size_t', \ 'float' : 'float', 'double' : 'double', 'bool' : 'bool', \ 'str' : 'std::string', \ - 'Backend' : 'paddle::experimental::Backend', 'DataLayout' : 'paddle::experimental::DataLayout', 'DataType' : 'paddle::experimental::DataType', \ + 'Place' : 'paddle::experimental::Place', 'DataLayout' : 'paddle::experimental::DataLayout', 'DataType' : 'paddle::experimental::DataType', \ 'int64[]' : 'std::vector', 'int[]' : 'std::vector', 'Tensor' : 'Tensor', 'Tensor[]' : 'std::vector', @@ -56,6 +56,14 @@ def ParseArguments(): ################# ### Helpers ### ################# +def RecoverBaseNameOfInplaceFunction(function_name): + return function_name[:-1] + + +def GetInplacedFunctionName(function_name): + return function_name + "_" + + def FindGradName(string): return string + "_grad" @@ -149,6 +157,24 @@ def ReadBwdFile(filepath): ###################### ### Yaml Parsers ### ###################### +def ParseInplaceInfo(string): + # string: "(x -> out0), (y -> out2)" + inplace_map = {} + for pair in string.split(","): + pair = pair.strip() + if pair.startswith("("): + pair = pair[1:] + + if pair.endswith(")"): + pair = pair[:-1] + + key = pair.split("->")[0].strip() + val = pair.split("->")[1].strip() + inplace_map[key] = val + + return inplace_map + + def RemoveSpecialSymbolsInName(string): # Remove any name after '@' ret = string.split("@")[0] @@ -684,9 +710,10 @@ std::vector> {}::operator()(const std: def GenerateNodeCreationCodes( fwd_api_name, bwd_api_name, forward_inputs_position_map, - forward_outputs_position_map, forward_attrs_list, + forward_outputs_position_map, forward_attrs_list, forward_call_str, backward_fwd_input_map, backward_grad_input_map, - backward_grad_output_map, backward_attrs_list, optional_inputs): + backward_grad_output_map, backward_attrs_list, optional_inputs, + inplace_map): # fwd_api_name = "" # forward_inputs_position_map = { "name" : [type, fwd_position] } # forward_outputs_position_map = { "name" : [type, fwd_position] } @@ -724,19 +751,19 @@ def GenerateNodeCreationCodes( output_autograd_meta_vec_name = GetAutoGradMetaVectorName(name) if num_fwd_outputs == 1: if IsPlainTensorType(rtype): - output_autograd_meta = f" egr::AutogradMeta* {output_autograd_meta_name} = egr::EagerUtils::autograd_meta(&api_result);" + output_autograd_meta = f" egr::AutogradMeta* {output_autograd_meta_name} = egr::EagerUtils::autograd_meta(&api_result);" else: assert IsVectorTensorType(rtype) - output_autograd_meta = f" std::vector {output_autograd_meta_vec_name} = egr::EagerUtils::autograd_meta(&api_result);\n" - output_autograd_meta += f" std::vector* {output_autograd_meta_name} = &{output_autograd_meta_vec_name};" + output_autograd_meta = f" std::vector {output_autograd_meta_vec_name} = egr::EagerUtils::autograd_meta(&api_result);\n" + output_autograd_meta += f" std::vector* {output_autograd_meta_name} = &{output_autograd_meta_vec_name};" else: # Tuple api_result if IsPlainTensorType(rtype): - output_autograd_meta = f" egr::AutogradMeta* {output_autograd_meta_name} = egr::EagerUtils::autograd_meta(&std::get<{pos}>(api_result));" + output_autograd_meta = f" egr::AutogradMeta* {output_autograd_meta_name} = egr::EagerUtils::autograd_meta(&std::get<{pos}>(api_result));" else: assert IsVectorTensorType(rtype) - output_autograd_meta = f" std::vector {output_autograd_meta_vec_name} = egr::EagerUtils::autograd_meta(&std::get<{pos}>(api_result));\n" - output_autograd_meta += f" std::vector* {output_autograd_meta_name} = &{output_autograd_meta_vec_name};" + output_autograd_meta = f" std::vector {output_autograd_meta_vec_name} = egr::EagerUtils::autograd_meta(&std::get<{pos}>(api_result));\n" + output_autograd_meta += f" std::vector* {output_autograd_meta_name} = &{output_autograd_meta_vec_name};" outputs_autograd_meta_list.append(output_autograd_meta) pass_stop_gradient_args_list.append(output_autograd_meta_name) @@ -745,16 +772,34 @@ def GenerateNodeCreationCodes( outputs_autograd_meta_str = "\n".join(outputs_autograd_meta_list) pass_stop_gradient_args_str = ",".join(pass_stop_gradient_args_list) + # Check Inplace + check_inplace_str = "" + bump_inplace_version_str = "" + for inplace_name in inplace_map.keys(): + inplace_autograd_meta_name = GetAutoGradMetaName(inplace_name) + check_inplace_str += f""" + // Check Inplace + egr::EagerUtils::CheckInplace({inplace_name}, {inplace_autograd_meta_name}, require_any_grad);\n +""" + + bump_inplace_version_str += f""" + // Bump Inplace Version + {inplace_name}.bump_inplace_version(); + VLOG(3) << \"Tensor(\" << {inplace_name}.name() << \") uses Inplace Strategy.\";\n +""" + # Node Construction num_bwd_inputs = len(backward_grad_input_map.keys()) num_bwd_outputs = len(backward_grad_output_map.keys()) - grad_node_name = GetGradNodeName(fwd_api_name) - node_construction_str = f" auto grad_node = std::make_shared<{grad_node_name}>({num_bwd_inputs}, {num_bwd_outputs});" + grad_node_name = GetGradNodeName( + RecoverBaseNameOfInplaceFunction( + fwd_api_name)) if inplace_map else GetGradNodeName(fwd_api_name) + node_construction_str = f" auto grad_node = std::make_shared<{grad_node_name}>({num_bwd_inputs}, {num_bwd_outputs});" # SetAttributes set_attributes_list = [] for name, _, _, _ in backward_attrs_list: - set_attributes = f" grad_node->SetAttribute{name}({name});" + set_attributes = f" grad_node->SetAttribute{name}({name});" set_attributes_list.append(set_attributes) set_attributes_str = "\n".join(set_attributes_list) @@ -765,9 +810,9 @@ def GenerateNodeCreationCodes( if is_fwd_input: if is_optional: - set_tensor_wrappers = f" if({name}.is_initialized()) grad_node->SetTensorWrapper{name}({name}, true);" + set_tensor_wrappers = f" if({name}.is_initialized()) grad_node->SetTensorWrapper{name}({name}, true);" else: - set_tensor_wrappers = f" grad_node->SetTensorWrapper{name}({name}, true);" + set_tensor_wrappers = f" grad_node->SetTensorWrapper{name}({name}, true);" else: if num_fwd_outputs > 1: # Aligned with forward output position @@ -778,9 +823,9 @@ def GenerateNodeCreationCodes( tw_name = f"api_result" if is_optional: - set_tensor_wrappers = f" if({tw_name}.is_initialized()) grad_node->SetTensorWrapper{name}({tw_name}, false);" + set_tensor_wrappers = f" if({tw_name}.is_initialized()) grad_node->SetTensorWrapper{name}({tw_name}, false);" else: - set_tensor_wrappers = f" grad_node->SetTensorWrapper{name}({tw_name}, false);" + set_tensor_wrappers = f" grad_node->SetTensorWrapper{name}({tw_name}, false);" set_tensor_wrappers_list.append(set_tensor_wrappers) set_tensor_wrappers_str = "\n".join(set_tensor_wrappers_list) @@ -789,8 +834,8 @@ def GenerateNodeCreationCodes( set_edges_list = [] for name, (_, pos) in forward_inputs_position_map.items(): input_autograd_meta_name = GetAutoGradMetaName(name) - set_grad_out_meta = f" grad_node->SetGradOutMeta({name}, {pos});" - set_edges = f" grad_node->AddEdges({input_autograd_meta_name}, {pos});" + set_grad_out_meta = f" grad_node->SetGradOutMeta({name}, {pos});" + set_edges = f" grad_node->AddEdges({input_autograd_meta_name}, {pos});" set_grad_out_meta_list.append(set_grad_out_meta) set_edges_list.append(set_edges) set_grad_out_meta_str = "\n".join(set_grad_out_meta_list) @@ -804,14 +849,14 @@ def GenerateNodeCreationCodes( num_outputs = len(forward_outputs_position_map.keys()) for name, (_, pos) in forward_outputs_position_map.items(): output_autograd_meta_name = GetAutoGradMetaName(name) - set_out_rank = f" egr::EagerUtils::SetOutRankWithSlot({output_autograd_meta_name}, {pos});" - set_history = f" egr::EagerUtils::SetHistory({output_autograd_meta_name}, grad_node);" + set_out_rank = f" egr::EagerUtils::SetOutRankWithSlot({output_autograd_meta_name}, {pos});" + set_history = f" egr::EagerUtils::SetHistory({output_autograd_meta_name}, grad_node);" if num_outputs == 1: - set_retain_grad = f" egr::EagerUtils::CheckAndRetainGrad(api_result);" - set_grad_in_meta = f" grad_node->SetGradInMeta(api_result, {pos});" + set_retain_grad = f" egr::EagerUtils::CheckAndRetainGrad(api_result);" + set_grad_in_meta = f" grad_node->SetGradInMeta(api_result, {pos});" else: - set_retain_grad = f" egr::EagerUtils::CheckAndRetainGrad(std::get<{pos}>(api_result));" - set_grad_in_meta = f" grad_node->SetGradInMeta(std::get<{pos}>(api_result), {pos});" + set_retain_grad = f" egr::EagerUtils::CheckAndRetainGrad(std::get<{pos}>(api_result));" + set_grad_in_meta = f" grad_node->SetGradInMeta(std::get<{pos}>(api_result), {pos});" set_out_rank_list.append(set_out_rank) set_history_list.append(set_history) @@ -823,55 +868,64 @@ def GenerateNodeCreationCodes( set_grad_in_meta_str = "\n".join(set_grad_in_meta_list) set_retain_grad_str = "\n".join(set_retain_grad_list) + node_event_name = fwd_api_name + " node_creation" + NODE_CREATION_TEMPLATE = """ + paddle::platform::RecordEvent node_creation_record_event(\"{}\", paddle::platform::TracerEventType::Operator, 1);\n + """ + node_creation_event_str = NODE_CREATION_TEMPLATE.format(node_event_name) + NODE_CREATION_TEMPLATE = """ // Get AutoGradMeta -{} {} bool trace_backward = egr::Controller::Instance().HasGrad(); - bool require_any_grad = egr::EagerUtils::ComputeRequireGrad({}); - if(require_any_grad) {{ - egr::EagerUtils::PassStopGradient({}); - - // Node Construction {} - - // SetAttributes + // Forward API Call + {} {} - - // SetTensorWrappers + {{ {} - - // SetGradOutMeta & SetEdges {} + if(require_any_grad) {{ + egr::EagerUtils::PassStopGradient({}); + + // Node Construction {} - - // SetOutRank & SetHistory & SetGradInMeta & RetainGrad + // SetAttributes {} + // SetTensorWrappers {} + // SetGradOutMeta & SetEdges {} {} - + // SetOutRank & SetHistory & SetGradInMeta & RetainGrad +{} +{} +{} +{} + }} }} """ node_creation_str = NODE_CREATION_TEMPLATE.format( - inputs_autograd_meta_str, outputs_autograd_meta_str, - compute_require_grad_args_str, pass_stop_gradient_args_str, - node_construction_str, set_attributes_str, set_tensor_wrappers_str, - set_grad_out_meta_str, set_edges_str, set_out_rank_str, set_history_str, - set_grad_in_meta_str, set_retain_grad_str) + inputs_autograd_meta_str, compute_require_grad_args_str, + check_inplace_str, forward_call_str, bump_inplace_version_str, + node_creation_event_str, outputs_autograd_meta_str, + pass_stop_gradient_args_str, node_construction_str, set_attributes_str, + set_tensor_wrappers_str, set_grad_out_meta_str, set_edges_str, + set_out_rank_str, set_history_str, set_grad_in_meta_str, + set_retain_grad_str) return node_creation_str -def GenerateForwardDefinition(fwd_api_name, bwd_api_name, - forward_inputs_position_map, - forward_outputs_position_map, forward_attrs_list, - backward_fwd_input_map, backward_grad_input_map, - backward_grad_output_map, backward_attrs_list, - optional_inputs, intermediate_outputs): +def GenerateForwardDefinition( + fwd_api_name, bwd_api_name, forward_inputs_position_map, + forward_outputs_position_map, forward_attrs_list, + backward_fwd_input_map, backward_grad_input_map, + backward_grad_output_map, backward_attrs_list, optional_inputs, + intermediate_outputs, inplace_map): # fwd_api_name = "" # forward_inputs_position_map = { "name" : [type, fwd_position] } # forward_outputs_position_map = { "name" : [type, fwd_position] } @@ -895,7 +949,10 @@ def GenerateForwardDefinition(fwd_api_name, bwd_api_name, if is_optional: arg_str = f"const paddle::optional& {name}" else: - arg_str = f"const paddle::experimental::Tensor& {name}" + if inplace_map and name in inplace_map.keys(): + arg_str = f"paddle::experimental::Tensor& {name}" + else: + arg_str = f"const paddle::experimental::Tensor& {name}" else: assert IsVectorTensorType(ttype) arg_str = f"const std::vector& {name}" @@ -958,26 +1015,16 @@ def GenerateForwardDefinition(fwd_api_name, bwd_api_name, node_creation_str = GenerateNodeCreationCodes( fwd_api_name, bwd_api_name, forward_inputs_position_map, - forward_outputs_position_map, forward_attrs_list, + forward_outputs_position_map, forward_attrs_list, forward_call_str, backward_fwd_input_map, backward_grad_input_map, - backward_grad_output_map, backward_attrs_list, optional_inputs) - - node_event_name = fwd_api_name + " node_creation" - NODE_CREATION_TEMPLATE = """{{\n - paddle::platform::RecordEvent node_creation_record_event(\"{}\", paddle::platform::TracerEventType::Operator, 1);\n - {}\n - }}""" - node_creation_str = NODE_CREATION_TEMPLATE.format(node_event_name, - node_creation_str) + backward_grad_output_map, backward_attrs_list, optional_inputs, + inplace_map) dygraph_event_str = f"paddle::platform::RecordEvent dygraph_entrance_record_event(\"{fwd_api_name} dygraph\", paddle::platform::TracerEventType::Operator, 1);" FORWARD_FUNCTION_TEMPLATE = """ {} {}({}) {{ {} - - // Forward API Call - {} {} @@ -989,7 +1036,7 @@ def GenerateForwardDefinition(fwd_api_name, bwd_api_name, forward_function_name = GetForwardFunctionName(fwd_api_name) forward_function_str = FORWARD_FUNCTION_TEMPLATE.format( returns_type_str, forward_function_name, inputs_args_definition_str, - dygraph_event_str, forward_call_str, node_creation_str, returns_str) + dygraph_event_str, node_creation_str, returns_str) forward_function_declaration_str = f"{returns_type_str} {forward_function_name}({inputs_args_declaration_str});" return forward_function_str, forward_function_declaration_str @@ -1191,6 +1238,10 @@ if __name__ == "__main__": fwd_args_str = fwd_api['args'] fwd_returns_str = fwd_api['output'] + inplace_map = {} + if 'inplace' in fwd_api.keys(): + inplace_map = ParseInplaceInfo(fwd_api['inplace']) + bwd_api_name = fwd_api['backward'] assert bwd_api_name in grad_api_dict.keys(), bwd_api_name bwd_api = grad_api_dict[bwd_api_name] @@ -1287,7 +1338,7 @@ if __name__ == "__main__": forward_outputs_position_map, orig_forward_attrs_list, backward_fwd_input_map, backward_grad_input_map, backward_grad_output_map, backward_attrs_list, optional_inputs, - intermediate_outputs) + intermediate_outputs, {}) print("Generated Forward Definition: ", forward_definition_str) print("Generated Forward Declaration: ", forward_declaration_str) yaml_forward_definition_str += definition_declaration_pair[0] @@ -1298,6 +1349,30 @@ if __name__ == "__main__": forward_outputs_position_map, orig_forward_attrs_list) + # Inplaced Version Dygraph Function Generation + if fwd_api_name != "sum" and "inplace" in fwd_api.keys(): + fwd_api_name_inplaced = GetInplacedFunctionName(fwd_api_name) + + # Node Definition Generation + definition_declaration_pair = GenerateForwardDefinition( + fwd_api_name_inplaced, bwd_api_name, + forward_inputs_position_map, forward_outputs_position_map, + forward_attrs_list, backward_fwd_input_map, + backward_grad_input_map, backward_grad_output_map, + backward_attrs_list, optional_inputs, intermediate_outputs, + inplace_map) + print("Generated Inplaced Forward Definition: ", + forward_definition_str) + print("Generated Inplaced Forward Declaration: ", + forward_declaration_str) + forward_definition_str += definition_declaration_pair[0] + forward_declaration_str += definition_declaration_pair[1] + + # For python-level API dispatch + CollectCoreOpsInformation( + fwd_api_name_inplaced, forward_inputs_position_map, + forward_outputs_position_map, forward_attrs_list) + if len(namespace) > 0: forward_definition_str += f"""namespace {namespace} {{ {yaml_forward_definition_str} diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py index 203e0098a64d4aec79aff72454d1d11461ea068e..5a732212a5649b6899394b80399d3af32707197d 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py @@ -15,7 +15,7 @@ import os import argparse import logging -from eager_gen import namespace, yaml_types_mapping, ReadFwdFile, ParseDispensable, IsVectorTensorType, GetForwardFunctionName, ParseYamlForward, DetermineForwardPositionMap +from eager_gen import namespace, yaml_types_mapping, ReadFwdFile, ParseDispensable, IsVectorTensorType, GetForwardFunctionName, ParseYamlForward, DetermineForwardPositionMap, GetInplacedFunctionName, ParseInplaceInfo ########################### ## Global Configurations ## @@ -43,7 +43,7 @@ atype_to_parsing_function = { "std::vector": "CastPyArg2Strings", "paddle::experimental::Scalar": "CastPyArg2Scalar", "paddle::experimental::ScalarArray": "CastPyArg2ScalarArray", - "paddle::experimental::Backend": "CastPyArg2Backend", + "paddle::experimental::Place": "CastPyArg2Place", "paddle::experimental::DataType": "CastPyArg2DataType", } @@ -71,6 +71,14 @@ RECORD_EVENT_TEMPLATE = \ " paddle::platform::RecordEvent {}(\"{} {}\", paddle::platform::TracerEventType::Operator, 1);" +RETURN_INPLACE_PYOBJECT_TEMPLATE = \ +""" + ssize_t arg_id = GetIdxFromCoreOpsInfoMap(core_ops_final_state_args_info, \"final_state_{}\", \"{}\"); + ssize_t return_id = GetIdxFromCoreOpsInfoMap(core_ops_final_state_returns_info, \"final_state_{}\", \"{}\"); + return ToPyObject(out, return_id, args, arg_id); +""" + + PYTHON_C_FUNCTION_TEMPLATE = \ """ static PyObject * eager_final_state_api_{}(PyObject *self, PyObject *args, PyObject *kwargs) @@ -94,7 +102,7 @@ static PyObject * eager_final_state_api_{}(PyObject *self, PyObject *args, PyObj PyEval_RestoreThread(tstate); tstate = nullptr; - return ToPyObject(out); +{} }} catch(...) {{ if (tstate) {{ @@ -287,9 +295,10 @@ class PythonCSingleFunctionGenerator: self.forward_inputs_position_map, self.forward_outputs_position_map = DetermineForwardPositionMap( forward_inputs_list, forward_returns_list) - def GeneratePythonCFunction(self): + def GeneratePythonCFunction(self, inplace_map): namespace = self.namespace - forward_api_name = self.forward_api_name + forward_api_name = GetInplacedFunctionName( + self.forward_api_name) if inplace_map else self.forward_api_name forward_attrs_list = self.forward_attrs_list forward_inputs_position_map = self.forward_inputs_position_map forward_outputs_position_map = self.forward_outputs_position_map @@ -341,19 +350,31 @@ class PythonCSingleFunctionGenerator: fwd_function_name = FUNCTION_NAME_TEMPLATE.format( "::", namespace, GetForwardFunctionName(forward_api_name)) + if inplace_map: + assert len( + inplace_map + ) == 1, f"size of inplace_map must be 1, but inplace_map of \"{forward_api_name}\" op got {len(inplace_map)}" + for inplace_input, inplace_output in inplace_map.items(): + return_str = RETURN_INPLACE_PYOBJECT_TEMPLATE.format( + forward_api_name, inplace_input, forward_api_name, + inplace_output) + break + else: + return_str = " return ToPyObject(out);" + # Generate Record Event for performance profiling pythonc_record_event_str = RECORD_EVENT_TEMPLATE.format( "pythonc_record_event", forward_api_name, "pybind_imperative_func") self.python_c_function_str = PYTHON_C_FUNCTION_TEMPLATE.format( forward_api_name, pythonc_record_event_str, forward_api_name, get_eager_tensor_str, parse_attributes_str, fwd_function_name, - dygraph_function_call_str) + dygraph_function_call_str, return_str) # Generate Python-C Function Registration self.python_c_function_reg_str = PYTHON_C_FUNCTION_REG_TEMPLATE.format( forward_api_name, namespace, forward_api_name, forward_api_name) - def run(self): + def run(self, inplace_map): # Initialized is_forward_only self.CollectIsForwardOnly() @@ -384,7 +405,7 @@ class PythonCSingleFunctionGenerator: ) # Code Generation - self.GeneratePythonCFunction() + self.GeneratePythonCFunction(inplace_map) logging.info( f"Generated Python-C Function: {self.python_c_function_str}") logging.info( @@ -416,12 +437,23 @@ class PythonCYamlGenerator: for forward_api_content in forward_api_list: f_generator = PythonCSingleFunctionGenerator(forward_api_content, namespace) - status = f_generator.run() + status = f_generator.run({}) if status == True: self.python_c_functions_reg_str += f_generator.python_c_function_reg_str + ",\n" self.python_c_functions_str += f_generator.python_c_function_str + "\n" + if 'inplace' in forward_api_content.keys(): + inplace_map = ParseInplaceInfo(forward_api_content['inplace']) + + f_generator_inplace = PythonCSingleFunctionGenerator( + forward_api_content, namespace) + status = f_generator_inplace.run(inplace_map) + + if status == True: + self.python_c_functions_reg_str += f_generator_inplace.python_c_function_reg_str + ",\n" + self.python_c_functions_str += f_generator_inplace.python_c_function_str + "\n" + def InferNameSpace(self): yaml_path = self.yaml_path if "sparse" in yaml_path: diff --git a/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc b/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc index c8b2d22dcf95139db47704be86a6f64554f7c0ba..5fec38bf25a433542e09e0eec0cb52c61cff7167 100644 --- a/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc +++ b/paddle/fluid/eager/tests/data_structure_tests/eager_tensor_test.cc @@ -90,7 +90,7 @@ TEST(Tensor, MemberFunction) { VLOG(6) << "Set impl"; CHECK_EQ(et3.initialized(), true); CHECK_EQ(et3.is_cpu(), true); - CHECK_EQ(et3.is_cuda(), false); + CHECK_EQ(et3.is_gpu(), false); CHECK_EQ(et3.numel(), 2); auto expected_dim = phi::make_ddim({1, 2}); CHECK_EQ(et3.dims(), expected_dim); diff --git a/paddle/fluid/operators/graph_send_recv_op.cc b/paddle/fluid/operators/graph_send_recv_op.cc index f7c006dbcb1a9a23ec619c8d790df8a093530eee..f67dea7402864b324756914e0ad23c7d87b8e867 100644 --- a/paddle/fluid/operators/graph_send_recv_op.cc +++ b/paddle/fluid/operators/graph_send_recv_op.cc @@ -38,7 +38,7 @@ class GraphSendRecvGradOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { - auto in_dims = ctx->GetInputDim(framework::GradVarName("Out")); + auto in_dims = ctx->GetInputDim("X"); ctx->SetOutputDim(framework::GradVarName("X"), in_dims); } @@ -68,6 +68,12 @@ class GraphSendRecvOpMaker : public framework::OpProtoAndCheckerMaker { "tensors of Dst_index.") .SetDefault("SUM") .InEnum({"SUM", "MEAN", "MIN", "MAX"}); + AddAttr( + "out_size", + "(int64_t, default 0)" + "Define the first dimension of Output tensor." + "If set default 0, then the shape of Out is the same with X.") + .SetDefault(0); AddComment(R"DOC( Graph Learning Send_Recv combine operator. @@ -93,6 +99,7 @@ class GraphSendRecvGradOpMaker : public framework::SingleGradOpMaker { op->SetType("graph_send_recv_grad"); op->SetInput("Src_index", this->Input("Src_index")); op->SetInput("Dst_index", this->Input("Dst_index")); + op->SetInput("X", this->Input("X")); if (BOOST_GET_CONST(std::string, this->GetAttr("pool_type")) == "MEAN") { op->SetInput("Dst_count", this->Output("Dst_count")); @@ -100,7 +107,6 @@ class GraphSendRecvGradOpMaker : public framework::SingleGradOpMaker { if (BOOST_GET_CONST(std::string, this->GetAttr("pool_type")) == "MIN" || BOOST_GET_CONST(std::string, this->GetAttr("pool_type")) == "MAX") { - op->SetInput("X", this->Input("X")); op->SetInput("Out", this->Output("Out")); } diff --git a/paddle/fluid/operators/group_norm_op.cu b/paddle/fluid/operators/group_norm_op.cu index ab8c50d90b8ece68b8e4e05d46cecd13fa84d7e0..c08f1920205daa7f6d5d3f032afcf5d832230ced 100644 --- a/paddle/fluid/operators/group_norm_op.cu +++ b/paddle/fluid/operators/group_norm_op.cu @@ -152,6 +152,21 @@ __device__ __forceinline__ void ThreadReduce(phi::Array arrs, } } +template +__device__ __forceinline__ void ReduceMeanAndVar(T* mean, T* var, T x_mean, + T x_var, int size) { + const int nc = blockIdx.x; + x_mean = kps::details::BlockXReduce>( + x_mean, kps::AddFunctor()); + x_var = kps::details::BlockXReduce>( + x_var, kps::AddFunctor()); + __syncthreads(); + if (threadIdx.x == 0) { + mean[nc] = static_cast(x_mean / size); + var[nc] = static_cast(x_var / size); + } +} + template __global__ void ScalarGetMeanAndVarNCHW(const T* x, T* mean, T* var, int size) { int i = blockIdx.x; @@ -162,10 +177,7 @@ __global__ void ScalarGetMeanAndVarNCHW(const T* x, T* mean, T* var, int size) { x_mean += val; x_var += val * val; } - x_mean /= size; - x_var /= size; - CudaAtomicAddWithWarp(&mean[i], x_mean); - CudaAtomicAddWithWarp(&var[i], x_var); + ReduceMeanAndVar(mean, var, x_mean, x_var, size); } template @@ -174,21 +186,12 @@ __global__ void VectorizedGetMeanAndVarNCHW(const T* x, T* mean, T* var, int i = blockIdx.x; AccT x_mean = static_cast(0); AccT x_var = static_cast(0); - const int input_offset = ((uint64_t)x) % ALIGN_BYTES / sizeof(T); x += i * size; + const int input_offset = ((uint64_t)x) % ALIGN_BYTES / sizeof(T); phi::Array ins; ins[0] = x; ThreadReduce(ins, size, input_offset, &x_mean, &x_var); - - x_mean = kps::details::BlockXReduce>( - x_mean, kps::AddFunctor()); - x_var = kps::details::BlockXReduce>( - x_var, kps::AddFunctor()); - __syncthreads(); - if (threadIdx.x == 0) { - mean[i] = static_cast(x_mean / size); - var[i] = static_cast(x_var / size); - } + ReduceMeanAndVar(mean, var, x_mean, x_var, size); } template @@ -272,10 +275,6 @@ class GroupNormKernel auto& dev_ctx = ctx.template device_context(); Tensor temp_var; temp_var.mutable_data(var->dims(), ctx.GetPlace()); - - set_zero(dev_ctx, mean, static_cast(0)); - set_zero(dev_ctx, &temp_var, static_cast(0)); - auto* x_data = x->data(); auto* y_data = y->data(); auto* mean_data = mean->data(); @@ -319,7 +318,7 @@ class GroupNormKernel block_size_nchw = std::max(block_size_nchw, kps::details::kWarpSize); dim3 grids(x_dims[0] * groups); dim3 blocks(block_size_nchw); - if (size < vec_size) { + if (size < vec_size * block_size_nchw) { ScalarGetMeanAndVarNCHW<<>>( x_data, mean_data, temp_var_data, size); } else { @@ -328,6 +327,8 @@ class GroupNormKernel x_data, mean_data, temp_var_data, size); } } else { + set_zero(dev_ctx, mean, static_cast(0)); + set_zero(dev_ctx, &temp_var, static_cast(0)); GroupNormForwardGetMeanAndVar<<>>( x_data, x_dims[0], C, W, imsize, groups, group_size, mean_data, temp_var_data); @@ -424,24 +425,15 @@ __global__ void VectorizedGetDsDbCUDAKernel(int imsize, const T* x, const T* dy, int i = blockIdx.x; AccT ds_sum = static_cast(0); AccT db_sum = static_cast(0); - const int input_offset = ((uint64_t)x) % ALIGN_BYTES / sizeof(T); x += i * imsize; + const int input_offset = ((uint64_t)x) % ALIGN_BYTES / sizeof(T); phi::Array ins; ins[0] = x; ins[1] = dy; ThreadReduce(ins, imsize, input_offset, &db_sum, &ds_sum); - - ds_sum = kps::details::BlockXReduce>( - ds_sum, kps::AddFunctor()); - db_sum = kps::details::BlockXReduce>( - db_sum, kps::AddFunctor()); - __syncthreads(); - if (threadIdx.x == 0) { - ds[i] = ds_sum; - db[i] = db_sum; - } + ReduceMeanAndVar(db, ds, db_sum, ds_sum, 1); } template @@ -455,8 +447,7 @@ __global__ void ScalarGetDsDbCUDAKernel(int imsize, const T* x, const T* dy, ds_sum += dy[index] * x[index]; db_sum += dy[index]; } - CudaAtomicAddWithWarp(&ds[nc], ds_sum); - CudaAtomicAddWithWarp(&db[nc], db_sum); + ReduceMeanAndVar(db, ds, db_sum, ds_sum, 1); } template @@ -641,13 +632,7 @@ class GroupNormGradKernel } block_size_nchw = std::max(block_size_nchw, kps::details::kWarpSize); dim3 blocks(block_size_nchw); - if (imsize < vec_size) { - if (d_scale) { - set_zero(dev_ctx, d_scale, static_cast(0)); - } - if (d_bias) { - set_zero(dev_ctx, d_bias, static_cast(0)); - } + if (imsize < vec_size * block_size_nchw) { ScalarGetDsDbCUDAKernel< T><<>>( imsize, x_data, dy_data, ds_data, db_data); @@ -687,7 +672,6 @@ class GroupNormGradKernel imsize, C, group_size, groups, p1_data, p2_data, p3_data, x_data, dy_data, d_x_data); } - } else { if (d_scale) { set_zero(dev_ctx, d_scale, static_cast(0)); diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h index 65cca94814e88111239aef3559285d6fe321a72d..ff1ddb4175feff0b059c7ba43e77b306197a11a0 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_op.h +++ b/paddle/fluid/operators/reduce_ops/reduce_op.h @@ -265,67 +265,6 @@ class ReduceKernel : public framework::OpKernel { framework::TransToPhiDataType(cast_out_dtype), output); } }; -template -class BoolReduceKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - bool reduce_all = context.Attr("reduce_all"); - auto* input = context.Input("X"); - auto* output = context.Output("Out"); - output->mutable_data(context.GetPlace()); - - auto dims = context.Attr>("dim"); - bool keep_dim = context.Attr("keep_dim"); - - // The dims has full dim, set the reduce_all is True - const auto& input_dim_size = context.Input("X")->dims().size(); - std::set dims_set(dims.begin(), dims.end()); - bool full_dim = true; - for (auto i = 0; i < input_dim_size; i++) { - if (dims_set.find(i) == dims_set.end()) { - full_dim = false; - break; - } - } - reduce_all = (reduce_all || full_dim); - - if (reduce_all) { - // Flatten and reduce 1-D tensor - auto x = EigenVector::Flatten(*input); - auto out = EigenScalar::From(*output); - auto& place = - *context.template device_context().eigen_device(); - auto reduce_dim = Eigen::array({{0}}); - Functor functor; - functor(place, &x, &out, reduce_dim); - } else { - int ndim = input->dims().size(); - int rdim = dims.size(); - // comments for accelerating compiling temporarily. - if (ndim > 6) { - HandleLargeDim(context, input, output, - dims, keep_dim); - } else { - HANDLE_DIM(6, 5); - HANDLE_DIM(6, 4); - HANDLE_DIM(6, 3); - HANDLE_DIM(6, 2); - HANDLE_DIM(6, 1); - HANDLE_DIM(5, 4); - HANDLE_DIM(5, 3); - HANDLE_DIM(5, 2); - HANDLE_DIM(5, 1); - HANDLE_DIM(4, 3); - HANDLE_DIM(4, 2); - HANDLE_DIM(4, 1); - HANDLE_DIM(3, 2); - HANDLE_DIM(3, 1); - HANDLE_DIM(2, 1); - HANDLE_DIM(1, 1); - } - } - } -}; template void LaunchReduceGradKernel(const framework::ExecutionContext& context, diff --git a/paddle/fluid/pybind/eager.cc b/paddle/fluid/pybind/eager.cc index 1052f93d32ec3cb626577c4b584cc6172c83da2e..75f0babdfe85b61ae2a7fef7f026dc924dc9e511 100644 --- a/paddle/fluid/pybind/eager.cc +++ b/paddle/fluid/pybind/eager.cc @@ -132,8 +132,7 @@ void InitTensorWithTensor(TensorObject* self, self->tensor.set_impl(impl); VLOG(4) << "Same place, do ShareDataWith"; } else { - self->tensor.set_impl( - src.copy_to(phi::TransToPhiBackend(place), true).impl()); + self->tensor.set_impl(src.copy_to(place, true).impl()); VLOG(4) << "Different place, do TensorCopy"; } if (src.get_autograd_meta()) { @@ -156,8 +155,7 @@ void InitTensorWithFrameworkTensor(TensorObject* self, } else { auto temp = paddle::experimental::Tensor(std::make_shared(src)); - self->tensor.set_impl( - temp.copy_to(phi::TransToPhiBackend(place), true).impl()); + self->tensor.set_impl(temp.copy_to(place, true).impl()); VLOG(4) << "Different place, do TensorCopy"; } egr::EagerUtils::autograd_meta(&(self->tensor))->SetPersistable(false); diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc index 528bd75eb0013b95057d7549e083b2fa1318cac1..f3c48309e69fe8b40099b076f673e4ba8c8bcabd 100644 --- a/paddle/fluid/pybind/eager_functions.cc +++ b/paddle/fluid/pybind/eager_functions.cc @@ -159,7 +159,7 @@ static PyObject* eager_api_tensor_copy(PyObject* self, PyObject* args, auto place = CastPyArg2Place(PyTuple_GET_ITEM(args, 2), 2); bool blocking = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 3), 3); - dst = src.copy_to(phi::TransToPhiBackend(place), blocking); + dst = src.copy_to(place, blocking); egr::EagerUtils::autograd_meta(&dst)->SetStopGradient( egr::EagerUtils::autograd_meta(&(src))->StopGradient()); egr::EagerUtils::autograd_meta(&dst)->SetPersistable( diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc index 49745e5679d9af3c8cf07ba0cac679217a691052..cce663e410ddb5d4d987e58d16d578ec85ee6142 100644 --- a/paddle/fluid/pybind/eager_method.cc +++ b/paddle/fluid/pybind/eager_method.cc @@ -175,7 +175,7 @@ static PyObject* tensor_method_numpy(TensorObject* self, PyObject* args, pybind11::detail::npy_api::NPY_ARRAY_WRITEABLE_, nullptr); - if (self->tensor.is_cpu()) { + if (self->tensor.is_cpu() || self->tensor.is_gpu_pinned()) { auto dense_tensor = std::dynamic_pointer_cast(self->tensor.impl()); platform::CPUPlace place; @@ -184,7 +184,7 @@ static PyObject* tensor_method_numpy(TensorObject* self, PyObject* args, pybind11::detail::array_proxy(array)->data), place, dense_tensor->data(), sizeof_dtype * numel); #if defined(PADDLE_WITH_CUDA) - } else if (self->tensor.is_cuda()) { + } else if (self->tensor.is_gpu()) { auto dense_tensor = std::dynamic_pointer_cast(self->tensor.impl()); @@ -218,8 +218,7 @@ static PyObject* tensor_method__copy_to(TensorObject* self, PyObject* args, EAGER_TRY auto place = CastPyArg2Place(PyTuple_GET_ITEM(args, 0), 0); bool blocking = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 1), 1); - auto cp_tensor = - self->tensor.copy_to(phi::TransToPhiBackend(place), blocking); + auto cp_tensor = self->tensor.copy_to(place, blocking); egr::EagerUtils::autograd_meta(&cp_tensor)->SetStopGradient(true); egr::EagerUtils::autograd_meta(&cp_tensor) ->SetPersistable( @@ -231,8 +230,7 @@ static PyObject* tensor_method__copy_to(TensorObject* self, PyObject* args, static PyObject* tensor_method_cpu(TensorObject* self, PyObject* args, PyObject* kwargs) { EAGER_TRY - auto cp_tensor = - self->tensor.copy_to(phi::TransToPhiBackend(phi::CPUPlace()), true); + auto cp_tensor = self->tensor.copy_to(phi::CPUPlace(), true); egr::EagerUtils::autograd_meta(&cp_tensor)->SetStopGradient(true); egr::EagerUtils::autograd_meta(&cp_tensor) ->SetPersistable( diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc index 24bb2c9e65cde6909d5f934eb41b0a363044dde9..2e884b212aff307d8f83815f8edac37cb884f3f0 100644 --- a/paddle/fluid/pybind/eager_utils.cc +++ b/paddle/fluid/pybind/eager_utils.cc @@ -929,28 +929,10 @@ std::vector GetScopePtrListFromArgs( return result; } -paddle::experimental::Backend CastPyArg2Backend(PyObject* obj, - const std::string& op_type, - ssize_t arg_pos) { - if (obj == Py_None) { - PADDLE_THROW(platform::errors::InvalidArgument( - "%s(): argument (position %d) must be " - "int or place, but got %s", - op_type, arg_pos + 1, - ((PyTypeObject*)obj->ob_type)->tp_name)); // NOLINT - } - - PyTypeObject* type = obj->ob_type; - auto type_name = std::string(type->tp_name); - if (type_name == "int") { - int value = CastPyArg2Int(obj, op_type, arg_pos); - return static_cast(value); - } else { - platform::Place place = CastPyArg2Place(obj, arg_pos); - return phi::TransToPhiBackend(place); - } - - return paddle::experimental::Backend::CPU; +paddle::experimental::Place CastPyArg2Place(PyObject* obj, + const std::string& op_type, + ssize_t arg_pos) { + return CastPyArg2Place(obj, arg_pos); } paddle::experimental::DataType CastPyArg2DataType(PyObject* obj, diff --git a/paddle/fluid/pybind/eager_utils.h b/paddle/fluid/pybind/eager_utils.h index fba1485bcf44ea70db286225fbbe3c70caceb4bd..3500082ba645f35b0f9587985af63a311d12128f 100644 --- a/paddle/fluid/pybind/eager_utils.h +++ b/paddle/fluid/pybind/eager_utils.h @@ -154,9 +154,9 @@ paddle::experimental::Scalar CastPyArg2Scalar(PyObject* obj, paddle::experimental::ScalarArray CastPyArg2ScalarArray( PyObject* obj, const std::string& op_type, ssize_t arg_pos); -paddle::experimental::Backend CastPyArg2Backend(PyObject* obj, - const std::string& op_type, - ssize_t arg_pos); +paddle::experimental::Place CastPyArg2Place(PyObject* obj, + const std::string& op_type, + ssize_t arg_pos); paddle::experimental::DataType CastPyArg2DataType(PyObject* obj, const std::string& op_type, diff --git a/paddle/phi/api/include/tensor.h b/paddle/phi/api/include/tensor.h index c58ebe69523eb97a4357bea061de64e5e01ec181..6fab6643f398dca696640b184c76ec90dec342cd 100644 --- a/paddle/phi/api/include/tensor.h +++ b/paddle/phi/api/include/tensor.h @@ -31,7 +31,6 @@ using gpuStream_t = hipStream_t; #include "paddle/phi/api/ext/dll_decl.h" #include "paddle/phi/api/ext/place.h" -#include "paddle/phi/common/backend.h" #include "paddle/phi/common/data_type.h" #include "paddle/phi/common/layout.h" #include "paddle/phi/common/place.h" @@ -269,12 +268,20 @@ class PADDLE_API Tensor final { bool is_cpu() const; /** - * @brief Determine whether the tensor device is CUDA + * @brief Determine whether the tensor device is GPU * * @return true * @return false */ - bool is_cuda() const; + bool is_gpu() const; + + /** + * @brief Determine whether the tensor device is GPU_PINNED + * + * @return true + * @return false + */ + bool is_gpu_pinned() const; /* Part 4: Data Access methods */ @@ -407,11 +414,11 @@ class PADDLE_API Tensor final { /** * @brief Transfer the current Tensor to the specified device and return. * - * @param backend, The target backend of which the tensor will copy to. + * @param place, The target place of which the tensor will copy to. * @param blocking, Should we copy this in sync way. * @return Tensor */ - Tensor copy_to(Backend backend, bool blocking) const; + Tensor copy_to(Place place, bool blocking) const; /** * @brief Transfer the source Tensor to current Tensor. diff --git a/paddle/phi/api/lib/api_custom_impl.cc b/paddle/phi/api/lib/api_custom_impl.cc index fc1afb26bf4143e5c75398b3dc1042581e1f1546..98f28ddcbdb3362cfbde9dc53176a78b9b952576 100644 --- a/paddle/phi/api/lib/api_custom_impl.cc +++ b/paddle/phi/api/lib/api_custom_impl.cc @@ -19,6 +19,7 @@ limitations under the License. */ #include "paddle/phi/api/lib/data_transform.h" #include "paddle/phi/api/lib/kernel_dispatch.h" #include "paddle/phi/api/lib/utils/storage.h" +#include "paddle/phi/core/compat/convert_utils.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/meta_tensor.h" #include "paddle/phi/infermeta/binary.h" @@ -31,9 +32,10 @@ limitations under the License. */ namespace paddle { namespace experimental { -Tensor copy_to_impl(const Tensor& x, Backend backend, bool blocking) { +Tensor copy_to_impl(const Tensor& x, Place place, bool blocking) { auto kernel_key_set = ParseKernelKeyByInputArgs(x); - kernel_key_set.backend_set = kernel_key_set.backend_set | BackendSet(backend); + kernel_key_set.backend_set = + kernel_key_set.backend_set | BackendSet(phi::TransToPhiBackend(place)); auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey(); auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError( "copy", kernel_key); @@ -57,8 +59,7 @@ Tensor copy_to_impl(const Tensor& x, Backend backend, bool blocking) { phi::DenseTensor*); auto* kernel_fn = kernel.GetVariadicKernelFn(); - (*kernel_fn)( - *dev_ctx, *dense_x, phi::TransToPhiPlace(backend), blocking, kernel_out); + (*kernel_fn)(*dev_ctx, *dense_x, place, blocking, kernel_out); return out; } diff --git a/paddle/phi/api/lib/api_custom_impl.h b/paddle/phi/api/lib/api_custom_impl.h index 5acb68a3281332565d0b094a37fc8ee38c4904ab..48eda2d954647e8c2786b29b615cfc6166734bca 100644 --- a/paddle/phi/api/lib/api_custom_impl.h +++ b/paddle/phi/api/lib/api_custom_impl.h @@ -15,15 +15,14 @@ limitations under the License. */ #pragma once #include "paddle/phi/api/include/tensor.h" -#include "paddle/phi/common/backend.h" +#include "paddle/phi/common/place.h" #include "paddle/phi/common/scalar.h" #include "paddle/phi/common/scalar_array.h" namespace paddle { namespace experimental { -// TODO(chenweihang): Replace backend by place when place is ready -Tensor copy_to_impl(const Tensor& x, Backend backend, bool blocking); +Tensor copy_to_impl(const Tensor& x, Place place, bool blocking); std::vector split_impl(const Tensor& x, const ScalarArray& num_or_sections, diff --git a/paddle/phi/api/lib/backend_set.h b/paddle/phi/api/lib/backend_set.h index 88f7b086715d6535be1d5b30e02497c89026fb6f..2aa4f969221d9c0f8936e7fc7c3a41cc04a63454 100644 --- a/paddle/phi/api/lib/backend_set.h +++ b/paddle/phi/api/lib/backend_set.h @@ -35,7 +35,7 @@ class BackendSet final { : bitset_(b == Backend::UNDEFINED ? 0 : 1ULL << (static_cast(b) - 1)) {} - uint64_t bitset() const { return bitset_; } + inline uint64_t bitset() const { return bitset_; } bool inline Has(Backend b) const { PD_CHECK(b != Backend::UNDEFINED, "Backend argument can't be UNDEFINED."); diff --git a/paddle/phi/api/lib/data_transform.cc b/paddle/phi/api/lib/data_transform.cc index 8bf5f3b481a0e041b439ffd99a8ac017f4aae50e..7d886e50dbc23ef20557956ae1ae9503fbefbb45 100644 --- a/paddle/phi/api/lib/data_transform.cc +++ b/paddle/phi/api/lib/data_transform.cc @@ -39,7 +39,7 @@ inline bool NeedTransformPlace(const paddle::platform::Place& input, const TransformFlag& transform_flag) { bool ret = transform_flag.need_trans_backend() && target != Backend::ALL_BACKEND && - !platform::is_same_place(input, phi::TransToPhiPlace(target)); + phi::TransToPhiBackend(input) != target; return ret; } @@ -180,21 +180,20 @@ std::shared_ptr PrepareData( const phi::TensorArgDef& target_args_def, const TransformFlag& transform_flag) { const auto& tensor_in = input.impl(); - VLOG(6) << tensor_in->dtype() << "\t" << target_args_def.dtype; - if (!transform_flag.NeedTransform() || !tensor_in->initialized() || + phi::DenseTensor& dense_tensor = + *static_cast(tensor_in.get()); + if (!transform_flag.NeedTransform() || !dense_tensor.initialized() || (!NeedTransformPlace( - tensor_in->place(), target_args_def.backend, transform_flag) && + dense_tensor.place(), target_args_def.backend, transform_flag) && !NeedTransformDataType( - tensor_in->dtype(), target_args_def.dtype, transform_flag) && + dense_tensor.dtype(), target_args_def.dtype, transform_flag) && !NeedTransformLayout( - tensor_in->layout(), target_args_def.layout, transform_flag))) { + dense_tensor.layout(), target_args_def.layout, transform_flag))) { return std::static_pointer_cast(tensor_in); } phi::DenseTensor out = - TransformData(*(static_cast(tensor_in.get())), - target_args_def, - transform_flag); + TransformData(dense_tensor, target_args_def, transform_flag); return std::make_shared(std::move(out)); } diff --git a/paddle/phi/api/lib/data_type_set.h b/paddle/phi/api/lib/data_type_set.h index ecc1b37c3a6af0d55f1cf71e860fd87657ed759b..4b5e6bde247002ca96a964f8739ea06a2036d580 100644 --- a/paddle/phi/api/lib/data_type_set.h +++ b/paddle/phi/api/lib/data_type_set.h @@ -30,7 +30,7 @@ class DataTypeSet final { ? 0 : 1ULL << (static_cast(dtype) - 1)) {} - uint64_t bitset() const { return bitset_; } + inline uint64_t bitset() const { return bitset_; } bool inline Has(DataType dtype) const { PD_CHECK(dtype != DataType::UNDEFINED, diff --git a/paddle/phi/api/lib/kernel_dispatch.cc b/paddle/phi/api/lib/kernel_dispatch.cc index 5e334b9b727dc19f6bba7dc8c9c15b2414d13abb..c2f7a7981f001119608c276a643a68199d0df0c4 100644 --- a/paddle/phi/api/lib/kernel_dispatch.cc +++ b/paddle/phi/api/lib/kernel_dispatch.cc @@ -16,13 +16,16 @@ limitations under the License. */ #include "paddle/phi/api/include/context_pool.h" #include "paddle/phi/core/compat/convert_utils.h" +#ifdef _MSC_VER +#include +#endif namespace paddle { namespace experimental { namespace detail { -BackendSet GetTensorBackendSet(const Tensor& t) { - BackendSet backend_set(phi::TransToPhiBackend(t.inner_place())); +BackendSet GetTensorBackendSet(const phi::TensorBase& t) { + BackendSet backend_set(phi::TransToPhiBackend(t.place())); switch (t.layout()) { case DataLayout::MKLDNN: backend_set = backend_set | BackendSet(Backend::MKLDNN); @@ -35,6 +38,11 @@ BackendSet GetTensorBackendSet(const Tensor& t) { } std::size_t CountLeadingZeros(uint64_t val) { +#if defined(__clang__) || defined(__GNUC__) + return __builtin_clzl(val); +#elif defined(_MSC_VER) + return __lzcnt64(val); +#else if (val == 0) { return 64; } @@ -48,6 +56,7 @@ std::size_t CountLeadingZeros(uint64_t val) { } } return zero_bits; +#endif } } // namespace detail @@ -82,13 +91,17 @@ DataType ParseDataTypeWithInputOrder(DataType dtype, const Tensor& tensor) { return dtype != DataType::UNDEFINED ? dtype : ParseDataType(tensor); } -Backend ParseBackend(Backend backend) { return backend; } +Backend ParseBackend(const Place& place) { + return phi::TransToPhiBackend(place); +} Backend ParseBackend(const Tensor& tensor) { return phi::TransToPhiBackend(tensor.inner_place()); } -Backend ParseBackendWithInputOrder(Backend backend, const Tensor& tensor) { - return backend != Backend::UNDEFINED ? backend : ParseBackend(tensor); +Backend ParseBackendWithInputOrder(const Place& place, const Tensor& tensor) { + return place.GetType() != phi::AllocationType::UNDEFINED + ? ParseBackend(place) + : ParseBackend(tensor); } DataLayout ParseLayout(DataLayout layout) { return layout; } diff --git a/paddle/phi/api/lib/kernel_dispatch.h b/paddle/phi/api/lib/kernel_dispatch.h index 9a09bc2183ad73857d5afee8909d957e65c5a664..25b74e7fe31b955a8b25529871c503179b302629 100644 --- a/paddle/phi/api/lib/kernel_dispatch.h +++ b/paddle/phi/api/lib/kernel_dispatch.h @@ -33,7 +33,7 @@ namespace paddle { namespace experimental { namespace detail { -BackendSet GetTensorBackendSet(const Tensor& t); +BackendSet GetTensorBackendSet(const phi::TensorBase& t); std::size_t CountLeadingZeros(uint64_t val); } // namespace detail @@ -93,11 +93,13 @@ struct KernelKeyParser : ArgsIterator { // TODO(chenweihang): deal with multiple diff input Tensors // TODO(chenweihang): add global device guard method to set backend void operator()(const Tensor& x) { - key_set.backend_set = key_set.backend_set | detail::GetTensorBackendSet(x); - // TODO(chenweihang): selecte multi layout and dtype - key_set.layout = x.layout(); - key_set.dtype = x.type(); - dtype_set = dtype_set | DataTypeSet(x.dtype()); + const phi::TensorBase& tensor = *x.impl(); + key_set.backend_set = + key_set.backend_set | detail::GetTensorBackendSet(tensor); + // TODO(chenweihang): select multi layout and dtype + key_set.layout = tensor.layout(); + key_set.dtype = tensor.dtype(); + dtype_set = dtype_set | DataTypeSet(key_set.dtype); auto promote_result = PromoteTypes(dtype_set); if (promote_result != DataType::UNDEFINED) { key_set.dtype = promote_result; @@ -105,11 +107,12 @@ struct KernelKeyParser : ArgsIterator { } void operator()(const std::vector& x) { + const phi::TensorBase& tensor = *x.at(0).impl(); key_set.backend_set = - key_set.backend_set | detail::GetTensorBackendSet(x[0]); - // TODO(chenweihang): selecte multi layout and dtype - key_set.layout = x[0].layout(); - key_set.dtype = x[0].type(); + key_set.backend_set | detail::GetTensorBackendSet(tensor); + // TODO(chenweihang): select multi layout and dtype + key_set.layout = tensor.layout(); + key_set.dtype = tensor.dtype(); } // skip other type args, these args don't used in kernel selection @@ -154,7 +157,7 @@ DataType ParseDataType(const Tensor& tensor); DataType ParseDataType(const std::vector& tensors); DataType ParseDataTypeWithInputOrder(DataType dtype, const Tensor& tensor); -Backend ParseBackend(Backend backend); +Backend ParseBackend(const Place& place); Backend ParseBackend(const Tensor& tensor); template Backend ParseBackend(T t, Args... args) { @@ -163,7 +166,7 @@ Backend ParseBackend(T t, Args... args) { return static_cast(64 - detail::CountLeadingZeros(backend_set.bitset())); } -Backend ParseBackendWithInputOrder(Backend backend, const Tensor& tensor); +Backend ParseBackendWithInputOrder(const Place& place, const Tensor& tensor); DataLayout ParseLayout(DataLayout layout); DataLayout ParseLayout(const Tensor& tensor); diff --git a/paddle/phi/api/lib/tensor.cc b/paddle/phi/api/lib/tensor.cc index 066287d4244797e316a34d524eca171e94afcfdf..b9b6ca36f673b03ba0b548424711833f21e612a9 100644 --- a/paddle/phi/api/lib/tensor.cc +++ b/paddle/phi/api/lib/tensor.cc @@ -163,10 +163,14 @@ bool Tensor::is_cpu() const { return paddle::platform::is_cpu_place(inner_place()); } -bool Tensor::is_cuda() const { +bool Tensor::is_gpu() const { return paddle::platform::is_gpu_place(inner_place()); } +bool Tensor::is_gpu_pinned() const { + return paddle::platform::is_cuda_pinned_place(inner_place()); +} + /* Part 4: Data Access methods */ template diff --git a/paddle/phi/api/lib/tensor_method.cc b/paddle/phi/api/lib/tensor_method.cc index cc797507e68ec11005d6ac35d5dca2d19418598d..c6214052f7bc30ef8bf801fe20171256d8d0b142 100644 --- a/paddle/phi/api/lib/tensor_method.cc +++ b/paddle/phi/api/lib/tensor_method.cc @@ -27,14 +27,14 @@ namespace paddle { namespace experimental { // declare cast api Tensor cast(const Tensor &x, DataType out_dtype); -Tensor copy_to(const Tensor &x, Backend backend, bool blocking); +Tensor copy_to(const Tensor &x, Place place, bool blocking); Tensor Tensor::cast(DataType target_type) const { return experimental::cast(*this, target_type); } -Tensor Tensor::copy_to(Backend backend, bool blocking) const { - return experimental::copy_to(*this, backend, blocking); +Tensor Tensor::copy_to(Place place, bool blocking) const { + return experimental::copy_to(*this, place, blocking); } template @@ -44,7 +44,7 @@ Tensor Tensor::copy_to(const PlaceType &target_place) const { "`copy_to` method without template argument instead. " "reason: copying a Tensor to another device does not need " "to specify the data type template argument."; - return copy_to(ConvertExtPlaceToBackend(target_place), /*blocking=*/false); + return copy_to(ConvertExtPlaceToInnerPlace(target_place), /*blocking=*/false); } template PADDLE_API Tensor diff --git a/paddle/phi/common/place.h b/paddle/phi/common/place.h index 53ddd499a7e24e09ded721619407b8679cab7628..4c6d47597bd2cc12c397794eb575a74960542a6d 100644 --- a/paddle/phi/common/place.h +++ b/paddle/phi/common/place.h @@ -203,5 +203,10 @@ namespace paddle { namespace experimental { using AllocationType = phi::AllocationType; using Place = phi::Place; +using CPUPlace = phi::CPUPlace; +using GPUPlace = phi::GPUPlace; +using GPUPinnedPlace = phi::GPUPinnedPlace; +using XPUPlace = phi::XPUPlace; +using NPUPlace = phi::NPUPlace; } // namespace experimental } // namespace paddle diff --git a/paddle/phi/core/compat/convert_utils.cc b/paddle/phi/core/compat/convert_utils.cc index 67245f1da5a6b65cdf81d2cd329d7d6b3828852e..667cee10675d8a6b756a6af701b65c63b3b359de 100644 --- a/paddle/phi/core/compat/convert_utils.cc +++ b/paddle/phi/core/compat/convert_utils.cc @@ -26,13 +26,14 @@ limitations under the License. */ namespace phi { Backend TransToPhiBackend(const phi::Place& place) { - if (place.GetType() == phi::AllocationType::CPU) { + auto allocation_type = place.GetType(); + if (allocation_type == phi::AllocationType::CPU) { return Backend::CPU; - } else if (place.GetType() == phi::AllocationType::GPU) { + } else if (allocation_type == phi::AllocationType::GPU) { return Backend::GPU; - } else if (place.GetType() == phi::AllocationType::XPU) { + } else if (allocation_type == phi::AllocationType::XPU) { return Backend::XPU; - } else if (place.GetType() == phi::AllocationType::CUSTOM) { + } else if (allocation_type == phi::AllocationType::CUSTOM) { return static_cast( static_cast(Backend::NUM_BACKENDS) + GetOrRegisterGlobalDeviceTypeId(place.GetDeviceType())); diff --git a/paddle/phi/infermeta/ternary.cc b/paddle/phi/infermeta/ternary.cc index 556fb874470dd248dcf7c77e8a8ac3510bd6f63e..a72b8d913f81d721bb02116b84306fff401a8873 100644 --- a/paddle/phi/infermeta/ternary.cc +++ b/paddle/phi/infermeta/ternary.cc @@ -145,6 +145,7 @@ void GraphSendRecvInferMeta(const MetaTensor& x, const MetaTensor& src_index, const MetaTensor& dst_index, const std::string& pool_type, + int64_t out_size, MetaTensor* out, MetaTensor* dst_count) { auto src_index_dims = src_index.dims(); @@ -187,11 +188,23 @@ void GraphSendRecvInferMeta(const MetaTensor& x, "Src_index and Dst_index should have the same shape.")); auto dims = x.dims(); - out->set_dims(dims); + if (out_size <= 0) { + out->set_dims(dims); + } else { + std::vector dims_ = phi::vectorize(dims); + if (dims_.size() > 0) { + dims_[0] = out_size; + } + out->set_dims(phi::make_ddim(dims_)); + } out->set_dtype(x.dtype()); if (pool_type == "MEAN") { - dst_count->set_dims({dims[0]}); + if (out_size <= 0) { + dst_count->set_dims({dims[0]}); + } else { + dst_count->set_dims({out_size}); + } dst_count->set_dtype(DataType::INT32); } } diff --git a/paddle/phi/infermeta/ternary.h b/paddle/phi/infermeta/ternary.h index 42a0f35dc1d8d6aef13b631d355a4cee951a4ed1..8521a1ee855c66aa0f2adcf22dcdbe2ca7218cbe 100644 --- a/paddle/phi/infermeta/ternary.h +++ b/paddle/phi/infermeta/ternary.h @@ -51,6 +51,7 @@ void GraphSendRecvInferMeta(const MetaTensor& x, const MetaTensor& src_index, const MetaTensor& dst_index, const std::string& pool_type, + int64_t out_size, MetaTensor* out, MetaTensor* dst_count); diff --git a/paddle/phi/kernels/cpu/graph_send_recv_grad_kernel.cc b/paddle/phi/kernels/cpu/graph_send_recv_grad_kernel.cc index 8538461b1b83b887223224bac231600221547f11..6a83cee1ae40d1dec6c3d7d2f4499d7eb71390f9 100644 --- a/paddle/phi/kernels/cpu/graph_send_recv_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/graph_send_recv_grad_kernel.cc @@ -23,15 +23,14 @@ namespace phi { template -void GraphSendRecvCpuGradLoop(const int& input_size, - const int& index_size, +void GraphSendRecvCpuGradLoop(const int& index_size, const IndexT* s_index, const IndexT* d_index, const DenseTensor& src, + const DenseTensor& input, DenseTensor* dst, const std::string& pool_type, const int* dst_count = nullptr, - const DenseTensor* input = nullptr, const DenseTensor* output = nullptr) { if (pool_type == "SUM") { Functor functor; @@ -55,7 +54,7 @@ void GraphSendRecvCpuGradLoop(const int& input_size, for (int i = 0; i < index_size; ++i) { const IndexT& forward_src_idx = d_index[i]; const IndexT& forward_dst_idx = s_index[i]; - auto input_slice = input->Slice(forward_src_idx, forward_src_idx + 1); + auto input_slice = input.Slice(forward_src_idx, forward_src_idx + 1); auto output_slice = output->Slice(forward_dst_idx, forward_dst_idx + 1); auto eigen_input = phi::EigenVector::Flatten(input_slice); auto eigen_output = phi::EigenVector::Flatten(output_slice); @@ -73,18 +72,18 @@ template void GraphSendRecvGradOpKernelLaunchHelper( const Context& ctx, const DenseTensor& out_grad, + const DenseTensor& x, const DenseTensor& src_index, const DenseTensor& dst_index, const std::string& pool_type, DenseTensor* x_grad, const DenseTensor* dst_count = nullptr, - const DenseTensor* x = nullptr, const DenseTensor* out = nullptr) { const int& index_size = dst_index.dims()[0]; ctx.template Alloc(x_grad); T* p_output = x_grad->data(); - const auto& src_dims = out_grad.dims(); + const auto& src_dims = x.dims(); int64_t memset_size = 1; for (int i = 0; i < src_dims.size(); ++i) memset_size *= src_dims[i]; const size_t& memset_bytes = memset_size * sizeof(T); @@ -97,29 +96,22 @@ void GraphSendRecvGradOpKernelLaunchHelper( if (pool_type == "SUM") { GraphSendRecvCpuGradLoop>( - src_dims[0], index_size, d_index, s_index, out_grad, x_grad, pool_type); + index_size, d_index, s_index, out_grad, x, x_grad, pool_type); } else if (pool_type == "MEAN") { const int* s_count = dst_count->data(); // Functor not used here. - GraphSendRecvCpuGradLoop>(src_dims[0], - index_size, - d_index, - s_index, - out_grad, - x_grad, - pool_type, - s_count); + GraphSendRecvCpuGradLoop>( + index_size, d_index, s_index, out_grad, x, x_grad, pool_type, s_count); } else if (pool_type == "MIN" || pool_type == "MAX") { // Functor not used here. - GraphSendRecvCpuGradLoop>(src_dims[0], - index_size, + GraphSendRecvCpuGradLoop>(index_size, d_index, s_index, out_grad, + x, x_grad, pool_type, nullptr, - x, out); } } @@ -127,7 +119,7 @@ void GraphSendRecvGradOpKernelLaunchHelper( template void GraphSendRecvGradKernel(const Context& ctx, const DenseTensor& out_grad, - paddle::optional x, + const DenseTensor& x, paddle::optional out, const DenseTensor& src_index, const DenseTensor& dst_index, @@ -139,23 +131,23 @@ void GraphSendRecvGradKernel(const Context& ctx, GraphSendRecvGradOpKernelLaunchHelper( ctx, out_grad, + x, src_index, dst_index, pool_type, x_grad, dst_count.get_ptr(), - x.get_ptr(), out.get_ptr()); } else if (index_type == phi::DataType::INT64) { GraphSendRecvGradOpKernelLaunchHelper( ctx, out_grad, + x, src_index, dst_index, pool_type, x_grad, dst_count.get_ptr(), - x.get_ptr(), out.get_ptr()); } } diff --git a/paddle/phi/kernels/cpu/graph_send_recv_kernel.cc b/paddle/phi/kernels/cpu/graph_send_recv_kernel.cc index fecbd4b1d7aa05c94d2cb713f302abd6f2aeb1c4..8f71ba12cc4fa25bc20e337bf3986518e3ce3672 100644 --- a/paddle/phi/kernels/cpu/graph_send_recv_kernel.cc +++ b/paddle/phi/kernels/cpu/graph_send_recv_kernel.cc @@ -83,6 +83,7 @@ void GraphSendRecvOpKernelLaunchHelper(const Context& ctx, const DenseTensor& src_index, const DenseTensor& dst_index, const std::string& pool_type, + int64_t out_size, DenseTensor* out, DenseTensor* dst_count = nullptr) { const int& index_size = src_index.dims()[0]; @@ -91,7 +92,16 @@ void GraphSendRecvOpKernelLaunchHelper(const Context& ctx, T* p_output = out->data(); const auto& src_dims = x.dims(); int64_t memset_size = 1; - for (int i = 0; i < src_dims.size(); ++i) memset_size *= src_dims[i]; + if (out_size <= 0) { + for (int i = 0; i < src_dims.size(); ++i) { + memset_size *= src_dims[i]; + } + } else { + memset_size = out_size; + for (int i = 1; i < src_dims.size(); ++i) { + memset_size *= src_dims[i]; + } + } const size_t& memset_bytes = memset_size * sizeof(T); memset(p_output, 0, memset_bytes); @@ -129,15 +139,16 @@ void GraphSendRecvKernel(const Context& ctx, const DenseTensor& src_index, const DenseTensor& dst_index, const std::string& pool_type, + int64_t out_size, DenseTensor* out, DenseTensor* dst_count) { auto index_type = src_index.dtype(); if (index_type == phi::DataType::INT32) { GraphSendRecvOpKernelLaunchHelper( - ctx, x, src_index, dst_index, pool_type, out, dst_count); + ctx, x, src_index, dst_index, pool_type, out_size, out, dst_count); } else if (index_type == phi::DataType::INT64) { GraphSendRecvOpKernelLaunchHelper( - ctx, x, src_index, dst_index, pool_type, out, dst_count); + ctx, x, src_index, dst_index, pool_type, out_size, out, dst_count); } } diff --git a/paddle/phi/kernels/cpu/reduce_grad_kernel.cc b/paddle/phi/kernels/cpu/reduce_grad_kernel.cc index 78a7ae8d415b5d4b18fdf8e469576db50f739e38..4b3b1fc16e9c426daccb6364f926a7c44ab63e59 100644 --- a/paddle/phi/kernels/cpu/reduce_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/reduce_grad_kernel.cc @@ -99,8 +99,8 @@ void ReduceSumGradKernel(const Context& dev_ctx, ReduceGradKernel(dev_ctx, x, - out_grad, paddle::none, + out_grad, dims, keep_dim, reduce_all, @@ -121,8 +121,8 @@ void ReduceMeanGradKernel(const Context& dev_ctx, DenseTensor* x_grad) { ReduceGradKernel(dev_ctx, x, - out_grad, paddle::none, + out_grad, dims, keep_dim, reduce_all, diff --git a/paddle/phi/kernels/funcs/mode.h b/paddle/phi/kernels/funcs/mode.h index 1b7641762e2639acf3db540280891b518f22eed2..3bd6c19545e16c1bca2881c4804db43af2b0b227 100644 --- a/paddle/phi/kernels/funcs/mode.h +++ b/paddle/phi/kernels/funcs/mode.h @@ -14,7 +14,7 @@ #pragma once -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(__NVCC__) || defined(__HIPCC__) #include #include #include @@ -143,7 +143,7 @@ static void ModeAssign(const Type& input_height, } } -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(__NVCC__) || defined(__HIPCC__) template static void GetModebySort(const phi::GPUContext& dev_ctx, const DenseTensor* input_tensor, diff --git a/paddle/phi/kernels/funcs/select_impl.cu.h b/paddle/phi/kernels/funcs/select_impl.cu.h index 3a1d9b8ea7a7a36c31f31f7fc60dffc1f827d34e..16e00414ad772a162d33fb18c0dfafc889e53951 100644 --- a/paddle/phi/kernels/funcs/select_impl.cu.h +++ b/paddle/phi/kernels/funcs/select_impl.cu.h @@ -168,57 +168,82 @@ __global__ void CumsumOneBlock( } } +// where_index template + int MaskData> struct SelectCaller { - __device__ void inline operator()(OutT *store_data, + __device__ void inline operator()(OutT *out, const MT *mask_data, const InT *in, Functor func, - int num, - int data_offset) { - // where_index op - IdT index_reg[VecSize]; - // Set data index of global - kps::InitWithDataIndex(&index_reg[0], data_offset); + int data_offset, + int store_num, + int thread_fix, + int num) { + int64_t in_data[VecSize]; + OutT store_data[VecSize * phi::DDim::kMaxRank]; + // set index + kps::InitWithDataIndex(&in_data[0], data_offset); // Get store data according to mask_idt - kps::OperatorTernary( - store_data, mask_data, &index_reg[0], func, VecSize); + kps::OperatorTernary( + store_data, mask_data, &in_data[0], func, VecSize); + kps::details::WriteData(out + thread_fix, &store_data[0], store_num); } }; +// masked_select template -struct SelectCaller { // masked_select - __device__ void inline operator()(OutT *store_data, +struct SelectCaller { + __device__ void inline operator()(OutT *out, const MT *mask_data, const InT *in, Functor func, - int num, - int data_offset) { + int data_offset, + int store_num, + int thread_fix, + int num) { InT in_data[VecSize]; + OutT store_data[VecSize * phi::DDim::kMaxRank]; kps::ReadData(&in_data[0], in, num); // Get store data according to mask_idt kps::OperatorTernary( store_data, mask_data, &in_data[0], func, VecSize); + kps::details::WriteData(out + thread_fix, &store_data[0], store_num); + } +}; + +// masked_select_grad +template +struct SelectCaller { + __device__ void inline operator()(OutT *out, + const MT *mask_data, + const InT *in, + Functor func, + int data_offset, + int store_num, + int thread_fix, + int num) { + InT in_data[VecSize]; + OutT store_data[VecSize * phi::DDim::kMaxRank]; + kps::details::ReadData(&in_data[0], in + thread_fix, store_num); + kps::OperatorTernary( + store_data, mask_data, &in_data[0], func, VecSize); + kps::WriteData(out, &store_data[0], num); } }; @@ -253,7 +278,6 @@ SelectKernelImpl(OutT *out, IdT num_thread[kCVecSize]; IdT cumsum_thread[kCVecSize]; - OutT store_data[VecSize * phi::DDim::kMaxRank]; MT mask_data[VecSize]; IdT mask_idt[VecSize]; // init data_pr @@ -271,17 +295,15 @@ SelectKernelImpl(OutT *out, // Get cumsum_thread cumsum from 0 to num_thread cumsum_thread[0] is the // thread_fix kps::Cumsum(&cumsum_thread[0], &num_thread[0], Add()); - // Get store data(index) according to mask_idt - SelectCaller - compute; - compute(&store_data[0], &mask_data[0], in, func, num, data_offset); // get thread_fix int thread_fix = (static_cast(cumsum_thread[0] - num_thread[0]) * store_rank); // get how many data need to store int store_num = static_cast(num_thread[0]) * store_rank; // thread store num data, each thread may has different num - kps::details::WriteData(out + thread_fix, &store_data[0], store_num); + // Get store data(index) according to mask_idt + SelectCaller select; + select(out, mask_data, in, func, data_offset, store_num, thread_fix, num); } template (&block_store_offset, cumsum + idx_cumsum, 1); + int out_fix = MaskData < 2 ? block_store_offset * store_rank : data_offset; + int in_fix = MaskData < 2 ? data_offset : block_store_offset * store_rank; SelectKernelImpl( - out + block_store_offset * store_rank, + out + out_fix, mask + data_offset, - in + data_offset, + in + in_fix, func, size, data_offset, @@ -323,12 +347,13 @@ __global__ void SelectKernel(OutT *out, if (num > 0) { // Cumsum index int idx_cumsum = repeat * GRID_NUM_X + BLOCK_ID_X; - // niuliling todo: us ReadData API - int block_store_offset = static_cast(cumsum[idx_cumsum]); + kps::details::ReadData(&block_store_offset, cumsum + idx_cumsum, 1); + int out_fix = MaskData < 2 ? block_store_offset * store_rank : data_offset; + int in_fix = MaskData < 2 ? data_offset : block_store_offset * store_rank; SelectKernelImpl( - out + block_store_offset * store_rank, + out + out_fix, mask + data_offset, - in + data_offset, + in + in_fix, func, num, data_offset, @@ -402,6 +427,7 @@ void SelectKernel(const KPDevice &dev_ctx, const int kCumVesize = 2; const int block_c = 256; const int main_offset_c = Floor(size_count_block, (kCumVesize * block_c)); + using Add = kps::AddFunctor; CumsumOneBlock<<<1, block_c, 0, stream>>>( count_data, cumsum_data, size_count_block, main_offset_c, Add()); @@ -418,10 +444,13 @@ void SelectKernel(const KPDevice &dev_ctx, dev_ctx.Wait(); // 3.1.2 allock for out with total_true_num std::vector out_dim = {static_cast(total_true_num)}; - if (SelectData == 0) { // where_index + + if (SelectData == 1) { + out->Resize(phi::make_ddim(out_dim)); + } else if (SelectData == 0) { // == 0 where_index out_dim.push_back(rank); + out->Resize(phi::make_ddim(out_dim)); } - out->Resize(phi::make_ddim(out_dim)); auto out_data = out->mutable_data(cuda_place); // 3.2 get true data's index according to cond_data and cumsum_data if (total_true_num <= 0) return; diff --git a/paddle/phi/kernels/gpu/copy_kernel.cu b/paddle/phi/kernels/gpu/copy_kernel.cu index 4545f9ce436ea4028d43d3a91ae46a21cde41bb5..a16c8369cc9e5b71ab483b5c2eb9a3d2e61f900a 100644 --- a/paddle/phi/kernels/gpu/copy_kernel.cu +++ b/paddle/phi/kernels/gpu/copy_kernel.cu @@ -87,7 +87,8 @@ void Copy(const Context& dev_ctx, : reinterpret_cast(dev_ctx).stream(); paddle::memory::Copy( dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream); - } else if (paddle::platform::is_cpu_place(src_place) && // NOLINT + } else if ((paddle::platform::is_cpu_place(src_place) || + paddle::platform::is_cuda_pinned_place(src_place)) && // NOLINT paddle::platform::is_gpu_place(dst_place)) { auto src_cpu_place = src_place; auto dst_gpu_place = dst_place; diff --git a/paddle/phi/kernels/gpu/graph_send_recv_grad_kernel.cu b/paddle/phi/kernels/gpu/graph_send_recv_grad_kernel.cu index 75692966b4662c12feaefdf5bfc7311ed9d628b8..8bd3337280d7596ebd7a638b1cc86cf37b70632d 100644 --- a/paddle/phi/kernels/gpu/graph_send_recv_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/graph_send_recv_grad_kernel.cu @@ -28,19 +28,19 @@ template void GraphSendRecvGradOpCUDAKernelLaunchHelper( const Context& ctx, const DenseTensor& out_grad, + const DenseTensor& x, const DenseTensor& src_index, const DenseTensor& dst_index, const std::string& pool_type, DenseTensor* x_grad, const DenseTensor* dst_count = nullptr, - const DenseTensor* x = nullptr, const DenseTensor* out = nullptr) { const int& index_size = dst_index.dims()[0]; ctx.template Alloc(x_grad); T* p_output = x_grad->data(); - const auto& src_dims = out_grad.dims(); + const auto& src_dims = x.dims(); int64_t memset_size = 1; for (int i = 0; i < src_dims.size(); ++i) { memset_size *= src_dims[i]; @@ -86,7 +86,7 @@ void GraphSendRecvGradOpCUDAKernelLaunchHelper( ManipulateMeanGradCUDAKernel<<>>( p_src, d_index, s_index, p_output, index_size, slice_size, s_count); } else if (pool_type == "MAX" || pool_type == "MIN") { - const T* ptr_input = x->data(); + const T* ptr_input = x.data(); const T* ptr_output = out->data(); ManipulateMinMaxGradCUDAKernel<<>>( p_src, @@ -103,7 +103,7 @@ void GraphSendRecvGradOpCUDAKernelLaunchHelper( template void GraphSendRecvGradKernel(const Context& ctx, const DenseTensor& out_grad, - paddle::optional x, + const DenseTensor& x, paddle::optional out, const DenseTensor& src_index, const DenseTensor& dst_index, @@ -115,23 +115,23 @@ void GraphSendRecvGradKernel(const Context& ctx, GraphSendRecvGradOpCUDAKernelLaunchHelper( ctx, out_grad, + x, src_index, dst_index, pool_type, x_grad, dst_count.get_ptr(), - x.get_ptr(), out.get_ptr()); } else if (index_type == phi::DataType::INT64) { GraphSendRecvGradOpCUDAKernelLaunchHelper( ctx, out_grad, + x, src_index, dst_index, pool_type, x_grad, dst_count.get_ptr(), - x.get_ptr(), out.get_ptr()); } } diff --git a/paddle/phi/kernels/gpu/graph_send_recv_kernel.cu b/paddle/phi/kernels/gpu/graph_send_recv_kernel.cu index fab306f831a6f401b858b0dfe11011463827a50b..2826c071d6ec3e21e42f4a8f3947cde78ab85e2d 100644 --- a/paddle/phi/kernels/gpu/graph_send_recv_kernel.cu +++ b/paddle/phi/kernels/gpu/graph_send_recv_kernel.cu @@ -32,6 +32,7 @@ void GraphSendRecvOpCUDAKernelLaunchHelper(const Context& ctx, const DenseTensor& src_index, const DenseTensor& dst_index, const std::string& pool_type, + int64_t out_size, DenseTensor* out, DenseTensor* dst_count = nullptr) { const int& index_size = src_index.dims()[0]; @@ -39,8 +40,15 @@ void GraphSendRecvOpCUDAKernelLaunchHelper(const Context& ctx, T* p_output = out->data(); const auto& src_dims = x.dims(); int64_t memset_size = 1; - for (int i = 0; i < src_dims.size(); ++i) { - memset_size *= src_dims[i]; + if (out_size <= 0) { + for (int i = 0; i < src_dims.size(); ++i) { + memset_size *= src_dims[i]; + } + } else { + memset_size = out_size; + for (int i = 1; i < src_dims.size(); ++i) { + memset_size *= src_dims[i]; + } } const size_t& memset_bytes = memset_size * sizeof(T); if (pool_type == "SUM" || pool_type == "MEAN") { @@ -100,6 +108,9 @@ void GraphSendRecvOpCUDAKernelLaunchHelper(const Context& ctx, IndexT>><<>>( p_src, s_index, d_index, p_output, index_size, slice_size, functor); + if (out_size > 0) { + input_size = out_size; + } int64_t grid_max_tmp = (input_size * slice_size + block - 1) / block; int64_t grid_max = grid_max_tmp < max_grid_dimx ? grid_max_tmp : max_grid_dimx; @@ -114,6 +125,9 @@ void GraphSendRecvOpCUDAKernelLaunchHelper(const Context& ctx, IndexT>><<>>( p_src, s_index, d_index, p_output, index_size, slice_size, functor); + if (out_size > 0) { + input_size = out_size; + } int64_t grid_min_tmp = (input_size * slice_size + block - 1) / block; int64_t grid_min = grid_min_tmp < max_grid_dimx ? grid_min_tmp : max_grid_dimx; @@ -130,6 +144,9 @@ void GraphSendRecvOpCUDAKernelLaunchHelper(const Context& ctx, ctx.template Alloc(dst_count); int32_t* p_dst_count = dst_count->data(); + if (out_size > 0) { + input_size = out_size; + } #ifdef PADDLE_WITH_HIP hipMemset(p_dst_count, 0, input_size * sizeof(int)); @@ -155,15 +172,16 @@ void GraphSendRecvKernel(const Context& ctx, const DenseTensor& src_index, const DenseTensor& dst_index, const std::string& pool_type, + int64_t out_size, DenseTensor* out, DenseTensor* dst_count) { auto index_type = src_index.dtype(); if (index_type == phi::DataType::INT32) { GraphSendRecvOpCUDAKernelLaunchHelper( - ctx, x, src_index, dst_index, pool_type, out, dst_count); + ctx, x, src_index, dst_index, pool_type, out_size, out, dst_count); } else if (index_type == phi::DataType::INT64) { GraphSendRecvOpCUDAKernelLaunchHelper( - ctx, x, src_index, dst_index, pool_type, out, dst_count); + ctx, x, src_index, dst_index, pool_type, out_size, out, dst_count); } } diff --git a/paddle/phi/kernels/gpu/masked_select_grad_kernel.cu b/paddle/phi/kernels/gpu/masked_select_grad_kernel.cu index 71b7cd8750462fdf0dad20b2b221bd18cc6dbbe6..5d0097af2ca9abe4c7a4feb2d312068a5150ae1b 100644 --- a/paddle/phi/kernels/gpu/masked_select_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/masked_select_grad_kernel.cu @@ -17,38 +17,31 @@ #include #include -#include "paddle/phi/backends/cpu/cpu_context.h" -#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/select_impl.cu.h" #include "paddle/phi/kernels/masked_select_grad_kernel.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" namespace phi { -__global__ void SetMaskArrayT(const bool* mask, int32_t* mask_array, int size) { - int idx = blockDim.x * blockIdx.x + threadIdx.x; - for (; idx < size; idx += blockDim.x * gridDim.x) { - if (mask[idx]) - mask_array[idx] = 1; - else - mask_array[idx] = 0; - } -} +template +struct MaskedSelectGradFunctor { + HOSTDEVICE MaskedSelectGradFunctor() {} -template -__global__ void SelectGradWithPrefixMask(const int32_t* mask_prefix_sum, - const bool* mask, - const T* input, - T* out, - int size) { - int idx = blockDim.x * blockIdx.x + threadIdx.x; - for (; idx < size; idx += blockDim.x * gridDim.x) { - if (mask[idx]) { - int index = mask_prefix_sum[idx]; - out[idx] = input[index]; - } else { - out[idx] = 0; + HOSTDEVICE inline void operator()(OutT* out, + const MT* mask, + const InT* value, + int num) { + int read_fix = 0; + for (int idx = 0; idx < num; idx++) { + if (mask[idx]) { + out[idx] = value[read_fix++]; + } else { + out[idx] = 0; + } } } -} +}; template void MaskedSelectGradKernel(const Context& dev_ctx, @@ -56,42 +49,12 @@ void MaskedSelectGradKernel(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& mask, DenseTensor* x_grad) { - auto* mask_data = mask.data(); - auto* input_data = out_grad.data(); - auto* out_data = x_grad->mutable_data(dev_ctx.GetPlace()); - - auto input_size = out_grad.numel(); auto mask_size = mask.numel(); - auto mask_dim = mask.dims(); - - auto out_size = mask_size; - - DenseTensor mask_array; - DenseTensor mask_prefix_sum; - mask_array.Resize(mask_dim); - mask_prefix_sum.Resize(mask_dim); - - int32_t* mask_array_data = - mask_array.mutable_data(dev_ctx.GetPlace()); - int32_t* mask_prefix_sum_data = - mask_prefix_sum.mutable_data(dev_ctx.GetPlace()); - int threads = 512; - int grid = (mask_size + threads - 1) / threads; - auto stream = dev_ctx.stream(); - SetMaskArrayT<<>>( - mask_data, mask_array_data, mask_size); - - thrust::device_ptr mask_array_dev_ptr = - thrust::device_pointer_cast(mask_array_data); - thrust::device_vector mask_array_vec(mask_array_dev_ptr, - mask_array_dev_ptr + mask_size); - thrust::exclusive_scan(thrust::device, - mask_array_vec.begin(), - mask_array_vec.end(), - mask_prefix_sum_data); - - SelectGradWithPrefixMask<<>>( - mask_prefix_sum_data, mask_data, input_data, out_data, mask_size); + auto* out_data = x_grad->mutable_data(dev_ctx.GetPlace()); + if (mask_size <= 0) return; + using Functor = MaskedSelectGradFunctor; + phi::funcs::SelectKernel( + dev_ctx, mask, out_grad, x_grad, Functor()); } } // namespace phi diff --git a/paddle/phi/kernels/gpu/masked_select_kernel.cu b/paddle/phi/kernels/gpu/masked_select_kernel.cu index b443ae6b8fb5e6c3bf5264a50d25205a419f22ad..8986c97583e2086eb9ba53aee34745533dbc91c5 100644 --- a/paddle/phi/kernels/gpu/masked_select_kernel.cu +++ b/paddle/phi/kernels/gpu/masked_select_kernel.cu @@ -17,11 +17,12 @@ #include #include -#include "paddle/phi/backends/gpu/gpu_context.h" -#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/select_impl.cu.h" #include "paddle/phi/kernels/masked_select_kernel.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + namespace phi { template diff --git a/paddle/phi/kernels/gpu/where_index_kernel.cu b/paddle/phi/kernels/gpu/where_index_kernel.cu index 9538533f70d597e21b393d2650d56bebd823c360..616679057ffce29b8d911d56d5cf428801138589 100644 --- a/paddle/phi/kernels/gpu/where_index_kernel.cu +++ b/paddle/phi/kernels/gpu/where_index_kernel.cu @@ -20,13 +20,14 @@ namespace cub = hipcub; #endif -#include "paddle/phi/core/ddim.h" -#include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/select_impl.cu.h" #include "paddle/phi/kernels/where_index_kernel.h" +#include "paddle/phi/core/ddim.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/kernel_registry.h" + namespace phi { template struct IndexFunctor { diff --git a/paddle/phi/kernels/graph_send_recv_grad_kernel.h b/paddle/phi/kernels/graph_send_recv_grad_kernel.h index d163e6e278a0755c63be3192ae749ddbac5262d4..3694c8f1e6c9905586ea82d49a87b6b1a12e1e71 100644 --- a/paddle/phi/kernels/graph_send_recv_grad_kernel.h +++ b/paddle/phi/kernels/graph_send_recv_grad_kernel.h @@ -23,7 +23,7 @@ namespace phi { template void GraphSendRecvGradKernel(const Context& ctx, const DenseTensor& out_grad, - paddle::optional x, + const DenseTensor& x, paddle::optional out, const DenseTensor& src_index, const DenseTensor& dst_index, diff --git a/paddle/phi/kernels/graph_send_recv_kernel.h b/paddle/phi/kernels/graph_send_recv_kernel.h index 95dbdc4443ad003c3b9e7e584976cbe5c0503e20..51768fbc18f019da7713f2f95334e445620da91b 100644 --- a/paddle/phi/kernels/graph_send_recv_kernel.h +++ b/paddle/phi/kernels/graph_send_recv_kernel.h @@ -25,6 +25,7 @@ void GraphSendRecvKernel(const Context& ctx, const DenseTensor& src_index, const DenseTensor& dst_index, const std::string& pool_type, + int64_t out_size, DenseTensor* out, DenseTensor* dst_count); diff --git a/paddle/phi/kernels/impl/frobenius_norm_grad_kernel_impl.h b/paddle/phi/kernels/impl/frobenius_norm_grad_kernel_impl.h index 65d903a7fe426c6eed6cba6f38e8c636001d47b0..1877a4ecc227e36cad474d8a5c11de5ff7a1946c 100644 --- a/paddle/phi/kernels/impl/frobenius_norm_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/frobenius_norm_grad_kernel_impl.h @@ -33,7 +33,7 @@ void FrobeniusNormGradKernel(const Context& ctx, DataType out_dtype, DenseTensor* dx) { ReduceGradKernel( - ctx, x, dout, out, axis, keep_dim, reduce_all, in_dtype, out_dtype, dx); + ctx, x, out, dout, axis, keep_dim, reduce_all, in_dtype, out_dtype, dx); } } // namespace phi diff --git a/paddle/phi/kernels/impl/reduce_grad.h b/paddle/phi/kernels/impl/reduce_grad.h index f56d3d3ed50f7e72910115f7ec28914a5eade2e8..0b1c43b5f040227b691fca6bdc257fbf8c697957 100644 --- a/paddle/phi/kernels/impl/reduce_grad.h +++ b/paddle/phi/kernels/impl/reduce_grad.h @@ -87,8 +87,8 @@ template void ReduceGradKernel(const Context& dev_ctx, const DenseTensor& x, - const DenseTensor& out_grad, const paddle::optional& out, + const DenseTensor& out_grad, const std::vector& dims, bool keep_dim, bool reduce_all, diff --git a/paddle/phi/kernels/impl/reduce_max_grad_kernel_impl.h b/paddle/phi/kernels/impl/reduce_max_grad_kernel_impl.h index 4a74416e3916492e6d3a40e09ca347db485fff7c..0a0c1abac808646db157b39ef929b007a25e6985 100644 --- a/paddle/phi/kernels/impl/reduce_max_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/reduce_max_grad_kernel_impl.h @@ -24,8 +24,8 @@ namespace phi { template void ReduceMaxGradKernel(const Context& dev_ctx, const DenseTensor& x, - const DenseTensor& out_grad, const DenseTensor& out, + const DenseTensor& out_grad, const std::vector& dims, bool keep_dim, bool reduce_all, @@ -34,8 +34,8 @@ void ReduceMaxGradKernel(const Context& dev_ctx, DenseTensor* x_grad) { ReduceGradKernel(dev_ctx, x, - out_grad, out, + out_grad, dims, keep_dim, reduce_all, diff --git a/paddle/phi/kernels/impl/reduce_min_grad_kernel_impl.h b/paddle/phi/kernels/impl/reduce_min_grad_kernel_impl.h index baaa544f137366f1e0343c25bc373cc08350f7fd..965fc686e27838ccc3ec73b7ae7d8c5267cd59bf 100644 --- a/paddle/phi/kernels/impl/reduce_min_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/reduce_min_grad_kernel_impl.h @@ -24,8 +24,8 @@ namespace phi { template void ReduceMinGradKernel(const Context& dev_ctx, const DenseTensor& x, - const DenseTensor& out_grad, const DenseTensor& out, + const DenseTensor& out_grad, const std::vector& dims, bool keep_dim, bool reduce_all, @@ -34,8 +34,8 @@ void ReduceMinGradKernel(const Context& dev_ctx, DenseTensor* x_grad) { ReduceGradKernel(dev_ctx, x, - out_grad, out, + out_grad, dims, keep_dim, reduce_all, diff --git a/paddle/phi/kernels/impl/reduce_prod_grad_kernel_impl.h b/paddle/phi/kernels/impl/reduce_prod_grad_kernel_impl.h index 6b93e98cec0168ab55e15e3401a72738f79d3a07..fb361e34205582bd3478696ade10602240b7d66b 100644 --- a/paddle/phi/kernels/impl/reduce_prod_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/reduce_prod_grad_kernel_impl.h @@ -24,8 +24,8 @@ namespace phi { template void ReduceProdGradKernel(const Context& dev_ctx, const DenseTensor& x, - const DenseTensor& out_grad, const DenseTensor& out, + const DenseTensor& out_grad, const std::vector& dims, bool keep_dim, bool reduce_all, @@ -34,8 +34,8 @@ void ReduceProdGradKernel(const Context& dev_ctx, DenseTensor* x_grad) { ReduceGradKernel(dev_ctx, x, - out_grad, out, + out_grad, dims, keep_dim, reduce_all, diff --git a/paddle/phi/kernels/primitive/datamover_primitives.h b/paddle/phi/kernels/primitive/datamover_primitives.h index 1d4181f3b9a89509ada2a8fe27d584a9b5aa039c..993349f2d9e14112e2acbd0bc098f3a00351232b 100644 --- a/paddle/phi/kernels/primitive/datamover_primitives.h +++ b/paddle/phi/kernels/primitive/datamover_primitives.h @@ -123,6 +123,15 @@ __device__ __forceinline__ void WriteData(T* dst, dst[i] = src[i]; } } + +template +__device__ __forceinline__ void ReadData(T* dst, + const T* __restrict__ src, + int num) { + for (int i = 0; i < num; i++) { + dst[i] = src[i]; + } +} #undef INT_BITS } // namespace details diff --git a/paddle/phi/kernels/reduce_grad_kernel.h b/paddle/phi/kernels/reduce_grad_kernel.h index ee6f3d19a094d29546e82e7138933eceb96459d0..a4b472c445888fd45ff497884d309778b7447151 100644 --- a/paddle/phi/kernels/reduce_grad_kernel.h +++ b/paddle/phi/kernels/reduce_grad_kernel.h @@ -43,8 +43,8 @@ void ReduceMeanGradKernel(const Context& dev_ctx, template void ReduceProdGradKernel(const Context& dev_ctx, const DenseTensor& x, - const DenseTensor& out_grad, const DenseTensor& out, + const DenseTensor& out_grad, const std::vector& dims, bool keep_dim, bool reduce_all, @@ -55,8 +55,8 @@ void ReduceProdGradKernel(const Context& dev_ctx, template void ReduceMaxGradKernel(const Context& dev_ctx, const DenseTensor& x, - const DenseTensor& out_grad, const DenseTensor& out, + const DenseTensor& out_grad, const std::vector& dims, bool keep_dim, bool reduce_all, @@ -67,8 +67,8 @@ void ReduceMaxGradKernel(const Context& dev_ctx, template void ReduceMinGradKernel(const Context& dev_ctx, const DenseTensor& x, - const DenseTensor& out_grad, const DenseTensor& out, + const DenseTensor& out_grad, const std::vector& dims, bool keep_dim, bool reduce_all, diff --git a/paddle/phi/ops/compat/batch_norm_sig.cc b/paddle/phi/ops/compat/batch_norm_sig.cc index fa1fac5d23779597fee7f8a6e4e467c02d6d4c94..803bb50b438a58eb49beb50caa43f8fb0f408f8c 100644 --- a/paddle/phi/ops/compat/batch_norm_sig.cc +++ b/paddle/phi/ops/compat/batch_norm_sig.cc @@ -18,10 +18,17 @@ namespace phi { KernelSignature BatchNormOpArgumentMapping(const ArgumentMappingContext& ctx) { bool is_test = paddle::any_cast(ctx.Attr("is_test")); - bool use_global_stats = paddle::any_cast(ctx.Attr("use_global_stats")); + bool use_global_stats = + ctx.HasAttr("use_global_stats") + ? paddle::any_cast(ctx.Attr("use_global_stats")) + : false; bool trainable_statistics = - paddle::any_cast(ctx.Attr("trainable_statistics")); - bool fuse_with_relu = paddle::any_cast(ctx.Attr("fuse_with_relu")); + ctx.HasAttr("trainable_statistics") + ? paddle::any_cast(ctx.Attr("trainable_statistics")) + : false; + bool fuse_with_relu = ctx.HasAttr("fuse_with_relu") + ? paddle::any_cast(ctx.Attr("fuse_with_relu")) + : false; // Dispenable `MomentumTensor` is useless now if (is_test && !use_global_stats && !trainable_statistics && !fuse_with_relu) { diff --git a/paddle/phi/ops/compat/graph_send_recv_sig.cc b/paddle/phi/ops/compat/graph_send_recv_sig.cc index dacb8b25a89f9c151198f573c3fc4fda37e2939e..fa4da0704c9871f8b5969fdf79da6dcb8617cae9 100644 --- a/paddle/phi/ops/compat/graph_send_recv_sig.cc +++ b/paddle/phi/ops/compat/graph_send_recv_sig.cc @@ -16,6 +16,14 @@ limitations under the License. */ namespace phi { +KernelSignature GraphSendRecvOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("graph_send_recv", + {"X", "Src_index", "Dst_index"}, + {"pool_type", "out_size"}, + {"Out", "Dst_count"}); +} + KernelSignature GraphSendRecvGradOpArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature( @@ -27,5 +35,8 @@ KernelSignature GraphSendRecvGradOpArgumentMapping( } // namespace phi +PD_REGISTER_ARG_MAPPING_FN(graph_send_recv, + phi::GraphSendRecvOpArgumentMapping); + PD_REGISTER_ARG_MAPPING_FN(graph_send_recv_grad, phi::GraphSendRecvGradOpArgumentMapping); diff --git a/paddle/phi/ops/compat/reduce_sig.cc b/paddle/phi/ops/compat/reduce_sig.cc index 4bca0523801c1a94f90197c93cc495c2c4f56eeb..273badee623816ee32d25b52f8951dd0f854d8a4 100644 --- a/paddle/phi/ops/compat/reduce_sig.cc +++ b/paddle/phi/ops/compat/reduce_sig.cc @@ -149,7 +149,7 @@ KernelSignature ReduceMaxGradOpArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature( "max_grad", - {"X", GradVarName("Out"), "Out"}, + {"X", "Out", GradVarName("Out")}, {"dim", "keep_dim", "reduce_all", "in_dtype", "out_dtype"}, {GradVarName("X")}); } @@ -158,7 +158,7 @@ KernelSignature ReduceMinGradOpArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature( "min_grad", - {"X", GradVarName("Out"), "Out"}, + {"X", "Out", GradVarName("Out")}, {"dim", "keep_dim", "reduce_all", "in_dtype", "out_dtype"}, {GradVarName("X")}); } @@ -167,7 +167,7 @@ KernelSignature ReduceProdGradOpArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature( "prod_grad", - {"X", GradVarName("Out"), "Out"}, + {"X", "Out", GradVarName("Out")}, {"dim", "keep_dim", "reduce_all", "in_dtype", "out_dtype"}, {GradVarName("X")}); } diff --git a/paddle/phi/tests/api/test_data_transform.cc b/paddle/phi/tests/api/test_data_transform.cc index dd008ff36d50a8e30166d478b916ac180d564c7c..a2bd1f2cad9fcfa2e29b6fd433a3065a3bcb4c10 100644 --- a/paddle/phi/tests/api/test_data_transform.cc +++ b/paddle/phi/tests/api/test_data_transform.cc @@ -17,6 +17,7 @@ limitations under the License. */ #include "paddle/phi/api/include/api.h" #include "paddle/phi/common/complex.h" +#include "paddle/phi/common/place.h" #include "paddle/phi/core/compat/convert_utils.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/kernel_registry.h" @@ -39,10 +40,10 @@ TEST(API, data_transform_same_place) { auto x = paddle::experimental::full({3, 3}, 1.0, experimental::DataType::COMPLEX128, - experimental::Backend::CPU); + experimental::CPUPlace()); auto y = paddle::experimental::full( - {3, 3}, 2.0, experimental::DataType::FLOAT32, experimental::Backend::CPU); + {3, 3}, 2.0, experimental::DataType::FLOAT32, experimental::CPUPlace()); std::vector> sum(9, 6.0); @@ -74,10 +75,10 @@ TEST(API, data_transform_same_place) { TEST(Tensor, data_transform_diff_place) { // 1. create tensor auto x = paddle::experimental::full( - {3, 3}, 1.0, experimental::DataType::FLOAT64, experimental::Backend::CPU); + {3, 3}, 1.0, experimental::DataType::FLOAT64, experimental::CPUPlace()); auto y = paddle::experimental::full( - {3, 3}, 2.0, experimental::DataType::FLOAT64, experimental::Backend::GPU); + {3, 3}, 2.0, experimental::DataType::FLOAT64, experimental::GPUPlace()); std::vector sum(9, 6.0); @@ -95,7 +96,7 @@ TEST(Tensor, data_transform_diff_place) { ASSERT_EQ(out.impl()->place(), phi::TransToPhiPlace(experimental::Backend::GPU)); - auto ref_out = experimental::copy_to(out, experimental::Backend::CPU, true); + auto ref_out = experimental::copy_to(out, experimental::CPUPlace(), true); auto dense_out = std::dynamic_pointer_cast(ref_out.impl()); for (size_t i = 0; i < 9; i++) { diff --git a/paddle/phi/tests/api/test_scale_benchmark.cc b/paddle/phi/tests/api/test_scale_benchmark.cc index 05a5563344966e4d0664a83051dc84b3f2da499a..ca4a264e511bd574b7f4640bab84862df3f676b0 100644 --- a/paddle/phi/tests/api/test_scale_benchmark.cc +++ b/paddle/phi/tests/api/test_scale_benchmark.cc @@ -30,7 +30,7 @@ namespace tests { TEST(API, scale) { auto x = experimental::full( - {3, 4}, 1.0, experimental::DataType::FLOAT32, experimental::Backend::CPU); + {3, 4}, 1.0, experimental::DataType::FLOAT32, experimental::CPUPlace()); const size_t cycles = 300; phi::tests::Timer timer; diff --git a/paddle/phi/tests/api/test_to_api.cc b/paddle/phi/tests/api/test_to_api.cc index 66c478e4c000150b8bfd50d6cb5b79e90d7a5d80..4e8755be0c773ec38f454aad03b924e4daee532d 100644 --- a/paddle/phi/tests/api/test_to_api.cc +++ b/paddle/phi/tests/api/test_to_api.cc @@ -69,10 +69,10 @@ TEST(API, copy_to) { // 2. test API #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - auto tmp = paddle::experimental::copy_to(x, phi::Backend::GPU, false); - auto out = paddle::experimental::copy_to(tmp, phi::Backend::CPU, true); + auto tmp = paddle::experimental::copy_to(x, phi::GPUPlace(), false); + auto out = paddle::experimental::copy_to(tmp, phi::CPUPlace(), true); #else - auto out = paddle::experimental::copy_to(x, phi::Backend::CPU, false); + auto out = paddle::experimental::copy_to(x, phi::CPUPlace(), false); #endif // 3. check result @@ -85,10 +85,10 @@ TEST(Tensor, copy_to) { // 2. test API #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - auto tmp = x.copy_to(phi::Backend::GPU, false); - auto out = tmp.copy_to(phi::Backend::CPU, true); + auto tmp = x.copy_to(phi::GPUPlace(), false); + auto out = tmp.copy_to(phi::CPUPlace(), true); #else - auto out = x.copy_to(phi::Backend::CPU, false); + auto out = x.copy_to(phi::CPUPlace(), false); #endif // 3. check result diff --git a/python/paddle/distributed/run/__init__.py b/python/paddle/distributed/launch/__init__.py similarity index 55% rename from python/paddle/distributed/run/__init__.py rename to python/paddle/distributed/launch/__init__.py index f25ddb794cc4d573429ac960e646bd8125c48d16..f39bb76114345d7144e1f5b005eb42792a84cf14 100644 --- a/python/paddle/distributed/run/__init__.py +++ b/python/paddle/distributed/launch/__init__.py @@ -12,74 +12,69 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .job.container import Container -from .job.pod import Pod -from .job.job import Job -from . import plugins - -#__all__ = [Container, Pod, Job] +__all__ = [] ''' -Paddle distribution training entry ``python -m paddle.distributed.run``. +Paddle distributed training entry ``python -m paddle.distributed.launch``. Help # for arg usage and explanation, try the following command -# python -m paddle.distributed.run -h +# python -m paddle.distributed.launch -h Collective Mode Case 1: 1 node use all visible devices -# python -m paddle.distributed.run train.py +# python -m paddle.distributed.launch train.py use specified devices -# python -m paddle.distributed.run --devices=0,1,2,3 train.py +# python -m paddle.distributed.launch --devices=0,1,2,3 train.py Case 2: multi-node, auto detect ip/port -# python -m paddle.distributed.run --np 2 train.py +# python -m paddle.distributed.launch --nnodes 2 train.py # auto print following command -# python -m paddle.distributed.run --master 10.0.0.1:13538 --np 2 demo.py +# python -m paddle.distributed.launch --master 10.0.0.1:13538 --nnodes 2 demo.py # then copy and paste above command to other nodes Case 3: multi-node, specified master/rendezvous server -# python -m paddle.distributed.run --np 2 --master 10.0.0.1:2379 train.py +# python -m paddle.distributed.launch --nnodes 2 --master 10.0.0.1:2379 train.py # the master ip must be one of the node and the port must available Parameter Server Mode Case 1.1: 1 node, 1 ps, 1 trainer -# python -m paddle.distributed.run --mode ps train.py -# python -m paddle.distributed.run --server_num=1 --trainer_num=1 train.py +# python -m paddle.distributed.launch --mode ps train.py +# python -m paddle.distributed.launch --server_num=1 --trainer_num=1 train.py Case 1.2: 1 node, 2 ps, 2 trainer -# python -m paddle.distributed.run --server_num=2 --trainer_num=2 train.py +# python -m paddle.distributed.launch --server_num=2 --trainer_num=2 train.py Case 2: 2 node, 2 ps, 2 trainer per node -# python -m paddle.distributed.run --server_num=2 --trainer_num=2 --np 2 train.py +# python -m paddle.distributed.launch --server_num=2 --trainer_num=2 --nnodes 2 train.py # auto print following command -# python -m paddle.distributed.run --master 10.0.0.1:13538 --server_num=2 --trainer_num=2 --np 2 train.py +# python -m paddle.distributed.launch --master 10.0.0.1:13538 --server_num=2 --trainer_num=2 --nnodes 2 train.py # then copy and paste above command to other nodes Case 3: multi-node, specified master/rendezvous server -# python -m paddle.distributed.run --master 10.0.0.1:13538 --server_num=2 --trainer_num=2 --np 2 train.py +# python -m paddle.distributed.launch --master 10.0.0.1:13538 --server_num=2 --trainer_num=2 --nnodes 2 train.py # the master ip must be one of the node and the port must available Case 4: specified servers and trainers in each node -python -m paddle.distributed.run --servers 127.0.0.1:8900,127.0.0.1:8901 --trainers 127.0.0.1:8902,127.0.0.1:8903 train.py +python -m paddle.distributed.launch --servers 127.0.0.1:8900,127.0.0.1:8901 --trainers 127.0.0.1:8902,127.0.0.1:8903 train.py Elastic Mode # run following command in 3 node to run immediately, or in 2 node to run after elastic_timeout -# python -m paddle.distributed.run --master etcd://10.0.0.1:2379 --np 2:3 train.py +# python -m paddle.distributed.launch --master etcd://10.0.0.1:2379 --nnodes 2:3 train.py # once the peer number changes between 2:3, the strategy holds diff --git a/python/paddle/distributed/run/__main__.py b/python/paddle/distributed/launch/__main__.py similarity index 61% rename from python/paddle/distributed/run/__main__.py rename to python/paddle/distributed/launch/__main__.py index e32df59a328081e33aa86b42ed9b8e489ac399e8..9cd6f4408c9897d5eaeaa0fd31602c4a0f6de09f 100644 --- a/python/paddle/distributed/run/__main__.py +++ b/python/paddle/distributed/launch/__main__.py @@ -15,14 +15,28 @@ from .context import Context from . import controllers -# initialize the context to run -ctx = Context() -# initialize the selected controller -c = controllers.init(ctx) +def launch(): + # initialize the context to run + ctx = Context() -# run the pods -c.run() + if ctx.is_legacy_mode(): -# manager or just wait pod -c.finalize() + # legacy mode + from paddle.distributed.fleet import launch + launch.launch() + + else: + + # initialize the selected controller + c = controllers.init(ctx) + + # run the pods + c.run() + + # manager or just wait pod + c.finalize() + + +if __name__ == "__main__": + launch() diff --git a/python/paddle/distributed/launch/context/__init__.py b/python/paddle/distributed/launch/context/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e13bb2a5f0ba720037e488e92d1777a9a9111b68 --- /dev/null +++ b/python/paddle/distributed/launch/context/__init__.py @@ -0,0 +1,88 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddle.distributed.launch import plugins + +from .node import Node +from .status import Status +from .args_envs import parse_args, fetch_envs, env_args_mapping + +import logging + + +class Context(object): + def __init__(self, enable_plugin=True): + self.args, self.unknown_args = parse_args() + self.envs = fetch_envs() + self.logger = self.get_logger() + + self.node = Node() + self.status = Status() + + self.set_env_in_args() + + # design for event queue, later + self.events = [] + + if enable_plugin: + self._enable_plugin() + + def is_legacy_mode(self): + if self.args.legacy: + return True + + if len(self.unknown_args) > 0: + self.logger.warning("Compatible mode enable with args {}".format( + self.unknown_args)) + return True + + legacy_env_list = [ + 'DISTRIBUTED_TRAINER_ENDPOINTS', + 'PADDLE_ELASTIC_JOB_ID', + 'PADDLE_DISTRI_BACKEND', + 'FLAGS_START_PORT', + ] + + for env in legacy_env_list: + if env in self.envs: + self.logger.warning( + "ENV {} is deprecated, legacy launch enable".format(env)) + return True + + if self.args.master: + return False + + return False + + def get_envs(self): + return self.envs.copy() + + def _enable_plugin(self): + for pl in plugins.enabled_plugins: + pl(self) + + def get_logger(self, level=logging.INFO): + logger = logging.getLogger("LAUNCH") + logger.setLevel(self.args.log_level.upper() or level) + formatter = logging.Formatter( + fmt='%(name)s %(levelname)s %(asctime)s %(message)s') + ch = logging.StreamHandler() + ch.setFormatter(formatter) + logger.addHandler(ch) + return logger + + def set_env_in_args(self): + for k, v in env_args_mapping.items(): + if k in self.envs: + setattr(self.args, v, self.envs[k]) diff --git a/python/paddle/distributed/launch/context/args_envs.py b/python/paddle/distributed/launch/context/args_envs.py new file mode 100644 index 0000000000000000000000000000000000000000..d504a11e5f3d1cfc06c73cbb4b723e8758de52ce --- /dev/null +++ b/python/paddle/distributed/launch/context/args_envs.py @@ -0,0 +1,151 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from argparse import ArgumentParser, REMAINDER + +env_args_mapping = { + 'POD_IP': 'host', + 'PADDLE_MASTER': 'master', + 'PADDLE_DEVICES': 'devices', + 'PADDLE_NNODES': 'nnodes', + 'PADDLE_MODE': 'mode', + 'PADDLE_LOG_LEVEL': 'log_level', + 'PADDLE_NPROC_PER_NODE': 'nproc_per_node', + 'PADDLE_JOB_ID': 'job_id', + 'PADDLE_RANK': 'rank', + 'PADDLE_LOG_DIR': 'log_dir', + 'PADDLE_MAX_RESTART': 'max_restart', + 'PADDLE_ELASTIC_LEVEL': 'elastic_level', + 'PADDLE_ELASTIC_TIMEOUT': 'elastic_timeout', + 'PADDLE_SERVER_NUM': 'server_num', + 'PADDLE_TRAINER_NUM': 'trainer_num', + 'PADDLE_SERVERS_ENDPOINTS': 'servers', + 'PADDLE_TRAINERS_ENDPOINTS': 'trainers', + 'PADDLE_GLOO_PORT': 'gloo_port', + 'PADDLE_WITH_GLOO': 'with_gloo', +} + + +def fetch_envs(): + os.environ.pop('http_proxy', None) + os.environ.pop('https_proxy', None) + + return os.environ.copy() + + +def parse_args(): + parser = ArgumentParser() + + base_group = parser.add_argument_group("Base Parameters") + + base_group.add_argument( + "--master", + type=str, + default=None, + help="the master/rendezvous server, ip:port") + + base_group.add_argument( + "--legacy", type=bool, default=False, help="use legacy launch") + + base_group.add_argument( + "--rank", type=int, default=-1, help="the peer rank") + + base_group.add_argument( + "--log_level", type=str, default="INFO", help="log level. Default INFO") + + base_group.add_argument( + "--nnodes", + type=str, + default="1", + help="the number of peers, i.e. pod/node number") + + base_group.add_argument( + "--nproc_per_node", + type=int, + default=None, + help="the number of processes in a pod") + + base_group.add_argument( + "--log_dir", + type=str, + default="log", + help="the path for each process's log. Default ./log") + base_group.add_argument( + "--mode", + type=str, + default="collective", + help="run mode of the job, collective/ps/ps-heter") + + base_group.add_argument( + "--job_id", + type=str, + default="default", + help="unique id of the job. Default default") + + base_group.add_argument( + "--devices", + type=str, + default=None, + help="accelerate devices. as --gpus,npus,xps") + + base_group.add_argument("--host", type=str, default=None, help="host ip") + + base_group.add_argument( + "training_script", + type=str, + help="the full path of py script," + "followed by arguments for the " + "training script") + + base_group.add_argument('training_script_args', nargs=REMAINDER) + + ps_group = parser.add_argument_group("Parameter-Server Parameters") + # for parameter server + ps_group.add_argument( + "--servers", type=str, default='', help="servers endpoints full list") + ps_group.add_argument( + "--trainers", type=str, default='', help="trainers endpoints full list") + + ps_group.add_argument( + "--trainer_num", type=int, default=None, help="number of trainers") + ps_group.add_argument( + "--server_num", type=int, default=None, help="number of servers") + ps_group.add_argument( + "--gloo_port", type=int, default=6767, help="gloo http port") + ps_group.add_argument( + "--with_gloo", type=str, default="0", help="use gloo or not") + + # parameter elastic mode + elastic_group = parser.add_argument_group("Elastic Parameters") + elastic_group.add_argument( + "--max_restart", + type=int, + default=3, + help="the times can restart. Default 3") + + elastic_group.add_argument( + "--elastic_level", + type=int, + default=-1, + help="elastic level: -1 disable, 0 failed exit, peers hold, 1 internal restart" + ) + + elastic_group.add_argument( + "--elastic_timeout", + type=int, + default=30, + help="seconds to wait before elastic perform training") + + return parser.parse_known_args() diff --git a/python/paddle/distributed/launch/context/device.py b/python/paddle/distributed/launch/context/device.py new file mode 100644 index 0000000000000000000000000000000000000000..9163e7abd918371ddf4eca388bc912b630684f1f --- /dev/null +++ b/python/paddle/distributed/launch/context/device.py @@ -0,0 +1,149 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + + +class DeviceType: + CPU = 'cpu' + GPU = 'gpu' + XPU = 'xpu' + NPU = 'npu' + MLU = 'mlu' + + +class Device(object): + def __init__(self, dtype=None, memory="", labels=""): + self._dtype = dtype + self._memory = memory + self._labels = labels + + def __str__(self): + return ",".join(self._labels) + + @property + def dtype(self): + return self._dtype + + @property + def count(self): + return len(self._labels) or 1 + + @property + def memory(self): + return self._memory + + @property + def labels(self): + return self._labels + + @labels.setter + def labels(self, lbs): + if isinstance(lbs, str): + self._labels = lbs.split(',') + elif isinstance(lbs, list): + self._labels = lbs + else: + self._labels = [] + + def get_selected_flag_key(self): + if self._dtype == DeviceType.CPU: + return 'FLAGS_selected_cpus' + if self._dtype == DeviceType.GPU: + return 'FLAGS_selected_gpus' + if self._dtype == DeviceType.NPU: + return 'FLAGS_selected_npus' + if self._dtype == DeviceType.XPU: + return 'FLAGS_selected_xpus' + if self._dtype == DeviceType.MLU: + return 'FLAGS_selected_mlus' + return 'FLAGS_selected_devices' + + def get_selected_flag_label(self, idx): + if idx < len(self._labels): + return self._labels[idx] + else: + return '0' + + def selected_flags(self, idx=None): + if idx is None: + return {self.get_selected_flag_key(): ','.join(self._labels)} + else: + return { + self.get_selected_flag_key(): self.get_selected_flag_label(idx) + } + + @classmethod + def parse_device(self): + dev = Device() + visible_devices = None + if 'CUDA_VISIBLE_DEVICES' in os.environ or 'NVIDIA_VISIBLE_DEVICES' in os.environ: + dev._dtype = DeviceType.GPU + visible_devices = os.getenv("CUDA_VISIBLE_DEVICES") or os.getenv( + "NVIDIA_VISIBLE_DEVICES") + elif 'XPU_VISIBLE_DEVICES' in os.environ: + dev._dtype = DeviceType.XPU + visible_devices = os.getenv("XPU_VISIBLE_DEVICES") + elif 'ASCEND_VISIBLE_DEVICES' in os.environ: + dev._dtype = DeviceType.NPU + visible_devices = os.getenv("ASCEND_VISIBLE_DEVICES") + elif 'MLU_VISIBLE_DEVICES' in os.environ: + dev._dtype = DeviceType.MLU + visible_devices = os.getenv("MLU_VISIBLE_DEVICES") + + if visible_devices is not None and visible_devices != 'all': + dev._labels = visible_devices.split(',') + else: + return self.detect_device() + + return dev + + @classmethod + def detect_device(self): + import paddle.fluid as fluid + + dev = Device() + num = 0 + visible_devices = None + if fluid.core.is_compiled_with_cuda(): + dev._dtype = DeviceType.GPU + num = fluid.core.get_cuda_device_count() + visible_devices = os.getenv("CUDA_VISIBLE_DEVICES") or os.getenv( + "NVIDIA_VISIBLE_DEVICES") + elif fluid.core.is_compiled_with_xpu(): + dev._dtype = DeviceType.XPU + num = fluid.core.get_xpu_device_count() + visible_devices = os.getenv("XPU_VISIBLE_DEVICES") + elif fluid.core.is_compiled_with_npu(): + dev._dtype = DeviceType.NPU + num = fluid.core.get_npu_device_count() + visible_devices = os.getenv("ASCEND_VISIBLE_DEVICES") + elif fluid.core.is_compiled_with_mlu(): + dev._dtype = DeviceType.MLU + num = fluid.core.get_mlu_device_count() + visible_devices = os.getenv("MLU_VISIBLE_DEVICES") + + if num == 0: + dev._dtype = DeviceType.CPU + elif visible_devices is None or visible_devices == "all": + dev._labels = [str(x) for x in range(0, num)] + else: + dev._labels = visible_devices.split(',') + + return dev + + +if __name__ == '__main__': + d = Device.parse_device() + print(d.get_selected_flag()) diff --git a/python/paddle/distributed/run/context/event.py b/python/paddle/distributed/launch/context/event.py similarity index 100% rename from python/paddle/distributed/run/context/event.py rename to python/paddle/distributed/launch/context/event.py diff --git a/python/paddle/distributed/run/context/node.py b/python/paddle/distributed/launch/context/node.py similarity index 100% rename from python/paddle/distributed/run/context/node.py rename to python/paddle/distributed/launch/context/node.py diff --git a/python/paddle/distributed/run/context/resource.py b/python/paddle/distributed/launch/context/resource.py similarity index 100% rename from python/paddle/distributed/run/context/resource.py rename to python/paddle/distributed/launch/context/resource.py diff --git a/python/paddle/distributed/run/context/status.py b/python/paddle/distributed/launch/context/status.py similarity index 100% rename from python/paddle/distributed/run/context/status.py rename to python/paddle/distributed/launch/context/status.py diff --git a/python/paddle/distributed/run/controllers/__init__.py b/python/paddle/distributed/launch/controllers/__init__.py similarity index 98% rename from python/paddle/distributed/run/controllers/__init__.py rename to python/paddle/distributed/launch/controllers/__init__.py index e5557151ad5489cb4af0c34b3ad47c31774b3326..706131300f0d884b216071a6ea3fe75caaf714df 100644 --- a/python/paddle/distributed/run/controllers/__init__.py +++ b/python/paddle/distributed/launch/controllers/__init__.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -__all__ = ["init"] +__all__ = [] from .collective import CollectiveController from .collective import CollectiveElasticController diff --git a/python/paddle/distributed/run/controllers/collective.py b/python/paddle/distributed/launch/controllers/collective.py similarity index 95% rename from python/paddle/distributed/run/controllers/collective.py rename to python/paddle/distributed/launch/controllers/collective.py index c4feb54428a07265693c0969e6e385a380e22f3d..c3fa4e6e07de9bca8acd8db585be679e3cf0244c 100644 --- a/python/paddle/distributed/run/controllers/collective.py +++ b/python/paddle/distributed/launch/controllers/collective.py @@ -89,6 +89,10 @@ class CollectiveController(Controller): "PADDLE_TRAINERS_NUM": "{}".format(global_size), "PADDLE_RANK_IN_NODE": str(i), } + if self.pod.replicas == 1: + e.update(self.ctx.node.device.selected_flags()) + else: + e.update(self.ctx.node.device.selected_flags(i)) self.add_container(envs=e, log_tag=i) return True @@ -106,7 +110,8 @@ class CollectiveElasticController(CollectiveController): def register(self): if self.job.id == 'default': self.ctx.logger.warning( - 'Using default job name may cause conflict, add --id in args') + 'Using default job name may cause conflict, add --job_id in args' + ) self.master.register_heartbeat(self.job.id, self.pod.name) @@ -114,6 +119,8 @@ class CollectiveElasticController(CollectiveController): ''' watch self and peer status, return true to exit ''' + + self.ctx.logger.info("Watching {}".format(self.pod)) while not self.ctx.status.is_done(): # self status status = self.pod.watch(timeout=2) @@ -171,13 +178,8 @@ class CollectiveElasticController(CollectiveController): continue self.master.set_status(self.ctx.status.RUNNING) - self.ctx.status.run() - - assert len(self.pod.containers) > 0, "No container in the pod" - self.ctx.logger.debug("Run {}".format(self.pod)) - self.ctx.logger.debug("Run {}".format(self.pod.containers[0])) - self.pod.deploy() + self.deploy_pod() if self.watch(): break diff --git a/python/paddle/distributed/run/controllers/controller.py b/python/paddle/distributed/launch/controllers/controller.py similarity index 90% rename from python/paddle/distributed/run/controllers/controller.py rename to python/paddle/distributed/launch/controllers/controller.py index 2d904cf2a2cca5b9abaab06d1545c03c160e3d93..60e34b85a12bc46636f7edb9bbb4926eddfc73ba 100644 --- a/python/paddle/distributed/run/controllers/controller.py +++ b/python/paddle/distributed/launch/controllers/controller.py @@ -16,9 +16,9 @@ import sys import os import signal -from paddle.distributed.run.job import Job -from paddle.distributed.run.job import Pod -from paddle.distributed.run.job import Container +from paddle.distributed.launch.job.job import Job +from paddle.distributed.launch.job.pod import Pod +from paddle.distributed.launch.job.container import Container from .master import Master @@ -39,38 +39,43 @@ class ControllerBase(object): self.ctx = ctx self.master = Master.factory(self.ctx) - self.job = Job(np=self.ctx.args.np, + self.job = Job(nnodes=self.ctx.args.nnodes, mode=self.ctx.args.mode, - id=self.ctx.args.id) + jid=self.ctx.args.job_id) self.pod = Pod() self.join_server = None - def run(self): - self.build_job() - self.build_pod() + def deploy_pod(self): - if len(self.pod.containers) < 1: - self.ctx.logger.error("No container in the pod {}".format(self.pod)) - return + assert len(self.pod.containers) > 0, "No container in the pod" self.ctx.logger.info("Run {}".format(self.pod)) self.ctx.logger.debug(self.pod.containers[0]) + self.ctx.status.run() self.pod.deploy() + def run(self): + self.build_job() + self.build_pod() + + self.deploy_pod() + self.watch() def watch(self) -> bool: + self.ctx.logger.info("Watching {}".format(self.pod)) + status = self.pod.watch() if status == self.ctx.status.COMPLETED: self.ctx.logger.info("Pod {}".format(status)) elif status == self.ctx.status.FAILED: + fc = self.pod.failed_container() self.ctx.logger.info("Pod {}".format(status)) - self.ctx.logger.error("Container failed !!!\n{}".format( - self.pod.failed_container())) - self.pod.tail() + self.ctx.logger.error("Container failed !!!\n{}".format(fc[0])) + fc[0].tail() self.pod.stop() def stop(self, sigint=None): diff --git a/python/paddle/distributed/run/controllers/master.py b/python/paddle/distributed/launch/controllers/master.py similarity index 96% rename from python/paddle/distributed/run/controllers/master.py rename to python/paddle/distributed/launch/controllers/master.py index 257ba3bad8da3c331ac303b7a3ee415461fd13b8..f9f484eb125eed264cb7e8658476255281a048cf 100644 --- a/python/paddle/distributed/run/controllers/master.py +++ b/python/paddle/distributed/launch/controllers/master.py @@ -12,8 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -from paddle.distributed.run.utils.kv_client import KVClient -from paddle.distributed.run.utils.kv_server import KVServer +from paddle.distributed.launch.utils.kv_client import KVClient +from paddle.distributed.launch.utils.kv_server import KVServer import time import sys @@ -84,7 +84,7 @@ class HTTPMaster(Master): print("Copy the following command to other nodes to run.") cmd = [ - sys.executable.split('/')[-1], "-m", "paddle.distributed.run" + sys.executable.split('/')[-1], "-m", "paddle.distributed.launch" ] cmd.extend(["--master", self.endpoint]) cmd.extend(sys.argv[1:]) @@ -118,9 +118,12 @@ class HTTPMaster(Master): self._stop_server() def sync_peers(self, prefix, key, value, size, rank=-1) -> (list, int): + if size < 2: return [value], 0 + self.ctx.logger.info("Waiting peer ready...") + self.lazy_init() while not self.ctx.status.is_done(): @@ -130,7 +133,7 @@ class HTTPMaster(Master): self.ctx.logger.warning("master not ready") time.sleep(0.1) - # 'aaaaaa' make suer main pod (master server) as rank 0 + # 'aaaaaa' make sure main pod (master server) as rank 0 ky = 'aaaaaa' if rank < 0 and self.role == Master.MAIN else key k = "{}/{}/{}".format(prefix, ky, rank) @@ -177,6 +180,12 @@ class ETCDMaster(Master): sync_peers gather all value for key under scope prefix result always be sorted either by rank or alphabet of pod.name ''' + + if size < 2: + return [value], 0 + + self.ctx.logger.info("Waiting peer ready...") + path = "{}/{}/{}".format(prefix, key, rank) self.client.delete_prefix(prefix) diff --git a/python/paddle/distributed/run/controllers/ps.py b/python/paddle/distributed/launch/controllers/ps.py similarity index 98% rename from python/paddle/distributed/run/controllers/ps.py rename to python/paddle/distributed/launch/controllers/ps.py index cc43c336cf1862fe075e5a4463b1f5b666a5005c..d3d0ef59bfd2f8102fe92beecef100c2079b1698 100644 --- a/python/paddle/distributed/run/controllers/ps.py +++ b/python/paddle/distributed/launch/controllers/ps.py @@ -22,7 +22,8 @@ class PSController(Controller): @classmethod def enable(cls, ctx): if ctx.args.mode == ControleMode.PS or ctx.args.server_num or len( - ctx.args.servers) > 0: + ctx.args.servers) > 0 or ctx.args.trainer_num or len( + ctx.args.trainers) > 0: ctx.logger.debug("{} enabled".format(cls.__name__)) ctx.args.mode = ControleMode.PS return True diff --git a/python/paddle/distributed/launch.py b/python/paddle/distributed/launch/job/__init__.py similarity index 78% rename from python/paddle/distributed/launch.py rename to python/paddle/distributed/launch/job/__init__.py index e02a439025b77f9ae612aa790bc521b521fb481f..97043fd7ba6885aac81cad5a49924c23c67d4d47 100644 --- a/python/paddle/distributed/launch.py +++ b/python/paddle/distributed/launch/job/__init__.py @@ -1,18 +1,13 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -from paddle.distributed.fleet import launch -launch.launch() - -__all__ = [] diff --git a/python/paddle/distributed/run/job/container.py b/python/paddle/distributed/launch/job/container.py similarity index 90% rename from python/paddle/distributed/run/job/container.py rename to python/paddle/distributed/launch/job/container.py index 651932d6c88378034d7ab9cb05bac00ee3ea7ddf..7105cae9024f25f4b5d63a8d74d5d492777d4046 100644 --- a/python/paddle/distributed/run/job/container.py +++ b/python/paddle/distributed/launch/job/container.py @@ -13,12 +13,11 @@ # limitations under the License. from collections import OrderedDict -from paddle.distributed.run.utils.process_context import ProcessContext +from paddle.distributed.launch.utils.process_context import ProcessContext from .status import Status import os, copy, sys -import time class Container(object): @@ -78,6 +77,11 @@ class Container(object): kwargs = {k: v for k, v in kwargs.items() if isinstance(v, str)} self._env.update(kwargs) + def _valide_env(self): + for k, v in self._env.items(): + assert isinstance(k, str) and isinstance( + v, str), 'env {}:{} must be str'.format(k, v) + def _get_fd(self, pth): if not pth: return None @@ -90,12 +94,12 @@ class Container(object): except: return None - def start(self, timeout=-1): - end = time.time() + timeout - + def start(self): if self._proc and self._proc.alive(): return True + self._valide_env() + self._stdout = self._get_fd(self._out) or sys.stdout if self._out == self._err: self._stderr = self._stdout @@ -106,14 +110,6 @@ class Container(object): self._entrypoint, env=self._env, out=self._stdout, err=self._stderr) self._proc.start() - while timeout > 0 and time.time() < end: - if self._proc.alive(): - time.sleep(0.1) - continue - if self._proc.exit_code() == 0: - return True - return False - def terminate(self, force=False): if self._log_handler: self._log_handler.close() @@ -125,9 +121,11 @@ class Container(object): def wait(self, timeout=None): self._proc.wait(timeout) + @property def exit_code(self): return self._proc.exit_code() if self._proc else -1 + @property def status(self): if not self._proc: return Status.UNINIT @@ -141,9 +139,9 @@ class Container(object): def __str__(self): return 'Container rank {} status {} cmd {} code {} log {} \nenv {}'.format( self._rank, - self.status(), + self.status, self._entrypoint, - self.exit_code(), + self.exit_code, self.errfile, self._env, ) diff --git a/python/paddle/distributed/run/job/job.py b/python/paddle/distributed/launch/job/job.py similarity index 89% rename from python/paddle/distributed/run/job/job.py rename to python/paddle/distributed/launch/job/job.py index 3469ed862576faed3bd7546710927f638b8fe0d5..31827968ddce605b424e53f5609ff302e5e4ac95 100644 --- a/python/paddle/distributed/run/job/job.py +++ b/python/paddle/distributed/launch/job/job.py @@ -20,16 +20,16 @@ class JobMode: class Job(object): - def __init__(self, id='default', mode=JobMode.COLLECTIVE, np="1"): + def __init__(self, jid='default', mode=JobMode.COLLECTIVE, nnodes="1"): self._mode = mode - self._id = id + self._id = jid self._replicas = 0 self._replicas_min = self._replicas self._replicas_max = self._replicas self._elastic = False - self.set_replicas(str(np)) + self.set_replicas(str(nnodes)) def __str__(self): return "Job: {}, mode {}, replicas {}[{}:{}], elastic {}".format( @@ -64,8 +64,8 @@ class Job(object): def replicas(self, replicas): self._replicas = replicas - def set_replicas(self, np: str): - np = str(np) if np else '1' + def set_replicas(self, nnodes: str): + np = str(nnodes) if nnodes else '1' if ':' in np: nps = np.split(':') diff --git a/python/paddle/distributed/run/job/pod.py b/python/paddle/distributed/launch/job/pod.py similarity index 78% rename from python/paddle/distributed/run/job/pod.py rename to python/paddle/distributed/launch/job/pod.py index f7c31edce1d552befac3a6f54e5e79c326b31c67..701adf45f94e880adef6108b6f7e77ea58155a9e 100644 --- a/python/paddle/distributed/run/job/pod.py +++ b/python/paddle/distributed/launch/job/pod.py @@ -34,7 +34,7 @@ class PodSepc(object): #self.status: Status = None self._rank = -1 - self._init_timeout = 120 # 2 min timeout for each init container + self._init_timeout = None self._restart = -1 self._replicas = 0 # number of containers self._exit_code = 0 @@ -45,15 +45,15 @@ class Pod(PodSepc): super().__init__() def __str__(self): - return "Pod: {}, replicas {}, status {}".format(self.name, - self.replicas, - self.status()) + return "Pod: {}, replicas {}, status {}".format( + self.name, self.replicas, self.status) def failed_container(self): + cs = [] for c in self._containers: - if c.status() == Status.FAILED: - return c - return None + if c.status == Status.FAILED: + cs.append(c) + return cs @property def name(self): @@ -65,7 +65,7 @@ class Pod(PodSepc): @replicas.setter def replicas(self, r): - self._replicas = r + self._replicas = max(r, 1) @property def rank(self): @@ -98,13 +98,15 @@ class Pod(PodSepc): @property def exit_code(self): for c in self._containers: - if c.exit_code() != 0: - return c.exit_code() + if c.exit_code != 0: + return c.exit_code return 0 def deploy(self): + # init container should stop before run containers for i in self._init_containers: - i.start(self._init_timeout) + i.start() + i.wait(self._init_timeout) for c in self._containers: c.start() @@ -120,6 +122,7 @@ class Pod(PodSepc): for c in self._containers: c.wait(None) + @property def status(self): if self.is_failed(): return Status.FAILED @@ -127,6 +130,9 @@ class Pod(PodSepc): if self.is_completed(): return Status.COMPLETED + if self.is_running(): + return Status.RUNNING + return Status.READY def reset(self): @@ -135,31 +141,31 @@ class Pod(PodSepc): def is_failed(self): for c in self._containers: - if c.status() == Status.FAILED: + if c.status == Status.FAILED: return True return False def is_completed(self): for c in self._containers: - if c.status() != Status.COMPLETED: + if c.status != Status.COMPLETED: + return False + return True + + def is_running(self): + for c in self._containers: + if c.status != Status.RUNNING: return False return True def logs(self, idx=None): if idx is None: - if self.failed_container(): - self.failed_container().logs() - else: - self._containers[0].logs() + self._containers[0].logs() else: self._containers[idx].logs() def tail(self, idx=None): if idx is None: - if self.failed_container(): - self.failed_container().tail() - else: - self._containers[0].tail() + self._containers[0].tail() else: self._containers[idx].tail() @@ -175,10 +181,10 @@ class Pod(PodSepc): end = time.time() + timeout while timeout < 0 or time.time() < end: for c in self._containers: - if c.status() in any_list: - return c.status() + if c.status in any_list: + return c.status - s = [c.status() for c in self._containers] + s = [c.status for c in self._containers] if len(set(s)) == 1 and s[0] in all_list: return s[0] diff --git a/python/paddle/distributed/run/job/status.py b/python/paddle/distributed/launch/job/status.py similarity index 100% rename from python/paddle/distributed/run/job/status.py rename to python/paddle/distributed/launch/job/status.py diff --git a/python/paddle/distributed/run/plugins/__init__.py b/python/paddle/distributed/launch/plugins/__init__.py similarity index 71% rename from python/paddle/distributed/run/plugins/__init__.py rename to python/paddle/distributed/launch/plugins/__init__.py index ec91402a7aad359c9860cf737a78cb7c1f1375d1..1862f75a77f65d39715e031b0ba72ebea6ab5523 100644 --- a/python/paddle/distributed/run/plugins/__init__.py +++ b/python/paddle/distributed/launch/plugins/__init__.py @@ -30,15 +30,26 @@ def process_args(ctx): argdev = ctx.args.devices if argdev: ctx.node.device.labels = argdev.split(',') - ctx.node.device.count = len(ctx.node.device.labels) ctx.logger.debug('Device reset by args {}'.format(argdev)) def collective_compatible(ctx): if 'PADDLE_TRAINER_ENDPOINTS' in ctx.envs: - ctx.master = ctx.envs['PADDLE_TRAINER_ENDPOINTS'].split(',')[0] + eps = ctx.envs['PADDLE_TRAINER_ENDPOINTS'].split(',') + hosts = set([h.split(':')[0] for h in eps]) + ctx.args.master = eps[0] if ':' in eps[0] else '{}:6768'.format(eps[0]) + ctx.args.nnodes = len(hosts) + ctx.logger.info('args reset by env PADDLE_TRAINER_ENDPOINTS\n{}'.format( + eps)) + ''' if 'DISTRIBUTED_TRAINER_ENDPOINTS' in ctx.envs: - ctx.master = ctx.envs['DISTRIBUTED_TRAINER_ENDPOINTS'].split(',')[0] + eps = ctx.envs['DISTRIBUTED_TRAINER_ENDPOINTS'].split(',') + hosts = set([h.split(':')[0] for h in eps]) + ctx.args.master = eps[0] + ctx.args.nnodes = len(hosts) + ctx.logger.info( + 'args reset by env DISTRIBUTED_TRAINER_ENDPOINTS\n{}'.format(eps)) + ''' def rewrite_host_ip(ctx): diff --git a/python/paddle/distributed/run/job/__init__.py b/python/paddle/distributed/launch/utils/__init__.py similarity index 78% rename from python/paddle/distributed/run/job/__init__.py rename to python/paddle/distributed/launch/utils/__init__.py index 66d2abbce21ebc72cda5373a3d3a242c077beaa8..97043fd7ba6885aac81cad5a49924c23c67d4d47 100644 --- a/python/paddle/distributed/run/job/__init__.py +++ b/python/paddle/distributed/launch/utils/__init__.py @@ -11,15 +11,3 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -from .pod import Pod -from .job import Job -from .container import Container -from .status import Status - -__all__ = [ - 'Pod', - 'Job', - 'Container', - 'Status', -] diff --git a/python/paddle/distributed/run/utils/kv_client.py b/python/paddle/distributed/launch/utils/kv_client.py similarity index 100% rename from python/paddle/distributed/run/utils/kv_client.py rename to python/paddle/distributed/launch/utils/kv_client.py diff --git a/python/paddle/distributed/run/utils/kv_server.py b/python/paddle/distributed/launch/utils/kv_server.py similarity index 100% rename from python/paddle/distributed/run/utils/kv_server.py rename to python/paddle/distributed/launch/utils/kv_server.py diff --git a/python/paddle/distributed/run/utils/process_context.py b/python/paddle/distributed/launch/utils/process_context.py similarity index 100% rename from python/paddle/distributed/run/utils/process_context.py rename to python/paddle/distributed/launch/utils/process_context.py diff --git a/python/paddle/distributed/run/context/__init__.py b/python/paddle/distributed/run/context/__init__.py deleted file mode 100644 index 86dff0f1f8056e784268a6ef3a3ebabb44aa9c6d..0000000000000000000000000000000000000000 --- a/python/paddle/distributed/run/context/__init__.py +++ /dev/null @@ -1,219 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from argparse import ArgumentParser, REMAINDER -import os, copy - -from paddle.distributed.run import plugins - -from .node import Node -from .status import Status - -import logging - - -class Context(object): - def __init__(self, enable_plugin=True): - os.environ.pop('http_proxy', None) - os.environ.pop('https_proxy', None) - - self.args = self.parse_args() - self.envs = self.fetch_envs() - self.logger = self.get_logger() - - self.node = Node() - self.status = Status() - - self.set_env_in_args() - - # design for event queue, later - self.events = [] - - if enable_plugin: - self._enable_plugin() - - def get_envs(self): - return self.envs.copy() - - def _enable_plugin(self): - for pl in plugins.enabled_plugins: - pl(self) - - def parse_args(self): - parser = ArgumentParser() - - base_group = parser.add_argument_group("Base Parameters") - - base_group.add_argument( - "--master", - type=str, - default=None, - help="the master/rendezvous server, ip:port") - - base_group.add_argument( - "--rank", type=int, default=-1, help="the peer rank") - - base_group.add_argument( - "--log", type=str, default="INFO", help="log level. Default INFO") - - base_group.add_argument( - "--np", - type=str, - default="1", - help="the number of peers, i.e. pod/node number") - - base_group.add_argument( - "--nproc_per_node", - type=int, - default=None, - help="the number of processes in a pod") - - base_group.add_argument( - "--log_dir", - type=str, - default="log", - help="the path for each process's log. Default ./log") - base_group.add_argument( - "--mode", - type=str, - default="collective", - help="run mode of the job, collective/ps/ps-heter") - - base_group.add_argument( - "--id", - type=str, - default="default", - help="unique id of the job. Default default") - - base_group.add_argument( - "--devices", - type=str, - default=None, - help="accelerate devices. as --gpus,npus,xps") - - base_group.add_argument( - "--host", type=str, default=None, help="host ip") - - base_group.add_argument( - "training_script", - type=str, - help="the full path of py script," - "followed by arguments for the " - "training script") - - base_group.add_argument('training_script_args', nargs=REMAINDER) - - ps_group = parser.add_argument_group("Parameter-Server Parameters") - # for parameter server - ps_group.add_argument( - "--servers", - type=str, - default='', - help="servers endpoints full list") - ps_group.add_argument( - "--trainers", - type=str, - default='', - help="trainers endpoints full list") - - ps_group.add_argument( - "--trainer_num", type=int, default=None, help="number of trainers") - ps_group.add_argument( - "--server_num", type=int, default=None, help="number of servers") - ps_group.add_argument( - "--gloo_port", type=int, default=6767, help="gloo http port") - ps_group.add_argument( - "--with_gloo", type=str, default="0", help="use gloo or not") - - # parameter elastic mode - elastic_group = parser.add_argument_group("Elastic Parameters") - elastic_group.add_argument( - "--max_restart", - type=int, - default=3, - help="the times can restart. Default 3") - - elastic_group.add_argument( - "--elastic_level", - type=int, - default=-1, - help="elastic level: -1 disable, 0 failed exit, peers hold, 1 internal restart" - ) - - elastic_group.add_argument( - "--elastic_timeout", - type=int, - default=30, - help="seconds to wait before elastic perform training") - return parser.parse_args() - - def _valide_env(self, key): - if key in ['POD_IP']: - return True - if key.endswith('_VISIBLE_DEVICES'): - return True - if key.startswith('PADDLE_'): - return True - - return False - - def fetch_envs(self): - ge = os.environ.copy() - - black_env_list = ['http_proxy', 'https_proxy'] - for key in black_env_list: - ge.pop(key, None) - - return ge - ''' - # use black list instead white list - return {k: ge[k] for k in ge if self._valide_env(k)} - ''' - - def get_logger(self, level=logging.INFO): - logger = logging.getLogger("PADDLERUN") - logger.setLevel(self.args.log.upper() or level) - formatter = logging.Formatter( - fmt='%(name)s %(levelname)s %(asctime)s %(message)s') - ch = logging.StreamHandler() - ch.setFormatter(formatter) - logger.addHandler(ch) - return logger - - def set_env_in_args(self): - env_args = { - 'POD_IP': 'host', - 'PADDLE_MASTER': 'master', - 'PADDLE_DEVICES': 'devices', - 'PADDLE_NP': 'np', - 'PADDLE_MODE': 'mode', - 'PADDLE_LOG': 'log', - 'PADDLE_NPROC_PER_NODE': 'nproc_per_node', - 'PADDLE_JOB_ID': 'id', - 'PADDLE_RANK': 'rank', - 'PADDLE_LOG_DIR': 'log_dir', - 'PADDLE_MAX_RESTlRT': 'max_restart', - 'PADDLE_ELASTIC_LEVEL': 'elastic_level', - 'PADDLE_ELASTIC_TIMEOUT': 'elastic_timeout', - 'PADDLE_SERVER_NUM': 'server_num', - 'PADDLE_TRAINER_NUM': 'trainer_num', - 'PADDLE_SERVERS_ENDPOINTS': 'servers', - 'PADDLE_TRAINERS_ENDPOINTS': 'trainers', - 'PADDLE_GLOO_PORT': 'gloo_port', - 'PADDLE_WITH_GLOO': 'with_gloo', - } - - for k, v in env_args.items(): - if k in self.envs: - setattr(self.args, v, self.envs[k]) diff --git a/python/paddle/distributed/run/context/device.py b/python/paddle/distributed/run/context/device.py deleted file mode 100644 index d8bbd851ccf83a1ebfac60758576384bbe1aa4f4..0000000000000000000000000000000000000000 --- a/python/paddle/distributed/run/context/device.py +++ /dev/null @@ -1,88 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os - - -class DeviceType: - CPU = 'cpu' - GPU = 'gpu' - XPU = 'xpu' - NPU = 'npu' - - -class Device(object): - def __init__(self, dtype=None, count=1, memory="", labels=""): - self.dtype = dtype - self.count = count - self.memory = memory - self.labels = labels - - def __str__(self): - return ",".join(self.labels) - - @classmethod - def parse_device(self): - dev = Device() - visible_devices = None - if 'CUDA_VISIBLE_DEVICES' in os.environ or 'NVIDIA_VISIBLE_DEVICES' in os.environ: - dev.dtype = DeviceType.GPU - visible_devices = os.getenv("CUDA_VISIBLE_DEVICES") or os.getenv( - "NVIDIA_VISIBLE_DEVICES") - elif 'XPU_VISIBLE_DEVICES' in os.environ: - dev.dtype = DeviceType.XPU - visible_devices = os.getenv("XPU_VISIBLE_DEVICES") - elif 'ASCEND_VISIBLE_DEVICES' in os.environ: - dev.dtype = DeviceType.NPU - visible_devices = os.getenv("ASCEND_VISIBLE_DEVICES") - - if visible_devices and visible_devices != 'all': - dev.labels = visible_devices.split(',') - dev.count = len(dev.labels) - else: - return self.detect_device() - - return dev - - @classmethod - def detect_device(self): - import paddle.fluid as fluid - - dev = Device() - num = 0 - visible_devices = None - if fluid.core.is_compiled_with_cuda(): - dev.dtype = DeviceType.GPU - num = fluid.core.get_cuda_device_count() - visible_devices = os.getenv("CUDA_VISIBLE_DEVICES") or os.getenv( - "NVIDIA_VISIBLE_DEVICES") - elif fluid.core.is_compiled_with_xpu(): - dev.dtype = DeviceType.XPU - num = fluid.core.get_xpu_device_count() - visible_devices = os.getenv("XPU_VISIBLE_DEVICES") - elif fluid.core.is_compiled_with_npu(): - dev.dtype = DeviceType.NPU - num = fluid.core.get_npu_device_count() - visible_devices = os.getenv("ASCEND_VISIBLE_DEVICES") - - if num == 0: - dev.dtype = DeviceType.CPU - elif visible_devices is None or visible_devices == "all" or visible_devices == "": - dev.labels = [str(x) for x in range(0, num)] - dev.count = num - else: - dev.labels = visible_devices.split(',') - dev.count = len(dev.labels) - - return dev diff --git a/python/paddle/distributed/run/plugins/ip.py b/python/paddle/distributed/run/plugins/ip.py deleted file mode 100644 index 0809ed5864da9f3bea29235621e7c29b75823391..0000000000000000000000000000000000000000 --- a/python/paddle/distributed/run/plugins/ip.py +++ /dev/null @@ -1,30 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import socket - - -def get_local_ip(ctx): - _, ip = _get_host_name_ip() - ctx.args.host = ip - ctx.envs["POD_IP"] = ip - - -def _get_host_name_ip(): - try: - host_name = socket.gethostname() - host_ip = socket.gethostbyname(host_name) - return host_name, host_ip - except: - return None diff --git a/python/paddle/fluid/dygraph/layers.py b/python/paddle/fluid/dygraph/layers.py index 6957850d205794363183b4e6ca58a6daf3e11358..f4334085620f510e3d520f89332b754a93aa120a 100644 --- a/python/paddle/fluid/dygraph/layers.py +++ b/python/paddle/fluid/dygraph/layers.py @@ -1155,7 +1155,8 @@ class Layer(object): layers[name] = None else: _buffers = self.__dict__.get('_buffers', None) - if type(value) == core.VarBase: + if type(value) == core.VarBase or \ + type(value) == core.eager.Tensor: if _buffers is None: raise ValueError( "super(YourLayer, self).__init__() should be called first" diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 44e6f8e8f2a6d11371f21fff5a9dccefcd72ebed..2acf530eea3fbd2d7fbc9cf04d2c792b7175035c 100755 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -949,7 +949,7 @@ if (WITH_DISTRIBUTE AND NOT APPLE) endif() # setting timeout value as 15S -set_tests_properties(test_run PROPERTIES TIMEOUT 200) +set_tests_properties(test_run PROPERTIES TIMEOUT 120) set_tests_properties(test_sync_batch_norm_op PROPERTIES TIMEOUT 120) set_tests_properties(test_cross_op PROPERTIES TIMEOUT 120) set_tests_properties(test_imperative_lod_tensor_to_selected_rows PROPERTIES TIMEOUT 200) diff --git a/python/paddle/fluid/tests/unittests/test_c_comm_init_op.sh b/python/paddle/fluid/tests/unittests/test_c_comm_init_op.sh index aba95a68ab790828a63b1f46afaaa87f7826d248..9b99e553d182bc5f9ce8cd1835d7600ebfb956f2 100644 --- a/python/paddle/fluid/tests/unittests/test_c_comm_init_op.sh +++ b/python/paddle/fluid/tests/unittests/test_c_comm_init_op.sh @@ -17,5 +17,4 @@ set -e # use default values # FIXME: random fails on Unknown command lines -c (or -m). -launch_py=${PADDLE_BINARY_DIR}/python/paddle/distributed/launch.py -CUDA_VISIBLE_DEVICES=0,1 python ${launch_py} c_comm_init_op.py +CUDA_VISIBLE_DEVICES=0,1 python -m paddle.distributed.launch c_comm_init_op.py diff --git a/python/paddle/fluid/tests/unittests/test_egr_python_api.py b/python/paddle/fluid/tests/unittests/test_egr_python_api.py index 98ef339e04535bb943add02b6cf6efe490f0354b..8166598677a3eb7ce7a4cb42b8a96b6b100aeb20 100644 --- a/python/paddle/fluid/tests/unittests/test_egr_python_api.py +++ b/python/paddle/fluid/tests/unittests/test_egr_python_api.py @@ -637,6 +637,10 @@ class EagerVariablePropertiesAndMethodsTestCase(unittest.TestCase): self.assertTrue(tensor3.persistable, True) self.assertTrue(tensor3.stop_gradient, True) self.assertTrue(tensor3.place.is_gpu_place()) + tensor4 = paddle.to_tensor([1, 2, 3], place='gpu_pinned') + tensor5 = tensor4._copy_to(core.CUDAPlace(0), True) + self.assertTrue( + np.array_equal(tensor4.numpy(), tensor5.numpy())) else: tensor3 = tensor2._copy_to(core.CPUPlace(), True) self.assertTrue(np.array_equal(tensor3.numpy(), arr2)) diff --git a/python/paddle/fluid/tests/unittests/test_graph_send_recv_op.py b/python/paddle/fluid/tests/unittests/test_graph_send_recv_op.py index 68b354775d13e659492d7b4355ecf162546d2f27..30f943e3248e90820abbef26214e81cf691f50a4 100644 --- a/python/paddle/fluid/tests/unittests/test_graph_send_recv_op.py +++ b/python/paddle/fluid/tests/unittests/test_graph_send_recv_op.py @@ -304,6 +304,35 @@ class API_GraphSendRecvOpTest(unittest.TestCase): "two value is\ {}\n{}, check diff!".format(np_res, ret_res)) + def test_set_outsize_gpu(self): + if paddle.fluid.core.is_compiled_with_cuda(): + x = paddle.to_tensor( + np.array([[0, 2, 3], [1, 4, 5], [2, 6, 6]]), dtype="float32") + src_index = paddle.to_tensor(np.array([0, 0, 1]), dtype="int32") + dst_index = paddle.to_tensor(np.array([0, 1, 1]), dtype="int32") + res = paddle.incubate.graph_send_recv(x, src_index, dst_index, + "sum") + out_size = paddle.max(dst_index) + 1 + res_set_outsize = paddle.incubate.graph_send_recv( + x, src_index, dst_index, "sum", out_size) + + np_res = np.array( + [[0, 2, 3], [1, 6, 8], [0, 0, 0]], dtype="float32") + np_res_set_outsize = np.array( + [[0, 2, 3], [1, 6, 8]], dtype="float32") + + self.assertTrue( + np.allclose( + np_res, res, atol=1e-6), + "two value is\ + {}\n{}, check diff!".format(np_res, res)) + self.assertTrue( + np.allclose( + np_res_set_outsize, res_set_outsize, atol=1e-6), + "two value is\ + {}\n{}, check diff!" + .format(np_res_set_outsize, res_set_outsize)) + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_inplace_eager_fluid.py b/python/paddle/fluid/tests/unittests/test_inplace_eager_fluid.py index 6736783d1bc4627cb0b447ad578e6ba4bd5a7b76..27f7903a42e8a1c073b9840b8eebe950b526f5cb 100644 --- a/python/paddle/fluid/tests/unittests/test_inplace_eager_fluid.py +++ b/python/paddle/fluid/tests/unittests/test_inplace_eager_fluid.py @@ -195,6 +195,8 @@ class TestDygraphInplaceFlatten(TestDygraphInplace): return var.flatten_() +""" +# This case will fail while using `_C_ops.final_state_scatter`. class TestDygraphInplaceScatter(TestDygraphInplace): def init_data(self): self.input_var_numpy = np.array([[1, 1], [2, 2], [3, 3]]) @@ -213,6 +215,7 @@ class TestDygraphInplaceScatter(TestDygraphInplace): [[1, 1], [2, 2], [3, 3], [4, 4]], dtype='float32') return paddle.scatter_(var, index, updates, overwrite=False) +""" class TestDygraphInplaceElu(TestDygraphInplace): diff --git a/python/paddle/fluid/tests/unittests/test_run.py b/python/paddle/fluid/tests/unittests/test_run.py index 8fe5fb9bb9455aa58d84bda03f9e9e16038a3be0..498aecf7c6e75cbb3dd1e73e4aae6151ed65f0db 100644 --- a/python/paddle/fluid/tests/unittests/test_run.py +++ b/python/paddle/fluid/tests/unittests/test_run.py @@ -39,9 +39,7 @@ import os env = os.environ.copy() assert "PADDLE_PSERVERS_IP_PORT_LIST" in env assert "PADDLE_TRAINER_ENDPOINTS" in env -#assert "PADDLE_PSERVER_ENDPOINTS" in env -#assert "PADDLE_TRAINER_ENDPOINTS" in env -#assert "PADDLE_ROLE" in env +assert "PADDLE_ROLE" in env #assert "PADDLE_RANK" in env ''' @@ -62,27 +60,24 @@ class Collective_Test(unittest.TestCase): write_file(pyname, colpyfile) def pdrun(self, args, env=None): - cmd = [sys.executable.split('/')[-1], "-m", "paddle.distributed.run"] + cmd = [sys.executable.split('/')[-1], "-m", "paddle.distributed.launch"] if args: cmd.extend(args.split(" ")) cmd.extend([pyname]) proc = subprocess.Popen(cmd, env) return proc - ''' def test_collective_1(self): - args = "--id test1" + args = "--job_id test1" p = self.pdrun(args) p.wait() self.assertTrue(p.poll() == 0) - ''' - def test_collective_2(self): if os.path.exists('./log'): shutil.rmtree('./log') - args = "--id test2 --devices 0,1,2" + args = "--job_id test2 --devices 0,1,2" p = self.pdrun(args) p.wait() self.assertTrue(p.poll() == 0) @@ -95,7 +90,7 @@ class Collective_Test(unittest.TestCase): shutil.rmtree('./log') port = random.randrange(6000, 8000) - args = "--id test3 --devices 0,1 --master 127.0.0.1:{} --np 2".format( + args = "--job_id test3 --devices 0,1 --master 127.0.0.1:{} --np 2".format( port) p1 = self.pdrun(args) p2 = self.pdrun(args) @@ -113,14 +108,13 @@ class PS_Test(unittest.TestCase): write_file(pyname, pspyfile) def pdrun(self, args, env=None): - cmd = [sys.executable.split('/')[-1], "-m", "paddle.distributed.run"] + cmd = [sys.executable.split('/')[-1], "-m", "paddle.distributed.launch"] if args: cmd.extend(args.split(" ")) cmd.extend([pyname]) proc = subprocess.Popen(cmd, env) return proc - ''' def test_ps_1(self): args = "--mode ps" p = self.pdrun(args) @@ -131,21 +125,20 @@ class PS_Test(unittest.TestCase): if os.path.exists('./log'): shutil.rmtree('./log') - args = "--id ps2 --server_num=2 --trainer_num=2" + args = "--job_id ps2 --server_num=2 --trainer_num=2" p = self.pdrun(args) p.wait() self.assertTrue(p.poll() == 0) c = get_files('log', 'ps2') self.assertTrue(len(c) == 5) - ''' def test_ps_3(self): if os.path.exists('./log'): shutil.rmtree('./log') port = random.randrange(6000, 8000) - args = "--id ps3 --master 127.0.0.1:{} --np 2 --server_num=1 --trainer_num=1".format( + args = "--job_id ps3 --master 127.0.0.1:{} --np 2 --server_num=1 --trainer_num=1".format( port) p1 = self.pdrun(args) p2 = self.pdrun(args) @@ -161,7 +154,7 @@ class PS_Test(unittest.TestCase): if os.path.exists('./log'): shutil.rmtree('./log') - args = "--id ps4 --servers 127.0.0.1:8900,127.0.0.1:8901 --trainers 127.0.0.1:8902,127.0.0.1:8903" + args = "--job_id ps4 --servers 127.0.0.1:8900,127.0.0.1:8901 --trainers 127.0.0.1:8902,127.0.0.1:8903" p1 = self.pdrun(args) p1.wait() self.assertTrue(p1.poll() == 0) diff --git a/python/paddle/incubate/operators/graph_send_recv.py b/python/paddle/incubate/operators/graph_send_recv.py index 45810621e42076aedbb1c416e7f6f5d90f1c0f6e..05f6a80a442f28fc00d6766747d571664bb389fc 100644 --- a/python/paddle/incubate/operators/graph_send_recv.py +++ b/python/paddle/incubate/operators/graph_send_recv.py @@ -19,7 +19,12 @@ from paddle.fluid import core from paddle import _C_ops -def graph_send_recv(x, src_index, dst_index, pool_type="sum", name=None): +def graph_send_recv(x, + src_index, + dst_index, + pool_type="sum", + out_size=None, + name=None): r""" Graph Learning Send_Recv combine operator. @@ -27,7 +32,7 @@ def graph_send_recv(x, src_index, dst_index, pool_type="sum", name=None): This operator is mainly used in Graph Learning domain, and the main purpose is to reduce intermediate memory consumption in the process of message passing. Take `x` as the input tensor, we first use `src_index` to gather the corresponding data, and then use `dst_index` to update the corresponding position of output tensor - in different pooling types, like sum, mean, max, or min. + in different pooling types, like sum, mean, max, or min. Besides, we can set `out_size` to get necessary output shape. .. code-block:: text @@ -43,6 +48,8 @@ def graph_send_recv(x, src_index, dst_index, pool_type="sum", name=None): pool_type = "sum" + out_size = None + Then: Out = [[0, 2, 3], @@ -56,6 +63,9 @@ def graph_send_recv(x, src_index, dst_index, pool_type="sum", name=None): The available data type is int32, int64. pool_type (str): The pooling type of graph_send_recv, including `sum`, `mean`, `max`, `min`. Default value is `sum`. + out_size (int64|None): We can set `out_size` to get necessary output shape. If not set, then this + attribute will not be used. If set, it should be equal with or larger than + max(dst_index) + 1. name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. @@ -75,6 +85,21 @@ def graph_send_recv(x, src_index, dst_index, pool_type="sum", name=None): out = paddle.incubate.graph_send_recv(x, src_index, dst_index, pool_type="sum") # Outputs: [[0., 2., 3.], [2., 8., 10.], [1., 4., 5.]] + x = paddle.to_tensor([[0, 2, 3], [1, 4, 5], [2, 6, 7]], dtype="float32") + indexes = paddle.to_tensor([[0, 1], [2, 1], [0, 0]], dtype="int32") + src_index = indexes[:, 0] + dst_index = indexes[:, 1] + out_size = paddle.max(dst_index) + 1 + out = paddle.incubate.graph_send_recv(x, src_index, dst_index, pool_type="sum", out_size=out_size) + # Outputs: [[0., 2., 3.], [[2., 8., 10.]]] + + x = paddle.to_tensor([[0, 2, 3], [1, 4, 5], [2, 6, 7]], dtype="float32") + indexes = paddle.to_tensor([[0, 1], [2, 1], [0, 0]], dtype="int32") + src_index = indexes[:, 0] + dst_index = indexes[:, 1] + out = paddle.incubate.graph_send_recv(x, src_index, dst_index, pool_type="sum") + # Outputs: [[0., 2., 3.], [2., 8., 10.], [0., 0., 0.]] + """ if pool_type not in ["sum", "mean", "max", "min"]: @@ -82,9 +107,16 @@ def graph_send_recv(x, src_index, dst_index, pool_type="sum", name=None): "pool_type should be `sum`, `mean`, `max` or `min`, but received %s" % pool_type) + # TODO(daisiming): Should we add judgement for out_size: max(dst_index) + 1. + if in_dygraph_mode(): - out, tmp = _C_ops.graph_send_recv(x, src_index, dst_index, 'pool_type', - pool_type.upper()) + if out_size is None or out_size <= 0: + out, tmp = _C_ops.graph_send_recv(x, src_index, dst_index, + 'pool_type', pool_type.upper()) + else: + out, tmp = _C_ops.graph_send_recv( + x, src_index, dst_index, 'pool_type', + pool_type.upper(), 'out_size', out_size) return out check_variable_and_dtype(x, "X", ("float32", "float64", "int32", "int64"), @@ -105,5 +137,8 @@ def graph_send_recv(x, src_index, dst_index, pool_type="sum", name=None): "Dst_index": dst_index}, outputs={"Out": out, "Dst_count": dst_count}, - attrs={"pool_type": pool_type.upper()}) + attrs={ + "pool_type": pool_type.upper(), + "out_size": 0 if out_size is None or out_size <= 0 else out_size + }) return out diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py index cef6ba48f83994cfcc47ae408bed1bbe80ddb178..ed7fd91b7a7aa90f4f595d7e33f29d065113bc36 100644 --- a/python/paddle/nn/functional/activation.py +++ b/python/paddle/nn/functional/activation.py @@ -578,6 +578,8 @@ def relu_(x, name=None): Inplace version of ``relu`` API, the output Tensor will be inplaced with input ``x``. Please refer to :ref:`api_nn_cn_relu`. """ + if _in_eager_mode(): + return _C_ops.final_state_relu_(x) return _C_ops.relu_(x) diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py index bdb0eabe2bbb2968a00d5baf3f9ada14e05a635e..9cef336aa54ae4daae14a8bb99d70470e5dd7511 100644 --- a/python/paddle/tensor/creation.py +++ b/python/paddle/tensor/creation.py @@ -127,7 +127,7 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True): "\n\tFaild to convert input data to a regular ndarray :\n\t - Usually " "this means the input data contains nested lists with different lengths. " ) - elif isinstance(data, paddle.Tensor): + elif isinstance(data, (paddle.Tensor, core.eager.Tensor)): data = data._copy_to(place, False) data = _handle_dtype(data, dtype) data.stop_gradient = stop_gradient diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py index a4972061d420853b4790dce5df457d981127194c..98b9811f85c1f77e39a4ddd027aa426d5ae72fc4 100755 --- a/python/paddle/tensor/manipulation.py +++ b/python/paddle/tensor/manipulation.py @@ -1735,12 +1735,12 @@ def tile(x, repeat_times, name=None): Args: x (Tensor): The input tensor, its data type should be bool, float32, float64, int32 or int64. - repeat_times (Tensor|tuple|list): The number of repeating times. If repeat_times is a list or tuple, all its elements + repeat_times (list|tuple|Tensor): The number of repeating times. If repeat_times is a list or tuple, all its elements should be integers or 1-D Tensors with the data type int32. If repeat_times is a Tensor, it should be an 1-D Tensor with the data type int32. name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. Returns: - N-D Tensor. The data type is the same as ``x``. + N-D Tensor. The data type is the same as ``x``. The size of the i-th dimension is equal to ``x[i] * repeat_times[i]``. Examples: .. code-block:: python @@ -1750,16 +1750,18 @@ def tile(x, repeat_times, name=None): data = paddle.to_tensor([1, 2, 3], dtype='int32') out = paddle.tile(data, repeat_times=[2, 1]) np_out = out.numpy() - # [[1, 2, 3], [1, 2, 3]] + # [[1, 2, 3] + # [1, 2, 3]] - out = paddle.tile(data, repeat_times=[2, 2]) + out = paddle.tile(data, repeat_times=(2, 2)) np_out = out.numpy() - # [[1, 2, 3, 1, 2, 3], [1, 2, 3, 1, 2, 3]] + # [[1, 2, 3, 1, 2, 3] + # [1, 2, 3, 1, 2, 3]] - repeat_times = paddle.to_tensor([2, 1], dtype='int32') + repeat_times = paddle.to_tensor([1, 2], dtype='int32') out = paddle.tile(data, repeat_times=repeat_times) np_out = out.numpy() - # [[1, 2, 3], [1, 2, 3]] + # [[1, 2, 3, 1, 2, 3]] """ if paddle.in_dynamic_mode(): if _in_eager_mode(): diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml index ba9598db64c416dafe7f8ebbed267913e14a3960..ee6503a39c6f126b52db23d302a59b2ef1196f74 100644 --- a/python/paddle/utils/code_gen/api.yaml +++ b/python/paddle/utils/code_gen/api.yaml @@ -36,9 +36,9 @@ func : conj - api : copy_to - args : (Tensor x, Backend backend, bool blocking) + args : (Tensor x, Place place, bool blocking) output : Tensor - invoke : copy_to_impl(x, backend, blocking) + invoke : copy_to_impl(x, place, blocking) - api : divide args : (Tensor x, Tensor y) @@ -57,7 +57,7 @@ func : dot - api : empty - args : (ScalarArray shape, DataType dtype=DataType::FLOAT32, Backend place=Backend::CPU) + args : (ScalarArray shape, DataType dtype=DataType::FLOAT32, Place place=CPUPlace()) output: Tensor infer_meta : func : CreateInferMeta @@ -69,7 +69,7 @@ backend : place - api : empty_like - args : (Tensor x, DataType dtype = DataType::UNDEFINED, Backend place = Backend::UNDEFINED) + args : (Tensor x, DataType dtype = DataType::UNDEFINED, Place place = {}) output: Tensor infer_meta : func : CreateLikeInferMeta @@ -89,7 +89,7 @@ func : flatten - api : full - args : (ScalarArray shape, Scalar value, DataType dtype=DataType::FLOAT32, Backend place=Backend::CPU) + args : (ScalarArray shape, Scalar value, DataType dtype=DataType::FLOAT32, Place place=CPUPlace()) output: Tensor infer_meta : func : CreateInferMeta @@ -101,7 +101,7 @@ backend : place - api : full_like - args : (Tensor x, Scalar value, DataType dtype = DataType::UNDEFINED, Backend place = Backend::UNDEFINED) + args : (Tensor x, Scalar value, DataType dtype = DataType::UNDEFINED, Place place = {}) output: Tensor infer_meta : func : CreateLikeInferMeta @@ -138,7 +138,7 @@ func : multiply - api : ones_like - args : (Tensor x, DataType dtype=DataType::UNDEFINED, Backend place=Backend::UNDEFINED) + args : (Tensor x, DataType dtype=DataType::UNDEFINED, Place place={}) output : Tensor invoke : full_like(x, 1, dtype, place) @@ -219,7 +219,7 @@ data_type : x - api : zeros_like - args : (Tensor x, DataType dtype=DataType::UNDEFINED, Backend place=Backend::UNDEFINED) + args : (Tensor x, DataType dtype=DataType::UNDEFINED, Place place = {}) output : Tensor invoke : full_like(x, 0, dtype, place) @@ -231,7 +231,7 @@ func : OneHotInferMeta kernel : func : one_hot - + - api : digamma args : (Tensor x) output : Tensor @@ -1484,3 +1484,16 @@ kernel : func : mv backward : mv_grad + + + +# =================================== sep0 + + +# =================================== sep1 + + +# =================================== sep2 + + +# =================================== sep3 diff --git a/python/paddle/utils/code_gen/api_base.py b/python/paddle/utils/code_gen/api_base.py index bf3d7b3d19eab806706f1d2d654957aac5b33434..a42691d320875ef8d8eaf04faadbb99ab7b41cbf 100644 --- a/python/paddle/utils/code_gen/api_base.py +++ b/python/paddle/utils/code_gen/api_base.py @@ -99,7 +99,7 @@ class BaseAPI(object): 'double': 'double', 'bool': 'bool', 'str': 'const std::string&', - 'Backend': 'Backend', + 'Place': 'Place', 'DataLayout': 'DataLayout', 'DataType': 'DataType', 'int64[]': 'const std::vector&', @@ -118,7 +118,7 @@ class BaseAPI(object): 'float': 'paddle::optional', 'double': 'paddle::optional', 'bool': 'paddle::optional', - 'Backend': 'paddle::optional', + 'Place': 'paddle::optional', 'DataLayout': 'paddle::optional', 'DataType': 'paddle::optional', 'int64[]': 'paddle::optional>', @@ -327,9 +327,9 @@ PADDLE_API {self.gene_return_type_code()} {self.get_api_func_name() + '_'}({self attr_layout_count = 0 attr_data_type_count = 0 for attr_name in attrs['names']: - if attrs['attr_info'][attr_name][0] == 'Backend': + if attrs['attr_info'][attr_name][0] == 'Place': assert kernel['backend'] is not None, \ - f"{api} api: When there is a parameter with 'Backend' type in attributes, you must set backend of kernel manually." + f"{api} api: When there is a parameter with 'Place' type in attributes, you must set backend of kernel manually." attr_backend_count = attr_backend_count + 1 if attrs['attr_info'][attr_name][0] == 'DataLayout': assert kernel['layout'] is not None, \ @@ -348,8 +348,8 @@ PADDLE_API {self.gene_return_type_code()} {self.get_api_func_name() + '_'}({self assert len( vars_list ) == 2, f"{api} api: The number of params to set backend with '>' only allows 2, but received {len(vars_list)}." - assert (vars_list[0].strip() in attrs['names']) and (attrs['attr_info'][vars_list[0].strip()][0] == 'Backend'), \ - f"{api} api: When use '>' to set kernel backend, the first param should be a attribute with Backend type." + assert (vars_list[0].strip() in attrs['names']) and (attrs['attr_info'][vars_list[0].strip()][0] == 'Place'), \ + f"{api} api: When use '>' to set kernel backend, the first param should be a attribute with Place type." kernel_select_code = kernel_select_code + f""" kernel_backend = ParseBackendWithInputOrder({vars_list[0].strip()}, {vars_list[1].strip()}); """ diff --git a/python/paddle/utils/code_gen/backward.yaml b/python/paddle/utils/code_gen/backward.yaml index 0c55735f3b105b25003d0cd527ea1b4e952ddf95..c6611781b69478fcd86e75458c121b8fee736540 100644 --- a/python/paddle/utils/code_gen/backward.yaml +++ b/python/paddle/utils/code_gen/backward.yaml @@ -56,6 +56,16 @@ kernel : func : abs_grad +- backward_api : relu_grad + forward : relu (Tensor x) -> Tensor(out) + args : (Tensor x, Tensor out_grad) + output : Tensor(x_grad) + infer_meta : + func : UnchangedInferMeta + param : [x] + kernel : + func : relu_grad + - backward_api : trunc_grad forward : trunc (Tensor x) -> Tensor(out) args : (Tensor out_grad) @@ -868,3 +878,16 @@ param : [x, vec] kernel : func : mv_grad + + + +# =================================== sep0 + + +# =================================== sep1 + + +# =================================== sep2 + + +# =================================== sep3 diff --git a/python/setup.py.in b/python/setup.py.in index 44998bd3e1675f2a3f77edd26c9cd8fa85121b6a..0a10e9dcc698d7433ceab5dd35ee4d5fa1729636 100755 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -282,6 +282,12 @@ packages=['paddle', 'paddle.distribution', 'paddle.distributed.sharding', 'paddle.distributed.fleet', + 'paddle.distributed.launch', + 'paddle.distributed.launch.context', + 'paddle.distributed.launch.controllers', + 'paddle.distributed.launch.job', + 'paddle.distributed.launch.plugins', + 'paddle.distributed.launch.utils', 'paddle.distributed.fleet.base', 'paddle.distributed.fleet.elastic', 'paddle.distributed.fleet.meta_optimizers', @@ -727,7 +733,7 @@ with redirect_stdout(): }, entry_points={ 'console_scripts': [ - 'fleetrun = paddle.distributed.fleet.launch:launch' + 'fleetrun = paddle.distributed.launch.__main__:launch' ] }, classifiers=[