提交 9501b7e5 编写于 作者: P phlrain

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into add_some_yaml_config

......@@ -14,9 +14,8 @@ English | [简体中文](./README_cn.md)
Welcome to the PaddlePaddle GitHub.
PaddlePaddle, as the only independent R&D deep learning platform in China, has been officially open-sourced to professional communities since 2016. It is an industrial platform with advanced technologies and rich features that cover core deep learning frameworks, basic model libraries, end-to-end development kits, tools & components as well as service platforms.
PaddlePaddle is originated from industrial practices with dedication and commitments to industrialization. It has been widely adopted by a wide range of sectors including manufacturing, agriculture, enterprise service, and so on while serving more than 4 million developers. With such advantages, PaddlePaddle has helped an increasing number of partners commercialize AI.
PaddlePaddle, as the first independent R&D deep learning platform in China, has been officially open-sourced to professional communities since 2016. It is an industrial platform with advanced technologies and rich features that cover core deep learning frameworks, basic model libraries, end-to-end development kits, tools & components as well as service platforms.
PaddlePaddle is originated from industrial practices with dedication and commitments to industrialization. It has been widely adopted by a wide range of sectors including manufacturing, agriculture, enterprise service, and so on while serving more than 4 million developers, 157,000 companies and generating 476,000 models. With such advantages, PaddlePaddle has helped an increasing number of partners commercialize AI.
## Installation
......
......@@ -395,7 +395,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Barrier(
platform::CUDADeviceGuard gpuGuard;
for (auto& place : places) {
gpuGuard.SetDeviceIndex(place.GetDeviceId());
auto dt = full({1}, 0, phi::DataType::FLOAT32, phi::Backend::GPU);
auto dt = full({1}, 0, phi::DataType::FLOAT32, phi::GPUPlace());
barrierTensors.push_back(dt);
}
auto task = ProcessGroupNCCL::AllReduce(barrierTensors);
......@@ -417,7 +417,7 @@ void CheckTensorsInDifferentDevices(const std::vector<Tensor>& tensors,
std::set<Place> used_devices;
for (const auto& t : tensors) {
PADDLE_ENFORCE_EQ(t.is_cuda() && t.is_dense_tensor(), true,
PADDLE_ENFORCE_EQ(t.is_gpu() && t.is_dense_tensor(), true,
platform::errors::InvalidArgument(
"Tensors must be CUDA and dense tensor."));
......
......@@ -321,7 +321,7 @@ EagerReducer::EagerReducer(
if (find_unused_vars_each_step_) {
global_used_vars_ = paddle::experimental::empty(
ScalarArray({static_cast<int32_t>(tensors_.size())}), DataType::INT32,
TransToBackend(inner_place_));
inner_place_);
}
}
......@@ -363,10 +363,8 @@ void EagerReducer::InitializeGroups(
} else {
// process the dense gradient.
InitializeDenseGroups(tensor_indices_, &group);
// experimental::Backend backend = TransToBackend(inner_place_);
group.dense_contents_ = paddle::experimental::empty(
ScalarArray({group.all_length_}), group.dtype_,
TransToBackend(inner_place_));
ScalarArray({group.all_length_}), group.dtype_, inner_place_);
}
// map tensors to this group by VariableLocator
......
......@@ -43,8 +43,7 @@ paddle::experimental::Tensor CreateTensorWithValue(
const phi::DataType& dtype, const phi::DataLayout& layout, float value,
bool is_leaf) {
paddle::experimental::Tensor out = paddle::experimental::full(
phi::vectorize(ddim), paddle::experimental::Scalar(value), dtype,
phi::TransToPhiBackend(place));
phi::vectorize(ddim), paddle::experimental::Scalar(value), dtype, place);
auto meta = EagerUtils::autograd_meta(&out);
if (is_leaf) {
......
......@@ -29,7 +29,7 @@ yaml_types_mapping = {
'int' : 'int', 'int32' : 'int32_t', 'int64' : 'int64_t', 'size_t' : 'size_t', \
'float' : 'float', 'double' : 'double', 'bool' : 'bool', \
'str' : 'std::string', \
'Backend' : 'paddle::experimental::Backend', 'DataLayout' : 'paddle::experimental::DataLayout', 'DataType' : 'paddle::experimental::DataType', \
'Place' : 'paddle::experimental::Place', 'DataLayout' : 'paddle::experimental::DataLayout', 'DataType' : 'paddle::experimental::DataType', \
'int64[]' : 'std::vector<int64_t>', 'int[]' : 'std::vector<int>',
'Tensor' : 'Tensor',
'Tensor[]' : 'std::vector<Tensor>',
......@@ -56,6 +56,14 @@ def ParseArguments():
#################
### Helpers ###
#################
def RecoverBaseNameOfInplaceFunction(function_name):
return function_name[:-1]
def GetInplacedFunctionName(function_name):
return function_name + "_"
def FindGradName(string):
return string + "_grad"
......@@ -149,6 +157,24 @@ def ReadBwdFile(filepath):
######################
### Yaml Parsers ###
######################
def ParseInplaceInfo(string):
# string: "(x -> out0), (y -> out2)"
inplace_map = {}
for pair in string.split(","):
pair = pair.strip()
if pair.startswith("("):
pair = pair[1:]
if pair.endswith(")"):
pair = pair[:-1]
key = pair.split("->")[0].strip()
val = pair.split("->")[1].strip()
inplace_map[key] = val
return inplace_map
def RemoveSpecialSymbolsInName(string):
# Remove any name after '@'
ret = string.split("@")[0]
......@@ -684,9 +710,10 @@ std::vector<std::vector<paddle::experimental::Tensor>> {}::operator()(const std:
def GenerateNodeCreationCodes(
fwd_api_name, bwd_api_name, forward_inputs_position_map,
forward_outputs_position_map, forward_attrs_list,
forward_outputs_position_map, forward_attrs_list, forward_call_str,
backward_fwd_input_map, backward_grad_input_map,
backward_grad_output_map, backward_attrs_list, optional_inputs):
backward_grad_output_map, backward_attrs_list, optional_inputs,
inplace_map):
# fwd_api_name = ""
# forward_inputs_position_map = { "name" : [type, fwd_position] }
# forward_outputs_position_map = { "name" : [type, fwd_position] }
......@@ -724,19 +751,19 @@ def GenerateNodeCreationCodes(
output_autograd_meta_vec_name = GetAutoGradMetaVectorName(name)
if num_fwd_outputs == 1:
if IsPlainTensorType(rtype):
output_autograd_meta = f" egr::AutogradMeta* {output_autograd_meta_name} = egr::EagerUtils::autograd_meta(&api_result);"
output_autograd_meta = f" egr::AutogradMeta* {output_autograd_meta_name} = egr::EagerUtils::autograd_meta(&api_result);"
else:
assert IsVectorTensorType(rtype)
output_autograd_meta = f" std::vector<egr::AutogradMeta*> {output_autograd_meta_vec_name} = egr::EagerUtils::autograd_meta(&api_result);\n"
output_autograd_meta += f" std::vector<egr::AutogradMeta*>* {output_autograd_meta_name} = &{output_autograd_meta_vec_name};"
output_autograd_meta = f" std::vector<egr::AutogradMeta*> {output_autograd_meta_vec_name} = egr::EagerUtils::autograd_meta(&api_result);\n"
output_autograd_meta += f" std::vector<egr::AutogradMeta*>* {output_autograd_meta_name} = &{output_autograd_meta_vec_name};"
else:
# Tuple api_result
if IsPlainTensorType(rtype):
output_autograd_meta = f" egr::AutogradMeta* {output_autograd_meta_name} = egr::EagerUtils::autograd_meta(&std::get<{pos}>(api_result));"
output_autograd_meta = f" egr::AutogradMeta* {output_autograd_meta_name} = egr::EagerUtils::autograd_meta(&std::get<{pos}>(api_result));"
else:
assert IsVectorTensorType(rtype)
output_autograd_meta = f" std::vector<egr::AutogradMeta*> {output_autograd_meta_vec_name} = egr::EagerUtils::autograd_meta(&std::get<{pos}>(api_result));\n"
output_autograd_meta += f" std::vector<egr::AutogradMeta*>* {output_autograd_meta_name} = &{output_autograd_meta_vec_name};"
output_autograd_meta = f" std::vector<egr::AutogradMeta*> {output_autograd_meta_vec_name} = egr::EagerUtils::autograd_meta(&std::get<{pos}>(api_result));\n"
output_autograd_meta += f" std::vector<egr::AutogradMeta*>* {output_autograd_meta_name} = &{output_autograd_meta_vec_name};"
outputs_autograd_meta_list.append(output_autograd_meta)
pass_stop_gradient_args_list.append(output_autograd_meta_name)
......@@ -745,16 +772,34 @@ def GenerateNodeCreationCodes(
outputs_autograd_meta_str = "\n".join(outputs_autograd_meta_list)
pass_stop_gradient_args_str = ",".join(pass_stop_gradient_args_list)
# Check Inplace
check_inplace_str = ""
bump_inplace_version_str = ""
for inplace_name in inplace_map.keys():
inplace_autograd_meta_name = GetAutoGradMetaName(inplace_name)
check_inplace_str += f"""
// Check Inplace
egr::EagerUtils::CheckInplace({inplace_name}, {inplace_autograd_meta_name}, require_any_grad);\n
"""
bump_inplace_version_str += f"""
// Bump Inplace Version
{inplace_name}.bump_inplace_version();
VLOG(3) << \"Tensor(\" << {inplace_name}.name() << \") uses Inplace Strategy.\";\n
"""
# Node Construction
num_bwd_inputs = len(backward_grad_input_map.keys())
num_bwd_outputs = len(backward_grad_output_map.keys())
grad_node_name = GetGradNodeName(fwd_api_name)
node_construction_str = f" auto grad_node = std::make_shared<{grad_node_name}>({num_bwd_inputs}, {num_bwd_outputs});"
grad_node_name = GetGradNodeName(
RecoverBaseNameOfInplaceFunction(
fwd_api_name)) if inplace_map else GetGradNodeName(fwd_api_name)
node_construction_str = f" auto grad_node = std::make_shared<{grad_node_name}>({num_bwd_inputs}, {num_bwd_outputs});"
# SetAttributes
set_attributes_list = []
for name, _, _, _ in backward_attrs_list:
set_attributes = f" grad_node->SetAttribute{name}({name});"
set_attributes = f" grad_node->SetAttribute{name}({name});"
set_attributes_list.append(set_attributes)
set_attributes_str = "\n".join(set_attributes_list)
......@@ -765,9 +810,9 @@ def GenerateNodeCreationCodes(
if is_fwd_input:
if is_optional:
set_tensor_wrappers = f" if({name}.is_initialized()) grad_node->SetTensorWrapper{name}({name}, true);"
set_tensor_wrappers = f" if({name}.is_initialized()) grad_node->SetTensorWrapper{name}({name}, true);"
else:
set_tensor_wrappers = f" grad_node->SetTensorWrapper{name}({name}, true);"
set_tensor_wrappers = f" grad_node->SetTensorWrapper{name}({name}, true);"
else:
if num_fwd_outputs > 1:
# Aligned with forward output position
......@@ -778,9 +823,9 @@ def GenerateNodeCreationCodes(
tw_name = f"api_result"
if is_optional:
set_tensor_wrappers = f" if({tw_name}.is_initialized()) grad_node->SetTensorWrapper{name}({tw_name}, false);"
set_tensor_wrappers = f" if({tw_name}.is_initialized()) grad_node->SetTensorWrapper{name}({tw_name}, false);"
else:
set_tensor_wrappers = f" grad_node->SetTensorWrapper{name}({tw_name}, false);"
set_tensor_wrappers = f" grad_node->SetTensorWrapper{name}({tw_name}, false);"
set_tensor_wrappers_list.append(set_tensor_wrappers)
set_tensor_wrappers_str = "\n".join(set_tensor_wrappers_list)
......@@ -789,8 +834,8 @@ def GenerateNodeCreationCodes(
set_edges_list = []
for name, (_, pos) in forward_inputs_position_map.items():
input_autograd_meta_name = GetAutoGradMetaName(name)
set_grad_out_meta = f" grad_node->SetGradOutMeta({name}, {pos});"
set_edges = f" grad_node->AddEdges({input_autograd_meta_name}, {pos});"
set_grad_out_meta = f" grad_node->SetGradOutMeta({name}, {pos});"
set_edges = f" grad_node->AddEdges({input_autograd_meta_name}, {pos});"
set_grad_out_meta_list.append(set_grad_out_meta)
set_edges_list.append(set_edges)
set_grad_out_meta_str = "\n".join(set_grad_out_meta_list)
......@@ -804,14 +849,14 @@ def GenerateNodeCreationCodes(
num_outputs = len(forward_outputs_position_map.keys())
for name, (_, pos) in forward_outputs_position_map.items():
output_autograd_meta_name = GetAutoGradMetaName(name)
set_out_rank = f" egr::EagerUtils::SetOutRankWithSlot({output_autograd_meta_name}, {pos});"
set_history = f" egr::EagerUtils::SetHistory({output_autograd_meta_name}, grad_node);"
set_out_rank = f" egr::EagerUtils::SetOutRankWithSlot({output_autograd_meta_name}, {pos});"
set_history = f" egr::EagerUtils::SetHistory({output_autograd_meta_name}, grad_node);"
if num_outputs == 1:
set_retain_grad = f" egr::EagerUtils::CheckAndRetainGrad(api_result);"
set_grad_in_meta = f" grad_node->SetGradInMeta(api_result, {pos});"
set_retain_grad = f" egr::EagerUtils::CheckAndRetainGrad(api_result);"
set_grad_in_meta = f" grad_node->SetGradInMeta(api_result, {pos});"
else:
set_retain_grad = f" egr::EagerUtils::CheckAndRetainGrad(std::get<{pos}>(api_result));"
set_grad_in_meta = f" grad_node->SetGradInMeta(std::get<{pos}>(api_result), {pos});"
set_retain_grad = f" egr::EagerUtils::CheckAndRetainGrad(std::get<{pos}>(api_result));"
set_grad_in_meta = f" grad_node->SetGradInMeta(std::get<{pos}>(api_result), {pos});"
set_out_rank_list.append(set_out_rank)
set_history_list.append(set_history)
......@@ -823,55 +868,64 @@ def GenerateNodeCreationCodes(
set_grad_in_meta_str = "\n".join(set_grad_in_meta_list)
set_retain_grad_str = "\n".join(set_retain_grad_list)
node_event_name = fwd_api_name + " node_creation"
NODE_CREATION_TEMPLATE = """
paddle::platform::RecordEvent node_creation_record_event(\"{}\", paddle::platform::TracerEventType::Operator, 1);\n
"""
node_creation_event_str = NODE_CREATION_TEMPLATE.format(node_event_name)
NODE_CREATION_TEMPLATE = """
// Get AutoGradMeta
{}
{}
bool trace_backward = egr::Controller::Instance().HasGrad();
bool require_any_grad = egr::EagerUtils::ComputeRequireGrad({});
if(require_any_grad) {{
egr::EagerUtils::PassStopGradient({});
// Node Construction
{}
// SetAttributes
// Forward API Call
{}
{}
// SetTensorWrappers
{{
{}
// SetGradOutMeta & SetEdges
{}
if(require_any_grad) {{
egr::EagerUtils::PassStopGradient({});
// Node Construction
{}
// SetOutRank & SetHistory & SetGradInMeta & RetainGrad
// SetAttributes
{}
// SetTensorWrappers
{}
// SetGradOutMeta & SetEdges
{}
{}
// SetOutRank & SetHistory & SetGradInMeta & RetainGrad
{}
{}
{}
{}
}}
}}
"""
node_creation_str = NODE_CREATION_TEMPLATE.format(
inputs_autograd_meta_str, outputs_autograd_meta_str,
compute_require_grad_args_str, pass_stop_gradient_args_str,
node_construction_str, set_attributes_str, set_tensor_wrappers_str,
set_grad_out_meta_str, set_edges_str, set_out_rank_str, set_history_str,
set_grad_in_meta_str, set_retain_grad_str)
inputs_autograd_meta_str, compute_require_grad_args_str,
check_inplace_str, forward_call_str, bump_inplace_version_str,
node_creation_event_str, outputs_autograd_meta_str,
pass_stop_gradient_args_str, node_construction_str, set_attributes_str,
set_tensor_wrappers_str, set_grad_out_meta_str, set_edges_str,
set_out_rank_str, set_history_str, set_grad_in_meta_str,
set_retain_grad_str)
return node_creation_str
def GenerateForwardDefinition(fwd_api_name, bwd_api_name,
forward_inputs_position_map,
forward_outputs_position_map, forward_attrs_list,
backward_fwd_input_map, backward_grad_input_map,
backward_grad_output_map, backward_attrs_list,
optional_inputs, intermediate_outputs):
def GenerateForwardDefinition(
fwd_api_name, bwd_api_name, forward_inputs_position_map,
forward_outputs_position_map, forward_attrs_list,
backward_fwd_input_map, backward_grad_input_map,
backward_grad_output_map, backward_attrs_list, optional_inputs,
intermediate_outputs, inplace_map):
# fwd_api_name = ""
# forward_inputs_position_map = { "name" : [type, fwd_position] }
# forward_outputs_position_map = { "name" : [type, fwd_position] }
......@@ -895,7 +949,10 @@ def GenerateForwardDefinition(fwd_api_name, bwd_api_name,
if is_optional:
arg_str = f"const paddle::optional<paddle::experimental::Tensor>& {name}"
else:
arg_str = f"const paddle::experimental::Tensor& {name}"
if inplace_map and name in inplace_map.keys():
arg_str = f"paddle::experimental::Tensor& {name}"
else:
arg_str = f"const paddle::experimental::Tensor& {name}"
else:
assert IsVectorTensorType(ttype)
arg_str = f"const std::vector<paddle::experimental::Tensor>& {name}"
......@@ -958,26 +1015,16 @@ def GenerateForwardDefinition(fwd_api_name, bwd_api_name,
node_creation_str = GenerateNodeCreationCodes(
fwd_api_name, bwd_api_name, forward_inputs_position_map,
forward_outputs_position_map, forward_attrs_list,
forward_outputs_position_map, forward_attrs_list, forward_call_str,
backward_fwd_input_map, backward_grad_input_map,
backward_grad_output_map, backward_attrs_list, optional_inputs)
node_event_name = fwd_api_name + " node_creation"
NODE_CREATION_TEMPLATE = """{{\n
paddle::platform::RecordEvent node_creation_record_event(\"{}\", paddle::platform::TracerEventType::Operator, 1);\n
{}\n
}}"""
node_creation_str = NODE_CREATION_TEMPLATE.format(node_event_name,
node_creation_str)
backward_grad_output_map, backward_attrs_list, optional_inputs,
inplace_map)
dygraph_event_str = f"paddle::platform::RecordEvent dygraph_entrance_record_event(\"{fwd_api_name} dygraph\", paddle::platform::TracerEventType::Operator, 1);"
FORWARD_FUNCTION_TEMPLATE = """
{} {}({}) {{
{}
// Forward API Call
{}
{}
......@@ -989,7 +1036,7 @@ def GenerateForwardDefinition(fwd_api_name, bwd_api_name,
forward_function_name = GetForwardFunctionName(fwd_api_name)
forward_function_str = FORWARD_FUNCTION_TEMPLATE.format(
returns_type_str, forward_function_name, inputs_args_definition_str,
dygraph_event_str, forward_call_str, node_creation_str, returns_str)
dygraph_event_str, node_creation_str, returns_str)
forward_function_declaration_str = f"{returns_type_str} {forward_function_name}({inputs_args_declaration_str});"
return forward_function_str, forward_function_declaration_str
......@@ -1191,6 +1238,10 @@ if __name__ == "__main__":
fwd_args_str = fwd_api['args']
fwd_returns_str = fwd_api['output']
inplace_map = {}
if 'inplace' in fwd_api.keys():
inplace_map = ParseInplaceInfo(fwd_api['inplace'])
bwd_api_name = fwd_api['backward']
assert bwd_api_name in grad_api_dict.keys(), bwd_api_name
bwd_api = grad_api_dict[bwd_api_name]
......@@ -1287,7 +1338,7 @@ if __name__ == "__main__":
forward_outputs_position_map, orig_forward_attrs_list,
backward_fwd_input_map, backward_grad_input_map,
backward_grad_output_map, backward_attrs_list, optional_inputs,
intermediate_outputs)
intermediate_outputs, {})
print("Generated Forward Definition: ", forward_definition_str)
print("Generated Forward Declaration: ", forward_declaration_str)
yaml_forward_definition_str += definition_declaration_pair[0]
......@@ -1298,6 +1349,30 @@ if __name__ == "__main__":
forward_outputs_position_map,
orig_forward_attrs_list)
# Inplaced Version Dygraph Function Generation
if fwd_api_name != "sum" and "inplace" in fwd_api.keys():
fwd_api_name_inplaced = GetInplacedFunctionName(fwd_api_name)
# Node Definition Generation
definition_declaration_pair = GenerateForwardDefinition(
fwd_api_name_inplaced, bwd_api_name,
forward_inputs_position_map, forward_outputs_position_map,
forward_attrs_list, backward_fwd_input_map,
backward_grad_input_map, backward_grad_output_map,
backward_attrs_list, optional_inputs, intermediate_outputs,
inplace_map)
print("Generated Inplaced Forward Definition: ",
forward_definition_str)
print("Generated Inplaced Forward Declaration: ",
forward_declaration_str)
forward_definition_str += definition_declaration_pair[0]
forward_declaration_str += definition_declaration_pair[1]
# For python-level API dispatch
CollectCoreOpsInformation(
fwd_api_name_inplaced, forward_inputs_position_map,
forward_outputs_position_map, forward_attrs_list)
if len(namespace) > 0:
forward_definition_str += f"""namespace {namespace} {{
{yaml_forward_definition_str}
......
......@@ -15,7 +15,7 @@
import os
import argparse
import logging
from eager_gen import namespace, yaml_types_mapping, ReadFwdFile, ParseDispensable, IsVectorTensorType, GetForwardFunctionName, ParseYamlForward, DetermineForwardPositionMap
from eager_gen import namespace, yaml_types_mapping, ReadFwdFile, ParseDispensable, IsVectorTensorType, GetForwardFunctionName, ParseYamlForward, DetermineForwardPositionMap, GetInplacedFunctionName, ParseInplaceInfo
###########################
## Global Configurations ##
......@@ -43,7 +43,7 @@ atype_to_parsing_function = {
"std::vector<std::string>": "CastPyArg2Strings",
"paddle::experimental::Scalar": "CastPyArg2Scalar",
"paddle::experimental::ScalarArray": "CastPyArg2ScalarArray",
"paddle::experimental::Backend": "CastPyArg2Backend",
"paddle::experimental::Place": "CastPyArg2Place",
"paddle::experimental::DataType": "CastPyArg2DataType",
}
......@@ -71,6 +71,14 @@ RECORD_EVENT_TEMPLATE = \
" paddle::platform::RecordEvent {}(\"{} {}\", paddle::platform::TracerEventType::Operator, 1);"
RETURN_INPLACE_PYOBJECT_TEMPLATE = \
"""
ssize_t arg_id = GetIdxFromCoreOpsInfoMap(core_ops_final_state_args_info, \"final_state_{}\", \"{}\");
ssize_t return_id = GetIdxFromCoreOpsInfoMap(core_ops_final_state_returns_info, \"final_state_{}\", \"{}\");
return ToPyObject(out, return_id, args, arg_id);
"""
PYTHON_C_FUNCTION_TEMPLATE = \
"""
static PyObject * eager_final_state_api_{}(PyObject *self, PyObject *args, PyObject *kwargs)
......@@ -94,7 +102,7 @@ static PyObject * eager_final_state_api_{}(PyObject *self, PyObject *args, PyObj
PyEval_RestoreThread(tstate);
tstate = nullptr;
return ToPyObject(out);
{}
}}
catch(...) {{
if (tstate) {{
......@@ -287,9 +295,10 @@ class PythonCSingleFunctionGenerator:
self.forward_inputs_position_map, self.forward_outputs_position_map = DetermineForwardPositionMap(
forward_inputs_list, forward_returns_list)
def GeneratePythonCFunction(self):
def GeneratePythonCFunction(self, inplace_map):
namespace = self.namespace
forward_api_name = self.forward_api_name
forward_api_name = GetInplacedFunctionName(
self.forward_api_name) if inplace_map else self.forward_api_name
forward_attrs_list = self.forward_attrs_list
forward_inputs_position_map = self.forward_inputs_position_map
forward_outputs_position_map = self.forward_outputs_position_map
......@@ -341,19 +350,31 @@ class PythonCSingleFunctionGenerator:
fwd_function_name = FUNCTION_NAME_TEMPLATE.format(
"::", namespace, GetForwardFunctionName(forward_api_name))
if inplace_map:
assert len(
inplace_map
) == 1, f"size of inplace_map must be 1, but inplace_map of \"{forward_api_name}\" op got {len(inplace_map)}"
for inplace_input, inplace_output in inplace_map.items():
return_str = RETURN_INPLACE_PYOBJECT_TEMPLATE.format(
forward_api_name, inplace_input, forward_api_name,
inplace_output)
break
else:
return_str = " return ToPyObject(out);"
# Generate Record Event for performance profiling
pythonc_record_event_str = RECORD_EVENT_TEMPLATE.format(
"pythonc_record_event", forward_api_name, "pybind_imperative_func")
self.python_c_function_str = PYTHON_C_FUNCTION_TEMPLATE.format(
forward_api_name, pythonc_record_event_str, forward_api_name,
get_eager_tensor_str, parse_attributes_str, fwd_function_name,
dygraph_function_call_str)
dygraph_function_call_str, return_str)
# Generate Python-C Function Registration
self.python_c_function_reg_str = PYTHON_C_FUNCTION_REG_TEMPLATE.format(
forward_api_name, namespace, forward_api_name, forward_api_name)
def run(self):
def run(self, inplace_map):
# Initialized is_forward_only
self.CollectIsForwardOnly()
......@@ -384,7 +405,7 @@ class PythonCSingleFunctionGenerator:
)
# Code Generation
self.GeneratePythonCFunction()
self.GeneratePythonCFunction(inplace_map)
logging.info(
f"Generated Python-C Function: {self.python_c_function_str}")
logging.info(
......@@ -416,12 +437,23 @@ class PythonCYamlGenerator:
for forward_api_content in forward_api_list:
f_generator = PythonCSingleFunctionGenerator(forward_api_content,
namespace)
status = f_generator.run()
status = f_generator.run({})
if status == True:
self.python_c_functions_reg_str += f_generator.python_c_function_reg_str + ",\n"
self.python_c_functions_str += f_generator.python_c_function_str + "\n"
if 'inplace' in forward_api_content.keys():
inplace_map = ParseInplaceInfo(forward_api_content['inplace'])
f_generator_inplace = PythonCSingleFunctionGenerator(
forward_api_content, namespace)
status = f_generator_inplace.run(inplace_map)
if status == True:
self.python_c_functions_reg_str += f_generator_inplace.python_c_function_reg_str + ",\n"
self.python_c_functions_str += f_generator_inplace.python_c_function_str + "\n"
def InferNameSpace(self):
yaml_path = self.yaml_path
if "sparse" in yaml_path:
......
......@@ -90,7 +90,7 @@ TEST(Tensor, MemberFunction) {
VLOG(6) << "Set impl";
CHECK_EQ(et3.initialized(), true);
CHECK_EQ(et3.is_cpu(), true);
CHECK_EQ(et3.is_cuda(), false);
CHECK_EQ(et3.is_gpu(), false);
CHECK_EQ(et3.numel(), 2);
auto expected_dim = phi::make_ddim({1, 2});
CHECK_EQ(et3.dims(), expected_dim);
......
......@@ -38,7 +38,7 @@ class GraphSendRecvGradOp : public framework::OperatorWithKernel {
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
auto in_dims = ctx->GetInputDim(framework::GradVarName("Out"));
auto in_dims = ctx->GetInputDim("X");
ctx->SetOutputDim(framework::GradVarName("X"), in_dims);
}
......@@ -68,6 +68,12 @@ class GraphSendRecvOpMaker : public framework::OpProtoAndCheckerMaker {
"tensors of Dst_index.")
.SetDefault("SUM")
.InEnum({"SUM", "MEAN", "MIN", "MAX"});
AddAttr<int64_t>(
"out_size",
"(int64_t, default 0)"
"Define the first dimension of Output tensor."
"If set default 0, then the shape of Out is the same with X.")
.SetDefault(0);
AddComment(R"DOC(
Graph Learning Send_Recv combine operator.
......@@ -93,6 +99,7 @@ class GraphSendRecvGradOpMaker : public framework::SingleGradOpMaker<T> {
op->SetType("graph_send_recv_grad");
op->SetInput("Src_index", this->Input("Src_index"));
op->SetInput("Dst_index", this->Input("Dst_index"));
op->SetInput("X", this->Input("X"));
if (BOOST_GET_CONST(std::string, this->GetAttr("pool_type")) == "MEAN") {
op->SetInput("Dst_count", this->Output("Dst_count"));
......@@ -100,7 +107,6 @@ class GraphSendRecvGradOpMaker : public framework::SingleGradOpMaker<T> {
if (BOOST_GET_CONST(std::string, this->GetAttr("pool_type")) == "MIN" ||
BOOST_GET_CONST(std::string, this->GetAttr("pool_type")) == "MAX") {
op->SetInput("X", this->Input("X"));
op->SetInput("Out", this->Output("Out"));
}
......
......@@ -152,6 +152,21 @@ __device__ __forceinline__ void ThreadReduce(phi::Array<const T*, Num> arrs,
}
}
template <typename T>
__device__ __forceinline__ void ReduceMeanAndVar(T* mean, T* var, T x_mean,
T x_var, int size) {
const int nc = blockIdx.x;
x_mean = kps::details::BlockXReduce<T, kps::AddFunctor<T>>(
x_mean, kps::AddFunctor<T>());
x_var = kps::details::BlockXReduce<T, kps::AddFunctor<T>>(
x_var, kps::AddFunctor<T>());
__syncthreads();
if (threadIdx.x == 0) {
mean[nc] = static_cast<T>(x_mean / size);
var[nc] = static_cast<T>(x_var / size);
}
}
template <typename T>
__global__ void ScalarGetMeanAndVarNCHW(const T* x, T* mean, T* var, int size) {
int i = blockIdx.x;
......@@ -162,10 +177,7 @@ __global__ void ScalarGetMeanAndVarNCHW(const T* x, T* mean, T* var, int size) {
x_mean += val;
x_var += val * val;
}
x_mean /= size;
x_var /= size;
CudaAtomicAddWithWarp(&mean[i], x_mean);
CudaAtomicAddWithWarp(&var[i], x_var);
ReduceMeanAndVar<T>(mean, var, x_mean, x_var, size);
}
template <typename T, typename AccT, int VecSize>
......@@ -174,21 +186,12 @@ __global__ void VectorizedGetMeanAndVarNCHW(const T* x, T* mean, T* var,
int i = blockIdx.x;
AccT x_mean = static_cast<AccT>(0);
AccT x_var = static_cast<AccT>(0);
const int input_offset = ((uint64_t)x) % ALIGN_BYTES / sizeof(T);
x += i * size;
const int input_offset = ((uint64_t)x) % ALIGN_BYTES / sizeof(T);
phi::Array<const T*, 1> ins;
ins[0] = x;
ThreadReduce<T, AccT, VecSize, 1>(ins, size, input_offset, &x_mean, &x_var);
x_mean = kps::details::BlockXReduce<AccT, kps::AddFunctor<AccT>>(
x_mean, kps::AddFunctor<AccT>());
x_var = kps::details::BlockXReduce<AccT, kps::AddFunctor<AccT>>(
x_var, kps::AddFunctor<AccT>());
__syncthreads();
if (threadIdx.x == 0) {
mean[i] = static_cast<T>(x_mean / size);
var[i] = static_cast<T>(x_var / size);
}
ReduceMeanAndVar<AccT>(mean, var, x_mean, x_var, size);
}
template <typename T, int flags>
......@@ -272,10 +275,6 @@ class GroupNormKernel<platform::CUDADeviceContext, T>
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
Tensor temp_var;
temp_var.mutable_data<T>(var->dims(), ctx.GetPlace());
set_zero(dev_ctx, mean, static_cast<T>(0));
set_zero(dev_ctx, &temp_var, static_cast<T>(0));
auto* x_data = x->data<T>();
auto* y_data = y->data<T>();
auto* mean_data = mean->data<T>();
......@@ -319,7 +318,7 @@ class GroupNormKernel<platform::CUDADeviceContext, T>
block_size_nchw = std::max(block_size_nchw, kps::details::kWarpSize);
dim3 grids(x_dims[0] * groups);
dim3 blocks(block_size_nchw);
if (size < vec_size) {
if (size < vec_size * block_size_nchw) {
ScalarGetMeanAndVarNCHW<T><<<grids, blocks, 0, dev_ctx.stream()>>>(
x_data, mean_data, temp_var_data, size);
} else {
......@@ -328,6 +327,8 @@ class GroupNormKernel<platform::CUDADeviceContext, T>
x_data, mean_data, temp_var_data, size);
}
} else {
set_zero(dev_ctx, mean, static_cast<T>(0));
set_zero(dev_ctx, &temp_var, static_cast<T>(0));
GroupNormForwardGetMeanAndVar<T><<<grid, threads, 0, dev_ctx.stream()>>>(
x_data, x_dims[0], C, W, imsize, groups, group_size, mean_data,
temp_var_data);
......@@ -424,24 +425,15 @@ __global__ void VectorizedGetDsDbCUDAKernel(int imsize, const T* x, const T* dy,
int i = blockIdx.x;
AccT ds_sum = static_cast<AccT>(0);
AccT db_sum = static_cast<AccT>(0);
const int input_offset = ((uint64_t)x) % ALIGN_BYTES / sizeof(T);
x += i * imsize;
const int input_offset = ((uint64_t)x) % ALIGN_BYTES / sizeof(T);
phi::Array<const T*, 2> ins;
ins[0] = x;
ins[1] = dy;
ThreadReduce<T, AccT, VecSize, 2>(ins, imsize, input_offset, &db_sum,
&ds_sum);
ds_sum = kps::details::BlockXReduce<AccT, kps::AddFunctor<AccT>>(
ds_sum, kps::AddFunctor<AccT>());
db_sum = kps::details::BlockXReduce<AccT, kps::AddFunctor<AccT>>(
db_sum, kps::AddFunctor<AccT>());
__syncthreads();
if (threadIdx.x == 0) {
ds[i] = ds_sum;
db[i] = db_sum;
}
ReduceMeanAndVar<AccT>(db, ds, db_sum, ds_sum, 1);
}
template <typename T>
......@@ -455,8 +447,7 @@ __global__ void ScalarGetDsDbCUDAKernel(int imsize, const T* x, const T* dy,
ds_sum += dy[index] * x[index];
db_sum += dy[index];
}
CudaAtomicAddWithWarp(&ds[nc], ds_sum);
CudaAtomicAddWithWarp(&db[nc], db_sum);
ReduceMeanAndVar<T>(db, ds, db_sum, ds_sum, 1);
}
template <typename T>
......@@ -641,13 +632,7 @@ class GroupNormGradKernel<platform::CUDADeviceContext, T>
}
block_size_nchw = std::max(block_size_nchw, kps::details::kWarpSize);
dim3 blocks(block_size_nchw);
if (imsize < vec_size) {
if (d_scale) {
set_zero(dev_ctx, d_scale, static_cast<T>(0));
}
if (d_bias) {
set_zero(dev_ctx, d_bias, static_cast<T>(0));
}
if (imsize < vec_size * block_size_nchw) {
ScalarGetDsDbCUDAKernel<
T><<<x_dims[0] * C, blocks, 0, dev_ctx.stream()>>>(
imsize, x_data, dy_data, ds_data, db_data);
......@@ -687,7 +672,6 @@ class GroupNormGradKernel<platform::CUDADeviceContext, T>
imsize, C, group_size, groups, p1_data, p2_data, p3_data, x_data,
dy_data, d_x_data);
}
} else {
if (d_scale) {
set_zero(dev_ctx, d_scale, static_cast<T>(0));
......
......@@ -265,67 +265,6 @@ class ReduceKernel : public framework::OpKernel<T> {
framework::TransToPhiDataType(cast_out_dtype), output);
}
};
template <typename DeviceContext, typename OutT, typename Functor>
class BoolReduceKernel : public framework::OpKernel<OutT> {
public:
void Compute(const framework::ExecutionContext& context) const override {
bool reduce_all = context.Attr<bool>("reduce_all");
auto* input = context.Input<Tensor>("X");
auto* output = context.Output<Tensor>("Out");
output->mutable_data<OutT>(context.GetPlace());
auto dims = context.Attr<std::vector<int>>("dim");
bool keep_dim = context.Attr<bool>("keep_dim");
// The dims has full dim, set the reduce_all is True
const auto& input_dim_size = context.Input<Tensor>("X")->dims().size();
std::set<int> dims_set(dims.begin(), dims.end());
bool full_dim = true;
for (auto i = 0; i < input_dim_size; i++) {
if (dims_set.find(i) == dims_set.end()) {
full_dim = false;
break;
}
}
reduce_all = (reduce_all || full_dim);
if (reduce_all) {
// Flatten and reduce 1-D tensor
auto x = EigenVector<OutT>::Flatten(*input);
auto out = EigenScalar<OutT>::From(*output);
auto& place =
*context.template device_context<DeviceContext>().eigen_device();
auto reduce_dim = Eigen::array<int, 1>({{0}});
Functor functor;
functor(place, &x, &out, reduce_dim);
} else {
int ndim = input->dims().size();
int rdim = dims.size();
// comments for accelerating compiling temporarily.
if (ndim > 6) {
HandleLargeDim<DeviceContext, OutT, Functor>(context, input, output,
dims, keep_dim);
} else {
HANDLE_DIM(6, 5);
HANDLE_DIM(6, 4);
HANDLE_DIM(6, 3);
HANDLE_DIM(6, 2);
HANDLE_DIM(6, 1);
HANDLE_DIM(5, 4);
HANDLE_DIM(5, 3);
HANDLE_DIM(5, 2);
HANDLE_DIM(5, 1);
HANDLE_DIM(4, 3);
HANDLE_DIM(4, 2);
HANDLE_DIM(4, 1);
HANDLE_DIM(3, 2);
HANDLE_DIM(3, 1);
HANDLE_DIM(2, 1);
HANDLE_DIM(1, 1);
}
}
}
};
template <typename DeviceContext, typename T, typename Functor>
void LaunchReduceGradKernel(const framework::ExecutionContext& context,
......
......@@ -132,8 +132,7 @@ void InitTensorWithTensor(TensorObject* self,
self->tensor.set_impl(impl);
VLOG(4) << "Same place, do ShareDataWith";
} else {
self->tensor.set_impl(
src.copy_to(phi::TransToPhiBackend(place), true).impl());
self->tensor.set_impl(src.copy_to(place, true).impl());
VLOG(4) << "Different place, do TensorCopy";
}
if (src.get_autograd_meta()) {
......@@ -156,8 +155,7 @@ void InitTensorWithFrameworkTensor(TensorObject* self,
} else {
auto temp =
paddle::experimental::Tensor(std::make_shared<phi::DenseTensor>(src));
self->tensor.set_impl(
temp.copy_to(phi::TransToPhiBackend(place), true).impl());
self->tensor.set_impl(temp.copy_to(place, true).impl());
VLOG(4) << "Different place, do TensorCopy";
}
egr::EagerUtils::autograd_meta(&(self->tensor))->SetPersistable(false);
......
......@@ -159,7 +159,7 @@ static PyObject* eager_api_tensor_copy(PyObject* self, PyObject* args,
auto place = CastPyArg2Place(PyTuple_GET_ITEM(args, 2), 2);
bool blocking = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 3), 3);
dst = src.copy_to(phi::TransToPhiBackend(place), blocking);
dst = src.copy_to(place, blocking);
egr::EagerUtils::autograd_meta(&dst)->SetStopGradient(
egr::EagerUtils::autograd_meta(&(src))->StopGradient());
egr::EagerUtils::autograd_meta(&dst)->SetPersistable(
......
......@@ -175,7 +175,7 @@ static PyObject* tensor_method_numpy(TensorObject* self, PyObject* args,
pybind11::detail::npy_api::NPY_ARRAY_WRITEABLE_,
nullptr);
if (self->tensor.is_cpu()) {
if (self->tensor.is_cpu() || self->tensor.is_gpu_pinned()) {
auto dense_tensor =
std::dynamic_pointer_cast<phi::DenseTensor>(self->tensor.impl());
platform::CPUPlace place;
......@@ -184,7 +184,7 @@ static PyObject* tensor_method_numpy(TensorObject* self, PyObject* args,
pybind11::detail::array_proxy(array)->data),
place, dense_tensor->data(), sizeof_dtype * numel);
#if defined(PADDLE_WITH_CUDA)
} else if (self->tensor.is_cuda()) {
} else if (self->tensor.is_gpu()) {
auto dense_tensor =
std::dynamic_pointer_cast<phi::DenseTensor>(self->tensor.impl());
......@@ -218,8 +218,7 @@ static PyObject* tensor_method__copy_to(TensorObject* self, PyObject* args,
EAGER_TRY
auto place = CastPyArg2Place(PyTuple_GET_ITEM(args, 0), 0);
bool blocking = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 1), 1);
auto cp_tensor =
self->tensor.copy_to(phi::TransToPhiBackend(place), blocking);
auto cp_tensor = self->tensor.copy_to(place, blocking);
egr::EagerUtils::autograd_meta(&cp_tensor)->SetStopGradient(true);
egr::EagerUtils::autograd_meta(&cp_tensor)
->SetPersistable(
......@@ -231,8 +230,7 @@ static PyObject* tensor_method__copy_to(TensorObject* self, PyObject* args,
static PyObject* tensor_method_cpu(TensorObject* self, PyObject* args,
PyObject* kwargs) {
EAGER_TRY
auto cp_tensor =
self->tensor.copy_to(phi::TransToPhiBackend(phi::CPUPlace()), true);
auto cp_tensor = self->tensor.copy_to(phi::CPUPlace(), true);
egr::EagerUtils::autograd_meta(&cp_tensor)->SetStopGradient(true);
egr::EagerUtils::autograd_meta(&cp_tensor)
->SetPersistable(
......
......@@ -929,28 +929,10 @@ std::vector<paddle::framework::Scope*> GetScopePtrListFromArgs(
return result;
}
paddle::experimental::Backend CastPyArg2Backend(PyObject* obj,
const std::string& op_type,
ssize_t arg_pos) {
if (obj == Py_None) {
PADDLE_THROW(platform::errors::InvalidArgument(
"%s(): argument (position %d) must be "
"int or place, but got %s",
op_type, arg_pos + 1,
((PyTypeObject*)obj->ob_type)->tp_name)); // NOLINT
}
PyTypeObject* type = obj->ob_type;
auto type_name = std::string(type->tp_name);
if (type_name == "int") {
int value = CastPyArg2Int(obj, op_type, arg_pos);
return static_cast<paddle::experimental::Backend>(value);
} else {
platform::Place place = CastPyArg2Place(obj, arg_pos);
return phi::TransToPhiBackend(place);
}
return paddle::experimental::Backend::CPU;
paddle::experimental::Place CastPyArg2Place(PyObject* obj,
const std::string& op_type,
ssize_t arg_pos) {
return CastPyArg2Place(obj, arg_pos);
}
paddle::experimental::DataType CastPyArg2DataType(PyObject* obj,
......
......@@ -154,9 +154,9 @@ paddle::experimental::Scalar CastPyArg2Scalar(PyObject* obj,
paddle::experimental::ScalarArray CastPyArg2ScalarArray(
PyObject* obj, const std::string& op_type, ssize_t arg_pos);
paddle::experimental::Backend CastPyArg2Backend(PyObject* obj,
const std::string& op_type,
ssize_t arg_pos);
paddle::experimental::Place CastPyArg2Place(PyObject* obj,
const std::string& op_type,
ssize_t arg_pos);
paddle::experimental::DataType CastPyArg2DataType(PyObject* obj,
const std::string& op_type,
......
......@@ -31,7 +31,6 @@ using gpuStream_t = hipStream_t;
#include "paddle/phi/api/ext/dll_decl.h"
#include "paddle/phi/api/ext/place.h"
#include "paddle/phi/common/backend.h"
#include "paddle/phi/common/data_type.h"
#include "paddle/phi/common/layout.h"
#include "paddle/phi/common/place.h"
......@@ -269,12 +268,20 @@ class PADDLE_API Tensor final {
bool is_cpu() const;
/**
* @brief Determine whether the tensor device is CUDA
* @brief Determine whether the tensor device is GPU
*
* @return true
* @return false
*/
bool is_cuda() const;
bool is_gpu() const;
/**
* @brief Determine whether the tensor device is GPU_PINNED
*
* @return true
* @return false
*/
bool is_gpu_pinned() const;
/* Part 4: Data Access methods */
......@@ -407,11 +414,11 @@ class PADDLE_API Tensor final {
/**
* @brief Transfer the current Tensor to the specified device and return.
*
* @param backend, The target backend of which the tensor will copy to.
* @param place, The target place of which the tensor will copy to.
* @param blocking, Should we copy this in sync way.
* @return Tensor
*/
Tensor copy_to(Backend backend, bool blocking) const;
Tensor copy_to(Place place, bool blocking) const;
/**
* @brief Transfer the source Tensor to current Tensor.
......
......@@ -19,6 +19,7 @@ limitations under the License. */
#include "paddle/phi/api/lib/data_transform.h"
#include "paddle/phi/api/lib/kernel_dispatch.h"
#include "paddle/phi/api/lib/utils/storage.h"
#include "paddle/phi/core/compat/convert_utils.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/meta_tensor.h"
#include "paddle/phi/infermeta/binary.h"
......@@ -31,9 +32,10 @@ limitations under the License. */
namespace paddle {
namespace experimental {
Tensor copy_to_impl(const Tensor& x, Backend backend, bool blocking) {
Tensor copy_to_impl(const Tensor& x, Place place, bool blocking) {
auto kernel_key_set = ParseKernelKeyByInputArgs(x);
kernel_key_set.backend_set = kernel_key_set.backend_set | BackendSet(backend);
kernel_key_set.backend_set =
kernel_key_set.backend_set | BackendSet(phi::TransToPhiBackend(place));
auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
"copy", kernel_key);
......@@ -57,8 +59,7 @@ Tensor copy_to_impl(const Tensor& x, Backend backend, bool blocking) {
phi::DenseTensor*);
auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
(*kernel_fn)(
*dev_ctx, *dense_x, phi::TransToPhiPlace(backend), blocking, kernel_out);
(*kernel_fn)(*dev_ctx, *dense_x, place, blocking, kernel_out);
return out;
}
......
......@@ -15,15 +15,14 @@ limitations under the License. */
#pragma once
#include "paddle/phi/api/include/tensor.h"
#include "paddle/phi/common/backend.h"
#include "paddle/phi/common/place.h"
#include "paddle/phi/common/scalar.h"
#include "paddle/phi/common/scalar_array.h"
namespace paddle {
namespace experimental {
// TODO(chenweihang): Replace backend by place when place is ready
Tensor copy_to_impl(const Tensor& x, Backend backend, bool blocking);
Tensor copy_to_impl(const Tensor& x, Place place, bool blocking);
std::vector<Tensor> split_impl(const Tensor& x,
const ScalarArray& num_or_sections,
......
......@@ -35,7 +35,7 @@ class BackendSet final {
: bitset_(b == Backend::UNDEFINED ? 0 : 1ULL << (static_cast<uint8_t>(b) -
1)) {}
uint64_t bitset() const { return bitset_; }
inline uint64_t bitset() const { return bitset_; }
bool inline Has(Backend b) const {
PD_CHECK(b != Backend::UNDEFINED, "Backend argument can't be UNDEFINED.");
......
......@@ -39,7 +39,7 @@ inline bool NeedTransformPlace(const paddle::platform::Place& input,
const TransformFlag& transform_flag) {
bool ret = transform_flag.need_trans_backend() &&
target != Backend::ALL_BACKEND &&
!platform::is_same_place(input, phi::TransToPhiPlace(target));
phi::TransToPhiBackend(input) != target;
return ret;
}
......@@ -180,21 +180,20 @@ std::shared_ptr<phi::DenseTensor> PrepareData(
const phi::TensorArgDef& target_args_def,
const TransformFlag& transform_flag) {
const auto& tensor_in = input.impl();
VLOG(6) << tensor_in->dtype() << "\t" << target_args_def.dtype;
if (!transform_flag.NeedTransform() || !tensor_in->initialized() ||
phi::DenseTensor& dense_tensor =
*static_cast<phi::DenseTensor*>(tensor_in.get());
if (!transform_flag.NeedTransform() || !dense_tensor.initialized() ||
(!NeedTransformPlace(
tensor_in->place(), target_args_def.backend, transform_flag) &&
dense_tensor.place(), target_args_def.backend, transform_flag) &&
!NeedTransformDataType(
tensor_in->dtype(), target_args_def.dtype, transform_flag) &&
dense_tensor.dtype(), target_args_def.dtype, transform_flag) &&
!NeedTransformLayout(
tensor_in->layout(), target_args_def.layout, transform_flag))) {
dense_tensor.layout(), target_args_def.layout, transform_flag))) {
return std::static_pointer_cast<phi::DenseTensor>(tensor_in);
}
phi::DenseTensor out =
TransformData(*(static_cast<phi::DenseTensor*>(tensor_in.get())),
target_args_def,
transform_flag);
TransformData(dense_tensor, target_args_def, transform_flag);
return std::make_shared<phi::DenseTensor>(std::move(out));
}
......
......@@ -30,7 +30,7 @@ class DataTypeSet final {
? 0
: 1ULL << (static_cast<uint8_t>(dtype) - 1)) {}
uint64_t bitset() const { return bitset_; }
inline uint64_t bitset() const { return bitset_; }
bool inline Has(DataType dtype) const {
PD_CHECK(dtype != DataType::UNDEFINED,
......
......@@ -16,13 +16,16 @@ limitations under the License. */
#include "paddle/phi/api/include/context_pool.h"
#include "paddle/phi/core/compat/convert_utils.h"
#ifdef _MSC_VER
#include <intrin.h>
#endif
namespace paddle {
namespace experimental {
namespace detail {
BackendSet GetTensorBackendSet(const Tensor& t) {
BackendSet backend_set(phi::TransToPhiBackend(t.inner_place()));
BackendSet GetTensorBackendSet(const phi::TensorBase& t) {
BackendSet backend_set(phi::TransToPhiBackend(t.place()));
switch (t.layout()) {
case DataLayout::MKLDNN:
backend_set = backend_set | BackendSet(Backend::MKLDNN);
......@@ -35,6 +38,11 @@ BackendSet GetTensorBackendSet(const Tensor& t) {
}
std::size_t CountLeadingZeros(uint64_t val) {
#if defined(__clang__) || defined(__GNUC__)
return __builtin_clzl(val);
#elif defined(_MSC_VER)
return __lzcnt64(val);
#else
if (val == 0) {
return 64;
}
......@@ -48,6 +56,7 @@ std::size_t CountLeadingZeros(uint64_t val) {
}
}
return zero_bits;
#endif
}
} // namespace detail
......@@ -82,13 +91,17 @@ DataType ParseDataTypeWithInputOrder(DataType dtype, const Tensor& tensor) {
return dtype != DataType::UNDEFINED ? dtype : ParseDataType(tensor);
}
Backend ParseBackend(Backend backend) { return backend; }
Backend ParseBackend(const Place& place) {
return phi::TransToPhiBackend(place);
}
Backend ParseBackend(const Tensor& tensor) {
return phi::TransToPhiBackend(tensor.inner_place());
}
Backend ParseBackendWithInputOrder(Backend backend, const Tensor& tensor) {
return backend != Backend::UNDEFINED ? backend : ParseBackend(tensor);
Backend ParseBackendWithInputOrder(const Place& place, const Tensor& tensor) {
return place.GetType() != phi::AllocationType::UNDEFINED
? ParseBackend(place)
: ParseBackend(tensor);
}
DataLayout ParseLayout(DataLayout layout) { return layout; }
......
......@@ -33,7 +33,7 @@ namespace paddle {
namespace experimental {
namespace detail {
BackendSet GetTensorBackendSet(const Tensor& t);
BackendSet GetTensorBackendSet(const phi::TensorBase& t);
std::size_t CountLeadingZeros(uint64_t val);
} // namespace detail
......@@ -93,11 +93,13 @@ struct KernelKeyParser : ArgsIterator<KernelKeyParser> {
// TODO(chenweihang): deal with multiple diff input Tensors
// TODO(chenweihang): add global device guard method to set backend
void operator()(const Tensor& x) {
key_set.backend_set = key_set.backend_set | detail::GetTensorBackendSet(x);
// TODO(chenweihang): selecte multi layout and dtype
key_set.layout = x.layout();
key_set.dtype = x.type();
dtype_set = dtype_set | DataTypeSet(x.dtype());
const phi::TensorBase& tensor = *x.impl();
key_set.backend_set =
key_set.backend_set | detail::GetTensorBackendSet(tensor);
// TODO(chenweihang): select multi layout and dtype
key_set.layout = tensor.layout();
key_set.dtype = tensor.dtype();
dtype_set = dtype_set | DataTypeSet(key_set.dtype);
auto promote_result = PromoteTypes(dtype_set);
if (promote_result != DataType::UNDEFINED) {
key_set.dtype = promote_result;
......@@ -105,11 +107,12 @@ struct KernelKeyParser : ArgsIterator<KernelKeyParser> {
}
void operator()(const std::vector<Tensor>& x) {
const phi::TensorBase& tensor = *x.at(0).impl();
key_set.backend_set =
key_set.backend_set | detail::GetTensorBackendSet(x[0]);
// TODO(chenweihang): selecte multi layout and dtype
key_set.layout = x[0].layout();
key_set.dtype = x[0].type();
key_set.backend_set | detail::GetTensorBackendSet(tensor);
// TODO(chenweihang): select multi layout and dtype
key_set.layout = tensor.layout();
key_set.dtype = tensor.dtype();
}
// skip other type args, these args don't used in kernel selection
......@@ -154,7 +157,7 @@ DataType ParseDataType(const Tensor& tensor);
DataType ParseDataType(const std::vector<Tensor>& tensors);
DataType ParseDataTypeWithInputOrder(DataType dtype, const Tensor& tensor);
Backend ParseBackend(Backend backend);
Backend ParseBackend(const Place& place);
Backend ParseBackend(const Tensor& tensor);
template <typename T, typename... Args>
Backend ParseBackend(T t, Args... args) {
......@@ -163,7 +166,7 @@ Backend ParseBackend(T t, Args... args) {
return static_cast<Backend>(64 -
detail::CountLeadingZeros(backend_set.bitset()));
}
Backend ParseBackendWithInputOrder(Backend backend, const Tensor& tensor);
Backend ParseBackendWithInputOrder(const Place& place, const Tensor& tensor);
DataLayout ParseLayout(DataLayout layout);
DataLayout ParseLayout(const Tensor& tensor);
......
......@@ -163,10 +163,14 @@ bool Tensor::is_cpu() const {
return paddle::platform::is_cpu_place(inner_place());
}
bool Tensor::is_cuda() const {
bool Tensor::is_gpu() const {
return paddle::platform::is_gpu_place(inner_place());
}
bool Tensor::is_gpu_pinned() const {
return paddle::platform::is_cuda_pinned_place(inner_place());
}
/* Part 4: Data Access methods */
template <typename T>
......
......@@ -27,14 +27,14 @@ namespace paddle {
namespace experimental {
// declare cast api
Tensor cast(const Tensor &x, DataType out_dtype);
Tensor copy_to(const Tensor &x, Backend backend, bool blocking);
Tensor copy_to(const Tensor &x, Place place, bool blocking);
Tensor Tensor::cast(DataType target_type) const {
return experimental::cast(*this, target_type);
}
Tensor Tensor::copy_to(Backend backend, bool blocking) const {
return experimental::copy_to(*this, backend, blocking);
Tensor Tensor::copy_to(Place place, bool blocking) const {
return experimental::copy_to(*this, place, blocking);
}
template <typename T>
......@@ -44,7 +44,7 @@ Tensor Tensor::copy_to(const PlaceType &target_place) const {
"`copy_to` method without template argument instead. "
"reason: copying a Tensor to another device does not need "
"to specify the data type template argument.";
return copy_to(ConvertExtPlaceToBackend(target_place), /*blocking=*/false);
return copy_to(ConvertExtPlaceToInnerPlace(target_place), /*blocking=*/false);
}
template PADDLE_API Tensor
......
......@@ -203,5 +203,10 @@ namespace paddle {
namespace experimental {
using AllocationType = phi::AllocationType;
using Place = phi::Place;
using CPUPlace = phi::CPUPlace;
using GPUPlace = phi::GPUPlace;
using GPUPinnedPlace = phi::GPUPinnedPlace;
using XPUPlace = phi::XPUPlace;
using NPUPlace = phi::NPUPlace;
} // namespace experimental
} // namespace paddle
......@@ -26,13 +26,14 @@ limitations under the License. */
namespace phi {
Backend TransToPhiBackend(const phi::Place& place) {
if (place.GetType() == phi::AllocationType::CPU) {
auto allocation_type = place.GetType();
if (allocation_type == phi::AllocationType::CPU) {
return Backend::CPU;
} else if (place.GetType() == phi::AllocationType::GPU) {
} else if (allocation_type == phi::AllocationType::GPU) {
return Backend::GPU;
} else if (place.GetType() == phi::AllocationType::XPU) {
} else if (allocation_type == phi::AllocationType::XPU) {
return Backend::XPU;
} else if (place.GetType() == phi::AllocationType::CUSTOM) {
} else if (allocation_type == phi::AllocationType::CUSTOM) {
return static_cast<Backend>(
static_cast<size_t>(Backend::NUM_BACKENDS) +
GetOrRegisterGlobalDeviceTypeId(place.GetDeviceType()));
......
......@@ -145,6 +145,7 @@ void GraphSendRecvInferMeta(const MetaTensor& x,
const MetaTensor& src_index,
const MetaTensor& dst_index,
const std::string& pool_type,
int64_t out_size,
MetaTensor* out,
MetaTensor* dst_count) {
auto src_index_dims = src_index.dims();
......@@ -187,11 +188,23 @@ void GraphSendRecvInferMeta(const MetaTensor& x,
"Src_index and Dst_index should have the same shape."));
auto dims = x.dims();
out->set_dims(dims);
if (out_size <= 0) {
out->set_dims(dims);
} else {
std::vector<int64_t> dims_ = phi::vectorize(dims);
if (dims_.size() > 0) {
dims_[0] = out_size;
}
out->set_dims(phi::make_ddim(dims_));
}
out->set_dtype(x.dtype());
if (pool_type == "MEAN") {
dst_count->set_dims({dims[0]});
if (out_size <= 0) {
dst_count->set_dims({dims[0]});
} else {
dst_count->set_dims({out_size});
}
dst_count->set_dtype(DataType::INT32);
}
}
......
......@@ -51,6 +51,7 @@ void GraphSendRecvInferMeta(const MetaTensor& x,
const MetaTensor& src_index,
const MetaTensor& dst_index,
const std::string& pool_type,
int64_t out_size,
MetaTensor* out,
MetaTensor* dst_count);
......
......@@ -23,15 +23,14 @@
namespace phi {
template <typename T, typename IndexT, typename Functor>
void GraphSendRecvCpuGradLoop(const int& input_size,
const int& index_size,
void GraphSendRecvCpuGradLoop(const int& index_size,
const IndexT* s_index,
const IndexT* d_index,
const DenseTensor& src,
const DenseTensor& input,
DenseTensor* dst,
const std::string& pool_type,
const int* dst_count = nullptr,
const DenseTensor* input = nullptr,
const DenseTensor* output = nullptr) {
if (pool_type == "SUM") {
Functor functor;
......@@ -55,7 +54,7 @@ void GraphSendRecvCpuGradLoop(const int& input_size,
for (int i = 0; i < index_size; ++i) {
const IndexT& forward_src_idx = d_index[i];
const IndexT& forward_dst_idx = s_index[i];
auto input_slice = input->Slice(forward_src_idx, forward_src_idx + 1);
auto input_slice = input.Slice(forward_src_idx, forward_src_idx + 1);
auto output_slice = output->Slice(forward_dst_idx, forward_dst_idx + 1);
auto eigen_input = phi::EigenVector<T>::Flatten(input_slice);
auto eigen_output = phi::EigenVector<T>::Flatten(output_slice);
......@@ -73,18 +72,18 @@ template <typename Context, typename T, typename IndexT>
void GraphSendRecvGradOpKernelLaunchHelper(
const Context& ctx,
const DenseTensor& out_grad,
const DenseTensor& x,
const DenseTensor& src_index,
const DenseTensor& dst_index,
const std::string& pool_type,
DenseTensor* x_grad,
const DenseTensor* dst_count = nullptr,
const DenseTensor* x = nullptr,
const DenseTensor* out = nullptr) {
const int& index_size = dst_index.dims()[0];
ctx.template Alloc<T>(x_grad);
T* p_output = x_grad->data<T>();
const auto& src_dims = out_grad.dims();
const auto& src_dims = x.dims();
int64_t memset_size = 1;
for (int i = 0; i < src_dims.size(); ++i) memset_size *= src_dims[i];
const size_t& memset_bytes = memset_size * sizeof(T);
......@@ -97,29 +96,22 @@ void GraphSendRecvGradOpKernelLaunchHelper(
if (pool_type == "SUM") {
GraphSendRecvCpuGradLoop<T, IndexT, GraphSendRecvSumFunctor<T>>(
src_dims[0], index_size, d_index, s_index, out_grad, x_grad, pool_type);
index_size, d_index, s_index, out_grad, x, x_grad, pool_type);
} else if (pool_type == "MEAN") {
const int* s_count = dst_count->data<int>();
// Functor not used here.
GraphSendRecvCpuGradLoop<T, IndexT, GraphSendRecvSumFunctor<T>>(src_dims[0],
index_size,
d_index,
s_index,
out_grad,
x_grad,
pool_type,
s_count);
GraphSendRecvCpuGradLoop<T, IndexT, GraphSendRecvSumFunctor<T>>(
index_size, d_index, s_index, out_grad, x, x_grad, pool_type, s_count);
} else if (pool_type == "MIN" || pool_type == "MAX") {
// Functor not used here.
GraphSendRecvCpuGradLoop<T, IndexT, GraphSendRecvMinFunctor<T>>(src_dims[0],
index_size,
GraphSendRecvCpuGradLoop<T, IndexT, GraphSendRecvMinFunctor<T>>(index_size,
d_index,
s_index,
out_grad,
x,
x_grad,
pool_type,
nullptr,
x,
out);
}
}
......@@ -127,7 +119,7 @@ void GraphSendRecvGradOpKernelLaunchHelper(
template <typename T, typename Context>
void GraphSendRecvGradKernel(const Context& ctx,
const DenseTensor& out_grad,
paddle::optional<const DenseTensor&> x,
const DenseTensor& x,
paddle::optional<const DenseTensor&> out,
const DenseTensor& src_index,
const DenseTensor& dst_index,
......@@ -139,23 +131,23 @@ void GraphSendRecvGradKernel(const Context& ctx,
GraphSendRecvGradOpKernelLaunchHelper<Context, T, int32_t>(
ctx,
out_grad,
x,
src_index,
dst_index,
pool_type,
x_grad,
dst_count.get_ptr(),
x.get_ptr(),
out.get_ptr());
} else if (index_type == phi::DataType::INT64) {
GraphSendRecvGradOpKernelLaunchHelper<Context, T, int64_t>(
ctx,
out_grad,
x,
src_index,
dst_index,
pool_type,
x_grad,
dst_count.get_ptr(),
x.get_ptr(),
out.get_ptr());
}
}
......
......@@ -83,6 +83,7 @@ void GraphSendRecvOpKernelLaunchHelper(const Context& ctx,
const DenseTensor& src_index,
const DenseTensor& dst_index,
const std::string& pool_type,
int64_t out_size,
DenseTensor* out,
DenseTensor* dst_count = nullptr) {
const int& index_size = src_index.dims()[0];
......@@ -91,7 +92,16 @@ void GraphSendRecvOpKernelLaunchHelper(const Context& ctx,
T* p_output = out->data<T>();
const auto& src_dims = x.dims();
int64_t memset_size = 1;
for (int i = 0; i < src_dims.size(); ++i) memset_size *= src_dims[i];
if (out_size <= 0) {
for (int i = 0; i < src_dims.size(); ++i) {
memset_size *= src_dims[i];
}
} else {
memset_size = out_size;
for (int i = 1; i < src_dims.size(); ++i) {
memset_size *= src_dims[i];
}
}
const size_t& memset_bytes = memset_size * sizeof(T);
memset(p_output, 0, memset_bytes);
......@@ -129,15 +139,16 @@ void GraphSendRecvKernel(const Context& ctx,
const DenseTensor& src_index,
const DenseTensor& dst_index,
const std::string& pool_type,
int64_t out_size,
DenseTensor* out,
DenseTensor* dst_count) {
auto index_type = src_index.dtype();
if (index_type == phi::DataType::INT32) {
GraphSendRecvOpKernelLaunchHelper<Context, T, int32_t>(
ctx, x, src_index, dst_index, pool_type, out, dst_count);
ctx, x, src_index, dst_index, pool_type, out_size, out, dst_count);
} else if (index_type == phi::DataType::INT64) {
GraphSendRecvOpKernelLaunchHelper<Context, T, int64_t>(
ctx, x, src_index, dst_index, pool_type, out, dst_count);
ctx, x, src_index, dst_index, pool_type, out_size, out, dst_count);
}
}
......
......@@ -99,8 +99,8 @@ void ReduceSumGradKernel(const Context& dev_ctx,
ReduceGradKernel<Context, T, funcs::SumGradFunctor, true>(dev_ctx,
x,
out_grad,
paddle::none,
out_grad,
dims,
keep_dim,
reduce_all,
......@@ -121,8 +121,8 @@ void ReduceMeanGradKernel(const Context& dev_ctx,
DenseTensor* x_grad) {
ReduceGradKernel<Context, T, funcs::MeanGradFunctor, true>(dev_ctx,
x,
out_grad,
paddle::none,
out_grad,
dims,
keep_dim,
reduce_all,
......
......@@ -14,7 +14,7 @@
#pragma once
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#if defined(__NVCC__) || defined(__HIPCC__)
#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#include <thrust/extrema.h>
......@@ -143,7 +143,7 @@ static void ModeAssign(const Type& input_height,
}
}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#if defined(__NVCC__) || defined(__HIPCC__)
template <typename T>
static void GetModebySort(const phi::GPUContext& dev_ctx,
const DenseTensor* input_tensor,
......
......@@ -168,57 +168,82 @@ __global__ void CumsumOneBlock(
}
}
// where_index
template <typename OutT,
typename MT,
typename InT,
typename IdT,
typename Functor,
int VecSize,
int IsBoundary,
int IsMaskData>
int MaskData>
struct SelectCaller {
__device__ void inline operator()(OutT *store_data,
__device__ void inline operator()(OutT *out,
const MT *mask_data,
const InT *in,
Functor func,
int num,
int data_offset) {
// where_index op
IdT index_reg[VecSize];
// Set data index of global
kps::InitWithDataIndex<IdT, VecSize, 1, 1>(&index_reg[0], data_offset);
int data_offset,
int store_num,
int thread_fix,
int num) {
int64_t in_data[VecSize];
OutT store_data[VecSize * phi::DDim::kMaxRank];
// set index
kps::InitWithDataIndex<int64_t, VecSize, 1, 1>(&in_data[0], data_offset);
// Get store data according to mask_idt
kps::OperatorTernary<MT, IdT, OutT, Functor>(
store_data, mask_data, &index_reg[0], func, VecSize);
kps::OperatorTernary<MT, int64_t, OutT, Functor>(
store_data, mask_data, &in_data[0], func, VecSize);
kps::details::WriteData<OutT>(out + thread_fix, &store_data[0], store_num);
}
};
// masked_select
template <typename OutT,
typename MT,
typename InT,
typename IdT,
typename Functor,
int VecSize,
int IsBoundary>
struct SelectCaller<OutT,
MT,
InT,
IdT,
Functor,
VecSize,
IsBoundary,
1> { // masked_select
__device__ void inline operator()(OutT *store_data,
struct SelectCaller<OutT, MT, InT, Functor, VecSize, IsBoundary, 1> {
__device__ void inline operator()(OutT *out,
const MT *mask_data,
const InT *in,
Functor func,
int num,
int data_offset) {
int data_offset,
int store_num,
int thread_fix,
int num) {
InT in_data[VecSize];
OutT store_data[VecSize * phi::DDim::kMaxRank];
kps::ReadData<InT, VecSize, 1, 1, IsBoundary>(&in_data[0], in, num);
// Get store data according to mask_idt
kps::OperatorTernary<MT, InT, OutT, Functor>(
store_data, mask_data, &in_data[0], func, VecSize);
kps::details::WriteData<OutT>(out + thread_fix, &store_data[0], store_num);
}
};
// masked_select_grad
template <typename OutT,
typename MT,
typename InT,
typename Functor,
int VecSize,
int IsBoundary>
struct SelectCaller<OutT, MT, InT, Functor, VecSize, IsBoundary, 2> {
__device__ void inline operator()(OutT *out,
const MT *mask_data,
const InT *in,
Functor func,
int data_offset,
int store_num,
int thread_fix,
int num) {
InT in_data[VecSize];
OutT store_data[VecSize * phi::DDim::kMaxRank];
kps::details::ReadData<InT>(&in_data[0], in + thread_fix, store_num);
kps::OperatorTernary<MT, InT, OutT, Functor>(
store_data, mask_data, &in_data[0], func, VecSize);
kps::WriteData<OutT, VecSize, 1, 1, IsBoundary>(out, &store_data[0], num);
}
};
......@@ -253,7 +278,6 @@ SelectKernelImpl(OutT *out,
IdT num_thread[kCVecSize];
IdT cumsum_thread[kCVecSize];
OutT store_data[VecSize * phi::DDim::kMaxRank];
MT mask_data[VecSize];
IdT mask_idt[VecSize];
// init data_pr
......@@ -271,17 +295,15 @@ SelectKernelImpl(OutT *out,
// Get cumsum_thread cumsum from 0 to num_thread cumsum_thread[0] is the
// thread_fix
kps::Cumsum<IdT, IdT, 1, Add>(&cumsum_thread[0], &num_thread[0], Add());
// Get store data(index) according to mask_idt
SelectCaller<OutT, MT, InT, IdT, Functor, VecSize, IsBoundary, MaskData>
compute;
compute(&store_data[0], &mask_data[0], in, func, num, data_offset);
// get thread_fix
int thread_fix =
(static_cast<int>(cumsum_thread[0] - num_thread[0]) * store_rank);
// get how many data need to store
int store_num = static_cast<int>(num_thread[0]) * store_rank;
// thread store num data, each thread may has different num
kps::details::WriteData<OutT>(out + thread_fix, &store_data[0], store_num);
// Get store data(index) according to mask_idt
SelectCaller<OutT, MT, InT, Functor, VecSize, IsBoundary, MaskData> select;
select(out, mask_data, in, func, data_offset, store_num, thread_fix, num);
}
template <typename MT,
......@@ -303,15 +325,17 @@ __global__ void SelectKernel(OutT *out,
int stride = BLOCK_NUM_X * GRID_NUM_X * VecSize;
int repeat = 0;
int size = VecSize * BLOCK_ID_X;
CT block_store_offset = 0;
for (; data_offset < main_offset; data_offset += stride) {
// Cumsum index
int idx_cumsum = repeat * GRID_NUM_X + BLOCK_ID_X;
// niuliling todo: us ReadData API
int block_store_offset = cumsum[idx_cumsum];
kps::details::ReadData<CT>(&block_store_offset, cumsum + idx_cumsum, 1);
int out_fix = MaskData < 2 ? block_store_offset * store_rank : data_offset;
int in_fix = MaskData < 2 ? data_offset : block_store_offset * store_rank;
SelectKernelImpl<InT, MT, OutT, Functor, VecSize, MaskData, false>(
out + block_store_offset * store_rank,
out + out_fix,
mask + data_offset,
in + data_offset,
in + in_fix,
func,
size,
data_offset,
......@@ -323,12 +347,13 @@ __global__ void SelectKernel(OutT *out,
if (num > 0) {
// Cumsum index
int idx_cumsum = repeat * GRID_NUM_X + BLOCK_ID_X;
// niuliling todo: us ReadData API
int block_store_offset = static_cast<int>(cumsum[idx_cumsum]);
kps::details::ReadData<CT>(&block_store_offset, cumsum + idx_cumsum, 1);
int out_fix = MaskData < 2 ? block_store_offset * store_rank : data_offset;
int in_fix = MaskData < 2 ? data_offset : block_store_offset * store_rank;
SelectKernelImpl<InT, MT, OutT, Functor, VecSize, MaskData, true>(
out + block_store_offset * store_rank,
out + out_fix,
mask + data_offset,
in + data_offset,
in + in_fix,
func,
num,
data_offset,
......@@ -402,6 +427,7 @@ void SelectKernel(const KPDevice &dev_ctx,
const int kCumVesize = 2;
const int block_c = 256;
const int main_offset_c = Floor(size_count_block, (kCumVesize * block_c));
using Add = kps::AddFunctor<CT>;
CumsumOneBlock<CT, CT, Add, kCumVesize><<<1, block_c, 0, stream>>>(
count_data, cumsum_data, size_count_block, main_offset_c, Add());
......@@ -418,10 +444,13 @@ void SelectKernel(const KPDevice &dev_ctx,
dev_ctx.Wait();
// 3.1.2 allock for out with total_true_num
std::vector<int64_t> out_dim = {static_cast<int64_t>(total_true_num)};
if (SelectData == 0) { // where_index
if (SelectData == 1) {
out->Resize(phi::make_ddim(out_dim));
} else if (SelectData == 0) { // == 0 where_index
out_dim.push_back(rank);
out->Resize(phi::make_ddim(out_dim));
}
out->Resize(phi::make_ddim(out_dim));
auto out_data = out->mutable_data<OutT>(cuda_place);
// 3.2 get true data's index according to cond_data and cumsum_data
if (total_true_num <= 0) return;
......
......@@ -87,7 +87,8 @@ void Copy(const Context& dev_ctx,
: reinterpret_cast<const phi::GPUContext&>(dev_ctx).stream();
paddle::memory::Copy(
dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
} else if (paddle::platform::is_cpu_place(src_place) && // NOLINT
} else if ((paddle::platform::is_cpu_place(src_place) ||
paddle::platform::is_cuda_pinned_place(src_place)) && // NOLINT
paddle::platform::is_gpu_place(dst_place)) {
auto src_cpu_place = src_place;
auto dst_gpu_place = dst_place;
......
......@@ -28,19 +28,19 @@ template <typename Context, typename T, typename IndexT>
void GraphSendRecvGradOpCUDAKernelLaunchHelper(
const Context& ctx,
const DenseTensor& out_grad,
const DenseTensor& x,
const DenseTensor& src_index,
const DenseTensor& dst_index,
const std::string& pool_type,
DenseTensor* x_grad,
const DenseTensor* dst_count = nullptr,
const DenseTensor* x = nullptr,
const DenseTensor* out = nullptr) {
const int& index_size = dst_index.dims()[0];
ctx.template Alloc<T>(x_grad);
T* p_output = x_grad->data<T>();
const auto& src_dims = out_grad.dims();
const auto& src_dims = x.dims();
int64_t memset_size = 1;
for (int i = 0; i < src_dims.size(); ++i) {
memset_size *= src_dims[i];
......@@ -86,7 +86,7 @@ void GraphSendRecvGradOpCUDAKernelLaunchHelper(
ManipulateMeanGradCUDAKernel<T, IndexT><<<grid, block, 0, ctx.stream()>>>(
p_src, d_index, s_index, p_output, index_size, slice_size, s_count);
} else if (pool_type == "MAX" || pool_type == "MIN") {
const T* ptr_input = x->data<T>();
const T* ptr_input = x.data<T>();
const T* ptr_output = out->data<T>();
ManipulateMinMaxGradCUDAKernel<T, IndexT><<<grid, block, 0, ctx.stream()>>>(
p_src,
......@@ -103,7 +103,7 @@ void GraphSendRecvGradOpCUDAKernelLaunchHelper(
template <typename T, typename Context>
void GraphSendRecvGradKernel(const Context& ctx,
const DenseTensor& out_grad,
paddle::optional<const DenseTensor&> x,
const DenseTensor& x,
paddle::optional<const DenseTensor&> out,
const DenseTensor& src_index,
const DenseTensor& dst_index,
......@@ -115,23 +115,23 @@ void GraphSendRecvGradKernel(const Context& ctx,
GraphSendRecvGradOpCUDAKernelLaunchHelper<Context, T, int32_t>(
ctx,
out_grad,
x,
src_index,
dst_index,
pool_type,
x_grad,
dst_count.get_ptr(),
x.get_ptr(),
out.get_ptr());
} else if (index_type == phi::DataType::INT64) {
GraphSendRecvGradOpCUDAKernelLaunchHelper<Context, T, int64_t>(
ctx,
out_grad,
x,
src_index,
dst_index,
pool_type,
x_grad,
dst_count.get_ptr(),
x.get_ptr(),
out.get_ptr());
}
}
......
......@@ -32,6 +32,7 @@ void GraphSendRecvOpCUDAKernelLaunchHelper(const Context& ctx,
const DenseTensor& src_index,
const DenseTensor& dst_index,
const std::string& pool_type,
int64_t out_size,
DenseTensor* out,
DenseTensor* dst_count = nullptr) {
const int& index_size = src_index.dims()[0];
......@@ -39,8 +40,15 @@ void GraphSendRecvOpCUDAKernelLaunchHelper(const Context& ctx,
T* p_output = out->data<T>();
const auto& src_dims = x.dims();
int64_t memset_size = 1;
for (int i = 0; i < src_dims.size(); ++i) {
memset_size *= src_dims[i];
if (out_size <= 0) {
for (int i = 0; i < src_dims.size(); ++i) {
memset_size *= src_dims[i];
}
} else {
memset_size = out_size;
for (int i = 1; i < src_dims.size(); ++i) {
memset_size *= src_dims[i];
}
}
const size_t& memset_bytes = memset_size * sizeof(T);
if (pool_type == "SUM" || pool_type == "MEAN") {
......@@ -100,6 +108,9 @@ void GraphSendRecvOpCUDAKernelLaunchHelper(const Context& ctx,
IndexT>><<<grid, block, 0, ctx.stream()>>>(
p_src, s_index, d_index, p_output, index_size, slice_size, functor);
if (out_size > 0) {
input_size = out_size;
}
int64_t grid_max_tmp = (input_size * slice_size + block - 1) / block;
int64_t grid_max =
grid_max_tmp < max_grid_dimx ? grid_max_tmp : max_grid_dimx;
......@@ -114,6 +125,9 @@ void GraphSendRecvOpCUDAKernelLaunchHelper(const Context& ctx,
IndexT>><<<grid, block, 0, ctx.stream()>>>(
p_src, s_index, d_index, p_output, index_size, slice_size, functor);
if (out_size > 0) {
input_size = out_size;
}
int64_t grid_min_tmp = (input_size * slice_size + block - 1) / block;
int64_t grid_min =
grid_min_tmp < max_grid_dimx ? grid_min_tmp : max_grid_dimx;
......@@ -130,6 +144,9 @@ void GraphSendRecvOpCUDAKernelLaunchHelper(const Context& ctx,
ctx.template Alloc<int32_t>(dst_count);
int32_t* p_dst_count = dst_count->data<int32_t>();
if (out_size > 0) {
input_size = out_size;
}
#ifdef PADDLE_WITH_HIP
hipMemset(p_dst_count, 0, input_size * sizeof(int));
......@@ -155,15 +172,16 @@ void GraphSendRecvKernel(const Context& ctx,
const DenseTensor& src_index,
const DenseTensor& dst_index,
const std::string& pool_type,
int64_t out_size,
DenseTensor* out,
DenseTensor* dst_count) {
auto index_type = src_index.dtype();
if (index_type == phi::DataType::INT32) {
GraphSendRecvOpCUDAKernelLaunchHelper<Context, T, int32_t>(
ctx, x, src_index, dst_index, pool_type, out, dst_count);
ctx, x, src_index, dst_index, pool_type, out_size, out, dst_count);
} else if (index_type == phi::DataType::INT64) {
GraphSendRecvOpCUDAKernelLaunchHelper<Context, T, int64_t>(
ctx, x, src_index, dst_index, pool_type, out, dst_count);
ctx, x, src_index, dst_index, pool_type, out_size, out, dst_count);
}
}
......
......@@ -17,38 +17,31 @@
#include <thrust/reverse.h>
#include <thrust/scan.h>
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/select_impl.cu.h"
#include "paddle/phi/kernels/masked_select_grad_kernel.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
namespace phi {
__global__ void SetMaskArrayT(const bool* mask, int32_t* mask_array, int size) {
int idx = blockDim.x * blockIdx.x + threadIdx.x;
for (; idx < size; idx += blockDim.x * gridDim.x) {
if (mask[idx])
mask_array[idx] = 1;
else
mask_array[idx] = 0;
}
}
template <typename MT, typename InT, typename OutT>
struct MaskedSelectGradFunctor {
HOSTDEVICE MaskedSelectGradFunctor() {}
template <typename T>
__global__ void SelectGradWithPrefixMask(const int32_t* mask_prefix_sum,
const bool* mask,
const T* input,
T* out,
int size) {
int idx = blockDim.x * blockIdx.x + threadIdx.x;
for (; idx < size; idx += blockDim.x * gridDim.x) {
if (mask[idx]) {
int index = mask_prefix_sum[idx];
out[idx] = input[index];
} else {
out[idx] = 0;
HOSTDEVICE inline void operator()(OutT* out,
const MT* mask,
const InT* value,
int num) {
int read_fix = 0;
for (int idx = 0; idx < num; idx++) {
if (mask[idx]) {
out[idx] = value[read_fix++];
} else {
out[idx] = 0;
}
}
}
}
};
template <typename T, typename Context>
void MaskedSelectGradKernel(const Context& dev_ctx,
......@@ -56,42 +49,12 @@ void MaskedSelectGradKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& mask,
DenseTensor* x_grad) {
auto* mask_data = mask.data<bool>();
auto* input_data = out_grad.data<T>();
auto* out_data = x_grad->mutable_data<T>(dev_ctx.GetPlace());
auto input_size = out_grad.numel();
auto mask_size = mask.numel();
auto mask_dim = mask.dims();
auto out_size = mask_size;
DenseTensor mask_array;
DenseTensor mask_prefix_sum;
mask_array.Resize(mask_dim);
mask_prefix_sum.Resize(mask_dim);
int32_t* mask_array_data =
mask_array.mutable_data<int32_t>(dev_ctx.GetPlace());
int32_t* mask_prefix_sum_data =
mask_prefix_sum.mutable_data<int32_t>(dev_ctx.GetPlace());
int threads = 512;
int grid = (mask_size + threads - 1) / threads;
auto stream = dev_ctx.stream();
SetMaskArrayT<<<grid, threads, 0, stream>>>(
mask_data, mask_array_data, mask_size);
thrust::device_ptr<int32_t> mask_array_dev_ptr =
thrust::device_pointer_cast(mask_array_data);
thrust::device_vector<int32_t> mask_array_vec(mask_array_dev_ptr,
mask_array_dev_ptr + mask_size);
thrust::exclusive_scan(thrust::device,
mask_array_vec.begin(),
mask_array_vec.end(),
mask_prefix_sum_data);
SelectGradWithPrefixMask<T><<<grid, threads, 0, stream>>>(
mask_prefix_sum_data, mask_data, input_data, out_data, mask_size);
auto* out_data = x_grad->mutable_data<T>(dev_ctx.GetPlace());
if (mask_size <= 0) return;
using Functor = MaskedSelectGradFunctor<bool, T, T>;
phi::funcs::SelectKernel<bool, T, T, 2, Functor>(
dev_ctx, mask, out_grad, x_grad, Functor());
}
} // namespace phi
......
......@@ -17,11 +17,12 @@
#include <thrust/reverse.h>
#include <thrust/scan.h>
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/select_impl.cu.h"
#include "paddle/phi/kernels/masked_select_kernel.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
namespace phi {
template <typename MT, typename InT, typename OutT>
......
......@@ -20,13 +20,14 @@
namespace cub = hipcub;
#endif
#include "paddle/phi/core/ddim.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/select_impl.cu.h"
#include "paddle/phi/kernels/where_index_kernel.h"
#include "paddle/phi/core/ddim.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/kernel_registry.h"
namespace phi {
template <typename T1, typename T2, typename OutT>
struct IndexFunctor {
......
......@@ -23,7 +23,7 @@ namespace phi {
template <typename T, typename Context>
void GraphSendRecvGradKernel(const Context& ctx,
const DenseTensor& out_grad,
paddle::optional<const DenseTensor&> x,
const DenseTensor& x,
paddle::optional<const DenseTensor&> out,
const DenseTensor& src_index,
const DenseTensor& dst_index,
......
......@@ -25,6 +25,7 @@ void GraphSendRecvKernel(const Context& ctx,
const DenseTensor& src_index,
const DenseTensor& dst_index,
const std::string& pool_type,
int64_t out_size,
DenseTensor* out,
DenseTensor* dst_count);
......
......@@ -33,7 +33,7 @@ void FrobeniusNormGradKernel(const Context& ctx,
DataType out_dtype,
DenseTensor* dx) {
ReduceGradKernel<Context, T, funcs::FrobeniusNormGradFunctor>(
ctx, x, dout, out, axis, keep_dim, reduce_all, in_dtype, out_dtype, dx);
ctx, x, out, dout, axis, keep_dim, reduce_all, in_dtype, out_dtype, dx);
}
} // namespace phi
......@@ -87,8 +87,8 @@ template <typename Context,
bool kNoNeedBufferY = false>
void ReduceGradKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& out_grad,
const paddle::optional<DenseTensor>& out,
const DenseTensor& out_grad,
const std::vector<int64_t>& dims,
bool keep_dim,
bool reduce_all,
......
......@@ -24,8 +24,8 @@ namespace phi {
template <typename T, typename Context>
void ReduceMaxGradKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& out_grad,
const DenseTensor& out,
const DenseTensor& out_grad,
const std::vector<int64_t>& dims,
bool keep_dim,
bool reduce_all,
......@@ -34,8 +34,8 @@ void ReduceMaxGradKernel(const Context& dev_ctx,
DenseTensor* x_grad) {
ReduceGradKernel<Context, T, funcs::MaxOrMinGradFunctor>(dev_ctx,
x,
out_grad,
out,
out_grad,
dims,
keep_dim,
reduce_all,
......
......@@ -24,8 +24,8 @@ namespace phi {
template <typename T, typename Context>
void ReduceMinGradKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& out_grad,
const DenseTensor& out,
const DenseTensor& out_grad,
const std::vector<int64_t>& dims,
bool keep_dim,
bool reduce_all,
......@@ -34,8 +34,8 @@ void ReduceMinGradKernel(const Context& dev_ctx,
DenseTensor* x_grad) {
ReduceGradKernel<Context, T, funcs::MaxOrMinGradFunctor>(dev_ctx,
x,
out_grad,
out,
out_grad,
dims,
keep_dim,
reduce_all,
......
......@@ -24,8 +24,8 @@ namespace phi {
template <typename T, typename Context>
void ReduceProdGradKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& out_grad,
const DenseTensor& out,
const DenseTensor& out_grad,
const std::vector<int64_t>& dims,
bool keep_dim,
bool reduce_all,
......@@ -34,8 +34,8 @@ void ReduceProdGradKernel(const Context& dev_ctx,
DenseTensor* x_grad) {
ReduceGradKernel<Context, T, funcs::ProdGradFunctor>(dev_ctx,
x,
out_grad,
out,
out_grad,
dims,
keep_dim,
reduce_all,
......
......@@ -123,6 +123,15 @@ __device__ __forceinline__ void WriteData(T* dst,
dst[i] = src[i];
}
}
template <typename T>
__device__ __forceinline__ void ReadData(T* dst,
const T* __restrict__ src,
int num) {
for (int i = 0; i < num; i++) {
dst[i] = src[i];
}
}
#undef INT_BITS
} // namespace details
......
......@@ -43,8 +43,8 @@ void ReduceMeanGradKernel(const Context& dev_ctx,
template <typename T, typename Context>
void ReduceProdGradKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& out_grad,
const DenseTensor& out,
const DenseTensor& out_grad,
const std::vector<int64_t>& dims,
bool keep_dim,
bool reduce_all,
......@@ -55,8 +55,8 @@ void ReduceProdGradKernel(const Context& dev_ctx,
template <typename T, typename Context>
void ReduceMaxGradKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& out_grad,
const DenseTensor& out,
const DenseTensor& out_grad,
const std::vector<int64_t>& dims,
bool keep_dim,
bool reduce_all,
......@@ -67,8 +67,8 @@ void ReduceMaxGradKernel(const Context& dev_ctx,
template <typename T, typename Context>
void ReduceMinGradKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& out_grad,
const DenseTensor& out,
const DenseTensor& out_grad,
const std::vector<int64_t>& dims,
bool keep_dim,
bool reduce_all,
......
......@@ -18,10 +18,17 @@ namespace phi {
KernelSignature BatchNormOpArgumentMapping(const ArgumentMappingContext& ctx) {
bool is_test = paddle::any_cast<bool>(ctx.Attr("is_test"));
bool use_global_stats = paddle::any_cast<bool>(ctx.Attr("use_global_stats"));
bool use_global_stats =
ctx.HasAttr("use_global_stats")
? paddle::any_cast<bool>(ctx.Attr("use_global_stats"))
: false;
bool trainable_statistics =
paddle::any_cast<bool>(ctx.Attr("trainable_statistics"));
bool fuse_with_relu = paddle::any_cast<bool>(ctx.Attr("fuse_with_relu"));
ctx.HasAttr("trainable_statistics")
? paddle::any_cast<bool>(ctx.Attr("trainable_statistics"))
: false;
bool fuse_with_relu = ctx.HasAttr("fuse_with_relu")
? paddle::any_cast<bool>(ctx.Attr("fuse_with_relu"))
: false;
// Dispenable `MomentumTensor` is useless now
if (is_test && !use_global_stats && !trainable_statistics &&
!fuse_with_relu) {
......
......@@ -16,6 +16,14 @@ limitations under the License. */
namespace phi {
KernelSignature GraphSendRecvOpArgumentMapping(
const ArgumentMappingContext& ctx) {
return KernelSignature("graph_send_recv",
{"X", "Src_index", "Dst_index"},
{"pool_type", "out_size"},
{"Out", "Dst_count"});
}
KernelSignature GraphSendRecvGradOpArgumentMapping(
const ArgumentMappingContext& ctx) {
return KernelSignature(
......@@ -27,5 +35,8 @@ KernelSignature GraphSendRecvGradOpArgumentMapping(
} // namespace phi
PD_REGISTER_ARG_MAPPING_FN(graph_send_recv,
phi::GraphSendRecvOpArgumentMapping);
PD_REGISTER_ARG_MAPPING_FN(graph_send_recv_grad,
phi::GraphSendRecvGradOpArgumentMapping);
......@@ -149,7 +149,7 @@ KernelSignature ReduceMaxGradOpArgumentMapping(
const ArgumentMappingContext& ctx) {
return KernelSignature(
"max_grad",
{"X", GradVarName("Out"), "Out"},
{"X", "Out", GradVarName("Out")},
{"dim", "keep_dim", "reduce_all", "in_dtype", "out_dtype"},
{GradVarName("X")});
}
......@@ -158,7 +158,7 @@ KernelSignature ReduceMinGradOpArgumentMapping(
const ArgumentMappingContext& ctx) {
return KernelSignature(
"min_grad",
{"X", GradVarName("Out"), "Out"},
{"X", "Out", GradVarName("Out")},
{"dim", "keep_dim", "reduce_all", "in_dtype", "out_dtype"},
{GradVarName("X")});
}
......@@ -167,7 +167,7 @@ KernelSignature ReduceProdGradOpArgumentMapping(
const ArgumentMappingContext& ctx) {
return KernelSignature(
"prod_grad",
{"X", GradVarName("Out"), "Out"},
{"X", "Out", GradVarName("Out")},
{"dim", "keep_dim", "reduce_all", "in_dtype", "out_dtype"},
{GradVarName("X")});
}
......
......@@ -17,6 +17,7 @@ limitations under the License. */
#include "paddle/phi/api/include/api.h"
#include "paddle/phi/common/complex.h"
#include "paddle/phi/common/place.h"
#include "paddle/phi/core/compat/convert_utils.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/kernel_registry.h"
......@@ -39,10 +40,10 @@ TEST(API, data_transform_same_place) {
auto x = paddle::experimental::full({3, 3},
1.0,
experimental::DataType::COMPLEX128,
experimental::Backend::CPU);
experimental::CPUPlace());
auto y = paddle::experimental::full(
{3, 3}, 2.0, experimental::DataType::FLOAT32, experimental::Backend::CPU);
{3, 3}, 2.0, experimental::DataType::FLOAT32, experimental::CPUPlace());
std::vector<phi::dtype::complex<double>> sum(9, 6.0);
......@@ -74,10 +75,10 @@ TEST(API, data_transform_same_place) {
TEST(Tensor, data_transform_diff_place) {
// 1. create tensor
auto x = paddle::experimental::full(
{3, 3}, 1.0, experimental::DataType::FLOAT64, experimental::Backend::CPU);
{3, 3}, 1.0, experimental::DataType::FLOAT64, experimental::CPUPlace());
auto y = paddle::experimental::full(
{3, 3}, 2.0, experimental::DataType::FLOAT64, experimental::Backend::GPU);
{3, 3}, 2.0, experimental::DataType::FLOAT64, experimental::GPUPlace());
std::vector<float> sum(9, 6.0);
......@@ -95,7 +96,7 @@ TEST(Tensor, data_transform_diff_place) {
ASSERT_EQ(out.impl()->place(),
phi::TransToPhiPlace(experimental::Backend::GPU));
auto ref_out = experimental::copy_to(out, experimental::Backend::CPU, true);
auto ref_out = experimental::copy_to(out, experimental::CPUPlace(), true);
auto dense_out = std::dynamic_pointer_cast<phi::DenseTensor>(ref_out.impl());
for (size_t i = 0; i < 9; i++) {
......
......@@ -30,7 +30,7 @@ namespace tests {
TEST(API, scale) {
auto x = experimental::full(
{3, 4}, 1.0, experimental::DataType::FLOAT32, experimental::Backend::CPU);
{3, 4}, 1.0, experimental::DataType::FLOAT32, experimental::CPUPlace());
const size_t cycles = 300;
phi::tests::Timer timer;
......
......@@ -69,10 +69,10 @@ TEST(API, copy_to) {
// 2. test API
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto tmp = paddle::experimental::copy_to(x, phi::Backend::GPU, false);
auto out = paddle::experimental::copy_to(tmp, phi::Backend::CPU, true);
auto tmp = paddle::experimental::copy_to(x, phi::GPUPlace(), false);
auto out = paddle::experimental::copy_to(tmp, phi::CPUPlace(), true);
#else
auto out = paddle::experimental::copy_to(x, phi::Backend::CPU, false);
auto out = paddle::experimental::copy_to(x, phi::CPUPlace(), false);
#endif
// 3. check result
......@@ -85,10 +85,10 @@ TEST(Tensor, copy_to) {
// 2. test API
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
auto tmp = x.copy_to(phi::Backend::GPU, false);
auto out = tmp.copy_to(phi::Backend::CPU, true);
auto tmp = x.copy_to(phi::GPUPlace(), false);
auto out = tmp.copy_to(phi::CPUPlace(), true);
#else
auto out = x.copy_to(phi::Backend::CPU, false);
auto out = x.copy_to(phi::CPUPlace(), false);
#endif
// 3. check result
......
......@@ -12,74 +12,69 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from .job.container import Container
from .job.pod import Pod
from .job.job import Job
from . import plugins
#__all__ = [Container, Pod, Job]
__all__ = []
'''
Paddle distribution training entry ``python -m paddle.distributed.run``.
Paddle distributed training entry ``python -m paddle.distributed.launch``.
Help
# for arg usage and explanation, try the following command
# python -m paddle.distributed.run -h
# python -m paddle.distributed.launch -h
Collective Mode
Case 1: 1 node
use all visible devices
# python -m paddle.distributed.run train.py
# python -m paddle.distributed.launch train.py
use specified devices
# python -m paddle.distributed.run --devices=0,1,2,3 train.py
# python -m paddle.distributed.launch --devices=0,1,2,3 train.py
Case 2: multi-node, auto detect ip/port
# python -m paddle.distributed.run --np 2 train.py
# python -m paddle.distributed.launch --nnodes 2 train.py
# auto print following command
# python -m paddle.distributed.run --master 10.0.0.1:13538 --np 2 demo.py
# python -m paddle.distributed.launch --master 10.0.0.1:13538 --nnodes 2 demo.py
# then copy and paste above command to other nodes
Case 3: multi-node, specified master/rendezvous server
# python -m paddle.distributed.run --np 2 --master 10.0.0.1:2379 train.py
# python -m paddle.distributed.launch --nnodes 2 --master 10.0.0.1:2379 train.py
# the master ip must be one of the node and the port must available
Parameter Server Mode
Case 1.1: 1 node, 1 ps, 1 trainer
# python -m paddle.distributed.run --mode ps train.py
# python -m paddle.distributed.run --server_num=1 --trainer_num=1 train.py
# python -m paddle.distributed.launch --mode ps train.py
# python -m paddle.distributed.launch --server_num=1 --trainer_num=1 train.py
Case 1.2: 1 node, 2 ps, 2 trainer
# python -m paddle.distributed.run --server_num=2 --trainer_num=2 train.py
# python -m paddle.distributed.launch --server_num=2 --trainer_num=2 train.py
Case 2: 2 node, 2 ps, 2 trainer per node
# python -m paddle.distributed.run --server_num=2 --trainer_num=2 --np 2 train.py
# python -m paddle.distributed.launch --server_num=2 --trainer_num=2 --nnodes 2 train.py
# auto print following command
# python -m paddle.distributed.run --master 10.0.0.1:13538 --server_num=2 --trainer_num=2 --np 2 train.py
# python -m paddle.distributed.launch --master 10.0.0.1:13538 --server_num=2 --trainer_num=2 --nnodes 2 train.py
# then copy and paste above command to other nodes
Case 3: multi-node, specified master/rendezvous server
# python -m paddle.distributed.run --master 10.0.0.1:13538 --server_num=2 --trainer_num=2 --np 2 train.py
# python -m paddle.distributed.launch --master 10.0.0.1:13538 --server_num=2 --trainer_num=2 --nnodes 2 train.py
# the master ip must be one of the node and the port must available
Case 4: specified servers and trainers in each node
python -m paddle.distributed.run --servers 127.0.0.1:8900,127.0.0.1:8901 --trainers 127.0.0.1:8902,127.0.0.1:8903 train.py
python -m paddle.distributed.launch --servers 127.0.0.1:8900,127.0.0.1:8901 --trainers 127.0.0.1:8902,127.0.0.1:8903 train.py
Elastic Mode
# run following command in 3 node to run immediately, or in 2 node to run after elastic_timeout
# python -m paddle.distributed.run --master etcd://10.0.0.1:2379 --np 2:3 train.py
# python -m paddle.distributed.launch --master etcd://10.0.0.1:2379 --nnodes 2:3 train.py
# once the peer number changes between 2:3, the strategy holds
......
......@@ -15,14 +15,28 @@
from .context import Context
from . import controllers
# initialize the context to run
ctx = Context()
# initialize the selected controller
c = controllers.init(ctx)
def launch():
# initialize the context to run
ctx = Context()
# run the pods
c.run()
if ctx.is_legacy_mode():
# manager or just wait pod
c.finalize()
# legacy mode
from paddle.distributed.fleet import launch
launch.launch()
else:
# initialize the selected controller
c = controllers.init(ctx)
# run the pods
c.run()
# manager or just wait pod
c.finalize()
if __name__ == "__main__":
launch()
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.distributed.launch import plugins
from .node import Node
from .status import Status
from .args_envs import parse_args, fetch_envs, env_args_mapping
import logging
class Context(object):
def __init__(self, enable_plugin=True):
self.args, self.unknown_args = parse_args()
self.envs = fetch_envs()
self.logger = self.get_logger()
self.node = Node()
self.status = Status()
self.set_env_in_args()
# design for event queue, later
self.events = []
if enable_plugin:
self._enable_plugin()
def is_legacy_mode(self):
if self.args.legacy:
return True
if len(self.unknown_args) > 0:
self.logger.warning("Compatible mode enable with args {}".format(
self.unknown_args))
return True
legacy_env_list = [
'DISTRIBUTED_TRAINER_ENDPOINTS',
'PADDLE_ELASTIC_JOB_ID',
'PADDLE_DISTRI_BACKEND',
'FLAGS_START_PORT',
]
for env in legacy_env_list:
if env in self.envs:
self.logger.warning(
"ENV {} is deprecated, legacy launch enable".format(env))
return True
if self.args.master:
return False
return False
def get_envs(self):
return self.envs.copy()
def _enable_plugin(self):
for pl in plugins.enabled_plugins:
pl(self)
def get_logger(self, level=logging.INFO):
logger = logging.getLogger("LAUNCH")
logger.setLevel(self.args.log_level.upper() or level)
formatter = logging.Formatter(
fmt='%(name)s %(levelname)s %(asctime)s %(message)s')
ch = logging.StreamHandler()
ch.setFormatter(formatter)
logger.addHandler(ch)
return logger
def set_env_in_args(self):
for k, v in env_args_mapping.items():
if k in self.envs:
setattr(self.args, v, self.envs[k])
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from argparse import ArgumentParser, REMAINDER
env_args_mapping = {
'POD_IP': 'host',
'PADDLE_MASTER': 'master',
'PADDLE_DEVICES': 'devices',
'PADDLE_NNODES': 'nnodes',
'PADDLE_MODE': 'mode',
'PADDLE_LOG_LEVEL': 'log_level',
'PADDLE_NPROC_PER_NODE': 'nproc_per_node',
'PADDLE_JOB_ID': 'job_id',
'PADDLE_RANK': 'rank',
'PADDLE_LOG_DIR': 'log_dir',
'PADDLE_MAX_RESTART': 'max_restart',
'PADDLE_ELASTIC_LEVEL': 'elastic_level',
'PADDLE_ELASTIC_TIMEOUT': 'elastic_timeout',
'PADDLE_SERVER_NUM': 'server_num',
'PADDLE_TRAINER_NUM': 'trainer_num',
'PADDLE_SERVERS_ENDPOINTS': 'servers',
'PADDLE_TRAINERS_ENDPOINTS': 'trainers',
'PADDLE_GLOO_PORT': 'gloo_port',
'PADDLE_WITH_GLOO': 'with_gloo',
}
def fetch_envs():
os.environ.pop('http_proxy', None)
os.environ.pop('https_proxy', None)
return os.environ.copy()
def parse_args():
parser = ArgumentParser()
base_group = parser.add_argument_group("Base Parameters")
base_group.add_argument(
"--master",
type=str,
default=None,
help="the master/rendezvous server, ip:port")
base_group.add_argument(
"--legacy", type=bool, default=False, help="use legacy launch")
base_group.add_argument(
"--rank", type=int, default=-1, help="the peer rank")
base_group.add_argument(
"--log_level", type=str, default="INFO", help="log level. Default INFO")
base_group.add_argument(
"--nnodes",
type=str,
default="1",
help="the number of peers, i.e. pod/node number")
base_group.add_argument(
"--nproc_per_node",
type=int,
default=None,
help="the number of processes in a pod")
base_group.add_argument(
"--log_dir",
type=str,
default="log",
help="the path for each process's log. Default ./log")
base_group.add_argument(
"--mode",
type=str,
default="collective",
help="run mode of the job, collective/ps/ps-heter")
base_group.add_argument(
"--job_id",
type=str,
default="default",
help="unique id of the job. Default default")
base_group.add_argument(
"--devices",
type=str,
default=None,
help="accelerate devices. as --gpus,npus,xps")
base_group.add_argument("--host", type=str, default=None, help="host ip")
base_group.add_argument(
"training_script",
type=str,
help="the full path of py script,"
"followed by arguments for the "
"training script")
base_group.add_argument('training_script_args', nargs=REMAINDER)
ps_group = parser.add_argument_group("Parameter-Server Parameters")
# for parameter server
ps_group.add_argument(
"--servers", type=str, default='', help="servers endpoints full list")
ps_group.add_argument(
"--trainers", type=str, default='', help="trainers endpoints full list")
ps_group.add_argument(
"--trainer_num", type=int, default=None, help="number of trainers")
ps_group.add_argument(
"--server_num", type=int, default=None, help="number of servers")
ps_group.add_argument(
"--gloo_port", type=int, default=6767, help="gloo http port")
ps_group.add_argument(
"--with_gloo", type=str, default="0", help="use gloo or not")
# parameter elastic mode
elastic_group = parser.add_argument_group("Elastic Parameters")
elastic_group.add_argument(
"--max_restart",
type=int,
default=3,
help="the times can restart. Default 3")
elastic_group.add_argument(
"--elastic_level",
type=int,
default=-1,
help="elastic level: -1 disable, 0 failed exit, peers hold, 1 internal restart"
)
elastic_group.add_argument(
"--elastic_timeout",
type=int,
default=30,
help="seconds to wait before elastic perform training")
return parser.parse_known_args()
......@@ -20,36 +20,90 @@ class DeviceType:
GPU = 'gpu'
XPU = 'xpu'
NPU = 'npu'
MLU = 'mlu'
class Device(object):
def __init__(self, dtype=None, count=1, memory="", labels=""):
self.dtype = dtype
self.count = count
self.memory = memory
self.labels = labels
def __init__(self, dtype=None, memory="", labels=""):
self._dtype = dtype
self._memory = memory
self._labels = labels
def __str__(self):
return ",".join(self.labels)
return ",".join(self._labels)
@property
def dtype(self):
return self._dtype
@property
def count(self):
return len(self._labels) or 1
@property
def memory(self):
return self._memory
@property
def labels(self):
return self._labels
@labels.setter
def labels(self, lbs):
if isinstance(lbs, str):
self._labels = lbs.split(',')
elif isinstance(lbs, list):
self._labels = lbs
else:
self._labels = []
def get_selected_flag_key(self):
if self._dtype == DeviceType.CPU:
return 'FLAGS_selected_cpus'
if self._dtype == DeviceType.GPU:
return 'FLAGS_selected_gpus'
if self._dtype == DeviceType.NPU:
return 'FLAGS_selected_npus'
if self._dtype == DeviceType.XPU:
return 'FLAGS_selected_xpus'
if self._dtype == DeviceType.MLU:
return 'FLAGS_selected_mlus'
return 'FLAGS_selected_devices'
def get_selected_flag_label(self, idx):
if idx < len(self._labels):
return self._labels[idx]
else:
return '0'
def selected_flags(self, idx=None):
if idx is None:
return {self.get_selected_flag_key(): ','.join(self._labels)}
else:
return {
self.get_selected_flag_key(): self.get_selected_flag_label(idx)
}
@classmethod
def parse_device(self):
dev = Device()
visible_devices = None
if 'CUDA_VISIBLE_DEVICES' in os.environ or 'NVIDIA_VISIBLE_DEVICES' in os.environ:
dev.dtype = DeviceType.GPU
dev._dtype = DeviceType.GPU
visible_devices = os.getenv("CUDA_VISIBLE_DEVICES") or os.getenv(
"NVIDIA_VISIBLE_DEVICES")
elif 'XPU_VISIBLE_DEVICES' in os.environ:
dev.dtype = DeviceType.XPU
dev._dtype = DeviceType.XPU
visible_devices = os.getenv("XPU_VISIBLE_DEVICES")
elif 'ASCEND_VISIBLE_DEVICES' in os.environ:
dev.dtype = DeviceType.NPU
dev._dtype = DeviceType.NPU
visible_devices = os.getenv("ASCEND_VISIBLE_DEVICES")
elif 'MLU_VISIBLE_DEVICES' in os.environ:
dev._dtype = DeviceType.MLU
visible_devices = os.getenv("MLU_VISIBLE_DEVICES")
if visible_devices and visible_devices != 'all':
dev.labels = visible_devices.split(',')
dev.count = len(dev.labels)
if visible_devices is not None and visible_devices != 'all':
dev._labels = visible_devices.split(',')
else:
return self.detect_device()
......@@ -63,26 +117,33 @@ class Device(object):
num = 0
visible_devices = None
if fluid.core.is_compiled_with_cuda():
dev.dtype = DeviceType.GPU
dev._dtype = DeviceType.GPU
num = fluid.core.get_cuda_device_count()
visible_devices = os.getenv("CUDA_VISIBLE_DEVICES") or os.getenv(
"NVIDIA_VISIBLE_DEVICES")
elif fluid.core.is_compiled_with_xpu():
dev.dtype = DeviceType.XPU
dev._dtype = DeviceType.XPU
num = fluid.core.get_xpu_device_count()
visible_devices = os.getenv("XPU_VISIBLE_DEVICES")
elif fluid.core.is_compiled_with_npu():
dev.dtype = DeviceType.NPU
dev._dtype = DeviceType.NPU
num = fluid.core.get_npu_device_count()
visible_devices = os.getenv("ASCEND_VISIBLE_DEVICES")
elif fluid.core.is_compiled_with_mlu():
dev._dtype = DeviceType.MLU
num = fluid.core.get_mlu_device_count()
visible_devices = os.getenv("MLU_VISIBLE_DEVICES")
if num == 0:
dev.dtype = DeviceType.CPU
elif visible_devices is None or visible_devices == "all" or visible_devices == "":
dev.labels = [str(x) for x in range(0, num)]
dev.count = num
dev._dtype = DeviceType.CPU
elif visible_devices is None or visible_devices == "all":
dev._labels = [str(x) for x in range(0, num)]
else:
dev.labels = visible_devices.split(',')
dev.count = len(dev.labels)
dev._labels = visible_devices.split(',')
return dev
if __name__ == '__main__':
d = Device.parse_device()
print(d.get_selected_flag())
......@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
__all__ = ["init"]
__all__ = []
from .collective import CollectiveController
from .collective import CollectiveElasticController
......
......@@ -89,6 +89,10 @@ class CollectiveController(Controller):
"PADDLE_TRAINERS_NUM": "{}".format(global_size),
"PADDLE_RANK_IN_NODE": str(i),
}
if self.pod.replicas == 1:
e.update(self.ctx.node.device.selected_flags())
else:
e.update(self.ctx.node.device.selected_flags(i))
self.add_container(envs=e, log_tag=i)
return True
......@@ -106,7 +110,8 @@ class CollectiveElasticController(CollectiveController):
def register(self):
if self.job.id == 'default':
self.ctx.logger.warning(
'Using default job name may cause conflict, add --id in args')
'Using default job name may cause conflict, add --job_id in args'
)
self.master.register_heartbeat(self.job.id, self.pod.name)
......@@ -114,6 +119,8 @@ class CollectiveElasticController(CollectiveController):
'''
watch self and peer status, return true to exit
'''
self.ctx.logger.info("Watching {}".format(self.pod))
while not self.ctx.status.is_done():
# self status
status = self.pod.watch(timeout=2)
......@@ -171,13 +178,8 @@ class CollectiveElasticController(CollectiveController):
continue
self.master.set_status(self.ctx.status.RUNNING)
self.ctx.status.run()
assert len(self.pod.containers) > 0, "No container in the pod"
self.ctx.logger.debug("Run {}".format(self.pod))
self.ctx.logger.debug("Run {}".format(self.pod.containers[0]))
self.pod.deploy()
self.deploy_pod()
if self.watch():
break
......
......@@ -16,9 +16,9 @@ import sys
import os
import signal
from paddle.distributed.run.job import Job
from paddle.distributed.run.job import Pod
from paddle.distributed.run.job import Container
from paddle.distributed.launch.job.job import Job
from paddle.distributed.launch.job.pod import Pod
from paddle.distributed.launch.job.container import Container
from .master import Master
......@@ -39,38 +39,43 @@ class ControllerBase(object):
self.ctx = ctx
self.master = Master.factory(self.ctx)
self.job = Job(np=self.ctx.args.np,
self.job = Job(nnodes=self.ctx.args.nnodes,
mode=self.ctx.args.mode,
id=self.ctx.args.id)
jid=self.ctx.args.job_id)
self.pod = Pod()
self.join_server = None
def run(self):
self.build_job()
self.build_pod()
def deploy_pod(self):
if len(self.pod.containers) < 1:
self.ctx.logger.error("No container in the pod {}".format(self.pod))
return
assert len(self.pod.containers) > 0, "No container in the pod"
self.ctx.logger.info("Run {}".format(self.pod))
self.ctx.logger.debug(self.pod.containers[0])
self.ctx.status.run()
self.pod.deploy()
def run(self):
self.build_job()
self.build_pod()
self.deploy_pod()
self.watch()
def watch(self) -> bool:
self.ctx.logger.info("Watching {}".format(self.pod))
status = self.pod.watch()
if status == self.ctx.status.COMPLETED:
self.ctx.logger.info("Pod {}".format(status))
elif status == self.ctx.status.FAILED:
fc = self.pod.failed_container()
self.ctx.logger.info("Pod {}".format(status))
self.ctx.logger.error("Container failed !!!\n{}".format(
self.pod.failed_container()))
self.pod.tail()
self.ctx.logger.error("Container failed !!!\n{}".format(fc[0]))
fc[0].tail()
self.pod.stop()
def stop(self, sigint=None):
......
......@@ -12,8 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.distributed.run.utils.kv_client import KVClient
from paddle.distributed.run.utils.kv_server import KVServer
from paddle.distributed.launch.utils.kv_client import KVClient
from paddle.distributed.launch.utils.kv_server import KVServer
import time
import sys
......@@ -84,7 +84,7 @@ class HTTPMaster(Master):
print("Copy the following command to other nodes to run.")
cmd = [
sys.executable.split('/')[-1], "-m", "paddle.distributed.run"
sys.executable.split('/')[-1], "-m", "paddle.distributed.launch"
]
cmd.extend(["--master", self.endpoint])
cmd.extend(sys.argv[1:])
......@@ -118,9 +118,12 @@ class HTTPMaster(Master):
self._stop_server()
def sync_peers(self, prefix, key, value, size, rank=-1) -> (list, int):
if size < 2:
return [value], 0
self.ctx.logger.info("Waiting peer ready...")
self.lazy_init()
while not self.ctx.status.is_done():
......@@ -130,7 +133,7 @@ class HTTPMaster(Master):
self.ctx.logger.warning("master not ready")
time.sleep(0.1)
# 'aaaaaa' make suer main pod (master server) as rank 0
# 'aaaaaa' make sure main pod (master server) as rank 0
ky = 'aaaaaa' if rank < 0 and self.role == Master.MAIN else key
k = "{}/{}/{}".format(prefix, ky, rank)
......@@ -177,6 +180,12 @@ class ETCDMaster(Master):
sync_peers gather all value for key under scope prefix
result always be sorted either by rank or alphabet of pod.name
'''
if size < 2:
return [value], 0
self.ctx.logger.info("Waiting peer ready...")
path = "{}/{}/{}".format(prefix, key, rank)
self.client.delete_prefix(prefix)
......
......@@ -22,7 +22,8 @@ class PSController(Controller):
@classmethod
def enable(cls, ctx):
if ctx.args.mode == ControleMode.PS or ctx.args.server_num or len(
ctx.args.servers) > 0:
ctx.args.servers) > 0 or ctx.args.trainer_num or len(
ctx.args.trainers) > 0:
ctx.logger.debug("{} enabled".format(cls.__name__))
ctx.args.mode = ControleMode.PS
return True
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.distributed.fleet import launch
launch.launch()
__all__ = []
......@@ -13,12 +13,11 @@
# limitations under the License.
from collections import OrderedDict
from paddle.distributed.run.utils.process_context import ProcessContext
from paddle.distributed.launch.utils.process_context import ProcessContext
from .status import Status
import os, copy, sys
import time
class Container(object):
......@@ -78,6 +77,11 @@ class Container(object):
kwargs = {k: v for k, v in kwargs.items() if isinstance(v, str)}
self._env.update(kwargs)
def _valide_env(self):
for k, v in self._env.items():
assert isinstance(k, str) and isinstance(
v, str), 'env {}:{} must be str'.format(k, v)
def _get_fd(self, pth):
if not pth:
return None
......@@ -90,12 +94,12 @@ class Container(object):
except:
return None
def start(self, timeout=-1):
end = time.time() + timeout
def start(self):
if self._proc and self._proc.alive():
return True
self._valide_env()
self._stdout = self._get_fd(self._out) or sys.stdout
if self._out == self._err:
self._stderr = self._stdout
......@@ -106,14 +110,6 @@ class Container(object):
self._entrypoint, env=self._env, out=self._stdout, err=self._stderr)
self._proc.start()
while timeout > 0 and time.time() < end:
if self._proc.alive():
time.sleep(0.1)
continue
if self._proc.exit_code() == 0:
return True
return False
def terminate(self, force=False):
if self._log_handler:
self._log_handler.close()
......@@ -125,9 +121,11 @@ class Container(object):
def wait(self, timeout=None):
self._proc.wait(timeout)
@property
def exit_code(self):
return self._proc.exit_code() if self._proc else -1
@property
def status(self):
if not self._proc:
return Status.UNINIT
......@@ -141,9 +139,9 @@ class Container(object):
def __str__(self):
return 'Container rank {} status {} cmd {} code {} log {} \nenv {}'.format(
self._rank,
self.status(),
self.status,
self._entrypoint,
self.exit_code(),
self.exit_code,
self.errfile,
self._env, )
......
......@@ -20,16 +20,16 @@ class JobMode:
class Job(object):
def __init__(self, id='default', mode=JobMode.COLLECTIVE, np="1"):
def __init__(self, jid='default', mode=JobMode.COLLECTIVE, nnodes="1"):
self._mode = mode
self._id = id
self._id = jid
self._replicas = 0
self._replicas_min = self._replicas
self._replicas_max = self._replicas
self._elastic = False
self.set_replicas(str(np))
self.set_replicas(str(nnodes))
def __str__(self):
return "Job: {}, mode {}, replicas {}[{}:{}], elastic {}".format(
......@@ -64,8 +64,8 @@ class Job(object):
def replicas(self, replicas):
self._replicas = replicas
def set_replicas(self, np: str):
np = str(np) if np else '1'
def set_replicas(self, nnodes: str):
np = str(nnodes) if nnodes else '1'
if ':' in np:
nps = np.split(':')
......
......@@ -34,7 +34,7 @@ class PodSepc(object):
#self.status: Status = None
self._rank = -1
self._init_timeout = 120 # 2 min timeout for each init container
self._init_timeout = None
self._restart = -1
self._replicas = 0 # number of containers
self._exit_code = 0
......@@ -45,15 +45,15 @@ class Pod(PodSepc):
super().__init__()
def __str__(self):
return "Pod: {}, replicas {}, status {}".format(self.name,
self.replicas,
self.status())
return "Pod: {}, replicas {}, status {}".format(
self.name, self.replicas, self.status)
def failed_container(self):
cs = []
for c in self._containers:
if c.status() == Status.FAILED:
return c
return None
if c.status == Status.FAILED:
cs.append(c)
return cs
@property
def name(self):
......@@ -65,7 +65,7 @@ class Pod(PodSepc):
@replicas.setter
def replicas(self, r):
self._replicas = r
self._replicas = max(r, 1)
@property
def rank(self):
......@@ -98,13 +98,15 @@ class Pod(PodSepc):
@property
def exit_code(self):
for c in self._containers:
if c.exit_code() != 0:
return c.exit_code()
if c.exit_code != 0:
return c.exit_code
return 0
def deploy(self):
# init container should stop before run containers
for i in self._init_containers:
i.start(self._init_timeout)
i.start()
i.wait(self._init_timeout)
for c in self._containers:
c.start()
......@@ -120,6 +122,7 @@ class Pod(PodSepc):
for c in self._containers:
c.wait(None)
@property
def status(self):
if self.is_failed():
return Status.FAILED
......@@ -127,6 +130,9 @@ class Pod(PodSepc):
if self.is_completed():
return Status.COMPLETED
if self.is_running():
return Status.RUNNING
return Status.READY
def reset(self):
......@@ -135,31 +141,31 @@ class Pod(PodSepc):
def is_failed(self):
for c in self._containers:
if c.status() == Status.FAILED:
if c.status == Status.FAILED:
return True
return False
def is_completed(self):
for c in self._containers:
if c.status() != Status.COMPLETED:
if c.status != Status.COMPLETED:
return False
return True
def is_running(self):
for c in self._containers:
if c.status != Status.RUNNING:
return False
return True
def logs(self, idx=None):
if idx is None:
if self.failed_container():
self.failed_container().logs()
else:
self._containers[0].logs()
self._containers[0].logs()
else:
self._containers[idx].logs()
def tail(self, idx=None):
if idx is None:
if self.failed_container():
self.failed_container().tail()
else:
self._containers[0].tail()
self._containers[0].tail()
else:
self._containers[idx].tail()
......@@ -175,10 +181,10 @@ class Pod(PodSepc):
end = time.time() + timeout
while timeout < 0 or time.time() < end:
for c in self._containers:
if c.status() in any_list:
return c.status()
if c.status in any_list:
return c.status
s = [c.status() for c in self._containers]
s = [c.status for c in self._containers]
if len(set(s)) == 1 and s[0] in all_list:
return s[0]
......
......@@ -30,15 +30,26 @@ def process_args(ctx):
argdev = ctx.args.devices
if argdev:
ctx.node.device.labels = argdev.split(',')
ctx.node.device.count = len(ctx.node.device.labels)
ctx.logger.debug('Device reset by args {}'.format(argdev))
def collective_compatible(ctx):
if 'PADDLE_TRAINER_ENDPOINTS' in ctx.envs:
ctx.master = ctx.envs['PADDLE_TRAINER_ENDPOINTS'].split(',')[0]
eps = ctx.envs['PADDLE_TRAINER_ENDPOINTS'].split(',')
hosts = set([h.split(':')[0] for h in eps])
ctx.args.master = eps[0] if ':' in eps[0] else '{}:6768'.format(eps[0])
ctx.args.nnodes = len(hosts)
ctx.logger.info('args reset by env PADDLE_TRAINER_ENDPOINTS\n{}'.format(
eps))
'''
if 'DISTRIBUTED_TRAINER_ENDPOINTS' in ctx.envs:
ctx.master = ctx.envs['DISTRIBUTED_TRAINER_ENDPOINTS'].split(',')[0]
eps = ctx.envs['DISTRIBUTED_TRAINER_ENDPOINTS'].split(',')
hosts = set([h.split(':')[0] for h in eps])
ctx.args.master = eps[0]
ctx.args.nnodes = len(hosts)
ctx.logger.info(
'args reset by env DISTRIBUTED_TRAINER_ENDPOINTS\n{}'.format(eps))
'''
def rewrite_host_ip(ctx):
......
......@@ -11,15 +11,3 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .pod import Pod
from .job import Job
from .container import Container
from .status import Status
__all__ = [
'Pod',
'Job',
'Container',
'Status',
]
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from argparse import ArgumentParser, REMAINDER
import os, copy
from paddle.distributed.run import plugins
from .node import Node
from .status import Status
import logging
class Context(object):
def __init__(self, enable_plugin=True):
os.environ.pop('http_proxy', None)
os.environ.pop('https_proxy', None)
self.args = self.parse_args()
self.envs = self.fetch_envs()
self.logger = self.get_logger()
self.node = Node()
self.status = Status()
self.set_env_in_args()
# design for event queue, later
self.events = []
if enable_plugin:
self._enable_plugin()
def get_envs(self):
return self.envs.copy()
def _enable_plugin(self):
for pl in plugins.enabled_plugins:
pl(self)
def parse_args(self):
parser = ArgumentParser()
base_group = parser.add_argument_group("Base Parameters")
base_group.add_argument(
"--master",
type=str,
default=None,
help="the master/rendezvous server, ip:port")
base_group.add_argument(
"--rank", type=int, default=-1, help="the peer rank")
base_group.add_argument(
"--log", type=str, default="INFO", help="log level. Default INFO")
base_group.add_argument(
"--np",
type=str,
default="1",
help="the number of peers, i.e. pod/node number")
base_group.add_argument(
"--nproc_per_node",
type=int,
default=None,
help="the number of processes in a pod")
base_group.add_argument(
"--log_dir",
type=str,
default="log",
help="the path for each process's log. Default ./log")
base_group.add_argument(
"--mode",
type=str,
default="collective",
help="run mode of the job, collective/ps/ps-heter")
base_group.add_argument(
"--id",
type=str,
default="default",
help="unique id of the job. Default default")
base_group.add_argument(
"--devices",
type=str,
default=None,
help="accelerate devices. as --gpus,npus,xps")
base_group.add_argument(
"--host", type=str, default=None, help="host ip")
base_group.add_argument(
"training_script",
type=str,
help="the full path of py script,"
"followed by arguments for the "
"training script")
base_group.add_argument('training_script_args', nargs=REMAINDER)
ps_group = parser.add_argument_group("Parameter-Server Parameters")
# for parameter server
ps_group.add_argument(
"--servers",
type=str,
default='',
help="servers endpoints full list")
ps_group.add_argument(
"--trainers",
type=str,
default='',
help="trainers endpoints full list")
ps_group.add_argument(
"--trainer_num", type=int, default=None, help="number of trainers")
ps_group.add_argument(
"--server_num", type=int, default=None, help="number of servers")
ps_group.add_argument(
"--gloo_port", type=int, default=6767, help="gloo http port")
ps_group.add_argument(
"--with_gloo", type=str, default="0", help="use gloo or not")
# parameter elastic mode
elastic_group = parser.add_argument_group("Elastic Parameters")
elastic_group.add_argument(
"--max_restart",
type=int,
default=3,
help="the times can restart. Default 3")
elastic_group.add_argument(
"--elastic_level",
type=int,
default=-1,
help="elastic level: -1 disable, 0 failed exit, peers hold, 1 internal restart"
)
elastic_group.add_argument(
"--elastic_timeout",
type=int,
default=30,
help="seconds to wait before elastic perform training")
return parser.parse_args()
def _valide_env(self, key):
if key in ['POD_IP']:
return True
if key.endswith('_VISIBLE_DEVICES'):
return True
if key.startswith('PADDLE_'):
return True
return False
def fetch_envs(self):
ge = os.environ.copy()
black_env_list = ['http_proxy', 'https_proxy']
for key in black_env_list:
ge.pop(key, None)
return ge
'''
# use black list instead white list
return {k: ge[k] for k in ge if self._valide_env(k)}
'''
def get_logger(self, level=logging.INFO):
logger = logging.getLogger("PADDLERUN")
logger.setLevel(self.args.log.upper() or level)
formatter = logging.Formatter(
fmt='%(name)s %(levelname)s %(asctime)s %(message)s')
ch = logging.StreamHandler()
ch.setFormatter(formatter)
logger.addHandler(ch)
return logger
def set_env_in_args(self):
env_args = {
'POD_IP': 'host',
'PADDLE_MASTER': 'master',
'PADDLE_DEVICES': 'devices',
'PADDLE_NP': 'np',
'PADDLE_MODE': 'mode',
'PADDLE_LOG': 'log',
'PADDLE_NPROC_PER_NODE': 'nproc_per_node',
'PADDLE_JOB_ID': 'id',
'PADDLE_RANK': 'rank',
'PADDLE_LOG_DIR': 'log_dir',
'PADDLE_MAX_RESTlRT': 'max_restart',
'PADDLE_ELASTIC_LEVEL': 'elastic_level',
'PADDLE_ELASTIC_TIMEOUT': 'elastic_timeout',
'PADDLE_SERVER_NUM': 'server_num',
'PADDLE_TRAINER_NUM': 'trainer_num',
'PADDLE_SERVERS_ENDPOINTS': 'servers',
'PADDLE_TRAINERS_ENDPOINTS': 'trainers',
'PADDLE_GLOO_PORT': 'gloo_port',
'PADDLE_WITH_GLOO': 'with_gloo',
}
for k, v in env_args.items():
if k in self.envs:
setattr(self.args, v, self.envs[k])
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import socket
def get_local_ip(ctx):
_, ip = _get_host_name_ip()
ctx.args.host = ip
ctx.envs["POD_IP"] = ip
def _get_host_name_ip():
try:
host_name = socket.gethostname()
host_ip = socket.gethostbyname(host_name)
return host_name, host_ip
except:
return None
......@@ -1155,7 +1155,8 @@ class Layer(object):
layers[name] = None
else:
_buffers = self.__dict__.get('_buffers', None)
if type(value) == core.VarBase:
if type(value) == core.VarBase or \
type(value) == core.eager.Tensor:
if _buffers is None:
raise ValueError(
"super(YourLayer, self).__init__() should be called first"
......
......@@ -949,7 +949,7 @@ if (WITH_DISTRIBUTE AND NOT APPLE)
endif()
# setting timeout value as 15S
set_tests_properties(test_run PROPERTIES TIMEOUT 200)
set_tests_properties(test_run PROPERTIES TIMEOUT 120)
set_tests_properties(test_sync_batch_norm_op PROPERTIES TIMEOUT 120)
set_tests_properties(test_cross_op PROPERTIES TIMEOUT 120)
set_tests_properties(test_imperative_lod_tensor_to_selected_rows PROPERTIES TIMEOUT 200)
......
......@@ -17,5 +17,4 @@
set -e
# use default values
# FIXME: random fails on Unknown command lines -c (or -m).
launch_py=${PADDLE_BINARY_DIR}/python/paddle/distributed/launch.py
CUDA_VISIBLE_DEVICES=0,1 python ${launch_py} c_comm_init_op.py
CUDA_VISIBLE_DEVICES=0,1 python -m paddle.distributed.launch c_comm_init_op.py
......@@ -637,6 +637,10 @@ class EagerVariablePropertiesAndMethodsTestCase(unittest.TestCase):
self.assertTrue(tensor3.persistable, True)
self.assertTrue(tensor3.stop_gradient, True)
self.assertTrue(tensor3.place.is_gpu_place())
tensor4 = paddle.to_tensor([1, 2, 3], place='gpu_pinned')
tensor5 = tensor4._copy_to(core.CUDAPlace(0), True)
self.assertTrue(
np.array_equal(tensor4.numpy(), tensor5.numpy()))
else:
tensor3 = tensor2._copy_to(core.CPUPlace(), True)
self.assertTrue(np.array_equal(tensor3.numpy(), arr2))
......
......@@ -304,6 +304,35 @@ class API_GraphSendRecvOpTest(unittest.TestCase):
"two value is\
{}\n{}, check diff!".format(np_res, ret_res))
def test_set_outsize_gpu(self):
if paddle.fluid.core.is_compiled_with_cuda():
x = paddle.to_tensor(
np.array([[0, 2, 3], [1, 4, 5], [2, 6, 6]]), dtype="float32")
src_index = paddle.to_tensor(np.array([0, 0, 1]), dtype="int32")
dst_index = paddle.to_tensor(np.array([0, 1, 1]), dtype="int32")
res = paddle.incubate.graph_send_recv(x, src_index, dst_index,
"sum")
out_size = paddle.max(dst_index) + 1
res_set_outsize = paddle.incubate.graph_send_recv(
x, src_index, dst_index, "sum", out_size)
np_res = np.array(
[[0, 2, 3], [1, 6, 8], [0, 0, 0]], dtype="float32")
np_res_set_outsize = np.array(
[[0, 2, 3], [1, 6, 8]], dtype="float32")
self.assertTrue(
np.allclose(
np_res, res, atol=1e-6),
"two value is\
{}\n{}, check diff!".format(np_res, res))
self.assertTrue(
np.allclose(
np_res_set_outsize, res_set_outsize, atol=1e-6),
"two value is\
{}\n{}, check diff!"
.format(np_res_set_outsize, res_set_outsize))
if __name__ == '__main__':
unittest.main()
......@@ -195,6 +195,8 @@ class TestDygraphInplaceFlatten(TestDygraphInplace):
return var.flatten_()
"""
# This case will fail while using `_C_ops.final_state_scatter`.
class TestDygraphInplaceScatter(TestDygraphInplace):
def init_data(self):
self.input_var_numpy = np.array([[1, 1], [2, 2], [3, 3]])
......@@ -213,6 +215,7 @@ class TestDygraphInplaceScatter(TestDygraphInplace):
[[1, 1], [2, 2], [3, 3], [4, 4]], dtype='float32')
return paddle.scatter_(var, index, updates, overwrite=False)
"""
class TestDygraphInplaceElu(TestDygraphInplace):
......
......@@ -39,9 +39,7 @@ import os
env = os.environ.copy()
assert "PADDLE_PSERVERS_IP_PORT_LIST" in env
assert "PADDLE_TRAINER_ENDPOINTS" in env
#assert "PADDLE_PSERVER_ENDPOINTS" in env
#assert "PADDLE_TRAINER_ENDPOINTS" in env
#assert "PADDLE_ROLE" in env
assert "PADDLE_ROLE" in env
#assert "PADDLE_RANK" in env
'''
......@@ -62,27 +60,24 @@ class Collective_Test(unittest.TestCase):
write_file(pyname, colpyfile)
def pdrun(self, args, env=None):
cmd = [sys.executable.split('/')[-1], "-m", "paddle.distributed.run"]
cmd = [sys.executable.split('/')[-1], "-m", "paddle.distributed.launch"]
if args:
cmd.extend(args.split(" "))
cmd.extend([pyname])
proc = subprocess.Popen(cmd, env)
return proc
'''
def test_collective_1(self):
args = "--id test1"
args = "--job_id test1"
p = self.pdrun(args)
p.wait()
self.assertTrue(p.poll() == 0)
'''
def test_collective_2(self):
if os.path.exists('./log'):
shutil.rmtree('./log')
args = "--id test2 --devices 0,1,2"
args = "--job_id test2 --devices 0,1,2"
p = self.pdrun(args)
p.wait()
self.assertTrue(p.poll() == 0)
......@@ -95,7 +90,7 @@ class Collective_Test(unittest.TestCase):
shutil.rmtree('./log')
port = random.randrange(6000, 8000)
args = "--id test3 --devices 0,1 --master 127.0.0.1:{} --np 2".format(
args = "--job_id test3 --devices 0,1 --master 127.0.0.1:{} --np 2".format(
port)
p1 = self.pdrun(args)
p2 = self.pdrun(args)
......@@ -113,14 +108,13 @@ class PS_Test(unittest.TestCase):
write_file(pyname, pspyfile)
def pdrun(self, args, env=None):
cmd = [sys.executable.split('/')[-1], "-m", "paddle.distributed.run"]
cmd = [sys.executable.split('/')[-1], "-m", "paddle.distributed.launch"]
if args:
cmd.extend(args.split(" "))
cmd.extend([pyname])
proc = subprocess.Popen(cmd, env)
return proc
'''
def test_ps_1(self):
args = "--mode ps"
p = self.pdrun(args)
......@@ -131,21 +125,20 @@ class PS_Test(unittest.TestCase):
if os.path.exists('./log'):
shutil.rmtree('./log')
args = "--id ps2 --server_num=2 --trainer_num=2"
args = "--job_id ps2 --server_num=2 --trainer_num=2"
p = self.pdrun(args)
p.wait()
self.assertTrue(p.poll() == 0)
c = get_files('log', 'ps2')
self.assertTrue(len(c) == 5)
'''
def test_ps_3(self):
if os.path.exists('./log'):
shutil.rmtree('./log')
port = random.randrange(6000, 8000)
args = "--id ps3 --master 127.0.0.1:{} --np 2 --server_num=1 --trainer_num=1".format(
args = "--job_id ps3 --master 127.0.0.1:{} --np 2 --server_num=1 --trainer_num=1".format(
port)
p1 = self.pdrun(args)
p2 = self.pdrun(args)
......@@ -161,7 +154,7 @@ class PS_Test(unittest.TestCase):
if os.path.exists('./log'):
shutil.rmtree('./log')
args = "--id ps4 --servers 127.0.0.1:8900,127.0.0.1:8901 --trainers 127.0.0.1:8902,127.0.0.1:8903"
args = "--job_id ps4 --servers 127.0.0.1:8900,127.0.0.1:8901 --trainers 127.0.0.1:8902,127.0.0.1:8903"
p1 = self.pdrun(args)
p1.wait()
self.assertTrue(p1.poll() == 0)
......
......@@ -19,7 +19,12 @@ from paddle.fluid import core
from paddle import _C_ops
def graph_send_recv(x, src_index, dst_index, pool_type="sum", name=None):
def graph_send_recv(x,
src_index,
dst_index,
pool_type="sum",
out_size=None,
name=None):
r"""
Graph Learning Send_Recv combine operator.
......@@ -27,7 +32,7 @@ def graph_send_recv(x, src_index, dst_index, pool_type="sum", name=None):
This operator is mainly used in Graph Learning domain, and the main purpose is to reduce intermediate memory
consumption in the process of message passing. Take `x` as the input tensor, we first use `src_index`
to gather the corresponding data, and then use `dst_index` to update the corresponding position of output tensor
in different pooling types, like sum, mean, max, or min.
in different pooling types, like sum, mean, max, or min. Besides, we can set `out_size` to get necessary output shape.
.. code-block:: text
......@@ -43,6 +48,8 @@ def graph_send_recv(x, src_index, dst_index, pool_type="sum", name=None):
pool_type = "sum"
out_size = None
Then:
Out = [[0, 2, 3],
......@@ -56,6 +63,9 @@ def graph_send_recv(x, src_index, dst_index, pool_type="sum", name=None):
The available data type is int32, int64.
pool_type (str): The pooling type of graph_send_recv, including `sum`, `mean`, `max`, `min`.
Default value is `sum`.
out_size (int64|None): We can set `out_size` to get necessary output shape. If not set, then this
attribute will not be used. If set, it should be equal with or larger than
max(dst_index) + 1.
name (str, optional): Name for the operation (optional, default is None).
For more information, please refer to :ref:`api_guide_Name`.
......@@ -75,6 +85,21 @@ def graph_send_recv(x, src_index, dst_index, pool_type="sum", name=None):
out = paddle.incubate.graph_send_recv(x, src_index, dst_index, pool_type="sum")
# Outputs: [[0., 2., 3.], [2., 8., 10.], [1., 4., 5.]]
x = paddle.to_tensor([[0, 2, 3], [1, 4, 5], [2, 6, 7]], dtype="float32")
indexes = paddle.to_tensor([[0, 1], [2, 1], [0, 0]], dtype="int32")
src_index = indexes[:, 0]
dst_index = indexes[:, 1]
out_size = paddle.max(dst_index) + 1
out = paddle.incubate.graph_send_recv(x, src_index, dst_index, pool_type="sum", out_size=out_size)
# Outputs: [[0., 2., 3.], [[2., 8., 10.]]]
x = paddle.to_tensor([[0, 2, 3], [1, 4, 5], [2, 6, 7]], dtype="float32")
indexes = paddle.to_tensor([[0, 1], [2, 1], [0, 0]], dtype="int32")
src_index = indexes[:, 0]
dst_index = indexes[:, 1]
out = paddle.incubate.graph_send_recv(x, src_index, dst_index, pool_type="sum")
# Outputs: [[0., 2., 3.], [2., 8., 10.], [0., 0., 0.]]
"""
if pool_type not in ["sum", "mean", "max", "min"]:
......@@ -82,9 +107,16 @@ def graph_send_recv(x, src_index, dst_index, pool_type="sum", name=None):
"pool_type should be `sum`, `mean`, `max` or `min`, but received %s"
% pool_type)
# TODO(daisiming): Should we add judgement for out_size: max(dst_index) + 1.
if in_dygraph_mode():
out, tmp = _C_ops.graph_send_recv(x, src_index, dst_index, 'pool_type',
pool_type.upper())
if out_size is None or out_size <= 0:
out, tmp = _C_ops.graph_send_recv(x, src_index, dst_index,
'pool_type', pool_type.upper())
else:
out, tmp = _C_ops.graph_send_recv(
x, src_index, dst_index, 'pool_type',
pool_type.upper(), 'out_size', out_size)
return out
check_variable_and_dtype(x, "X", ("float32", "float64", "int32", "int64"),
......@@ -105,5 +137,8 @@ def graph_send_recv(x, src_index, dst_index, pool_type="sum", name=None):
"Dst_index": dst_index},
outputs={"Out": out,
"Dst_count": dst_count},
attrs={"pool_type": pool_type.upper()})
attrs={
"pool_type": pool_type.upper(),
"out_size": 0 if out_size is None or out_size <= 0 else out_size
})
return out
......@@ -578,6 +578,8 @@ def relu_(x, name=None):
Inplace version of ``relu`` API, the output Tensor will be inplaced with input ``x``.
Please refer to :ref:`api_nn_cn_relu`.
"""
if _in_eager_mode():
return _C_ops.final_state_relu_(x)
return _C_ops.relu_(x)
......
......@@ -127,7 +127,7 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True):
"\n\tFaild to convert input data to a regular ndarray :\n\t - Usually "
"this means the input data contains nested lists with different lengths. "
)
elif isinstance(data, paddle.Tensor):
elif isinstance(data, (paddle.Tensor, core.eager.Tensor)):
data = data._copy_to(place, False)
data = _handle_dtype(data, dtype)
data.stop_gradient = stop_gradient
......
......@@ -1735,12 +1735,12 @@ def tile(x, repeat_times, name=None):
Args:
x (Tensor): The input tensor, its data type should be bool, float32, float64, int32 or int64.
repeat_times (Tensor|tuple|list): The number of repeating times. If repeat_times is a list or tuple, all its elements
repeat_times (list|tuple|Tensor): The number of repeating times. If repeat_times is a list or tuple, all its elements
should be integers or 1-D Tensors with the data type int32. If repeat_times is a Tensor, it should be an 1-D Tensor with the data type int32.
name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
Returns:
N-D Tensor. The data type is the same as ``x``.
N-D Tensor. The data type is the same as ``x``. The size of the i-th dimension is equal to ``x[i] * repeat_times[i]``.
Examples:
.. code-block:: python
......@@ -1750,16 +1750,18 @@ def tile(x, repeat_times, name=None):
data = paddle.to_tensor([1, 2, 3], dtype='int32')
out = paddle.tile(data, repeat_times=[2, 1])
np_out = out.numpy()
# [[1, 2, 3], [1, 2, 3]]
# [[1, 2, 3]
# [1, 2, 3]]
out = paddle.tile(data, repeat_times=[2, 2])
out = paddle.tile(data, repeat_times=(2, 2))
np_out = out.numpy()
# [[1, 2, 3, 1, 2, 3], [1, 2, 3, 1, 2, 3]]
# [[1, 2, 3, 1, 2, 3]
# [1, 2, 3, 1, 2, 3]]
repeat_times = paddle.to_tensor([2, 1], dtype='int32')
repeat_times = paddle.to_tensor([1, 2], dtype='int32')
out = paddle.tile(data, repeat_times=repeat_times)
np_out = out.numpy()
# [[1, 2, 3], [1, 2, 3]]
# [[1, 2, 3, 1, 2, 3]]
"""
if paddle.in_dynamic_mode():
if _in_eager_mode():
......
......@@ -36,9 +36,9 @@
func : conj
- api : copy_to
args : (Tensor x, Backend backend, bool blocking)
args : (Tensor x, Place place, bool blocking)
output : Tensor
invoke : copy_to_impl(x, backend, blocking)
invoke : copy_to_impl(x, place, blocking)
- api : divide
args : (Tensor x, Tensor y)
......@@ -57,7 +57,7 @@
func : dot
- api : empty
args : (ScalarArray shape, DataType dtype=DataType::FLOAT32, Backend place=Backend::CPU)
args : (ScalarArray shape, DataType dtype=DataType::FLOAT32, Place place=CPUPlace())
output: Tensor
infer_meta :
func : CreateInferMeta
......@@ -69,7 +69,7 @@
backend : place
- api : empty_like
args : (Tensor x, DataType dtype = DataType::UNDEFINED, Backend place = Backend::UNDEFINED)
args : (Tensor x, DataType dtype = DataType::UNDEFINED, Place place = {})
output: Tensor
infer_meta :
func : CreateLikeInferMeta
......@@ -89,7 +89,7 @@
func : flatten
- api : full
args : (ScalarArray shape, Scalar value, DataType dtype=DataType::FLOAT32, Backend place=Backend::CPU)
args : (ScalarArray shape, Scalar value, DataType dtype=DataType::FLOAT32, Place place=CPUPlace())
output: Tensor
infer_meta :
func : CreateInferMeta
......@@ -101,7 +101,7 @@
backend : place
- api : full_like
args : (Tensor x, Scalar value, DataType dtype = DataType::UNDEFINED, Backend place = Backend::UNDEFINED)
args : (Tensor x, Scalar value, DataType dtype = DataType::UNDEFINED, Place place = {})
output: Tensor
infer_meta :
func : CreateLikeInferMeta
......@@ -138,7 +138,7 @@
func : multiply
- api : ones_like
args : (Tensor x, DataType dtype=DataType::UNDEFINED, Backend place=Backend::UNDEFINED)
args : (Tensor x, DataType dtype=DataType::UNDEFINED, Place place={})
output : Tensor
invoke : full_like(x, 1, dtype, place)
......@@ -219,7 +219,7 @@
data_type : x
- api : zeros_like
args : (Tensor x, DataType dtype=DataType::UNDEFINED, Backend place=Backend::UNDEFINED)
args : (Tensor x, DataType dtype=DataType::UNDEFINED, Place place = {})
output : Tensor
invoke : full_like(x, 0, dtype, place)
......@@ -231,7 +231,7 @@
func : OneHotInferMeta
kernel :
func : one_hot
- api : digamma
args : (Tensor x)
output : Tensor
......@@ -1484,3 +1484,16 @@
kernel :
func : mv
backward : mv_grad
# =================================== sep0
# =================================== sep1
# =================================== sep2
# =================================== sep3
......@@ -99,7 +99,7 @@ class BaseAPI(object):
'double': 'double',
'bool': 'bool',
'str': 'const std::string&',
'Backend': 'Backend',
'Place': 'Place',
'DataLayout': 'DataLayout',
'DataType': 'DataType',
'int64[]': 'const std::vector<int64_t>&',
......@@ -118,7 +118,7 @@ class BaseAPI(object):
'float': 'paddle::optional<float>',
'double': 'paddle::optional<double>',
'bool': 'paddle::optional<bool>',
'Backend': 'paddle::optional<Backend>',
'Place': 'paddle::optional<Place>',
'DataLayout': 'paddle::optional<DataLayout>',
'DataType': 'paddle::optional<DataType>',
'int64[]': 'paddle::optional<std::vector<int64_t>>',
......@@ -327,9 +327,9 @@ PADDLE_API {self.gene_return_type_code()} {self.get_api_func_name() + '_'}({self
attr_layout_count = 0
attr_data_type_count = 0
for attr_name in attrs['names']:
if attrs['attr_info'][attr_name][0] == 'Backend':
if attrs['attr_info'][attr_name][0] == 'Place':
assert kernel['backend'] is not None, \
f"{api} api: When there is a parameter with 'Backend' type in attributes, you must set backend of kernel manually."
f"{api} api: When there is a parameter with 'Place' type in attributes, you must set backend of kernel manually."
attr_backend_count = attr_backend_count + 1
if attrs['attr_info'][attr_name][0] == 'DataLayout':
assert kernel['layout'] is not None, \
......@@ -348,8 +348,8 @@ PADDLE_API {self.gene_return_type_code()} {self.get_api_func_name() + '_'}({self
assert len(
vars_list
) == 2, f"{api} api: The number of params to set backend with '>' only allows 2, but received {len(vars_list)}."
assert (vars_list[0].strip() in attrs['names']) and (attrs['attr_info'][vars_list[0].strip()][0] == 'Backend'), \
f"{api} api: When use '>' to set kernel backend, the first param should be a attribute with Backend type."
assert (vars_list[0].strip() in attrs['names']) and (attrs['attr_info'][vars_list[0].strip()][0] == 'Place'), \
f"{api} api: When use '>' to set kernel backend, the first param should be a attribute with Place type."
kernel_select_code = kernel_select_code + f"""
kernel_backend = ParseBackendWithInputOrder({vars_list[0].strip()}, {vars_list[1].strip()});
"""
......
......@@ -56,6 +56,16 @@
kernel :
func : abs_grad
- backward_api : relu_grad
forward : relu (Tensor x) -> Tensor(out)
args : (Tensor x, Tensor out_grad)
output : Tensor(x_grad)
infer_meta :
func : UnchangedInferMeta
param : [x]
kernel :
func : relu_grad
- backward_api : trunc_grad
forward : trunc (Tensor x) -> Tensor(out)
args : (Tensor out_grad)
......@@ -868,3 +878,16 @@
param : [x, vec]
kernel :
func : mv_grad
# =================================== sep0
# =================================== sep1
# =================================== sep2
# =================================== sep3
......@@ -282,6 +282,12 @@ packages=['paddle',
'paddle.distribution',
'paddle.distributed.sharding',
'paddle.distributed.fleet',
'paddle.distributed.launch',
'paddle.distributed.launch.context',
'paddle.distributed.launch.controllers',
'paddle.distributed.launch.job',
'paddle.distributed.launch.plugins',
'paddle.distributed.launch.utils',
'paddle.distributed.fleet.base',
'paddle.distributed.fleet.elastic',
'paddle.distributed.fleet.meta_optimizers',
......@@ -727,7 +733,7 @@ with redirect_stdout():
},
entry_points={
'console_scripts': [
'fleetrun = paddle.distributed.fleet.launch:launch'
'fleetrun = paddle.distributed.launch.__main__:launch'
]
},
classifiers=[
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册