提交 da478d1e 编写于 作者: P phlrain

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into add_some_yaml_config

...@@ -258,6 +258,12 @@ copy(inference_lib_dist ...@@ -258,6 +258,12 @@ copy(inference_lib_dist
copy(inference_lib_dist copy(inference_lib_dist
SRCS ${PADDLE_SOURCE_DIR}/paddle/utils/any.h SRCS ${PADDLE_SOURCE_DIR}/paddle/utils/any.h
DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/utils/) DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/utils/)
copy(inference_lib_dist
SRCS ${PADDLE_SOURCE_DIR}/paddle/utils/optional.h
DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/utils/)
copy(inference_lib_dist
SRCS ${PADDLE_SOURCE_DIR}/paddle/utils/none.h
DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/utils/)
copy(inference_lib_dist copy(inference_lib_dist
SRCS ${PADDLE_SOURCE_DIR}/paddle/extension.h SRCS ${PADDLE_SOURCE_DIR}/paddle/extension.h
DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/) DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/)
......
...@@ -39,8 +39,9 @@ static void CopyOrAddTensor(paddle::experimental::Tensor* tensor, ...@@ -39,8 +39,9 @@ static void CopyOrAddTensor(paddle::experimental::Tensor* tensor,
} }
std::vector<std::vector<paddle::experimental::Tensor>> GradNodeAccumulation:: std::vector<std::vector<paddle::experimental::Tensor>> GradNodeAccumulation::
operator()(const std::vector<std::vector<paddle::experimental::Tensor>>& grads, operator()(
bool create_graph) { std::vector<std::vector<paddle::experimental::Tensor>>& grads, // NOLINT
bool create_graph) {
VLOG(3) << "Running Eager Backward Node: GradNodeAccumulation"; VLOG(3) << "Running Eager Backward Node: GradNodeAccumulation";
PADDLE_ENFORCE(grads.size() == 1, PADDLE_ENFORCE(grads.size() == 1,
paddle::platform::errors::Fatal( paddle::platform::errors::Fatal(
......
...@@ -35,7 +35,7 @@ class GradNodeAccumulation : public GradNodeBase { ...@@ -35,7 +35,7 @@ class GradNodeAccumulation : public GradNodeBase {
// Functor: perform backward computations // Functor: perform backward computations
virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()( virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
const std::vector<std::vector<paddle::experimental::Tensor>>& grads, std::vector<std::vector<paddle::experimental::Tensor>>& grads, // NOLINT
bool create_graph = false) override; bool create_graph = false) override;
void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; } void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; }
......
...@@ -145,8 +145,9 @@ void GradNodeScale::SetTensorWrappers_X( ...@@ -145,8 +145,9 @@ void GradNodeScale::SetTensorWrappers_X(
void GradNodeScale::SetAttributes_scale(float scale) { scale_ = scale; } void GradNodeScale::SetAttributes_scale(float scale) { scale_ = scale; }
std::vector<std::vector<paddle::experimental::Tensor>> GradNodeScale:: std::vector<std::vector<paddle::experimental::Tensor>> GradNodeScale::
operator()(const std::vector<std::vector<paddle::experimental::Tensor>>& grads, operator()(
bool create_graph) { std::vector<std::vector<paddle::experimental::Tensor>>& grads, // NOLINT
bool create_graph) {
// 1. Check Output Size // 1. Check Output Size
PADDLE_ENFORCE( PADDLE_ENFORCE(
((grads.size() == 1) && (grads[0].size() == 1)), ((grads.size() == 1) && (grads[0].size() == 1)),
......
...@@ -39,7 +39,7 @@ class GradNodeScale : public GradNodeBase { ...@@ -39,7 +39,7 @@ class GradNodeScale : public GradNodeBase {
// Functor: perform backward computations // Functor: perform backward computations
virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()( virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
const std::vector<std::vector<paddle::experimental::Tensor>>& grads, std::vector<std::vector<paddle::experimental::Tensor>>& grads, // NOLINT
bool create_graph = false) override; bool create_graph = false) override;
void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; } void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; }
......
...@@ -47,6 +47,9 @@ std::unordered_map<std::string, std::vector<std::string>> ...@@ -47,6 +47,9 @@ std::unordered_map<std::string, std::vector<std::string>>
static std::unordered_map<std::string, paddle::framework::AttributeMap> static std::unordered_map<std::string, paddle::framework::AttributeMap>
operators_with_attrs = {}; operators_with_attrs = {};
static std::unordered_set<std::string> ops_to_fill_zero_for_empty_grads = {
"split"};
/* --- Black Ops list that's NO NEED to apply code generation --- */ /* --- Black Ops list that's NO NEED to apply code generation --- */
static std::unordered_set<std::string> black_ops_list = {"run_program"}; static std::unordered_set<std::string> black_ops_list = {"run_program"};
...@@ -2243,11 +2246,21 @@ static std::string GenerateGradNodeCCContents( ...@@ -2243,11 +2246,21 @@ static std::string GenerateGradNodeCCContents(
// [Generation] Get Full Grad Function // [Generation] Get Full Grad Function
const char* GRAD_FUNCTION_TEMPLATE = const char* GRAD_FUNCTION_TEMPLATE =
"std::vector<std::vector<paddle::experimental::Tensor>> " "std::vector<std::vector<paddle::experimental::Tensor>> "
"GradNode%s::operator()(const " "GradNode%s::operator()("
"std::vector<std::vector<paddle::experimental::Tensor>>& grads, " "std::vector<std::vector<paddle::experimental::Tensor>>& grads, bool "
"bool create_graph) {\n%s\n}"; "create_graph) {\n"
std::string grad_function_str = paddle::string::Sprintf( "%s"
GRAD_FUNCTION_TEMPLATE, fwd_op_type, generated_grad_function_body); "%s"
"\n}";
std::string fill_zero_str = "";
if (ops_to_fill_zero_for_empty_grads.count(fwd_op_type)) {
fill_zero_str =
"egr::EagerUtils::FillZeroForEmptyGradInputs(&grads, "
"this->InputMeta());\n";
}
std::string grad_function_str =
paddle::string::Sprintf(GRAD_FUNCTION_TEMPLATE, fwd_op_type,
fill_zero_str, generated_grad_function_body);
VLOG(6) << "Generated returns"; VLOG(6) << "Generated returns";
...@@ -2279,9 +2292,9 @@ static std::string GenerateGradNodeHeaderContents( ...@@ -2279,9 +2292,9 @@ static std::string GenerateGradNodeHeaderContents(
" ~GradNode%s() override { VLOG(6) << \" Destruct GradNode%s \"; }\n" " ~GradNode%s() override { VLOG(6) << \" Destruct GradNode%s \"; }\n"
"\n" "\n"
" virtual std::vector<std::vector<paddle::experimental::Tensor>> " " virtual std::vector<std::vector<paddle::experimental::Tensor>> "
"operator()(const " "operator()("
"std::vector<std::vector<paddle::experimental::Tensor>>& grads, const " "std::vector<std::vector<paddle::experimental::Tensor>>& grads, bool "
"bool create_graph = false) " "create_graph = false) "
"override;\n" "override;\n"
"\n" "\n"
" void ClearTensorWrappers() override { \n" " void ClearTensorWrappers() override { \n"
......
...@@ -17,6 +17,8 @@ import re ...@@ -17,6 +17,8 @@ import re
import argparse import argparse
import os import os
ops_to_fill_zero_for_empty_grads = set(list("split"))
# For API dispatch used at python-level # For API dispatch used at python-level
# { op_name : [arg_name, ...] } # { op_name : [arg_name, ...] }
core_ops_returns_info = {} core_ops_returns_info = {}
...@@ -599,7 +601,8 @@ class {} : public egr::GradNodeBase {{ ...@@ -599,7 +601,8 @@ class {} : public egr::GradNodeBase {{
~{}() override = default; ~{}() override = default;
virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()( virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
const std::vector<std::vector<paddle::experimental::Tensor>>& grads, bool create_graph = false) override; std::vector<std::vector<paddle::experimental::Tensor>>& grads, bool create_graph = false) override;
std::string name() override {{ return \" {} \"; }} std::string name() override {{ return \" {} \"; }}
void ClearTensorWrappers() override {{ void ClearTensorWrappers() override {{
...@@ -657,10 +660,11 @@ def GenerateNodeDefinition(fwd_api_name, bwd_api_name, backward_fwd_input_map, ...@@ -657,10 +660,11 @@ def GenerateNodeDefinition(fwd_api_name, bwd_api_name, backward_fwd_input_map,
for _, (ttype, fwd_position, for _, (ttype, fwd_position,
grad_api_position) in backward_grad_input_map.items(): grad_api_position) in backward_grad_input_map.items():
if IsPlainTensorType(ttype): if IsPlainTensorType(ttype):
grad_api_args[grad_api_position] = f"grads[{fwd_position}][0]" grad_api_args[
grad_api_position] = f"hooked_grads[{fwd_position}][0]"
else: else:
assert IsVectorTensorType(ttype) assert IsVectorTensorType(ttype)
grad_api_args[grad_api_position] = f"grads[{fwd_position}]" grad_api_args[grad_api_position] = f"hooked_grads[{fwd_position}]"
for name, _, _, grad_api_position in backward_attrs_list: for name, _, _, grad_api_position in backward_attrs_list:
saved_attribute_name = GetSavedName(name) saved_attribute_name = GetSavedName(name)
...@@ -688,23 +692,30 @@ def GenerateNodeDefinition(fwd_api_name, bwd_api_name, backward_fwd_input_map, ...@@ -688,23 +692,30 @@ def GenerateNodeDefinition(fwd_api_name, bwd_api_name, backward_fwd_input_map,
grad_node_name = GetGradNodeName(fwd_api_name) grad_node_name = GetGradNodeName(fwd_api_name)
fill_zero_str = ""
if fwd_api_name in ops_to_fill_zero_for_empty_grads:
fill_zero_str = "egr::EagerUtils::FillZeroForEmptyGradInputs(&grads, this->InputMeta());\n"
if len(namespace) > 0: if len(namespace) > 0:
grad_api_namespace = f"paddle::experimental::{namespace}" grad_api_namespace = f"paddle::experimental::{namespace}"
else: else:
grad_api_namespace = f"paddle::experimental" grad_api_namespace = f"paddle::experimental"
FUNCTION_TEMPLATE = """ FUNCTION_TEMPLATE = """
std::vector<std::vector<paddle::experimental::Tensor>> {}::operator()(const std::vector<std::vector<paddle::experimental::Tensor>>& grads, bool create_graph) {{ std::vector<std::vector<paddle::experimental::Tensor>> {}::operator()(std::vector<std::vector<paddle::experimental::Tensor>>& grads, bool create_graph) {{
{}
auto hooked_grads = ApplyGradientHooks(grads);
// Call grad_api function // Call grad_api function
VLOG(3) << \"Finally State Running: \" << \"{}\"; VLOG(3) << \"Final State Running: \" << \"{}\";
auto grad_api_returns = {}::{}({}); auto grad_api_returns = {}::{}({});
{} {}
}} }}
""" """
node_definition_str = FUNCTION_TEMPLATE.format( node_definition_str = FUNCTION_TEMPLATE.format(
grad_node_name, grad_node_name, grad_api_namespace, bwd_api_name, grad_node_name, fill_zero_str, grad_node_name, grad_api_namespace,
grad_api_args_str, returns_str) bwd_api_name, grad_api_args_str, returns_str)
return node_definition_str return node_definition_str
...@@ -799,8 +810,15 @@ def GenerateNodeCreationCodes( ...@@ -799,8 +810,15 @@ def GenerateNodeCreationCodes(
# SetAttributes # SetAttributes
set_attributes_list = [] set_attributes_list = []
for name, _, _, _ in backward_attrs_list: forward_attrs_name_set = set()
set_attributes = f" grad_node->SetAttribute{name}({name});" for name, _, _, _ in forward_attrs_list:
forward_attrs_name_set.add(name)
for name, _, default_val_attr, _ in backward_attrs_list:
if name in forward_attrs_name_set:
set_attributes = f" grad_node->SetAttribute{name}({name});"
else:
set_attributes = f" grad_node->SetAttribute{name}({default_val_attr});"
set_attributes_list.append(set_attributes) set_attributes_list.append(set_attributes)
set_attributes_str = "\n".join(set_attributes_list) set_attributes_str = "\n".join(set_attributes_list)
......
...@@ -20,8 +20,8 @@ ...@@ -20,8 +20,8 @@
namespace egr { namespace egr {
std::vector<std::vector<paddle::experimental::Tensor>> RunCustomOpNode:: std::vector<std::vector<paddle::experimental::Tensor>> RunCustomOpNode::
operator()(const std::vector<std::vector<paddle::experimental::Tensor>>& grads, operator()(std::vector<std::vector<paddle::experimental::Tensor>>& grads,
bool create_graph) { bool create_graph) { // NOLINT
paddle::CustomOpKernelContext ctx; paddle::CustomOpKernelContext ctx;
auto grad_inputs_name = paddle::framework::OpMetaInfoHelper::GetInputs( auto grad_inputs_name = paddle::framework::OpMetaInfoHelper::GetInputs(
egr::Controller::Instance().GetOpMetaInfoMap().at(op_type_)[1]); egr::Controller::Instance().GetOpMetaInfoMap().at(op_type_)[1]);
......
...@@ -37,8 +37,9 @@ class RunCustomOpNode : public GradNodeBase { ...@@ -37,8 +37,9 @@ class RunCustomOpNode : public GradNodeBase {
// Functor: perform backward computations // Functor: perform backward computations
virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()( virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
const std::vector<std::vector<paddle::experimental::Tensor>>& grads, std::vector<std::vector<paddle::experimental::Tensor>>& grads,
bool create_graph) override; bool create_graph = false) // NOLINT
override;
std::string name() { std::string name() {
return paddle::string::Sprintf("RunCustomOpNode: %s_grad", op_type_); return paddle::string::Sprintf("RunCustomOpNode: %s_grad", op_type_);
......
...@@ -102,6 +102,7 @@ const std::vector<std::vector<GradSlotMeta>>& GradNodeBase::OutputMeta() const { ...@@ -102,6 +102,7 @@ const std::vector<std::vector<GradSlotMeta>>& GradNodeBase::OutputMeta() const {
void GradNodeBase::SetGradInMeta(const paddle::experimental::Tensor& fwd_out, void GradNodeBase::SetGradInMeta(const paddle::experimental::Tensor& fwd_out,
size_t slot_rank) { size_t slot_rank) {
VLOG(6) << "Set GradSlotMeta for Grad Inputs";
auto* fwd_out_meta = egr::EagerUtils::nullable_autograd_meta(fwd_out); auto* fwd_out_meta = egr::EagerUtils::nullable_autograd_meta(fwd_out);
PADDLE_ENFORCE_LE( PADDLE_ENFORCE_LE(
slot_rank, (bwd_in_meta_.size() - 1), slot_rank, (bwd_in_meta_.size() - 1),
...@@ -117,6 +118,12 @@ void GradNodeBase::SetGradInMeta(const paddle::experimental::Tensor& fwd_out, ...@@ -117,6 +118,12 @@ void GradNodeBase::SetGradInMeta(const paddle::experimental::Tensor& fwd_out,
auto& meta = metas[0]; auto& meta = metas[0];
meta.SetStopGradient(fwd_out_meta->StopGradient()); meta.SetStopGradient(fwd_out_meta->StopGradient());
if (!fwd_out.is_initialized()) {
VLOG(6)
<< "Skip Configuring GradSlotMeta for uninitialized GradInput Tensor";
return;
}
// Record TensorMeta // Record TensorMeta
if (phi::DenseTensor::classof(fwd_out.impl().get())) { if (phi::DenseTensor::classof(fwd_out.impl().get())) {
// Only Copy Meta // Only Copy Meta
...@@ -128,7 +135,9 @@ void GradNodeBase::SetGradInMeta(const paddle::experimental::Tensor& fwd_out, ...@@ -128,7 +135,9 @@ void GradNodeBase::SetGradInMeta(const paddle::experimental::Tensor& fwd_out,
paddle::platform::errors::Fatal( paddle::platform::errors::Fatal(
"Attempting to copy DenseTensorMeta with phi::DataType::UNDEFINED," "Attempting to copy DenseTensorMeta with phi::DataType::UNDEFINED,"
"which is illegal.")); "which is illegal."));
meta.SetTensorMeta(dense_tensor->meta()); meta.SetTensorMeta(dense_tensor->meta());
meta.SetPlace(fwd_out.inner_place());
if (paddle::framework::IsComplexType( if (paddle::framework::IsComplexType(
paddle::framework::TransToProtoVarType(dense_tensor->type()))) { paddle::framework::TransToProtoVarType(dense_tensor->type()))) {
...@@ -143,6 +152,7 @@ void GradNodeBase::SetGradInMeta(const paddle::experimental::Tensor& fwd_out, ...@@ -143,6 +152,7 @@ void GradNodeBase::SetGradInMeta(const paddle::experimental::Tensor& fwd_out,
void GradNodeBase::SetGradInMeta( void GradNodeBase::SetGradInMeta(
const std::vector<paddle::experimental::Tensor>& fwd_out, const std::vector<paddle::experimental::Tensor>& fwd_out,
size_t slot_rank) { size_t slot_rank) {
VLOG(6) << "Set GradSlotMeta for Grad Inputs";
size_t slot_size = fwd_out.size(); size_t slot_size = fwd_out.size();
PADDLE_ENFORCE_LE( PADDLE_ENFORCE_LE(
slot_rank, (bwd_in_meta_.size() - 1), slot_rank, (bwd_in_meta_.size() - 1),
...@@ -172,6 +182,12 @@ void GradNodeBase::SetGradInMeta( ...@@ -172,6 +182,12 @@ void GradNodeBase::SetGradInMeta(
meta.SetStopGradient(fwd_out_meta->StopGradient()); meta.SetStopGradient(fwd_out_meta->StopGradient());
} }
if (!fwd_out_tensor.is_initialized()) {
VLOG(6)
<< "Skip Configuring GradSlotMeta for uninitialized GradInput Tensor";
return;
}
// Record TensorMeta // Record TensorMeta
if (phi::DenseTensor::classof(fwd_out_tensor.impl().get())) { if (phi::DenseTensor::classof(fwd_out_tensor.impl().get())) {
// Only Copy Meta // Only Copy Meta
...@@ -184,6 +200,8 @@ void GradNodeBase::SetGradInMeta( ...@@ -184,6 +200,8 @@ void GradNodeBase::SetGradInMeta(
"with phi::DataType::UNDEFINED," "with phi::DataType::UNDEFINED,"
"which is illegal.")); "which is illegal."));
meta.SetTensorMeta(dense_tensor->meta()); meta.SetTensorMeta(dense_tensor->meta());
meta.SetPlace(fwd_out_tensor.inner_place());
if (paddle::framework::IsComplexType( if (paddle::framework::IsComplexType(
paddle::framework::TransToProtoVarType(dense_tensor->type()))) { paddle::framework::TransToProtoVarType(dense_tensor->type()))) {
need_complex_to_real_ = true; need_complex_to_real_ = true;
...@@ -228,6 +246,7 @@ void GradNodeBase::SetGradOutMeta(const paddle::experimental::Tensor& fwd_in, ...@@ -228,6 +246,7 @@ void GradNodeBase::SetGradOutMeta(const paddle::experimental::Tensor& fwd_in,
"with phi::DataType::UNDEFINED," "with phi::DataType::UNDEFINED,"
"which is illegal.")); "which is illegal."));
meta.SetTensorMeta(dense_tensor->meta()); meta.SetTensorMeta(dense_tensor->meta());
meta.SetPlace(fwd_in.inner_place());
} }
} else { } else {
VLOG(6) << "Unable to initialize the DenseTensorMeta of GradSlotMeta with " VLOG(6) << "Unable to initialize the DenseTensorMeta of GradSlotMeta with "
...@@ -272,6 +291,7 @@ void GradNodeBase::SetGradOutMeta( ...@@ -272,6 +291,7 @@ void GradNodeBase::SetGradOutMeta(
"phi::DataType::UNDEFINED," "phi::DataType::UNDEFINED,"
"which is illegal.")); "which is illegal."));
meta.SetTensorMeta(dense_tensor->meta()); meta.SetTensorMeta(dense_tensor->meta());
meta.SetPlace(fwd_in_tensor.inner_place());
} }
} else { } else {
VLOG(6) << "Unable to initialize the DenseTensorMeta of GradSlotMeta " VLOG(6) << "Unable to initialize the DenseTensorMeta of GradSlotMeta "
......
...@@ -76,8 +76,12 @@ class GradSlotMeta { ...@@ -76,8 +76,12 @@ class GradSlotMeta {
return *meta_.get(); return *meta_.get();
} }
void SetPlace(const phi::Place& place) { place_ = place; }
const phi::Place& GetPlace() const { return place_; }
private: private:
bool stop_gradient_{false}; bool stop_gradient_{false};
phi::Place place_;
std::shared_ptr<phi::DenseTensorMeta> meta_ = nullptr; std::shared_ptr<phi::DenseTensorMeta> meta_ = nullptr;
}; };
...@@ -102,7 +106,7 @@ class GradNodeBase { ...@@ -102,7 +106,7 @@ class GradNodeBase {
* is better choice to fit this format. * is better choice to fit this format.
* **/ * **/
virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()( virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
const std::vector<std::vector<paddle::experimental::Tensor>>& grads, std::vector<std::vector<paddle::experimental::Tensor>>& grads, // NOLINT
bool create_graph = false) = 0; bool create_graph = false) = 0;
virtual void ClearTensorWrappers() = 0; virtual void ClearTensorWrappers() = 0;
......
...@@ -53,7 +53,7 @@ class GradTensorHolder { ...@@ -53,7 +53,7 @@ class GradTensorHolder {
return buffer_[pos]; return buffer_[pos];
} }
const std::vector<std::vector<paddle::experimental::Tensor>>& Buffers() { std::vector<std::vector<paddle::experimental::Tensor>>& Buffers() {
return buffer_; return buffer_;
} }
......
...@@ -80,13 +80,15 @@ TEST(AccumulationNode, Tensor) { ...@@ -80,13 +80,15 @@ TEST(AccumulationNode, Tensor) {
grad_meta->SetStopGradient(false); grad_meta->SetStopGradient(false);
// operator() // operator()
paddle::experimental::Tensor ret_et0 = node->operator()({{et0}})[0][0]; std::vector<std::vector<paddle::experimental::Tensor>> et0_vec = {{et0}};
paddle::experimental::Tensor ret_et0 = node->operator()(et0_vec)[0][0];
auto* ret_et0_ptr = auto* ret_et0_ptr =
std::dynamic_pointer_cast<phi::DenseTensor>(ret_et0.impl()) std::dynamic_pointer_cast<phi::DenseTensor>(ret_et0.impl())
->data<paddle::platform::float16>(); ->data<paddle::platform::float16>();
CHECK_EQ(ret_et0_ptr[0], paddle::platform::float16(10.0f)); CHECK_EQ(ret_et0_ptr[0], paddle::platform::float16(10.0f));
paddle::experimental::Tensor ret_et1 = node->operator()({{et1}})[0][0]; std::vector<std::vector<paddle::experimental::Tensor>> et1_vec = {{et1}};
paddle::experimental::Tensor ret_et1 = node->operator()(et1_vec)[0][0];
auto* ret_et1_ptr = auto* ret_et1_ptr =
std::dynamic_pointer_cast<phi::DenseTensor>(ret_et1.impl()) std::dynamic_pointer_cast<phi::DenseTensor>(ret_et1.impl())
...@@ -121,7 +123,7 @@ TEST(AccumulationNode, Tensor) { ...@@ -121,7 +123,7 @@ TEST(AccumulationNode, Tensor) {
std::make_shared<egr::CppTensorVoidHook>(reduce_hook_1)); std::make_shared<egr::CppTensorVoidHook>(reduce_hook_1));
// operator() // operator()
paddle::experimental::Tensor _ret = node->operator()({{et0}})[0][0]; paddle::experimental::Tensor _ret = node->operator()(et0_vec)[0][0];
// Check operator() result, should be 36.0 // Check operator() result, should be 36.0
auto* _ret_ptr = std::dynamic_pointer_cast<phi::DenseTensor>(_ret.impl()) auto* _ret_ptr = std::dynamic_pointer_cast<phi::DenseTensor>(_ret.impl())
......
...@@ -32,7 +32,7 @@ class GradTestNode : public egr::GradNodeBase { ...@@ -32,7 +32,7 @@ class GradTestNode : public egr::GradNodeBase {
GradTestNode() : GradNodeBase() { val_ = 1.0; } GradTestNode() : GradNodeBase() { val_ = 1.0; }
std::string name() override { return "GradTestNode"; } std::string name() override { return "GradTestNode"; }
std::vector<std::vector<paddle::experimental::Tensor>> operator()( std::vector<std::vector<paddle::experimental::Tensor>> operator()(
const std::vector<std::vector<paddle::experimental::Tensor>>& grads, std::vector<std::vector<paddle::experimental::Tensor>>& grads,
bool create_graph = false) override { bool create_graph = false) override {
val_ = std::dynamic_pointer_cast<phi::DenseTensor>(grads[0][0].impl()) val_ = std::dynamic_pointer_cast<phi::DenseTensor>(grads[0][0].impl())
->data<float>()[0]; ->data<float>()[0];
......
...@@ -247,4 +247,20 @@ TEST(EagerUtils, GetGradAccumulationNode) { ...@@ -247,4 +247,20 @@ TEST(EagerUtils, GetGradAccumulationNode) {
ASSERT_ANY_THROW(egr::EagerUtils::GetGradAccumulationNode(t0)); ASSERT_ANY_THROW(egr::EagerUtils::GetGradAccumulationNode(t0));
} }
TEST(EagerUtils, FillZeroForEmptyGradInputs) {
std::vector<std::vector<paddle::experimental::Tensor>> grads = {
std::vector<paddle::experimental::Tensor>(1)};
std::vector<std::vector<GradSlotMeta>> slot_metas = {
std::vector<GradSlotMeta>(1)};
phi::DenseTensorMeta tensor_meta;
tensor_meta.dtype = paddle::experimental::DataType::FLOAT32;
tensor_meta.dims = {2, 4};
slot_metas[0][0].SetTensorMeta(tensor_meta);
slot_metas[0][0].SetPlace(phi::CPUPlace());
EagerUtils::FillZeroForEmptyGradInputs(&grads, slot_metas);
eager_test::CompareTensorWithValue<float>(grads[0][0], 0.0);
}
} // namespace egr } // namespace egr
...@@ -370,7 +370,7 @@ class GradNodeRunProgram : public egr::GradNodeBase { ...@@ -370,7 +370,7 @@ class GradNodeRunProgram : public egr::GradNodeBase {
~GradNodeRunProgram() override = default; ~GradNodeRunProgram() override = default;
// Functor: perform backward computations // Functor: perform backward computations
virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()( virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
const std::vector<std::vector<paddle::experimental::Tensor>> &grads, std::vector<std::vector<paddle::experimental::Tensor>> &grads, // NOLINT
bool create_graph) override { bool create_graph) override {
VLOG(3) << "Running Eager Backward Node: GradNodeRunProgram"; VLOG(3) << "Running Eager Backward Node: GradNodeRunProgram";
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
......
...@@ -20,6 +20,7 @@ ...@@ -20,6 +20,7 @@
#include "paddle/phi/api/all.h" #include "paddle/phi/api/all.h"
#include "paddle/phi/common/layout.h" #include "paddle/phi/common/layout.h"
#include "paddle/phi/core/compat/convert_utils.h"
#include "paddle/phi/core/tensor_meta.h" #include "paddle/phi/core/tensor_meta.h"
#include "paddle/fluid/framework/data_layout.h" #include "paddle/fluid/framework/data_layout.h"
...@@ -392,4 +393,28 @@ std::shared_ptr<egr::GradNodeBase> EagerUtils::GetGradAccumulationNode( ...@@ -392,4 +393,28 @@ std::shared_ptr<egr::GradNodeBase> EagerUtils::GetGradAccumulationNode(
} }
} }
void EagerUtils::FillZeroForEmptyGradInputs(
std::vector<std::vector<paddle::experimental::Tensor>>* in_grads,
const std::vector<std::vector<GradSlotMeta>>& grad_in_metas) {
for (size_t i = 0; i < in_grads->size(); i++) {
for (size_t j = 0; j < (*in_grads)[0].size(); j++) {
paddle::experimental::Tensor& grad = (*in_grads)[i][j];
if (!grad.is_initialized()) {
const GradSlotMeta& grad_in_meta = grad_in_metas[i][j];
PADDLE_ENFORCE(
grad_in_meta.HasTensorMeta(),
paddle::platform::errors::Fatal(
"Unable to fill empty grad inputs due to empty GradSlotMeta"));
const auto& tensor_meta = grad_in_meta.GetTensorMeta();
phi::Place place = grad_in_meta.GetPlace();
auto tensor_with_zero = paddle::experimental::full(
phi::vectorize(tensor_meta.dims), 0.0, tensor_meta.dtype, place);
grad.set_impl(tensor_with_zero.impl());
}
}
}
}
} // namespace egr } // namespace egr
...@@ -217,6 +217,13 @@ class EagerUtils { ...@@ -217,6 +217,13 @@ class EagerUtils {
const std::vector<paddle::experimental::Tensor>& tensors); const std::vector<paddle::experimental::Tensor>& tensors);
static std::shared_ptr<egr::GradNodeBase> GetGradAccumulationNode( static std::shared_ptr<egr::GradNodeBase> GetGradAccumulationNode(
const paddle::experimental::Tensor& tensor); const paddle::experimental::Tensor& tensor);
/**
* Fill Zero
* **/
static void FillZeroForEmptyGradInputs(
std::vector<std::vector<paddle::experimental::Tensor>>* out_grads,
const std::vector<std::vector<GradSlotMeta>>& grad_out_metas);
}; };
} // namespace egr } // namespace egr
...@@ -176,6 +176,20 @@ void Tracer::TraceOp(const std::string& type, const NameVarMap<VarType>& ins, ...@@ -176,6 +176,20 @@ void Tracer::TraceOp(const std::string& type, const NameVarMap<VarType>& ins,
const std::map<std::string, std::string>& inplace_map, const std::map<std::string, std::string>& inplace_map,
paddle::framework::AttributeMap* passed_default_attrs_, paddle::framework::AttributeMap* passed_default_attrs_,
bool use_default_attr_map) { bool use_default_attr_map) {
TraceOpImpl<VarType>(type, ins, outs, attrs, place, trace_backward,
inplace_map, passed_default_attrs_,
use_default_attr_map);
}
template <typename VarType>
void Tracer::TraceOpImpl(const std::string& type,
const NameVarMap<VarType>& ins,
const NameVarMap<VarType>& outs,
framework::AttributeMap& attrs,
const platform::Place& place, bool trace_backward,
const std::map<std::string, std::string>& inplace_map,
paddle::framework::AttributeMap* passed_default_attrs_,
bool use_default_attr_map) {
platform::RecordEvent op_type_record_event( platform::RecordEvent op_type_record_event(
type + " trace_op", platform::TracerEventType::Operator, 1); type + " trace_op", platform::TracerEventType::Operator, 1);
platform::ScopedFlushDenormal flush; platform::ScopedFlushDenormal flush;
...@@ -340,25 +354,33 @@ void Tracer::TraceOp(const std::string& type, const NameVarBaseMap& ins, ...@@ -340,25 +354,33 @@ void Tracer::TraceOp(const std::string& type, const NameVarBaseMap& ins,
void Tracer::TraceOp(const std::string& type, const NameTensorMap& ins, void Tracer::TraceOp(const std::string& type, const NameTensorMap& ins,
const NameTensorMap& outs, const NameTensorMap& outs,
paddle::framework::AttributeMap attrs, paddle::framework::AttributeMap& attrs,
const paddle::platform::Place& place, const paddle::platform::Place& place,
paddle::framework::AttributeMap* default_attrs, paddle::framework::AttributeMap* default_attrs,
bool use_default_attr_map, bool use_default_attr_map,
const std::map<std::string, std::string>& inplace_map) { const std::map<std::string, std::string>& inplace_map) {
VLOG(6) << "Running On Eager TraceOp with use_default_attr_map: " VLOG(6) << "Running On Eager TraceOp with use_default_attr_map: "
<< use_default_attr_map; << use_default_attr_map;
TraceOp<egr::EagerVariable>(type, ins, outs, std::move(attrs), place, false, TraceOpImpl<egr::EagerVariable>(type, ins, outs, attrs, place, false,
inplace_map, default_attrs, use_default_attr_map); inplace_map, default_attrs,
use_default_attr_map);
}
void Tracer::TraceOp(const std::string& type, const NameTensorMap& ins,
const NameTensorMap& outs,
paddle::framework::AttributeMap attrs) {
VLOG(6) << "Running On Eager TraceOp(4 agrs): ";
TraceOpImpl<egr::EagerVariable>(type, ins, outs, attrs, expected_place_,
false, {}, nullptr, true);
} }
void Tracer::TraceOp(const std::string& type, const NameTensorMap& ins, void Tracer::TraceOp(const std::string& type, const NameTensorMap& ins,
const NameTensorMap& outs, const NameTensorMap& outs,
paddle::framework::AttributeMap attrs, paddle::framework::AttributeMap& attrs,
const std::map<std::string, std::string>& inplace_map) { const std::map<std::string, std::string>& inplace_map) {
VLOG(6) << "Running On Eager TraceOp(less): "; VLOG(6) << "Running On Eager TraceOp(less): ";
TraceOp<egr::EagerVariable>(type, ins, outs, std::move(attrs), TraceOpImpl<egr::EagerVariable>(type, ins, outs, attrs, expected_place_,
expected_place_, false, inplace_map, nullptr, false, inplace_map, nullptr, true);
true);
} }
void Tracer::SetExpectedPlace(platform::Place place) { void Tracer::SetExpectedPlace(platform::Place place) {
......
...@@ -74,16 +74,32 @@ class Tracer { ...@@ -74,16 +74,32 @@ class Tracer {
paddle::framework::AttributeMap* passed_default_attrs_ = nullptr, paddle::framework::AttributeMap* passed_default_attrs_ = nullptr,
bool use_default_attr_map = true); bool use_default_attr_map = true);
template <typename VarType>
void TraceOpImpl(
const std::string& type, const NameVarMap<VarType>& ins,
const NameVarMap<VarType>& outs,
framework::AttributeMap& attrs, // NOLINT
const platform::Place& place, bool trace_backward,
const std::map<std::string, std::string>& inplace_map = {},
paddle::framework::AttributeMap* passed_default_attrs_ = nullptr,
bool use_default_attr_map = true);
void TraceOp(const std::string& type, const NameVarBaseMap& ins, void TraceOp(const std::string& type, const NameVarBaseMap& ins,
const NameVarBaseMap& outs, framework::AttributeMap attrs, const NameVarBaseMap& outs, framework::AttributeMap attrs,
const std::map<std::string, std::string>& inplace_map = {}); const std::map<std::string, std::string>& inplace_map = {});
void TraceOp(const std::string& type, const NameTensorMap& ins, void TraceOp(const std::string& type, const NameTensorMap& ins,
const NameTensorMap& outs, paddle::framework::AttributeMap attrs, const NameTensorMap& outs,
paddle::framework::AttributeMap& attrs, // NOLINT
const std::map<std::string, std::string>& inplace_map = {}); const std::map<std::string, std::string>& inplace_map = {});
void TraceOp(const std::string& type, const NameTensorMap& ins, void TraceOp(const std::string& type, const NameTensorMap& ins,
const NameTensorMap& outs, paddle::framework::AttributeMap attrs, const NameTensorMap& outs,
paddle::framework::AttributeMap attrs);
void TraceOp(const std::string& type, const NameTensorMap& ins,
const NameTensorMap& outs,
paddle::framework::AttributeMap& attrs, // NOLINT
const paddle::platform::Place& place, const paddle::platform::Place& place,
paddle::framework::AttributeMap* default_attrs, paddle::framework::AttributeMap* default_attrs,
bool use_default_attr_map, bool use_default_attr_map,
......
...@@ -34,6 +34,7 @@ ...@@ -34,6 +34,7 @@
#include "paddle/fluid/memory/allocation/thread_local_allocator.h" #include "paddle/fluid/memory/allocation/thread_local_allocator.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/device/gpu/cuda/cuda_graph.h" #include "paddle/fluid/platform/device/gpu/cuda/cuda_graph.h"
...@@ -210,13 +211,28 @@ class AllocatorFacadePrivate { ...@@ -210,13 +211,28 @@ class AllocatorFacadePrivate {
InitNaiveBestFitCPUAllocator(); InitNaiveBestFitCPUAllocator();
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
allow_free_idle_chunk_ = allow_free_idle_chunk; allow_free_idle_chunk_ = allow_free_idle_chunk;
if (!FLAGS_use_stream_safe_cuda_allocator) { for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) {
for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); InitAutoGrowthCUDAAllocator(platform::CUDAPlace(dev_id),
++dev_id) { allow_free_idle_chunk_);
InitAutoGrowthCUDAAllocator(platform::CUDAPlace(dev_id), }
allow_free_idle_chunk_);
} // Note(Ruibiao): For GPU multi-stream case, the 'allocators_' map(place
// -> Allocator) hold the StreamSafeCUDAAllocator releate to default
// stream (i.e., the stream directly got from DeviceContex), while the
// 'cuda_allocators_' map(place -> map(stream -> Allocator)) hold the
// StreamSafeCUDAAllocator releate to non-default stream (i.e., the
// stream users pass in). The default stream Allocator is built in the
// structure of AllocatorFacadePrivate, while the non-default stream is
// build in a delayed manner in GetAllocator function with
// 'create_if_not_found = ture'. We make special treatment for the
// default stream for performance reasons. Since most Alloc calls are
// for default stream in application, treating it separately can avoid
// lots of overhead of acquiring default stream and applying read-write
// lock.
if (FLAGS_use_stream_safe_cuda_allocator) {
WrapStreamSafeCUDAAllocatorForDefault();
} }
InitNaiveBestFitCUDAPinnedAllocator(); InitNaiveBestFitCUDAPinnedAllocator();
#endif #endif
#ifdef PADDLE_WITH_ASCEND_CL #ifdef PADDLE_WITH_ASCEND_CL
...@@ -301,7 +317,8 @@ class AllocatorFacadePrivate { ...@@ -301,7 +317,8 @@ class AllocatorFacadePrivate {
CheckAllocThreadSafe(); CheckAllocThreadSafe();
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
if (UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) { if (FLAGS_use_stream_safe_cuda_allocator == false &&
UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) {
WrapCUDAGraphAllocator(); WrapCUDAGraphAllocator();
} }
#endif #endif
...@@ -341,7 +358,12 @@ class AllocatorFacadePrivate { ...@@ -341,7 +358,12 @@ class AllocatorFacadePrivate {
const std::shared_ptr<Allocator>& GetAllocator( const std::shared_ptr<Allocator>& GetAllocator(
const platform::CUDAPlace& place, const gpuStream_t& stream, const platform::CUDAPlace& place, const gpuStream_t& stream,
bool create_if_not_found = false) { bool create_if_not_found = false) {
{ // shared_lock_guard if (stream == GetDefaultStream(place)) {
VLOG(7) << "Get Allocator by passing in a default stream";
return GetAllocator(place, /* A non-zero num to choose allocator_ */ 1);
}
/* shared_lock_guard */ {
std::shared_lock<std::shared_timed_mutex> lock_guard( std::shared_lock<std::shared_timed_mutex> lock_guard(
cuda_allocator_mutex_); cuda_allocator_mutex_);
if (LIKELY(HasCUDAAllocator(place, stream))) { if (LIKELY(HasCUDAAllocator(place, stream))) {
...@@ -355,7 +377,7 @@ class AllocatorFacadePrivate { ...@@ -355,7 +377,7 @@ class AllocatorFacadePrivate {
} }
} }
{ // unique_lock_guard /* unique_lock_guard */ {
std::unique_lock<std::shared_timed_mutex> lock_guard( std::unique_lock<std::shared_timed_mutex> lock_guard(
cuda_allocator_mutex_); cuda_allocator_mutex_);
InitStreamSafeCUDAAllocator(place, stream); InitStreamSafeCUDAAllocator(place, stream);
...@@ -363,9 +385,40 @@ class AllocatorFacadePrivate { ...@@ -363,9 +385,40 @@ class AllocatorFacadePrivate {
} }
} }
gpuStream_t GetDefaultStream(const platform::CUDAPlace& place) { const std::shared_ptr<StreamSafeCUDAAllocator>
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); GetDefaultStreamSafeCUDAAllocator(const platform::CUDAPlace& place) const {
return static_cast<platform::CUDADeviceContext*>(pool.Get(place))->stream(); const auto iter = default_stream_safe_cuda_allocators_.find(place);
PADDLE_ENFORCE_NE(
iter, default_stream_safe_cuda_allocators_.end(),
platform::errors::NotFound(
"No StreamSafeCUDAAllocator found for the place, %s", place));
return iter->second;
}
const gpuStream_t& GetDefaultStream(const platform::CUDAPlace& place) const {
const std::shared_ptr<StreamSafeCUDAAllocator>& allocator =
GetDefaultStreamSafeCUDAAllocator(place);
return allocator->GetDefaultStream();
}
void SetDefaultStream(const platform::CUDAPlace& place,
const gpuStream_t& stream) {
const std::shared_ptr<StreamSafeCUDAAllocator>& allocator =
GetDefaultStreamSafeCUDAAllocator(place);
allocator->SetDefaultStream(stream);
VLOG(8) << "Set default stream to " << stream
<< " for StreamSafeCUDAAllocator(" << allocator.get() << ") in "
<< place;
}
void SetDefaultStreamFromDeviceContext() {
VLOG(8) << "Set default stream from DeviceContex";
for (auto& pair : default_stream_safe_cuda_allocators_) {
platform::DeviceContextPool& pool =
platform::DeviceContextPool::Instance();
pair.second->SetDefaultStream(
static_cast<phi::GPUContext*>(pool.Get(pair.first))->stream());
}
} }
void RecordStream(std::shared_ptr<phi::Allocation> allocation, void RecordStream(std::shared_ptr<phi::Allocation> allocation,
...@@ -635,6 +688,26 @@ class AllocatorFacadePrivate { ...@@ -635,6 +688,26 @@ class AllocatorFacadePrivate {
/* in_cuda_graph_capturing = */ !allow_free_idle_chunk_); /* in_cuda_graph_capturing = */ !allow_free_idle_chunk_);
} }
void WrapStreamSafeCUDAAllocatorForDefault() {
for (auto& pair : allocators_) {
auto& place = pair.first;
if (platform::is_gpu_place(place)) {
std::shared_ptr<StreamSafeCUDAAllocator>&& allocator =
std::make_shared<StreamSafeCUDAAllocator>(
pair.second, place, /* default_stream = */ nullptr,
/* in_cuda_graph_capturing = */ !allow_free_idle_chunk_);
pair.second = allocator;
// NOTE(Ruibiao): A tricky implement to give StreamSafeCUDAAllocator an
// ability to interact with the outside world, i.e., change default
// stream from outside
default_stream_safe_cuda_allocators_[place] = allocator;
VLOG(8) << "WrapStreamSafeCUDAAllocator for " << place
<< ", allocator address = " << pair.second.get();
}
}
}
void WrapCUDARetryAllocator(platform::CUDAPlace p, gpuStream_t stream, void WrapCUDARetryAllocator(platform::CUDAPlace p, gpuStream_t stream,
size_t retry_time) { size_t retry_time) {
PADDLE_ENFORCE_GT( PADDLE_ENFORCE_GT(
...@@ -813,7 +886,6 @@ class AllocatorFacadePrivate { ...@@ -813,7 +886,6 @@ class AllocatorFacadePrivate {
#endif #endif
} }
// NOTE(Ruibiao): Old single-stream version, will be removed later
void WrapCUDARetryAllocator(size_t retry_time) { void WrapCUDARetryAllocator(size_t retry_time) {
PADDLE_ENFORCE_GT( PADDLE_ENFORCE_GT(
retry_time, 0, retry_time, 0,
...@@ -828,6 +900,8 @@ class AllocatorFacadePrivate { ...@@ -828,6 +900,8 @@ class AllocatorFacadePrivate {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
// a standalone CUDA allocator to support multi-stream GC in new executor // a standalone CUDA allocator to support multi-stream GC in new executor
std::map<platform::Place, std::shared_ptr<StreamSafeCUDAAllocator>>
default_stream_safe_cuda_allocators_;
CUDAAllocatorMap cuda_allocators_; CUDAAllocatorMap cuda_allocators_;
std::shared_timed_mutex cuda_allocator_mutex_; std::shared_timed_mutex cuda_allocator_mutex_;
#endif #endif
...@@ -870,15 +944,6 @@ AllocatorFacadePrivate* AllocatorFacade::GetPrivate() const { ...@@ -870,15 +944,6 @@ AllocatorFacadePrivate* AllocatorFacade::GetPrivate() const {
const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator( const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
const platform::Place& place) { const platform::Place& place) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
FLAGS_use_system_allocator == false) {
AllocatorFacadePrivate* m = GetPrivate();
platform::CUDAPlace cuda_place(place.GetDeviceId());
return m->GetAllocator(cuda_place, m->GetDefaultStream(cuda_place));
}
#endif
return GetPrivate()->GetAllocator( return GetPrivate()->GetAllocator(
place, /* A non-zero num to choose allocator_ */ 1); place, /* A non-zero num to choose allocator_ */ 1);
} }
...@@ -898,19 +963,6 @@ void* AllocatorFacade::GetBasePtr( ...@@ -898,19 +963,6 @@ void* AllocatorFacade::GetBasePtr(
return GetPrivate()->GetBasePtr(allocation); return GetPrivate()->GetBasePtr(allocation);
} }
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
const platform::Place& place, const gpuStream_t& stream) {
if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
FLAGS_use_system_allocator == false) {
return GetPrivate()->GetAllocator(place, stream,
/*create_if_not_found=*/true);
}
return GetPrivate()->GetAllocator(
place, /* A non-zero num to choose allocator_ */ 1);
}
#endif
const std::shared_ptr<Allocator>& AllocatorFacade::GetZeroAllocator( const std::shared_ptr<Allocator>& AllocatorFacade::GetZeroAllocator(
const platform::Place& place) { const platform::Place& place) {
return GetPrivate()->GetAllocator(place, /* zero size */ 0); return GetPrivate()->GetAllocator(place, /* zero size */ 0);
...@@ -923,26 +975,10 @@ std::shared_ptr<phi::Allocation> AllocatorFacade::AllocShared( ...@@ -923,26 +975,10 @@ std::shared_ptr<phi::Allocation> AllocatorFacade::AllocShared(
AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
size_t size) { size_t size) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
size > 0 && FLAGS_use_system_allocator == false) {
platform::CUDAPlace cuda_place(place.GetDeviceId());
phi::Stream default_stream = phi::Stream(reinterpret_cast<phi::StreamId>(
GetPrivate()->GetDefaultStream(cuda_place)));
return Alloc(cuda_place, size, default_stream);
}
#endif
return GetPrivate()->GetAllocator(place, size)->Allocate(size); return GetPrivate()->GetAllocator(place, size)->Allocate(size);
} }
uint64_t AllocatorFacade::Release(const platform::Place& place) { uint64_t AllocatorFacade::Release(const platform::Place& place) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
FLAGS_use_system_allocator == false) {
platform::CUDAPlace cuda_place(place.GetDeviceId());
return Release(cuda_place, GetPrivate()->GetDefaultStream(cuda_place));
}
#endif
return GetPrivate() return GetPrivate()
->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1) ->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1)
->Release(place); ->Release(place);
...@@ -1028,6 +1064,17 @@ void AllocatorFacade::RecordStream(std::shared_ptr<phi::Allocation> allocation, ...@@ -1028,6 +1064,17 @@ void AllocatorFacade::RecordStream(std::shared_ptr<phi::Allocation> allocation,
GetPrivate()->RecordStream(allocation, stream); GetPrivate()->RecordStream(allocation, stream);
} }
const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
const platform::Place& place, const gpuStream_t& stream) {
if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
FLAGS_use_system_allocator == false) {
return GetPrivate()->GetAllocator(place, stream,
/*create_if_not_found=*/true);
}
return GetPrivate()->GetAllocator(
place, /* A non-zero num to choose allocator_ */ 1);
}
const gpuStream_t& AllocatorFacade::GetStream( const gpuStream_t& AllocatorFacade::GetStream(
const std::shared_ptr<phi::Allocation>& allocation) const { const std::shared_ptr<phi::Allocation>& allocation) const {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
...@@ -1040,6 +1087,13 @@ const gpuStream_t& AllocatorFacade::GetStream( ...@@ -1040,6 +1087,13 @@ const gpuStream_t& AllocatorFacade::GetStream(
return GetPrivate()->GetStream(allocation); return GetPrivate()->GetStream(allocation);
} }
void AllocatorFacade::SetDefaultStream(const platform::CUDAPlace& place,
const gpuStream_t& stream) {
if (FLAGS_use_stream_safe_cuda_allocator) {
GetPrivate()->SetDefaultStream(place, stream);
}
}
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
void AllocatorFacade::PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) { void AllocatorFacade::PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) {
PADDLE_ENFORCE_EQ(GetAllocatorStrategy(), AllocatorStrategy::kAutoGrowth, PADDLE_ENFORCE_EQ(GetAllocatorStrategy(), AllocatorStrategy::kAutoGrowth,
...@@ -1055,6 +1109,8 @@ void AllocatorFacade::PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) { ...@@ -1055,6 +1109,8 @@ void AllocatorFacade::PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) {
"The memory pool of the CUDA Graph with ID %d have been prepared.", "The memory pool of the CUDA Graph with ID %d have been prepared.",
id)); id));
allocator.reset(new AllocatorFacadePrivate(/*allow_free_idle_chunk=*/false)); allocator.reset(new AllocatorFacadePrivate(/*allow_free_idle_chunk=*/false));
allocator->SetDefaultStreamFromDeviceContext();
VLOG(10) << "Prepare memory pool for CUDA Graph with ID " << id; VLOG(10) << "Prepare memory pool for CUDA Graph with ID " << id;
} }
......
...@@ -55,11 +55,6 @@ class AllocatorFacade { ...@@ -55,11 +55,6 @@ class AllocatorFacade {
void* GetBasePtr(const std::shared_ptr<Allocation>& allocation); void* GetBasePtr(const std::shared_ptr<Allocation>& allocation);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
const std::shared_ptr<Allocator>& GetAllocator(const platform::Place& place,
const gpuStream_t& stream);
#endif
const std::shared_ptr<Allocator>& GetZeroAllocator( const std::shared_ptr<Allocator>& GetZeroAllocator(
const platform::Place& place); const platform::Place& place);
...@@ -86,8 +81,12 @@ class AllocatorFacade { ...@@ -86,8 +81,12 @@ class AllocatorFacade {
uint64_t Release(const platform::CUDAPlace& place, const gpuStream_t& stream); uint64_t Release(const platform::CUDAPlace& place, const gpuStream_t& stream);
void RecordStream(std::shared_ptr<Allocation> allocation, void RecordStream(std::shared_ptr<Allocation> allocation,
const gpuStream_t& stream); const gpuStream_t& stream);
const std::shared_ptr<Allocator>& GetAllocator(const platform::Place& place,
const gpuStream_t& stream);
const gpuStream_t& GetStream( const gpuStream_t& GetStream(
const std::shared_ptr<Allocation>& allocation) const; const std::shared_ptr<Allocation>& allocation) const;
void SetDefaultStream(const platform::CUDAPlace& place,
const gpuStream_t& stream);
#endif #endif
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
......
...@@ -154,6 +154,14 @@ StreamSafeCUDAAllocator::~StreamSafeCUDAAllocator() { ...@@ -154,6 +154,14 @@ StreamSafeCUDAAllocator::~StreamSafeCUDAAllocator() {
bool StreamSafeCUDAAllocator::IsAllocThreadSafe() const { return true; } bool StreamSafeCUDAAllocator::IsAllocThreadSafe() const { return true; }
const gpuStream_t& StreamSafeCUDAAllocator::GetDefaultStream() const {
return default_stream_;
}
void StreamSafeCUDAAllocator::SetDefaultStream(const gpuStream_t& stream) {
default_stream_ = stream;
}
phi::Allocation* StreamSafeCUDAAllocator::AllocateImpl(size_t size) { phi::Allocation* StreamSafeCUDAAllocator::AllocateImpl(size_t size) {
platform::RecordEvent("StreamSafeCUDAAllocator::Allocate", platform::RecordEvent("StreamSafeCUDAAllocator::Allocate",
platform::TracerEventType::UserDefined, 9 /*level*/); platform::TracerEventType::UserDefined, 9 /*level*/);
...@@ -187,12 +195,8 @@ void StreamSafeCUDAAllocator::FreeImpl(phi::Allocation* allocation) { ...@@ -187,12 +195,8 @@ void StreamSafeCUDAAllocator::FreeImpl(phi::Allocation* allocation) {
platform::RecordEvent("StreamSafeCUDAAllocator::Free", platform::RecordEvent("StreamSafeCUDAAllocator::Free",
platform::TracerEventType::UserDefined, 9 /*level*/); platform::TracerEventType::UserDefined, 9 /*level*/);
StreamSafeCUDAAllocation* stream_safe_cuda_allocation = StreamSafeCUDAAllocation* stream_safe_cuda_allocation =
dynamic_cast<StreamSafeCUDAAllocation*>(allocation); static_cast<StreamSafeCUDAAllocation*>(allocation);
PADDLE_ENFORCE_NOT_NULL(stream_safe_cuda_allocation,
platform::errors::InvalidArgument(
"Failed to dynamic cast %p from Allocation* to "
"StreamSafeCUDAAllocation*",
allocation));
VLOG(8) << "Try free allocation " << stream_safe_cuda_allocation->ptr(); VLOG(8) << "Try free allocation " << stream_safe_cuda_allocation->ptr();
if (stream_safe_cuda_allocation->CanBeFreed()) { if (stream_safe_cuda_allocation->CanBeFreed()) {
VLOG(9) << "Directly delete allocation"; VLOG(9) << "Directly delete allocation";
...@@ -221,6 +225,12 @@ uint64_t StreamSafeCUDAAllocator::ReleaseImpl(const platform::Place& place) { ...@@ -221,6 +225,12 @@ uint64_t StreamSafeCUDAAllocator::ReleaseImpl(const platform::Place& place) {
} }
void StreamSafeCUDAAllocator::ProcessUnfreedAllocations() { void StreamSafeCUDAAllocator::ProcessUnfreedAllocations() {
// NOTE(Ruibiao): This condition is to reduce lock competion. It does not need
// to be thread-safe since here occasional misjudgments are permissible.
if (unfreed_allocations_.empty()) {
return;
}
std::lock_guard<SpinLock> lock_guard(unfreed_allocation_lock_); std::lock_guard<SpinLock> lock_guard(unfreed_allocation_lock_);
for (auto it = unfreed_allocations_.begin(); for (auto it = unfreed_allocations_.begin();
it != unfreed_allocations_.end();) { it != unfreed_allocations_.end();) {
......
...@@ -64,7 +64,10 @@ class StreamSafeCUDAAllocator ...@@ -64,7 +64,10 @@ class StreamSafeCUDAAllocator
platform::CUDAPlace place, gpuStream_t default_stream, platform::CUDAPlace place, gpuStream_t default_stream,
bool in_cuda_graph_capturing = false); bool in_cuda_graph_capturing = false);
~StreamSafeCUDAAllocator(); ~StreamSafeCUDAAllocator();
bool IsAllocThreadSafe() const override; bool IsAllocThreadSafe() const override;
const gpuStream_t &GetDefaultStream() const;
void SetDefaultStream(const gpuStream_t &stream);
protected: protected:
phi::Allocation *AllocateImpl(size_t size) override; phi::Allocation *AllocateImpl(size_t size) override;
......
...@@ -24,7 +24,9 @@ class CinnInstructionRunOp : public framework::OperatorWithKernel { ...@@ -24,7 +24,9 @@ class CinnInstructionRunOp : public framework::OperatorWithKernel {
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override { void InferShape(framework::InferShapeContext* ctx) const override {
OP_INOUT_CHECK(ctx->HasInputs(kX), "Input", kX, "CinnInstructionRun"); // The cinn-graph may hasn't input for CINN now support fill_constant,
// and its all inputs may generated by fill_constant instead of by fetch.
// OP_INOUT_CHECK(ctx->HasInputs(kX), "Input", kX, "CinnInstructionRun");
OP_INOUT_CHECK(ctx->HasOutputs(kOutputs), "Output", kOutputs, OP_INOUT_CHECK(ctx->HasOutputs(kOutputs), "Output", kOutputs,
"CinnInstructionRun"); "CinnInstructionRun");
const CinnCompiledObject& compiled_object = const CinnCompiledObject& compiled_object =
...@@ -43,6 +45,53 @@ class CinnInstructionRunOp : public framework::OperatorWithKernel { ...@@ -43,6 +45,53 @@ class CinnInstructionRunOp : public framework::OperatorWithKernel {
}); });
ctx->SetOutputsDim(kOutputs, output_dims); ctx->SetOutputsDim(kOutputs, output_dims);
} }
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
// Why we need override GetExpectedKernelType?
// A cinn-graph may has no inpute var, if we use the base function,
// it will check wheter input tensors is initialized. Here we rewrite
// the function so that we can infer kernel type by output date type.
if (ctx.InputSize(kX)) {
// if the instruction has input, infer kernel type by input date type:
return OperatorWithKernel::GetExpectedKernelType(ctx);
}
// Else infer kernel type by output date type:
// The `OutputVar` will check wheter the kOutputs iff has one output var
const framework::Variable* var = ctx.OutputVar(kOutputs);
PADDLE_ENFORCE_NE(
var, nullptr,
platform::errors::InvalidArgument(
"The cinn_instruction_run Op's Output Variable should not empty."));
const framework::Tensor* tensor = nullptr;
if (var->IsType<framework::Tensor>()) {
tensor = &var->Get<framework::Tensor>();
} else if (var->IsType<framework::LoDTensor>()) {
tensor = &var->Get<framework::LoDTensor>();
} else if (var->IsType<phi::SelectedRows>()) {
tensor = &(var->Get<phi::SelectedRows>().value());
} else if (var->IsType<framework::LoDTensorArray>()) {
auto t_arr = &var->Get<framework::LoDTensorArray>();
PADDLE_ENFORCE_EQ(t_arr->size(), 1UL,
platform::errors::InvalidArgument(
"The cinn_instruction_run Op should just has One "
"Output when Input empty."));
tensor = &(t_arr->front());
}
PADDLE_ENFORCE_NE(
tensor, nullptr,
platform::errors::InvalidArgument(
"The cinn_instruction_run Op's Output Tensor should not empty."));
VLOG(4) << "The tensor [" << ctx.OutputName(kOutputs) << "]'s dtype is "
<< paddle::framework::DataType2String(tensor->dtype());
auto output_type = paddle::framework::TransToProtoVarType(tensor->dtype());
return framework::OpKernelType(output_type, ctx.device_context());
}
}; };
class CinnInstructionRunOpMaker : public framework::OpProtoAndCheckerMaker { class CinnInstructionRunOpMaker : public framework::OpProtoAndCheckerMaker {
......
...@@ -87,9 +87,12 @@ class CinnLaunchOp : public framework::OperatorWithKernel { ...@@ -87,9 +87,12 @@ class CinnLaunchOp : public framework::OperatorWithKernel {
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override { void InferShape(framework::InferShapeContext* ctx) const override {
OP_INOUT_CHECK(ctx->HasInputs(kX) || ctx->HasInputs(kNoNeedBufferX), // The cinn-graph may hasn't input for CINN now support fill_constant,
"Input", string::format_string("%s|%s", kX, kNoNeedBufferX), // and its all inputs may generated by fill_constant instead of by fetch.
"CinnLaunchOp"); // OP_INOUT_CHECK(ctx->HasInputs(kX) || ctx->HasInputs(kNoNeedBufferX),
// "Input", string::format_string("%s|%s", kX,
// kNoNeedBufferX),
// "CinnLaunchOp");
OP_INOUT_CHECK(ctx->HasOutputs(kOutputs), "Output", kOutputs, OP_INOUT_CHECK(ctx->HasOutputs(kOutputs), "Output", kOutputs,
"CinnLaunchOp"); "CinnLaunchOp");
} }
......
...@@ -35,143 +35,99 @@ limitations under the License. */ ...@@ -35,143 +35,99 @@ limitations under the License. */
#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h" #include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
#include "paddle/fluid/platform/aligned_vector.h" #include "paddle/fluid/platform/aligned_vector.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/kernels/funcs/distribution_helper.h"
#include "paddle/phi/kernels/funcs/functors.h" #include "paddle/phi/kernels/funcs/functors.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
template <typename T1, typename T2 = T1, typename OutT = T1>
struct DstMaskGenerator {
const float dropout_prob_;
const bool is_upscale_in_train_;
using MT = typename details::MPTypeTrait<T1>::Type;
MT factor;
HOSTDEVICE inline DstMaskGenerator(const float dropout_prob,
const bool is_upscale_in_train)
: dropout_prob_(dropout_prob), is_upscale_in_train_(is_upscale_in_train) {
factor = static_cast<MT>(1.0f / (1.0f - dropout_prob_));
}
template <typename T, typename MaskType> HOSTDEVICE inline void operator()(OutT* dst, const T1* src_val,
__global__ void RandomGenerator(const size_t n, uint64_t seed, const T2* rand, int num) const {
const float dropout_prob, const T* src, static constexpr int kCount =
MaskType* mask, T* dst, phi::funcs::uniform_distribution<T2>::kReturnsCount;
bool is_upscale_in_train, uint64_t increment) { // 0 ~ kCount -1 is dist , kCount ~ 2 * kCount - 1 is mask
using MT = typename details::MPTypeTrait<T>::Type; #pragma unroll
int idx = blockDim.x * blockIdx.x + threadIdx.x; for (int i = 0; i < kCount; i++) {
#ifdef PADDLE_WITH_HIP if (rand[i] < dropout_prob_) {
hiprandStatePhilox4_32_10_t state; dst[i] = static_cast<T1>(0);
hiprand_init(seed, idx, increment, &state); dst[i + kCount] = dst[i];
#else } else {
curandStatePhilox4_32_10_t state; dst[i] = is_upscale_in_train_
curand_init(seed, idx, increment, &state); ? static_cast<T1>(static_cast<MT>(src_val[i]) * factor)
#endif : static_cast<T1>(src_val[i]);
dst[i + kCount] = static_cast<T1>(1);
MaskType mask_val; }
T dst_val;
MT factor = static_cast<MT>(1.0f / (1.0f - dropout_prob));
for (; idx < n; idx += blockDim.x * gridDim.x) {
T src_val = src[idx];
#ifdef PADDLE_WITH_HIP
if (hiprand_uniform(&state) < dropout_prob) {
#else
if (curand_uniform(&state) < dropout_prob) {
#endif
mask_val = 0;
dst_val = 0;
} else {
mask_val = 1;
dst_val = is_upscale_in_train
? static_cast<T>(static_cast<MT>(src_val) * factor)
: src_val;
} }
mask[idx] = mask_val;
dst[idx] = dst_val;
} }
} };
template <typename T, typename MaskType, int VecSize> template <typename T, typename MaskType>
__global__ void VectorizedRandomGenerator(const size_t n, uint64_t seed, __global__ void VectorizedRandomGenerator(const size_t n, uint64_t seed,
const float dropout_prob, const float dropout_prob,
const T* src, MaskType* mask, T* dst, const T* src, MaskType* mask, T* dst,
bool is_upscale_in_train, bool is_upscale_in_train,
uint64_t increment) { uint64_t increment,
using MT = typename details::MPTypeTrait<T>::Type; size_t main_offset) {
using LoadT = phi::AlignedVector<T, VecSize>; size_t idx = static_cast<size_t>(BLOCK_ID_X * BLOCK_NUM_X);
using MaskLoadT = phi::AlignedVector<MaskType, VecSize>; static constexpr int kCount =
phi::funcs::uniform_distribution<float>::kReturnsCount;
size_t stride = BLOCK_NUM_X * GRID_NUM_X * kCount;
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
int64_t idx = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
hiprandStatePhilox4_32_10_t state; hiprandStatePhilox4_32_10_t state;
hiprand_init(seed, idx, increment, &state); hiprand_init(seed, idx + THREAD_ID_X, increment, &state);
using SType = hiprandStatePhilox4_32_10_t;
#else #else
int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;
curandStatePhilox4_32_10_t state; curandStatePhilox4_32_10_t state;
curand_init(seed, idx, increment, &state); curand_init(seed, idx + THREAD_ID_X, increment, &state);
#endif using SType = curandStatePhilox4_32_10_t;
MT factor = static_cast<MT>(1.0f / (1.0f - dropout_prob));
for (int i = idx * VecSize; i < n; i += blockDim.x * gridDim.x * VecSize) {
LoadT src_val;
phi::Load<T, VecSize>(&src[i], &src_val);
#ifdef PADDLE_WITH_HIP
float4 rand = hiprand_uniform4(&state);
#else
float4 rand = curand_uniform4(&state);
#endif #endif
T dst_mask[kCount * 2]; // 0 ~ kCount -1 : dst;kCount ~ 2 * kCount - 1: mask
LoadT dst_val; float rands[kCount];
MaskLoadT mask_val; MaskType mask_result[kCount];
using Rand = phi::funcs::uniform_distribution<float>;
#pragma unroll using Cast = kps::IdentityFunctor<T>;
for (int j = 0; j < VecSize; j++) { int deal_size = BLOCK_NUM_X * kCount;
if ((&rand.x)[j] < dropout_prob) { auto dst_functor =
dst_val[j] = 0; DstMaskGenerator<T, float>(dropout_prob, is_upscale_in_train);
mask_val[j] = 0; size_t fix = idx * kCount;
} else { for (; fix < main_offset; fix += stride) {
dst_val[j] = is_upscale_in_train kps::ReadData<T, kCount, 1, 1, false>(&dst_mask[0], src + fix, deal_size);
? static_cast<T>(static_cast<MT>(src_val[j]) * factor) kps::ElementwiseRandom<SType, float, kCount, 1, Rand>(&rands[0], Rand(),
: src_val[j]; &state);
mask_val[j] = 1; // dst
} kps::OperatorTernary<T, float, T, DstMaskGenerator<T, float>>(
} &dst_mask[0], &dst_mask[0], &rands[0], dst_functor, kCount);
kps::WriteData<T, kCount, 1, 1, false>(dst + fix, &dst_mask[0], deal_size);
phi::Store<T, VecSize>(dst_val, &dst[i]); // mask
phi::Store<MaskType, VecSize>(mask_val, &mask[i]); kps::ElementwiseUnary<T, MaskType, kCount, 1, 1, Cast>(
&mask_result[0], &dst_mask[kCount], Cast());
kps::WriteData<MaskType, kCount, 1, 1, false>(mask + fix, &mask_result[0],
deal_size);
} }
} int remainder = n - fix;
if (remainder > 0) {
template <typename T, typename MaskType> kps::ReadData<T, kCount, 1, 1, true>(&dst_mask[0], src + fix, remainder);
struct CudaDropoutGradFunctor { kps::ElementwiseRandom<SType, float, kCount, 1, Rand>(&rands[0], Rand(),
using MT = typename details::MPTypeTrait<T>::Type; &state);
// dst
explicit CudaDropoutGradFunctor(const MT factor) : factor_(factor) {} kps::OperatorTernary<T, float, T, DstMaskGenerator<T, float>>(
&dst_mask[0], &dst_mask[0], &rands[0], dst_functor, kCount);
__device__ __forceinline__ T operator()(const T dout, kps::WriteData<T, kCount, 1, 1, true>(dst + fix, &dst_mask[0], remainder);
const MaskType mask) const { // mask
return static_cast<T>(static_cast<MT>(dout) * static_cast<MT>(mask) * kps::ElementwiseUnary<T, MaskType, kCount, 1, 1, Cast>(
factor_); &mask_result[0], &dst_mask[kCount], Cast());
} kps::WriteData<MaskType, kCount, 1, 1, true>(mask + fix, &mask_result[0],
remainder);
private:
MT factor_;
};
template <typename T, typename MaskType, int VecSize>
__global__ void DropoutGradCUDAKernel(
const T* dout, const MaskType* mask,
const typename details::MPTypeTrait<T>::Type factor, const int64_t size,
T* dx) {
using MT = typename details::MPTypeTrait<T>::Type;
using LoadT = phi::AlignedVector<T, VecSize>;
using MaskLoadT = phi::AlignedVector<MaskType, VecSize>;
int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;
for (int i = idx * VecSize; i < size; i += blockDim.x * gridDim.x * VecSize) {
LoadT dout_val;
phi::Load<T, VecSize>(&dout[i], &dout_val);
MaskLoadT mask_val;
phi::Load<MaskType, VecSize>(&mask[i], &mask_val);
LoadT dx_val;
#pragma unroll
for (int j = 0; j < VecSize; j++) {
dx_val[j] = static_cast<T>(static_cast<MT>(dout_val[j]) *
static_cast<MT>(mask_val[j]) * factor);
}
phi::Store<T, VecSize>(dx_val, &dx[i]);
} }
} }
...@@ -218,42 +174,21 @@ void DropoutFwGPUKernelDriver(const phi::GPUContext& dev_ctx, bool is_test, ...@@ -218,42 +174,21 @@ void DropoutFwGPUKernelDriver(const phi::GPUContext& dev_ctx, bool is_test,
uint64_t seed_data; uint64_t seed_data;
uint64_t increment; uint64_t increment;
// VectorizedRandomGenerator use curand_uniform4, so we only support // VectorizedRandomGenerator use curand_uniform4, so we only support
// vec_size is 4; // kVecSize is 4;
int vec_size = (phi::GetVectorizedSize<T>(x_data) == 4) ? 4 : 1; constexpr int kVecSize =
phi::funcs::uniform_distribution<float>::kReturnsCount;
auto gpu_config = auto gpu_config =
phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, x_numel, vec_size); phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, x_numel, kVecSize);
auto offset = auto offset =
((x_numel - 1) / (gpu_config.GetThreadNum() * vec_size) + 1) * vec_size; ((x_numel - 1) / (gpu_config.GetThreadNum() * kVecSize) + 1) * kVecSize;
GetSeedDataAndIncrement(dev_ctx, seed, is_fix_seed, seed_val, offset, GetSeedDataAndIncrement(dev_ctx, seed, is_fix_seed, seed_val, offset,
&seed_data, &increment); &seed_data, &increment);
size_t main_offset = size / (gpu_config.GetBlockSize() * kVecSize) *
#ifdef __HIPCC__ (gpu_config.GetBlockSize() * kVecSize);
if (vec_size == 4 && size % 4 == 0) { VectorizedRandomGenerator<T, uint8_t><<<
hipLaunchKernelGGL( gpu_config.GetGridSize(), gpu_config.GetBlockSize(), 0, stream>>>(
HIP_KERNEL_NAME(VectorizedRandomGenerator<T, uint8_t, 4>), size, seed_data, dropout_prob, x_data, mask_data, y_data,
gpu_config.GetGridSize(), gpu_config.GetBlockSize(), 0, stream, size, upscale_in_train, increment, main_offset);
seed_data, dropout_prob, x_data, mask_data, y_data, upscale_in_train,
increment);
} else {
hipLaunchKernelGGL(HIP_KERNEL_NAME(RandomGenerator<T, uint8_t>),
gpu_config.GetGridSize(), gpu_config.GetBlockSize(), 0,
stream, size, seed_data, dropout_prob, x_data,
mask_data, y_data, upscale_in_train, increment);
}
#else
if (vec_size == 4 && size % 4 == 0) {
VectorizedRandomGenerator<T, uint8_t, 4><<<
gpu_config.block_per_grid, gpu_config.thread_per_block, 0, stream>>>(
size, seed_data, dropout_prob, x_data, mask_data, y_data,
upscale_in_train, increment);
} else {
RandomGenerator<T, uint8_t><<<gpu_config.block_per_grid,
gpu_config.thread_per_block, 0, stream>>>(
size, seed_data, dropout_prob, x_data, mask_data, y_data,
upscale_in_train, increment);
}
#endif
} else { } else {
if (upscale_in_train) { if (upscale_in_train) {
// todo: can y share with data with x directly? // todo: can y share with data with x directly?
...@@ -278,6 +213,22 @@ void DropoutFwGPUKernelDriver(const phi::GPUContext& dev_ctx, bool is_test, ...@@ -278,6 +213,22 @@ void DropoutFwGPUKernelDriver(const phi::GPUContext& dev_ctx, bool is_test,
} }
} }
template <typename T, typename MaskType>
struct CudaDropoutGradFunctor {
using MT = typename details::MPTypeTrait<T>::Type;
explicit CudaDropoutGradFunctor(const MT factor) : factor_(factor) {}
__device__ __forceinline__ T operator()(const T dout,
const MaskType mask) const {
return static_cast<T>(static_cast<MT>(dout) * static_cast<MT>(mask) *
factor_);
}
private:
MT factor_;
};
template <typename T> template <typename T>
void DropoutGradGPUKernelDriver(const phi::GPUContext& dev_ctx, void DropoutGradGPUKernelDriver(const phi::GPUContext& dev_ctx,
const std::string dropout_implementation, const std::string dropout_implementation,
......
...@@ -58,19 +58,15 @@ __global__ void DequantizeOneScaleQuantAxis0(const T* in, const T* scale, ...@@ -58,19 +58,15 @@ __global__ void DequantizeOneScaleQuantAxis0(const T* in, const T* scale,
} }
template <typename T> template <typename T>
__global__ void DequantizeOneScaleQuantAxis1(const T* in, const T* scale, __global__ void DequantizeOneScaleQuantAxisN(const T* in, const T* scale,
T max_range, const int num, const T max_range,
const int cin, const int cout, const int64_t num,
T* out) { const int n_scales,
int bid = blockIdx.x; const int quant_stride, T* out) {
T s = scale[bid % cout]; int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;
for (int64_t i = idx; i < num; i += blockDim.x * gridDim.x) {
int wh_size = num / (cin * cout); T s = scale[(i / quant_stride) % n_scales];
const T* in_current = in + bid * wh_size; out[i] = in[i] * s / max_range;
T* out_current = out + bid * wh_size;
for (int i = threadIdx.x; i < wh_size; i += blockDim.x) {
out_current[i] = in_current[i] * s / max_range;
} }
} }
...@@ -98,20 +94,32 @@ struct ChannelDequantizeFunctor<platform::CUDADeviceContext, T> { ...@@ -98,20 +94,32 @@ struct ChannelDequantizeFunctor<platform::CUDADeviceContext, T> {
const T* in_data = in->data<T>(); const T* in_data = in->data<T>();
T* out_data = out->mutable_data<T>(dev_ctx.GetPlace()); T* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
if (scale_num == 1) { if (scale_num == 1) {
int num = in->numel(); int64_t num = in->numel();
const T* scale_factor = scales[0]->data<T>(); const T* scale_factor = scales[0]->data<T>();
if (quant_axis == 0) { if (quant_axis == 0) {
int grid = in_dims[0]; int grid = in_dims[0];
int block = 1024; int block = 1024;
DequantizeOneScaleQuantAxis0<T><<<grid, block, 0, dev_ctx.stream()>>>( DequantizeOneScaleQuantAxis0<T><<<grid, block, 0, dev_ctx.stream()>>>(
in_data, scale_factor, max_range, num, in_dims[0], out_data); in_data, scale_factor, max_range, num, in_dims[0], out_data);
} else if (quant_axis == 1) { } else {
// Dequantize weight of Cin * Cout * W * H int quant_stride = 1;
int grid = in_dims[0] * in_dims[1]; for (int i = quant_axis + 1; i < in_dims.size(); i++) {
int block = 1024; quant_stride *= in_dims[i];
DequantizeOneScaleQuantAxis1<T><<<grid, block, 0, dev_ctx.stream()>>>( }
in_data, scale_factor, max_range, num, in_dims[0], in_dims[1],
out_data); int64_t block_size = std::min(
num, static_cast<int64_t>(dev_ctx.GetMaxThreadsPerBlock() / 4));
int64_t max_threads =
dev_ctx.GetMaxPhysicalThreadCount(); // SM * block_per_SM
const int64_t max_blocks = std::max(
((max_threads - 1) / block_size + 1), static_cast<int64_t>(1));
const int64_t grid_size =
std::min(max_blocks, (num + block_size - 1) / block_size);
DequantizeOneScaleQuantAxisN<
T><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
in_data, scale_factor, max_range, num, in_dims[quant_axis],
quant_stride, out_data);
} }
} else if (scale_num == 2) { } else if (scale_num == 2) {
// Not need to consider quant_axis // Not need to consider quant_axis
......
...@@ -273,18 +273,18 @@ struct ClipAndFakeQuantDequantFunctor<platform::CUDADeviceContext, T> { ...@@ -273,18 +273,18 @@ struct ClipAndFakeQuantDequantFunctor<platform::CUDADeviceContext, T> {
template <typename T> template <typename T>
__global__ void ChannelClipAndQuantKernelQuantAxis0(const T* in, const T* scale, __global__ void ChannelClipAndQuantKernelQuantAxis0(const T* in, const T* scale,
const int bin_cnt, const int bin_cnt,
const int n, const int c, const int64_t n,
T* out) { const int c, T* out) {
int tid = threadIdx.x; int tid = threadIdx.x;
int channel_size = n / c; int64_t channel_size = n / c;
const T* in_c = in + blockIdx.x * channel_size; const T* in_c = in + blockIdx.x * channel_size;
T* out_c = out + blockIdx.x * channel_size; T* out_c = out + blockIdx.x * channel_size;
T s = scale[blockIdx.x]; T s = scale[blockIdx.x];
T inv_s = inverse(s); T inv_s = inverse(s);
for (int i = tid; i < channel_size; i += blockDim.x) { for (int64_t i = tid; i < channel_size; i += blockDim.x) {
T x = in_c[i]; T x = in_c[i];
T v = x > s ? s : x; T v = x > s ? s : x;
v = v < -s ? -s : v; v = v < -s ? -s : v;
...@@ -293,25 +293,20 @@ __global__ void ChannelClipAndQuantKernelQuantAxis0(const T* in, const T* scale, ...@@ -293,25 +293,20 @@ __global__ void ChannelClipAndQuantKernelQuantAxis0(const T* in, const T* scale,
} }
} }
// ChannelClipAndQuantKernel for quant_axis is 1 // ChannelClipAndQuantKernel for quant_axis is N
template <typename T> template <typename T>
__global__ void ChannelClipAndQuantKernelQuantAxis1(const T* in, const T* scale, __global__ void ChannelClipAndQuantKernelQuantAxisN(
const int bin_cnt, const T* in, const T* scale, const int bin_cnt, const int64_t n,
const int n, const int cin, const int nScale, const int quant_stride, T* out) {
const int cout, T* out) { int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;
T s = scale[blockIdx.x % cout]; for (int64_t i = idx; i < n; i += blockDim.x * gridDim.x) {
T inv_s = inverse(s); T s = scale[(i / quant_stride) % nScale];
T inv_s = 1.0 / s;
int wh_size = n / (cin * cout); T x = in[i];
const T* in_c = in + blockIdx.x * wh_size;
T* out_c = out + blockIdx.x * wh_size;
for (int i = threadIdx.x; i < wh_size; i += blockDim.x) {
T x = in_c[i];
T v = x > s ? s : x; T v = x > s ? s : x;
v = v < -s ? -s : v; v = v < -s ? -s : v;
v = bin_cnt * inv_s * v; v = bin_cnt * inv_s * v;
out_c[i] = round(v); out[i] = round(v);
} }
} }
...@@ -327,7 +322,7 @@ struct ChannelClipAndFakeQuantFunctor<platform::CUDADeviceContext, T> { ...@@ -327,7 +322,7 @@ struct ChannelClipAndFakeQuantFunctor<platform::CUDADeviceContext, T> {
"the received is %d", "the received is %d",
quant_axis)); quant_axis));
int num = in.numel(); int64_t num = in.numel();
auto in_dims = in.dims(); auto in_dims = in.dims();
const T* in_data = in.data<T>(); const T* in_data = in.data<T>();
const T* scale_data = scale.data<T>(); const T* scale_data = scale.data<T>();
...@@ -338,11 +333,24 @@ struct ChannelClipAndFakeQuantFunctor<platform::CUDADeviceContext, T> { ...@@ -338,11 +333,24 @@ struct ChannelClipAndFakeQuantFunctor<platform::CUDADeviceContext, T> {
int block = 1024; int block = 1024;
ChannelClipAndQuantKernelQuantAxis0<T><<<grid, block, 0, ctx.stream()>>>( ChannelClipAndQuantKernelQuantAxis0<T><<<grid, block, 0, ctx.stream()>>>(
in_data, scale_data, bin_cnt, num, in_dims[0], out_data); in_data, scale_data, bin_cnt, num, in_dims[0], out_data);
} else if (quant_axis == 1) { } else {
int grid = in_dims[0] * in_dims[1]; int quant_stride = 1;
int block = 1024; for (int i = quant_axis + 1; i < in_dims.size(); i++) {
ChannelClipAndQuantKernelQuantAxis1<T><<<grid, block, 0, ctx.stream()>>>( quant_stride *= in_dims[i];
in_data, scale_data, bin_cnt, num, in_dims[0], in_dims[1], out_data); }
int64_t block_size =
std::min(num, static_cast<int64_t>(ctx.GetMaxThreadsPerBlock() / 4));
int64_t max_threads =
ctx.GetMaxPhysicalThreadCount(); // SM * block_per_SM
const int64_t max_blocks = std::max(((max_threads - 1) / block_size + 1),
static_cast<int64_t>(1));
const int64_t grid_size =
std::min(max_blocks, (num + block_size - 1) / block_size);
ChannelClipAndQuantKernelQuantAxisN<T><<<grid_size, block_size>>>(
in_data, scale_data, bin_cnt, num, in_dims[quant_axis], quant_stride,
out_data);
} }
} }
}; };
......
...@@ -64,18 +64,26 @@ class FrameOp : public framework::OperatorWithKernel { ...@@ -64,18 +64,26 @@ class FrameOp : public framework::OperatorWithKernel {
end_axis = x_rank - 2; end_axis = x_rank - 2;
} }
PADDLE_ENFORCE_LE(frame_length, seq_length, bool contain_unknown_dim = phi::contain_unknown_dim(x_dims);
platform::errors::InvalidArgument( bool check = ctx->IsRuntime() || !contain_unknown_dim;
"Attribute(frame_length) of FrameOp should be less " if (check) {
"equal than sequence length, but got (%s) > (%s).", PADDLE_ENFORCE_LE(frame_length, seq_length,
frame_length, seq_length)); platform::errors::InvalidArgument(
"Attribute(frame_length) of FrameOp should be less "
"equal than sequence length, but got (%s) > (%s).",
frame_length, seq_length));
}
// It won't go into for loop when x_rank == 1U. // It won't go into for loop when x_rank == 1U.
for (int i = start_axis; i <= end_axis; i++) { for (int i = start_axis; i <= end_axis; i++) {
output_shape.push_back(x_dims[i]); output_shape.push_back(x_dims[i]);
} }
n_frames = 1 + (seq_length - frame_length) / hop_length; if (seq_length == -1) {
n_frames = -1;
} else {
n_frames = 1 + (seq_length - frame_length) / hop_length;
}
if (axis == 0) { if (axis == 0) {
// (n_frames, frame_length, ...) // (n_frames, frame_length, ...)
......
...@@ -98,9 +98,17 @@ REGISTER_OP_CPU_KERNEL( ...@@ -98,9 +98,17 @@ REGISTER_OP_CPU_KERNEL(
mean, ops::MeanKernel<paddle::platform::CPUDeviceContext, float>, mean, ops::MeanKernel<paddle::platform::CPUDeviceContext, float>,
ops::MeanKernel<paddle::platform::CPUDeviceContext, double>, ops::MeanKernel<paddle::platform::CPUDeviceContext, double>,
ops::MeanKernel<paddle::platform::CPUDeviceContext, ops::MeanKernel<paddle::platform::CPUDeviceContext,
paddle::platform::bfloat16>); paddle::platform::bfloat16>,
ops::MeanKernel<paddle::platform::CPUDeviceContext,
paddle::platform::complex<float>>,
ops::MeanKernel<paddle::platform::CPUDeviceContext,
paddle::platform::complex<double>>);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
mean_grad, ops::MeanGradKernel<paddle::platform::CPUDeviceContext, float>, mean_grad, ops::MeanGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::MeanGradKernel<paddle::platform::CPUDeviceContext, double>, ops::MeanGradKernel<paddle::platform::CPUDeviceContext, double>,
ops::MeanGradKernel<paddle::platform::CPUDeviceContext, ops::MeanGradKernel<paddle::platform::CPUDeviceContext,
paddle::platform::bfloat16>); paddle::platform::bfloat16>,
ops::MeanGradKernel<paddle::platform::CPUDeviceContext,
paddle::platform::complex<float>>,
ops::MeanGradKernel<paddle::platform::CPUDeviceContext,
paddle::platform::complex<double>>);
...@@ -102,10 +102,17 @@ namespace plat = paddle::platform; ...@@ -102,10 +102,17 @@ namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
mean, ops::MeanCUDAKernel<paddle::platform::CUDADeviceContext, float>, mean, ops::MeanCUDAKernel<paddle::platform::CUDADeviceContext, float>,
ops::MeanCUDAKernel<paddle::platform::CUDADeviceContext, double>, ops::MeanCUDAKernel<paddle::platform::CUDADeviceContext, double>,
ops::MeanCUDAKernel<paddle::platform::CUDADeviceContext, plat::float16>); ops::MeanCUDAKernel<paddle::platform::CUDADeviceContext, plat::float16>,
ops::MeanCUDAKernel<paddle::platform::CUDADeviceContext,
paddle::platform::complex<float>>,
ops::MeanCUDAKernel<paddle::platform::CUDADeviceContext,
paddle::platform::complex<double>>);
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
mean_grad, mean_grad,
ops::MeanCUDAGradKernel<paddle::platform::CUDADeviceContext, float>, ops::MeanCUDAGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::MeanCUDAGradKernel<paddle::platform::CUDADeviceContext, double>, ops::MeanCUDAGradKernel<paddle::platform::CUDADeviceContext, double>,
ops::MeanCUDAGradKernel<paddle::platform::CUDADeviceContext, plat::float16>,
ops::MeanCUDAGradKernel<paddle::platform::CUDADeviceContext, ops::MeanCUDAGradKernel<paddle::platform::CUDADeviceContext,
plat::float16>); paddle::platform::complex<float>>,
ops::MeanCUDAGradKernel<paddle::platform::CUDADeviceContext,
paddle::platform::complex<double>>);
...@@ -54,6 +54,7 @@ class OverlapAddOp : public framework::OperatorWithKernel { ...@@ -54,6 +54,7 @@ class OverlapAddOp : public framework::OperatorWithKernel {
std::vector<int64_t> output_shape; std::vector<int64_t> output_shape;
int n_frames; int n_frames;
int frame_length; int frame_length;
int seq_length;
int start_axis; int start_axis;
int end_axis; int end_axis;
...@@ -69,14 +70,22 @@ class OverlapAddOp : public framework::OperatorWithKernel { ...@@ -69,14 +70,22 @@ class OverlapAddOp : public framework::OperatorWithKernel {
end_axis = x_rank - 3; end_axis = x_rank - 3;
} }
PADDLE_ENFORCE_LE( bool contain_unknown_dim = phi::contain_unknown_dim(x_dims);
hop_length, frame_length, bool check = ctx->IsRuntime() || !contain_unknown_dim;
platform::errors::InvalidArgument( if (check) {
"Attribute(hop_length) of OverlapAddOp should be less or equal " PADDLE_ENFORCE_LE(
"than frame_length, but got hop_length(%s) > frame_length(%s).", hop_length, frame_length,
hop_length, frame_length)); platform::errors::InvalidArgument(
"Attribute(hop_length) of OverlapAddOp should be less or equal "
"than frame_length, but got hop_length(%s) > frame_length(%s).",
hop_length, frame_length));
}
const int seq_length = (n_frames - 1) * hop_length + frame_length; if (n_frames == -1) {
seq_length = -1;
} else {
seq_length = (n_frames - 1) * hop_length + frame_length;
}
// It won't go into for loop when x_rank == 2U. // It won't go into for loop when x_rank == 2U.
for (int i = start_axis; i <= end_axis; i++) { for (int i = start_axis; i <= end_axis; i++) {
......
...@@ -16,451 +16,469 @@ ...@@ -16,451 +16,469 @@
#include "paddle/fluid/operators/spectral_op.h" #include "paddle/fluid/operators/spectral_op.h"
#ifdef PADDLE_WITH_HIP #if defined(PADDLE_WITH_ONEMKL)
#include "paddle/fluid/platform/dynload/hipfft.h" #include "paddle/phi/backends/dynload/mklrt.h"
#endif #elif defined(PADDLE_WITH_POCKETFFT)
#include "extern_pocketfft/pocketfft_hdronly.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/dynload/cufft.h"
#endif #endif
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using ScalarType = framework::proto::VarType::Type;
const int64_t kMaxFFTNdim = 3;
const int64_t kMaxDataNdim = kMaxFFTNdim + 1;
// This struct is used to easily compute hashes of the
// parameters. It will be the **key** to the plan cache.
struct FFTConfigKey {
// between 1 and kMaxFFTNdim, i.e., 1 <= signal_ndim <= 3
int64_t signal_ndim_;
// These include additional batch dimension as well.
int64_t sizes_[kMaxDataNdim];
int64_t input_shape_[kMaxDataNdim];
int64_t output_shape_[kMaxDataNdim];
FFTTransformType fft_type_;
ScalarType value_type_;
FFTConfigKey() = default;
FFTConfigKey(const std::vector<int64_t>& in_shape,
const std::vector<int64_t>& out_shape,
const std::vector<int64_t>& signal_size,
FFTTransformType fft_type, ScalarType value_type) {
// Padding bits must be zeroed for hashing
memset(this, 0, sizeof(*this));
signal_ndim_ = signal_size.size() - 1;
fft_type_ = fft_type;
value_type_ = value_type;
std::copy(signal_size.cbegin(), signal_size.cend(), sizes_);
std::copy(in_shape.cbegin(), in_shape.cend(), input_shape_);
std::copy(out_shape.cbegin(), out_shape.cend(), output_shape_);
}
};
#if defined(PADDLE_WITH_CUDA)
// An RAII encapsulation of cuFFTHandle
class CuFFTHandle {
::cufftHandle handle_;
public: using Tensor = framework::Tensor;
CuFFTHandle() {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftCreate(&handle_));
}
CuFFTHandle(const CuFFTHandle& other) = delete;
CuFFTHandle& operator=(const CuFFTHandle& other) = delete;
CuFFTHandle(CuFFTHandle&& other) = delete; // FFT Functors
CuFFTHandle& operator=(CuFFTHandle&& other) = delete; #if defined(PADDLE_WITH_ONEMKL)
::cufftHandle& get() { return handle_; } #define MKL_DFTI_CHECK(expr) \
const ::cufftHandle& get() const { return handle_; } do { \
MKL_LONG status = (expr); \
if (!phi::dynload::DftiErrorClass(status, DFTI_NO_ERROR)) \
PADDLE_THROW( \
platform::errors::External(phi::dynload::DftiErrorMessage(status))); \
} while (0);
~CuFFTHandle() { struct DftiDescriptorDeleter {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftDestroy(handle_)); void operator()(DFTI_DESCRIPTOR_HANDLE handle) {
if (handle != nullptr) {
MKL_DFTI_CHECK(phi::dynload::DftiFreeDescriptor(&handle));
}
} }
}; };
using plan_size_type = long long int; // NOLINT // A RAII wrapper for MKL_DESCRIPTOR*
// This class contains all the information needed to execute a cuFFT plan: class DftiDescriptor {
// 1. the plan
// 2. the workspace size needed
class FFTConfig {
public: public:
// Only move semantics is enought for this class. Although we already use void init(DFTI_CONFIG_VALUE precision, DFTI_CONFIG_VALUE signal_type,
// unique_ptr for the plan, still remove copy constructor and assignment op so MKL_LONG signal_ndim, MKL_LONG* sizes) {
// we don't accidentally copy and take perf hit. PADDLE_ENFORCE_EQ(desc_.get(), nullptr,
explicit FFTConfig(const FFTConfigKey& plan_key) platform::errors::AlreadyExists(
: FFTConfig( "DftiDescriptor has already been initialized."));
std::vector<int64_t>(plan_key.sizes_,
plan_key.sizes_ + plan_key.signal_ndim_ + 1), DFTI_DESCRIPTOR* raw_desc;
plan_key.signal_ndim_, plan_key.fft_type_, plan_key.value_type_) {} MKL_DFTI_CHECK(phi::dynload::DftiCreateDescriptorX(
&raw_desc, precision, signal_type, signal_ndim, sizes));
// sizes are full signal, including batch size and always two-sided desc_.reset(raw_desc);
FFTConfig(const std::vector<int64_t>& sizes, const int64_t signal_ndim,
FFTTransformType fft_type, ScalarType dtype)
: fft_type_(fft_type), value_type_(dtype) {
// signal sizes (excluding batch dim)
std::vector<plan_size_type> signal_sizes(sizes.begin() + 1, sizes.end());
// input batch size
const auto batch = static_cast<plan_size_type>(sizes[0]);
// const int64_t signal_ndim = sizes.size() - 1;
PADDLE_ENFORCE_EQ(signal_ndim, sizes.size() - 1,
platform::errors::InvalidArgument(
"The signal_ndim must be equal to sizes.size() - 1,"
"But signal_ndim is: [%d], sizes.size() - 1 is: [%d]",
signal_ndim, sizes.size() - 1));
cudaDataType itype, otype, exec_type;
const auto complex_input = has_complex_input(fft_type);
const auto complex_output = has_complex_output(fft_type);
if (dtype == framework::proto::VarType::FP32) {
itype = complex_input ? CUDA_C_32F : CUDA_R_32F;
otype = complex_output ? CUDA_C_32F : CUDA_R_32F;
exec_type = CUDA_C_32F;
} else if (dtype == framework::proto::VarType::FP64) {
itype = complex_input ? CUDA_C_64F : CUDA_R_64F;
otype = complex_output ? CUDA_C_64F : CUDA_R_64F;
exec_type = CUDA_C_64F;
} else if (dtype == framework::proto::VarType::FP16) {
itype = complex_input ? CUDA_C_16F : CUDA_R_16F;
otype = complex_output ? CUDA_C_16F : CUDA_R_16F;
exec_type = CUDA_C_16F;
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"cuFFT only support transforms of type float16, float32 and "
"float64"));
}
// disable auto allocation of workspace to use allocator from the framework
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftSetAutoAllocation(
plan(), /* autoAllocate */ 0));
size_t ws_size_t;
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftXtMakePlanMany(
plan(), signal_ndim, signal_sizes.data(),
/* inembed */ nullptr, /* base_istride */ 1, /* idist */ 1, itype,
/* onembed */ nullptr, /* base_ostride */ 1, /* odist */ 1, otype,
batch, &ws_size_t, exec_type));
ws_size = ws_size_t;
} }
FFTConfig(const FFTConfig& other) = delete; DFTI_DESCRIPTOR* get() const {
FFTConfig& operator=(const FFTConfig& other) = delete; DFTI_DESCRIPTOR* raw_desc = desc_.get();
PADDLE_ENFORCE_NOT_NULL(raw_desc,
FFTConfig(FFTConfig&& other) = delete; platform::errors::PreconditionNotMet(
FFTConfig& operator=(FFTConfig&& other) = delete; "DFTI DESCRIPTOR has not been initialized."));
return raw_desc;
const cufftHandle& plan() const { return plan_ptr.get(); } }
FFTTransformType transform_type() const { return fft_type_; }
ScalarType data_type() const { return value_type_; }
size_t workspace_size() const { return ws_size; }
private: private:
CuFFTHandle plan_ptr; std::unique_ptr<DFTI_DESCRIPTOR, DftiDescriptorDeleter> desc_;
size_t ws_size;
FFTTransformType fft_type_;
ScalarType value_type_;
}; };
#elif defined(PADDLE_WITH_HIP) static DftiDescriptor _plan_mkl_fft(
// An RAII encapsulation of cuFFTHandle const framework::proto::VarType::Type& in_dtype,
class HIPFFTHandle { const framework::proto::VarType::Type& out_dtype,
::hipfftHandle handle_; const framework::DDim& in_strides, const framework::DDim& out_strides,
const std::vector<int>& signal_sizes, FFTNormMode normalization,
public: bool forward) {
HIPFFTHandle() { const DFTI_CONFIG_VALUE precision = [&] {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftCreate(&handle_)); switch (in_dtype) {
case framework::proto::VarType::FP32:
return DFTI_SINGLE;
case framework::proto::VarType::COMPLEX64:
return DFTI_SINGLE;
case framework::proto::VarType::FP64:
return DFTI_DOUBLE;
case framework::proto::VarType::COMPLEX128:
return DFTI_DOUBLE;
default:
PADDLE_THROW(platform::errors::InvalidArgument(
"Invalid input datatype (%s), input data type should be FP32, "
"FP64, COMPLEX64 or COMPLEX128.",
framework::DataTypeToString(in_dtype)));
}
}();
// C2C, R2C, C2R
const FFTTransformType fft_type = GetFFTTransformType(in_dtype, out_dtype);
const DFTI_CONFIG_VALUE domain =
(fft_type == FFTTransformType::C2C) ? DFTI_COMPLEX : DFTI_REAL;
DftiDescriptor descriptor;
std::vector<MKL_LONG> fft_sizes(signal_sizes.cbegin(), signal_sizes.cend());
const MKL_LONG signal_ndim = fft_sizes.size() - 1;
descriptor.init(precision, domain, signal_ndim, fft_sizes.data() + 1);
// placement inplace or not inplace
MKL_DFTI_CHECK(phi::dynload::DftiSetValue(descriptor.get(), DFTI_PLACEMENT,
DFTI_NOT_INPLACE));
// number of transformations
const MKL_LONG batch_size = fft_sizes[0];
MKL_DFTI_CHECK(phi::dynload::DftiSetValue(
descriptor.get(), DFTI_NUMBER_OF_TRANSFORMS, batch_size));
// input & output distance
const MKL_LONG idist = in_strides[0];
const MKL_LONG odist = out_strides[0];
MKL_DFTI_CHECK(
phi::dynload::DftiSetValue(descriptor.get(), DFTI_INPUT_DISTANCE, idist));
MKL_DFTI_CHECK(phi::dynload::DftiSetValue(descriptor.get(),
DFTI_OUTPUT_DISTANCE, odist));
// input & output stride
std::vector<MKL_LONG> mkl_in_stride(1 + signal_ndim, 0);
std::vector<MKL_LONG> mkl_out_stride(1 + signal_ndim, 0);
for (MKL_LONG i = 1; i <= signal_ndim; i++) {
mkl_in_stride[i] = in_strides[i];
mkl_out_stride[i] = out_strides[i];
} }
MKL_DFTI_CHECK(phi::dynload::DftiSetValue(
HIPFFTHandle(const HIPFFTHandle& other) = delete; descriptor.get(), DFTI_INPUT_STRIDES, mkl_in_stride.data()));
HIPFFTHandle& operator=(const HIPFFTHandle& other) = delete; MKL_DFTI_CHECK(phi::dynload::DftiSetValue(
descriptor.get(), DFTI_OUTPUT_STRIDES, mkl_out_stride.data()));
HIPFFTHandle(HIPFFTHandle&& other) = delete;
HIPFFTHandle& operator=(HIPFFTHandle&& other) = delete; // conjugate even storage
if (!(fft_type == FFTTransformType::C2C)) {
::hipfftHandle& get() { return handle_; } MKL_DFTI_CHECK(phi::dynload::DftiSetValue(
const ::hipfftHandle& get() const { return handle_; } descriptor.get(), DFTI_CONJUGATE_EVEN_STORAGE, DFTI_COMPLEX_COMPLEX));
~HIPFFTHandle() {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftDestroy(handle_));
} }
};
using plan_size_type = int; MKL_LONG signal_numel =
// This class contains all the information needed to execute a cuFFT plan: std::accumulate(fft_sizes.cbegin() + 1, fft_sizes.cend(), 1UL,
// 1. the plan std::multiplies<MKL_LONG>());
// 2. the workspace size needed if (normalization != FFTNormMode::none) {
class FFTConfig { const double scale =
public: ((normalization == FFTNormMode::by_sqrt_n)
// Only move semantics is enought for this class. Although we already use ? 1.0 / std::sqrt(static_cast<double>(signal_numel))
// unique_ptr for the plan, still remove copy constructor and assignment op so : 1.0 / static_cast<double>(signal_numel));
// we don't accidentally copy and take perf hit. const auto scale_direction = [&]() {
explicit FFTConfig(const FFTConfigKey& plan_key) if (fft_type == FFTTransformType::R2C ||
: FFTConfig( (fft_type == FFTTransformType::C2C && forward)) {
std::vector<int64_t>(plan_key.sizes_, return DFTI_FORWARD_SCALE;
plan_key.sizes_ + plan_key.signal_ndim_ + 1), } else {
plan_key.signal_ndim_, plan_key.fft_type_, plan_key.value_type_) {} // (fft_type == FFTTransformType::C2R ||
// (fft_type == FFTTransformType::C2C && !forward))
// sizes are full signal, including batch size and always two-sided return DFTI_BACKWARD_SCALE;
FFTConfig(const std::vector<int64_t>& sizes, const int64_t signal_ndim,
FFTTransformType fft_type, ScalarType dtype)
: fft_type_(fft_type), value_type_(dtype) {
// signal sizes (excluding batch dim)
std::vector<plan_size_type> signal_sizes(sizes.begin() + 1, sizes.end());
// input batch size
const auto batch = static_cast<plan_size_type>(sizes[0]);
// const int64_t signal_ndim = sizes.size() - 1;
PADDLE_ENFORCE_EQ(signal_ndim, sizes.size() - 1,
platform::errors::InvalidArgument(
"The signal_ndim must be equal to sizes.size() - 1,"
"But signal_ndim is: [%d], sizes.size() - 1 is: [%d]",
signal_ndim, sizes.size() - 1));
hipfftType exec_type = [&] {
if (dtype == framework::proto::VarType::FP32) {
switch (fft_type) {
case FFTTransformType::C2C:
return HIPFFT_C2C;
case FFTTransformType::R2C:
return HIPFFT_R2C;
case FFTTransformType::C2R:
return HIPFFT_C2R;
}
} else if (dtype == framework::proto::VarType::FP64) {
switch (fft_type) {
case FFTTransformType::C2C:
return HIPFFT_Z2Z;
case FFTTransformType::R2C:
return HIPFFT_D2Z;
case FFTTransformType::C2R:
return HIPFFT_Z2D;
}
} }
PADDLE_THROW(platform::errors::InvalidArgument(
"hipFFT only support transforms of type float32 and float64"));
}(); }();
MKL_DFTI_CHECK(
// disable auto allocation of workspace to use allocator from the framework phi::dynload::DftiSetValue(descriptor.get(), scale_direction, scale));
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftSetAutoAllocation(
plan(), /* autoAllocate */ 0));
size_t ws_size_t;
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftMakePlanMany(
plan(), signal_ndim, signal_sizes.data(),
/* inembed */ nullptr, /* base_istride */ 1, /* idist */ 1,
/* onembed */ nullptr, /* base_ostride */ 1, /* odist */ 1, exec_type,
batch, &ws_size_t));
ws_size = ws_size_t;
} }
const hipfftHandle& plan() const { return plan_ptr.get(); } // commit the descriptor
MKL_DFTI_CHECK(phi::dynload::DftiCommitDescriptor(descriptor.get()));
FFTTransformType transform_type() const { return fft_type_; } return descriptor;
ScalarType data_type() const { return value_type_; } }
size_t workspace_size() const { return ws_size; }
private: // Execute a general fft operation (can be c2c, onesided r2c or onesided c2r)
HIPFFTHandle plan_ptr; template <typename DeviceContext, typename Ti, typename To>
size_t ws_size; void exec_fft(const DeviceContext& ctx, const Tensor* x, Tensor* out,
FFTTransformType fft_type_; const std::vector<int64_t>& axes, FFTNormMode normalization,
ScalarType value_type_; bool forward) {
}; const framework::DDim& in_sizes = x->dims();
#endif const int ndim = in_sizes.size();
const int signal_ndim = axes.size();
const int batch_ndim = ndim - signal_ndim;
const framework::DDim& out_sizes = out->dims();
// make a dim permutation
std::vector<int> dim_permute(ndim);
std::iota(dim_permute.begin(), dim_permute.end(), 0);
std::vector<bool> is_transformed_dim(ndim, false);
for (const auto& d : axes) {
is_transformed_dim[d] = true;
}
const auto batch_end =
std::partition(dim_permute.begin(), dim_permute.end(),
[&](size_t axis) { return !is_transformed_dim[axis]; });
std::copy(axes.cbegin(), axes.cend(), batch_end);
// transpose input according to that permutation
framework::DDim transposed_input_shape = in_sizes.transpose(dim_permute);
std::vector<int64_t> transposed_input_shape_ =
phi::vectorize(transposed_input_shape);
framework::Tensor transposed_input;
transposed_input.Resize(transposed_input_shape);
const auto place = ctx.GetPlace();
transposed_input.mutable_data<Ti>(place);
TransCompute<platform::CPUDeviceContext, Ti>(ndim, ctx, *x, &transposed_input,
dim_permute);
// make an collapsed input: collapse batch axes for input
const int batch_size = std::accumulate(
transposed_input_shape.Get(), transposed_input_shape.Get() + batch_ndim,
1L, std::multiplies<int64_t>());
std::vector<int> collapsed_input_shape_(1 + signal_ndim);
collapsed_input_shape_[0] = batch_size;
std::copy(transposed_input_shape_.begin() + batch_ndim,
transposed_input_shape_.end(), collapsed_input_shape_.begin() + 1);
const framework::DDim collapsed_input_shape =
phi::make_ddim(collapsed_input_shape_);
transposed_input.Resize(collapsed_input_shape);
framework::Tensor& collapsed_input = transposed_input;
// make a collapsed output
std::vector<int> collapsed_output_shape_(1 + signal_ndim);
collapsed_output_shape_[0] = batch_size;
for (int i = 0; i < signal_ndim; i++) {
collapsed_output_shape_[1 + i] = out_sizes[axes[i]];
}
const framework::DDim collapsed_output_shape =
phi::make_ddim(collapsed_output_shape_);
framework::Tensor collapsed_output;
collapsed_output.Resize(collapsed_output_shape);
collapsed_output.mutable_data(place, out->type());
// signal sizes
std::vector<int> signal_sizes(1 + signal_ndim);
signal_sizes[0] = batch_size;
for (int i = 0; i < signal_ndim; i++) {
signal_sizes[1 + i] =
std::max(collapsed_input_shape[1 + i], collapsed_output_shape[1 + i]);
}
// Hashing machinery for Key // input & output stride
// Fowler–Noll–Vo hash function const framework::DDim input_stride = phi::stride(collapsed_input_shape);
// see const framework::DDim output_stride = phi::stride(collapsed_output_shape);
// https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
template <typename Key> // make a DFTI_DESCRIPTOR
struct KeyHash { DftiDescriptor desc =
// Key must be a POD because we read out its memory _plan_mkl_fft(framework::TransToProtoVarType(x->dtype()),
// contenst as char* when hashing framework::TransToProtoVarType(out->dtype()), input_stride,
static_assert(std::is_pod<Key>::value, "Key must be plain old data type"); output_stride, signal_sizes, normalization, forward);
size_t operator()(const Key& params) const { const FFTTransformType fft_type =
auto ptr = reinterpret_cast<const uint8_t*>(&params); GetFFTTransformType(framework::TransToProtoVarType(x->dtype()),
uint32_t value = 0x811C9DC5; framework::TransToProtoVarType(out->type()));
for (int i = 0; i < static_cast<int>(sizeof(Key)); ++i) { if (fft_type == FFTTransformType::C2R && forward) {
value ^= ptr[i]; framework::Tensor collapsed_input_conj(collapsed_input.dtype());
value *= 0x01000193; collapsed_input_conj.mutable_data<Ti>(collapsed_input.dims(),
ctx.GetPlace());
// conjugate the input
platform::ForRange<DeviceContext> for_range(ctx, collapsed_input.numel());
phi::funcs::ConjFunctor<Ti> functor(collapsed_input.data<Ti>(),
collapsed_input.numel(),
collapsed_input_conj.data<Ti>());
for_range(functor);
MKL_DFTI_CHECK(phi::dynload::DftiComputeBackward(
desc.get(), collapsed_input_conj.data(), collapsed_output.data()));
} else if (fft_type == FFTTransformType::R2C && !forward) {
framework::Tensor collapsed_output_conj(collapsed_output.dtype());
collapsed_output_conj.mutable_data<To>(collapsed_output.dims(),
ctx.GetPlace());
MKL_DFTI_CHECK(phi::dynload::DftiComputeForward(
desc.get(), collapsed_input.data(), collapsed_output_conj.data()));
// conjugate the output
platform::ForRange<DeviceContext> for_range(ctx, collapsed_output.numel());
phi::funcs::ConjFunctor<To> functor(collapsed_output_conj.data<To>(),
collapsed_output.numel(),
collapsed_output.data<To>());
for_range(functor);
} else {
if (forward) {
MKL_DFTI_CHECK(phi::dynload::DftiComputeForward(
desc.get(), collapsed_input.data(), collapsed_output.data()));
} else {
MKL_DFTI_CHECK(phi::dynload::DftiComputeBackward(
desc.get(), collapsed_input.data(), collapsed_output.data()));
} }
return static_cast<size_t>(value);
} }
};
template <typename Key> // resize for the collapsed output
struct KeyEqual { framework::DDim transposed_output_shape = out_sizes.transpose(dim_permute);
// Key must be a POD because we read out its memory collapsed_output.Resize(transposed_output_shape);
// contenst as char* when comparing framework::Tensor& transposed_output = collapsed_output;
static_assert(std::is_pod<Key>::value, "Key must be plain old data type");
bool operator()(const Key& a, const Key& b) const { // reverse the transposition
auto ptr1 = reinterpret_cast<const uint8_t*>(&a); std::vector<int> reverse_dim_permute(ndim);
auto ptr2 = reinterpret_cast<const uint8_t*>(&b); for (int i = 0; i < ndim; i++) {
return memcmp(ptr1, ptr2, sizeof(Key)) == 0; reverse_dim_permute[dim_permute[i]] = i;
} }
}; TransCompute<platform::CPUDeviceContext, To>(ndim, ctx, transposed_output,
out, reverse_dim_permute);
#if CUDA_VERSION < 10000 }
// Note that the max plan number for CUDA version < 10 has to be 1023
// due to a bug that fails on the 1024th plan
constexpr size_t CUFFT_MAX_PLAN_NUM = 1023;
constexpr size_t CUFFT_DEFAULT_CACHE_SIZE = CUFFT_MAX_PLAN_NUM;
#else
constexpr size_t CUFFT_MAX_PLAN_NUM = std::numeric_limits<size_t>::max();
// The default max cache size chosen for CUDA version > 10 is arbitrary.
// This number puts a limit on how big of a plan cache should we maintain by
// default. Users can always configure it via cufft_set_plan_cache_max_size.
constexpr size_t CUFFT_DEFAULT_CACHE_SIZE = 4096;
#endif
static_assert(CUFFT_MAX_PLAN_NUM >= 0 &&
CUFFT_MAX_PLAN_NUM <= std::numeric_limits<size_t>::max(),
"CUFFT_MAX_PLAN_NUM not in size_t range");
static_assert(CUFFT_DEFAULT_CACHE_SIZE >= 0 &&
CUFFT_DEFAULT_CACHE_SIZE <= CUFFT_MAX_PLAN_NUM,
"CUFFT_DEFAULT_CACHE_SIZE not in [0, CUFFT_MAX_PLAN_NUM] range");
// This cache assumes that the mapping from key to value never changes.
// This is **NOT** thread-safe. Please use a mutex when using it **AND** the
// value returned from try_emplace_value.
// The contract of using this cache is that try_emplace_value should only be
// used when the max_size is positive.
class FFTConfigCache {
public:
using kv_t = typename std::pair<FFTConfigKey, FFTConfig>;
using map_t = typename std::unordered_map<
std::reference_wrapper<FFTConfigKey>, typename std::list<kv_t>::iterator,
KeyHash<FFTConfigKey>, KeyEqual<FFTConfigKey>>;
using map_kkv_iter_t = typename map_t::iterator;
FFTConfigCache() : FFTConfigCache(CUFFT_DEFAULT_CACHE_SIZE) {}
explicit FFTConfigCache(int64_t max_size) { _set_max_size(max_size); }
FFTConfigCache(const FFTConfigCache& other) = delete;
FFTConfigCache& operator=(const FFTConfigCache& other) = delete;
FFTConfigCache(FFTConfigCache&& other) noexcept
: _usage_list(std::move(other._usage_list)),
_cache_map(std::move(other._cache_map)),
_max_size(other._max_size) {}
FFTConfigCache& operator=(FFTConfigCache&& other) noexcept { template <typename Ti, typename To>
_usage_list = std::move(other._usage_list); struct FFTC2CFunctor<platform::CPUDeviceContext, Ti, To> {
_cache_map = std::move(other._cache_map); void operator()(const platform::CPUDeviceContext& ctx, const Tensor* x,
_max_size = other._max_size; Tensor* out, const std::vector<int64_t>& axes,
return *this; FFTNormMode normalization, bool forward) {
exec_fft<platform::CPUDeviceContext, Ti, To>(ctx, x, out, axes,
normalization, forward);
} }
};
// If key is in this cache, return the cached config. Otherwise, emplace the template <typename Ti, typename To>
// config in this cache and return it. struct FFTR2CFunctor<platform::CPUDeviceContext, Ti, To> {
FFTConfig& lookup(FFTConfigKey params) { void operator()(const platform::CPUDeviceContext& ctx, const Tensor* x,
PADDLE_ENFORCE_GT(_max_size, 0, Tensor* out, const std::vector<int64_t>& axes,
platform::errors::InvalidArgument( FFTNormMode normalization, bool forward) {
"The max size of FFTConfigCache must be great than 0," exec_fft<platform::CPUDeviceContext, Ti, To>(ctx, x, out, axes,
"But received is [%d]", normalization, forward);
_max_size)); }
};
map_kkv_iter_t map_it = _cache_map.find(params);
// Hit, put to list front
if (map_it != _cache_map.end()) {
_usage_list.splice(_usage_list.begin(), _usage_list, map_it->second);
return map_it->second->second;
}
// Miss template <typename Ti, typename To>
// remove if needed struct FFTC2RFunctor<platform::CPUDeviceContext, Ti, To> {
if (_usage_list.size() >= _max_size) { void operator()(const platform::CPUDeviceContext& ctx, const Tensor* x,
auto last = _usage_list.end(); Tensor* out, const std::vector<int64_t>& axes,
last--; FFTNormMode normalization, bool forward) {
_cache_map.erase(last->first); if (axes.size() > 1) {
_usage_list.pop_back(); const std::vector<int64_t> c2c_dims(axes.begin(), axes.end() - 1);
Tensor temp;
temp.mutable_data<Ti>(x->dims(), ctx.GetPlace());
FFTC2CFunctor<platform::CPUDeviceContext, Ti, Ti> c2c_functor;
c2c_functor(ctx, x, &temp, c2c_dims, normalization, forward);
const std::vector<int64_t> new_axes{axes.back()};
exec_fft<platform::CPUDeviceContext, Ti, To>(ctx, &temp, out, new_axes,
normalization, forward);
} else {
exec_fft<platform::CPUDeviceContext, Ti, To>(ctx, x, out, axes,
normalization, forward);
} }
// construct new plan at list front, then insert into _cache_map
_usage_list.emplace_front(std::piecewise_construct,
std::forward_as_tuple(params),
std::forward_as_tuple(params));
auto kv_it = _usage_list.begin();
_cache_map.emplace(std::piecewise_construct,
std::forward_as_tuple(kv_it->first),
std::forward_as_tuple(kv_it));
return kv_it->second;
} }
};
void clear() { #elif defined(PADDLE_WITH_POCKETFFT)
_cache_map.clear();
_usage_list.clear(); template <typename T>
T compute_factor(int64_t size, FFTNormMode normalization) {
constexpr auto one = static_cast<T>(1);
switch (normalization) {
case FFTNormMode::none:
return one;
case FFTNormMode::by_n:
return one / static_cast<T>(size);
case FFTNormMode::by_sqrt_n:
return one / std::sqrt(static_cast<T>(size));
} }
PADDLE_THROW(
platform::errors::InvalidArgument("Unsupported normalization type"));
}
void resize(int64_t new_size) { template <typename Ti, typename To>
_set_max_size(new_size); struct FFTC2CFunctor<platform::CPUDeviceContext, Ti, To> {
auto cur_size = _usage_list.size(); void operator()(const platform::CPUDeviceContext& ctx, const Tensor* x,
if (cur_size > _max_size) { Tensor* out, const std::vector<int64_t>& axes,
auto delete_it = _usage_list.end(); FFTNormMode normalization, bool forward) {
for (size_t i = 0; i < cur_size - _max_size; i++) { using R = typename Ti::value_type;
delete_it--; using C = std::complex<R>;
_cache_map.erase(delete_it->first);
} const auto& input_dim = x->dims();
_usage_list.erase(delete_it, _usage_list.end()); const std::vector<size_t> in_sizes = phi::vectorize<size_t>(input_dim);
std::vector<std::ptrdiff_t> in_strides =
phi::vectorize<std::ptrdiff_t>(phi::stride(input_dim));
const int64_t data_size = sizeof(C);
std::transform(in_strides.begin(), in_strides.end(), in_strides.begin(),
[&](std::ptrdiff_t s) { return s * data_size; });
const auto* in_data = reinterpret_cast<const C*>(x->data<Ti>());
auto* out_data = reinterpret_cast<C*>(out->data<To>());
// pocketfft requires std::vector<size_t>
std::vector<size_t> axes_(axes.size());
std::copy(axes.begin(), axes.end(), axes_.begin());
// compuet factor
int64_t signal_numel = 1;
for (auto i : axes) {
signal_numel *= in_sizes[i];
} }
R factor = compute_factor<R>(signal_numel, normalization);
pocketfft::c2c(in_sizes, in_strides, in_strides, axes_, forward, in_data,
out_data, factor);
} }
};
size_t size() const { return _cache_map.size(); } template <typename Ti, typename To>
struct FFTR2CFunctor<platform::CPUDeviceContext, Ti, To> {
size_t max_size() const noexcept { return _max_size; } void operator()(const platform::CPUDeviceContext& ctx, const Tensor* x,
Tensor* out, const std::vector<int64_t>& axes,
FFTNormMode normalization, bool forward) {
using R = Ti;
using C = std::complex<R>;
const auto& input_dim = x->dims();
const std::vector<size_t> in_sizes = phi::vectorize<size_t>(input_dim);
std::vector<std::ptrdiff_t> in_strides =
phi::vectorize<std::ptrdiff_t>(phi::stride(input_dim));
{
const int64_t data_size = sizeof(R);
std::transform(in_strides.begin(), in_strides.end(), in_strides.begin(),
[&](std::ptrdiff_t s) { return s * data_size; });
}
std::mutex mutex; const auto& output_dim = out->dims();
const std::vector<size_t> out_sizes = phi::vectorize<size_t>(output_dim);
std::vector<std::ptrdiff_t> out_strides =
phi::vectorize<std::ptrdiff_t>(phi::stride(output_dim));
{
const int64_t data_size = sizeof(C);
std::transform(out_strides.begin(), out_strides.end(),
out_strides.begin(),
[&](std::ptrdiff_t s) { return s * data_size; });
}
private: const auto* in_data = x->data<R>();
// Only sets size and does value check. Does not resize the data structures. auto* out_data = reinterpret_cast<C*>(out->data<To>());
void _set_max_size(int64_t new_size) { // pocketfft requires std::vector<size_t>
// We check that 0 <= new_size <= CUFFT_MAX_PLAN_NUM here. Since std::vector<size_t> axes_(axes.size());
// CUFFT_MAX_PLAN_NUM is of type size_t, we need to do non-negativity check std::copy(axes.begin(), axes.end(), axes_.begin());
// first. // compuet normalization factor
PADDLE_ENFORCE_GE( int64_t signal_numel = 1;
new_size, 0, for (auto i : axes) {
platform::errors::InvalidArgument( signal_numel *= in_sizes[i];
"cuFFT plan cache size must be non-negative, But received is [%d]", }
new_size)); R factor = compute_factor<R>(signal_numel, normalization);
PADDLE_ENFORCE_LE(new_size, CUFFT_MAX_PLAN_NUM, pocketfft::r2c(in_sizes, in_strides, out_strides, axes_, forward, in_data,
platform::errors::InvalidArgument( out_data, factor);
"cuFFT plan cache size can not be larger than [%d], "
"But received is [%d]",
CUFFT_MAX_PLAN_NUM, new_size));
_max_size = static_cast<size_t>(new_size);
} }
std::list<kv_t> _usage_list;
map_t _cache_map;
size_t _max_size;
}; };
static std::vector<std::unique_ptr<FFTConfigCache>> plan_caches; template <typename Ti, typename To>
static std::mutex plan_caches_mutex; struct FFTC2RFunctor<platform::CPUDeviceContext, Ti, To> {
void operator()(const platform::CPUDeviceContext& ctx, const Tensor* x,
static inline FFTConfigCache& get_fft_plan_cache(int64_t device_index) { Tensor* out, const std::vector<int64_t>& axes,
std::lock_guard<std::mutex> guard(plan_caches_mutex); FFTNormMode normalization, bool forward) {
using R = To;
using C = std::complex<R>;
const auto& input_dim = x->dims();
const std::vector<size_t> in_sizes = phi::vectorize<size_t>(input_dim);
std::vector<std::ptrdiff_t> in_strides =
phi::vectorize<std::ptrdiff_t>(phi::stride(input_dim));
{
const int64_t data_size = sizeof(C);
std::transform(in_strides.begin(), in_strides.end(), in_strides.begin(),
[&](std::ptrdiff_t s) { return s * data_size; });
}
if (device_index >= plan_caches.size()) { const auto& output_dim = out->dims();
plan_caches.resize(device_index + 1); const std::vector<size_t> out_sizes = phi::vectorize<size_t>(output_dim);
} std::vector<std::ptrdiff_t> out_strides =
phi::vectorize<std::ptrdiff_t>(phi::stride(output_dim));
{
const int64_t data_size = sizeof(R);
std::transform(out_strides.begin(), out_strides.end(),
out_strides.begin(),
[&](std::ptrdiff_t s) { return s * data_size; });
}
if (!plan_caches[device_index]) { const auto* in_data = reinterpret_cast<const C*>(x->data<Ti>());
plan_caches[device_index] = std::make_unique<FFTConfigCache>(); auto* out_data = out->data<R>();
// pocketfft requires std::vector<size_t>
std::vector<size_t> axes_(axes.size());
std::copy(axes.begin(), axes.end(), axes_.begin());
// compuet normalization factor
int64_t signal_numel = 1;
for (auto i : axes) {
signal_numel *= out_sizes[i];
}
R factor = compute_factor<R>(signal_numel, normalization);
pocketfft::c2r(out_sizes, in_strides, out_strides, axes_, forward, in_data,
out_data, factor);
} }
};
return *plan_caches[device_index]; #endif
}
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -13,28 +13,7 @@ ...@@ -13,28 +13,7 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/operators/spectral_op.h" #include "paddle/fluid/operators/spectral_op.h"
#include "paddle/fluid/operators/spectral_helper.h"
#include <algorithm>
#include <functional>
#include <memory>
#include <numeric>
#include <string>
#include <vector>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/operators/transpose_op.h"
#include "paddle/fluid/platform/complex.h"
#include "paddle/phi/kernels/funcs/complex_functors.h"
#if defined(PADDLE_WITH_ONEMKL)
#include "paddle/phi/backends/dynload/mklrt.h"
#elif defined(PADDLE_WITH_POCKETFFT)
#include "extern_pocketfft/pocketfft_hdronly.h"
#endif
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/for_range.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -355,465 +334,6 @@ FFTNormMode get_norm_from_string(const std::string& norm, bool forward) { ...@@ -355,465 +334,6 @@ FFTNormMode get_norm_from_string(const std::string& norm, bool forward) {
norm)); norm));
} }
// FFT Functors
#if defined(PADDLE_WITH_ONEMKL)
#define MKL_DFTI_CHECK(expr) \
do { \
MKL_LONG status = (expr); \
if (!phi::dynload::DftiErrorClass(status, DFTI_NO_ERROR)) \
PADDLE_THROW( \
platform::errors::External(phi::dynload::DftiErrorMessage(status))); \
} while (0);
namespace {
struct DftiDescriptorDeleter {
void operator()(DFTI_DESCRIPTOR_HANDLE handle) {
if (handle != nullptr) {
MKL_DFTI_CHECK(phi::dynload::DftiFreeDescriptor(&handle));
}
}
};
// A RAII wrapper for MKL_DESCRIPTOR*
class DftiDescriptor {
public:
void init(DFTI_CONFIG_VALUE precision, DFTI_CONFIG_VALUE signal_type,
MKL_LONG signal_ndim, MKL_LONG* sizes) {
PADDLE_ENFORCE_EQ(desc_.get(), nullptr,
platform::errors::AlreadyExists(
"DftiDescriptor has already been initialized."));
DFTI_DESCRIPTOR* raw_desc;
MKL_DFTI_CHECK(phi::dynload::DftiCreateDescriptorX(
&raw_desc, precision, signal_type, signal_ndim, sizes));
desc_.reset(raw_desc);
}
DFTI_DESCRIPTOR* get() const {
DFTI_DESCRIPTOR* raw_desc = desc_.get();
PADDLE_ENFORCE_NOT_NULL(raw_desc,
platform::errors::PreconditionNotMet(
"DFTI DESCRIPTOR has not been initialized."));
return raw_desc;
}
private:
std::unique_ptr<DFTI_DESCRIPTOR, DftiDescriptorDeleter> desc_;
};
DftiDescriptor _plan_mkl_fft(const framework::proto::VarType::Type& in_dtype,
const framework::proto::VarType::Type& out_dtype,
const framework::DDim& in_strides,
const framework::DDim& out_strides,
const std::vector<int>& signal_sizes,
FFTNormMode normalization, bool forward) {
const DFTI_CONFIG_VALUE precision = [&] {
switch (in_dtype) {
case framework::proto::VarType::FP32:
return DFTI_SINGLE;
case framework::proto::VarType::COMPLEX64:
return DFTI_SINGLE;
case framework::proto::VarType::FP64:
return DFTI_DOUBLE;
case framework::proto::VarType::COMPLEX128:
return DFTI_DOUBLE;
default:
PADDLE_THROW(platform::errors::InvalidArgument(
"Invalid input datatype (%s), input data type should be FP32, "
"FP64, COMPLEX64 or COMPLEX128.",
framework::DataTypeToString(in_dtype)));
}
}();
// C2C, R2C, C2R
const FFTTransformType fft_type = GetFFTTransformType(in_dtype, out_dtype);
const DFTI_CONFIG_VALUE domain =
(fft_type == FFTTransformType::C2C) ? DFTI_COMPLEX : DFTI_REAL;
DftiDescriptor descriptor;
std::vector<MKL_LONG> fft_sizes(signal_sizes.cbegin(), signal_sizes.cend());
const MKL_LONG signal_ndim = fft_sizes.size() - 1;
descriptor.init(precision, domain, signal_ndim, fft_sizes.data() + 1);
// placement inplace or not inplace
MKL_DFTI_CHECK(phi::dynload::DftiSetValue(descriptor.get(), DFTI_PLACEMENT,
DFTI_NOT_INPLACE));
// number of transformations
const MKL_LONG batch_size = fft_sizes[0];
MKL_DFTI_CHECK(phi::dynload::DftiSetValue(
descriptor.get(), DFTI_NUMBER_OF_TRANSFORMS, batch_size));
// input & output distance
const MKL_LONG idist = in_strides[0];
const MKL_LONG odist = out_strides[0];
MKL_DFTI_CHECK(
phi::dynload::DftiSetValue(descriptor.get(), DFTI_INPUT_DISTANCE, idist));
MKL_DFTI_CHECK(phi::dynload::DftiSetValue(descriptor.get(),
DFTI_OUTPUT_DISTANCE, odist));
// input & output stride
std::vector<MKL_LONG> mkl_in_stride(1 + signal_ndim, 0);
std::vector<MKL_LONG> mkl_out_stride(1 + signal_ndim, 0);
for (MKL_LONG i = 1; i <= signal_ndim; i++) {
mkl_in_stride[i] = in_strides[i];
mkl_out_stride[i] = out_strides[i];
}
MKL_DFTI_CHECK(phi::dynload::DftiSetValue(
descriptor.get(), DFTI_INPUT_STRIDES, mkl_in_stride.data()));
MKL_DFTI_CHECK(phi::dynload::DftiSetValue(
descriptor.get(), DFTI_OUTPUT_STRIDES, mkl_out_stride.data()));
// conjugate even storage
if (!(fft_type == FFTTransformType::C2C)) {
MKL_DFTI_CHECK(phi::dynload::DftiSetValue(
descriptor.get(), DFTI_CONJUGATE_EVEN_STORAGE, DFTI_COMPLEX_COMPLEX));
}
MKL_LONG signal_numel =
std::accumulate(fft_sizes.cbegin() + 1, fft_sizes.cend(), 1UL,
std::multiplies<MKL_LONG>());
if (normalization != FFTNormMode::none) {
const double scale =
((normalization == FFTNormMode::by_sqrt_n)
? 1.0 / std::sqrt(static_cast<double>(signal_numel))
: 1.0 / static_cast<double>(signal_numel));
const auto scale_direction = [&]() {
if (fft_type == FFTTransformType::R2C ||
(fft_type == FFTTransformType::C2C && forward)) {
return DFTI_FORWARD_SCALE;
} else {
// (fft_type == FFTTransformType::C2R ||
// (fft_type == FFTTransformType::C2C && !forward))
return DFTI_BACKWARD_SCALE;
}
}();
MKL_DFTI_CHECK(
phi::dynload::DftiSetValue(descriptor.get(), scale_direction, scale));
}
// commit the descriptor
MKL_DFTI_CHECK(phi::dynload::DftiCommitDescriptor(descriptor.get()));
return descriptor;
}
// Execute a general fft operation (can be c2c, onesided r2c or onesided c2r)
template <typename DeviceContext, typename Ti, typename To>
void exec_fft(const DeviceContext& ctx, const Tensor* x, Tensor* out,
const std::vector<int64_t>& axes, FFTNormMode normalization,
bool forward) {
const framework::DDim& in_sizes = x->dims();
const int ndim = in_sizes.size();
const int signal_ndim = axes.size();
const int batch_ndim = ndim - signal_ndim;
const framework::DDim& out_sizes = out->dims();
// make a dim permutation
std::vector<int> dim_permute(ndim);
std::iota(dim_permute.begin(), dim_permute.end(), 0);
std::vector<bool> is_transformed_dim(ndim, false);
for (const auto& d : axes) {
is_transformed_dim[d] = true;
}
const auto batch_end =
std::partition(dim_permute.begin(), dim_permute.end(),
[&](size_t axis) { return !is_transformed_dim[axis]; });
std::copy(axes.cbegin(), axes.cend(), batch_end);
// transpose input according to that permutation
framework::DDim transposed_input_shape = in_sizes.transpose(dim_permute);
std::vector<int64_t> transposed_input_shape_ =
phi::vectorize(transposed_input_shape);
framework::Tensor transposed_input;
transposed_input.Resize(transposed_input_shape);
const auto place = ctx.GetPlace();
transposed_input.mutable_data<Ti>(place);
TransCompute<platform::CPUDeviceContext, Ti>(ndim, ctx, *x, &transposed_input,
dim_permute);
// make an collapsed input: collapse batch axes for input
const int batch_size = std::accumulate(
transposed_input_shape.Get(), transposed_input_shape.Get() + batch_ndim,
1L, std::multiplies<int64_t>());
std::vector<int> collapsed_input_shape_(1 + signal_ndim);
collapsed_input_shape_[0] = batch_size;
std::copy(transposed_input_shape_.begin() + batch_ndim,
transposed_input_shape_.end(), collapsed_input_shape_.begin() + 1);
const framework::DDim collapsed_input_shape =
phi::make_ddim(collapsed_input_shape_);
transposed_input.Resize(collapsed_input_shape);
framework::Tensor& collapsed_input = transposed_input;
// make a collapsed output
std::vector<int> collapsed_output_shape_(1 + signal_ndim);
collapsed_output_shape_[0] = batch_size;
for (int i = 0; i < signal_ndim; i++) {
collapsed_output_shape_[1 + i] = out_sizes[axes[i]];
}
const framework::DDim collapsed_output_shape =
phi::make_ddim(collapsed_output_shape_);
framework::Tensor collapsed_output;
collapsed_output.Resize(collapsed_output_shape);
collapsed_output.mutable_data(place, out->type());
// signal sizes
std::vector<int> signal_sizes(1 + signal_ndim);
signal_sizes[0] = batch_size;
for (int i = 0; i < signal_ndim; i++) {
signal_sizes[1 + i] =
std::max(collapsed_input_shape[1 + i], collapsed_output_shape[1 + i]);
}
// input & output stride
const framework::DDim input_stride = phi::stride(collapsed_input_shape);
const framework::DDim output_stride = phi::stride(collapsed_output_shape);
// make a DFTI_DESCRIPTOR
DftiDescriptor desc =
_plan_mkl_fft(framework::TransToProtoVarType(x->dtype()),
framework::TransToProtoVarType(out->dtype()), input_stride,
output_stride, signal_sizes, normalization, forward);
const FFTTransformType fft_type =
GetFFTTransformType(framework::TransToProtoVarType(x->dtype()),
framework::TransToProtoVarType(out->type()));
if (fft_type == FFTTransformType::C2R && forward) {
framework::Tensor collapsed_input_conj(collapsed_input.dtype());
collapsed_input_conj.mutable_data<Ti>(collapsed_input.dims(),
ctx.GetPlace());
// conjugate the input
platform::ForRange<DeviceContext> for_range(ctx, collapsed_input.numel());
phi::funcs::ConjFunctor<Ti> functor(collapsed_input.data<Ti>(),
collapsed_input.numel(),
collapsed_input_conj.data<Ti>());
for_range(functor);
MKL_DFTI_CHECK(phi::dynload::DftiComputeBackward(
desc.get(), collapsed_input_conj.data(), collapsed_output.data()));
} else if (fft_type == FFTTransformType::R2C && !forward) {
framework::Tensor collapsed_output_conj(collapsed_output.dtype());
collapsed_output_conj.mutable_data<To>(collapsed_output.dims(),
ctx.GetPlace());
MKL_DFTI_CHECK(phi::dynload::DftiComputeForward(
desc.get(), collapsed_input.data(), collapsed_output_conj.data()));
// conjugate the output
platform::ForRange<DeviceContext> for_range(ctx, collapsed_output.numel());
phi::funcs::ConjFunctor<To> functor(collapsed_output_conj.data<To>(),
collapsed_output.numel(),
collapsed_output.data<To>());
for_range(functor);
} else {
if (forward) {
MKL_DFTI_CHECK(phi::dynload::DftiComputeForward(
desc.get(), collapsed_input.data(), collapsed_output.data()));
} else {
MKL_DFTI_CHECK(phi::dynload::DftiComputeBackward(
desc.get(), collapsed_input.data(), collapsed_output.data()));
}
}
// resize for the collapsed output
framework::DDim transposed_output_shape = out_sizes.transpose(dim_permute);
collapsed_output.Resize(transposed_output_shape);
framework::Tensor& transposed_output = collapsed_output;
// reverse the transposition
std::vector<int> reverse_dim_permute(ndim);
for (int i = 0; i < ndim; i++) {
reverse_dim_permute[dim_permute[i]] = i;
}
TransCompute<platform::CPUDeviceContext, To>(ndim, ctx, transposed_output,
out, reverse_dim_permute);
}
} // anonymous namespace
template <typename Ti, typename To>
struct FFTC2CFunctor<platform::CPUDeviceContext, Ti, To> {
void operator()(const platform::CPUDeviceContext& ctx, const Tensor* x,
Tensor* out, const std::vector<int64_t>& axes,
FFTNormMode normalization, bool forward) {
exec_fft<platform::CPUDeviceContext, Ti, To>(ctx, x, out, axes,
normalization, forward);
}
};
template <typename Ti, typename To>
struct FFTR2CFunctor<platform::CPUDeviceContext, Ti, To> {
void operator()(const platform::CPUDeviceContext& ctx, const Tensor* x,
Tensor* out, const std::vector<int64_t>& axes,
FFTNormMode normalization, bool forward) {
exec_fft<platform::CPUDeviceContext, Ti, To>(ctx, x, out, axes,
normalization, forward);
}
};
template <typename Ti, typename To>
struct FFTC2RFunctor<platform::CPUDeviceContext, Ti, To> {
void operator()(const platform::CPUDeviceContext& ctx, const Tensor* x,
Tensor* out, const std::vector<int64_t>& axes,
FFTNormMode normalization, bool forward) {
if (axes.size() > 1) {
const std::vector<int64_t> c2c_dims(axes.begin(), axes.end() - 1);
Tensor temp;
temp.mutable_data<Ti>(x->dims(), ctx.GetPlace());
FFTC2CFunctor<platform::CPUDeviceContext, Ti, Ti> c2c_functor;
c2c_functor(ctx, x, &temp, c2c_dims, normalization, forward);
const std::vector<int64_t> new_axes{axes.back()};
exec_fft<platform::CPUDeviceContext, Ti, To>(ctx, &temp, out, new_axes,
normalization, forward);
} else {
exec_fft<platform::CPUDeviceContext, Ti, To>(ctx, x, out, axes,
normalization, forward);
}
}
};
#elif defined(PADDLE_WITH_POCKETFFT)
namespace {
template <typename T>
T compute_factor(int64_t size, FFTNormMode normalization) {
constexpr auto one = static_cast<T>(1);
switch (normalization) {
case FFTNormMode::none:
return one;
case FFTNormMode::by_n:
return one / static_cast<T>(size);
case FFTNormMode::by_sqrt_n:
return one / std::sqrt(static_cast<T>(size));
}
PADDLE_THROW(
platform::errors::InvalidArgument("Unsupported normalization type"));
}
} // anonymous namespace
template <typename Ti, typename To>
struct FFTC2CFunctor<platform::CPUDeviceContext, Ti, To> {
void operator()(const platform::CPUDeviceContext& ctx, const Tensor* x,
Tensor* out, const std::vector<int64_t>& axes,
FFTNormMode normalization, bool forward) {
using R = typename Ti::value_type;
using C = std::complex<R>;
const auto& input_dim = x->dims();
const std::vector<size_t> in_sizes = phi::vectorize<size_t>(input_dim);
std::vector<std::ptrdiff_t> in_strides =
phi::vectorize<std::ptrdiff_t>(phi::stride(input_dim));
const int64_t data_size = sizeof(C);
std::transform(in_strides.begin(), in_strides.end(), in_strides.begin(),
[&](std::ptrdiff_t s) { return s * data_size; });
const auto* in_data = reinterpret_cast<const C*>(x->data<Ti>());
auto* out_data = reinterpret_cast<C*>(out->data<To>());
// pocketfft requires std::vector<size_t>
std::vector<size_t> axes_(axes.size());
std::copy(axes.begin(), axes.end(), axes_.begin());
// compuet factor
int64_t signal_numel = 1;
for (auto i : axes) {
signal_numel *= in_sizes[i];
}
R factor = compute_factor<R>(signal_numel, normalization);
pocketfft::c2c(in_sizes, in_strides, in_strides, axes_, forward, in_data,
out_data, factor);
}
};
template <typename Ti, typename To>
struct FFTR2CFunctor<platform::CPUDeviceContext, Ti, To> {
void operator()(const platform::CPUDeviceContext& ctx, const Tensor* x,
Tensor* out, const std::vector<int64_t>& axes,
FFTNormMode normalization, bool forward) {
using R = Ti;
using C = std::complex<R>;
const auto& input_dim = x->dims();
const std::vector<size_t> in_sizes = phi::vectorize<size_t>(input_dim);
std::vector<std::ptrdiff_t> in_strides =
phi::vectorize<std::ptrdiff_t>(phi::stride(input_dim));
{
const int64_t data_size = sizeof(R);
std::transform(in_strides.begin(), in_strides.end(), in_strides.begin(),
[&](std::ptrdiff_t s) { return s * data_size; });
}
const auto& output_dim = out->dims();
const std::vector<size_t> out_sizes = phi::vectorize<size_t>(output_dim);
std::vector<std::ptrdiff_t> out_strides =
phi::vectorize<std::ptrdiff_t>(phi::stride(output_dim));
{
const int64_t data_size = sizeof(C);
std::transform(out_strides.begin(), out_strides.end(),
out_strides.begin(),
[&](std::ptrdiff_t s) { return s * data_size; });
}
const auto* in_data = x->data<R>();
auto* out_data = reinterpret_cast<C*>(out->data<To>());
// pocketfft requires std::vector<size_t>
std::vector<size_t> axes_(axes.size());
std::copy(axes.begin(), axes.end(), axes_.begin());
// compuet normalization factor
int64_t signal_numel = 1;
for (auto i : axes) {
signal_numel *= in_sizes[i];
}
R factor = compute_factor<R>(signal_numel, normalization);
pocketfft::r2c(in_sizes, in_strides, out_strides, axes_, forward, in_data,
out_data, factor);
}
};
template <typename Ti, typename To>
struct FFTC2RFunctor<platform::CPUDeviceContext, Ti, To> {
void operator()(const platform::CPUDeviceContext& ctx, const Tensor* x,
Tensor* out, const std::vector<int64_t>& axes,
FFTNormMode normalization, bool forward) {
using R = To;
using C = std::complex<R>;
const auto& input_dim = x->dims();
const std::vector<size_t> in_sizes = phi::vectorize<size_t>(input_dim);
std::vector<std::ptrdiff_t> in_strides =
phi::vectorize<std::ptrdiff_t>(phi::stride(input_dim));
{
const int64_t data_size = sizeof(C);
std::transform(in_strides.begin(), in_strides.end(), in_strides.begin(),
[&](std::ptrdiff_t s) { return s * data_size; });
}
const auto& output_dim = out->dims();
const std::vector<size_t> out_sizes = phi::vectorize<size_t>(output_dim);
std::vector<std::ptrdiff_t> out_strides =
phi::vectorize<std::ptrdiff_t>(phi::stride(output_dim));
{
const int64_t data_size = sizeof(R);
std::transform(out_strides.begin(), out_strides.end(),
out_strides.begin(),
[&](std::ptrdiff_t s) { return s * data_size; });
}
const auto* in_data = reinterpret_cast<const C*>(x->data<Ti>());
auto* out_data = out->data<R>();
// pocketfft requires std::vector<size_t>
std::vector<size_t> axes_(axes.size());
std::copy(axes.begin(), axes.end(), axes_.begin());
// compuet normalization factor
int64_t signal_numel = 1;
for (auto i : axes) {
signal_numel *= out_sizes[i];
}
R factor = compute_factor<R>(signal_numel, normalization);
pocketfft::c2r(out_sizes, in_strides, out_strides, axes_, forward, in_data,
out_data, factor);
}
};
#endif
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
......
...@@ -8,496 +8,9 @@ ...@@ -8,496 +8,9 @@
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <functional>
#include <list>
#include <memory>
#include <mutex>
#include <numeric>
#include <sstream>
#include <stdexcept>
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/operators/conj_op.h" #include "paddle/fluid/operators/spectral_op.cu.h"
#include "paddle/fluid/operators/spectral_helper.h"
#include "paddle/fluid/operators/spectral_op.h" #include "paddle/fluid/operators/spectral_op.h"
#include "paddle/fluid/operators/transpose_op.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/phi/kernels/funcs/complex_functors.h"
namespace paddle {
namespace operators {
namespace {
// Calculates the normalization constant
double fft_normalization_scale(FFTNormMode normalization,
const std::vector<int64_t>& sizes,
const std::vector<int64_t>& dims) {
// auto norm = static_cast<fft_norm_mode>(normalization);
if (normalization == FFTNormMode::none) {
return static_cast<double>(1.0);
}
int64_t signal_numel = 1;
for (auto dim : dims) {
signal_numel *= sizes[dim];
}
const double scale_denom = (normalization == FFTNormMode::by_sqrt_n)
? std::sqrt(signal_numel)
: static_cast<double>(signal_numel);
return static_cast<double>(1.0 / scale_denom);
}
template <typename DeviceContext, typename T>
void exec_normalization(const DeviceContext& ctx, const Tensor* in, Tensor* out,
FFTNormMode normalization,
const std::vector<int64_t>& sizes,
const std::vector<int64_t>& axes) {
double scale = fft_normalization_scale(normalization, sizes, axes);
if (scale != 1.0) {
auto eigen_out = framework::EigenVector<T>::Flatten(*out);
auto eigen_in = framework::EigenVector<T>::Flatten(*in);
auto dev = ctx.eigen_device();
EigenScale<Eigen::GpuDevice, T>::Eval(*dev, eigen_out, eigen_in,
static_cast<T>(scale),
static_cast<T>(0), false);
} else {
framework::TensorCopy(*in, ctx.GetPlace(), out);
}
}
#if defined(PADDLE_WITH_CUDA)
FFTConfigKey create_fft_configkey(const framework::Tensor& input,
const framework::Tensor& output,
int signal_ndim) {
// Create the transform plan (either from cache or locally)
const auto value_type =
framework::IsComplexType(framework::TransToProtoVarType(input.dtype()))
? framework::ToRealType(framework::TransToProtoVarType(input.dtype()))
: framework::TransToProtoVarType(input.dtype());
auto fft_type =
GetFFTTransformType(framework::TransToProtoVarType(input.dtype()),
framework::TransToProtoVarType(output.dtype()));
// signal sizes
std::vector<int64_t> signal_size(signal_ndim + 1);
signal_size[0] = input.dims()[0];
for (int64_t i = 1; i <= signal_ndim; ++i) {
auto in_size = input.dims()[i];
auto out_size = output.dims()[i];
signal_size[i] = std::max(in_size, out_size);
}
FFTConfigKey key(phi::vectorize(input.dims()), phi::vectorize(output.dims()),
signal_size, fft_type, value_type);
return key;
}
// Execute a pre-planned transform
static void exec_cufft_plan_raw(const FFTConfig& config, void* in_data,
void* out_data, bool forward) {
auto& plan = config.plan();
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftXtExec(
plan, in_data, out_data, forward ? CUFFT_FORWARD : CUFFT_INVERSE));
}
template <typename DeviceContext, typename Ti, typename To>
void exec_cufft_plan(const DeviceContext& ctx, const FFTConfig& config,
framework::Tensor* input, framework::Tensor* output,
bool forward) {
// execute transform plan
auto fft_type = config.transform_type();
if (fft_type == FFTTransformType::C2R && forward) {
forward = false;
framework::Tensor input_conj(input->type());
input_conj.mutable_data<Ti>(input->dims(), ctx.GetPlace());
platform::ForRange<DeviceContext> for_range(ctx, input->numel());
phi::funcs::ConjFunctor<Ti> functor(input->data<Ti>(), input->numel(),
input_conj.data<Ti>());
for_range(functor);
exec_cufft_plan_raw(config, input_conj.data(), output->data(), forward);
} else if (fft_type == FFTTransformType::R2C && !forward) {
forward = true;
framework::Tensor out_conj(output->type());
out_conj.mutable_data<To>(output->dims(), ctx.GetPlace());
exec_cufft_plan_raw(config, input->data(), out_conj.data(), forward);
platform::ForRange<DeviceContext> for_range(ctx, output->numel());
phi::funcs::ConjFunctor<To> functor(out_conj.data<To>(), output->numel(),
output->data<To>());
for_range(functor);
} else {
exec_cufft_plan_raw(config, input->data(), output->data(), forward);
}
}
#elif defined(PADDLE_WITH_HIP)
FFTConfigKey create_fft_configkey(const framework::Tensor& input,
const framework::Tensor& output,
int signal_ndim) {
// Create the transform plan (either from cache or locally)
const auto value_type =
framework::IsComplexType(framework::TransToProtoVarType(input.dtype()))
? framework::ToRealType(framework::TransToProtoVarType(input.dtype()))
: framework::TransToProtoVarType(input.dtype());
auto fft_type =
GetFFTTransformType(framework::TransToProtoVarType(input.dtype()),
framework::TransToProtoVarType(output.type()));
// signal sizes
std::vector<int64_t> signal_size(signal_ndim + 1);
signal_size[0] = input.dims()[0];
for (int64_t i = 1; i <= signal_ndim; ++i) {
auto in_size = input.dims()[i];
auto out_size = output.dims()[i];
signal_size[i] = std::max(in_size, out_size);
}
FFTConfigKey key(phi::vectorize(input.dims()), phi::vectorize(output.dims()),
signal_size, fft_type, value_type);
return key;
}
// Execute a pre-planned transform
static void exec_hipfft_plan_raw(const FFTConfig& config, void* in_data,
void* out_data, bool forward) {
auto& plan = config.plan();
auto value_type = config.data_type();
if (value_type == framework::proto::VarType::FP32) {
switch (config.transform_type()) {
case FFTTransformType::C2C: {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecC2C(
plan, static_cast<hipfftComplex*>(in_data),
static_cast<hipfftComplex*>(out_data),
forward ? HIPFFT_FORWARD : HIPFFT_BACKWARD));
return;
}
case FFTTransformType::R2C: {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecR2C(
plan, static_cast<hipfftReal*>(in_data),
static_cast<hipfftComplex*>(out_data)));
return;
}
case FFTTransformType::C2R: {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecC2R(
plan, static_cast<hipfftComplex*>(in_data),
static_cast<hipfftReal*>(out_data)));
return;
}
}
} else if (value_type == framework::proto::VarType::FP64) {
switch (config.transform_type()) {
case FFTTransformType::C2C: {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecZ2Z(
plan, static_cast<hipfftDoubleComplex*>(in_data),
static_cast<hipfftDoubleComplex*>(out_data),
forward ? HIPFFT_FORWARD : HIPFFT_BACKWARD));
return;
}
case FFTTransformType::R2C: {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecD2Z(
plan, static_cast<hipfftDoubleReal*>(in_data),
static_cast<hipfftDoubleComplex*>(out_data)));
return;
}
case FFTTransformType::C2R: {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecZ2D(
plan, static_cast<hipfftDoubleComplex*>(in_data),
static_cast<hipfftDoubleReal*>(out_data)));
return;
}
}
}
PADDLE_THROW(platform::errors::InvalidArgument(
"hipFFT only support transforms of type float32 and float64"));
}
template <typename DeviceContext, typename Ti, typename To>
void exec_hipfft_plan(const DeviceContext& ctx, const FFTConfig& config,
framework::Tensor* input, framework::Tensor* output,
bool forward) {
auto fft_type = config.transform_type();
if (fft_type == FFTTransformType::C2R && forward) {
forward = false;
framework::Tensor input_conj(input->type());
input_conj.mutable_data<Ti>(input->dims(), ctx.GetPlace());
platform::ForRange<DeviceContext> for_range(ctx, input->numel());
phi::funcs::ConjFunctor<Ti> functor(input->data<Ti>(), input->numel(),
input_conj.data<Ti>());
for_range(functor);
exec_hipfft_plan_raw(config, input_conj.data(), output->data(), forward);
} else if (fft_type == FFTTransformType::R2C && !forward) {
forward = true;
framework::Tensor out_conj(output->type());
out_conj.mutable_data<To>(output->dims(), ctx.GetPlace());
exec_hipfft_plan_raw(config, input->data(), out_conj.data(), forward);
platform::ForRange<DeviceContext> for_range(ctx, output->numel());
phi::funcs::ConjFunctor<To> functor(out_conj.data<To>(), output->numel(),
output->data<To>());
for_range(functor);
} else {
exec_hipfft_plan_raw(config, input->data(), output->data(), forward);
}
}
#endif
// Execute a general unnormalized fft operation (can be c2c, onesided r2c or
// onesided c2r)
template <typename DeviceContext, typename Ti, typename To>
void exec_fft(const DeviceContext& ctx, const Tensor* X, Tensor* out,
const std::vector<int64_t>& dim, bool forward) {
const auto x_dims = phi::vectorize(X->dims());
const int64_t ndim = static_cast<int64_t>(X->dims().size());
auto tensor_place = ctx.GetPlace();
// make a dim permutation
std::vector<int> dim_permute(ndim);
std::iota(dim_permute.begin(), dim_permute.end(), int{0});
std::vector<bool> is_transformed_dim(ndim);
for (const auto& d : dim) {
is_transformed_dim[d] = true;
}
auto batch_end =
std::partition(dim_permute.begin(), dim_permute.end(),
[&](int64_t d) { return !is_transformed_dim[d]; });
std::sort(dim_permute.begin(), batch_end);
std::copy(dim.cbegin(), dim.cend(), batch_end);
// transpose input according to dim permutation
auto transposed_input_shape = X->dims().transpose(dim_permute);
framework::Tensor transposed_input;
transposed_input.Resize(transposed_input_shape);
transposed_input.mutable_data<Ti>(tensor_place);
TransCompute<DeviceContext, Ti>(ndim, ctx, *X, &transposed_input,
dim_permute);
// Reshape batch dimensions into a single dimension
const int64_t signal_ndim = static_cast<int64_t>(dim.size());
std::vector<int64_t> collapsed_input_shape(signal_ndim + 1);
auto transposed_input_shape_ = phi::vectorize(transposed_input_shape);
const int64_t batch_dims = ndim - signal_ndim;
auto batch_size =
std::accumulate(transposed_input_shape_.begin(),
transposed_input_shape_.begin() + batch_dims,
static_cast<int>(1), std::multiplies<int>());
collapsed_input_shape[0] = batch_size;
std::copy(transposed_input_shape_.begin() + batch_dims,
transposed_input_shape_.end(), collapsed_input_shape.begin() + 1);
framework::Tensor& collapsed_input = transposed_input;
collapsed_input.Resize(phi::make_ddim(collapsed_input_shape));
// make a collpased output
const auto out_dims = phi::vectorize(out->dims());
std::vector<int64_t> collapsed_output_shape(1 + signal_ndim);
collapsed_output_shape[0] = batch_size;
for (size_t i = 0; i < dim.size(); ++i) {
collapsed_output_shape[i + 1] = out_dims[dim[i]];
}
framework::Tensor collapsed_output;
collapsed_output.Resize(phi::make_ddim(collapsed_output_shape));
collapsed_output.mutable_data<To>(tensor_place);
FFTConfig* config = nullptr;
#if defined(PADDLE_WITH_CUDA)
std::unique_ptr<FFTConfig> config_ = nullptr;
// create plan
FFTConfigKey key =
create_fft_configkey(collapsed_input, collapsed_output, signal_ndim);
bool using_cache = false;
#if !defined(CUFFT_VERSION) || (CUFFT_VERSION < 10200)
using_cache = true;
#endif
if (using_cache) {
const int64_t device_id = static_cast<int64_t>(
reinterpret_cast<const platform::CUDAPlace*>(&collapsed_input.place())
->GetDeviceId());
FFTConfigCache& plan_cache = get_fft_plan_cache(device_id);
std::unique_lock<std::mutex> guard(plan_cache.mutex, std::defer_lock);
guard.lock();
config = &(plan_cache.lookup(key));
} else {
config_ = std::make_unique<FFTConfig>(key);
config = config_.get();
}
// prepare cufft for execution
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cufftSetStream(config->plan(), ctx.stream()));
framework::Tensor workspace_tensor;
workspace_tensor.mutable_data<To>(tensor_place, config->workspace_size());
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftSetWorkArea(
config->plan(), workspace_tensor.data<To>()));
// execute transform plan
exec_cufft_plan<DeviceContext, Ti, To>(ctx, *config, &collapsed_input,
&collapsed_output, forward);
#elif defined(PADDLE_WITH_HIP)
// create plan
FFTConfigKey key =
create_fft_configkey(collapsed_input, collapsed_output, signal_ndim);
const int64_t device_id = static_cast<int64_t>(
reinterpret_cast<const platform::CUDAPlace*>(&collapsed_input.place())
->GetDeviceId());
FFTConfigCache& plan_cache = get_fft_plan_cache(device_id);
std::unique_lock<std::mutex> guard(plan_cache.mutex, std::defer_lock);
guard.lock();
config = &(plan_cache.lookup(key));
// prepare cufft for execution
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::hipfftSetStream(config->plan(), ctx.stream()));
framework::Tensor workspace_tensor;
workspace_tensor.mutable_data<To>(tensor_place, config->workspace_size());
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftSetWorkArea(
config->plan(), workspace_tensor.data<To>()));
// execute transform plan
exec_hipfft_plan<DeviceContext, Ti, To>(ctx, *config, &collapsed_input,
&collapsed_output, forward);
#endif
// Inverting output by reshape and transpose to original batch and dimension
auto transposed_out_shape = out->dims().transpose(dim_permute);
collapsed_output.Resize(transposed_out_shape);
auto& transposed_output = collapsed_output;
std::vector<int> reverse_dim_permute(ndim);
for (size_t i = 0; i < ndim; i++) {
reverse_dim_permute[dim_permute[i]] = i;
}
TransCompute<DeviceContext, To>(ndim, ctx, transposed_output, out,
reverse_dim_permute);
}
} // anonymous namespace
// Use the optimized path to perform single R2C or C2R if transformation dim is
// supported by cuFFT
bool use_optimized_fft_path(const std::vector<int64_t>& axes) {
// For performance reason, when axes starts with (0, 1), do not use the
// optimized path.
if (axes.size() > kMaxFFTNdim ||
(axes.size() >= 2 && axes[0] == 0 && axes[1] == 1)) {
return false;
} else {
return true;
}
}
template <typename Ti, typename To>
struct FFTC2CFunctor<platform::CUDADeviceContext, Ti, To> {
void operator()(const platform::CUDADeviceContext& ctx, const Tensor* X,
Tensor* out, const std::vector<int64_t>& axes,
FFTNormMode normalization, bool forward) {
if (axes.empty()) {
framework::TensorCopy(*X, ctx.GetPlace(), out);
return;
}
framework::Tensor* p_out = out;
std::vector<int64_t> out_dims = phi::vectorize(X->dims());
std::vector<int64_t> working_axes(axes.begin(), axes.end());
std::vector<int64_t> first_dims;
size_t max_dims;
framework::Tensor working_tensor;
working_tensor.mutable_data<Ti>(X->dims(), ctx.GetPlace());
framework::Tensor* p_working_tensor = &working_tensor;
framework::TensorCopy(*X, ctx.GetPlace(), &working_tensor);
while (true) {
max_dims =
std::min(static_cast<size_t>(kMaxFFTNdim), working_axes.size());
first_dims.assign(working_axes.end() - max_dims, working_axes.end());
exec_fft<platform::CUDADeviceContext, Ti, To>(ctx, p_working_tensor,
p_out, first_dims, forward);
working_axes.resize(working_axes.size() - max_dims);
first_dims.clear();
if (working_axes.empty()) {
break;
}
std::swap(p_out, p_working_tensor);
}
exec_normalization<platform::CUDADeviceContext, To>(
ctx, p_out, out, normalization, out_dims, axes);
}
};
template <typename Ti, typename To>
struct FFTC2RFunctor<platform::CUDADeviceContext, Ti, To> {
void operator()(const platform::CUDADeviceContext& ctx, const Tensor* X,
Tensor* out, const std::vector<int64_t>& axes,
FFTNormMode normalization, bool forward) {
std::vector<int64_t> in_dims = phi::vectorize(X->dims());
std::vector<int64_t> out_dims = phi::vectorize(out->dims());
if (use_optimized_fft_path(axes)) {
framework::Tensor x_copy(X->type());
x_copy.mutable_data<Ti>(X->dims(), ctx.GetPlace());
framework::TensorCopy(*X, ctx.GetPlace(), &x_copy);
exec_fft<platform::CUDADeviceContext, Ti, To>(ctx, &x_copy, out, axes,
forward);
} else {
framework::Tensor temp_tensor;
temp_tensor.mutable_data<Ti>(X->dims(), ctx.GetPlace());
const std::vector<int64_t> dims(axes.begin(), axes.end() - 1);
FFTC2CFunctor<platform::CUDADeviceContext, Ti, Ti> c2c_functor;
c2c_functor(ctx, X, &temp_tensor, dims, FFTNormMode::none, forward);
exec_fft<platform::CUDADeviceContext, Ti, To>(ctx, &temp_tensor, out,
{axes.back()}, forward);
}
exec_normalization<platform::CUDADeviceContext, To>(
ctx, out, out, normalization, out_dims, axes);
}
};
// n dimension real to complex FFT use cufft lib
template <typename Ti, typename To>
struct FFTR2CFunctor<platform::CUDADeviceContext, Ti, To> {
void operator()(const platform::CUDADeviceContext& ctx, const Tensor* X,
Tensor* out, const std::vector<int64_t>& axes,
FFTNormMode normalization, bool forward) {
// Step1: R2C transform on the last dimension
framework::Tensor* r2c_out = out;
const std::vector<int64_t> last_dim{axes.back()};
std::vector<int64_t> out_dims = phi::vectorize(out->dims());
exec_fft<platform::CUDADeviceContext, Ti, To>(ctx, X, r2c_out, last_dim,
forward);
// Step2: C2C transform on the remaining dimension
framework::Tensor c2c_out;
if (axes.size() > 1) {
c2c_out.mutable_data<To>(out->dims(), ctx.GetPlace());
std::vector<int64_t> remain_dim(axes.begin(), axes.end() - 1);
FFTC2CFunctor<platform::CUDADeviceContext, To, To> fft_c2c_func;
fft_c2c_func(ctx, r2c_out, &c2c_out, remain_dim, FFTNormMode::none,
forward);
}
const auto in_sizes = phi::vectorize(X->dims());
framework::Tensor* norm_tensor = axes.size() > 1 ? &c2c_out : r2c_out;
exec_normalization<platform::CUDADeviceContext, To>(
ctx, norm_tensor, out, normalization, in_sizes, axes);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <functional>
#include <list>
#include <memory>
#include <mutex>
#include <numeric>
#include <sstream>
#include <stdexcept>
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/operators/conj_op.h"
#include "paddle/fluid/operators/spectral_op.h"
#include "paddle/fluid/operators/transpose_op.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/phi/kernels/funcs/complex_functors.h"
#ifdef PADDLE_WITH_HIP
#include "paddle/fluid/platform/dynload/hipfft.h"
#endif
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/dynload/cufft.h"
#endif
namespace paddle {
namespace operators {
using ScalarType = framework::proto::VarType::Type;
const int64_t kMaxFFTNdim = 3;
const int64_t kMaxDataNdim = kMaxFFTNdim + 1;
// This struct is used to easily compute hashes of the
// parameters. It will be the **key** to the plan cache.
struct FFTConfigKey {
// between 1 and kMaxFFTNdim, i.e., 1 <= signal_ndim <= 3
int64_t signal_ndim_;
// These include additional batch dimension as well.
int64_t sizes_[kMaxDataNdim];
int64_t input_shape_[kMaxDataNdim];
int64_t output_shape_[kMaxDataNdim];
FFTTransformType fft_type_;
ScalarType value_type_;
FFTConfigKey() = default;
FFTConfigKey(const std::vector<int64_t>& in_shape,
const std::vector<int64_t>& out_shape,
const std::vector<int64_t>& signal_size,
FFTTransformType fft_type, ScalarType value_type) {
// Padding bits must be zeroed for hashing
memset(this, 0, sizeof(*this));
signal_ndim_ = signal_size.size() - 1;
fft_type_ = fft_type;
value_type_ = value_type;
std::copy(signal_size.cbegin(), signal_size.cend(), sizes_);
std::copy(in_shape.cbegin(), in_shape.cend(), input_shape_);
std::copy(out_shape.cbegin(), out_shape.cend(), output_shape_);
}
};
#if defined(PADDLE_WITH_CUDA)
// An RAII encapsulation of cuFFTHandle
class CuFFTHandle {
::cufftHandle handle_;
public:
CuFFTHandle() {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftCreate(&handle_));
}
CuFFTHandle(const CuFFTHandle& other) = delete;
CuFFTHandle& operator=(const CuFFTHandle& other) = delete;
CuFFTHandle(CuFFTHandle&& other) = delete;
CuFFTHandle& operator=(CuFFTHandle&& other) = delete;
::cufftHandle& get() { return handle_; }
const ::cufftHandle& get() const { return handle_; }
~CuFFTHandle() {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftDestroy(handle_));
}
};
using plan_size_type = long long int; // NOLINT
// This class contains all the information needed to execute a cuFFT plan:
// 1. the plan
// 2. the workspace size needed
class FFTConfig {
public:
// Only move semantics is enought for this class. Although we already use
// unique_ptr for the plan, still remove copy constructor and assignment op so
// we don't accidentally copy and take perf hit.
explicit FFTConfig(const FFTConfigKey& plan_key)
: FFTConfig(
std::vector<int64_t>(plan_key.sizes_,
plan_key.sizes_ + plan_key.signal_ndim_ + 1),
plan_key.signal_ndim_, plan_key.fft_type_, plan_key.value_type_) {}
// sizes are full signal, including batch size and always two-sided
FFTConfig(const std::vector<int64_t>& sizes, const int64_t signal_ndim,
FFTTransformType fft_type, ScalarType dtype)
: fft_type_(fft_type), value_type_(dtype) {
// signal sizes (excluding batch dim)
std::vector<plan_size_type> signal_sizes(sizes.begin() + 1, sizes.end());
// input batch size
const auto batch = static_cast<plan_size_type>(sizes[0]);
// const int64_t signal_ndim = sizes.size() - 1;
PADDLE_ENFORCE_EQ(signal_ndim, sizes.size() - 1,
platform::errors::InvalidArgument(
"The signal_ndim must be equal to sizes.size() - 1,"
"But signal_ndim is: [%d], sizes.size() - 1 is: [%d]",
signal_ndim, sizes.size() - 1));
cudaDataType itype, otype, exec_type;
const auto complex_input = has_complex_input(fft_type);
const auto complex_output = has_complex_output(fft_type);
if (dtype == framework::proto::VarType::FP32) {
itype = complex_input ? CUDA_C_32F : CUDA_R_32F;
otype = complex_output ? CUDA_C_32F : CUDA_R_32F;
exec_type = CUDA_C_32F;
} else if (dtype == framework::proto::VarType::FP64) {
itype = complex_input ? CUDA_C_64F : CUDA_R_64F;
otype = complex_output ? CUDA_C_64F : CUDA_R_64F;
exec_type = CUDA_C_64F;
} else if (dtype == framework::proto::VarType::FP16) {
itype = complex_input ? CUDA_C_16F : CUDA_R_16F;
otype = complex_output ? CUDA_C_16F : CUDA_R_16F;
exec_type = CUDA_C_16F;
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"cuFFT only support transforms of type float16, float32 and "
"float64"));
}
// disable auto allocation of workspace to use allocator from the framework
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftSetAutoAllocation(
plan(), /* autoAllocate */ 0));
size_t ws_size_t;
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftXtMakePlanMany(
plan(), signal_ndim, signal_sizes.data(),
/* inembed */ nullptr, /* base_istride */ 1, /* idist */ 1, itype,
/* onembed */ nullptr, /* base_ostride */ 1, /* odist */ 1, otype,
batch, &ws_size_t, exec_type));
ws_size = ws_size_t;
}
FFTConfig(const FFTConfig& other) = delete;
FFTConfig& operator=(const FFTConfig& other) = delete;
FFTConfig(FFTConfig&& other) = delete;
FFTConfig& operator=(FFTConfig&& other) = delete;
const cufftHandle& plan() const { return plan_ptr.get(); }
FFTTransformType transform_type() const { return fft_type_; }
ScalarType data_type() const { return value_type_; }
size_t workspace_size() const { return ws_size; }
private:
CuFFTHandle plan_ptr;
size_t ws_size;
FFTTransformType fft_type_;
ScalarType value_type_;
};
#elif defined(PADDLE_WITH_HIP)
// An RAII encapsulation of cuFFTHandle
class HIPFFTHandle {
::hipfftHandle handle_;
public:
HIPFFTHandle() {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftCreate(&handle_));
}
HIPFFTHandle(const HIPFFTHandle& other) = delete;
HIPFFTHandle& operator=(const HIPFFTHandle& other) = delete;
HIPFFTHandle(HIPFFTHandle&& other) = delete;
HIPFFTHandle& operator=(HIPFFTHandle&& other) = delete;
::hipfftHandle& get() { return handle_; }
const ::hipfftHandle& get() const { return handle_; }
~HIPFFTHandle() {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftDestroy(handle_));
}
};
using plan_size_type = int;
// This class contains all the information needed to execute a cuFFT plan:
// 1. the plan
// 2. the workspace size needed
class FFTConfig {
public:
// Only move semantics is enought for this class. Although we already use
// unique_ptr for the plan, still remove copy constructor and assignment op so
// we don't accidentally copy and take perf hit.
explicit FFTConfig(const FFTConfigKey& plan_key)
: FFTConfig(
std::vector<int64_t>(plan_key.sizes_,
plan_key.sizes_ + plan_key.signal_ndim_ + 1),
plan_key.signal_ndim_, plan_key.fft_type_, plan_key.value_type_) {}
// sizes are full signal, including batch size and always two-sided
FFTConfig(const std::vector<int64_t>& sizes, const int64_t signal_ndim,
FFTTransformType fft_type, ScalarType dtype)
: fft_type_(fft_type), value_type_(dtype) {
// signal sizes (excluding batch dim)
std::vector<plan_size_type> signal_sizes(sizes.begin() + 1, sizes.end());
// input batch size
const auto batch = static_cast<plan_size_type>(sizes[0]);
// const int64_t signal_ndim = sizes.size() - 1;
PADDLE_ENFORCE_EQ(signal_ndim, sizes.size() - 1,
platform::errors::InvalidArgument(
"The signal_ndim must be equal to sizes.size() - 1,"
"But signal_ndim is: [%d], sizes.size() - 1 is: [%d]",
signal_ndim, sizes.size() - 1));
hipfftType exec_type = [&] {
if (dtype == framework::proto::VarType::FP32) {
switch (fft_type) {
case FFTTransformType::C2C:
return HIPFFT_C2C;
case FFTTransformType::R2C:
return HIPFFT_R2C;
case FFTTransformType::C2R:
return HIPFFT_C2R;
}
} else if (dtype == framework::proto::VarType::FP64) {
switch (fft_type) {
case FFTTransformType::C2C:
return HIPFFT_Z2Z;
case FFTTransformType::R2C:
return HIPFFT_D2Z;
case FFTTransformType::C2R:
return HIPFFT_Z2D;
}
}
PADDLE_THROW(platform::errors::InvalidArgument(
"hipFFT only support transforms of type float32 and float64"));
}();
// disable auto allocation of workspace to use allocator from the framework
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftSetAutoAllocation(
plan(), /* autoAllocate */ 0));
size_t ws_size_t;
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftMakePlanMany(
plan(), signal_ndim, signal_sizes.data(),
/* inembed */ nullptr, /* base_istride */ 1, /* idist */ 1,
/* onembed */ nullptr, /* base_ostride */ 1, /* odist */ 1, exec_type,
batch, &ws_size_t));
ws_size = ws_size_t;
}
const hipfftHandle& plan() const { return plan_ptr.get(); }
FFTTransformType transform_type() const { return fft_type_; }
ScalarType data_type() const { return value_type_; }
size_t workspace_size() const { return ws_size; }
private:
HIPFFTHandle plan_ptr;
size_t ws_size;
FFTTransformType fft_type_;
ScalarType value_type_;
};
#endif
// Hashing machinery for Key
// Fowler–Noll–Vo hash function
// see
// https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
template <typename Key>
struct KeyHash {
// Key must be a POD because we read out its memory
// contenst as char* when hashing
static_assert(std::is_pod<Key>::value, "Key must be plain old data type");
size_t operator()(const Key& params) const {
auto ptr = reinterpret_cast<const uint8_t*>(&params);
uint32_t value = 0x811C9DC5;
for (int i = 0; i < static_cast<int>(sizeof(Key)); ++i) {
value ^= ptr[i];
value *= 0x01000193;
}
return static_cast<size_t>(value);
}
};
template <typename Key>
struct KeyEqual {
// Key must be a POD because we read out its memory
// contenst as char* when comparing
static_assert(std::is_pod<Key>::value, "Key must be plain old data type");
bool operator()(const Key& a, const Key& b) const {
auto ptr1 = reinterpret_cast<const uint8_t*>(&a);
auto ptr2 = reinterpret_cast<const uint8_t*>(&b);
return memcmp(ptr1, ptr2, sizeof(Key)) == 0;
}
};
#if CUDA_VERSION < 10000
// Note that the max plan number for CUDA version < 10 has to be 1023
// due to a bug that fails on the 1024th plan
constexpr size_t CUFFT_MAX_PLAN_NUM = 1023;
constexpr size_t CUFFT_DEFAULT_CACHE_SIZE = CUFFT_MAX_PLAN_NUM;
#else
constexpr size_t CUFFT_MAX_PLAN_NUM = std::numeric_limits<size_t>::max();
// The default max cache size chosen for CUDA version > 10 is arbitrary.
// This number puts a limit on how big of a plan cache should we maintain by
// default. Users can always configure it via cufft_set_plan_cache_max_size.
constexpr size_t CUFFT_DEFAULT_CACHE_SIZE = 4096;
#endif
static_assert(CUFFT_MAX_PLAN_NUM >= 0 &&
CUFFT_MAX_PLAN_NUM <= std::numeric_limits<size_t>::max(),
"CUFFT_MAX_PLAN_NUM not in size_t range");
static_assert(CUFFT_DEFAULT_CACHE_SIZE >= 0 &&
CUFFT_DEFAULT_CACHE_SIZE <= CUFFT_MAX_PLAN_NUM,
"CUFFT_DEFAULT_CACHE_SIZE not in [0, CUFFT_MAX_PLAN_NUM] range");
// This cache assumes that the mapping from key to value never changes.
// This is **NOT** thread-safe. Please use a mutex when using it **AND** the
// value returned from try_emplace_value.
// The contract of using this cache is that try_emplace_value should only be
// used when the max_size is positive.
class FFTConfigCache {
public:
using kv_t = typename std::pair<FFTConfigKey, FFTConfig>;
using map_t = typename std::unordered_map<
std::reference_wrapper<FFTConfigKey>, typename std::list<kv_t>::iterator,
KeyHash<FFTConfigKey>, KeyEqual<FFTConfigKey>>;
using map_kkv_iter_t = typename map_t::iterator;
FFTConfigCache() : FFTConfigCache(CUFFT_DEFAULT_CACHE_SIZE) {}
explicit FFTConfigCache(int64_t max_size) { _set_max_size(max_size); }
FFTConfigCache(const FFTConfigCache& other) = delete;
FFTConfigCache& operator=(const FFTConfigCache& other) = delete;
FFTConfigCache(FFTConfigCache&& other) noexcept
: _usage_list(std::move(other._usage_list)),
_cache_map(std::move(other._cache_map)),
_max_size(other._max_size) {}
FFTConfigCache& operator=(FFTConfigCache&& other) noexcept {
_usage_list = std::move(other._usage_list);
_cache_map = std::move(other._cache_map);
_max_size = other._max_size;
return *this;
}
// If key is in this cache, return the cached config. Otherwise, emplace the
// config in this cache and return it.
FFTConfig& lookup(FFTConfigKey params) {
PADDLE_ENFORCE_GT(_max_size, 0,
platform::errors::InvalidArgument(
"The max size of FFTConfigCache must be great than 0,"
"But received is [%d]",
_max_size));
map_kkv_iter_t map_it = _cache_map.find(params);
// Hit, put to list front
if (map_it != _cache_map.end()) {
_usage_list.splice(_usage_list.begin(), _usage_list, map_it->second);
return map_it->second->second;
}
// Miss
// remove if needed
if (_usage_list.size() >= _max_size) {
auto last = _usage_list.end();
last--;
_cache_map.erase(last->first);
_usage_list.pop_back();
}
// construct new plan at list front, then insert into _cache_map
_usage_list.emplace_front(std::piecewise_construct,
std::forward_as_tuple(params),
std::forward_as_tuple(params));
auto kv_it = _usage_list.begin();
_cache_map.emplace(std::piecewise_construct,
std::forward_as_tuple(kv_it->first),
std::forward_as_tuple(kv_it));
return kv_it->second;
}
void clear() {
_cache_map.clear();
_usage_list.clear();
}
void resize(int64_t new_size) {
_set_max_size(new_size);
auto cur_size = _usage_list.size();
if (cur_size > _max_size) {
auto delete_it = _usage_list.end();
for (size_t i = 0; i < cur_size - _max_size; i++) {
delete_it--;
_cache_map.erase(delete_it->first);
}
_usage_list.erase(delete_it, _usage_list.end());
}
}
size_t size() const { return _cache_map.size(); }
size_t max_size() const noexcept { return _max_size; }
std::mutex mutex;
private:
// Only sets size and does value check. Does not resize the data structures.
void _set_max_size(int64_t new_size) {
// We check that 0 <= new_size <= CUFFT_MAX_PLAN_NUM here. Since
// CUFFT_MAX_PLAN_NUM is of type size_t, we need to do non-negativity check
// first.
PADDLE_ENFORCE_GE(
new_size, 0,
platform::errors::InvalidArgument(
"cuFFT plan cache size must be non-negative, But received is [%d]",
new_size));
PADDLE_ENFORCE_LE(new_size, CUFFT_MAX_PLAN_NUM,
platform::errors::InvalidArgument(
"cuFFT plan cache size can not be larger than [%d], "
"But received is [%d]",
CUFFT_MAX_PLAN_NUM, new_size));
_max_size = static_cast<size_t>(new_size);
}
std::list<kv_t> _usage_list;
map_t _cache_map;
size_t _max_size;
};
static std::vector<std::unique_ptr<FFTConfigCache>> plan_caches;
static std::mutex plan_caches_mutex;
static inline FFTConfigCache& get_fft_plan_cache(int64_t device_index) {
std::lock_guard<std::mutex> guard(plan_caches_mutex);
if (device_index >= plan_caches.size()) {
plan_caches.resize(device_index + 1);
}
if (!plan_caches[device_index]) {
plan_caches[device_index] = std::make_unique<FFTConfigCache>();
}
return *plan_caches[device_index];
}
// Calculates the normalization constant
static double fft_normalization_scale(FFTNormMode normalization,
const std::vector<int64_t>& sizes,
const std::vector<int64_t>& dims) {
// auto norm = static_cast<fft_norm_mode>(normalization);
if (normalization == FFTNormMode::none) {
return static_cast<double>(1.0);
}
int64_t signal_numel = 1;
for (auto dim : dims) {
signal_numel *= sizes[dim];
}
const double scale_denom = (normalization == FFTNormMode::by_sqrt_n)
? std::sqrt(signal_numel)
: static_cast<double>(signal_numel);
return static_cast<double>(1.0 / scale_denom);
}
template <typename DeviceContext, typename T>
void exec_normalization(const DeviceContext& ctx, const Tensor* in, Tensor* out,
FFTNormMode normalization,
const std::vector<int64_t>& sizes,
const std::vector<int64_t>& axes) {
double scale = fft_normalization_scale(normalization, sizes, axes);
if (scale != 1.0) {
auto eigen_out = framework::EigenVector<T>::Flatten(*out);
auto eigen_in = framework::EigenVector<T>::Flatten(*in);
auto dev = ctx.eigen_device();
EigenScale<Eigen::GpuDevice, T>::Eval(*dev, eigen_out, eigen_in,
static_cast<T>(scale),
static_cast<T>(0), false);
} else {
framework::TensorCopy(*in, ctx.GetPlace(), out);
}
}
#if defined(PADDLE_WITH_CUDA)
static FFTConfigKey create_fft_configkey(const framework::Tensor& input,
const framework::Tensor& output,
int signal_ndim) {
// Create the transform plan (either from cache or locally)
const auto value_type =
framework::IsComplexType(framework::TransToProtoVarType(input.dtype()))
? framework::ToRealType(framework::TransToProtoVarType(input.dtype()))
: framework::TransToProtoVarType(input.dtype());
auto fft_type =
GetFFTTransformType(framework::TransToProtoVarType(input.dtype()),
framework::TransToProtoVarType(output.dtype()));
// signal sizes
std::vector<int64_t> signal_size(signal_ndim + 1);
signal_size[0] = input.dims()[0];
for (int64_t i = 1; i <= signal_ndim; ++i) {
auto in_size = input.dims()[i];
auto out_size = output.dims()[i];
signal_size[i] = std::max(in_size, out_size);
}
FFTConfigKey key(phi::vectorize(input.dims()), phi::vectorize(output.dims()),
signal_size, fft_type, value_type);
return key;
}
// Execute a pre-planned transform
static void exec_cufft_plan_raw(const FFTConfig& config, void* in_data,
void* out_data, bool forward) {
auto& plan = config.plan();
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftXtExec(
plan, in_data, out_data, forward ? CUFFT_FORWARD : CUFFT_INVERSE));
}
template <typename DeviceContext, typename Ti, typename To>
void exec_cufft_plan(const DeviceContext& ctx, const FFTConfig& config,
framework::Tensor* input, framework::Tensor* output,
bool forward) {
// execute transform plan
auto fft_type = config.transform_type();
if (fft_type == FFTTransformType::C2R && forward) {
forward = false;
framework::Tensor input_conj(input->type());
input_conj.mutable_data<Ti>(input->dims(), ctx.GetPlace());
platform::ForRange<DeviceContext> for_range(ctx, input->numel());
phi::funcs::ConjFunctor<Ti> functor(input->data<Ti>(), input->numel(),
input_conj.data<Ti>());
for_range(functor);
exec_cufft_plan_raw(config, input_conj.data(), output->data(), forward);
} else if (fft_type == FFTTransformType::R2C && !forward) {
forward = true;
framework::Tensor out_conj(output->type());
out_conj.mutable_data<To>(output->dims(), ctx.GetPlace());
exec_cufft_plan_raw(config, input->data(), out_conj.data(), forward);
platform::ForRange<DeviceContext> for_range(ctx, output->numel());
phi::funcs::ConjFunctor<To> functor(out_conj.data<To>(), output->numel(),
output->data<To>());
for_range(functor);
} else {
exec_cufft_plan_raw(config, input->data(), output->data(), forward);
}
}
#elif defined(PADDLE_WITH_HIP)
static FFTConfigKey create_fft_configkey(const framework::Tensor& input,
const framework::Tensor& output,
int signal_ndim) {
// Create the transform plan (either from cache or locally)
const auto value_type =
framework::IsComplexType(framework::TransToProtoVarType(input.dtype()))
? framework::ToRealType(framework::TransToProtoVarType(input.dtype()))
: framework::TransToProtoVarType(input.dtype());
auto fft_type =
GetFFTTransformType(framework::TransToProtoVarType(input.dtype()),
framework::TransToProtoVarType(output.type()));
// signal sizes
std::vector<int64_t> signal_size(signal_ndim + 1);
signal_size[0] = input.dims()[0];
for (int64_t i = 1; i <= signal_ndim; ++i) {
auto in_size = input.dims()[i];
auto out_size = output.dims()[i];
signal_size[i] = std::max(in_size, out_size);
}
FFTConfigKey key(phi::vectorize(input.dims()), phi::vectorize(output.dims()),
signal_size, fft_type, value_type);
return key;
}
// Execute a pre-planned transform
static void exec_hipfft_plan_raw(const FFTConfig& config, void* in_data,
void* out_data, bool forward) {
auto& plan = config.plan();
auto value_type = config.data_type();
if (value_type == framework::proto::VarType::FP32) {
switch (config.transform_type()) {
case FFTTransformType::C2C: {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecC2C(
plan, static_cast<hipfftComplex*>(in_data),
static_cast<hipfftComplex*>(out_data),
forward ? HIPFFT_FORWARD : HIPFFT_BACKWARD));
return;
}
case FFTTransformType::R2C: {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecR2C(
plan, static_cast<hipfftReal*>(in_data),
static_cast<hipfftComplex*>(out_data)));
return;
}
case FFTTransformType::C2R: {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecC2R(
plan, static_cast<hipfftComplex*>(in_data),
static_cast<hipfftReal*>(out_data)));
return;
}
}
} else if (value_type == framework::proto::VarType::FP64) {
switch (config.transform_type()) {
case FFTTransformType::C2C: {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecZ2Z(
plan, static_cast<hipfftDoubleComplex*>(in_data),
static_cast<hipfftDoubleComplex*>(out_data),
forward ? HIPFFT_FORWARD : HIPFFT_BACKWARD));
return;
}
case FFTTransformType::R2C: {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecD2Z(
plan, static_cast<hipfftDoubleReal*>(in_data),
static_cast<hipfftDoubleComplex*>(out_data)));
return;
}
case FFTTransformType::C2R: {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecZ2D(
plan, static_cast<hipfftDoubleComplex*>(in_data),
static_cast<hipfftDoubleReal*>(out_data)));
return;
}
}
}
PADDLE_THROW(platform::errors::InvalidArgument(
"hipFFT only support transforms of type float32 and float64"));
}
template <typename DeviceContext, typename Ti, typename To>
void exec_hipfft_plan(const DeviceContext& ctx, const FFTConfig& config,
framework::Tensor* input, framework::Tensor* output,
bool forward) {
auto fft_type = config.transform_type();
if (fft_type == FFTTransformType::C2R && forward) {
forward = false;
framework::Tensor input_conj(input->type());
input_conj.mutable_data<Ti>(input->dims(), ctx.GetPlace());
platform::ForRange<DeviceContext> for_range(ctx, input->numel());
phi::funcs::ConjFunctor<Ti> functor(input->data<Ti>(), input->numel(),
input_conj.data<Ti>());
for_range(functor);
exec_hipfft_plan_raw(config, input_conj.data(), output->data(), forward);
} else if (fft_type == FFTTransformType::R2C && !forward) {
forward = true;
framework::Tensor out_conj(output->type());
out_conj.mutable_data<To>(output->dims(), ctx.GetPlace());
exec_hipfft_plan_raw(config, input->data(), out_conj.data(), forward);
platform::ForRange<DeviceContext> for_range(ctx, output->numel());
phi::funcs::ConjFunctor<To> functor(out_conj.data<To>(), output->numel(),
output->data<To>());
for_range(functor);
} else {
exec_hipfft_plan_raw(config, input->data(), output->data(), forward);
}
}
#endif
// Execute a general unnormalized fft operation (can be c2c, onesided r2c or
// onesided c2r)
template <typename DeviceContext, typename Ti, typename To>
void exec_fft(const DeviceContext& ctx, const Tensor* X, Tensor* out,
const std::vector<int64_t>& dim, bool forward) {
const auto x_dims = phi::vectorize(X->dims());
const int64_t ndim = static_cast<int64_t>(X->dims().size());
auto tensor_place = ctx.GetPlace();
// make a dim permutation
std::vector<int> dim_permute(ndim);
std::iota(dim_permute.begin(), dim_permute.end(), int{0});
std::vector<bool> is_transformed_dim(ndim);
for (const auto& d : dim) {
is_transformed_dim[d] = true;
}
auto batch_end =
std::partition(dim_permute.begin(), dim_permute.end(),
[&](int64_t d) { return !is_transformed_dim[d]; });
std::sort(dim_permute.begin(), batch_end);
std::copy(dim.cbegin(), dim.cend(), batch_end);
// transpose input according to dim permutation
auto transposed_input_shape = X->dims().transpose(dim_permute);
framework::Tensor transposed_input;
transposed_input.Resize(transposed_input_shape);
transposed_input.mutable_data<Ti>(tensor_place);
TransCompute<DeviceContext, Ti>(ndim, ctx, *X, &transposed_input,
dim_permute);
// Reshape batch dimensions into a single dimension
const int64_t signal_ndim = static_cast<int64_t>(dim.size());
std::vector<int64_t> collapsed_input_shape(signal_ndim + 1);
auto transposed_input_shape_ = phi::vectorize(transposed_input_shape);
const int64_t batch_dims = ndim - signal_ndim;
auto batch_size =
std::accumulate(transposed_input_shape_.begin(),
transposed_input_shape_.begin() + batch_dims,
static_cast<int>(1), std::multiplies<int>());
collapsed_input_shape[0] = batch_size;
std::copy(transposed_input_shape_.begin() + batch_dims,
transposed_input_shape_.end(), collapsed_input_shape.begin() + 1);
framework::Tensor& collapsed_input = transposed_input;
collapsed_input.Resize(phi::make_ddim(collapsed_input_shape));
// make a collpased output
const auto out_dims = phi::vectorize(out->dims());
std::vector<int64_t> collapsed_output_shape(1 + signal_ndim);
collapsed_output_shape[0] = batch_size;
for (size_t i = 0; i < dim.size(); ++i) {
collapsed_output_shape[i + 1] = out_dims[dim[i]];
}
framework::Tensor collapsed_output;
collapsed_output.Resize(phi::make_ddim(collapsed_output_shape));
collapsed_output.mutable_data<To>(tensor_place);
FFTConfig* config = nullptr;
#if defined(PADDLE_WITH_CUDA)
std::unique_ptr<FFTConfig> config_ = nullptr;
// create plan
FFTConfigKey key =
create_fft_configkey(collapsed_input, collapsed_output, signal_ndim);
bool using_cache = false;
#if !defined(CUFFT_VERSION) || (CUFFT_VERSION < 10200)
using_cache = true;
#endif
if (using_cache) {
const int64_t device_id = static_cast<int64_t>(
reinterpret_cast<const platform::CUDAPlace*>(&collapsed_input.place())
->GetDeviceId());
FFTConfigCache& plan_cache = get_fft_plan_cache(device_id);
std::unique_lock<std::mutex> guard(plan_cache.mutex, std::defer_lock);
guard.lock();
config = &(plan_cache.lookup(key));
} else {
config_ = std::make_unique<FFTConfig>(key);
config = config_.get();
}
// prepare cufft for execution
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cufftSetStream(config->plan(), ctx.stream()));
framework::Tensor workspace_tensor;
workspace_tensor.mutable_data<To>(tensor_place, config->workspace_size());
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftSetWorkArea(
config->plan(), workspace_tensor.data<To>()));
// execute transform plan
exec_cufft_plan<DeviceContext, Ti, To>(ctx, *config, &collapsed_input,
&collapsed_output, forward);
#elif defined(PADDLE_WITH_HIP)
// create plan
FFTConfigKey key =
create_fft_configkey(collapsed_input, collapsed_output, signal_ndim);
const int64_t device_id = static_cast<int64_t>(
reinterpret_cast<const platform::CUDAPlace*>(&collapsed_input.place())
->GetDeviceId());
FFTConfigCache& plan_cache = get_fft_plan_cache(device_id);
std::unique_lock<std::mutex> guard(plan_cache.mutex, std::defer_lock);
guard.lock();
config = &(plan_cache.lookup(key));
// prepare cufft for execution
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::hipfftSetStream(config->plan(), ctx.stream()));
framework::Tensor workspace_tensor;
workspace_tensor.mutable_data<To>(tensor_place, config->workspace_size());
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftSetWorkArea(
config->plan(), workspace_tensor.data<To>()));
// execute transform plan
exec_hipfft_plan<DeviceContext, Ti, To>(ctx, *config, &collapsed_input,
&collapsed_output, forward);
#endif
// Inverting output by reshape and transpose to original batch and dimension
auto transposed_out_shape = out->dims().transpose(dim_permute);
collapsed_output.Resize(transposed_out_shape);
auto& transposed_output = collapsed_output;
std::vector<int> reverse_dim_permute(ndim);
for (size_t i = 0; i < ndim; i++) {
reverse_dim_permute[dim_permute[i]] = i;
}
TransCompute<DeviceContext, To>(ndim, ctx, transposed_output, out,
reverse_dim_permute);
}
// Use the optimized path to perform single R2C or C2R if transformation dim is
// supported by cuFFT
static bool use_optimized_fft_path(const std::vector<int64_t>& axes) {
// For performance reason, when axes starts with (0, 1), do not use the
// optimized path.
if (axes.size() > kMaxFFTNdim ||
(axes.size() >= 2 && axes[0] == 0 && axes[1] == 1)) {
return false;
} else {
return true;
}
}
template <typename Ti, typename To>
struct FFTC2CFunctor<platform::CUDADeviceContext, Ti, To> {
void operator()(const platform::CUDADeviceContext& ctx, const Tensor* X,
Tensor* out, const std::vector<int64_t>& axes,
FFTNormMode normalization, bool forward) {
if (axes.empty()) {
framework::TensorCopy(*X, ctx.GetPlace(), out);
return;
}
framework::Tensor* p_out = out;
std::vector<int64_t> out_dims = phi::vectorize(X->dims());
std::vector<int64_t> working_axes(axes.begin(), axes.end());
std::vector<int64_t> first_dims;
size_t max_dims;
framework::Tensor working_tensor;
working_tensor.mutable_data<Ti>(X->dims(), ctx.GetPlace());
framework::Tensor* p_working_tensor = &working_tensor;
framework::TensorCopy(*X, ctx.GetPlace(), &working_tensor);
while (true) {
max_dims =
std::min(static_cast<size_t>(kMaxFFTNdim), working_axes.size());
first_dims.assign(working_axes.end() - max_dims, working_axes.end());
exec_fft<platform::CUDADeviceContext, Ti, To>(ctx, p_working_tensor,
p_out, first_dims, forward);
working_axes.resize(working_axes.size() - max_dims);
first_dims.clear();
if (working_axes.empty()) {
break;
}
std::swap(p_out, p_working_tensor);
}
exec_normalization<platform::CUDADeviceContext, To>(
ctx, p_out, out, normalization, out_dims, axes);
}
};
template <typename Ti, typename To>
struct FFTC2RFunctor<platform::CUDADeviceContext, Ti, To> {
void operator()(const platform::CUDADeviceContext& ctx, const Tensor* X,
Tensor* out, const std::vector<int64_t>& axes,
FFTNormMode normalization, bool forward) {
std::vector<int64_t> in_dims = phi::vectorize(X->dims());
std::vector<int64_t> out_dims = phi::vectorize(out->dims());
if (use_optimized_fft_path(axes)) {
framework::Tensor x_copy(X->type());
x_copy.mutable_data<Ti>(X->dims(), ctx.GetPlace());
framework::TensorCopy(*X, ctx.GetPlace(), &x_copy);
exec_fft<platform::CUDADeviceContext, Ti, To>(ctx, &x_copy, out, axes,
forward);
} else {
framework::Tensor temp_tensor;
temp_tensor.mutable_data<Ti>(X->dims(), ctx.GetPlace());
const std::vector<int64_t> dims(axes.begin(), axes.end() - 1);
FFTC2CFunctor<platform::CUDADeviceContext, Ti, Ti> c2c_functor;
c2c_functor(ctx, X, &temp_tensor, dims, FFTNormMode::none, forward);
exec_fft<platform::CUDADeviceContext, Ti, To>(ctx, &temp_tensor, out,
{axes.back()}, forward);
}
exec_normalization<platform::CUDADeviceContext, To>(
ctx, out, out, normalization, out_dims, axes);
}
};
// n dimension real to complex FFT use cufft lib
template <typename Ti, typename To>
struct FFTR2CFunctor<platform::CUDADeviceContext, Ti, To> {
void operator()(const platform::CUDADeviceContext& ctx, const Tensor* X,
Tensor* out, const std::vector<int64_t>& axes,
FFTNormMode normalization, bool forward) {
// Step1: R2C transform on the last dimension
framework::Tensor* r2c_out = out;
const std::vector<int64_t> last_dim{axes.back()};
std::vector<int64_t> out_dims = phi::vectorize(out->dims());
exec_fft<platform::CUDADeviceContext, Ti, To>(ctx, X, r2c_out, last_dim,
forward);
// Step2: C2C transform on the remaining dimension
framework::Tensor c2c_out;
if (axes.size() > 1) {
c2c_out.mutable_data<To>(out->dims(), ctx.GetPlace());
std::vector<int64_t> remain_dim(axes.begin(), axes.end() - 1);
FFTC2CFunctor<platform::CUDADeviceContext, To, To> fft_c2c_func;
fft_c2c_func(ctx, r2c_out, &c2c_out, remain_dim, FFTNormMode::none,
forward);
}
const auto in_sizes = phi::vectorize(X->dims());
framework::Tensor* norm_tensor = axes.size() > 1 ? &c2c_out : r2c_out;
exec_normalization<platform::CUDADeviceContext, To>(
ctx, norm_tensor, out, normalization, in_sizes, axes);
}
};
} // namespace operators
} // namespace paddle
...@@ -11,8 +11,11 @@ ...@@ -11,8 +11,11 @@
#pragma once #pragma once
#define NOMINMAX // to use std::min std::max correctly on windows #define NOMINMAX // to use std::min std::max correctly on windows
#include <algorithm>
#include <functional>
#include <iostream> #include <iostream>
#include <memory> #include <memory>
#include <numeric>
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/framework/convert_utils.h"
...@@ -23,8 +26,10 @@ ...@@ -23,8 +26,10 @@
#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/conj_op.h" #include "paddle/fluid/operators/conj_op.h"
#include "paddle/fluid/operators/eigen/eigen_function.h" #include "paddle/fluid/operators/eigen/eigen_function.h"
#include "paddle/fluid/operators/transpose_op.h"
#include "paddle/fluid/platform/complex.h" #include "paddle/fluid/platform/complex.h"
#include "paddle/fluid/platform/for_range.h" #include "paddle/fluid/platform/for_range.h"
#include "paddle/phi/kernels/funcs/complex_functors.h"
#include "paddle/phi/kernels/funcs/padding.h" #include "paddle/phi/kernels/funcs/padding.h"
#if defined(__NVCC__) || defined(__HIPCC__) #if defined(__NVCC__) || defined(__HIPCC__)
#include "thrust/device_vector.h" #include "thrust/device_vector.h"
......
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/stft_op.h"
#include "paddle/fluid/operators/spectral_helper.h"
namespace paddle {
namespace operators {
class StftOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "frame");
OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "frame");
const int n_fft = ctx->Attrs().Get<int>("n_fft");
const int hop_length = ctx->Attrs().Get<int>("hop_length");
const auto x_dims = ctx->GetInputDim("X");
const int x_rank = x_dims.size();
const bool onesided = ctx->Attrs().Get<bool>("onesided");
PADDLE_ENFORCE_EQ(
x_rank, 2,
platform::errors::InvalidArgument(
"Input(X) of StftOp should be a tensor with shape [N, T], "
"but got rank %s.",
x_rank));
PADDLE_ENFORCE_GT(
hop_length, 0,
platform::errors::InvalidArgument(
"Attribute(hop_length) should be greater than 0, but got %s.",
hop_length));
int seq_length = x_dims[x_rank - 1];
int n_frames = 1 + (seq_length - n_fft) / hop_length;
PADDLE_ENFORCE_LE(n_fft, seq_length,
platform::errors::InvalidArgument(
"Attribute(frame_length) should be less equal than "
"sequence length, but got (%s) > (%s).",
n_fft, seq_length));
std::vector<int64_t> output_shape;
output_shape.push_back(x_dims[0]);
if (onesided) {
output_shape.push_back(n_fft / 2 + 1);
} else {
output_shape.push_back(n_fft);
}
output_shape.push_back(n_frames);
ctx->SetOutputDim("Out", phi::make_ddim(output_shape));
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
const auto in_dtype = OperatorWithKernel::IndicateVarDataType(ctx, "X");
return framework::OpKernelType(in_dtype, ctx.GetPlace());
}
};
class StftOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X", "Input waveforms with shape (N, T)");
AddOutput("Out",
"The complex STFT output tensor with shape (N, n_fft, "
"num_frames) or (N, n_fft/2 + 1, num_frames)");
AddAttr<int>("n_fft", "The number of input samples to perform FFT");
AddAttr<int>("hop_length", "Number of samples between adjacent frames");
AddAttr<bool>("normalized",
"Control whether to scale the output by 1/sqrt(n_fft)");
AddAttr<bool>("onesided",
"Control whether to return half of the FFT output");
AddComment(R"DOC(
Short-time Fourier transform (STFT).
)DOC");
}
};
template <typename T>
class StftGradOpMaker : public framework::SingleGradOpMaker<T> {
public:
using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
protected:
void Apply(GradOpPtr<T> grad_op) const override {
grad_op->SetType("stft_grad");
grad_op->SetInput("X", this->Input("X"));
grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
grad_op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
grad_op->SetAttrMap(this->Attrs());
}
};
class StftGradOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
const auto out_grad_name = framework::GradVarName("Out");
OP_INOUT_CHECK(ctx->HasInput(out_grad_name), "Input", out_grad_name,
"stft_grad");
OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "stft_grad");
const auto x_grad_name = framework::GradVarName("X");
OP_INOUT_CHECK(ctx->HasOutput(x_grad_name), "Output", x_grad_name,
"stft_grad");
ctx->ShareDim("X", /*->*/ x_grad_name);
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
const auto in_dtype = OperatorWithKernel::IndicateVarDataType(
ctx, framework::GradVarName("Out"));
const auto kernel_dtype = framework::ToRealType(in_dtype);
return framework::OpKernelType(kernel_dtype, ctx.GetPlace());
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(stft, ops::StftOp, ops::StftOpMaker,
ops::StftGradOpMaker<paddle::framework::OpDesc>,
ops::StftGradOpMaker<paddle::imperative::OpBase>);
REGISTER_OPERATOR(stft_grad, ops::StftGradOp);
REGISTER_OP_CPU_KERNEL(
stft, ops::StftKernel<paddle::platform::CPUDeviceContext, float>,
ops::StftKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
stft_grad, ops::StftGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::StftGradKernel<paddle::platform::CPUDeviceContext, double>);
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/spectral_op.cu.h"
#include "paddle/fluid/operators/stft_op.h"
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
stft, ops::StftKernel<paddle::platform::CUDADeviceContext, float>,
ops::StftKernel<paddle::platform::CUDADeviceContext, double>);
REGISTER_OP_CUDA_KERNEL(
stft_grad, ops::StftGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::StftGradKernel<paddle::platform::CUDADeviceContext, double>);
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/frame_op.h"
#include "paddle/fluid/operators/spectral_op.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename DeviceContext, typename T>
class StftKernel : public framework::OpKernel<T> {
public:
/*
Batch Signals (N, T) -> Frames (N, n_fft, num_frames) -> FFTR2C -> (N,
n_fft/2 + 1, num_frames) or (N, n_fft, num_frames)
*/
void Compute(const framework::ExecutionContext& ctx) const override {
using C = paddle::platform::complex<T>;
const Tensor* x = ctx.Input<Tensor>("X");
Tensor* out = ctx.Output<Tensor>("Out");
out->mutable_data<C>(ctx.GetPlace());
const size_t x_rank = x->dims().size();
const size_t out_rank = out->dims().size();
const int n_fft = ctx.Attr<int>("n_fft");
const int hop_length = ctx.Attr<int>("hop_length");
const bool normalized = ctx.Attr<bool>("normalized");
const bool onesided = ctx.Attr<bool>("onesided");
const int n_frames = out->dims()[out_rank - 1];
const int seq_length = x->dims()[x_rank - 1];
auto& dev_ctx = ctx.device_context<DeviceContext>();
std::vector<int64_t> axes = {1};
// Frame
Tensor frames;
framework::DDim frames_dims(out->dims());
frames_dims.at(axes.back()) = n_fft;
frames.mutable_data<T>(frames_dims, ctx.GetPlace());
FrameFunctor<DeviceContext, T>()(dev_ctx, x, &frames, seq_length, n_fft,
n_frames, hop_length, /*is_grad*/ false);
// FFTR2C
FFTNormMode normalization;
if (normalized) {
normalization = get_norm_from_string("ortho", true);
} else {
normalization = get_norm_from_string("backward", true);
}
FFTR2CFunctor<DeviceContext, T, C> fft_r2c_func;
if (onesided) {
fft_r2c_func(dev_ctx, &frames, out, axes, normalization, true);
} else {
framework::DDim onesided_dims(out->dims());
const int64_t onesided_axis_size = out->dims().at(axes.back()) / 2 + 1;
onesided_dims.at(axes.back()) = onesided_axis_size;
Tensor onesided_out;
onesided_out.mutable_data<C>(onesided_dims, ctx.GetPlace());
fft_r2c_func(dev_ctx, &frames, &onesided_out, axes, normalization, true);
fill_conj<DeviceContext, C>(dev_ctx, &onesided_out, out, axes);
}
}
};
template <typename DeviceContext, typename T>
class StftGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
using C = paddle::platform::complex<T>;
auto& dev_ctx = ctx.device_context<DeviceContext>();
const auto* dy = ctx.Input<Tensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
dx->mutable_data<T>(ctx.GetPlace());
const size_t dy_rank = dy->dims().size();
const size_t dx_rank = dx->dims().size();
const int n_fft = ctx.Attr<int>("n_fft");
const int hop_length = ctx.Attr<int>("hop_length");
const bool normalized = ctx.Attr<bool>("normalized");
const bool onesided = ctx.Attr<bool>("onesided");
const int n_frames = dy->dims()[dy_rank - 1];
const int seq_length = dx->dims()[dx_rank - 1];
std::vector<int64_t> axes = {1};
Tensor d_frames;
framework::DDim d_frames_dims(dy->dims());
d_frames_dims.at(axes.back()) = n_fft;
d_frames.mutable_data<T>(d_frames_dims, ctx.GetPlace());
Tensor complex_d_frames;
complex_d_frames.mutable_data<C>(d_frames_dims, ctx.GetPlace());
// dy -> d_frames
FFTNormMode normalization;
if (normalized) {
normalization = get_norm_from_string("ortho", true);
} else {
normalization = get_norm_from_string("backward", true);
}
FFTC2CFunctor<DeviceContext, C, C> fft_c2c_func;
if (!onesided) {
fft_c2c_func(dev_ctx, dy, &complex_d_frames, axes, normalization, false);
} else {
Tensor full_dy;
full_dy.mutable_data<C>(d_frames_dims, ctx.GetPlace());
auto zero_length = static_cast<int>(full_dy.dims().at(axes.back()) -
dy->dims().at(axes.back()));
auto rank = dy->dims().size();
std::vector<int> pads(rank * 2, 0);
pads[axes.back() * 2 + 1] = zero_length;
phi::funcs::PaddingFunctor<DeviceContext, C>(
rank, ctx.template device_context<DeviceContext>(), pads,
static_cast<C>(0), *dy, &full_dy);
fft_c2c_func(dev_ctx, &full_dy, &complex_d_frames, axes, normalization,
false);
}
framework::TransComplexToReal(
framework::TransToProtoVarType(d_frames.dtype()),
framework::TransToProtoVarType(complex_d_frames.dtype()),
complex_d_frames, &d_frames);
// d_frames -> dx
FrameFunctor<DeviceContext, T>()(dev_ctx, &d_frames, dx, seq_length, n_fft,
n_frames, hop_length, /*is_grad*/ true);
}
};
} // namespace operators
} // namespace paddle
...@@ -159,10 +159,8 @@ inline void EmplaceDeviceContext( ...@@ -159,10 +159,8 @@ inline void EmplaceDeviceContext(
cuda_ctx, cuda_ctx,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"Failed to dynamic_cast dev_ctx into CUDADeviceContext.")); "Failed to dynamic_cast dev_ctx into CUDADeviceContext."));
// Note: A trick method to init context, why GetAllocator interface
// needs a stream parameter?
dev_ctx->SetAllocator(memory::allocation::AllocatorFacade::Instance() dev_ctx->SetAllocator(memory::allocation::AllocatorFacade::Instance()
.GetAllocator(p, cuda_ctx->stream()) .GetAllocator(p)
.get()); .get());
cuda_ctx->PartialInitWithAllocator(); cuda_ctx->PartialInitWithAllocator();
dev_ctx->SetGenerator( dev_ctx->SetGenerator(
...@@ -517,10 +515,10 @@ CUDAContext::~CUDAContext() { ...@@ -517,10 +515,10 @@ CUDAContext::~CUDAContext() {
CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : phi::GPUContext(place) { CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : phi::GPUContext(place) {
phi::GPUContext::PartialInitWithoutAllocator(); phi::GPUContext::PartialInitWithoutAllocator();
cuda_stream_.reset(new stream::CUDAStream(phi::GPUContext::stream(), place)); cuda_stream_.reset(new stream::CUDAStream(phi::GPUContext::stream(), place));
workspace_.reset(new phi::DnnWorkspaceHandle( auto& instance = memory::allocation::AllocatorFacade::Instance();
memory::allocation::AllocatorFacade::Instance() instance.SetDefaultStream(place, phi::GPUContext::stream());
.GetAllocator(place, phi::GPUContext::stream()) workspace_.reset(
.get())); new phi::DnnWorkspaceHandle(instance.GetAllocator(place).get()));
} }
CUDADeviceContext::~CUDADeviceContext() = default; CUDADeviceContext::~CUDADeviceContext() = default;
...@@ -618,7 +616,7 @@ phi::DnnWorkspaceHandle CUDADeviceContext::cudnn_workspace_handle() const { ...@@ -618,7 +616,7 @@ phi::DnnWorkspaceHandle CUDADeviceContext::cudnn_workspace_handle() const {
// return workspace_.get(); // return workspace_.get();
return phi::DnnWorkspaceHandle( return phi::DnnWorkspaceHandle(
memory::allocation::AllocatorFacade::Instance() memory::allocation::AllocatorFacade::Instance()
.GetAllocator(GetPlace(), phi::GPUContext::stream()) .GetAllocator(GetPlace())
.get()); .get());
} }
return phi::GPUContext::cudnn_workspace_handle(); return phi::GPUContext::cudnn_workspace_handle();
......
...@@ -118,8 +118,9 @@ float CpuUtilization::GetCpuUtilization() { ...@@ -118,8 +118,9 @@ float CpuUtilization::GetCpuUtilization() {
float busy_time = (system_kernel_time_end - system_kernel_time_start) + float busy_time = (system_kernel_time_end - system_kernel_time_start) +
(system_user_time_end - system_user_time_start); (system_user_time_end - system_user_time_start);
float idle_time = system_idle_time_end - system_idle_time_start; float idle_time = system_idle_time_end - system_idle_time_start;
cpu_utilization = busy_time / (busy_time + idle_time); if (busy_time + idle_time != 0) {
cpu_utilization = busy_time / (busy_time + idle_time);
}
#elif defined(__linux__) #elif defined(__linux__)
float busy_time = (system_tms_end_.tms_utime - system_tms_start_.tms_utime) + float busy_time = (system_tms_end_.tms_utime - system_tms_start_.tms_utime) +
(system_tms_end_.tms_stime - system_tms_start_.tms_stime) + (system_tms_end_.tms_stime - system_tms_start_.tms_stime) +
...@@ -127,7 +128,9 @@ float CpuUtilization::GetCpuUtilization() { ...@@ -127,7 +128,9 @@ float CpuUtilization::GetCpuUtilization() {
(irq_end_ - irq_start_) + (softirq_end_ - softirq_start_) + (irq_end_ - irq_start_) + (softirq_end_ - softirq_start_) +
(steal_end_ - steal_start_); (steal_end_ - steal_start_);
float idle_time = (idle_end_ - idle_start_) + (iowait_end_ - iowait_start_); float idle_time = (idle_end_ - idle_start_) + (iowait_end_ - iowait_start_);
cpu_utilization = busy_time / (busy_time + idle_time); if (busy_time + idle_time != 0) {
cpu_utilization = busy_time / (busy_time + idle_time);
}
#else #else
LOG(WARNING) LOG(WARNING)
<< "Current System is not supported to get system cpu utilization" << "Current System is not supported to get system cpu utilization"
...@@ -148,13 +151,16 @@ float CpuUtilization::GetCpuCurProcessUtilization() { ...@@ -148,13 +151,16 @@ float CpuUtilization::GetCpuCurProcessUtilization() {
uint64_t end = FileTimeToUint64(end_); uint64_t end = FileTimeToUint64(end_);
float busy_time = (process_kernel_time_end - process_kernel_time_start) + float busy_time = (process_kernel_time_end - process_kernel_time_start) +
(process_user_time_end - process_user_time_start); (process_user_time_end - process_user_time_start);
cpu_process_utilization = busy_time / (end - start); if (end - start != 0) {
LOG(INFO) << "Process Utilization = " << cpu_process_utilization << std::endl; cpu_process_utilization = busy_time / (end - start);
}
#elif defined(__linux__) #elif defined(__linux__)
float busy_time = float busy_time =
(process_tms_end_.tms_utime - process_tms_start_.tms_utime) + (process_tms_end_.tms_utime - process_tms_start_.tms_utime) +
(process_tms_end_.tms_stime - process_tms_start_.tms_stime); (process_tms_end_.tms_stime - process_tms_start_.tms_stime);
cpu_process_utilization = busy_time / (end_ - start_); if (end_ - start_ != 0) {
cpu_process_utilization = busy_time / (end_ - start_);
}
#else #else
LOG(WARNING) LOG(WARNING)
<< "Current System is not supported to get process cpu utilization" << "Current System is not supported to get process cpu utilization"
......
...@@ -44,6 +44,14 @@ std::unique_ptr<Profiler> Profiler::Create(const ProfilerOptions& options) { ...@@ -44,6 +44,14 @@ std::unique_ptr<Profiler> Profiler::Create(const ProfilerOptions& options) {
return std::unique_ptr<Profiler>(new Profiler(options)); return std::unique_ptr<Profiler>(new Profiler(options));
} }
bool Profiler::IsCuptiSupported() {
bool supported = false;
#ifdef PADDLE_WITH_CUPTI
supported = true;
#endif
return supported;
}
Profiler::Profiler(const ProfilerOptions& options) { Profiler::Profiler(const ProfilerOptions& options) {
options_ = options; options_ = options;
std::bitset<32> trace_switch(options_.trace_switch); std::bitset<32> trace_switch(options_.trace_switch);
......
...@@ -43,6 +43,8 @@ class Profiler { ...@@ -43,6 +43,8 @@ class Profiler {
public: public:
static std::unique_ptr<Profiler> Create(const ProfilerOptions& options); static std::unique_ptr<Profiler> Create(const ProfilerOptions& options);
static bool IsCuptiSupported();
void Prepare(); void Prepare();
void Start(); void Start();
......
...@@ -18,7 +18,6 @@ limitations under the License. */ ...@@ -18,7 +18,6 @@ limitations under the License. */
#include "glog/logging.h" #include "glog/logging.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/dynload/cupti.h"
namespace paddle { namespace paddle {
namespace platform { namespace platform {
......
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#include <ctime> #include <ctime>
#include <string> #include <string>
#include "paddle/fluid/platform/dynload/cupti.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/os_info.h" #include "paddle/fluid/platform/os_info.h"
......
...@@ -25,6 +25,7 @@ limitations under the License. */ ...@@ -25,6 +25,7 @@ limitations under the License. */
#include "paddle/fluid/eager/hooks.h" #include "paddle/fluid/eager/hooks.h"
#include "paddle/fluid/eager/utils.h" #include "paddle/fluid/eager/utils.h"
#include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/framework/convert_utils.h"
#include "paddle/fluid/framework/python_headers.h"
#include "paddle/fluid/memory/allocation/allocator.h" #include "paddle/fluid/memory/allocation/allocator.h"
#include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
...@@ -32,12 +33,14 @@ limitations under the License. */ ...@@ -32,12 +33,14 @@ limitations under the License. */
#include "paddle/fluid/pybind/eager_utils.h" #include "paddle/fluid/pybind/eager_utils.h"
#include "paddle/fluid/pybind/exception.h" #include "paddle/fluid/pybind/exception.h"
#include "paddle/fluid/pybind/slice_utils.h" #include "paddle/fluid/pybind/slice_utils.h"
#include "paddle/fluid/pybind/tensor_py.h"
#include "paddle/phi/api/include/api.h" #include "paddle/phi/api/include/api.h"
#include "paddle/phi/common/data_type.h" #include "paddle/phi/common/data_type.h"
#include "paddle/phi/core/compat/convert_utils.h" #include "paddle/phi/core/compat/convert_utils.h"
#include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/sparse_coo_tensor.h" #include "paddle/phi/core/sparse_coo_tensor.h"
#include "paddle/phi/core/sparse_csr_tensor.h" #include "paddle/phi/core/sparse_csr_tensor.h"
#include "pybind11/detail/internals.h"
namespace paddle { namespace paddle {
namespace pybind { namespace pybind {
...@@ -150,12 +153,22 @@ bool PyCheckTensor(PyObject* obj) { ...@@ -150,12 +153,22 @@ bool PyCheckTensor(PyObject* obj) {
static PyObject* tensor_method_numpy(TensorObject* self, PyObject* args, static PyObject* tensor_method_numpy(TensorObject* self, PyObject* args,
PyObject* kwargs) { PyObject* kwargs) {
EAGER_TRY EAGER_TRY
PADDLE_ENFORCE_EQ( auto& api = pybind11::detail::npy_api::get();
self->tensor.initialized(), true, if (!self->tensor.impl()) {
platform::errors::InvalidArgument( Py_intptr_t py_dims[paddle::framework::DDim::kMaxRank];
"Tensor data of %s is Empty that indicates we have null tensor for " Py_intptr_t py_strides[paddle::framework::DDim::kMaxRank];
"now, please check if it has no data and initialize it first.", py_dims[0] = 0;
self->tensor.name())); py_strides[0] = 0;
PyObject* array = api.PyArray_NewFromDescr_(
api.PyArray_Type_,
api.PyArray_DescrFromType_(pybind11::detail::npy_api::NPY_FLOAT_), 1,
py_dims, py_strides, nullptr,
pybind11::detail::npy_api::NPY_ARRAY_ALIGNED_ |
pybind11::detail::npy_api::NPY_ARRAY_WRITEABLE_,
nullptr);
return array;
}
auto tensor_dims = self->tensor.shape(); auto tensor_dims = self->tensor.shape();
auto numpy_dtype = TensorDtype2NumpyDtype(self->tensor.type()); auto numpy_dtype = TensorDtype2NumpyDtype(self->tensor.type());
auto sizeof_dtype = paddle::framework::DataTypeSize(self->tensor.type()); auto sizeof_dtype = paddle::framework::DataTypeSize(self->tensor.type());
...@@ -167,7 +180,7 @@ static PyObject* tensor_method_numpy(TensorObject* self, PyObject* args, ...@@ -167,7 +180,7 @@ static PyObject* tensor_method_numpy(TensorObject* self, PyObject* args,
py_strides[i] = sizeof_dtype * numel; py_strides[i] = sizeof_dtype * numel;
numel *= py_dims[i]; numel *= py_dims[i];
} }
auto& api = pybind11::detail::npy_api::get();
PyObject* array = api.PyArray_NewFromDescr_( PyObject* array = api.PyArray_NewFromDescr_(
api.PyArray_Type_, api.PyArray_DescrFromType_(numpy_dtype), api.PyArray_Type_, api.PyArray_DescrFromType_(numpy_dtype),
tensor_dims.size(), py_dims, py_strides, nullptr, tensor_dims.size(), py_dims, py_strides, nullptr,
...@@ -175,6 +188,10 @@ static PyObject* tensor_method_numpy(TensorObject* self, PyObject* args, ...@@ -175,6 +188,10 @@ static PyObject* tensor_method_numpy(TensorObject* self, PyObject* args,
pybind11::detail::npy_api::NPY_ARRAY_WRITEABLE_, pybind11::detail::npy_api::NPY_ARRAY_WRITEABLE_,
nullptr); nullptr);
if (!self->tensor.impl()->initialized()) {
return array;
}
if (self->tensor.is_cpu() || self->tensor.is_gpu_pinned()) { if (self->tensor.is_cpu() || self->tensor.is_gpu_pinned()) {
auto dense_tensor = auto dense_tensor =
std::dynamic_pointer_cast<phi::DenseTensor>(self->tensor.impl()); std::dynamic_pointer_cast<phi::DenseTensor>(self->tensor.impl());
...@@ -213,6 +230,20 @@ static PyObject* tensor_method__is_initialized(TensorObject* self, ...@@ -213,6 +230,20 @@ static PyObject* tensor_method__is_initialized(TensorObject* self,
EAGER_CATCH_AND_THROW_RETURN_NULL EAGER_CATCH_AND_THROW_RETURN_NULL
} }
static PyObject* tensor_method__is_dense_tensor_hold_allocation(
TensorObject* self, PyObject* args, PyObject* kwargs) {
EAGER_TRY
auto dense_tensor =
std::dynamic_pointer_cast<phi::DenseTensor>(self->tensor.impl());
if (dense_tensor) {
return ToPyObject(dense_tensor->IsInitialized());
} else {
return ToPyObject(false);
}
EAGER_CATCH_AND_THROW_RETURN_NULL
}
static PyObject* tensor_method__copy_to(TensorObject* self, PyObject* args, static PyObject* tensor_method__copy_to(TensorObject* self, PyObject* args,
PyObject* kwargs) { PyObject* kwargs) {
EAGER_TRY EAGER_TRY
...@@ -552,10 +583,13 @@ static PyObject* tensor__getitem_index_not_tensor(TensorObject* self, ...@@ -552,10 +583,13 @@ static PyObject* tensor__getitem_index_not_tensor(TensorObject* self,
} }
if (op_type == "slice") { if (op_type == "slice") {
out = slice_dygraph_function(self->tensor, paddle::experimental::Tensor(), out = slice_dygraph_function(self->tensor, paddle::experimental::Tensor(),
paddle::experimental::Tensor(), paddle::experimental::Tensor(), {}, {},
std::move(attrs)); std::move(attrs));
} else if (op_type == "strided_slice") { } else if (op_type == "strided_slice") {
out = strided_slice_dygraph_function(self->tensor, attrs); out = strided_slice_dygraph_function(
self->tensor, paddle::experimental::Tensor(),
paddle::experimental::Tensor(), paddle::experimental::Tensor(), {},
{}, {}, attrs);
} else { } else {
PADDLE_THROW(platform::errors::InvalidArgument( PADDLE_THROW(platform::errors::InvalidArgument(
"Slice is only support slice and strided_slice, but we got %s which " "Slice is only support slice and strided_slice, but we got %s which "
...@@ -604,6 +638,7 @@ static PyObject* tensor__getitem_index_not_tensor(TensorObject* self, ...@@ -604,6 +638,7 @@ static PyObject* tensor__getitem_index_not_tensor(TensorObject* self,
auto select_index = paddle::experimental::Tensor( auto select_index = paddle::experimental::Tensor(
egr::Controller::Instance().GenerateUniqueName()); egr::Controller::Instance().GenerateUniqueName());
auto idx_tensor = std::make_shared<phi::DenseTensor>(); auto idx_tensor = std::make_shared<phi::DenseTensor>();
select_index.set_impl(idx_tensor);
auto* dev_ctx = platform::DeviceContextPool::Instance().Get( auto* dev_ctx = platform::DeviceContextPool::Instance().Get(
egr::Controller::Instance().GetExpectedPlace()); egr::Controller::Instance().GetExpectedPlace());
paddle::framework::TensorFromVector(list_select_idxs, *dev_ctx, paddle::framework::TensorFromVector(list_select_idxs, *dev_ctx,
...@@ -617,6 +652,216 @@ static PyObject* tensor__getitem_index_not_tensor(TensorObject* self, ...@@ -617,6 +652,216 @@ static PyObject* tensor__getitem_index_not_tensor(TensorObject* self,
EAGER_CATCH_AND_THROW_RETURN_NULL EAGER_CATCH_AND_THROW_RETURN_NULL
} }
static PyObject* tensor_method__setitem_eager_tensor(TensorObject* self,
PyObject* args,
PyObject* kwargs) {
EAGER_TRY
VLOG(4) << "Call __setitem_eager_tensor";
auto self_tensor = static_cast<phi::DenseTensor*>(self->tensor.impl().get());
PyObject* _index = PyTuple_GET_ITEM(args, 0);
PyObject* value_obj = PyTuple_GET_ITEM(args, 1);
// NOTE(zhiqiu): PyTuple_Pack increases refcount while PyTuple_New
// https://github.com/python/cpython/blob/24b63c695ae0a95b06379eaadace66735abac1e2/Objects/tupleobject.c#L251
PyObject* index_ptr =
!PyTuple_Check(_index) ? PyTuple_Pack(1, _index) : _index;
DEFINE_PADDLE_SCOPE_GUARD([index_ptr, &_index]() {
if (!PyTuple_Check(_index)) {
Py_DECREF(index_ptr);
VLOG(4) << "Call Py_DECREF";
}
});
// TODO(pangyoki) add inplace(BumpInplaceVersion) if need
// 1. Check argumnets
bool parse_index = true;
// Check whether _index can be parsed.
const int size = PyTuple_GET_SIZE(index_ptr);
for (int dim = 0; dim < size; ++dim) {
PyObject* slice_item = PyTuple_GetItem(index_ptr, dim);
if (!(PyCheckInteger(slice_item) || PySlice_Check(slice_item) ||
slice_item == Py_Ellipsis || slice_item == Py_None)) {
parse_index = false;
break;
}
}
// 2. Call op set_value to speed up if the condition is met,
// otherwise call TensorToPyArray.
// TODO(liym27): Try not to call TensorToPyArray because it always
// copys data to cpu place, which reduces performance.
if (parse_index) {
std::vector<int> axes, starts, ends, steps, decrease_axes, none_axes,
infer_flags, list_select_idxs;
// if index is a list, list_select_flag will be true
bool list_select_flag = false;
ParseIndexingSlice(self_tensor, index_ptr, &axes, &starts, &ends, &steps,
&decrease_axes, &none_axes, &infer_flags,
&list_select_idxs, &list_select_flag);
framework::AttributeMap attrs = {{"axes", axes},
{"starts", starts},
{"ends", ends},
{"steps", steps},
{"decrease_axes", decrease_axes},
{"none_axes", none_axes}};
if (egr::Controller::Instance().HasGrad()) {
PADDLE_ENFORCE_EQ(
egr::egr_utils_api::IsLeafTensor(self->tensor) &&
!egr::EagerUtils::autograd_meta(&self->tensor)->StopGradient(),
false, platform::errors::InvalidArgument(
"Leaf Tensor (%s) that doesn't stop gradient can't use "
"inplace strategy.",
self->tensor.name()));
}
paddle::experimental::Tensor value_tensor;
if (PyCheckTensor(value_obj)) {
value_tensor = reinterpret_cast<TensorObject*>(value_obj)->tensor;
// pass the stop_gradient from value to tensor
if (!egr::EagerUtils::autograd_meta(&value_tensor)->StopGradient() &&
egr::EagerUtils::autograd_meta(&self->tensor)->StopGradient()) {
egr::EagerUtils::autograd_meta(&self->tensor)->SetStopGradient(false);
}
} else if (py::isinstance<py::array>(value_obj)) {
paddle::experimental::Tensor value_tensor_tmp(
std::make_shared<phi::DenseTensor>(),
egr::Controller::Instance().GenerateUniqueName());
py::object value_obj_tmp(py::handle(value_obj), true);
py::object value = value_obj_tmp;
if (self->tensor.dtype() == paddle::experimental::DataType::FLOAT32) {
if (!py::isinstance<py::array_t<float>>(value_obj_tmp)) {
value = pybind11::detail::CastNumpyArray<float>(value_obj_tmp);
}
} else if (self->tensor.dtype() ==
paddle::experimental::DataType::FLOAT64) {
if (!py::isinstance<py::array_t<double>>(value_obj_tmp)) {
value = pybind11::detail::CastNumpyArray<double>(value_obj_tmp);
}
} else if (self->tensor.dtype() ==
paddle::experimental::DataType::INT32) {
if (!py::isinstance<py::array_t<int32_t>>(value_obj_tmp)) {
value = pybind11::detail::CastNumpyArray<int32_t>(value_obj_tmp);
}
} else if (self->tensor.dtype() ==
paddle::experimental::DataType::INT64) {
if (!py::isinstance<py::array_t<int64_t>>(value_obj_tmp)) {
value = pybind11::detail::CastNumpyArray<int64_t>(value_obj_tmp);
}
} else if (self->tensor.dtype() == paddle::experimental::DataType::BOOL) {
if (!py::isinstance<py::array_t<bool>>(value_obj_tmp)) {
value = pybind11::detail::CastNumpyArray<bool>(value_obj_tmp);
}
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"When assign a numpy.np value to a paddle.Tensor, "
"the data type of the paddle.Tensor must be bool, "
"float32, int32 or int64, "
"please check the type of tensor."));
}
if (value_tensor_tmp.place() == paddle::PlaceType::kUNK) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
SetTensorFromPyArray(
static_cast<phi::DenseTensor*>(value_tensor_tmp.impl().get()),
value, platform::Place(platform::CUDAPlace(0)), false);
#else
SetTensorFromPyArray(
static_cast<phi::DenseTensor*>(value_tensor_tmp.impl().get()),
value, platform::Place(platform::CPUPlace()), false);
#endif
} else {
SetTensorFromPyArray(
static_cast<phi::DenseTensor*>(value_tensor_tmp.impl().get()),
value, value_tensor_tmp.inner_place(), false);
}
value_tensor = value_tensor_tmp;
} else {
py::object value_obj_tmp(py::handle(value_obj), true);
// convert the value to self data type
if (py::isinstance<py::float_>(value_obj_tmp) ||
py::isinstance<py::int_>(value_obj_tmp) ||
py::isinstance<py::bool_>(value_obj_tmp)) {
if (self->tensor.dtype() == paddle::experimental::DataType::FLOAT32) {
attrs["fp32_values"] =
std::vector<float>{value_obj_tmp.cast<float>()};
} else if (self->tensor.dtype() ==
paddle::experimental::DataType::FLOAT64) {
attrs["fp64_values"] =
std::vector<double>{value_obj_tmp.cast<double>()};
} else if (self->tensor.dtype() ==
paddle::experimental::DataType::INT32) {
attrs["int32_values"] =
std::vector<int32_t>{value_obj_tmp.cast<int32_t>()};
} else if (self->tensor.dtype() ==
paddle::experimental::DataType::INT64) {
attrs["int64_values"] =
std::vector<int64_t>{value_obj_tmp.cast<int64_t>()};
} else if (self->tensor.dtype() ==
paddle::experimental::DataType::BOOL) {
attrs["bool_values"] = std::vector<int>{value_obj_tmp.cast<bool>()};
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"When assign a value to a paddle.Tensor, "
"the data type of the paddle.Tensor must be bool, "
"float32, int32 or int64, "
"please check the type of tensor."));
}
attrs["shape"] = std::vector<int64_t>{1};
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"Value type error. The assign value allows "
"numpy.ndarray, integer, float or bool, "
"but received %s.",
Py_TYPE(value_obj)));
}
}
{
// Release gil and do tracing
py::gil_scoped_release release;
self->tensor = set_value_dygraph_function(self->tensor, value_tensor, {},
{}, {}, attrs);
}
} else {
auto self_numpy = TensorToPyArray(*self_tensor);
VLOG(4) << "parse_index is false";
if (PyCheckTensor(_index)) {
VLOG(4) << "index is tensor";
auto index_tensor = static_cast<phi::DenseTensor*>(
reinterpret_cast<TensorObject*>(_index)->tensor.impl().get());
auto index_numpy = TensorToPyArray(*index_tensor);
self_numpy[index_numpy] = py::object(py::handle(value_obj), true);
} else {
VLOG(4) << "index is not tensor";
self_numpy[_index] = py::object(py::handle(value_obj), true);
}
if (self->tensor.place() == paddle::PlaceType::kUNK) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
SetTensorFromPyArray(self_tensor, self_numpy,
platform::Place(platform::CUDAPlace(0)), false);
#else
SetTensorFromPyArray(self_tensor, self_numpy,
platform::Place(platform::CPUPlace()), false);
#endif
} else {
SetTensorFromPyArray(self_tensor, self_numpy, self->tensor.inner_place(),
false);
}
}
Py_INCREF(Py_None);
return Py_None;
EAGER_CATCH_AND_THROW_RETURN_NULL
}
static PyObject* tensor_register_grad_hook(TensorObject* self, PyObject* args, static PyObject* tensor_register_grad_hook(TensorObject* self, PyObject* args,
PyObject* kwargs) { PyObject* kwargs) {
EAGER_TRY EAGER_TRY
...@@ -825,6 +1070,10 @@ PyMethodDef variable_methods[] = { ...@@ -825,6 +1070,10 @@ PyMethodDef variable_methods[] = {
{"_is_initialized", {"_is_initialized",
(PyCFunction)(void (*)(void))tensor_method__is_initialized, (PyCFunction)(void (*)(void))tensor_method__is_initialized,
METH_VARARGS | METH_KEYWORDS, NULL}, METH_VARARGS | METH_KEYWORDS, NULL},
{"_is_dense_tensor_hold_allocation",
(PyCFunction)(
void (*)(void))tensor_method__is_dense_tensor_hold_allocation,
METH_VARARGS | METH_KEYWORDS, NULL},
{"_copy_to", (PyCFunction)(void (*)(void))tensor_method__copy_to, {"_copy_to", (PyCFunction)(void (*)(void))tensor_method__copy_to,
METH_VARARGS | METH_KEYWORDS, NULL}, METH_VARARGS | METH_KEYWORDS, NULL},
{"copy_", (PyCFunction)(void (*)(void))tensor_method_copy_, {"copy_", (PyCFunction)(void (*)(void))tensor_method_copy_,
...@@ -857,6 +1106,9 @@ PyMethodDef variable_methods[] = { ...@@ -857,6 +1106,9 @@ PyMethodDef variable_methods[] = {
{"_getitem_index_not_tensor", {"_getitem_index_not_tensor",
(PyCFunction)(void (*)(void))tensor__getitem_index_not_tensor, (PyCFunction)(void (*)(void))tensor__getitem_index_not_tensor,
METH_VARARGS | METH_KEYWORDS, NULL}, METH_VARARGS | METH_KEYWORDS, NULL},
{"__setitem_eager_tensor__",
(PyCFunction)(void (*)(void))tensor_method__setitem_eager_tensor,
METH_VARARGS | METH_KEYWORDS, NULL},
{"_register_grad_hook", {"_register_grad_hook",
(PyCFunction)(void (*)(void))tensor_register_grad_hook, (PyCFunction)(void (*)(void))tensor_register_grad_hook,
METH_VARARGS | METH_KEYWORDS, NULL}, METH_VARARGS | METH_KEYWORDS, NULL},
......
...@@ -52,6 +52,12 @@ PyObject* tensor_properties_get_type(TensorObject* self, void* closure) { ...@@ -52,6 +52,12 @@ PyObject* tensor_properties_get_type(TensorObject* self, void* closure) {
EAGER_CATCH_AND_THROW_RETURN_NULL EAGER_CATCH_AND_THROW_RETURN_NULL
} }
PyObject* tensor_properties_is_leaf(TensorObject* self, void* closure) {
EAGER_TRY
return ToPyObject(egr::egr_utils_api::IsLeafTensor(self->tensor));
EAGER_CATCH_AND_THROW_RETURN_NULL
}
int tensor_properties_set_name(TensorObject* self, PyObject* value, int tensor_properties_set_name(TensorObject* self, PyObject* value,
void* closure) { void* closure) {
EAGER_TRY EAGER_TRY
...@@ -179,6 +185,7 @@ struct PyGetSetDef variable_properties[] = { ...@@ -179,6 +185,7 @@ struct PyGetSetDef variable_properties[] = {
nullptr}, nullptr},
{"dtype", (getter)tensor_properties_get_dtype, nullptr, nullptr, nullptr}, {"dtype", (getter)tensor_properties_get_dtype, nullptr, nullptr, nullptr},
{"type", (getter)tensor_properties_get_type, nullptr, nullptr, nullptr}, {"type", (getter)tensor_properties_get_type, nullptr, nullptr, nullptr},
{"is_leaf", (getter)tensor_properties_is_leaf, nullptr, nullptr, nullptr},
{nullptr, nullptr, nullptr, nullptr, nullptr}}; {nullptr, nullptr, nullptr, nullptr, nullptr}};
} // namespace pybind } // namespace pybind
......
...@@ -386,46 +386,6 @@ GetVarBaseListFromPyHandle(const py::handle &handle) { ...@@ -386,46 +386,6 @@ GetVarBaseListFromPyHandle(const py::handle &handle) {
return result; return result;
} }
// cast numpy type form S to T, this may allocate new memory
template <class T, class S>
static py::array_t<T> CastNumpyType(py::array_t<S> array) {
if (std::is_same<T, S>::value) {
return array;
}
auto dim = array.ndim();
std::vector<py::ssize_t> result_shape(dim);
for (auto i = 0; i < dim; i++) {
result_shape[i] = array.shape(i);
}
py::array_t<T> result(result_shape);
return py::vectorize([](S s) { return static_cast<T>(s); })(array);
}
template <class T>
static py::array_t<T> CastNumpyArray(const py::object &array) {
if (py::isinstance<py::array_t<float>>(array)) {
return CastNumpyType<T>(array.cast<py::array_t<float>>());
} else if (py::isinstance<py::array_t<double>>(array)) {
return CastNumpyType<T>(array.cast<py::array_t<double>>());
} else if (py::isinstance<py::array_t<int32_t>>(array)) {
return CastNumpyType<T>(array.cast<py::array_t<int32_t>>());
} else if (py::isinstance<py::array_t<int64_t>>(array)) {
return CastNumpyType<T>(array.cast<py::array_t<int64_t>>());
} else if (py::isinstance<py::array_t<bool>>(array)) {
return CastNumpyType<T>(array.cast<py::array_t<bool>>());
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"Value type error. The assign numpy value allows integer, float, "
"double and bool, "
"but received %s.",
Py_TYPE(array.ptr())->tp_name));
}
// can't reach here
return py::array_t<T>();
}
static imperative::NameVarBaseMap ConvertToNameVarBaseMap( static imperative::NameVarBaseMap ConvertToNameVarBaseMap(
const PyNameVarBaseMap &map) { const PyNameVarBaseMap &map) {
imperative::NameVarBaseMap result; imperative::NameVarBaseMap result;
...@@ -854,27 +814,29 @@ void BindImperative(py::module *m_ptr) { ...@@ -854,27 +814,29 @@ void BindImperative(py::module *m_ptr) {
py::object value = value_obj; py::object value = value_obj;
if (self->DataType() == framework::proto::VarType::FP32) { if (self->DataType() == framework::proto::VarType::FP32) {
if (!py::isinstance<py::array_t<float>>(value_obj)) { if (!py::isinstance<py::array_t<float>>(value_obj)) {
value = CastNumpyArray<float>(value_obj); value = pybind11::detail::CastNumpyArray<float>(value_obj);
} }
} else if (self->DataType() == } else if (self->DataType() ==
framework::proto::VarType::FP64) { framework::proto::VarType::FP64) {
if (!py::isinstance<py::array_t<double>>(value_obj)) { if (!py::isinstance<py::array_t<double>>(value_obj)) {
value = CastNumpyArray<double>(value_obj); value = pybind11::detail::CastNumpyArray<double>(value_obj);
} }
} else if (self->DataType() == } else if (self->DataType() ==
framework::proto::VarType::INT32) { framework::proto::VarType::INT32) {
if (!py::isinstance<py::array_t<int32_t>>(value_obj)) { if (!py::isinstance<py::array_t<int32_t>>(value_obj)) {
value = CastNumpyArray<int32_t>(value_obj); value =
pybind11::detail::CastNumpyArray<int32_t>(value_obj);
} }
} else if (self->DataType() == } else if (self->DataType() ==
framework::proto::VarType::INT64) { framework::proto::VarType::INT64) {
if (!py::isinstance<py::array_t<int64_t>>(value_obj)) { if (!py::isinstance<py::array_t<int64_t>>(value_obj)) {
value = CastNumpyArray<int64_t>(value_obj); value =
pybind11::detail::CastNumpyArray<int64_t>(value_obj);
} }
} else if (self->DataType() == } else if (self->DataType() ==
framework::proto::VarType::BOOL) { framework::proto::VarType::BOOL) {
if (!py::isinstance<py::array_t<bool>>(value_obj)) { if (!py::isinstance<py::array_t<bool>>(value_obj)) {
value = CastNumpyArray<bool>(value_obj); value = pybind11::detail::CastNumpyArray<bool>(value_obj);
} }
} else { } else {
PADDLE_THROW(platform::errors::InvalidArgument( PADDLE_THROW(platform::errors::InvalidArgument(
......
...@@ -38,7 +38,15 @@ std::map<std::string, std::set<std::string>> op_ins_map = { ...@@ -38,7 +38,15 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
{"assign", {"X"}}, {"assign", {"X"}},
{"reshape2", {"X", "Shape"}}, {"reshape2", {"X", "Shape"}},
{"expand", {"X", "ExpandTimes"}}, {"expand", {"X", "ExpandTimes"}},
{"slice", {"Input", "StartsTensor", "EndsTensor"}}, {"slice",
{"Input", "StartsTensor", "EndsTensor", "StartsTensorList",
"EndsTensorList"}},
{"strided_slice",
{"Input", "StartsTensor", "EndsTensor", "StridesTensor",
"StartsTensorList", "EndsTensorList", "StridesTensorList"}},
{"set_value",
{"Input", "ValueTensor", "StartsTensorList", "EndsTensorList",
"StepsTensorList"}},
{"fake_quantize_dequantize_moving_average_abs_max", {"fake_quantize_dequantize_moving_average_abs_max",
{"X", "InScale", "InAccum", "InState"}}, {"X", "InScale", "InAccum", "InState"}},
{"nll_loss", {"X", "Label", "Weight"}}, {"nll_loss", {"X", "Label", "Weight"}},
...@@ -89,6 +97,7 @@ std::map<std::string, std::set<std::string>> op_ins_map = { ...@@ -89,6 +97,7 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
{"Input", "Label", "Weight", "Bias", "SampleWeight", "CustomDistProbs", {"Input", "Label", "Weight", "Bias", "SampleWeight", "CustomDistProbs",
"CustomDistAlias", "CustomDistAliasProbs"}}, "CustomDistAlias", "CustomDistAliasProbs"}},
{"check_finite_and_unscale", {"X", "Scale", "FloatStatus"}}, {"check_finite_and_unscale", {"X", "Scale", "FloatStatus"}},
{"group_norm", {"X", "Scale", "Bias"}},
}; };
// NOTE(zhiqiu): Like op_ins_map. // NOTE(zhiqiu): Like op_ins_map.
......
...@@ -3322,6 +3322,7 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -3322,6 +3322,7 @@ All parameter, weight, gradient are variables in Paddle.
py::class_<paddle::platform::Profiler>(m, "_Profiler") py::class_<paddle::platform::Profiler>(m, "_Profiler")
.def("create", &paddle::platform::Profiler::Create, .def("create", &paddle::platform::Profiler::Create,
py::return_value_policy::take_ownership) py::return_value_policy::take_ownership)
.def("is_cupti_supported", &paddle::platform::Profiler::IsCuptiSupported)
.def("prepare", .def("prepare",
[](paddle::platform::Profiler *profiler) { [](paddle::platform::Profiler *profiler) {
platform::EnableHostEventRecorder(); platform::EnableHostEventRecorder();
......
...@@ -52,6 +52,46 @@ constexpr int NPY_UINT16_ = 4; ...@@ -52,6 +52,46 @@ constexpr int NPY_UINT16_ = 4;
constexpr int NPY_COMPLEX64 = 14; constexpr int NPY_COMPLEX64 = 14;
constexpr int NPY_COMPLEX128 = 15; constexpr int NPY_COMPLEX128 = 15;
// cast numpy type form S to T, this may allocate new memory
template <class T, class S>
static py::array_t<T> CastNumpyType(py::array_t<S> array) {
if (std::is_same<T, S>::value) {
return array;
}
auto dim = array.ndim();
std::vector<py::ssize_t> result_shape(dim);
for (auto i = 0; i < dim; i++) {
result_shape[i] = array.shape(i);
}
py::array_t<T> result(result_shape);
return py::vectorize([](S s) { return static_cast<T>(s); })(array);
}
template <class T>
static py::array_t<T> CastNumpyArray(const py::object &array) {
if (py::isinstance<py::array_t<float>>(array)) {
return CastNumpyType<T>(array.cast<py::array_t<float>>());
} else if (py::isinstance<py::array_t<double>>(array)) {
return CastNumpyType<T>(array.cast<py::array_t<double>>());
} else if (py::isinstance<py::array_t<int32_t>>(array)) {
return CastNumpyType<T>(array.cast<py::array_t<int32_t>>());
} else if (py::isinstance<py::array_t<int64_t>>(array)) {
return CastNumpyType<T>(array.cast<py::array_t<int64_t>>());
} else if (py::isinstance<py::array_t<bool>>(array)) {
return CastNumpyType<T>(array.cast<py::array_t<bool>>());
} else {
PADDLE_THROW(paddle::platform::errors::InvalidArgument(
"Value type error. The assign numpy value allows integer, float, "
"double and bool, "
"but received %s.",
Py_TYPE(array.ptr())->tp_name));
}
// can't reach here
return py::array_t<T>();
}
// Note: Since float16 is not a builtin type in C++, we register // Note: Since float16 is not a builtin type in C++, we register
// paddle::platform::float16 as numpy.float16. // paddle::platform::float16 as numpy.float16.
// Ref: https://github.com/pybind/pybind11/issues/1776 // Ref: https://github.com/pybind/pybind11/issues/1776
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#include "paddle/phi/kernels/pad3d_kernel.h" #include "paddle/phi/kernels/pad3d_kernel.h"
#include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/common/complex.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
namespace phi { namespace phi {
...@@ -574,5 +575,13 @@ void Pad3dKernel(const Context& dev_ctx, ...@@ -574,5 +575,13 @@ void Pad3dKernel(const Context& dev_ctx,
} // namespace phi } // namespace phi
PD_REGISTER_KERNEL( PD_REGISTER_KERNEL(pad3d,
pad3d, CPU, ALL_LAYOUT, phi::Pad3dKernel, float, double, int, int64_t) {} CPU,
ALL_LAYOUT,
phi::Pad3dKernel,
float,
double,
int,
int64_t,
phi::dtype::complex<float>,
phi::dtype::complex<double>) {}
...@@ -50,11 +50,15 @@ struct exponential_transform { ...@@ -50,11 +50,15 @@ struct exponential_transform {
HOSTDEVICE inline T operator()(T val) const { HOSTDEVICE inline T operator()(T val) const {
#if defined(__NVCC__) || defined(__HIPCC__) #if defined(__NVCC__) || defined(__HIPCC__)
if (std::is_same<T, double>::value) { T log = -std::numeric_limits<T>::epsilon() / 2;
return static_cast<T>(-1.0) / lambda_ * log(val); if (val < static_cast<T>(1.) - std::numeric_limits<T>::epsilon() / 2) {
} else { if (std::is_same<T, double>::value) {
return static_cast<T>(-1.0) / lambda_ * __logf(val); log = logf(val);
} else {
log = __logf(val);
}
} }
return static_cast<T>(-1.0) / lambda_ * log;
#else #else
return static_cast<T>(-1.0) / lambda_ * std::log(static_cast<T>(1.0) - val); return static_cast<T>(-1.0) / lambda_ * std::log(static_cast<T>(1.0) - val);
#endif #endif
...@@ -114,13 +118,19 @@ struct normal_transform { ...@@ -114,13 +118,19 @@ struct normal_transform {
namespace kps = phi::kps; namespace kps = phi::kps;
/*********************** Distribution Function *************************/ /*********************** Distribution Function *************************/
template <typename T>
struct uniform_distribution;
template <typename T> template <typename T>
struct normal_distribution; struct normal_distribution;
#if defined(__NVCC__) #if defined(__NVCC__)
template <typename T>
struct uniform_distribution {
__device__ inline T operator()(curandStatePhilox4_32_10_t *state) const {
return static_cast<T>(curand_uniform(state));
}
static constexpr int kReturnsCount = 1;
};
template <> template <>
struct uniform_distribution<float> { struct uniform_distribution<float> {
__device__ inline float4 operator()(curandStatePhilox4_32_10_t *state) const { __device__ inline float4 operator()(curandStatePhilox4_32_10_t *state) const {
...@@ -177,6 +187,14 @@ struct normal_distribution<double> { ...@@ -177,6 +187,14 @@ struct normal_distribution<double> {
}; };
#else #else
template <typename T>
struct uniform_distribution {
__device__ inline T operator()(hiprandStatePhilox4_32_10_t *state) const {
return hiprand_uniform(state);
}
static constexpr int kReturnsCount = 1;
};
template <> template <>
struct uniform_distribution<float> { struct uniform_distribution<float> {
__device__ inline float4 operator()( __device__ inline float4 operator()(
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#ifdef __NVCC__
#include "cub/cub.cuh"
#endif
#ifdef __HIPCC__
#include <hipcub/hipcub.hpp>
namespace cub = hipcub;
#endif
#include <thrust/device_ptr.h>
#include <thrust/iterator/reverse_iterator.h>
#include "paddle/phi/common/type_traits.h"
#include "paddle/phi/core/enforce.h"
#include "paddle/phi/kernels/funcs/for_range.h"
// See Note [ Why still include the fluid headers? ]
#include "paddle/fluid/memory/malloc.h"
namespace phi {
namespace funcs {
template <typename T>
struct IsComplex : public std::false_type {};
template <>
struct IsComplex<::phi::dtype::complex<float>> : public std::true_type {};
template <>
struct IsComplex<::phi::dtype::complex<double>> : public std::true_type {};
template <typename InputIterator, typename OutputIterator, typename BinaryOp>
static void CubInclusiveScan(InputIterator x_iter,
OutputIterator y_iter,
size_t n,
BinaryOp op,
const phi::GPUContext &dev_ctx) {
paddle::memory::allocation::AllocationPtr allocation;
void *temp_storage = nullptr;
size_t temp_storage_bytes = 0;
for (size_t i = 0; i < 2; ++i) {
PADDLE_ENFORCE_GPU_SUCCESS(
cub::DeviceScan::InclusiveScan(temp_storage,
temp_storage_bytes,
x_iter,
y_iter,
op,
static_cast<int>(n),
dev_ctx.stream()));
if (i == 0 && temp_storage_bytes > 0) {
allocation =
paddle::memory::Alloc(dev_ctx.GetPlace(), temp_storage_bytes);
temp_storage = allocation->ptr();
}
}
}
template <typename T>
static auto MakeThrustReverseIterator(T *x) {
return thrust::reverse_iterator<thrust::device_ptr<T>>(
thrust::device_pointer_cast(x));
}
template <typename T, typename BinaryOp, bool kReverse>
struct InclusiveScanOuterOrMidDimFunctor {
HOSTDEVICE InclusiveScanOuterOrMidDimFunctor(
const T *x, T *y, size_t mid_dim, size_t inner_dim, T init, BinaryOp op)
: x_(x),
y_(y),
mid_dim_(mid_dim),
inner_dim_(inner_dim),
init_(init),
op_(op) {}
HOSTDEVICE void operator()(size_t idx) const {
auto outer_idx = idx / inner_dim_;
auto inner_idx = idx % inner_dim_;
if (kReverse) {
idx = outer_idx * mid_dim_ * inner_dim_ + (mid_dim_ - 1) * inner_dim_ +
inner_idx;
} else {
idx = outer_idx * mid_dim_ * inner_dim_ + inner_idx;
}
auto x_ptr = x_ + idx;
auto y_ptr = y_ + idx;
T acc_value = init_;
for (size_t i = 0; i < mid_dim_; ++i) {
acc_value = op_(acc_value, *x_ptr);
*y_ptr = acc_value;
if (kReverse) {
x_ptr -= inner_dim_;
y_ptr -= inner_dim_;
} else {
x_ptr += inner_dim_;
y_ptr += inner_dim_;
}
}
}
private:
const T *x_;
T *y_;
size_t mid_dim_;
size_t inner_dim_;
T init_;
BinaryOp op_;
};
template <typename T,
typename BinaryOp,
size_t kThreadNumX,
size_t kThreadNumY,
bool kReverse>
static __global__ void InclusiveScanInnerDimCUDAKernel(
const T *x, T *y, size_t num_rows, size_t row_size, T init, BinaryOp op) {
using RealT = phi::dtype::Real<T>;
constexpr auto kSharedBufferSize =
IsComplex<T>::value ? 4 * kThreadNumX : 2 * kThreadNumX;
__shared__ RealT sbuf[kThreadNumY][kSharedBufferSize];
T *row_buf = reinterpret_cast<T *>(sbuf[threadIdx.y]);
size_t block_row = static_cast<size_t>(blockIdx.x * kThreadNumY);
size_t block_row_stride = static_cast<size_t>(gridDim.x * kThreadNumY);
for (; block_row < num_rows; block_row += block_row_stride) {
size_t row = block_row + threadIdx.y;
T block_total = init;
const T *row_x = x + row * row_size;
T *row_y = y + row * row_size;
for (size_t block_col = 0; block_col < row_size;
block_col += 2 * kThreadNumX) {
size_t col1, col2;
if (kReverse) {
col1 = row_size - 1 - block_col - threadIdx.x;
col2 = col1 - kThreadNumX;
} else {
col1 = block_col + threadIdx.x;
col2 = col1 + kThreadNumX;
}
if (row < num_rows) {
if (col1 < row_size) {
row_buf[threadIdx.x] = row_x[col1];
} else {
row_buf[threadIdx.x] = init;
}
if (col2 < row_size) {
row_buf[kThreadNumX + threadIdx.x] = row_x[col2];
} else {
row_buf[kThreadNumX + threadIdx.x] = init;
}
if (threadIdx.x == 0) {
row_buf[0] = op(row_buf[0], block_total);
}
}
__syncthreads();
for (size_t s = kThreadNumX, d = 1; s >= 1; s >>= 1, d <<= 1) {
if (row < num_rows && threadIdx.x < s) {
size_t offset = (2 * threadIdx.x + 1) * d - 1;
row_buf[offset + d] = op(row_buf[offset], row_buf[offset + d]);
}
__syncthreads();
}
for (size_t s = 2, d = kThreadNumX / 2; d >= 1; s <<= 1, d >>= 1) {
if (row < num_rows && threadIdx.x < s - 1) {
size_t offset = 2 * (threadIdx.x + 1) * d - 1;
row_buf[offset + d] = op(row_buf[offset], row_buf[offset + d]);
}
__syncthreads();
}
if (row < num_rows) {
if (col1 < row_size) row_y[col1] = row_buf[threadIdx.x];
if (col2 < row_size) row_y[col2] = row_buf[kThreadNumX + threadIdx.x];
}
block_total = row_buf[2 * kThreadNumX - 1];
__syncthreads();
}
}
}
template <typename T, typename BinaryOp>
static void InclusiveScanInnerDim(const T *x,
T *y,
size_t outer_dim,
size_t inner_dim,
T init,
BinaryOp op,
bool reverse,
const phi::GPUContext &dev_ctx) {
constexpr size_t kThreadNumX = 16;
constexpr size_t kThreadNumY = 32;
size_t grid_dim = (outer_dim + kThreadNumY - 1) / kThreadNumY;
grid_dim = std::min<size_t>(grid_dim, dev_ctx.GetCUDAMaxGridDimSize()[0]);
dim3 thread_dims(kThreadNumX, kThreadNumY);
if (reverse) {
InclusiveScanInnerDimCUDAKernel<
T,
BinaryOp,
kThreadNumX,
kThreadNumY,
/*kReverse=*/true><<<grid_dim, thread_dims, 0, dev_ctx.stream()>>>(
x, y, outer_dim, inner_dim, init, op);
} else {
InclusiveScanInnerDimCUDAKernel<
T,
BinaryOp,
kThreadNumX,
kThreadNumY,
/*kReverse=*/false><<<grid_dim, thread_dims, 0, dev_ctx.stream()>>>(
x, y, outer_dim, inner_dim, init, op);
}
}
template <typename T, typename BinaryOp>
void InclusiveScan(const T *x,
T *y,
size_t outer_dim,
size_t mid_dim,
size_t inner_dim,
T init,
BinaryOp op,
bool reverse,
const phi::GPUContext &dev_ctx) {
if (outer_dim == 0 || mid_dim == 0 || inner_dim == 0) return;
if (outer_dim == 1 && inner_dim == 1) {
if (reverse) {
auto x_reverse_iter = MakeThrustReverseIterator(x + mid_dim);
auto y_reverse_iter = MakeThrustReverseIterator(y + mid_dim);
CubInclusiveScan(x_reverse_iter, y_reverse_iter, mid_dim, op, dev_ctx);
} else {
CubInclusiveScan(x, y, mid_dim, op, dev_ctx);
}
} else if (inner_dim != 1) {
phi::funcs::ForRange<phi::GPUContext> for_range(dev_ctx,
outer_dim * inner_dim);
if (reverse) {
for_range(
InclusiveScanOuterOrMidDimFunctor<T, BinaryOp, /*kReverse=*/true>(
x, y, mid_dim, inner_dim, init, op));
} else {
for_range(
InclusiveScanOuterOrMidDimFunctor<T, BinaryOp, /*kReverse=*/false>(
x, y, mid_dim, inner_dim, init, op));
}
} else {
InclusiveScanInnerDim<T, BinaryOp>(
x, y, outer_dim, mid_dim, init, op, reverse, dev_ctx);
}
}
} // namespace funcs
} // namespace phi
...@@ -17,11 +17,10 @@ ...@@ -17,11 +17,10 @@
#include <thrust/reverse.h> #include <thrust/reverse.h>
#include <thrust/scan.h> #include <thrust/scan.h>
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/select_impl.cu.h" #include "paddle/phi/kernels/funcs/select_impl.cu.h"
#include "paddle/phi/kernels/masked_select_grad_kernel.h" #include "paddle/phi/kernels/masked_select_grad_kernel.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
namespace phi { namespace phi {
template <typename MT, typename InT, typename OutT> template <typename MT, typename InT, typename OutT>
...@@ -50,7 +49,7 @@ void MaskedSelectGradKernel(const Context& dev_ctx, ...@@ -50,7 +49,7 @@ void MaskedSelectGradKernel(const Context& dev_ctx,
const DenseTensor& mask, const DenseTensor& mask,
DenseTensor* x_grad) { DenseTensor* x_grad) {
auto mask_size = mask.numel(); auto mask_size = mask.numel();
auto* out_data = x_grad->mutable_data<T>(dev_ctx.GetPlace()); dev_ctx.template Alloc<T>(x_grad);
if (mask_size <= 0) return; if (mask_size <= 0) return;
using Functor = MaskedSelectGradFunctor<bool, T, T>; using Functor = MaskedSelectGradFunctor<bool, T, T>;
phi::funcs::SelectKernel<bool, T, T, 2, Functor>( phi::funcs::SelectKernel<bool, T, T, 2, Functor>(
......
...@@ -23,11 +23,32 @@ limitations under the License. */ ...@@ -23,11 +23,32 @@ limitations under the License. */
#include <thrust/scan.h> #include <thrust/scan.h>
#include <thrust/transform.h> #include <thrust/transform.h>
#include "paddle/fluid/platform/transform.h" #ifdef __NVCC__
#include "cub/cub.cuh"
#endif
#ifdef __HIPCC__
#include <hipcub/hipcub.hpp>
namespace cub = hipcub;
#endif
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/scalar.h"
#include "paddle/phi/core/ddim.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/arg_min_max_kernel.h"
#include "paddle/phi/kernels/empty_kernel.h"
#include "paddle/phi/kernels/funcs/distribution_helper.h"
#include "paddle/phi/kernels/funcs/eigen/common.h" #include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/phi/kernels/funcs/for_range.h"
#include "paddle/phi/kernels/funcs/inclusive_scan.h"
#include "paddle/phi/kernels/funcs/multinomial_functor.h" #include "paddle/phi/kernels/funcs/multinomial_functor.h"
#include "paddle/phi/kernels/top_k_kernel.h"
// See Note [ Why still include the fluid headers? ]
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/transform.h"
DECLARE_bool(use_curand);
namespace phi { namespace phi {
...@@ -57,12 +78,12 @@ template <typename T> ...@@ -57,12 +78,12 @@ template <typename T>
__global__ void GetCumulativeProbs(T* norm_probs_data, __global__ void GetCumulativeProbs(T* norm_probs_data,
int64_t num_distributions, int64_t num_distributions,
int64_t num_categories, int64_t num_categories,
T* cumulative_probs) { T* cumulative_probs_data) {
int id = blockIdx.x; int id = blockIdx.x;
thrust::inclusive_scan(thrust::device, thrust::inclusive_scan(thrust::device,
norm_probs_data + id * num_categories, norm_probs_data + id * num_categories,
norm_probs_data + (id + 1) * num_categories, norm_probs_data + (id + 1) * num_categories,
cumulative_probs + id * num_categories); cumulative_probs_data + id * num_categories);
} }
template <typename T> template <typename T>
...@@ -80,7 +101,7 @@ struct RandomGeneratorCudaFunctor { ...@@ -80,7 +101,7 @@ struct RandomGeneratorCudaFunctor {
}; };
template <typename T> template <typename T>
__device__ int binarySearchFunctor(T* cumulative_probs, __device__ int binarySearchFunctor(T* cumulative_probs_data,
T* norm_probs_data, T* norm_probs_data,
int num_categories, int num_categories,
T rng_number) { T rng_number) {
...@@ -90,7 +111,7 @@ __device__ int binarySearchFunctor(T* cumulative_probs, ...@@ -90,7 +111,7 @@ __device__ int binarySearchFunctor(T* cumulative_probs,
while (right - left > 0) { while (right - left > 0) {
int mid = left + (right - left) / 2; int mid = left + (right - left) / 2;
T temp_prob = cumulative_probs[mid]; T temp_prob = cumulative_probs_data[mid];
if (temp_prob < rng_number) { if (temp_prob < rng_number) {
left = mid + 1; left = mid + 1;
} else { } else {
...@@ -114,26 +135,35 @@ __global__ void sampleMultinomialWithReplacement( ...@@ -114,26 +135,35 @@ __global__ void sampleMultinomialWithReplacement(
int64_t* out_data, int64_t* out_data,
const int64_t num_distributions, const int64_t num_distributions,
const int64_t num_categories, const int64_t num_categories,
T* cumulative_probs, T* cumulative_probs_data,
T* norm_probs_data) { T* norm_probs_data,
uint64_t seed,
uint64_t offset,
bool use_curand) {
// use binary search to get the selected category sample id. // use binary search to get the selected category sample id.
// let cumulative_probs[id-1] < rng_data < cumulative_probs[id]. // let cumulative_probs_data[id-1] < rng_data < cumulative_probs_data[id].
size_t idx = gridDim.x * blockDim.x * blockIdx.y + blockDim.x * blockIdx.x +
threadIdx.x;
// for every distribution curandStatePhilox4_32_10_t state;
int dist = blockIdx.y; curand_init(seed, idx, offset, &state);
// for every sample
int sample = blockIdx.x * blockDim.x + threadIdx.x;
if (sample < num_samples) {
T rng_number = rng_data[sample + dist * num_samples];
// Find the bucket that a uniform random number lies in int sample = blockIdx.x * blockDim.x + threadIdx.x;
int selected_category = for (int dist = blockIdx.y; dist < num_distributions; dist += gridDim.y) {
binarySearchFunctor<T>(cumulative_probs + dist * num_categories, if (sample < num_samples) {
norm_probs_data + dist * num_categories, T rng_number = rng_data[sample + dist * num_samples];
num_categories, if (use_curand) {
rng_number); rng_number = static_cast<T>(curand_uniform4(&state).x);
}
// Find the bucket that a uniform random number lies in
int selected_category =
binarySearchFunctor<T>(cumulative_probs_data + dist * num_categories,
norm_probs_data + dist * num_categories,
num_categories,
rng_number);
out_data[sample + dist * num_samples] = selected_category; out_data[sample + dist * num_samples] = selected_category;
}
} }
} }
...@@ -172,6 +202,54 @@ void MultinomialKernel(const Context& dev_ctx, ...@@ -172,6 +202,54 @@ void MultinomialKernel(const Context& dev_ctx,
in_data_numel * sizeof(T), in_data_numel * sizeof(T),
cudaMemcpyDeviceToHost); cudaMemcpyDeviceToHost);
#endif #endif
if (FLAGS_use_curand) {
for (size_t i = 0; i < num_distributions; ++i) {
int zero_num = 0;
for (size_t j = 0; j < num_categories; ++j) {
T weight = cpu_in_data[i * num_distributions + j];
PADDLE_ENFORCE_GE(
weight,
0,
errors::InvalidArgument(
"Each element of multinomial'input must >= 0, but got %f.",
weight));
if (weight == static_cast<T>(0)) {
zero_num++;
}
}
int valid_samples = num_categories - zero_num;
PADDLE_ENFORCE_LE(
num_samples,
valid_samples,
errors::InvalidArgument("When replacement=False, 'num_samples' "
"must less than or eaqual to the number of "
"positive item of input"));
}
// Refer to [gumbel softmax algorithm]
DenseTensor rand = EmptyLike<T, Context>(dev_ctx, x);
T* rand_data = rand.data<T>();
funcs::uniform_distribution<T> dist;
funcs::exponential_transform<T> trans(1.0);
funcs::distribution_and_transform<T>(dev_ctx, &rand, dist, trans);
funcs::ForRange<Context> for_range(dev_ctx, x.numel());
for_range([rand_data, in_data] __device__(size_t idx) {
rand_data[idx] = in_data[idx] / rand_data[idx];
});
if (num_samples == 1) {
ArgMaxKernel<T, Context>(
dev_ctx, rand, -1, true, false, 3 /*proto::VarType::INT64*/, out);
} else {
std::vector<int64_t> out_dim_vec = vectorize<int64_t>(out->dims());
DenseTensor value =
Empty<T, Context>(dev_ctx, ScalarArray(out_dim_vec));
TopkKernel<T, Context>(
dev_ctx, rand, Scalar(num_samples), -1, true, true, &value, out);
}
return;
}
funcs::MultinomialFunctor<T>(dev_ctx, funcs::MultinomialFunctor<T>(dev_ctx,
cpu_out_data, cpu_out_data,
...@@ -228,7 +306,8 @@ void MultinomialKernel(const Context& dev_ctx, ...@@ -228,7 +306,8 @@ void MultinomialKernel(const Context& dev_ctx,
auto* norm_probs_data = dev_ctx.template Alloc<T>(&norm_probs_tensor); auto* norm_probs_data = dev_ctx.template Alloc<T>(&norm_probs_tensor);
// number of threads in a block is min(num_categories, 512) // number of threads in a block is min(num_categories, 512)
dim3 block_norm(num_categories < 512 ? num_categories : 512); int block_size = num_categories < 512 ? num_categories : 512;
dim3 block_norm(block_size);
dim3 grid_norm((num_distributions * num_categories - 1) / block_norm.x + 1); dim3 grid_norm((num_distributions * num_categories - 1) / block_norm.x + 1);
NormalizeProbability<T><<<grid_norm, block_norm, 0, dev_ctx.stream()>>>( NormalizeProbability<T><<<grid_norm, block_norm, 0, dev_ctx.stream()>>>(
norm_probs_data, norm_probs_data,
...@@ -238,16 +317,34 @@ void MultinomialKernel(const Context& dev_ctx, ...@@ -238,16 +317,34 @@ void MultinomialKernel(const Context& dev_ctx,
num_categories); num_categories);
// Get cumulative probability of each distribution. It's the same function // Get cumulative probability of each distribution. It's the same function
// of // of ``cumsum`` op.
// ``cumsum`` op.
DenseTensor cumulative_probs_tensor; DenseTensor cumulative_probs_tensor;
cumulative_probs_tensor.Resize({num_distributions, num_categories}); cumulative_probs_tensor.Resize({num_distributions, num_categories});
auto* cumulative_probs = dev_ctx.template Alloc<T>(&cumulative_probs_tensor); auto* cumulative_probs_data =
dev_ctx.template Alloc<T>(&cumulative_probs_tensor);
dim3 block_cumsum(1);
dim3 grid_cumsum(num_distributions); if (FLAGS_use_curand) {
GetCumulativeProbs<T><<<grid_cumsum, block_cumsum, 0, dev_ctx.stream()>>>( // 'phi::funcs::InclusiveScan' has higher accuracy than
norm_probs_data, num_distributions, num_categories, cumulative_probs); // 'thrust::inclusive_scan'
funcs::InclusiveScan<T, std::plus<T>>(
/*in*/ norm_probs_data,
/*out*/ cumulative_probs_data,
/*outer_dim*/ static_cast<size_t>(num_distributions),
/*mid_dim*/ static_cast<size_t>(num_categories),
/*inner_dim*/ static_cast<size_t>(1),
/*init*/ static_cast<T>(0),
std::plus<T>(),
/*reverse=*/false,
dev_ctx);
} else {
dim3 block_cumsum(1);
dim3 grid_cumsum(num_distributions);
GetCumulativeProbs<T><<<grid_cumsum, block_cumsum, 0, dev_ctx.stream()>>>(
norm_probs_data,
num_distributions,
num_categories,
cumulative_probs_data);
}
// Generate random number for each sample. // Generate random number for each sample.
std::random_device rd; std::random_device rd;
...@@ -266,16 +363,30 @@ void MultinomialKernel(const Context& dev_ctx, ...@@ -266,16 +363,30 @@ void MultinomialKernel(const Context& dev_ctx,
RandomGeneratorCudaFunctor<T>(seed)); RandomGeneratorCudaFunctor<T>(seed));
// Sample the multinomial distributions. // Sample the multinomial distributions.
dim3 block_sample(128); dim3 block(128);
dim3 grid_sample((num_samples - 1) / block_sample.x + 1, num_distributions); int64_t device_id = dev_ctx.GetPlace().GetDeviceId();
sampleMultinomialWithReplacement< const auto& prop = phi::backends::gpu::GetDeviceProperties(device_id);
T><<<grid_sample, block_sample, 0, dev_ctx.stream()>>>(rng_data, int grid_y = std::min<int64_t>(num_distributions, prop.maxGridSize[1]);
num_samples, dim3 grid((num_samples - 1) / block.x + 1, grid_y);
out_data,
num_distributions, auto gen_cuda = dev_ctx.GetGenerator();
num_categories, size_t curand4_loop_times =
cumulative_probs, (num_distributions + 4 * grid_y - 1) / (4 * grid_y);
norm_probs_data); // 'increment' shoulde be multiple of 4
uint64_t increment = curand4_loop_times * 4;
auto seed_offset = gen_cuda->IncrementOffset(increment);
sampleMultinomialWithReplacement<T><<<grid, block, 0, dev_ctx.stream()>>>(
rng_data,
num_samples,
out_data,
num_distributions,
num_categories,
cumulative_probs_data,
norm_probs_data,
seed_offset.first,
seed_offset.second,
FLAGS_use_curand);
} }
} // namespace phi } // namespace phi
......
...@@ -19,6 +19,7 @@ ...@@ -19,6 +19,7 @@
#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/complex.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
namespace phi { namespace phi {
...@@ -585,4 +586,6 @@ PD_REGISTER_KERNEL(pad3d, ...@@ -585,4 +586,6 @@ PD_REGISTER_KERNEL(pad3d,
float, float,
double, double,
int, int,
int64_t) {} int64_t,
phi::dtype::complex<float>,
phi::dtype::complex<double>) {}
...@@ -13,7 +13,7 @@ ...@@ -13,7 +13,7 @@
# limitations under the License. # limitations under the License.
from .spawn import spawn # noqa: F401 from .spawn import spawn # noqa: F401
from .fleet.launch import launch # noqa: F401 from .launch.main import launch # noqa: F401
from .parallel import init_parallel_env # noqa: F401 from .parallel import init_parallel_env # noqa: F401
from .parallel import get_rank # noqa: F401 from .parallel import get_rank # noqa: F401
......
...@@ -1482,3 +1482,512 @@ register_distributed_operator_impl("matmul_v2", ...@@ -1482,3 +1482,512 @@ register_distributed_operator_impl("matmul_v2",
DistributedMatmulV2Impl1("row_parallel")) DistributedMatmulV2Impl1("row_parallel"))
register_distributed_operator_impl( register_distributed_operator_impl(
"matmul_v2", DistributedMatmulV2Impl2("replicate_parallel")) "matmul_v2", DistributedMatmulV2Impl2("replicate_parallel"))
class DistributedMul(DistributedOperatorImplContainer):
def __init__(self, op_type):
super(DistributedMul, self).__init__(op_type)
register_distributed_operator_impl_container(DistributedMul("mul"))
# ColumnParallel
class DistributedMulImpl0(DistributedOperatorImpl):
def __init__(self, name):
super(DistributedMulImpl0, self).__init__(name)
self._forward_implemented = True
self._backward_implemented = True
def is_input_compatible(self, dist_op):
op_desc = dist_op.serial_op.desc
op_dist_attr = dist_op.dist_attr
x_name = op_desc.input('X')[0]
y_name = op_desc.input('Y')[0]
x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
y_dims_mapping = op_dist_attr.get_input_dims_mapping(y_name)
if is_dim_shard(x_dims_mapping[-1]):
return False
if is_dim_shard(y_dims_mapping[-2]) or is_dim_replicate(y_dims_mapping[
-1]):
return False
for mapping in x_dims_mapping[1:-1]:
if is_dim_shard(mapping):
return False
return True
def is_output_compatible(self, dist_op):
op_desc = dist_op.serial_op.desc
op_dist_attr = dist_op.dist_attr
out_name = op_desc.output('Out')[0]
out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
if is_dim_replicate(out_dims_mapping[-1]):
return False
for mapping in out_dims_mapping[1:-1]:
if is_dim_shard(mapping):
return False
return True
def is_auto_compatible(self, dist_op):
if (not self.is_input_compatible(dist_op)) or \
(not self.is_output_compatible(dist_op)):
return False
if not _is_auto_compatible_for_matmul(dist_op):
return False
return True
def update_dims_mapping(self, dist_op):
changed = False
dim_changed = _update_dims_mapping_for_matmul(dist_op)
if dim_changed:
changed = True
return changed
@staticmethod
def forward(ctx, *args, **kwargs):
"""
kwargs: inputname_mapping & outputname_mapping
"""
dist_op_context = ctx.dist_op_context
main_block = dist_op_context.work_block
startup_block = dist_op_context.startup_block
src_op = dist_op_context.cur_src_op
rank_id = dist_op_context.rank_id
op_dist_attr = ctx.get_op_dist_attr_for_program(src_op)
assert op_dist_attr is not None, "backward op [{}] don't have dist attribute !".format(
str(src_op))
# FIXME (JZ-LIANG) Remove this hack to support any op mesh group for Pipeline Parallelism
if rank_id not in op_dist_attr.process_mesh.processes:
rank_id = _get_corresponding_rank(ctx, op_dist_attr.process_mesh,
rank_id)
# check validation of inputs / outputs
for input_name in src_op.desc.input_names():
assert input_name in kwargs, "input [{}] is not given".format(
input_name)
assert len(kwargs[input_name]) == len(
src_op.desc.input(input_name)
), "number of tensor for input [{}] is not match".format(input_name)
for output_name in src_op.desc.output_names():
assert output_name in kwargs, "input [{}] is not given".format(
output_name)
assert len(kwargs[output_name]) == len(
src_op.desc.output(output_name)
), "number of tensor for input [{}] is not match".format(
output_name)
X_var = main_block.var(kwargs['X'][0])
Weight_var = main_block._var_recursive(kwargs['Y'][0])
Out_var = main_block.var(kwargs['Out'][0])
# TODO infer logic comm presentation
matmul_col_dim_mapping = op_dist_attr.get_input_dims_mapping(
Weight_var.name)[-1]
assert matmul_col_dim_mapping >= 0, "col_parallel_matmul's row should be divided by a specific mesh axis, but got [{}]".format(
matmul_col_dim_mapping)
process_mesh_shape = op_dist_attr.process_mesh.topology
process_mesh_group = op_dist_attr.process_mesh.processes
parallel_axis = matmul_col_dim_mapping
group_ranks = _get_comm_group(process_mesh_group, process_mesh_shape,
parallel_axis, rank_id)
group = new_process_group(group_ranks)
# infer new var shape with op dist attr
x_tensor_dist_attr = ctx.get_tensor_dist_attr_for_program(X_var)
assert x_tensor_dist_attr is not None
identity_var_dist_attr = op_dist_attr.get_input_dist_attr(X_var.name)
assert identity_var_dist_attr is not None
ref_shape_x = infer_shape(main_block, X_var, x_tensor_dist_attr,
identity_var_dist_attr)
# infer out var shape with op dist attr
out_tensor_dist_attr = ctx.get_tensor_dist_attr_for_program(Out_var)
assert out_tensor_dist_attr is not None
out_var_dist_attr = op_dist_attr.get_output_dist_attr(Out_var.name)
assert out_var_dist_attr is not None
ref_shape_out = infer_shape(main_block, Out_var, out_tensor_dist_attr,
out_var_dist_attr)
intermediate_var_0 = main_block.create_var(
name=unique_name.generate_with_ignorable_key(".".join(
["c_identity", 'tmp'])),
dtype=X_var.dtype,
shape=X_var.shape,
type=core.VarDesc.VarType.LOD_TENSOR,
persistable=False,
stop_gradient=X_var.stop_gradient)
# set intermediate_var_0's dist_attr with X_var's dist_attr
ctx.set_tensor_dist_attr_for_program(intermediate_var_0,
identity_var_dist_attr)
check_variable_and_dtype(
X_var, 'tensor',
['float16', 'float32', 'float64', 'int32', 'int64'], '_c_identity')
c_identity_op = main_block.append_op(
type='c_identity',
inputs={'X': [X_var]},
outputs={'Out': intermediate_var_0},
attrs={
'ring_id': group.id,
'use_calc_stream': True,
'use_model_parallel': True,
})
if intermediate_var_0.shape != ref_shape_x:
intermediate_var_0.desc.set_shape(ref_shape_x)
check_variable_and_dtype(intermediate_var_0, 'x',
['float16', 'float32', 'float64'], 'linear')
check_dtype(intermediate_var_0.dtype, 'dtype',
['float16', 'float32', 'float64'], 'linear')
# attrs = {'trans_x': False, 'trans_y': False}
attrs = {
"x_num_col_dims": src_op.desc.attr("x_num_col_dims"),
"y_num_col_dims": src_op.desc.attr("y_num_col_dims")
}
inputs = {'X': [intermediate_var_0], 'Y': [Weight_var]}
mul_op = main_block.append_op(
type='mul', inputs=inputs, outputs={'Out': Out_var}, attrs=attrs)
if Out_var.shape != ref_shape_out:
Out_var.desc.set_shape(ref_shape_out)
# set dist op's dist_attr with serial op's dist_attr
# c_identity
identity_op_dist_attr = OperatorDistributedAttribute()
identity_op_dist_attr.process_mesh = op_dist_attr.process_mesh
identity_op_dist_attr.impl_type = op_dist_attr.impl_type
identity_op_dist_attr.impl_idx = op_dist_attr.impl_idx
# input
input_varname = c_identity_op.desc.input_arg_names()[0]
input_dist_attr = op_dist_attr.get_input_dist_attr(input_varname)
assert input_dist_attr is not None, "dist_attr is {}".format(
op_dist_attr)
identity_op_dist_attr.set_input_dist_attr(input_varname,
input_dist_attr)
# output
output_varname = c_identity_op.desc.output_arg_names()[0]
identity_op_dist_attr.set_output_dist_attr(output_varname,
input_dist_attr)
ctx.set_op_dist_attr_for_program(c_identity_op, identity_op_dist_attr)
# matmulv2
matmulv2_op_dist_attr = OperatorDistributedAttribute()
matmulv2_op_dist_attr.process_mesh = op_dist_attr.process_mesh
matmulv2_op_dist_attr.impl_type = op_dist_attr.impl_type
matmulv2_op_dist_attr.impl_idx = op_dist_attr.impl_idx
for input_varname in mul_op.desc.input_arg_names():
if input_varname in src_op.desc.input_arg_names():
input_dist_attr = op_dist_attr.get_input_dist_attr(
input_varname)
assert input_dist_attr is not None, "dist_attr is {}".format(
op_dist_attr)
matmulv2_op_dist_attr.set_input_dist_attr(input_varname,
input_dist_attr)
else:
input_var = main_block.var(input_varname)
tensor_dist_attr = ctx.get_tensor_dist_attr_for_program(
input_var)
matmulv2_op_dist_attr.set_input_dist_attr(input_varname,
tensor_dist_attr)
for output_varname in mul_op.desc.output_arg_names():
output_dist_attr = op_dist_attr.get_output_dist_attr(output_varname)
assert output_dist_attr is not None, "dist_attr is {}".format(
op_dist_attr)
matmulv2_op_dist_attr.set_output_dist_attr(output_varname,
output_dist_attr)
ctx.set_op_dist_attr_for_program(mul_op, matmulv2_op_dist_attr)
# init param sync
if Weight_var.is_parameter and not op_dist_attr.is_recompute:
_init_param_sync(Weight_var, dist_op_context, startup_block, ctx,
rank_id)
@staticmethod
def backward(ctx, *args, **kwargs):
_right_operand_parameter_matmul_backward(ctx, *args, **kwargs)
# RowParallel
class DistributedMulImpl1(DistributedOperatorImpl):
def __init__(self, name):
super(DistributedMulImpl1, self).__init__(name)
self._forward_implemented = True
self._backward_implemented = True
def is_input_compatible(self, dist_op):
op_desc = dist_op.serial_op.desc
op_dist_attr = dist_op.dist_attr
x_name = op_desc.input('X')[0]
y_name = op_desc.input('Y')[0]
x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
y_dims_mapping = op_dist_attr.get_input_dims_mapping(y_name)
if is_dim_replicate(x_dims_mapping[-1]):
return False
if is_dim_replicate(y_dims_mapping[-2]) or is_dim_shard(y_dims_mapping[
-1]):
return False
# Other dimensions must be replicate except the batch dimension
for mapping in x_dims_mapping[1:-1]:
if is_dim_shard(mapping):
return False
return True
def is_output_compatible(self, dist_op):
op_desc = dist_op.serial_op.desc
op_dist_attr = dist_op.dist_attr
out_name = op_desc.output('Out')[0]
out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
if is_dim_shard(out_dims_mapping[-1]):
return False
# Other dimensions must be replicate except the batch dimension
for mapping in out_dims_mapping[1:-1]:
if is_dim_shard(mapping):
return False
return True
def is_auto_compatible(self, dist_op):
if (not self.is_input_compatible(dist_op)) or \
(not self.is_output_compatible(dist_op)):
return False
if not _is_auto_compatible_for_matmul(dist_op):
return False
return True
def update_dims_mapping(self, dist_op):
changed = False
dim_changed = _update_dims_mapping_for_matmul(dist_op)
if dim_changed:
changed = True
return changed
@staticmethod
def forward(ctx, *args, **kwargs):
"""
kwargs: inputname_mapping & outputname_mapping
"""
dist_op_context = ctx.dist_op_context
main_block = dist_op_context.work_block
startup_block = dist_op_context.startup_block
src_op = dist_op_context.cur_src_op
rank_id = dist_op_context.rank_id
op_dist_attr = ctx.get_op_dist_attr_for_program(src_op)
assert op_dist_attr is not None, "backward op [{}] don't have dist attribute !".format(
str(src_op))
# FIXME (JZ-LIANG) Remove this hack to support any op mesh group for Pipeline Parallelism
if rank_id not in op_dist_attr.process_mesh.processes:
rank_id = _get_corresponding_rank(ctx, op_dist_attr.process_mesh,
rank_id)
# check validation of inputs / outputs
for input_name in src_op.desc.input_names():
assert input_name in kwargs, "input [{}] is not given".format(
input_name)
assert len(kwargs[input_name]) == len(
src_op.desc.input(input_name)
), "number of tensor for input [{}] is not match".format(input_name)
for output_name in src_op.desc.output_names():
assert output_name in kwargs, "input [{}] is not given".format(
output_name)
assert len(kwargs[output_name]) == len(
src_op.desc.output(output_name)
), "number of tensor for input [{}] is not match".format(
output_name)
X_var = main_block.var(kwargs['X'][0])
Weight_var = main_block._var_recursive(kwargs['Y'][0])
Out_var = main_block.var(kwargs['Out'][0])
# TODO infer logic comm presentation
matmul_row_dim_mapping = op_dist_attr.get_input_dims_mapping(
Weight_var.name)[-2]
assert matmul_row_dim_mapping >= 0, "row_parallel_matmul's row should be divided by a specific mesh axis, but got [{}]".format(
matmul_row_dim_mapping)
process_mesh_shape = op_dist_attr.process_mesh.topology
process_mesh_group = op_dist_attr.process_mesh.processes
parallel_axis = matmul_row_dim_mapping
group_ranks = _get_comm_group(process_mesh_group, process_mesh_shape,
parallel_axis, rank_id)
group = new_process_group(group_ranks)
check_variable_and_dtype(X_var, 'x', ['float16', 'float32', 'float64'],
'linear')
check_dtype(X_var.dtype, 'dtype', ['float16', 'float32', 'float64'],
'linear')
# attrs = {'trans_x': False, 'trans_y': False}
attrs = {
"x_num_col_dims": src_op.desc.attr("x_num_col_dims"),
"y_num_col_dims": src_op.desc.attr("y_num_col_dims")
}
inputs = {'X': X_var, 'Y': Weight_var}
# infer out var shape with op dist attr
out_tensor_dist_attr = ctx.get_tensor_dist_attr_for_program(Out_var)
assert out_tensor_dist_attr is not None
out_var_dist_attr = op_dist_attr.get_output_dist_attr(Out_var.name)
assert out_var_dist_attr is not None
ref_shape = infer_shape(main_block, Out_var, out_tensor_dist_attr,
out_var_dist_attr)
intermediate_var_0 = main_block.create_var(
shape=Out_var.shape,
dtype=Out_var.dtype,
type=Out_var.type,
lod_level=Out_var.lod_level,
persistable=False,
is_data=False,
need_check_feed=Out_var.desc.need_check_feed())
# set intermediate_var_0's dist_attr with Out_var's dist_attr
ctx.set_tensor_dist_attr_for_program(intermediate_var_0,
out_var_dist_attr)
mul_op = main_block.append_op(
type='mul',
inputs=inputs,
outputs={'Out': intermediate_var_0},
attrs=attrs)
if intermediate_var_0.shape != ref_shape:
intermediate_var_0.desc.set_shape(ref_shape)
c_allreduce_sum_op = main_block.append_op(
type='c_allreduce_sum',
inputs={'X': intermediate_var_0},
outputs={'Out': Out_var},
attrs={
'ring_id': group.id,
'use_calc_stream': True,
'use_model_parallel': True
})
if Out_var.shape != ref_shape:
Out_var.desc.set_shape(ref_shape)
# set dist op's dist_attr with serial op's dist_attr
# matmulv2
matmulv2_op_dist_attr = OperatorDistributedAttribute()
matmulv2_op_dist_attr.process_mesh = op_dist_attr.process_mesh
matmulv2_op_dist_attr.impl_type = op_dist_attr.impl_type
matmulv2_op_dist_attr.impl_idx = op_dist_attr.impl_idx
for input_varname in mul_op.desc.input_arg_names():
input_dist_attr = op_dist_attr.get_input_dist_attr(input_varname)
assert input_dist_attr is not None, "dist_attr is {}".format(
op_dist_attr)
matmulv2_op_dist_attr.set_input_dist_attr(input_varname,
input_dist_attr)
output_varname = mul_op.desc.output_arg_names()[0]
output_dist_attr = op_dist_attr.get_output_dist_attr(Out_var.name)
assert output_dist_attr is not None, "dist_attr is {}".format(
op_dist_attr)
matmulv2_op_dist_attr.set_output_dist_attr(output_varname,
output_dist_attr)
ctx.set_op_dist_attr_for_program(mul_op, matmulv2_op_dist_attr)
# allreduce
allreduce_op_dist_attr = OperatorDistributedAttribute()
allreduce_op_dist_attr.process_mesh = op_dist_attr.process_mesh
allreduce_op_dist_attr.impl_type = op_dist_attr.impl_type
allreduce_op_dist_attr.impl_idx = op_dist_attr.impl_idx
for input_varname in c_allreduce_sum_op.desc.input_arg_names():
input_var = main_block.var(input_varname)
tensor_dist_attr = ctx.get_tensor_dist_attr_for_program(input_var)
assert tensor_dist_attr is not None
allreduce_op_dist_attr.set_input_dist_attr(input_varname,
tensor_dist_attr)
for output_varname in c_allreduce_sum_op.desc.output_arg_names():
output_dist_attr = op_dist_attr.get_output_dist_attr(output_varname)
assert output_dist_attr is not None, "dist_attr is {}".format(
op_dist_attr)
allreduce_op_dist_attr.set_output_dist_attr(output_varname,
output_dist_attr)
ctx.set_op_dist_attr_for_program(c_allreduce_sum_op,
allreduce_op_dist_attr)
# init param sync
if Weight_var.is_parameter and not op_dist_attr.is_recompute:
_init_param_sync(Weight_var, dist_op_context, startup_block, ctx,
rank_id)
@staticmethod
def backward(ctx, *args, **kwargs):
_right_operand_parameter_matmul_backward(ctx, *args, **kwargs)
# ReplicateParallel
class DistributedMulImpl2(DistributedOperatorImpl):
def __init__(self, name):
super(DistributedMulImpl2, self).__init__(name)
def is_input_compatible(self, dist_op):
op_desc = dist_op.serial_op.desc
op_dist_attr = dist_op.dist_attr
x_name = op_desc.input('X')[0]
y_name = op_desc.input('Y')[0]
x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
y_dims_mapping = op_dist_attr.get_input_dims_mapping(y_name)
if is_dim_shard(x_dims_mapping[-1]):
return False
if is_valid_list_index(x_dims_mapping,
-2) and is_dim_shard(x_dims_mapping[-2]):
return False
if is_dim_shard(y_dims_mapping[-1]):
return False
if is_valid_list_index(y_dims_mapping,
-2) and is_dim_shard(y_dims_mapping[-2]):
return False
return True
def is_output_compatible(self, dist_op):
op_desc = dist_op.serial_op.desc
op_dist_attr = dist_op.dist_attr
op_desc = dist_op.serial_op.desc
op_dist_attr = dist_op.dist_attr
out_name = op_desc.output('Out')[0]
out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
if is_dim_shard(out_dims_mapping[-1]):
return False
if is_valid_list_index(out_dims_mapping,
-2) and is_dim_shard(out_dims_mapping[-2]):
return False
return True
def is_auto_compatible(self, dist_op):
if (not self.is_input_compatible(dist_op)) or \
(not self.is_output_compatible(dist_op)):
return False
if not _is_auto_compatible_for_matmul(dist_op):
return False
return True
def update_dims_mapping(self, dist_op):
changed = False
dim_changed = _update_dims_mapping_for_matmul(dist_op)
if dim_changed:
changed = True
return changed
@staticmethod
def forward(ctx, *args, **kwargs):
DistributedDefaultImpl0.forward(ctx, *args, **kwargs)
@staticmethod
def backward(ctx, *args, **kwargs):
_right_operand_parameter_matmul_backward(ctx, *args, **kwargs)
register_distributed_operator_impl("mul",
DistributedMulImpl0("column_parallel"))
register_distributed_operator_impl("mul", DistributedMulImpl1("row_parallel"))
register_distributed_operator_impl("mul",
DistributedMulImpl2("replicate_parallel"))
...@@ -13,69 +13,3 @@ ...@@ -13,69 +13,3 @@
# limitations under the License. # limitations under the License.
__all__ = [] __all__ = []
'''
Paddle distributed training entry ``python -m paddle.distributed.launch``.
Help
# for arg usage and explanation, try the following command
# python -m paddle.distributed.launch -h
Collective Mode
Case 1: 1 node
use all visible devices
# python -m paddle.distributed.launch train.py
use specified devices
# python -m paddle.distributed.launch --devices=0,1,2,3 train.py
Case 2: multi-node, auto detect ip/port
# python -m paddle.distributed.launch --nnodes 2 train.py
# auto print following command
# python -m paddle.distributed.launch --master 10.0.0.1:13538 --nnodes 2 demo.py
# then copy and paste above command to other nodes
Case 3: multi-node, specified master/rendezvous server
# python -m paddle.distributed.launch --nnodes 2 --master 10.0.0.1:2379 train.py
# the master ip must be one of the node and the port must available
Parameter Server Mode
Case 1.1: 1 node, 1 ps, 1 trainer
# python -m paddle.distributed.launch --mode ps train.py
# python -m paddle.distributed.launch --server_num=1 --trainer_num=1 train.py
Case 1.2: 1 node, 2 ps, 2 trainer
# python -m paddle.distributed.launch --server_num=2 --trainer_num=2 train.py
Case 2: 2 node, 2 ps, 2 trainer per node
# python -m paddle.distributed.launch --server_num=2 --trainer_num=2 --nnodes 2 train.py
# auto print following command
# python -m paddle.distributed.launch --master 10.0.0.1:13538 --server_num=2 --trainer_num=2 --nnodes 2 train.py
# then copy and paste above command to other nodes
Case 3: multi-node, specified master/rendezvous server
# python -m paddle.distributed.launch --master 10.0.0.1:13538 --server_num=2 --trainer_num=2 --nnodes 2 train.py
# the master ip must be one of the node and the port must available
Case 4: specified servers and trainers in each node
python -m paddle.distributed.launch --servers 127.0.0.1:8900,127.0.0.1:8901 --trainers 127.0.0.1:8902,127.0.0.1:8903 train.py
Elastic Mode
# run following command in 3 node to run immediately, or in 2 node to run after elastic_timeout
# python -m paddle.distributed.launch --master etcd://10.0.0.1:2379 --nnodes 2:3 train.py
# once the peer number changes between 2:3, the strategy holds
'''
...@@ -12,31 +12,6 @@ ...@@ -12,31 +12,6 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from .context import Context from .main import launch
from . import controllers
launch()
def launch():
# initialize the context to run
ctx = Context()
if ctx.is_legacy_mode():
# legacy mode
from paddle.distributed.fleet import launch
launch.launch()
else:
# initialize the selected controller
c = controllers.init(ctx)
# run the pods
c.run()
# manager or just wait pod
c.finalize()
if __name__ == "__main__":
launch()
...@@ -82,6 +82,12 @@ class Context(object): ...@@ -82,6 +82,12 @@ class Context(object):
logger.addHandler(ch) logger.addHandler(ch)
return logger return logger
def continous_log(self) -> bool:
if self.args.log_level.upper() in ['DEBUG', 'ERROR']:
return True
else:
return False
def set_env_in_args(self): def set_env_in_args(self):
for k, v in env_args_mapping.items(): for k, v in env_args_mapping.items():
if k in self.envs: if k in self.envs:
......
...@@ -20,7 +20,7 @@ env_args_mapping = { ...@@ -20,7 +20,7 @@ env_args_mapping = {
'PADDLE_MASTER': 'master', 'PADDLE_MASTER': 'master',
'PADDLE_DEVICES': 'devices', 'PADDLE_DEVICES': 'devices',
'PADDLE_NNODES': 'nnodes', 'PADDLE_NNODES': 'nnodes',
'PADDLE_MODE': 'mode', 'PADDLE_RUN_MODE': 'run_mode',
'PADDLE_LOG_LEVEL': 'log_level', 'PADDLE_LOG_LEVEL': 'log_level',
'PADDLE_NPROC_PER_NODE': 'nproc_per_node', 'PADDLE_NPROC_PER_NODE': 'nproc_per_node',
'PADDLE_JOB_ID': 'job_id', 'PADDLE_JOB_ID': 'job_id',
...@@ -60,7 +60,7 @@ def parse_args(): ...@@ -60,7 +60,7 @@ def parse_args():
"--legacy", type=bool, default=False, help="use legacy launch") "--legacy", type=bool, default=False, help="use legacy launch")
base_group.add_argument( base_group.add_argument(
"--rank", type=int, default=-1, help="the peer rank") "--rank", type=int, default=-1, help="the node rank")
base_group.add_argument( base_group.add_argument(
"--log_level", type=str, default="INFO", help="log level. Default INFO") "--log_level", type=str, default="INFO", help="log level. Default INFO")
...@@ -69,7 +69,7 @@ def parse_args(): ...@@ -69,7 +69,7 @@ def parse_args():
"--nnodes", "--nnodes",
type=str, type=str,
default="1", default="1",
help="the number of peers, i.e. pod/node number") help="the number of nodes, i.e. pod/node number")
base_group.add_argument( base_group.add_argument(
"--nproc_per_node", "--nproc_per_node",
...@@ -83,7 +83,7 @@ def parse_args(): ...@@ -83,7 +83,7 @@ def parse_args():
default="log", default="log",
help="the path for each process's log. Default ./log") help="the path for each process's log. Default ./log")
base_group.add_argument( base_group.add_argument(
"--mode", "--run_mode",
type=str, type=str,
default="collective", default="collective",
help="run mode of the job, collective/ps/ps-heter") help="run mode of the job, collective/ps/ps-heter")
...@@ -146,6 +146,6 @@ def parse_args(): ...@@ -146,6 +146,6 @@ def parse_args():
"--elastic_timeout", "--elastic_timeout",
type=int, type=int,
default=30, default=30,
help="seconds to wait before elastic perform training") help="seconds to wait before elastic job begin to train")
return parser.parse_known_args() return parser.parse_known_args()
...@@ -115,46 +115,6 @@ class CollectiveElasticController(CollectiveController): ...@@ -115,46 +115,6 @@ class CollectiveElasticController(CollectiveController):
self.master.register_heartbeat(self.job.id, self.pod.name) self.master.register_heartbeat(self.job.id, self.pod.name)
def watch(self) -> bool:
'''
watch self and peer status, return true to exit
'''
self.ctx.logger.info("Watching {}".format(self.pod))
while not self.ctx.status.is_done():
# self status
status = self.pod.watch(timeout=2)
self.ctx.logger.debug("Pod status {}, Ctx status {}".format(
status, self.ctx.status.current()))
# completed
if status == self.ctx.status.COMPLETED:
self.master.set_status(status)
self.ctx.status.complete()
self.ctx.logger.info("Pod complete {}".format(status))
return True
# self failure
elif status == self.ctx.status.FAILED:
self.master.set_status(status)
self.master.restart_peer()
self.ctx.logger.info("Pod failed {}".format(status))
self.pod.stop()
if self.ctx.args.elastic_level <= 0:
return True
else:
return False
# peer failure
if self.ctx.status.is_restarting() and self.master.get_status(
) != self.ctx.status.COMPLETED:
self.pod.stop()
return False
#peers = self.master.fetch_peer_alive()
#print("peers {}".format(peers))
def run(self): def run(self):
timeout = self.ctx.args.elastic_timeout if self.job.elastic else self.ctx.args.elastic_timeout * 10 timeout = self.ctx.args.elastic_timeout if self.job.elastic else self.ctx.args.elastic_timeout * 10
...@@ -164,6 +124,8 @@ class CollectiveElasticController(CollectiveController): ...@@ -164,6 +124,8 @@ class CollectiveElasticController(CollectiveController):
self.build_job() self.build_job()
self.ctx.logger.info("Waiting peer ready...")
ok, replicas = self.master.wait_peer_ready( ok, replicas = self.master.wait_peer_ready(
self.job.replicas_min, self.job.replicas_max, timeout) self.job.replicas_min, self.job.replicas_max, timeout)
if ok: if ok:
......
...@@ -40,7 +40,7 @@ class ControllerBase(object): ...@@ -40,7 +40,7 @@ class ControllerBase(object):
self.master = Master.factory(self.ctx) self.master = Master.factory(self.ctx)
self.job = Job(nnodes=self.ctx.args.nnodes, self.job = Job(nnodes=self.ctx.args.nnodes,
mode=self.ctx.args.mode, mode=self.ctx.args.run_mode,
jid=self.ctx.args.job_id) jid=self.ctx.args.job_id)
self.pod = Pod() self.pod = Pod()
...@@ -65,18 +65,51 @@ class ControllerBase(object): ...@@ -65,18 +65,51 @@ class ControllerBase(object):
self.watch() self.watch()
def watch(self) -> bool: def watch(self) -> bool:
'''
watch self and peer status, return true to exit
'''
#TODO(kuizhiqing) unify ctx.status and master status
self.ctx.logger.info("Watching {}".format(self.pod)) self.ctx.logger.info("Watching {}".format(self.pod))
status = self.pod.watch() while not self.ctx.status.is_done():
status = self.pod.watch(timeout=2)
if self.ctx.continous_log():
self.pod.logs()
# completed
if status == self.ctx.status.COMPLETED:
self.ctx.status.complete()
self.master.set_status(status)
self.ctx.logger.info("Pod {}".format(status))
return True
# self failure
elif status == self.ctx.status.FAILED:
self.ctx.status.fail()
self.master.set_status(status)
self.master.restart_peer()
fc = self.pod.failed_container()
self.ctx.logger.info("Pod {}".format(status))
self.ctx.logger.error("Container failed !!!\n{}".format(fc[0]))
fc[0].tail()
self.pod.stop()
if self.ctx.args.elastic_level <= 0:
return True
else:
return False
if status == self.ctx.status.COMPLETED: # peer failure
self.ctx.logger.info("Pod {}".format(status)) if self.ctx.status.is_restarting() and self.master.get_status(
elif status == self.ctx.status.FAILED: ) != self.ctx.status.COMPLETED:
fc = self.pod.failed_container() self.pod.stop()
self.ctx.logger.info("Pod {}".format(status)) return False
self.ctx.logger.error("Container failed !!!\n{}".format(fc[0]))
fc[0].tail()
self.pod.stop()
def stop(self, sigint=None): def stop(self, sigint=None):
self.ctx.logger.debug("Controller stop") self.ctx.logger.debug("Controller stop")
......
...@@ -43,6 +43,15 @@ class Master(object): ...@@ -43,6 +43,15 @@ class Master(object):
def stop(self): def stop(self):
raise NotImplementedError raise NotImplementedError
def set_status(self, status):
pass
def get_status(self):
return None
def restart_peer(self):
pass
def sync_peers(self, prefix, key, value, size, rank=-1) -> (list, int): def sync_peers(self, prefix, key, value, size, rank=-1) -> (list, int):
raise NotImplementedError raise NotImplementedError
...@@ -122,7 +131,7 @@ class HTTPMaster(Master): ...@@ -122,7 +131,7 @@ class HTTPMaster(Master):
if size < 2: if size < 2:
return [value], 0 return [value], 0
self.ctx.logger.info("Waiting peer ready...") self.ctx.logger.info("Waiting peer start...")
self.lazy_init() self.lazy_init()
...@@ -184,7 +193,7 @@ class ETCDMaster(Master): ...@@ -184,7 +193,7 @@ class ETCDMaster(Master):
if size < 2: if size < 2:
return [value], 0 return [value], 0
self.ctx.logger.info("Waiting peer ready...") self.ctx.logger.info("Waiting peer start...")
path = "{}/{}/{}".format(prefix, key, rank) path = "{}/{}/{}".format(prefix, key, rank)
......
...@@ -21,11 +21,11 @@ import os, shutil ...@@ -21,11 +21,11 @@ import os, shutil
class PSController(Controller): class PSController(Controller):
@classmethod @classmethod
def enable(cls, ctx): def enable(cls, ctx):
if ctx.args.mode == ControleMode.PS or ctx.args.server_num or len( if ctx.args.run_mode == ControleMode.PS or ctx.args.server_num or len(
ctx.args.servers) > 0 or ctx.args.trainer_num or len( ctx.args.servers) > 0 or ctx.args.trainer_num or len(
ctx.args.trainers) > 0: ctx.args.trainers) > 0:
ctx.logger.debug("{} enabled".format(cls.__name__)) ctx.logger.debug("{} enabled".format(cls.__name__))
ctx.args.mode = ControleMode.PS ctx.args.run_mode = ControleMode.PS
return True return True
else: else:
return False return False
......
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .context import Context
def launch():
"""
Paddle distribution training entry ``python -m paddle.distributed.launch``.
Usage:
.. code-block:: bash
:name: code-block-bash1
python -m paddle.distributed.launch [-h] [--master MASTER] [--rank RANK]
[--log_level LOG_LEVEL] [--nnodes NNODES]
[--nproc_per_node NPROC_PER_NODE] [--log_dir LOG_DIR]
[--run_mode RUN_MODE] [--job_id JOB_ID] [--devices DEVICES]
[--host HOST] [--servers SERVERS] [--trainers TRAINERS]
[--trainer_num TRAINER_NUM] [--server_num SERVER_NUM]
[--gloo_port GLOO_PORT] [--with_gloo WITH_GLOO]
[--max_restart MAX_RESTART] [--elastic_level ELASTIC_LEVEL]
[--elastic_timeout ELASTIC_TIMEOUT]
training_script ...
Base Parameters:
- ``--master``: The master/rendezvous server, support http:// and etcd://, default with http://. e.g., ``--master=127.0.0.1:8080``. Default ``--log_dir=None``.
- ``--rank``: The rank of the node, can be auto assigned by master. Default ``--rank=-1``.
- ``--log_level``: The log levl to set for logging.setLevel. Default ``--log_level=INFO``.
- ``--nnodes``: The number of nodes for a distributed job, it can be a range in elastic mode, e.g., ``--nnnodes=2:3``. Default ``--nnodes=1``.
- ``--nproc_per_node``: The number of processes to launch on a node. In gpu training, it should be less or equal to the gpus number of you system. e.g., ``--nproc_per_node=8``
- ``--log_dir``: The path for each process's log. e.g., ``--log_dir=output_dir``. Default ``--log_dir=log``.
- ``--run_mode``: The run mode of job, can be:collective/ps/ps-heter. e.g., ``--run_mode=ps``. Default ``--run_mode=collective``.
- ``--job_id``: The job unique id, it affects the log files' name. e.g., ``--job_id=job1``. Default ``--job_id=default``.
- ``--devices``: The selected accelerate devices on nodes, can be gpu/xpu/npu/mlu etc.. e.g., ``--devices=0,1,2,3`` will launch four training processes each bound to one device.
- ``training_script``: The full path to the single GPU training program/script to be launched in parallel, followed by all the arguments for the training script. e.g., ``traing.py``
- ``training_script_args``: The args of training_script. e.g., ``--lr=0.1``
Collective Parameters:
- ``--ips``: [DEPRECATED] Paddle cluster nodes ips, e.g., ``--ips=192.168.0.16,192.168.0.17``. Default ``--ips=127.0.0.1``.
Parameter-Server Parameters:
- ``--servers``: User defined servers ip:port, e.g., ``--servers="192.168.0.16:6170,192.168.0.17:6170"``
- ``--trainers``: User defined trainers ip:port, e.g., ``--trainers="192.168.0.16:6171,192.168.0.16:6172,192.168.0.17:6171,192.168.0.17:6172"``
- ``--workers``: [DEPRECATED] The same as trainers.
- ``--trainer_num``: Number of trainers on each node, can be 0.
- ``--worker_num``: [DEPRECATED] The same as trainer_num.
- ``--server_num``: Number of servers on each node, can be 0.
- ``--heter_workers``: User defined heter workers ip1:port1;ip2:port2, e.g., ``--heter_workers="192.168.0.16:6172;192.168.0.17:6172"``
- ``--heter_worker_num``: Number of heter_workers in each stage (It recommend to set when in the emulated distributed environment using single node)
- ``--heter_devices``: Type of heter_device in each stage
- ``--gloo_port``: Gloo http Port. Default ``--gloo_port=6767``.
- ``--with_gloo``: Using gloo or not. Default ``--with_gloo=0``.
Elastic Parameters:
- ``--max_restart``: The maximum restart times for an elastic job. Default ``--max_restart=3``.
- ``--elastic_level``: The elastic level: -1: disable, 0: failed exit, peers hold, 1: internal restart. Default ``--elastic_level=-1``.
- ``--elastic_timeout``: Seconds to wait before elastic job begin to train. Default ``--elastic_timeout=30``.
Returns:
``None``
Examples 0 (master, ip/port auto detection):
# For training on multi node, run the following command in one of the nodes
python -m paddle.distributed.launch --nnodes 2 train.py
# Then the following info will be print
# Copy the following command to other nodes to run.
# --------------------------------------------------------------------------------
# python -m paddle.distributed.launch --master 10.0.0.1:38714 --nnodes 2 train.py
# --------------------------------------------------------------------------------
# Follow the instruction above and paste the command in other nodes can launch a multi nodes training job.
# There are two ways to launch a job with the same command for multi nodes training
# 1) using the following command in every nodes, make sure the ip is one of the training node and the port is available on that node
# python -m paddle.distributed.launch --master 10.0.0.1:38714 --nnodes 2 train.py
# 2) using the following command in every nodes with a independent etcd service
# python -m paddle.distributed.launch --master etcd://10.0.0.1:2379 --nnodes 2 train.py
# This functionality works will for both collective and ps mode and even with other arguments.
Examples 1 (collective, single node):
.. code-block:: bash
:name: code-block-example-bash1
# For training on single node using 4 gpus.
python -m paddle.distributed.launch --devices=0,1,2,3 train.py --lr=0.01
Examples 2 (collective, multi node):
.. code-block:: bash
:name: code-block-example-bash2
# For training on multiple nodes, e.g., 192.168.0.16, 192.168.0.17
# On 192.168.0.16:
python -m paddle.distributed.launch --devices=0,1,2,3 --master=192.168.0.16:8090 train.py --lr=0.01
# On 192.168.0.17:
python -m paddle.distributed.launch --devices=0,1,2,3 --master=192.168.0.16:8090 train.py --lr=0.01
Examples 3 (ps, cpu, single node):
.. code-block:: bash
:name: code-block-example-bash3
# To simulate distributed environment using single node, e.g., 2 servers and 4 workers.
python -m paddle.distributed.launch --server_num=2 --worker_num=4 train.py --lr=0.01
Examples 4 (ps, cpu, multi node):
.. code-block:: bash
:name: code-block-example-bash4
# For training on multiple nodes, e.g., 192.168.0.16, 192.168.0.17 where each node with 1 server and 2 workers.
# On 192.168.0.16:
python -m paddle.distributed.launch --servers="192.168.0.16:6170,192.168.0.17:6170" --workers="192.168.0.16:6171,192.168.0.16:6172,192.168.0.17:6171,192.168.0.17:6172" train.py --lr=0.01
# On 192.168.0.17:
python -m paddle.distributed.launch --servers="192.168.0.16:6170,192.168.0.17:6170" --workers="192.168.0.16:6171,192.168.0.16:6172,192.168.0.17:6171,192.168.0.17:6172" train.py --lr=0.01
# Or with master, the following command run 2 server and 2 trainer on each node.
python -m paddle.distributed.launch --master 192.168.0.16:9090 --server_num=2 --trainer_num=2 --nnodes 2 train.py
Examples 5 (ps, gpu, single node):
.. code-block:: bash
:name: code-block-example-bash5
# To simulate distributed environment using single node, e.g., 2 servers and 4 workers, each worker use single gpu.
export CUDA_VISIBLE_DEVICES=0,1,2,3
python -m paddle.distributed.launch --server_num=2 --worker_num=4 train.py --lr=0.01
Examples 6 (ps, gpu, multi node):
.. code-block:: bash
:name: code-block-example-bash6
# For training on multiple nodes, e.g., 192.168.0.16, 192.168.0.17 where each node with 1 server and 2 workers.
# On 192.168.0.16:
export CUDA_VISIBLE_DEVICES=0,1
python -m paddle.distributed.launch --servers="192.168.0.16:6170,192.168.0.17:6170" --workers="192.168.0.16:6171,192.168.0.16:6172,192.168.0.17:6171,192.168.0.17:6172" train.py --lr=0.01
# On 192.168.0.17:
export CUDA_VISIBLE_DEVICES=0,1
python -m paddle.distributed.launch --servers="192.168.0.16:6170,192.168.0.17:6170" --workers="192.168.0.16:6171,192.168.0.16:6172,192.168.0.17:6171,192.168.0.17:6172" train.py --lr=0.01
Examples 7 (ps-heter, cpu + gpu, single node):
.. code-block:: bash
:name: code-block-example-bash7
# To simulate distributed environment using single node, e.g., 2 servers and 4 workers, two workers use gpu, two workers use cpu.
export CUDA_VISIBLE_DEVICES=0,1
python -m paddle.distributed.launch --server_num=2 --worker_num=2 --heter_worker_num=2 train.py --lr=0.01
Examples 8 (ps-heter, cpu + gpu, multi node):
.. code-block:: bash
:name: code-block-example-bash8
# For training on multiple nodes, e.g., 192.168.0.16, 192.168.0.17 where each node with 1 server, 1 gpu worker, 1 cpu worker.
# On 192.168.0.16:
export CUDA_VISIBLE_DEVICES=0
python -m paddle.distributed.launch --servers="192.168.0.16:6170,192.168.0.17:6170" --workers="192.168.0.16:6171,192.168.0.17:6171" --heter_workers="192.168.0.16:6172,192.168.0.17:6172" train.py --lr=0.01
# On 192.168.0.17:
export CUDA_VISIBLE_DEVICES=0
python -m paddle.distributed.launch --servers="192.168.0.16:6170,192.168.0.17:6170" --workers="192.168.0.16:6171,192.168.0.17:6171" --heter_workers="192.168.0.16:6172,192.168.0.17:6172" train.py --lr=0.01
Examples 9 (elastic):
.. code-block:: bash
:name: code-block-example-bash9
# With the following command, the job will begin to run immediately if 4 nodes are ready,
# or it will run after elastic_timeout if only 2 or 3 nodes ready
python -m paddle.distributed.launch --master etcd://10.0.0.1:2379 --nnodes 2:4 train.py
# once the number of nodes changes between 2:4 during training, the strategy holds
"""
# initialize the context to run
ctx = Context()
if ctx.is_legacy_mode():
# legacy mode
from paddle.distributed.fleet import launch
launch.launch()
else:
from . import controllers
# initialize the selected controller
c = controllers.init(ctx)
# run the pods
c.run()
# manager or just wait pod
c.finalize()
if __name__ == "__main__":
launch()
...@@ -30,6 +30,7 @@ from paddle.fluid.framework import _set_expected_place, _current_expected_place, ...@@ -30,6 +30,7 @@ from paddle.fluid.framework import _set_expected_place, _current_expected_place,
import queue import queue
import paddle import paddle
import paddle.profiler as profiler
from .. import core, layers from .. import core, layers
from ..framework import in_dygraph_mode, _in_eager_mode from ..framework import in_dygraph_mode, _in_eager_mode
from ..multiprocess_utils import _set_SIGCHLD_handler, MP_STATUS_CHECK_INTERVAL, CleanupFuncRegistrar from ..multiprocess_utils import _set_SIGCHLD_handler, MP_STATUS_CHECK_INTERVAL, CleanupFuncRegistrar
...@@ -250,6 +251,10 @@ class _DataLoaderIterSingleProcess(_DataLoaderIterBase): ...@@ -250,6 +251,10 @@ class _DataLoaderIterSingleProcess(_DataLoaderIterBase):
self._exit_thread_expectedly() self._exit_thread_expectedly()
def __next__(self): def __next__(self):
trace_event = profiler.RecordEvent(
name="_DataLoaderIterSingleProcess",
event_type=profiler.TracerEventType.Dataloader)
trace_event.begin()
try: try:
if in_dygraph_mode(): if in_dygraph_mode():
if _in_eager_mode(): if _in_eager_mode():
...@@ -283,6 +288,8 @@ class _DataLoaderIterSingleProcess(_DataLoaderIterBase): ...@@ -283,6 +288,8 @@ class _DataLoaderIterSingleProcess(_DataLoaderIterBase):
self._reader.shutdown() self._reader.shutdown()
self._try_shutdown_all() self._try_shutdown_all()
six.reraise(*sys.exc_info()) six.reraise(*sys.exc_info())
finally:
trace_event.end()
def _shutdown_thread(self): def _shutdown_thread(self):
if self._thread: if self._thread:
...@@ -695,6 +702,10 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase): ...@@ -695,6 +702,10 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase):
self._try_shutdown_all(1) self._try_shutdown_all(1)
def __next__(self): def __next__(self):
trace_event = profiler.RecordEvent(
name="_DataLoaderIterMultiProcess",
event_type=profiler.TracerEventType.Dataloader)
trace_event.begin()
try: try:
# _batches_outstanding here record the total batch data number # _batches_outstanding here record the total batch data number
# in 'from after _try_put_indices to beforeoutput data', this # in 'from after _try_put_indices to beforeoutput data', this
...@@ -743,6 +754,8 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase): ...@@ -743,6 +754,8 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase):
self._reader.shutdown() self._reader.shutdown()
self._try_shutdown_all() self._try_shutdown_all()
six.reraise(*sys.exc_info()) six.reraise(*sys.exc_info())
finally:
trace_event.end()
# python2 compatibility # python2 compatibility
def next(self): def next(self):
......
...@@ -25,6 +25,7 @@ from copy import deepcopy ...@@ -25,6 +25,7 @@ from copy import deepcopy
import inspect import inspect
import paddle import paddle
import paddle.profiler as profiler
from . import parallel_helper from . import parallel_helper
from .. import unique_name from .. import unique_name
...@@ -905,7 +906,9 @@ class Layer(object): ...@@ -905,7 +906,9 @@ class Layer(object):
self._built = True self._built = True
outputs = self.forward(*inputs, **kwargs) with profiler.RecordEvent(self.full_name(),
profiler.TracerEventType.Forward):
outputs = self.forward(*inputs, **kwargs)
for forward_post_hook in self._forward_post_hooks.values(): for forward_post_hook in self._forward_post_hooks.values():
hook_result = forward_post_hook(self, inputs, outputs) hook_result = forward_post_hook(self, inputs, outputs)
......
...@@ -2986,6 +2986,12 @@ class GroupNorm(layers.Layer): ...@@ -2986,6 +2986,12 @@ class GroupNorm(layers.Layer):
is_bias=True) is_bias=True)
def forward(self, input): def forward(self, input):
if in_dygraph_mode():
attrs = ('epsilon', self._epsilon, 'groups', self._groups)
out, _, _ = _C_ops.group_norm(input, self.weight, self.bias, *attrs)
return dygraph_utils._append_activation_in_dygraph(out, self._act)
inputs = {'X': input} inputs = {'X': input}
if self.bias is not None: if self.bias is not None:
inputs['Bias'] = self.bias inputs['Bias'] = self.bias
......
...@@ -28,6 +28,7 @@ from .math_op_patch import monkey_patch_math_varbase ...@@ -28,6 +28,7 @@ from .math_op_patch import monkey_patch_math_varbase
from .parallel import scale_loss from .parallel import scale_loss
from paddle.fluid.data_feeder import convert_dtype, _PADDLE_DTYPE_2_NUMPY_DTYPE from paddle.fluid.data_feeder import convert_dtype, _PADDLE_DTYPE_2_NUMPY_DTYPE
import paddle.utils.deprecated as deprecated import paddle.utils.deprecated as deprecated
import paddle.profiler as profiler
from paddle import _C_ops from paddle import _C_ops
...@@ -199,8 +200,8 @@ def monkey_patch_varbase(): ...@@ -199,8 +200,8 @@ def monkey_patch_varbase():
You can clear gradient by ``Tensor.clear_grad()`` . You can clear gradient by ``Tensor.clear_grad()`` .
Args: Args:
grad_tensor(Tensor, optional): initial gradient values of the current Tensor. If `grad_tensor` is None, grad_tensor(Tensor, optional): initial gradient values of the current Tensor. If `grad_tensor` is None,
the initial gradient values of the current Tensor would be Tensor filled with 1.0; the initial gradient values of the current Tensor would be Tensor filled with 1.0;
if `grad_tensor` is not None, it must have the same length as the current Tensor. if `grad_tensor` is not None, it must have the same length as the current Tensor.
Teh default value is None. Teh default value is None.
...@@ -243,6 +244,9 @@ def monkey_patch_varbase(): ...@@ -243,6 +244,9 @@ def monkey_patch_varbase():
""" """
if framework.in_dygraph_mode(): if framework.in_dygraph_mode():
record_event = profiler.RecordEvent(
"Gradient Backward", profiler.TracerEventType.Backward)
record_event.begin()
if grad_tensor is not None: if grad_tensor is not None:
if core._in_eager_mode(): if core._in_eager_mode():
assert isinstance( assert isinstance(
...@@ -278,6 +282,7 @@ def monkey_patch_varbase(): ...@@ -278,6 +282,7 @@ def monkey_patch_varbase():
core.dygraph_run_backward([self], [grad_tensor], core.dygraph_run_backward([self], [grad_tensor],
retain_graph, retain_graph,
framework._dygraph_tracer()) framework._dygraph_tracer())
record_event.end()
else: else:
raise ValueError( raise ValueError(
"Variable.backward() is only available in DyGraph mode") "Variable.backward() is only available in DyGraph mode")
...@@ -476,7 +481,7 @@ def monkey_patch_varbase(): ...@@ -476,7 +481,7 @@ def monkey_patch_varbase():
def grad(self): def grad(self):
""" """
.. warning:: .. warning::
This API will return the tensor value of the gradient. If you want This API will return the tensor value of the gradient. If you want
to get the numpy value of the gradient, you can use :code:`x.grad.numpy()`. to get the numpy value of the gradient, you can use :code:`x.grad.numpy()`.
Get the Gradient of Current Tensor. Get the Gradient of Current Tensor.
...@@ -515,7 +520,7 @@ def monkey_patch_varbase(): ...@@ -515,7 +520,7 @@ def monkey_patch_varbase():
def item(self, *args): def item(self, *args):
""" """
Convert element at specific position in Tensor into Python scalars. If the position is not specified, the Tensor must be a Convert element at specific position in Tensor into Python scalars. If the position is not specified, the Tensor must be a
single-element Tensor. single-element Tensor.
Args: Args:
...@@ -526,7 +531,7 @@ def monkey_patch_varbase(): ...@@ -526,7 +531,7 @@ def monkey_patch_varbase():
Raises: Raises:
ValueError: If the Tensor has more than one element, there must be coordinates. ValueError: If the Tensor has more than one element, there must be coordinates.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -588,7 +593,7 @@ def monkey_patch_varbase(): ...@@ -588,7 +593,7 @@ def monkey_patch_varbase():
import paddle import paddle
x = paddle.rand([2, 5]) x = paddle.rand([2, 5])
print(x) print(x)
# Tensor(shape=[2, 5], dtype=float32, place=CPUPlace, # Tensor(shape=[2, 5], dtype=float32, place=CPUPlace,
# [[0.30574632, 0.55739117, 0.30902600, 0.39413780, 0.44830436], # [[0.30574632, 0.55739117, 0.30902600, 0.39413780, 0.44830436],
# [0.79010487, 0.53972793, 0.09495186, 0.44267157, 0.72112119]]) # [0.79010487, 0.53972793, 0.09495186, 0.44267157, 0.72112119]])
...@@ -611,7 +616,7 @@ def monkey_patch_varbase(): ...@@ -611,7 +616,7 @@ def monkey_patch_varbase():
import copy import copy
x = paddle.to_tensor(2.) x = paddle.to_tensor(2.)
y = copy.deepcopy(x) y = copy.deepcopy(x)
print(x) print(x)
# Tensor(shape=[1], dtype=float32, place=CPUPlace, stop_gradient=True, # Tensor(shape=[1], dtype=float32, place=CPUPlace, stop_gradient=True,
# [2.]) # [2.])
...@@ -655,7 +660,7 @@ def monkey_patch_varbase(): ...@@ -655,7 +660,7 @@ def monkey_patch_varbase():
def __array__(self, dtype=None): def __array__(self, dtype=None):
""" """
Returns a numpy array shows the value of current Tensor. Returns a numpy array shows the value of current Tensor.
Returns: Returns:
ndarray: The numpy value of current Tensor. ndarray: The numpy value of current Tensor.
...@@ -763,8 +768,11 @@ def monkey_patch_varbase(): ...@@ -763,8 +768,11 @@ def monkey_patch_varbase():
return _setitem_impl_(self, item, value) return _setitem_impl_(self, item, value)
else: else:
# Call c++ func __setitem_varbase__ to speedup. if core._in_eager_mode():
return self.__setitem_varbase__(item, value) return self.__setitem_eager_tensor__(item, value)
else:
# Call c++ func __setitem_varbase__ to speedup.
return self.__setitem_varbase__(item, value)
@framework.dygraph_only @framework.dygraph_only
def _grad_ivar(self): def _grad_ivar(self):
......
...@@ -270,9 +270,10 @@ def generate_activation_fn(op_type): ...@@ -270,9 +270,10 @@ def generate_activation_fn(op_type):
op_type) op_type)
else: else:
# abs exp square ops support dtype(int32, int64, float16, float32, float64) # abs exp square ops support dtype(int32, int64, float16, float32, float64)
check_variable_and_dtype( check_variable_and_dtype(x, 'x', [
x, 'x', ['int32', 'int64', 'float16', 'float32', 'float64'], 'int32', 'int64', 'float16', 'float32', 'float64', 'complex64',
op_type) 'complex128'
], op_type)
helper = LayerHelper(op_type, **locals()) helper = LayerHelper(op_type, **locals())
......
...@@ -5616,9 +5616,10 @@ def transpose(x, perm, name=None): ...@@ -5616,9 +5616,10 @@ def transpose(x, perm, name=None):
out, _ = _C_ops.transpose2(x, 'axis', perm) out, _ = _C_ops.transpose2(x, 'axis', perm)
return out return out
check_variable_and_dtype( check_variable_and_dtype(x, 'x', [
x, 'x', ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'], 'bool', 'float16', 'float32', 'float64', 'int32', 'int64', 'complex64',
'transpose') 'complex128'
], 'transpose')
check_type(perm, 'perm', (list, tuple), 'transpose') check_type(perm, 'perm', (list, tuple), 'transpose')
if isinstance(perm, tuple): if isinstance(perm, tuple):
perm = list(perm) perm = list(perm)
...@@ -6410,10 +6411,10 @@ def squeeze(input, axes, name=None): ...@@ -6410,10 +6411,10 @@ def squeeze(input, axes, name=None):
return out return out
helper = LayerHelper("squeeze", **locals()) helper = LayerHelper("squeeze", **locals())
check_variable_and_dtype( check_variable_and_dtype(input, 'input', [
input, 'input', 'float16', 'float32', 'float64', 'bool', 'int8', 'int32', 'int64',
['float16', 'float32', 'float64', 'bool', 'int8', 'int32', 'int64'], 'complex64', 'complex128'
'squeeze') ], 'squeeze')
check_type(axes, 'axis/axes', (list, tuple), 'squeeze') check_type(axes, 'axis/axes', (list, tuple), 'squeeze')
out = helper.create_variable_for_type_inference(dtype=input.dtype) out = helper.create_variable_for_type_inference(dtype=input.dtype)
x_shape = helper.create_variable_for_type_inference(dtype=input.dtype) x_shape = helper.create_variable_for_type_inference(dtype=input.dtype)
...@@ -6471,8 +6472,16 @@ def unsqueeze(input, axes, name=None): ...@@ -6471,8 +6472,16 @@ def unsqueeze(input, axes, name=None):
check_type(axes, 'axis/axes', (int, list, tuple, Variable), 'unsqueeze') check_type(axes, 'axis/axes', (int, list, tuple, Variable), 'unsqueeze')
check_variable_and_dtype(input, 'input', [ check_variable_and_dtype(input, 'input', [
'float16', 'float32', 'float64', 'bool', 'int8', 'int16', 'int32', 'float16',
'int64' 'float32',
'float64',
'bool',
'int8',
'int16',
'int32',
'int64',
'complex64',
'complex128',
], 'unsqueeze') ], 'unsqueeze')
helper = LayerHelper("unsqueeze2", **locals()) helper = LayerHelper("unsqueeze2", **locals())
inputs = {"X": input} inputs = {"X": input}
...@@ -11180,8 +11189,8 @@ def slice(input, axes, starts, ends): ...@@ -11180,8 +11189,8 @@ def slice(input, axes, starts, ends):
ends_tensor.stop_gradient = True ends_tensor.stop_gradient = True
infer_flags = list(-1 for i in range(len(axes))) infer_flags = list(-1 for i in range(len(axes)))
return _C_ops.slice(input, starts_tensor, ends_tensor, 'axes', axes, return _C_ops.slice(input, starts_tensor, ends_tensor, None, None,
'infer_flags', infer_flags, *attrs) 'axes', axes, 'infer_flags', infer_flags, *attrs)
if not isinstance(starts, (list, tuple, Variable)): if not isinstance(starts, (list, tuple, Variable)):
raise ValueError( raise ValueError(
......
...@@ -632,7 +632,7 @@ def assign(input, output=None): ...@@ -632,7 +632,7 @@ def assign(input, output=None):
dtype = VarDesc.VarType.FP32 dtype = VarDesc.VarType.FP32
if dtype == VarDesc.VarType.BOOL: if dtype == VarDesc.VarType.BOOL:
value_name = "bool_values" value_name = "bool_values"
values = [bool(v) for v in input.flat] values = [int(v) for v in input.flat]
elif dtype == VarDesc.VarType.FP32: elif dtype == VarDesc.VarType.FP32:
value_name = "fp32_values" value_name = "fp32_values"
values = [float(v) for v in input.flat] values = [float(v) for v in input.flat]
...@@ -756,7 +756,7 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None, name=None): ...@@ -756,7 +756,7 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None, name=None):
check_shape(shape) check_shape(shape)
check_dtype(dtype, 'dtype', [ check_dtype(dtype, 'dtype', [
'bool', 'float16', 'float32', 'float64', 'uint8', 'int16', 'int32', 'bool', 'float16', 'float32', 'float64', 'uint8', 'int16', 'int32',
'int64' 'int64', 'complex64', 'complex128'
], 'fill_constant') ], 'fill_constant')
check_type(shape, 'shape', (Variable, list, tuple), 'fill_constant') check_type(shape, 'shape', (Variable, list, tuple), 'fill_constant')
......
...@@ -20,6 +20,8 @@ import os ...@@ -20,6 +20,8 @@ import os
import six import six
import sys import sys
from paddle.utils.deprecated import deprecated
__all__ = [ __all__ = [
'cuda_profiler', 'reset_profiler', 'profiler', 'start_profiler', 'cuda_profiler', 'reset_profiler', 'profiler', 'start_profiler',
'stop_profiler' 'stop_profiler'
...@@ -36,10 +38,16 @@ NVPROF_CONFIG = [ ...@@ -36,10 +38,16 @@ NVPROF_CONFIG = [
] ]
@deprecated(
since="2.3.0",
update_to="paddle.profiler.Profiler",
level=1,
reason="Please use new profiler tool, this profiler tool is no longer maintained."
)
@signature_safe_contextmanager @signature_safe_contextmanager
def cuda_profiler(output_file, output_mode=None, config=None): def cuda_profiler(output_file, output_mode=None, config=None):
""" """
API cuda_profiler has been abandoned. If you have relevant requirements, you can use `paddle.utils.profiler.start_profiler` and `paddle.utils.profiler.stop_profiler`. API cuda_profiler has been abandoned. If you have relevant requirements, you can use `paddle.utils.profiler.start_profiler` and `paddle.utils.profiler.stop_profiler`.
The relevant reference documents are as follows: The relevant reference documents are as follows:
<https://www.paddlepaddle.org.cn/documentation/docs/en/api/paddle/utils/profiler/start_profiler_en.html#start-profiler> <https://www.paddlepaddle.org.cn/documentation/docs/en/api/paddle/utils/profiler/start_profiler_en.html#start-profiler>
<https://www.paddlepaddle.org.cn/documentation/docs/en/api/paddle/utils/profiler/stop_profiler_en.html#stop-profiler> <https://www.paddlepaddle.org.cn/documentation/docs/en/api/paddle/utils/profiler/stop_profiler_en.html#stop-profiler>
...@@ -54,18 +62,18 @@ def cuda_profiler(output_file, output_mode=None, config=None): ...@@ -54,18 +62,18 @@ def cuda_profiler(output_file, output_mode=None, config=None):
def npu_profiler(output_file, config=None): def npu_profiler(output_file, config=None):
""" """
The NPU profiler. The NPU profiler.
This fuctions is used to profile NPU program by NPU runtime application This fuctions is used to profile NPU program by NPU runtime application
programming interface. The profiling result will be written into programming interface. The profiling result will be written into
`output_file`. The users can set set the NPU profiling config by `config` argument. `output_file`. The users can set set the NPU profiling config by `config` argument.
After getting the profiling result file, users can use After getting the profiling result file, users can use
`tools provided by Ascend <https://support.huaweicloud.com/tg-Inference-cann/atlasprofiling_16_0006.html>`_ `tools provided by Ascend <https://support.huaweicloud.com/tg-Inference-cann/atlasprofiling_16_0006.html>`_
to load this output file to visualize results. to load this output file to visualize results.
Args: Args:
output_file (str) : The output file name, the result will be output_file (str) : The output file name, the result will be
written into this file. It should be absolute path. written into this file. It should be absolute path.
config (list<str>, optional) : NPU profile config. For more details, please config (list<str>, optional) : NPU profile config. For more details, please
refer to `User Guide <https://support.huaweicloud.com/tg-Inference-cann/atlasprofiling_16_0006.html>`_ . refer to `User Guide <https://support.huaweicloud.com/tg-Inference-cann/atlasprofiling_16_0006.html>`_ .
...@@ -109,6 +117,12 @@ def npu_profiler(output_file, config=None): ...@@ -109,6 +117,12 @@ def npu_profiler(output_file, config=None):
core.npu_prof_finalize() core.npu_prof_finalize()
@deprecated(
since="2.3.0",
update_to="paddle.profiler.Profiler",
level=1,
reason="Please use new profiler tool, this profiler tool is no longer maintained."
)
def reset_profiler(): def reset_profiler():
""" """
Clear the previous time record. It works for Clear the previous time record. It works for
...@@ -131,31 +145,38 @@ def reset_profiler(): ...@@ -131,31 +145,38 @@ def reset_profiler():
core.reset_profiler() core.reset_profiler()
@deprecated(
since="2.3.0",
update_to="paddle.profiler.Profiler",
level=1,
reason="Please use new profiler tool, this profiler tool is no longer maintained."
)
def start_profiler(state, tracer_option='Default'): def start_profiler(state, tracer_option='Default'):
""" """
Enable the profiler. Uers can use `fluid.profiler.start_profiler` and Enable the profiler. Uers can use `fluid.profiler.start_profiler` and
`fluid.profiler.stop_profiler` to profile, which is equal to the usage `fluid.profiler.stop_profiler` to profile, which is equal to the usage
of `fluid.profiler.profiler` interface. of `fluid.profiler.profiler` interface.
Args: Args:
state (str) : The profiling state, which should be one of 'CPU', 'GPU' state (str) : The profiling state, which should be one of 'CPU', 'GPU'
or 'All'. 'CPU' means only profiling CPU; 'GPU' means profiling or 'All'. 'CPU' means only profiling CPU; 'GPU' means profiling
both CPU and GPU; 'All' means profiling both CPU and GPU, and both CPU and GPU; 'All' means profiling both CPU and GPU, and
generates timeline as well. generates timeline as well.
tracer_option (str, optional) : tracer_option can be one of ['Default', 'OpDetail', 'AllOpDetail'], it tracer_option (str, optional) : tracer_option can be one of ['Default', 'OpDetail', 'AllOpDetail'], it
can control the profile level and print the different level profile result. `Default` option print can control the profile level and print the different level profile result. `Default` option print
the different Op type profiling result and the `OpDetail` option print the detail profiling the different Op type profiling result and the `OpDetail` option print the detail profiling
result of different op types such as compute and data transform, `AllOpDetail` option result of different op types such as compute and data transform, `AllOpDetail` option
print the detail profiling result of different op name same as `OpDetail`. print the detail profiling result of different op name same as `OpDetail`.
Raises: Raises:
ValueError: If `state` is not in ['CPU', 'GPU', 'All'] or `tracer_option` ValueError: If `state` is not in ['CPU', 'GPU', 'All'] or `tracer_option`
is not in ['Default', 'OpDetail', 'AllOpDetail']. is not in ['Default', 'OpDetail', 'AllOpDetail'].
Examples: Examples:
.. code-block:: python .. code-block:: python
# required: gpu
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.profiler as profiler import paddle.fluid.profiler as profiler
...@@ -165,7 +186,7 @@ def start_profiler(state, tracer_option='Default'): ...@@ -165,7 +186,7 @@ def start_profiler(state, tracer_option='Default'):
profiler.reset_profiler() profiler.reset_profiler()
# except each iteration # except each iteration
profiler.stop_profiler('total', '/tmp/profile') profiler.stop_profiler('total', '/tmp/profile')
profiler.start_profiler('GPU', "OpDetail") profiler.start_profiler('GPU', "OpDetail")
for iter in range(10): for iter in range(10):
if iter == 2: if iter == 2:
...@@ -198,14 +219,20 @@ def start_profiler(state, tracer_option='Default'): ...@@ -198,14 +219,20 @@ def start_profiler(state, tracer_option='Default'):
core.enable_profiler(prof_state) core.enable_profiler(prof_state)
@deprecated(
since="2.3.0",
update_to="paddle.profiler.Profiler",
level=1,
reason="Please use new profiler tool, this profiler tool is no longer maintained."
)
def stop_profiler(sorted_key=None, profile_path='/tmp/profile'): def stop_profiler(sorted_key=None, profile_path='/tmp/profile'):
""" """
Stop the profiler. Uers can use `fluid.profiler.start_profiler` and Stop the profiler. Uers can use `fluid.profiler.start_profiler` and
`fluid.profiler.stop_profiler` to profile, which is equal to the usage `fluid.profiler.stop_profiler` to profile, which is equal to the usage
of `fluid.profiler.profiler` interface. of `fluid.profiler.profiler` interface.
Args: Args:
sorted_key (str, optional) : The order of profiling results, which sorted_key (str, optional) : The order of profiling results, which
should be one of None, 'calls', 'total', 'max', 'min' or 'ave'. should be one of None, 'calls', 'total', 'max', 'min' or 'ave'.
Default is None, means the profiling results will be printed Default is None, means the profiling results will be printed
in the order of first end time of events. in the order of first end time of events.
...@@ -214,7 +241,7 @@ def stop_profiler(sorted_key=None, profile_path='/tmp/profile'): ...@@ -214,7 +241,7 @@ def stop_profiler(sorted_key=None, profile_path='/tmp/profile'):
The `max` means sorting by the maximum execution time. The `max` means sorting by the maximum execution time.
The `min` means sorting by the minimum execution time. The `min` means sorting by the minimum execution time.
The `ave` means sorting by the average execution time. The `ave` means sorting by the average execution time.
and write it into `profile_path`. The default profile_path is `/tmp/profile`. and write it into `profile_path`. The default profile_path is `/tmp/profile`.
profile_path (str, optional) : If state == 'All', it will generate timeline, profile_path (str, optional) : If state == 'All', it will generate timeline,
Raises: Raises:
...@@ -225,6 +252,7 @@ def stop_profiler(sorted_key=None, profile_path='/tmp/profile'): ...@@ -225,6 +252,7 @@ def stop_profiler(sorted_key=None, profile_path='/tmp/profile'):
.. code-block:: python .. code-block:: python
# required: gpu
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.profiler as profiler import paddle.fluid.profiler as profiler
...@@ -254,6 +282,12 @@ def stop_profiler(sorted_key=None, profile_path='/tmp/profile'): ...@@ -254,6 +282,12 @@ def stop_profiler(sorted_key=None, profile_path='/tmp/profile'):
core.disable_profiler(key_map[sorted_key], profile_path) core.disable_profiler(key_map[sorted_key], profile_path)
@deprecated(
since="2.3.0",
update_to="paddle.profiler.Profiler",
level=1,
reason="Please use new profiler tool, this profiler tool is no longer maintained."
)
@signature_safe_contextmanager @signature_safe_contextmanager
def profiler(state, def profiler(state,
sorted_key=None, sorted_key=None,
...@@ -265,9 +299,9 @@ def profiler(state, ...@@ -265,9 +299,9 @@ def profiler(state,
Args: Args:
state (str) : The profiling state, which should be one of 'CPU', 'GPU' state (str) : The profiling state, which should be one of 'CPU', 'GPU'
or 'All'. 'CPU' means only profiling CPU; 'GPU' means profiling or 'All'. 'CPU' means only profiling CPU; 'GPU' means profiling
both CPU and GPU; 'All' means profiling both CPU and GPU, and both CPU and GPU; 'All' means profiling both CPU and GPU, and
generates timeline as well. generates timeline as well.
sorted_key (str, optional) : The order of profiling results, which sorted_key (str, optional) : The order of profiling results, which
should be one of None, 'calls', 'total', 'max', 'min' or 'ave'. should be one of None, 'calls', 'total', 'max', 'min' or 'ave'.
Default is None, means the profiling results will be printed Default is None, means the profiling results will be printed
in the order of first end time of events. in the order of first end time of events.
...@@ -277,11 +311,11 @@ def profiler(state, ...@@ -277,11 +311,11 @@ def profiler(state,
The `min` means sorting by the minimum execution time. The `min` means sorting by the minimum execution time.
The `ave` means sorting by the average execution time. The `ave` means sorting by the average execution time.
profile_path (str, optional) : If state == 'All', it will generate timeline, profile_path (str, optional) : If state == 'All', it will generate timeline,
and write it into `profile_path`. The default profile_path is `/tmp/profile`. and write it into `profile_path`. The default profile_path is `/tmp/profile`.
tracer_option (str, optional) : tracer_option can be one of ['Default', 'OpDetail', 'AllOpDetail'], it tracer_option (str, optional) : tracer_option can be one of ['Default', 'OpDetail', 'AllOpDetail'], it
can control the profile level and print the different level profile result. `Default` option print can control the profile level and print the different level profile result. `Default` option print
the different Op type profiling result and the `OpDetail` option print the detail profiling the different Op type profiling result and the `OpDetail` option print the detail profiling
result of different op types such as compute and data transform, `AllOpDetail` option result of different op types such as compute and data transform, `AllOpDetail` option
print the detail profiling result of different op name same as `OpDetail`. print the detail profiling result of different op name same as `OpDetail`.
Raises: Raises:
...@@ -319,7 +353,7 @@ def profiler(state, ...@@ -319,7 +353,7 @@ def profiler(state,
#### Examples Results #### #### Examples Results ####
#### 1) sorted_key = 'total', 'calls', 'max', 'min', 'ave' #### #### 1) sorted_key = 'total', 'calls', 'max', 'min', 'ave' ####
# The only difference in 5 sorted_key results is the following sentence: # The only difference in 5 sorted_key results is the following sentence:
# "Sorted by number of xxx in descending order in the same thread." # "Sorted by number of xxx in descending order in the same thread."
# The reason is that in this example, above 5 columns are already sorted. # The reason is that in this example, above 5 columns are already sorted.
-------------------------> Profiling Report <------------------------- -------------------------> Profiling Report <-------------------------
...@@ -339,7 +373,7 @@ def profiler(state, ...@@ -339,7 +373,7 @@ def profiler(state,
#### 2) sorted_key = None #### #### 2) sorted_key = None ####
# Since the profiling results are printed in the order of first end time of Ops, # Since the profiling results are printed in the order of first end time of Ops,
# the printed order is feed->conv2d->elementwise_add # the printed order is feed->conv2d->elementwise_add
-------------------------> Profiling Report <------------------------- -------------------------> Profiling Report <-------------------------
Place: CPU Place: CPU
...@@ -366,7 +400,7 @@ def _nvprof_range(iter_id, start, end, exit_after_prof=True): ...@@ -366,7 +400,7 @@ def _nvprof_range(iter_id, start, end, exit_after_prof=True):
Examples: Examples:
.. code-block:: python .. code-block:: python
model = Model() model = Model()
for i in range(max_iter): for i in range(max_iter):
paddle.fluid.profiler._nvprof_range(i, 10, 20): paddle.fluid.profiler._nvprof_range(i, 10, 20):
......
...@@ -71,7 +71,7 @@ class TestAssignValueNPUOp4(TestAssignValueNPUOp): ...@@ -71,7 +71,7 @@ class TestAssignValueNPUOp4(TestAssignValueNPUOp):
def init_data(self): def init_data(self):
self.value = numpy.random.choice( self.value = numpy.random.choice(
a=[False, True], size=(2, 5)).astype(numpy.bool) a=[False, True], size=(2, 5)).astype(numpy.bool)
self.attrs["bool_values"] = [bool(v) for v in self.value.flat] self.attrs["bool_values"] = [int(v) for v in self.value.flat]
class TestAssignApi(unittest.TestCase): class TestAssignApi(unittest.TestCase):
......
...@@ -58,7 +58,7 @@ class TestAssignValueOp4(TestAssignValueOp): ...@@ -58,7 +58,7 @@ class TestAssignValueOp4(TestAssignValueOp):
def init_data(self): def init_data(self):
self.value = numpy.random.choice( self.value = numpy.random.choice(
a=[False, True], size=(2, 5)).astype(numpy.bool) a=[False, True], size=(2, 5)).astype(numpy.bool)
self.attrs["bool_values"] = [bool(v) for v in self.value.flat] self.attrs["bool_values"] = [int(v) for v in self.value.flat]
class TestAssignApi(unittest.TestCase): class TestAssignApi(unittest.TestCase):
......
...@@ -24,6 +24,7 @@ from op_test import OpTest, skip_check_grad_ci, convert_float_to_uint16 ...@@ -24,6 +24,7 @@ from op_test import OpTest, skip_check_grad_ci, convert_float_to_uint16
class ElementwiseDivOp(OpTest): class ElementwiseDivOp(OpTest):
def setUp(self): def setUp(self):
self.op_type = "elementwise_div" self.op_type = "elementwise_div"
self.python_api = paddle.divide
self.dtype = np.float64 self.dtype = np.float64
self.init_dtype() self.init_dtype()
""" Warning """ Warning
...@@ -37,8 +38,11 @@ class ElementwiseDivOp(OpTest): ...@@ -37,8 +38,11 @@ class ElementwiseDivOp(OpTest):
} }
self.outputs = {'Out': np.divide(self.inputs['X'], self.inputs['Y'])} self.outputs = {'Out': np.divide(self.inputs['X'], self.inputs['Y'])}
def check_eager(self):
return (self.use_mkldnn == False and self.axis == -1)
def test_check_output(self): def test_check_output(self):
self.check_output() self.check_output(check_eager=False)
def test_check_grad_normal(self): def test_check_grad_normal(self):
self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.05) self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.05)
......
...@@ -182,7 +182,7 @@ class TestImperativeAutoPrune(unittest.TestCase): ...@@ -182,7 +182,7 @@ class TestImperativeAutoPrune(unittest.TestCase):
self.func_auto_prune2() self.func_auto_prune2()
# TODO(jiabin): Support this when we support better split tensor # TODO(jiabin): Support this when we support better split tensor
def test_auto_prune3(self): def func_auto_prune3(self):
with fluid.dygraph.guard(): with fluid.dygraph.guard():
case3 = AutoPruneLayer3(input_size=784) case3 = AutoPruneLayer3(input_size=784)
value1 = np.arange(784).reshape(1, 784).astype("float32") value1 = np.arange(784).reshape(1, 784).astype("float32")
...@@ -194,7 +194,12 @@ class TestImperativeAutoPrune(unittest.TestCase): ...@@ -194,7 +194,12 @@ class TestImperativeAutoPrune(unittest.TestCase):
self.assertTrue(case3.linear.weight._grad_ivar() is not None) self.assertTrue(case3.linear.weight._grad_ivar() is not None)
self.assertTrue((part2.gradient() == 0).all()) self.assertTrue((part2.gradient() == 0).all())
def test_auto_prune4(self): def test_auto_prune3(self):
with _test_eager_guard():
self.func_auto_prune3()
self.func_auto_prune3()
def func_auto_prune4(self):
with fluid.dygraph.guard(): with fluid.dygraph.guard():
case4 = AutoPruneLayer3(input_size=784) case4 = AutoPruneLayer3(input_size=784)
value1 = np.arange(784).reshape(1, 784).astype("float32") value1 = np.arange(784).reshape(1, 784).astype("float32")
...@@ -206,7 +211,12 @@ class TestImperativeAutoPrune(unittest.TestCase): ...@@ -206,7 +211,12 @@ class TestImperativeAutoPrune(unittest.TestCase):
self.assertTrue(case4.linear.weight._grad_ivar() is not None) self.assertTrue(case4.linear.weight._grad_ivar() is not None)
self.assertTrue((part2.gradient() == 1).all()) self.assertTrue((part2.gradient() == 1).all())
def test_auto_prune5(self): def test_auto_prune4(self):
with _test_eager_guard():
self.func_auto_prune4()
self.func_auto_prune4()
def func_auto_prune5(self):
with fluid.dygraph.guard(): with fluid.dygraph.guard():
case4 = AutoPruneLayer3(input_size=784) case4 = AutoPruneLayer3(input_size=784)
value1 = np.arange(784).reshape(1, 784).astype("float32") value1 = np.arange(784).reshape(1, 784).astype("float32")
...@@ -218,6 +228,11 @@ class TestImperativeAutoPrune(unittest.TestCase): ...@@ -218,6 +228,11 @@ class TestImperativeAutoPrune(unittest.TestCase):
self.assertTrue(case4.linear.weight._grad_ivar() is not None) self.assertTrue(case4.linear.weight._grad_ivar() is not None)
self.assertTrue((part2.gradient() == 0).all()) self.assertTrue((part2.gradient() == 0).all())
def test_auto_prune5(self):
with _test_eager_guard():
self.func_auto_prune5()
self.func_auto_prune5()
def func_auto_prune6(self): def func_auto_prune6(self):
with fluid.dygraph.guard(): with fluid.dygraph.guard():
value0 = np.arange(26).reshape(2, 13).astype("float32") value0 = np.arange(26).reshape(2, 13).astype("float32")
......
...@@ -1819,7 +1819,7 @@ class TestLayer(LayerTest): ...@@ -1819,7 +1819,7 @@ class TestLayer(LayerTest):
self.assertTrue(np.allclose(static_ret, static_ret2)) self.assertTrue(np.allclose(static_ret, static_ret2))
def test_group_norm(self): def func_group_norm(self):
if core.is_compiled_with_cuda(): if core.is_compiled_with_cuda():
place = core.CUDAPlace(0) place = core.CUDAPlace(0)
else: else:
...@@ -1873,7 +1873,6 @@ class TestLayer(LayerTest): ...@@ -1873,7 +1873,6 @@ class TestLayer(LayerTest):
with_lod=True)[0] with_lod=True)[0]
with self.dynamic_graph(): with self.dynamic_graph():
# TODO(wuweilong): Add with _test_eager_guard():
groupNorm = nn.GroupNorm( groupNorm = nn.GroupNorm(
channels=shape[1], channels=shape[1],
groups=2, groups=2,
...@@ -1886,6 +1885,11 @@ class TestLayer(LayerTest): ...@@ -1886,6 +1885,11 @@ class TestLayer(LayerTest):
self.assertTrue(np.allclose(static_ret, dy_rlt_value)) self.assertTrue(np.allclose(static_ret, dy_rlt_value))
self.assertTrue(np.allclose(static_ret, static_ret2)) self.assertTrue(np.allclose(static_ret, static_ret2))
def test_group_norm(self):
with _test_eager_guard():
self.func_group_norm()
self.func_group_norm()
def test_instance_norm(self): def test_instance_norm(self):
if core.is_compiled_with_cuda(): if core.is_compiled_with_cuda():
place = core.CUDAPlace(0) place = core.CUDAPlace(0)
...@@ -2348,7 +2352,7 @@ class TestLayer(LayerTest): ...@@ -2348,7 +2352,7 @@ class TestLayer(LayerTest):
with self.assertRaises(TypeError): with self.assertRaises(TypeError):
layers.eye(num_rows=3, batch_shape=[-1]) layers.eye(num_rows=3, batch_shape=[-1])
def test_while_loop(self): def func_while_loop(self):
with self.static_graph(): with self.static_graph():
i = layers.fill_constant(shape=[1], dtype='int64', value=0) i = layers.fill_constant(shape=[1], dtype='int64', value=0)
ten = layers.fill_constant(shape=[1], dtype='int64', value=10) ten = layers.fill_constant(shape=[1], dtype='int64', value=10)
...@@ -2363,7 +2367,6 @@ class TestLayer(LayerTest): ...@@ -2363,7 +2367,6 @@ class TestLayer(LayerTest):
static_ret = self.get_static_graph_result(feed={}, fetch_list=out) static_ret = self.get_static_graph_result(feed={}, fetch_list=out)
with self.dynamic_graph(): with self.dynamic_graph():
# TODO(wuweilong): Add with _test_eager_guard():
i = layers.fill_constant(shape=[1], dtype='int64', value=0) i = layers.fill_constant(shape=[1], dtype='int64', value=0)
ten = layers.fill_constant(shape=[1], dtype='int64', value=10) ten = layers.fill_constant(shape=[1], dtype='int64', value=10)
...@@ -2384,6 +2387,11 @@ class TestLayer(LayerTest): ...@@ -2384,6 +2387,11 @@ class TestLayer(LayerTest):
self.assertTrue(np.array_equal(static_ret[0], dy_ret[0].numpy())) self.assertTrue(np.array_equal(static_ret[0], dy_ret[0].numpy()))
def test_while_loop(self):
with _test_eager_guard():
self.func_while_loop()
self.func_while_loop()
def test_compare(self): def test_compare(self):
value_a = np.arange(3) value_a = np.arange(3)
value_b = np.arange(3) value_b = np.arange(3)
......
...@@ -21,6 +21,7 @@ from paddle.fluid import core ...@@ -21,6 +21,7 @@ from paddle.fluid import core
from op_test import OpTest from op_test import OpTest
import numpy as np import numpy as np
from paddle.fluid.framework import _test_eager_guard from paddle.fluid.framework import _test_eager_guard
import os
def sample_output_one_dimension(out, dim): def sample_output_one_dimension(out, dim):
...@@ -250,6 +251,60 @@ class TestMultinomialError(unittest.TestCase): ...@@ -250,6 +251,60 @@ class TestMultinomialError(unittest.TestCase):
self.assertRaises(ValueError, test_dim_less_than_1) self.assertRaises(ValueError, test_dim_less_than_1)
class TestRandomValue(unittest.TestCase):
def test_fixed_random_number(self):
# Test GPU Fixed random number, which is generated by 'curandStatePhilox4_32_10_t'
if not paddle.is_compiled_with_cuda():
return
# Different GPU generatte different random value. Only test V100 here.
if not "V100" in paddle.device.cuda.get_device_name():
return
if os.getenv("FLAGS_use_curand", None) in ('0', 'False', None):
return
print("Test Fixed Random number on V100 GPU------>")
paddle.disable_static()
paddle.set_device('gpu')
paddle.seed(100)
x = paddle.randint(0, 100, [1024, 10000]).astype('float32')
y = paddle.multinomial(x, 1, replacement=False).numpy()
self.assertEqual(np.sum(y), 5187793)
self.assertEqual(np.mean(y), 5066.2041015625)
expect = [9982, 1655, 4741, 1323, 9319, 3298, 6473, 7477, 2507, 2628]
self.assertTrue(np.array_equal(y[100:110, :].flatten(), expect))
y = paddle.multinomial(x, 5000, replacement=False).numpy()
self.assertEqual(np.sum(y), 25603962316)
self.assertEqual(np.mean(y), 5000.77388984375)
expect = [7300, 6055, 8714, 5401, 7360, 161, 5035, 7002, 6788, 2916]
self.assertTrue(np.array_equal(y[100, 1000:1010], expect))
y = paddle.multinomial(x, 5000, replacement=False).numpy()
self.assertEqual(np.sum(y), 25592855710)
self.assertEqual(np.mean(y), 4998.604630859375)
expect = [5700, 6567, 4399, 5688, 7472, 545, 6894, 526, 2124, 385]
self.assertTrue(np.array_equal(y[300, 3000:3010], expect))
y = paddle.multinomial(x, 20000, replacement=True).numpy()
self.assertEqual(np.sum(y), 102371362581)
self.assertEqual(np.mean(y), 4998.60168852539)
self.assertEqual(np.std(y), 2886.316308500771)
expect = [7630, 8235, 8445, 3275, 5580, 4591, 1331, 342, 1662, 7156]
self.assertTrue(np.array_equal(y[100, 0:10], expect))
y = paddle.multinomial(x, 20000, replacement=True).numpy()
self.assertEqual(np.sum(y), 102400672117)
self.assertEqual(np.mean(y), 5000.032818212891)
self.assertEqual(np.std(y), 2886.913426124017)
expect = [4159, 7849, 9305, 5759, 4422, 122, 345, 2897, 5200, 5911]
self.assertTrue(np.array_equal(y[100, 0:10], expect))
paddle.enable_static()
if __name__ == "__main__": if __name__ == "__main__":
paddle.enable_static() paddle.enable_static()
unittest.main() unittest.main()
...@@ -56,7 +56,15 @@ class TestProfilerStatistic(unittest.TestCase): ...@@ -56,7 +56,15 @@ class TestProfilerStatistic(unittest.TestCase):
mobilenet_node = HostPythonNode( mobilenet_node = HostPythonNode(
'MobileNet', profiler.TracerEventType.Forward, 20, 50, 1000, 1001) 'MobileNet', profiler.TracerEventType.Forward, 20, 50, 1000, 1001)
yolonet_node = HostPythonNode( yolonet_node = HostPythonNode(
'Yolov3Net', profiler.TracerEventType.Forward, 50, 100, 1000, 1001) 'Yolov3Net', profiler.TracerEventType.Forward, 50, 110, 1000, 1001)
userdefined_node = HostPythonNode('Communication Time',
profiler.TracerEventType.UserDefined,
100, 110, 1000, 1001)
communication_node = HostPythonNode(
'Communication', profiler.TracerEventType.Communication, 105, 110,
1000, 1001)
backward_node = HostPythonNode('Gradient Backward', backward_node = HostPythonNode('Gradient Backward',
profiler.TracerEventType.Backward, 120, profiler.TracerEventType.Backward, 120,
200, 1000, 1001) 200, 1000, 1001)
...@@ -114,7 +122,9 @@ class TestProfilerStatistic(unittest.TestCase): ...@@ -114,7 +122,9 @@ class TestProfilerStatistic(unittest.TestCase):
optimization_node optimization_node
]) ])
mobilenet_node.children_node.append(conv2d_node) mobilenet_node.children_node.append(conv2d_node)
yolonet_node.children_node.append(sync_batch_norm_node) yolonet_node.children_node.extend(
[sync_batch_norm_node, userdefined_node])
userdefined_node.children_node.append(communication_node)
conv2d_node.children_node.extend( conv2d_node.children_node.extend(
[conv2d_infer_shape, conv2d_compute, conv2d_MemCpy]) [conv2d_infer_shape, conv2d_compute, conv2d_MemCpy])
conv2d_compute.runtime_node.append(conv2d_launchkernel) conv2d_compute.runtime_node.append(conv2d_launchkernel)
...@@ -145,7 +155,7 @@ class TestProfilerStatistic(unittest.TestCase): ...@@ -145,7 +155,7 @@ class TestProfilerStatistic(unittest.TestCase):
profiler.TracerEventType.ProfileStep), 400) profiler.TracerEventType.ProfileStep), 400)
self.assertEqual( self.assertEqual(
time_range_summary.get_cpu_range_sum( time_range_summary.get_cpu_range_sum(
profiler.TracerEventType.Forward), 90) profiler.TracerEventType.Forward), 100)
self.assertEqual( self.assertEqual(
time_range_summary.get_cpu_range_sum( time_range_summary.get_cpu_range_sum(
profiler.TracerEventType.Backward), 80) profiler.TracerEventType.Backward), 80)
...@@ -169,15 +179,18 @@ class TestProfilerStatistic(unittest.TestCase): ...@@ -169,15 +179,18 @@ class TestProfilerStatistic(unittest.TestCase):
0, profiler.TracerEventType.Memcpy), 60) 0, profiler.TracerEventType.Memcpy), 60)
self.assertEqual( self.assertEqual(
time_range_summary.get_cpu_range_sum( time_range_summary.get_cpu_range_sum(
profiler.TracerEventType.UserDefined), 15) profiler.TracerEventType.UserDefined), 25)
self.assertEqual(
time_range_summary.get_cpu_range_sum(
profiler.TracerEventType.Communication), 5)
self.assertEqual(len(event_summary.items), 2) self.assertEqual(len(event_summary.items), 2)
self.assertEqual(len(event_summary.userdefined_items), 0) self.assertEqual(len(event_summary.userdefined_items), 1)
self.assertEqual(len(event_summary.model_perspective_items), 3) self.assertEqual(len(event_summary.model_perspective_items), 3)
self.assertEqual(len(event_summary.memory_manipulation_items), 1) self.assertEqual(len(event_summary.memory_manipulation_items), 1)
self.assertEqual(event_summary.items['conv2d'].cpu_time, 15) self.assertEqual(event_summary.items['conv2d'].cpu_time, 15)
self.assertEqual(event_summary.items['conv2d'].gpu_time, 25) self.assertEqual(event_summary.items['conv2d'].gpu_time, 25)
self.assertEqual( self.assertEqual(
event_summary.model_perspective_items['Forward'].cpu_time, 90) event_summary.model_perspective_items['Forward'].cpu_time, 100)
self.assertEqual( self.assertEqual(
event_summary.model_perspective_items['Forward'].gpu_time, 135) event_summary.model_perspective_items['Forward'].gpu_time, 135)
self.assertEqual( self.assertEqual(
......
...@@ -116,7 +116,7 @@ class PS_Test(unittest.TestCase): ...@@ -116,7 +116,7 @@ class PS_Test(unittest.TestCase):
return proc return proc
def test_ps_1(self): def test_ps_1(self):
args = "--mode ps" args = "--run_mode ps"
p = self.pdrun(args) p = self.pdrun(args)
p.wait() p.wait()
self.assertTrue(p.poll() == 0) self.assertTrue(p.poll() == 0)
......
...@@ -22,6 +22,7 @@ import numpy as np ...@@ -22,6 +22,7 @@ import numpy as np
import paddle import paddle
from paddle.fluid.layer_helper import LayerHelper from paddle.fluid.layer_helper import LayerHelper
from functools import reduce from functools import reduce
from paddle.fluid.framework import _test_eager_guard, _in_eager_mode
class TestSetValueBase(unittest.TestCase): class TestSetValueBase(unittest.TestCase):
...@@ -69,7 +70,7 @@ class TestSetValueApi(TestSetValueBase): ...@@ -69,7 +70,7 @@ class TestSetValueApi(TestSetValueBase):
paddle.enable_static() paddle.enable_static()
return out return out
def test_api(self): def func_test_api(self):
static_out = self._run_static() static_out = self._run_static()
dynamic_out = self._run_dynamic() dynamic_out = self._run_dynamic()
self._get_answer() self._get_answer()
...@@ -82,6 +83,11 @@ class TestSetValueApi(TestSetValueBase): ...@@ -82,6 +83,11 @@ class TestSetValueApi(TestSetValueBase):
(self.data == dynamic_out).all(), (self.data == dynamic_out).all(),
msg=error_msg.format("dynamic", self.data, dynamic_out)) msg=error_msg.format("dynamic", self.data, dynamic_out))
def test_api(self):
with _test_eager_guard():
self.func_test_api()
self.func_test_api()
# 1. Test different type of item: int, Python slice, Paddle Tensor # 1. Test different type of item: int, Python slice, Paddle Tensor
# 1.1 item is int # 1.1 item is int
...@@ -995,9 +1001,9 @@ class TestBackward(unittest.TestCase): ...@@ -995,9 +1001,9 @@ class TestBackward(unittest.TestCase):
fetch_list=[var.name + "@GRAD", z.name + "@GRAD"]) fetch_list=[var.name + "@GRAD", z.name + "@GRAD"])
self.assertTrue((var_grad == z_grad[0, :]).all()) self.assertTrue((var_grad == z_grad[0, :]).all())
def test_dynamic(self):
paddle.disable_static() paddle.disable_static()
def func_test_dynamic(self):
model = Model() model = Model()
x = paddle.ones([1, 12, 3, 3]).astype("float32") x = paddle.ones([1, 12, 3, 3]).astype("float32")
y = paddle.ones([1, 12, 3, 3]).astype("float32") y = paddle.ones([1, 12, 3, 3]).astype("float32")
...@@ -1006,11 +1012,18 @@ class TestBackward(unittest.TestCase): ...@@ -1006,11 +1012,18 @@ class TestBackward(unittest.TestCase):
self.assertTrue(var.grad.shape == x.grad[0, :, 0, 0].shape) self.assertTrue(var.grad.shape == x.grad[0, :, 0, 0].shape)
# #
self.assertTrue((0 == x.grad[0, :, 0, 0]).all()) # TODO(pangyoki) add inplace and delete if
if not _in_eager_mode():
self.assertTrue((0 == x.grad[0, :, 0, 0]).all())
def test_dynamic(self):
with _test_eager_guard():
self.func_test_dynamic()
self.func_test_dynamic()
class TestGradientTruncated(unittest.TestCase): class TestGradientTruncated(unittest.TestCase):
def test_consistent_with_competitor(self): def func_test_consistent_with_competitor(self):
paddle.disable_static() paddle.disable_static()
def set_value(t, value): def set_value(t, value):
...@@ -1182,6 +1195,11 @@ class TestGradientTruncated(unittest.TestCase): ...@@ -1182,6 +1195,11 @@ class TestGradientTruncated(unittest.TestCase):
self.assertTrue(~x.stop_gradient) self.assertTrue(~x.stop_gradient)
self.assertTrue(~x.is_leaf) self.assertTrue(~x.is_leaf)
def test_consistent_with_competitor(self):
with _test_eager_guard():
self.func_test_consistent_with_competitor()
self.func_test_consistent_with_competitor()
def test_static_graph(self): def test_static_graph(self):
paddle.enable_static() paddle.enable_static()
...@@ -1328,6 +1346,7 @@ class TestGradientTruncated(unittest.TestCase): ...@@ -1328,6 +1346,7 @@ class TestGradientTruncated(unittest.TestCase):
self.assertTrue((numel(out1[0][0:5:3].shape) == out3[0]).all()) self.assertTrue((numel(out1[0][0:5:3].shape) == out3[0]).all())
array = array[0] array = array[0]
paddle.disable_static()
class TestSetValueInplace(unittest.TestCase): class TestSetValueInplace(unittest.TestCase):
......
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
from numpy.lib.stride_tricks import as_strided
import paddle
import unittest
from op_test import OpTest
def frame_from_librosa(x, frame_length, hop_length, axis=-1):
if axis == -1 and not x.flags["C_CONTIGUOUS"]:
x = np.ascontiguousarray(x)
elif axis == 0 and not x.flags["F_CONTIGUOUS"]:
x = np.asfortranarray(x)
n_frames = 1 + (x.shape[axis] - frame_length) // hop_length
strides = np.asarray(x.strides)
if axis == -1:
shape = list(x.shape)[:-1] + [frame_length, n_frames]
strides = list(strides) + [hop_length * x.itemsize]
elif axis == 0:
shape = [n_frames, frame_length] + list(x.shape)[1:]
strides = [hop_length * x.itemsize] + list(strides)
else:
raise ValueError("Frame axis={} must be either 0 or -1".format(axis))
return as_strided(x, shape=shape, strides=strides)
def stft_np(x, n_fft, hop_length, **kwargs):
frames = frame_from_librosa(x, n_fft, hop_length)
res = np.fft.rfft(frames, axis=1)
return res
class TestStftOp(OpTest):
def setUp(self):
self.op_type = "stft"
self.shape, self.type, self.attrs = self.initTestCase()
self.inputs = {
'X': np.random.random(size=self.shape).astype(self.type),
}
self.outputs = {'Out': stft_np(x=self.inputs['X'], **self.attrs)}
def initTestCase(self):
input_shape = (2, 100)
input_type = 'float64'
attrs = {
'n_fft': 50,
'hop_length': 15,
'normalized': False,
'onesided': True,
}
return input_shape, input_type, attrs
def test_check_output(self):
paddle.enable_static()
self.check_output()
paddle.disable_static()
def test_check_grad_normal(self):
paddle.enable_static()
self.check_grad(['X'], 'Out')
paddle.disable_static()
if __name__ == '__main__':
unittest.main()
...@@ -22,6 +22,7 @@ import copy ...@@ -22,6 +22,7 @@ import copy
import paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.core as core import paddle.fluid.core as core
from paddle.fluid.framework import _test_eager_guard, _in_eager_mode
class TestVarBase(unittest.TestCase): class TestVarBase(unittest.TestCase):
...@@ -874,7 +875,7 @@ class TestVarBase(unittest.TestCase): ...@@ -874,7 +875,7 @@ class TestVarBase(unittest.TestCase):
col = np.array([2, 1, 3]) col = np.array([2, 1, 3])
self.assertTrue(np.array_equal(array[row, col], x[row, col].numpy())) self.assertTrue(np.array_equal(array[row, col], x[row, col].numpy()))
def test_slice(self): def func_test_slice(self):
with fluid.dygraph.guard(): with fluid.dygraph.guard():
self._test_slice() self._test_slice()
self._test_slice_for_tensor_attr() self._test_slice_for_tensor_attr()
...@@ -899,6 +900,11 @@ class TestVarBase(unittest.TestCase): ...@@ -899,6 +900,11 @@ class TestVarBase(unittest.TestCase):
mask = np.array([1, 0, 1, 0], dtype=bool) mask = np.array([1, 0, 1, 0], dtype=bool)
var[paddle.to_tensor([0, 1]), mask] var[paddle.to_tensor([0, 1]), mask]
def test_slice(self):
with _test_eager_guard():
self.func_test_slice()
self.func_test_slice()
def test_var_base_to_np(self): def test_var_base_to_np(self):
with fluid.dygraph.guard(): with fluid.dygraph.guard():
var = fluid.dygraph.to_variable(self.array) var = fluid.dygraph.to_variable(self.array)
...@@ -1125,7 +1131,6 @@ class TestVarBase(unittest.TestCase): ...@@ -1125,7 +1131,6 @@ class TestVarBase(unittest.TestCase):
class TestVarBaseSetitem(unittest.TestCase): class TestVarBaseSetitem(unittest.TestCase):
def setUp(self): def setUp(self):
paddle.disable_static()
self.set_dtype() self.set_dtype()
self.tensor_x = paddle.to_tensor(np.ones((4, 2, 3)).astype(self.dtype)) self.tensor_x = paddle.to_tensor(np.ones((4, 2, 3)).astype(self.dtype))
self.np_value = np.random.random((2, 3)).astype(self.dtype) self.np_value = np.random.random((2, 3)).astype(self.dtype)
...@@ -1135,12 +1140,13 @@ class TestVarBaseSetitem(unittest.TestCase): ...@@ -1135,12 +1140,13 @@ class TestVarBaseSetitem(unittest.TestCase):
self.dtype = "int32" self.dtype = "int32"
def _test(self, value): def _test(self, value):
paddle.disable_static() if not _in_eager_mode():
self.assertEqual(self.tensor_x.inplace_version, 0) self.assertEqual(self.tensor_x.inplace_version, 0)
id_origin = id(self.tensor_x) id_origin = id(self.tensor_x)
self.tensor_x[0] = value self.tensor_x[0] = value
self.assertEqual(self.tensor_x.inplace_version, 1) if not _in_eager_mode():
self.assertEqual(self.tensor_x.inplace_version, 1)
if isinstance(value, (six.integer_types, float)): if isinstance(value, (six.integer_types, float)):
result = np.zeros((2, 3)).astype(self.dtype) + value result = np.zeros((2, 3)).astype(self.dtype) + value
...@@ -1152,27 +1158,47 @@ class TestVarBaseSetitem(unittest.TestCase): ...@@ -1152,27 +1158,47 @@ class TestVarBaseSetitem(unittest.TestCase):
self.assertEqual(id_origin, id(self.tensor_x)) self.assertEqual(id_origin, id(self.tensor_x))
self.tensor_x[1:2] = value self.tensor_x[1:2] = value
self.assertEqual(self.tensor_x.inplace_version, 2) if not _in_eager_mode():
self.assertEqual(self.tensor_x.inplace_version, 2)
self.assertTrue(np.array_equal(self.tensor_x[1].numpy(), result)) self.assertTrue(np.array_equal(self.tensor_x[1].numpy(), result))
self.assertEqual(id_origin, id(self.tensor_x)) self.assertEqual(id_origin, id(self.tensor_x))
self.tensor_x[...] = value self.tensor_x[...] = value
self.assertEqual(self.tensor_x.inplace_version, 3) if not _in_eager_mode():
self.assertEqual(self.tensor_x.inplace_version, 3)
self.assertTrue(np.array_equal(self.tensor_x[3].numpy(), result)) self.assertTrue(np.array_equal(self.tensor_x[3].numpy(), result))
self.assertEqual(id_origin, id(self.tensor_x)) self.assertEqual(id_origin, id(self.tensor_x))
def test_value_tensor(self): def func_test_value_tensor(self):
paddle.disable_static()
self._test(self.tensor_value) self._test(self.tensor_value)
def test_value_numpy(self): def test_value_tensor(self):
paddle.disable_static() with _test_eager_guard():
self.setUp()
self.func_test_value_tensor()
self.setUp()
self.func_test_value_tensor()
def func_test_value_numpy(self):
self._test(self.np_value) self._test(self.np_value)
def test_value_int(self): def test_value_numpy(self):
paddle.disable_static() with _test_eager_guard():
self.setUp()
self.func_test_value_numpy()
self.setUp()
self.func_test_value_numpy()
def func_test_value_int(self):
self._test(10) self._test(10)
def test_value_int(self):
with _test_eager_guard():
self.setUp()
self.func_test_value_int()
self.setUp()
self.func_test_value_int()
class TestVarBaseSetitemInt64(TestVarBaseSetitem): class TestVarBaseSetitemInt64(TestVarBaseSetitem):
def set_dtype(self): def set_dtype(self):
......
...@@ -382,7 +382,7 @@ def _getitem_impl_(var, item): ...@@ -382,7 +382,7 @@ def _getitem_impl_(var, item):
idx = assign(np.array(slice_item).astype("int32")) idx = assign(np.array(slice_item).astype("int32"))
return index_select(var, index=idx, axis=0) return index_select(var, index=idx, axis=0)
elif isinstance(slice_item, (Variable)): elif isinstance(slice_item, (Variable, core.eager.Tensor)):
if len(item) == 1: if len(item) == 1:
from ..tensor import index_select, gather_nd from ..tensor import index_select, gather_nd
...@@ -636,7 +636,7 @@ def _setitem_impl_(var, item, value): ...@@ -636,7 +636,7 @@ def _setitem_impl_(var, item, value):
shape = list(value.shape) shape = list(value.shape)
if dtype == core.VarDesc.VarType.BOOL: if dtype == core.VarDesc.VarType.BOOL:
value_name = "bool_values" value_name = "bool_values"
values = [bool(v) for v in value.flat] values = [int(v) for v in value.flat]
elif dtype == core.VarDesc.VarType.FP32: elif dtype == core.VarDesc.VarType.FP32:
value_name = "fp32_values" value_name = "fp32_values"
values = [float(v) for v in value.flat] values = [float(v) for v in value.flat]
...@@ -657,7 +657,7 @@ def _setitem_impl_(var, item, value): ...@@ -657,7 +657,7 @@ def _setitem_impl_(var, item, value):
attrs[value_name] = values attrs[value_name] = values
attrs["shape"] = shape attrs["shape"] = shape
elif isinstance(value, Variable): elif isinstance(value, (Variable, core.eager.Tensor)):
inputs["ValueTensor"] = value inputs["ValueTensor"] = value
else: else:
raise TypeError( raise TypeError(
...@@ -665,7 +665,9 @@ def _setitem_impl_(var, item, value): ...@@ -665,7 +665,9 @@ def _setitem_impl_(var, item, value):
"paddle.Tensor to a paddle.Tensor, but received {}".format( "paddle.Tensor to a paddle.Tensor, but received {}".format(
type(value))) type(value)))
if paddle.fluid.framework.in_dygraph_mode(): if paddle.fluid.framework.in_dygraph_mode(
) and not paddle.fluid.framework._in_eager_mode():
# TODO(pangyoki) add inplace(BumpInplaceVersion) if need
var._bump_inplace_version() var._bump_inplace_version()
cur_block = default_main_program().current_block() cur_block = default_main_program().current_block()
......
...@@ -20,7 +20,7 @@ from .utils import RecordEvent, load_profiler_result ...@@ -20,7 +20,7 @@ from .utils import RecordEvent, load_profiler_result
from .profiler_statistic import SortedKeys from .profiler_statistic import SortedKeys
__all__ = [ __all__ = [
'ProfilerState', 'ProfilerTarget', 'TracerEventType', 'make_scheduler', 'ProfilerState', 'ProfilerTarget', 'make_scheduler',
'export_chrome_tracing', 'export_protobuf', 'Profiler', 'RecordEvent', 'export_chrome_tracing', 'export_protobuf', 'Profiler', 'RecordEvent',
'load_profiler_result', 'SortedKeys' 'load_profiler_result', 'SortedKeys'
] ]
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
# You may obtain a copy of the License at # You may obtain a copy of the License at
# #
# http://www.apache.org/licenses/LICENSE-2.0 # http://www.apache.org/licenses/LICENSE-2.0
# #
# Unless required by applicable law or agreed to in writing, software # Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, # distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...@@ -24,7 +24,7 @@ from paddle.fluid.core import (_Profiler, _ProfilerResult, ProfilerOptions, ...@@ -24,7 +24,7 @@ from paddle.fluid.core import (_Profiler, _ProfilerResult, ProfilerOptions,
TracerEventType) TracerEventType)
from .utils import RecordEvent, wrap_optimizers from .utils import RecordEvent, wrap_optimizers
from .profiler_statistic import SortedKeys from .profiler_statistic import StatisticData, _build_table, SortedKeys
class ProfilerState(Enum): class ProfilerState(Enum):
...@@ -32,21 +32,28 @@ class ProfilerState(Enum): ...@@ -32,21 +32,28 @@ class ProfilerState(Enum):
Profiler state that can be specified to control profiler action. Profiler state that can be specified to control profiler action.
CLOSED: The profilers are closed. CLOSED: The profilers are closed.
READY: The profilers are open, but the data will not be recorded. READY: The profilers are open, but the data will not be recorded.
This state is used for reducing overhead influence when profilers start. This state is used for reducing overhead influence when profilers start.
RECORD: The profilers are open, and the data will be recorded. RECORD: The profilers are open, and the data will be recorded.
RECORD_AND_RETURN: The profilers are open, and at the last batch of current profiler period,
the collected data will be returned. RECORD_AND_RETURN: The profilers are open, and at the last batch of current profiler period,
the collected data will be returned.
""" """
CLOSED = 0 CLOSED = 0
READY = 1 READY = 1
RECORD = 2 RECORD = 2
RECORD_AND_RETURN = 3 # the last step of RECORD RECORD_AND_RETURN = 3 # the last step of RECORD
class ProfilerTarget(Enum): class ProfilerTarget(Enum):
r""" r"""
Target device for profiling. Target device for profiling.
CPU: Profile events on CPU.
GPU: Profile events on GPU.
""" """
CPU = 0 CPU = 0
GPU = 1 GPU = 1
...@@ -62,17 +69,19 @@ def make_scheduler(*, ...@@ -62,17 +69,19 @@ def make_scheduler(*,
Return a scheduler function, which scheduler the state according to the setting. Return a scheduler function, which scheduler the state according to the setting.
The state transform confirms to: The state transform confirms to:
(CLOSED) (CLOSED) (CLOSED) (READY) (RECORD,last RETURN) (CLOSED) .. code-block:: text
START -> skip_first -> closed -> ready -> record -> END
| | (CLOSED) (CLOSED) (CLOSED) (READY) (RECORD,last RETURN) (CLOSED)
| | (if has_repeated < repeat) START -> skip_first -> closed -> ready -> record -> END
- - - - - - - - - - - - | |
Note that repeat <= 0 means the cycle will continue until the profiler exits. | | (if has_repeated < repeat)
- - - - - - - - - - - -
Note that repeat <= 0 means the cycle will continue until the profiler exits.
Parameters: Parameters:
closed(int): The number of steps in state ProfilerState.CLOSED. closed(int): The number of steps in state ProfilerState.CLOSED.
ready(int): The number of steps in state ProfilerState.READY. ready(int): The number of steps in state ProfilerState.READY.
record(int): The number of steps in state ProfilerState.RECORD. record(int): The number of steps in state ProfilerState.RECORD.
repeat(int): The number of cycles to repeat above state transform. repeat(int): The number of cycles to repeat above state transform.
skip_first(int): The number of first steps to drop, not participate in the state transform. skip_first(int): The number of first steps to drop, not participate in the state transform.
...@@ -81,13 +90,23 @@ def make_scheduler(*, ...@@ -81,13 +90,23 @@ def make_scheduler(*,
Examples: Examples:
1. profiling range [2, 5] 1. profiling range [2, 5]
batch 0: closed, batch 1: ready, batch [2, 5] record batch 0: closed, batch 1: ready, batch [2, 5] record
.. code-block:: python
make_scheduler(closed=1, ready=1, record=4, repeat=1) .. code-block:: python
import paddle.profiler as profiler
profiler.make_scheduler(closed=1, ready=1, record=4, repeat=1)
2. profiling range [3,6], [9,12], [15,18]... 2. profiling range [3,6], [9,12], [15,18]...
batch 0: skiped, batch 1: closed, batch 2: ready, batch [3,6]: record, repeat batch 0: skiped, batch 1: closed, batch 2: ready, batch [3,6]: record, repeat
.. code-block:: python
make_scheduler(closed=1, ready=1, record=4, skip_first=1) .. code-block:: python
import paddle.profiler as profiler
profiler.make_scheduler(closed=1, ready=1, record=4, skip_first=1)
""" """
def getScheduleState(step: int) -> ProfilerState: def getScheduleState(step: int) -> ProfilerState:
...@@ -138,15 +157,16 @@ def export_chrome_tracing(dir_name: str, ...@@ -138,15 +157,16 @@ def export_chrome_tracing(dir_name: str,
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle.profiler as profiler
with profiler.Profiler(targets=[profiler.ProfilerTarget.CPU, # required: gpu
profiler.ProfilerTarget.GPU], import paddle.profiler as profiler
scheduler = (3, 10), with profiler.Profiler(
on_trace_ready = profiler.export_chrome_tracing('./log') targets=[profiler.ProfilerTarget.CPU, profiler.ProfilerTarget.GPU],
) as p: scheduler = (3, 10),
for iter in range(N): on_trace_ready=profiler.export_protobuf('./log')) as p:
train() for iter in range(10):
p.step() #train()
p.step()
""" """
if not os.path.exists(dir_name): if not os.path.exists(dir_name):
try: try:
...@@ -181,15 +201,16 @@ def export_protobuf(dir_name: str, worker_name: Optional[str]=None) -> Callable: ...@@ -181,15 +201,16 @@ def export_protobuf(dir_name: str, worker_name: Optional[str]=None) -> Callable:
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle.profiler as profiler
with profiler.Profiler(targets=[profiler.ProfilerTarget.CPU, # required: gpu
profiler.ProfilerTarget.GPU], import paddle.profiler as profiler
scheduler = (3, 10), with profiler.Profiler(
on_trace_ready = profiler.export_protobuf('./log') targets=[profiler.ProfilerTarget.CPU, profiler.ProfilerTarget.GPU],
) as p: scheduler = (3, 10),
for iter in range(N): on_trace_ready = profiler.export_protobuf('./log')) as p:
train() for iter in range(10):
p.step() #train()
p.step()
""" """
if not os.path.exists(dir_name): if not os.path.exists(dir_name):
try: try:
...@@ -216,7 +237,7 @@ def _get_supported_targets() -> Iterable[ProfilerTarget]: ...@@ -216,7 +237,7 @@ def _get_supported_targets() -> Iterable[ProfilerTarget]:
r""" r"""
Get the current supported profiler target in the system. Get the current supported profiler target in the system.
""" """
if paddle.device.is_compiled_with_cuda(): if _Profiler.is_cupti_supported():
return [ProfilerTarget.CPU, ProfilerTarget.GPU] return [ProfilerTarget.CPU, ProfilerTarget.GPU]
return [ProfilerTarget.CPU] return [ProfilerTarget.CPU]
...@@ -226,48 +247,56 @@ class Profiler: ...@@ -226,48 +247,56 @@ class Profiler:
Profiler context manager, user interface to manage profile process. Profiler context manager, user interface to manage profile process.
Parameters: Parameters:
targets (iterable): list of tracing targets, currently supported values: targets (iterable): list of tracing targets, currently supported values, ``ProfilerTarget.CPU``, ``ProfilerTarget.GPU`` .
``paddle.profiler.ProfilerTarget.CPU``, scheduler (callable or tuple): If it is a callable object, it takes a step number as parameter and return the corresponding ``ProfilerState``.
``paddle.profiler.ProfilerTarget.GPU``. If not provided, the default scheduler will keep tracing until the profiler exits. If it is a tuple, it has two values start_batch and end_batch,
scheduler (callable or tuple): If it is a callable object, it takes a step number as parameter and return the corresponding ``ProfilerState``.
If not provided, the default sheduler will keep tracing until the profiler exits. If it is a tuple, it has two values start_batch and end_batch,
which means profiling range [start_batch, end_batch). which means profiling range [start_batch, end_batch).
on_trace_ready (callable): callable object, takes the Profiler object as parameter, which provides a way for users to do post-processing. on_trace_ready (callable): callable object, takes the Profiler object as parameter, which provides a way for users to do post-processing.
This callable object will be called when ``sheduler`` returns ``ProfilerState.RECORD_AND_RETURN``. This callable object will be called when ``scheduler`` returns ``ProfilerState.RECORD_AND_RETURN``.
Examples: Examples:
1. profiling range [2, 5) 1. profiling range [2, 5)
.. code-block:: python
import paddle.profiler as profiler .. code-block:: python
with profiler.Profiler(targets=[profiler.ProfilerTarget.CPU,
profiler.ProfilerTarget.GPU], # required: gpu
scheduler = (2, 5), import paddle.profiler as profiler
on_trace_ready = profiler.export_chrome_tracing('./log') with profiler.Profiler(
) as p: targets=[profiler.ProfilerTarget.CPU, profiler.ProfilerTarget.GPU],
for iter in range(N): scheduler = (2, 5),
train() on_trace_ready = profiler.export_chrome_tracing('./log')) as p:
p.step() for iter in range(10):
#train()
p.step()
2. profiling range [2,4], [7, 9], [11,13] 2. profiling range [2,4], [7, 9], [11,13]
.. code-block:: python
import paddle.profiler as profiler .. code-block:: python
with profiler.Profiler(targets=[profiler.ProfilerTarget.CPU,
profiler.ProfilerTarget.GPU], # required: gpu
scheduler = profiler.make_scheduler(closed=1, ready=1, record=3, repeat=3), import paddle.profiler as profiler
on_trace_ready = profiler.export_chrome_tracing('./log') with profiler.Profiler(
) as p: targets=[profiler.ProfilerTarget.CPU, profiler.ProfilerTarget.GPU],
for iter in range(N): scheduler = profiler.make_scheduler(closed=1, ready=1, record=3, repeat=3),
train() on_trace_ready = profiler.export_chrome_tracing('./log')) as p:
p.step() for iter in range(10):
#train()
p.step()
3. Use profiler without context manager, and use default parameters 3. Use profiler without context manager, and use default parameters
.. code-block:: python
import paddle.profiler as profiler .. code-block:: python
p = profiler.Profiler()
p.start() # required: gpu
for iter in range(N): import paddle.profiler as profiler
train() p = profiler.Profiler()
p.step() p.start()
p.stop() for iter in range(10):
p.summary() #train()
p.step()
p.stop()
p.summary()
""" """
def __init__( def __init__(
...@@ -334,7 +363,22 @@ class Profiler: ...@@ -334,7 +363,22 @@ class Profiler:
def start(self): def start(self):
r''' r'''
Start profiler and enter the first profiler step(0). Start profiler and enter the first profiler step(0).
State transformed from CLOSED to self.current_state and trigger corresponding action. State transformed from CLOSED to self.current_state and trigger corresponding action.
Examples:
.. code-block:: python
# required: gpu
import paddle.profiler as profiler
prof = profiler.Profiler(
targets=[profiler.ProfilerTarget.CPU, profiler.ProfilerTarget.GPU],
scheduler = (1, 9),
on_trace_ready = profiler.export_chrome_tracing('./log'))
prof.start()
for iter in range(10):
#train()
prof.step()
prof.stop()
''' '''
# CLOSED -> self.current_state # CLOSED -> self.current_state
if self.current_state == ProfilerState.READY: if self.current_state == ProfilerState.READY:
...@@ -354,6 +398,21 @@ class Profiler: ...@@ -354,6 +398,21 @@ class Profiler:
r''' r'''
Stop profiler and State transformed from self.current_state to CLOSED. Stop profiler and State transformed from self.current_state to CLOSED.
Trigger corresponding action and post-process profiler result using self.on_trace_ready if result exists. Trigger corresponding action and post-process profiler result using self.on_trace_ready if result exists.
Examples:
.. code-block:: python
# required: gpu
import paddle.profiler as profiler
prof = profiler.Profiler(
targets=[profiler.ProfilerTarget.CPU, profiler.ProfilerTarget.GPU],
scheduler = (1, 7),
on_trace_ready = profiler.export_chrome_tracing('./log'))
prof.start()
for iter in range(10):
#train()
prof.step()
prof.stop()
''' '''
# self.current_state -> CLOSED # self.current_state -> CLOSED
# In this situation, RECORD state is regarded as RECORD_AND_RETURN # In this situation, RECORD state is regarded as RECORD_AND_RETURN
...@@ -375,6 +434,22 @@ class Profiler: ...@@ -375,6 +434,22 @@ class Profiler:
r""" r"""
Signals the profiler that the next profiling step has started. Signals the profiler that the next profiling step has started.
Get the new ProfilerState and trigger corresponding action. Get the new ProfilerState and trigger corresponding action.
Examples:
.. code-block:: python
# required: gpu
import paddle.profiler as profiler
prof = profiler.Profiler(
targets=[profiler.ProfilerTarget.CPU, profiler.ProfilerTarget.GPU],
scheduler = (3, 7),
on_trace_ready = profiler.export_chrome_tracing('./log'))
prof.start()
for iter in range(10):
#train()
prof.step()
prof.stop()
""" """
if self.record_event: if self.record_event:
self.record_event.end() self.record_event.end()
...@@ -448,6 +523,21 @@ class Profiler: ...@@ -448,6 +523,21 @@ class Profiler:
def export(self, path="", format="json"): def export(self, path="", format="json"):
r""" r"""
Exports the tracing data in Chrome tracing data format. Exports the tracing data in Chrome tracing data format.
Examples:
.. code-block:: python
# required: gpu
import paddle.profiler as profiler
prof = profiler.Profiler(
targets=[profiler.ProfilerTarget.CPU, profiler.ProfilerTarget.GPU],
scheduler = (3, 7))
prof.start()
for iter in range(10):
#train()
prof.step()
prof.stop()
prof.export(path="./profiler_data.json", format="json")
""" """
if self.profiler_result: if self.profiler_result:
self.profiler_result.save(path, format) self.profiler_result.save(path, format)
...@@ -461,9 +551,35 @@ class Profiler: ...@@ -461,9 +551,35 @@ class Profiler:
Print the Summary table. Print the Summary table.
Parameters: Parameters:
sorted_by: how to rank the op table items. sorted_by(SortedKeys): how to rank the op table items.
detail: expand each operator detail information. op_detail(bool): expand each operator detail information.
thread_sep: print op table each thread. thread_sep(bool): print op table each thread.
time_unit: can be chosen form ['s', 'ms', 'us', 'ns'] time_unit(str): can be chosen form ['s', 'ms', 'us', 'ns']
Examples:
.. code-block:: python
# required: gpu
import paddle.profiler as profiler
prof = profiler.Profiler(
targets=[profiler.ProfilerTarget.CPU, profiler.ProfilerTarget.GPU],
scheduler = (3, 7),
on_trace_ready = profiler.export_chrome_tracing('./log'))
prof.start()
for iter in range(10):
#train()
prof.step()
prof.stop()
prof.summary(sorted_by=profiler.SortedKeys.CPUTotal, op_detail=True, thread_sep=False, time_unit='ms')
""" """
pass if self.profiler_result:
statistic_data = StatisticData(
self.profiler_result.get_data(),
self.profiler_result.get_extra_info())
print(
_build_table(
statistic_data,
sorted_by=sorted_by,
op_detail=op_detail,
thread_sep=thread_sep,
time_unit=time_unit))
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
# You may obtain a copy of the License at # You may obtain a copy of the License at
# #
# http://www.apache.org/licenses/LICENSE-2.0 # http://www.apache.org/licenses/LICENSE-2.0
# #
# Unless required by applicable law or agreed to in writing, software # Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, # distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...@@ -34,6 +34,22 @@ _CommunicationOpName = ['reduce', 'broadcast', 'rpc'] ...@@ -34,6 +34,22 @@ _CommunicationOpName = ['reduce', 'broadcast', 'rpc']
class SortedKeys(Enum): class SortedKeys(Enum):
r""" r"""
Sorted keys for printing summary table. Sorted keys for printing summary table.
CPUTotal: Sorted by CPU total time.
CPUAvg: Sorted by CPU average time.
CPUMax: Sorted by CPU max time.
CPUMin: Sorted by CPU min time.
GPUTotal: Sorted by GPU total time.
GPUAvg: Sorted by GPU average time.
GPUMax: Sorted by GPU max time.
GPUMin: Sorted by GPU min time.
""" """
CPUTotal = 0 CPUTotal = 0
CPUAvg = 1 CPUAvg = 1
...@@ -642,6 +658,171 @@ def _build_table(statistic_data, ...@@ -642,6 +658,171 @@ def _build_table(statistic_data,
append('') append('')
append('') append('')
###### Print Model Summary Report ######
model_perspective_items = statistic_data.event_summary.model_perspective_items
if model_perspective_items:
headers = [
'Name', 'Calls', 'CPU Total / Avg / Max / Min / Ratio(%)',
'GPU Total / Avg / Max / Min / Ratio(%)'
]
row_format_list = [""]
header_sep_list = [""]
line_length_list = [-SPACING_SIZE]
name_column_width = 15
add_column(name_column_width)
add_column(6)
add_column(40)
add_column(40)
row_format = row_format_list[0]
header_sep = header_sep_list[0]
line_length = line_length_list[0]
# construct table string
append(add_title(line_length, "Model Summary"))
append('Time unit: {}'.format(time_unit))
append(header_sep)
append(row_format.format(*headers))
append(header_sep)
accmulation_time = 0
row_values = [
'Total Time', '-', '{} / - / - / - / {}'.format(
format_time(
total_time, unit=time_unit), format_ratio(1)),
'- / - / - / -/ -'
]
append(row_format.format(*row_values))
for name in ['Dataloader', 'Forward', 'Backward', 'Optimization']:
if name in model_perspective_items:
item = model_perspective_items[name]
row_values = [
' {}'.format(name), item.call,
'{} / {} / {} / {} / {}'.format(
format_time(
item.cpu_time, unit=time_unit),
format_time(
item.avg_cpu_time, unit=time_unit),
format_time(
item.max_cpu_time, unit=time_unit),
format_time(
item.min_cpu_time, unit=time_unit),
format_ratio(float(item.cpu_time) / total_time)),
'{} / {} / {} / {} / {}'.format(
format_time(
item.gpu_time, unit=time_unit),
format_time(
item.avg_gpu_time, unit=time_unit),
format_time(
item.max_gpu_time, unit=time_unit),
format_time(
item.min_gpu_time, unit=time_unit),
format_ratio(float(item.gpu_time) / total_time))
]
append(row_format.format(*row_values))
accmulation_time += item.cpu_time
other_time = total_time - accmulation_time
row_values = [
' Others', '-', '{} / - / - / - / {}'.format(
format_time(
other_time, unit=time_unit),
format_ratio(float(other_time) / total_time)),
'- / - / - / - / -'
]
append(row_format.format(*row_values))
append(header_sep)
append('')
append('')
###### Print Distribution Summary Report ######
if TracerEventType.Communication in statistic_data.time_range_summary.CPUTimeRange:
headers = [
'Name',
'Total Time',
'Ratio (%)',
]
row_format_list = [""]
header_sep_list = [""]
line_length_list = [-SPACING_SIZE]
DEFAULT_COLUMN_WIDTH = 20
for _ in headers:
add_column(DEFAULT_COLUMN_WIDTH)
row_format = row_format_list[0]
header_sep = header_sep_list[0]
line_length = line_length_list[0]
# construct table string
append(add_title(line_length, "Distribution Summary"))
append('Time unit: {}'.format(time_unit))
append(header_sep)
append(row_format.format(*headers))
append(header_sep)
cpu_communication_time_range = []
gpu_communication_time_range = []
cpu_communication_time_range = merge_ranges(
statistic_data.time_range_summary.CPUTimeRange[
TracerEventType.Communication], cpu_communication_time_range)
kernel_time_range = []
for device_id, device_time_ranges in statistic_data.time_range_summary.GPUTimeRange.items(
):
kernel_time_range = merge_ranges(
device_time_ranges[TracerEventType.Kernel],
kernel_time_range,
is_sorted=True)
gpu_communication_time_range = merge_ranges(
device_time_ranges[TracerEventType.Communication],
gpu_communication_time_range,
is_sorted=True)
communication_time_range = merge_ranges(
cpu_communication_time_range,
gpu_communication_time_range,
is_sorted=True)
computation_time_range = subtract_ranges(kernel_time_range,
gpu_communication_time_range)
overlap_time_range = intersection_ranges(communication_time_range,
computation_time_range)
communication_time = sum_ranges(communication_time_range)
computation_time = sum_ranges(computation_time_range)
overlap_time = sum_ranges(overlap_time_range)
row_values = [
'Communication', format_time(
communication_time, unit=time_unit),
format_ratio(float(communication_time) / total_time)
]
append(row_format.format(*row_values))
row_values = [
'Computation', format_time(
computation_time, unit=time_unit),
format_ratio(float(computation_time) / total_time)
]
append(row_format.format(*row_values))
row_values = [
'Overlap', format_time(
overlap_time, unit=time_unit),
format_ratio(float(overlap_time) / total_time)
]
append(row_format.format(*row_values))
append(header_sep)
append(
"Note:\nCommunication time: Communication Op time and its kernel time on gpu.\n"
"Computation time: Kernel time, substract kernels belong to communication op.\n"
"Overlap time: Communication time intersect with computation time.\n"
"Example:\n"
"Communication:\n"
" CPU: |_________________|\n"
" GPU: |______________|\n"
" Total: |_________________| |______________|\n"
"Computation time(Kernel):\n"
" GPU: |________________|\n"
"Overlap time: |___________|\n")
append('-' * line_length)
append('')
append('')
###### Print Operator Summary Report ###### ###### Print Operator Summary Report ######
if statistic_data.event_summary.items: if statistic_data.event_summary.items:
headers = [ headers = [
...@@ -708,11 +889,6 @@ def _build_table(statistic_data, ...@@ -708,11 +889,6 @@ def _build_table(statistic_data,
sorted_items = sorted( sorted_items = sorted(
items.items(), key=lambda x: x[1].min_gpu_time) items.items(), key=lambda x: x[1].min_gpu_time)
total_cpu_time = 0
total_gpu_time = 0
for name, item in sorted_items:
total_cpu_time += item.cpu_time
total_gpu_time += item.gpu_time
for name, item in sorted_items: for name, item in sorted_items:
row_values = [ row_values = [
name, item.call, '{} / {} / {} / {} / {}'.format( name, item.call, '{} / {} / {} / {} / {}'.format(
...@@ -724,7 +900,7 @@ def _build_table(statistic_data, ...@@ -724,7 +900,7 @@ def _build_table(statistic_data,
item.max_cpu_time, unit=time_unit), item.max_cpu_time, unit=time_unit),
format_time( format_time(
item.min_cpu_time, unit=time_unit), item.min_cpu_time, unit=time_unit),
format_ratio(float(item.cpu_time) / total_cpu_time)), format_ratio(float(item.cpu_time) / total_time)),
'{} / {} / {} / {} / {}'.format( '{} / {} / {} / {} / {}'.format(
format_time( format_time(
item.gpu_time, unit=time_unit), item.gpu_time, unit=time_unit),
...@@ -734,7 +910,7 @@ def _build_table(statistic_data, ...@@ -734,7 +910,7 @@ def _build_table(statistic_data,
item.max_gpu_time, unit=time_unit), item.max_gpu_time, unit=time_unit),
format_time( format_time(
item.min_gpu_time, unit=time_unit), item.min_gpu_time, unit=time_unit),
format_ratio(float(item.gpu_time) / total_gpu_time)) format_ratio(float(item.gpu_time) / total_time))
] ]
append(row_format.format(*row_values)) append(row_format.format(*row_values))
if op_detail: if op_detail:
...@@ -752,8 +928,7 @@ def _build_table(statistic_data, ...@@ -752,8 +928,7 @@ def _build_table(statistic_data,
format_time( format_time(
innerop_node.min_cpu_time, unit=time_unit), innerop_node.min_cpu_time, unit=time_unit),
format_ratio( format_ratio(
float(innerop_node.cpu_time) / float(innerop_node.cpu_time) / total_time)),
total_cpu_time)),
'{} / {} / {} / {} / {}'.format( '{} / {} / {} / {} / {}'.format(
format_time( format_time(
innerop_node.gpu_time, unit=time_unit), innerop_node.gpu_time, unit=time_unit),
...@@ -764,8 +939,7 @@ def _build_table(statistic_data, ...@@ -764,8 +939,7 @@ def _build_table(statistic_data,
format_time( format_time(
innerop_node.min_gpu_time, unit=time_unit), innerop_node.min_gpu_time, unit=time_unit),
format_ratio( format_ratio(
float(innerop_node.gpu_time) / float(innerop_node.gpu_time) / total_time))
total_gpu_time))
] ]
append(row_format.format(*row_values)) append(row_format.format(*row_values))
for device_node_name, devicenode in innerop_node.devices.items( for device_node_name, devicenode in innerop_node.devices.items(
...@@ -792,7 +966,7 @@ def _build_table(statistic_data, ...@@ -792,7 +966,7 @@ def _build_table(statistic_data,
unit=time_unit), unit=time_unit),
format_ratio( format_ratio(
float(devicenode.gpu_time) / float(devicenode.gpu_time) /
total_gpu_time)) total_time))
] ]
append(row_format.format(*row_values)) append(row_format.format(*row_values))
for device_node_name, device_node in item.devices.items(): for device_node_name, device_node in item.devices.items():
...@@ -814,11 +988,160 @@ def _build_table(statistic_data, ...@@ -814,11 +988,160 @@ def _build_table(statistic_data,
format_time( format_time(
devicenode.min_gpu_time, unit=time_unit), devicenode.min_gpu_time, unit=time_unit),
format_ratio( format_ratio(
float(devicenode.gpu_time) / float(devicenode.gpu_time) / total_time))
total_gpu_time))
] ]
append(row_format.format(*row_values)) append(row_format.format(*row_values))
append(header_sep) append(header_sep)
append('') append('')
append('') append('')
###### Print Memory Manipulation Summary Report ######
if statistic_data.event_summary.memory_manipulation_items:
headers = [
'Name', 'Calls', 'CPU Total / Avg / Max / Min / Ratio(%)',
'GPU Total / Avg / Max / Min / Ratio(%)'
]
row_format_list = [""]
header_sep_list = [""]
line_length_list = [-SPACING_SIZE]
name_column_width = 30
add_column(name_column_width)
add_column(6)
add_column(40)
add_column(40)
row_format = row_format_list[0]
header_sep = header_sep_list[0]
line_length = line_length_list[0]
# construct table string
append(add_title(line_length, "Memory Manipulation Summary"))
append('Time unit: {}'.format(time_unit))
append(header_sep)
append(row_format.format(*headers))
append(header_sep)
memory_manipulation_items = statistic_data.event_summary.memory_manipulation_items
for name, item in memory_manipulation_items.items():
row_values = [
name,
item.call,
'{} / {} / {} / {} / {}'.format(
format_time(
item.cpu_time, unit=time_unit),
format_time(
item.avg_cpu_time, unit=time_unit),
format_time(
item.max_cpu_time, unit=time_unit),
format_time(
item.min_cpu_time, unit=time_unit),
format_ratio(float(item.cpu_time) / total_time)),
'{} / {} / {} / {} / {}'.format(
format_time(
item.gpu_time, unit=time_unit),
format_time(
item.avg_gpu_time, unit=time_unit),
format_time(
item.max_gpu_time, unit=time_unit),
format_time(
item.min_gpu_time, unit=time_unit),
format_ratio(float(item.gpu_time) / total_time)),
]
append(row_format.format(*row_values))
append(header_sep)
append('')
append('')
###### Print UserDefined Summary Report ######
if statistic_data.event_summary.userdefined_items:
headers = [
'Name', 'Calls', 'CPU Total / Avg / Max / Min / Ratio(%)',
'GPU Total / Avg / Max / Min / Ratio(%)'
]
row_format_list = [""]
header_sep_list = [""]
line_length_list = [-SPACING_SIZE]
name_column_width = 30
add_column(name_column_width)
add_column(6)
add_column(40)
add_column(40)
row_format = row_format_list[0]
header_sep = header_sep_list[0]
line_length = line_length_list[0]
# construct table string
append(add_title(line_length, "UserDefined Summary"))
append('Time unit: {}'.format(time_unit))
append(header_sep)
append(row_format.format(*headers))
append(header_sep)
if thread_sep == True:
userdefined_thread_items = statistic_data.event_summary.userdefined_thread_items
else:
userdefined_thread_items = {
'All threads merged':
statistic_data.event_summary.userdefined_items
}
for thread_id, items in userdefined_thread_items.items():
append(add_title(line_length, "Thread: {}".format(thread_id)))
if sorted_by == SortedKeys.CPUTotal:
sorted_items = sorted(
items.items(), key=lambda x: x[1].cpu_time, reverse=True)
elif sorted_by == SortedKeys.CPUAvg:
sorted_items = sorted(
items.items(),
key=lambda x: x[1].avg_cpu_time,
reverse=True)
elif sorted_by == SortedKeys.CPUMax:
sorted_items = sorted(
items.items(),
key=lambda x: x[1].max_cpu_time,
reverse=True)
elif sorted_by == SortedKeys.CPUMin:
sorted_items = sorted(
items.items(), key=lambda x: x[1].min_cpu_time)
elif sorted_by == SortedKeys.GPUTotal:
sorted_items = sorted(
items.items(), key=lambda x: x[1].gpu_time, reverse=True)
elif sorted_by == SortedKeys.GPUAvg:
sorted_items = sorted(
items.items(),
key=lambda x: x[1].avg_gpu_time,
reverse=True)
elif sorted_by == SortedKeys.GPUMax:
sorted_items = sorted(
items.items(),
key=lambda x: x[1].max_gpu_time,
reverse=True)
elif sorted_by == SortedKeys.GPUMin:
sorted_items = sorted(
items.items(), key=lambda x: x[1].min_gpu_time)
for name, item in sorted_items:
row_values = [
name,
item.call,
'{} / {} / {} / {} / {}'.format(
format_time(
item.cpu_time, unit=time_unit),
format_time(
item.avg_cpu_time, unit=time_unit),
format_time(
item.max_cpu_time, unit=time_unit),
format_time(
item.min_cpu_time, unit=time_unit),
format_ratio(float(item.cpu_time) / total_time)),
'{} / {} / {} / {} / {}'.format(
format_time(
item.gpu_time, unit=time_unit),
format_time(
item.avg_gpu_time, unit=time_unit),
format_time(
item.max_gpu_time, unit=time_unit),
format_time(
item.min_gpu_time, unit=time_unit),
format_ratio(float(item.gpu_time) / total_time)),
]
append(row_format.format(*row_values))
append(header_sep)
return ''.join(result) return ''.join(result)
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
# You may obtain a copy of the License at # You may obtain a copy of the License at
# #
# http://www.apache.org/licenses/LICENSE-2.0 # http://www.apache.org/licenses/LICENSE-2.0
# #
# Unless required by applicable law or agreed to in writing, software # Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, # distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from paddle.fluid.core import (_RecordEvent, TracerEventType,
load_profiler_result)
from typing import Any from typing import Any
from warnings import warn from warnings import warn
import functools import functools
from contextlib import ContextDecorator from contextlib import ContextDecorator
from paddle.fluid.core import (_RecordEvent, TracerEventType)
import paddle.fluid.core as core
_AllowedEventTypeList = [ _AllowedEventTypeList = [
TracerEventType.Dataloader, TracerEventType.ProfileStep, TracerEventType.Dataloader, TracerEventType.ProfileStep,
TracerEventType.UserDefined, TracerEventType.Forward, TracerEventType.UserDefined, TracerEventType.Forward,
...@@ -32,14 +33,28 @@ class RecordEvent(ContextDecorator): ...@@ -32,14 +33,28 @@ class RecordEvent(ContextDecorator):
Interface for recording a time range. Interface for recording a time range.
Parameters: Parameters:
name(str): Name of the record event name(str): Name of the record event
event_type(TracerEventType): Type of the record event, can be used for statistics.
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle.profiler as profiler
with profiler.RecordEvent(name='op1', event_type=TracerEventType=TracerEventType.UserDefined): import paddle
op1() import paddle.profiler as profiler
# method1: using context manager
with profiler.RecordEvent("record_add"):
data1 = paddle.randn(shape=[3])
data2 = paddle.randn(shape=[3])
result = data1 + data2
# method2: call begin() and end()
record_event = profiler.RecordEvent("record_add")
record_event.begin()
data1 = paddle.randn(shape=[3])
data2 = paddle.randn(shape=[3])
result = data1 + data2
record_event.end()
Note:
RecordEvent will take effect only when profiler is on and at the state of RECORD.
""" """
def __init__(self, def __init__(self,
...@@ -57,6 +72,20 @@ class RecordEvent(ContextDecorator): ...@@ -57,6 +72,20 @@ class RecordEvent(ContextDecorator):
self.end() self.end()
def begin(self): def begin(self):
r"""
Record the time of begining.
.. code-block:: python
import paddle
import paddle.profiler as profiler
record_event = profiler.RecordEvent("record_sub")
record_event.begin()
data1 = paddle.randn(shape=[3])
data2 = paddle.randn(shape=[3])
result = data1 - data2
record_event.end()
"""
if self.event_type not in _AllowedEventTypeList: if self.event_type not in _AllowedEventTypeList:
warn("Only TracerEvent Type in [{}, {}, {}, {}, {}, {},{}]\ warn("Only TracerEvent Type in [{}, {}, {}, {}, {}, {},{}]\
can be recorded.".format(*_AllowedEventTypeList)) can be recorded.".format(*_AllowedEventTypeList))
...@@ -67,10 +96,51 @@ class RecordEvent(ContextDecorator): ...@@ -67,10 +96,51 @@ class RecordEvent(ContextDecorator):
self.event = _RecordEvent(self.name, self.event_type) self.event = _RecordEvent(self.name, self.event_type)
def end(self): def end(self):
r'''
Record the time of ending.
.. code-block:: python
import paddle
import paddle.profiler as profiler
record_event = profiler.RecordEvent("record_mul")
record_event.begin()
data1 = paddle.randn(shape=[3])
data2 = paddle.randn(shape=[3])
result = data1 * data2
record_event.end()
'''
if self.event: if self.event:
self.event.end() self.event.end()
def load_profiler_result(filename: str):
r"""
Load dumped profiler data back to memory.
Parameters:
filename(str): Name of the exported protobuf file of profiler data.
Returns:
ProfilerResult object.
Examples:
.. code-block:: python
# required: gpu
import paddle.profiler as profiler
with profiler.Profiler(
targets=[profiler.ProfilerTarget.CPU, profiler.ProfilerTarget.GPU],
scheduler = (3, 10)) as p:
for iter in range(10):
#train()
p.step()
p.export('test_export_protobuf.pb', format='pb')
profiler_result = profiler.load_profiler_result('test_export_protobuf.pb')
"""
return core.load_profiler_result(filename)
def wrap_optimizers(): def wrap_optimizers():
def optimizer_warpper(func): def optimizer_warpper(func):
@functools.wraps(func) @functools.wraps(func)
......
...@@ -119,10 +119,11 @@ def frame(x, frame_length, hop_length, axis=-1, name=None): ...@@ -119,10 +119,11 @@ def frame(x, frame_length, hop_length, axis=-1, name=None):
f'Unexpected hop_length: {hop_length}. It should be an positive integer.' f'Unexpected hop_length: {hop_length}. It should be an positive integer.'
) )
if frame_length > x.shape[axis]: if in_dygraph_mode():
raise ValueError( if frame_length > x.shape[axis]:
f'Attribute frame_length should be less equal than sequence length, ' raise ValueError(
f'but got ({frame_length}) > ({x.shape[axis]}).') f'Attribute frame_length should be less equal than sequence length, '
f'but got ({frame_length}) > ({x.shape[axis]}).')
op_type = 'frame' op_type = 'frame'
...@@ -306,8 +307,7 @@ def stft(x, ...@@ -306,8 +307,7 @@ def stft(x,
y1 = stft(x, n_fft=512, center=False, onesided=False) # [8, 512, 372] y1 = stft(x, n_fft=512, center=False, onesided=False) # [8, 512, 372]
""" """
check_variable_and_dtype( check_variable_and_dtype(
x, 'x', ['float16', 'float32', 'float64', 'complex64', 'complex128'], x, 'x', ['float32', 'float64', 'complex64', 'complex128'], 'stft')
'stft')
x_rank = len(x.shape) x_rank = len(x.shape)
assert x_rank in [1, 2], \ assert x_rank in [1, 2], \
...@@ -325,8 +325,9 @@ def stft(x, ...@@ -325,8 +325,9 @@ def stft(x,
if win_length is None: if win_length is None:
win_length = n_fft win_length = n_fft
assert 0 < n_fft <= x.shape[-1], \ if in_dygraph_mode():
f'n_fft should be in (0, seq_length({x.shape[-1]})], but got {n_fft}.' assert 0 < n_fft <= x.shape[-1], \
f'n_fft should be in (0, seq_length({x.shape[-1]})], but got {n_fft}.'
assert 0 < win_length <= n_fft, \ assert 0 < win_length <= n_fft, \
f'win_length should be in (0, n_fft({n_fft})], but got {win_length}.' f'win_length should be in (0, n_fft({n_fft})], but got {win_length}.'
...@@ -359,7 +360,7 @@ def stft(x, ...@@ -359,7 +360,7 @@ def stft(x,
x_frames = x_frames.transpose( x_frames = x_frames.transpose(
perm=[0, 2, perm=[0, 2,
1]) # switch n_fft to last dim, egs: (batch, num_frames, n_fft) 1]) # switch n_fft to last dim, egs: (batch, num_frames, n_fft)
x_frames = x_frames * window x_frames = paddle.multiply(x_frames, window)
norm = 'ortho' if normalized else 'backward' norm = 'ortho' if normalized else 'backward'
if is_complex(x_frames): if is_complex(x_frames):
...@@ -495,18 +496,22 @@ def istft(x, ...@@ -495,18 +496,22 @@ def istft(x,
n_frames = x.shape[-1] n_frames = x.shape[-1]
fft_size = x.shape[-2] fft_size = x.shape[-2]
if onesided: if in_dygraph_mode():
assert (fft_size == n_fft // 2 + 1), \ if onesided:
'fft_size should be equal to n_fft // 2 + 1({}) when onesided is True, but got {}.'.format(n_fft // 2 + 1, fft_size) assert (fft_size == n_fft // 2 + 1), \
else: 'fft_size should be equal to n_fft // 2 + 1({}) when onesided is True, but got {}.'.format(n_fft // 2 + 1, fft_size)
assert (fft_size == n_fft), \ else:
'fft_size should be equal to n_fft({}) when onesided is False, but got {}.'.format(n_fft, fft_size) assert (fft_size == n_fft), \
'fft_size should be equal to n_fft({}) when onesided is False, but got {}.'.format(n_fft, fft_size)
if window is not None: if window is not None:
assert len(window.shape) == 1 and len(window) == win_length, \ assert len(window.shape) == 1 and len(window) == win_length, \
'expected a 1D window tensor of size equal to win_length({}), but got window with shape {}.'.format(win_length, window.shape) 'expected a 1D window tensor of size equal to win_length({}), but got window with shape {}.'.format(win_length, window.shape)
else: else:
window = paddle.ones(shape=(win_length, )) window_dtype = paddle.float32 if x.dtype in [
paddle.float32, paddle.complex64
] else paddle.float64
window = paddle.ones(shape=(win_length, ), dtype=window_dtype)
if win_length < n_fft: if win_length < n_fft:
pad_left = (n_fft - win_length) // 2 pad_left = (n_fft - win_length) // 2
...@@ -534,15 +539,15 @@ def istft(x, ...@@ -534,15 +539,15 @@ def istft(x,
x = x[:, :, :n_fft // 2 + 1] x = x[:, :, :n_fft // 2 + 1]
out = fft_c2r(x=x, n=None, axis=-1, norm=norm, forward=False, name=None) out = fft_c2r(x=x, n=None, axis=-1, norm=norm, forward=False, name=None)
out = paddle.multiply(out, window).transpose(
perm=[0, 2, 1]) # (batch, n_fft, num_frames)
out = overlap_add( out = overlap_add(
x=(out * window).transpose( x=out, hop_length=hop_length, axis=-1) # (batch, seq_length)
perm=[0, 2, 1]), # (batch, n_fft, num_frames)
hop_length=hop_length,
axis=-1) # (batch, seq_length)
window_envelop = overlap_add( window_envelop = overlap_add(
x=paddle.tile( x=paddle.tile(
x=window * window, repeat_times=[n_frames, 1]).transpose( x=paddle.multiply(window, window).unsqueeze(0),
repeat_times=[n_frames, 1]).transpose(
perm=[1, 0]), # (n_fft, num_frames) perm=[1, 0]), # (n_fft, num_frames)
hop_length=hop_length, hop_length=hop_length,
axis=-1) # (seq_length, ) axis=-1) # (seq_length, )
...@@ -561,7 +566,7 @@ def istft(x, ...@@ -561,7 +566,7 @@ def istft(x,
window_envelop = window_envelop[start:start + length] window_envelop = window_envelop[start:start + length]
# Check whether the Nonzero Overlap Add (NOLA) constraint is met. # Check whether the Nonzero Overlap Add (NOLA) constraint is met.
if window_envelop.abs().min().item() < 1e-11: if in_dygraph_mode() and window_envelop.abs().min().item() < 1e-11:
raise ValueError( raise ValueError(
'Abort istft because Nonzero Overlap Add (NOLA) condition failed. For more information about NOLA constraint please see `scipy.signal.check_NOLA`(https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.check_NOLA.html).' 'Abort istft because Nonzero Overlap Add (NOLA) condition failed. For more information about NOLA constraint please see `scipy.signal.check_NOLA`(https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.check_NOLA.html).'
) )
......
...@@ -147,7 +147,9 @@ def matmul(x, y, transpose_x=False, transpose_y=False, name=None): ...@@ -147,7 +147,9 @@ def matmul(x, y, transpose_x=False, transpose_y=False, name=None):
var_names = {'x': x, 'y': y} var_names = {'x': x, 'y': y}
for name, val in var_names.items(): for name, val in var_names.items():
check_variable_and_dtype( check_variable_and_dtype(
val, name, ['float16', 'float32', 'float64'], 'matmul') val, name,
['float16', 'float32', 'float64', 'complex64', 'complex128'],
'matmul')
__check_input(x, y) __check_input(x, y)
......
...@@ -243,8 +243,8 @@ def add(x, y, name=None): ...@@ -243,8 +243,8 @@ def add(x, y, name=None):
""" """
if paddle.in_dynamic_mode(): if paddle.in_dynamic_mode():
#if _in_eager_mode(): if _in_eager_mode():
#return _C_ops.final_state_add(x, y) return _C_ops.final_state_add( x, y)
return _C_ops.elementwise_add(x, y) return _C_ops.elementwise_add(x, y)
return _elementwise_op(LayerHelper('elementwise_add', **locals())) return _elementwise_op(LayerHelper('elementwise_add', **locals()))
...@@ -324,8 +324,8 @@ def subtract(x, y, name=None): ...@@ -324,8 +324,8 @@ def subtract(x, y, name=None):
axis = -1 axis = -1
act = None act = None
if paddle.in_dynamic_mode(): if paddle.in_dynamic_mode():
# if _in_eager_mode(): if _in_eager_mode():
# return _C_ops.final_state_subtract( x, y) return _C_ops.final_state_subtract(x, y)
return _elementwise_op_in_dygraph( return _elementwise_op_in_dygraph(
x, y, axis=axis, act=act, op_name=op_type) x, y, axis=axis, act=act, op_name=op_type)
return _elementwise_op(LayerHelper(op_type, **locals())) return _elementwise_op(LayerHelper(op_type, **locals()))
...@@ -383,6 +383,8 @@ def divide(x, y, name=None): ...@@ -383,6 +383,8 @@ def divide(x, y, name=None):
axis = -1 axis = -1
act = None act = None
if paddle.in_dynamic_mode(): if paddle.in_dynamic_mode():
if _in_eager_mode():
return _C_ops.final_state_divide( x, y)
return _elementwise_op_in_dygraph( return _elementwise_op_in_dygraph(
x, y, axis=axis, act=act, op_name=op_type) x, y, axis=axis, act=act, op_name=op_type)
...@@ -512,6 +514,8 @@ def multiply(x, y, name=None): ...@@ -512,6 +514,8 @@ def multiply(x, y, name=None):
axis = -1 axis = -1
if paddle.in_dynamic_mode(): if paddle.in_dynamic_mode():
if _in_eager_mode():
return _C_ops.final_state_multiply(x, y)
return _elementwise_op_in_dygraph( return _elementwise_op_in_dygraph(
x, y, axis=axis, act=act, op_name=op_type) x, y, axis=axis, act=act, op_name=op_type)
...@@ -3801,13 +3805,13 @@ def diff(x, n=1, axis=-1, prepend=None, append=None, name=None): ...@@ -3801,13 +3805,13 @@ def diff(x, n=1, axis=-1, prepend=None, append=None, name=None):
attrs_1 += ('starts', starts_1) attrs_1 += ('starts', starts_1)
ends_1 = [dim_len - 1] ends_1 = [dim_len - 1]
attrs_1 += ('ends', ends_1) attrs_1 += ('ends', ends_1)
input_front = _C_ops.slice(new_input, None, None, 'axes', axes, \ input_front = _C_ops.slice(new_input, None, None, None, None, 'axes', axes, \
'infer_flags', infer_flags, *attrs_1) 'infer_flags', infer_flags, *attrs_1)
starts_2 = [1] starts_2 = [1]
attrs_2 += ('starts', starts_2) attrs_2 += ('starts', starts_2)
ends_2 = [dim_len] ends_2 = [dim_len]
attrs_2 += ('ends', ends_2) attrs_2 += ('ends', ends_2)
input_back = _C_ops.slice(new_input, None, None, 'axes', axes, \ input_back = _C_ops.slice(new_input, None, None, None, None, 'axes', axes, \
'infer_flags', infer_flags, *attrs_2) 'infer_flags', infer_flags, *attrs_2)
if x.dtype == paddle.bool: if x.dtype == paddle.bool:
......
...@@ -317,7 +317,7 @@ def tensor_to_string(tensor, prefix='Tensor'): ...@@ -317,7 +317,7 @@ def tensor_to_string(tensor, prefix='Tensor'):
_template = "{prefix}(shape={shape}, dtype={dtype}, place={place}, stop_gradient={stop_gradient},\n{indent}{data})" _template = "{prefix}(shape={shape}, dtype={dtype}, place={place}, stop_gradient={stop_gradient},\n{indent}{data})"
if not tensor._is_initialized(): if not tensor._is_dense_tensor_hold_allocation():
return "Tensor(Not initialized)" return "Tensor(Not initialized)"
if tensor.is_sparse(): if tensor.is_sparse():
......
...@@ -5,7 +5,7 @@ ...@@ -5,7 +5,7 @@
func : ElementwiseInferMeta func : ElementwiseInferMeta
kernel : kernel :
func : add func : add
# backward : add_grad backward : add_grad
- api : cast - api : cast
args : (Tensor x, DataType out_dtype) args : (Tensor x, DataType out_dtype)
...@@ -47,6 +47,7 @@ ...@@ -47,6 +47,7 @@
func : ElementwiseInferMeta func : ElementwiseInferMeta
kernel : kernel :
func : divide func : divide
backward : divide_grad
- api : dot - api : dot
args : (Tensor x, Tensor y) args : (Tensor x, Tensor y)
...@@ -136,6 +137,7 @@ ...@@ -136,6 +137,7 @@
func : ElementwiseInferMeta func : ElementwiseInferMeta
kernel : kernel :
func : multiply func : multiply
backward : multiply_grad
- api : ones_like - api : ones_like
args : (Tensor x, DataType dtype=DataType::UNDEFINED, Place place={}) args : (Tensor x, DataType dtype=DataType::UNDEFINED, Place place={})
...@@ -208,6 +210,7 @@ ...@@ -208,6 +210,7 @@
func : ElementwiseInferMeta func : ElementwiseInferMeta
kernel : kernel :
func : subtract func : subtract
backward : subtract_grad
- api : sum - api : sum
args : (Tensor x, int64[] axis={}, DataType dtype=DataType::UNDEFINED, bool keep_dim=false) args : (Tensor x, int64[] axis={}, DataType dtype=DataType::UNDEFINED, bool keep_dim=false)
...@@ -1314,7 +1317,7 @@ ...@@ -1314,7 +1317,7 @@
func : AdamaxInferMeta func : AdamaxInferMeta
kernel : kernel :
func : adamax func : adamax
- api : where - api : where
...@@ -1370,7 +1373,7 @@ ...@@ -1370,7 +1373,7 @@
func : CompareInferMeta func : CompareInferMeta
kernel : kernel :
func : equal func : equal
- api : not_equal - api : not_equal
args : (Tensor x, Tensor y, int axis = -1) args : (Tensor x, Tensor y, int axis = -1)
output : Tensor output : Tensor
......
...@@ -25,10 +25,9 @@ ...@@ -25,10 +25,9 @@
output : Tensor(x_grad) output : Tensor(x_grad)
invoke : scale(out_grad, scale, bias, bias_after_scale) invoke : scale(out_grad, scale, bias, bias_after_scale)
- backward_api : add_grad - backward_api : add_grad
forward : add (Tensor x, Tensor y) -> Tensor(out) forward : add (Tensor x, Tensor y) -> Tensor(out)
args : (Tensor x, Tensor y, Tensor out_grad) args : (Tensor x, Tensor y, Tensor out_grad, int axis = -1)
output : Tensor(x_grad), Tensor(y_grad) output : Tensor(x_grad), Tensor(y_grad)
infer_meta : infer_meta :
func : GeneralBinaryGradInferMeta func : GeneralBinaryGradInferMeta
...@@ -36,6 +35,37 @@ ...@@ -36,6 +35,37 @@
kernel : kernel :
func : add_grad func : add_grad
- backward_api : subtract_grad
forward : subtract (Tensor x, Tensor y) -> Tensor(out)
args : (Tensor x, Tensor y, Tensor out_grad, int axis = -1)
output : Tensor(x_grad), Tensor(y_grad)
infer_meta :
func : GeneralBinaryGradInferMeta
param : [x, y]
kernel :
func : subtract_grad
- backward_api : multiply_grad
forward : multiply (Tensor x, Tensor y) -> Tensor(out)
args : (Tensor x, Tensor y, Tensor out_grad, int axis = -1)
output : Tensor(x_grad), Tensor(y_grad)
infer_meta :
func : GeneralBinaryGradInferMeta
param : [x, y]
kernel :
func : multiply_grad
- backward_api : divide_grad
forward : divide (Tensor x, Tensor y) -> Tensor(out)
args : (Tensor x, Tensor y, Tensor out_grad, int axis = -1)
output : Tensor(x_grad), Tensor(y_grad)
infer_meta :
func : GeneralBinaryGradInferMeta
param : [x, y]
kernel :
func : divide_grad
- backward_api : digamma_grad - backward_api : digamma_grad
forward : digamma (Tensor x) -> Tensor(out) forward : digamma (Tensor x) -> Tensor(out)
args : (Tensor x, Tensor out_grad) args : (Tensor x, Tensor out_grad)
...@@ -490,7 +520,7 @@ ...@@ -490,7 +520,7 @@
# param : [out, out_grad, axis] # param : [out, out_grad, axis]
# kernel : # kernel :
# func : gumbel_softmax_grad # func : gumbel_softmax_grad
- backward_api : transpose_grad - backward_api : transpose_grad
forward : transpose (Tensor x, int[] axis) -> Tensor(out) forward : transpose (Tensor x, int[] axis) -> Tensor(out)
...@@ -501,7 +531,7 @@ ...@@ -501,7 +531,7 @@
param : [out_grad, axis] param : [out_grad, axis]
kernel : kernel :
func : transpose_grad func : transpose_grad
# - backward_api : lerp_grad # - backward_api : lerp_grad
# forward : transpose (Tensor x, Tensor y, Tensor weight) -> Tensor(out) # forward : transpose (Tensor x, Tensor y, Tensor weight) -> Tensor(out)
# args : (Tensor x, Tensor y, Tensor weight, Tensor out, Tensor out_grad) # args : (Tensor x, Tensor y, Tensor weight, Tensor out, Tensor out_grad)
......
...@@ -733,7 +733,7 @@ with redirect_stdout(): ...@@ -733,7 +733,7 @@ with redirect_stdout():
}, },
entry_points={ entry_points={
'console_scripts': [ 'console_scripts': [
'fleetrun = paddle.distributed.launch.__main__:launch' 'fleetrun = paddle.distributed.launch.main:launch'
] ]
}, },
classifiers=[ classifiers=[
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册