提交 da478d1e 编写于 作者: P phlrain

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into add_some_yaml_config

......@@ -258,6 +258,12 @@ copy(inference_lib_dist
copy(inference_lib_dist
SRCS ${PADDLE_SOURCE_DIR}/paddle/utils/any.h
DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/utils/)
copy(inference_lib_dist
SRCS ${PADDLE_SOURCE_DIR}/paddle/utils/optional.h
DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/utils/)
copy(inference_lib_dist
SRCS ${PADDLE_SOURCE_DIR}/paddle/utils/none.h
DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/utils/)
copy(inference_lib_dist
SRCS ${PADDLE_SOURCE_DIR}/paddle/extension.h
DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/)
......
......@@ -39,7 +39,8 @@ static void CopyOrAddTensor(paddle::experimental::Tensor* tensor,
}
std::vector<std::vector<paddle::experimental::Tensor>> GradNodeAccumulation::
operator()(const std::vector<std::vector<paddle::experimental::Tensor>>& grads,
operator()(
std::vector<std::vector<paddle::experimental::Tensor>>& grads, // NOLINT
bool create_graph) {
VLOG(3) << "Running Eager Backward Node: GradNodeAccumulation";
PADDLE_ENFORCE(grads.size() == 1,
......
......@@ -35,7 +35,7 @@ class GradNodeAccumulation : public GradNodeBase {
// Functor: perform backward computations
virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
const std::vector<std::vector<paddle::experimental::Tensor>>& grads,
std::vector<std::vector<paddle::experimental::Tensor>>& grads, // NOLINT
bool create_graph = false) override;
void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; }
......
......@@ -145,7 +145,8 @@ void GradNodeScale::SetTensorWrappers_X(
void GradNodeScale::SetAttributes_scale(float scale) { scale_ = scale; }
std::vector<std::vector<paddle::experimental::Tensor>> GradNodeScale::
operator()(const std::vector<std::vector<paddle::experimental::Tensor>>& grads,
operator()(
std::vector<std::vector<paddle::experimental::Tensor>>& grads, // NOLINT
bool create_graph) {
// 1. Check Output Size
PADDLE_ENFORCE(
......
......@@ -39,7 +39,7 @@ class GradNodeScale : public GradNodeBase {
// Functor: perform backward computations
virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
const std::vector<std::vector<paddle::experimental::Tensor>>& grads,
std::vector<std::vector<paddle::experimental::Tensor>>& grads, // NOLINT
bool create_graph = false) override;
void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; }
......
......@@ -47,6 +47,9 @@ std::unordered_map<std::string, std::vector<std::string>>
static std::unordered_map<std::string, paddle::framework::AttributeMap>
operators_with_attrs = {};
static std::unordered_set<std::string> ops_to_fill_zero_for_empty_grads = {
"split"};
/* --- Black Ops list that's NO NEED to apply code generation --- */
static std::unordered_set<std::string> black_ops_list = {"run_program"};
......@@ -2243,11 +2246,21 @@ static std::string GenerateGradNodeCCContents(
// [Generation] Get Full Grad Function
const char* GRAD_FUNCTION_TEMPLATE =
"std::vector<std::vector<paddle::experimental::Tensor>> "
"GradNode%s::operator()(const "
"std::vector<std::vector<paddle::experimental::Tensor>>& grads, "
"bool create_graph) {\n%s\n}";
std::string grad_function_str = paddle::string::Sprintf(
GRAD_FUNCTION_TEMPLATE, fwd_op_type, generated_grad_function_body);
"GradNode%s::operator()("
"std::vector<std::vector<paddle::experimental::Tensor>>& grads, bool "
"create_graph) {\n"
"%s"
"%s"
"\n}";
std::string fill_zero_str = "";
if (ops_to_fill_zero_for_empty_grads.count(fwd_op_type)) {
fill_zero_str =
"egr::EagerUtils::FillZeroForEmptyGradInputs(&grads, "
"this->InputMeta());\n";
}
std::string grad_function_str =
paddle::string::Sprintf(GRAD_FUNCTION_TEMPLATE, fwd_op_type,
fill_zero_str, generated_grad_function_body);
VLOG(6) << "Generated returns";
......@@ -2279,9 +2292,9 @@ static std::string GenerateGradNodeHeaderContents(
" ~GradNode%s() override { VLOG(6) << \" Destruct GradNode%s \"; }\n"
"\n"
" virtual std::vector<std::vector<paddle::experimental::Tensor>> "
"operator()(const "
"std::vector<std::vector<paddle::experimental::Tensor>>& grads, const "
"bool create_graph = false) "
"operator()("
"std::vector<std::vector<paddle::experimental::Tensor>>& grads, bool "
"create_graph = false) "
"override;\n"
"\n"
" void ClearTensorWrappers() override { \n"
......
......@@ -17,6 +17,8 @@ import re
import argparse
import os
ops_to_fill_zero_for_empty_grads = set(list("split"))
# For API dispatch used at python-level
# { op_name : [arg_name, ...] }
core_ops_returns_info = {}
......@@ -599,7 +601,8 @@ class {} : public egr::GradNodeBase {{
~{}() override = default;
virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
const std::vector<std::vector<paddle::experimental::Tensor>>& grads, bool create_graph = false) override;
std::vector<std::vector<paddle::experimental::Tensor>>& grads, bool create_graph = false) override;
std::string name() override {{ return \" {} \"; }}
void ClearTensorWrappers() override {{
......@@ -657,10 +660,11 @@ def GenerateNodeDefinition(fwd_api_name, bwd_api_name, backward_fwd_input_map,
for _, (ttype, fwd_position,
grad_api_position) in backward_grad_input_map.items():
if IsPlainTensorType(ttype):
grad_api_args[grad_api_position] = f"grads[{fwd_position}][0]"
grad_api_args[
grad_api_position] = f"hooked_grads[{fwd_position}][0]"
else:
assert IsVectorTensorType(ttype)
grad_api_args[grad_api_position] = f"grads[{fwd_position}]"
grad_api_args[grad_api_position] = f"hooked_grads[{fwd_position}]"
for name, _, _, grad_api_position in backward_attrs_list:
saved_attribute_name = GetSavedName(name)
......@@ -688,23 +692,30 @@ def GenerateNodeDefinition(fwd_api_name, bwd_api_name, backward_fwd_input_map,
grad_node_name = GetGradNodeName(fwd_api_name)
fill_zero_str = ""
if fwd_api_name in ops_to_fill_zero_for_empty_grads:
fill_zero_str = "egr::EagerUtils::FillZeroForEmptyGradInputs(&grads, this->InputMeta());\n"
if len(namespace) > 0:
grad_api_namespace = f"paddle::experimental::{namespace}"
else:
grad_api_namespace = f"paddle::experimental"
FUNCTION_TEMPLATE = """
std::vector<std::vector<paddle::experimental::Tensor>> {}::operator()(const std::vector<std::vector<paddle::experimental::Tensor>>& grads, bool create_graph) {{
std::vector<std::vector<paddle::experimental::Tensor>> {}::operator()(std::vector<std::vector<paddle::experimental::Tensor>>& grads, bool create_graph) {{
{}
auto hooked_grads = ApplyGradientHooks(grads);
// Call grad_api function
VLOG(3) << \"Finally State Running: \" << \"{}\";
VLOG(3) << \"Final State Running: \" << \"{}\";
auto grad_api_returns = {}::{}({});
{}
}}
"""
node_definition_str = FUNCTION_TEMPLATE.format(
grad_node_name, grad_node_name, grad_api_namespace, bwd_api_name,
grad_api_args_str, returns_str)
grad_node_name, fill_zero_str, grad_node_name, grad_api_namespace,
bwd_api_name, grad_api_args_str, returns_str)
return node_definition_str
......@@ -799,8 +810,15 @@ def GenerateNodeCreationCodes(
# SetAttributes
set_attributes_list = []
for name, _, _, _ in backward_attrs_list:
forward_attrs_name_set = set()
for name, _, _, _ in forward_attrs_list:
forward_attrs_name_set.add(name)
for name, _, default_val_attr, _ in backward_attrs_list:
if name in forward_attrs_name_set:
set_attributes = f" grad_node->SetAttribute{name}({name});"
else:
set_attributes = f" grad_node->SetAttribute{name}({default_val_attr});"
set_attributes_list.append(set_attributes)
set_attributes_str = "\n".join(set_attributes_list)
......
......@@ -20,8 +20,8 @@
namespace egr {
std::vector<std::vector<paddle::experimental::Tensor>> RunCustomOpNode::
operator()(const std::vector<std::vector<paddle::experimental::Tensor>>& grads,
bool create_graph) {
operator()(std::vector<std::vector<paddle::experimental::Tensor>>& grads,
bool create_graph) { // NOLINT
paddle::CustomOpKernelContext ctx;
auto grad_inputs_name = paddle::framework::OpMetaInfoHelper::GetInputs(
egr::Controller::Instance().GetOpMetaInfoMap().at(op_type_)[1]);
......
......@@ -37,8 +37,9 @@ class RunCustomOpNode : public GradNodeBase {
// Functor: perform backward computations
virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
const std::vector<std::vector<paddle::experimental::Tensor>>& grads,
bool create_graph) override;
std::vector<std::vector<paddle::experimental::Tensor>>& grads,
bool create_graph = false) // NOLINT
override;
std::string name() {
return paddle::string::Sprintf("RunCustomOpNode: %s_grad", op_type_);
......
......@@ -102,6 +102,7 @@ const std::vector<std::vector<GradSlotMeta>>& GradNodeBase::OutputMeta() const {
void GradNodeBase::SetGradInMeta(const paddle::experimental::Tensor& fwd_out,
size_t slot_rank) {
VLOG(6) << "Set GradSlotMeta for Grad Inputs";
auto* fwd_out_meta = egr::EagerUtils::nullable_autograd_meta(fwd_out);
PADDLE_ENFORCE_LE(
slot_rank, (bwd_in_meta_.size() - 1),
......@@ -117,6 +118,12 @@ void GradNodeBase::SetGradInMeta(const paddle::experimental::Tensor& fwd_out,
auto& meta = metas[0];
meta.SetStopGradient(fwd_out_meta->StopGradient());
if (!fwd_out.is_initialized()) {
VLOG(6)
<< "Skip Configuring GradSlotMeta for uninitialized GradInput Tensor";
return;
}
// Record TensorMeta
if (phi::DenseTensor::classof(fwd_out.impl().get())) {
// Only Copy Meta
......@@ -128,7 +135,9 @@ void GradNodeBase::SetGradInMeta(const paddle::experimental::Tensor& fwd_out,
paddle::platform::errors::Fatal(
"Attempting to copy DenseTensorMeta with phi::DataType::UNDEFINED,"
"which is illegal."));
meta.SetTensorMeta(dense_tensor->meta());
meta.SetPlace(fwd_out.inner_place());
if (paddle::framework::IsComplexType(
paddle::framework::TransToProtoVarType(dense_tensor->type()))) {
......@@ -143,6 +152,7 @@ void GradNodeBase::SetGradInMeta(const paddle::experimental::Tensor& fwd_out,
void GradNodeBase::SetGradInMeta(
const std::vector<paddle::experimental::Tensor>& fwd_out,
size_t slot_rank) {
VLOG(6) << "Set GradSlotMeta for Grad Inputs";
size_t slot_size = fwd_out.size();
PADDLE_ENFORCE_LE(
slot_rank, (bwd_in_meta_.size() - 1),
......@@ -172,6 +182,12 @@ void GradNodeBase::SetGradInMeta(
meta.SetStopGradient(fwd_out_meta->StopGradient());
}
if (!fwd_out_tensor.is_initialized()) {
VLOG(6)
<< "Skip Configuring GradSlotMeta for uninitialized GradInput Tensor";
return;
}
// Record TensorMeta
if (phi::DenseTensor::classof(fwd_out_tensor.impl().get())) {
// Only Copy Meta
......@@ -184,6 +200,8 @@ void GradNodeBase::SetGradInMeta(
"with phi::DataType::UNDEFINED,"
"which is illegal."));
meta.SetTensorMeta(dense_tensor->meta());
meta.SetPlace(fwd_out_tensor.inner_place());
if (paddle::framework::IsComplexType(
paddle::framework::TransToProtoVarType(dense_tensor->type()))) {
need_complex_to_real_ = true;
......@@ -228,6 +246,7 @@ void GradNodeBase::SetGradOutMeta(const paddle::experimental::Tensor& fwd_in,
"with phi::DataType::UNDEFINED,"
"which is illegal."));
meta.SetTensorMeta(dense_tensor->meta());
meta.SetPlace(fwd_in.inner_place());
}
} else {
VLOG(6) << "Unable to initialize the DenseTensorMeta of GradSlotMeta with "
......@@ -272,6 +291,7 @@ void GradNodeBase::SetGradOutMeta(
"phi::DataType::UNDEFINED,"
"which is illegal."));
meta.SetTensorMeta(dense_tensor->meta());
meta.SetPlace(fwd_in_tensor.inner_place());
}
} else {
VLOG(6) << "Unable to initialize the DenseTensorMeta of GradSlotMeta "
......
......@@ -76,8 +76,12 @@ class GradSlotMeta {
return *meta_.get();
}
void SetPlace(const phi::Place& place) { place_ = place; }
const phi::Place& GetPlace() const { return place_; }
private:
bool stop_gradient_{false};
phi::Place place_;
std::shared_ptr<phi::DenseTensorMeta> meta_ = nullptr;
};
......@@ -102,7 +106,7 @@ class GradNodeBase {
* is better choice to fit this format.
* **/
virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
const std::vector<std::vector<paddle::experimental::Tensor>>& grads,
std::vector<std::vector<paddle::experimental::Tensor>>& grads, // NOLINT
bool create_graph = false) = 0;
virtual void ClearTensorWrappers() = 0;
......
......@@ -53,7 +53,7 @@ class GradTensorHolder {
return buffer_[pos];
}
const std::vector<std::vector<paddle::experimental::Tensor>>& Buffers() {
std::vector<std::vector<paddle::experimental::Tensor>>& Buffers() {
return buffer_;
}
......
......@@ -80,13 +80,15 @@ TEST(AccumulationNode, Tensor) {
grad_meta->SetStopGradient(false);
// operator()
paddle::experimental::Tensor ret_et0 = node->operator()({{et0}})[0][0];
std::vector<std::vector<paddle::experimental::Tensor>> et0_vec = {{et0}};
paddle::experimental::Tensor ret_et0 = node->operator()(et0_vec)[0][0];
auto* ret_et0_ptr =
std::dynamic_pointer_cast<phi::DenseTensor>(ret_et0.impl())
->data<paddle::platform::float16>();
CHECK_EQ(ret_et0_ptr[0], paddle::platform::float16(10.0f));
paddle::experimental::Tensor ret_et1 = node->operator()({{et1}})[0][0];
std::vector<std::vector<paddle::experimental::Tensor>> et1_vec = {{et1}};
paddle::experimental::Tensor ret_et1 = node->operator()(et1_vec)[0][0];
auto* ret_et1_ptr =
std::dynamic_pointer_cast<phi::DenseTensor>(ret_et1.impl())
......@@ -121,7 +123,7 @@ TEST(AccumulationNode, Tensor) {
std::make_shared<egr::CppTensorVoidHook>(reduce_hook_1));
// operator()
paddle::experimental::Tensor _ret = node->operator()({{et0}})[0][0];
paddle::experimental::Tensor _ret = node->operator()(et0_vec)[0][0];
// Check operator() result, should be 36.0
auto* _ret_ptr = std::dynamic_pointer_cast<phi::DenseTensor>(_ret.impl())
......
......@@ -32,7 +32,7 @@ class GradTestNode : public egr::GradNodeBase {
GradTestNode() : GradNodeBase() { val_ = 1.0; }
std::string name() override { return "GradTestNode"; }
std::vector<std::vector<paddle::experimental::Tensor>> operator()(
const std::vector<std::vector<paddle::experimental::Tensor>>& grads,
std::vector<std::vector<paddle::experimental::Tensor>>& grads,
bool create_graph = false) override {
val_ = std::dynamic_pointer_cast<phi::DenseTensor>(grads[0][0].impl())
->data<float>()[0];
......
......@@ -247,4 +247,20 @@ TEST(EagerUtils, GetGradAccumulationNode) {
ASSERT_ANY_THROW(egr::EagerUtils::GetGradAccumulationNode(t0));
}
TEST(EagerUtils, FillZeroForEmptyGradInputs) {
std::vector<std::vector<paddle::experimental::Tensor>> grads = {
std::vector<paddle::experimental::Tensor>(1)};
std::vector<std::vector<GradSlotMeta>> slot_metas = {
std::vector<GradSlotMeta>(1)};
phi::DenseTensorMeta tensor_meta;
tensor_meta.dtype = paddle::experimental::DataType::FLOAT32;
tensor_meta.dims = {2, 4};
slot_metas[0][0].SetTensorMeta(tensor_meta);
slot_metas[0][0].SetPlace(phi::CPUPlace());
EagerUtils::FillZeroForEmptyGradInputs(&grads, slot_metas);
eager_test::CompareTensorWithValue<float>(grads[0][0], 0.0);
}
} // namespace egr
......@@ -370,7 +370,7 @@ class GradNodeRunProgram : public egr::GradNodeBase {
~GradNodeRunProgram() override = default;
// Functor: perform backward computations
virtual std::vector<std::vector<paddle::experimental::Tensor>> operator()(
const std::vector<std::vector<paddle::experimental::Tensor>> &grads,
std::vector<std::vector<paddle::experimental::Tensor>> &grads, // NOLINT
bool create_graph) override {
VLOG(3) << "Running Eager Backward Node: GradNodeRunProgram";
PADDLE_ENFORCE_EQ(
......
......@@ -20,6 +20,7 @@
#include "paddle/phi/api/all.h"
#include "paddle/phi/common/layout.h"
#include "paddle/phi/core/compat/convert_utils.h"
#include "paddle/phi/core/tensor_meta.h"
#include "paddle/fluid/framework/data_layout.h"
......@@ -392,4 +393,28 @@ std::shared_ptr<egr::GradNodeBase> EagerUtils::GetGradAccumulationNode(
}
}
void EagerUtils::FillZeroForEmptyGradInputs(
std::vector<std::vector<paddle::experimental::Tensor>>* in_grads,
const std::vector<std::vector<GradSlotMeta>>& grad_in_metas) {
for (size_t i = 0; i < in_grads->size(); i++) {
for (size_t j = 0; j < (*in_grads)[0].size(); j++) {
paddle::experimental::Tensor& grad = (*in_grads)[i][j];
if (!grad.is_initialized()) {
const GradSlotMeta& grad_in_meta = grad_in_metas[i][j];
PADDLE_ENFORCE(
grad_in_meta.HasTensorMeta(),
paddle::platform::errors::Fatal(
"Unable to fill empty grad inputs due to empty GradSlotMeta"));
const auto& tensor_meta = grad_in_meta.GetTensorMeta();
phi::Place place = grad_in_meta.GetPlace();
auto tensor_with_zero = paddle::experimental::full(
phi::vectorize(tensor_meta.dims), 0.0, tensor_meta.dtype, place);
grad.set_impl(tensor_with_zero.impl());
}
}
}
}
} // namespace egr
......@@ -217,6 +217,13 @@ class EagerUtils {
const std::vector<paddle::experimental::Tensor>& tensors);
static std::shared_ptr<egr::GradNodeBase> GetGradAccumulationNode(
const paddle::experimental::Tensor& tensor);
/**
* Fill Zero
* **/
static void FillZeroForEmptyGradInputs(
std::vector<std::vector<paddle::experimental::Tensor>>* out_grads,
const std::vector<std::vector<GradSlotMeta>>& grad_out_metas);
};
} // namespace egr
......@@ -176,6 +176,20 @@ void Tracer::TraceOp(const std::string& type, const NameVarMap<VarType>& ins,
const std::map<std::string, std::string>& inplace_map,
paddle::framework::AttributeMap* passed_default_attrs_,
bool use_default_attr_map) {
TraceOpImpl<VarType>(type, ins, outs, attrs, place, trace_backward,
inplace_map, passed_default_attrs_,
use_default_attr_map);
}
template <typename VarType>
void Tracer::TraceOpImpl(const std::string& type,
const NameVarMap<VarType>& ins,
const NameVarMap<VarType>& outs,
framework::AttributeMap& attrs,
const platform::Place& place, bool trace_backward,
const std::map<std::string, std::string>& inplace_map,
paddle::framework::AttributeMap* passed_default_attrs_,
bool use_default_attr_map) {
platform::RecordEvent op_type_record_event(
type + " trace_op", platform::TracerEventType::Operator, 1);
platform::ScopedFlushDenormal flush;
......@@ -340,25 +354,33 @@ void Tracer::TraceOp(const std::string& type, const NameVarBaseMap& ins,
void Tracer::TraceOp(const std::string& type, const NameTensorMap& ins,
const NameTensorMap& outs,
paddle::framework::AttributeMap attrs,
paddle::framework::AttributeMap& attrs,
const paddle::platform::Place& place,
paddle::framework::AttributeMap* default_attrs,
bool use_default_attr_map,
const std::map<std::string, std::string>& inplace_map) {
VLOG(6) << "Running On Eager TraceOp with use_default_attr_map: "
<< use_default_attr_map;
TraceOp<egr::EagerVariable>(type, ins, outs, std::move(attrs), place, false,
inplace_map, default_attrs, use_default_attr_map);
TraceOpImpl<egr::EagerVariable>(type, ins, outs, attrs, place, false,
inplace_map, default_attrs,
use_default_attr_map);
}
void Tracer::TraceOp(const std::string& type, const NameTensorMap& ins,
const NameTensorMap& outs,
paddle::framework::AttributeMap attrs) {
VLOG(6) << "Running On Eager TraceOp(4 agrs): ";
TraceOpImpl<egr::EagerVariable>(type, ins, outs, attrs, expected_place_,
false, {}, nullptr, true);
}
void Tracer::TraceOp(const std::string& type, const NameTensorMap& ins,
const NameTensorMap& outs,
paddle::framework::AttributeMap attrs,
paddle::framework::AttributeMap& attrs,
const std::map<std::string, std::string>& inplace_map) {
VLOG(6) << "Running On Eager TraceOp(less): ";
TraceOp<egr::EagerVariable>(type, ins, outs, std::move(attrs),
expected_place_, false, inplace_map, nullptr,
true);
TraceOpImpl<egr::EagerVariable>(type, ins, outs, attrs, expected_place_,
false, inplace_map, nullptr, true);
}
void Tracer::SetExpectedPlace(platform::Place place) {
......
......@@ -74,16 +74,32 @@ class Tracer {
paddle::framework::AttributeMap* passed_default_attrs_ = nullptr,
bool use_default_attr_map = true);
template <typename VarType>
void TraceOpImpl(
const std::string& type, const NameVarMap<VarType>& ins,
const NameVarMap<VarType>& outs,
framework::AttributeMap& attrs, // NOLINT
const platform::Place& place, bool trace_backward,
const std::map<std::string, std::string>& inplace_map = {},
paddle::framework::AttributeMap* passed_default_attrs_ = nullptr,
bool use_default_attr_map = true);
void TraceOp(const std::string& type, const NameVarBaseMap& ins,
const NameVarBaseMap& outs, framework::AttributeMap attrs,
const std::map<std::string, std::string>& inplace_map = {});
void TraceOp(const std::string& type, const NameTensorMap& ins,
const NameTensorMap& outs, paddle::framework::AttributeMap attrs,
const NameTensorMap& outs,
paddle::framework::AttributeMap& attrs, // NOLINT
const std::map<std::string, std::string>& inplace_map = {});
void TraceOp(const std::string& type, const NameTensorMap& ins,
const NameTensorMap& outs, paddle::framework::AttributeMap attrs,
const NameTensorMap& outs,
paddle::framework::AttributeMap attrs);
void TraceOp(const std::string& type, const NameTensorMap& ins,
const NameTensorMap& outs,
paddle::framework::AttributeMap& attrs, // NOLINT
const paddle::platform::Place& place,
paddle::framework::AttributeMap* default_attrs,
bool use_default_attr_map,
......
......@@ -34,6 +34,7 @@
#include "paddle/fluid/memory/allocation/thread_local_allocator.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/device/gpu/cuda/cuda_graph.h"
......@@ -210,13 +211,28 @@ class AllocatorFacadePrivate {
InitNaiveBestFitCPUAllocator();
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
allow_free_idle_chunk_ = allow_free_idle_chunk;
if (!FLAGS_use_stream_safe_cuda_allocator) {
for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount();
++dev_id) {
for (int dev_id = 0; dev_id < platform::GetGPUDeviceCount(); ++dev_id) {
InitAutoGrowthCUDAAllocator(platform::CUDAPlace(dev_id),
allow_free_idle_chunk_);
}
// Note(Ruibiao): For GPU multi-stream case, the 'allocators_' map(place
// -> Allocator) hold the StreamSafeCUDAAllocator releate to default
// stream (i.e., the stream directly got from DeviceContex), while the
// 'cuda_allocators_' map(place -> map(stream -> Allocator)) hold the
// StreamSafeCUDAAllocator releate to non-default stream (i.e., the
// stream users pass in). The default stream Allocator is built in the
// structure of AllocatorFacadePrivate, while the non-default stream is
// build in a delayed manner in GetAllocator function with
// 'create_if_not_found = ture'. We make special treatment for the
// default stream for performance reasons. Since most Alloc calls are
// for default stream in application, treating it separately can avoid
// lots of overhead of acquiring default stream and applying read-write
// lock.
if (FLAGS_use_stream_safe_cuda_allocator) {
WrapStreamSafeCUDAAllocatorForDefault();
}
InitNaiveBestFitCUDAPinnedAllocator();
#endif
#ifdef PADDLE_WITH_ASCEND_CL
......@@ -301,7 +317,8 @@ class AllocatorFacadePrivate {
CheckAllocThreadSafe();
#ifdef PADDLE_WITH_CUDA
if (UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) {
if (FLAGS_use_stream_safe_cuda_allocator == false &&
UNLIKELY(platform::CUDAGraph::IsThisThreadCapturing())) {
WrapCUDAGraphAllocator();
}
#endif
......@@ -341,7 +358,12 @@ class AllocatorFacadePrivate {
const std::shared_ptr<Allocator>& GetAllocator(
const platform::CUDAPlace& place, const gpuStream_t& stream,
bool create_if_not_found = false) {
{ // shared_lock_guard
if (stream == GetDefaultStream(place)) {
VLOG(7) << "Get Allocator by passing in a default stream";
return GetAllocator(place, /* A non-zero num to choose allocator_ */ 1);
}
/* shared_lock_guard */ {
std::shared_lock<std::shared_timed_mutex> lock_guard(
cuda_allocator_mutex_);
if (LIKELY(HasCUDAAllocator(place, stream))) {
......@@ -355,7 +377,7 @@ class AllocatorFacadePrivate {
}
}
{ // unique_lock_guard
/* unique_lock_guard */ {
std::unique_lock<std::shared_timed_mutex> lock_guard(
cuda_allocator_mutex_);
InitStreamSafeCUDAAllocator(place, stream);
......@@ -363,9 +385,40 @@ class AllocatorFacadePrivate {
}
}
gpuStream_t GetDefaultStream(const platform::CUDAPlace& place) {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
return static_cast<platform::CUDADeviceContext*>(pool.Get(place))->stream();
const std::shared_ptr<StreamSafeCUDAAllocator>
GetDefaultStreamSafeCUDAAllocator(const platform::CUDAPlace& place) const {
const auto iter = default_stream_safe_cuda_allocators_.find(place);
PADDLE_ENFORCE_NE(
iter, default_stream_safe_cuda_allocators_.end(),
platform::errors::NotFound(
"No StreamSafeCUDAAllocator found for the place, %s", place));
return iter->second;
}
const gpuStream_t& GetDefaultStream(const platform::CUDAPlace& place) const {
const std::shared_ptr<StreamSafeCUDAAllocator>& allocator =
GetDefaultStreamSafeCUDAAllocator(place);
return allocator->GetDefaultStream();
}
void SetDefaultStream(const platform::CUDAPlace& place,
const gpuStream_t& stream) {
const std::shared_ptr<StreamSafeCUDAAllocator>& allocator =
GetDefaultStreamSafeCUDAAllocator(place);
allocator->SetDefaultStream(stream);
VLOG(8) << "Set default stream to " << stream
<< " for StreamSafeCUDAAllocator(" << allocator.get() << ") in "
<< place;
}
void SetDefaultStreamFromDeviceContext() {
VLOG(8) << "Set default stream from DeviceContex";
for (auto& pair : default_stream_safe_cuda_allocators_) {
platform::DeviceContextPool& pool =
platform::DeviceContextPool::Instance();
pair.second->SetDefaultStream(
static_cast<phi::GPUContext*>(pool.Get(pair.first))->stream());
}
}
void RecordStream(std::shared_ptr<phi::Allocation> allocation,
......@@ -635,6 +688,26 @@ class AllocatorFacadePrivate {
/* in_cuda_graph_capturing = */ !allow_free_idle_chunk_);
}
void WrapStreamSafeCUDAAllocatorForDefault() {
for (auto& pair : allocators_) {
auto& place = pair.first;
if (platform::is_gpu_place(place)) {
std::shared_ptr<StreamSafeCUDAAllocator>&& allocator =
std::make_shared<StreamSafeCUDAAllocator>(
pair.second, place, /* default_stream = */ nullptr,
/* in_cuda_graph_capturing = */ !allow_free_idle_chunk_);
pair.second = allocator;
// NOTE(Ruibiao): A tricky implement to give StreamSafeCUDAAllocator an
// ability to interact with the outside world, i.e., change default
// stream from outside
default_stream_safe_cuda_allocators_[place] = allocator;
VLOG(8) << "WrapStreamSafeCUDAAllocator for " << place
<< ", allocator address = " << pair.second.get();
}
}
}
void WrapCUDARetryAllocator(platform::CUDAPlace p, gpuStream_t stream,
size_t retry_time) {
PADDLE_ENFORCE_GT(
......@@ -813,7 +886,6 @@ class AllocatorFacadePrivate {
#endif
}
// NOTE(Ruibiao): Old single-stream version, will be removed later
void WrapCUDARetryAllocator(size_t retry_time) {
PADDLE_ENFORCE_GT(
retry_time, 0,
......@@ -828,6 +900,8 @@ class AllocatorFacadePrivate {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
// a standalone CUDA allocator to support multi-stream GC in new executor
std::map<platform::Place, std::shared_ptr<StreamSafeCUDAAllocator>>
default_stream_safe_cuda_allocators_;
CUDAAllocatorMap cuda_allocators_;
std::shared_timed_mutex cuda_allocator_mutex_;
#endif
......@@ -870,15 +944,6 @@ AllocatorFacadePrivate* AllocatorFacade::GetPrivate() const {
const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
const platform::Place& place) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
FLAGS_use_system_allocator == false) {
AllocatorFacadePrivate* m = GetPrivate();
platform::CUDAPlace cuda_place(place.GetDeviceId());
return m->GetAllocator(cuda_place, m->GetDefaultStream(cuda_place));
}
#endif
return GetPrivate()->GetAllocator(
place, /* A non-zero num to choose allocator_ */ 1);
}
......@@ -898,19 +963,6 @@ void* AllocatorFacade::GetBasePtr(
return GetPrivate()->GetBasePtr(allocation);
}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
const platform::Place& place, const gpuStream_t& stream) {
if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
FLAGS_use_system_allocator == false) {
return GetPrivate()->GetAllocator(place, stream,
/*create_if_not_found=*/true);
}
return GetPrivate()->GetAllocator(
place, /* A non-zero num to choose allocator_ */ 1);
}
#endif
const std::shared_ptr<Allocator>& AllocatorFacade::GetZeroAllocator(
const platform::Place& place) {
return GetPrivate()->GetAllocator(place, /* zero size */ 0);
......@@ -923,26 +975,10 @@ std::shared_ptr<phi::Allocation> AllocatorFacade::AllocShared(
AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
size_t size) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
size > 0 && FLAGS_use_system_allocator == false) {
platform::CUDAPlace cuda_place(place.GetDeviceId());
phi::Stream default_stream = phi::Stream(reinterpret_cast<phi::StreamId>(
GetPrivate()->GetDefaultStream(cuda_place)));
return Alloc(cuda_place, size, default_stream);
}
#endif
return GetPrivate()->GetAllocator(place, size)->Allocate(size);
}
uint64_t AllocatorFacade::Release(const platform::Place& place) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
FLAGS_use_system_allocator == false) {
platform::CUDAPlace cuda_place(place.GetDeviceId());
return Release(cuda_place, GetPrivate()->GetDefaultStream(cuda_place));
}
#endif
return GetPrivate()
->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1)
->Release(place);
......@@ -1028,6 +1064,17 @@ void AllocatorFacade::RecordStream(std::shared_ptr<phi::Allocation> allocation,
GetPrivate()->RecordStream(allocation, stream);
}
const std::shared_ptr<Allocator>& AllocatorFacade::GetAllocator(
const platform::Place& place, const gpuStream_t& stream) {
if (FLAGS_use_stream_safe_cuda_allocator && platform::is_gpu_place(place) &&
FLAGS_use_system_allocator == false) {
return GetPrivate()->GetAllocator(place, stream,
/*create_if_not_found=*/true);
}
return GetPrivate()->GetAllocator(
place, /* A non-zero num to choose allocator_ */ 1);
}
const gpuStream_t& AllocatorFacade::GetStream(
const std::shared_ptr<phi::Allocation>& allocation) const {
PADDLE_ENFORCE_EQ(
......@@ -1040,6 +1087,13 @@ const gpuStream_t& AllocatorFacade::GetStream(
return GetPrivate()->GetStream(allocation);
}
void AllocatorFacade::SetDefaultStream(const platform::CUDAPlace& place,
const gpuStream_t& stream) {
if (FLAGS_use_stream_safe_cuda_allocator) {
GetPrivate()->SetDefaultStream(place, stream);
}
}
#ifdef PADDLE_WITH_CUDA
void AllocatorFacade::PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) {
PADDLE_ENFORCE_EQ(GetAllocatorStrategy(), AllocatorStrategy::kAutoGrowth,
......@@ -1055,6 +1109,8 @@ void AllocatorFacade::PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) {
"The memory pool of the CUDA Graph with ID %d have been prepared.",
id));
allocator.reset(new AllocatorFacadePrivate(/*allow_free_idle_chunk=*/false));
allocator->SetDefaultStreamFromDeviceContext();
VLOG(10) << "Prepare memory pool for CUDA Graph with ID " << id;
}
......
......@@ -55,11 +55,6 @@ class AllocatorFacade {
void* GetBasePtr(const std::shared_ptr<Allocation>& allocation);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
const std::shared_ptr<Allocator>& GetAllocator(const platform::Place& place,
const gpuStream_t& stream);
#endif
const std::shared_ptr<Allocator>& GetZeroAllocator(
const platform::Place& place);
......@@ -86,8 +81,12 @@ class AllocatorFacade {
uint64_t Release(const platform::CUDAPlace& place, const gpuStream_t& stream);
void RecordStream(std::shared_ptr<Allocation> allocation,
const gpuStream_t& stream);
const std::shared_ptr<Allocator>& GetAllocator(const platform::Place& place,
const gpuStream_t& stream);
const gpuStream_t& GetStream(
const std::shared_ptr<Allocation>& allocation) const;
void SetDefaultStream(const platform::CUDAPlace& place,
const gpuStream_t& stream);
#endif
#ifdef PADDLE_WITH_CUDA
......
......@@ -154,6 +154,14 @@ StreamSafeCUDAAllocator::~StreamSafeCUDAAllocator() {
bool StreamSafeCUDAAllocator::IsAllocThreadSafe() const { return true; }
const gpuStream_t& StreamSafeCUDAAllocator::GetDefaultStream() const {
return default_stream_;
}
void StreamSafeCUDAAllocator::SetDefaultStream(const gpuStream_t& stream) {
default_stream_ = stream;
}
phi::Allocation* StreamSafeCUDAAllocator::AllocateImpl(size_t size) {
platform::RecordEvent("StreamSafeCUDAAllocator::Allocate",
platform::TracerEventType::UserDefined, 9 /*level*/);
......@@ -187,12 +195,8 @@ void StreamSafeCUDAAllocator::FreeImpl(phi::Allocation* allocation) {
platform::RecordEvent("StreamSafeCUDAAllocator::Free",
platform::TracerEventType::UserDefined, 9 /*level*/);
StreamSafeCUDAAllocation* stream_safe_cuda_allocation =
dynamic_cast<StreamSafeCUDAAllocation*>(allocation);
PADDLE_ENFORCE_NOT_NULL(stream_safe_cuda_allocation,
platform::errors::InvalidArgument(
"Failed to dynamic cast %p from Allocation* to "
"StreamSafeCUDAAllocation*",
allocation));
static_cast<StreamSafeCUDAAllocation*>(allocation);
VLOG(8) << "Try free allocation " << stream_safe_cuda_allocation->ptr();
if (stream_safe_cuda_allocation->CanBeFreed()) {
VLOG(9) << "Directly delete allocation";
......@@ -221,6 +225,12 @@ uint64_t StreamSafeCUDAAllocator::ReleaseImpl(const platform::Place& place) {
}
void StreamSafeCUDAAllocator::ProcessUnfreedAllocations() {
// NOTE(Ruibiao): This condition is to reduce lock competion. It does not need
// to be thread-safe since here occasional misjudgments are permissible.
if (unfreed_allocations_.empty()) {
return;
}
std::lock_guard<SpinLock> lock_guard(unfreed_allocation_lock_);
for (auto it = unfreed_allocations_.begin();
it != unfreed_allocations_.end();) {
......
......@@ -64,7 +64,10 @@ class StreamSafeCUDAAllocator
platform::CUDAPlace place, gpuStream_t default_stream,
bool in_cuda_graph_capturing = false);
~StreamSafeCUDAAllocator();
bool IsAllocThreadSafe() const override;
const gpuStream_t &GetDefaultStream() const;
void SetDefaultStream(const gpuStream_t &stream);
protected:
phi::Allocation *AllocateImpl(size_t size) override;
......
......@@ -24,7 +24,9 @@ class CinnInstructionRunOp : public framework::OperatorWithKernel {
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
OP_INOUT_CHECK(ctx->HasInputs(kX), "Input", kX, "CinnInstructionRun");
// The cinn-graph may hasn't input for CINN now support fill_constant,
// and its all inputs may generated by fill_constant instead of by fetch.
// OP_INOUT_CHECK(ctx->HasInputs(kX), "Input", kX, "CinnInstructionRun");
OP_INOUT_CHECK(ctx->HasOutputs(kOutputs), "Output", kOutputs,
"CinnInstructionRun");
const CinnCompiledObject& compiled_object =
......@@ -43,6 +45,53 @@ class CinnInstructionRunOp : public framework::OperatorWithKernel {
});
ctx->SetOutputsDim(kOutputs, output_dims);
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
// Why we need override GetExpectedKernelType?
// A cinn-graph may has no inpute var, if we use the base function,
// it will check wheter input tensors is initialized. Here we rewrite
// the function so that we can infer kernel type by output date type.
if (ctx.InputSize(kX)) {
// if the instruction has input, infer kernel type by input date type:
return OperatorWithKernel::GetExpectedKernelType(ctx);
}
// Else infer kernel type by output date type:
// The `OutputVar` will check wheter the kOutputs iff has one output var
const framework::Variable* var = ctx.OutputVar(kOutputs);
PADDLE_ENFORCE_NE(
var, nullptr,
platform::errors::InvalidArgument(
"The cinn_instruction_run Op's Output Variable should not empty."));
const framework::Tensor* tensor = nullptr;
if (var->IsType<framework::Tensor>()) {
tensor = &var->Get<framework::Tensor>();
} else if (var->IsType<framework::LoDTensor>()) {
tensor = &var->Get<framework::LoDTensor>();
} else if (var->IsType<phi::SelectedRows>()) {
tensor = &(var->Get<phi::SelectedRows>().value());
} else if (var->IsType<framework::LoDTensorArray>()) {
auto t_arr = &var->Get<framework::LoDTensorArray>();
PADDLE_ENFORCE_EQ(t_arr->size(), 1UL,
platform::errors::InvalidArgument(
"The cinn_instruction_run Op should just has One "
"Output when Input empty."));
tensor = &(t_arr->front());
}
PADDLE_ENFORCE_NE(
tensor, nullptr,
platform::errors::InvalidArgument(
"The cinn_instruction_run Op's Output Tensor should not empty."));
VLOG(4) << "The tensor [" << ctx.OutputName(kOutputs) << "]'s dtype is "
<< paddle::framework::DataType2String(tensor->dtype());
auto output_type = paddle::framework::TransToProtoVarType(tensor->dtype());
return framework::OpKernelType(output_type, ctx.device_context());
}
};
class CinnInstructionRunOpMaker : public framework::OpProtoAndCheckerMaker {
......
......@@ -87,9 +87,12 @@ class CinnLaunchOp : public framework::OperatorWithKernel {
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
OP_INOUT_CHECK(ctx->HasInputs(kX) || ctx->HasInputs(kNoNeedBufferX),
"Input", string::format_string("%s|%s", kX, kNoNeedBufferX),
"CinnLaunchOp");
// The cinn-graph may hasn't input for CINN now support fill_constant,
// and its all inputs may generated by fill_constant instead of by fetch.
// OP_INOUT_CHECK(ctx->HasInputs(kX) || ctx->HasInputs(kNoNeedBufferX),
// "Input", string::format_string("%s|%s", kX,
// kNoNeedBufferX),
// "CinnLaunchOp");
OP_INOUT_CHECK(ctx->HasOutputs(kOutputs), "Output", kOutputs,
"CinnLaunchOp");
}
......
......@@ -35,143 +35,99 @@ limitations under the License. */
#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
#include "paddle/fluid/platform/aligned_vector.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/kernels/funcs/distribution_helper.h"
#include "paddle/phi/kernels/funcs/functors.h"
namespace paddle {
namespace operators {
template <typename T1, typename T2 = T1, typename OutT = T1>
struct DstMaskGenerator {
const float dropout_prob_;
const bool is_upscale_in_train_;
using MT = typename details::MPTypeTrait<T1>::Type;
MT factor;
HOSTDEVICE inline DstMaskGenerator(const float dropout_prob,
const bool is_upscale_in_train)
: dropout_prob_(dropout_prob), is_upscale_in_train_(is_upscale_in_train) {
factor = static_cast<MT>(1.0f / (1.0f - dropout_prob_));
}
template <typename T, typename MaskType>
__global__ void RandomGenerator(const size_t n, uint64_t seed,
const float dropout_prob, const T* src,
MaskType* mask, T* dst,
bool is_upscale_in_train, uint64_t increment) {
using MT = typename details::MPTypeTrait<T>::Type;
int idx = blockDim.x * blockIdx.x + threadIdx.x;
#ifdef PADDLE_WITH_HIP
hiprandStatePhilox4_32_10_t state;
hiprand_init(seed, idx, increment, &state);
#else
curandStatePhilox4_32_10_t state;
curand_init(seed, idx, increment, &state);
#endif
MaskType mask_val;
T dst_val;
MT factor = static_cast<MT>(1.0f / (1.0f - dropout_prob));
for (; idx < n; idx += blockDim.x * gridDim.x) {
T src_val = src[idx];
#ifdef PADDLE_WITH_HIP
if (hiprand_uniform(&state) < dropout_prob) {
#else
if (curand_uniform(&state) < dropout_prob) {
#endif
mask_val = 0;
dst_val = 0;
HOSTDEVICE inline void operator()(OutT* dst, const T1* src_val,
const T2* rand, int num) const {
static constexpr int kCount =
phi::funcs::uniform_distribution<T2>::kReturnsCount;
// 0 ~ kCount -1 is dist , kCount ~ 2 * kCount - 1 is mask
#pragma unroll
for (int i = 0; i < kCount; i++) {
if (rand[i] < dropout_prob_) {
dst[i] = static_cast<T1>(0);
dst[i + kCount] = dst[i];
} else {
mask_val = 1;
dst_val = is_upscale_in_train
? static_cast<T>(static_cast<MT>(src_val) * factor)
: src_val;
dst[i] = is_upscale_in_train_
? static_cast<T1>(static_cast<MT>(src_val[i]) * factor)
: static_cast<T1>(src_val[i]);
dst[i + kCount] = static_cast<T1>(1);
}
mask[idx] = mask_val;
dst[idx] = dst_val;
}
}
}
};
template <typename T, typename MaskType, int VecSize>
template <typename T, typename MaskType>
__global__ void VectorizedRandomGenerator(const size_t n, uint64_t seed,
const float dropout_prob,
const T* src, MaskType* mask, T* dst,
bool is_upscale_in_train,
uint64_t increment) {
using MT = typename details::MPTypeTrait<T>::Type;
using LoadT = phi::AlignedVector<T, VecSize>;
using MaskLoadT = phi::AlignedVector<MaskType, VecSize>;
uint64_t increment,
size_t main_offset) {
size_t idx = static_cast<size_t>(BLOCK_ID_X * BLOCK_NUM_X);
static constexpr int kCount =
phi::funcs::uniform_distribution<float>::kReturnsCount;
size_t stride = BLOCK_NUM_X * GRID_NUM_X * kCount;
#ifdef PADDLE_WITH_HIP
int64_t idx = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
hiprandStatePhilox4_32_10_t state;
hiprand_init(seed, idx, increment, &state);
hiprand_init(seed, idx + THREAD_ID_X, increment, &state);
using SType = hiprandStatePhilox4_32_10_t;
#else
int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;
curandStatePhilox4_32_10_t state;
curand_init(seed, idx, increment, &state);
curand_init(seed, idx + THREAD_ID_X, increment, &state);
using SType = curandStatePhilox4_32_10_t;
#endif
MT factor = static_cast<MT>(1.0f / (1.0f - dropout_prob));
for (int i = idx * VecSize; i < n; i += blockDim.x * gridDim.x * VecSize) {
LoadT src_val;
phi::Load<T, VecSize>(&src[i], &src_val);
#ifdef PADDLE_WITH_HIP
float4 rand = hiprand_uniform4(&state);
#else
float4 rand = curand_uniform4(&state);
#endif
LoadT dst_val;
MaskLoadT mask_val;
#pragma unroll
for (int j = 0; j < VecSize; j++) {
if ((&rand.x)[j] < dropout_prob) {
dst_val[j] = 0;
mask_val[j] = 0;
} else {
dst_val[j] = is_upscale_in_train
? static_cast<T>(static_cast<MT>(src_val[j]) * factor)
: src_val[j];
mask_val[j] = 1;
}
}
phi::Store<T, VecSize>(dst_val, &dst[i]);
phi::Store<MaskType, VecSize>(mask_val, &mask[i]);
}
}
template <typename T, typename MaskType>
struct CudaDropoutGradFunctor {
using MT = typename details::MPTypeTrait<T>::Type;
explicit CudaDropoutGradFunctor(const MT factor) : factor_(factor) {}
__device__ __forceinline__ T operator()(const T dout,
const MaskType mask) const {
return static_cast<T>(static_cast<MT>(dout) * static_cast<MT>(mask) *
factor_);
T dst_mask[kCount * 2]; // 0 ~ kCount -1 : dst;kCount ~ 2 * kCount - 1: mask
float rands[kCount];
MaskType mask_result[kCount];
using Rand = phi::funcs::uniform_distribution<float>;
using Cast = kps::IdentityFunctor<T>;
int deal_size = BLOCK_NUM_X * kCount;
auto dst_functor =
DstMaskGenerator<T, float>(dropout_prob, is_upscale_in_train);
size_t fix = idx * kCount;
for (; fix < main_offset; fix += stride) {
kps::ReadData<T, kCount, 1, 1, false>(&dst_mask[0], src + fix, deal_size);
kps::ElementwiseRandom<SType, float, kCount, 1, Rand>(&rands[0], Rand(),
&state);
// dst
kps::OperatorTernary<T, float, T, DstMaskGenerator<T, float>>(
&dst_mask[0], &dst_mask[0], &rands[0], dst_functor, kCount);
kps::WriteData<T, kCount, 1, 1, false>(dst + fix, &dst_mask[0], deal_size);
// mask
kps::ElementwiseUnary<T, MaskType, kCount, 1, 1, Cast>(
&mask_result[0], &dst_mask[kCount], Cast());
kps::WriteData<MaskType, kCount, 1, 1, false>(mask + fix, &mask_result[0],
deal_size);
}
private:
MT factor_;
};
template <typename T, typename MaskType, int VecSize>
__global__ void DropoutGradCUDAKernel(
const T* dout, const MaskType* mask,
const typename details::MPTypeTrait<T>::Type factor, const int64_t size,
T* dx) {
using MT = typename details::MPTypeTrait<T>::Type;
using LoadT = phi::AlignedVector<T, VecSize>;
using MaskLoadT = phi::AlignedVector<MaskType, VecSize>;
int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;
for (int i = idx * VecSize; i < size; i += blockDim.x * gridDim.x * VecSize) {
LoadT dout_val;
phi::Load<T, VecSize>(&dout[i], &dout_val);
MaskLoadT mask_val;
phi::Load<MaskType, VecSize>(&mask[i], &mask_val);
LoadT dx_val;
#pragma unroll
for (int j = 0; j < VecSize; j++) {
dx_val[j] = static_cast<T>(static_cast<MT>(dout_val[j]) *
static_cast<MT>(mask_val[j]) * factor);
}
phi::Store<T, VecSize>(dx_val, &dx[i]);
int remainder = n - fix;
if (remainder > 0) {
kps::ReadData<T, kCount, 1, 1, true>(&dst_mask[0], src + fix, remainder);
kps::ElementwiseRandom<SType, float, kCount, 1, Rand>(&rands[0], Rand(),
&state);
// dst
kps::OperatorTernary<T, float, T, DstMaskGenerator<T, float>>(
&dst_mask[0], &dst_mask[0], &rands[0], dst_functor, kCount);
kps::WriteData<T, kCount, 1, 1, true>(dst + fix, &dst_mask[0], remainder);
// mask
kps::ElementwiseUnary<T, MaskType, kCount, 1, 1, Cast>(
&mask_result[0], &dst_mask[kCount], Cast());
kps::WriteData<MaskType, kCount, 1, 1, true>(mask + fix, &mask_result[0],
remainder);
}
}
......@@ -218,42 +174,21 @@ void DropoutFwGPUKernelDriver(const phi::GPUContext& dev_ctx, bool is_test,
uint64_t seed_data;
uint64_t increment;
// VectorizedRandomGenerator use curand_uniform4, so we only support
// vec_size is 4;
int vec_size = (phi::GetVectorizedSize<T>(x_data) == 4) ? 4 : 1;
// kVecSize is 4;
constexpr int kVecSize =
phi::funcs::uniform_distribution<float>::kReturnsCount;
auto gpu_config =
phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, x_numel, vec_size);
phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, x_numel, kVecSize);
auto offset =
((x_numel - 1) / (gpu_config.GetThreadNum() * vec_size) + 1) * vec_size;
((x_numel - 1) / (gpu_config.GetThreadNum() * kVecSize) + 1) * kVecSize;
GetSeedDataAndIncrement(dev_ctx, seed, is_fix_seed, seed_val, offset,
&seed_data, &increment);
#ifdef __HIPCC__
if (vec_size == 4 && size % 4 == 0) {
hipLaunchKernelGGL(
HIP_KERNEL_NAME(VectorizedRandomGenerator<T, uint8_t, 4>),
gpu_config.GetGridSize(), gpu_config.GetBlockSize(), 0, stream, size,
seed_data, dropout_prob, x_data, mask_data, y_data, upscale_in_train,
increment);
} else {
hipLaunchKernelGGL(HIP_KERNEL_NAME(RandomGenerator<T, uint8_t>),
gpu_config.GetGridSize(), gpu_config.GetBlockSize(), 0,
stream, size, seed_data, dropout_prob, x_data,
mask_data, y_data, upscale_in_train, increment);
}
#else
if (vec_size == 4 && size % 4 == 0) {
VectorizedRandomGenerator<T, uint8_t, 4><<<
gpu_config.block_per_grid, gpu_config.thread_per_block, 0, stream>>>(
size, seed_data, dropout_prob, x_data, mask_data, y_data,
upscale_in_train, increment);
} else {
RandomGenerator<T, uint8_t><<<gpu_config.block_per_grid,
gpu_config.thread_per_block, 0, stream>>>(
size_t main_offset = size / (gpu_config.GetBlockSize() * kVecSize) *
(gpu_config.GetBlockSize() * kVecSize);
VectorizedRandomGenerator<T, uint8_t><<<
gpu_config.GetGridSize(), gpu_config.GetBlockSize(), 0, stream>>>(
size, seed_data, dropout_prob, x_data, mask_data, y_data,
upscale_in_train, increment);
}
#endif
upscale_in_train, increment, main_offset);
} else {
if (upscale_in_train) {
// todo: can y share with data with x directly?
......@@ -278,6 +213,22 @@ void DropoutFwGPUKernelDriver(const phi::GPUContext& dev_ctx, bool is_test,
}
}
template <typename T, typename MaskType>
struct CudaDropoutGradFunctor {
using MT = typename details::MPTypeTrait<T>::Type;
explicit CudaDropoutGradFunctor(const MT factor) : factor_(factor) {}
__device__ __forceinline__ T operator()(const T dout,
const MaskType mask) const {
return static_cast<T>(static_cast<MT>(dout) * static_cast<MT>(mask) *
factor_);
}
private:
MT factor_;
};
template <typename T>
void DropoutGradGPUKernelDriver(const phi::GPUContext& dev_ctx,
const std::string dropout_implementation,
......
......@@ -58,19 +58,15 @@ __global__ void DequantizeOneScaleQuantAxis0(const T* in, const T* scale,
}
template <typename T>
__global__ void DequantizeOneScaleQuantAxis1(const T* in, const T* scale,
T max_range, const int num,
const int cin, const int cout,
T* out) {
int bid = blockIdx.x;
T s = scale[bid % cout];
int wh_size = num / (cin * cout);
const T* in_current = in + bid * wh_size;
T* out_current = out + bid * wh_size;
for (int i = threadIdx.x; i < wh_size; i += blockDim.x) {
out_current[i] = in_current[i] * s / max_range;
__global__ void DequantizeOneScaleQuantAxisN(const T* in, const T* scale,
const T max_range,
const int64_t num,
const int n_scales,
const int quant_stride, T* out) {
int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;
for (int64_t i = idx; i < num; i += blockDim.x * gridDim.x) {
T s = scale[(i / quant_stride) % n_scales];
out[i] = in[i] * s / max_range;
}
}
......@@ -98,20 +94,32 @@ struct ChannelDequantizeFunctor<platform::CUDADeviceContext, T> {
const T* in_data = in->data<T>();
T* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
if (scale_num == 1) {
int num = in->numel();
int64_t num = in->numel();
const T* scale_factor = scales[0]->data<T>();
if (quant_axis == 0) {
int grid = in_dims[0];
int block = 1024;
DequantizeOneScaleQuantAxis0<T><<<grid, block, 0, dev_ctx.stream()>>>(
in_data, scale_factor, max_range, num, in_dims[0], out_data);
} else if (quant_axis == 1) {
// Dequantize weight of Cin * Cout * W * H
int grid = in_dims[0] * in_dims[1];
int block = 1024;
DequantizeOneScaleQuantAxis1<T><<<grid, block, 0, dev_ctx.stream()>>>(
in_data, scale_factor, max_range, num, in_dims[0], in_dims[1],
out_data);
} else {
int quant_stride = 1;
for (int i = quant_axis + 1; i < in_dims.size(); i++) {
quant_stride *= in_dims[i];
}
int64_t block_size = std::min(
num, static_cast<int64_t>(dev_ctx.GetMaxThreadsPerBlock() / 4));
int64_t max_threads =
dev_ctx.GetMaxPhysicalThreadCount(); // SM * block_per_SM
const int64_t max_blocks = std::max(
((max_threads - 1) / block_size + 1), static_cast<int64_t>(1));
const int64_t grid_size =
std::min(max_blocks, (num + block_size - 1) / block_size);
DequantizeOneScaleQuantAxisN<
T><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
in_data, scale_factor, max_range, num, in_dims[quant_axis],
quant_stride, out_data);
}
} else if (scale_num == 2) {
// Not need to consider quant_axis
......
......@@ -273,18 +273,18 @@ struct ClipAndFakeQuantDequantFunctor<platform::CUDADeviceContext, T> {
template <typename T>
__global__ void ChannelClipAndQuantKernelQuantAxis0(const T* in, const T* scale,
const int bin_cnt,
const int n, const int c,
T* out) {
const int64_t n,
const int c, T* out) {
int tid = threadIdx.x;
int channel_size = n / c;
int64_t channel_size = n / c;
const T* in_c = in + blockIdx.x * channel_size;
T* out_c = out + blockIdx.x * channel_size;
T s = scale[blockIdx.x];
T inv_s = inverse(s);
for (int i = tid; i < channel_size; i += blockDim.x) {
for (int64_t i = tid; i < channel_size; i += blockDim.x) {
T x = in_c[i];
T v = x > s ? s : x;
v = v < -s ? -s : v;
......@@ -293,25 +293,20 @@ __global__ void ChannelClipAndQuantKernelQuantAxis0(const T* in, const T* scale,
}
}
// ChannelClipAndQuantKernel for quant_axis is 1
// ChannelClipAndQuantKernel for quant_axis is N
template <typename T>
__global__ void ChannelClipAndQuantKernelQuantAxis1(const T* in, const T* scale,
const int bin_cnt,
const int n, const int cin,
const int cout, T* out) {
T s = scale[blockIdx.x % cout];
T inv_s = inverse(s);
int wh_size = n / (cin * cout);
const T* in_c = in + blockIdx.x * wh_size;
T* out_c = out + blockIdx.x * wh_size;
for (int i = threadIdx.x; i < wh_size; i += blockDim.x) {
T x = in_c[i];
__global__ void ChannelClipAndQuantKernelQuantAxisN(
const T* in, const T* scale, const int bin_cnt, const int64_t n,
const int nScale, const int quant_stride, T* out) {
int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;
for (int64_t i = idx; i < n; i += blockDim.x * gridDim.x) {
T s = scale[(i / quant_stride) % nScale];
T inv_s = 1.0 / s;
T x = in[i];
T v = x > s ? s : x;
v = v < -s ? -s : v;
v = bin_cnt * inv_s * v;
out_c[i] = round(v);
out[i] = round(v);
}
}
......@@ -327,7 +322,7 @@ struct ChannelClipAndFakeQuantFunctor<platform::CUDADeviceContext, T> {
"the received is %d",
quant_axis));
int num = in.numel();
int64_t num = in.numel();
auto in_dims = in.dims();
const T* in_data = in.data<T>();
const T* scale_data = scale.data<T>();
......@@ -338,11 +333,24 @@ struct ChannelClipAndFakeQuantFunctor<platform::CUDADeviceContext, T> {
int block = 1024;
ChannelClipAndQuantKernelQuantAxis0<T><<<grid, block, 0, ctx.stream()>>>(
in_data, scale_data, bin_cnt, num, in_dims[0], out_data);
} else if (quant_axis == 1) {
int grid = in_dims[0] * in_dims[1];
int block = 1024;
ChannelClipAndQuantKernelQuantAxis1<T><<<grid, block, 0, ctx.stream()>>>(
in_data, scale_data, bin_cnt, num, in_dims[0], in_dims[1], out_data);
} else {
int quant_stride = 1;
for (int i = quant_axis + 1; i < in_dims.size(); i++) {
quant_stride *= in_dims[i];
}
int64_t block_size =
std::min(num, static_cast<int64_t>(ctx.GetMaxThreadsPerBlock() / 4));
int64_t max_threads =
ctx.GetMaxPhysicalThreadCount(); // SM * block_per_SM
const int64_t max_blocks = std::max(((max_threads - 1) / block_size + 1),
static_cast<int64_t>(1));
const int64_t grid_size =
std::min(max_blocks, (num + block_size - 1) / block_size);
ChannelClipAndQuantKernelQuantAxisN<T><<<grid_size, block_size>>>(
in_data, scale_data, bin_cnt, num, in_dims[quant_axis], quant_stride,
out_data);
}
}
};
......
......@@ -64,18 +64,26 @@ class FrameOp : public framework::OperatorWithKernel {
end_axis = x_rank - 2;
}
bool contain_unknown_dim = phi::contain_unknown_dim(x_dims);
bool check = ctx->IsRuntime() || !contain_unknown_dim;
if (check) {
PADDLE_ENFORCE_LE(frame_length, seq_length,
platform::errors::InvalidArgument(
"Attribute(frame_length) of FrameOp should be less "
"equal than sequence length, but got (%s) > (%s).",
frame_length, seq_length));
}
// It won't go into for loop when x_rank == 1U.
for (int i = start_axis; i <= end_axis; i++) {
output_shape.push_back(x_dims[i]);
}
if (seq_length == -1) {
n_frames = -1;
} else {
n_frames = 1 + (seq_length - frame_length) / hop_length;
}
if (axis == 0) {
// (n_frames, frame_length, ...)
......
......@@ -98,9 +98,17 @@ REGISTER_OP_CPU_KERNEL(
mean, ops::MeanKernel<paddle::platform::CPUDeviceContext, float>,
ops::MeanKernel<paddle::platform::CPUDeviceContext, double>,
ops::MeanKernel<paddle::platform::CPUDeviceContext,
paddle::platform::bfloat16>);
paddle::platform::bfloat16>,
ops::MeanKernel<paddle::platform::CPUDeviceContext,
paddle::platform::complex<float>>,
ops::MeanKernel<paddle::platform::CPUDeviceContext,
paddle::platform::complex<double>>);
REGISTER_OP_CPU_KERNEL(
mean_grad, ops::MeanGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::MeanGradKernel<paddle::platform::CPUDeviceContext, double>,
ops::MeanGradKernel<paddle::platform::CPUDeviceContext,
paddle::platform::bfloat16>);
paddle::platform::bfloat16>,
ops::MeanGradKernel<paddle::platform::CPUDeviceContext,
paddle::platform::complex<float>>,
ops::MeanGradKernel<paddle::platform::CPUDeviceContext,
paddle::platform::complex<double>>);
......@@ -102,10 +102,17 @@ namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(
mean, ops::MeanCUDAKernel<paddle::platform::CUDADeviceContext, float>,
ops::MeanCUDAKernel<paddle::platform::CUDADeviceContext, double>,
ops::MeanCUDAKernel<paddle::platform::CUDADeviceContext, plat::float16>);
ops::MeanCUDAKernel<paddle::platform::CUDADeviceContext, plat::float16>,
ops::MeanCUDAKernel<paddle::platform::CUDADeviceContext,
paddle::platform::complex<float>>,
ops::MeanCUDAKernel<paddle::platform::CUDADeviceContext,
paddle::platform::complex<double>>);
REGISTER_OP_CUDA_KERNEL(
mean_grad,
ops::MeanCUDAGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::MeanCUDAGradKernel<paddle::platform::CUDADeviceContext, double>,
ops::MeanCUDAGradKernel<paddle::platform::CUDADeviceContext, plat::float16>,
ops::MeanCUDAGradKernel<paddle::platform::CUDADeviceContext,
plat::float16>);
paddle::platform::complex<float>>,
ops::MeanCUDAGradKernel<paddle::platform::CUDADeviceContext,
paddle::platform::complex<double>>);
......@@ -54,6 +54,7 @@ class OverlapAddOp : public framework::OperatorWithKernel {
std::vector<int64_t> output_shape;
int n_frames;
int frame_length;
int seq_length;
int start_axis;
int end_axis;
......@@ -69,14 +70,22 @@ class OverlapAddOp : public framework::OperatorWithKernel {
end_axis = x_rank - 3;
}
bool contain_unknown_dim = phi::contain_unknown_dim(x_dims);
bool check = ctx->IsRuntime() || !contain_unknown_dim;
if (check) {
PADDLE_ENFORCE_LE(
hop_length, frame_length,
platform::errors::InvalidArgument(
"Attribute(hop_length) of OverlapAddOp should be less or equal "
"than frame_length, but got hop_length(%s) > frame_length(%s).",
hop_length, frame_length));
}
const int seq_length = (n_frames - 1) * hop_length + frame_length;
if (n_frames == -1) {
seq_length = -1;
} else {
seq_length = (n_frames - 1) * hop_length + frame_length;
}
// It won't go into for loop when x_rank == 2U.
for (int i = start_axis; i <= end_axis; i++) {
......
......@@ -16,451 +16,469 @@
#include "paddle/fluid/operators/spectral_op.h"
#ifdef PADDLE_WITH_HIP
#include "paddle/fluid/platform/dynload/hipfft.h"
#endif
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/dynload/cufft.h"
#if defined(PADDLE_WITH_ONEMKL)
#include "paddle/phi/backends/dynload/mklrt.h"
#elif defined(PADDLE_WITH_POCKETFFT)
#include "extern_pocketfft/pocketfft_hdronly.h"
#endif
namespace paddle {
namespace operators {
using ScalarType = framework::proto::VarType::Type;
const int64_t kMaxFFTNdim = 3;
const int64_t kMaxDataNdim = kMaxFFTNdim + 1;
// This struct is used to easily compute hashes of the
// parameters. It will be the **key** to the plan cache.
struct FFTConfigKey {
// between 1 and kMaxFFTNdim, i.e., 1 <= signal_ndim <= 3
int64_t signal_ndim_;
// These include additional batch dimension as well.
int64_t sizes_[kMaxDataNdim];
int64_t input_shape_[kMaxDataNdim];
int64_t output_shape_[kMaxDataNdim];
FFTTransformType fft_type_;
ScalarType value_type_;
FFTConfigKey() = default;
FFTConfigKey(const std::vector<int64_t>& in_shape,
const std::vector<int64_t>& out_shape,
const std::vector<int64_t>& signal_size,
FFTTransformType fft_type, ScalarType value_type) {
// Padding bits must be zeroed for hashing
memset(this, 0, sizeof(*this));
signal_ndim_ = signal_size.size() - 1;
fft_type_ = fft_type;
value_type_ = value_type;
std::copy(signal_size.cbegin(), signal_size.cend(), sizes_);
std::copy(in_shape.cbegin(), in_shape.cend(), input_shape_);
std::copy(out_shape.cbegin(), out_shape.cend(), output_shape_);
}
};
#if defined(PADDLE_WITH_CUDA)
// An RAII encapsulation of cuFFTHandle
class CuFFTHandle {
::cufftHandle handle_;
public:
CuFFTHandle() {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftCreate(&handle_));
}
CuFFTHandle(const CuFFTHandle& other) = delete;
CuFFTHandle& operator=(const CuFFTHandle& other) = delete;
using Tensor = framework::Tensor;
CuFFTHandle(CuFFTHandle&& other) = delete;
CuFFTHandle& operator=(CuFFTHandle&& other) = delete;
// FFT Functors
#if defined(PADDLE_WITH_ONEMKL)
::cufftHandle& get() { return handle_; }
const ::cufftHandle& get() const { return handle_; }
#define MKL_DFTI_CHECK(expr) \
do { \
MKL_LONG status = (expr); \
if (!phi::dynload::DftiErrorClass(status, DFTI_NO_ERROR)) \
PADDLE_THROW( \
platform::errors::External(phi::dynload::DftiErrorMessage(status))); \
} while (0);
~CuFFTHandle() {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftDestroy(handle_));
struct DftiDescriptorDeleter {
void operator()(DFTI_DESCRIPTOR_HANDLE handle) {
if (handle != nullptr) {
MKL_DFTI_CHECK(phi::dynload::DftiFreeDescriptor(&handle));
}
}
};
using plan_size_type = long long int; // NOLINT
// This class contains all the information needed to execute a cuFFT plan:
// 1. the plan
// 2. the workspace size needed
class FFTConfig {
// A RAII wrapper for MKL_DESCRIPTOR*
class DftiDescriptor {
public:
// Only move semantics is enought for this class. Although we already use
// unique_ptr for the plan, still remove copy constructor and assignment op so
// we don't accidentally copy and take perf hit.
explicit FFTConfig(const FFTConfigKey& plan_key)
: FFTConfig(
std::vector<int64_t>(plan_key.sizes_,
plan_key.sizes_ + plan_key.signal_ndim_ + 1),
plan_key.signal_ndim_, plan_key.fft_type_, plan_key.value_type_) {}
// sizes are full signal, including batch size and always two-sided
FFTConfig(const std::vector<int64_t>& sizes, const int64_t signal_ndim,
FFTTransformType fft_type, ScalarType dtype)
: fft_type_(fft_type), value_type_(dtype) {
// signal sizes (excluding batch dim)
std::vector<plan_size_type> signal_sizes(sizes.begin() + 1, sizes.end());
// input batch size
const auto batch = static_cast<plan_size_type>(sizes[0]);
// const int64_t signal_ndim = sizes.size() - 1;
PADDLE_ENFORCE_EQ(signal_ndim, sizes.size() - 1,
platform::errors::InvalidArgument(
"The signal_ndim must be equal to sizes.size() - 1,"
"But signal_ndim is: [%d], sizes.size() - 1 is: [%d]",
signal_ndim, sizes.size() - 1));
cudaDataType itype, otype, exec_type;
const auto complex_input = has_complex_input(fft_type);
const auto complex_output = has_complex_output(fft_type);
if (dtype == framework::proto::VarType::FP32) {
itype = complex_input ? CUDA_C_32F : CUDA_R_32F;
otype = complex_output ? CUDA_C_32F : CUDA_R_32F;
exec_type = CUDA_C_32F;
} else if (dtype == framework::proto::VarType::FP64) {
itype = complex_input ? CUDA_C_64F : CUDA_R_64F;
otype = complex_output ? CUDA_C_64F : CUDA_R_64F;
exec_type = CUDA_C_64F;
} else if (dtype == framework::proto::VarType::FP16) {
itype = complex_input ? CUDA_C_16F : CUDA_R_16F;
otype = complex_output ? CUDA_C_16F : CUDA_R_16F;
exec_type = CUDA_C_16F;
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"cuFFT only support transforms of type float16, float32 and "
"float64"));
void init(DFTI_CONFIG_VALUE precision, DFTI_CONFIG_VALUE signal_type,
MKL_LONG signal_ndim, MKL_LONG* sizes) {
PADDLE_ENFORCE_EQ(desc_.get(), nullptr,
platform::errors::AlreadyExists(
"DftiDescriptor has already been initialized."));
DFTI_DESCRIPTOR* raw_desc;
MKL_DFTI_CHECK(phi::dynload::DftiCreateDescriptorX(
&raw_desc, precision, signal_type, signal_ndim, sizes));
desc_.reset(raw_desc);
}
// disable auto allocation of workspace to use allocator from the framework
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftSetAutoAllocation(
plan(), /* autoAllocate */ 0));
size_t ws_size_t;
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftXtMakePlanMany(
plan(), signal_ndim, signal_sizes.data(),
/* inembed */ nullptr, /* base_istride */ 1, /* idist */ 1, itype,
/* onembed */ nullptr, /* base_ostride */ 1, /* odist */ 1, otype,
batch, &ws_size_t, exec_type));
ws_size = ws_size_t;
DFTI_DESCRIPTOR* get() const {
DFTI_DESCRIPTOR* raw_desc = desc_.get();
PADDLE_ENFORCE_NOT_NULL(raw_desc,
platform::errors::PreconditionNotMet(
"DFTI DESCRIPTOR has not been initialized."));
return raw_desc;
}
FFTConfig(const FFTConfig& other) = delete;
FFTConfig& operator=(const FFTConfig& other) = delete;
FFTConfig(FFTConfig&& other) = delete;
FFTConfig& operator=(FFTConfig&& other) = delete;
const cufftHandle& plan() const { return plan_ptr.get(); }
FFTTransformType transform_type() const { return fft_type_; }
ScalarType data_type() const { return value_type_; }
size_t workspace_size() const { return ws_size; }
private:
CuFFTHandle plan_ptr;
size_t ws_size;
FFTTransformType fft_type_;
ScalarType value_type_;
std::unique_ptr<DFTI_DESCRIPTOR, DftiDescriptorDeleter> desc_;
};
#elif defined(PADDLE_WITH_HIP)
// An RAII encapsulation of cuFFTHandle
class HIPFFTHandle {
::hipfftHandle handle_;
public:
HIPFFTHandle() {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftCreate(&handle_));
static DftiDescriptor _plan_mkl_fft(
const framework::proto::VarType::Type& in_dtype,
const framework::proto::VarType::Type& out_dtype,
const framework::DDim& in_strides, const framework::DDim& out_strides,
const std::vector<int>& signal_sizes, FFTNormMode normalization,
bool forward) {
const DFTI_CONFIG_VALUE precision = [&] {
switch (in_dtype) {
case framework::proto::VarType::FP32:
return DFTI_SINGLE;
case framework::proto::VarType::COMPLEX64:
return DFTI_SINGLE;
case framework::proto::VarType::FP64:
return DFTI_DOUBLE;
case framework::proto::VarType::COMPLEX128:
return DFTI_DOUBLE;
default:
PADDLE_THROW(platform::errors::InvalidArgument(
"Invalid input datatype (%s), input data type should be FP32, "
"FP64, COMPLEX64 or COMPLEX128.",
framework::DataTypeToString(in_dtype)));
}
}();
HIPFFTHandle(const HIPFFTHandle& other) = delete;
HIPFFTHandle& operator=(const HIPFFTHandle& other) = delete;
HIPFFTHandle(HIPFFTHandle&& other) = delete;
HIPFFTHandle& operator=(HIPFFTHandle&& other) = delete;
::hipfftHandle& get() { return handle_; }
const ::hipfftHandle& get() const { return handle_; }
~HIPFFTHandle() {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftDestroy(handle_));
}
};
using plan_size_type = int;
// This class contains all the information needed to execute a cuFFT plan:
// 1. the plan
// 2. the workspace size needed
class FFTConfig {
public:
// Only move semantics is enought for this class. Although we already use
// unique_ptr for the plan, still remove copy constructor and assignment op so
// we don't accidentally copy and take perf hit.
explicit FFTConfig(const FFTConfigKey& plan_key)
: FFTConfig(
std::vector<int64_t>(plan_key.sizes_,
plan_key.sizes_ + plan_key.signal_ndim_ + 1),
plan_key.signal_ndim_, plan_key.fft_type_, plan_key.value_type_) {}
// sizes are full signal, including batch size and always two-sided
FFTConfig(const std::vector<int64_t>& sizes, const int64_t signal_ndim,
FFTTransformType fft_type, ScalarType dtype)
: fft_type_(fft_type), value_type_(dtype) {
// signal sizes (excluding batch dim)
std::vector<plan_size_type> signal_sizes(sizes.begin() + 1, sizes.end());
// input batch size
const auto batch = static_cast<plan_size_type>(sizes[0]);
// const int64_t signal_ndim = sizes.size() - 1;
PADDLE_ENFORCE_EQ(signal_ndim, sizes.size() - 1,
platform::errors::InvalidArgument(
"The signal_ndim must be equal to sizes.size() - 1,"
"But signal_ndim is: [%d], sizes.size() - 1 is: [%d]",
signal_ndim, sizes.size() - 1));
hipfftType exec_type = [&] {
if (dtype == framework::proto::VarType::FP32) {
switch (fft_type) {
case FFTTransformType::C2C:
return HIPFFT_C2C;
case FFTTransformType::R2C:
return HIPFFT_R2C;
case FFTTransformType::C2R:
return HIPFFT_C2R;
// C2C, R2C, C2R
const FFTTransformType fft_type = GetFFTTransformType(in_dtype, out_dtype);
const DFTI_CONFIG_VALUE domain =
(fft_type == FFTTransformType::C2C) ? DFTI_COMPLEX : DFTI_REAL;
DftiDescriptor descriptor;
std::vector<MKL_LONG> fft_sizes(signal_sizes.cbegin(), signal_sizes.cend());
const MKL_LONG signal_ndim = fft_sizes.size() - 1;
descriptor.init(precision, domain, signal_ndim, fft_sizes.data() + 1);
// placement inplace or not inplace
MKL_DFTI_CHECK(phi::dynload::DftiSetValue(descriptor.get(), DFTI_PLACEMENT,
DFTI_NOT_INPLACE));
// number of transformations
const MKL_LONG batch_size = fft_sizes[0];
MKL_DFTI_CHECK(phi::dynload::DftiSetValue(
descriptor.get(), DFTI_NUMBER_OF_TRANSFORMS, batch_size));
// input & output distance
const MKL_LONG idist = in_strides[0];
const MKL_LONG odist = out_strides[0];
MKL_DFTI_CHECK(
phi::dynload::DftiSetValue(descriptor.get(), DFTI_INPUT_DISTANCE, idist));
MKL_DFTI_CHECK(phi::dynload::DftiSetValue(descriptor.get(),
DFTI_OUTPUT_DISTANCE, odist));
// input & output stride
std::vector<MKL_LONG> mkl_in_stride(1 + signal_ndim, 0);
std::vector<MKL_LONG> mkl_out_stride(1 + signal_ndim, 0);
for (MKL_LONG i = 1; i <= signal_ndim; i++) {
mkl_in_stride[i] = in_strides[i];
mkl_out_stride[i] = out_strides[i];
}
} else if (dtype == framework::proto::VarType::FP64) {
switch (fft_type) {
case FFTTransformType::C2C:
return HIPFFT_Z2Z;
case FFTTransformType::R2C:
return HIPFFT_D2Z;
case FFTTransformType::C2R:
return HIPFFT_Z2D;
MKL_DFTI_CHECK(phi::dynload::DftiSetValue(
descriptor.get(), DFTI_INPUT_STRIDES, mkl_in_stride.data()));
MKL_DFTI_CHECK(phi::dynload::DftiSetValue(
descriptor.get(), DFTI_OUTPUT_STRIDES, mkl_out_stride.data()));
// conjugate even storage
if (!(fft_type == FFTTransformType::C2C)) {
MKL_DFTI_CHECK(phi::dynload::DftiSetValue(
descriptor.get(), DFTI_CONJUGATE_EVEN_STORAGE, DFTI_COMPLEX_COMPLEX));
}
MKL_LONG signal_numel =
std::accumulate(fft_sizes.cbegin() + 1, fft_sizes.cend(), 1UL,
std::multiplies<MKL_LONG>());
if (normalization != FFTNormMode::none) {
const double scale =
((normalization == FFTNormMode::by_sqrt_n)
? 1.0 / std::sqrt(static_cast<double>(signal_numel))
: 1.0 / static_cast<double>(signal_numel));
const auto scale_direction = [&]() {
if (fft_type == FFTTransformType::R2C ||
(fft_type == FFTTransformType::C2C && forward)) {
return DFTI_FORWARD_SCALE;
} else {
// (fft_type == FFTTransformType::C2R ||
// (fft_type == FFTTransformType::C2C && !forward))
return DFTI_BACKWARD_SCALE;
}
PADDLE_THROW(platform::errors::InvalidArgument(
"hipFFT only support transforms of type float32 and float64"));
}();
MKL_DFTI_CHECK(
phi::dynload::DftiSetValue(descriptor.get(), scale_direction, scale));
}
// disable auto allocation of workspace to use allocator from the framework
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftSetAutoAllocation(
plan(), /* autoAllocate */ 0));
size_t ws_size_t;
// commit the descriptor
MKL_DFTI_CHECK(phi::dynload::DftiCommitDescriptor(descriptor.get()));
return descriptor;
}
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftMakePlanMany(
plan(), signal_ndim, signal_sizes.data(),
/* inembed */ nullptr, /* base_istride */ 1, /* idist */ 1,
/* onembed */ nullptr, /* base_ostride */ 1, /* odist */ 1, exec_type,
batch, &ws_size_t));
// Execute a general fft operation (can be c2c, onesided r2c or onesided c2r)
template <typename DeviceContext, typename Ti, typename To>
void exec_fft(const DeviceContext& ctx, const Tensor* x, Tensor* out,
const std::vector<int64_t>& axes, FFTNormMode normalization,
bool forward) {
const framework::DDim& in_sizes = x->dims();
const int ndim = in_sizes.size();
const int signal_ndim = axes.size();
const int batch_ndim = ndim - signal_ndim;
const framework::DDim& out_sizes = out->dims();
// make a dim permutation
std::vector<int> dim_permute(ndim);
std::iota(dim_permute.begin(), dim_permute.end(), 0);
std::vector<bool> is_transformed_dim(ndim, false);
for (const auto& d : axes) {
is_transformed_dim[d] = true;
}
const auto batch_end =
std::partition(dim_permute.begin(), dim_permute.end(),
[&](size_t axis) { return !is_transformed_dim[axis]; });
std::copy(axes.cbegin(), axes.cend(), batch_end);
// transpose input according to that permutation
framework::DDim transposed_input_shape = in_sizes.transpose(dim_permute);
std::vector<int64_t> transposed_input_shape_ =
phi::vectorize(transposed_input_shape);
framework::Tensor transposed_input;
transposed_input.Resize(transposed_input_shape);
const auto place = ctx.GetPlace();
transposed_input.mutable_data<Ti>(place);
TransCompute<platform::CPUDeviceContext, Ti>(ndim, ctx, *x, &transposed_input,
dim_permute);
// make an collapsed input: collapse batch axes for input
const int batch_size = std::accumulate(
transposed_input_shape.Get(), transposed_input_shape.Get() + batch_ndim,
1L, std::multiplies<int64_t>());
std::vector<int> collapsed_input_shape_(1 + signal_ndim);
collapsed_input_shape_[0] = batch_size;
std::copy(transposed_input_shape_.begin() + batch_ndim,
transposed_input_shape_.end(), collapsed_input_shape_.begin() + 1);
const framework::DDim collapsed_input_shape =
phi::make_ddim(collapsed_input_shape_);
transposed_input.Resize(collapsed_input_shape);
framework::Tensor& collapsed_input = transposed_input;
// make a collapsed output
std::vector<int> collapsed_output_shape_(1 + signal_ndim);
collapsed_output_shape_[0] = batch_size;
for (int i = 0; i < signal_ndim; i++) {
collapsed_output_shape_[1 + i] = out_sizes[axes[i]];
}
const framework::DDim collapsed_output_shape =
phi::make_ddim(collapsed_output_shape_);
framework::Tensor collapsed_output;
collapsed_output.Resize(collapsed_output_shape);
collapsed_output.mutable_data(place, out->type());
// signal sizes
std::vector<int> signal_sizes(1 + signal_ndim);
signal_sizes[0] = batch_size;
for (int i = 0; i < signal_ndim; i++) {
signal_sizes[1 + i] =
std::max(collapsed_input_shape[1 + i], collapsed_output_shape[1 + i]);
}
ws_size = ws_size_t;
// input & output stride
const framework::DDim input_stride = phi::stride(collapsed_input_shape);
const framework::DDim output_stride = phi::stride(collapsed_output_shape);
// make a DFTI_DESCRIPTOR
DftiDescriptor desc =
_plan_mkl_fft(framework::TransToProtoVarType(x->dtype()),
framework::TransToProtoVarType(out->dtype()), input_stride,
output_stride, signal_sizes, normalization, forward);
const FFTTransformType fft_type =
GetFFTTransformType(framework::TransToProtoVarType(x->dtype()),
framework::TransToProtoVarType(out->type()));
if (fft_type == FFTTransformType::C2R && forward) {
framework::Tensor collapsed_input_conj(collapsed_input.dtype());
collapsed_input_conj.mutable_data<Ti>(collapsed_input.dims(),
ctx.GetPlace());
// conjugate the input
platform::ForRange<DeviceContext> for_range(ctx, collapsed_input.numel());
phi::funcs::ConjFunctor<Ti> functor(collapsed_input.data<Ti>(),
collapsed_input.numel(),
collapsed_input_conj.data<Ti>());
for_range(functor);
MKL_DFTI_CHECK(phi::dynload::DftiComputeBackward(
desc.get(), collapsed_input_conj.data(), collapsed_output.data()));
} else if (fft_type == FFTTransformType::R2C && !forward) {
framework::Tensor collapsed_output_conj(collapsed_output.dtype());
collapsed_output_conj.mutable_data<To>(collapsed_output.dims(),
ctx.GetPlace());
MKL_DFTI_CHECK(phi::dynload::DftiComputeForward(
desc.get(), collapsed_input.data(), collapsed_output_conj.data()));
// conjugate the output
platform::ForRange<DeviceContext> for_range(ctx, collapsed_output.numel());
phi::funcs::ConjFunctor<To> functor(collapsed_output_conj.data<To>(),
collapsed_output.numel(),
collapsed_output.data<To>());
for_range(functor);
} else {
if (forward) {
MKL_DFTI_CHECK(phi::dynload::DftiComputeForward(
desc.get(), collapsed_input.data(), collapsed_output.data()));
} else {
MKL_DFTI_CHECK(phi::dynload::DftiComputeBackward(
desc.get(), collapsed_input.data(), collapsed_output.data()));
}
}
const hipfftHandle& plan() const { return plan_ptr.get(); }
// resize for the collapsed output
framework::DDim transposed_output_shape = out_sizes.transpose(dim_permute);
collapsed_output.Resize(transposed_output_shape);
framework::Tensor& transposed_output = collapsed_output;
FFTTransformType transform_type() const { return fft_type_; }
ScalarType data_type() const { return value_type_; }
size_t workspace_size() const { return ws_size; }
// reverse the transposition
std::vector<int> reverse_dim_permute(ndim);
for (int i = 0; i < ndim; i++) {
reverse_dim_permute[dim_permute[i]] = i;
}
TransCompute<platform::CPUDeviceContext, To>(ndim, ctx, transposed_output,
out, reverse_dim_permute);
}
private:
HIPFFTHandle plan_ptr;
size_t ws_size;
FFTTransformType fft_type_;
ScalarType value_type_;
template <typename Ti, typename To>
struct FFTC2CFunctor<platform::CPUDeviceContext, Ti, To> {
void operator()(const platform::CPUDeviceContext& ctx, const Tensor* x,
Tensor* out, const std::vector<int64_t>& axes,
FFTNormMode normalization, bool forward) {
exec_fft<platform::CPUDeviceContext, Ti, To>(ctx, x, out, axes,
normalization, forward);
}
};
#endif
// Hashing machinery for Key
// Fowler–Noll–Vo hash function
// see
// https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
template <typename Key>
struct KeyHash {
// Key must be a POD because we read out its memory
// contenst as char* when hashing
static_assert(std::is_pod<Key>::value, "Key must be plain old data type");
size_t operator()(const Key& params) const {
auto ptr = reinterpret_cast<const uint8_t*>(&params);
uint32_t value = 0x811C9DC5;
for (int i = 0; i < static_cast<int>(sizeof(Key)); ++i) {
value ^= ptr[i];
value *= 0x01000193;
}
return static_cast<size_t>(value);
template <typename Ti, typename To>
struct FFTR2CFunctor<platform::CPUDeviceContext, Ti, To> {
void operator()(const platform::CPUDeviceContext& ctx, const Tensor* x,
Tensor* out, const std::vector<int64_t>& axes,
FFTNormMode normalization, bool forward) {
exec_fft<platform::CPUDeviceContext, Ti, To>(ctx, x, out, axes,
normalization, forward);
}
};
template <typename Key>
struct KeyEqual {
// Key must be a POD because we read out its memory
// contenst as char* when comparing
static_assert(std::is_pod<Key>::value, "Key must be plain old data type");
bool operator()(const Key& a, const Key& b) const {
auto ptr1 = reinterpret_cast<const uint8_t*>(&a);
auto ptr2 = reinterpret_cast<const uint8_t*>(&b);
return memcmp(ptr1, ptr2, sizeof(Key)) == 0;
template <typename Ti, typename To>
struct FFTC2RFunctor<platform::CPUDeviceContext, Ti, To> {
void operator()(const platform::CPUDeviceContext& ctx, const Tensor* x,
Tensor* out, const std::vector<int64_t>& axes,
FFTNormMode normalization, bool forward) {
if (axes.size() > 1) {
const std::vector<int64_t> c2c_dims(axes.begin(), axes.end() - 1);
Tensor temp;
temp.mutable_data<Ti>(x->dims(), ctx.GetPlace());
FFTC2CFunctor<platform::CPUDeviceContext, Ti, Ti> c2c_functor;
c2c_functor(ctx, x, &temp, c2c_dims, normalization, forward);
const std::vector<int64_t> new_axes{axes.back()};
exec_fft<platform::CPUDeviceContext, Ti, To>(ctx, &temp, out, new_axes,
normalization, forward);
} else {
exec_fft<platform::CPUDeviceContext, Ti, To>(ctx, x, out, axes,
normalization, forward);
}
}
};
#if CUDA_VERSION < 10000
// Note that the max plan number for CUDA version < 10 has to be 1023
// due to a bug that fails on the 1024th plan
constexpr size_t CUFFT_MAX_PLAN_NUM = 1023;
constexpr size_t CUFFT_DEFAULT_CACHE_SIZE = CUFFT_MAX_PLAN_NUM;
#else
constexpr size_t CUFFT_MAX_PLAN_NUM = std::numeric_limits<size_t>::max();
// The default max cache size chosen for CUDA version > 10 is arbitrary.
// This number puts a limit on how big of a plan cache should we maintain by
// default. Users can always configure it via cufft_set_plan_cache_max_size.
constexpr size_t CUFFT_DEFAULT_CACHE_SIZE = 4096;
#endif
static_assert(CUFFT_MAX_PLAN_NUM >= 0 &&
CUFFT_MAX_PLAN_NUM <= std::numeric_limits<size_t>::max(),
"CUFFT_MAX_PLAN_NUM not in size_t range");
static_assert(CUFFT_DEFAULT_CACHE_SIZE >= 0 &&
CUFFT_DEFAULT_CACHE_SIZE <= CUFFT_MAX_PLAN_NUM,
"CUFFT_DEFAULT_CACHE_SIZE not in [0, CUFFT_MAX_PLAN_NUM] range");
// This cache assumes that the mapping from key to value never changes.
// This is **NOT** thread-safe. Please use a mutex when using it **AND** the
// value returned from try_emplace_value.
// The contract of using this cache is that try_emplace_value should only be
// used when the max_size is positive.
class FFTConfigCache {
public:
using kv_t = typename std::pair<FFTConfigKey, FFTConfig>;
using map_t = typename std::unordered_map<
std::reference_wrapper<FFTConfigKey>, typename std::list<kv_t>::iterator,
KeyHash<FFTConfigKey>, KeyEqual<FFTConfigKey>>;
using map_kkv_iter_t = typename map_t::iterator;
FFTConfigCache() : FFTConfigCache(CUFFT_DEFAULT_CACHE_SIZE) {}
explicit FFTConfigCache(int64_t max_size) { _set_max_size(max_size); }
FFTConfigCache(const FFTConfigCache& other) = delete;
FFTConfigCache& operator=(const FFTConfigCache& other) = delete;
FFTConfigCache(FFTConfigCache&& other) noexcept
: _usage_list(std::move(other._usage_list)),
_cache_map(std::move(other._cache_map)),
_max_size(other._max_size) {}
FFTConfigCache& operator=(FFTConfigCache&& other) noexcept {
_usage_list = std::move(other._usage_list);
_cache_map = std::move(other._cache_map);
_max_size = other._max_size;
return *this;
#elif defined(PADDLE_WITH_POCKETFFT)
template <typename T>
T compute_factor(int64_t size, FFTNormMode normalization) {
constexpr auto one = static_cast<T>(1);
switch (normalization) {
case FFTNormMode::none:
return one;
case FFTNormMode::by_n:
return one / static_cast<T>(size);
case FFTNormMode::by_sqrt_n:
return one / std::sqrt(static_cast<T>(size));
}
PADDLE_THROW(
platform::errors::InvalidArgument("Unsupported normalization type"));
}
// If key is in this cache, return the cached config. Otherwise, emplace the
// config in this cache and return it.
FFTConfig& lookup(FFTConfigKey params) {
PADDLE_ENFORCE_GT(_max_size, 0,
platform::errors::InvalidArgument(
"The max size of FFTConfigCache must be great than 0,"
"But received is [%d]",
_max_size));
map_kkv_iter_t map_it = _cache_map.find(params);
// Hit, put to list front
if (map_it != _cache_map.end()) {
_usage_list.splice(_usage_list.begin(), _usage_list, map_it->second);
return map_it->second->second;
template <typename Ti, typename To>
struct FFTC2CFunctor<platform::CPUDeviceContext, Ti, To> {
void operator()(const platform::CPUDeviceContext& ctx, const Tensor* x,
Tensor* out, const std::vector<int64_t>& axes,
FFTNormMode normalization, bool forward) {
using R = typename Ti::value_type;
using C = std::complex<R>;
const auto& input_dim = x->dims();
const std::vector<size_t> in_sizes = phi::vectorize<size_t>(input_dim);
std::vector<std::ptrdiff_t> in_strides =
phi::vectorize<std::ptrdiff_t>(phi::stride(input_dim));
const int64_t data_size = sizeof(C);
std::transform(in_strides.begin(), in_strides.end(), in_strides.begin(),
[&](std::ptrdiff_t s) { return s * data_size; });
const auto* in_data = reinterpret_cast<const C*>(x->data<Ti>());
auto* out_data = reinterpret_cast<C*>(out->data<To>());
// pocketfft requires std::vector<size_t>
std::vector<size_t> axes_(axes.size());
std::copy(axes.begin(), axes.end(), axes_.begin());
// compuet factor
int64_t signal_numel = 1;
for (auto i : axes) {
signal_numel *= in_sizes[i];
}
// Miss
// remove if needed
if (_usage_list.size() >= _max_size) {
auto last = _usage_list.end();
last--;
_cache_map.erase(last->first);
_usage_list.pop_back();
R factor = compute_factor<R>(signal_numel, normalization);
pocketfft::c2c(in_sizes, in_strides, in_strides, axes_, forward, in_data,
out_data, factor);
}
};
// construct new plan at list front, then insert into _cache_map
_usage_list.emplace_front(std::piecewise_construct,
std::forward_as_tuple(params),
std::forward_as_tuple(params));
auto kv_it = _usage_list.begin();
_cache_map.emplace(std::piecewise_construct,
std::forward_as_tuple(kv_it->first),
std::forward_as_tuple(kv_it));
return kv_it->second;
template <typename Ti, typename To>
struct FFTR2CFunctor<platform::CPUDeviceContext, Ti, To> {
void operator()(const platform::CPUDeviceContext& ctx, const Tensor* x,
Tensor* out, const std::vector<int64_t>& axes,
FFTNormMode normalization, bool forward) {
using R = Ti;
using C = std::complex<R>;
const auto& input_dim = x->dims();
const std::vector<size_t> in_sizes = phi::vectorize<size_t>(input_dim);
std::vector<std::ptrdiff_t> in_strides =
phi::vectorize<std::ptrdiff_t>(phi::stride(input_dim));
{
const int64_t data_size = sizeof(R);
std::transform(in_strides.begin(), in_strides.end(), in_strides.begin(),
[&](std::ptrdiff_t s) { return s * data_size; });
}
void clear() {
_cache_map.clear();
_usage_list.clear();
const auto& output_dim = out->dims();
const std::vector<size_t> out_sizes = phi::vectorize<size_t>(output_dim);
std::vector<std::ptrdiff_t> out_strides =
phi::vectorize<std::ptrdiff_t>(phi::stride(output_dim));
{
const int64_t data_size = sizeof(C);
std::transform(out_strides.begin(), out_strides.end(),
out_strides.begin(),
[&](std::ptrdiff_t s) { return s * data_size; });
}
void resize(int64_t new_size) {
_set_max_size(new_size);
auto cur_size = _usage_list.size();
if (cur_size > _max_size) {
auto delete_it = _usage_list.end();
for (size_t i = 0; i < cur_size - _max_size; i++) {
delete_it--;
_cache_map.erase(delete_it->first);
}
_usage_list.erase(delete_it, _usage_list.end());
const auto* in_data = x->data<R>();
auto* out_data = reinterpret_cast<C*>(out->data<To>());
// pocketfft requires std::vector<size_t>
std::vector<size_t> axes_(axes.size());
std::copy(axes.begin(), axes.end(), axes_.begin());
// compuet normalization factor
int64_t signal_numel = 1;
for (auto i : axes) {
signal_numel *= in_sizes[i];
}
R factor = compute_factor<R>(signal_numel, normalization);
pocketfft::r2c(in_sizes, in_strides, out_strides, axes_, forward, in_data,
out_data, factor);
}
size_t size() const { return _cache_map.size(); }
size_t max_size() const noexcept { return _max_size; }
std::mutex mutex;
private:
// Only sets size and does value check. Does not resize the data structures.
void _set_max_size(int64_t new_size) {
// We check that 0 <= new_size <= CUFFT_MAX_PLAN_NUM here. Since
// CUFFT_MAX_PLAN_NUM is of type size_t, we need to do non-negativity check
// first.
PADDLE_ENFORCE_GE(
new_size, 0,
platform::errors::InvalidArgument(
"cuFFT plan cache size must be non-negative, But received is [%d]",
new_size));
PADDLE_ENFORCE_LE(new_size, CUFFT_MAX_PLAN_NUM,
platform::errors::InvalidArgument(
"cuFFT plan cache size can not be larger than [%d], "
"But received is [%d]",
CUFFT_MAX_PLAN_NUM, new_size));
_max_size = static_cast<size_t>(new_size);
}
std::list<kv_t> _usage_list;
map_t _cache_map;
size_t _max_size;
};
static std::vector<std::unique_ptr<FFTConfigCache>> plan_caches;
static std::mutex plan_caches_mutex;
static inline FFTConfigCache& get_fft_plan_cache(int64_t device_index) {
std::lock_guard<std::mutex> guard(plan_caches_mutex);
template <typename Ti, typename To>
struct FFTC2RFunctor<platform::CPUDeviceContext, Ti, To> {
void operator()(const platform::CPUDeviceContext& ctx, const Tensor* x,
Tensor* out, const std::vector<int64_t>& axes,
FFTNormMode normalization, bool forward) {
using R = To;
using C = std::complex<R>;
const auto& input_dim = x->dims();
const std::vector<size_t> in_sizes = phi::vectorize<size_t>(input_dim);
std::vector<std::ptrdiff_t> in_strides =
phi::vectorize<std::ptrdiff_t>(phi::stride(input_dim));
{
const int64_t data_size = sizeof(C);
std::transform(in_strides.begin(), in_strides.end(), in_strides.begin(),
[&](std::ptrdiff_t s) { return s * data_size; });
}
if (device_index >= plan_caches.size()) {
plan_caches.resize(device_index + 1);
const auto& output_dim = out->dims();
const std::vector<size_t> out_sizes = phi::vectorize<size_t>(output_dim);
std::vector<std::ptrdiff_t> out_strides =
phi::vectorize<std::ptrdiff_t>(phi::stride(output_dim));
{
const int64_t data_size = sizeof(R);
std::transform(out_strides.begin(), out_strides.end(),
out_strides.begin(),
[&](std::ptrdiff_t s) { return s * data_size; });
}
if (!plan_caches[device_index]) {
plan_caches[device_index] = std::make_unique<FFTConfigCache>();
const auto* in_data = reinterpret_cast<const C*>(x->data<Ti>());
auto* out_data = out->data<R>();
// pocketfft requires std::vector<size_t>
std::vector<size_t> axes_(axes.size());
std::copy(axes.begin(), axes.end(), axes_.begin());
// compuet normalization factor
int64_t signal_numel = 1;
for (auto i : axes) {
signal_numel *= out_sizes[i];
}
R factor = compute_factor<R>(signal_numel, normalization);
pocketfft::c2r(out_sizes, in_strides, out_strides, axes_, forward, in_data,
out_data, factor);
}
};
return *plan_caches[device_index];
}
#endif
} // namespace operators
} // namespace paddle
......@@ -13,28 +13,7 @@
// limitations under the License.
#include "paddle/fluid/operators/spectral_op.h"
#include <algorithm>
#include <functional>
#include <memory>
#include <numeric>
#include <string>
#include <vector>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/operators/transpose_op.h"
#include "paddle/fluid/platform/complex.h"
#include "paddle/phi/kernels/funcs/complex_functors.h"
#if defined(PADDLE_WITH_ONEMKL)
#include "paddle/phi/backends/dynload/mklrt.h"
#elif defined(PADDLE_WITH_POCKETFFT)
#include "extern_pocketfft/pocketfft_hdronly.h"
#endif
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/platform/for_range.h"
#include "paddle/fluid/operators/spectral_helper.h"
namespace paddle {
namespace operators {
......@@ -355,465 +334,6 @@ FFTNormMode get_norm_from_string(const std::string& norm, bool forward) {
norm));
}
// FFT Functors
#if defined(PADDLE_WITH_ONEMKL)
#define MKL_DFTI_CHECK(expr) \
do { \
MKL_LONG status = (expr); \
if (!phi::dynload::DftiErrorClass(status, DFTI_NO_ERROR)) \
PADDLE_THROW( \
platform::errors::External(phi::dynload::DftiErrorMessage(status))); \
} while (0);
namespace {
struct DftiDescriptorDeleter {
void operator()(DFTI_DESCRIPTOR_HANDLE handle) {
if (handle != nullptr) {
MKL_DFTI_CHECK(phi::dynload::DftiFreeDescriptor(&handle));
}
}
};
// A RAII wrapper for MKL_DESCRIPTOR*
class DftiDescriptor {
public:
void init(DFTI_CONFIG_VALUE precision, DFTI_CONFIG_VALUE signal_type,
MKL_LONG signal_ndim, MKL_LONG* sizes) {
PADDLE_ENFORCE_EQ(desc_.get(), nullptr,
platform::errors::AlreadyExists(
"DftiDescriptor has already been initialized."));
DFTI_DESCRIPTOR* raw_desc;
MKL_DFTI_CHECK(phi::dynload::DftiCreateDescriptorX(
&raw_desc, precision, signal_type, signal_ndim, sizes));
desc_.reset(raw_desc);
}
DFTI_DESCRIPTOR* get() const {
DFTI_DESCRIPTOR* raw_desc = desc_.get();
PADDLE_ENFORCE_NOT_NULL(raw_desc,
platform::errors::PreconditionNotMet(
"DFTI DESCRIPTOR has not been initialized."));
return raw_desc;
}
private:
std::unique_ptr<DFTI_DESCRIPTOR, DftiDescriptorDeleter> desc_;
};
DftiDescriptor _plan_mkl_fft(const framework::proto::VarType::Type& in_dtype,
const framework::proto::VarType::Type& out_dtype,
const framework::DDim& in_strides,
const framework::DDim& out_strides,
const std::vector<int>& signal_sizes,
FFTNormMode normalization, bool forward) {
const DFTI_CONFIG_VALUE precision = [&] {
switch (in_dtype) {
case framework::proto::VarType::FP32:
return DFTI_SINGLE;
case framework::proto::VarType::COMPLEX64:
return DFTI_SINGLE;
case framework::proto::VarType::FP64:
return DFTI_DOUBLE;
case framework::proto::VarType::COMPLEX128:
return DFTI_DOUBLE;
default:
PADDLE_THROW(platform::errors::InvalidArgument(
"Invalid input datatype (%s), input data type should be FP32, "
"FP64, COMPLEX64 or COMPLEX128.",
framework::DataTypeToString(in_dtype)));
}
}();
// C2C, R2C, C2R
const FFTTransformType fft_type = GetFFTTransformType(in_dtype, out_dtype);
const DFTI_CONFIG_VALUE domain =
(fft_type == FFTTransformType::C2C) ? DFTI_COMPLEX : DFTI_REAL;
DftiDescriptor descriptor;
std::vector<MKL_LONG> fft_sizes(signal_sizes.cbegin(), signal_sizes.cend());
const MKL_LONG signal_ndim = fft_sizes.size() - 1;
descriptor.init(precision, domain, signal_ndim, fft_sizes.data() + 1);
// placement inplace or not inplace
MKL_DFTI_CHECK(phi::dynload::DftiSetValue(descriptor.get(), DFTI_PLACEMENT,
DFTI_NOT_INPLACE));
// number of transformations
const MKL_LONG batch_size = fft_sizes[0];
MKL_DFTI_CHECK(phi::dynload::DftiSetValue(
descriptor.get(), DFTI_NUMBER_OF_TRANSFORMS, batch_size));
// input & output distance
const MKL_LONG idist = in_strides[0];
const MKL_LONG odist = out_strides[0];
MKL_DFTI_CHECK(
phi::dynload::DftiSetValue(descriptor.get(), DFTI_INPUT_DISTANCE, idist));
MKL_DFTI_CHECK(phi::dynload::DftiSetValue(descriptor.get(),
DFTI_OUTPUT_DISTANCE, odist));
// input & output stride
std::vector<MKL_LONG> mkl_in_stride(1 + signal_ndim, 0);
std::vector<MKL_LONG> mkl_out_stride(1 + signal_ndim, 0);
for (MKL_LONG i = 1; i <= signal_ndim; i++) {
mkl_in_stride[i] = in_strides[i];
mkl_out_stride[i] = out_strides[i];
}
MKL_DFTI_CHECK(phi::dynload::DftiSetValue(
descriptor.get(), DFTI_INPUT_STRIDES, mkl_in_stride.data()));
MKL_DFTI_CHECK(phi::dynload::DftiSetValue(
descriptor.get(), DFTI_OUTPUT_STRIDES, mkl_out_stride.data()));
// conjugate even storage
if (!(fft_type == FFTTransformType::C2C)) {
MKL_DFTI_CHECK(phi::dynload::DftiSetValue(
descriptor.get(), DFTI_CONJUGATE_EVEN_STORAGE, DFTI_COMPLEX_COMPLEX));
}
MKL_LONG signal_numel =
std::accumulate(fft_sizes.cbegin() + 1, fft_sizes.cend(), 1UL,
std::multiplies<MKL_LONG>());
if (normalization != FFTNormMode::none) {
const double scale =
((normalization == FFTNormMode::by_sqrt_n)
? 1.0 / std::sqrt(static_cast<double>(signal_numel))
: 1.0 / static_cast<double>(signal_numel));
const auto scale_direction = [&]() {
if (fft_type == FFTTransformType::R2C ||
(fft_type == FFTTransformType::C2C && forward)) {
return DFTI_FORWARD_SCALE;
} else {
// (fft_type == FFTTransformType::C2R ||
// (fft_type == FFTTransformType::C2C && !forward))
return DFTI_BACKWARD_SCALE;
}
}();
MKL_DFTI_CHECK(
phi::dynload::DftiSetValue(descriptor.get(), scale_direction, scale));
}
// commit the descriptor
MKL_DFTI_CHECK(phi::dynload::DftiCommitDescriptor(descriptor.get()));
return descriptor;
}
// Execute a general fft operation (can be c2c, onesided r2c or onesided c2r)
template <typename DeviceContext, typename Ti, typename To>
void exec_fft(const DeviceContext& ctx, const Tensor* x, Tensor* out,
const std::vector<int64_t>& axes, FFTNormMode normalization,
bool forward) {
const framework::DDim& in_sizes = x->dims();
const int ndim = in_sizes.size();
const int signal_ndim = axes.size();
const int batch_ndim = ndim - signal_ndim;
const framework::DDim& out_sizes = out->dims();
// make a dim permutation
std::vector<int> dim_permute(ndim);
std::iota(dim_permute.begin(), dim_permute.end(), 0);
std::vector<bool> is_transformed_dim(ndim, false);
for (const auto& d : axes) {
is_transformed_dim[d] = true;
}
const auto batch_end =
std::partition(dim_permute.begin(), dim_permute.end(),
[&](size_t axis) { return !is_transformed_dim[axis]; });
std::copy(axes.cbegin(), axes.cend(), batch_end);
// transpose input according to that permutation
framework::DDim transposed_input_shape = in_sizes.transpose(dim_permute);
std::vector<int64_t> transposed_input_shape_ =
phi::vectorize(transposed_input_shape);
framework::Tensor transposed_input;
transposed_input.Resize(transposed_input_shape);
const auto place = ctx.GetPlace();
transposed_input.mutable_data<Ti>(place);
TransCompute<platform::CPUDeviceContext, Ti>(ndim, ctx, *x, &transposed_input,
dim_permute);
// make an collapsed input: collapse batch axes for input
const int batch_size = std::accumulate(
transposed_input_shape.Get(), transposed_input_shape.Get() + batch_ndim,
1L, std::multiplies<int64_t>());
std::vector<int> collapsed_input_shape_(1 + signal_ndim);
collapsed_input_shape_[0] = batch_size;
std::copy(transposed_input_shape_.begin() + batch_ndim,
transposed_input_shape_.end(), collapsed_input_shape_.begin() + 1);
const framework::DDim collapsed_input_shape =
phi::make_ddim(collapsed_input_shape_);
transposed_input.Resize(collapsed_input_shape);
framework::Tensor& collapsed_input = transposed_input;
// make a collapsed output
std::vector<int> collapsed_output_shape_(1 + signal_ndim);
collapsed_output_shape_[0] = batch_size;
for (int i = 0; i < signal_ndim; i++) {
collapsed_output_shape_[1 + i] = out_sizes[axes[i]];
}
const framework::DDim collapsed_output_shape =
phi::make_ddim(collapsed_output_shape_);
framework::Tensor collapsed_output;
collapsed_output.Resize(collapsed_output_shape);
collapsed_output.mutable_data(place, out->type());
// signal sizes
std::vector<int> signal_sizes(1 + signal_ndim);
signal_sizes[0] = batch_size;
for (int i = 0; i < signal_ndim; i++) {
signal_sizes[1 + i] =
std::max(collapsed_input_shape[1 + i], collapsed_output_shape[1 + i]);
}
// input & output stride
const framework::DDim input_stride = phi::stride(collapsed_input_shape);
const framework::DDim output_stride = phi::stride(collapsed_output_shape);
// make a DFTI_DESCRIPTOR
DftiDescriptor desc =
_plan_mkl_fft(framework::TransToProtoVarType(x->dtype()),
framework::TransToProtoVarType(out->dtype()), input_stride,
output_stride, signal_sizes, normalization, forward);
const FFTTransformType fft_type =
GetFFTTransformType(framework::TransToProtoVarType(x->dtype()),
framework::TransToProtoVarType(out->type()));
if (fft_type == FFTTransformType::C2R && forward) {
framework::Tensor collapsed_input_conj(collapsed_input.dtype());
collapsed_input_conj.mutable_data<Ti>(collapsed_input.dims(),
ctx.GetPlace());
// conjugate the input
platform::ForRange<DeviceContext> for_range(ctx, collapsed_input.numel());
phi::funcs::ConjFunctor<Ti> functor(collapsed_input.data<Ti>(),
collapsed_input.numel(),
collapsed_input_conj.data<Ti>());
for_range(functor);
MKL_DFTI_CHECK(phi::dynload::DftiComputeBackward(
desc.get(), collapsed_input_conj.data(), collapsed_output.data()));
} else if (fft_type == FFTTransformType::R2C && !forward) {
framework::Tensor collapsed_output_conj(collapsed_output.dtype());
collapsed_output_conj.mutable_data<To>(collapsed_output.dims(),
ctx.GetPlace());
MKL_DFTI_CHECK(phi::dynload::DftiComputeForward(
desc.get(), collapsed_input.data(), collapsed_output_conj.data()));
// conjugate the output
platform::ForRange<DeviceContext> for_range(ctx, collapsed_output.numel());
phi::funcs::ConjFunctor<To> functor(collapsed_output_conj.data<To>(),
collapsed_output.numel(),
collapsed_output.data<To>());
for_range(functor);
} else {
if (forward) {
MKL_DFTI_CHECK(phi::dynload::DftiComputeForward(
desc.get(), collapsed_input.data(), collapsed_output.data()));
} else {
MKL_DFTI_CHECK(phi::dynload::DftiComputeBackward(
desc.get(), collapsed_input.data(), collapsed_output.data()));
}
}
// resize for the collapsed output
framework::DDim transposed_output_shape = out_sizes.transpose(dim_permute);
collapsed_output.Resize(transposed_output_shape);
framework::Tensor& transposed_output = collapsed_output;
// reverse the transposition
std::vector<int> reverse_dim_permute(ndim);
for (int i = 0; i < ndim; i++) {
reverse_dim_permute[dim_permute[i]] = i;
}
TransCompute<platform::CPUDeviceContext, To>(ndim, ctx, transposed_output,
out, reverse_dim_permute);
}
} // anonymous namespace
template <typename Ti, typename To>
struct FFTC2CFunctor<platform::CPUDeviceContext, Ti, To> {
void operator()(const platform::CPUDeviceContext& ctx, const Tensor* x,
Tensor* out, const std::vector<int64_t>& axes,
FFTNormMode normalization, bool forward) {
exec_fft<platform::CPUDeviceContext, Ti, To>(ctx, x, out, axes,
normalization, forward);
}
};
template <typename Ti, typename To>
struct FFTR2CFunctor<platform::CPUDeviceContext, Ti, To> {
void operator()(const platform::CPUDeviceContext& ctx, const Tensor* x,
Tensor* out, const std::vector<int64_t>& axes,
FFTNormMode normalization, bool forward) {
exec_fft<platform::CPUDeviceContext, Ti, To>(ctx, x, out, axes,
normalization, forward);
}
};
template <typename Ti, typename To>
struct FFTC2RFunctor<platform::CPUDeviceContext, Ti, To> {
void operator()(const platform::CPUDeviceContext& ctx, const Tensor* x,
Tensor* out, const std::vector<int64_t>& axes,
FFTNormMode normalization, bool forward) {
if (axes.size() > 1) {
const std::vector<int64_t> c2c_dims(axes.begin(), axes.end() - 1);
Tensor temp;
temp.mutable_data<Ti>(x->dims(), ctx.GetPlace());
FFTC2CFunctor<platform::CPUDeviceContext, Ti, Ti> c2c_functor;
c2c_functor(ctx, x, &temp, c2c_dims, normalization, forward);
const std::vector<int64_t> new_axes{axes.back()};
exec_fft<platform::CPUDeviceContext, Ti, To>(ctx, &temp, out, new_axes,
normalization, forward);
} else {
exec_fft<platform::CPUDeviceContext, Ti, To>(ctx, x, out, axes,
normalization, forward);
}
}
};
#elif defined(PADDLE_WITH_POCKETFFT)
namespace {
template <typename T>
T compute_factor(int64_t size, FFTNormMode normalization) {
constexpr auto one = static_cast<T>(1);
switch (normalization) {
case FFTNormMode::none:
return one;
case FFTNormMode::by_n:
return one / static_cast<T>(size);
case FFTNormMode::by_sqrt_n:
return one / std::sqrt(static_cast<T>(size));
}
PADDLE_THROW(
platform::errors::InvalidArgument("Unsupported normalization type"));
}
} // anonymous namespace
template <typename Ti, typename To>
struct FFTC2CFunctor<platform::CPUDeviceContext, Ti, To> {
void operator()(const platform::CPUDeviceContext& ctx, const Tensor* x,
Tensor* out, const std::vector<int64_t>& axes,
FFTNormMode normalization, bool forward) {
using R = typename Ti::value_type;
using C = std::complex<R>;
const auto& input_dim = x->dims();
const std::vector<size_t> in_sizes = phi::vectorize<size_t>(input_dim);
std::vector<std::ptrdiff_t> in_strides =
phi::vectorize<std::ptrdiff_t>(phi::stride(input_dim));
const int64_t data_size = sizeof(C);
std::transform(in_strides.begin(), in_strides.end(), in_strides.begin(),
[&](std::ptrdiff_t s) { return s * data_size; });
const auto* in_data = reinterpret_cast<const C*>(x->data<Ti>());
auto* out_data = reinterpret_cast<C*>(out->data<To>());
// pocketfft requires std::vector<size_t>
std::vector<size_t> axes_(axes.size());
std::copy(axes.begin(), axes.end(), axes_.begin());
// compuet factor
int64_t signal_numel = 1;
for (auto i : axes) {
signal_numel *= in_sizes[i];
}
R factor = compute_factor<R>(signal_numel, normalization);
pocketfft::c2c(in_sizes, in_strides, in_strides, axes_, forward, in_data,
out_data, factor);
}
};
template <typename Ti, typename To>
struct FFTR2CFunctor<platform::CPUDeviceContext, Ti, To> {
void operator()(const platform::CPUDeviceContext& ctx, const Tensor* x,
Tensor* out, const std::vector<int64_t>& axes,
FFTNormMode normalization, bool forward) {
using R = Ti;
using C = std::complex<R>;
const auto& input_dim = x->dims();
const std::vector<size_t> in_sizes = phi::vectorize<size_t>(input_dim);
std::vector<std::ptrdiff_t> in_strides =
phi::vectorize<std::ptrdiff_t>(phi::stride(input_dim));
{
const int64_t data_size = sizeof(R);
std::transform(in_strides.begin(), in_strides.end(), in_strides.begin(),
[&](std::ptrdiff_t s) { return s * data_size; });
}
const auto& output_dim = out->dims();
const std::vector<size_t> out_sizes = phi::vectorize<size_t>(output_dim);
std::vector<std::ptrdiff_t> out_strides =
phi::vectorize<std::ptrdiff_t>(phi::stride(output_dim));
{
const int64_t data_size = sizeof(C);
std::transform(out_strides.begin(), out_strides.end(),
out_strides.begin(),
[&](std::ptrdiff_t s) { return s * data_size; });
}
const auto* in_data = x->data<R>();
auto* out_data = reinterpret_cast<C*>(out->data<To>());
// pocketfft requires std::vector<size_t>
std::vector<size_t> axes_(axes.size());
std::copy(axes.begin(), axes.end(), axes_.begin());
// compuet normalization factor
int64_t signal_numel = 1;
for (auto i : axes) {
signal_numel *= in_sizes[i];
}
R factor = compute_factor<R>(signal_numel, normalization);
pocketfft::r2c(in_sizes, in_strides, out_strides, axes_, forward, in_data,
out_data, factor);
}
};
template <typename Ti, typename To>
struct FFTC2RFunctor<platform::CPUDeviceContext, Ti, To> {
void operator()(const platform::CPUDeviceContext& ctx, const Tensor* x,
Tensor* out, const std::vector<int64_t>& axes,
FFTNormMode normalization, bool forward) {
using R = To;
using C = std::complex<R>;
const auto& input_dim = x->dims();
const std::vector<size_t> in_sizes = phi::vectorize<size_t>(input_dim);
std::vector<std::ptrdiff_t> in_strides =
phi::vectorize<std::ptrdiff_t>(phi::stride(input_dim));
{
const int64_t data_size = sizeof(C);
std::transform(in_strides.begin(), in_strides.end(), in_strides.begin(),
[&](std::ptrdiff_t s) { return s * data_size; });
}
const auto& output_dim = out->dims();
const std::vector<size_t> out_sizes = phi::vectorize<size_t>(output_dim);
std::vector<std::ptrdiff_t> out_strides =
phi::vectorize<std::ptrdiff_t>(phi::stride(output_dim));
{
const int64_t data_size = sizeof(R);
std::transform(out_strides.begin(), out_strides.end(),
out_strides.begin(),
[&](std::ptrdiff_t s) { return s * data_size; });
}
const auto* in_data = reinterpret_cast<const C*>(x->data<Ti>());
auto* out_data = out->data<R>();
// pocketfft requires std::vector<size_t>
std::vector<size_t> axes_(axes.size());
std::copy(axes.begin(), axes.end(), axes_.begin());
// compuet normalization factor
int64_t signal_numel = 1;
for (auto i : axes) {
signal_numel *= out_sizes[i];
}
R factor = compute_factor<R>(signal_numel, normalization);
pocketfft::c2r(out_sizes, in_strides, out_strides, axes_, forward, in_data,
out_data, factor);
}
};
#endif
} // namespace operators
} // namespace paddle
......
......@@ -8,496 +8,9 @@
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <functional>
#include <list>
#include <memory>
#include <mutex>
#include <numeric>
#include <sstream>
#include <stdexcept>
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/operators/conj_op.h"
#include "paddle/fluid/operators/spectral_helper.h"
#include "paddle/fluid/operators/spectral_op.cu.h"
#include "paddle/fluid/operators/spectral_op.h"
#include "paddle/fluid/operators/transpose_op.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/phi/kernels/funcs/complex_functors.h"
namespace paddle {
namespace operators {
namespace {
// Calculates the normalization constant
double fft_normalization_scale(FFTNormMode normalization,
const std::vector<int64_t>& sizes,
const std::vector<int64_t>& dims) {
// auto norm = static_cast<fft_norm_mode>(normalization);
if (normalization == FFTNormMode::none) {
return static_cast<double>(1.0);
}
int64_t signal_numel = 1;
for (auto dim : dims) {
signal_numel *= sizes[dim];
}
const double scale_denom = (normalization == FFTNormMode::by_sqrt_n)
? std::sqrt(signal_numel)
: static_cast<double>(signal_numel);
return static_cast<double>(1.0 / scale_denom);
}
template <typename DeviceContext, typename T>
void exec_normalization(const DeviceContext& ctx, const Tensor* in, Tensor* out,
FFTNormMode normalization,
const std::vector<int64_t>& sizes,
const std::vector<int64_t>& axes) {
double scale = fft_normalization_scale(normalization, sizes, axes);
if (scale != 1.0) {
auto eigen_out = framework::EigenVector<T>::Flatten(*out);
auto eigen_in = framework::EigenVector<T>::Flatten(*in);
auto dev = ctx.eigen_device();
EigenScale<Eigen::GpuDevice, T>::Eval(*dev, eigen_out, eigen_in,
static_cast<T>(scale),
static_cast<T>(0), false);
} else {
framework::TensorCopy(*in, ctx.GetPlace(), out);
}
}
#if defined(PADDLE_WITH_CUDA)
FFTConfigKey create_fft_configkey(const framework::Tensor& input,
const framework::Tensor& output,
int signal_ndim) {
// Create the transform plan (either from cache or locally)
const auto value_type =
framework::IsComplexType(framework::TransToProtoVarType(input.dtype()))
? framework::ToRealType(framework::TransToProtoVarType(input.dtype()))
: framework::TransToProtoVarType(input.dtype());
auto fft_type =
GetFFTTransformType(framework::TransToProtoVarType(input.dtype()),
framework::TransToProtoVarType(output.dtype()));
// signal sizes
std::vector<int64_t> signal_size(signal_ndim + 1);
signal_size[0] = input.dims()[0];
for (int64_t i = 1; i <= signal_ndim; ++i) {
auto in_size = input.dims()[i];
auto out_size = output.dims()[i];
signal_size[i] = std::max(in_size, out_size);
}
FFTConfigKey key(phi::vectorize(input.dims()), phi::vectorize(output.dims()),
signal_size, fft_type, value_type);
return key;
}
// Execute a pre-planned transform
static void exec_cufft_plan_raw(const FFTConfig& config, void* in_data,
void* out_data, bool forward) {
auto& plan = config.plan();
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftXtExec(
plan, in_data, out_data, forward ? CUFFT_FORWARD : CUFFT_INVERSE));
}
template <typename DeviceContext, typename Ti, typename To>
void exec_cufft_plan(const DeviceContext& ctx, const FFTConfig& config,
framework::Tensor* input, framework::Tensor* output,
bool forward) {
// execute transform plan
auto fft_type = config.transform_type();
if (fft_type == FFTTransformType::C2R && forward) {
forward = false;
framework::Tensor input_conj(input->type());
input_conj.mutable_data<Ti>(input->dims(), ctx.GetPlace());
platform::ForRange<DeviceContext> for_range(ctx, input->numel());
phi::funcs::ConjFunctor<Ti> functor(input->data<Ti>(), input->numel(),
input_conj.data<Ti>());
for_range(functor);
exec_cufft_plan_raw(config, input_conj.data(), output->data(), forward);
} else if (fft_type == FFTTransformType::R2C && !forward) {
forward = true;
framework::Tensor out_conj(output->type());
out_conj.mutable_data<To>(output->dims(), ctx.GetPlace());
exec_cufft_plan_raw(config, input->data(), out_conj.data(), forward);
platform::ForRange<DeviceContext> for_range(ctx, output->numel());
phi::funcs::ConjFunctor<To> functor(out_conj.data<To>(), output->numel(),
output->data<To>());
for_range(functor);
} else {
exec_cufft_plan_raw(config, input->data(), output->data(), forward);
}
}
#elif defined(PADDLE_WITH_HIP)
FFTConfigKey create_fft_configkey(const framework::Tensor& input,
const framework::Tensor& output,
int signal_ndim) {
// Create the transform plan (either from cache or locally)
const auto value_type =
framework::IsComplexType(framework::TransToProtoVarType(input.dtype()))
? framework::ToRealType(framework::TransToProtoVarType(input.dtype()))
: framework::TransToProtoVarType(input.dtype());
auto fft_type =
GetFFTTransformType(framework::TransToProtoVarType(input.dtype()),
framework::TransToProtoVarType(output.type()));
// signal sizes
std::vector<int64_t> signal_size(signal_ndim + 1);
signal_size[0] = input.dims()[0];
for (int64_t i = 1; i <= signal_ndim; ++i) {
auto in_size = input.dims()[i];
auto out_size = output.dims()[i];
signal_size[i] = std::max(in_size, out_size);
}
FFTConfigKey key(phi::vectorize(input.dims()), phi::vectorize(output.dims()),
signal_size, fft_type, value_type);
return key;
}
// Execute a pre-planned transform
static void exec_hipfft_plan_raw(const FFTConfig& config, void* in_data,
void* out_data, bool forward) {
auto& plan = config.plan();
auto value_type = config.data_type();
if (value_type == framework::proto::VarType::FP32) {
switch (config.transform_type()) {
case FFTTransformType::C2C: {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecC2C(
plan, static_cast<hipfftComplex*>(in_data),
static_cast<hipfftComplex*>(out_data),
forward ? HIPFFT_FORWARD : HIPFFT_BACKWARD));
return;
}
case FFTTransformType::R2C: {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecR2C(
plan, static_cast<hipfftReal*>(in_data),
static_cast<hipfftComplex*>(out_data)));
return;
}
case FFTTransformType::C2R: {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecC2R(
plan, static_cast<hipfftComplex*>(in_data),
static_cast<hipfftReal*>(out_data)));
return;
}
}
} else if (value_type == framework::proto::VarType::FP64) {
switch (config.transform_type()) {
case FFTTransformType::C2C: {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecZ2Z(
plan, static_cast<hipfftDoubleComplex*>(in_data),
static_cast<hipfftDoubleComplex*>(out_data),
forward ? HIPFFT_FORWARD : HIPFFT_BACKWARD));
return;
}
case FFTTransformType::R2C: {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecD2Z(
plan, static_cast<hipfftDoubleReal*>(in_data),
static_cast<hipfftDoubleComplex*>(out_data)));
return;
}
case FFTTransformType::C2R: {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecZ2D(
plan, static_cast<hipfftDoubleComplex*>(in_data),
static_cast<hipfftDoubleReal*>(out_data)));
return;
}
}
}
PADDLE_THROW(platform::errors::InvalidArgument(
"hipFFT only support transforms of type float32 and float64"));
}
template <typename DeviceContext, typename Ti, typename To>
void exec_hipfft_plan(const DeviceContext& ctx, const FFTConfig& config,
framework::Tensor* input, framework::Tensor* output,
bool forward) {
auto fft_type = config.transform_type();
if (fft_type == FFTTransformType::C2R && forward) {
forward = false;
framework::Tensor input_conj(input->type());
input_conj.mutable_data<Ti>(input->dims(), ctx.GetPlace());
platform::ForRange<DeviceContext> for_range(ctx, input->numel());
phi::funcs::ConjFunctor<Ti> functor(input->data<Ti>(), input->numel(),
input_conj.data<Ti>());
for_range(functor);
exec_hipfft_plan_raw(config, input_conj.data(), output->data(), forward);
} else if (fft_type == FFTTransformType::R2C && !forward) {
forward = true;
framework::Tensor out_conj(output->type());
out_conj.mutable_data<To>(output->dims(), ctx.GetPlace());
exec_hipfft_plan_raw(config, input->data(), out_conj.data(), forward);
platform::ForRange<DeviceContext> for_range(ctx, output->numel());
phi::funcs::ConjFunctor<To> functor(out_conj.data<To>(), output->numel(),
output->data<To>());
for_range(functor);
} else {
exec_hipfft_plan_raw(config, input->data(), output->data(), forward);
}
}
#endif
// Execute a general unnormalized fft operation (can be c2c, onesided r2c or
// onesided c2r)
template <typename DeviceContext, typename Ti, typename To>
void exec_fft(const DeviceContext& ctx, const Tensor* X, Tensor* out,
const std::vector<int64_t>& dim, bool forward) {
const auto x_dims = phi::vectorize(X->dims());
const int64_t ndim = static_cast<int64_t>(X->dims().size());
auto tensor_place = ctx.GetPlace();
// make a dim permutation
std::vector<int> dim_permute(ndim);
std::iota(dim_permute.begin(), dim_permute.end(), int{0});
std::vector<bool> is_transformed_dim(ndim);
for (const auto& d : dim) {
is_transformed_dim[d] = true;
}
auto batch_end =
std::partition(dim_permute.begin(), dim_permute.end(),
[&](int64_t d) { return !is_transformed_dim[d]; });
std::sort(dim_permute.begin(), batch_end);
std::copy(dim.cbegin(), dim.cend(), batch_end);
// transpose input according to dim permutation
auto transposed_input_shape = X->dims().transpose(dim_permute);
framework::Tensor transposed_input;
transposed_input.Resize(transposed_input_shape);
transposed_input.mutable_data<Ti>(tensor_place);
TransCompute<DeviceContext, Ti>(ndim, ctx, *X, &transposed_input,
dim_permute);
// Reshape batch dimensions into a single dimension
const int64_t signal_ndim = static_cast<int64_t>(dim.size());
std::vector<int64_t> collapsed_input_shape(signal_ndim + 1);
auto transposed_input_shape_ = phi::vectorize(transposed_input_shape);
const int64_t batch_dims = ndim - signal_ndim;
auto batch_size =
std::accumulate(transposed_input_shape_.begin(),
transposed_input_shape_.begin() + batch_dims,
static_cast<int>(1), std::multiplies<int>());
collapsed_input_shape[0] = batch_size;
std::copy(transposed_input_shape_.begin() + batch_dims,
transposed_input_shape_.end(), collapsed_input_shape.begin() + 1);
framework::Tensor& collapsed_input = transposed_input;
collapsed_input.Resize(phi::make_ddim(collapsed_input_shape));
// make a collpased output
const auto out_dims = phi::vectorize(out->dims());
std::vector<int64_t> collapsed_output_shape(1 + signal_ndim);
collapsed_output_shape[0] = batch_size;
for (size_t i = 0; i < dim.size(); ++i) {
collapsed_output_shape[i + 1] = out_dims[dim[i]];
}
framework::Tensor collapsed_output;
collapsed_output.Resize(phi::make_ddim(collapsed_output_shape));
collapsed_output.mutable_data<To>(tensor_place);
FFTConfig* config = nullptr;
#if defined(PADDLE_WITH_CUDA)
std::unique_ptr<FFTConfig> config_ = nullptr;
// create plan
FFTConfigKey key =
create_fft_configkey(collapsed_input, collapsed_output, signal_ndim);
bool using_cache = false;
#if !defined(CUFFT_VERSION) || (CUFFT_VERSION < 10200)
using_cache = true;
#endif
if (using_cache) {
const int64_t device_id = static_cast<int64_t>(
reinterpret_cast<const platform::CUDAPlace*>(&collapsed_input.place())
->GetDeviceId());
FFTConfigCache& plan_cache = get_fft_plan_cache(device_id);
std::unique_lock<std::mutex> guard(plan_cache.mutex, std::defer_lock);
guard.lock();
config = &(plan_cache.lookup(key));
} else {
config_ = std::make_unique<FFTConfig>(key);
config = config_.get();
}
// prepare cufft for execution
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cufftSetStream(config->plan(), ctx.stream()));
framework::Tensor workspace_tensor;
workspace_tensor.mutable_data<To>(tensor_place, config->workspace_size());
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftSetWorkArea(
config->plan(), workspace_tensor.data<To>()));
// execute transform plan
exec_cufft_plan<DeviceContext, Ti, To>(ctx, *config, &collapsed_input,
&collapsed_output, forward);
#elif defined(PADDLE_WITH_HIP)
// create plan
FFTConfigKey key =
create_fft_configkey(collapsed_input, collapsed_output, signal_ndim);
const int64_t device_id = static_cast<int64_t>(
reinterpret_cast<const platform::CUDAPlace*>(&collapsed_input.place())
->GetDeviceId());
FFTConfigCache& plan_cache = get_fft_plan_cache(device_id);
std::unique_lock<std::mutex> guard(plan_cache.mutex, std::defer_lock);
guard.lock();
config = &(plan_cache.lookup(key));
// prepare cufft for execution
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::hipfftSetStream(config->plan(), ctx.stream()));
framework::Tensor workspace_tensor;
workspace_tensor.mutable_data<To>(tensor_place, config->workspace_size());
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftSetWorkArea(
config->plan(), workspace_tensor.data<To>()));
// execute transform plan
exec_hipfft_plan<DeviceContext, Ti, To>(ctx, *config, &collapsed_input,
&collapsed_output, forward);
#endif
// Inverting output by reshape and transpose to original batch and dimension
auto transposed_out_shape = out->dims().transpose(dim_permute);
collapsed_output.Resize(transposed_out_shape);
auto& transposed_output = collapsed_output;
std::vector<int> reverse_dim_permute(ndim);
for (size_t i = 0; i < ndim; i++) {
reverse_dim_permute[dim_permute[i]] = i;
}
TransCompute<DeviceContext, To>(ndim, ctx, transposed_output, out,
reverse_dim_permute);
}
} // anonymous namespace
// Use the optimized path to perform single R2C or C2R if transformation dim is
// supported by cuFFT
bool use_optimized_fft_path(const std::vector<int64_t>& axes) {
// For performance reason, when axes starts with (0, 1), do not use the
// optimized path.
if (axes.size() > kMaxFFTNdim ||
(axes.size() >= 2 && axes[0] == 0 && axes[1] == 1)) {
return false;
} else {
return true;
}
}
template <typename Ti, typename To>
struct FFTC2CFunctor<platform::CUDADeviceContext, Ti, To> {
void operator()(const platform::CUDADeviceContext& ctx, const Tensor* X,
Tensor* out, const std::vector<int64_t>& axes,
FFTNormMode normalization, bool forward) {
if (axes.empty()) {
framework::TensorCopy(*X, ctx.GetPlace(), out);
return;
}
framework::Tensor* p_out = out;
std::vector<int64_t> out_dims = phi::vectorize(X->dims());
std::vector<int64_t> working_axes(axes.begin(), axes.end());
std::vector<int64_t> first_dims;
size_t max_dims;
framework::Tensor working_tensor;
working_tensor.mutable_data<Ti>(X->dims(), ctx.GetPlace());
framework::Tensor* p_working_tensor = &working_tensor;
framework::TensorCopy(*X, ctx.GetPlace(), &working_tensor);
while (true) {
max_dims =
std::min(static_cast<size_t>(kMaxFFTNdim), working_axes.size());
first_dims.assign(working_axes.end() - max_dims, working_axes.end());
exec_fft<platform::CUDADeviceContext, Ti, To>(ctx, p_working_tensor,
p_out, first_dims, forward);
working_axes.resize(working_axes.size() - max_dims);
first_dims.clear();
if (working_axes.empty()) {
break;
}
std::swap(p_out, p_working_tensor);
}
exec_normalization<platform::CUDADeviceContext, To>(
ctx, p_out, out, normalization, out_dims, axes);
}
};
template <typename Ti, typename To>
struct FFTC2RFunctor<platform::CUDADeviceContext, Ti, To> {
void operator()(const platform::CUDADeviceContext& ctx, const Tensor* X,
Tensor* out, const std::vector<int64_t>& axes,
FFTNormMode normalization, bool forward) {
std::vector<int64_t> in_dims = phi::vectorize(X->dims());
std::vector<int64_t> out_dims = phi::vectorize(out->dims());
if (use_optimized_fft_path(axes)) {
framework::Tensor x_copy(X->type());
x_copy.mutable_data<Ti>(X->dims(), ctx.GetPlace());
framework::TensorCopy(*X, ctx.GetPlace(), &x_copy);
exec_fft<platform::CUDADeviceContext, Ti, To>(ctx, &x_copy, out, axes,
forward);
} else {
framework::Tensor temp_tensor;
temp_tensor.mutable_data<Ti>(X->dims(), ctx.GetPlace());
const std::vector<int64_t> dims(axes.begin(), axes.end() - 1);
FFTC2CFunctor<platform::CUDADeviceContext, Ti, Ti> c2c_functor;
c2c_functor(ctx, X, &temp_tensor, dims, FFTNormMode::none, forward);
exec_fft<platform::CUDADeviceContext, Ti, To>(ctx, &temp_tensor, out,
{axes.back()}, forward);
}
exec_normalization<platform::CUDADeviceContext, To>(
ctx, out, out, normalization, out_dims, axes);
}
};
// n dimension real to complex FFT use cufft lib
template <typename Ti, typename To>
struct FFTR2CFunctor<platform::CUDADeviceContext, Ti, To> {
void operator()(const platform::CUDADeviceContext& ctx, const Tensor* X,
Tensor* out, const std::vector<int64_t>& axes,
FFTNormMode normalization, bool forward) {
// Step1: R2C transform on the last dimension
framework::Tensor* r2c_out = out;
const std::vector<int64_t> last_dim{axes.back()};
std::vector<int64_t> out_dims = phi::vectorize(out->dims());
exec_fft<platform::CUDADeviceContext, Ti, To>(ctx, X, r2c_out, last_dim,
forward);
// Step2: C2C transform on the remaining dimension
framework::Tensor c2c_out;
if (axes.size() > 1) {
c2c_out.mutable_data<To>(out->dims(), ctx.GetPlace());
std::vector<int64_t> remain_dim(axes.begin(), axes.end() - 1);
FFTC2CFunctor<platform::CUDADeviceContext, To, To> fft_c2c_func;
fft_c2c_func(ctx, r2c_out, &c2c_out, remain_dim, FFTNormMode::none,
forward);
}
const auto in_sizes = phi::vectorize(X->dims());
framework::Tensor* norm_tensor = axes.size() > 1 ? &c2c_out : r2c_out;
exec_normalization<platform::CUDADeviceContext, To>(
ctx, norm_tensor, out, normalization, in_sizes, axes);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <functional>
#include <list>
#include <memory>
#include <mutex>
#include <numeric>
#include <sstream>
#include <stdexcept>
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/operators/conj_op.h"
#include "paddle/fluid/operators/spectral_op.h"
#include "paddle/fluid/operators/transpose_op.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/phi/kernels/funcs/complex_functors.h"
#ifdef PADDLE_WITH_HIP
#include "paddle/fluid/platform/dynload/hipfft.h"
#endif
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/dynload/cufft.h"
#endif
namespace paddle {
namespace operators {
using ScalarType = framework::proto::VarType::Type;
const int64_t kMaxFFTNdim = 3;
const int64_t kMaxDataNdim = kMaxFFTNdim + 1;
// This struct is used to easily compute hashes of the
// parameters. It will be the **key** to the plan cache.
struct FFTConfigKey {
// between 1 and kMaxFFTNdim, i.e., 1 <= signal_ndim <= 3
int64_t signal_ndim_;
// These include additional batch dimension as well.
int64_t sizes_[kMaxDataNdim];
int64_t input_shape_[kMaxDataNdim];
int64_t output_shape_[kMaxDataNdim];
FFTTransformType fft_type_;
ScalarType value_type_;
FFTConfigKey() = default;
FFTConfigKey(const std::vector<int64_t>& in_shape,
const std::vector<int64_t>& out_shape,
const std::vector<int64_t>& signal_size,
FFTTransformType fft_type, ScalarType value_type) {
// Padding bits must be zeroed for hashing
memset(this, 0, sizeof(*this));
signal_ndim_ = signal_size.size() - 1;
fft_type_ = fft_type;
value_type_ = value_type;
std::copy(signal_size.cbegin(), signal_size.cend(), sizes_);
std::copy(in_shape.cbegin(), in_shape.cend(), input_shape_);
std::copy(out_shape.cbegin(), out_shape.cend(), output_shape_);
}
};
#if defined(PADDLE_WITH_CUDA)
// An RAII encapsulation of cuFFTHandle
class CuFFTHandle {
::cufftHandle handle_;
public:
CuFFTHandle() {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftCreate(&handle_));
}
CuFFTHandle(const CuFFTHandle& other) = delete;
CuFFTHandle& operator=(const CuFFTHandle& other) = delete;
CuFFTHandle(CuFFTHandle&& other) = delete;
CuFFTHandle& operator=(CuFFTHandle&& other) = delete;
::cufftHandle& get() { return handle_; }
const ::cufftHandle& get() const { return handle_; }
~CuFFTHandle() {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftDestroy(handle_));
}
};
using plan_size_type = long long int; // NOLINT
// This class contains all the information needed to execute a cuFFT plan:
// 1. the plan
// 2. the workspace size needed
class FFTConfig {
public:
// Only move semantics is enought for this class. Although we already use
// unique_ptr for the plan, still remove copy constructor and assignment op so
// we don't accidentally copy and take perf hit.
explicit FFTConfig(const FFTConfigKey& plan_key)
: FFTConfig(
std::vector<int64_t>(plan_key.sizes_,
plan_key.sizes_ + plan_key.signal_ndim_ + 1),
plan_key.signal_ndim_, plan_key.fft_type_, plan_key.value_type_) {}
// sizes are full signal, including batch size and always two-sided
FFTConfig(const std::vector<int64_t>& sizes, const int64_t signal_ndim,
FFTTransformType fft_type, ScalarType dtype)
: fft_type_(fft_type), value_type_(dtype) {
// signal sizes (excluding batch dim)
std::vector<plan_size_type> signal_sizes(sizes.begin() + 1, sizes.end());
// input batch size
const auto batch = static_cast<plan_size_type>(sizes[0]);
// const int64_t signal_ndim = sizes.size() - 1;
PADDLE_ENFORCE_EQ(signal_ndim, sizes.size() - 1,
platform::errors::InvalidArgument(
"The signal_ndim must be equal to sizes.size() - 1,"
"But signal_ndim is: [%d], sizes.size() - 1 is: [%d]",
signal_ndim, sizes.size() - 1));
cudaDataType itype, otype, exec_type;
const auto complex_input = has_complex_input(fft_type);
const auto complex_output = has_complex_output(fft_type);
if (dtype == framework::proto::VarType::FP32) {
itype = complex_input ? CUDA_C_32F : CUDA_R_32F;
otype = complex_output ? CUDA_C_32F : CUDA_R_32F;
exec_type = CUDA_C_32F;
} else if (dtype == framework::proto::VarType::FP64) {
itype = complex_input ? CUDA_C_64F : CUDA_R_64F;
otype = complex_output ? CUDA_C_64F : CUDA_R_64F;
exec_type = CUDA_C_64F;
} else if (dtype == framework::proto::VarType::FP16) {
itype = complex_input ? CUDA_C_16F : CUDA_R_16F;
otype = complex_output ? CUDA_C_16F : CUDA_R_16F;
exec_type = CUDA_C_16F;
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"cuFFT only support transforms of type float16, float32 and "
"float64"));
}
// disable auto allocation of workspace to use allocator from the framework
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftSetAutoAllocation(
plan(), /* autoAllocate */ 0));
size_t ws_size_t;
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftXtMakePlanMany(
plan(), signal_ndim, signal_sizes.data(),
/* inembed */ nullptr, /* base_istride */ 1, /* idist */ 1, itype,
/* onembed */ nullptr, /* base_ostride */ 1, /* odist */ 1, otype,
batch, &ws_size_t, exec_type));
ws_size = ws_size_t;
}
FFTConfig(const FFTConfig& other) = delete;
FFTConfig& operator=(const FFTConfig& other) = delete;
FFTConfig(FFTConfig&& other) = delete;
FFTConfig& operator=(FFTConfig&& other) = delete;
const cufftHandle& plan() const { return plan_ptr.get(); }
FFTTransformType transform_type() const { return fft_type_; }
ScalarType data_type() const { return value_type_; }
size_t workspace_size() const { return ws_size; }
private:
CuFFTHandle plan_ptr;
size_t ws_size;
FFTTransformType fft_type_;
ScalarType value_type_;
};
#elif defined(PADDLE_WITH_HIP)
// An RAII encapsulation of cuFFTHandle
class HIPFFTHandle {
::hipfftHandle handle_;
public:
HIPFFTHandle() {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftCreate(&handle_));
}
HIPFFTHandle(const HIPFFTHandle& other) = delete;
HIPFFTHandle& operator=(const HIPFFTHandle& other) = delete;
HIPFFTHandle(HIPFFTHandle&& other) = delete;
HIPFFTHandle& operator=(HIPFFTHandle&& other) = delete;
::hipfftHandle& get() { return handle_; }
const ::hipfftHandle& get() const { return handle_; }
~HIPFFTHandle() {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftDestroy(handle_));
}
};
using plan_size_type = int;
// This class contains all the information needed to execute a cuFFT plan:
// 1. the plan
// 2. the workspace size needed
class FFTConfig {
public:
// Only move semantics is enought for this class. Although we already use
// unique_ptr for the plan, still remove copy constructor and assignment op so
// we don't accidentally copy and take perf hit.
explicit FFTConfig(const FFTConfigKey& plan_key)
: FFTConfig(
std::vector<int64_t>(plan_key.sizes_,
plan_key.sizes_ + plan_key.signal_ndim_ + 1),
plan_key.signal_ndim_, plan_key.fft_type_, plan_key.value_type_) {}
// sizes are full signal, including batch size and always two-sided
FFTConfig(const std::vector<int64_t>& sizes, const int64_t signal_ndim,
FFTTransformType fft_type, ScalarType dtype)
: fft_type_(fft_type), value_type_(dtype) {
// signal sizes (excluding batch dim)
std::vector<plan_size_type> signal_sizes(sizes.begin() + 1, sizes.end());
// input batch size
const auto batch = static_cast<plan_size_type>(sizes[0]);
// const int64_t signal_ndim = sizes.size() - 1;
PADDLE_ENFORCE_EQ(signal_ndim, sizes.size() - 1,
platform::errors::InvalidArgument(
"The signal_ndim must be equal to sizes.size() - 1,"
"But signal_ndim is: [%d], sizes.size() - 1 is: [%d]",
signal_ndim, sizes.size() - 1));
hipfftType exec_type = [&] {
if (dtype == framework::proto::VarType::FP32) {
switch (fft_type) {
case FFTTransformType::C2C:
return HIPFFT_C2C;
case FFTTransformType::R2C:
return HIPFFT_R2C;
case FFTTransformType::C2R:
return HIPFFT_C2R;
}
} else if (dtype == framework::proto::VarType::FP64) {
switch (fft_type) {
case FFTTransformType::C2C:
return HIPFFT_Z2Z;
case FFTTransformType::R2C:
return HIPFFT_D2Z;
case FFTTransformType::C2R:
return HIPFFT_Z2D;
}
}
PADDLE_THROW(platform::errors::InvalidArgument(
"hipFFT only support transforms of type float32 and float64"));
}();
// disable auto allocation of workspace to use allocator from the framework
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftSetAutoAllocation(
plan(), /* autoAllocate */ 0));
size_t ws_size_t;
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftMakePlanMany(
plan(), signal_ndim, signal_sizes.data(),
/* inembed */ nullptr, /* base_istride */ 1, /* idist */ 1,
/* onembed */ nullptr, /* base_ostride */ 1, /* odist */ 1, exec_type,
batch, &ws_size_t));
ws_size = ws_size_t;
}
const hipfftHandle& plan() const { return plan_ptr.get(); }
FFTTransformType transform_type() const { return fft_type_; }
ScalarType data_type() const { return value_type_; }
size_t workspace_size() const { return ws_size; }
private:
HIPFFTHandle plan_ptr;
size_t ws_size;
FFTTransformType fft_type_;
ScalarType value_type_;
};
#endif
// Hashing machinery for Key
// Fowler–Noll–Vo hash function
// see
// https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
template <typename Key>
struct KeyHash {
// Key must be a POD because we read out its memory
// contenst as char* when hashing
static_assert(std::is_pod<Key>::value, "Key must be plain old data type");
size_t operator()(const Key& params) const {
auto ptr = reinterpret_cast<const uint8_t*>(&params);
uint32_t value = 0x811C9DC5;
for (int i = 0; i < static_cast<int>(sizeof(Key)); ++i) {
value ^= ptr[i];
value *= 0x01000193;
}
return static_cast<size_t>(value);
}
};
template <typename Key>
struct KeyEqual {
// Key must be a POD because we read out its memory
// contenst as char* when comparing
static_assert(std::is_pod<Key>::value, "Key must be plain old data type");
bool operator()(const Key& a, const Key& b) const {
auto ptr1 = reinterpret_cast<const uint8_t*>(&a);
auto ptr2 = reinterpret_cast<const uint8_t*>(&b);
return memcmp(ptr1, ptr2, sizeof(Key)) == 0;
}
};
#if CUDA_VERSION < 10000
// Note that the max plan number for CUDA version < 10 has to be 1023
// due to a bug that fails on the 1024th plan
constexpr size_t CUFFT_MAX_PLAN_NUM = 1023;
constexpr size_t CUFFT_DEFAULT_CACHE_SIZE = CUFFT_MAX_PLAN_NUM;
#else
constexpr size_t CUFFT_MAX_PLAN_NUM = std::numeric_limits<size_t>::max();
// The default max cache size chosen for CUDA version > 10 is arbitrary.
// This number puts a limit on how big of a plan cache should we maintain by
// default. Users can always configure it via cufft_set_plan_cache_max_size.
constexpr size_t CUFFT_DEFAULT_CACHE_SIZE = 4096;
#endif
static_assert(CUFFT_MAX_PLAN_NUM >= 0 &&
CUFFT_MAX_PLAN_NUM <= std::numeric_limits<size_t>::max(),
"CUFFT_MAX_PLAN_NUM not in size_t range");
static_assert(CUFFT_DEFAULT_CACHE_SIZE >= 0 &&
CUFFT_DEFAULT_CACHE_SIZE <= CUFFT_MAX_PLAN_NUM,
"CUFFT_DEFAULT_CACHE_SIZE not in [0, CUFFT_MAX_PLAN_NUM] range");
// This cache assumes that the mapping from key to value never changes.
// This is **NOT** thread-safe. Please use a mutex when using it **AND** the
// value returned from try_emplace_value.
// The contract of using this cache is that try_emplace_value should only be
// used when the max_size is positive.
class FFTConfigCache {
public:
using kv_t = typename std::pair<FFTConfigKey, FFTConfig>;
using map_t = typename std::unordered_map<
std::reference_wrapper<FFTConfigKey>, typename std::list<kv_t>::iterator,
KeyHash<FFTConfigKey>, KeyEqual<FFTConfigKey>>;
using map_kkv_iter_t = typename map_t::iterator;
FFTConfigCache() : FFTConfigCache(CUFFT_DEFAULT_CACHE_SIZE) {}
explicit FFTConfigCache(int64_t max_size) { _set_max_size(max_size); }
FFTConfigCache(const FFTConfigCache& other) = delete;
FFTConfigCache& operator=(const FFTConfigCache& other) = delete;
FFTConfigCache(FFTConfigCache&& other) noexcept
: _usage_list(std::move(other._usage_list)),
_cache_map(std::move(other._cache_map)),
_max_size(other._max_size) {}
FFTConfigCache& operator=(FFTConfigCache&& other) noexcept {
_usage_list = std::move(other._usage_list);
_cache_map = std::move(other._cache_map);
_max_size = other._max_size;
return *this;
}
// If key is in this cache, return the cached config. Otherwise, emplace the
// config in this cache and return it.
FFTConfig& lookup(FFTConfigKey params) {
PADDLE_ENFORCE_GT(_max_size, 0,
platform::errors::InvalidArgument(
"The max size of FFTConfigCache must be great than 0,"
"But received is [%d]",
_max_size));
map_kkv_iter_t map_it = _cache_map.find(params);
// Hit, put to list front
if (map_it != _cache_map.end()) {
_usage_list.splice(_usage_list.begin(), _usage_list, map_it->second);
return map_it->second->second;
}
// Miss
// remove if needed
if (_usage_list.size() >= _max_size) {
auto last = _usage_list.end();
last--;
_cache_map.erase(last->first);
_usage_list.pop_back();
}
// construct new plan at list front, then insert into _cache_map
_usage_list.emplace_front(std::piecewise_construct,
std::forward_as_tuple(params),
std::forward_as_tuple(params));
auto kv_it = _usage_list.begin();
_cache_map.emplace(std::piecewise_construct,
std::forward_as_tuple(kv_it->first),
std::forward_as_tuple(kv_it));
return kv_it->second;
}
void clear() {
_cache_map.clear();
_usage_list.clear();
}
void resize(int64_t new_size) {
_set_max_size(new_size);
auto cur_size = _usage_list.size();
if (cur_size > _max_size) {
auto delete_it = _usage_list.end();
for (size_t i = 0; i < cur_size - _max_size; i++) {
delete_it--;
_cache_map.erase(delete_it->first);
}
_usage_list.erase(delete_it, _usage_list.end());
}
}
size_t size() const { return _cache_map.size(); }
size_t max_size() const noexcept { return _max_size; }
std::mutex mutex;
private:
// Only sets size and does value check. Does not resize the data structures.
void _set_max_size(int64_t new_size) {
// We check that 0 <= new_size <= CUFFT_MAX_PLAN_NUM here. Since
// CUFFT_MAX_PLAN_NUM is of type size_t, we need to do non-negativity check
// first.
PADDLE_ENFORCE_GE(
new_size, 0,
platform::errors::InvalidArgument(
"cuFFT plan cache size must be non-negative, But received is [%d]",
new_size));
PADDLE_ENFORCE_LE(new_size, CUFFT_MAX_PLAN_NUM,
platform::errors::InvalidArgument(
"cuFFT plan cache size can not be larger than [%d], "
"But received is [%d]",
CUFFT_MAX_PLAN_NUM, new_size));
_max_size = static_cast<size_t>(new_size);
}
std::list<kv_t> _usage_list;
map_t _cache_map;
size_t _max_size;
};
static std::vector<std::unique_ptr<FFTConfigCache>> plan_caches;
static std::mutex plan_caches_mutex;
static inline FFTConfigCache& get_fft_plan_cache(int64_t device_index) {
std::lock_guard<std::mutex> guard(plan_caches_mutex);
if (device_index >= plan_caches.size()) {
plan_caches.resize(device_index + 1);
}
if (!plan_caches[device_index]) {
plan_caches[device_index] = std::make_unique<FFTConfigCache>();
}
return *plan_caches[device_index];
}
// Calculates the normalization constant
static double fft_normalization_scale(FFTNormMode normalization,
const std::vector<int64_t>& sizes,
const std::vector<int64_t>& dims) {
// auto norm = static_cast<fft_norm_mode>(normalization);
if (normalization == FFTNormMode::none) {
return static_cast<double>(1.0);
}
int64_t signal_numel = 1;
for (auto dim : dims) {
signal_numel *= sizes[dim];
}
const double scale_denom = (normalization == FFTNormMode::by_sqrt_n)
? std::sqrt(signal_numel)
: static_cast<double>(signal_numel);
return static_cast<double>(1.0 / scale_denom);
}
template <typename DeviceContext, typename T>
void exec_normalization(const DeviceContext& ctx, const Tensor* in, Tensor* out,
FFTNormMode normalization,
const std::vector<int64_t>& sizes,
const std::vector<int64_t>& axes) {
double scale = fft_normalization_scale(normalization, sizes, axes);
if (scale != 1.0) {
auto eigen_out = framework::EigenVector<T>::Flatten(*out);
auto eigen_in = framework::EigenVector<T>::Flatten(*in);
auto dev = ctx.eigen_device();
EigenScale<Eigen::GpuDevice, T>::Eval(*dev, eigen_out, eigen_in,
static_cast<T>(scale),
static_cast<T>(0), false);
} else {
framework::TensorCopy(*in, ctx.GetPlace(), out);
}
}
#if defined(PADDLE_WITH_CUDA)
static FFTConfigKey create_fft_configkey(const framework::Tensor& input,
const framework::Tensor& output,
int signal_ndim) {
// Create the transform plan (either from cache or locally)
const auto value_type =
framework::IsComplexType(framework::TransToProtoVarType(input.dtype()))
? framework::ToRealType(framework::TransToProtoVarType(input.dtype()))
: framework::TransToProtoVarType(input.dtype());
auto fft_type =
GetFFTTransformType(framework::TransToProtoVarType(input.dtype()),
framework::TransToProtoVarType(output.dtype()));
// signal sizes
std::vector<int64_t> signal_size(signal_ndim + 1);
signal_size[0] = input.dims()[0];
for (int64_t i = 1; i <= signal_ndim; ++i) {
auto in_size = input.dims()[i];
auto out_size = output.dims()[i];
signal_size[i] = std::max(in_size, out_size);
}
FFTConfigKey key(phi::vectorize(input.dims()), phi::vectorize(output.dims()),
signal_size, fft_type, value_type);
return key;
}
// Execute a pre-planned transform
static void exec_cufft_plan_raw(const FFTConfig& config, void* in_data,
void* out_data, bool forward) {
auto& plan = config.plan();
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftXtExec(
plan, in_data, out_data, forward ? CUFFT_FORWARD : CUFFT_INVERSE));
}
template <typename DeviceContext, typename Ti, typename To>
void exec_cufft_plan(const DeviceContext& ctx, const FFTConfig& config,
framework::Tensor* input, framework::Tensor* output,
bool forward) {
// execute transform plan
auto fft_type = config.transform_type();
if (fft_type == FFTTransformType::C2R && forward) {
forward = false;
framework::Tensor input_conj(input->type());
input_conj.mutable_data<Ti>(input->dims(), ctx.GetPlace());
platform::ForRange<DeviceContext> for_range(ctx, input->numel());
phi::funcs::ConjFunctor<Ti> functor(input->data<Ti>(), input->numel(),
input_conj.data<Ti>());
for_range(functor);
exec_cufft_plan_raw(config, input_conj.data(), output->data(), forward);
} else if (fft_type == FFTTransformType::R2C && !forward) {
forward = true;
framework::Tensor out_conj(output->type());
out_conj.mutable_data<To>(output->dims(), ctx.GetPlace());
exec_cufft_plan_raw(config, input->data(), out_conj.data(), forward);
platform::ForRange<DeviceContext> for_range(ctx, output->numel());
phi::funcs::ConjFunctor<To> functor(out_conj.data<To>(), output->numel(),
output->data<To>());
for_range(functor);
} else {
exec_cufft_plan_raw(config, input->data(), output->data(), forward);
}
}
#elif defined(PADDLE_WITH_HIP)
static FFTConfigKey create_fft_configkey(const framework::Tensor& input,
const framework::Tensor& output,
int signal_ndim) {
// Create the transform plan (either from cache or locally)
const auto value_type =
framework::IsComplexType(framework::TransToProtoVarType(input.dtype()))
? framework::ToRealType(framework::TransToProtoVarType(input.dtype()))
: framework::TransToProtoVarType(input.dtype());
auto fft_type =
GetFFTTransformType(framework::TransToProtoVarType(input.dtype()),
framework::TransToProtoVarType(output.type()));
// signal sizes
std::vector<int64_t> signal_size(signal_ndim + 1);
signal_size[0] = input.dims()[0];
for (int64_t i = 1; i <= signal_ndim; ++i) {
auto in_size = input.dims()[i];
auto out_size = output.dims()[i];
signal_size[i] = std::max(in_size, out_size);
}
FFTConfigKey key(phi::vectorize(input.dims()), phi::vectorize(output.dims()),
signal_size, fft_type, value_type);
return key;
}
// Execute a pre-planned transform
static void exec_hipfft_plan_raw(const FFTConfig& config, void* in_data,
void* out_data, bool forward) {
auto& plan = config.plan();
auto value_type = config.data_type();
if (value_type == framework::proto::VarType::FP32) {
switch (config.transform_type()) {
case FFTTransformType::C2C: {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecC2C(
plan, static_cast<hipfftComplex*>(in_data),
static_cast<hipfftComplex*>(out_data),
forward ? HIPFFT_FORWARD : HIPFFT_BACKWARD));
return;
}
case FFTTransformType::R2C: {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecR2C(
plan, static_cast<hipfftReal*>(in_data),
static_cast<hipfftComplex*>(out_data)));
return;
}
case FFTTransformType::C2R: {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecC2R(
plan, static_cast<hipfftComplex*>(in_data),
static_cast<hipfftReal*>(out_data)));
return;
}
}
} else if (value_type == framework::proto::VarType::FP64) {
switch (config.transform_type()) {
case FFTTransformType::C2C: {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecZ2Z(
plan, static_cast<hipfftDoubleComplex*>(in_data),
static_cast<hipfftDoubleComplex*>(out_data),
forward ? HIPFFT_FORWARD : HIPFFT_BACKWARD));
return;
}
case FFTTransformType::R2C: {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecD2Z(
plan, static_cast<hipfftDoubleReal*>(in_data),
static_cast<hipfftDoubleComplex*>(out_data)));
return;
}
case FFTTransformType::C2R: {
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftExecZ2D(
plan, static_cast<hipfftDoubleComplex*>(in_data),
static_cast<hipfftDoubleReal*>(out_data)));
return;
}
}
}
PADDLE_THROW(platform::errors::InvalidArgument(
"hipFFT only support transforms of type float32 and float64"));
}
template <typename DeviceContext, typename Ti, typename To>
void exec_hipfft_plan(const DeviceContext& ctx, const FFTConfig& config,
framework::Tensor* input, framework::Tensor* output,
bool forward) {
auto fft_type = config.transform_type();
if (fft_type == FFTTransformType::C2R && forward) {
forward = false;
framework::Tensor input_conj(input->type());
input_conj.mutable_data<Ti>(input->dims(), ctx.GetPlace());
platform::ForRange<DeviceContext> for_range(ctx, input->numel());
phi::funcs::ConjFunctor<Ti> functor(input->data<Ti>(), input->numel(),
input_conj.data<Ti>());
for_range(functor);
exec_hipfft_plan_raw(config, input_conj.data(), output->data(), forward);
} else if (fft_type == FFTTransformType::R2C && !forward) {
forward = true;
framework::Tensor out_conj(output->type());
out_conj.mutable_data<To>(output->dims(), ctx.GetPlace());
exec_hipfft_plan_raw(config, input->data(), out_conj.data(), forward);
platform::ForRange<DeviceContext> for_range(ctx, output->numel());
phi::funcs::ConjFunctor<To> functor(out_conj.data<To>(), output->numel(),
output->data<To>());
for_range(functor);
} else {
exec_hipfft_plan_raw(config, input->data(), output->data(), forward);
}
}
#endif
// Execute a general unnormalized fft operation (can be c2c, onesided r2c or
// onesided c2r)
template <typename DeviceContext, typename Ti, typename To>
void exec_fft(const DeviceContext& ctx, const Tensor* X, Tensor* out,
const std::vector<int64_t>& dim, bool forward) {
const auto x_dims = phi::vectorize(X->dims());
const int64_t ndim = static_cast<int64_t>(X->dims().size());
auto tensor_place = ctx.GetPlace();
// make a dim permutation
std::vector<int> dim_permute(ndim);
std::iota(dim_permute.begin(), dim_permute.end(), int{0});
std::vector<bool> is_transformed_dim(ndim);
for (const auto& d : dim) {
is_transformed_dim[d] = true;
}
auto batch_end =
std::partition(dim_permute.begin(), dim_permute.end(),
[&](int64_t d) { return !is_transformed_dim[d]; });
std::sort(dim_permute.begin(), batch_end);
std::copy(dim.cbegin(), dim.cend(), batch_end);
// transpose input according to dim permutation
auto transposed_input_shape = X->dims().transpose(dim_permute);
framework::Tensor transposed_input;
transposed_input.Resize(transposed_input_shape);
transposed_input.mutable_data<Ti>(tensor_place);
TransCompute<DeviceContext, Ti>(ndim, ctx, *X, &transposed_input,
dim_permute);
// Reshape batch dimensions into a single dimension
const int64_t signal_ndim = static_cast<int64_t>(dim.size());
std::vector<int64_t> collapsed_input_shape(signal_ndim + 1);
auto transposed_input_shape_ = phi::vectorize(transposed_input_shape);
const int64_t batch_dims = ndim - signal_ndim;
auto batch_size =
std::accumulate(transposed_input_shape_.begin(),
transposed_input_shape_.begin() + batch_dims,
static_cast<int>(1), std::multiplies<int>());
collapsed_input_shape[0] = batch_size;
std::copy(transposed_input_shape_.begin() + batch_dims,
transposed_input_shape_.end(), collapsed_input_shape.begin() + 1);
framework::Tensor& collapsed_input = transposed_input;
collapsed_input.Resize(phi::make_ddim(collapsed_input_shape));
// make a collpased output
const auto out_dims = phi::vectorize(out->dims());
std::vector<int64_t> collapsed_output_shape(1 + signal_ndim);
collapsed_output_shape[0] = batch_size;
for (size_t i = 0; i < dim.size(); ++i) {
collapsed_output_shape[i + 1] = out_dims[dim[i]];
}
framework::Tensor collapsed_output;
collapsed_output.Resize(phi::make_ddim(collapsed_output_shape));
collapsed_output.mutable_data<To>(tensor_place);
FFTConfig* config = nullptr;
#if defined(PADDLE_WITH_CUDA)
std::unique_ptr<FFTConfig> config_ = nullptr;
// create plan
FFTConfigKey key =
create_fft_configkey(collapsed_input, collapsed_output, signal_ndim);
bool using_cache = false;
#if !defined(CUFFT_VERSION) || (CUFFT_VERSION < 10200)
using_cache = true;
#endif
if (using_cache) {
const int64_t device_id = static_cast<int64_t>(
reinterpret_cast<const platform::CUDAPlace*>(&collapsed_input.place())
->GetDeviceId());
FFTConfigCache& plan_cache = get_fft_plan_cache(device_id);
std::unique_lock<std::mutex> guard(plan_cache.mutex, std::defer_lock);
guard.lock();
config = &(plan_cache.lookup(key));
} else {
config_ = std::make_unique<FFTConfig>(key);
config = config_.get();
}
// prepare cufft for execution
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cufftSetStream(config->plan(), ctx.stream()));
framework::Tensor workspace_tensor;
workspace_tensor.mutable_data<To>(tensor_place, config->workspace_size());
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cufftSetWorkArea(
config->plan(), workspace_tensor.data<To>()));
// execute transform plan
exec_cufft_plan<DeviceContext, Ti, To>(ctx, *config, &collapsed_input,
&collapsed_output, forward);
#elif defined(PADDLE_WITH_HIP)
// create plan
FFTConfigKey key =
create_fft_configkey(collapsed_input, collapsed_output, signal_ndim);
const int64_t device_id = static_cast<int64_t>(
reinterpret_cast<const platform::CUDAPlace*>(&collapsed_input.place())
->GetDeviceId());
FFTConfigCache& plan_cache = get_fft_plan_cache(device_id);
std::unique_lock<std::mutex> guard(plan_cache.mutex, std::defer_lock);
guard.lock();
config = &(plan_cache.lookup(key));
// prepare cufft for execution
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::hipfftSetStream(config->plan(), ctx.stream()));
framework::Tensor workspace_tensor;
workspace_tensor.mutable_data<To>(tensor_place, config->workspace_size());
PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::hipfftSetWorkArea(
config->plan(), workspace_tensor.data<To>()));
// execute transform plan
exec_hipfft_plan<DeviceContext, Ti, To>(ctx, *config, &collapsed_input,
&collapsed_output, forward);
#endif
// Inverting output by reshape and transpose to original batch and dimension
auto transposed_out_shape = out->dims().transpose(dim_permute);
collapsed_output.Resize(transposed_out_shape);
auto& transposed_output = collapsed_output;
std::vector<int> reverse_dim_permute(ndim);
for (size_t i = 0; i < ndim; i++) {
reverse_dim_permute[dim_permute[i]] = i;
}
TransCompute<DeviceContext, To>(ndim, ctx, transposed_output, out,
reverse_dim_permute);
}
// Use the optimized path to perform single R2C or C2R if transformation dim is
// supported by cuFFT
static bool use_optimized_fft_path(const std::vector<int64_t>& axes) {
// For performance reason, when axes starts with (0, 1), do not use the
// optimized path.
if (axes.size() > kMaxFFTNdim ||
(axes.size() >= 2 && axes[0] == 0 && axes[1] == 1)) {
return false;
} else {
return true;
}
}
template <typename Ti, typename To>
struct FFTC2CFunctor<platform::CUDADeviceContext, Ti, To> {
void operator()(const platform::CUDADeviceContext& ctx, const Tensor* X,
Tensor* out, const std::vector<int64_t>& axes,
FFTNormMode normalization, bool forward) {
if (axes.empty()) {
framework::TensorCopy(*X, ctx.GetPlace(), out);
return;
}
framework::Tensor* p_out = out;
std::vector<int64_t> out_dims = phi::vectorize(X->dims());
std::vector<int64_t> working_axes(axes.begin(), axes.end());
std::vector<int64_t> first_dims;
size_t max_dims;
framework::Tensor working_tensor;
working_tensor.mutable_data<Ti>(X->dims(), ctx.GetPlace());
framework::Tensor* p_working_tensor = &working_tensor;
framework::TensorCopy(*X, ctx.GetPlace(), &working_tensor);
while (true) {
max_dims =
std::min(static_cast<size_t>(kMaxFFTNdim), working_axes.size());
first_dims.assign(working_axes.end() - max_dims, working_axes.end());
exec_fft<platform::CUDADeviceContext, Ti, To>(ctx, p_working_tensor,
p_out, first_dims, forward);
working_axes.resize(working_axes.size() - max_dims);
first_dims.clear();
if (working_axes.empty()) {
break;
}
std::swap(p_out, p_working_tensor);
}
exec_normalization<platform::CUDADeviceContext, To>(
ctx, p_out, out, normalization, out_dims, axes);
}
};
template <typename Ti, typename To>
struct FFTC2RFunctor<platform::CUDADeviceContext, Ti, To> {
void operator()(const platform::CUDADeviceContext& ctx, const Tensor* X,
Tensor* out, const std::vector<int64_t>& axes,
FFTNormMode normalization, bool forward) {
std::vector<int64_t> in_dims = phi::vectorize(X->dims());
std::vector<int64_t> out_dims = phi::vectorize(out->dims());
if (use_optimized_fft_path(axes)) {
framework::Tensor x_copy(X->type());
x_copy.mutable_data<Ti>(X->dims(), ctx.GetPlace());
framework::TensorCopy(*X, ctx.GetPlace(), &x_copy);
exec_fft<platform::CUDADeviceContext, Ti, To>(ctx, &x_copy, out, axes,
forward);
} else {
framework::Tensor temp_tensor;
temp_tensor.mutable_data<Ti>(X->dims(), ctx.GetPlace());
const std::vector<int64_t> dims(axes.begin(), axes.end() - 1);
FFTC2CFunctor<platform::CUDADeviceContext, Ti, Ti> c2c_functor;
c2c_functor(ctx, X, &temp_tensor, dims, FFTNormMode::none, forward);
exec_fft<platform::CUDADeviceContext, Ti, To>(ctx, &temp_tensor, out,
{axes.back()}, forward);
}
exec_normalization<platform::CUDADeviceContext, To>(
ctx, out, out, normalization, out_dims, axes);
}
};
// n dimension real to complex FFT use cufft lib
template <typename Ti, typename To>
struct FFTR2CFunctor<platform::CUDADeviceContext, Ti, To> {
void operator()(const platform::CUDADeviceContext& ctx, const Tensor* X,
Tensor* out, const std::vector<int64_t>& axes,
FFTNormMode normalization, bool forward) {
// Step1: R2C transform on the last dimension
framework::Tensor* r2c_out = out;
const std::vector<int64_t> last_dim{axes.back()};
std::vector<int64_t> out_dims = phi::vectorize(out->dims());
exec_fft<platform::CUDADeviceContext, Ti, To>(ctx, X, r2c_out, last_dim,
forward);
// Step2: C2C transform on the remaining dimension
framework::Tensor c2c_out;
if (axes.size() > 1) {
c2c_out.mutable_data<To>(out->dims(), ctx.GetPlace());
std::vector<int64_t> remain_dim(axes.begin(), axes.end() - 1);
FFTC2CFunctor<platform::CUDADeviceContext, To, To> fft_c2c_func;
fft_c2c_func(ctx, r2c_out, &c2c_out, remain_dim, FFTNormMode::none,
forward);
}
const auto in_sizes = phi::vectorize(X->dims());
framework::Tensor* norm_tensor = axes.size() > 1 ? &c2c_out : r2c_out;
exec_normalization<platform::CUDADeviceContext, To>(
ctx, norm_tensor, out, normalization, in_sizes, axes);
}
};
} // namespace operators
} // namespace paddle
......@@ -11,8 +11,11 @@
#pragma once
#define NOMINMAX // to use std::min std::max correctly on windows
#include <algorithm>
#include <functional>
#include <iostream>
#include <memory>
#include <numeric>
#include <string>
#include <vector>
#include "paddle/fluid/framework/convert_utils.h"
......@@ -23,8 +26,10 @@
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/conj_op.h"
#include "paddle/fluid/operators/eigen/eigen_function.h"
#include "paddle/fluid/operators/transpose_op.h"
#include "paddle/fluid/platform/complex.h"
#include "paddle/fluid/platform/for_range.h"
#include "paddle/phi/kernels/funcs/complex_functors.h"
#include "paddle/phi/kernels/funcs/padding.h"
#if defined(__NVCC__) || defined(__HIPCC__)
#include "thrust/device_vector.h"
......
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/stft_op.h"
#include "paddle/fluid/operators/spectral_helper.h"
namespace paddle {
namespace operators {
class StftOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "frame");
OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "frame");
const int n_fft = ctx->Attrs().Get<int>("n_fft");
const int hop_length = ctx->Attrs().Get<int>("hop_length");
const auto x_dims = ctx->GetInputDim("X");
const int x_rank = x_dims.size();
const bool onesided = ctx->Attrs().Get<bool>("onesided");
PADDLE_ENFORCE_EQ(
x_rank, 2,
platform::errors::InvalidArgument(
"Input(X) of StftOp should be a tensor with shape [N, T], "
"but got rank %s.",
x_rank));
PADDLE_ENFORCE_GT(
hop_length, 0,
platform::errors::InvalidArgument(
"Attribute(hop_length) should be greater than 0, but got %s.",
hop_length));
int seq_length = x_dims[x_rank - 1];
int n_frames = 1 + (seq_length - n_fft) / hop_length;
PADDLE_ENFORCE_LE(n_fft, seq_length,
platform::errors::InvalidArgument(
"Attribute(frame_length) should be less equal than "
"sequence length, but got (%s) > (%s).",
n_fft, seq_length));
std::vector<int64_t> output_shape;
output_shape.push_back(x_dims[0]);
if (onesided) {
output_shape.push_back(n_fft / 2 + 1);
} else {
output_shape.push_back(n_fft);
}
output_shape.push_back(n_frames);
ctx->SetOutputDim("Out", phi::make_ddim(output_shape));
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
const auto in_dtype = OperatorWithKernel::IndicateVarDataType(ctx, "X");
return framework::OpKernelType(in_dtype, ctx.GetPlace());
}
};
class StftOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X", "Input waveforms with shape (N, T)");
AddOutput("Out",
"The complex STFT output tensor with shape (N, n_fft, "
"num_frames) or (N, n_fft/2 + 1, num_frames)");
AddAttr<int>("n_fft", "The number of input samples to perform FFT");
AddAttr<int>("hop_length", "Number of samples between adjacent frames");
AddAttr<bool>("normalized",
"Control whether to scale the output by 1/sqrt(n_fft)");
AddAttr<bool>("onesided",
"Control whether to return half of the FFT output");
AddComment(R"DOC(
Short-time Fourier transform (STFT).
)DOC");
}
};
template <typename T>
class StftGradOpMaker : public framework::SingleGradOpMaker<T> {
public:
using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
protected:
void Apply(GradOpPtr<T> grad_op) const override {
grad_op->SetType("stft_grad");
grad_op->SetInput("X", this->Input("X"));
grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
grad_op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
grad_op->SetAttrMap(this->Attrs());
}
};
class StftGradOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
const auto out_grad_name = framework::GradVarName("Out");
OP_INOUT_CHECK(ctx->HasInput(out_grad_name), "Input", out_grad_name,
"stft_grad");
OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "stft_grad");
const auto x_grad_name = framework::GradVarName("X");
OP_INOUT_CHECK(ctx->HasOutput(x_grad_name), "Output", x_grad_name,
"stft_grad");
ctx->ShareDim("X", /*->*/ x_grad_name);
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
const auto in_dtype = OperatorWithKernel::IndicateVarDataType(
ctx, framework::GradVarName("Out"));
const auto kernel_dtype = framework::ToRealType(in_dtype);
return framework::OpKernelType(kernel_dtype, ctx.GetPlace());
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(stft, ops::StftOp, ops::StftOpMaker,
ops::StftGradOpMaker<paddle::framework::OpDesc>,
ops::StftGradOpMaker<paddle::imperative::OpBase>);
REGISTER_OPERATOR(stft_grad, ops::StftGradOp);
REGISTER_OP_CPU_KERNEL(
stft, ops::StftKernel<paddle::platform::CPUDeviceContext, float>,
ops::StftKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_CPU_KERNEL(
stft_grad, ops::StftGradKernel<paddle::platform::CPUDeviceContext, float>,
ops::StftGradKernel<paddle::platform::CPUDeviceContext, double>);
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/spectral_op.cu.h"
#include "paddle/fluid/operators/stft_op.h"
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
stft, ops::StftKernel<paddle::platform::CUDADeviceContext, float>,
ops::StftKernel<paddle::platform::CUDADeviceContext, double>);
REGISTER_OP_CUDA_KERNEL(
stft_grad, ops::StftGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::StftGradKernel<paddle::platform::CUDADeviceContext, double>);
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/frame_op.h"
#include "paddle/fluid/operators/spectral_op.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename DeviceContext, typename T>
class StftKernel : public framework::OpKernel<T> {
public:
/*
Batch Signals (N, T) -> Frames (N, n_fft, num_frames) -> FFTR2C -> (N,
n_fft/2 + 1, num_frames) or (N, n_fft, num_frames)
*/
void Compute(const framework::ExecutionContext& ctx) const override {
using C = paddle::platform::complex<T>;
const Tensor* x = ctx.Input<Tensor>("X");
Tensor* out = ctx.Output<Tensor>("Out");
out->mutable_data<C>(ctx.GetPlace());
const size_t x_rank = x->dims().size();
const size_t out_rank = out->dims().size();
const int n_fft = ctx.Attr<int>("n_fft");
const int hop_length = ctx.Attr<int>("hop_length");
const bool normalized = ctx.Attr<bool>("normalized");
const bool onesided = ctx.Attr<bool>("onesided");
const int n_frames = out->dims()[out_rank - 1];
const int seq_length = x->dims()[x_rank - 1];
auto& dev_ctx = ctx.device_context<DeviceContext>();
std::vector<int64_t> axes = {1};
// Frame
Tensor frames;
framework::DDim frames_dims(out->dims());
frames_dims.at(axes.back()) = n_fft;
frames.mutable_data<T>(frames_dims, ctx.GetPlace());
FrameFunctor<DeviceContext, T>()(dev_ctx, x, &frames, seq_length, n_fft,
n_frames, hop_length, /*is_grad*/ false);
// FFTR2C
FFTNormMode normalization;
if (normalized) {
normalization = get_norm_from_string("ortho", true);
} else {
normalization = get_norm_from_string("backward", true);
}
FFTR2CFunctor<DeviceContext, T, C> fft_r2c_func;
if (onesided) {
fft_r2c_func(dev_ctx, &frames, out, axes, normalization, true);
} else {
framework::DDim onesided_dims(out->dims());
const int64_t onesided_axis_size = out->dims().at(axes.back()) / 2 + 1;
onesided_dims.at(axes.back()) = onesided_axis_size;
Tensor onesided_out;
onesided_out.mutable_data<C>(onesided_dims, ctx.GetPlace());
fft_r2c_func(dev_ctx, &frames, &onesided_out, axes, normalization, true);
fill_conj<DeviceContext, C>(dev_ctx, &onesided_out, out, axes);
}
}
};
template <typename DeviceContext, typename T>
class StftGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
using C = paddle::platform::complex<T>;
auto& dev_ctx = ctx.device_context<DeviceContext>();
const auto* dy = ctx.Input<Tensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
dx->mutable_data<T>(ctx.GetPlace());
const size_t dy_rank = dy->dims().size();
const size_t dx_rank = dx->dims().size();
const int n_fft = ctx.Attr<int>("n_fft");
const int hop_length = ctx.Attr<int>("hop_length");
const bool normalized = ctx.Attr<bool>("normalized");
const bool onesided = ctx.Attr<bool>("onesided");
const int n_frames = dy->dims()[dy_rank - 1];
const int seq_length = dx->dims()[dx_rank - 1];
std::vector<int64_t> axes = {1};
Tensor d_frames;
framework::DDim d_frames_dims(dy->dims());
d_frames_dims.at(axes.back()) = n_fft;
d_frames.mutable_data<T>(d_frames_dims, ctx.GetPlace());
Tensor complex_d_frames;
complex_d_frames.mutable_data<C>(d_frames_dims, ctx.GetPlace());
// dy -> d_frames
FFTNormMode normalization;
if (normalized) {
normalization = get_norm_from_string("ortho", true);
} else {
normalization = get_norm_from_string("backward", true);
}
FFTC2CFunctor<DeviceContext, C, C> fft_c2c_func;
if (!onesided) {
fft_c2c_func(dev_ctx, dy, &complex_d_frames, axes, normalization, false);
} else {
Tensor full_dy;
full_dy.mutable_data<C>(d_frames_dims, ctx.GetPlace());
auto zero_length = static_cast<int>(full_dy.dims().at(axes.back()) -
dy->dims().at(axes.back()));
auto rank = dy->dims().size();
std::vector<int> pads(rank * 2, 0);
pads[axes.back() * 2 + 1] = zero_length;
phi::funcs::PaddingFunctor<DeviceContext, C>(
rank, ctx.template device_context<DeviceContext>(), pads,
static_cast<C>(0), *dy, &full_dy);
fft_c2c_func(dev_ctx, &full_dy, &complex_d_frames, axes, normalization,
false);
}
framework::TransComplexToReal(
framework::TransToProtoVarType(d_frames.dtype()),
framework::TransToProtoVarType(complex_d_frames.dtype()),
complex_d_frames, &d_frames);
// d_frames -> dx
FrameFunctor<DeviceContext, T>()(dev_ctx, &d_frames, dx, seq_length, n_fft,
n_frames, hop_length, /*is_grad*/ true);
}
};
} // namespace operators
} // namespace paddle
......@@ -159,10 +159,8 @@ inline void EmplaceDeviceContext(
cuda_ctx,
platform::errors::InvalidArgument(
"Failed to dynamic_cast dev_ctx into CUDADeviceContext."));
// Note: A trick method to init context, why GetAllocator interface
// needs a stream parameter?
dev_ctx->SetAllocator(memory::allocation::AllocatorFacade::Instance()
.GetAllocator(p, cuda_ctx->stream())
.GetAllocator(p)
.get());
cuda_ctx->PartialInitWithAllocator();
dev_ctx->SetGenerator(
......@@ -517,10 +515,10 @@ CUDAContext::~CUDAContext() {
CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : phi::GPUContext(place) {
phi::GPUContext::PartialInitWithoutAllocator();
cuda_stream_.reset(new stream::CUDAStream(phi::GPUContext::stream(), place));
workspace_.reset(new phi::DnnWorkspaceHandle(
memory::allocation::AllocatorFacade::Instance()
.GetAllocator(place, phi::GPUContext::stream())
.get()));
auto& instance = memory::allocation::AllocatorFacade::Instance();
instance.SetDefaultStream(place, phi::GPUContext::stream());
workspace_.reset(
new phi::DnnWorkspaceHandle(instance.GetAllocator(place).get()));
}
CUDADeviceContext::~CUDADeviceContext() = default;
......@@ -618,7 +616,7 @@ phi::DnnWorkspaceHandle CUDADeviceContext::cudnn_workspace_handle() const {
// return workspace_.get();
return phi::DnnWorkspaceHandle(
memory::allocation::AllocatorFacade::Instance()
.GetAllocator(GetPlace(), phi::GPUContext::stream())
.GetAllocator(GetPlace())
.get());
}
return phi::GPUContext::cudnn_workspace_handle();
......
......@@ -118,8 +118,9 @@ float CpuUtilization::GetCpuUtilization() {
float busy_time = (system_kernel_time_end - system_kernel_time_start) +
(system_user_time_end - system_user_time_start);
float idle_time = system_idle_time_end - system_idle_time_start;
if (busy_time + idle_time != 0) {
cpu_utilization = busy_time / (busy_time + idle_time);
}
#elif defined(__linux__)
float busy_time = (system_tms_end_.tms_utime - system_tms_start_.tms_utime) +
(system_tms_end_.tms_stime - system_tms_start_.tms_stime) +
......@@ -127,7 +128,9 @@ float CpuUtilization::GetCpuUtilization() {
(irq_end_ - irq_start_) + (softirq_end_ - softirq_start_) +
(steal_end_ - steal_start_);
float idle_time = (idle_end_ - idle_start_) + (iowait_end_ - iowait_start_);
if (busy_time + idle_time != 0) {
cpu_utilization = busy_time / (busy_time + idle_time);
}
#else
LOG(WARNING)
<< "Current System is not supported to get system cpu utilization"
......@@ -148,13 +151,16 @@ float CpuUtilization::GetCpuCurProcessUtilization() {
uint64_t end = FileTimeToUint64(end_);
float busy_time = (process_kernel_time_end - process_kernel_time_start) +
(process_user_time_end - process_user_time_start);
if (end - start != 0) {
cpu_process_utilization = busy_time / (end - start);
LOG(INFO) << "Process Utilization = " << cpu_process_utilization << std::endl;
}
#elif defined(__linux__)
float busy_time =
(process_tms_end_.tms_utime - process_tms_start_.tms_utime) +
(process_tms_end_.tms_stime - process_tms_start_.tms_stime);
if (end_ - start_ != 0) {
cpu_process_utilization = busy_time / (end_ - start_);
}
#else
LOG(WARNING)
<< "Current System is not supported to get process cpu utilization"
......
......@@ -44,6 +44,14 @@ std::unique_ptr<Profiler> Profiler::Create(const ProfilerOptions& options) {
return std::unique_ptr<Profiler>(new Profiler(options));
}
bool Profiler::IsCuptiSupported() {
bool supported = false;
#ifdef PADDLE_WITH_CUPTI
supported = true;
#endif
return supported;
}
Profiler::Profiler(const ProfilerOptions& options) {
options_ = options;
std::bitset<32> trace_switch(options_.trace_switch);
......
......@@ -43,6 +43,8 @@ class Profiler {
public:
static std::unique_ptr<Profiler> Create(const ProfilerOptions& options);
static bool IsCuptiSupported();
void Prepare();
void Start();
......
......@@ -18,7 +18,6 @@ limitations under the License. */
#include "glog/logging.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/dynload/cupti.h"
namespace paddle {
namespace platform {
......
......@@ -15,6 +15,7 @@ limitations under the License. */
#include <ctime>
#include <string>
#include "paddle/fluid/platform/dynload/cupti.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/os_info.h"
......
......@@ -25,6 +25,7 @@ limitations under the License. */
#include "paddle/fluid/eager/hooks.h"
#include "paddle/fluid/eager/utils.h"
#include "paddle/fluid/framework/convert_utils.h"
#include "paddle/fluid/framework/python_headers.h"
#include "paddle/fluid/memory/allocation/allocator.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/enforce.h"
......@@ -32,12 +33,14 @@ limitations under the License. */
#include "paddle/fluid/pybind/eager_utils.h"
#include "paddle/fluid/pybind/exception.h"
#include "paddle/fluid/pybind/slice_utils.h"
#include "paddle/fluid/pybind/tensor_py.h"
#include "paddle/phi/api/include/api.h"
#include "paddle/phi/common/data_type.h"
#include "paddle/phi/core/compat/convert_utils.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/sparse_coo_tensor.h"
#include "paddle/phi/core/sparse_csr_tensor.h"
#include "pybind11/detail/internals.h"
namespace paddle {
namespace pybind {
......@@ -150,12 +153,22 @@ bool PyCheckTensor(PyObject* obj) {
static PyObject* tensor_method_numpy(TensorObject* self, PyObject* args,
PyObject* kwargs) {
EAGER_TRY
PADDLE_ENFORCE_EQ(
self->tensor.initialized(), true,
platform::errors::InvalidArgument(
"Tensor data of %s is Empty that indicates we have null tensor for "
"now, please check if it has no data and initialize it first.",
self->tensor.name()));
auto& api = pybind11::detail::npy_api::get();
if (!self->tensor.impl()) {
Py_intptr_t py_dims[paddle::framework::DDim::kMaxRank];
Py_intptr_t py_strides[paddle::framework::DDim::kMaxRank];
py_dims[0] = 0;
py_strides[0] = 0;
PyObject* array = api.PyArray_NewFromDescr_(
api.PyArray_Type_,
api.PyArray_DescrFromType_(pybind11::detail::npy_api::NPY_FLOAT_), 1,
py_dims, py_strides, nullptr,
pybind11::detail::npy_api::NPY_ARRAY_ALIGNED_ |
pybind11::detail::npy_api::NPY_ARRAY_WRITEABLE_,
nullptr);
return array;
}
auto tensor_dims = self->tensor.shape();
auto numpy_dtype = TensorDtype2NumpyDtype(self->tensor.type());
auto sizeof_dtype = paddle::framework::DataTypeSize(self->tensor.type());
......@@ -167,7 +180,7 @@ static PyObject* tensor_method_numpy(TensorObject* self, PyObject* args,
py_strides[i] = sizeof_dtype * numel;
numel *= py_dims[i];
}
auto& api = pybind11::detail::npy_api::get();
PyObject* array = api.PyArray_NewFromDescr_(
api.PyArray_Type_, api.PyArray_DescrFromType_(numpy_dtype),
tensor_dims.size(), py_dims, py_strides, nullptr,
......@@ -175,6 +188,10 @@ static PyObject* tensor_method_numpy(TensorObject* self, PyObject* args,
pybind11::detail::npy_api::NPY_ARRAY_WRITEABLE_,
nullptr);
if (!self->tensor.impl()->initialized()) {
return array;
}
if (self->tensor.is_cpu() || self->tensor.is_gpu_pinned()) {
auto dense_tensor =
std::dynamic_pointer_cast<phi::DenseTensor>(self->tensor.impl());
......@@ -213,6 +230,20 @@ static PyObject* tensor_method__is_initialized(TensorObject* self,
EAGER_CATCH_AND_THROW_RETURN_NULL
}
static PyObject* tensor_method__is_dense_tensor_hold_allocation(
TensorObject* self, PyObject* args, PyObject* kwargs) {
EAGER_TRY
auto dense_tensor =
std::dynamic_pointer_cast<phi::DenseTensor>(self->tensor.impl());
if (dense_tensor) {
return ToPyObject(dense_tensor->IsInitialized());
} else {
return ToPyObject(false);
}
EAGER_CATCH_AND_THROW_RETURN_NULL
}
static PyObject* tensor_method__copy_to(TensorObject* self, PyObject* args,
PyObject* kwargs) {
EAGER_TRY
......@@ -552,10 +583,13 @@ static PyObject* tensor__getitem_index_not_tensor(TensorObject* self,
}
if (op_type == "slice") {
out = slice_dygraph_function(self->tensor, paddle::experimental::Tensor(),
paddle::experimental::Tensor(),
paddle::experimental::Tensor(), {}, {},
std::move(attrs));
} else if (op_type == "strided_slice") {
out = strided_slice_dygraph_function(self->tensor, attrs);
out = strided_slice_dygraph_function(
self->tensor, paddle::experimental::Tensor(),
paddle::experimental::Tensor(), paddle::experimental::Tensor(), {},
{}, {}, attrs);
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"Slice is only support slice and strided_slice, but we got %s which "
......@@ -604,6 +638,7 @@ static PyObject* tensor__getitem_index_not_tensor(TensorObject* self,
auto select_index = paddle::experimental::Tensor(
egr::Controller::Instance().GenerateUniqueName());
auto idx_tensor = std::make_shared<phi::DenseTensor>();
select_index.set_impl(idx_tensor);
auto* dev_ctx = platform::DeviceContextPool::Instance().Get(
egr::Controller::Instance().GetExpectedPlace());
paddle::framework::TensorFromVector(list_select_idxs, *dev_ctx,
......@@ -617,6 +652,216 @@ static PyObject* tensor__getitem_index_not_tensor(TensorObject* self,
EAGER_CATCH_AND_THROW_RETURN_NULL
}
static PyObject* tensor_method__setitem_eager_tensor(TensorObject* self,
PyObject* args,
PyObject* kwargs) {
EAGER_TRY
VLOG(4) << "Call __setitem_eager_tensor";
auto self_tensor = static_cast<phi::DenseTensor*>(self->tensor.impl().get());
PyObject* _index = PyTuple_GET_ITEM(args, 0);
PyObject* value_obj = PyTuple_GET_ITEM(args, 1);
// NOTE(zhiqiu): PyTuple_Pack increases refcount while PyTuple_New
// https://github.com/python/cpython/blob/24b63c695ae0a95b06379eaadace66735abac1e2/Objects/tupleobject.c#L251
PyObject* index_ptr =
!PyTuple_Check(_index) ? PyTuple_Pack(1, _index) : _index;
DEFINE_PADDLE_SCOPE_GUARD([index_ptr, &_index]() {
if (!PyTuple_Check(_index)) {
Py_DECREF(index_ptr);
VLOG(4) << "Call Py_DECREF";
}
});
// TODO(pangyoki) add inplace(BumpInplaceVersion) if need
// 1. Check argumnets
bool parse_index = true;
// Check whether _index can be parsed.
const int size = PyTuple_GET_SIZE(index_ptr);
for (int dim = 0; dim < size; ++dim) {
PyObject* slice_item = PyTuple_GetItem(index_ptr, dim);
if (!(PyCheckInteger(slice_item) || PySlice_Check(slice_item) ||
slice_item == Py_Ellipsis || slice_item == Py_None)) {
parse_index = false;
break;
}
}
// 2. Call op set_value to speed up if the condition is met,
// otherwise call TensorToPyArray.
// TODO(liym27): Try not to call TensorToPyArray because it always
// copys data to cpu place, which reduces performance.
if (parse_index) {
std::vector<int> axes, starts, ends, steps, decrease_axes, none_axes,
infer_flags, list_select_idxs;
// if index is a list, list_select_flag will be true
bool list_select_flag = false;
ParseIndexingSlice(self_tensor, index_ptr, &axes, &starts, &ends, &steps,
&decrease_axes, &none_axes, &infer_flags,
&list_select_idxs, &list_select_flag);
framework::AttributeMap attrs = {{"axes", axes},
{"starts", starts},
{"ends", ends},
{"steps", steps},
{"decrease_axes", decrease_axes},
{"none_axes", none_axes}};
if (egr::Controller::Instance().HasGrad()) {
PADDLE_ENFORCE_EQ(
egr::egr_utils_api::IsLeafTensor(self->tensor) &&
!egr::EagerUtils::autograd_meta(&self->tensor)->StopGradient(),
false, platform::errors::InvalidArgument(
"Leaf Tensor (%s) that doesn't stop gradient can't use "
"inplace strategy.",
self->tensor.name()));
}
paddle::experimental::Tensor value_tensor;
if (PyCheckTensor(value_obj)) {
value_tensor = reinterpret_cast<TensorObject*>(value_obj)->tensor;
// pass the stop_gradient from value to tensor
if (!egr::EagerUtils::autograd_meta(&value_tensor)->StopGradient() &&
egr::EagerUtils::autograd_meta(&self->tensor)->StopGradient()) {
egr::EagerUtils::autograd_meta(&self->tensor)->SetStopGradient(false);
}
} else if (py::isinstance<py::array>(value_obj)) {
paddle::experimental::Tensor value_tensor_tmp(
std::make_shared<phi::DenseTensor>(),
egr::Controller::Instance().GenerateUniqueName());
py::object value_obj_tmp(py::handle(value_obj), true);
py::object value = value_obj_tmp;
if (self->tensor.dtype() == paddle::experimental::DataType::FLOAT32) {
if (!py::isinstance<py::array_t<float>>(value_obj_tmp)) {
value = pybind11::detail::CastNumpyArray<float>(value_obj_tmp);
}
} else if (self->tensor.dtype() ==
paddle::experimental::DataType::FLOAT64) {
if (!py::isinstance<py::array_t<double>>(value_obj_tmp)) {
value = pybind11::detail::CastNumpyArray<double>(value_obj_tmp);
}
} else if (self->tensor.dtype() ==
paddle::experimental::DataType::INT32) {
if (!py::isinstance<py::array_t<int32_t>>(value_obj_tmp)) {
value = pybind11::detail::CastNumpyArray<int32_t>(value_obj_tmp);
}
} else if (self->tensor.dtype() ==
paddle::experimental::DataType::INT64) {
if (!py::isinstance<py::array_t<int64_t>>(value_obj_tmp)) {
value = pybind11::detail::CastNumpyArray<int64_t>(value_obj_tmp);
}
} else if (self->tensor.dtype() == paddle::experimental::DataType::BOOL) {
if (!py::isinstance<py::array_t<bool>>(value_obj_tmp)) {
value = pybind11::detail::CastNumpyArray<bool>(value_obj_tmp);
}
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"When assign a numpy.np value to a paddle.Tensor, "
"the data type of the paddle.Tensor must be bool, "
"float32, int32 or int64, "
"please check the type of tensor."));
}
if (value_tensor_tmp.place() == paddle::PlaceType::kUNK) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
SetTensorFromPyArray(
static_cast<phi::DenseTensor*>(value_tensor_tmp.impl().get()),
value, platform::Place(platform::CUDAPlace(0)), false);
#else
SetTensorFromPyArray(
static_cast<phi::DenseTensor*>(value_tensor_tmp.impl().get()),
value, platform::Place(platform::CPUPlace()), false);
#endif
} else {
SetTensorFromPyArray(
static_cast<phi::DenseTensor*>(value_tensor_tmp.impl().get()),
value, value_tensor_tmp.inner_place(), false);
}
value_tensor = value_tensor_tmp;
} else {
py::object value_obj_tmp(py::handle(value_obj), true);
// convert the value to self data type
if (py::isinstance<py::float_>(value_obj_tmp) ||
py::isinstance<py::int_>(value_obj_tmp) ||
py::isinstance<py::bool_>(value_obj_tmp)) {
if (self->tensor.dtype() == paddle::experimental::DataType::FLOAT32) {
attrs["fp32_values"] =
std::vector<float>{value_obj_tmp.cast<float>()};
} else if (self->tensor.dtype() ==
paddle::experimental::DataType::FLOAT64) {
attrs["fp64_values"] =
std::vector<double>{value_obj_tmp.cast<double>()};
} else if (self->tensor.dtype() ==
paddle::experimental::DataType::INT32) {
attrs["int32_values"] =
std::vector<int32_t>{value_obj_tmp.cast<int32_t>()};
} else if (self->tensor.dtype() ==
paddle::experimental::DataType::INT64) {
attrs["int64_values"] =
std::vector<int64_t>{value_obj_tmp.cast<int64_t>()};
} else if (self->tensor.dtype() ==
paddle::experimental::DataType::BOOL) {
attrs["bool_values"] = std::vector<int>{value_obj_tmp.cast<bool>()};
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"When assign a value to a paddle.Tensor, "
"the data type of the paddle.Tensor must be bool, "
"float32, int32 or int64, "
"please check the type of tensor."));
}
attrs["shape"] = std::vector<int64_t>{1};
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"Value type error. The assign value allows "
"numpy.ndarray, integer, float or bool, "
"but received %s.",
Py_TYPE(value_obj)));
}
}
{
// Release gil and do tracing
py::gil_scoped_release release;
self->tensor = set_value_dygraph_function(self->tensor, value_tensor, {},
{}, {}, attrs);
}
} else {
auto self_numpy = TensorToPyArray(*self_tensor);
VLOG(4) << "parse_index is false";
if (PyCheckTensor(_index)) {
VLOG(4) << "index is tensor";
auto index_tensor = static_cast<phi::DenseTensor*>(
reinterpret_cast<TensorObject*>(_index)->tensor.impl().get());
auto index_numpy = TensorToPyArray(*index_tensor);
self_numpy[index_numpy] = py::object(py::handle(value_obj), true);
} else {
VLOG(4) << "index is not tensor";
self_numpy[_index] = py::object(py::handle(value_obj), true);
}
if (self->tensor.place() == paddle::PlaceType::kUNK) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
SetTensorFromPyArray(self_tensor, self_numpy,
platform::Place(platform::CUDAPlace(0)), false);
#else
SetTensorFromPyArray(self_tensor, self_numpy,
platform::Place(platform::CPUPlace()), false);
#endif
} else {
SetTensorFromPyArray(self_tensor, self_numpy, self->tensor.inner_place(),
false);
}
}
Py_INCREF(Py_None);
return Py_None;
EAGER_CATCH_AND_THROW_RETURN_NULL
}
static PyObject* tensor_register_grad_hook(TensorObject* self, PyObject* args,
PyObject* kwargs) {
EAGER_TRY
......@@ -825,6 +1070,10 @@ PyMethodDef variable_methods[] = {
{"_is_initialized",
(PyCFunction)(void (*)(void))tensor_method__is_initialized,
METH_VARARGS | METH_KEYWORDS, NULL},
{"_is_dense_tensor_hold_allocation",
(PyCFunction)(
void (*)(void))tensor_method__is_dense_tensor_hold_allocation,
METH_VARARGS | METH_KEYWORDS, NULL},
{"_copy_to", (PyCFunction)(void (*)(void))tensor_method__copy_to,
METH_VARARGS | METH_KEYWORDS, NULL},
{"copy_", (PyCFunction)(void (*)(void))tensor_method_copy_,
......@@ -857,6 +1106,9 @@ PyMethodDef variable_methods[] = {
{"_getitem_index_not_tensor",
(PyCFunction)(void (*)(void))tensor__getitem_index_not_tensor,
METH_VARARGS | METH_KEYWORDS, NULL},
{"__setitem_eager_tensor__",
(PyCFunction)(void (*)(void))tensor_method__setitem_eager_tensor,
METH_VARARGS | METH_KEYWORDS, NULL},
{"_register_grad_hook",
(PyCFunction)(void (*)(void))tensor_register_grad_hook,
METH_VARARGS | METH_KEYWORDS, NULL},
......
......@@ -52,6 +52,12 @@ PyObject* tensor_properties_get_type(TensorObject* self, void* closure) {
EAGER_CATCH_AND_THROW_RETURN_NULL
}
PyObject* tensor_properties_is_leaf(TensorObject* self, void* closure) {
EAGER_TRY
return ToPyObject(egr::egr_utils_api::IsLeafTensor(self->tensor));
EAGER_CATCH_AND_THROW_RETURN_NULL
}
int tensor_properties_set_name(TensorObject* self, PyObject* value,
void* closure) {
EAGER_TRY
......@@ -179,6 +185,7 @@ struct PyGetSetDef variable_properties[] = {
nullptr},
{"dtype", (getter)tensor_properties_get_dtype, nullptr, nullptr, nullptr},
{"type", (getter)tensor_properties_get_type, nullptr, nullptr, nullptr},
{"is_leaf", (getter)tensor_properties_is_leaf, nullptr, nullptr, nullptr},
{nullptr, nullptr, nullptr, nullptr, nullptr}};
} // namespace pybind
......
......@@ -386,46 +386,6 @@ GetVarBaseListFromPyHandle(const py::handle &handle) {
return result;
}
// cast numpy type form S to T, this may allocate new memory
template <class T, class S>
static py::array_t<T> CastNumpyType(py::array_t<S> array) {
if (std::is_same<T, S>::value) {
return array;
}
auto dim = array.ndim();
std::vector<py::ssize_t> result_shape(dim);
for (auto i = 0; i < dim; i++) {
result_shape[i] = array.shape(i);
}
py::array_t<T> result(result_shape);
return py::vectorize([](S s) { return static_cast<T>(s); })(array);
}
template <class T>
static py::array_t<T> CastNumpyArray(const py::object &array) {
if (py::isinstance<py::array_t<float>>(array)) {
return CastNumpyType<T>(array.cast<py::array_t<float>>());
} else if (py::isinstance<py::array_t<double>>(array)) {
return CastNumpyType<T>(array.cast<py::array_t<double>>());
} else if (py::isinstance<py::array_t<int32_t>>(array)) {
return CastNumpyType<T>(array.cast<py::array_t<int32_t>>());
} else if (py::isinstance<py::array_t<int64_t>>(array)) {
return CastNumpyType<T>(array.cast<py::array_t<int64_t>>());
} else if (py::isinstance<py::array_t<bool>>(array)) {
return CastNumpyType<T>(array.cast<py::array_t<bool>>());
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"Value type error. The assign numpy value allows integer, float, "
"double and bool, "
"but received %s.",
Py_TYPE(array.ptr())->tp_name));
}
// can't reach here
return py::array_t<T>();
}
static imperative::NameVarBaseMap ConvertToNameVarBaseMap(
const PyNameVarBaseMap &map) {
imperative::NameVarBaseMap result;
......@@ -854,27 +814,29 @@ void BindImperative(py::module *m_ptr) {
py::object value = value_obj;
if (self->DataType() == framework::proto::VarType::FP32) {
if (!py::isinstance<py::array_t<float>>(value_obj)) {
value = CastNumpyArray<float>(value_obj);
value = pybind11::detail::CastNumpyArray<float>(value_obj);
}
} else if (self->DataType() ==
framework::proto::VarType::FP64) {
if (!py::isinstance<py::array_t<double>>(value_obj)) {
value = CastNumpyArray<double>(value_obj);
value = pybind11::detail::CastNumpyArray<double>(value_obj);
}
} else if (self->DataType() ==
framework::proto::VarType::INT32) {
if (!py::isinstance<py::array_t<int32_t>>(value_obj)) {
value = CastNumpyArray<int32_t>(value_obj);
value =
pybind11::detail::CastNumpyArray<int32_t>(value_obj);
}
} else if (self->DataType() ==
framework::proto::VarType::INT64) {
if (!py::isinstance<py::array_t<int64_t>>(value_obj)) {
value = CastNumpyArray<int64_t>(value_obj);
value =
pybind11::detail::CastNumpyArray<int64_t>(value_obj);
}
} else if (self->DataType() ==
framework::proto::VarType::BOOL) {
if (!py::isinstance<py::array_t<bool>>(value_obj)) {
value = CastNumpyArray<bool>(value_obj);
value = pybind11::detail::CastNumpyArray<bool>(value_obj);
}
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
......
......@@ -38,7 +38,15 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
{"assign", {"X"}},
{"reshape2", {"X", "Shape"}},
{"expand", {"X", "ExpandTimes"}},
{"slice", {"Input", "StartsTensor", "EndsTensor"}},
{"slice",
{"Input", "StartsTensor", "EndsTensor", "StartsTensorList",
"EndsTensorList"}},
{"strided_slice",
{"Input", "StartsTensor", "EndsTensor", "StridesTensor",
"StartsTensorList", "EndsTensorList", "StridesTensorList"}},
{"set_value",
{"Input", "ValueTensor", "StartsTensorList", "EndsTensorList",
"StepsTensorList"}},
{"fake_quantize_dequantize_moving_average_abs_max",
{"X", "InScale", "InAccum", "InState"}},
{"nll_loss", {"X", "Label", "Weight"}},
......@@ -89,6 +97,7 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
{"Input", "Label", "Weight", "Bias", "SampleWeight", "CustomDistProbs",
"CustomDistAlias", "CustomDistAliasProbs"}},
{"check_finite_and_unscale", {"X", "Scale", "FloatStatus"}},
{"group_norm", {"X", "Scale", "Bias"}},
};
// NOTE(zhiqiu): Like op_ins_map.
......
......@@ -3322,6 +3322,7 @@ All parameter, weight, gradient are variables in Paddle.
py::class_<paddle::platform::Profiler>(m, "_Profiler")
.def("create", &paddle::platform::Profiler::Create,
py::return_value_policy::take_ownership)
.def("is_cupti_supported", &paddle::platform::Profiler::IsCuptiSupported)
.def("prepare",
[](paddle::platform::Profiler *profiler) {
platform::EnableHostEventRecorder();
......
......@@ -52,6 +52,46 @@ constexpr int NPY_UINT16_ = 4;
constexpr int NPY_COMPLEX64 = 14;
constexpr int NPY_COMPLEX128 = 15;
// cast numpy type form S to T, this may allocate new memory
template <class T, class S>
static py::array_t<T> CastNumpyType(py::array_t<S> array) {
if (std::is_same<T, S>::value) {
return array;
}
auto dim = array.ndim();
std::vector<py::ssize_t> result_shape(dim);
for (auto i = 0; i < dim; i++) {
result_shape[i] = array.shape(i);
}
py::array_t<T> result(result_shape);
return py::vectorize([](S s) { return static_cast<T>(s); })(array);
}
template <class T>
static py::array_t<T> CastNumpyArray(const py::object &array) {
if (py::isinstance<py::array_t<float>>(array)) {
return CastNumpyType<T>(array.cast<py::array_t<float>>());
} else if (py::isinstance<py::array_t<double>>(array)) {
return CastNumpyType<T>(array.cast<py::array_t<double>>());
} else if (py::isinstance<py::array_t<int32_t>>(array)) {
return CastNumpyType<T>(array.cast<py::array_t<int32_t>>());
} else if (py::isinstance<py::array_t<int64_t>>(array)) {
return CastNumpyType<T>(array.cast<py::array_t<int64_t>>());
} else if (py::isinstance<py::array_t<bool>>(array)) {
return CastNumpyType<T>(array.cast<py::array_t<bool>>());
} else {
PADDLE_THROW(paddle::platform::errors::InvalidArgument(
"Value type error. The assign numpy value allows integer, float, "
"double and bool, "
"but received %s.",
Py_TYPE(array.ptr())->tp_name));
}
// can't reach here
return py::array_t<T>();
}
// Note: Since float16 is not a builtin type in C++, we register
// paddle::platform::float16 as numpy.float16.
// Ref: https://github.com/pybind/pybind11/issues/1776
......
......@@ -15,6 +15,7 @@
#include "paddle/phi/kernels/pad3d_kernel.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/common/complex.h"
#include "paddle/phi/core/kernel_registry.h"
namespace phi {
......@@ -574,5 +575,13 @@ void Pad3dKernel(const Context& dev_ctx,
} // namespace phi
PD_REGISTER_KERNEL(
pad3d, CPU, ALL_LAYOUT, phi::Pad3dKernel, float, double, int, int64_t) {}
PD_REGISTER_KERNEL(pad3d,
CPU,
ALL_LAYOUT,
phi::Pad3dKernel,
float,
double,
int,
int64_t,
phi::dtype::complex<float>,
phi::dtype::complex<double>) {}
......@@ -50,11 +50,15 @@ struct exponential_transform {
HOSTDEVICE inline T operator()(T val) const {
#if defined(__NVCC__) || defined(__HIPCC__)
T log = -std::numeric_limits<T>::epsilon() / 2;
if (val < static_cast<T>(1.) - std::numeric_limits<T>::epsilon() / 2) {
if (std::is_same<T, double>::value) {
return static_cast<T>(-1.0) / lambda_ * log(val);
log = logf(val);
} else {
return static_cast<T>(-1.0) / lambda_ * __logf(val);
log = __logf(val);
}
}
return static_cast<T>(-1.0) / lambda_ * log;
#else
return static_cast<T>(-1.0) / lambda_ * std::log(static_cast<T>(1.0) - val);
#endif
......@@ -114,13 +118,19 @@ struct normal_transform {
namespace kps = phi::kps;
/*********************** Distribution Function *************************/
template <typename T>
struct uniform_distribution;
template <typename T>
struct normal_distribution;
#if defined(__NVCC__)
template <typename T>
struct uniform_distribution {
__device__ inline T operator()(curandStatePhilox4_32_10_t *state) const {
return static_cast<T>(curand_uniform(state));
}
static constexpr int kReturnsCount = 1;
};
template <>
struct uniform_distribution<float> {
__device__ inline float4 operator()(curandStatePhilox4_32_10_t *state) const {
......@@ -177,6 +187,14 @@ struct normal_distribution<double> {
};
#else
template <typename T>
struct uniform_distribution {
__device__ inline T operator()(hiprandStatePhilox4_32_10_t *state) const {
return hiprand_uniform(state);
}
static constexpr int kReturnsCount = 1;
};
template <>
struct uniform_distribution<float> {
__device__ inline float4 operator()(
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#ifdef __NVCC__
#include "cub/cub.cuh"
#endif
#ifdef __HIPCC__
#include <hipcub/hipcub.hpp>
namespace cub = hipcub;
#endif
#include <thrust/device_ptr.h>
#include <thrust/iterator/reverse_iterator.h>
#include "paddle/phi/common/type_traits.h"
#include "paddle/phi/core/enforce.h"
#include "paddle/phi/kernels/funcs/for_range.h"
// See Note [ Why still include the fluid headers? ]
#include "paddle/fluid/memory/malloc.h"
namespace phi {
namespace funcs {
template <typename T>
struct IsComplex : public std::false_type {};
template <>
struct IsComplex<::phi::dtype::complex<float>> : public std::true_type {};
template <>
struct IsComplex<::phi::dtype::complex<double>> : public std::true_type {};
template <typename InputIterator, typename OutputIterator, typename BinaryOp>
static void CubInclusiveScan(InputIterator x_iter,
OutputIterator y_iter,
size_t n,
BinaryOp op,
const phi::GPUContext &dev_ctx) {
paddle::memory::allocation::AllocationPtr allocation;
void *temp_storage = nullptr;
size_t temp_storage_bytes = 0;
for (size_t i = 0; i < 2; ++i) {
PADDLE_ENFORCE_GPU_SUCCESS(
cub::DeviceScan::InclusiveScan(temp_storage,
temp_storage_bytes,
x_iter,
y_iter,
op,
static_cast<int>(n),
dev_ctx.stream()));
if (i == 0 && temp_storage_bytes > 0) {
allocation =
paddle::memory::Alloc(dev_ctx.GetPlace(), temp_storage_bytes);
temp_storage = allocation->ptr();
}
}
}
template <typename T>
static auto MakeThrustReverseIterator(T *x) {
return thrust::reverse_iterator<thrust::device_ptr<T>>(
thrust::device_pointer_cast(x));
}
template <typename T, typename BinaryOp, bool kReverse>
struct InclusiveScanOuterOrMidDimFunctor {
HOSTDEVICE InclusiveScanOuterOrMidDimFunctor(
const T *x, T *y, size_t mid_dim, size_t inner_dim, T init, BinaryOp op)
: x_(x),
y_(y),
mid_dim_(mid_dim),
inner_dim_(inner_dim),
init_(init),
op_(op) {}
HOSTDEVICE void operator()(size_t idx) const {
auto outer_idx = idx / inner_dim_;
auto inner_idx = idx % inner_dim_;
if (kReverse) {
idx = outer_idx * mid_dim_ * inner_dim_ + (mid_dim_ - 1) * inner_dim_ +
inner_idx;
} else {
idx = outer_idx * mid_dim_ * inner_dim_ + inner_idx;
}
auto x_ptr = x_ + idx;
auto y_ptr = y_ + idx;
T acc_value = init_;
for (size_t i = 0; i < mid_dim_; ++i) {
acc_value = op_(acc_value, *x_ptr);
*y_ptr = acc_value;
if (kReverse) {
x_ptr -= inner_dim_;
y_ptr -= inner_dim_;
} else {
x_ptr += inner_dim_;
y_ptr += inner_dim_;
}
}
}
private:
const T *x_;
T *y_;
size_t mid_dim_;
size_t inner_dim_;
T init_;
BinaryOp op_;
};
template <typename T,
typename BinaryOp,
size_t kThreadNumX,
size_t kThreadNumY,
bool kReverse>
static __global__ void InclusiveScanInnerDimCUDAKernel(
const T *x, T *y, size_t num_rows, size_t row_size, T init, BinaryOp op) {
using RealT = phi::dtype::Real<T>;
constexpr auto kSharedBufferSize =
IsComplex<T>::value ? 4 * kThreadNumX : 2 * kThreadNumX;
__shared__ RealT sbuf[kThreadNumY][kSharedBufferSize];
T *row_buf = reinterpret_cast<T *>(sbuf[threadIdx.y]);
size_t block_row = static_cast<size_t>(blockIdx.x * kThreadNumY);
size_t block_row_stride = static_cast<size_t>(gridDim.x * kThreadNumY);
for (; block_row < num_rows; block_row += block_row_stride) {
size_t row = block_row + threadIdx.y;
T block_total = init;
const T *row_x = x + row * row_size;
T *row_y = y + row * row_size;
for (size_t block_col = 0; block_col < row_size;
block_col += 2 * kThreadNumX) {
size_t col1, col2;
if (kReverse) {
col1 = row_size - 1 - block_col - threadIdx.x;
col2 = col1 - kThreadNumX;
} else {
col1 = block_col + threadIdx.x;
col2 = col1 + kThreadNumX;
}
if (row < num_rows) {
if (col1 < row_size) {
row_buf[threadIdx.x] = row_x[col1];
} else {
row_buf[threadIdx.x] = init;
}
if (col2 < row_size) {
row_buf[kThreadNumX + threadIdx.x] = row_x[col2];
} else {
row_buf[kThreadNumX + threadIdx.x] = init;
}
if (threadIdx.x == 0) {
row_buf[0] = op(row_buf[0], block_total);
}
}
__syncthreads();
for (size_t s = kThreadNumX, d = 1; s >= 1; s >>= 1, d <<= 1) {
if (row < num_rows && threadIdx.x < s) {
size_t offset = (2 * threadIdx.x + 1) * d - 1;
row_buf[offset + d] = op(row_buf[offset], row_buf[offset + d]);
}
__syncthreads();
}
for (size_t s = 2, d = kThreadNumX / 2; d >= 1; s <<= 1, d >>= 1) {
if (row < num_rows && threadIdx.x < s - 1) {
size_t offset = 2 * (threadIdx.x + 1) * d - 1;
row_buf[offset + d] = op(row_buf[offset], row_buf[offset + d]);
}
__syncthreads();
}
if (row < num_rows) {
if (col1 < row_size) row_y[col1] = row_buf[threadIdx.x];
if (col2 < row_size) row_y[col2] = row_buf[kThreadNumX + threadIdx.x];
}
block_total = row_buf[2 * kThreadNumX - 1];
__syncthreads();
}
}
}
template <typename T, typename BinaryOp>
static void InclusiveScanInnerDim(const T *x,
T *y,
size_t outer_dim,
size_t inner_dim,
T init,
BinaryOp op,
bool reverse,
const phi::GPUContext &dev_ctx) {
constexpr size_t kThreadNumX = 16;
constexpr size_t kThreadNumY = 32;
size_t grid_dim = (outer_dim + kThreadNumY - 1) / kThreadNumY;
grid_dim = std::min<size_t>(grid_dim, dev_ctx.GetCUDAMaxGridDimSize()[0]);
dim3 thread_dims(kThreadNumX, kThreadNumY);
if (reverse) {
InclusiveScanInnerDimCUDAKernel<
T,
BinaryOp,
kThreadNumX,
kThreadNumY,
/*kReverse=*/true><<<grid_dim, thread_dims, 0, dev_ctx.stream()>>>(
x, y, outer_dim, inner_dim, init, op);
} else {
InclusiveScanInnerDimCUDAKernel<
T,
BinaryOp,
kThreadNumX,
kThreadNumY,
/*kReverse=*/false><<<grid_dim, thread_dims, 0, dev_ctx.stream()>>>(
x, y, outer_dim, inner_dim, init, op);
}
}
template <typename T, typename BinaryOp>
void InclusiveScan(const T *x,
T *y,
size_t outer_dim,
size_t mid_dim,
size_t inner_dim,
T init,
BinaryOp op,
bool reverse,
const phi::GPUContext &dev_ctx) {
if (outer_dim == 0 || mid_dim == 0 || inner_dim == 0) return;
if (outer_dim == 1 && inner_dim == 1) {
if (reverse) {
auto x_reverse_iter = MakeThrustReverseIterator(x + mid_dim);
auto y_reverse_iter = MakeThrustReverseIterator(y + mid_dim);
CubInclusiveScan(x_reverse_iter, y_reverse_iter, mid_dim, op, dev_ctx);
} else {
CubInclusiveScan(x, y, mid_dim, op, dev_ctx);
}
} else if (inner_dim != 1) {
phi::funcs::ForRange<phi::GPUContext> for_range(dev_ctx,
outer_dim * inner_dim);
if (reverse) {
for_range(
InclusiveScanOuterOrMidDimFunctor<T, BinaryOp, /*kReverse=*/true>(
x, y, mid_dim, inner_dim, init, op));
} else {
for_range(
InclusiveScanOuterOrMidDimFunctor<T, BinaryOp, /*kReverse=*/false>(
x, y, mid_dim, inner_dim, init, op));
}
} else {
InclusiveScanInnerDim<T, BinaryOp>(
x, y, outer_dim, mid_dim, init, op, reverse, dev_ctx);
}
}
} // namespace funcs
} // namespace phi
......@@ -17,11 +17,10 @@
#include <thrust/reverse.h>
#include <thrust/scan.h>
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/select_impl.cu.h"
#include "paddle/phi/kernels/masked_select_grad_kernel.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
namespace phi {
template <typename MT, typename InT, typename OutT>
......@@ -50,7 +49,7 @@ void MaskedSelectGradKernel(const Context& dev_ctx,
const DenseTensor& mask,
DenseTensor* x_grad) {
auto mask_size = mask.numel();
auto* out_data = x_grad->mutable_data<T>(dev_ctx.GetPlace());
dev_ctx.template Alloc<T>(x_grad);
if (mask_size <= 0) return;
using Functor = MaskedSelectGradFunctor<bool, T, T>;
phi::funcs::SelectKernel<bool, T, T, 2, Functor>(
......
......@@ -23,11 +23,32 @@ limitations under the License. */
#include <thrust/scan.h>
#include <thrust/transform.h>
#include "paddle/fluid/platform/transform.h"
#ifdef __NVCC__
#include "cub/cub.cuh"
#endif
#ifdef __HIPCC__
#include <hipcub/hipcub.hpp>
namespace cub = hipcub;
#endif
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/scalar.h"
#include "paddle/phi/core/ddim.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/arg_min_max_kernel.h"
#include "paddle/phi/kernels/empty_kernel.h"
#include "paddle/phi/kernels/funcs/distribution_helper.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/phi/kernels/funcs/for_range.h"
#include "paddle/phi/kernels/funcs/inclusive_scan.h"
#include "paddle/phi/kernels/funcs/multinomial_functor.h"
#include "paddle/phi/kernels/top_k_kernel.h"
// See Note [ Why still include the fluid headers? ]
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/transform.h"
DECLARE_bool(use_curand);
namespace phi {
......@@ -57,12 +78,12 @@ template <typename T>
__global__ void GetCumulativeProbs(T* norm_probs_data,
int64_t num_distributions,
int64_t num_categories,
T* cumulative_probs) {
T* cumulative_probs_data) {
int id = blockIdx.x;
thrust::inclusive_scan(thrust::device,
norm_probs_data + id * num_categories,
norm_probs_data + (id + 1) * num_categories,
cumulative_probs + id * num_categories);
cumulative_probs_data + id * num_categories);
}
template <typename T>
......@@ -80,7 +101,7 @@ struct RandomGeneratorCudaFunctor {
};
template <typename T>
__device__ int binarySearchFunctor(T* cumulative_probs,
__device__ int binarySearchFunctor(T* cumulative_probs_data,
T* norm_probs_data,
int num_categories,
T rng_number) {
......@@ -90,7 +111,7 @@ __device__ int binarySearchFunctor(T* cumulative_probs,
while (right - left > 0) {
int mid = left + (right - left) / 2;
T temp_prob = cumulative_probs[mid];
T temp_prob = cumulative_probs_data[mid];
if (temp_prob < rng_number) {
left = mid + 1;
} else {
......@@ -114,27 +135,36 @@ __global__ void sampleMultinomialWithReplacement(
int64_t* out_data,
const int64_t num_distributions,
const int64_t num_categories,
T* cumulative_probs,
T* norm_probs_data) {
T* cumulative_probs_data,
T* norm_probs_data,
uint64_t seed,
uint64_t offset,
bool use_curand) {
// use binary search to get the selected category sample id.
// let cumulative_probs[id-1] < rng_data < cumulative_probs[id].
// let cumulative_probs_data[id-1] < rng_data < cumulative_probs_data[id].
size_t idx = gridDim.x * blockDim.x * blockIdx.y + blockDim.x * blockIdx.x +
threadIdx.x;
curandStatePhilox4_32_10_t state;
curand_init(seed, idx, offset, &state);
// for every distribution
int dist = blockIdx.y;
// for every sample
int sample = blockIdx.x * blockDim.x + threadIdx.x;
for (int dist = blockIdx.y; dist < num_distributions; dist += gridDim.y) {
if (sample < num_samples) {
T rng_number = rng_data[sample + dist * num_samples];
if (use_curand) {
rng_number = static_cast<T>(curand_uniform4(&state).x);
}
// Find the bucket that a uniform random number lies in
int selected_category =
binarySearchFunctor<T>(cumulative_probs + dist * num_categories,
binarySearchFunctor<T>(cumulative_probs_data + dist * num_categories,
norm_probs_data + dist * num_categories,
num_categories,
rng_number);
out_data[sample + dist * num_samples] = selected_category;
}
}
}
template <typename T, typename Context>
......@@ -172,6 +202,54 @@ void MultinomialKernel(const Context& dev_ctx,
in_data_numel * sizeof(T),
cudaMemcpyDeviceToHost);
#endif
if (FLAGS_use_curand) {
for (size_t i = 0; i < num_distributions; ++i) {
int zero_num = 0;
for (size_t j = 0; j < num_categories; ++j) {
T weight = cpu_in_data[i * num_distributions + j];
PADDLE_ENFORCE_GE(
weight,
0,
errors::InvalidArgument(
"Each element of multinomial'input must >= 0, but got %f.",
weight));
if (weight == static_cast<T>(0)) {
zero_num++;
}
}
int valid_samples = num_categories - zero_num;
PADDLE_ENFORCE_LE(
num_samples,
valid_samples,
errors::InvalidArgument("When replacement=False, 'num_samples' "
"must less than or eaqual to the number of "
"positive item of input"));
}
// Refer to [gumbel softmax algorithm]
DenseTensor rand = EmptyLike<T, Context>(dev_ctx, x);
T* rand_data = rand.data<T>();
funcs::uniform_distribution<T> dist;
funcs::exponential_transform<T> trans(1.0);
funcs::distribution_and_transform<T>(dev_ctx, &rand, dist, trans);
funcs::ForRange<Context> for_range(dev_ctx, x.numel());
for_range([rand_data, in_data] __device__(size_t idx) {
rand_data[idx] = in_data[idx] / rand_data[idx];
});
if (num_samples == 1) {
ArgMaxKernel<T, Context>(
dev_ctx, rand, -1, true, false, 3 /*proto::VarType::INT64*/, out);
} else {
std::vector<int64_t> out_dim_vec = vectorize<int64_t>(out->dims());
DenseTensor value =
Empty<T, Context>(dev_ctx, ScalarArray(out_dim_vec));
TopkKernel<T, Context>(
dev_ctx, rand, Scalar(num_samples), -1, true, true, &value, out);
}
return;
}
funcs::MultinomialFunctor<T>(dev_ctx,
cpu_out_data,
......@@ -228,7 +306,8 @@ void MultinomialKernel(const Context& dev_ctx,
auto* norm_probs_data = dev_ctx.template Alloc<T>(&norm_probs_tensor);
// number of threads in a block is min(num_categories, 512)
dim3 block_norm(num_categories < 512 ? num_categories : 512);
int block_size = num_categories < 512 ? num_categories : 512;
dim3 block_norm(block_size);
dim3 grid_norm((num_distributions * num_categories - 1) / block_norm.x + 1);
NormalizeProbability<T><<<grid_norm, block_norm, 0, dev_ctx.stream()>>>(
norm_probs_data,
......@@ -238,16 +317,34 @@ void MultinomialKernel(const Context& dev_ctx,
num_categories);
// Get cumulative probability of each distribution. It's the same function
// of
// ``cumsum`` op.
// of ``cumsum`` op.
DenseTensor cumulative_probs_tensor;
cumulative_probs_tensor.Resize({num_distributions, num_categories});
auto* cumulative_probs = dev_ctx.template Alloc<T>(&cumulative_probs_tensor);
auto* cumulative_probs_data =
dev_ctx.template Alloc<T>(&cumulative_probs_tensor);
if (FLAGS_use_curand) {
// 'phi::funcs::InclusiveScan' has higher accuracy than
// 'thrust::inclusive_scan'
funcs::InclusiveScan<T, std::plus<T>>(
/*in*/ norm_probs_data,
/*out*/ cumulative_probs_data,
/*outer_dim*/ static_cast<size_t>(num_distributions),
/*mid_dim*/ static_cast<size_t>(num_categories),
/*inner_dim*/ static_cast<size_t>(1),
/*init*/ static_cast<T>(0),
std::plus<T>(),
/*reverse=*/false,
dev_ctx);
} else {
dim3 block_cumsum(1);
dim3 grid_cumsum(num_distributions);
GetCumulativeProbs<T><<<grid_cumsum, block_cumsum, 0, dev_ctx.stream()>>>(
norm_probs_data, num_distributions, num_categories, cumulative_probs);
norm_probs_data,
num_distributions,
num_categories,
cumulative_probs_data);
}
// Generate random number for each sample.
std::random_device rd;
......@@ -266,16 +363,30 @@ void MultinomialKernel(const Context& dev_ctx,
RandomGeneratorCudaFunctor<T>(seed));
// Sample the multinomial distributions.
dim3 block_sample(128);
dim3 grid_sample((num_samples - 1) / block_sample.x + 1, num_distributions);
sampleMultinomialWithReplacement<
T><<<grid_sample, block_sample, 0, dev_ctx.stream()>>>(rng_data,
dim3 block(128);
int64_t device_id = dev_ctx.GetPlace().GetDeviceId();
const auto& prop = phi::backends::gpu::GetDeviceProperties(device_id);
int grid_y = std::min<int64_t>(num_distributions, prop.maxGridSize[1]);
dim3 grid((num_samples - 1) / block.x + 1, grid_y);
auto gen_cuda = dev_ctx.GetGenerator();
size_t curand4_loop_times =
(num_distributions + 4 * grid_y - 1) / (4 * grid_y);
// 'increment' shoulde be multiple of 4
uint64_t increment = curand4_loop_times * 4;
auto seed_offset = gen_cuda->IncrementOffset(increment);
sampleMultinomialWithReplacement<T><<<grid, block, 0, dev_ctx.stream()>>>(
rng_data,
num_samples,
out_data,
num_distributions,
num_categories,
cumulative_probs,
norm_probs_data);
cumulative_probs_data,
norm_probs_data,
seed_offset.first,
seed_offset.second,
FLAGS_use_curand);
}
} // namespace phi
......
......@@ -19,6 +19,7 @@
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/complex.h"
#include "paddle/phi/core/kernel_registry.h"
namespace phi {
......@@ -585,4 +586,6 @@ PD_REGISTER_KERNEL(pad3d,
float,
double,
int,
int64_t) {}
int64_t,
phi::dtype::complex<float>,
phi::dtype::complex<double>) {}
......@@ -13,7 +13,7 @@
# limitations under the License.
from .spawn import spawn # noqa: F401
from .fleet.launch import launch # noqa: F401
from .launch.main import launch # noqa: F401
from .parallel import init_parallel_env # noqa: F401
from .parallel import get_rank # noqa: F401
......
......@@ -1482,3 +1482,512 @@ register_distributed_operator_impl("matmul_v2",
DistributedMatmulV2Impl1("row_parallel"))
register_distributed_operator_impl(
"matmul_v2", DistributedMatmulV2Impl2("replicate_parallel"))
class DistributedMul(DistributedOperatorImplContainer):
def __init__(self, op_type):
super(DistributedMul, self).__init__(op_type)
register_distributed_operator_impl_container(DistributedMul("mul"))
# ColumnParallel
class DistributedMulImpl0(DistributedOperatorImpl):
def __init__(self, name):
super(DistributedMulImpl0, self).__init__(name)
self._forward_implemented = True
self._backward_implemented = True
def is_input_compatible(self, dist_op):
op_desc = dist_op.serial_op.desc
op_dist_attr = dist_op.dist_attr
x_name = op_desc.input('X')[0]
y_name = op_desc.input('Y')[0]
x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
y_dims_mapping = op_dist_attr.get_input_dims_mapping(y_name)
if is_dim_shard(x_dims_mapping[-1]):
return False
if is_dim_shard(y_dims_mapping[-2]) or is_dim_replicate(y_dims_mapping[
-1]):
return False
for mapping in x_dims_mapping[1:-1]:
if is_dim_shard(mapping):
return False
return True
def is_output_compatible(self, dist_op):
op_desc = dist_op.serial_op.desc
op_dist_attr = dist_op.dist_attr
out_name = op_desc.output('Out')[0]
out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
if is_dim_replicate(out_dims_mapping[-1]):
return False
for mapping in out_dims_mapping[1:-1]:
if is_dim_shard(mapping):
return False
return True
def is_auto_compatible(self, dist_op):
if (not self.is_input_compatible(dist_op)) or \
(not self.is_output_compatible(dist_op)):
return False
if not _is_auto_compatible_for_matmul(dist_op):
return False
return True
def update_dims_mapping(self, dist_op):
changed = False
dim_changed = _update_dims_mapping_for_matmul(dist_op)
if dim_changed:
changed = True
return changed
@staticmethod
def forward(ctx, *args, **kwargs):
"""
kwargs: inputname_mapping & outputname_mapping
"""
dist_op_context = ctx.dist_op_context
main_block = dist_op_context.work_block
startup_block = dist_op_context.startup_block
src_op = dist_op_context.cur_src_op
rank_id = dist_op_context.rank_id
op_dist_attr = ctx.get_op_dist_attr_for_program(src_op)
assert op_dist_attr is not None, "backward op [{}] don't have dist attribute !".format(
str(src_op))
# FIXME (JZ-LIANG) Remove this hack to support any op mesh group for Pipeline Parallelism
if rank_id not in op_dist_attr.process_mesh.processes:
rank_id = _get_corresponding_rank(ctx, op_dist_attr.process_mesh,
rank_id)
# check validation of inputs / outputs
for input_name in src_op.desc.input_names():
assert input_name in kwargs, "input [{}] is not given".format(
input_name)
assert len(kwargs[input_name]) == len(
src_op.desc.input(input_name)
), "number of tensor for input [{}] is not match".format(input_name)
for output_name in src_op.desc.output_names():
assert output_name in kwargs, "input [{}] is not given".format(
output_name)
assert len(kwargs[output_name]) == len(
src_op.desc.output(output_name)
), "number of tensor for input [{}] is not match".format(
output_name)
X_var = main_block.var(kwargs['X'][0])
Weight_var = main_block._var_recursive(kwargs['Y'][0])
Out_var = main_block.var(kwargs['Out'][0])
# TODO infer logic comm presentation
matmul_col_dim_mapping = op_dist_attr.get_input_dims_mapping(
Weight_var.name)[-1]
assert matmul_col_dim_mapping >= 0, "col_parallel_matmul's row should be divided by a specific mesh axis, but got [{}]".format(
matmul_col_dim_mapping)
process_mesh_shape = op_dist_attr.process_mesh.topology
process_mesh_group = op_dist_attr.process_mesh.processes
parallel_axis = matmul_col_dim_mapping
group_ranks = _get_comm_group(process_mesh_group, process_mesh_shape,
parallel_axis, rank_id)
group = new_process_group(group_ranks)
# infer new var shape with op dist attr
x_tensor_dist_attr = ctx.get_tensor_dist_attr_for_program(X_var)
assert x_tensor_dist_attr is not None
identity_var_dist_attr = op_dist_attr.get_input_dist_attr(X_var.name)
assert identity_var_dist_attr is not None
ref_shape_x = infer_shape(main_block, X_var, x_tensor_dist_attr,
identity_var_dist_attr)
# infer out var shape with op dist attr
out_tensor_dist_attr = ctx.get_tensor_dist_attr_for_program(Out_var)
assert out_tensor_dist_attr is not None
out_var_dist_attr = op_dist_attr.get_output_dist_attr(Out_var.name)
assert out_var_dist_attr is not None
ref_shape_out = infer_shape(main_block, Out_var, out_tensor_dist_attr,
out_var_dist_attr)
intermediate_var_0 = main_block.create_var(
name=unique_name.generate_with_ignorable_key(".".join(
["c_identity", 'tmp'])),
dtype=X_var.dtype,
shape=X_var.shape,
type=core.VarDesc.VarType.LOD_TENSOR,
persistable=False,
stop_gradient=X_var.stop_gradient)
# set intermediate_var_0's dist_attr with X_var's dist_attr
ctx.set_tensor_dist_attr_for_program(intermediate_var_0,
identity_var_dist_attr)
check_variable_and_dtype(
X_var, 'tensor',
['float16', 'float32', 'float64', 'int32', 'int64'], '_c_identity')
c_identity_op = main_block.append_op(
type='c_identity',
inputs={'X': [X_var]},
outputs={'Out': intermediate_var_0},
attrs={
'ring_id': group.id,
'use_calc_stream': True,
'use_model_parallel': True,
})
if intermediate_var_0.shape != ref_shape_x:
intermediate_var_0.desc.set_shape(ref_shape_x)
check_variable_and_dtype(intermediate_var_0, 'x',
['float16', 'float32', 'float64'], 'linear')
check_dtype(intermediate_var_0.dtype, 'dtype',
['float16', 'float32', 'float64'], 'linear')
# attrs = {'trans_x': False, 'trans_y': False}
attrs = {
"x_num_col_dims": src_op.desc.attr("x_num_col_dims"),
"y_num_col_dims": src_op.desc.attr("y_num_col_dims")
}
inputs = {'X': [intermediate_var_0], 'Y': [Weight_var]}
mul_op = main_block.append_op(
type='mul', inputs=inputs, outputs={'Out': Out_var}, attrs=attrs)
if Out_var.shape != ref_shape_out:
Out_var.desc.set_shape(ref_shape_out)
# set dist op's dist_attr with serial op's dist_attr
# c_identity
identity_op_dist_attr = OperatorDistributedAttribute()
identity_op_dist_attr.process_mesh = op_dist_attr.process_mesh
identity_op_dist_attr.impl_type = op_dist_attr.impl_type
identity_op_dist_attr.impl_idx = op_dist_attr.impl_idx
# input
input_varname = c_identity_op.desc.input_arg_names()[0]
input_dist_attr = op_dist_attr.get_input_dist_attr(input_varname)
assert input_dist_attr is not None, "dist_attr is {}".format(
op_dist_attr)
identity_op_dist_attr.set_input_dist_attr(input_varname,
input_dist_attr)
# output
output_varname = c_identity_op.desc.output_arg_names()[0]
identity_op_dist_attr.set_output_dist_attr(output_varname,
input_dist_attr)
ctx.set_op_dist_attr_for_program(c_identity_op, identity_op_dist_attr)
# matmulv2
matmulv2_op_dist_attr = OperatorDistributedAttribute()
matmulv2_op_dist_attr.process_mesh = op_dist_attr.process_mesh
matmulv2_op_dist_attr.impl_type = op_dist_attr.impl_type
matmulv2_op_dist_attr.impl_idx = op_dist_attr.impl_idx
for input_varname in mul_op.desc.input_arg_names():
if input_varname in src_op.desc.input_arg_names():
input_dist_attr = op_dist_attr.get_input_dist_attr(
input_varname)
assert input_dist_attr is not None, "dist_attr is {}".format(
op_dist_attr)
matmulv2_op_dist_attr.set_input_dist_attr(input_varname,
input_dist_attr)
else:
input_var = main_block.var(input_varname)
tensor_dist_attr = ctx.get_tensor_dist_attr_for_program(
input_var)
matmulv2_op_dist_attr.set_input_dist_attr(input_varname,
tensor_dist_attr)
for output_varname in mul_op.desc.output_arg_names():
output_dist_attr = op_dist_attr.get_output_dist_attr(output_varname)
assert output_dist_attr is not None, "dist_attr is {}".format(
op_dist_attr)
matmulv2_op_dist_attr.set_output_dist_attr(output_varname,
output_dist_attr)
ctx.set_op_dist_attr_for_program(mul_op, matmulv2_op_dist_attr)
# init param sync
if Weight_var.is_parameter and not op_dist_attr.is_recompute:
_init_param_sync(Weight_var, dist_op_context, startup_block, ctx,
rank_id)
@staticmethod
def backward(ctx, *args, **kwargs):
_right_operand_parameter_matmul_backward(ctx, *args, **kwargs)
# RowParallel
class DistributedMulImpl1(DistributedOperatorImpl):
def __init__(self, name):
super(DistributedMulImpl1, self).__init__(name)
self._forward_implemented = True
self._backward_implemented = True
def is_input_compatible(self, dist_op):
op_desc = dist_op.serial_op.desc
op_dist_attr = dist_op.dist_attr
x_name = op_desc.input('X')[0]
y_name = op_desc.input('Y')[0]
x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
y_dims_mapping = op_dist_attr.get_input_dims_mapping(y_name)
if is_dim_replicate(x_dims_mapping[-1]):
return False
if is_dim_replicate(y_dims_mapping[-2]) or is_dim_shard(y_dims_mapping[
-1]):
return False
# Other dimensions must be replicate except the batch dimension
for mapping in x_dims_mapping[1:-1]:
if is_dim_shard(mapping):
return False
return True
def is_output_compatible(self, dist_op):
op_desc = dist_op.serial_op.desc
op_dist_attr = dist_op.dist_attr
out_name = op_desc.output('Out')[0]
out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
if is_dim_shard(out_dims_mapping[-1]):
return False
# Other dimensions must be replicate except the batch dimension
for mapping in out_dims_mapping[1:-1]:
if is_dim_shard(mapping):
return False
return True
def is_auto_compatible(self, dist_op):
if (not self.is_input_compatible(dist_op)) or \
(not self.is_output_compatible(dist_op)):
return False
if not _is_auto_compatible_for_matmul(dist_op):
return False
return True
def update_dims_mapping(self, dist_op):
changed = False
dim_changed = _update_dims_mapping_for_matmul(dist_op)
if dim_changed:
changed = True
return changed
@staticmethod
def forward(ctx, *args, **kwargs):
"""
kwargs: inputname_mapping & outputname_mapping
"""
dist_op_context = ctx.dist_op_context
main_block = dist_op_context.work_block
startup_block = dist_op_context.startup_block
src_op = dist_op_context.cur_src_op
rank_id = dist_op_context.rank_id
op_dist_attr = ctx.get_op_dist_attr_for_program(src_op)
assert op_dist_attr is not None, "backward op [{}] don't have dist attribute !".format(
str(src_op))
# FIXME (JZ-LIANG) Remove this hack to support any op mesh group for Pipeline Parallelism
if rank_id not in op_dist_attr.process_mesh.processes:
rank_id = _get_corresponding_rank(ctx, op_dist_attr.process_mesh,
rank_id)
# check validation of inputs / outputs
for input_name in src_op.desc.input_names():
assert input_name in kwargs, "input [{}] is not given".format(
input_name)
assert len(kwargs[input_name]) == len(
src_op.desc.input(input_name)
), "number of tensor for input [{}] is not match".format(input_name)
for output_name in src_op.desc.output_names():
assert output_name in kwargs, "input [{}] is not given".format(
output_name)
assert len(kwargs[output_name]) == len(
src_op.desc.output(output_name)
), "number of tensor for input [{}] is not match".format(
output_name)
X_var = main_block.var(kwargs['X'][0])
Weight_var = main_block._var_recursive(kwargs['Y'][0])
Out_var = main_block.var(kwargs['Out'][0])
# TODO infer logic comm presentation
matmul_row_dim_mapping = op_dist_attr.get_input_dims_mapping(
Weight_var.name)[-2]
assert matmul_row_dim_mapping >= 0, "row_parallel_matmul's row should be divided by a specific mesh axis, but got [{}]".format(
matmul_row_dim_mapping)
process_mesh_shape = op_dist_attr.process_mesh.topology
process_mesh_group = op_dist_attr.process_mesh.processes
parallel_axis = matmul_row_dim_mapping
group_ranks = _get_comm_group(process_mesh_group, process_mesh_shape,
parallel_axis, rank_id)
group = new_process_group(group_ranks)
check_variable_and_dtype(X_var, 'x', ['float16', 'float32', 'float64'],
'linear')
check_dtype(X_var.dtype, 'dtype', ['float16', 'float32', 'float64'],
'linear')
# attrs = {'trans_x': False, 'trans_y': False}
attrs = {
"x_num_col_dims": src_op.desc.attr("x_num_col_dims"),
"y_num_col_dims": src_op.desc.attr("y_num_col_dims")
}
inputs = {'X': X_var, 'Y': Weight_var}
# infer out var shape with op dist attr
out_tensor_dist_attr = ctx.get_tensor_dist_attr_for_program(Out_var)
assert out_tensor_dist_attr is not None
out_var_dist_attr = op_dist_attr.get_output_dist_attr(Out_var.name)
assert out_var_dist_attr is not None
ref_shape = infer_shape(main_block, Out_var, out_tensor_dist_attr,
out_var_dist_attr)
intermediate_var_0 = main_block.create_var(
shape=Out_var.shape,
dtype=Out_var.dtype,
type=Out_var.type,
lod_level=Out_var.lod_level,
persistable=False,
is_data=False,
need_check_feed=Out_var.desc.need_check_feed())
# set intermediate_var_0's dist_attr with Out_var's dist_attr
ctx.set_tensor_dist_attr_for_program(intermediate_var_0,
out_var_dist_attr)
mul_op = main_block.append_op(
type='mul',
inputs=inputs,
outputs={'Out': intermediate_var_0},
attrs=attrs)
if intermediate_var_0.shape != ref_shape:
intermediate_var_0.desc.set_shape(ref_shape)
c_allreduce_sum_op = main_block.append_op(
type='c_allreduce_sum',
inputs={'X': intermediate_var_0},
outputs={'Out': Out_var},
attrs={
'ring_id': group.id,
'use_calc_stream': True,
'use_model_parallel': True
})
if Out_var.shape != ref_shape:
Out_var.desc.set_shape(ref_shape)
# set dist op's dist_attr with serial op's dist_attr
# matmulv2
matmulv2_op_dist_attr = OperatorDistributedAttribute()
matmulv2_op_dist_attr.process_mesh = op_dist_attr.process_mesh
matmulv2_op_dist_attr.impl_type = op_dist_attr.impl_type
matmulv2_op_dist_attr.impl_idx = op_dist_attr.impl_idx
for input_varname in mul_op.desc.input_arg_names():
input_dist_attr = op_dist_attr.get_input_dist_attr(input_varname)
assert input_dist_attr is not None, "dist_attr is {}".format(
op_dist_attr)
matmulv2_op_dist_attr.set_input_dist_attr(input_varname,
input_dist_attr)
output_varname = mul_op.desc.output_arg_names()[0]
output_dist_attr = op_dist_attr.get_output_dist_attr(Out_var.name)
assert output_dist_attr is not None, "dist_attr is {}".format(
op_dist_attr)
matmulv2_op_dist_attr.set_output_dist_attr(output_varname,
output_dist_attr)
ctx.set_op_dist_attr_for_program(mul_op, matmulv2_op_dist_attr)
# allreduce
allreduce_op_dist_attr = OperatorDistributedAttribute()
allreduce_op_dist_attr.process_mesh = op_dist_attr.process_mesh
allreduce_op_dist_attr.impl_type = op_dist_attr.impl_type
allreduce_op_dist_attr.impl_idx = op_dist_attr.impl_idx
for input_varname in c_allreduce_sum_op.desc.input_arg_names():
input_var = main_block.var(input_varname)
tensor_dist_attr = ctx.get_tensor_dist_attr_for_program(input_var)
assert tensor_dist_attr is not None
allreduce_op_dist_attr.set_input_dist_attr(input_varname,
tensor_dist_attr)
for output_varname in c_allreduce_sum_op.desc.output_arg_names():
output_dist_attr = op_dist_attr.get_output_dist_attr(output_varname)
assert output_dist_attr is not None, "dist_attr is {}".format(
op_dist_attr)
allreduce_op_dist_attr.set_output_dist_attr(output_varname,
output_dist_attr)
ctx.set_op_dist_attr_for_program(c_allreduce_sum_op,
allreduce_op_dist_attr)
# init param sync
if Weight_var.is_parameter and not op_dist_attr.is_recompute:
_init_param_sync(Weight_var, dist_op_context, startup_block, ctx,
rank_id)
@staticmethod
def backward(ctx, *args, **kwargs):
_right_operand_parameter_matmul_backward(ctx, *args, **kwargs)
# ReplicateParallel
class DistributedMulImpl2(DistributedOperatorImpl):
def __init__(self, name):
super(DistributedMulImpl2, self).__init__(name)
def is_input_compatible(self, dist_op):
op_desc = dist_op.serial_op.desc
op_dist_attr = dist_op.dist_attr
x_name = op_desc.input('X')[0]
y_name = op_desc.input('Y')[0]
x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
y_dims_mapping = op_dist_attr.get_input_dims_mapping(y_name)
if is_dim_shard(x_dims_mapping[-1]):
return False
if is_valid_list_index(x_dims_mapping,
-2) and is_dim_shard(x_dims_mapping[-2]):
return False
if is_dim_shard(y_dims_mapping[-1]):
return False
if is_valid_list_index(y_dims_mapping,
-2) and is_dim_shard(y_dims_mapping[-2]):
return False
return True
def is_output_compatible(self, dist_op):
op_desc = dist_op.serial_op.desc
op_dist_attr = dist_op.dist_attr
op_desc = dist_op.serial_op.desc
op_dist_attr = dist_op.dist_attr
out_name = op_desc.output('Out')[0]
out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
if is_dim_shard(out_dims_mapping[-1]):
return False
if is_valid_list_index(out_dims_mapping,
-2) and is_dim_shard(out_dims_mapping[-2]):
return False
return True
def is_auto_compatible(self, dist_op):
if (not self.is_input_compatible(dist_op)) or \
(not self.is_output_compatible(dist_op)):
return False
if not _is_auto_compatible_for_matmul(dist_op):
return False
return True
def update_dims_mapping(self, dist_op):
changed = False
dim_changed = _update_dims_mapping_for_matmul(dist_op)
if dim_changed:
changed = True
return changed
@staticmethod
def forward(ctx, *args, **kwargs):
DistributedDefaultImpl0.forward(ctx, *args, **kwargs)
@staticmethod
def backward(ctx, *args, **kwargs):
_right_operand_parameter_matmul_backward(ctx, *args, **kwargs)
register_distributed_operator_impl("mul",
DistributedMulImpl0("column_parallel"))
register_distributed_operator_impl("mul", DistributedMulImpl1("row_parallel"))
register_distributed_operator_impl("mul",
DistributedMulImpl2("replicate_parallel"))
......@@ -13,69 +13,3 @@
# limitations under the License.
__all__ = []
'''
Paddle distributed training entry ``python -m paddle.distributed.launch``.
Help
# for arg usage and explanation, try the following command
# python -m paddle.distributed.launch -h
Collective Mode
Case 1: 1 node
use all visible devices
# python -m paddle.distributed.launch train.py
use specified devices
# python -m paddle.distributed.launch --devices=0,1,2,3 train.py
Case 2: multi-node, auto detect ip/port
# python -m paddle.distributed.launch --nnodes 2 train.py
# auto print following command
# python -m paddle.distributed.launch --master 10.0.0.1:13538 --nnodes 2 demo.py
# then copy and paste above command to other nodes
Case 3: multi-node, specified master/rendezvous server
# python -m paddle.distributed.launch --nnodes 2 --master 10.0.0.1:2379 train.py
# the master ip must be one of the node and the port must available
Parameter Server Mode
Case 1.1: 1 node, 1 ps, 1 trainer
# python -m paddle.distributed.launch --mode ps train.py
# python -m paddle.distributed.launch --server_num=1 --trainer_num=1 train.py
Case 1.2: 1 node, 2 ps, 2 trainer
# python -m paddle.distributed.launch --server_num=2 --trainer_num=2 train.py
Case 2: 2 node, 2 ps, 2 trainer per node
# python -m paddle.distributed.launch --server_num=2 --trainer_num=2 --nnodes 2 train.py
# auto print following command
# python -m paddle.distributed.launch --master 10.0.0.1:13538 --server_num=2 --trainer_num=2 --nnodes 2 train.py
# then copy and paste above command to other nodes
Case 3: multi-node, specified master/rendezvous server
# python -m paddle.distributed.launch --master 10.0.0.1:13538 --server_num=2 --trainer_num=2 --nnodes 2 train.py
# the master ip must be one of the node and the port must available
Case 4: specified servers and trainers in each node
python -m paddle.distributed.launch --servers 127.0.0.1:8900,127.0.0.1:8901 --trainers 127.0.0.1:8902,127.0.0.1:8903 train.py
Elastic Mode
# run following command in 3 node to run immediately, or in 2 node to run after elastic_timeout
# python -m paddle.distributed.launch --master etcd://10.0.0.1:2379 --nnodes 2:3 train.py
# once the peer number changes between 2:3, the strategy holds
'''
......@@ -12,31 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from .context import Context
from . import controllers
from .main import launch
def launch():
# initialize the context to run
ctx = Context()
if ctx.is_legacy_mode():
# legacy mode
from paddle.distributed.fleet import launch
launch.launch()
else:
# initialize the selected controller
c = controllers.init(ctx)
# run the pods
c.run()
# manager or just wait pod
c.finalize()
if __name__ == "__main__":
launch()
launch()
......@@ -82,6 +82,12 @@ class Context(object):
logger.addHandler(ch)
return logger
def continous_log(self) -> bool:
if self.args.log_level.upper() in ['DEBUG', 'ERROR']:
return True
else:
return False
def set_env_in_args(self):
for k, v in env_args_mapping.items():
if k in self.envs:
......
......@@ -20,7 +20,7 @@ env_args_mapping = {
'PADDLE_MASTER': 'master',
'PADDLE_DEVICES': 'devices',
'PADDLE_NNODES': 'nnodes',
'PADDLE_MODE': 'mode',
'PADDLE_RUN_MODE': 'run_mode',
'PADDLE_LOG_LEVEL': 'log_level',
'PADDLE_NPROC_PER_NODE': 'nproc_per_node',
'PADDLE_JOB_ID': 'job_id',
......@@ -60,7 +60,7 @@ def parse_args():
"--legacy", type=bool, default=False, help="use legacy launch")
base_group.add_argument(
"--rank", type=int, default=-1, help="the peer rank")
"--rank", type=int, default=-1, help="the node rank")
base_group.add_argument(
"--log_level", type=str, default="INFO", help="log level. Default INFO")
......@@ -69,7 +69,7 @@ def parse_args():
"--nnodes",
type=str,
default="1",
help="the number of peers, i.e. pod/node number")
help="the number of nodes, i.e. pod/node number")
base_group.add_argument(
"--nproc_per_node",
......@@ -83,7 +83,7 @@ def parse_args():
default="log",
help="the path for each process's log. Default ./log")
base_group.add_argument(
"--mode",
"--run_mode",
type=str,
default="collective",
help="run mode of the job, collective/ps/ps-heter")
......@@ -146,6 +146,6 @@ def parse_args():
"--elastic_timeout",
type=int,
default=30,
help="seconds to wait before elastic perform training")
help="seconds to wait before elastic job begin to train")
return parser.parse_known_args()
......@@ -115,46 +115,6 @@ class CollectiveElasticController(CollectiveController):
self.master.register_heartbeat(self.job.id, self.pod.name)
def watch(self) -> bool:
'''
watch self and peer status, return true to exit
'''
self.ctx.logger.info("Watching {}".format(self.pod))
while not self.ctx.status.is_done():
# self status
status = self.pod.watch(timeout=2)
self.ctx.logger.debug("Pod status {}, Ctx status {}".format(
status, self.ctx.status.current()))
# completed
if status == self.ctx.status.COMPLETED:
self.master.set_status(status)
self.ctx.status.complete()
self.ctx.logger.info("Pod complete {}".format(status))
return True
# self failure
elif status == self.ctx.status.FAILED:
self.master.set_status(status)
self.master.restart_peer()
self.ctx.logger.info("Pod failed {}".format(status))
self.pod.stop()
if self.ctx.args.elastic_level <= 0:
return True
else:
return False
# peer failure
if self.ctx.status.is_restarting() and self.master.get_status(
) != self.ctx.status.COMPLETED:
self.pod.stop()
return False
#peers = self.master.fetch_peer_alive()
#print("peers {}".format(peers))
def run(self):
timeout = self.ctx.args.elastic_timeout if self.job.elastic else self.ctx.args.elastic_timeout * 10
......@@ -164,6 +124,8 @@ class CollectiveElasticController(CollectiveController):
self.build_job()
self.ctx.logger.info("Waiting peer ready...")
ok, replicas = self.master.wait_peer_ready(
self.job.replicas_min, self.job.replicas_max, timeout)
if ok:
......
......@@ -40,7 +40,7 @@ class ControllerBase(object):
self.master = Master.factory(self.ctx)
self.job = Job(nnodes=self.ctx.args.nnodes,
mode=self.ctx.args.mode,
mode=self.ctx.args.run_mode,
jid=self.ctx.args.job_id)
self.pod = Pod()
......@@ -65,19 +65,52 @@ class ControllerBase(object):
self.watch()
def watch(self) -> bool:
'''
watch self and peer status, return true to exit
'''
#TODO(kuizhiqing) unify ctx.status and master status
self.ctx.logger.info("Watching {}".format(self.pod))
status = self.pod.watch()
while not self.ctx.status.is_done():
status = self.pod.watch(timeout=2)
if self.ctx.continous_log():
self.pod.logs()
# completed
if status == self.ctx.status.COMPLETED:
self.ctx.status.complete()
self.master.set_status(status)
self.ctx.logger.info("Pod {}".format(status))
return True
# self failure
elif status == self.ctx.status.FAILED:
self.ctx.status.fail()
self.master.set_status(status)
self.master.restart_peer()
fc = self.pod.failed_container()
self.ctx.logger.info("Pod {}".format(status))
self.ctx.logger.error("Container failed !!!\n{}".format(fc[0]))
fc[0].tail()
self.pod.stop()
if self.ctx.args.elastic_level <= 0:
return True
else:
return False
# peer failure
if self.ctx.status.is_restarting() and self.master.get_status(
) != self.ctx.status.COMPLETED:
self.pod.stop()
return False
def stop(self, sigint=None):
self.ctx.logger.debug("Controller stop")
self.master.stop()
......
......@@ -43,6 +43,15 @@ class Master(object):
def stop(self):
raise NotImplementedError
def set_status(self, status):
pass
def get_status(self):
return None
def restart_peer(self):
pass
def sync_peers(self, prefix, key, value, size, rank=-1) -> (list, int):
raise NotImplementedError
......@@ -122,7 +131,7 @@ class HTTPMaster(Master):
if size < 2:
return [value], 0
self.ctx.logger.info("Waiting peer ready...")
self.ctx.logger.info("Waiting peer start...")
self.lazy_init()
......@@ -184,7 +193,7 @@ class ETCDMaster(Master):
if size < 2:
return [value], 0
self.ctx.logger.info("Waiting peer ready...")
self.ctx.logger.info("Waiting peer start...")
path = "{}/{}/{}".format(prefix, key, rank)
......
......@@ -21,11 +21,11 @@ import os, shutil
class PSController(Controller):
@classmethod
def enable(cls, ctx):
if ctx.args.mode == ControleMode.PS or ctx.args.server_num or len(
if ctx.args.run_mode == ControleMode.PS or ctx.args.server_num or len(
ctx.args.servers) > 0 or ctx.args.trainer_num or len(
ctx.args.trainers) > 0:
ctx.logger.debug("{} enabled".format(cls.__name__))
ctx.args.mode = ControleMode.PS
ctx.args.run_mode = ControleMode.PS
return True
else:
return False
......
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .context import Context
def launch():
"""
Paddle distribution training entry ``python -m paddle.distributed.launch``.
Usage:
.. code-block:: bash
:name: code-block-bash1
python -m paddle.distributed.launch [-h] [--master MASTER] [--rank RANK]
[--log_level LOG_LEVEL] [--nnodes NNODES]
[--nproc_per_node NPROC_PER_NODE] [--log_dir LOG_DIR]
[--run_mode RUN_MODE] [--job_id JOB_ID] [--devices DEVICES]
[--host HOST] [--servers SERVERS] [--trainers TRAINERS]
[--trainer_num TRAINER_NUM] [--server_num SERVER_NUM]
[--gloo_port GLOO_PORT] [--with_gloo WITH_GLOO]
[--max_restart MAX_RESTART] [--elastic_level ELASTIC_LEVEL]
[--elastic_timeout ELASTIC_TIMEOUT]
training_script ...
Base Parameters:
- ``--master``: The master/rendezvous server, support http:// and etcd://, default with http://. e.g., ``--master=127.0.0.1:8080``. Default ``--log_dir=None``.
- ``--rank``: The rank of the node, can be auto assigned by master. Default ``--rank=-1``.
- ``--log_level``: The log levl to set for logging.setLevel. Default ``--log_level=INFO``.
- ``--nnodes``: The number of nodes for a distributed job, it can be a range in elastic mode, e.g., ``--nnnodes=2:3``. Default ``--nnodes=1``.
- ``--nproc_per_node``: The number of processes to launch on a node. In gpu training, it should be less or equal to the gpus number of you system. e.g., ``--nproc_per_node=8``
- ``--log_dir``: The path for each process's log. e.g., ``--log_dir=output_dir``. Default ``--log_dir=log``.
- ``--run_mode``: The run mode of job, can be:collective/ps/ps-heter. e.g., ``--run_mode=ps``. Default ``--run_mode=collective``.
- ``--job_id``: The job unique id, it affects the log files' name. e.g., ``--job_id=job1``. Default ``--job_id=default``.
- ``--devices``: The selected accelerate devices on nodes, can be gpu/xpu/npu/mlu etc.. e.g., ``--devices=0,1,2,3`` will launch four training processes each bound to one device.
- ``training_script``: The full path to the single GPU training program/script to be launched in parallel, followed by all the arguments for the training script. e.g., ``traing.py``
- ``training_script_args``: The args of training_script. e.g., ``--lr=0.1``
Collective Parameters:
- ``--ips``: [DEPRECATED] Paddle cluster nodes ips, e.g., ``--ips=192.168.0.16,192.168.0.17``. Default ``--ips=127.0.0.1``.
Parameter-Server Parameters:
- ``--servers``: User defined servers ip:port, e.g., ``--servers="192.168.0.16:6170,192.168.0.17:6170"``
- ``--trainers``: User defined trainers ip:port, e.g., ``--trainers="192.168.0.16:6171,192.168.0.16:6172,192.168.0.17:6171,192.168.0.17:6172"``
- ``--workers``: [DEPRECATED] The same as trainers.
- ``--trainer_num``: Number of trainers on each node, can be 0.
- ``--worker_num``: [DEPRECATED] The same as trainer_num.
- ``--server_num``: Number of servers on each node, can be 0.
- ``--heter_workers``: User defined heter workers ip1:port1;ip2:port2, e.g., ``--heter_workers="192.168.0.16:6172;192.168.0.17:6172"``
- ``--heter_worker_num``: Number of heter_workers in each stage (It recommend to set when in the emulated distributed environment using single node)
- ``--heter_devices``: Type of heter_device in each stage
- ``--gloo_port``: Gloo http Port. Default ``--gloo_port=6767``.
- ``--with_gloo``: Using gloo or not. Default ``--with_gloo=0``.
Elastic Parameters:
- ``--max_restart``: The maximum restart times for an elastic job. Default ``--max_restart=3``.
- ``--elastic_level``: The elastic level: -1: disable, 0: failed exit, peers hold, 1: internal restart. Default ``--elastic_level=-1``.
- ``--elastic_timeout``: Seconds to wait before elastic job begin to train. Default ``--elastic_timeout=30``.
Returns:
``None``
Examples 0 (master, ip/port auto detection):
# For training on multi node, run the following command in one of the nodes
python -m paddle.distributed.launch --nnodes 2 train.py
# Then the following info will be print
# Copy the following command to other nodes to run.
# --------------------------------------------------------------------------------
# python -m paddle.distributed.launch --master 10.0.0.1:38714 --nnodes 2 train.py
# --------------------------------------------------------------------------------
# Follow the instruction above and paste the command in other nodes can launch a multi nodes training job.
# There are two ways to launch a job with the same command for multi nodes training
# 1) using the following command in every nodes, make sure the ip is one of the training node and the port is available on that node
# python -m paddle.distributed.launch --master 10.0.0.1:38714 --nnodes 2 train.py
# 2) using the following command in every nodes with a independent etcd service
# python -m paddle.distributed.launch --master etcd://10.0.0.1:2379 --nnodes 2 train.py
# This functionality works will for both collective and ps mode and even with other arguments.
Examples 1 (collective, single node):
.. code-block:: bash
:name: code-block-example-bash1
# For training on single node using 4 gpus.
python -m paddle.distributed.launch --devices=0,1,2,3 train.py --lr=0.01
Examples 2 (collective, multi node):
.. code-block:: bash
:name: code-block-example-bash2
# For training on multiple nodes, e.g., 192.168.0.16, 192.168.0.17
# On 192.168.0.16:
python -m paddle.distributed.launch --devices=0,1,2,3 --master=192.168.0.16:8090 train.py --lr=0.01
# On 192.168.0.17:
python -m paddle.distributed.launch --devices=0,1,2,3 --master=192.168.0.16:8090 train.py --lr=0.01
Examples 3 (ps, cpu, single node):
.. code-block:: bash
:name: code-block-example-bash3
# To simulate distributed environment using single node, e.g., 2 servers and 4 workers.
python -m paddle.distributed.launch --server_num=2 --worker_num=4 train.py --lr=0.01
Examples 4 (ps, cpu, multi node):
.. code-block:: bash
:name: code-block-example-bash4
# For training on multiple nodes, e.g., 192.168.0.16, 192.168.0.17 where each node with 1 server and 2 workers.
# On 192.168.0.16:
python -m paddle.distributed.launch --servers="192.168.0.16:6170,192.168.0.17:6170" --workers="192.168.0.16:6171,192.168.0.16:6172,192.168.0.17:6171,192.168.0.17:6172" train.py --lr=0.01
# On 192.168.0.17:
python -m paddle.distributed.launch --servers="192.168.0.16:6170,192.168.0.17:6170" --workers="192.168.0.16:6171,192.168.0.16:6172,192.168.0.17:6171,192.168.0.17:6172" train.py --lr=0.01
# Or with master, the following command run 2 server and 2 trainer on each node.
python -m paddle.distributed.launch --master 192.168.0.16:9090 --server_num=2 --trainer_num=2 --nnodes 2 train.py
Examples 5 (ps, gpu, single node):
.. code-block:: bash
:name: code-block-example-bash5
# To simulate distributed environment using single node, e.g., 2 servers and 4 workers, each worker use single gpu.
export CUDA_VISIBLE_DEVICES=0,1,2,3
python -m paddle.distributed.launch --server_num=2 --worker_num=4 train.py --lr=0.01
Examples 6 (ps, gpu, multi node):
.. code-block:: bash
:name: code-block-example-bash6
# For training on multiple nodes, e.g., 192.168.0.16, 192.168.0.17 where each node with 1 server and 2 workers.
# On 192.168.0.16:
export CUDA_VISIBLE_DEVICES=0,1
python -m paddle.distributed.launch --servers="192.168.0.16:6170,192.168.0.17:6170" --workers="192.168.0.16:6171,192.168.0.16:6172,192.168.0.17:6171,192.168.0.17:6172" train.py --lr=0.01
# On 192.168.0.17:
export CUDA_VISIBLE_DEVICES=0,1
python -m paddle.distributed.launch --servers="192.168.0.16:6170,192.168.0.17:6170" --workers="192.168.0.16:6171,192.168.0.16:6172,192.168.0.17:6171,192.168.0.17:6172" train.py --lr=0.01
Examples 7 (ps-heter, cpu + gpu, single node):
.. code-block:: bash
:name: code-block-example-bash7
# To simulate distributed environment using single node, e.g., 2 servers and 4 workers, two workers use gpu, two workers use cpu.
export CUDA_VISIBLE_DEVICES=0,1
python -m paddle.distributed.launch --server_num=2 --worker_num=2 --heter_worker_num=2 train.py --lr=0.01
Examples 8 (ps-heter, cpu + gpu, multi node):
.. code-block:: bash
:name: code-block-example-bash8
# For training on multiple nodes, e.g., 192.168.0.16, 192.168.0.17 where each node with 1 server, 1 gpu worker, 1 cpu worker.
# On 192.168.0.16:
export CUDA_VISIBLE_DEVICES=0
python -m paddle.distributed.launch --servers="192.168.0.16:6170,192.168.0.17:6170" --workers="192.168.0.16:6171,192.168.0.17:6171" --heter_workers="192.168.0.16:6172,192.168.0.17:6172" train.py --lr=0.01
# On 192.168.0.17:
export CUDA_VISIBLE_DEVICES=0
python -m paddle.distributed.launch --servers="192.168.0.16:6170,192.168.0.17:6170" --workers="192.168.0.16:6171,192.168.0.17:6171" --heter_workers="192.168.0.16:6172,192.168.0.17:6172" train.py --lr=0.01
Examples 9 (elastic):
.. code-block:: bash
:name: code-block-example-bash9
# With the following command, the job will begin to run immediately if 4 nodes are ready,
# or it will run after elastic_timeout if only 2 or 3 nodes ready
python -m paddle.distributed.launch --master etcd://10.0.0.1:2379 --nnodes 2:4 train.py
# once the number of nodes changes between 2:4 during training, the strategy holds
"""
# initialize the context to run
ctx = Context()
if ctx.is_legacy_mode():
# legacy mode
from paddle.distributed.fleet import launch
launch.launch()
else:
from . import controllers
# initialize the selected controller
c = controllers.init(ctx)
# run the pods
c.run()
# manager or just wait pod
c.finalize()
if __name__ == "__main__":
launch()
......@@ -30,6 +30,7 @@ from paddle.fluid.framework import _set_expected_place, _current_expected_place,
import queue
import paddle
import paddle.profiler as profiler
from .. import core, layers
from ..framework import in_dygraph_mode, _in_eager_mode
from ..multiprocess_utils import _set_SIGCHLD_handler, MP_STATUS_CHECK_INTERVAL, CleanupFuncRegistrar
......@@ -250,6 +251,10 @@ class _DataLoaderIterSingleProcess(_DataLoaderIterBase):
self._exit_thread_expectedly()
def __next__(self):
trace_event = profiler.RecordEvent(
name="_DataLoaderIterSingleProcess",
event_type=profiler.TracerEventType.Dataloader)
trace_event.begin()
try:
if in_dygraph_mode():
if _in_eager_mode():
......@@ -283,6 +288,8 @@ class _DataLoaderIterSingleProcess(_DataLoaderIterBase):
self._reader.shutdown()
self._try_shutdown_all()
six.reraise(*sys.exc_info())
finally:
trace_event.end()
def _shutdown_thread(self):
if self._thread:
......@@ -695,6 +702,10 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase):
self._try_shutdown_all(1)
def __next__(self):
trace_event = profiler.RecordEvent(
name="_DataLoaderIterMultiProcess",
event_type=profiler.TracerEventType.Dataloader)
trace_event.begin()
try:
# _batches_outstanding here record the total batch data number
# in 'from after _try_put_indices to beforeoutput data', this
......@@ -743,6 +754,8 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase):
self._reader.shutdown()
self._try_shutdown_all()
six.reraise(*sys.exc_info())
finally:
trace_event.end()
# python2 compatibility
def next(self):
......
......@@ -25,6 +25,7 @@ from copy import deepcopy
import inspect
import paddle
import paddle.profiler as profiler
from . import parallel_helper
from .. import unique_name
......@@ -905,6 +906,8 @@ class Layer(object):
self._built = True
with profiler.RecordEvent(self.full_name(),
profiler.TracerEventType.Forward):
outputs = self.forward(*inputs, **kwargs)
for forward_post_hook in self._forward_post_hooks.values():
......
......@@ -2986,6 +2986,12 @@ class GroupNorm(layers.Layer):
is_bias=True)
def forward(self, input):
if in_dygraph_mode():
attrs = ('epsilon', self._epsilon, 'groups', self._groups)
out, _, _ = _C_ops.group_norm(input, self.weight, self.bias, *attrs)
return dygraph_utils._append_activation_in_dygraph(out, self._act)
inputs = {'X': input}
if self.bias is not None:
inputs['Bias'] = self.bias
......
......@@ -28,6 +28,7 @@ from .math_op_patch import monkey_patch_math_varbase
from .parallel import scale_loss
from paddle.fluid.data_feeder import convert_dtype, _PADDLE_DTYPE_2_NUMPY_DTYPE
import paddle.utils.deprecated as deprecated
import paddle.profiler as profiler
from paddle import _C_ops
......@@ -243,6 +244,9 @@ def monkey_patch_varbase():
"""
if framework.in_dygraph_mode():
record_event = profiler.RecordEvent(
"Gradient Backward", profiler.TracerEventType.Backward)
record_event.begin()
if grad_tensor is not None:
if core._in_eager_mode():
assert isinstance(
......@@ -278,6 +282,7 @@ def monkey_patch_varbase():
core.dygraph_run_backward([self], [grad_tensor],
retain_graph,
framework._dygraph_tracer())
record_event.end()
else:
raise ValueError(
"Variable.backward() is only available in DyGraph mode")
......@@ -762,6 +767,9 @@ def monkey_patch_varbase():
# Call _setitem_impl_ when item contains tensor or list.
return _setitem_impl_(self, item, value)
else:
if core._in_eager_mode():
return self.__setitem_eager_tensor__(item, value)
else:
# Call c++ func __setitem_varbase__ to speedup.
return self.__setitem_varbase__(item, value)
......
......@@ -270,9 +270,10 @@ def generate_activation_fn(op_type):
op_type)
else:
# abs exp square ops support dtype(int32, int64, float16, float32, float64)
check_variable_and_dtype(
x, 'x', ['int32', 'int64', 'float16', 'float32', 'float64'],
op_type)
check_variable_and_dtype(x, 'x', [
'int32', 'int64', 'float16', 'float32', 'float64', 'complex64',
'complex128'
], op_type)
helper = LayerHelper(op_type, **locals())
......
......@@ -5616,9 +5616,10 @@ def transpose(x, perm, name=None):
out, _ = _C_ops.transpose2(x, 'axis', perm)
return out
check_variable_and_dtype(
x, 'x', ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'],
'transpose')
check_variable_and_dtype(x, 'x', [
'bool', 'float16', 'float32', 'float64', 'int32', 'int64', 'complex64',
'complex128'
], 'transpose')
check_type(perm, 'perm', (list, tuple), 'transpose')
if isinstance(perm, tuple):
perm = list(perm)
......@@ -6410,10 +6411,10 @@ def squeeze(input, axes, name=None):
return out
helper = LayerHelper("squeeze", **locals())
check_variable_and_dtype(
input, 'input',
['float16', 'float32', 'float64', 'bool', 'int8', 'int32', 'int64'],
'squeeze')
check_variable_and_dtype(input, 'input', [
'float16', 'float32', 'float64', 'bool', 'int8', 'int32', 'int64',
'complex64', 'complex128'
], 'squeeze')
check_type(axes, 'axis/axes', (list, tuple), 'squeeze')
out = helper.create_variable_for_type_inference(dtype=input.dtype)
x_shape = helper.create_variable_for_type_inference(dtype=input.dtype)
......@@ -6471,8 +6472,16 @@ def unsqueeze(input, axes, name=None):
check_type(axes, 'axis/axes', (int, list, tuple, Variable), 'unsqueeze')
check_variable_and_dtype(input, 'input', [
'float16', 'float32', 'float64', 'bool', 'int8', 'int16', 'int32',
'int64'
'float16',
'float32',
'float64',
'bool',
'int8',
'int16',
'int32',
'int64',
'complex64',
'complex128',
], 'unsqueeze')
helper = LayerHelper("unsqueeze2", **locals())
inputs = {"X": input}
......@@ -11180,8 +11189,8 @@ def slice(input, axes, starts, ends):
ends_tensor.stop_gradient = True
infer_flags = list(-1 for i in range(len(axes)))
return _C_ops.slice(input, starts_tensor, ends_tensor, 'axes', axes,
'infer_flags', infer_flags, *attrs)
return _C_ops.slice(input, starts_tensor, ends_tensor, None, None,
'axes', axes, 'infer_flags', infer_flags, *attrs)
if not isinstance(starts, (list, tuple, Variable)):
raise ValueError(
......
......@@ -632,7 +632,7 @@ def assign(input, output=None):
dtype = VarDesc.VarType.FP32
if dtype == VarDesc.VarType.BOOL:
value_name = "bool_values"
values = [bool(v) for v in input.flat]
values = [int(v) for v in input.flat]
elif dtype == VarDesc.VarType.FP32:
value_name = "fp32_values"
values = [float(v) for v in input.flat]
......@@ -756,7 +756,7 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None, name=None):
check_shape(shape)
check_dtype(dtype, 'dtype', [
'bool', 'float16', 'float32', 'float64', 'uint8', 'int16', 'int32',
'int64'
'int64', 'complex64', 'complex128'
], 'fill_constant')
check_type(shape, 'shape', (Variable, list, tuple), 'fill_constant')
......
......@@ -20,6 +20,8 @@ import os
import six
import sys
from paddle.utils.deprecated import deprecated
__all__ = [
'cuda_profiler', 'reset_profiler', 'profiler', 'start_profiler',
'stop_profiler'
......@@ -36,6 +38,12 @@ NVPROF_CONFIG = [
]
@deprecated(
since="2.3.0",
update_to="paddle.profiler.Profiler",
level=1,
reason="Please use new profiler tool, this profiler tool is no longer maintained."
)
@signature_safe_contextmanager
def cuda_profiler(output_file, output_mode=None, config=None):
"""
......@@ -109,6 +117,12 @@ def npu_profiler(output_file, config=None):
core.npu_prof_finalize()
@deprecated(
since="2.3.0",
update_to="paddle.profiler.Profiler",
level=1,
reason="Please use new profiler tool, this profiler tool is no longer maintained."
)
def reset_profiler():
"""
Clear the previous time record. It works for
......@@ -131,6 +145,12 @@ def reset_profiler():
core.reset_profiler()
@deprecated(
since="2.3.0",
update_to="paddle.profiler.Profiler",
level=1,
reason="Please use new profiler tool, this profiler tool is no longer maintained."
)
def start_profiler(state, tracer_option='Default'):
"""
Enable the profiler. Uers can use `fluid.profiler.start_profiler` and
......@@ -156,6 +176,7 @@ def start_profiler(state, tracer_option='Default'):
.. code-block:: python
# required: gpu
import paddle.fluid as fluid
import paddle.fluid.profiler as profiler
......@@ -198,6 +219,12 @@ def start_profiler(state, tracer_option='Default'):
core.enable_profiler(prof_state)
@deprecated(
since="2.3.0",
update_to="paddle.profiler.Profiler",
level=1,
reason="Please use new profiler tool, this profiler tool is no longer maintained."
)
def stop_profiler(sorted_key=None, profile_path='/tmp/profile'):
"""
Stop the profiler. Uers can use `fluid.profiler.start_profiler` and
......@@ -225,6 +252,7 @@ def stop_profiler(sorted_key=None, profile_path='/tmp/profile'):
.. code-block:: python
# required: gpu
import paddle.fluid as fluid
import paddle.fluid.profiler as profiler
......@@ -254,6 +282,12 @@ def stop_profiler(sorted_key=None, profile_path='/tmp/profile'):
core.disable_profiler(key_map[sorted_key], profile_path)
@deprecated(
since="2.3.0",
update_to="paddle.profiler.Profiler",
level=1,
reason="Please use new profiler tool, this profiler tool is no longer maintained."
)
@signature_safe_contextmanager
def profiler(state,
sorted_key=None,
......
......@@ -71,7 +71,7 @@ class TestAssignValueNPUOp4(TestAssignValueNPUOp):
def init_data(self):
self.value = numpy.random.choice(
a=[False, True], size=(2, 5)).astype(numpy.bool)
self.attrs["bool_values"] = [bool(v) for v in self.value.flat]
self.attrs["bool_values"] = [int(v) for v in self.value.flat]
class TestAssignApi(unittest.TestCase):
......
......@@ -58,7 +58,7 @@ class TestAssignValueOp4(TestAssignValueOp):
def init_data(self):
self.value = numpy.random.choice(
a=[False, True], size=(2, 5)).astype(numpy.bool)
self.attrs["bool_values"] = [bool(v) for v in self.value.flat]
self.attrs["bool_values"] = [int(v) for v in self.value.flat]
class TestAssignApi(unittest.TestCase):
......
......@@ -24,6 +24,7 @@ from op_test import OpTest, skip_check_grad_ci, convert_float_to_uint16
class ElementwiseDivOp(OpTest):
def setUp(self):
self.op_type = "elementwise_div"
self.python_api = paddle.divide
self.dtype = np.float64
self.init_dtype()
""" Warning
......@@ -37,8 +38,11 @@ class ElementwiseDivOp(OpTest):
}
self.outputs = {'Out': np.divide(self.inputs['X'], self.inputs['Y'])}
def check_eager(self):
return (self.use_mkldnn == False and self.axis == -1)
def test_check_output(self):
self.check_output()
self.check_output(check_eager=False)
def test_check_grad_normal(self):
self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.05)
......
......@@ -182,7 +182,7 @@ class TestImperativeAutoPrune(unittest.TestCase):
self.func_auto_prune2()
# TODO(jiabin): Support this when we support better split tensor
def test_auto_prune3(self):
def func_auto_prune3(self):
with fluid.dygraph.guard():
case3 = AutoPruneLayer3(input_size=784)
value1 = np.arange(784).reshape(1, 784).astype("float32")
......@@ -194,7 +194,12 @@ class TestImperativeAutoPrune(unittest.TestCase):
self.assertTrue(case3.linear.weight._grad_ivar() is not None)
self.assertTrue((part2.gradient() == 0).all())
def test_auto_prune4(self):
def test_auto_prune3(self):
with _test_eager_guard():
self.func_auto_prune3()
self.func_auto_prune3()
def func_auto_prune4(self):
with fluid.dygraph.guard():
case4 = AutoPruneLayer3(input_size=784)
value1 = np.arange(784).reshape(1, 784).astype("float32")
......@@ -206,7 +211,12 @@ class TestImperativeAutoPrune(unittest.TestCase):
self.assertTrue(case4.linear.weight._grad_ivar() is not None)
self.assertTrue((part2.gradient() == 1).all())
def test_auto_prune5(self):
def test_auto_prune4(self):
with _test_eager_guard():
self.func_auto_prune4()
self.func_auto_prune4()
def func_auto_prune5(self):
with fluid.dygraph.guard():
case4 = AutoPruneLayer3(input_size=784)
value1 = np.arange(784).reshape(1, 784).astype("float32")
......@@ -218,6 +228,11 @@ class TestImperativeAutoPrune(unittest.TestCase):
self.assertTrue(case4.linear.weight._grad_ivar() is not None)
self.assertTrue((part2.gradient() == 0).all())
def test_auto_prune5(self):
with _test_eager_guard():
self.func_auto_prune5()
self.func_auto_prune5()
def func_auto_prune6(self):
with fluid.dygraph.guard():
value0 = np.arange(26).reshape(2, 13).astype("float32")
......
......@@ -1819,7 +1819,7 @@ class TestLayer(LayerTest):
self.assertTrue(np.allclose(static_ret, static_ret2))
def test_group_norm(self):
def func_group_norm(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
else:
......@@ -1873,7 +1873,6 @@ class TestLayer(LayerTest):
with_lod=True)[0]
with self.dynamic_graph():
# TODO(wuweilong): Add with _test_eager_guard():
groupNorm = nn.GroupNorm(
channels=shape[1],
groups=2,
......@@ -1886,6 +1885,11 @@ class TestLayer(LayerTest):
self.assertTrue(np.allclose(static_ret, dy_rlt_value))
self.assertTrue(np.allclose(static_ret, static_ret2))
def test_group_norm(self):
with _test_eager_guard():
self.func_group_norm()
self.func_group_norm()
def test_instance_norm(self):
if core.is_compiled_with_cuda():
place = core.CUDAPlace(0)
......@@ -2348,7 +2352,7 @@ class TestLayer(LayerTest):
with self.assertRaises(TypeError):
layers.eye(num_rows=3, batch_shape=[-1])
def test_while_loop(self):
def func_while_loop(self):
with self.static_graph():
i = layers.fill_constant(shape=[1], dtype='int64', value=0)
ten = layers.fill_constant(shape=[1], dtype='int64', value=10)
......@@ -2363,7 +2367,6 @@ class TestLayer(LayerTest):
static_ret = self.get_static_graph_result(feed={}, fetch_list=out)
with self.dynamic_graph():
# TODO(wuweilong): Add with _test_eager_guard():
i = layers.fill_constant(shape=[1], dtype='int64', value=0)
ten = layers.fill_constant(shape=[1], dtype='int64', value=10)
......@@ -2384,6 +2387,11 @@ class TestLayer(LayerTest):
self.assertTrue(np.array_equal(static_ret[0], dy_ret[0].numpy()))
def test_while_loop(self):
with _test_eager_guard():
self.func_while_loop()
self.func_while_loop()
def test_compare(self):
value_a = np.arange(3)
value_b = np.arange(3)
......
......@@ -21,6 +21,7 @@ from paddle.fluid import core
from op_test import OpTest
import numpy as np
from paddle.fluid.framework import _test_eager_guard
import os
def sample_output_one_dimension(out, dim):
......@@ -250,6 +251,60 @@ class TestMultinomialError(unittest.TestCase):
self.assertRaises(ValueError, test_dim_less_than_1)
class TestRandomValue(unittest.TestCase):
def test_fixed_random_number(self):
# Test GPU Fixed random number, which is generated by 'curandStatePhilox4_32_10_t'
if not paddle.is_compiled_with_cuda():
return
# Different GPU generatte different random value. Only test V100 here.
if not "V100" in paddle.device.cuda.get_device_name():
return
if os.getenv("FLAGS_use_curand", None) in ('0', 'False', None):
return
print("Test Fixed Random number on V100 GPU------>")
paddle.disable_static()
paddle.set_device('gpu')
paddle.seed(100)
x = paddle.randint(0, 100, [1024, 10000]).astype('float32')
y = paddle.multinomial(x, 1, replacement=False).numpy()
self.assertEqual(np.sum(y), 5187793)
self.assertEqual(np.mean(y), 5066.2041015625)
expect = [9982, 1655, 4741, 1323, 9319, 3298, 6473, 7477, 2507, 2628]
self.assertTrue(np.array_equal(y[100:110, :].flatten(), expect))
y = paddle.multinomial(x, 5000, replacement=False).numpy()
self.assertEqual(np.sum(y), 25603962316)
self.assertEqual(np.mean(y), 5000.77388984375)
expect = [7300, 6055, 8714, 5401, 7360, 161, 5035, 7002, 6788, 2916]
self.assertTrue(np.array_equal(y[100, 1000:1010], expect))
y = paddle.multinomial(x, 5000, replacement=False).numpy()
self.assertEqual(np.sum(y), 25592855710)
self.assertEqual(np.mean(y), 4998.604630859375)
expect = [5700, 6567, 4399, 5688, 7472, 545, 6894, 526, 2124, 385]
self.assertTrue(np.array_equal(y[300, 3000:3010], expect))
y = paddle.multinomial(x, 20000, replacement=True).numpy()
self.assertEqual(np.sum(y), 102371362581)
self.assertEqual(np.mean(y), 4998.60168852539)
self.assertEqual(np.std(y), 2886.316308500771)
expect = [7630, 8235, 8445, 3275, 5580, 4591, 1331, 342, 1662, 7156]
self.assertTrue(np.array_equal(y[100, 0:10], expect))
y = paddle.multinomial(x, 20000, replacement=True).numpy()
self.assertEqual(np.sum(y), 102400672117)
self.assertEqual(np.mean(y), 5000.032818212891)
self.assertEqual(np.std(y), 2886.913426124017)
expect = [4159, 7849, 9305, 5759, 4422, 122, 345, 2897, 5200, 5911]
self.assertTrue(np.array_equal(y[100, 0:10], expect))
paddle.enable_static()
if __name__ == "__main__":
paddle.enable_static()
unittest.main()
......@@ -56,7 +56,15 @@ class TestProfilerStatistic(unittest.TestCase):
mobilenet_node = HostPythonNode(
'MobileNet', profiler.TracerEventType.Forward, 20, 50, 1000, 1001)
yolonet_node = HostPythonNode(
'Yolov3Net', profiler.TracerEventType.Forward, 50, 100, 1000, 1001)
'Yolov3Net', profiler.TracerEventType.Forward, 50, 110, 1000, 1001)
userdefined_node = HostPythonNode('Communication Time',
profiler.TracerEventType.UserDefined,
100, 110, 1000, 1001)
communication_node = HostPythonNode(
'Communication', profiler.TracerEventType.Communication, 105, 110,
1000, 1001)
backward_node = HostPythonNode('Gradient Backward',
profiler.TracerEventType.Backward, 120,
200, 1000, 1001)
......@@ -114,7 +122,9 @@ class TestProfilerStatistic(unittest.TestCase):
optimization_node
])
mobilenet_node.children_node.append(conv2d_node)
yolonet_node.children_node.append(sync_batch_norm_node)
yolonet_node.children_node.extend(
[sync_batch_norm_node, userdefined_node])
userdefined_node.children_node.append(communication_node)
conv2d_node.children_node.extend(
[conv2d_infer_shape, conv2d_compute, conv2d_MemCpy])
conv2d_compute.runtime_node.append(conv2d_launchkernel)
......@@ -145,7 +155,7 @@ class TestProfilerStatistic(unittest.TestCase):
profiler.TracerEventType.ProfileStep), 400)
self.assertEqual(
time_range_summary.get_cpu_range_sum(
profiler.TracerEventType.Forward), 90)
profiler.TracerEventType.Forward), 100)
self.assertEqual(
time_range_summary.get_cpu_range_sum(
profiler.TracerEventType.Backward), 80)
......@@ -169,15 +179,18 @@ class TestProfilerStatistic(unittest.TestCase):
0, profiler.TracerEventType.Memcpy), 60)
self.assertEqual(
time_range_summary.get_cpu_range_sum(
profiler.TracerEventType.UserDefined), 15)
profiler.TracerEventType.UserDefined), 25)
self.assertEqual(
time_range_summary.get_cpu_range_sum(
profiler.TracerEventType.Communication), 5)
self.assertEqual(len(event_summary.items), 2)
self.assertEqual(len(event_summary.userdefined_items), 0)
self.assertEqual(len(event_summary.userdefined_items), 1)
self.assertEqual(len(event_summary.model_perspective_items), 3)
self.assertEqual(len(event_summary.memory_manipulation_items), 1)
self.assertEqual(event_summary.items['conv2d'].cpu_time, 15)
self.assertEqual(event_summary.items['conv2d'].gpu_time, 25)
self.assertEqual(
event_summary.model_perspective_items['Forward'].cpu_time, 90)
event_summary.model_perspective_items['Forward'].cpu_time, 100)
self.assertEqual(
event_summary.model_perspective_items['Forward'].gpu_time, 135)
self.assertEqual(
......
......@@ -116,7 +116,7 @@ class PS_Test(unittest.TestCase):
return proc
def test_ps_1(self):
args = "--mode ps"
args = "--run_mode ps"
p = self.pdrun(args)
p.wait()
self.assertTrue(p.poll() == 0)
......
......@@ -22,6 +22,7 @@ import numpy as np
import paddle
from paddle.fluid.layer_helper import LayerHelper
from functools import reduce
from paddle.fluid.framework import _test_eager_guard, _in_eager_mode
class TestSetValueBase(unittest.TestCase):
......@@ -69,7 +70,7 @@ class TestSetValueApi(TestSetValueBase):
paddle.enable_static()
return out
def test_api(self):
def func_test_api(self):
static_out = self._run_static()
dynamic_out = self._run_dynamic()
self._get_answer()
......@@ -82,6 +83,11 @@ class TestSetValueApi(TestSetValueBase):
(self.data == dynamic_out).all(),
msg=error_msg.format("dynamic", self.data, dynamic_out))
def test_api(self):
with _test_eager_guard():
self.func_test_api()
self.func_test_api()
# 1. Test different type of item: int, Python slice, Paddle Tensor
# 1.1 item is int
......@@ -995,9 +1001,9 @@ class TestBackward(unittest.TestCase):
fetch_list=[var.name + "@GRAD", z.name + "@GRAD"])
self.assertTrue((var_grad == z_grad[0, :]).all())
def test_dynamic(self):
paddle.disable_static()
def func_test_dynamic(self):
model = Model()
x = paddle.ones([1, 12, 3, 3]).astype("float32")
y = paddle.ones([1, 12, 3, 3]).astype("float32")
......@@ -1006,11 +1012,18 @@ class TestBackward(unittest.TestCase):
self.assertTrue(var.grad.shape == x.grad[0, :, 0, 0].shape)
#
# TODO(pangyoki) add inplace and delete if
if not _in_eager_mode():
self.assertTrue((0 == x.grad[0, :, 0, 0]).all())
def test_dynamic(self):
with _test_eager_guard():
self.func_test_dynamic()
self.func_test_dynamic()
class TestGradientTruncated(unittest.TestCase):
def test_consistent_with_competitor(self):
def func_test_consistent_with_competitor(self):
paddle.disable_static()
def set_value(t, value):
......@@ -1182,6 +1195,11 @@ class TestGradientTruncated(unittest.TestCase):
self.assertTrue(~x.stop_gradient)
self.assertTrue(~x.is_leaf)
def test_consistent_with_competitor(self):
with _test_eager_guard():
self.func_test_consistent_with_competitor()
self.func_test_consistent_with_competitor()
def test_static_graph(self):
paddle.enable_static()
......@@ -1328,6 +1346,7 @@ class TestGradientTruncated(unittest.TestCase):
self.assertTrue((numel(out1[0][0:5:3].shape) == out3[0]).all())
array = array[0]
paddle.disable_static()
class TestSetValueInplace(unittest.TestCase):
......
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
from numpy.lib.stride_tricks import as_strided
import paddle
import unittest
from op_test import OpTest
def frame_from_librosa(x, frame_length, hop_length, axis=-1):
if axis == -1 and not x.flags["C_CONTIGUOUS"]:
x = np.ascontiguousarray(x)
elif axis == 0 and not x.flags["F_CONTIGUOUS"]:
x = np.asfortranarray(x)
n_frames = 1 + (x.shape[axis] - frame_length) // hop_length
strides = np.asarray(x.strides)
if axis == -1:
shape = list(x.shape)[:-1] + [frame_length, n_frames]
strides = list(strides) + [hop_length * x.itemsize]
elif axis == 0:
shape = [n_frames, frame_length] + list(x.shape)[1:]
strides = [hop_length * x.itemsize] + list(strides)
else:
raise ValueError("Frame axis={} must be either 0 or -1".format(axis))
return as_strided(x, shape=shape, strides=strides)
def stft_np(x, n_fft, hop_length, **kwargs):
frames = frame_from_librosa(x, n_fft, hop_length)
res = np.fft.rfft(frames, axis=1)
return res
class TestStftOp(OpTest):
def setUp(self):
self.op_type = "stft"
self.shape, self.type, self.attrs = self.initTestCase()
self.inputs = {
'X': np.random.random(size=self.shape).astype(self.type),
}
self.outputs = {'Out': stft_np(x=self.inputs['X'], **self.attrs)}
def initTestCase(self):
input_shape = (2, 100)
input_type = 'float64'
attrs = {
'n_fft': 50,
'hop_length': 15,
'normalized': False,
'onesided': True,
}
return input_shape, input_type, attrs
def test_check_output(self):
paddle.enable_static()
self.check_output()
paddle.disable_static()
def test_check_grad_normal(self):
paddle.enable_static()
self.check_grad(['X'], 'Out')
paddle.disable_static()
if __name__ == '__main__':
unittest.main()
......@@ -22,6 +22,7 @@ import copy
import paddle
import paddle.fluid as fluid
import paddle.fluid.core as core
from paddle.fluid.framework import _test_eager_guard, _in_eager_mode
class TestVarBase(unittest.TestCase):
......@@ -874,7 +875,7 @@ class TestVarBase(unittest.TestCase):
col = np.array([2, 1, 3])
self.assertTrue(np.array_equal(array[row, col], x[row, col].numpy()))
def test_slice(self):
def func_test_slice(self):
with fluid.dygraph.guard():
self._test_slice()
self._test_slice_for_tensor_attr()
......@@ -899,6 +900,11 @@ class TestVarBase(unittest.TestCase):
mask = np.array([1, 0, 1, 0], dtype=bool)
var[paddle.to_tensor([0, 1]), mask]
def test_slice(self):
with _test_eager_guard():
self.func_test_slice()
self.func_test_slice()
def test_var_base_to_np(self):
with fluid.dygraph.guard():
var = fluid.dygraph.to_variable(self.array)
......@@ -1125,7 +1131,6 @@ class TestVarBase(unittest.TestCase):
class TestVarBaseSetitem(unittest.TestCase):
def setUp(self):
paddle.disable_static()
self.set_dtype()
self.tensor_x = paddle.to_tensor(np.ones((4, 2, 3)).astype(self.dtype))
self.np_value = np.random.random((2, 3)).astype(self.dtype)
......@@ -1135,11 +1140,12 @@ class TestVarBaseSetitem(unittest.TestCase):
self.dtype = "int32"
def _test(self, value):
paddle.disable_static()
if not _in_eager_mode():
self.assertEqual(self.tensor_x.inplace_version, 0)
id_origin = id(self.tensor_x)
self.tensor_x[0] = value
if not _in_eager_mode():
self.assertEqual(self.tensor_x.inplace_version, 1)
if isinstance(value, (six.integer_types, float)):
......@@ -1152,27 +1158,47 @@ class TestVarBaseSetitem(unittest.TestCase):
self.assertEqual(id_origin, id(self.tensor_x))
self.tensor_x[1:2] = value
if not _in_eager_mode():
self.assertEqual(self.tensor_x.inplace_version, 2)
self.assertTrue(np.array_equal(self.tensor_x[1].numpy(), result))
self.assertEqual(id_origin, id(self.tensor_x))
self.tensor_x[...] = value
if not _in_eager_mode():
self.assertEqual(self.tensor_x.inplace_version, 3)
self.assertTrue(np.array_equal(self.tensor_x[3].numpy(), result))
self.assertEqual(id_origin, id(self.tensor_x))
def test_value_tensor(self):
paddle.disable_static()
def func_test_value_tensor(self):
self._test(self.tensor_value)
def test_value_numpy(self):
paddle.disable_static()
def test_value_tensor(self):
with _test_eager_guard():
self.setUp()
self.func_test_value_tensor()
self.setUp()
self.func_test_value_tensor()
def func_test_value_numpy(self):
self._test(self.np_value)
def test_value_int(self):
paddle.disable_static()
def test_value_numpy(self):
with _test_eager_guard():
self.setUp()
self.func_test_value_numpy()
self.setUp()
self.func_test_value_numpy()
def func_test_value_int(self):
self._test(10)
def test_value_int(self):
with _test_eager_guard():
self.setUp()
self.func_test_value_int()
self.setUp()
self.func_test_value_int()
class TestVarBaseSetitemInt64(TestVarBaseSetitem):
def set_dtype(self):
......
......@@ -382,7 +382,7 @@ def _getitem_impl_(var, item):
idx = assign(np.array(slice_item).astype("int32"))
return index_select(var, index=idx, axis=0)
elif isinstance(slice_item, (Variable)):
elif isinstance(slice_item, (Variable, core.eager.Tensor)):
if len(item) == 1:
from ..tensor import index_select, gather_nd
......@@ -636,7 +636,7 @@ def _setitem_impl_(var, item, value):
shape = list(value.shape)
if dtype == core.VarDesc.VarType.BOOL:
value_name = "bool_values"
values = [bool(v) for v in value.flat]
values = [int(v) for v in value.flat]
elif dtype == core.VarDesc.VarType.FP32:
value_name = "fp32_values"
values = [float(v) for v in value.flat]
......@@ -657,7 +657,7 @@ def _setitem_impl_(var, item, value):
attrs[value_name] = values
attrs["shape"] = shape
elif isinstance(value, Variable):
elif isinstance(value, (Variable, core.eager.Tensor)):
inputs["ValueTensor"] = value
else:
raise TypeError(
......@@ -665,7 +665,9 @@ def _setitem_impl_(var, item, value):
"paddle.Tensor to a paddle.Tensor, but received {}".format(
type(value)))
if paddle.fluid.framework.in_dygraph_mode():
if paddle.fluid.framework.in_dygraph_mode(
) and not paddle.fluid.framework._in_eager_mode():
# TODO(pangyoki) add inplace(BumpInplaceVersion) if need
var._bump_inplace_version()
cur_block = default_main_program().current_block()
......
......@@ -20,7 +20,7 @@ from .utils import RecordEvent, load_profiler_result
from .profiler_statistic import SortedKeys
__all__ = [
'ProfilerState', 'ProfilerTarget', 'TracerEventType', 'make_scheduler',
'ProfilerState', 'ProfilerTarget', 'make_scheduler',
'export_chrome_tracing', 'export_protobuf', 'Profiler', 'RecordEvent',
'load_profiler_result', 'SortedKeys'
]
......@@ -24,7 +24,7 @@ from paddle.fluid.core import (_Profiler, _ProfilerResult, ProfilerOptions,
TracerEventType)
from .utils import RecordEvent, wrap_optimizers
from .profiler_statistic import SortedKeys
from .profiler_statistic import StatisticData, _build_table, SortedKeys
class ProfilerState(Enum):
......@@ -32,9 +32,12 @@ class ProfilerState(Enum):
Profiler state that can be specified to control profiler action.
CLOSED: The profilers are closed.
READY: The profilers are open, but the data will not be recorded.
This state is used for reducing overhead influence when profilers start.
RECORD: The profilers are open, and the data will be recorded.
RECORD_AND_RETURN: The profilers are open, and at the last batch of current profiler period,
the collected data will be returned.
"""
......@@ -47,6 +50,10 @@ class ProfilerState(Enum):
class ProfilerTarget(Enum):
r"""
Target device for profiling.
CPU: Profile events on CPU.
GPU: Profile events on GPU.
"""
CPU = 0
GPU = 1
......@@ -62,6 +69,8 @@ def make_scheduler(*,
Return a scheduler function, which scheduler the state according to the setting.
The state transform confirms to:
.. code-block:: text
(CLOSED) (CLOSED) (CLOSED) (READY) (RECORD,last RETURN) (CLOSED)
START -> skip_first -> closed -> ready -> record -> END
| |
......@@ -81,13 +90,23 @@ def make_scheduler(*,
Examples:
1. profiling range [2, 5]
batch 0: closed, batch 1: ready, batch [2, 5] record
.. code-block:: python
make_scheduler(closed=1, ready=1, record=4, repeat=1)
import paddle.profiler as profiler
profiler.make_scheduler(closed=1, ready=1, record=4, repeat=1)
2. profiling range [3,6], [9,12], [15,18]...
batch 0: skiped, batch 1: closed, batch 2: ready, batch [3,6]: record, repeat
.. code-block:: python
make_scheduler(closed=1, ready=1, record=4, skip_first=1)
import paddle.profiler as profiler
profiler.make_scheduler(closed=1, ready=1, record=4, skip_first=1)
"""
def getScheduleState(step: int) -> ProfilerState:
......@@ -138,14 +157,15 @@ def export_chrome_tracing(dir_name: str,
Examples:
.. code-block:: python
# required: gpu
import paddle.profiler as profiler
with profiler.Profiler(targets=[profiler.ProfilerTarget.CPU,
profiler.ProfilerTarget.GPU],
with profiler.Profiler(
targets=[profiler.ProfilerTarget.CPU, profiler.ProfilerTarget.GPU],
scheduler = (3, 10),
on_trace_ready = profiler.export_chrome_tracing('./log')
) as p:
for iter in range(N):
train()
on_trace_ready=profiler.export_protobuf('./log')) as p:
for iter in range(10):
#train()
p.step()
"""
if not os.path.exists(dir_name):
......@@ -181,14 +201,15 @@ def export_protobuf(dir_name: str, worker_name: Optional[str]=None) -> Callable:
Examples:
.. code-block:: python
# required: gpu
import paddle.profiler as profiler
with profiler.Profiler(targets=[profiler.ProfilerTarget.CPU,
profiler.ProfilerTarget.GPU],
with profiler.Profiler(
targets=[profiler.ProfilerTarget.CPU, profiler.ProfilerTarget.GPU],
scheduler = (3, 10),
on_trace_ready = profiler.export_protobuf('./log')
) as p:
for iter in range(N):
train()
on_trace_ready = profiler.export_protobuf('./log')) as p:
for iter in range(10):
#train()
p.step()
"""
if not os.path.exists(dir_name):
......@@ -216,7 +237,7 @@ def _get_supported_targets() -> Iterable[ProfilerTarget]:
r"""
Get the current supported profiler target in the system.
"""
if paddle.device.is_compiled_with_cuda():
if _Profiler.is_cupti_supported():
return [ProfilerTarget.CPU, ProfilerTarget.GPU]
return [ProfilerTarget.CPU]
......@@ -226,48 +247,56 @@ class Profiler:
Profiler context manager, user interface to manage profile process.
Parameters:
targets (iterable): list of tracing targets, currently supported values:
``paddle.profiler.ProfilerTarget.CPU``,
``paddle.profiler.ProfilerTarget.GPU``.
targets (iterable): list of tracing targets, currently supported values, ``ProfilerTarget.CPU``, ``ProfilerTarget.GPU`` .
scheduler (callable or tuple): If it is a callable object, it takes a step number as parameter and return the corresponding ``ProfilerState``.
If not provided, the default sheduler will keep tracing until the profiler exits. If it is a tuple, it has two values start_batch and end_batch,
If not provided, the default scheduler will keep tracing until the profiler exits. If it is a tuple, it has two values start_batch and end_batch,
which means profiling range [start_batch, end_batch).
on_trace_ready (callable): callable object, takes the Profiler object as parameter, which provides a way for users to do post-processing.
This callable object will be called when ``sheduler`` returns ``ProfilerState.RECORD_AND_RETURN``.
This callable object will be called when ``scheduler`` returns ``ProfilerState.RECORD_AND_RETURN``.
Examples:
1. profiling range [2, 5)
.. code-block:: python
# required: gpu
import paddle.profiler as profiler
with profiler.Profiler(targets=[profiler.ProfilerTarget.CPU,
profiler.ProfilerTarget.GPU],
with profiler.Profiler(
targets=[profiler.ProfilerTarget.CPU, profiler.ProfilerTarget.GPU],
scheduler = (2, 5),
on_trace_ready = profiler.export_chrome_tracing('./log')
) as p:
for iter in range(N):
train()
on_trace_ready = profiler.export_chrome_tracing('./log')) as p:
for iter in range(10):
#train()
p.step()
2. profiling range [2,4], [7, 9], [11,13]
.. code-block:: python
# required: gpu
import paddle.profiler as profiler
with profiler.Profiler(targets=[profiler.ProfilerTarget.CPU,
profiler.ProfilerTarget.GPU],
with profiler.Profiler(
targets=[profiler.ProfilerTarget.CPU, profiler.ProfilerTarget.GPU],
scheduler = profiler.make_scheduler(closed=1, ready=1, record=3, repeat=3),
on_trace_ready = profiler.export_chrome_tracing('./log')
) as p:
for iter in range(N):
train()
on_trace_ready = profiler.export_chrome_tracing('./log')) as p:
for iter in range(10):
#train()
p.step()
3. Use profiler without context manager, and use default parameters
.. code-block:: python
# required: gpu
import paddle.profiler as profiler
p = profiler.Profiler()
p.start()
for iter in range(N):
train()
for iter in range(10):
#train()
p.step()
p.stop()
p.summary()
"""
def __init__(
......@@ -335,6 +364,21 @@ class Profiler:
r'''
Start profiler and enter the first profiler step(0).
State transformed from CLOSED to self.current_state and trigger corresponding action.
Examples:
.. code-block:: python
# required: gpu
import paddle.profiler as profiler
prof = profiler.Profiler(
targets=[profiler.ProfilerTarget.CPU, profiler.ProfilerTarget.GPU],
scheduler = (1, 9),
on_trace_ready = profiler.export_chrome_tracing('./log'))
prof.start()
for iter in range(10):
#train()
prof.step()
prof.stop()
'''
# CLOSED -> self.current_state
if self.current_state == ProfilerState.READY:
......@@ -354,6 +398,21 @@ class Profiler:
r'''
Stop profiler and State transformed from self.current_state to CLOSED.
Trigger corresponding action and post-process profiler result using self.on_trace_ready if result exists.
Examples:
.. code-block:: python
# required: gpu
import paddle.profiler as profiler
prof = profiler.Profiler(
targets=[profiler.ProfilerTarget.CPU, profiler.ProfilerTarget.GPU],
scheduler = (1, 7),
on_trace_ready = profiler.export_chrome_tracing('./log'))
prof.start()
for iter in range(10):
#train()
prof.step()
prof.stop()
'''
# self.current_state -> CLOSED
# In this situation, RECORD state is regarded as RECORD_AND_RETURN
......@@ -375,6 +434,22 @@ class Profiler:
r"""
Signals the profiler that the next profiling step has started.
Get the new ProfilerState and trigger corresponding action.
Examples:
.. code-block:: python
# required: gpu
import paddle.profiler as profiler
prof = profiler.Profiler(
targets=[profiler.ProfilerTarget.CPU, profiler.ProfilerTarget.GPU],
scheduler = (3, 7),
on_trace_ready = profiler.export_chrome_tracing('./log'))
prof.start()
for iter in range(10):
#train()
prof.step()
prof.stop()
"""
if self.record_event:
self.record_event.end()
......@@ -448,6 +523,21 @@ class Profiler:
def export(self, path="", format="json"):
r"""
Exports the tracing data in Chrome tracing data format.
Examples:
.. code-block:: python
# required: gpu
import paddle.profiler as profiler
prof = profiler.Profiler(
targets=[profiler.ProfilerTarget.CPU, profiler.ProfilerTarget.GPU],
scheduler = (3, 7))
prof.start()
for iter in range(10):
#train()
prof.step()
prof.stop()
prof.export(path="./profiler_data.json", format="json")
"""
if self.profiler_result:
self.profiler_result.save(path, format)
......@@ -461,9 +551,35 @@ class Profiler:
Print the Summary table.
Parameters:
sorted_by: how to rank the op table items.
detail: expand each operator detail information.
thread_sep: print op table each thread.
time_unit: can be chosen form ['s', 'ms', 'us', 'ns']
sorted_by(SortedKeys): how to rank the op table items.
op_detail(bool): expand each operator detail information.
thread_sep(bool): print op table each thread.
time_unit(str): can be chosen form ['s', 'ms', 'us', 'ns']
Examples:
.. code-block:: python
# required: gpu
import paddle.profiler as profiler
prof = profiler.Profiler(
targets=[profiler.ProfilerTarget.CPU, profiler.ProfilerTarget.GPU],
scheduler = (3, 7),
on_trace_ready = profiler.export_chrome_tracing('./log'))
prof.start()
for iter in range(10):
#train()
prof.step()
prof.stop()
prof.summary(sorted_by=profiler.SortedKeys.CPUTotal, op_detail=True, thread_sep=False, time_unit='ms')
"""
pass
if self.profiler_result:
statistic_data = StatisticData(
self.profiler_result.get_data(),
self.profiler_result.get_extra_info())
print(
_build_table(
statistic_data,
sorted_by=sorted_by,
op_detail=op_detail,
thread_sep=thread_sep,
time_unit=time_unit))
......@@ -34,6 +34,22 @@ _CommunicationOpName = ['reduce', 'broadcast', 'rpc']
class SortedKeys(Enum):
r"""
Sorted keys for printing summary table.
CPUTotal: Sorted by CPU total time.
CPUAvg: Sorted by CPU average time.
CPUMax: Sorted by CPU max time.
CPUMin: Sorted by CPU min time.
GPUTotal: Sorted by GPU total time.
GPUAvg: Sorted by GPU average time.
GPUMax: Sorted by GPU max time.
GPUMin: Sorted by GPU min time.
"""
CPUTotal = 0
CPUAvg = 1
......@@ -642,6 +658,171 @@ def _build_table(statistic_data,
append('')
append('')
###### Print Model Summary Report ######
model_perspective_items = statistic_data.event_summary.model_perspective_items
if model_perspective_items:
headers = [
'Name', 'Calls', 'CPU Total / Avg / Max / Min / Ratio(%)',
'GPU Total / Avg / Max / Min / Ratio(%)'
]
row_format_list = [""]
header_sep_list = [""]
line_length_list = [-SPACING_SIZE]
name_column_width = 15
add_column(name_column_width)
add_column(6)
add_column(40)
add_column(40)
row_format = row_format_list[0]
header_sep = header_sep_list[0]
line_length = line_length_list[0]
# construct table string
append(add_title(line_length, "Model Summary"))
append('Time unit: {}'.format(time_unit))
append(header_sep)
append(row_format.format(*headers))
append(header_sep)
accmulation_time = 0
row_values = [
'Total Time', '-', '{} / - / - / - / {}'.format(
format_time(
total_time, unit=time_unit), format_ratio(1)),
'- / - / - / -/ -'
]
append(row_format.format(*row_values))
for name in ['Dataloader', 'Forward', 'Backward', 'Optimization']:
if name in model_perspective_items:
item = model_perspective_items[name]
row_values = [
' {}'.format(name), item.call,
'{} / {} / {} / {} / {}'.format(
format_time(
item.cpu_time, unit=time_unit),
format_time(
item.avg_cpu_time, unit=time_unit),
format_time(
item.max_cpu_time, unit=time_unit),
format_time(
item.min_cpu_time, unit=time_unit),
format_ratio(float(item.cpu_time) / total_time)),
'{} / {} / {} / {} / {}'.format(
format_time(
item.gpu_time, unit=time_unit),
format_time(
item.avg_gpu_time, unit=time_unit),
format_time(
item.max_gpu_time, unit=time_unit),
format_time(
item.min_gpu_time, unit=time_unit),
format_ratio(float(item.gpu_time) / total_time))
]
append(row_format.format(*row_values))
accmulation_time += item.cpu_time
other_time = total_time - accmulation_time
row_values = [
' Others', '-', '{} / - / - / - / {}'.format(
format_time(
other_time, unit=time_unit),
format_ratio(float(other_time) / total_time)),
'- / - / - / - / -'
]
append(row_format.format(*row_values))
append(header_sep)
append('')
append('')
###### Print Distribution Summary Report ######
if TracerEventType.Communication in statistic_data.time_range_summary.CPUTimeRange:
headers = [
'Name',
'Total Time',
'Ratio (%)',
]
row_format_list = [""]
header_sep_list = [""]
line_length_list = [-SPACING_SIZE]
DEFAULT_COLUMN_WIDTH = 20
for _ in headers:
add_column(DEFAULT_COLUMN_WIDTH)
row_format = row_format_list[0]
header_sep = header_sep_list[0]
line_length = line_length_list[0]
# construct table string
append(add_title(line_length, "Distribution Summary"))
append('Time unit: {}'.format(time_unit))
append(header_sep)
append(row_format.format(*headers))
append(header_sep)
cpu_communication_time_range = []
gpu_communication_time_range = []
cpu_communication_time_range = merge_ranges(
statistic_data.time_range_summary.CPUTimeRange[
TracerEventType.Communication], cpu_communication_time_range)
kernel_time_range = []
for device_id, device_time_ranges in statistic_data.time_range_summary.GPUTimeRange.items(
):
kernel_time_range = merge_ranges(
device_time_ranges[TracerEventType.Kernel],
kernel_time_range,
is_sorted=True)
gpu_communication_time_range = merge_ranges(
device_time_ranges[TracerEventType.Communication],
gpu_communication_time_range,
is_sorted=True)
communication_time_range = merge_ranges(
cpu_communication_time_range,
gpu_communication_time_range,
is_sorted=True)
computation_time_range = subtract_ranges(kernel_time_range,
gpu_communication_time_range)
overlap_time_range = intersection_ranges(communication_time_range,
computation_time_range)
communication_time = sum_ranges(communication_time_range)
computation_time = sum_ranges(computation_time_range)
overlap_time = sum_ranges(overlap_time_range)
row_values = [
'Communication', format_time(
communication_time, unit=time_unit),
format_ratio(float(communication_time) / total_time)
]
append(row_format.format(*row_values))
row_values = [
'Computation', format_time(
computation_time, unit=time_unit),
format_ratio(float(computation_time) / total_time)
]
append(row_format.format(*row_values))
row_values = [
'Overlap', format_time(
overlap_time, unit=time_unit),
format_ratio(float(overlap_time) / total_time)
]
append(row_format.format(*row_values))
append(header_sep)
append(
"Note:\nCommunication time: Communication Op time and its kernel time on gpu.\n"
"Computation time: Kernel time, substract kernels belong to communication op.\n"
"Overlap time: Communication time intersect with computation time.\n"
"Example:\n"
"Communication:\n"
" CPU: |_________________|\n"
" GPU: |______________|\n"
" Total: |_________________| |______________|\n"
"Computation time(Kernel):\n"
" GPU: |________________|\n"
"Overlap time: |___________|\n")
append('-' * line_length)
append('')
append('')
###### Print Operator Summary Report ######
if statistic_data.event_summary.items:
headers = [
......@@ -708,11 +889,6 @@ def _build_table(statistic_data,
sorted_items = sorted(
items.items(), key=lambda x: x[1].min_gpu_time)
total_cpu_time = 0
total_gpu_time = 0
for name, item in sorted_items:
total_cpu_time += item.cpu_time
total_gpu_time += item.gpu_time
for name, item in sorted_items:
row_values = [
name, item.call, '{} / {} / {} / {} / {}'.format(
......@@ -724,7 +900,7 @@ def _build_table(statistic_data,
item.max_cpu_time, unit=time_unit),
format_time(
item.min_cpu_time, unit=time_unit),
format_ratio(float(item.cpu_time) / total_cpu_time)),
format_ratio(float(item.cpu_time) / total_time)),
'{} / {} / {} / {} / {}'.format(
format_time(
item.gpu_time, unit=time_unit),
......@@ -734,7 +910,7 @@ def _build_table(statistic_data,
item.max_gpu_time, unit=time_unit),
format_time(
item.min_gpu_time, unit=time_unit),
format_ratio(float(item.gpu_time) / total_gpu_time))
format_ratio(float(item.gpu_time) / total_time))
]
append(row_format.format(*row_values))
if op_detail:
......@@ -752,8 +928,7 @@ def _build_table(statistic_data,
format_time(
innerop_node.min_cpu_time, unit=time_unit),
format_ratio(
float(innerop_node.cpu_time) /
total_cpu_time)),
float(innerop_node.cpu_time) / total_time)),
'{} / {} / {} / {} / {}'.format(
format_time(
innerop_node.gpu_time, unit=time_unit),
......@@ -764,8 +939,7 @@ def _build_table(statistic_data,
format_time(
innerop_node.min_gpu_time, unit=time_unit),
format_ratio(
float(innerop_node.gpu_time) /
total_gpu_time))
float(innerop_node.gpu_time) / total_time))
]
append(row_format.format(*row_values))
for device_node_name, devicenode in innerop_node.devices.items(
......@@ -792,7 +966,7 @@ def _build_table(statistic_data,
unit=time_unit),
format_ratio(
float(devicenode.gpu_time) /
total_gpu_time))
total_time))
]
append(row_format.format(*row_values))
for device_node_name, device_node in item.devices.items():
......@@ -814,11 +988,160 @@ def _build_table(statistic_data,
format_time(
devicenode.min_gpu_time, unit=time_unit),
format_ratio(
float(devicenode.gpu_time) /
total_gpu_time))
float(devicenode.gpu_time) / total_time))
]
append(row_format.format(*row_values))
append(header_sep)
append('')
append('')
###### Print Memory Manipulation Summary Report ######
if statistic_data.event_summary.memory_manipulation_items:
headers = [
'Name', 'Calls', 'CPU Total / Avg / Max / Min / Ratio(%)',
'GPU Total / Avg / Max / Min / Ratio(%)'
]
row_format_list = [""]
header_sep_list = [""]
line_length_list = [-SPACING_SIZE]
name_column_width = 30
add_column(name_column_width)
add_column(6)
add_column(40)
add_column(40)
row_format = row_format_list[0]
header_sep = header_sep_list[0]
line_length = line_length_list[0]
# construct table string
append(add_title(line_length, "Memory Manipulation Summary"))
append('Time unit: {}'.format(time_unit))
append(header_sep)
append(row_format.format(*headers))
append(header_sep)
memory_manipulation_items = statistic_data.event_summary.memory_manipulation_items
for name, item in memory_manipulation_items.items():
row_values = [
name,
item.call,
'{} / {} / {} / {} / {}'.format(
format_time(
item.cpu_time, unit=time_unit),
format_time(
item.avg_cpu_time, unit=time_unit),
format_time(
item.max_cpu_time, unit=time_unit),
format_time(
item.min_cpu_time, unit=time_unit),
format_ratio(float(item.cpu_time) / total_time)),
'{} / {} / {} / {} / {}'.format(
format_time(
item.gpu_time, unit=time_unit),
format_time(
item.avg_gpu_time, unit=time_unit),
format_time(
item.max_gpu_time, unit=time_unit),
format_time(
item.min_gpu_time, unit=time_unit),
format_ratio(float(item.gpu_time) / total_time)),
]
append(row_format.format(*row_values))
append(header_sep)
append('')
append('')
###### Print UserDefined Summary Report ######
if statistic_data.event_summary.userdefined_items:
headers = [
'Name', 'Calls', 'CPU Total / Avg / Max / Min / Ratio(%)',
'GPU Total / Avg / Max / Min / Ratio(%)'
]
row_format_list = [""]
header_sep_list = [""]
line_length_list = [-SPACING_SIZE]
name_column_width = 30
add_column(name_column_width)
add_column(6)
add_column(40)
add_column(40)
row_format = row_format_list[0]
header_sep = header_sep_list[0]
line_length = line_length_list[0]
# construct table string
append(add_title(line_length, "UserDefined Summary"))
append('Time unit: {}'.format(time_unit))
append(header_sep)
append(row_format.format(*headers))
append(header_sep)
if thread_sep == True:
userdefined_thread_items = statistic_data.event_summary.userdefined_thread_items
else:
userdefined_thread_items = {
'All threads merged':
statistic_data.event_summary.userdefined_items
}
for thread_id, items in userdefined_thread_items.items():
append(add_title(line_length, "Thread: {}".format(thread_id)))
if sorted_by == SortedKeys.CPUTotal:
sorted_items = sorted(
items.items(), key=lambda x: x[1].cpu_time, reverse=True)
elif sorted_by == SortedKeys.CPUAvg:
sorted_items = sorted(
items.items(),
key=lambda x: x[1].avg_cpu_time,
reverse=True)
elif sorted_by == SortedKeys.CPUMax:
sorted_items = sorted(
items.items(),
key=lambda x: x[1].max_cpu_time,
reverse=True)
elif sorted_by == SortedKeys.CPUMin:
sorted_items = sorted(
items.items(), key=lambda x: x[1].min_cpu_time)
elif sorted_by == SortedKeys.GPUTotal:
sorted_items = sorted(
items.items(), key=lambda x: x[1].gpu_time, reverse=True)
elif sorted_by == SortedKeys.GPUAvg:
sorted_items = sorted(
items.items(),
key=lambda x: x[1].avg_gpu_time,
reverse=True)
elif sorted_by == SortedKeys.GPUMax:
sorted_items = sorted(
items.items(),
key=lambda x: x[1].max_gpu_time,
reverse=True)
elif sorted_by == SortedKeys.GPUMin:
sorted_items = sorted(
items.items(), key=lambda x: x[1].min_gpu_time)
for name, item in sorted_items:
row_values = [
name,
item.call,
'{} / {} / {} / {} / {}'.format(
format_time(
item.cpu_time, unit=time_unit),
format_time(
item.avg_cpu_time, unit=time_unit),
format_time(
item.max_cpu_time, unit=time_unit),
format_time(
item.min_cpu_time, unit=time_unit),
format_ratio(float(item.cpu_time) / total_time)),
'{} / {} / {} / {} / {}'.format(
format_time(
item.gpu_time, unit=time_unit),
format_time(
item.avg_gpu_time, unit=time_unit),
format_time(
item.max_gpu_time, unit=time_unit),
format_time(
item.min_gpu_time, unit=time_unit),
format_ratio(float(item.gpu_time) / total_time)),
]
append(row_format.format(*row_values))
append(header_sep)
return ''.join(result)
......@@ -12,13 +12,14 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.fluid.core import (_RecordEvent, TracerEventType,
load_profiler_result)
from typing import Any
from warnings import warn
import functools
from contextlib import ContextDecorator
from paddle.fluid.core import (_RecordEvent, TracerEventType)
import paddle.fluid.core as core
_AllowedEventTypeList = [
TracerEventType.Dataloader, TracerEventType.ProfileStep,
TracerEventType.UserDefined, TracerEventType.Forward,
......@@ -33,13 +34,27 @@ class RecordEvent(ContextDecorator):
Parameters:
name(str): Name of the record event
event_type(TracerEventType): Type of the record event, can be used for statistics.
Examples:
.. code-block:: python
import paddle
import paddle.profiler as profiler
with profiler.RecordEvent(name='op1', event_type=TracerEventType=TracerEventType.UserDefined):
op1()
# method1: using context manager
with profiler.RecordEvent("record_add"):
data1 = paddle.randn(shape=[3])
data2 = paddle.randn(shape=[3])
result = data1 + data2
# method2: call begin() and end()
record_event = profiler.RecordEvent("record_add")
record_event.begin()
data1 = paddle.randn(shape=[3])
data2 = paddle.randn(shape=[3])
result = data1 + data2
record_event.end()
Note:
RecordEvent will take effect only when profiler is on and at the state of RECORD.
"""
def __init__(self,
......@@ -57,6 +72,20 @@ class RecordEvent(ContextDecorator):
self.end()
def begin(self):
r"""
Record the time of begining.
.. code-block:: python
import paddle
import paddle.profiler as profiler
record_event = profiler.RecordEvent("record_sub")
record_event.begin()
data1 = paddle.randn(shape=[3])
data2 = paddle.randn(shape=[3])
result = data1 - data2
record_event.end()
"""
if self.event_type not in _AllowedEventTypeList:
warn("Only TracerEvent Type in [{}, {}, {}, {}, {}, {},{}]\
can be recorded.".format(*_AllowedEventTypeList))
......@@ -67,10 +96,51 @@ class RecordEvent(ContextDecorator):
self.event = _RecordEvent(self.name, self.event_type)
def end(self):
r'''
Record the time of ending.
.. code-block:: python
import paddle
import paddle.profiler as profiler
record_event = profiler.RecordEvent("record_mul")
record_event.begin()
data1 = paddle.randn(shape=[3])
data2 = paddle.randn(shape=[3])
result = data1 * data2
record_event.end()
'''
if self.event:
self.event.end()
def load_profiler_result(filename: str):
r"""
Load dumped profiler data back to memory.
Parameters:
filename(str): Name of the exported protobuf file of profiler data.
Returns:
ProfilerResult object.
Examples:
.. code-block:: python
# required: gpu
import paddle.profiler as profiler
with profiler.Profiler(
targets=[profiler.ProfilerTarget.CPU, profiler.ProfilerTarget.GPU],
scheduler = (3, 10)) as p:
for iter in range(10):
#train()
p.step()
p.export('test_export_protobuf.pb', format='pb')
profiler_result = profiler.load_profiler_result('test_export_protobuf.pb')
"""
return core.load_profiler_result(filename)
def wrap_optimizers():
def optimizer_warpper(func):
@functools.wraps(func)
......
......@@ -119,6 +119,7 @@ def frame(x, frame_length, hop_length, axis=-1, name=None):
f'Unexpected hop_length: {hop_length}. It should be an positive integer.'
)
if in_dygraph_mode():
if frame_length > x.shape[axis]:
raise ValueError(
f'Attribute frame_length should be less equal than sequence length, '
......@@ -306,8 +307,7 @@ def stft(x,
y1 = stft(x, n_fft=512, center=False, onesided=False) # [8, 512, 372]
"""
check_variable_and_dtype(
x, 'x', ['float16', 'float32', 'float64', 'complex64', 'complex128'],
'stft')
x, 'x', ['float32', 'float64', 'complex64', 'complex128'], 'stft')
x_rank = len(x.shape)
assert x_rank in [1, 2], \
......@@ -325,6 +325,7 @@ def stft(x,
if win_length is None:
win_length = n_fft
if in_dygraph_mode():
assert 0 < n_fft <= x.shape[-1], \
f'n_fft should be in (0, seq_length({x.shape[-1]})], but got {n_fft}.'
......@@ -359,7 +360,7 @@ def stft(x,
x_frames = x_frames.transpose(
perm=[0, 2,
1]) # switch n_fft to last dim, egs: (batch, num_frames, n_fft)
x_frames = x_frames * window
x_frames = paddle.multiply(x_frames, window)
norm = 'ortho' if normalized else 'backward'
if is_complex(x_frames):
......@@ -495,6 +496,7 @@ def istft(x,
n_frames = x.shape[-1]
fft_size = x.shape[-2]
if in_dygraph_mode():
if onesided:
assert (fft_size == n_fft // 2 + 1), \
'fft_size should be equal to n_fft // 2 + 1({}) when onesided is True, but got {}.'.format(n_fft // 2 + 1, fft_size)
......@@ -506,7 +508,10 @@ def istft(x,
assert len(window.shape) == 1 and len(window) == win_length, \
'expected a 1D window tensor of size equal to win_length({}), but got window with shape {}.'.format(win_length, window.shape)
else:
window = paddle.ones(shape=(win_length, ))
window_dtype = paddle.float32 if x.dtype in [
paddle.float32, paddle.complex64
] else paddle.float64
window = paddle.ones(shape=(win_length, ), dtype=window_dtype)
if win_length < n_fft:
pad_left = (n_fft - win_length) // 2
......@@ -534,15 +539,15 @@ def istft(x,
x = x[:, :, :n_fft // 2 + 1]
out = fft_c2r(x=x, n=None, axis=-1, norm=norm, forward=False, name=None)
out = paddle.multiply(out, window).transpose(
perm=[0, 2, 1]) # (batch, n_fft, num_frames)
out = overlap_add(
x=(out * window).transpose(
perm=[0, 2, 1]), # (batch, n_fft, num_frames)
hop_length=hop_length,
axis=-1) # (batch, seq_length)
x=out, hop_length=hop_length, axis=-1) # (batch, seq_length)
window_envelop = overlap_add(
x=paddle.tile(
x=window * window, repeat_times=[n_frames, 1]).transpose(
x=paddle.multiply(window, window).unsqueeze(0),
repeat_times=[n_frames, 1]).transpose(
perm=[1, 0]), # (n_fft, num_frames)
hop_length=hop_length,
axis=-1) # (seq_length, )
......@@ -561,7 +566,7 @@ def istft(x,
window_envelop = window_envelop[start:start + length]
# Check whether the Nonzero Overlap Add (NOLA) constraint is met.
if window_envelop.abs().min().item() < 1e-11:
if in_dygraph_mode() and window_envelop.abs().min().item() < 1e-11:
raise ValueError(
'Abort istft because Nonzero Overlap Add (NOLA) condition failed. For more information about NOLA constraint please see `scipy.signal.check_NOLA`(https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.check_NOLA.html).'
)
......
......@@ -147,7 +147,9 @@ def matmul(x, y, transpose_x=False, transpose_y=False, name=None):
var_names = {'x': x, 'y': y}
for name, val in var_names.items():
check_variable_and_dtype(
val, name, ['float16', 'float32', 'float64'], 'matmul')
val, name,
['float16', 'float32', 'float64', 'complex64', 'complex128'],
'matmul')
__check_input(x, y)
......
......@@ -243,8 +243,8 @@ def add(x, y, name=None):
"""
if paddle.in_dynamic_mode():
#if _in_eager_mode():
#return _C_ops.final_state_add(x, y)
if _in_eager_mode():
return _C_ops.final_state_add( x, y)
return _C_ops.elementwise_add(x, y)
return _elementwise_op(LayerHelper('elementwise_add', **locals()))
......@@ -324,8 +324,8 @@ def subtract(x, y, name=None):
axis = -1
act = None
if paddle.in_dynamic_mode():
# if _in_eager_mode():
# return _C_ops.final_state_subtract( x, y)
if _in_eager_mode():
return _C_ops.final_state_subtract(x, y)
return _elementwise_op_in_dygraph(
x, y, axis=axis, act=act, op_name=op_type)
return _elementwise_op(LayerHelper(op_type, **locals()))
......@@ -383,6 +383,8 @@ def divide(x, y, name=None):
axis = -1
act = None
if paddle.in_dynamic_mode():
if _in_eager_mode():
return _C_ops.final_state_divide( x, y)
return _elementwise_op_in_dygraph(
x, y, axis=axis, act=act, op_name=op_type)
......@@ -512,6 +514,8 @@ def multiply(x, y, name=None):
axis = -1
if paddle.in_dynamic_mode():
if _in_eager_mode():
return _C_ops.final_state_multiply(x, y)
return _elementwise_op_in_dygraph(
x, y, axis=axis, act=act, op_name=op_type)
......@@ -3801,13 +3805,13 @@ def diff(x, n=1, axis=-1, prepend=None, append=None, name=None):
attrs_1 += ('starts', starts_1)
ends_1 = [dim_len - 1]
attrs_1 += ('ends', ends_1)
input_front = _C_ops.slice(new_input, None, None, 'axes', axes, \
input_front = _C_ops.slice(new_input, None, None, None, None, 'axes', axes, \
'infer_flags', infer_flags, *attrs_1)
starts_2 = [1]
attrs_2 += ('starts', starts_2)
ends_2 = [dim_len]
attrs_2 += ('ends', ends_2)
input_back = _C_ops.slice(new_input, None, None, 'axes', axes, \
input_back = _C_ops.slice(new_input, None, None, None, None, 'axes', axes, \
'infer_flags', infer_flags, *attrs_2)
if x.dtype == paddle.bool:
......
......@@ -317,7 +317,7 @@ def tensor_to_string(tensor, prefix='Tensor'):
_template = "{prefix}(shape={shape}, dtype={dtype}, place={place}, stop_gradient={stop_gradient},\n{indent}{data})"
if not tensor._is_initialized():
if not tensor._is_dense_tensor_hold_allocation():
return "Tensor(Not initialized)"
if tensor.is_sparse():
......
......@@ -5,7 +5,7 @@
func : ElementwiseInferMeta
kernel :
func : add
# backward : add_grad
backward : add_grad
- api : cast
args : (Tensor x, DataType out_dtype)
......@@ -47,6 +47,7 @@
func : ElementwiseInferMeta
kernel :
func : divide
backward : divide_grad
- api : dot
args : (Tensor x, Tensor y)
......@@ -136,6 +137,7 @@
func : ElementwiseInferMeta
kernel :
func : multiply
backward : multiply_grad
- api : ones_like
args : (Tensor x, DataType dtype=DataType::UNDEFINED, Place place={})
......@@ -208,6 +210,7 @@
func : ElementwiseInferMeta
kernel :
func : subtract
backward : subtract_grad
- api : sum
args : (Tensor x, int64[] axis={}, DataType dtype=DataType::UNDEFINED, bool keep_dim=false)
......
......@@ -25,10 +25,9 @@
output : Tensor(x_grad)
invoke : scale(out_grad, scale, bias, bias_after_scale)
- backward_api : add_grad
forward : add (Tensor x, Tensor y) -> Tensor(out)
args : (Tensor x, Tensor y, Tensor out_grad)
args : (Tensor x, Tensor y, Tensor out_grad, int axis = -1)
output : Tensor(x_grad), Tensor(y_grad)
infer_meta :
func : GeneralBinaryGradInferMeta
......@@ -36,6 +35,37 @@
kernel :
func : add_grad
- backward_api : subtract_grad
forward : subtract (Tensor x, Tensor y) -> Tensor(out)
args : (Tensor x, Tensor y, Tensor out_grad, int axis = -1)
output : Tensor(x_grad), Tensor(y_grad)
infer_meta :
func : GeneralBinaryGradInferMeta
param : [x, y]
kernel :
func : subtract_grad
- backward_api : multiply_grad
forward : multiply (Tensor x, Tensor y) -> Tensor(out)
args : (Tensor x, Tensor y, Tensor out_grad, int axis = -1)
output : Tensor(x_grad), Tensor(y_grad)
infer_meta :
func : GeneralBinaryGradInferMeta
param : [x, y]
kernel :
func : multiply_grad
- backward_api : divide_grad
forward : divide (Tensor x, Tensor y) -> Tensor(out)
args : (Tensor x, Tensor y, Tensor out_grad, int axis = -1)
output : Tensor(x_grad), Tensor(y_grad)
infer_meta :
func : GeneralBinaryGradInferMeta
param : [x, y]
kernel :
func : divide_grad
- backward_api : digamma_grad
forward : digamma (Tensor x) -> Tensor(out)
args : (Tensor x, Tensor out_grad)
......
......@@ -733,7 +733,7 @@ with redirect_stdout():
},
entry_points={
'console_scripts': [
'fleetrun = paddle.distributed.launch.__main__:launch'
'fleetrun = paddle.distributed.launch.main:launch'
]
},
classifiers=[
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册