未验证 提交 91ebc460 编写于 作者: C Chen Weihang 提交者: GitHub

[Cherry-pick] Complex network execute support (#29905)

* [Complex] Add support for complex grad accumulated (#29889)

* add support for complex grad accumulated

* add unittest for coverage

* update test dtype

* remove useless blank line

* [Complex] Handle complex to real after type promotion (#29855)

* try to add fwd op input dtypes

* refactor base impl

* return tmp_ins after dygraph prepare data

* fix typo found in debug

* polish comment & add complex net test

* revert detail change

* fix unittest failed

* add complex kernel condition control

* fix xpu test failed & polish comment

* polish details by review comments

* Complex op test (#29753)

* delete no need to calculate inputs in dygraph op_test

* delete no need to calculate inputs in dygraph op_test

* change grad elementwise_mul for complex types (#29757)

* add conj op for complex types

* add conj for complex types

* add more test case

* add conj_op test

* modify conj api and impl

* add complex type for fill_constant_op xpu

* add setConstant for complex type

* remove complex conj test file

* user define grad for test_conj_op

* add test case for static mode of conj api

* modify conj doc

* change input args name to x

* remove useless codes

* conj support real types

* add conj test case for real number

* delete no need to calculate inputs in dygraph op_test

* delete no need to calculate inputs in dygraph op_test

* modify grad of mul for complex types

* fix the grads of inputs args order not match bug

* change the grad of div when complex types (#29804)

* change the grad of div when complex types

* fix the grads of inputs args order not match bug
Co-authored-by: Nchentianyu03 <chentianyu03@baidu.com>
上级 bea300dd
...@@ -109,5 +109,30 @@ void TransDataType(const OpKernelType& kernel_type_for_var, ...@@ -109,5 +109,30 @@ void TransDataType(const OpKernelType& kernel_type_for_var,
} }
} }
void TransComplexToReal(const proto::VarType::Type& dst_type,
const proto::VarType::Type& src_type, const Tensor& in,
Tensor* out) {
auto& pool = platform::DeviceContextPool::Instance();
auto* ctx = pool.Get(in.place());
out->Resize(in.dims());
// complex -> real
switch (src_type) {
case proto::VarType::COMPLEX64:
framework::VisitDataType(dst_type,
CastDataType<platform::complex64>(in, out, ctx));
break;
case proto::VarType::COMPLEX128:
framework::VisitDataType(
dst_type, CastDataType<platform::complex128>(in, out, ctx));
break;
default:
PADDLE_THROW(platform::errors::Unimplemented(
"Data type (%s) is not supported when casting complex tensor to real "
"data type.",
DataTypeToString(src_type)));
}
}
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -33,5 +33,20 @@ void TransDataType(const OpKernelType& kernel_type_for_var, ...@@ -33,5 +33,20 @@ void TransDataType(const OpKernelType& kernel_type_for_var,
const OpKernelType& expected_kernel_type, const Tensor& in, const OpKernelType& expected_kernel_type, const Tensor& in,
Tensor* out); Tensor* out);
/**
* Transform complex gradient to real data type.
*
* If complex type promotion occurred in forward op, the grad output of
* this op is complex data type, but the input variable may be real type,
* in this case the grad input need to be cast to type same with input,
* this casting executed at the end of grad op.
*
* note: call this function need to ensure that dst_type is real and
* src_type is complex
*/
void TransComplexToReal(const proto::VarType::Type& dst_type,
const proto::VarType::Type& src_type, const Tensor& in,
Tensor* out);
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -24,6 +24,7 @@ limitations under the License. */ ...@@ -24,6 +24,7 @@ limitations under the License. */
#include <vector> #include <vector>
#include "paddle/fluid/framework/data_transform.h" #include "paddle/fluid/framework/data_transform.h"
#include "paddle/fluid/framework/data_type_transform.h"
#include "paddle/fluid/framework/details/nan_inf_utils.h" #include "paddle/fluid/framework/details/nan_inf_utils.h"
#include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
...@@ -1110,6 +1111,13 @@ void OperatorWithKernel::RunImpl(const Scope& scope, ...@@ -1110,6 +1111,13 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
// there is inplace variable has been transferred. // there is inplace variable has been transferred.
TransferInplaceVarsBack(scope, transfered_inplace_vars, *transfer_scope); TransferInplaceVarsBack(scope, transfered_inplace_vars, *transfer_scope);
} }
// See [ Why need handle complex gradient to real gradient? ]
// Only handle the case where the current kernel data type is complex
if (framework::IsComplexType(kernel_type_->data_type_)) {
HandleComplexGradToRealGrad(scope, runtime_ctx);
}
if (FLAGS_enable_unused_var_check) { if (FLAGS_enable_unused_var_check) {
// skip op that uses mkldnn because it has different memory reuse strategy. // skip op that uses mkldnn because it has different memory reuse strategy.
// use attr here because some GradMakers (like ActivationGradOpMaker) add // use attr here because some GradMakers (like ActivationGradOpMaker) add
...@@ -1255,6 +1263,73 @@ void OperatorWithKernel::TransferInplaceVarsBack( ...@@ -1255,6 +1263,73 @@ void OperatorWithKernel::TransferInplaceVarsBack(
} }
} }
void OperatorWithKernel::HandleComplexGradToRealGrad(
const Scope& scope, RuntimeContext* ctx) const {
for (auto& var_name_item : Outputs()) {
std::vector<Variable*>& output_vars = ctx->outputs[var_name_item.first];
for (size_t i = 0; i < var_name_item.second.size(); ++i) {
// 1. find grad_var & check whether is complex tensor
auto var_name = var_name_item.second[i];
auto orig_var_name = GradOriginalVarName(var_name);
// only focus on gradient var
if (var_name == orig_var_name) {
continue;
}
auto* grad_var = output_vars[i];
// skip nullptr var
if (grad_var == nullptr) {
continue;
}
// don't process LoDTensorArray temporarily,
// add support if necessary for complex number calculations in the future
if (!VarIsTensor(*grad_var)) {
continue;
}
auto* grad_tensor =
GetMutableLoDTensorOrSelectedRowsValueFromVar(grad_var);
// skip nullptr tensor
if (grad_tensor == nullptr || !grad_tensor->IsInitialized()) {
continue;
}
// only focus on complex dtype now
auto src_type = grad_tensor->type();
if (!IsComplexType(src_type)) {
continue;
}
// 2. find forward var & check whether need to cast
auto* var = scope.FindVar(orig_var_name);
// if forward var not exists, do nothing
if (var == nullptr) {
continue;
}
if (!VarIsTensor(*var)) {
continue;
}
const auto* tensor = GetLoDTensorOrSelectedRowsValueFromVar(*var);
PADDLE_ENFORCE_NOT_NULL(
tensor,
platform::errors::Unavailable(
"Forward tensor is nullptr when handle complex data to real."));
// only need record type, the allocation may have been released
auto dst_type = tensor->saved_type();
// only focus on real dtype and need casting
if (IsComplexType(dst_type)) {
continue;
}
// 3. cast complex grad to real grad
VLOG(6) << "Transform " << framework::DataTypeToString(src_type)
<< " var `" << var_name << "` to "
<< framework::DataTypeToString(dst_type)
<< " real var in static graph.";
Tensor out;
TransComplexToReal(dst_type, src_type, *grad_tensor, &out);
SetTensorToVariable(*grad_var, out, grad_var);
}
}
}
Scope* OperatorWithKernel::PrepareData( Scope* OperatorWithKernel::PrepareData(
const Scope& scope, const OpKernelType& expected_kernel_key, const Scope& scope, const OpKernelType& expected_kernel_key,
std::vector<std::string>* transfered_inplace_vars, std::vector<std::string>* transfered_inplace_vars,
......
...@@ -545,6 +545,9 @@ class OperatorWithKernel : public OperatorBase { ...@@ -545,6 +545,9 @@ class OperatorWithKernel : public OperatorBase {
void ChooseKernel(const RuntimeContext& ctx, const Scope& scope, void ChooseKernel(const RuntimeContext& ctx, const Scope& scope,
const platform::Place& place) const; const platform::Place& place) const;
void HandleComplexGradToRealGrad(const Scope& scope,
RuntimeContext* ctx) const;
/* Inner assist methods */ /* Inner assist methods */
// indicate kernel DataType by input data. // indicate kernel DataType by input data.
// By default all input data must be same. // By default all input data must be same.
......
...@@ -197,6 +197,24 @@ class Tensor { ...@@ -197,6 +197,24 @@ class Tensor {
return type_; return type_;
} }
/**
* [Add method get the saved type of tensor]
*
* After the introduction of complex number calculations, Ops that support
* complex number calculations generally support type promotion, such as
* x(float32) + y(complex64) = out(complex64), then the type of the grad
* tensor should be dout(complex64), dx(float32), dy (complex64), but the
* type of dx to be recognized to be float32 by the grad Op relay on the type
* of forward tensor x. But many of our ops have registered InplaceInferer,
* covering the tensor memory of x with out, so as to save storage.
*
* In this case, the dim and type information recorded by x still exist,
* but because x becomes an uninitialized tensor, The type of x record cannot
* be obtained with x.type(), but the type is still valid here, so we
* add saved_type(), This method SHOULD NOT be called by general scenarios.
*/
proto::VarType::Type saved_type() const { return type_; }
// memory size returns the holding memory size in byte. // memory size returns the holding memory size in byte.
size_t memory_size() const; size_t memory_size() const;
...@@ -232,6 +250,7 @@ class Tensor { ...@@ -232,6 +250,7 @@ class Tensor {
void ResetHolderWithType(std::shared_ptr<memory::Allocation> holder, void ResetHolderWithType(std::shared_ptr<memory::Allocation> holder,
const proto::VarType::Type type); const proto::VarType::Type type);
TensorInplaceVersion& InplaceVersionCounter() { TensorInplaceVersion& InplaceVersionCounter() {
return inplace_version_counter_; return inplace_version_counter_;
} }
......
...@@ -23,6 +23,7 @@ ...@@ -23,6 +23,7 @@
#include <unordered_set> #include <unordered_set>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/imperative/gradient_accumulator.h" #include "paddle/fluid/imperative/gradient_accumulator.h"
#include "paddle/fluid/imperative/layer.h" #include "paddle/fluid/imperative/layer.h"
#include "paddle/fluid/imperative/op_base.h" #include "paddle/fluid/imperative/op_base.h"
...@@ -239,6 +240,7 @@ void BasicEngine::Execute() { ...@@ -239,6 +240,7 @@ void BasicEngine::Execute() {
if (var->OverridedStopGradient() || iter->second->RefCnt() > 1) { if (var->OverridedStopGradient() || iter->second->RefCnt() > 1) {
auto tmp_var = std::make_shared<VariableWrapper>(var->Name()); auto tmp_var = std::make_shared<VariableWrapper>(var->Name());
tmp_var->SetType(var->Type()); tmp_var->SetType(var->Type());
tmp_var->SetForwardDataType(var->ForwardDataType());
var = tmp_var; var = tmp_var;
need_accu_var_list_.emplace_back(iter->second.get(), var); need_accu_var_list_.emplace_back(iter->second.get(), var);
VLOG(10) << "create temporary var of " << var->Name() VLOG(10) << "create temporary var of " << var->Name()
......
...@@ -25,6 +25,8 @@ ...@@ -25,6 +25,8 @@
#include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h" #include "paddle/fluid/operators/math/selected_rows_functor.h"
#include "paddle/fluid/platform/complex128.h"
#include "paddle/fluid/platform/complex64.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/float16.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
...@@ -161,6 +163,10 @@ void TensorAdd(const framework::Variable& src, framework::Variable* dst) { ...@@ -161,6 +163,10 @@ void TensorAdd(const framework::Variable& src, framework::Variable* dst) {
PADDLE_TENSOR_ADD(float); PADDLE_TENSOR_ADD(float);
PADDLE_TENSOR_ADD(double); PADDLE_TENSOR_ADD(double);
// NOTE(chenweihang): only support complex grad tensor accumulated,
// support selected rows if needed in the future
PADDLE_TENSOR_ADD(platform::complex64);
PADDLE_TENSOR_ADD(platform::complex128);
#undef PADDLE_TENSOR_ADD #undef PADDLE_TENSOR_ADD
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include <algorithm> #include <algorithm>
#include <queue> #include <queue>
#include <utility> #include <utility>
#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/variable_helper.h" #include "paddle/fluid/framework/variable_helper.h"
...@@ -326,9 +327,31 @@ static void OpBaseRunImpl(const framework::OperatorBase& op, ...@@ -326,9 +327,31 @@ static void OpBaseRunImpl(const framework::OperatorBase& op,
} }
VLOG(5) << LayerDebugString(op.Type(), ins, outs); VLOG(5) << LayerDebugString(op.Type(), ins, outs);
auto prepared_op = PreparedOp::Prepare(ins, outs, *op_kernel, place, attrs);
prepared_op.Run(ins, outs, attrs); /**
* [ Why need temporary inputs here? ]
*
* PrepareData should not change original input tensor inplace.
* Suppose the user defines a tensor(int), enters an op to execute,
* and then this op rewrites GetExpectedKernelForVar, and converts
* this tensor to float type during execution. After the dynamic
* graph is executed, the user-defined variable will be lost, and
* the user cannot get the originally defined int tensor, because
* it has been converted to float, this should be regarded as a bug
* in certain usage scenarios
*
* In static graph mode, when op is executed, a temporary scope
* `transfer_scope` is created before PrepareData, the data after
* transform is stored in the temporary scope, and then discarded
* after the execution of op, but the original input is directly
* overwritten in the previous dynamic graph implemention.
*/
auto expected_kernel_key =
GetExpectedKernelKey<VarType>(ins, outs, *op_kernel, place, attrs);
auto prepared_op = PreparedOp::Prepare(*op_kernel, expected_kernel_key);
auto tmp_ins = PrepareData<VarType>(*op_kernel, ins, expected_kernel_key);
prepared_op.Run(tmp_ins, outs, attrs);
VLOG(4) << LayerDebugString(op.Type(), ins, outs); VLOG(4) << LayerDebugString(op.Type(), ins, outs);
} }
......
...@@ -201,6 +201,14 @@ class VarBase { ...@@ -201,6 +201,14 @@ class VarBase {
framework::proto::VarType::Type DataType() const { return var_->DataType(); } framework::proto::VarType::Type DataType() const { return var_->DataType(); }
void SetForwardDataType(framework::proto::VarType::Type data_type) {
var_->SetForwardDataType(data_type);
}
framework::proto::VarType::Type ForwardDataType() const {
return var_->ForwardDataType();
}
const platform::Place Place() const { return var_->Place(); } const platform::Place Place() const { return var_->Place(); }
void ClearGradient(); void ClearGradient();
......
...@@ -857,6 +857,7 @@ void PartialGradTask::RunEachOp(OpBase *op) { ...@@ -857,6 +857,7 @@ void PartialGradTask::RunEachOp(OpBase *op) {
auto new_grad_var = std::make_shared<VarBase>(true, grad_var->Name()); auto new_grad_var = std::make_shared<VarBase>(true, grad_var->Name());
new_grad_var->SetOverridedStopGradient(false); new_grad_var->SetOverridedStopGradient(false);
new_grad_var->SetForwardDataType(grad_var->ForwardDataType());
if (new_grad_var_iter->second->TotalRefCnt() > 1) { if (new_grad_var_iter->second->TotalRefCnt() > 1) {
grads_to_accumulate_.emplace_back(new_grad_var_iter->second.get(), grads_to_accumulate_.emplace_back(new_grad_var_iter->second.get(),
new_grad_var->SharedVar()); new_grad_var->SharedVar());
......
...@@ -16,12 +16,10 @@ ...@@ -16,12 +16,10 @@
#include <sstream> #include <sstream>
#include "paddle/fluid/imperative/execution_context.h" #include "paddle/fluid/framework/data_type_transform.h"
#include "paddle/fluid/imperative/infer_shape_context.h" #include "paddle/fluid/imperative/infer_shape_context.h"
#include "paddle/fluid/imperative/infer_var_type_context.h" #include "paddle/fluid/imperative/infer_var_type_context.h"
DECLARE_bool(use_mkldnn);
namespace paddle { namespace paddle {
namespace imperative { namespace imperative {
...@@ -36,26 +34,32 @@ const framework::Tensor* GetTensorFromVar(const framework::Variable& var) { ...@@ -36,26 +34,32 @@ const framework::Tensor* GetTensorFromVar(const framework::Variable& var) {
} }
template <typename VarType> template <typename VarType>
static void PrepareData(const platform::Place& place, static void HandleComplexGradToRealGrad(const NameVarMap<VarType>& outs) {
const NameVarMap<VarType>& ins, for (auto& pair : outs) {
const framework::OperatorWithKernel& op, for (auto& var : pair.second) {
const framework::OpKernelType& expected_kernel_key) { if (var == nullptr) {
for (const auto& name_pair : ins) { continue;
for (const auto& var_base : name_pair.second) { }
const auto* tensor = GetTensorFromVar(var_base->Var()); if (var->ForwardDataType() ==
static_cast<framework::proto::VarType::Type>(-1)) {
VLOG(6) << "Var (" << var->Name()
<< ")'s forward data type is not set.";
continue;
}
if (!framework::IsComplexType(var->DataType()) ||
framework::IsComplexType(var->ForwardDataType())) {
continue;
}
const auto* tensor = GetTensorFromVar(var->Var());
if (tensor && tensor->IsInitialized()) { if (tensor && tensor->IsInitialized()) {
auto kernel_type_for_var = op.GetKernelTypeForVar( VLOG(6) << "Transform " << framework::DataTypeToString(var->DataType())
name_pair.first, *tensor, expected_kernel_key); << " var `" << var->Name() << "` to "
if (!NeedTransform(kernel_type_for_var, expected_kernel_key)) { << framework::DataTypeToString(var->ForwardDataType())
continue; << " real var in dynamic graph.";
} else { framework::Tensor out;
VLOG(3) << "Transform Variable " << var_base->Name() << " from " framework::TransComplexToReal(var->ForwardDataType(), var->DataType(),
<< kernel_type_for_var << " to " << expected_kernel_key; *tensor, &out);
framework::Tensor out; SetTensorToVariable(var->Var(), out, var->MutableVar());
TransformData(expected_kernel_key, kernel_type_for_var, *tensor,
&out);
SetTensorToVariable(var_base->Var(), out, var_base->MutableVar());
}
} }
} }
} }
...@@ -63,18 +67,20 @@ static void PrepareData(const platform::Place& place, ...@@ -63,18 +67,20 @@ static void PrepareData(const platform::Place& place,
PreparedOp::PreparedOp(const framework::OperatorBase& op, PreparedOp::PreparedOp(const framework::OperatorBase& op,
const framework::RuntimeContext& ctx, const framework::RuntimeContext& ctx,
const framework::OpKernelType& kernel_type,
const framework::OperatorWithKernel::OpKernelFunc& func, const framework::OperatorWithKernel::OpKernelFunc& func,
platform::DeviceContext* dev_ctx) platform::DeviceContext* dev_ctx)
: op_(op), ctx_(ctx), func_(func), dev_ctx_(dev_ctx) {} : op_(op),
ctx_(ctx),
template <typename VarType> kernel_type_(kernel_type),
PreparedOp PrepareOpImpl(const NameVarMap<VarType>& ins, func_(func),
const NameVarMap<VarType>& outs, dev_ctx_(dev_ctx) {}
const framework::OperatorWithKernel& op,
platform::Place place, PreparedOp PreparedOp::Prepare(
const framework::AttributeMap& attrs) { const framework::OperatorWithKernel& op,
const framework::OpKernelType& expected_kernel_key) {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto* dev_ctx = pool.Get(place); auto* dev_ctx = pool.Get(expected_kernel_key.place_);
// check if op[type] has kernel registered. // check if op[type] has kernel registered.
auto& all_op_kernels = op.AllOpKernels(); auto& all_op_kernels = op.AllOpKernels();
...@@ -89,62 +95,20 @@ PreparedOp PrepareOpImpl(const NameVarMap<VarType>& ins, ...@@ -89,62 +95,20 @@ PreparedOp PrepareOpImpl(const NameVarMap<VarType>& ins,
auto& kernels = kernels_iter->second; auto& kernels = kernels_iter->second;
framework::RuntimeContext ctx({}, {}); framework::RuntimeContext ctx({}, {});
#ifdef PADDLE_WITH_MKLDNN
// MKLDNN variant of code reads attributes in some of GetKernelTypeForVar and
// GetKernelType functions, so we need to copy the attributes there.
// Const qualifier of Attrs had to be discarded to overwrite it.
if (FLAGS_use_mkldnn) {
auto& mutable_op_attrs = const_cast<framework::AttributeMap&>(op.Attrs());
mutable_op_attrs = attrs;
}
#endif
auto expected_kernel_key =
op.GetExpectedKernelType(DygraphExecutionContext<VarType>(
op, framework::Scope(), *dev_ctx, ctx, ins, outs, attrs));
VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
auto kernel_iter = kernels.find(expected_kernel_key); auto kernel_iter = kernels.find(expected_kernel_key);
#ifdef PADDLE_WITH_XPU
if (kernel_iter == kernels.end() &&
is_xpu_place(expected_kernel_key.place_)) {
expected_kernel_key.place_ = platform::CPUPlace();
kernel_iter = kernels.find(expected_kernel_key);
}
#endif
// TODO(jiabin): Add operator.cc's line 1000 part back when we need that case // TODO(jiabin): Add operator.cc's line 1000 part back when we need that case
PADDLE_ENFORCE_NE(kernel_iter, kernels.end(), PADDLE_ENFORCE_NE(kernel_iter, kernels.end(),
platform::errors::NotFound( platform::errors::NotFound(
"Operator %s does not have kernel for %s.", op.Type(), "Operator %s does not have kernel for %s.", op.Type(),
KernelTypeToString(expected_kernel_key))); KernelTypeToString(expected_kernel_key)));
if (!(expected_kernel_key.place_ == place)) { return PreparedOp(op, ctx, expected_kernel_key, kernel_iter->second, dev_ctx);
dev_ctx = pool.Get(expected_kernel_key.place_);
place = dev_ctx->GetPlace();
}
PrepareData<VarType>(place, ins, op, expected_kernel_key);
return PreparedOp(op, ctx, kernel_iter->second, dev_ctx);
}
PreparedOp PreparedOp::Prepare(const NameVarMap<VarBase>& ins,
const NameVarMap<VarBase>& outs,
const framework::OperatorWithKernel& op,
const platform::Place& place,
const framework::AttributeMap& attrs) {
return PrepareOpImpl<VarBase>(ins, outs, op, place, attrs);
}
PreparedOp PreparedOp::Prepare(const NameVarMap<VariableWrapper>& ins,
const NameVarMap<VariableWrapper>& outs,
const framework::OperatorWithKernel& op,
const platform::Place& place,
const framework::AttributeMap& attrs) {
return PrepareOpImpl<VariableWrapper>(ins, outs, op, place, attrs);
} }
template <typename VarType> template <typename VarType>
static void PreparedOpRunImpl( static void PreparedOpRunImpl(
const framework::OperatorBase& op, const framework::RuntimeContext& ctx, const framework::OperatorBase& op, const framework::RuntimeContext& ctx,
const framework::OpKernelType& kernel_type,
const framework::OperatorWithKernel::OpKernelFunc& func, const framework::OperatorWithKernel::OpKernelFunc& func,
platform::DeviceContext* dev_ctx, const NameVarMap<VarType>& ins, platform::DeviceContext* dev_ctx, const NameVarMap<VarType>& ins,
const NameVarMap<VarType>& outs, const framework::AttributeMap& attrs) { const NameVarMap<VarType>& outs, const framework::AttributeMap& attrs) {
...@@ -158,19 +122,36 @@ static void PreparedOpRunImpl( ...@@ -158,19 +122,36 @@ static void PreparedOpRunImpl(
func(DygraphExecutionContext<VarType>(op, scope, *dev_ctx, ctx, ins, outs, func(DygraphExecutionContext<VarType>(op, scope, *dev_ctx, ctx, ins, outs,
attrs)); attrs));
/**
* [ Why need handle complex gradient to real gradient? ]
*
* After the introduction of complex number calculations, Ops that support
* complex number calculations generally support type promotion, such as
* x(float32) + y(complex64) = out(complex64), then the type of the grad
* tensor should be dout(complex64), dx(float32), dy (complex64).
*
* But because the dout is complex64, the dx is also complex64 after
* grad op kernel executed, we need to recognize this situation and
* convert dx to float32 type. HandleComplexGradToRealGrad does this thing.
*/
if (framework::IsComplexType(kernel_type.data_type_)) {
HandleComplexGradToRealGrad<VarType>(outs);
}
} }
void PreparedOp::Run(const NameVarMap<VarBase>& ins, void PreparedOp::Run(const NameVarMap<VarBase>& ins,
const NameVarMap<VarBase>& outs, const NameVarMap<VarBase>& outs,
const framework::AttributeMap& attrs) { const framework::AttributeMap& attrs) {
PreparedOpRunImpl<VarBase>(op_, ctx_, func_, dev_ctx_, ins, outs, attrs); PreparedOpRunImpl<VarBase>(op_, ctx_, kernel_type_, func_, dev_ctx_, ins,
outs, attrs);
} }
void PreparedOp::Run(const NameVarMap<VariableWrapper>& ins, void PreparedOp::Run(const NameVarMap<VariableWrapper>& ins,
const NameVarMap<VariableWrapper>& outs, const NameVarMap<VariableWrapper>& outs,
const framework::AttributeMap& attrs) { const framework::AttributeMap& attrs) {
PreparedOpRunImpl<VariableWrapper>(op_, ctx_, func_, dev_ctx_, ins, outs, PreparedOpRunImpl<VariableWrapper>(op_, ctx_, kernel_type_, func_, dev_ctx_,
attrs); ins, outs, attrs);
} }
} // namespace imperative } // namespace imperative
......
...@@ -21,9 +21,12 @@ ...@@ -21,9 +21,12 @@
#include "paddle/fluid/framework/data_transform.h" #include "paddle/fluid/framework/data_transform.h"
#include "paddle/fluid/framework/op_kernel_type.h" #include "paddle/fluid/framework/op_kernel_type.h"
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/imperative/execution_context.h"
#include "paddle/fluid/imperative/layer.h" #include "paddle/fluid/imperative/layer.h"
#include "paddle/fluid/imperative/type_defs.h" #include "paddle/fluid/imperative/type_defs.h"
DECLARE_bool(use_mkldnn);
namespace paddle { namespace paddle {
namespace framework { namespace framework {
class Tensor; class Tensor;
...@@ -39,24 +42,120 @@ namespace imperative { ...@@ -39,24 +42,120 @@ namespace imperative {
const framework::Tensor* GetTensorFromVar(const framework::Variable& var); const framework::Tensor* GetTensorFromVar(const framework::Variable& var);
template <typename VarType>
static void SetForwardDataTypeOfGradVar(const std::shared_ptr<VarType>& var);
template <>
void SetForwardDataTypeOfGradVar<VariableWrapper>(
const std::shared_ptr<VariableWrapper>& var) {
if (var->HasGradVar()) {
auto grad_var = var->GetGradVar();
VLOG(6) << "Set grad var (" << grad_var->Name() << ") dtype to ("
<< framework::DataTypeToString(var->DataType()) << ").";
grad_var->SetForwardDataType(var->DataType());
}
}
template <>
void SetForwardDataTypeOfGradVar<VarBase>(const std::shared_ptr<VarBase>& var) {
if (var->HasGradVar()) {
auto& shared_var = var->SharedVar();
SetForwardDataTypeOfGradVar<VariableWrapper>(shared_var);
}
}
#ifdef PADDLE_WITH_XPU
static void ReplaceXPUKernelIfNotExists(
const framework::OperatorWithKernel& op,
framework::OpKernelType* expected_kernel_key) {
auto& all_op_kernels = op.AllOpKernels();
auto kernels_iter = all_op_kernels.find(op.Type());
PADDLE_ENFORCE_NE(
kernels_iter, all_op_kernels.end(),
platform::errors::NotFound(
"There are no kernels which are registered in the %s operator.",
op.Type()));
auto& kernels = kernels_iter->second;
auto kernel_iter = kernels.find(*expected_kernel_key);
if (kernel_iter == kernels.end() &&
is_xpu_place(expected_kernel_key->place_)) {
expected_kernel_key->place_ = platform::CPUPlace();
}
}
#endif
template <typename VarType>
framework::OpKernelType GetExpectedKernelKey(
const NameVarMap<VarType>& ins, const NameVarMap<VarType>& outs,
const framework::OperatorWithKernel& op, const platform::Place& place,
const framework::AttributeMap& attrs) {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto* dev_ctx = pool.Get(place);
framework::RuntimeContext ctx({}, {});
#ifdef PADDLE_WITH_MKLDNN
// MKLDNN variant of code reads attributes in some of GetKernelTypeForVar and
// GetKernelType functions, so we need to copy the attributes there.
// Const qualifier of Attrs had to be discarded to overwrite it.
if (FLAGS_use_mkldnn) {
auto& mutable_op_attrs = const_cast<framework::AttributeMap&>(op.Attrs());
mutable_op_attrs = attrs;
}
#endif
auto expected_kernel_key =
op.GetExpectedKernelType(DygraphExecutionContext<VarType>(
op, framework::Scope(), *dev_ctx, ctx, ins, outs, attrs));
#ifdef PADDLE_WITH_XPU
ReplaceXPUKernelIfNotExists(op, &expected_kernel_key);
#endif
VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
return expected_kernel_key;
}
template <typename VarType>
NameVarMap<VarType> PrepareData(
const framework::OperatorWithKernel& op, const NameVarMap<VarType>& ins,
const framework::OpKernelType& expected_kernel_key) {
NameVarMap<VarType> tmp_ins(ins);
for (auto& name_pair : tmp_ins) {
for (auto& var_base : name_pair.second) {
const auto* tensor = GetTensorFromVar(var_base->Var());
SetForwardDataTypeOfGradVar(var_base);
if (tensor && tensor->IsInitialized()) {
auto kernel_type_for_var = op.GetKernelTypeForVar(
name_pair.first, *tensor, expected_kernel_key);
if (!NeedTransform(kernel_type_for_var, expected_kernel_key)) {
continue;
} else {
VLOG(3) << "Transform Variable " << var_base->Name() << " from "
<< kernel_type_for_var << " to " << expected_kernel_key;
framework::Tensor out;
auto tmp_var = std::make_shared<VarType>(var_base->Name());
tmp_var->SetType(var_base->Type());
TransformData(expected_kernel_key, kernel_type_for_var, *tensor,
&out);
SetTensorToVariable(var_base->Var(), out, tmp_var->MutableVar());
var_base = tmp_var;
}
}
}
}
return tmp_ins;
}
class PreparedOp { class PreparedOp {
public: public:
PreparedOp(const framework::OperatorBase& op, PreparedOp(const framework::OperatorBase& op,
const framework::RuntimeContext& ctx, const framework::RuntimeContext& ctx,
const framework::OpKernelType& kernel_type,
const framework::OperatorWithKernel::OpKernelFunc& func, const framework::OperatorWithKernel::OpKernelFunc& func,
platform::DeviceContext* dev_ctx); platform::DeviceContext* dev_ctx);
static PreparedOp Prepare(const NameVarMap<VarBase>& ins, static PreparedOp Prepare(const framework::OperatorWithKernel& op,
const NameVarMap<VarBase>& outs, const framework::OpKernelType& expected_kernel_key);
const framework::OperatorWithKernel& op,
const platform::Place& place,
const framework::AttributeMap& attrs);
static PreparedOp Prepare(const NameVarMap<VariableWrapper>& ins,
const NameVarMap<VariableWrapper>& outs,
const framework::OperatorWithKernel& op,
const platform::Place& place,
const framework::AttributeMap& attrs);
void Run(const NameVarMap<VarBase>& in, const NameVarMap<VarBase>& out, void Run(const NameVarMap<VarBase>& in, const NameVarMap<VarBase>& out,
const framework::AttributeMap& attrs); const framework::AttributeMap& attrs);
...@@ -68,6 +167,7 @@ class PreparedOp { ...@@ -68,6 +167,7 @@ class PreparedOp {
private: private:
const framework::OperatorBase& op_; const framework::OperatorBase& op_;
const framework::RuntimeContext& ctx_; const framework::RuntimeContext& ctx_;
framework::OpKernelType kernel_type_;
framework::OperatorWithKernel::OpKernelFunc func_; framework::OperatorWithKernel::OpKernelFunc func_;
platform::DeviceContext* dev_ctx_; platform::DeviceContext* dev_ctx_;
}; };
......
...@@ -32,27 +32,6 @@ namespace framework = paddle::framework; ...@@ -32,27 +32,6 @@ namespace framework = paddle::framework;
namespace paddle { namespace paddle {
namespace imperative { namespace imperative {
static framework::RuntimeContext PrepareRuntimeContext(
const NameVarBaseMap& ins, const NameVarBaseMap& outs) {
framework::VariableValueMap inputs, outputs;
for (auto& in_pair : ins) {
auto& in_ctx = inputs[in_pair.first];
in_ctx.reserve(in_pair.second.size());
for (auto& in_var : in_pair.second) {
in_ctx.emplace_back(in_var->MutableVar());
}
}
for (auto& out_pair : outs) {
auto& out_ctx = outputs[out_pair.first];
out_ctx.reserve(out_pair.second.size());
for (auto& out_var : out_pair.second) {
out_ctx.emplace_back(out_var->MutableVar());
}
}
return framework::RuntimeContext(std::move(inputs), std::move(outputs));
}
static framework::VariableNameMap CreateVarNameMap( static framework::VariableNameMap CreateVarNameMap(
const framework::OpInfo& op_info, const std::string& op_type, const framework::OpInfo& op_info, const std::string& op_type,
const NameVarBaseMap& varbase_map, bool is_input) { const NameVarBaseMap& varbase_map, bool is_input) {
...@@ -111,11 +90,12 @@ TEST(test_prepare_op, test_prepare_op) { ...@@ -111,11 +90,12 @@ TEST(test_prepare_op, test_prepare_op) {
CreateVarNameMap(info, "split", outs, false); CreateVarNameMap(info, "split", outs, false);
auto op = framework::OpRegistry::CreateOp("split", var_in_map, var_out_map, auto op = framework::OpRegistry::CreateOp("split", var_in_map, var_out_map,
split_attr_map); split_attr_map);
framework::RuntimeContext ctx = PrepareRuntimeContext(ins, outs); auto expected_kernel_key = GetExpectedKernelKey<imperative::VarBase>(
ins, outs, dynamic_cast<framework::OperatorWithKernel&>(*op), place,
split_attr_map);
ASSERT_NO_FATAL_FAILURE(PreparedOp preparedOp = PreparedOp::Prepare( ASSERT_NO_FATAL_FAILURE(PreparedOp preparedOp = PreparedOp::Prepare(
ins, outs,
dynamic_cast<framework::OperatorWithKernel&>(*op), dynamic_cast<framework::OperatorWithKernel&>(*op),
place, split_attr_map)); expected_kernel_key));
} }
const framework::Tensor* GetTensorFromVar(const framework::Variable& var); const framework::Tensor* GetTensorFromVar(const framework::Variable& var);
...@@ -161,13 +141,15 @@ TEST(test_prepare_op, test_prepare_data) { ...@@ -161,13 +141,15 @@ TEST(test_prepare_op, test_prepare_data) {
CreateVarNameMap(info, op_type, outs, false); CreateVarNameMap(info, op_type, outs, false);
auto op = framework::OpRegistry::CreateOp(op_type, var_in_map, var_out_map, auto op = framework::OpRegistry::CreateOp(op_type, var_in_map, var_out_map,
attr_map); attr_map);
framework::RuntimeContext ctx = PrepareRuntimeContext(ins, outs);
// test if it can be transformed to GPU place // test if it can be transformed to GPU place
PreparedOp prepared_op = PreparedOp::Prepare( auto expected_kernel_key = GetExpectedKernelKey<imperative::VarBase>(
ins, outs, dynamic_cast<framework::OperatorWithKernel&>(*op), gpu_place, ins, outs, dynamic_cast<framework::OperatorWithKernel&>(*op), gpu_place,
attr_map); attr_map);
for (const auto& name_pair : ins) { imperative::NameVarBaseMap tmp_ins = PrepareData<imperative::VarBase>(
dynamic_cast<framework::OperatorWithKernel&>(*op), ins,
expected_kernel_key);
for (const auto& name_pair : tmp_ins) {
for (const auto& vb : name_pair.second) { for (const auto& vb : name_pair.second) {
ASSERT_TRUE(platform::is_same_place( ASSERT_TRUE(platform::is_same_place(
vb->Var().Get<framework::LoDTensor>().place(), gpu_place)); vb->Var().Get<framework::LoDTensor>().place(), gpu_place));
...@@ -208,13 +190,15 @@ void TestPrepareDataSamePlace(framework::AttributeMap attr_map) { ...@@ -208,13 +190,15 @@ void TestPrepareDataSamePlace(framework::AttributeMap attr_map) {
auto op = framework::OpRegistry::CreateOp(op_type, var_in_map, var_out_map, auto op = framework::OpRegistry::CreateOp(op_type, var_in_map, var_out_map,
attr_map); attr_map);
framework::RuntimeContext ctx = PrepareRuntimeContext(ins, outs);
// test if it never transferred on GPU place // test if it never transferred on GPU place
PreparedOp prepared_op = PreparedOp::Prepare( auto expected_kernel_key = GetExpectedKernelKey<imperative::VarBase>(
ins, outs, dynamic_cast<framework::OperatorWithKernel&>(*op), cpu_place, ins, outs, dynamic_cast<framework::OperatorWithKernel&>(*op), cpu_place,
attr_map); attr_map);
for (const auto& name_pair : ins) { imperative::NameVarBaseMap tmp_ins = PrepareData<imperative::VarBase>(
dynamic_cast<framework::OperatorWithKernel&>(*op), ins,
expected_kernel_key);
for (const auto& name_pair : tmp_ins) {
for (const auto& vb : name_pair.second) { for (const auto& vb : name_pair.second) {
ASSERT_TRUE(platform::is_same_place( ASSERT_TRUE(platform::is_same_place(
vb->Var().Get<framework::LoDTensor>().place(), cpu_place)); vb->Var().Get<framework::LoDTensor>().place(), cpu_place));
......
...@@ -122,10 +122,6 @@ class VariableWrapper { ...@@ -122,10 +122,6 @@ class VariableWrapper {
framework::proto::VarType::Type Type() const { return type_; } framework::proto::VarType::Type Type() const { return type_; }
void SetDataType(framework::proto::VarType::Type data_type) {
data_type_ = data_type;
}
std::shared_ptr<VariableWrapper> GetGradVar() const { std::shared_ptr<VariableWrapper> GetGradVar() const {
return grad_var_.lock(); return grad_var_.lock();
} }
...@@ -140,6 +136,10 @@ class VariableWrapper { ...@@ -140,6 +136,10 @@ class VariableWrapper {
bool HasGradVar() const { return !grad_var_.expired(); } bool HasGradVar() const { return !grad_var_.expired(); }
void SetDataType(framework::proto::VarType::Type data_type) {
data_type_ = data_type;
}
framework::proto::VarType::Type DataType() const { framework::proto::VarType::Type DataType() const {
const framework::Tensor* tensor = nullptr; const framework::Tensor* tensor = nullptr;
if (var_.IsInitialized()) { if (var_.IsInitialized()) {
...@@ -160,6 +160,14 @@ class VariableWrapper { ...@@ -160,6 +160,14 @@ class VariableWrapper {
} }
} }
void SetForwardDataType(framework::proto::VarType::Type data_type) {
fwd_data_type_ = data_type;
}
framework::proto::VarType::Type ForwardDataType() const {
return fwd_data_type_;
}
const platform::Place Place() const { const platform::Place Place() const {
const framework::Tensor* tensor = nullptr; const framework::Tensor* tensor = nullptr;
auto place = auto place =
...@@ -306,6 +314,13 @@ class VariableWrapper { ...@@ -306,6 +314,13 @@ class VariableWrapper {
framework::proto::VarType::Type type_{framework::proto::VarType::LOD_TENSOR}; framework::proto::VarType::Type type_{framework::proto::VarType::LOD_TENSOR};
framework::proto::VarType::Type data_type_{framework::proto::VarType::FP32}; framework::proto::VarType::Type data_type_{framework::proto::VarType::FP32};
// See [ Why need handle complex gradient to real gradient? ]
// Used for grad var to get the data type of its corresponding forward var,
// if inconsistent, the data type of grad var needs to be casted to be
// consistent with forward var
framework::proto::VarType::Type fwd_data_type_{
static_cast<framework::proto::VarType::Type>(-1)};
std::weak_ptr<VariableWrapper> grad_var_; std::weak_ptr<VariableWrapper> grad_var_;
std::weak_ptr<GradOpNode> grad_node_; std::weak_ptr<GradOpNode> grad_node_;
......
...@@ -269,7 +269,7 @@ void Copy<platform::CUDAPlace, platform::CUDAPlace>( ...@@ -269,7 +269,7 @@ void Copy<platform::CUDAPlace, platform::CUDAPlace>(
if (UNLIKELY(num == 0)) return; if (UNLIKELY(num == 0)) return;
VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
<< dst_place << " by thream(" << stream << ")"; << dst_place << " by stream(" << stream << ")";
if (dst_place == src_place) { if (dst_place == src_place) {
platform::SetDeviceId(src_place.device); platform::SetDeviceId(src_place.device);
if (stream) { if (stream) {
......
...@@ -75,6 +75,45 @@ static __global__ void SimpleElemwiseDivGradCUDAKernel(const T* x, const T* y, ...@@ -75,6 +75,45 @@ static __global__ void SimpleElemwiseDivGradCUDAKernel(const T* x, const T* y,
} }
} }
template <>
__global__ void SimpleElemwiseDivGradCUDAKernel<paddle::platform::complex64>(
const paddle::platform::complex64* x, const paddle::platform::complex64* y,
const paddle::platform::complex64* out,
const paddle::platform::complex64* dout, int64_t size,
paddle::platform::complex64* dx, paddle::platform::complex64* dy) {
int col = blockIdx.x * blockDim.x + threadIdx.x;
while (col < size) {
paddle::platform::complex64 o = dout[col];
paddle::platform::complex64 y_conj(y[col].real, -y[col].imag);
paddle::platform::complex64 out_div_y_conj((out[col] / y[col]).real,
-(out[col] / y[col]).imag);
dx[col] = o / y_conj;
dy[col] = -o * out_div_y_conj;
col += blockDim.x * gridDim.x;
}
}
template <>
__global__ void SimpleElemwiseDivGradCUDAKernel<paddle::platform::complex128>(
const paddle::platform::complex128* x,
const paddle::platform::complex128* y,
const paddle::platform::complex128* out,
const paddle::platform::complex128* dout, int64_t size,
paddle::platform::complex128* dx, paddle::platform::complex128* dy) {
int col = blockIdx.x * blockDim.x + threadIdx.x;
while (col < size) {
paddle::platform::complex128 o = dout[col];
paddle::platform::complex128 y_conj(y[col].real, -y[col].imag);
paddle::platform::complex128 out_div_y_conj((out[col] / y[col]).real,
-(out[col] / y[col]).imag);
dx[col] = o / y_conj;
dy[col] = -o * out_div_y_conj;
col += blockDim.x * gridDim.x;
}
}
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
typename std::enable_if< typename std::enable_if<
std::is_same<DeviceContext, plat::CUDADeviceContext>::value>::type std::is_same<DeviceContext, plat::CUDADeviceContext>::value>::type
......
...@@ -73,6 +73,27 @@ struct DivGradDX { ...@@ -73,6 +73,27 @@ struct DivGradDX {
HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout / y; } HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout / y; }
}; };
template <>
struct DivGradDX<paddle::platform::complex64> {
HOSTDEVICE paddle::platform::complex64 operator()(
paddle::platform::complex64 x, paddle::platform::complex64 y,
paddle::platform::complex64 out, paddle::platform::complex64 dout) const {
paddle::platform::complex64 y_conj(y.real, -y.imag);
return dout / y_conj;
}
};
template <>
struct DivGradDX<paddle::platform::complex128> {
HOSTDEVICE paddle::platform::complex128 operator()(
paddle::platform::complex128 x, paddle::platform::complex128 y,
paddle::platform::complex128 out,
paddle::platform::complex128 dout) const {
paddle::platform::complex128 y_conj(y.real, -y.imag);
return dout / y_conj;
}
};
template <typename T> template <typename T>
struct DivGradDY { struct DivGradDY {
HOSTDEVICE T operator()(T x, T y, T out, T dout) const { HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
...@@ -80,6 +101,28 @@ struct DivGradDY { ...@@ -80,6 +101,28 @@ struct DivGradDY {
} }
}; };
template <>
struct DivGradDY<paddle::platform::complex64> {
HOSTDEVICE paddle::platform::complex64 operator()(
paddle::platform::complex64 x, paddle::platform::complex64 y,
paddle::platform::complex64 out, paddle::platform::complex64 dout) const {
paddle::platform::complex64 out_div_y_conj((out / y).real, -(out / y).imag);
return -dout * out_div_y_conj;
}
};
template <>
struct DivGradDY<paddle::platform::complex128> {
HOSTDEVICE paddle::platform::complex128 operator()(
paddle::platform::complex128 x, paddle::platform::complex128 y,
paddle::platform::complex128 out,
paddle::platform::complex128 dout) const {
paddle::platform::complex128 out_div_y_conj((out / y).real,
-(out / y).imag);
return -dout * out_div_y_conj;
}
};
template <typename T> template <typename T>
struct DivDoubleDY { struct DivDoubleDY {
HOSTDEVICE T operator()(T x, T y, T out, T dout) const { HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
......
...@@ -75,6 +75,36 @@ static __global__ void SimpleElemwiseMulGradCUDAKernel(const T* x, const T* y, ...@@ -75,6 +75,36 @@ static __global__ void SimpleElemwiseMulGradCUDAKernel(const T* x, const T* y,
} }
} }
template <>
__global__ void SimpleElemwiseMulGradCUDAKernel<plat::complex64>(
const plat::complex64* x, const plat::complex64* y,
const plat::complex64* out, const plat::complex64* dout, int64_t size,
plat::complex64* dx, plat::complex64* dy) {
int col = blockIdx.x * blockDim.x + threadIdx.x;
while (col < size) {
plat::complex64 o = dout[col];
dx[col] = plat::complex64(y[col].real, -y[col].imag) * o;
dy[col] = plat::complex64(x[col].real, -x[col].imag) * o;
col += blockDim.x * gridDim.x;
}
}
template <>
__global__ void SimpleElemwiseMulGradCUDAKernel<plat::complex128>(
const plat::complex128* x, const plat::complex128* y,
const plat::complex128* out, const plat::complex128* dout, int64_t size,
plat::complex128* dx, plat::complex128* dy) {
int col = blockIdx.x * blockDim.x + threadIdx.x;
while (col < size) {
plat::complex128 o = dout[col];
dx[col] = plat::complex128(y[col].real, -y[col].imag) * o;
dy[col] = plat::complex128(x[col].real, -x[col].imag) * o;
col += blockDim.x * gridDim.x;
}
}
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
typename std::enable_if< typename std::enable_if<
std::is_same<DeviceContext, plat::CUDADeviceContext>::value>::type std::is_same<DeviceContext, plat::CUDADeviceContext>::value>::type
......
...@@ -132,11 +132,53 @@ struct MulGradDX { ...@@ -132,11 +132,53 @@ struct MulGradDX {
HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout * y; } HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout * y; }
}; };
template <>
struct MulGradDX<paddle::platform::complex64> {
HOSTDEVICE paddle::platform::complex64 operator()(
paddle::platform::complex64 x, paddle::platform::complex64 y,
paddle::platform::complex64 out, paddle::platform::complex64 dout) const {
paddle::platform::complex64 y_conj(y.real, -y.imag);
return dout * y_conj;
}
};
template <>
struct MulGradDX<paddle::platform::complex128> {
HOSTDEVICE paddle::platform::complex128 operator()(
paddle::platform::complex128 x, paddle::platform::complex128 y,
paddle::platform::complex128 out,
paddle::platform::complex128 dout) const {
paddle::platform::complex128 y_conj(y.real, -y.imag);
return dout * y_conj;
}
};
template <typename T> template <typename T>
struct MulGradDY { struct MulGradDY {
HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout * x; } HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout * x; }
}; };
template <>
struct MulGradDY<paddle::platform::complex64> {
HOSTDEVICE paddle::platform::complex64 operator()(
paddle::platform::complex64 x, paddle::platform::complex64 y,
paddle::platform::complex64 out, paddle::platform::complex64 dout) const {
paddle::platform::complex64 x_conj(x.real, -x.imag);
return dout * x_conj;
}
};
template <>
struct MulGradDY<paddle::platform::complex128> {
HOSTDEVICE paddle::platform::complex128 operator()(
paddle::platform::complex128 x, paddle::platform::complex128 y,
paddle::platform::complex128 out,
paddle::platform::complex128 dout) const {
paddle::platform::complex128 x_conj(x.real, -x.imag);
return dout * x_conj;
}
};
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
typename std::enable_if< typename std::enable_if<
std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
......
...@@ -352,7 +352,8 @@ class ElementwiseOpDoubleGradWithoutDXDY ...@@ -352,7 +352,8 @@ class ElementwiseOpDoubleGradWithoutDXDY
"ElementwiseOpDoubleGradWithoutDXDY"); "ElementwiseOpDoubleGradWithoutDXDY");
input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "DDX"); input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "DDX");
} else { } else {
input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "DDX"); input_data_type =
OperatorWithKernel::IndicateOrPromoteVarDataTypes(ctx, "DDX", "DDY");
} }
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
...@@ -364,6 +365,19 @@ class ElementwiseOpDoubleGradWithoutDXDY ...@@ -364,6 +365,19 @@ class ElementwiseOpDoubleGradWithoutDXDY
#endif #endif
return framework::OpKernelType(input_data_type, ctx.GetPlace()); return framework::OpKernelType(input_data_type, ctx.GetPlace());
} }
framework::OpKernelType GetKernelTypeForVar(
const std::string &var_name, const framework::Tensor &tensor,
const framework::OpKernelType &expected_kernel_type) const {
if (framework::IsComplexType(expected_kernel_type.data_type_)) {
// only promote inputs’s types when contains complex input
return framework::OpKernelType(tensor.type(), tensor.place(),
tensor.layout());
} else {
return framework::OpKernelType(expected_kernel_type.data_type_,
tensor.place(), tensor.layout());
}
}
}; };
template <typename T> template <typename T>
......
...@@ -275,6 +275,15 @@ struct CUBlas<platform::complex64> { ...@@ -275,6 +275,15 @@ struct CUBlas<platform::complex64> {
reinterpret_cast<cuFloatComplex *>(C), ldc)); reinterpret_cast<cuFloatComplex *>(C), ldc));
} }
static void AXPY(cublasHandle_t handle, int n, const complex64 *alpha,
const complex64 *X, const int incX, complex64 *Y,
const int incY) {
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasCaxpy(
handle, n, reinterpret_cast<const cuFloatComplex *>(alpha),
reinterpret_cast<const cuFloatComplex *>(X), incX,
reinterpret_cast<cuFloatComplex *>(Y), incY));
}
static void GEMM_STRIDED_BATCH(cublasHandle_t handle, static void GEMM_STRIDED_BATCH(cublasHandle_t handle,
cublasOperation_t transa, cublasOperation_t transa,
cublasOperation_t transb, int m, int n, int k, cublasOperation_t transb, int m, int n, int k,
...@@ -362,6 +371,15 @@ struct CUBlas<platform::complex128> { ...@@ -362,6 +371,15 @@ struct CUBlas<platform::complex128> {
reinterpret_cast<cuDoubleComplex *>(C), ldc)); reinterpret_cast<cuDoubleComplex *>(C), ldc));
} }
static void AXPY(cublasHandle_t handle, int n, const complex128 *alpha,
const complex128 *X, const int incX, complex128 *Y,
const int incY) {
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasZaxpy(
handle, n, reinterpret_cast<const cuDoubleComplex *>(alpha),
reinterpret_cast<const cuDoubleComplex *>(X), incX,
reinterpret_cast<cuDoubleComplex *>(Y), incY));
}
static void GEMM_STRIDED_BATCH(cublasHandle_t handle, static void GEMM_STRIDED_BATCH(cublasHandle_t handle,
cublasOperation_t transa, cublasOperation_t transa,
cublasOperation_t transb, int m, int n, int k, cublasOperation_t transb, int m, int n, int k,
......
...@@ -295,6 +295,13 @@ struct CBlas<double> { ...@@ -295,6 +295,13 @@ struct CBlas<double> {
template <> template <>
struct CBlas<platform::complex64> { struct CBlas<platform::complex64> {
template <typename... ARGS>
static void AXPY(int n, const paddle::platform::complex64 alpha,
const paddle::platform::complex64 *X, const int incX,
paddle::platform::complex64 *Y, const int incY) {
platform::dynload::cblas_caxpy(n, &alpha, X, incX, Y, incY);
}
template <typename... ARGS> template <typename... ARGS>
static void VCOPY(ARGS... args) { static void VCOPY(ARGS... args) {
platform::dynload::cblas_ccopy(args...); platform::dynload::cblas_ccopy(args...);
...@@ -415,6 +422,13 @@ struct CBlas<platform::complex64> { ...@@ -415,6 +422,13 @@ struct CBlas<platform::complex64> {
template <> template <>
struct CBlas<platform::complex128> { struct CBlas<platform::complex128> {
template <typename... ARGS>
static void AXPY(int n, const paddle::platform::complex128 alpha,
const paddle::platform::complex128 *X, const int incX,
paddle::platform::complex128 *Y, const int incY) {
platform::dynload::cblas_zaxpy(n, &alpha, X, incX, Y, incY);
}
template <typename... ARGS> template <typename... ARGS>
static void VCOPY(ARGS... args) { static void VCOPY(ARGS... args) {
platform::dynload::cblas_zcopy(args...); platform::dynload::cblas_zcopy(args...);
...@@ -598,11 +612,6 @@ struct CBlas<platform::complex64> { ...@@ -598,11 +612,6 @@ struct CBlas<platform::complex64> {
cblas_ccopy(args...); cblas_ccopy(args...);
} }
template <typename... ARGS>
static void VADD(ARGS... args) {
vcAdd(args...);
}
template <typename... ARGS> template <typename... ARGS>
static void AXPY(int n, const paddle::platform::complex64 alpha, static void AXPY(int n, const paddle::platform::complex64 alpha,
const paddle::platform::complex64 *X, const int incX, const paddle::platform::complex64 *X, const int incX,
...@@ -641,11 +650,6 @@ struct CBlas<platform::complex128> { ...@@ -641,11 +650,6 @@ struct CBlas<platform::complex128> {
cblas_zcopy(args...); cblas_zcopy(args...);
} }
template <typename... ARGS>
static void VADD(ARGS... args) {
vzAdd(args...);
}
template <typename... ARGS> template <typename... ARGS>
static void AXPY(int n, const paddle::platform::complex128 alpha, static void AXPY(int n, const paddle::platform::complex128 alpha,
const paddle::platform::complex128 *X, const int incX, const paddle::platform::complex128 *X, const int incX,
......
...@@ -18,6 +18,8 @@ limitations under the License. */ ...@@ -18,6 +18,8 @@ limitations under the License. */
#include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h" #include "paddle/fluid/operators/math/selected_rows_functor.h"
#include "paddle/fluid/platform/complex128.h"
#include "paddle/fluid/platform/complex64.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -548,6 +550,10 @@ template struct MergeAdd<platform::CPUDeviceContext, int>; ...@@ -548,6 +550,10 @@ template struct MergeAdd<platform::CPUDeviceContext, int>;
template struct MergeAdd<platform::CPUDeviceContext, int64_t>; template struct MergeAdd<platform::CPUDeviceContext, int64_t>;
template struct MergeAdd<platform::CPUDeviceContext, float>; template struct MergeAdd<platform::CPUDeviceContext, float>;
template struct MergeAdd<platform::CPUDeviceContext, double>; template struct MergeAdd<platform::CPUDeviceContext, double>;
template struct MergeAdd<platform::CPUDeviceContext,
paddle::platform::complex64>;
template struct MergeAdd<platform::CPUDeviceContext,
paddle::platform::complex128>;
template struct MergeAverage<platform::CPUDeviceContext, int>; template struct MergeAverage<platform::CPUDeviceContext, int>;
template struct MergeAverage<platform::CPUDeviceContext, int64_t>; template struct MergeAverage<platform::CPUDeviceContext, int64_t>;
......
...@@ -448,6 +448,8 @@ template struct MergeAdd<platform::CUDADeviceContext, double>; ...@@ -448,6 +448,8 @@ template struct MergeAdd<platform::CUDADeviceContext, double>;
template struct MergeAdd<platform::CUDADeviceContext, int>; template struct MergeAdd<platform::CUDADeviceContext, int>;
template struct MergeAdd<platform::CUDADeviceContext, int64_t>; template struct MergeAdd<platform::CUDADeviceContext, int64_t>;
template struct MergeAdd<platform::CUDADeviceContext, platform::float16>; template struct MergeAdd<platform::CUDADeviceContext, platform::float16>;
template struct MergeAdd<platform::CUDADeviceContext, platform::complex64>;
template struct MergeAdd<platform::CUDADeviceContext, platform::complex128>;
template <typename T, int block_size> template <typename T, int block_size>
__global__ void UpdateToTensorKernel(const T* selected_rows, __global__ void UpdateToTensorKernel(const T* selected_rows,
......
...@@ -23,4 +23,6 @@ using CUDAReduceSumGradKernel = ...@@ -23,4 +23,6 @@ using CUDAReduceSumGradKernel =
REGISTER_OP_CUDA_KERNEL(reduce_sum_grad, CUDAReduceSumGradKernel<float>, REGISTER_OP_CUDA_KERNEL(reduce_sum_grad, CUDAReduceSumGradKernel<float>,
CUDAReduceSumGradKernel<double>, CUDAReduceSumGradKernel<double>,
CUDAReduceSumGradKernel<int>, CUDAReduceSumGradKernel<int>,
CUDAReduceSumGradKernel<int64_t>); CUDAReduceSumGradKernel<int64_t>,
CUDAReduceSumGradKernel<paddle::platform::complex64>,
CUDAReduceSumGradKernel<paddle::platform::complex128>);
...@@ -15,6 +15,8 @@ limitations under the License. */ ...@@ -15,6 +15,8 @@ limitations under the License. */
#pragma once #pragma once
#include <cuda.h> #include <cuda.h>
#include <stdio.h> #include <stdio.h>
#include "paddle/fluid/platform/complex128.h"
#include "paddle/fluid/platform/complex64.h"
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/float16.h"
namespace paddle { namespace paddle {
...@@ -126,9 +128,22 @@ CUDA_ATOMIC_WRAPPER(Add, float16) { ...@@ -126,9 +128,22 @@ CUDA_ATOMIC_WRAPPER(Add, float16) {
return ret; return ret;
} }
} }
#endif #endif
CUDA_ATOMIC_WRAPPER(Add, complex64) {
float *real = reinterpret_cast<float *>(address);
float *imag = real + 1;
return complex64(CudaAtomicAdd(real, val.real),
CudaAtomicAdd(imag, val.imag));
}
CUDA_ATOMIC_WRAPPER(Add, complex128) {
double *real = reinterpret_cast<double *>(address);
double *imag = real + 1;
return complex128(CudaAtomicAdd(real, val.real),
CudaAtomicAdd(imag, val.imag));
}
// For atomicMax // For atomicMax
USE_CUDA_ATOMIC(Max, int); USE_CUDA_ATOMIC(Max, int);
USE_CUDA_ATOMIC(Max, unsigned int); USE_CUDA_ATOMIC(Max, unsigned int);
......
...@@ -55,6 +55,8 @@ extern void *cublas_dso_handle; ...@@ -55,6 +55,8 @@ extern void *cublas_dso_handle;
#define CUBLAS_BLAS_ROUTINE_EACH(__macro) \ #define CUBLAS_BLAS_ROUTINE_EACH(__macro) \
__macro(cublasSaxpy_v2); \ __macro(cublasSaxpy_v2); \
__macro(cublasDaxpy_v2); \ __macro(cublasDaxpy_v2); \
__macro(cublasCaxpy_v2); \
__macro(cublasZaxpy_v2); \
__macro(cublasSscal_v2); \ __macro(cublasSscal_v2); \
__macro(cublasDscal_v2); \ __macro(cublasDscal_v2); \
__macro(cublasScopy_v2); \ __macro(cublasScopy_v2); \
......
...@@ -855,7 +855,7 @@ class OpTest(unittest.TestCase): ...@@ -855,7 +855,7 @@ class OpTest(unittest.TestCase):
place, place,
no_check_set=None, no_check_set=None,
inplace_atol=None): inplace_atol=None):
"""Chech the inplace correctness of given op (self.op_type). """Check the inplace correctness of given op (self.op_type).
Run the op twice with same inputs, one enable inplace and another disable, compare their outputs. Run the op twice with same inputs, one enable inplace and another disable, compare their outputs.
Args: Args:
...@@ -935,7 +935,7 @@ class OpTest(unittest.TestCase): ...@@ -935,7 +935,7 @@ class OpTest(unittest.TestCase):
fwd_res, fwd_res,
grad_op_desc, grad_op_desc,
inplace_atol=None): inplace_atol=None):
"""Chech the inplace correctness of given grad_op_desc. """Check the inplace correctness of given grad_op_desc.
Run the grad op twice with same inputs, one enable inplace and another disable, compare their outputs. Run the grad op twice with same inputs, one enable inplace and another disable, compare their outputs.
It works like _check_forward_inplace, but the way to construct program and feed_map differs. It works like _check_forward_inplace, but the way to construct program and feed_map differs.
...@@ -1291,7 +1291,6 @@ class OpTest(unittest.TestCase): ...@@ -1291,7 +1291,6 @@ class OpTest(unittest.TestCase):
def _assert_is_close(self, numeric_grads, analytic_grads, names, def _assert_is_close(self, numeric_grads, analytic_grads, names,
max_relative_error, msg_prefix): max_relative_error, msg_prefix):
for a, b, name in six.moves.zip(numeric_grads, analytic_grads, names): for a, b, name in six.moves.zip(numeric_grads, analytic_grads, names):
# It asserts np.abs(a - b) / np.abs(a) < max_relative_error, in which # It asserts np.abs(a - b) / np.abs(a) < max_relative_error, in which
# max_relative_error is 1e-7. According to the value of np.abs(a), we # max_relative_error is 1e-7. According to the value of np.abs(a), we
...@@ -1544,6 +1543,10 @@ class OpTest(unittest.TestCase): ...@@ -1544,6 +1543,10 @@ class OpTest(unittest.TestCase):
grad_outputs = [] grad_outputs = []
for grad_out_value in user_defined_grad_outputs: for grad_out_value in user_defined_grad_outputs:
grad_outputs.append(paddle.to_tensor(grad_out_value)) grad_outputs.append(paddle.to_tensor(grad_out_value))
# delete the inputs which no need to calculate grad
for no_grad_val in no_grad_set:
del (inputs[no_grad_val])
grad_inputs = paddle.grad( grad_inputs = paddle.grad(
outputs=fluid.layers.utils.flatten(outputs), outputs=fluid.layers.utils.flatten(outputs),
inputs=fluid.layers.utils.flatten(inputs), inputs=fluid.layers.utils.flatten(inputs),
...@@ -1612,7 +1615,7 @@ class OpTest(unittest.TestCase): ...@@ -1612,7 +1615,7 @@ class OpTest(unittest.TestCase):
targets = [ targets = [
outputs[name] for name in outputs if name in output_names outputs[name] for name in outputs if name in output_names
] ]
inputs = [inputs[name] for name in inputs if name in input_to_check] inputs = [inputs[name] for name in input_to_check if name in inputs]
grad_inputs = paddle.static.gradients(targets, inputs, grad_outputs, grad_inputs = paddle.static.gradients(targets, inputs, grad_outputs,
no_grad_set) no_grad_set)
fetch_list = grad_inputs fetch_list = grad_inputs
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
import paddle
import paddle.fluid.core as core
class Optimization_ex1(paddle.nn.Layer):
def __init__(self,
shape,
dtype,
param_attr=paddle.nn.initializer.Uniform(
low=-5., high=5.)):
super(Optimization_ex1, self).__init__()
self.theta0 = self.create_parameter(
shape=shape, attr=param_attr, dtype=dtype, is_bias=False)
self.theta1 = self.create_parameter(
shape=shape, attr=param_attr, dtype=dtype, is_bias=False)
self.A = paddle.to_tensor(
np.random.random((4, 4)).astype(dtype) + np.random.random((4, 4))
.astype(dtype) * 1j)
self.B = paddle.to_tensor(
np.random.random((4, 4)).astype(dtype) + np.random.random(
(4, 4)).astype(dtype) * 1j,
stop_gradient=False)
print(self.A)
def forward(self, mode=1):
jj = paddle.to_tensor(np.array([1j]).astype(np.complex64))
if mode == 1:
# run all calc in one step
loss = paddle.sum(self.A + (self.theta0 + self.theta1 * jj)) * (
paddle.sum(self.A + (self.theta0 + self.theta1 * jj)).conj())
return loss.real()
elif mode == 2:
# run in two step
self.theta = self.theta0 + self.theta1 * jj
loss = paddle.sum(self.A + self.theta) * (
paddle.sum(self.A + self.theta).conj())
return loss.real()
elif mode == 3:
# run without param
loss = paddle.sum(self.A + self.B) * (
paddle.sum(self.A + self.B).conj())
return loss.real()
else:
raise NotImplementedError
class TestComplexGradAccumulated(unittest.TestCase):
def setUp(self):
self.devices = ['cpu']
if core.is_compiled_with_cuda():
self.devices.append('gpu')
self.dtypes = ['float32', 'float64']
self.theta_size = [4, 4]
def run_backward(self, device, dtype, mode):
paddle.set_device(device)
myLayer = Optimization_ex1(self.theta_size, dtype)
loss = myLayer(mode)
loss.backward()
def test_case_one_step(self):
for dev in self.devices:
for dtype in self.dtypes:
self.run_backward(dev, dtype, 1)
def test_case_two_step(self):
for dev in self.devices:
for dtype in self.dtypes:
self.run_backward(dev, dtype, 2)
def test_case_non_param(self):
for dev in self.devices:
for dtype in self.dtypes:
self.run_backward(dev, dtype, 3)
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
import paddle
import paddle.fluid.core as core
class Optimization_ex1(paddle.nn.Layer):
def __init__(self,
shape,
param_attr=paddle.nn.initializer.Uniform(
low=-5., high=5.),
dtype='float32'):
super(Optimization_ex1, self).__init__()
self.theta = self.create_parameter(
shape=shape, attr=param_attr, dtype=dtype, is_bias=False)
self.A = paddle.to_tensor(
np.random.randn(4, 4) + np.random.randn(4, 4) * 1j)
def forward(self):
loss = paddle.add(self.theta, self.A)
return loss.real()
class TestComplexSimpleNet(unittest.TestCase):
def setUp(self):
self.devices = ['cpu']
if core.is_compiled_with_cuda():
self.devices.append('gpu')
self.iter = 10
self.learning_rate = 0.5
self.theta_size = [4, 4]
def train(self, device):
paddle.set_device(device)
myLayer = Optimization_ex1(self.theta_size)
optimizer = paddle.optimizer.Adam(
learning_rate=self.learning_rate, parameters=myLayer.parameters())
for itr in range(self.iter):
loss = myLayer()
loss.backward()
optimizer.step()
optimizer.clear_grad()
def test_train_success(self):
for dev in self.devices:
self.train(dev)
if __name__ == '__main__':
unittest.main()
...@@ -431,7 +431,8 @@ class TestAddOp(unittest.TestCase): ...@@ -431,7 +431,8 @@ class TestAddOp(unittest.TestCase):
class TestComplexElementwiseAddOp(OpTest): class TestComplexElementwiseAddOp(OpTest):
def setUp(self): def setUp(self):
self.op_type = "elementwise_add" self.op_type = "elementwise_add"
self.init_base_dtype() self.dtype = np.float64
self.shape = (2, 3, 4, 5)
self.init_input_output() self.init_input_output()
self.init_grad_input_output() self.init_grad_input_output()
...@@ -446,17 +447,15 @@ class TestComplexElementwiseAddOp(OpTest): ...@@ -446,17 +447,15 @@ class TestComplexElementwiseAddOp(OpTest):
self.dtype = np.float64 self.dtype = np.float64
def init_input_output(self): def init_input_output(self):
self.x = np.random.random( self.x = np.random.random(self.shape).astype(
(2, 3, 4, 5)).astype(self.dtype) + 1J * np.random.random( self.dtype) + 1J * np.random.random(self.shape).astype(self.dtype)
(2, 3, 4, 5)).astype(self.dtype) self.y = np.random.random(self.shape).astype(
self.y = np.random.random( self.dtype) + 1J * np.random.random(self.shape).astype(self.dtype)
(2, 3, 4, 5)).astype(self.dtype) + 1J * np.random.random(
(2, 3, 4, 5)).astype(self.dtype)
self.out = self.x + self.y self.out = self.x + self.y
def init_grad_input_output(self): def init_grad_input_output(self):
self.grad_out = np.ones((2, 3, 4, 5), self.dtype) + 1J * np.ones( self.grad_out = np.ones(self.shape, self.dtype) + 1J * np.ones(
(2, 3, 4, 5), self.dtype) self.shape, self.dtype)
self.grad_x = self.grad_out self.grad_x = self.grad_out
self.grad_y = self.grad_out self.grad_y = self.grad_out
...@@ -487,5 +486,19 @@ class TestComplexElementwiseAddOp(OpTest): ...@@ -487,5 +486,19 @@ class TestComplexElementwiseAddOp(OpTest):
user_defined_grad_outputs=[self.grad_out]) user_defined_grad_outputs=[self.grad_out])
class TestRealComplexElementwiseAddOp(TestComplexElementwiseAddOp):
def init_input_output(self):
self.x = np.random.random(self.shape).astype(self.dtype)
self.y = np.random.random(self.shape).astype(
self.dtype) + 1J * np.random.random(self.shape).astype(self.dtype)
self.out = self.x + self.y
def init_grad_input_output(self):
self.grad_out = np.ones(self.shape, self.dtype) + 1J * np.ones(
self.shape, self.dtype)
self.grad_x = np.real(self.grad_out)
self.grad_y = self.grad_out
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -261,5 +261,65 @@ class TestDivideOp(unittest.TestCase): ...@@ -261,5 +261,65 @@ class TestDivideOp(unittest.TestCase):
self.assertEqual((np_z == z_expected).all(), True) self.assertEqual((np_z == z_expected).all(), True)
class TestComplexElementwiseDivOp(OpTest):
def setUp(self):
self.op_type = "elementwise_div"
self.init_base_dtype()
self.init_input_output()
self.init_grad_input_output()
self.inputs = {
'X': OpTest.np_dtype_to_fluid_dtype(self.x),
'Y': OpTest.np_dtype_to_fluid_dtype(self.y)
}
self.attrs = {'axis': -1, 'use_mkldnn': False}
self.outputs = {'Out': self.out}
def init_base_dtype(self):
self.dtype = np.float64
def init_input_output(self):
self.x = np.random.random(
(2, 3, 4, 5)).astype(self.dtype) + 1J * np.random.random(
(2, 3, 4, 5)).astype(self.dtype)
self.y = np.random.random(
(2, 3, 4, 5)).astype(self.dtype) + 1J * np.random.random(
(2, 3, 4, 5)).astype(self.dtype)
self.out = self.x / self.y
def init_grad_input_output(self):
self.grad_out = np.ones((2, 3, 4, 5), self.dtype) + 1J * np.ones(
(2, 3, 4, 5), self.dtype)
self.grad_x = self.grad_out / np.conj(self.y)
self.grad_y = -self.grad_out * np.conj(self.x / self.y / self.y)
def test_check_output(self):
self.check_output()
def test_check_grad_normal(self):
self.check_grad(
['X', 'Y'],
'Out',
user_defined_grads=[self.grad_x, self.grad_y],
user_defined_grad_outputs=[self.grad_out])
def test_check_grad_ingore_x(self):
self.check_grad(
['Y'],
'Out',
no_grad_set=set("X"),
user_defined_grads=[self.grad_y],
user_defined_grad_outputs=[self.grad_out])
def test_check_grad_ingore_y(self):
self.check_grad(
['X'],
'Out',
no_grad_set=set('Y'),
user_defined_grads=[self.grad_x],
user_defined_grad_outputs=[self.grad_out])
if __name__ == '__main__': if __name__ == '__main__':
paddle.enable_static()
unittest.main() unittest.main()
...@@ -13,13 +13,17 @@ ...@@ -13,13 +13,17 @@
# limitations under the License. # limitations under the License.
from __future__ import print_function from __future__ import print_function
import unittest import unittest
import numpy as np import numpy as np
from op_test import OpTest, skip_check_grad_ci import paddle
import paddle.fluid as fluid
import paddle.fluid.core as core import paddle.fluid.core as core
from paddle.fluid import Program, compiler, program_guard
from paddle.fluid.op import Operator from paddle.fluid.op import Operator
import paddle.fluid as fluid
from paddle.fluid import compiler, Program, program_guard from op_test import OpTest, skip_check_grad_ci
class ElementwiseMulOp(OpTest): class ElementwiseMulOp(OpTest):
...@@ -241,5 +245,65 @@ class TestElementwiseMulOpError(unittest.TestCase): ...@@ -241,5 +245,65 @@ class TestElementwiseMulOpError(unittest.TestCase):
self.assertRaises(TypeError, fluid.layers.elementwise_mul, x2, y2) self.assertRaises(TypeError, fluid.layers.elementwise_mul, x2, y2)
class TestComplexElementwiseMulOp(OpTest):
def setUp(self):
self.op_type = "elementwise_mul"
self.init_base_dtype()
self.init_input_output()
self.init_grad_input_output()
self.inputs = {
'X': OpTest.np_dtype_to_fluid_dtype(self.x),
'Y': OpTest.np_dtype_to_fluid_dtype(self.y)
}
self.attrs = {'axis': -1, 'use_mkldnn': False}
self.outputs = {'Out': self.out}
def init_base_dtype(self):
self.dtype = np.float64
def init_input_output(self):
self.x = np.random.random(
(2, 3, 4, 5)).astype(self.dtype) + 1J * np.random.random(
(2, 3, 4, 5)).astype(self.dtype)
self.y = np.random.random(
(2, 3, 4, 5)).astype(self.dtype) + 1J * np.random.random(
(2, 3, 4, 5)).astype(self.dtype)
self.out = self.x * self.y
def init_grad_input_output(self):
self.grad_out = np.ones((2, 3, 4, 5), self.dtype) + 1J * np.ones(
(2, 3, 4, 5), self.dtype)
self.grad_x = self.grad_out * np.conj(self.y)
self.grad_y = self.grad_out * np.conj(self.x)
def test_check_output(self):
self.check_output()
def test_check_grad_normal(self):
self.check_grad(
['X', 'Y'],
'Out',
user_defined_grads=[self.grad_x, self.grad_y],
user_defined_grad_outputs=[self.grad_out])
def test_check_grad_ingore_x(self):
self.check_grad(
['Y'],
'Out',
no_grad_set=set("X"),
user_defined_grads=[self.grad_y],
user_defined_grad_outputs=[self.grad_out])
def test_check_grad_ingore_y(self):
self.check_grad(
['X'],
'Out',
no_grad_set=set('Y'),
user_defined_grads=[self.grad_x],
user_defined_grad_outputs=[self.grad_out])
if __name__ == '__main__': if __name__ == '__main__':
paddle.enable_static()
unittest.main() unittest.main()
...@@ -519,7 +519,7 @@ class TestStridedSliceAPI(unittest.TestCase): ...@@ -519,7 +519,7 @@ class TestStridedSliceAPI(unittest.TestCase):
np.random.randn(2, 10), place=paddle.CUDAPinnedPlace()) np.random.randn(2, 10), place=paddle.CUDAPinnedPlace())
self.assertTrue(x.place.is_cuda_pinned_place()) self.assertTrue(x.place.is_cuda_pinned_place())
y = x[:, ::2] y = x[:, ::2]
self.assertFalse(x.place.is_cuda_pinned_place()) self.assertTrue(x.place.is_cuda_pinned_place())
self.assertFalse(y.place.is_cuda_pinned_place()) self.assertFalse(y.place.is_cuda_pinned_place())
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册