未验证 提交 91ebc460 编写于 作者: C Chen Weihang 提交者: GitHub

[Cherry-pick] Complex network execute support (#29905)

* [Complex] Add support for complex grad accumulated (#29889)

* add support for complex grad accumulated

* add unittest for coverage

* update test dtype

* remove useless blank line

* [Complex] Handle complex to real after type promotion (#29855)

* try to add fwd op input dtypes

* refactor base impl

* return tmp_ins after dygraph prepare data

* fix typo found in debug

* polish comment & add complex net test

* revert detail change

* fix unittest failed

* add complex kernel condition control

* fix xpu test failed & polish comment

* polish details by review comments

* Complex op test (#29753)

* delete no need to calculate inputs in dygraph op_test

* delete no need to calculate inputs in dygraph op_test

* change grad elementwise_mul for complex types (#29757)

* add conj op for complex types

* add conj for complex types

* add more test case

* add conj_op test

* modify conj api and impl

* add complex type for fill_constant_op xpu

* add setConstant for complex type

* remove complex conj test file

* user define grad for test_conj_op

* add test case for static mode of conj api

* modify conj doc

* change input args name to x

* remove useless codes

* conj support real types

* add conj test case for real number

* delete no need to calculate inputs in dygraph op_test

* delete no need to calculate inputs in dygraph op_test

* modify grad of mul for complex types

* fix the grads of inputs args order not match bug

* change the grad of div when complex types (#29804)

* change the grad of div when complex types

* fix the grads of inputs args order not match bug
Co-authored-by: Nchentianyu03 <chentianyu03@baidu.com>
上级 bea300dd
......@@ -109,5 +109,30 @@ void TransDataType(const OpKernelType& kernel_type_for_var,
}
}
void TransComplexToReal(const proto::VarType::Type& dst_type,
const proto::VarType::Type& src_type, const Tensor& in,
Tensor* out) {
auto& pool = platform::DeviceContextPool::Instance();
auto* ctx = pool.Get(in.place());
out->Resize(in.dims());
// complex -> real
switch (src_type) {
case proto::VarType::COMPLEX64:
framework::VisitDataType(dst_type,
CastDataType<platform::complex64>(in, out, ctx));
break;
case proto::VarType::COMPLEX128:
framework::VisitDataType(
dst_type, CastDataType<platform::complex128>(in, out, ctx));
break;
default:
PADDLE_THROW(platform::errors::Unimplemented(
"Data type (%s) is not supported when casting complex tensor to real "
"data type.",
DataTypeToString(src_type)));
}
}
} // namespace framework
} // namespace paddle
......@@ -33,5 +33,20 @@ void TransDataType(const OpKernelType& kernel_type_for_var,
const OpKernelType& expected_kernel_type, const Tensor& in,
Tensor* out);
/**
* Transform complex gradient to real data type.
*
* If complex type promotion occurred in forward op, the grad output of
* this op is complex data type, but the input variable may be real type,
* in this case the grad input need to be cast to type same with input,
* this casting executed at the end of grad op.
*
* note: call this function need to ensure that dst_type is real and
* src_type is complex
*/
void TransComplexToReal(const proto::VarType::Type& dst_type,
const proto::VarType::Type& src_type, const Tensor& in,
Tensor* out);
} // namespace framework
} // namespace paddle
......@@ -24,6 +24,7 @@ limitations under the License. */
#include <vector>
#include "paddle/fluid/framework/data_transform.h"
#include "paddle/fluid/framework/data_type_transform.h"
#include "paddle/fluid/framework/details/nan_inf_utils.h"
#include "paddle/fluid/framework/executor.h"
#include "paddle/fluid/framework/lod_tensor.h"
......@@ -1110,6 +1111,13 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
// there is inplace variable has been transferred.
TransferInplaceVarsBack(scope, transfered_inplace_vars, *transfer_scope);
}
// See [ Why need handle complex gradient to real gradient? ]
// Only handle the case where the current kernel data type is complex
if (framework::IsComplexType(kernel_type_->data_type_)) {
HandleComplexGradToRealGrad(scope, runtime_ctx);
}
if (FLAGS_enable_unused_var_check) {
// skip op that uses mkldnn because it has different memory reuse strategy.
// use attr here because some GradMakers (like ActivationGradOpMaker) add
......@@ -1255,6 +1263,73 @@ void OperatorWithKernel::TransferInplaceVarsBack(
}
}
void OperatorWithKernel::HandleComplexGradToRealGrad(
const Scope& scope, RuntimeContext* ctx) const {
for (auto& var_name_item : Outputs()) {
std::vector<Variable*>& output_vars = ctx->outputs[var_name_item.first];
for (size_t i = 0; i < var_name_item.second.size(); ++i) {
// 1. find grad_var & check whether is complex tensor
auto var_name = var_name_item.second[i];
auto orig_var_name = GradOriginalVarName(var_name);
// only focus on gradient var
if (var_name == orig_var_name) {
continue;
}
auto* grad_var = output_vars[i];
// skip nullptr var
if (grad_var == nullptr) {
continue;
}
// don't process LoDTensorArray temporarily,
// add support if necessary for complex number calculations in the future
if (!VarIsTensor(*grad_var)) {
continue;
}
auto* grad_tensor =
GetMutableLoDTensorOrSelectedRowsValueFromVar(grad_var);
// skip nullptr tensor
if (grad_tensor == nullptr || !grad_tensor->IsInitialized()) {
continue;
}
// only focus on complex dtype now
auto src_type = grad_tensor->type();
if (!IsComplexType(src_type)) {
continue;
}
// 2. find forward var & check whether need to cast
auto* var = scope.FindVar(orig_var_name);
// if forward var not exists, do nothing
if (var == nullptr) {
continue;
}
if (!VarIsTensor(*var)) {
continue;
}
const auto* tensor = GetLoDTensorOrSelectedRowsValueFromVar(*var);
PADDLE_ENFORCE_NOT_NULL(
tensor,
platform::errors::Unavailable(
"Forward tensor is nullptr when handle complex data to real."));
// only need record type, the allocation may have been released
auto dst_type = tensor->saved_type();
// only focus on real dtype and need casting
if (IsComplexType(dst_type)) {
continue;
}
// 3. cast complex grad to real grad
VLOG(6) << "Transform " << framework::DataTypeToString(src_type)
<< " var `" << var_name << "` to "
<< framework::DataTypeToString(dst_type)
<< " real var in static graph.";
Tensor out;
TransComplexToReal(dst_type, src_type, *grad_tensor, &out);
SetTensorToVariable(*grad_var, out, grad_var);
}
}
}
Scope* OperatorWithKernel::PrepareData(
const Scope& scope, const OpKernelType& expected_kernel_key,
std::vector<std::string>* transfered_inplace_vars,
......
......@@ -545,6 +545,9 @@ class OperatorWithKernel : public OperatorBase {
void ChooseKernel(const RuntimeContext& ctx, const Scope& scope,
const platform::Place& place) const;
void HandleComplexGradToRealGrad(const Scope& scope,
RuntimeContext* ctx) const;
/* Inner assist methods */
// indicate kernel DataType by input data.
// By default all input data must be same.
......
......@@ -197,6 +197,24 @@ class Tensor {
return type_;
}
/**
* [Add method get the saved type of tensor]
*
* After the introduction of complex number calculations, Ops that support
* complex number calculations generally support type promotion, such as
* x(float32) + y(complex64) = out(complex64), then the type of the grad
* tensor should be dout(complex64), dx(float32), dy (complex64), but the
* type of dx to be recognized to be float32 by the grad Op relay on the type
* of forward tensor x. But many of our ops have registered InplaceInferer,
* covering the tensor memory of x with out, so as to save storage.
*
* In this case, the dim and type information recorded by x still exist,
* but because x becomes an uninitialized tensor, The type of x record cannot
* be obtained with x.type(), but the type is still valid here, so we
* add saved_type(), This method SHOULD NOT be called by general scenarios.
*/
proto::VarType::Type saved_type() const { return type_; }
// memory size returns the holding memory size in byte.
size_t memory_size() const;
......@@ -232,6 +250,7 @@ class Tensor {
void ResetHolderWithType(std::shared_ptr<memory::Allocation> holder,
const proto::VarType::Type type);
TensorInplaceVersion& InplaceVersionCounter() {
return inplace_version_counter_;
}
......
......@@ -23,6 +23,7 @@
#include <unordered_set>
#include <utility>
#include <vector>
#include "paddle/fluid/imperative/gradient_accumulator.h"
#include "paddle/fluid/imperative/layer.h"
#include "paddle/fluid/imperative/op_base.h"
......@@ -239,6 +240,7 @@ void BasicEngine::Execute() {
if (var->OverridedStopGradient() || iter->second->RefCnt() > 1) {
auto tmp_var = std::make_shared<VariableWrapper>(var->Name());
tmp_var->SetType(var->Type());
tmp_var->SetForwardDataType(var->ForwardDataType());
var = tmp_var;
need_accu_var_list_.emplace_back(iter->second.get(), var);
VLOG(10) << "create temporary var of " << var->Name()
......
......@@ -25,6 +25,8 @@
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h"
#include "paddle/fluid/platform/complex128.h"
#include "paddle/fluid/platform/complex64.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/fluid/platform/profiler.h"
......@@ -161,6 +163,10 @@ void TensorAdd(const framework::Variable& src, framework::Variable* dst) {
PADDLE_TENSOR_ADD(float);
PADDLE_TENSOR_ADD(double);
// NOTE(chenweihang): only support complex grad tensor accumulated,
// support selected rows if needed in the future
PADDLE_TENSOR_ADD(platform::complex64);
PADDLE_TENSOR_ADD(platform::complex128);
#undef PADDLE_TENSOR_ADD
......
......@@ -16,6 +16,7 @@
#include <algorithm>
#include <queue>
#include <utility>
#include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/variable_helper.h"
......@@ -326,9 +327,31 @@ static void OpBaseRunImpl(const framework::OperatorBase& op,
}
VLOG(5) << LayerDebugString(op.Type(), ins, outs);
auto prepared_op = PreparedOp::Prepare(ins, outs, *op_kernel, place, attrs);
prepared_op.Run(ins, outs, attrs);
/**
* [ Why need temporary inputs here? ]
*
* PrepareData should not change original input tensor inplace.
* Suppose the user defines a tensor(int), enters an op to execute,
* and then this op rewrites GetExpectedKernelForVar, and converts
* this tensor to float type during execution. After the dynamic
* graph is executed, the user-defined variable will be lost, and
* the user cannot get the originally defined int tensor, because
* it has been converted to float, this should be regarded as a bug
* in certain usage scenarios
*
* In static graph mode, when op is executed, a temporary scope
* `transfer_scope` is created before PrepareData, the data after
* transform is stored in the temporary scope, and then discarded
* after the execution of op, but the original input is directly
* overwritten in the previous dynamic graph implemention.
*/
auto expected_kernel_key =
GetExpectedKernelKey<VarType>(ins, outs, *op_kernel, place, attrs);
auto prepared_op = PreparedOp::Prepare(*op_kernel, expected_kernel_key);
auto tmp_ins = PrepareData<VarType>(*op_kernel, ins, expected_kernel_key);
prepared_op.Run(tmp_ins, outs, attrs);
VLOG(4) << LayerDebugString(op.Type(), ins, outs);
}
......
......@@ -201,6 +201,14 @@ class VarBase {
framework::proto::VarType::Type DataType() const { return var_->DataType(); }
void SetForwardDataType(framework::proto::VarType::Type data_type) {
var_->SetForwardDataType(data_type);
}
framework::proto::VarType::Type ForwardDataType() const {
return var_->ForwardDataType();
}
const platform::Place Place() const { return var_->Place(); }
void ClearGradient();
......
......@@ -857,6 +857,7 @@ void PartialGradTask::RunEachOp(OpBase *op) {
auto new_grad_var = std::make_shared<VarBase>(true, grad_var->Name());
new_grad_var->SetOverridedStopGradient(false);
new_grad_var->SetForwardDataType(grad_var->ForwardDataType());
if (new_grad_var_iter->second->TotalRefCnt() > 1) {
grads_to_accumulate_.emplace_back(new_grad_var_iter->second.get(),
new_grad_var->SharedVar());
......
......@@ -16,12 +16,10 @@
#include <sstream>
#include "paddle/fluid/imperative/execution_context.h"
#include "paddle/fluid/framework/data_type_transform.h"
#include "paddle/fluid/imperative/infer_shape_context.h"
#include "paddle/fluid/imperative/infer_var_type_context.h"
DECLARE_bool(use_mkldnn);
namespace paddle {
namespace imperative {
......@@ -36,26 +34,32 @@ const framework::Tensor* GetTensorFromVar(const framework::Variable& var) {
}
template <typename VarType>
static void PrepareData(const platform::Place& place,
const NameVarMap<VarType>& ins,
const framework::OperatorWithKernel& op,
const framework::OpKernelType& expected_kernel_key) {
for (const auto& name_pair : ins) {
for (const auto& var_base : name_pair.second) {
const auto* tensor = GetTensorFromVar(var_base->Var());
if (tensor && tensor->IsInitialized()) {
auto kernel_type_for_var = op.GetKernelTypeForVar(
name_pair.first, *tensor, expected_kernel_key);
if (!NeedTransform(kernel_type_for_var, expected_kernel_key)) {
static void HandleComplexGradToRealGrad(const NameVarMap<VarType>& outs) {
for (auto& pair : outs) {
for (auto& var : pair.second) {
if (var == nullptr) {
continue;
}
if (var->ForwardDataType() ==
static_cast<framework::proto::VarType::Type>(-1)) {
VLOG(6) << "Var (" << var->Name()
<< ")'s forward data type is not set.";
continue;
} else {
VLOG(3) << "Transform Variable " << var_base->Name() << " from "
<< kernel_type_for_var << " to " << expected_kernel_key;
framework::Tensor out;
TransformData(expected_kernel_key, kernel_type_for_var, *tensor,
&out);
SetTensorToVariable(var_base->Var(), out, var_base->MutableVar());
}
if (!framework::IsComplexType(var->DataType()) ||
framework::IsComplexType(var->ForwardDataType())) {
continue;
}
const auto* tensor = GetTensorFromVar(var->Var());
if (tensor && tensor->IsInitialized()) {
VLOG(6) << "Transform " << framework::DataTypeToString(var->DataType())
<< " var `" << var->Name() << "` to "
<< framework::DataTypeToString(var->ForwardDataType())
<< " real var in dynamic graph.";
framework::Tensor out;
framework::TransComplexToReal(var->ForwardDataType(), var->DataType(),
*tensor, &out);
SetTensorToVariable(var->Var(), out, var->MutableVar());
}
}
}
......@@ -63,18 +67,20 @@ static void PrepareData(const platform::Place& place,
PreparedOp::PreparedOp(const framework::OperatorBase& op,
const framework::RuntimeContext& ctx,
const framework::OpKernelType& kernel_type,
const framework::OperatorWithKernel::OpKernelFunc& func,
platform::DeviceContext* dev_ctx)
: op_(op), ctx_(ctx), func_(func), dev_ctx_(dev_ctx) {}
: op_(op),
ctx_(ctx),
kernel_type_(kernel_type),
func_(func),
dev_ctx_(dev_ctx) {}
template <typename VarType>
PreparedOp PrepareOpImpl(const NameVarMap<VarType>& ins,
const NameVarMap<VarType>& outs,
PreparedOp PreparedOp::Prepare(
const framework::OperatorWithKernel& op,
platform::Place place,
const framework::AttributeMap& attrs) {
const framework::OpKernelType& expected_kernel_key) {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto* dev_ctx = pool.Get(place);
auto* dev_ctx = pool.Get(expected_kernel_key.place_);
// check if op[type] has kernel registered.
auto& all_op_kernels = op.AllOpKernels();
......@@ -89,62 +95,20 @@ PreparedOp PrepareOpImpl(const NameVarMap<VarType>& ins,
auto& kernels = kernels_iter->second;
framework::RuntimeContext ctx({}, {});
#ifdef PADDLE_WITH_MKLDNN
// MKLDNN variant of code reads attributes in some of GetKernelTypeForVar and
// GetKernelType functions, so we need to copy the attributes there.
// Const qualifier of Attrs had to be discarded to overwrite it.
if (FLAGS_use_mkldnn) {
auto& mutable_op_attrs = const_cast<framework::AttributeMap&>(op.Attrs());
mutable_op_attrs = attrs;
}
#endif
auto expected_kernel_key =
op.GetExpectedKernelType(DygraphExecutionContext<VarType>(
op, framework::Scope(), *dev_ctx, ctx, ins, outs, attrs));
VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
auto kernel_iter = kernels.find(expected_kernel_key);
#ifdef PADDLE_WITH_XPU
if (kernel_iter == kernels.end() &&
is_xpu_place(expected_kernel_key.place_)) {
expected_kernel_key.place_ = platform::CPUPlace();
kernel_iter = kernels.find(expected_kernel_key);
}
#endif
// TODO(jiabin): Add operator.cc's line 1000 part back when we need that case
PADDLE_ENFORCE_NE(kernel_iter, kernels.end(),
platform::errors::NotFound(
"Operator %s does not have kernel for %s.", op.Type(),
KernelTypeToString(expected_kernel_key)));
if (!(expected_kernel_key.place_ == place)) {
dev_ctx = pool.Get(expected_kernel_key.place_);
place = dev_ctx->GetPlace();
}
PrepareData<VarType>(place, ins, op, expected_kernel_key);
return PreparedOp(op, ctx, kernel_iter->second, dev_ctx);
}
PreparedOp PreparedOp::Prepare(const NameVarMap<VarBase>& ins,
const NameVarMap<VarBase>& outs,
const framework::OperatorWithKernel& op,
const platform::Place& place,
const framework::AttributeMap& attrs) {
return PrepareOpImpl<VarBase>(ins, outs, op, place, attrs);
}
PreparedOp PreparedOp::Prepare(const NameVarMap<VariableWrapper>& ins,
const NameVarMap<VariableWrapper>& outs,
const framework::OperatorWithKernel& op,
const platform::Place& place,
const framework::AttributeMap& attrs) {
return PrepareOpImpl<VariableWrapper>(ins, outs, op, place, attrs);
return PreparedOp(op, ctx, expected_kernel_key, kernel_iter->second, dev_ctx);
}
template <typename VarType>
static void PreparedOpRunImpl(
const framework::OperatorBase& op, const framework::RuntimeContext& ctx,
const framework::OpKernelType& kernel_type,
const framework::OperatorWithKernel::OpKernelFunc& func,
platform::DeviceContext* dev_ctx, const NameVarMap<VarType>& ins,
const NameVarMap<VarType>& outs, const framework::AttributeMap& attrs) {
......@@ -158,19 +122,36 @@ static void PreparedOpRunImpl(
func(DygraphExecutionContext<VarType>(op, scope, *dev_ctx, ctx, ins, outs,
attrs));
/**
* [ Why need handle complex gradient to real gradient? ]
*
* After the introduction of complex number calculations, Ops that support
* complex number calculations generally support type promotion, such as
* x(float32) + y(complex64) = out(complex64), then the type of the grad
* tensor should be dout(complex64), dx(float32), dy (complex64).
*
* But because the dout is complex64, the dx is also complex64 after
* grad op kernel executed, we need to recognize this situation and
* convert dx to float32 type. HandleComplexGradToRealGrad does this thing.
*/
if (framework::IsComplexType(kernel_type.data_type_)) {
HandleComplexGradToRealGrad<VarType>(outs);
}
}
void PreparedOp::Run(const NameVarMap<VarBase>& ins,
const NameVarMap<VarBase>& outs,
const framework::AttributeMap& attrs) {
PreparedOpRunImpl<VarBase>(op_, ctx_, func_, dev_ctx_, ins, outs, attrs);
PreparedOpRunImpl<VarBase>(op_, ctx_, kernel_type_, func_, dev_ctx_, ins,
outs, attrs);
}
void PreparedOp::Run(const NameVarMap<VariableWrapper>& ins,
const NameVarMap<VariableWrapper>& outs,
const framework::AttributeMap& attrs) {
PreparedOpRunImpl<VariableWrapper>(op_, ctx_, func_, dev_ctx_, ins, outs,
attrs);
PreparedOpRunImpl<VariableWrapper>(op_, ctx_, kernel_type_, func_, dev_ctx_,
ins, outs, attrs);
}
} // namespace imperative
......
......@@ -21,9 +21,12 @@
#include "paddle/fluid/framework/data_transform.h"
#include "paddle/fluid/framework/op_kernel_type.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/imperative/execution_context.h"
#include "paddle/fluid/imperative/layer.h"
#include "paddle/fluid/imperative/type_defs.h"
DECLARE_bool(use_mkldnn);
namespace paddle {
namespace framework {
class Tensor;
......@@ -39,24 +42,120 @@ namespace imperative {
const framework::Tensor* GetTensorFromVar(const framework::Variable& var);
template <typename VarType>
static void SetForwardDataTypeOfGradVar(const std::shared_ptr<VarType>& var);
template <>
void SetForwardDataTypeOfGradVar<VariableWrapper>(
const std::shared_ptr<VariableWrapper>& var) {
if (var->HasGradVar()) {
auto grad_var = var->GetGradVar();
VLOG(6) << "Set grad var (" << grad_var->Name() << ") dtype to ("
<< framework::DataTypeToString(var->DataType()) << ").";
grad_var->SetForwardDataType(var->DataType());
}
}
template <>
void SetForwardDataTypeOfGradVar<VarBase>(const std::shared_ptr<VarBase>& var) {
if (var->HasGradVar()) {
auto& shared_var = var->SharedVar();
SetForwardDataTypeOfGradVar<VariableWrapper>(shared_var);
}
}
#ifdef PADDLE_WITH_XPU
static void ReplaceXPUKernelIfNotExists(
const framework::OperatorWithKernel& op,
framework::OpKernelType* expected_kernel_key) {
auto& all_op_kernels = op.AllOpKernels();
auto kernels_iter = all_op_kernels.find(op.Type());
PADDLE_ENFORCE_NE(
kernels_iter, all_op_kernels.end(),
platform::errors::NotFound(
"There are no kernels which are registered in the %s operator.",
op.Type()));
auto& kernels = kernels_iter->second;
auto kernel_iter = kernels.find(*expected_kernel_key);
if (kernel_iter == kernels.end() &&
is_xpu_place(expected_kernel_key->place_)) {
expected_kernel_key->place_ = platform::CPUPlace();
}
}
#endif
template <typename VarType>
framework::OpKernelType GetExpectedKernelKey(
const NameVarMap<VarType>& ins, const NameVarMap<VarType>& outs,
const framework::OperatorWithKernel& op, const platform::Place& place,
const framework::AttributeMap& attrs) {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto* dev_ctx = pool.Get(place);
framework::RuntimeContext ctx({}, {});
#ifdef PADDLE_WITH_MKLDNN
// MKLDNN variant of code reads attributes in some of GetKernelTypeForVar and
// GetKernelType functions, so we need to copy the attributes there.
// Const qualifier of Attrs had to be discarded to overwrite it.
if (FLAGS_use_mkldnn) {
auto& mutable_op_attrs = const_cast<framework::AttributeMap&>(op.Attrs());
mutable_op_attrs = attrs;
}
#endif
auto expected_kernel_key =
op.GetExpectedKernelType(DygraphExecutionContext<VarType>(
op, framework::Scope(), *dev_ctx, ctx, ins, outs, attrs));
#ifdef PADDLE_WITH_XPU
ReplaceXPUKernelIfNotExists(op, &expected_kernel_key);
#endif
VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
return expected_kernel_key;
}
template <typename VarType>
NameVarMap<VarType> PrepareData(
const framework::OperatorWithKernel& op, const NameVarMap<VarType>& ins,
const framework::OpKernelType& expected_kernel_key) {
NameVarMap<VarType> tmp_ins(ins);
for (auto& name_pair : tmp_ins) {
for (auto& var_base : name_pair.second) {
const auto* tensor = GetTensorFromVar(var_base->Var());
SetForwardDataTypeOfGradVar(var_base);
if (tensor && tensor->IsInitialized()) {
auto kernel_type_for_var = op.GetKernelTypeForVar(
name_pair.first, *tensor, expected_kernel_key);
if (!NeedTransform(kernel_type_for_var, expected_kernel_key)) {
continue;
} else {
VLOG(3) << "Transform Variable " << var_base->Name() << " from "
<< kernel_type_for_var << " to " << expected_kernel_key;
framework::Tensor out;
auto tmp_var = std::make_shared<VarType>(var_base->Name());
tmp_var->SetType(var_base->Type());
TransformData(expected_kernel_key, kernel_type_for_var, *tensor,
&out);
SetTensorToVariable(var_base->Var(), out, tmp_var->MutableVar());
var_base = tmp_var;
}
}
}
}
return tmp_ins;
}
class PreparedOp {
public:
PreparedOp(const framework::OperatorBase& op,
const framework::RuntimeContext& ctx,
const framework::OpKernelType& kernel_type,
const framework::OperatorWithKernel::OpKernelFunc& func,
platform::DeviceContext* dev_ctx);
static PreparedOp Prepare(const NameVarMap<VarBase>& ins,
const NameVarMap<VarBase>& outs,
const framework::OperatorWithKernel& op,
const platform::Place& place,
const framework::AttributeMap& attrs);
static PreparedOp Prepare(const NameVarMap<VariableWrapper>& ins,
const NameVarMap<VariableWrapper>& outs,
const framework::OperatorWithKernel& op,
const platform::Place& place,
const framework::AttributeMap& attrs);
static PreparedOp Prepare(const framework::OperatorWithKernel& op,
const framework::OpKernelType& expected_kernel_key);
void Run(const NameVarMap<VarBase>& in, const NameVarMap<VarBase>& out,
const framework::AttributeMap& attrs);
......@@ -68,6 +167,7 @@ class PreparedOp {
private:
const framework::OperatorBase& op_;
const framework::RuntimeContext& ctx_;
framework::OpKernelType kernel_type_;
framework::OperatorWithKernel::OpKernelFunc func_;
platform::DeviceContext* dev_ctx_;
};
......
......@@ -32,27 +32,6 @@ namespace framework = paddle::framework;
namespace paddle {
namespace imperative {
static framework::RuntimeContext PrepareRuntimeContext(
const NameVarBaseMap& ins, const NameVarBaseMap& outs) {
framework::VariableValueMap inputs, outputs;
for (auto& in_pair : ins) {
auto& in_ctx = inputs[in_pair.first];
in_ctx.reserve(in_pair.second.size());
for (auto& in_var : in_pair.second) {
in_ctx.emplace_back(in_var->MutableVar());
}
}
for (auto& out_pair : outs) {
auto& out_ctx = outputs[out_pair.first];
out_ctx.reserve(out_pair.second.size());
for (auto& out_var : out_pair.second) {
out_ctx.emplace_back(out_var->MutableVar());
}
}
return framework::RuntimeContext(std::move(inputs), std::move(outputs));
}
static framework::VariableNameMap CreateVarNameMap(
const framework::OpInfo& op_info, const std::string& op_type,
const NameVarBaseMap& varbase_map, bool is_input) {
......@@ -111,11 +90,12 @@ TEST(test_prepare_op, test_prepare_op) {
CreateVarNameMap(info, "split", outs, false);
auto op = framework::OpRegistry::CreateOp("split", var_in_map, var_out_map,
split_attr_map);
framework::RuntimeContext ctx = PrepareRuntimeContext(ins, outs);
auto expected_kernel_key = GetExpectedKernelKey<imperative::VarBase>(
ins, outs, dynamic_cast<framework::OperatorWithKernel&>(*op), place,
split_attr_map);
ASSERT_NO_FATAL_FAILURE(PreparedOp preparedOp = PreparedOp::Prepare(
ins, outs,
dynamic_cast<framework::OperatorWithKernel&>(*op),
place, split_attr_map));
expected_kernel_key));
}
const framework::Tensor* GetTensorFromVar(const framework::Variable& var);
......@@ -161,13 +141,15 @@ TEST(test_prepare_op, test_prepare_data) {
CreateVarNameMap(info, op_type, outs, false);
auto op = framework::OpRegistry::CreateOp(op_type, var_in_map, var_out_map,
attr_map);
framework::RuntimeContext ctx = PrepareRuntimeContext(ins, outs);
// test if it can be transformed to GPU place
PreparedOp prepared_op = PreparedOp::Prepare(
auto expected_kernel_key = GetExpectedKernelKey<imperative::VarBase>(
ins, outs, dynamic_cast<framework::OperatorWithKernel&>(*op), gpu_place,
attr_map);
for (const auto& name_pair : ins) {
imperative::NameVarBaseMap tmp_ins = PrepareData<imperative::VarBase>(
dynamic_cast<framework::OperatorWithKernel&>(*op), ins,
expected_kernel_key);
for (const auto& name_pair : tmp_ins) {
for (const auto& vb : name_pair.second) {
ASSERT_TRUE(platform::is_same_place(
vb->Var().Get<framework::LoDTensor>().place(), gpu_place));
......@@ -208,13 +190,15 @@ void TestPrepareDataSamePlace(framework::AttributeMap attr_map) {
auto op = framework::OpRegistry::CreateOp(op_type, var_in_map, var_out_map,
attr_map);
framework::RuntimeContext ctx = PrepareRuntimeContext(ins, outs);
// test if it never transferred on GPU place
PreparedOp prepared_op = PreparedOp::Prepare(
auto expected_kernel_key = GetExpectedKernelKey<imperative::VarBase>(
ins, outs, dynamic_cast<framework::OperatorWithKernel&>(*op), cpu_place,
attr_map);
for (const auto& name_pair : ins) {
imperative::NameVarBaseMap tmp_ins = PrepareData<imperative::VarBase>(
dynamic_cast<framework::OperatorWithKernel&>(*op), ins,
expected_kernel_key);
for (const auto& name_pair : tmp_ins) {
for (const auto& vb : name_pair.second) {
ASSERT_TRUE(platform::is_same_place(
vb->Var().Get<framework::LoDTensor>().place(), cpu_place));
......
......@@ -122,10 +122,6 @@ class VariableWrapper {
framework::proto::VarType::Type Type() const { return type_; }
void SetDataType(framework::proto::VarType::Type data_type) {
data_type_ = data_type;
}
std::shared_ptr<VariableWrapper> GetGradVar() const {
return grad_var_.lock();
}
......@@ -140,6 +136,10 @@ class VariableWrapper {
bool HasGradVar() const { return !grad_var_.expired(); }
void SetDataType(framework::proto::VarType::Type data_type) {
data_type_ = data_type;
}
framework::proto::VarType::Type DataType() const {
const framework::Tensor* tensor = nullptr;
if (var_.IsInitialized()) {
......@@ -160,6 +160,14 @@ class VariableWrapper {
}
}
void SetForwardDataType(framework::proto::VarType::Type data_type) {
fwd_data_type_ = data_type;
}
framework::proto::VarType::Type ForwardDataType() const {
return fwd_data_type_;
}
const platform::Place Place() const {
const framework::Tensor* tensor = nullptr;
auto place =
......@@ -306,6 +314,13 @@ class VariableWrapper {
framework::proto::VarType::Type type_{framework::proto::VarType::LOD_TENSOR};
framework::proto::VarType::Type data_type_{framework::proto::VarType::FP32};
// See [ Why need handle complex gradient to real gradient? ]
// Used for grad var to get the data type of its corresponding forward var,
// if inconsistent, the data type of grad var needs to be casted to be
// consistent with forward var
framework::proto::VarType::Type fwd_data_type_{
static_cast<framework::proto::VarType::Type>(-1)};
std::weak_ptr<VariableWrapper> grad_var_;
std::weak_ptr<GradOpNode> grad_node_;
......
......@@ -269,7 +269,7 @@ void Copy<platform::CUDAPlace, platform::CUDAPlace>(
if (UNLIKELY(num == 0)) return;
VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
<< dst_place << " by thream(" << stream << ")";
<< dst_place << " by stream(" << stream << ")";
if (dst_place == src_place) {
platform::SetDeviceId(src_place.device);
if (stream) {
......
......@@ -75,6 +75,45 @@ static __global__ void SimpleElemwiseDivGradCUDAKernel(const T* x, const T* y,
}
}
template <>
__global__ void SimpleElemwiseDivGradCUDAKernel<paddle::platform::complex64>(
const paddle::platform::complex64* x, const paddle::platform::complex64* y,
const paddle::platform::complex64* out,
const paddle::platform::complex64* dout, int64_t size,
paddle::platform::complex64* dx, paddle::platform::complex64* dy) {
int col = blockIdx.x * blockDim.x + threadIdx.x;
while (col < size) {
paddle::platform::complex64 o = dout[col];
paddle::platform::complex64 y_conj(y[col].real, -y[col].imag);
paddle::platform::complex64 out_div_y_conj((out[col] / y[col]).real,
-(out[col] / y[col]).imag);
dx[col] = o / y_conj;
dy[col] = -o * out_div_y_conj;
col += blockDim.x * gridDim.x;
}
}
template <>
__global__ void SimpleElemwiseDivGradCUDAKernel<paddle::platform::complex128>(
const paddle::platform::complex128* x,
const paddle::platform::complex128* y,
const paddle::platform::complex128* out,
const paddle::platform::complex128* dout, int64_t size,
paddle::platform::complex128* dx, paddle::platform::complex128* dy) {
int col = blockIdx.x * blockDim.x + threadIdx.x;
while (col < size) {
paddle::platform::complex128 o = dout[col];
paddle::platform::complex128 y_conj(y[col].real, -y[col].imag);
paddle::platform::complex128 out_div_y_conj((out[col] / y[col]).real,
-(out[col] / y[col]).imag);
dx[col] = o / y_conj;
dy[col] = -o * out_div_y_conj;
col += blockDim.x * gridDim.x;
}
}
template <typename DeviceContext, typename T>
typename std::enable_if<
std::is_same<DeviceContext, plat::CUDADeviceContext>::value>::type
......
......@@ -73,6 +73,27 @@ struct DivGradDX {
HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout / y; }
};
template <>
struct DivGradDX<paddle::platform::complex64> {
HOSTDEVICE paddle::platform::complex64 operator()(
paddle::platform::complex64 x, paddle::platform::complex64 y,
paddle::platform::complex64 out, paddle::platform::complex64 dout) const {
paddle::platform::complex64 y_conj(y.real, -y.imag);
return dout / y_conj;
}
};
template <>
struct DivGradDX<paddle::platform::complex128> {
HOSTDEVICE paddle::platform::complex128 operator()(
paddle::platform::complex128 x, paddle::platform::complex128 y,
paddle::platform::complex128 out,
paddle::platform::complex128 dout) const {
paddle::platform::complex128 y_conj(y.real, -y.imag);
return dout / y_conj;
}
};
template <typename T>
struct DivGradDY {
HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
......@@ -80,6 +101,28 @@ struct DivGradDY {
}
};
template <>
struct DivGradDY<paddle::platform::complex64> {
HOSTDEVICE paddle::platform::complex64 operator()(
paddle::platform::complex64 x, paddle::platform::complex64 y,
paddle::platform::complex64 out, paddle::platform::complex64 dout) const {
paddle::platform::complex64 out_div_y_conj((out / y).real, -(out / y).imag);
return -dout * out_div_y_conj;
}
};
template <>
struct DivGradDY<paddle::platform::complex128> {
HOSTDEVICE paddle::platform::complex128 operator()(
paddle::platform::complex128 x, paddle::platform::complex128 y,
paddle::platform::complex128 out,
paddle::platform::complex128 dout) const {
paddle::platform::complex128 out_div_y_conj((out / y).real,
-(out / y).imag);
return -dout * out_div_y_conj;
}
};
template <typename T>
struct DivDoubleDY {
HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
......
......@@ -75,6 +75,36 @@ static __global__ void SimpleElemwiseMulGradCUDAKernel(const T* x, const T* y,
}
}
template <>
__global__ void SimpleElemwiseMulGradCUDAKernel<plat::complex64>(
const plat::complex64* x, const plat::complex64* y,
const plat::complex64* out, const plat::complex64* dout, int64_t size,
plat::complex64* dx, plat::complex64* dy) {
int col = blockIdx.x * blockDim.x + threadIdx.x;
while (col < size) {
plat::complex64 o = dout[col];
dx[col] = plat::complex64(y[col].real, -y[col].imag) * o;
dy[col] = plat::complex64(x[col].real, -x[col].imag) * o;
col += blockDim.x * gridDim.x;
}
}
template <>
__global__ void SimpleElemwiseMulGradCUDAKernel<plat::complex128>(
const plat::complex128* x, const plat::complex128* y,
const plat::complex128* out, const plat::complex128* dout, int64_t size,
plat::complex128* dx, plat::complex128* dy) {
int col = blockIdx.x * blockDim.x + threadIdx.x;
while (col < size) {
plat::complex128 o = dout[col];
dx[col] = plat::complex128(y[col].real, -y[col].imag) * o;
dy[col] = plat::complex128(x[col].real, -x[col].imag) * o;
col += blockDim.x * gridDim.x;
}
}
template <typename DeviceContext, typename T>
typename std::enable_if<
std::is_same<DeviceContext, plat::CUDADeviceContext>::value>::type
......
......@@ -132,11 +132,53 @@ struct MulGradDX {
HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout * y; }
};
template <>
struct MulGradDX<paddle::platform::complex64> {
HOSTDEVICE paddle::platform::complex64 operator()(
paddle::platform::complex64 x, paddle::platform::complex64 y,
paddle::platform::complex64 out, paddle::platform::complex64 dout) const {
paddle::platform::complex64 y_conj(y.real, -y.imag);
return dout * y_conj;
}
};
template <>
struct MulGradDX<paddle::platform::complex128> {
HOSTDEVICE paddle::platform::complex128 operator()(
paddle::platform::complex128 x, paddle::platform::complex128 y,
paddle::platform::complex128 out,
paddle::platform::complex128 dout) const {
paddle::platform::complex128 y_conj(y.real, -y.imag);
return dout * y_conj;
}
};
template <typename T>
struct MulGradDY {
HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout * x; }
};
template <>
struct MulGradDY<paddle::platform::complex64> {
HOSTDEVICE paddle::platform::complex64 operator()(
paddle::platform::complex64 x, paddle::platform::complex64 y,
paddle::platform::complex64 out, paddle::platform::complex64 dout) const {
paddle::platform::complex64 x_conj(x.real, -x.imag);
return dout * x_conj;
}
};
template <>
struct MulGradDY<paddle::platform::complex128> {
HOSTDEVICE paddle::platform::complex128 operator()(
paddle::platform::complex128 x, paddle::platform::complex128 y,
paddle::platform::complex128 out,
paddle::platform::complex128 dout) const {
paddle::platform::complex128 x_conj(x.real, -x.imag);
return dout * x_conj;
}
};
template <typename DeviceContext, typename T>
typename std::enable_if<
std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
......
......@@ -352,7 +352,8 @@ class ElementwiseOpDoubleGradWithoutDXDY
"ElementwiseOpDoubleGradWithoutDXDY");
input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "DDX");
} else {
input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "DDX");
input_data_type =
OperatorWithKernel::IndicateOrPromoteVarDataTypes(ctx, "DDX", "DDY");
}
#ifdef PADDLE_WITH_MKLDNN
......@@ -364,6 +365,19 @@ class ElementwiseOpDoubleGradWithoutDXDY
#endif
return framework::OpKernelType(input_data_type, ctx.GetPlace());
}
framework::OpKernelType GetKernelTypeForVar(
const std::string &var_name, const framework::Tensor &tensor,
const framework::OpKernelType &expected_kernel_type) const {
if (framework::IsComplexType(expected_kernel_type.data_type_)) {
// only promote inputs’s types when contains complex input
return framework::OpKernelType(tensor.type(), tensor.place(),
tensor.layout());
} else {
return framework::OpKernelType(expected_kernel_type.data_type_,
tensor.place(), tensor.layout());
}
}
};
template <typename T>
......
......@@ -275,6 +275,15 @@ struct CUBlas<platform::complex64> {
reinterpret_cast<cuFloatComplex *>(C), ldc));
}
static void AXPY(cublasHandle_t handle, int n, const complex64 *alpha,
const complex64 *X, const int incX, complex64 *Y,
const int incY) {
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasCaxpy(
handle, n, reinterpret_cast<const cuFloatComplex *>(alpha),
reinterpret_cast<const cuFloatComplex *>(X), incX,
reinterpret_cast<cuFloatComplex *>(Y), incY));
}
static void GEMM_STRIDED_BATCH(cublasHandle_t handle,
cublasOperation_t transa,
cublasOperation_t transb, int m, int n, int k,
......@@ -362,6 +371,15 @@ struct CUBlas<platform::complex128> {
reinterpret_cast<cuDoubleComplex *>(C), ldc));
}
static void AXPY(cublasHandle_t handle, int n, const complex128 *alpha,
const complex128 *X, const int incX, complex128 *Y,
const int incY) {
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasZaxpy(
handle, n, reinterpret_cast<const cuDoubleComplex *>(alpha),
reinterpret_cast<const cuDoubleComplex *>(X), incX,
reinterpret_cast<cuDoubleComplex *>(Y), incY));
}
static void GEMM_STRIDED_BATCH(cublasHandle_t handle,
cublasOperation_t transa,
cublasOperation_t transb, int m, int n, int k,
......
......@@ -295,6 +295,13 @@ struct CBlas<double> {
template <>
struct CBlas<platform::complex64> {
template <typename... ARGS>
static void AXPY(int n, const paddle::platform::complex64 alpha,
const paddle::platform::complex64 *X, const int incX,
paddle::platform::complex64 *Y, const int incY) {
platform::dynload::cblas_caxpy(n, &alpha, X, incX, Y, incY);
}
template <typename... ARGS>
static void VCOPY(ARGS... args) {
platform::dynload::cblas_ccopy(args...);
......@@ -415,6 +422,13 @@ struct CBlas<platform::complex64> {
template <>
struct CBlas<platform::complex128> {
template <typename... ARGS>
static void AXPY(int n, const paddle::platform::complex128 alpha,
const paddle::platform::complex128 *X, const int incX,
paddle::platform::complex128 *Y, const int incY) {
platform::dynload::cblas_zaxpy(n, &alpha, X, incX, Y, incY);
}
template <typename... ARGS>
static void VCOPY(ARGS... args) {
platform::dynload::cblas_zcopy(args...);
......@@ -598,11 +612,6 @@ struct CBlas<platform::complex64> {
cblas_ccopy(args...);
}
template <typename... ARGS>
static void VADD(ARGS... args) {
vcAdd(args...);
}
template <typename... ARGS>
static void AXPY(int n, const paddle::platform::complex64 alpha,
const paddle::platform::complex64 *X, const int incX,
......@@ -641,11 +650,6 @@ struct CBlas<platform::complex128> {
cblas_zcopy(args...);
}
template <typename... ARGS>
static void VADD(ARGS... args) {
vzAdd(args...);
}
template <typename... ARGS>
static void AXPY(int n, const paddle::platform::complex128 alpha,
const paddle::platform::complex128 *X, const int incX,
......
......@@ -18,6 +18,8 @@ limitations under the License. */
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h"
#include "paddle/fluid/platform/complex128.h"
#include "paddle/fluid/platform/complex64.h"
namespace paddle {
namespace operators {
......@@ -548,6 +550,10 @@ template struct MergeAdd<platform::CPUDeviceContext, int>;
template struct MergeAdd<platform::CPUDeviceContext, int64_t>;
template struct MergeAdd<platform::CPUDeviceContext, float>;
template struct MergeAdd<platform::CPUDeviceContext, double>;
template struct MergeAdd<platform::CPUDeviceContext,
paddle::platform::complex64>;
template struct MergeAdd<platform::CPUDeviceContext,
paddle::platform::complex128>;
template struct MergeAverage<platform::CPUDeviceContext, int>;
template struct MergeAverage<platform::CPUDeviceContext, int64_t>;
......
......@@ -448,6 +448,8 @@ template struct MergeAdd<platform::CUDADeviceContext, double>;
template struct MergeAdd<platform::CUDADeviceContext, int>;
template struct MergeAdd<platform::CUDADeviceContext, int64_t>;
template struct MergeAdd<platform::CUDADeviceContext, platform::float16>;
template struct MergeAdd<platform::CUDADeviceContext, platform::complex64>;
template struct MergeAdd<platform::CUDADeviceContext, platform::complex128>;
template <typename T, int block_size>
__global__ void UpdateToTensorKernel(const T* selected_rows,
......
......@@ -23,4 +23,6 @@ using CUDAReduceSumGradKernel =
REGISTER_OP_CUDA_KERNEL(reduce_sum_grad, CUDAReduceSumGradKernel<float>,
CUDAReduceSumGradKernel<double>,
CUDAReduceSumGradKernel<int>,
CUDAReduceSumGradKernel<int64_t>);
CUDAReduceSumGradKernel<int64_t>,
CUDAReduceSumGradKernel<paddle::platform::complex64>,
CUDAReduceSumGradKernel<paddle::platform::complex128>);
......@@ -15,6 +15,8 @@ limitations under the License. */
#pragma once
#include <cuda.h>
#include <stdio.h>
#include "paddle/fluid/platform/complex128.h"
#include "paddle/fluid/platform/complex64.h"
#include "paddle/fluid/platform/float16.h"
namespace paddle {
......@@ -126,9 +128,22 @@ CUDA_ATOMIC_WRAPPER(Add, float16) {
return ret;
}
}
#endif
CUDA_ATOMIC_WRAPPER(Add, complex64) {
float *real = reinterpret_cast<float *>(address);
float *imag = real + 1;
return complex64(CudaAtomicAdd(real, val.real),
CudaAtomicAdd(imag, val.imag));
}
CUDA_ATOMIC_WRAPPER(Add, complex128) {
double *real = reinterpret_cast<double *>(address);
double *imag = real + 1;
return complex128(CudaAtomicAdd(real, val.real),
CudaAtomicAdd(imag, val.imag));
}
// For atomicMax
USE_CUDA_ATOMIC(Max, int);
USE_CUDA_ATOMIC(Max, unsigned int);
......
......@@ -55,6 +55,8 @@ extern void *cublas_dso_handle;
#define CUBLAS_BLAS_ROUTINE_EACH(__macro) \
__macro(cublasSaxpy_v2); \
__macro(cublasDaxpy_v2); \
__macro(cublasCaxpy_v2); \
__macro(cublasZaxpy_v2); \
__macro(cublasSscal_v2); \
__macro(cublasDscal_v2); \
__macro(cublasScopy_v2); \
......
......@@ -855,7 +855,7 @@ class OpTest(unittest.TestCase):
place,
no_check_set=None,
inplace_atol=None):
"""Chech the inplace correctness of given op (self.op_type).
"""Check the inplace correctness of given op (self.op_type).
Run the op twice with same inputs, one enable inplace and another disable, compare their outputs.
Args:
......@@ -935,7 +935,7 @@ class OpTest(unittest.TestCase):
fwd_res,
grad_op_desc,
inplace_atol=None):
"""Chech the inplace correctness of given grad_op_desc.
"""Check the inplace correctness of given grad_op_desc.
Run the grad op twice with same inputs, one enable inplace and another disable, compare their outputs.
It works like _check_forward_inplace, but the way to construct program and feed_map differs.
......@@ -1291,7 +1291,6 @@ class OpTest(unittest.TestCase):
def _assert_is_close(self, numeric_grads, analytic_grads, names,
max_relative_error, msg_prefix):
for a, b, name in six.moves.zip(numeric_grads, analytic_grads, names):
# It asserts np.abs(a - b) / np.abs(a) < max_relative_error, in which
# max_relative_error is 1e-7. According to the value of np.abs(a), we
......@@ -1544,6 +1543,10 @@ class OpTest(unittest.TestCase):
grad_outputs = []
for grad_out_value in user_defined_grad_outputs:
grad_outputs.append(paddle.to_tensor(grad_out_value))
# delete the inputs which no need to calculate grad
for no_grad_val in no_grad_set:
del (inputs[no_grad_val])
grad_inputs = paddle.grad(
outputs=fluid.layers.utils.flatten(outputs),
inputs=fluid.layers.utils.flatten(inputs),
......@@ -1612,7 +1615,7 @@ class OpTest(unittest.TestCase):
targets = [
outputs[name] for name in outputs if name in output_names
]
inputs = [inputs[name] for name in inputs if name in input_to_check]
inputs = [inputs[name] for name in input_to_check if name in inputs]
grad_inputs = paddle.static.gradients(targets, inputs, grad_outputs,
no_grad_set)
fetch_list = grad_inputs
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
import paddle
import paddle.fluid.core as core
class Optimization_ex1(paddle.nn.Layer):
def __init__(self,
shape,
dtype,
param_attr=paddle.nn.initializer.Uniform(
low=-5., high=5.)):
super(Optimization_ex1, self).__init__()
self.theta0 = self.create_parameter(
shape=shape, attr=param_attr, dtype=dtype, is_bias=False)
self.theta1 = self.create_parameter(
shape=shape, attr=param_attr, dtype=dtype, is_bias=False)
self.A = paddle.to_tensor(
np.random.random((4, 4)).astype(dtype) + np.random.random((4, 4))
.astype(dtype) * 1j)
self.B = paddle.to_tensor(
np.random.random((4, 4)).astype(dtype) + np.random.random(
(4, 4)).astype(dtype) * 1j,
stop_gradient=False)
print(self.A)
def forward(self, mode=1):
jj = paddle.to_tensor(np.array([1j]).astype(np.complex64))
if mode == 1:
# run all calc in one step
loss = paddle.sum(self.A + (self.theta0 + self.theta1 * jj)) * (
paddle.sum(self.A + (self.theta0 + self.theta1 * jj)).conj())
return loss.real()
elif mode == 2:
# run in two step
self.theta = self.theta0 + self.theta1 * jj
loss = paddle.sum(self.A + self.theta) * (
paddle.sum(self.A + self.theta).conj())
return loss.real()
elif mode == 3:
# run without param
loss = paddle.sum(self.A + self.B) * (
paddle.sum(self.A + self.B).conj())
return loss.real()
else:
raise NotImplementedError
class TestComplexGradAccumulated(unittest.TestCase):
def setUp(self):
self.devices = ['cpu']
if core.is_compiled_with_cuda():
self.devices.append('gpu')
self.dtypes = ['float32', 'float64']
self.theta_size = [4, 4]
def run_backward(self, device, dtype, mode):
paddle.set_device(device)
myLayer = Optimization_ex1(self.theta_size, dtype)
loss = myLayer(mode)
loss.backward()
def test_case_one_step(self):
for dev in self.devices:
for dtype in self.dtypes:
self.run_backward(dev, dtype, 1)
def test_case_two_step(self):
for dev in self.devices:
for dtype in self.dtypes:
self.run_backward(dev, dtype, 2)
def test_case_non_param(self):
for dev in self.devices:
for dtype in self.dtypes:
self.run_backward(dev, dtype, 3)
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
import paddle
import paddle.fluid.core as core
class Optimization_ex1(paddle.nn.Layer):
def __init__(self,
shape,
param_attr=paddle.nn.initializer.Uniform(
low=-5., high=5.),
dtype='float32'):
super(Optimization_ex1, self).__init__()
self.theta = self.create_parameter(
shape=shape, attr=param_attr, dtype=dtype, is_bias=False)
self.A = paddle.to_tensor(
np.random.randn(4, 4) + np.random.randn(4, 4) * 1j)
def forward(self):
loss = paddle.add(self.theta, self.A)
return loss.real()
class TestComplexSimpleNet(unittest.TestCase):
def setUp(self):
self.devices = ['cpu']
if core.is_compiled_with_cuda():
self.devices.append('gpu')
self.iter = 10
self.learning_rate = 0.5
self.theta_size = [4, 4]
def train(self, device):
paddle.set_device(device)
myLayer = Optimization_ex1(self.theta_size)
optimizer = paddle.optimizer.Adam(
learning_rate=self.learning_rate, parameters=myLayer.parameters())
for itr in range(self.iter):
loss = myLayer()
loss.backward()
optimizer.step()
optimizer.clear_grad()
def test_train_success(self):
for dev in self.devices:
self.train(dev)
if __name__ == '__main__':
unittest.main()
......@@ -431,7 +431,8 @@ class TestAddOp(unittest.TestCase):
class TestComplexElementwiseAddOp(OpTest):
def setUp(self):
self.op_type = "elementwise_add"
self.init_base_dtype()
self.dtype = np.float64
self.shape = (2, 3, 4, 5)
self.init_input_output()
self.init_grad_input_output()
......@@ -446,17 +447,15 @@ class TestComplexElementwiseAddOp(OpTest):
self.dtype = np.float64
def init_input_output(self):
self.x = np.random.random(
(2, 3, 4, 5)).astype(self.dtype) + 1J * np.random.random(
(2, 3, 4, 5)).astype(self.dtype)
self.y = np.random.random(
(2, 3, 4, 5)).astype(self.dtype) + 1J * np.random.random(
(2, 3, 4, 5)).astype(self.dtype)
self.x = np.random.random(self.shape).astype(
self.dtype) + 1J * np.random.random(self.shape).astype(self.dtype)
self.y = np.random.random(self.shape).astype(
self.dtype) + 1J * np.random.random(self.shape).astype(self.dtype)
self.out = self.x + self.y
def init_grad_input_output(self):
self.grad_out = np.ones((2, 3, 4, 5), self.dtype) + 1J * np.ones(
(2, 3, 4, 5), self.dtype)
self.grad_out = np.ones(self.shape, self.dtype) + 1J * np.ones(
self.shape, self.dtype)
self.grad_x = self.grad_out
self.grad_y = self.grad_out
......@@ -487,5 +486,19 @@ class TestComplexElementwiseAddOp(OpTest):
user_defined_grad_outputs=[self.grad_out])
class TestRealComplexElementwiseAddOp(TestComplexElementwiseAddOp):
def init_input_output(self):
self.x = np.random.random(self.shape).astype(self.dtype)
self.y = np.random.random(self.shape).astype(
self.dtype) + 1J * np.random.random(self.shape).astype(self.dtype)
self.out = self.x + self.y
def init_grad_input_output(self):
self.grad_out = np.ones(self.shape, self.dtype) + 1J * np.ones(
self.shape, self.dtype)
self.grad_x = np.real(self.grad_out)
self.grad_y = self.grad_out
if __name__ == '__main__':
unittest.main()
......@@ -261,5 +261,65 @@ class TestDivideOp(unittest.TestCase):
self.assertEqual((np_z == z_expected).all(), True)
class TestComplexElementwiseDivOp(OpTest):
def setUp(self):
self.op_type = "elementwise_div"
self.init_base_dtype()
self.init_input_output()
self.init_grad_input_output()
self.inputs = {
'X': OpTest.np_dtype_to_fluid_dtype(self.x),
'Y': OpTest.np_dtype_to_fluid_dtype(self.y)
}
self.attrs = {'axis': -1, 'use_mkldnn': False}
self.outputs = {'Out': self.out}
def init_base_dtype(self):
self.dtype = np.float64
def init_input_output(self):
self.x = np.random.random(
(2, 3, 4, 5)).astype(self.dtype) + 1J * np.random.random(
(2, 3, 4, 5)).astype(self.dtype)
self.y = np.random.random(
(2, 3, 4, 5)).astype(self.dtype) + 1J * np.random.random(
(2, 3, 4, 5)).astype(self.dtype)
self.out = self.x / self.y
def init_grad_input_output(self):
self.grad_out = np.ones((2, 3, 4, 5), self.dtype) + 1J * np.ones(
(2, 3, 4, 5), self.dtype)
self.grad_x = self.grad_out / np.conj(self.y)
self.grad_y = -self.grad_out * np.conj(self.x / self.y / self.y)
def test_check_output(self):
self.check_output()
def test_check_grad_normal(self):
self.check_grad(
['X', 'Y'],
'Out',
user_defined_grads=[self.grad_x, self.grad_y],
user_defined_grad_outputs=[self.grad_out])
def test_check_grad_ingore_x(self):
self.check_grad(
['Y'],
'Out',
no_grad_set=set("X"),
user_defined_grads=[self.grad_y],
user_defined_grad_outputs=[self.grad_out])
def test_check_grad_ingore_y(self):
self.check_grad(
['X'],
'Out',
no_grad_set=set('Y'),
user_defined_grads=[self.grad_x],
user_defined_grad_outputs=[self.grad_out])
if __name__ == '__main__':
paddle.enable_static()
unittest.main()
......@@ -13,13 +13,17 @@
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
from op_test import OpTest, skip_check_grad_ci
import paddle
import paddle.fluid as fluid
import paddle.fluid.core as core
from paddle.fluid import Program, compiler, program_guard
from paddle.fluid.op import Operator
import paddle.fluid as fluid
from paddle.fluid import compiler, Program, program_guard
from op_test import OpTest, skip_check_grad_ci
class ElementwiseMulOp(OpTest):
......@@ -241,5 +245,65 @@ class TestElementwiseMulOpError(unittest.TestCase):
self.assertRaises(TypeError, fluid.layers.elementwise_mul, x2, y2)
class TestComplexElementwiseMulOp(OpTest):
def setUp(self):
self.op_type = "elementwise_mul"
self.init_base_dtype()
self.init_input_output()
self.init_grad_input_output()
self.inputs = {
'X': OpTest.np_dtype_to_fluid_dtype(self.x),
'Y': OpTest.np_dtype_to_fluid_dtype(self.y)
}
self.attrs = {'axis': -1, 'use_mkldnn': False}
self.outputs = {'Out': self.out}
def init_base_dtype(self):
self.dtype = np.float64
def init_input_output(self):
self.x = np.random.random(
(2, 3, 4, 5)).astype(self.dtype) + 1J * np.random.random(
(2, 3, 4, 5)).astype(self.dtype)
self.y = np.random.random(
(2, 3, 4, 5)).astype(self.dtype) + 1J * np.random.random(
(2, 3, 4, 5)).astype(self.dtype)
self.out = self.x * self.y
def init_grad_input_output(self):
self.grad_out = np.ones((2, 3, 4, 5), self.dtype) + 1J * np.ones(
(2, 3, 4, 5), self.dtype)
self.grad_x = self.grad_out * np.conj(self.y)
self.grad_y = self.grad_out * np.conj(self.x)
def test_check_output(self):
self.check_output()
def test_check_grad_normal(self):
self.check_grad(
['X', 'Y'],
'Out',
user_defined_grads=[self.grad_x, self.grad_y],
user_defined_grad_outputs=[self.grad_out])
def test_check_grad_ingore_x(self):
self.check_grad(
['Y'],
'Out',
no_grad_set=set("X"),
user_defined_grads=[self.grad_y],
user_defined_grad_outputs=[self.grad_out])
def test_check_grad_ingore_y(self):
self.check_grad(
['X'],
'Out',
no_grad_set=set('Y'),
user_defined_grads=[self.grad_x],
user_defined_grad_outputs=[self.grad_out])
if __name__ == '__main__':
paddle.enable_static()
unittest.main()
......@@ -519,7 +519,7 @@ class TestStridedSliceAPI(unittest.TestCase):
np.random.randn(2, 10), place=paddle.CUDAPinnedPlace())
self.assertTrue(x.place.is_cuda_pinned_place())
y = x[:, ::2]
self.assertFalse(x.place.is_cuda_pinned_place())
self.assertTrue(x.place.is_cuda_pinned_place())
self.assertFalse(y.place.is_cuda_pinned_place())
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册