[Cherry-pick] Complex network execute support (#29905)

* [Complex] Add support for complex grad accumulated (#29889) * add support for complex grad accumulated * add unittest for coverage * update test dtype * remove useless blank line * [Complex] Handle complex to real after type promotion (#29855) * try to add fwd op input dtypes * refactor base impl * return tmp_ins after dygraph prepare data * fix typo found in debug * polish comment & add complex net test * revert detail change * fix unittest failed * add complex kernel condition control * fix xpu test failed & polish comment * polish details by review comments * Complex op test (#29753) * delete no need to calculate inputs in dygraph op_test * delete no need to calculate inputs in dygraph op_test * change grad elementwise_mul for complex types (#29757) * add conj op for complex types * add conj for complex types * add more test case * add conj_op test * modify conj api and impl * add complex type for fill_constant_op xpu * add setConstant for complex type * remove complex conj test file * user define grad for test_conj_op * add test case for static mode of conj api * modify conj doc * change input args name to x * remove useless codes * conj support real types * add conj test case for real number * delete no need to calculate inputs in dygraph op_test * delete no need to calculate inputs in dygraph op_test * modify grad of mul for complex types * fix the grads of inputs args order not match bug * change the grad of div when complex types (#29804) * change the grad of div when complex types * fix the grads of inputs args order not match bug Co-authored-by: N chentianyu03 <chentianyu03@baidu.com>

[Cherry-pick] Complex network execute support (#29905)
* [Complex] Add support for complex grad accumulated (#29889) * add support for complex grad accumulated * add unittest for coverage * update test dtype * remove useless blank line * [Complex] Handle complex to real after type promotion (#29855) * try to add fwd op input dtypes * refactor base impl * return tmp_ins after dygraph prepare data * fix typo found in debug * polish comment & add complex net test * revert detail change * fix unittest failed * add complex kernel condition control * fix xpu test failed & polish comment * polish details by review comments * Complex op test (#29753) * delete no need to calculate inputs in dygraph op_test * delete no need to calculate inputs in dygraph op_test * change grad elementwise_mul for complex types (#29757) * add conj op for complex types * add conj for complex types * add more test case * add conj_op test * modify conj api and impl * add complex type for fill_constant_op xpu * add setConstant for complex type * remove complex conj test file * user define grad for test_conj_op * add test case for static mode of conj api * modify conj doc * change input args name to x * remove useless codes * conj support real types * add conj test case for real number * delete no need to calculate inputs in dygraph op_test * delete no need to calculate inputs in dygraph op_test * modify grad of mul for complex types * fix the grads of inputs args order not match bug * change the grad of div when complex types (#29804) * change the grad of div when complex types * fix the grads of inputs args order not match bug Co-authored-by: N chentianyu03 <chentianyu03@baidu.com>
91ebc460 · Chen Weihang · GitHub · bea300dd · 91ebc460 · 91ebc460
34 changed file
--- a/paddle/fluid/framework/data_type_transform.cc
+++ b/paddle/fluid/framework/data_type_transform.cc
@@ -109,5 +109,30 @@ void TransDataType(const OpKernelType& kernel_type_for_var,
  }
 }
+void TransComplexToReal(const proto::VarType::Type& dst_type,
+                        const proto::VarType::Type& src_type, const Tensor& in,
+                        Tensor* out) {
+  auto& pool = platform::DeviceContextPool::Instance();
+  auto* ctx = pool.Get(in.place());
+  out->Resize(in.dims());
+  // complex -> real
+  switch (src_type) {
+    case proto::VarType::COMPLEX64:
+      framework::VisitDataType(dst_type,
+                               CastDataType<platform::complex64>(in, out, ctx));
+      break;
+    case proto::VarType::COMPLEX128:
+      framework::VisitDataType(
+          dst_type, CastDataType<platform::complex128>(in, out, ctx));
+      break;
+    default:
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Data type (%s) is not supported when casting complex tensor to real "
+          "data type.",
+          DataTypeToString(src_type)));
+  }
+}
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/data_type_transform.h
+++ b/paddle/fluid/framework/data_type_transform.h
@@ -33,5 +33,20 @@ void TransDataType(const OpKernelType& kernel_type_for_var,
                   const OpKernelType& expected_kernel_type, const Tensor& in,
                   Tensor* out);
+/**
+ * Transform complex gradient to real data type.
+ *
+ * If complex type promotion occurred in forward op, the grad output of
+ * this op is complex data type, but the input variable may be real type,
+ * in this case the grad input need to be cast to type same with input,
+ * this casting executed at the end of grad op.
+ *
+ * note: call this function need to ensure that dst_type is real and
+ * src_type is complex
+ */
+void TransComplexToReal(const proto::VarType::Type& dst_type,
+                        const proto::VarType::Type& src_type, const Tensor& in,
+                        Tensor* out);
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -24,6 +24,7 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/data_transform.h"
+#include "paddle/fluid/framework/data_type_transform.h"
 #include "paddle/fluid/framework/details/nan_inf_utils.h"
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/lod_tensor.h"
@@ -1110,6 +1111,13 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
    // there is inplace variable has been transferred.
    TransferInplaceVarsBack(scope, transfered_inplace_vars, *transfer_scope);
  }
+  // See [ Why need handle complex gradient to real gradient? ]
+  // Only handle the case where the current kernel data type is complex
+  if (framework::IsComplexType(kernel_type_->data_type_)) {
+    HandleComplexGradToRealGrad(scope, runtime_ctx);
+  }
  if (FLAGS_enable_unused_var_check) {
    // skip op that uses mkldnn because it has different memory reuse strategy.
    // use attr here because some GradMakers (like ActivationGradOpMaker) add
@@ -1255,6 +1263,73 @@ void OperatorWithKernel::TransferInplaceVarsBack(
  }
 }
+void OperatorWithKernel::HandleComplexGradToRealGrad(
+    const Scope& scope, RuntimeContext* ctx) const {
+  for (auto& var_name_item : Outputs()) {
+    std::vector<Variable*>& output_vars = ctx->outputs[var_name_item.first];
+    for (size_t i = 0; i < var_name_item.second.size(); ++i) {
+      // 1. find grad_var & check whether is complex tensor
+      auto var_name = var_name_item.second[i];
+      auto orig_var_name = GradOriginalVarName(var_name);
+      // only focus on gradient var
+      if (var_name == orig_var_name) {
+        continue;
+      }
+      auto* grad_var = output_vars[i];
+      // skip nullptr var
+      if (grad_var == nullptr) {
+        continue;
+      }
+      // don't process LoDTensorArray temporarily,
+      // add support if necessary for complex number calculations in the future
+      if (!VarIsTensor(*grad_var)) {
+        continue;
+      }
+      auto* grad_tensor =
+          GetMutableLoDTensorOrSelectedRowsValueFromVar(grad_var);
+      // skip nullptr tensor
+      if (grad_tensor == nullptr || !grad_tensor->IsInitialized()) {
+        continue;
+      }
+      // only focus on complex dtype now
+      auto src_type = grad_tensor->type();
+      if (!IsComplexType(src_type)) {
+        continue;
+      }
+      // 2. find forward var & check whether need to cast
+      auto* var = scope.FindVar(orig_var_name);
+      // if forward var not exists, do nothing
+      if (var == nullptr) {
+        continue;
+      }
+      if (!VarIsTensor(*var)) {
+        continue;
+      }
+      const auto* tensor = GetLoDTensorOrSelectedRowsValueFromVar(*var);
+      PADDLE_ENFORCE_NOT_NULL(
+          tensor,
+          platform::errors::Unavailable(
+              "Forward tensor is nullptr when handle complex data to real."));
+      // only need record type, the allocation may have been released
+      auto dst_type = tensor->saved_type();
+      // only focus on real dtype and need casting
+      if (IsComplexType(dst_type)) {
+        continue;
+      }
+      // 3. cast complex grad to real grad
+      VLOG(6) << "Transform " << framework::DataTypeToString(src_type)
+              << " var `" << var_name << "` to "
+              << framework::DataTypeToString(dst_type)
+              << " real var in static graph.";
+      Tensor out;
+      TransComplexToReal(dst_type, src_type, *grad_tensor, &out);
+      SetTensorToVariable(*grad_var, out, grad_var);
+    }
+  }
+}
 Scope* OperatorWithKernel::PrepareData(
    const Scope& scope, const OpKernelType& expected_kernel_key,
    std::vector<std::string>* transfered_inplace_vars,

--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -545,6 +545,9 @@ class OperatorWithKernel : public OperatorBase {
  void ChooseKernel(const RuntimeContext& ctx, const Scope& scope,
                    const platform::Place& place) const;
+  void HandleComplexGradToRealGrad(const Scope& scope,
+                                   RuntimeContext* ctx) const;
  /* Inner assist methods */
  // indicate kernel DataType by input data.
  // By default all input data must be same.

--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -197,6 +197,24 @@ class Tensor {
    return type_;
  }
+  /**
+   * [Add method get the saved type of tensor]
+   *
+   * After the introduction of complex number calculations, Ops that support
+   * complex number calculations generally support type promotion, such as
+   * x(float32) + y(complex64) = out(complex64), then the type of the grad
+   * tensor should be dout(complex64), dx(float32), dy (complex64), but the
+   * type of dx to be recognized to be float32 by the grad Op relay on the type
+   * of forward tensor x. But many of our ops have registered InplaceInferer,
+   * covering the tensor memory of x with out, so as to save storage.
+   *
+   * In this case, the dim and type information recorded by x still exist,
+   * but because x becomes an uninitialized tensor, The type of x record cannot
+   * be obtained with x.type(), but the type is still valid here, so we
+   * add saved_type(), This method SHOULD NOT be called by general scenarios.
+   */
+  proto::VarType::Type saved_type() const { return type_; }
  // memory size returns the holding memory size in byte.
  size_t memory_size() const;
@@ -232,6 +250,7 @@ class Tensor {
  void ResetHolderWithType(std::shared_ptr<memory::Allocation> holder,
                           const proto::VarType::Type type);
  TensorInplaceVersion& InplaceVersionCounter() {
    return inplace_version_counter_;
  }

--- a/paddle/fluid/imperative/basic_engine.cc
+++ b/paddle/fluid/imperative/basic_engine.cc
@@ -23,6 +23,7 @@
 #include <unordered_set>
 #include <utility>
 #include <vector>
 #include "paddle/fluid/imperative/gradient_accumulator.h"
 #include "paddle/fluid/imperative/layer.h"
 #include "paddle/fluid/imperative/op_base.h"
@@ -239,6 +240,7 @@ void BasicEngine::Execute() {
          if (var->OverridedStopGradient() || iter->second->RefCnt() > 1) {
            auto tmp_var = std::make_shared<VariableWrapper>(var->Name());
            tmp_var->SetType(var->Type());
+            tmp_var->SetForwardDataType(var->ForwardDataType());
            var = tmp_var;
            need_accu_var_list_.emplace_back(iter->second.get(), var);
            VLOG(10) << "create temporary var of " << var->Name()

--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@@ -25,6 +25,8 @@
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
+#include "paddle/fluid/platform/complex128.h"
+#include "paddle/fluid/platform/complex64.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -161,6 +163,10 @@ void TensorAdd(const framework::Variable& src, framework::Variable* dst) {
  PADDLE_TENSOR_ADD(float);
  PADDLE_TENSOR_ADD(double);
+  // NOTE(chenweihang): only support complex grad tensor accumulated,
+  // support selected rows if needed in the future
+  PADDLE_TENSOR_ADD(platform::complex64);
+  PADDLE_TENSOR_ADD(platform::complex128);
 #undef PADDLE_TENSOR_ADD

--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
@@ -16,6 +16,7 @@
 #include <algorithm>
 #include <queue>
 #include <utility>
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/variable_helper.h"
@@ -326,9 +327,31 @@ static void OpBaseRunImpl(const framework::OperatorBase& op,
  }
  VLOG(5) << LayerDebugString(op.Type(), ins, outs);
-  auto prepared_op = PreparedOp::Prepare(ins, outs, *op_kernel, place, attrs);
-  prepared_op.Run(ins, outs, attrs);
+  /**
+   * [ Why need temporary inputs here? ]
+   *
+   * PrepareData should not change original input tensor inplace.
+   * Suppose the user defines a tensor(int), enters an op to execute,
+   * and then this op rewrites GetExpectedKernelForVar, and converts
+   * this tensor to float type during execution. After the dynamic
+   * graph is executed, the user-defined variable will be lost, and
+   * the user cannot get the originally defined int tensor, because
+   * it has been converted to float, this should be regarded as a bug
+   * in certain usage scenarios
+   *
+   * In static graph mode, when op is executed, a temporary scope
+   * `transfer_scope` is created before PrepareData, the data after
+   * transform is stored in the temporary scope, and then discarded
+   * after the execution of op, but the original input is directly
+   * overwritten in the previous dynamic graph implemention.
+   */
+  auto expected_kernel_key =
+      GetExpectedKernelKey<VarType>(ins, outs, *op_kernel, place, attrs);
+  auto prepared_op = PreparedOp::Prepare(*op_kernel, expected_kernel_key);
+  auto tmp_ins = PrepareData<VarType>(*op_kernel, ins, expected_kernel_key);
+  prepared_op.Run(tmp_ins, outs, attrs);
  VLOG(4) << LayerDebugString(op.Type(), ins, outs);
 }

--- a/paddle/fluid/imperative/layer.h
+++ b/paddle/fluid/imperative/layer.h
@@ -201,6 +201,14 @@ class VarBase {
  framework::proto::VarType::Type DataType() const { return var_->DataType(); }
+  void SetForwardDataType(framework::proto::VarType::Type data_type) {
+    var_->SetForwardDataType(data_type);
+  }
+  framework::proto::VarType::Type ForwardDataType() const {
+    return var_->ForwardDataType();
+  }
  const platform::Place Place() const { return var_->Place(); }
  void ClearGradient();

--- a/paddle/fluid/imperative/partial_grad_engine.cc
+++ b/paddle/fluid/imperative/partial_grad_engine.cc
@@ -857,6 +857,7 @@ void PartialGradTask::RunEachOp(OpBase *op) {
          auto new_grad_var = std::make_shared<VarBase>(true, grad_var->Name());
          new_grad_var->SetOverridedStopGradient(false);
+          new_grad_var->SetForwardDataType(grad_var->ForwardDataType());
          if (new_grad_var_iter->second->TotalRefCnt() > 1) {
            grads_to_accumulate_.emplace_back(new_grad_var_iter->second.get(),
                                              new_grad_var->SharedVar());

--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -16,12 +16,10 @@
 #include <sstream>
-#include "paddle/fluid/imperative/execution_context.h"
+#include "paddle/fluid/framework/data_type_transform.h"
 #include "paddle/fluid/imperative/infer_shape_context.h"
 #include "paddle/fluid/imperative/infer_var_type_context.h"
-DECLARE_bool(use_mkldnn);
 namespace paddle {
 namespace imperative {
@@ -36,26 +34,32 @@ const framework::Tensor* GetTensorFromVar(const framework::Variable& var) {
 }
 template <typename VarType>
-static void PrepareData(const platform::Place& place,
+static void HandleComplexGradToRealGrad(const NameVarMap<VarType>& outs) {
-                        const NameVarMap<VarType>& ins,
+  for (auto& pair : outs) {
-                        const framework::OperatorWithKernel& op,
+    for (auto& var : pair.second) {
-                        const framework::OpKernelType& expected_kernel_key) {
+      if (var == nullptr) {
-  for (const auto& name_pair : ins) {
+        continue;
-    for (const auto& var_base : name_pair.second) {
+      }
-      const auto* tensor = GetTensorFromVar(var_base->Var());
+      if (var->ForwardDataType() ==
+          static_cast<framework::proto::VarType::Type>(-1)) {
+        VLOG(6) << "Var (" << var->Name()
+                << ")'s forward data type is not set.";
+        continue;
+      }
+      if (!framework::IsComplexType(var->DataType()) ||
+          framework::IsComplexType(var->ForwardDataType())) {
+        continue;
+      }
+      const auto* tensor = GetTensorFromVar(var->Var());
      if (tensor && tensor->IsInitialized()) {
-        auto kernel_type_for_var = op.GetKernelTypeForVar(
+        VLOG(6) << "Transform " << framework::DataTypeToString(var->DataType())
-            name_pair.first, *tensor, expected_kernel_key);
+                << " var `" << var->Name() << "` to "
-        if (!NeedTransform(kernel_type_for_var, expected_kernel_key)) {
+                << framework::DataTypeToString(var->ForwardDataType())
-          continue;
+                << " real var in dynamic graph.";
-        } else {
+        framework::Tensor out;
-          VLOG(3) << "Transform Variable " << var_base->Name() << " from "
+        framework::TransComplexToReal(var->ForwardDataType(), var->DataType(),
-                  << kernel_type_for_var << " to " << expected_kernel_key;
+                                      *tensor, &out);
-          framework::Tensor out;
+        SetTensorToVariable(var->Var(), out, var->MutableVar());
-          TransformData(expected_kernel_key, kernel_type_for_var, *tensor,
-                        &out);
-          SetTensorToVariable(var_base->Var(), out, var_base->MutableVar());
-        }
      }
    }
  }
@@ -63,18 +67,20 @@ static void PrepareData(const platform::Place& place,
 PreparedOp::PreparedOp(const framework::OperatorBase& op,
                       const framework::RuntimeContext& ctx,
+                       const framework::OpKernelType& kernel_type,
                       const framework::OperatorWithKernel::OpKernelFunc& func,
                       platform::DeviceContext* dev_ctx)
-    : op_(op), ctx_(ctx), func_(func), dev_ctx_(dev_ctx) {}
+    : op_(op),
+      ctx_(ctx),
-template <typename VarType>
+      kernel_type_(kernel_type),
-PreparedOp PrepareOpImpl(const NameVarMap<VarType>& ins,
+      func_(func),
-                         const NameVarMap<VarType>& outs,
+      dev_ctx_(dev_ctx) {}
-                         const framework::OperatorWithKernel& op,
-                         platform::Place place,
+PreparedOp PreparedOp::Prepare(
-                         const framework::AttributeMap& attrs) {
+    const framework::OperatorWithKernel& op,
+    const framework::OpKernelType& expected_kernel_key) {
  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-  auto* dev_ctx = pool.Get(place);
+  auto* dev_ctx = pool.Get(expected_kernel_key.place_);
  // check if op[type] has kernel registered.
  auto& all_op_kernels = op.AllOpKernels();
@@ -89,62 +95,20 @@ PreparedOp PrepareOpImpl(const NameVarMap<VarType>& ins,
  auto& kernels = kernels_iter->second;
  framework::RuntimeContext ctx({}, {});
-#ifdef PADDLE_WITH_MKLDNN
-  // MKLDNN variant of code reads attributes in some of GetKernelTypeForVar and
-  // GetKernelType functions, so we need to copy the attributes there.
-  // Const qualifier of Attrs had to be discarded to overwrite it.
-  if (FLAGS_use_mkldnn) {
-    auto& mutable_op_attrs = const_cast<framework::AttributeMap&>(op.Attrs());
-    mutable_op_attrs = attrs;
-  }
-#endif
-  auto expected_kernel_key =
-      op.GetExpectedKernelType(DygraphExecutionContext<VarType>(
-          op, framework::Scope(), *dev_ctx, ctx, ins, outs, attrs));
-  VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
  auto kernel_iter = kernels.find(expected_kernel_key);
-#ifdef PADDLE_WITH_XPU
-  if (kernel_iter == kernels.end() &&
-      is_xpu_place(expected_kernel_key.place_)) {
-    expected_kernel_key.place_ = platform::CPUPlace();
-    kernel_iter = kernels.find(expected_kernel_key);
-  }
-#endif
  // TODO(jiabin): Add operator.cc's line 1000 part back when we need that case
  PADDLE_ENFORCE_NE(kernel_iter, kernels.end(),
                    platform::errors::NotFound(
                        "Operator %s does not have kernel for %s.", op.Type(),
                        KernelTypeToString(expected_kernel_key)));
-  if (!(expected_kernel_key.place_ == place)) {
+  return PreparedOp(op, ctx, expected_kernel_key, kernel_iter->second, dev_ctx);
-    dev_ctx = pool.Get(expected_kernel_key.place_);
-    place = dev_ctx->GetPlace();
-  }
-  PrepareData<VarType>(place, ins, op, expected_kernel_key);
-  return PreparedOp(op, ctx, kernel_iter->second, dev_ctx);
-}
-PreparedOp PreparedOp::Prepare(const NameVarMap<VarBase>& ins,
-                               const NameVarMap<VarBase>& outs,
-                               const framework::OperatorWithKernel& op,
-                               const platform::Place& place,
-                               const framework::AttributeMap& attrs) {
-  return PrepareOpImpl<VarBase>(ins, outs, op, place, attrs);
-}
-PreparedOp PreparedOp::Prepare(const NameVarMap<VariableWrapper>& ins,
-                               const NameVarMap<VariableWrapper>& outs,
-                               const framework::OperatorWithKernel& op,
-                               const platform::Place& place,
-                               const framework::AttributeMap& attrs) {
-  return PrepareOpImpl<VariableWrapper>(ins, outs, op, place, attrs);
 }
 template <typename VarType>
 static void PreparedOpRunImpl(
    const framework::OperatorBase& op, const framework::RuntimeContext& ctx,
+    const framework::OpKernelType& kernel_type,
    const framework::OperatorWithKernel::OpKernelFunc& func,
    platform::DeviceContext* dev_ctx, const NameVarMap<VarType>& ins,
    const NameVarMap<VarType>& outs, const framework::AttributeMap& attrs) {
@@ -158,19 +122,36 @@ static void PreparedOpRunImpl(
  func(DygraphExecutionContext<VarType>(op, scope, *dev_ctx, ctx, ins, outs,
                                        attrs));
+  /**
+   * [ Why need handle complex gradient to real gradient? ]
+   *
+   * After the introduction of complex number calculations, Ops that support
+   * complex number calculations generally support type promotion, such as
+   * x(float32) + y(complex64) = out(complex64), then the type of the grad
+   * tensor should be dout(complex64), dx(float32), dy (complex64).
+   *
+   * But because the dout is complex64, the dx is also complex64 after
+   * grad op kernel executed, we need to recognize this situation and
+   * convert dx to float32 type. HandleComplexGradToRealGrad does this thing.
+   */
+  if (framework::IsComplexType(kernel_type.data_type_)) {
+    HandleComplexGradToRealGrad<VarType>(outs);
+  }
 }
 void PreparedOp::Run(const NameVarMap<VarBase>& ins,
                     const NameVarMap<VarBase>& outs,
                     const framework::AttributeMap& attrs) {
-  PreparedOpRunImpl<VarBase>(op_, ctx_, func_, dev_ctx_, ins, outs, attrs);
+  PreparedOpRunImpl<VarBase>(op_, ctx_, kernel_type_, func_, dev_ctx_, ins,
+                             outs, attrs);
 }
 void PreparedOp::Run(const NameVarMap<VariableWrapper>& ins,
                     const NameVarMap<VariableWrapper>& outs,
                     const framework::AttributeMap& attrs) {
-  PreparedOpRunImpl<VariableWrapper>(op_, ctx_, func_, dev_ctx_, ins, outs,
+  PreparedOpRunImpl<VariableWrapper>(op_, ctx_, kernel_type_, func_, dev_ctx_,
-                                     attrs);
+                                     ins, outs, attrs);
 }
 }  // namespace imperative

--- a/paddle/fluid/imperative/prepared_operator.h
+++ b/paddle/fluid/imperative/prepared_operator.h
@@ -21,9 +21,12 @@
 #include "paddle/fluid/framework/data_transform.h"
 #include "paddle/fluid/framework/op_kernel_type.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/imperative/execution_context.h"
 #include "paddle/fluid/imperative/layer.h"
 #include "paddle/fluid/imperative/type_defs.h"
+DECLARE_bool(use_mkldnn);
 namespace paddle {
 namespace framework {
 class Tensor;
@@ -39,24 +42,120 @@ namespace imperative {
 const framework::Tensor* GetTensorFromVar(const framework::Variable& var);
+template <typename VarType>
+static void SetForwardDataTypeOfGradVar(const std::shared_ptr<VarType>& var);
+template <>
+void SetForwardDataTypeOfGradVar<VariableWrapper>(
+    const std::shared_ptr<VariableWrapper>& var) {
+  if (var->HasGradVar()) {
+    auto grad_var = var->GetGradVar();
+    VLOG(6) << "Set grad var (" << grad_var->Name() << ") dtype to ("
+            << framework::DataTypeToString(var->DataType()) << ").";
+    grad_var->SetForwardDataType(var->DataType());
+  }
+}
+template <>
+void SetForwardDataTypeOfGradVar<VarBase>(const std::shared_ptr<VarBase>& var) {
+  if (var->HasGradVar()) {
+    auto& shared_var = var->SharedVar();
+    SetForwardDataTypeOfGradVar<VariableWrapper>(shared_var);
+  }
+}
+#ifdef PADDLE_WITH_XPU
+static void ReplaceXPUKernelIfNotExists(
+    const framework::OperatorWithKernel& op,
+    framework::OpKernelType* expected_kernel_key) {
+  auto& all_op_kernels = op.AllOpKernels();
+  auto kernels_iter = all_op_kernels.find(op.Type());
+  PADDLE_ENFORCE_NE(
+      kernels_iter, all_op_kernels.end(),
+      platform::errors::NotFound(
+          "There are no kernels which are registered in the %s operator.",
+          op.Type()));
+  auto& kernels = kernels_iter->second;
+  auto kernel_iter = kernels.find(*expected_kernel_key);
+  if (kernel_iter == kernels.end() &&
+      is_xpu_place(expected_kernel_key->place_)) {
+    expected_kernel_key->place_ = platform::CPUPlace();
+  }
+}
+#endif
+template <typename VarType>
+framework::OpKernelType GetExpectedKernelKey(
+    const NameVarMap<VarType>& ins, const NameVarMap<VarType>& outs,
+    const framework::OperatorWithKernel& op, const platform::Place& place,
+    const framework::AttributeMap& attrs) {
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  auto* dev_ctx = pool.Get(place);
+  framework::RuntimeContext ctx({}, {});
+#ifdef PADDLE_WITH_MKLDNN
+  // MKLDNN variant of code reads attributes in some of GetKernelTypeForVar and
+  // GetKernelType functions, so we need to copy the attributes there.
+  // Const qualifier of Attrs had to be discarded to overwrite it.
+  if (FLAGS_use_mkldnn) {
+    auto& mutable_op_attrs = const_cast<framework::AttributeMap&>(op.Attrs());
+    mutable_op_attrs = attrs;
+  }
+#endif
+  auto expected_kernel_key =
+      op.GetExpectedKernelType(DygraphExecutionContext<VarType>(
+          op, framework::Scope(), *dev_ctx, ctx, ins, outs, attrs));
+#ifdef PADDLE_WITH_XPU
+  ReplaceXPUKernelIfNotExists(op, &expected_kernel_key);
+#endif
+  VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
+  return expected_kernel_key;
+}
+template <typename VarType>
+NameVarMap<VarType> PrepareData(
+    const framework::OperatorWithKernel& op, const NameVarMap<VarType>& ins,
+    const framework::OpKernelType& expected_kernel_key) {
+  NameVarMap<VarType> tmp_ins(ins);
+  for (auto& name_pair : tmp_ins) {
+    for (auto& var_base : name_pair.second) {
+      const auto* tensor = GetTensorFromVar(var_base->Var());
+      SetForwardDataTypeOfGradVar(var_base);
+      if (tensor && tensor->IsInitialized()) {
+        auto kernel_type_for_var = op.GetKernelTypeForVar(
+            name_pair.first, *tensor, expected_kernel_key);
+        if (!NeedTransform(kernel_type_for_var, expected_kernel_key)) {
+          continue;
+        } else {
+          VLOG(3) << "Transform Variable " << var_base->Name() << " from "
+                  << kernel_type_for_var << " to " << expected_kernel_key;
+          framework::Tensor out;
+          auto tmp_var = std::make_shared<VarType>(var_base->Name());
+          tmp_var->SetType(var_base->Type());
+          TransformData(expected_kernel_key, kernel_type_for_var, *tensor,
+                        &out);
+          SetTensorToVariable(var_base->Var(), out, tmp_var->MutableVar());
+          var_base = tmp_var;
+        }
+      }
+    }
+  }
+  return tmp_ins;
+}
 class PreparedOp {
 public:
  PreparedOp(const framework::OperatorBase& op,
             const framework::RuntimeContext& ctx,
+             const framework::OpKernelType& kernel_type,
             const framework::OperatorWithKernel::OpKernelFunc& func,
             platform::DeviceContext* dev_ctx);
-  static PreparedOp Prepare(const NameVarMap<VarBase>& ins,
+  static PreparedOp Prepare(const framework::OperatorWithKernel& op,
-                            const NameVarMap<VarBase>& outs,
+                            const framework::OpKernelType& expected_kernel_key);
-                            const framework::OperatorWithKernel& op,
-                            const platform::Place& place,
-                            const framework::AttributeMap& attrs);
-  static PreparedOp Prepare(const NameVarMap<VariableWrapper>& ins,
-                            const NameVarMap<VariableWrapper>& outs,
-                            const framework::OperatorWithKernel& op,
-                            const platform::Place& place,
-                            const framework::AttributeMap& attrs);
  void Run(const NameVarMap<VarBase>& in, const NameVarMap<VarBase>& out,
           const framework::AttributeMap& attrs);
@@ -68,6 +167,7 @@ class PreparedOp {
 private:
  const framework::OperatorBase& op_;
  const framework::RuntimeContext& ctx_;
+  framework::OpKernelType kernel_type_;
  framework::OperatorWithKernel::OpKernelFunc func_;
  platform::DeviceContext* dev_ctx_;
 };

--- a/paddle/fluid/imperative/tests/test_prepare_op.cc
+++ b/paddle/fluid/imperative/tests/test_prepare_op.cc
@@ -32,27 +32,6 @@ namespace framework = paddle::framework;
 namespace paddle {
 namespace imperative {
-static framework::RuntimeContext PrepareRuntimeContext(
-    const NameVarBaseMap& ins, const NameVarBaseMap& outs) {
-  framework::VariableValueMap inputs, outputs;
-  for (auto& in_pair : ins) {
-    auto& in_ctx = inputs[in_pair.first];
-    in_ctx.reserve(in_pair.second.size());
-    for (auto& in_var : in_pair.second) {
-      in_ctx.emplace_back(in_var->MutableVar());
-    }
-  }
-  for (auto& out_pair : outs) {
-    auto& out_ctx = outputs[out_pair.first];
-    out_ctx.reserve(out_pair.second.size());
-    for (auto& out_var : out_pair.second) {
-      out_ctx.emplace_back(out_var->MutableVar());
-    }
-  }
-  return framework::RuntimeContext(std::move(inputs), std::move(outputs));
-}
 static framework::VariableNameMap CreateVarNameMap(
    const framework::OpInfo& op_info, const std::string& op_type,
    const NameVarBaseMap& varbase_map, bool is_input) {
@@ -111,11 +90,12 @@ TEST(test_prepare_op, test_prepare_op) {
      CreateVarNameMap(info, "split", outs, false);
  auto op = framework::OpRegistry::CreateOp("split", var_in_map, var_out_map,
                                            split_attr_map);
-  framework::RuntimeContext ctx = PrepareRuntimeContext(ins, outs);
+  auto expected_kernel_key = GetExpectedKernelKey<imperative::VarBase>(
+      ins, outs, dynamic_cast<framework::OperatorWithKernel&>(*op), place,
+      split_attr_map);
  ASSERT_NO_FATAL_FAILURE(PreparedOp preparedOp = PreparedOp::Prepare(
-                              ins, outs,
                              dynamic_cast<framework::OperatorWithKernel&>(*op),
-                              place, split_attr_map));
+                              expected_kernel_key));
 }
 const framework::Tensor* GetTensorFromVar(const framework::Variable& var);
@@ -161,13 +141,15 @@ TEST(test_prepare_op, test_prepare_data) {
      CreateVarNameMap(info, op_type, outs, false);
  auto op = framework::OpRegistry::CreateOp(op_type, var_in_map, var_out_map,
                                            attr_map);
-  framework::RuntimeContext ctx = PrepareRuntimeContext(ins, outs);
  // test if it can be transformed to GPU place
-  PreparedOp prepared_op = PreparedOp::Prepare(
+  auto expected_kernel_key = GetExpectedKernelKey<imperative::VarBase>(
      ins, outs, dynamic_cast<framework::OperatorWithKernel&>(*op), gpu_place,
      attr_map);
-  for (const auto& name_pair : ins) {
+  imperative::NameVarBaseMap tmp_ins = PrepareData<imperative::VarBase>(
+      dynamic_cast<framework::OperatorWithKernel&>(*op), ins,
+      expected_kernel_key);
+  for (const auto& name_pair : tmp_ins) {
    for (const auto& vb : name_pair.second) {
      ASSERT_TRUE(platform::is_same_place(
          vb->Var().Get<framework::LoDTensor>().place(), gpu_place));
@@ -208,13 +190,15 @@ void TestPrepareDataSamePlace(framework::AttributeMap attr_map) {
  auto op = framework::OpRegistry::CreateOp(op_type, var_in_map, var_out_map,
                                            attr_map);
-  framework::RuntimeContext ctx = PrepareRuntimeContext(ins, outs);
  // test if it never transferred on GPU place
-  PreparedOp prepared_op = PreparedOp::Prepare(
+  auto expected_kernel_key = GetExpectedKernelKey<imperative::VarBase>(
      ins, outs, dynamic_cast<framework::OperatorWithKernel&>(*op), cpu_place,
      attr_map);
-  for (const auto& name_pair : ins) {
+  imperative::NameVarBaseMap tmp_ins = PrepareData<imperative::VarBase>(
+      dynamic_cast<framework::OperatorWithKernel&>(*op), ins,
+      expected_kernel_key);
+  for (const auto& name_pair : tmp_ins) {
    for (const auto& vb : name_pair.second) {
      ASSERT_TRUE(platform::is_same_place(
          vb->Var().Get<framework::LoDTensor>().place(), cpu_place));

--- a/paddle/fluid/imperative/variable_wrapper.h
+++ b/paddle/fluid/imperative/variable_wrapper.h
@@ -122,10 +122,6 @@ class VariableWrapper {
  framework::proto::VarType::Type Type() const { return type_; }
-  void SetDataType(framework::proto::VarType::Type data_type) {
-    data_type_ = data_type;
-  }
  std::shared_ptr<VariableWrapper> GetGradVar() const {
    return grad_var_.lock();
  }
@@ -140,6 +136,10 @@ class VariableWrapper {
  bool HasGradVar() const { return !grad_var_.expired(); }
+  void SetDataType(framework::proto::VarType::Type data_type) {
+    data_type_ = data_type;
+  }
  framework::proto::VarType::Type DataType() const {
    const framework::Tensor* tensor = nullptr;
    if (var_.IsInitialized()) {
@@ -160,6 +160,14 @@ class VariableWrapper {
    }
  }
+  void SetForwardDataType(framework::proto::VarType::Type data_type) {
+    fwd_data_type_ = data_type;
+  }
+  framework::proto::VarType::Type ForwardDataType() const {
+    return fwd_data_type_;
+  }
  const platform::Place Place() const {
    const framework::Tensor* tensor = nullptr;
    auto place =
@@ -306,6 +314,13 @@ class VariableWrapper {
  framework::proto::VarType::Type type_{framework::proto::VarType::LOD_TENSOR};
  framework::proto::VarType::Type data_type_{framework::proto::VarType::FP32};
+  // See [ Why need handle complex gradient to real gradient? ]
+  // Used for grad var to get the data type of its corresponding forward var,
+  // if inconsistent, the data type of grad var needs to be casted to be
+  // consistent with forward var
+  framework::proto::VarType::Type fwd_data_type_{
+      static_cast<framework::proto::VarType::Type>(-1)};
  std::weak_ptr<VariableWrapper> grad_var_;
  std::weak_ptr<GradOpNode> grad_node_;

--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
@@ -269,7 +269,7 @@ void Copy<platform::CUDAPlace, platform::CUDAPlace>(
  if (UNLIKELY(num == 0)) return;
  VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
-          << dst_place << " by thream(" << stream << ")";
+          << dst_place << " by stream(" << stream << ")";
  if (dst_place == src_place) {
    platform::SetDeviceId(src_place.device);
    if (stream) {

--- a/paddle/fluid/operators/elementwise/elementwise_div_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op.cu
@@ -75,6 +75,45 @@ static __global__ void SimpleElemwiseDivGradCUDAKernel(const T* x, const T* y,
  }
 }
+template <>
+__global__ void SimpleElemwiseDivGradCUDAKernel<paddle::platform::complex64>(
+    const paddle::platform::complex64* x, const paddle::platform::complex64* y,
+    const paddle::platform::complex64* out,
+    const paddle::platform::complex64* dout, int64_t size,
+    paddle::platform::complex64* dx, paddle::platform::complex64* dy) {
+  int col = blockIdx.x * blockDim.x + threadIdx.x;
+  while (col < size) {
+    paddle::platform::complex64 o = dout[col];
+    paddle::platform::complex64 y_conj(y[col].real, -y[col].imag);
+    paddle::platform::complex64 out_div_y_conj((out[col] / y[col]).real,
+                                               -(out[col] / y[col]).imag);
+    dx[col] = o / y_conj;
+    dy[col] = -o * out_div_y_conj;
+    col += blockDim.x * gridDim.x;
+  }
+}
+template <>
+__global__ void SimpleElemwiseDivGradCUDAKernel<paddle::platform::complex128>(
+    const paddle::platform::complex128* x,
+    const paddle::platform::complex128* y,
+    const paddle::platform::complex128* out,
+    const paddle::platform::complex128* dout, int64_t size,
+    paddle::platform::complex128* dx, paddle::platform::complex128* dy) {
+  int col = blockIdx.x * blockDim.x + threadIdx.x;
+  while (col < size) {
+    paddle::platform::complex128 o = dout[col];
+    paddle::platform::complex128 y_conj(y[col].real, -y[col].imag);
+    paddle::platform::complex128 out_div_y_conj((out[col] / y[col]).real,
+                                                -(out[col] / y[col]).imag);
+    dx[col] = o / y_conj;
+    dy[col] = -o * out_div_y_conj;
+    col += blockDim.x * gridDim.x;
+  }
+}
 template <typename DeviceContext, typename T>
 typename std::enable_if<
    std::is_same<DeviceContext, plat::CUDADeviceContext>::value>::type

--- a/paddle/fluid/operators/elementwise/elementwise_div_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op.h
@@ -73,6 +73,27 @@ struct DivGradDX {
  HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout / y; }
 };
+template <>
+struct DivGradDX<paddle::platform::complex64> {
+  HOSTDEVICE paddle::platform::complex64 operator()(
+      paddle::platform::complex64 x, paddle::platform::complex64 y,
+      paddle::platform::complex64 out, paddle::platform::complex64 dout) const {
+    paddle::platform::complex64 y_conj(y.real, -y.imag);
+    return dout / y_conj;
+  }
+};
+template <>
+struct DivGradDX<paddle::platform::complex128> {
+  HOSTDEVICE paddle::platform::complex128 operator()(
+      paddle::platform::complex128 x, paddle::platform::complex128 y,
+      paddle::platform::complex128 out,
+      paddle::platform::complex128 dout) const {
+    paddle::platform::complex128 y_conj(y.real, -y.imag);
+    return dout / y_conj;
+  }
+};
 template <typename T>
 struct DivGradDY {
  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
@@ -80,6 +101,28 @@ struct DivGradDY {
  }
 };
+template <>
+struct DivGradDY<paddle::platform::complex64> {
+  HOSTDEVICE paddle::platform::complex64 operator()(
+      paddle::platform::complex64 x, paddle::platform::complex64 y,
+      paddle::platform::complex64 out, paddle::platform::complex64 dout) const {
+    paddle::platform::complex64 out_div_y_conj((out / y).real, -(out / y).imag);
+    return -dout * out_div_y_conj;
+  }
+};
+template <>
+struct DivGradDY<paddle::platform::complex128> {
+  HOSTDEVICE paddle::platform::complex128 operator()(
+      paddle::platform::complex128 x, paddle::platform::complex128 y,
+      paddle::platform::complex128 out,
+      paddle::platform::complex128 dout) const {
+    paddle::platform::complex128 out_div_y_conj((out / y).real,
+                                                -(out / y).imag);
+    return -dout * out_div_y_conj;
+  }
+};
 template <typename T>
 struct DivDoubleDY {
  HOSTDEVICE T operator()(T x, T y, T out, T dout) const {

--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
@@ -75,6 +75,36 @@ static __global__ void SimpleElemwiseMulGradCUDAKernel(const T* x, const T* y,
  }
 }
+template <>
+__global__ void SimpleElemwiseMulGradCUDAKernel<plat::complex64>(
+    const plat::complex64* x, const plat::complex64* y,
+    const plat::complex64* out, const plat::complex64* dout, int64_t size,
+    plat::complex64* dx, plat::complex64* dy) {
+  int col = blockIdx.x * blockDim.x + threadIdx.x;
+  while (col < size) {
+    plat::complex64 o = dout[col];
+    dx[col] = plat::complex64(y[col].real, -y[col].imag) * o;
+    dy[col] = plat::complex64(x[col].real, -x[col].imag) * o;
+    col += blockDim.x * gridDim.x;
+  }
+}
+template <>
+__global__ void SimpleElemwiseMulGradCUDAKernel<plat::complex128>(
+    const plat::complex128* x, const plat::complex128* y,
+    const plat::complex128* out, const plat::complex128* dout, int64_t size,
+    plat::complex128* dx, plat::complex128* dy) {
+  int col = blockIdx.x * blockDim.x + threadIdx.x;
+  while (col < size) {
+    plat::complex128 o = dout[col];
+    dx[col] = plat::complex128(y[col].real, -y[col].imag) * o;
+    dy[col] = plat::complex128(x[col].real, -x[col].imag) * o;
+    col += blockDim.x * gridDim.x;
+  }
+}
 template <typename DeviceContext, typename T>
 typename std::enable_if<
    std::is_same<DeviceContext, plat::CUDADeviceContext>::value>::type

--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
@@ -132,11 +132,53 @@ struct MulGradDX {
  HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout * y; }
 };
+template <>
+struct MulGradDX<paddle::platform::complex64> {
+  HOSTDEVICE paddle::platform::complex64 operator()(
+      paddle::platform::complex64 x, paddle::platform::complex64 y,
+      paddle::platform::complex64 out, paddle::platform::complex64 dout) const {
+    paddle::platform::complex64 y_conj(y.real, -y.imag);
+    return dout * y_conj;
+  }
+};
+template <>
+struct MulGradDX<paddle::platform::complex128> {
+  HOSTDEVICE paddle::platform::complex128 operator()(
+      paddle::platform::complex128 x, paddle::platform::complex128 y,
+      paddle::platform::complex128 out,
+      paddle::platform::complex128 dout) const {
+    paddle::platform::complex128 y_conj(y.real, -y.imag);
+    return dout * y_conj;
+  }
+};
 template <typename T>
 struct MulGradDY {
  HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout * x; }
 };
+template <>
+struct MulGradDY<paddle::platform::complex64> {
+  HOSTDEVICE paddle::platform::complex64 operator()(
+      paddle::platform::complex64 x, paddle::platform::complex64 y,
+      paddle::platform::complex64 out, paddle::platform::complex64 dout) const {
+    paddle::platform::complex64 x_conj(x.real, -x.imag);
+    return dout * x_conj;
+  }
+};
+template <>
+struct MulGradDY<paddle::platform::complex128> {
+  HOSTDEVICE paddle::platform::complex128 operator()(
+      paddle::platform::complex128 x, paddle::platform::complex128 y,
+      paddle::platform::complex128 out,
+      paddle::platform::complex128 dout) const {
+    paddle::platform::complex128 x_conj(x.real, -x.imag);
+    return dout * x_conj;
+  }
+};
 template <typename DeviceContext, typename T>
 typename std::enable_if<
    std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type

--- a/paddle/fluid/operators/elementwise/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op.h
@@ -352,7 +352,8 @@ class ElementwiseOpDoubleGradWithoutDXDY
                     "ElementwiseOpDoubleGradWithoutDXDY");
      input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "DDX");
    } else {
-      input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "DDX");
+      input_data_type =
+          OperatorWithKernel::IndicateOrPromoteVarDataTypes(ctx, "DDX", "DDY");
    }
 #ifdef PADDLE_WITH_MKLDNN
@@ -364,6 +365,19 @@ class ElementwiseOpDoubleGradWithoutDXDY
 #endif
    return framework::OpKernelType(input_data_type, ctx.GetPlace());
  }
+  framework::OpKernelType GetKernelTypeForVar(
+      const std::string &var_name, const framework::Tensor &tensor,
+      const framework::OpKernelType &expected_kernel_type) const {
+    if (framework::IsComplexType(expected_kernel_type.data_type_)) {
+      // only promote inputs’s types when contains complex input
+      return framework::OpKernelType(tensor.type(), tensor.place(),
+                                     tensor.layout());
+    } else {
+      return framework::OpKernelType(expected_kernel_type.data_type_,
+                                     tensor.place(), tensor.layout());
+    }
+  }
 };
 template <typename T>

--- a/paddle/fluid/operators/math/blas_impl.cu.h
+++ b/paddle/fluid/operators/math/blas_impl.cu.h
@@ -275,6 +275,15 @@ struct CUBlas<platform::complex64> {
        reinterpret_cast<cuFloatComplex *>(C), ldc));
  }
+  static void AXPY(cublasHandle_t handle, int n, const complex64 *alpha,
+                   const complex64 *X, const int incX, complex64 *Y,
+                   const int incY) {
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasCaxpy(
+        handle, n, reinterpret_cast<const cuFloatComplex *>(alpha),
+        reinterpret_cast<const cuFloatComplex *>(X), incX,
+        reinterpret_cast<cuFloatComplex *>(Y), incY));
+  }
  static void GEMM_STRIDED_BATCH(cublasHandle_t handle,
                                 cublasOperation_t transa,
                                 cublasOperation_t transb, int m, int n, int k,
@@ -362,6 +371,15 @@ struct CUBlas<platform::complex128> {
        reinterpret_cast<cuDoubleComplex *>(C), ldc));
  }
+  static void AXPY(cublasHandle_t handle, int n, const complex128 *alpha,
+                   const complex128 *X, const int incX, complex128 *Y,
+                   const int incY) {
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cublasZaxpy(
+        handle, n, reinterpret_cast<const cuDoubleComplex *>(alpha),
+        reinterpret_cast<const cuDoubleComplex *>(X), incX,
+        reinterpret_cast<cuDoubleComplex *>(Y), incY));
+  }
  static void GEMM_STRIDED_BATCH(cublasHandle_t handle,
                                 cublasOperation_t transa,
                                 cublasOperation_t transb, int m, int n, int k,

--- a/paddle/fluid/operators/math/blas_impl.h
+++ b/paddle/fluid/operators/math/blas_impl.h
@@ -295,6 +295,13 @@ struct CBlas<double> {
 template <>
 struct CBlas<platform::complex64> {
+  template <typename... ARGS>
+  static void AXPY(int n, const paddle::platform::complex64 alpha,
+                   const paddle::platform::complex64 *X, const int incX,
+                   paddle::platform::complex64 *Y, const int incY) {
+    platform::dynload::cblas_caxpy(n, &alpha, X, incX, Y, incY);
+  }
  template <typename... ARGS>
  static void VCOPY(ARGS... args) {
    platform::dynload::cblas_ccopy(args...);
@@ -415,6 +422,13 @@ struct CBlas<platform::complex64> {
 template <>
 struct CBlas<platform::complex128> {
+  template <typename... ARGS>
+  static void AXPY(int n, const paddle::platform::complex128 alpha,
+                   const paddle::platform::complex128 *X, const int incX,
+                   paddle::platform::complex128 *Y, const int incY) {
+    platform::dynload::cblas_zaxpy(n, &alpha, X, incX, Y, incY);
+  }
  template <typename... ARGS>
  static void VCOPY(ARGS... args) {
    platform::dynload::cblas_zcopy(args...);
@@ -598,11 +612,6 @@ struct CBlas<platform::complex64> {
    cblas_ccopy(args...);
  }
-  template <typename... ARGS>
-  static void VADD(ARGS... args) {
-    vcAdd(args...);
-  }
  template <typename... ARGS>
  static void AXPY(int n, const paddle::platform::complex64 alpha,
                   const paddle::platform::complex64 *X, const int incX,
@@ -641,11 +650,6 @@ struct CBlas<platform::complex128> {
    cblas_zcopy(args...);
  }
-  template <typename... ARGS>
-  static void VADD(ARGS... args) {
-    vzAdd(args...);
-  }
  template <typename... ARGS>
  static void AXPY(int n, const paddle::platform::complex128 alpha,
                   const paddle::platform::complex128 *X, const int incX,

--- a/paddle/fluid/operators/math/selected_rows_functor.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor.cc
@@ -18,6 +18,8 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
+#include "paddle/fluid/platform/complex128.h"
+#include "paddle/fluid/platform/complex64.h"
 namespace paddle {
 namespace operators {
@@ -548,6 +550,10 @@ template struct MergeAdd<platform::CPUDeviceContext, int>;
 template struct MergeAdd<platform::CPUDeviceContext, int64_t>;
 template struct MergeAdd<platform::CPUDeviceContext, float>;
 template struct MergeAdd<platform::CPUDeviceContext, double>;
+template struct MergeAdd<platform::CPUDeviceContext,
+                         paddle::platform::complex64>;
+template struct MergeAdd<platform::CPUDeviceContext,
+                         paddle::platform::complex128>;
 template struct MergeAverage<platform::CPUDeviceContext, int>;
 template struct MergeAverage<platform::CPUDeviceContext, int64_t>;

--- a/paddle/fluid/operators/math/selected_rows_functor.cu
+++ b/paddle/fluid/operators/math/selected_rows_functor.cu
@@ -448,6 +448,8 @@ template struct MergeAdd<platform::CUDADeviceContext, double>;
 template struct MergeAdd<platform::CUDADeviceContext, int>;
 template struct MergeAdd<platform::CUDADeviceContext, int64_t>;
 template struct MergeAdd<platform::CUDADeviceContext, platform::float16>;
+template struct MergeAdd<platform::CUDADeviceContext, platform::complex64>;
+template struct MergeAdd<platform::CUDADeviceContext, platform::complex128>;
 template <typename T, int block_size>
 __global__ void UpdateToTensorKernel(const T* selected_rows,

--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu
@@ -23,4 +23,6 @@ using CUDAReduceSumGradKernel =
 REGISTER_OP_CUDA_KERNEL(reduce_sum_grad, CUDAReduceSumGradKernel<float>,
                        CUDAReduceSumGradKernel<double>,
                        CUDAReduceSumGradKernel<int>,
-                        CUDAReduceSumGradKernel<int64_t>);
+                        CUDAReduceSumGradKernel<int64_t>,
+                        CUDAReduceSumGradKernel<paddle::platform::complex64>,
+                        CUDAReduceSumGradKernel<paddle::platform::complex128>);
--- a/paddle/fluid/platform/cuda_primitives.h
+++ b/paddle/fluid/platform/cuda_primitives.h
@@ -15,6 +15,8 @@ limitations under the License. */
 #pragma once
 #include <cuda.h>
 #include <stdio.h>
+#include "paddle/fluid/platform/complex128.h"
+#include "paddle/fluid/platform/complex64.h"
 #include "paddle/fluid/platform/float16.h"
 namespace paddle {
@@ -126,9 +128,22 @@ CUDA_ATOMIC_WRAPPER(Add, float16) {
    return ret;
  }
 }
 #endif
+CUDA_ATOMIC_WRAPPER(Add, complex64) {
+  float *real = reinterpret_cast<float *>(address);
+  float *imag = real + 1;
+  return complex64(CudaAtomicAdd(real, val.real),
+                   CudaAtomicAdd(imag, val.imag));
+}
+CUDA_ATOMIC_WRAPPER(Add, complex128) {
+  double *real = reinterpret_cast<double *>(address);
+  double *imag = real + 1;
+  return complex128(CudaAtomicAdd(real, val.real),
+                    CudaAtomicAdd(imag, val.imag));
+}
 // For atomicMax
 USE_CUDA_ATOMIC(Max, int);
 USE_CUDA_ATOMIC(Max, unsigned int);

--- a/paddle/fluid/platform/dynload/cublas.h
+++ b/paddle/fluid/platform/dynload/cublas.h
@@ -55,6 +55,8 @@ extern void *cublas_dso_handle;
 #define CUBLAS_BLAS_ROUTINE_EACH(__macro) \
  __macro(cublasSaxpy_v2);                \
  __macro(cublasDaxpy_v2);                \
+  __macro(cublasCaxpy_v2);                \
+  __macro(cublasZaxpy_v2);                \
  __macro(cublasSscal_v2);                \
  __macro(cublasDscal_v2);                \
  __macro(cublasScopy_v2);                \

--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -855,7 +855,7 @@ class OpTest(unittest.TestCase):
                               place,
                               no_check_set=None,
                               inplace_atol=None):
-        """Chech the inplace correctness of given op (self.op_type).
+        """Check the inplace correctness of given op (self.op_type).
        Run the op twice with same inputs, one enable inplace and another disable, compare their outputs.
        Args:
@@ -935,7 +935,7 @@ class OpTest(unittest.TestCase):
                            fwd_res,
                            grad_op_desc,
                            inplace_atol=None):
-        """Chech the inplace correctness of given grad_op_desc.
+        """Check the inplace correctness of given grad_op_desc.
        Run the grad op twice with same inputs, one enable inplace and another disable, compare their outputs.
        It works like _check_forward_inplace, but the way to construct program and feed_map differs.
@@ -1291,7 +1291,6 @@ class OpTest(unittest.TestCase):
    def _assert_is_close(self, numeric_grads, analytic_grads, names,
                         max_relative_error, msg_prefix):
        for a, b, name in six.moves.zip(numeric_grads, analytic_grads, names):
            # It asserts np.abs(a - b) / np.abs(a) < max_relative_error, in which
            # max_relative_error is 1e-7. According to the value of np.abs(a), we
@@ -1544,6 +1543,10 @@ class OpTest(unittest.TestCase):
                grad_outputs = []
                for grad_out_value in user_defined_grad_outputs:
                    grad_outputs.append(paddle.to_tensor(grad_out_value))
+                # delete the inputs which no need to calculate grad
+                for no_grad_val in no_grad_set:
+                    del (inputs[no_grad_val])
                grad_inputs = paddle.grad(
                    outputs=fluid.layers.utils.flatten(outputs),
                    inputs=fluid.layers.utils.flatten(inputs),
@@ -1612,7 +1615,7 @@ class OpTest(unittest.TestCase):
            targets = [
                outputs[name] for name in outputs if name in output_names
            ]
-            inputs = [inputs[name] for name in inputs if name in input_to_check]
+            inputs = [inputs[name] for name in input_to_check if name in inputs]
            grad_inputs = paddle.static.gradients(targets, inputs, grad_outputs,
                                                  no_grad_set)
            fetch_list = grad_inputs

--- a/python/paddle/fluid/tests/unittests/test_complex_grad_accumulated.py
+++ b/python/paddle/fluid/tests/unittests/test_complex_grad_accumulated.py
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid.core as core
+class Optimization_ex1(paddle.nn.Layer):
+    def __init__(self,
+                 shape,
+                 dtype,
+                 param_attr=paddle.nn.initializer.Uniform(
+                     low=-5., high=5.)):
+        super(Optimization_ex1, self).__init__()
+        self.theta0 = self.create_parameter(
+            shape=shape, attr=param_attr, dtype=dtype, is_bias=False)
+        self.theta1 = self.create_parameter(
+            shape=shape, attr=param_attr, dtype=dtype, is_bias=False)
+        self.A = paddle.to_tensor(
+            np.random.random((4, 4)).astype(dtype) + np.random.random((4, 4))
+            .astype(dtype) * 1j)
+        self.B = paddle.to_tensor(
+            np.random.random((4, 4)).astype(dtype) + np.random.random(
+                (4, 4)).astype(dtype) * 1j,
+            stop_gradient=False)
+        print(self.A)
+    def forward(self, mode=1):
+        jj = paddle.to_tensor(np.array([1j]).astype(np.complex64))
+        if mode == 1:
+            # run all calc in one step
+            loss = paddle.sum(self.A + (self.theta0 + self.theta1 * jj)) * (
+                paddle.sum(self.A + (self.theta0 + self.theta1 * jj)).conj())
+            return loss.real()
+        elif mode == 2:
+            # run in two step
+            self.theta = self.theta0 + self.theta1 * jj
+            loss = paddle.sum(self.A + self.theta) * (
+                paddle.sum(self.A + self.theta).conj())
+            return loss.real()
+        elif mode == 3:
+            # run without param
+            loss = paddle.sum(self.A + self.B) * (
+                paddle.sum(self.A + self.B).conj())
+            return loss.real()
+        else:
+            raise NotImplementedError
+class TestComplexGradAccumulated(unittest.TestCase):
+    def setUp(self):
+        self.devices = ['cpu']
+        if core.is_compiled_with_cuda():
+            self.devices.append('gpu')
+        self.dtypes = ['float32', 'float64']
+        self.theta_size = [4, 4]
+    def run_backward(self, device, dtype, mode):
+        paddle.set_device(device)
+        myLayer = Optimization_ex1(self.theta_size, dtype)
+        loss = myLayer(mode)
+        loss.backward()
+    def test_case_one_step(self):
+        for dev in self.devices:
+            for dtype in self.dtypes:
+                self.run_backward(dev, dtype, 1)
+    def test_case_two_step(self):
+        for dev in self.devices:
+            for dtype in self.dtypes:
+                self.run_backward(dev, dtype, 2)
+    def test_case_non_param(self):
+        for dev in self.devices:
+            for dtype in self.dtypes:
+                self.run_backward(dev, dtype, 3)
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_complex_simplenet.py
+++ b/python/paddle/fluid/tests/unittests/test_complex_simplenet.py
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid.core as core
+class Optimization_ex1(paddle.nn.Layer):
+    def __init__(self,
+                 shape,
+                 param_attr=paddle.nn.initializer.Uniform(
+                     low=-5., high=5.),
+                 dtype='float32'):
+        super(Optimization_ex1, self).__init__()
+        self.theta = self.create_parameter(
+            shape=shape, attr=param_attr, dtype=dtype, is_bias=False)
+        self.A = paddle.to_tensor(
+            np.random.randn(4, 4) + np.random.randn(4, 4) * 1j)
+    def forward(self):
+        loss = paddle.add(self.theta, self.A)
+        return loss.real()
+class TestComplexSimpleNet(unittest.TestCase):
+    def setUp(self):
+        self.devices = ['cpu']
+        if core.is_compiled_with_cuda():
+            self.devices.append('gpu')
+        self.iter = 10
+        self.learning_rate = 0.5
+        self.theta_size = [4, 4]
+    def train(self, device):
+        paddle.set_device(device)
+        myLayer = Optimization_ex1(self.theta_size)
+        optimizer = paddle.optimizer.Adam(
+            learning_rate=self.learning_rate, parameters=myLayer.parameters())
+        for itr in range(self.iter):
+            loss = myLayer()
+            loss.backward()
+            optimizer.step()
+            optimizer.clear_grad()
+    def test_train_success(self):
+        for dev in self.devices:
+            self.train(dev)
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
@@ -431,7 +431,8 @@ class TestAddOp(unittest.TestCase):
 class TestComplexElementwiseAddOp(OpTest):
    def setUp(self):
        self.op_type = "elementwise_add"
-        self.init_base_dtype()
+        self.dtype = np.float64
+        self.shape = (2, 3, 4, 5)
        self.init_input_output()
        self.init_grad_input_output()
@@ -446,17 +447,15 @@ class TestComplexElementwiseAddOp(OpTest):
        self.dtype = np.float64
    def init_input_output(self):
-        self.x = np.random.random(
+        self.x = np.random.random(self.shape).astype(
-            (2, 3, 4, 5)).astype(self.dtype) + 1J * np.random.random(
+            self.dtype) + 1J * np.random.random(self.shape).astype(self.dtype)
-                (2, 3, 4, 5)).astype(self.dtype)
+        self.y = np.random.random(self.shape).astype(
-        self.y = np.random.random(
+            self.dtype) + 1J * np.random.random(self.shape).astype(self.dtype)
-            (2, 3, 4, 5)).astype(self.dtype) + 1J * np.random.random(
-                (2, 3, 4, 5)).astype(self.dtype)
        self.out = self.x + self.y
    def init_grad_input_output(self):
-        self.grad_out = np.ones((2, 3, 4, 5), self.dtype) + 1J * np.ones(
+        self.grad_out = np.ones(self.shape, self.dtype) + 1J * np.ones(
-            (2, 3, 4, 5), self.dtype)
+            self.shape, self.dtype)
        self.grad_x = self.grad_out
        self.grad_y = self.grad_out
@@ -487,5 +486,19 @@ class TestComplexElementwiseAddOp(OpTest):
            user_defined_grad_outputs=[self.grad_out])
+class TestRealComplexElementwiseAddOp(TestComplexElementwiseAddOp):
+    def init_input_output(self):
+        self.x = np.random.random(self.shape).astype(self.dtype)
+        self.y = np.random.random(self.shape).astype(
+            self.dtype) + 1J * np.random.random(self.shape).astype(self.dtype)
+        self.out = self.x + self.y
+    def init_grad_input_output(self):
+        self.grad_out = np.ones(self.shape, self.dtype) + 1J * np.ones(
+            self.shape, self.dtype)
+        self.grad_x = np.real(self.grad_out)
+        self.grad_y = self.grad_out
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py
@@ -261,5 +261,65 @@ class TestDivideOp(unittest.TestCase):
            self.assertEqual((np_z == z_expected).all(), True)
+class TestComplexElementwiseDivOp(OpTest):
+    def setUp(self):
+        self.op_type = "elementwise_div"
+        self.init_base_dtype()
+        self.init_input_output()
+        self.init_grad_input_output()
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(self.x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(self.y)
+        }
+        self.attrs = {'axis': -1, 'use_mkldnn': False}
+        self.outputs = {'Out': self.out}
+    def init_base_dtype(self):
+        self.dtype = np.float64
+    def init_input_output(self):
+        self.x = np.random.random(
+            (2, 3, 4, 5)).astype(self.dtype) + 1J * np.random.random(
+                (2, 3, 4, 5)).astype(self.dtype)
+        self.y = np.random.random(
+            (2, 3, 4, 5)).astype(self.dtype) + 1J * np.random.random(
+                (2, 3, 4, 5)).astype(self.dtype)
+        self.out = self.x / self.y
+    def init_grad_input_output(self):
+        self.grad_out = np.ones((2, 3, 4, 5), self.dtype) + 1J * np.ones(
+            (2, 3, 4, 5), self.dtype)
+        self.grad_x = self.grad_out / np.conj(self.y)
+        self.grad_y = -self.grad_out * np.conj(self.x / self.y / self.y)
+    def test_check_output(self):
+        self.check_output()
+    def test_check_grad_normal(self):
+        self.check_grad(
+            ['X', 'Y'],
+            'Out',
+            user_defined_grads=[self.grad_x, self.grad_y],
+            user_defined_grad_outputs=[self.grad_out])
+    def test_check_grad_ingore_x(self):
+        self.check_grad(
+            ['Y'],
+            'Out',
+            no_grad_set=set("X"),
+            user_defined_grads=[self.grad_y],
+            user_defined_grad_outputs=[self.grad_out])
+    def test_check_grad_ingore_y(self):
+        self.check_grad(
+            ['X'],
+            'Out',
+            no_grad_set=set('Y'),
+            user_defined_grads=[self.grad_x],
+            user_defined_grad_outputs=[self.grad_out])
 if __name__ == '__main__':
+    paddle.enable_static()
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py
@@ -13,13 +13,17 @@
 # limitations under the License.
 from __future__ import print_function
 import unittest
 import numpy as np
-from op_test import OpTest, skip_check_grad_ci
+import paddle
+import paddle.fluid as fluid
 import paddle.fluid.core as core
+from paddle.fluid import Program, compiler, program_guard
 from paddle.fluid.op import Operator
-import paddle.fluid as fluid
-from paddle.fluid import compiler, Program, program_guard
+from op_test import OpTest, skip_check_grad_ci
 class ElementwiseMulOp(OpTest):
@@ -241,5 +245,65 @@ class TestElementwiseMulOpError(unittest.TestCase):
            self.assertRaises(TypeError, fluid.layers.elementwise_mul, x2, y2)
+class TestComplexElementwiseMulOp(OpTest):
+    def setUp(self):
+        self.op_type = "elementwise_mul"
+        self.init_base_dtype()
+        self.init_input_output()
+        self.init_grad_input_output()
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(self.x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(self.y)
+        }
+        self.attrs = {'axis': -1, 'use_mkldnn': False}
+        self.outputs = {'Out': self.out}
+    def init_base_dtype(self):
+        self.dtype = np.float64
+    def init_input_output(self):
+        self.x = np.random.random(
+            (2, 3, 4, 5)).astype(self.dtype) + 1J * np.random.random(
+                (2, 3, 4, 5)).astype(self.dtype)
+        self.y = np.random.random(
+            (2, 3, 4, 5)).astype(self.dtype) + 1J * np.random.random(
+                (2, 3, 4, 5)).astype(self.dtype)
+        self.out = self.x * self.y
+    def init_grad_input_output(self):
+        self.grad_out = np.ones((2, 3, 4, 5), self.dtype) + 1J * np.ones(
+            (2, 3, 4, 5), self.dtype)
+        self.grad_x = self.grad_out * np.conj(self.y)
+        self.grad_y = self.grad_out * np.conj(self.x)
+    def test_check_output(self):
+        self.check_output()
+    def test_check_grad_normal(self):
+        self.check_grad(
+            ['X', 'Y'],
+            'Out',
+            user_defined_grads=[self.grad_x, self.grad_y],
+            user_defined_grad_outputs=[self.grad_out])
+    def test_check_grad_ingore_x(self):
+        self.check_grad(
+            ['Y'],
+            'Out',
+            no_grad_set=set("X"),
+            user_defined_grads=[self.grad_y],
+            user_defined_grad_outputs=[self.grad_out])
+    def test_check_grad_ingore_y(self):
+        self.check_grad(
+            ['X'],
+            'Out',
+            no_grad_set=set('Y'),
+            user_defined_grads=[self.grad_x],
+            user_defined_grad_outputs=[self.grad_out])
 if __name__ == '__main__':
+    paddle.enable_static()
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_strided_slice_op.py
+++ b/python/paddle/fluid/tests/unittests/test_strided_slice_op.py
@@ -519,7 +519,7 @@ class TestStridedSliceAPI(unittest.TestCase):
                np.random.randn(2, 10), place=paddle.CUDAPinnedPlace())
            self.assertTrue(x.place.is_cuda_pinned_place())
            y = x[:, ::2]
-            self.assertFalse(x.place.is_cuda_pinned_place())
+            self.assertTrue(x.place.is_cuda_pinned_place())
            self.assertFalse(y.place.is_cuda_pinned_place())