diff --git a/paddle/fluid/extension/include/dtype.h b/paddle/fluid/extension/include/dtype.h
index 3db1f5c3084712edfb05a945b0c5fba5bd07b475..ec8d76a391c1eb608dea4be6d402a1b65814da19 100644
--- a/paddle/fluid/extension/include/dtype.h
+++ b/paddle/fluid/extension/include/dtype.h
@@ -13,13 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/fluid/platform/bfloat16.h"
-#include "paddle/fluid/platform/complex128.h"
-#include "paddle/fluid/platform/complex64.h"
-#include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
 
+struct complex128;
+struct complex64;
+struct float16;
+struct bfloat16;
+
 enum DataType {
   FLOAT32,
   FLOAT64,
diff --git a/paddle/fluid/extension/include/op_meta_info.h b/paddle/fluid/extension/include/op_meta_info.h
index a670e345ba06953879a7abc44309c8569e5c8eef..d02954dc61eb8d648ae02dc1056265e3164f85a6 100644
--- a/paddle/fluid/extension/include/op_meta_info.h
+++ b/paddle/fluid/extension/include/op_meta_info.h
@@ -293,6 +293,11 @@ class OpMetaInfoBuilder {
 // Call after PD_BUILD_OP(...)
 void RegisterAllCustomOperator();
 
+// Using this api to load compiled custom operator's dynamic library and
+// register Custom
+// Operator into it
+void LoadCustomOperatorLib(const std::string& dso_name);
+
 /////////////////////// Op register Macro /////////////////////////
 
 #define PD_BUILD_OP(op_name)                                            \
diff --git a/paddle/fluid/extension/include/tensor.h b/paddle/fluid/extension/include/tensor.h
index 1140efe5c190679be34f4846d58bf974b0b9d681..a5ce0d1a5858b0422e6187bf2ca0e7198b87ed57 100644
--- a/paddle/fluid/extension/include/tensor.h
+++ b/paddle/fluid/extension/include/tensor.h
@@ -25,13 +25,13 @@ class CustomTensorUtils;
 }  // namespace framework
 class Tensor {
  public:
-  /// \brief Construct a Tensor on None Place for CustomOp.
+  /// \brief Construct a Tensor on target Place for CustomOp.
   /// Generally it's only used for user to create Tensor.
   explicit Tensor(const PlaceType& place);
   /// \brief Reset the shape of the tensor.
   /// Generally it's only used for the input tensor.
   /// Reshape must be called before calling
-  /// mutable_data() or copy_from_cpu()
+  /// mutable_data() or copy_to(const PlaceType& place)
   /// \param shape The shape to set.
   void reshape(const std::vector<int>& shape);
 
@@ -59,11 +59,11 @@ class Tensor {
 
   /// \brief Copy the host memory to tensor data.
   /// It's usually used to set the input tensor data.
-  /// \param PlaceType of target place, from which
-  /// the tensor will copy.
+  /// \param PlaceType of target place, of which
+  /// the tensor will copy to.
 
   template <typename T>
-  Tensor copy_to(const PlaceType& place);
+  Tensor copy_to(const PlaceType& place) const;
 
   /// \brief Return the shape of the Tensor.
   std::vector<int> shape() const;
@@ -84,7 +84,7 @@ class Tensor {
   const PlaceType& place() const;
 
   /// \brief Cast datatype from one to another
-  Tensor cast(const DataType& target_type);
+  Tensor cast(const DataType& target_type) const;
 
  private:
   friend class framework::CustomTensorUtils;
diff --git a/paddle/fluid/extension/src/op_meta_info.cc b/paddle/fluid/extension/src/op_meta_info.cc
index 0238dd7a7eca716ee720a3d3b4c37c0590ed9060..f31723e5ac83675884f950c1c4e8917c220bc474 100644
--- a/paddle/fluid/extension/src/op_meta_info.cc
+++ b/paddle/fluid/extension/src/op_meta_info.cc
@@ -109,6 +109,9 @@ void RegisterAllCustomOperator() {
   framework::RegisterOperatorWithMetaInfoMap(op_meta_info_map);
 }
 
+void LoadCustomOperatorLib(const std::string& dso_name) {
+  paddle::framework::LoadOpMetaInfoAndRegisterOp(dso_name);
+}
 }  // namespace paddle
 
 extern "C" {
diff --git a/paddle/fluid/extension/src/tensor.cc b/paddle/fluid/extension/src/tensor.cc
index 12f701a131e2cb8acf1921342beaa4b7f99da346..34ca57d75bf03bfe9f45ada86b913f369062f6e0 100644
--- a/paddle/fluid/extension/src/tensor.cc
+++ b/paddle/fluid/extension/src/tensor.cc
@@ -17,7 +17,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/custom_tensor_utils.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/platform/bfloat16.h"
+#include "paddle/fluid/platform/complex128.h"
+#include "paddle/fluid/platform/complex64.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/platform/transform.h"
 
 namespace paddle {
@@ -174,7 +178,7 @@ DataType Tensor::type() const {
 }
 
 template <typename T>
-Tensor Tensor::copy_to(const PlaceType &target_place) {
+Tensor Tensor::copy_to(const PlaceType &target_place) const {
   GET_CASTED_TENSOR;
   PADDLE_ENFORCE_GE(tensor->numel(), 0,
                     platform::errors::PreconditionNotMet(
@@ -208,21 +212,21 @@ Tensor Tensor::copy_to(const PlaceType &target_place) {
 }
 
 template Tensor Tensor::copy_to<paddle::platform::float16>(
-    const PlaceType &target_place);
+    const PlaceType &target_place) const;
 template Tensor Tensor::copy_to<paddle::platform::bfloat16>(
-    const PlaceType &target_place);
+    const PlaceType &target_place) const;
 template Tensor Tensor::copy_to<paddle::platform::complex64>(
-    const PlaceType &target_place);
+    const PlaceType &target_place) const;
 template Tensor Tensor::copy_to<paddle::platform::complex128>(
-    const PlaceType &target_place);
-template Tensor Tensor::copy_to<float>(const PlaceType &target_place);
-template Tensor Tensor::copy_to<double>(const PlaceType &target_place);
-template Tensor Tensor::copy_to<int64_t>(const PlaceType &target_place);
-template Tensor Tensor::copy_to<int32_t>(const PlaceType &target_place);
-template Tensor Tensor::copy_to<uint8_t>(const PlaceType &target_place);
-template Tensor Tensor::copy_to<int8_t>(const PlaceType &target_place);
-template Tensor Tensor::copy_to<int16_t>(const PlaceType &target_place);
-template Tensor Tensor::copy_to<bool>(const PlaceType &target_place);
+    const PlaceType &target_place) const;
+template Tensor Tensor::copy_to<float>(const PlaceType &target_place) const;
+template Tensor Tensor::copy_to<double>(const PlaceType &target_place) const;
+template Tensor Tensor::copy_to<int64_t>(const PlaceType &target_place) const;
+template Tensor Tensor::copy_to<int32_t>(const PlaceType &target_place) const;
+template Tensor Tensor::copy_to<uint8_t>(const PlaceType &target_place) const;
+template Tensor Tensor::copy_to<int8_t>(const PlaceType &target_place) const;
+template Tensor Tensor::copy_to<int16_t>(const PlaceType &target_place) const;
+template Tensor Tensor::copy_to<bool>(const PlaceType &target_place) const;
 
 template float *Tensor::data<float>() const;
 template double *Tensor::data<double>() const;
@@ -295,7 +299,7 @@ const PlaceType &Tensor::place() const {
   return place_;
 }
 
-Tensor Tensor::cast(const DataType &target_type) {
+Tensor Tensor::cast(const DataType &target_type) const {
   GET_CASTED_TENSOR;
   Tensor rlt = Tensor(place());
   rlt.reshape(this->shape());
@@ -342,7 +346,14 @@ Tensor Tensor::cast(const DataType &target_type) {
       framework::VisitDataType(
           dst_type, CastDataType<uint8_t>(*tensor, rlt_tensor_, ctx));
       break;
-    // TODO(JiabinYang): Support Complex later
+    case framework::proto::VarType::COMPLEX64:
+      framework::VisitDataType(dst_type, CastDataType<platform::complex64>(
+                                             *tensor, rlt_tensor_, ctx));
+      break;
+    case framework::proto::VarType::COMPLEX128:
+      framework::VisitDataType(dst_type, CastDataType<platform::complex128>(
+                                             *tensor, rlt_tensor_, ctx));
+      break;
     default:
       PADDLE_THROW(platform::errors::Unimplemented(
           "Data type (%s) is not supported when casting data type.",
diff --git a/paddle/fluid/framework/custom_tensor_test.cc b/paddle/fluid/framework/custom_tensor_test.cc
index 643ee8270a0c5314b6061914389763dd96cd9e33..33b662454286f2210a989b87d6f4cb0129732520 100644
--- a/paddle/fluid/framework/custom_tensor_test.cc
+++ b/paddle/fluid/framework/custom_tensor_test.cc
@@ -25,7 +25,7 @@ paddle::Tensor InitCPUTensorForTest() {
   t1.reshape(tensor_shape);
   auto* p_data_ptr = t1.mutable_data<T>(paddle::PlaceType::kCPU);
   for (int64_t i = 0; i < t1.size(); i++) {
-    p_data_ptr[i] = 5;
+    p_data_ptr[i] = T(5);
   }
   return t1;
 }
@@ -36,7 +36,7 @@ void TestCopyTensor() {
   auto t1_cpu_cp = t1.template copy_to<T>(paddle::PlaceType::kCPU);
   CHECK((paddle::PlaceType::kCPU == t1_cpu_cp.place()));
   for (int64_t i = 0; i < t1.size(); i++) {
-    CHECK_EQ(t1_cpu_cp.template data<T>()[i], 5);
+    CHECK_EQ(t1_cpu_cp.template data<T>()[i], T(5));
   }
 #ifdef PADDLE_WITH_CUDA
   VLOG(2) << "Do GPU copy test";
@@ -48,7 +48,7 @@ void TestCopyTensor() {
       t1_gpu_cp.template copy_to<T>(paddle::PlaceType::kCPU);
   CHECK((paddle::PlaceType::kCPU == t1_gpu_cp_cp_cpu.place()));
   for (int64_t i = 0; i < t1.size(); i++) {
-    CHECK_EQ(t1_gpu_cp_cp_cpu.template data<T>()[i], 5);
+    CHECK_EQ(t1_gpu_cp_cp_cpu.template data<T>()[i], T(5));
   }
 #endif
 }
@@ -99,16 +99,15 @@ void GroupTestCopy() {
   TestCopyTensor<float>();
   VLOG(2) << "Double cpu-cpu-gpu-gpu-cpu";
   TestCopyTensor<double>();
-  // TODO(JiabinYang): Support these test later
-  //  VLOG(2) << "Fp16 cpu-cpu-gpu-gpu-cpu";
-  //  TestCopyTensor<paddle::platform::float16>();
-  //  VLOG(2) << "BF16 cpu-cpu-gpu-gpu-cpu";
-  //  TestCopyTensor<paddle::platform::bfloat16>();
-  //  VLOG(2) << "complex128 cpu-cpu-gpu-gpu-cpu";
-  //  TestCopyTensor<paddle::platform::complex128>();
-  //  VLOG(2) << "complex64 cpu-cpu-gpu-gpu-cpu";
-  //  TestCopyTensor<paddle::platform::complex64>();
-  //  VLOG(2) << "int cpu-cpu-gpu-gpu-cpu";
+  VLOG(2) << "Fp16 cpu-cpu-gpu-gpu-cpu";
+  TestCopyTensor<paddle::platform::float16>();
+  VLOG(2) << "BF16 cpu-cpu-gpu-gpu-cpu";
+  TestCopyTensor<paddle::platform::bfloat16>();
+  VLOG(2) << "complex128 cpu-cpu-gpu-gpu-cpu";
+  TestCopyTensor<paddle::platform::complex128>();
+  VLOG(2) << "complex64 cpu-cpu-gpu-gpu-cpu";
+  TestCopyTensor<paddle::platform::complex64>();
+  VLOG(2) << "int cpu-cpu-gpu-gpu-cpu";
   TestCopyTensor<int>();
   VLOG(2) << "int64 cpu-cpu-gpu-gpu-cpu";
   TestCopyTensor<int64_t>();
@@ -139,6 +138,10 @@ void GroupTestCast() {
   TestCast<uint8_t>(paddle::DataType::FLOAT32);
   VLOG(2) << "float cast";
   TestCast<float>(paddle::DataType::FLOAT32);
+  VLOG(2) << "complex64 cast";
+  TestCast<float>(paddle::DataType::FLOAT32);
+  VLOG(2) << "complex128 cast";
+  TestCast<float>(paddle::DataType::FLOAT32);
 }
 
 void GroupTestDtype() {