diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 46059100b3802a7f17b83bada6578636f2369d13..f29546c5210d9c6c54a16df015c21add27ccd95c 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -192,13 +192,13 @@ add_subdirectory(profiler)
 
 cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS})
 if(WITH_GPU)
-  nv_library(profiler SRCS profiler.cc profiler.cu DEPS os_info device_tracer gpu_info enforce dynload_cuda new_profiler)
+  nv_library(profiler SRCS profiler.cc profiler.cu DEPS os_info device_tracer gpu_info enforce dynload_cuda new_profiler stats)
   nv_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info gpu_info place)
 elseif(WITH_ROCM)
-  hip_library(profiler SRCS profiler.cc profiler.cu DEPS os_info device_tracer gpu_info enforce new_profiler)
+  hip_library(profiler SRCS profiler.cc profiler.cu DEPS os_info device_tracer gpu_info enforce new_profiler stats)
   hip_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info gpu_info place)
 else()
-  cc_library(profiler SRCS profiler.cc DEPS os_info device_tracer enforce new_profiler)
+  cc_library(profiler SRCS profiler.cc DEPS os_info device_tracer enforce new_profiler stats)
   cc_library(device_memory_aligment SRCS device_memory_aligment.cc DEPS cpu_info place)
 endif()
 
diff --git a/paddle/phi/CMakeLists.txt b/paddle/phi/CMakeLists.txt
index 724b1ba556d4b999f8be501b8d212cbf0742e894..d43e327393f25b57f544cae7056eab37a5b01db3 100644
--- a/paddle/phi/CMakeLists.txt
+++ b/paddle/phi/CMakeLists.txt
@@ -23,7 +23,7 @@ add_subdirectory(tools)
 add_subdirectory(tests)
 
 # make an unity target for compile deps
-set(PHI_DEPS convert_utils dense_tensor phi_context kernel_factory kernel_context arg_map_context infermeta lod_utils op_compat_infos sparse_csr_tensor sparse_coo_tensor string_tensor)
+set(PHI_DEPS convert_utils dense_tensor phi_context kernel_factory kernel_context arg_map_context infermeta lod_utils op_compat_infos sparse_csr_tensor sparse_coo_tensor string_tensor api_scalar)
 get_property(phi_kernels GLOBAL PROPERTY PHI_KERNELS)
 set(PHI_DEPS ${PHI_DEPS} ${phi_kernels})
 
diff --git a/paddle/phi/api/lib/CMakeLists.txt b/paddle/phi/api/lib/CMakeLists.txt
index 9cc5d620280bcdf2187b2e0e6c80f257f00f6466..e10ae8254a79e7beb2737328885f38053a9bb961 100644
--- a/paddle/phi/api/lib/CMakeLists.txt
+++ b/paddle/phi/api/lib/CMakeLists.txt
@@ -164,7 +164,7 @@ cc_library(kernel_dispatch SRCS kernel_dispatch.cc DEPS phi_tensor_raw phi_conte
 cc_library(api_gen_utils SRCS api_gen_utils.cc DEPS phi_tensor_raw selected_rows sparse_csr_tensor sparse_coo_tensor)
 cc_library(phi_data_transform SRCS data_transform.cc DEPS phi_tensor_raw transfer_layout_kernel cast_kernel data_device_transform)
 cc_library(api_custom_impl SRCS api_custom_impl.cc DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils backward_infermeta phi_data_transform)
-cc_library(sparse_api_custom_impl SRCS sparse_api_custom_impl.cc DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform)
+cc_library(sparse_api_custom_impl SRCS sparse_api_custom_impl.cc DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform tensor_copy)
 
 cc_library(phi_function_api SRCS ${api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform api_custom_impl)
 cc_library(phi_bw_function_api SRCS ${bw_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils backward_infermeta phi_data_transform phi_function_api api_custom_impl global_utils)
@@ -173,3 +173,5 @@ cc_library(sparse_bw_api SRCS ${sparse_bw_api_source_file} DEPS phi_tensor_raw p
 cc_library(phi_dygraph_api SRCS ${dygraph_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform phi_function_api sparse_api)
 cc_library(strings_api SRCS ${strings_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils)
 cc_library(phi_tensor SRCS tensor_method.cc DEPS phi_tensor_raw phi_function_api api_gen_utils kernel_dispatch infermeta sparse_api strings_api)
+cc_library(tensor_copy SRCS tensor_copy.cc DEPS phi_tensor_raw copy_kernel kernel_dispatch api_gen_utils)
+cc_library(api_scalar SRCS scalar.cc DEPS tensor_copy)
diff --git a/paddle/phi/api/lib/api_custom_impl.cc b/paddle/phi/api/lib/api_custom_impl.cc
index 2b80094a39e31646139e7a5312f933cf9a55cf6a..033ec569de811c935c7b43eb4feff8e300a9120f 100644
--- a/paddle/phi/api/lib/api_custom_impl.cc
+++ b/paddle/phi/api/lib/api_custom_impl.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 #include "paddle/phi/api/lib/api_gen_utils.h"
 #include "paddle/phi/api/lib/data_transform.h"
 #include "paddle/phi/api/lib/kernel_dispatch.h"
+#include "paddle/phi/api/lib/tensor_copy.h"
 #include "paddle/phi/api/lib/utils/storage.h"
 #include "paddle/phi/common/type_traits.h"
 #include "paddle/phi/core/compat/convert_utils.h"
@@ -424,35 +425,8 @@ std::vector<std::vector<Tensor>> conv2d_grad_impl(
 }
 
 Tensor copy_to_impl(const Tensor& x, Place place, bool blocking) {
-  auto kernel_key_set = ParseKernelKeyByInputArgs(x);
-  kernel_key_set.backend_set =
-      kernel_key_set.backend_set | BackendSet(phi::TransToPhiBackend(place));
-  auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
-  auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
-      "copy", kernel_key);
-
-  VLOG(6) << "copy API kernel key: " << kernel_key;
-  VLOG(6) << "copy API kernel: " << kernel;
-
-  auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
-
-  auto dense_x = TensorToDenseTensor(x);
-
   Tensor out;
-  auto kernel_out = SetKernelOutput(kernel_key.backend(), &out);
-  phi::MetaTensor meta_out(kernel_out);
-  phi::UnchangedInferMeta(*dense_x, &meta_out);
-
-  using kernel_signature = void (*)(const platform::DeviceContext&,
-                                    const phi::DenseTensor&,
-                                    phi::Place,
-                                    bool,
-                                    phi::DenseTensor*);
-
-  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
-
-  (*kernel_fn)(*dev_ctx, *dense_x, place, blocking, kernel_out);
-
+  copy(x, place, blocking, &out);
   return out;
 }
 
diff --git a/paddle/phi/api/lib/scalar.cc b/paddle/phi/api/lib/scalar.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c31338de09f1e2864a8920a2297de9446b8bce29
--- /dev/null
+++ b/paddle/phi/api/lib/scalar.cc
@@ -0,0 +1,49 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/common/scalar.h"
+
+#include "paddle/phi/api/lib/tensor_copy.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/enforce.h"
+
+namespace paddle {
+namespace experimental {
+
+template <>
+ScalarBase<Tensor>::ScalarBase(const Tensor& tensor_in)
+    : dtype_(tensor_in.dtype()) {  // NOLINT
+  PADDLE_ENFORCE_EQ(tensor_in.numel(),
+                    1,
+                    phi::errors::InvalidArgument(
+                        "The Scalar only supports Tensor with 1 element, but "
+                        "now Tensor has `%d` elements",
+                        tensor_in.numel()));
+  auto tensor_in_place = tensor_in.place().GetType();
+  if (tensor_in_place == phi::AllocationType::GPU) {
+    Tensor dst_tensor;
+    copy(tensor_in, phi::CPUPlace(), true, &dst_tensor);
+    GetDataFromTensor(dst_tensor);
+  } else if (tensor_in_place == phi::AllocationType::CPU) {
+    GetDataFromTensor(tensor_in);
+  } else {
+    PADDLE_THROW(phi::errors::Unimplemented(
+        "Now, it is not supported to construct Scalar using tensor that its "
+        "Place is (%s)",
+        tensor_in.place()));
+  }
+}
+
+}  // namespace experimental
+}  // namespace paddle
diff --git a/paddle/phi/api/lib/tensor_copy.cc b/paddle/phi/api/lib/tensor_copy.cc
new file mode 100644
index 0000000000000000000000000000000000000000..57e3c28d8cb1f9f5db19170084d93b1ca922508e
--- /dev/null
+++ b/paddle/phi/api/lib/tensor_copy.cc
@@ -0,0 +1,57 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/api/lib/tensor_copy.h"
+#include "paddle/phi/api/lib/api_gen_utils.h"
+#include "paddle/phi/api/lib/kernel_dispatch.h"
+#include "paddle/phi/api/lib/utils/storage.h"
+#include "paddle/phi/core/compat/convert_utils.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/meta_tensor.h"
+#include "paddle/phi/infermeta/unary.h"
+
+namespace paddle {
+namespace experimental {
+
+void copy(const Tensor& src, Place place, bool blocking, Tensor* dst) {
+  auto kernel_key_set = ParseKernelKeyByInputArgs(src);
+  kernel_key_set.backend_set =
+      kernel_key_set.backend_set | BackendSet(phi::TransToPhiBackend(place));
+  auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
+  auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
+      "copy", kernel_key);
+
+  VLOG(6) << "copy API kernel key: " << kernel_key;
+  VLOG(6) << "copy API kernel: " << kernel;
+
+  auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
+
+  auto dense_x = TensorToDenseTensor(src);
+
+  auto kernel_out = SetKernelOutput(kernel_key.backend(), dst);
+  phi::MetaTensor meta_out(kernel_out);
+  phi::UnchangedInferMeta(*dense_x, &meta_out);
+
+  using kernel_signature = void (*)(const platform::DeviceContext&,
+                                    const phi::DenseTensor&,
+                                    phi::Place,
+                                    bool,
+                                    phi::DenseTensor*);
+
+  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
+  (*kernel_fn)(*dev_ctx, *dense_x, place, blocking, kernel_out);
+}
+
+}  // namespace experimental
+}  // namespace paddle
diff --git a/paddle/phi/api/lib/tensor_copy.h b/paddle/phi/api/lib/tensor_copy.h
new file mode 100644
index 0000000000000000000000000000000000000000..3ce45853319ecf24b21a1305288bdd441f1c1e1c
--- /dev/null
+++ b/paddle/phi/api/lib/tensor_copy.h
@@ -0,0 +1,25 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/api/include/tensor.h"
+
+namespace paddle {
+namespace experimental {
+
+void copy(const Tensor& src, Place place, bool blocking, Tensor* dst);
+
+}  // namespace experimental
+}  // namespace paddle
diff --git a/paddle/phi/api/lib/utils/CMakeLists.txt b/paddle/phi/api/lib/utils/CMakeLists.txt
index 94a16da2b7720b5af08e72096d9d465a93ea3f9e..de97e7516f61938068a12d781dcc950e51863740 100644
--- a/paddle/phi/api/lib/utils/CMakeLists.txt
+++ b/paddle/phi/api/lib/utils/CMakeLists.txt
@@ -1,2 +1,2 @@
 cc_library(phi_api_utils SRCS storage.cc tensor_utils.cc DEPS
-tensor_base convert_utils dense_tensor lod_tensor selected_rows_utils place var_type_traits scalar string_tensor)
+tensor_base convert_utils dense_tensor lod_tensor selected_rows_utils place var_type_traits string_tensor scalar)
diff --git a/paddle/phi/common/CMakeLists.txt b/paddle/phi/common/CMakeLists.txt
index 9bf692703860f15601ad601970ea1f5b1316442b..aa839eab587cbe74f6f49be4edf9679c001e7b01 100644
--- a/paddle/phi/common/CMakeLists.txt
+++ b/paddle/phi/common/CMakeLists.txt
@@ -1,2 +1,2 @@
 cc_library(phi_place SRCS place.cc)
-cc_library(scalar SRCS scalar.cc DEPS phi_enforce)
+cc_library(scalar SRCS scalar.cc DEPS phi_enforce tensor)
diff --git a/paddle/phi/common/scalar.cc b/paddle/phi/common/scalar.cc
index 5cd55c1e88bed6f805a72cff92024d9dc219a1a2..41f1c9541823dbecc83ddc73540cae7255dc0199 100644
--- a/paddle/phi/common/scalar.cc
+++ b/paddle/phi/common/scalar.cc
@@ -14,21 +14,32 @@ limitations under the License. */
 
 #include "paddle/phi/common/scalar.h"
 
+#include "paddle/phi/common/place.h"
 #include "paddle/phi/core/enforce.h"
 
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/platform/place.h"
 namespace paddle {
 namespace experimental {
 
-// NOTE(xiongkun): why we put definition here?
-// test_custom_op can't include enforce.h, because enforce.h includes gflags.
-// so we decouple the include dependence of enforce.h by link.
-void ThrowTensorConvertError(int num) {
-  PADDLE_ENFORCE_EQ(num,
+// The Tensor must have one dim
+template <>
+ScalarBase<phi::DenseTensor>::ScalarBase(const phi::DenseTensor& tensor_in)
+    : dtype_(tensor_in.dtype()) {  // NOLINT
+  PADDLE_ENFORCE_EQ(tensor_in.numel(),
                     1,
                     phi::errors::InvalidArgument(
                         "The Scalar only supports Tensor with 1 element, but "
                         "now Tensor has `%d` elements",
-                        num));
+                        tensor_in.numel()));
+  auto cpu_place = phi::CPUPlace();
+  if (!paddle::platform::is_same_place(tensor_in.place(), cpu_place)) {
+    phi::DenseTensor tensor;
+    framework::TensorCopySync(tensor_in, cpu_place, &tensor);
+    GetDataFromTensor(tensor);
+  } else {
+    GetDataFromTensor(tensor_in);
+  }
 }
 
 }  // namespace experimental
diff --git a/paddle/phi/common/scalar.h b/paddle/phi/common/scalar.h
index 5134f4eb72639650a0bde34f2abbb0e05ced13c7..c28f6185a556a6075c199f0d74aeb10721c17d22 100644
--- a/paddle/phi/common/scalar.h
+++ b/paddle/phi/common/scalar.h
@@ -23,8 +23,6 @@ limitations under the License. */
 namespace paddle {
 namespace experimental {
 
-void ThrowTensorConvertError(int);
-
 template <typename T>
 class ScalarBase {
  public:
@@ -105,50 +103,7 @@ class ScalarBase {
   }
 
   // The Tensor must have one dim
-  ScalarBase(const T& tensor) : dtype_(tensor.dtype()) {  // NOLINT
-    is_from_tensor_ = true;
-    ThrowTensorConvertError(tensor.numel());
-    switch (dtype_) {
-      case DataType::FLOAT32:
-        data_.f32 = tensor.template data<float>()[0];
-        break;
-      case DataType::FLOAT64:
-        data_.f64 = tensor.template data<double>()[0];
-        break;
-      case DataType::FLOAT16:
-        data_.f16 = tensor.template data<float16>()[0];
-        break;
-      case DataType::BFLOAT16:
-        data_.bf16 = tensor.template data<bfloat16>()[0];
-        break;
-      case DataType::INT32:
-        data_.i32 = tensor.template data<int32_t>()[0];
-        break;
-      case DataType::INT64:
-        data_.i64 = tensor.template data<int64_t>()[0];
-        break;
-      case DataType::INT16:
-        data_.i16 = tensor.template data<int16_t>()[0];
-        break;
-      case DataType::INT8:
-        data_.i8 = tensor.template data<int8_t>()[0];
-        break;
-      case DataType::UINT8:
-        data_.ui8 = tensor.template data<uint8_t>()[0];
-        break;
-      case DataType::BOOL:
-        data_.b = tensor.template data<bool>()[0];
-        break;
-      case DataType::COMPLEX64:
-        data_.c64 = tensor.template data<complex64>()[0];
-        break;
-      case DataType::COMPLEX128:
-        data_.c128 = tensor.template data<complex128>()[0];
-        break;
-      default:
-        PD_THROW("Invalid tensor data type `", dtype_, "`.");
-    }
-  }
+  ScalarBase(const T& tensor_in);  // NOLINT
 
   template <typename OtherT>
   ScalarBase(const ScalarBase<OtherT>& other) {
@@ -200,6 +155,49 @@ class ScalarBase {
  private:
   template <typename T1, typename T2>
   friend void CopyScalar(const ScalarBase<T1>& src, ScalarBase<T2>* dst);
+  void GetDataFromTensor(const T& tensor) {
+    is_from_tensor_ = true;
+    switch (dtype_) {
+      case DataType::FLOAT32:
+        data_.f32 = tensor.template data<float>()[0];
+        break;
+      case DataType::FLOAT64:
+        data_.f64 = tensor.template data<double>()[0];
+        break;
+      case DataType::FLOAT16:
+        data_.f16 = tensor.template data<float16>()[0];
+        break;
+      case DataType::BFLOAT16:
+        data_.bf16 = tensor.template data<bfloat16>()[0];
+        break;
+      case DataType::INT32:
+        data_.i32 = tensor.template data<int32_t>()[0];
+        break;
+      case DataType::INT64:
+        data_.i64 = tensor.template data<int64_t>()[0];
+        break;
+      case DataType::INT16:
+        data_.i16 = tensor.template data<int16_t>()[0];
+        break;
+      case DataType::INT8:
+        data_.i8 = tensor.template data<int8_t>()[0];
+        break;
+      case DataType::UINT8:
+        data_.ui8 = tensor.template data<uint8_t>()[0];
+        break;
+      case DataType::BOOL:
+        data_.b = tensor.template data<bool>()[0];
+        break;
+      case DataType::COMPLEX64:
+        data_.c64 = tensor.template data<complex64>()[0];
+        break;
+      case DataType::COMPLEX128:
+        data_.c128 = tensor.template data<complex128>()[0];
+        break;
+      default:
+        PD_THROW("Invalid tensor data type `", dtype_, "`.");
+    }
+  }
 
  private:
   bool is_from_tensor_{false};
diff --git a/paddle/phi/core/CMakeLists.txt b/paddle/phi/core/CMakeLists.txt
index b42b4388c2ce159a6aca61a27898d12d41f77d7a..23574e98fbf170470d766fd47d3ee9cb42ca151b 100644
--- a/paddle/phi/core/CMakeLists.txt
+++ b/paddle/phi/core/CMakeLists.txt
@@ -23,7 +23,7 @@ cc_library(string_tensor SRCS string_tensor.cc DEPS convert_utils tensor_meta te
 
 cc_library(meta_tensor SRCS meta_tensor.cc DEPS tensor_base tensor_meta dense_tensor)
 cc_library(infermeta_utils SRCS infermeta_utils.cc DEPS meta_tensor)
-cc_library(selected_rows SRCS selected_rows_impl.cc DEPS dense_tensor phi_enforce ddim memcpy)
+cc_library(selected_rows SRCS selected_rows_impl.cc selected_rows.cc DEPS tensor_base dense_tensor phi_enforce ddim memcpy)
 cc_library(phi_device_context SRCS device_context.cc DEPS dense_tensor selected_rows)
 
 cc_library(custom_kernel SRCS custom_kernel.cc DEPS kernel_factory)
diff --git a/paddle/phi/core/selected_rows.cc b/paddle/phi/core/selected_rows.cc
new file mode 100644
index 0000000000000000000000000000000000000000..dcf9c4182157a871fbe1cf40a276d9581de99882
--- /dev/null
+++ b/paddle/phi/core/selected_rows.cc
@@ -0,0 +1,26 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/core/selected_rows.h"
+
+namespace phi {
+
+SelectedRows::SelectedRows(const std::vector<int64_t>& rows,
+                           const int64_t& height)
+    : impl_(std::make_shared<phi::SelectedRowsImpl>(rows, height)) {}
+
+SelectedRows::SelectedRows()
+    : impl_(std::make_shared<phi::SelectedRowsImpl>()) {}
+
+}  // namespace phi
diff --git a/paddle/phi/core/selected_rows.h b/paddle/phi/core/selected_rows.h
index 7ee475b4d5d9e03d0931587f2a607f5f4950a426..a71c0471cc431c8e988f00062352aaf8dfcaec3e 100644
--- a/paddle/phi/core/selected_rows.h
+++ b/paddle/phi/core/selected_rows.h
@@ -42,10 +42,9 @@ class SelectedRows : public TensorBase,
    *
    */
  public:
-  SelectedRows(const std::vector<int64_t>& rows, const int64_t& height)
-      : impl_(std::make_shared<phi::SelectedRowsImpl>(rows, height)) {}
+  SelectedRows(const std::vector<int64_t>& rows, const int64_t& height);
 
-  SelectedRows() : impl_(std::make_shared<phi::SelectedRowsImpl>()) {}
+  SelectedRows();
 
   const DenseTensor& value() const { return impl_->value(); }
 
diff --git a/paddle/phi/core/utils/type_registry.h b/paddle/phi/core/utils/type_registry.h
index 8d9f9167242c88f423d38a0b91db113150774893..f27c3db2275c379f5a1ce7d95c0684ba68a8c1a9 100644
--- a/paddle/phi/core/utils/type_registry.h
+++ b/paddle/phi/core/utils/type_registry.h
@@ -51,7 +51,7 @@ TypeInfo<BaseT> TypeRegistry<BaseT>::RegisterType(const std::string& type) {
   std::lock_guard<std::mutex> guard(mutex_);
   assert(name_to_id_.find(type) == name_to_id_.end());
   assert(names_.size() < std::numeric_limits<int8_t>::max());
-  int8_t id = names_.size();
+  int8_t id = static_cast<int8_t>(names_.size());
   names_.emplace_back(type);
   name_to_id_[type] = id;
   return TypeInfo<BaseT>(id);
diff --git a/paddle/phi/tests/api/CMakeLists.txt b/paddle/phi/tests/api/CMakeLists.txt
index 94378aceff58cef2656a42a84b390ae3e7493183..dd4b7e62ec52f48f07b329d2e600c84463e94b45 100644
--- a/paddle/phi/tests/api/CMakeLists.txt
+++ b/paddle/phi/tests/api/CMakeLists.txt
@@ -11,14 +11,14 @@ cc_test(test_mean_api SRCS test_mean_api.cc DEPS ${COMMON_API_TEST_DEPS})
 cc_test(test_dot_api SRCS test_dot_api.cc DEPS ${COMMON_API_TEST_DEPS})
 cc_test(test_matmul_api SRCS test_matmul_api.cc DEPS ${COMMON_API_TEST_DEPS})
 cc_test(test_empty_api SRCS test_empty_api.cc DEPS ${COMMON_API_TEST_DEPS})
-cc_test(test_fill_api SRCS test_fill_api.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(test_fill_api SRCS test_fill_api.cc DEPS ${COMMON_API_TEST_DEPS} api_scalar)
 cc_test(test_elementwise_api SRCS test_elementwise_api.cc DEPS ${COMMON_API_TEST_DEPS})
 cc_test(test_cast_api SRCS test_cast_api.cc DEPS ${COMMON_API_TEST_DEPS})
 cc_test(test_reshape_api SRCS test_reshape_api.cc DEPS ${COMMON_API_TEST_DEPS})
 cc_test(test_to_api SRCS test_to_api.cc DEPS ${COMMON_API_TEST_DEPS})
 cc_test(test_slice_api SRCS test_slice_api.cc DEPS ${COMMON_API_TEST_DEPS})
 cc_test(test_sum_api SRCS test_sum_api.cc DEPS ${COMMON_API_TEST_DEPS})
-cc_test(test_scale_api SRCS test_scale_api.cc DEPS ${COMMON_API_TEST_DEPS})
+cc_test(test_scale_api SRCS test_scale_api.cc DEPS ${COMMON_API_TEST_DEPS} api_scalar)
 cc_test(test_scale_benchmark SRCS test_scale_benchmark.cc DEPS ${COMMON_API_TEST_DEPS})
 cc_test(test_conj_api SRCS test_conj_api.cc DEPS ${COMMON_API_TEST_DEPS})
 cc_test(test_concat_api SRCS test_concat_api.cc DEPS ${COMMON_API_TEST_DEPS})
diff --git a/paddle/phi/tests/api/test_fill_api.cc b/paddle/phi/tests/api/test_fill_api.cc
index bf57574d39093cc81a47f8385097627eb98648af..523fa895d147e0042c8815e7eb8ed2f7c643c70f 100644
--- a/paddle/phi/tests/api/test_fill_api.cc
+++ b/paddle/phi/tests/api/test_fill_api.cc
@@ -22,6 +22,7 @@ limitations under the License. */
 #include "paddle/phi/core/kernel_registry.h"
 
 PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT);
 
 namespace paddle {
 namespace tests {
diff --git a/paddle/phi/tests/api/test_scale_api.cc b/paddle/phi/tests/api/test_scale_api.cc
index a40ecc8485e4a96e96f40d513a153df4cc2eb0c3..5f1e118946675f568d04006e7c1adec6038c8130 100644
--- a/paddle/phi/tests/api/test_scale_api.cc
+++ b/paddle/phi/tests/api/test_scale_api.cc
@@ -25,6 +25,7 @@ limitations under the License. */
 PD_DECLARE_KERNEL(full, CPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(scale, CPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(scale_sr, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT);
 
 namespace paddle {
 namespace tests {
diff --git a/paddle/phi/tests/common/CMakeLists.txt b/paddle/phi/tests/common/CMakeLists.txt
index 710ea3c06647205171289812be6fa7f18a8fb8d0..ca6d20045d1714d8ed7c00648d3626254099b414 100644
--- a/paddle/phi/tests/common/CMakeLists.txt
+++ b/paddle/phi/tests/common/CMakeLists.txt
@@ -2,3 +2,9 @@ cc_test(phi_test_backend SRCS test_backend.cc DEPS gtest)
 cc_test(phi_test_data_layout SRCS test_data_layout.cc DEPS gtest)
 cc_test(phi_test_data_type SRCS test_data_type.cc DEPS gtest)
 cc_test(phi_test_place SRCS test_place.cc DEPS phi_place)
+if (WITH_GPU)
+    nv_test(phi_test_scalar SRCS test_scalar.cu DEPS scalar api_scalar)
+endif()
+if(WITH_ROCM)
+    hip_test(phi_test_scalar SRCS test_scalar.cu DEPS scalar api_scalar)
+endif()
diff --git a/paddle/phi/tests/common/test_scalar.cu b/paddle/phi/tests/common/test_scalar.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6b0caa175dc04e2018f785677b1b00e43e151a2c
--- /dev/null
+++ b/paddle/phi/tests/common/test_scalar.cu
@@ -0,0 +1,205 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <map>  // NOLINT
+#include "gtest/gtest.h"
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
+#include "paddle/phi/api/include/tensor.h"
+#include "paddle/phi/api/lib/utils/allocator.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/complex.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/common/scalar.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_DECLARE_KERNEL(copy, GPU, ALL_LAYOUT);
+
+namespace phi {
+namespace tests {
+
+using DDim = phi::DDim;
+using float16 = phi::dtype::float16;
+using complex64 = ::phi::dtype::complex<float>;
+using complex128 = ::phi::dtype::complex<double>;
+
+__global__ void FillTensor(float* data) { data[0] = 1; }
+
+TEST(Scalar, ConstructFromDenseTensor1) {
+  // 1. create tensor
+  const auto alloc =
+      std::make_unique<paddle::experimental::DefaultAllocator>(phi::CPUPlace());
+  phi::DenseTensor dense_x(
+      alloc.get(),
+      phi::DenseTensorMeta(
+          phi::DataType::FLOAT16, phi::make_ddim({1}), phi::DataLayout::NCHW));
+  phi::CPUContext dev_ctx;
+  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(phi::CPUPlace())
+                           .get());
+  dev_ctx.Init();
+
+  auto* dense_x_data = dev_ctx.Alloc<float16>(&dense_x);
+  dense_x_data[0] = 1;
+  phi::Scalar scalar_test(dense_x);
+  ASSERT_NEAR(1, scalar_test.to<float16>(), 1e-6);
+}
+
+TEST(Scalar, ConstructFromDenseTensor2) {
+  // 1. create tensor
+  const auto alloc =
+      std::make_unique<paddle::experimental::DefaultAllocator>(phi::CPUPlace());
+  phi::DenseTensor dense_x(
+      alloc.get(),
+      phi::DenseTensorMeta(
+          phi::DataType::INT16, phi::make_ddim({1}), phi::DataLayout::NCHW));
+  phi::CPUContext dev_ctx;
+  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(phi::CPUPlace())
+                           .get());
+  dev_ctx.Init();
+
+  auto* dense_x_data = dev_ctx.Alloc<int16_t>(&dense_x);
+  dense_x_data[0] = 1;
+  phi::Scalar scalar_test(dense_x);
+  ASSERT_EQ(1, scalar_test.to<int16_t>());
+}
+
+TEST(Scalar, ConstructFromDenseTensor3) {
+  // 1. create tensor
+  const auto alloc =
+      std::make_unique<paddle::experimental::DefaultAllocator>(phi::CPUPlace());
+  phi::DenseTensor dense_x(
+      alloc.get(),
+      phi::DenseTensorMeta(
+          phi::DataType::INT8, phi::make_ddim({1}), phi::DataLayout::NCHW));
+  phi::CPUContext dev_ctx;
+  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(phi::CPUPlace())
+                           .get());
+  dev_ctx.Init();
+
+  auto* dense_x_data = dev_ctx.Alloc<int8_t>(&dense_x);
+  dense_x_data[0] = 1;
+  phi::Scalar scalar_test(dense_x);
+  ASSERT_EQ(1, scalar_test.to<int8_t>());
+}
+
+TEST(Scalar, ConstructFromDenseTensor4) {
+  // 1. create tensor
+  const auto alloc =
+      std::make_unique<paddle::experimental::DefaultAllocator>(phi::CPUPlace());
+  phi::DenseTensor dense_x(
+      alloc.get(),
+      phi::DenseTensorMeta(
+          phi::DataType::BOOL, phi::make_ddim({1}), phi::DataLayout::NCHW));
+  phi::CPUContext dev_ctx;
+  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(phi::CPUPlace())
+                           .get());
+  dev_ctx.Init();
+
+  auto* dense_x_data = dev_ctx.Alloc<bool>(&dense_x);
+  dense_x_data[0] = true;
+  phi::Scalar scalar_test(dense_x);
+  ASSERT_EQ(true, scalar_test.to<bool>());
+}
+
+TEST(Scalar, ConstructFromDenseTensor5) {
+  // 1. create tensor
+  const auto alloc =
+      std::make_unique<paddle::experimental::DefaultAllocator>(phi::CPUPlace());
+  phi::DenseTensor dense_x(alloc.get(),
+                           phi::DenseTensorMeta(phi::DataType::COMPLEX64,
+                                                phi::make_ddim({1}),
+                                                phi::DataLayout::NCHW));
+  phi::CPUContext dev_ctx;
+  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(phi::CPUPlace())
+                           .get());
+  dev_ctx.Init();
+
+  auto* dense_x_data = dev_ctx.Alloc<complex64>(&dense_x);
+  dense_x_data[0] = 1;
+  phi::Scalar scalar_test(dense_x);
+  complex64 expected_value(1, 0);
+  EXPECT_TRUE(expected_value == scalar_test.to<complex64>());
+}
+
+TEST(Scalar, ConstructFromDenseTensor6) {
+  // 1. create tensor
+  const auto alloc =
+      std::make_unique<paddle::experimental::DefaultAllocator>(phi::CPUPlace());
+  phi::DenseTensor dense_x(alloc.get(),
+                           phi::DenseTensorMeta(phi::DataType::COMPLEX128,
+                                                phi::make_ddim({1}),
+                                                phi::DataLayout::NCHW));
+  phi::CPUContext dev_ctx;
+  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(phi::CPUPlace())
+                           .get());
+  dev_ctx.Init();
+
+  auto* dense_x_data = dev_ctx.Alloc<complex128>(&dense_x);
+  dense_x_data[0] = 1;
+  phi::Scalar scalar_test(dense_x);
+  complex128 expected_value(1, 0);
+  EXPECT_TRUE(expected_value == scalar_test.to<complex128>());
+}
+
+TEST(Scalar, ConstructFromDenseTensor7) {
+  // 1. create tensor
+  const auto alloc =
+      std::make_unique<paddle::experimental::DefaultAllocator>(phi::GPUPlace());
+  phi::DenseTensor dense_x(
+      alloc.get(),
+      phi::DenseTensorMeta(
+          phi::DataType::FLOAT32, phi::make_ddim({1}), phi::DataLayout::NCHW));
+  phi::GPUContext dev_ctx;
+  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(phi::GPUPlace())
+                           .get());
+  dev_ctx.Init();
+
+  auto* dense_x_data = dev_ctx.Alloc<float>(&dense_x);
+  FillTensor<<<1, 1, 0, dev_ctx.stream()>>>(dense_x_data);
+  dev_ctx.Wait();
+  phi::Scalar scalar_test(dense_x);
+  ASSERT_NEAR(1, scalar_test.to<float>(), 1e-6);
+}
+
+TEST(Scalar, ConstructFromTensor) {
+  // 1. create tensor
+  const auto alloc =
+      std::make_unique<paddle::experimental::DefaultAllocator>(phi::GPUPlace());
+  auto dense_x = std::make_shared<phi::DenseTensor>(
+      alloc.get(),
+      phi::DenseTensorMeta(
+          phi::DataType::FLOAT32, phi::make_ddim({1}), phi::DataLayout::NCHW));
+
+  phi::GPUContext dev_ctx;
+  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(phi::GPUPlace())
+                           .get());
+  dev_ctx.Init();
+  auto* dense_x_data = dev_ctx.Alloc<float>(dense_x.get());
+  FillTensor<<<1, 1, 0, dev_ctx.stream()>>>(dense_x_data);
+  dev_ctx.Wait();
+  paddle::experimental::Tensor x(dense_x);
+  paddle::experimental::Scalar scalar_test(x);
+  ASSERT_NEAR(1, scalar_test.to<float>(), 1e-6);
+}
+
+}  // namespace tests
+}  // namespace phi
diff --git a/paddle/phi/tests/core/CMakeLists.txt b/paddle/phi/tests/core/CMakeLists.txt
index 824d1884578155a8b7a1eeb54f17fc4481b886a0..7d2fd90e6bb7b7191fd250ca813d9728aa6c4d24 100644
--- a/paddle/phi/tests/core/CMakeLists.txt
+++ b/paddle/phi/tests/core/CMakeLists.txt
@@ -1,4 +1,4 @@
-cc_test(test_custom_kernel SRCS test_custom_kernel.cc DEPS custom_kernel)
+cc_test(test_custom_kernel SRCS test_custom_kernel.cc DEPS custom_kernel scalar)
 cc_test(test_dense_tensor SRCS test_dense_tensor.cc DEPS dense_tensor)
 cc_test(test_intrusive_ptr SRCS test_intrusive_ptr.cc)
 cc_test(test_type_info SRCS test_type_info.cc)