diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py
index ab8c28c33e78ccf2dc156b636d6a032d628809ef..7769c5371baba048634fcef5ca8d68794ce77c31 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py
@@ -45,7 +45,7 @@ yaml_types_mapping = {
     'int' : 'int', 'int32_t' : 'int32_t', 'int64_t' : 'int64_t',  'size_t' : 'size_t', \
     'float' : 'float', 'double' : 'double', 'bool' : 'bool', \
     'str' : 'std::string', \
-    'Place' : 'paddle::experimental::Place', 'DataLayout' : 'paddle::experimental::DataLayout', 'DataType' : 'paddle::experimental::DataType', \
+    'Place' : 'paddle::Place', 'DataLayout' : 'paddle::experimental::DataLayout', 'DataType' : 'paddle::experimental::DataType', \
     'int64_t[]' : 'std::vector<int64_t>', 'int[]' : 'std::vector<int>',
     'Tensor' : 'Tensor',
     'Tensor[]' : 'std::vector<Tensor>',
diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
index 5a361ef39b79a16dfbe4f4db044e038921ae7ca5..7ca5fc833ea8d2eef43d5c785724036332331036 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py
@@ -46,7 +46,7 @@ atype_to_parsing_function = {
     "std::vector<std::string>": "CastPyArg2Strings",
     "paddle::experimental::Scalar": "CastPyArg2Scalar",
     "paddle::experimental::IntArray": "CastPyArg2IntArray",
-    "paddle::experimental::Place": "CastPyArg2Place",
+    "paddle::Place": "CastPyArg2Place",
     "paddle::experimental::DataType": "CastPyArg2DataType",
 }
 
diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc
index 124e5883324a88aba6ab7a38a11fcb3135eb6de1..4033e2d424fa39c790f010f1f7737f2a38620160 100644
--- a/paddle/fluid/pybind/eager_utils.cc
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -1194,15 +1194,13 @@ std::vector<paddle::framework::Scope*> GetScopePtrListFromArgs(
   return result;
 }
 
-paddle::experimental::Place CastPyArg2Place(PyObject* obj,
-                                            const std::string& op_type,
-                                            ssize_t arg_pos) {
+paddle::Place CastPyArg2Place(PyObject* obj, const std::string& op_type,
+                              ssize_t arg_pos) {
   return CastPyArg2Place(obj, arg_pos);
 }
 
-paddle::experimental::DataType CastPyArg2DataType(PyObject* obj,
-                                                  const std::string& op_type,
-                                                  ssize_t arg_pos) {
+paddle::DataType CastPyArg2DataType(PyObject* obj, const std::string& op_type,
+                                    ssize_t arg_pos) {
   if (obj == Py_None) {
     return paddle::experimental::DataType::UNDEFINED;
   }
diff --git a/paddle/fluid/pybind/eager_utils.h b/paddle/fluid/pybind/eager_utils.h
index f1fab6db6ead185316efa36ffa774299d82eb901..c4ddb34763228a286ed09ccd3455aaec16dc871d 100644
--- a/paddle/fluid/pybind/eager_utils.h
+++ b/paddle/fluid/pybind/eager_utils.h
@@ -171,13 +171,11 @@ paddle::experimental::IntArray CastPyArg2IntArray(PyObject* obj,
                                                   const std::string& op_type,
                                                   ssize_t arg_pos);
 
-paddle::experimental::Place CastPyArg2Place(PyObject* obj,
-                                            const std::string& op_type,
-                                            ssize_t arg_pos);
+paddle::Place CastPyArg2Place(PyObject* obj, const std::string& op_type,
+                              ssize_t arg_pos);
 
-paddle::experimental::DataType CastPyArg2DataType(PyObject* obj,
-                                                  const std::string& op_type,
-                                                  ssize_t arg_pos);
+paddle::DataType CastPyArg2DataType(PyObject* obj, const std::string& op_type,
+                                    ssize_t arg_pos);
 
 paddle::optional<const paddle::experimental::Tensor&> GetOptionalTensorFromArgs(
     const std::string& op_type, const std::string& arg_name, PyObject* args,
diff --git a/paddle/phi/api/lib/tensor.cc b/paddle/phi/api/lib/tensor.cc
index a7b89d7a4dca9348278803a47e1cf3665bb2a53d..fb81092ffee94ca39865dd6ef4a2a26fee129647 100644
--- a/paddle/phi/api/lib/tensor.cc
+++ b/paddle/phi/api/lib/tensor.cc
@@ -37,24 +37,6 @@ limitations under the License. */
 
 namespace paddle {
 namespace experimental {
-namespace detail {
-static Place GetCorrectPlaceByPlaceType(const Place &place_type) {
-  auto alloc_type = place_type.GetType();
-  switch (alloc_type) {
-    case AllocationType::CPU:
-      return place_type;
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    case AllocationType::GPU:
-      return phi::Place(AllocationType::GPU,
-                        phi::backends::gpu::GetCurrentDeviceId());
-#endif
-    default:
-      PADDLE_THROW(phi::errors::Unavailable(
-          "The PlaceType is a legacy design, only supports CPU and GPU, "
-          "and will not support other place types in the future."));
-  }
-}
-}  // namespace detail
 
 /////// Tensor Methods ////////
 
@@ -76,7 +58,7 @@ Tensor::Tensor(const Place &place) {
          "Reason: A legal tensor cannot be constructed only based on "
          "the `place`, and datatype, shape, layout, etc. is also "
          "required.";
-  DefaultAllocator alloc(detail::GetCorrectPlaceByPlaceType(place));
+  DefaultAllocator alloc(place);
   impl_ = std::move(std::make_shared<phi::DenseTensor>(
       &alloc,
       std::move(phi::DenseTensorMeta(
@@ -92,7 +74,7 @@ Tensor::Tensor(const Place &place, const std::vector<int64_t> &shape) {
          "Reason: A legal tensor cannot be constructed only based on "
          "the `place` and `shape`, and datatype, layout, etc. is also "
          "required.";
-  DefaultAllocator alloc(detail::GetCorrectPlaceByPlaceType(place));
+  DefaultAllocator alloc(place);
   impl_ = std::move(std::make_shared<phi::DenseTensor>(
       &alloc,
       std::move(phi::DenseTensorMeta(phi::DataType::FLOAT32,
diff --git a/paddle/phi/common/CMakeLists.txt b/paddle/phi/common/CMakeLists.txt
index aa839eab587cbe74f6f49be4edf9679c001e7b01..98f55a4f7213d47b7a045b597e7789e6d5e6adad 100644
--- a/paddle/phi/common/CMakeLists.txt
+++ b/paddle/phi/common/CMakeLists.txt
@@ -1,2 +1,18 @@
-cc_library(phi_place SRCS place.cc)
-cc_library(scalar SRCS scalar.cc DEPS phi_enforce tensor)
+if(WITH_GPU)
+  nv_library(
+    phi_place
+    SRCS place.cc
+    DEPS phi_gpu_info)
+elseif(WITH_ROCM)
+  hip_library(
+    phi_place
+    SRCS place.cc
+    DEPS phi_gpu_info)
+else()
+  cc_library(phi_place SRCS place.cc)
+endif()
+
+cc_library(
+  scalar
+  SRCS scalar.cc
+  DEPS phi_enforce tensor)
diff --git a/paddle/phi/common/place.cc b/paddle/phi/common/place.cc
index 667d0a32b93da3f6b82a21f93c14927cb7db81d0..1a67f1a192d0bdfe76efb1a01e74261008c2001e 100644
--- a/paddle/phi/common/place.cc
+++ b/paddle/phi/common/place.cc
@@ -21,6 +21,7 @@ limitations under the License. */
 #include "glog/logging.h"
 
 #include "paddle/phi/api/ext/exception.h"
+#include "paddle/phi/backends/gpu/gpu_info.h"
 
 namespace phi {
 
@@ -110,14 +111,32 @@ uint32_t Place::Hash::operator()(const Place &place) const {
   return hash_value;
 }
 
+namespace detail {
+static int8_t GetCorrectDeviceIdByPlaceType(
+    const paddle::PlaceType &place_type) {
+  switch (place_type) {
+    case paddle::PlaceType::kCPU:
+      return 0;
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    case paddle::PlaceType::kGPU:
+      return phi::backends::gpu::GetCurrentDeviceId();
+#endif
+    default:
+      PD_THROW(
+          "The PlaceType is a legacy design, only supports CPU and GPU, "
+          "and will not support other place types in the future.");
+  }
+}
+}  // namespace detail
+
 Place::Place(paddle::PlaceType type)
-    : device(0),
+    : device(detail::GetCorrectDeviceIdByPlaceType(type)),
       alloc_type_(static_cast<AllocationType>(type)),
       device_type_id_(GetOrRegisterGlobalDeviceTypeId("")) {
   LOG_FIRST_N(WARNING, 1)
       << "The `paddle::PlaceType::kCPU/kGPU` is deprecated since version "
          "2.3, and will be removed in version 2.4! Please use "
-         "`paddle::CPUPlace()/GPUPlace()` to represent the place type.";
+         "`paddle::CPUPlace()/DefaultGPUPlace()` to represent the place type.";
 }
 
 }  // namespace phi
@@ -140,4 +159,13 @@ bool operator==(PlaceType place_type, const Place &place) {
   return static_cast<AllocationType>(place_type) == place.GetType();
 }
 
+GPUPlace DefaultGPUPlace() {
+  return GPUPlace(
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+      phi::backends::gpu::GetCurrentDeviceId());
+#else
+      0);
+#endif
+}
+
 }  // namespace paddle
diff --git a/paddle/phi/common/place.h b/paddle/phi/common/place.h
index ed9fb7876425d36281ef1ab1234442cb9fa0c5f8..cbc1faf94f07c84c7701c6b6d6b3d6d5532a1791 100644
--- a/paddle/phi/common/place.h
+++ b/paddle/phi/common/place.h
@@ -213,9 +213,6 @@ std::ostream& operator<<(std::ostream&, const Place&);
 namespace paddle {
 namespace experimental {
 using AllocationType = phi::AllocationType;
-using Place = phi::Place;
-using CPUPlace = phi::CPUPlace;
-using GPUPlace = phi::GPUPlace;
 using GPUPinnedPlace = phi::GPUPinnedPlace;
 using XPUPlace = phi::XPUPlace;
 using NPUPlace = phi::NPUPlace;
@@ -259,4 +256,6 @@ enum class PlaceType {
 PADDLE_API bool operator==(const Place& place, PlaceType place_type);
 PADDLE_API bool operator==(PlaceType place_type, const Place& place);
 
+PADDLE_API GPUPlace DefaultGPUPlace();
+
 }  // namespace paddle
diff --git a/paddle/phi/tests/api/test_data_transform.cc b/paddle/phi/tests/api/test_data_transform.cc
index a2bd1f2cad9fcfa2e29b6fd433a3065a3bcb4c10..21d5eef4098c0dc5f0c0e5f95ac25a6d7c9f0655 100644
--- a/paddle/phi/tests/api/test_data_transform.cc
+++ b/paddle/phi/tests/api/test_data_transform.cc
@@ -37,13 +37,11 @@ namespace tests {
 // TODO(chenweihang): Remove this test after the API is used in the dygraph
 TEST(API, data_transform_same_place) {
   // 1. create tensor
-  auto x = paddle::experimental::full({3, 3},
-                                      1.0,
-                                      experimental::DataType::COMPLEX128,
-                                      experimental::CPUPlace());
+  auto x =
+      paddle::experimental::full({3, 3}, 1.0, DataType::COMPLEX128, CPUPlace());
 
-  auto y = paddle::experimental::full(
-      {3, 3}, 2.0, experimental::DataType::FLOAT32, experimental::CPUPlace());
+  auto y =
+      paddle::experimental::full({3, 3}, 2.0, DataType::FLOAT32, CPUPlace());
 
   std::vector<phi::dtype::complex<double>> sum(9, 6.0);
 
@@ -75,10 +73,10 @@ TEST(API, data_transform_same_place) {
 TEST(Tensor, data_transform_diff_place) {
   // 1. create tensor
   auto x = paddle::experimental::full(
-      {3, 3}, 1.0, experimental::DataType::FLOAT64, experimental::CPUPlace());
+      {3, 3}, 1.0, experimental::DataType::FLOAT64, CPUPlace());
 
   auto y = paddle::experimental::full(
-      {3, 3}, 2.0, experimental::DataType::FLOAT64, experimental::GPUPlace());
+      {3, 3}, 2.0, experimental::DataType::FLOAT64, GPUPlace());
 
   std::vector<float> sum(9, 6.0);
 
@@ -93,10 +91,9 @@ TEST(Tensor, data_transform_diff_place) {
   ASSERT_EQ(out.dtype(), phi::DataType::FLOAT64);
   ASSERT_EQ(out.layout(), phi::DataLayout::NCHW);
   ASSERT_EQ(out.initialized(), true);
-  ASSERT_EQ(out.impl()->place(),
-            phi::TransToPhiPlace(experimental::Backend::GPU));
+  ASSERT_EQ(out.impl()->place(), phi::TransToPhiPlace(phi::Backend::GPU));
 
-  auto ref_out = experimental::copy_to(out, experimental::CPUPlace(), true);
+  auto ref_out = experimental::copy_to(out, CPUPlace(), true);
 
   auto dense_out = std::dynamic_pointer_cast<phi::DenseTensor>(ref_out.impl());
   for (size_t i = 0; i < 9; i++) {
diff --git a/paddle/phi/tests/api/test_scale_benchmark.cc b/paddle/phi/tests/api/test_scale_benchmark.cc
index ca4a264e511bd574b7f4640bab84862df3f676b0..e2870a780aeae5d9bb8dd84e1bad017385302448 100644
--- a/paddle/phi/tests/api/test_scale_benchmark.cc
+++ b/paddle/phi/tests/api/test_scale_benchmark.cc
@@ -30,7 +30,7 @@ namespace tests {
 
 TEST(API, scale) {
   auto x = experimental::full(
-      {3, 4}, 1.0, experimental::DataType::FLOAT32, experimental::CPUPlace());
+      {3, 4}, 1.0, experimental::DataType::FLOAT32, CPUPlace());
 
   const size_t cycles = 300;
   phi::tests::Timer timer;
diff --git a/python/paddle/fluid/tests/custom_op/context_pool_test_op.cc b/python/paddle/fluid/tests/custom_op/context_pool_test_op.cc
index 6b0edcc7ab1489599552a251e387da573a8e844a..9286ae7ca0091e5cba7ed447b81a95f2664125af 100644
--- a/python/paddle/fluid/tests/custom_op/context_pool_test_op.cc
+++ b/python/paddle/fluid/tests/custom_op/context_pool_test_op.cc
@@ -22,8 +22,7 @@
 
 std::vector<paddle::Tensor> ContextPoolTest(const paddle::Tensor& x) {
   // 1. test cpu context
-  paddle::experimental::Place cpu_place(
-      paddle::experimental::AllocationType::CPU);
+  paddle::Place cpu_place(paddle::experimental::AllocationType::CPU);
   auto* cpu_ctx =
       paddle::experimental::DeviceContextPool::Instance()
           .Get<paddle::experimental::AllocationType::CPU>(cpu_place);
@@ -34,8 +33,7 @@ std::vector<paddle::Tensor> ContextPoolTest(const paddle::Tensor& x) {
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   // 2. test gpu context
-  paddle::experimental::Place gpu_place(
-      paddle::experimental::AllocationType::GPU);
+  paddle::Place gpu_place(paddle::experimental::AllocationType::GPU);
   auto* gpu_ctx =
       paddle::experimental::DeviceContextPool::Instance()
           .Get<paddle::experimental::AllocationType::GPU>(gpu_place);
diff --git a/python/paddle/fluid/tests/custom_op/custom_concat_op.cc b/python/paddle/fluid/tests/custom_op/custom_concat_op.cc
index 66cc36c300e9d6f24652ab5907a71e3a2fbc03ff..80f76e2df54fea69b63f3fc822c6fcafba882e91 100644
--- a/python/paddle/fluid/tests/custom_op/custom_concat_op.cc
+++ b/python/paddle/fluid/tests/custom_op/custom_concat_op.cc
@@ -75,7 +75,7 @@ std::vector<paddle::Tensor> ConcatForwardDynamicAxis(
   auto out_shape = ComputeOutShape(in_shapes, axis);
 
   // create output
-  auto out = paddle::Tensor(paddle::PlaceType::kCPU, out_shape);
+  auto out = paddle::empty(out_shape, inputs[0].type(), paddle::CPUPlace());
 
   // calc
   PD_DISPATCH_FLOATING_AND_INTEGRAL_TYPES(
@@ -106,7 +106,7 @@ std::vector<paddle::Tensor> ConcatBackwardDynamicAxis(
   // create outputs
   std::vector<paddle::Tensor> grad_inputs;
   for (auto& t : inputs) {
-    auto grad = paddle::Tensor(paddle::PlaceType::kCPU, t.shape());
+    auto grad = paddle::empty(t.shape(), t.dtype(), t.place());
     grad_inputs.emplace_back(grad);
   }
 
@@ -161,7 +161,7 @@ std::vector<paddle::Tensor> ConcatForwardStaticAxis(
   auto out_shape = ComputeOutShape(in_shapes, final_axis);
 
   // create output
-  auto out = paddle::Tensor(paddle::PlaceType::kCPU, out_shape);
+  auto out = paddle::empty(out_shape, inputs[0].type(), paddle::CPUPlace());
 
   // calc
   PD_DISPATCH_FLOATING_AND_INTEGRAL_TYPES(
@@ -190,7 +190,7 @@ std::vector<paddle::Tensor> ConcatBackwardStaticAxis(
   // create outputs
   std::vector<paddle::Tensor> grad_inputs;
   for (auto& t : inputs) {
-    auto grad = paddle::Tensor(paddle::PlaceType::kCPU, t.shape());
+    auto grad = paddle::empty(t.shape(), t.dtype(), t.place());
     grad_inputs.emplace_back(grad);
   }
 
diff --git a/python/paddle/fluid/tests/custom_op/custom_conj_op.cc b/python/paddle/fluid/tests/custom_op/custom_conj_op.cc
index b9c10f479e0a39eb8e33ffceb30e8eb9cc8efa9e..56938552420e7334294b80d65390230df46b4ac3 100644
--- a/python/paddle/fluid/tests/custom_op/custom_conj_op.cc
+++ b/python/paddle/fluid/tests/custom_op/custom_conj_op.cc
@@ -71,7 +71,7 @@ void ConjCPUKernel(const data_t* x_data, int64_t numel, data_t* out_data) {
 std::vector<paddle::Tensor> ConjFunction(const paddle::Tensor& x) {
   CHECK_INPUT(x);
 
-  paddle::Tensor out(x.place(), x.shape());
+  paddle::Tensor out = paddle::empty(x.shape(), x.dtype(), x.place());
 
   PD_DISPATCH_FLOATING_AND_COMPLEX_TYPES(
       x.type(), "ConjCPUKernel", ([&] {
diff --git a/python/paddle/fluid/tests/custom_op/custom_relu_op.cc b/python/paddle/fluid/tests/custom_op/custom_relu_op.cc
index 121a855a18f842136bd709c066eaa9ddfa413e62..f1860635ed5f4c9b1f15e2834201d77e5c30f60d 100644
--- a/python/paddle/fluid/tests/custom_op/custom_relu_op.cc
+++ b/python/paddle/fluid/tests/custom_op/custom_relu_op.cc
@@ -17,8 +17,7 @@
 
 #include "paddle/extension.h"
 
-#define CHECK_CPU_INPUT(x) \
-  PD_CHECK(x.place() == paddle::PlaceType::kCPU, #x " must be a CPU Tensor.")
+#define CHECK_CPU_INPUT(x) PD_CHECK(x.is_cpu(), #x " must be a CPU Tensor.")
 
 template <typename data_t>
 void relu_cpu_forward_kernel(const data_t* x_data,
@@ -26,7 +25,7 @@ void relu_cpu_forward_kernel(const data_t* x_data,
                              int64_t x_numel) {
   PD_CHECK(x_data != nullptr, "x_data is nullptr.");
   PD_CHECK(out_data != nullptr, "out_data is nullptr.");
-  for (int i = 0; i < x_numel; ++i) {
+  for (int64_t i = 0; i < x_numel; ++i) {
     out_data[i] = std::max(static_cast<data_t>(0.), x_data[i]);
   }
 }
@@ -36,7 +35,7 @@ void relu_cpu_backward_kernel(const data_t* grad_out_data,
                               const data_t* out_data,
                               data_t* grad_x_data,
                               int64_t out_numel) {
-  for (int i = 0; i < out_numel; ++i) {
+  for (int64_t i = 0; i < out_numel; ++i) {
     grad_x_data[i] =
         grad_out_data[i] * (out_data[i] > static_cast<data_t>(0) ? 1. : 0.);
   }
@@ -54,12 +53,12 @@ void relu_cpu_double_backward_kernel(const data_t* out_data,
 }
 
 std::vector<paddle::Tensor> relu_cpu_forward(const paddle::Tensor& x) {
-  auto out = paddle::Tensor(paddle::PlaceType::kCPU, x.shape());
+  auto out = paddle::empty_like(x);
 
   PD_DISPATCH_FLOATING_TYPES(
       x.type(), "relu_cpu_forward", ([&] {
         relu_cpu_forward_kernel<data_t>(
-            x.data<data_t>(), out.mutable_data<data_t>(x.place()), x.size());
+            x.data<data_t>(), out.data<data_t>(), x.numel());
       }));
 
   return {out};
@@ -68,13 +67,13 @@ std::vector<paddle::Tensor> relu_cpu_forward(const paddle::Tensor& x) {
 std::vector<paddle::Tensor> relu_cpu_backward(const paddle::Tensor& x,
                                               const paddle::Tensor& out,
                                               const paddle::Tensor& grad_out) {
-  auto grad_x = paddle::Tensor(paddle::PlaceType::kCPU, x.shape());
+  auto grad_x = paddle::empty_like(x);
 
   PD_DISPATCH_FLOATING_TYPES(out.type(), "relu_cpu_backward", ([&] {
                                relu_cpu_backward_kernel<data_t>(
                                    grad_out.data<data_t>(),
                                    out.data<data_t>(),
-                                   grad_x.mutable_data<data_t>(x.place()),
+                                   grad_x.data<data_t>(),
                                    out.size());
                              }));
 
@@ -85,7 +84,7 @@ std::vector<paddle::Tensor> relu_cpu_double_backward(
     const paddle::Tensor& out, const paddle::Tensor& ddx) {
   CHECK_CPU_INPUT(out);
   CHECK_CPU_INPUT(ddx);
-  auto ddout = paddle::Tensor(paddle::PlaceType::kCPU, out.shape());
+  auto ddout = paddle::empty(out.shape(), out.dtype(), out.place());
 
   PD_DISPATCH_FLOATING_TYPES(out.type(), "relu_cpu_double_backward", ([&] {
                                relu_cpu_double_backward_kernel<data_t>(
@@ -108,9 +107,9 @@ std::vector<paddle::Tensor> relu_cuda_double_backward(
     const paddle::Tensor& out, const paddle::Tensor& ddx);
 
 std::vector<paddle::Tensor> ReluForward(const paddle::Tensor& x) {
-  if (x.place() == paddle::PlaceType::kCPU) {
+  if (x.is_cpu()) {
     return relu_cpu_forward(x);
-  } else if (x.place() == paddle::PlaceType::kGPU) {
+  } else if (x.is_gpu()) {
     return relu_cuda_forward(x);
   } else {
     PD_THROW("Not implemented.");
@@ -120,10 +119,9 @@ std::vector<paddle::Tensor> ReluForward(const paddle::Tensor& x) {
 std::vector<paddle::Tensor> ReluBackward(const paddle::Tensor& x,
                                          const paddle::Tensor& out,
                                          const paddle::Tensor& grad_out) {
-  // TODO(chenweihang): Check Input
-  if (x.place() == paddle::PlaceType::kCPU) {
+  if (x.is_cpu()) {
     return relu_cpu_backward(x, out, grad_out);
-  } else if (x.place() == paddle::PlaceType::kGPU) {
+  } else if (x.is_gpu()) {
     return relu_cuda_backward(x, out, grad_out);
   } else {
     PD_THROW("Not implemented.");
@@ -165,7 +163,7 @@ PD_BUILD_DOUBLE_GRAD_OP(custom_relu)
 
 std::vector<paddle::Tensor> relu_cpu_backward_without_x(
     const paddle::Tensor& out, const paddle::Tensor& grad_out) {
-  auto grad_x = paddle::Tensor(paddle::PlaceType::kCPU, out.shape());
+  auto grad_x = paddle::empty(out.shape(), out.dtype(), out.place());
 
   PD_DISPATCH_FLOATING_TYPES(out.type(), "relu_cpu_backward", ([&] {
                                relu_cpu_backward_kernel<data_t>(
@@ -214,7 +212,7 @@ void relu_cpu_forward_out(const paddle::Tensor& x, paddle::Tensor* out) {
   PD_DISPATCH_FLOATING_TYPES(
       x.type(), "relu_cpu_forward", ([&] {
         relu_cpu_forward_kernel<data_t>(
-            x.data<data_t>(), out->mutable_data<data_t>(x.place()), x.size());
+            x.data<data_t>(), out->mutable_data<data_t>(x.place()), x.numel());
       }));
 }
 
diff --git a/python/paddle/fluid/tests/custom_op/custom_relu_op.cu b/python/paddle/fluid/tests/custom_op/custom_relu_op.cu
index 364a2216b9e8eaa6447123ce8e55736c0d902a62..e791ea8cb7600eb78b54b80f8af6265261b1bc66 100644
--- a/python/paddle/fluid/tests/custom_op/custom_relu_op.cu
+++ b/python/paddle/fluid/tests/custom_op/custom_relu_op.cu
@@ -14,15 +14,14 @@
 
 #include "paddle/extension.h"
 
-#define CHECK_GPU_INPUT(x) \
-  PD_CHECK(x.place() == paddle::PlaceType::kGPU, #x " must be a GPU Tensor.")
+#define CHECK_GPU_INPUT(x) PD_CHECK(x.is_gpu(), #x " must be a GPU Tensor.")
 
 template <typename data_t>
 __global__ void relu_cuda_forward_kernel(const data_t* x,
                                          data_t* y,
-                                         const int num) {
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;
-  for (int i = gid; i < num; i += blockDim.x * gridDim.x) {
+                                         int64_t num) {
+  int64_t gid = blockIdx.x * blockDim.x + threadIdx.x;
+  for (int64_t i = gid; i < num; i += blockDim.x * gridDim.x) {
     y[i] = x[i] > static_cast<data_t>(0.) ? x[i] : static_cast<data_t>(0.);
   }
 }
@@ -31,9 +30,9 @@ template <typename data_t>
 __global__ void relu_cuda_backward_kernel(const data_t* dy,
                                           const data_t* y,
                                           data_t* dx,
-                                          const int num) {
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;
-  for (int i = gid; i < num; i += blockDim.x * gridDim.x) {
+                                          int64_t num) {
+  int64_t gid = blockIdx.x * blockDim.x + threadIdx.x;
+  for (int64_t i = gid; i < num; i += blockDim.x * gridDim.x) {
     dx[i] = dy[i] * (y[i] > static_cast<data_t>(0.) ? static_cast<data_t>(1.)
                                                     : static_cast<data_t>(0.));
   }
@@ -54,15 +53,17 @@ __global__ void relu_cuda_double_backward_kernel(const data_t* out_data,
 
 std::vector<paddle::Tensor> relu_cuda_forward(const paddle::Tensor& x) {
   CHECK_GPU_INPUT(x);
-  auto out = paddle::Tensor(paddle::PlaceType::kGPU, x.shape());
+  auto out = paddle::empty_like(x);
 
-  int numel = x.size();
-  int block = 512;
-  int grid = (numel + block - 1) / block;
+  PD_CHECK(x.place() == paddle::DefaultGPUPlace());
+
+  int64_t numel = x.numel();
+  int64_t block = 512;
+  int64_t grid = (numel + block - 1) / block;
   PD_DISPATCH_FLOATING_AND_HALF_TYPES(
       x.type(), "relu_cuda_forward_kernel", ([&] {
         relu_cuda_forward_kernel<data_t><<<grid, block, 0, x.stream()>>>(
-            x.data<data_t>(), out.mutable_data<data_t>(x.place()), numel);
+            x.data<data_t>(), out.data<data_t>(), numel);
       }));
 
   return {out};
@@ -74,11 +75,13 @@ std::vector<paddle::Tensor> relu_cuda_backward(const paddle::Tensor& x,
   CHECK_GPU_INPUT(x);
   CHECK_GPU_INPUT(out);
   CHECK_GPU_INPUT(grad_out);
-  auto grad_x = paddle::Tensor(paddle::PlaceType::kGPU, x.shape());
+  auto grad_x = paddle::empty_like(x);
 
-  int numel = out.size();
-  int block = 512;
-  int grid = (numel + block - 1) / block;
+  PD_CHECK(x.place() == paddle::DefaultGPUPlace());
+
+  int64_t numel = out.numel();
+  int64_t block = 512;
+  int64_t grid = (numel + block - 1) / block;
   PD_DISPATCH_FLOATING_AND_HALF_TYPES(
       out.type(), "relu_cuda_backward_kernel", ([&] {
         relu_cuda_backward_kernel<data_t><<<grid, block, 0, x.stream()>>>(
@@ -95,19 +98,19 @@ std::vector<paddle::Tensor> relu_cuda_double_backward(
     const paddle::Tensor& out, const paddle::Tensor& ddx) {
   CHECK_GPU_INPUT(out);
   CHECK_GPU_INPUT(ddx);
-  auto ddout = paddle::Tensor(paddle::PlaceType::kGPU, out.shape());
+  auto ddout = paddle::empty(out.shape(), out.dtype(), out.place());
 
-  int64_t numel = out.size();
+  int64_t numel = out.numel();
   int64_t block = 512;
   int64_t grid = (numel + block - 1) / block;
   PD_DISPATCH_FLOATING_AND_HALF_TYPES(
       out.type(), "relu_cuda_double_backward_kernel", ([&] {
-        relu_cuda_double_backward_kernel<
-            data_t><<<grid, block, 0, out.stream()>>>(
-            out.data<data_t>(),
-            ddx.data<data_t>(),
-            ddout.mutable_data<data_t>(out.place()),
-            numel);
+        relu_cuda_double_backward_kernel<data_t>
+            <<<grid, block, 0, out.stream()>>>(
+                out.data<data_t>(),
+                ddx.data<data_t>(),
+                ddout.mutable_data<data_t>(out.place()),
+                numel);
       }));
 
   std::cout << "Debug info: run relu gpu double backward success." << std::endl;
@@ -117,9 +120,9 @@ std::vector<paddle::Tensor> relu_cuda_double_backward(
 
 std::vector<paddle::Tensor> relu_cuda_backward_without_x(
     const paddle::Tensor& out, const paddle::Tensor& grad_out) {
-  auto grad_x = paddle::Tensor(paddle::PlaceType::kGPU, out.shape());
+  auto grad_x = paddle::empty(out.shape(), out.dtype(), out.place());
 
-  int numel = out.size();
+  int numel = out.numel();
   int block = 512;
   int grid = (numel + block - 1) / block;
   PD_DISPATCH_FLOATING_AND_HALF_TYPES(
@@ -135,7 +138,7 @@ std::vector<paddle::Tensor> relu_cuda_backward_without_x(
 }
 
 void relu_cuda_forward_out(const paddle::Tensor& x, paddle::Tensor* out) {
-  int numel = x.size();
+  int numel = x.numel();
   int block = 512;
   int grid = (numel + block - 1) / block;
   out->reshape(x.shape());
@@ -150,7 +153,7 @@ void relu_cuda_backward_out(const paddle::Tensor& x,
                             const paddle::Tensor& out,
                             const paddle::Tensor& grad_out,
                             paddle::Tensor* grad_x) {
-  int numel = out.size();
+  int numel = out.numel();
   int block = 512;
   int grid = (numel + block - 1) / block;
   grad_x->reshape(x.shape());
diff --git a/python/paddle/fluid/tests/custom_op/custom_tanh_op.cc b/python/paddle/fluid/tests/custom_op/custom_tanh_op.cc
index f96297d69bd5b70fb84f1baeaf149294d4ec6368..399eb5b6366d779969b19b971baff8e4b763fecd 100644
--- a/python/paddle/fluid/tests/custom_op/custom_tanh_op.cc
+++ b/python/paddle/fluid/tests/custom_op/custom_tanh_op.cc
@@ -68,7 +68,7 @@ void tanh_cpu_double_backward_kernel(const data_t* out_data,
 
 std::vector<paddle::Tensor> TanhForward(const paddle::Tensor& x) {
   CHECK_CPU_INPUT(x);
-  auto out = paddle::Tensor(paddle::PlaceType::kCPU, x.shape());
+  auto out = paddle::empty(x.shape(), x.dtype(), x.place());
 
   PD_DISPATCH_FLOATING_TYPES(
       x.dtype(), "tanh_cpu_forward", ([&] {
@@ -82,7 +82,7 @@ std::vector<paddle::Tensor> TanhForward(const paddle::Tensor& x) {
 std::vector<paddle::Tensor> TanhBackward(const paddle::Tensor& out,
                                          const paddle::Tensor& grad_out) {
   CHECK_CPU_INPUT(out);
-  auto grad_x = paddle::Tensor(paddle::PlaceType::kCPU, out.shape());
+  auto grad_x = paddle::empty(out.shape(), out.dtype(), out.place());
 
   PD_DISPATCH_FLOATING_TYPES(out.dtype(), "tanh_cpu_backward", ([&] {
                                tanh_cpu_backward_kernel<data_t>(
@@ -101,8 +101,8 @@ std::vector<paddle::Tensor> TanhDoubleBackward(const paddle::Tensor& out,
   CHECK_CPU_INPUT(out);
   CHECK_CPU_INPUT(ddx);
   CHECK_CPU_INPUT(dout);
-  auto dout_new = paddle::Tensor(paddle::PlaceType::kCPU, out.shape());
-  auto ddout = paddle::Tensor(paddle::PlaceType::kCPU, out.shape());
+  auto dout_new = paddle::empty(out.shape(), out.dtype(), out.place());
+  auto ddout = paddle::empty(out.shape(), out.dtype(), out.place());
 
   PD_DISPATCH_FLOATING_TYPES(out.dtype(), "tanh_cpu_double_backward", ([&] {
                                tanh_cpu_double_backward_kernel<data_t>(