diff --git a/paddle/fluid/operators/scale_op.cc b/paddle/fluid/operators/scale_op.cc
index c24f924313fb90d33b17f727260578271f67ae88..4e9c84ef4c9503dc81d1258202ac4b37f867cfcf 100644
--- a/paddle/fluid/operators/scale_op.cc
+++ b/paddle/fluid/operators/scale_op.cc
@@ -75,14 +75,14 @@ class ScaleOp : public framework::OperatorWithKernel {
       const framework::ExecutionContext &ctx) const override {
     if (ctx.InputVar("X")->IsType<framework::LoDTensor>() ||
         ctx.InputVar("X")->IsType<framework::Tensor>()) {
+      std::string scale_attr;
       if (ctx.HasInput("ScaleTensor")) {
-        return framework::KernelSignature("scale.host", {"X", "ScaleTensor"},
-                                          {"bias", "bias_after_scale"},
-                                          {"Out"});
+        scale_attr = "ScaleTensor";
       } else {
-        return framework::KernelSignature(
-            "scale", {"X"}, {"scale", "bias", "bias_after_scale"}, {"Out"});
+        scale_attr = "scale";
       }
+      return framework::KernelSignature(
+          "scale", {"X"}, {scale_attr, "bias", "bias_after_scale"}, {"Out"});
     }
     // TODO(chenweihang): support other cases after selected rows added
     return framework::KernelSignature("scale.unregistered", {}, {}, {});
diff --git a/paddle/pten/api/include/math.h b/paddle/pten/api/include/math.h
index 149500c546dfd3db4f1623ac9bbc565d9d71b6eb..700af6d2d591162ecc2ec5fbfd00d9f39d128dc8 100644
--- a/paddle/pten/api/include/math.h
+++ b/paddle/pten/api/include/math.h
@@ -15,16 +15,11 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/pten/api/include/tensor.h"
+#include "paddle/pten/common/scalar.h"
 
 namespace paddle {
 namespace experimental {
 
-// TODO(chenweihang): add scale API
-// TODO(chenweihang): move mean API into stat.h/cc
-PD_DLL_DECL Tensor mean(const Tensor& x,
-                        const std::vector<int64_t>& axis,
-                        bool keep_dim);
-
 PD_DLL_DECL Tensor add(const Tensor& x, const Tensor& y);
 
 PD_DLL_DECL Tensor subtract(const Tensor& x, const Tensor& y);
@@ -33,10 +28,21 @@ PD_DLL_DECL Tensor divide(const Tensor& x, const Tensor& y);
 
 PD_DLL_DECL Tensor multiply(const Tensor& x, const Tensor& y);
 
+// TODO(chenweihang): move mean API into stat.h/cc
+PD_DLL_DECL Tensor mean(const Tensor& x,
+                        const std::vector<int64_t>& axis,
+                        bool keep_dim);
+
 PD_DLL_DECL Tensor sum(const Tensor& x,
                        const std::vector<int64_t>& axis,
                        DataType dtype,
                        bool keep_dim);
 
+// TODO(chenweihang): Follow-up discussion on the handling of `act` argument
+PD_DLL_DECL Tensor scale(const Tensor& x,
+                         const Scalar& scale,
+                         float bias,
+                         bool bias_after_scale);
+
 }  // namespace experimental
 }  // namespace paddle
diff --git a/paddle/pten/api/lib/math.cc b/paddle/pten/api/lib/math.cc
index bd2567ddb15063fb4aade7f4a8ef0956a2a2a410..a97d78b5a9d6f07fafeba39fab8a80dc4f6e4a3d 100644
--- a/paddle/pten/api/lib/math.cc
+++ b/paddle/pten/api/lib/math.cc
@@ -274,6 +274,45 @@ PD_DLL_DECL Tensor multiply(const Tensor& x, const Tensor& y) {
 
   return out;
 }
+
+PD_DLL_DECL Tensor scale(const Tensor& x,
+                         const Scalar& scale,
+                         float bias,
+                         bool bias_after_scale) {
+  // 1. Get kernel signature and kernel
+  auto kernel_key_set = ParseKernelKeyByInputArgs(x);
+  auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
+  auto kernel = pten::KernelFactory::Instance().SelectKernelOrThrowError(
+      "scale", kernel_key);
+
+  // 2. Get Device Context
+  auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
+  auto kernel_context = pten::KernelContext(dev_ctx);
+
+  // 3. Auto data transform
+  auto dense_x = std::dynamic_pointer_cast<pten::DenseTensor>(x.impl());
+  kernel_context.EmplaceBackInput(dense_x);
+  kernel_context.EmplaceBackAttr(pten::Scalar(scale));
+  kernel_context.EmplaceBackAttr(bias);
+  kernel_context.EmplaceBackAttr(bias_after_scale);
+
+  // 4. InferMeta
+  auto out_meta = UnchangedInferMeta(dense_x->meta());
+
+  // 5. Prepare outputs
+  Tensor out;
+  const auto allocator = std::make_shared<DefaultAllocator>(
+      pten::TransToFluidPlace(kernel_key.backend()));
+  auto dense_out = std::make_shared<pten::DenseTensor>(allocator, out_meta);
+  kernel_context.EmplaceBackOutput(dense_out);
+  out.set_impl(dense_out);
+
+  // 6. Call kernel
+  kernel(&kernel_context);
+
+  return out;
+}
+
 }  // namespace experimental
 }  // namespace paddle
 
diff --git a/paddle/pten/include/math.h b/paddle/pten/include/math.h
index c6528d85c27cc03bdd8dee82ae83469cda64701d..c2b9f75bda0449d00adf1962db425bf493886e1c 100644
--- a/paddle/pten/include/math.h
+++ b/paddle/pten/include/math.h
@@ -78,7 +78,7 @@ DenseTensor Sum(const ContextT& dev_ctx,
 template <typename T, typename ContextT>
 DenseTensor Scale(const ContextT& dev_ctx,
                   const DenseTensor& x,
-                  float scale,
+                  const Scalar& scale,
                   float bias,
                   bool bias_after_scale) {
   auto out_meta = UnchangedInferMeta(x.meta());
@@ -90,21 +90,6 @@ DenseTensor Scale(const ContextT& dev_ctx,
   return dense_out;
 }
 
-template <typename T, typename ContextT>
-DenseTensor Scale(const ContextT& dev_ctx,
-                  const DenseTensor& x,
-                  const DenseTensor& scale,
-                  float bias,
-                  bool bias_after_scale) {
-  auto out_meta = UnchangedInferMeta(x.meta());
-  const auto allocator =
-      std::make_shared<paddle::experimental::DefaultAllocator>(
-          dev_ctx.GetPlace());
-  pten::DenseTensor dense_out(allocator, out_meta);
-  ScaleHost<T>(dev_ctx, x, scale, bias, bias_after_scale, &dense_out);
-  return dense_out;
-}
-
 template <typename T, typename ContextT>
 DenseTensor Add(const ContextT& dev_ctx,
                 const DenseTensor& x,
diff --git a/paddle/pten/kernels/cpu/math.cc b/paddle/pten/kernels/cpu/math.cc
index 634b5231da266d56eb3ecbc1ebcb9c9e61ab9333..05ca7a3ae52446eb0024ece502a4ed33479d1ac1 100644
--- a/paddle/pten/kernels/cpu/math.cc
+++ b/paddle/pten/kernels/cpu/math.cc
@@ -50,28 +50,12 @@ void Mean(const CPUContext& dev_ctx,
 template <typename T>
 void Scale(const CPUContext& dev_ctx,
            const DenseTensor& x,
-           float scale,
+           const Scalar& scale,
            float bias,
            bool bias_after_scale,
            DenseTensor* out) {
-  eigen::Scale<CPUContext, T>(dev_ctx, x, scale, bias, bias_after_scale, out);
-}
-
-// TODO(chenweihang): now the ScaleTensor's dtype are same as x, so we cannot
-// register its dtype def
-template <typename T>
-void ScaleHost(const CPUContext& dev_ctx,
-               const DenseTensor& x,
-               const DenseTensor& scale,
-               float bias,
-               bool bias_after_scale,
-               DenseTensor* out) {
-  eigen::Scale<CPUContext, T>(dev_ctx,
-                              x,
-                              static_cast<float>(*scale.data<T>()),
-                              bias,
-                              bias_after_scale,
-                              out);
+  eigen::Scale<CPUContext, T>(
+      dev_ctx, x, scale.to<float>(), bias, bias_after_scale, out);
 }
 
 template <typename T>
@@ -145,20 +129,7 @@ PT_REGISTER_KERNEL("scale",
                    int16_t,
                    int,
                    int64_t) {}
-PT_REGISTER_KERNEL("scale.host",
-                   CPU,
-                   ANY,
-                   pten::ScaleHost,
-                   float,
-                   double,
-                   paddle::platform::bfloat16,
-                   uint8_t,
-                   int8_t,
-                   int16_t,
-                   int,
-                   int64_t) {
-  kernel->InputAt(1).SetBackend(pten::Backend::CPU);
-}
+
 PT_REGISTER_KERNEL("elementwise_add",
                    CPU,
                    ANY,
diff --git a/paddle/pten/kernels/cpu/math.h b/paddle/pten/kernels/cpu/math.h
index c06d40e57799fb2e7b4825229eadbb2a07271752..31532f38f6e49f754944e69c06dedd0fc4baaaca 100644
--- a/paddle/pten/kernels/cpu/math.h
+++ b/paddle/pten/kernels/cpu/math.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/pten/common/scalar.h"
 #include "paddle/pten/core/dense_tensor.h"
 #include "paddle/pten/core/kernel_registry.h"
 
@@ -40,19 +41,11 @@ void Mean(const CPUContext& dev_ctx,
 template <typename T>
 void Scale(const CPUContext& dev_ctx,
            const DenseTensor& x,
-           float scale,
+           const Scalar& scale,
            float bias,
            bool bias_after_scale,
            DenseTensor* out);
 
-template <typename T>
-void ScaleHost(const CPUContext& dev_ctx,
-               const DenseTensor& x,
-               const DenseTensor& scale,
-               float bias,
-               bool bias_after_scale,
-               DenseTensor* out);
-
 template <typename T>
 void ElementwiseAdd(const CPUContext& dev_ctx,
                     const DenseTensor& x,
diff --git a/paddle/pten/kernels/cuda/math.cu b/paddle/pten/kernels/cuda/math.cu
index bc5582926a40068d5cf8fe8a9b30117569276b3a..8d6abc92855305efabad347c81d09b0a67fbc011 100644
--- a/paddle/pten/kernels/cuda/math.cu
+++ b/paddle/pten/kernels/cuda/math.cu
@@ -79,30 +79,12 @@ void Mean(const CUDAContext& dev_ctx,
 template <typename T>
 void Scale(const CUDAContext& dev_ctx,
            const DenseTensor& x,
-           float scale,
+           const Scalar& scale,
            float bias,
            bool bias_after_scale,
            DenseTensor* out) {
-  eigen::Scale<CUDAContext, T>(dev_ctx, x, scale, bias, bias_after_scale, out);
-}
-
-template <typename T>
-void ScaleHost(const CUDAContext& dev_ctx,
-               const DenseTensor& x,
-               const DenseTensor& scale,
-               float bias,
-               bool bias_after_scale,
-               DenseTensor* out) {
-  PADDLE_ENFORCE_EQ(paddle::platform::is_gpu_place(scale.place()),
-                    false,
-                    paddle::platform::errors::InvalidArgument(
-                        "Scale argument isn't a host tensor."));
-  eigen::Scale<CUDAContext, T>(dev_ctx,
-                               x,
-                               static_cast<float>(*scale.data<T>()),
-                               bias,
-                               bias_after_scale,
-                               out);
+  eigen::Scale<CUDAContext, T>(
+      dev_ctx, x, scale.to<float>(), bias, bias_after_scale, out);
 }
 
 // Create the definition of ElementwiseAdd
@@ -150,20 +132,6 @@ PT_REGISTER_KERNEL("scale",
                    int16_t,
                    int,
                    int64_t) {}
-PT_REGISTER_KERNEL("scale.host",
-                   CUDA,
-                   ANY,
-                   pten::ScaleHost,
-                   float,
-                   double,
-                   float16,
-                   uint8_t,
-                   int8_t,
-                   int16_t,
-                   int,
-                   int64_t) {
-  kernel->InputAt(1).SetBackend(pten::Backend::CPU);
-}
 PT_REGISTER_KERNEL("elementwise_add",
                    CUDA,
                    ANY,
diff --git a/paddle/pten/kernels/cuda/math.h b/paddle/pten/kernels/cuda/math.h
index dcee649d7d82d59e363b05c512cad40432cf1ed5..0ac55f1f8795070eb3cd1163b98de0b555ab9e46 100644
--- a/paddle/pten/kernels/cuda/math.h
+++ b/paddle/pten/kernels/cuda/math.h
@@ -17,6 +17,7 @@ limitations under the License. */
 // CUDA and HIP use same api
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
+#include "paddle/pten/common/scalar.h"
 #include "paddle/pten/core/dense_tensor.h"
 
 // See Note [ Why still include the fluid headers? ]
@@ -42,19 +43,11 @@ void Mean(const CUDAContext& dev_ctx,
 template <typename T>
 void Scale(const CUDAContext& dev_ctx,
            const DenseTensor& x,
-           float scale,
+           const Scalar& scale,
            float bias,
            bool bias_after_scale,
            DenseTensor* out);
 
-template <typename T>
-void ScaleHost(const CUDAContext& dev_ctx,
-               const DenseTensor& x,
-               const DenseTensor& scale,
-               float bias,
-               bool bias_after_scale,
-               DenseTensor* out);
-
 template <typename T>
 void ElementwiseAdd(const CUDAContext& dev_ctx,
                     const DenseTensor& x,
diff --git a/paddle/pten/tests/api/CMakeLists.txt b/paddle/pten/tests/api/CMakeLists.txt
index 207d8f35b4c453e0963434e052fff9f704bc73a8..c670d094810198bb2108971e9f6d1ee8340579d7 100644
--- a/paddle/pten/tests/api/CMakeLists.txt
+++ b/paddle/pten/tests/api/CMakeLists.txt
@@ -20,3 +20,4 @@ cc_test(test_reshape_api SRCS test_reshape_api.cc DEPS pten_tensor pten_api pten
 cc_test(test_to_api SRCS test_to_api.cc DEPS pten_tensor pten_api pten_api_utils)
 cc_test(test_slice_api SRCS test_slice_api.cc DEPS pten_tensor pten_api pten_api_utils)
 cc_test(test_sum_api SRCS test_sum_api.cc DEPS pten_tensor pten_api pten_api_utils)
+cc_test(test_scale_api SRCS test_scale_api.cc DEPS pten_tensor pten_api pten_api_utils)
diff --git a/paddle/pten/tests/api/test_scale_api.cc b/paddle/pten/tests/api/test_scale_api.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2c0cd5cc71d8ee4c6aa918d983d86b10a99ec669
--- /dev/null
+++ b/paddle/pten/tests/api/test_scale_api.cc
@@ -0,0 +1,57 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <memory>
+
+#include "paddle/pten/api/include/creation.h"
+#include "paddle/pten/api/include/math.h"
+
+#include "paddle/pten/api/lib/utils/allocator.h"
+#include "paddle/pten/core/dense_tensor.h"
+
+namespace paddle {
+namespace tests {
+
+namespace framework = paddle::framework;
+using DDim = paddle::framework::DDim;
+
+void CheckScaleResult(Tensor* out) {
+  ASSERT_EQ(out->dims().size(), 2);
+  ASSERT_EQ(out->dims()[0], 3);
+  ASSERT_EQ(out->dims()[1], 4);
+  ASSERT_EQ(out->numel(), 12);
+  ASSERT_EQ(out->is_cpu(), true);
+  ASSERT_EQ(out->type(), pten::DataType::FLOAT32);
+  ASSERT_EQ(out->layout(), pten::DataLayout::NCHW);
+  ASSERT_EQ(out->initialized(), true);
+  for (int64_t i = 0; i < out->numel(); ++i) {
+    ASSERT_EQ(out->mutable_data<float>()[i], 3.0);
+  }
+}
+
+TEST(API, scale) {
+  // 1. check `scale` is float value
+  auto x = experimental::full({3, 4}, 1.0, pten::DataType::FLOAT32);
+  auto out1 = experimental::scale(x, 2.0, 1.0, true);
+  CheckScaleResult(&out1);
+
+  // 2. check `scale` is Tensor with shape [1]
+  auto scale = experimental::full({1}, 2.0, pten::DataType::FLOAT32);
+  auto out2 = experimental::scale(x, scale, 1.0, true);
+  CheckScaleResult(&out2);
+}
+
+}  // namespace tests
+}  // namespace paddle