diff --git a/paddle/fluid/operators/optimizers/sgd_op.cc b/paddle/fluid/operators/optimizers/sgd_op.cc
index f51d776d7195c5fcb8b013dedffb3d7f730a7c6e..a2af131cb505e4d0cb499f304382949c92c8ad99 100644
--- a/paddle/fluid/operators/optimizers/sgd_op.cc
+++ b/paddle/fluid/operators/optimizers/sgd_op.cc
@@ -126,7 +126,7 @@ $$param\_out = param - learning\_rate * grad$$
 
 namespace ops = paddle::operators;
 DECLARE_INFER_SHAPE_FUNCTOR(sgd, SGDInferShapeFunctor,
-                            PD_INFER_META(phi::SGDInferMeta));
+                            PD_INFER_META(phi::SgdInferMeta));
 REGISTER_OPERATOR(
     sgd, ops::SGDOp, ops::SGDOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
diff --git a/paddle/phi/api/lib/api_custom_impl.cc b/paddle/phi/api/lib/api_custom_impl.cc
index d7f148fff818b3c9ddf49ad90821ea812d8705c5..2b80094a39e31646139e7a5312f933cf9a55cf6a 100644
--- a/paddle/phi/api/lib/api_custom_impl.cc
+++ b/paddle/phi/api/lib/api_custom_impl.cc
@@ -656,6 +656,176 @@ std::tuple<Tensor, Tensor, Tensor> momentum_impl(
   return api_output;
 }
 
+std::tuple<Tensor, Tensor> sgd_impl(
+    const Tensor& param,
+    const Tensor& learning_rate,
+    const Tensor& grad,
+    paddle::optional<const Tensor&> master_param,
+    bool multi_precision) {
+  DataType kernel_data_type = ParseDataType(param);
+  auto kernel_key_set = ParseKernelKeyByInputArgs(param, learning_rate, grad);
+  auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
+  VLOG(6) << "sgd API kernel key: [" << kernel_key.backend() << ", "
+          << kernel_key.layout() << ", " << kernel_data_type << "]";
+
+  const auto& param_tensor = param.impl();
+  std::string kernel_name = "sgd";
+  if (phi::DenseTensor::classof(param_tensor.get())) {
+    if (!phi::DenseTensor::classof(grad.impl().get())) {
+      kernel_name = "sgd_dense_param_sparse_grad";
+    }
+  } else {
+    kernel_name = "sgd_sparse_param_sparse_grad";
+  }
+  const auto& kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
+      kernel_name,
+      {kernel_key.backend(), kernel_key.layout(), kernel_data_type});
+  VLOG(6) << kernel_name << " API kernel: " << kernel;
+
+  auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
+
+  auto in_learning_rate =
+      PrepareData(learning_rate, kernel.InputAt(1), {false, true, true, true});
+
+  std::tuple<Tensor, Tensor> out;
+  std::get<0>(out) = param;
+  if (master_param) {
+    std::get<1>(out) = *master_param;
+  }
+  phi::MetaTensor meta_out_0(std::get<0>(out).impl().get());
+  phi::MetaTensor meta_out_1(master_param ? std::get<1>(out).impl().get()
+                                          : nullptr);
+
+  if (phi::DenseTensor::classof(param_tensor.get())) {
+    auto in_param = PrepareData(param, kernel.InputAt(0), {});
+    auto in_master_param = PrepareData(master_param, kernel.InputAt(3), {});
+
+    paddle::optional<const phi::DenseTensor&> in_master_param_opt =
+        master_param
+            ? paddle::make_optional<const phi::DenseTensor&>(*in_master_param)
+            : paddle::none;
+    auto master_param_meta = MakeMetaTensor(in_master_param_opt);
+    paddle::optional<const phi::MetaTensor&> master_param_meta_opt =
+        master_param
+            ? paddle::make_optional<const phi::MetaTensor&>(*master_param_meta)
+            : paddle::none;
+
+    phi::DenseTensor* kernel_out_0 =
+        SetKernelOutput(kernel_key.backend(), &std::get<0>(out));
+    phi::DenseTensor* kernel_out_1 =
+        master_param
+            ? static_cast<phi::DenseTensor*>(std::get<1>(out).impl().get())
+            : nullptr;
+
+    if (phi::DenseTensor::classof(grad.impl().get())) {
+      auto in_grad = PrepareData(grad, kernel.InputAt(2), {});
+      SgdInferMeta(MakeMetaTensor(*in_param),
+                   MakeMetaTensor(*in_learning_rate),
+                   MakeMetaTensor(*in_grad),
+                   master_param_meta_opt,
+                   multi_precision,
+                   &meta_out_0,
+                   &meta_out_1);
+
+      using kernel_signature =
+          void (*)(const platform::DeviceContext&,
+                   const phi::DenseTensor&,
+                   const phi::DenseTensor&,
+                   const phi::DenseTensor&,
+                   paddle::optional<const phi::DenseTensor&>,
+                   bool,
+                   phi::DenseTensor*,
+                   phi::DenseTensor*);
+
+      auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
+      (*kernel_fn)(*dev_ctx,
+                   *in_param,
+                   *in_learning_rate,
+                   *in_grad,
+                   in_master_param_opt,
+                   multi_precision,
+                   kernel_out_0,
+                   kernel_out_1);
+    } else {
+      auto in_grad = TensorToSelectedRows(grad);
+      SgdInferMeta(MakeMetaTensor(*in_param),
+                   MakeMetaTensor(*in_learning_rate),
+                   MakeMetaTensor(*in_grad),
+                   master_param_meta_opt,
+                   multi_precision,
+                   &meta_out_0,
+                   &meta_out_1);
+
+      using kernel_signature =
+          void (*)(const platform::DeviceContext&,
+                   const phi::DenseTensor&,
+                   const phi::DenseTensor&,
+                   const phi::SelectedRows&,
+                   paddle::optional<const phi::DenseTensor&>,
+                   bool,
+                   phi::DenseTensor*,
+                   phi::DenseTensor*);
+      auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
+      (*kernel_fn)(*dev_ctx,
+                   *in_param,
+                   *in_learning_rate,
+                   *in_grad,
+                   in_master_param_opt,
+                   multi_precision,
+                   kernel_out_0,
+                   kernel_out_1);
+    }
+  } else {
+    auto in_param = TensorToSelectedRows(param);
+    auto in_grad = TensorToSelectedRows(grad);
+    auto in_master_param = TensorToSelectedRows(master_param);
+    auto in_master_param_opt =
+        master_param
+            ? paddle::make_optional<const phi::SelectedRows&>(*in_master_param)
+            : paddle::none;
+    auto master_param_meta = MakeMetaTensor(in_master_param_opt);
+    paddle::optional<const phi::MetaTensor&> master_param_meta_opt =
+        master_param
+            ? paddle::make_optional<const phi::MetaTensor&>(*master_param_meta)
+            : paddle::none;
+
+    phi::SelectedRows* kernel_out_0 =
+        SetSelectedRowsKernelOutput(kernel_key.backend(), &std::get<0>(out));
+    phi::SelectedRows* kernel_out_1 =
+        master_param
+            ? static_cast<phi::SelectedRows*>(std::get<1>(out).impl().get())
+            : nullptr;
+
+    SgdInferMeta(MakeMetaTensor(*in_param),
+                 MakeMetaTensor(*in_learning_rate),
+                 MakeMetaTensor(*in_grad),
+                 master_param_meta_opt,
+                 multi_precision,
+                 &meta_out_0,
+                 &meta_out_1);
+
+    using kernel_signature =
+        void (*)(const platform::DeviceContext&,
+                 const phi::SelectedRows&,
+                 const phi::DenseTensor&,
+                 const phi::SelectedRows&,
+                 paddle::optional<const phi::SelectedRows&>,
+                 bool,
+                 phi::SelectedRows*,
+                 phi::SelectedRows*);
+    auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
+    (*kernel_fn)(*dev_ctx,
+                 *in_param,
+                 *in_learning_rate,
+                 *in_grad,
+                 in_master_param_opt,
+                 multi_precision,
+                 kernel_out_0,
+                 kernel_out_1);
+  }
+  return out;
+}
+
 ////////////////// Backward(grad) api impls //////////////////////
 
 // TODO(chenweihang):  the original sum grad op can support higher-level
diff --git a/paddle/phi/api/lib/api_custom_impl.h b/paddle/phi/api/lib/api_custom_impl.h
index 5d46ed691816b71418ac62ec1726d590f342c0de..4ddc3e5f4e0d2edda8864960b79dc8eb22de48ff 100644
--- a/paddle/phi/api/lib/api_custom_impl.h
+++ b/paddle/phi/api/lib/api_custom_impl.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <tuple>
 #include <vector>
 
 #include "paddle/phi/api/include/tensor.h"
@@ -107,6 +108,13 @@ std::tuple<Tensor, Tensor, Tensor> momentum_impl(
     bool multi_precision,
     float rescale_grad);
 
+std::tuple<Tensor, Tensor> sgd_impl(
+    const Tensor& param,
+    const Tensor& learning_rate,
+    const Tensor& grad,
+    paddle::optional<const Tensor&> master_param,
+    bool multi_precision);
+
 ////////////////// Backward(grad) api impls //////////////////////
 
 std::vector<Tensor> add_n_grad_impl(const std::vector<Tensor>& x,
diff --git a/paddle/phi/api/lib/api_gen_utils.cc b/paddle/phi/api/lib/api_gen_utils.cc
index f9db152956923a66c1e88af4841d59e834bcb726..e0c910ba3d66c9c8e61265d12e60da4dd252d035 100644
--- a/paddle/phi/api/lib/api_gen_utils.cc
+++ b/paddle/phi/api/lib/api_gen_utils.cc
@@ -20,13 +20,13 @@ namespace experimental {
 /* ------------------ for input ----------------------- */
 
 std::shared_ptr<phi::DenseTensor> TensorToDenseTensor(const Tensor& tensor) {
-  return std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl());
+  return std::static_pointer_cast<phi::DenseTensor>(tensor.impl());
 }
 
 std::shared_ptr<phi::DenseTensor> TensorToDenseTensor(
-    const paddle::optional<Tensor>& tensor) {
+    const paddle::optional<const Tensor&>& tensor) {
   if (tensor) {
-    return std::dynamic_pointer_cast<phi::DenseTensor>(tensor->impl());
+    return std::static_pointer_cast<phi::DenseTensor>(tensor->impl());
   }
   return nullptr;
 }
@@ -45,13 +45,13 @@ std::unique_ptr<std::vector<phi::DenseTensor>> TensorToDenseTensor(
 }
 
 std::shared_ptr<phi::SelectedRows> TensorToSelectedRows(const Tensor& tensor) {
-  return std::dynamic_pointer_cast<phi::SelectedRows>(tensor.impl());
+  return std::static_pointer_cast<phi::SelectedRows>(tensor.impl());
 }
 
 std::shared_ptr<phi::SelectedRows> TensorToSelectedRows(
-    const paddle::optional<Tensor>& tensor) {
+    const paddle::optional<const Tensor&>& tensor) {
   if (tensor) {
-    return std::dynamic_pointer_cast<phi::SelectedRows>(tensor->impl());
+    return std::static_pointer_cast<phi::SelectedRows>(tensor->impl());
   }
   return nullptr;
 }
@@ -66,6 +66,14 @@ phi::MetaTensor MakeMetaTensor(const phi::DenseTensor& tensor) {
   return phi::MetaTensor(tensor);
 }
 
+paddle::optional<phi::MetaTensor> MakeMetaTensor(
+    const paddle::optional<const phi::DenseTensor&>& tensor) {
+  if (tensor) {
+    return {phi::MetaTensor(*tensor)};
+  }
+  return {paddle::none};
+}
+
 std::vector<phi::MetaTensor> MakeMetaTensor(
     const std::vector<const phi::DenseTensor*>& tensors) {
   std::vector<phi::MetaTensor> meta_tensors;
@@ -90,6 +98,14 @@ phi::MetaTensor MakeMetaTensor(const phi::SelectedRows& tensor) {
   return phi::MetaTensor(tensor);
 }
 
+paddle::optional<phi::MetaTensor> MakeMetaTensor(
+    const paddle::optional<const phi::SelectedRows&>& tensor) {
+  if (tensor) {
+    return {phi::MetaTensor(*tensor)};
+  }
+  return {paddle::none};
+}
+
 phi::MetaTensor MakeMetaTensor(const phi::StringTensor& tensor) {
   return phi::MetaTensor(tensor);
 }
diff --git a/paddle/phi/api/lib/api_gen_utils.h b/paddle/phi/api/lib/api_gen_utils.h
index 035dfc5204720714346a260fc60db1362e542a85..47b80bb3fc290dbba2abade53a1866a557c174a6 100644
--- a/paddle/phi/api/lib/api_gen_utils.h
+++ b/paddle/phi/api/lib/api_gen_utils.h
@@ -42,7 +42,7 @@ std::unique_ptr<std::vector<phi::DenseTensor>> TensorToDenseTensor(
 std::shared_ptr<phi::SelectedRows> TensorToSelectedRows(const Tensor& tensor);
 
 std::shared_ptr<phi::SelectedRows> TensorToSelectedRows(
-    const paddle::optional<Tensor>& tensor);
+    const paddle::optional<const Tensor&>& tensor);
 
 std::shared_ptr<phi::StringTensor> TensorToStringTensor(const Tensor& tensor);
 
@@ -50,6 +50,9 @@ std::shared_ptr<phi::StringTensor> TensorToStringTensor(const Tensor& tensor);
 
 phi::MetaTensor MakeMetaTensor(const phi::DenseTensor& tensor);
 
+paddle::optional<phi::MetaTensor> MakeMetaTensor(
+    const paddle::optional<const phi::DenseTensor&>& tensor);
+
 std::vector<phi::MetaTensor> MakeMetaTensor(
     const std::vector<const phi::DenseTensor*>& tensors);
 
@@ -58,6 +61,9 @@ std::vector<phi::MetaTensor> MakeMetaTensor(
 
 phi::MetaTensor MakeMetaTensor(const phi::SelectedRows& tensor);
 
+paddle::optional<phi::MetaTensor> MakeMetaTensor(
+    const paddle::optional<const phi::SelectedRows&>& tensor);
+
 phi::MetaTensor MakeMetaTensor(const phi::StringTensor& tensor);
 
 /* ------------------ for output ----------------------- */
diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc
index f2acfe5a9962be97fd385f322e5136986ad78a28..5fecd3740e9307182479e51b77556902dc2354ed 100644
--- a/paddle/phi/infermeta/multiary.cc
+++ b/paddle/phi/infermeta/multiary.cc
@@ -1887,7 +1887,7 @@ void RnnInferMeta(const MetaTensor& x,
   }
 }
 
-void SGDInferMeta(const MetaTensor& param,
+void SgdInferMeta(const MetaTensor& param,
                   const MetaTensor& learning_rate,
                   const MetaTensor& grad,
                   paddle::optional<const MetaTensor&> master_param,
diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h
index c037641d082b759760035a6d33ccf2ecece1193e..9137b574ac09d19d3eed0f92d82509bf600795fa 100644
--- a/paddle/phi/infermeta/multiary.h
+++ b/paddle/phi/infermeta/multiary.h
@@ -292,7 +292,7 @@ void RnnInferMeta(const MetaTensor& x,
                   std::vector<MetaTensor*> state,
                   MetaTensor* reserve);
 
-void SGDInferMeta(const MetaTensor& param,
+void SgdInferMeta(const MetaTensor& param,
                   const MetaTensor& learning_rate,
                   const MetaTensor& grad,
                   paddle::optional<const MetaTensor&> master_param,
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 8242d8e3392ec02ebb0f335b099d24eebd9fff06..95db9d39c1ec495df238175288c0d26e2f476bfb 100755
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -44,6 +44,7 @@ from .wrapped_decorator import signature_safe_contextmanager
 from .. import compat as cpt
 import warnings
 from paddle import _C_ops
+from ..fluid.framework import _in_legacy_dygraph, in_dygraph_mode
 
 __all__ = [
     'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'Dpsgd', 'DecayedAdagrad',
@@ -1370,7 +1371,11 @@ class SGDOptimizer(Optimizer):
                          if find_master else None)
 
         lr = self._create_param_lr(param_and_grad)
-        if framework._non_static_mode():
+        if in_dygraph_mode():
+            _C_ops.final_state_sgd(param_and_grad[0], lr, param_and_grad[1],
+                                   master_weight, find_master)
+            return None
+        if _in_legacy_dygraph():
             _C_ops.sgd(param_and_grad[0], lr, param_and_grad[1], master_weight,
                        param_and_grad[0], master_weight)
             return None
diff --git a/python/paddle/fluid/tests/unittests/test_sgd_op.py b/python/paddle/fluid/tests/unittests/test_sgd_op.py
index 817150a21f5e56e53d41d485d316802ec6983d8a..ad03fa30009e7a0ebcdc175388127008b517563a 100644
--- a/python/paddle/fluid/tests/unittests/test_sgd_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sgd_op.py
@@ -21,6 +21,7 @@ import paddle.fluid.core as core
 from paddle.fluid.op import Operator
 from op_test import OpTest
 import paddle
+from paddle.fluid.framework import _test_eager_guard
 
 paddle.enable_static()
 
@@ -291,6 +292,11 @@ class TestSGDV2(unittest.TestCase):
         adam.step()
         adam.clear_gradients()
 
+    def test_eager(self):
+        with _test_eager_guard():
+            self.test_sgd_dygraph()
+            self.test_sgd_group_dygraph()
+
 
 class TestSGDMultiPrecision2_0(unittest.TestCase):
     def dygraph_sgd_mp(self, mp):
diff --git a/python/paddle/optimizer/sgd.py b/python/paddle/optimizer/sgd.py
index fdee57bb1253e0683434faabd17fae6632ad5b72..46dd0b73a5eb8b1f4b63043f489cb84af7040bd7 100644
--- a/python/paddle/optimizer/sgd.py
+++ b/python/paddle/optimizer/sgd.py
@@ -22,6 +22,7 @@ import warnings
 from ..fluid.layer_helper import LayerHelper
 from ..fluid import unique_name
 from ..fluid import layers
+from ..fluid.framework import _in_legacy_dygraph, in_dygraph_mode
 
 __all__ = []
 
@@ -144,7 +145,11 @@ class SGD(Optimizer):
                          if find_master else None)
 
         lr = self._create_param_lr(param_and_grad)
-        if framework._non_static_mode():
+        if in_dygraph_mode():
+            _C_ops.final_state_sgd(param_and_grad[0], lr, param_and_grad[1],
+                                   master_weight, find_master)
+            return None
+        if _in_legacy_dygraph():
             _C_ops.sgd(param_and_grad[0], lr, param_and_grad[1], master_weight,
                        param_and_grad[0], master_weight)
             return None
diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml
index 329882317ee2bc2b61df83ae178e3f0f69a0d649..b4abe5b303b8e6425ebbbe17931e8a6d1db7da16 100644
--- a/python/paddle/utils/code_gen/api.yaml
+++ b/python/paddle/utils/code_gen/api.yaml
@@ -1794,6 +1794,12 @@
     func : selu
   backward : selu_grad
 
+- api : sgd
+  args : (Tensor param, Tensor learning_rate, Tensor grad, Tensor master_param, bool multi_precision)
+  output : Tensor(param_out), Tensor(master_param_out)
+  invoke : sgd_impl(param, learning_rate, grad, master_param, multi_precision)
+  optional : master_param
+
 - api : shape
   args : (Tensor input)
   output : Tensor