diff --git a/paddle/operators/sgd_op.cc b/paddle/operators/sgd_op.cc
index e26a1c7893d6b47fb8126dba646f7f428e973195..2acb96d1b4f5903ff6c57b10e7621c8adaf73171 100644
--- a/paddle/operators/sgd_op.cc
+++ b/paddle/operators/sgd_op.cc
@@ -61,7 +61,7 @@ param_out = param - learning_rate * grad;
 
 template <typename T>
 struct SparseSGDFunctor<platform::CPUPlace, T> {
-  void operator()(const platform::DeviceContext& ctx,
+  void operator()(const platform::DeviceContext& context,
                   const framework::SelectedRows& input,
                   const framework::Tensor& learning_rate,
                   framework::Tensor* output) {
diff --git a/paddle/operators/sgd_op.cu b/paddle/operators/sgd_op.cu
index 5c28314141cebe606ea4d32cd0dd0deee9406d14..106f9b746ba6614d8fa68b677c47ec04ed26fb81 100644
--- a/paddle/operators/sgd_op.cu
+++ b/paddle/operators/sgd_op.cu
@@ -34,15 +34,15 @@ __global__ void SparseSGDFunctorKernel(const T* selected_rows,
   for (int index = tid; index < row_numel; index += block_size) {
     // Since index in rows of SelectedRows can be duplicate, we have to use
     // Atomic Operation to avoid concurrent write error.
-    paddle::platform::CudaAtomicSub(tensor_out + index,
-                                    learning_rate[0] * selected_rows[index]);
+    paddle::platform::CudaAtomicAdd(
+        tensor_out + index, -1.0 * learning_rate[0] * selected_rows[index]);
   }
 }
 }  // namespace
 
 template <typename T>
 struct SparseSGDFunctor<platform::GPUPlace, T> {
-  void operator()(const platform::DeviceContext& ctx,
+  void operator()(const platform::DeviceContext& context,
                   const framework::SelectedRows& input,
                   const framework::Tensor& learning_rate,
                   framework::Tensor* output) {
diff --git a/paddle/operators/sgd_op.h b/paddle/operators/sgd_op.h
index 8c28d5e66b3f17be97d80bde653dfd5f8f7e77b5..78b595fc6c63d775b627f23cafa9458f1dadd4e5 100644
--- a/paddle/operators/sgd_op.h
+++ b/paddle/operators/sgd_op.h
@@ -22,7 +22,7 @@ namespace operators {
 
 template <typename Place, typename T>
 struct SparseSGDFunctor {
-  void operator()(const platform::DeviceContext& ctx,
+  void operator()(const platform::DeviceContext& context,
                   const framework::SelectedRows& input,
                   const framework::Tensor& learning_rate,
                   framework::Tensor* output);
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index 65e265b6142c922f8c2904ece465cbbb661d4ad5..80854fb0c5013fe0de857fba93749f9f3574b2ab 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -153,7 +153,15 @@ PYBIND11_PLUGIN(core) {
            py::return_value_policy::reference)
       .def("set_height", &SelectedRows::set_height)
       .def("height", &SelectedRows::height)
-      .def("set_rows", &SelectedRows::set_rows)
+      .def("set_rows",
+           [](SelectedRows &self, std::vector<int64_t> rows) {
+#ifndef PADDLE_WITH_CUDA
+             self.set_rows(rows);
+#else
+        Vector<int64_t> new_rows(rows);
+        self.set_rows(new_rows);
+#endif
+           })
       .def("rows", [](SelectedRows &self) {
 #ifndef PADDLE_WITH_CUDA
         return self.rows();
diff --git a/python/paddle/v2/framework/tests/test_sgd_op.py b/python/paddle/v2/framework/tests/test_sgd_op.py
index c7d6a3b345f6d5f88a7c263a3b1169a6e34e1174..01262bba4d43adaed179baef88ccab6e69b0884b 100644
--- a/python/paddle/v2/framework/tests/test_sgd_op.py
+++ b/python/paddle/v2/framework/tests/test_sgd_op.py
@@ -20,11 +20,10 @@ class TestSGDOp(OpTest):
 
 
 class TestSparseSGDOp(unittest.TestCase):
-    def test_sparse_sgd(self):
+    def check_with_place(self, place):
         scope = core.Scope()
 
         # create and initialize Grad Variable   
-        place = core.CPUPlace()
         height = 10
         rows = [0, 4, 7]
         row_numel = 12
@@ -35,6 +34,7 @@ class TestSparseSGDOp(unittest.TestCase):
         np_array = np.ones((len(rows), row_numel)).astype("float32")
         np_array[0, 0] = 2.0
         np_array[2, 8] = 4.0
+
         grad_tensor = grad_selected_rows.get_tensor()
         grad_tensor.set(np_array, place)
 
@@ -76,6 +76,13 @@ class TestSparseSGDOp(unittest.TestCase):
         # rows[2] = 7, 5.0 - 2.0 * 4.0
         self.assertAlmostEqual(-3.0, result_array[rows[2], 8])
 
+    def test_sparse_sgd(self):
+        places = [core.CPUPlace()]
+        if core.is_compile_gpu():
+            places.append(core.GPUPlace(0))
+        for place in places:
+            self.check_with_place(place)
+
 
 if __name__ == "__main__":
     unittest.main()