fix compile in cpu error. test=develop

00e8791f · dzhwinter · d239cf2e · 00e8791f · 00e8791f · 00e8791f
3 changed file
--- a/paddle/fluid/operators/momentum_op.cc
+++ b/paddle/fluid/operators/momentum_op.cc
@@ -45,12 +45,15 @@ class MomentumOp : public framework::OperatorWithKernel {
                   "Output(VelocityOut) of Momentum should not be null.");

    auto param_dim = ctx->GetInputDim("Param");
+    if (ctx->GetInputsVarType("Grad")[0] ==
+        framework::proto::VarType::LOD_TENSOR) {
      PADDLE_ENFORCE_EQ(
          param_dim, ctx->GetInputDim("Grad"),
          "Param and Grad input of MomentumOp should have the same dimension.");
      PADDLE_ENFORCE_EQ(
          param_dim, ctx->GetInputDim("Velocity"),
          "Param and Velocity of MomentumOp should have the same dimension.");
+    }
    PADDLE_ENFORCE_EQ(framework::product(ctx->GetInputDim("LearningRate")), 1,
                      "Learning_rate should be a scalar");


--- a/paddle/fluid/operators/momentum_op.h
+++ b/paddle/fluid/operators/momentum_op.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #pragma once
+#include <string>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/algorithm.h"
@@ -303,28 +304,30 @@ class MomentumOpKernel : public framework::OpKernel<T> {
      auto* merged_grad = const_cast<framework::Scope&>(ctx.scope())
                              .Var()
                              ->GetMutable<framework::SelectedRows>();
-
      math::scatter::MergeAdd<DeviceContext, T> merge_func;
      merge_func(ctx.template device_context<DeviceContext>(), *grad,
                 merged_grad);

-      platform::ForRange<DeviceContext> for_range(
-          static_cast<const DeviceContext&>(ctx.device_context()),
-          param->numel());
-
      const int64_t* rows = nullptr;
+#ifdef PADDLE_WITH_CUDA
      if (platform::is_gpu_place(ctx.GetPlace())) {
        rows = merged_grad->rows().CUDAData(ctx.GetPlace());
      } else {
+#endif
        rows = merged_grad->rows().data();
+#ifdef PADDLE_WITH_CUDA
      }
-
+#endif
+      int64_t row_numel =
+          merged_grad->value().numel() / merged_grad->rows().size();
+      platform::ForRange<DeviceContext> for_range(
+          static_cast<const DeviceContext&>(ctx.device_context()),
+          param->numel());
      if (use_nesterov) {
        SparseMomentumFunctor<T, UseNesterov> functor(
            param->data<T>(), merged_grad->value().data<T>(),
-            velocity->data<T>(), learning_rate->data<T>(), mu, rows,
+            velocity->data<T>(), learning_rate->data<T>(), mu, rows, row_numel,
            static_cast<int64_t>(merged_grad->rows().size()),
-            static_cast<int64_t>(merged_grad->height()),
            param_out->mutable_data<T>(ctx.GetPlace()),
            velocity_out->mutable_data<T>(ctx.GetPlace()));
        for_range(functor);
@@ -332,9 +335,8 @@ class MomentumOpKernel : public framework::OpKernel<T> {
      } else {
        SparseMomentumFunctor<T, NoNesterov> functor(
            param->data<T>(), merged_grad->value().data<T>(),
-            velocity->data<T>(), learning_rate->data<T>(), mu, rows,
+            velocity->data<T>(), learning_rate->data<T>(), mu, rows, row_numel,
            static_cast<int64_t>(merged_grad->rows().size()),
-            static_cast<int64_t>(merged_grad->height()),
            param_out->mutable_data<T>(ctx.GetPlace()),
            velocity_out->mutable_data<T>(ctx.GetPlace()));
        for_range(functor);

--- a/python/paddle/fluid/tests/unittests/test_momentum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_momentum_op.py
@@ -121,22 +121,13 @@ class TestSparseMomentumOp(unittest.TestCase):
        grad_tensor = grad_selected_rows.get_tensor()
        grad_tensor.set(grad_np_array, place)

-        velocity_selected_rows = scope.var('Velocity').get_selected_rows()
-        velocity_selected_rows.set_height(height)
-        velocity_selected_rows.set_rows(rows)
-        velocity_np_array = np.ones((len(rows), row_numel)).astype("float32")
-        velocity_np_array[0, 0] = 2.0
-        velocity_np_array[2, 8] = 2.0
-        velocity_tensor = velocity_selected_rows.get_tensor()
-        velocity_tensor.set(velocity_np_array, place)
-        velocity_out_selected_rows = scope.var('VelocityOut').get_selected_rows(
-        )
-        velocity_out_selected_rows.set_height(height)
-        velocity_out_selected_rows.set_rows(rows)
-        velocity_out_np_array = np.full((len(rows), row_numel),
+        velocity = scope.var('Velocity').get_tensor()
+        velocity_np_array = np.ones((height, row_numel)).astype("float32")
+        velocity.set(velocity_np_array, place)
+        velocity_out = scope.var('VelocityOut').get_tensor()
+        velocity_out_np_array = np.full((height, row_numel),
                                        0.0).astype("float32")
-        velocity_out_tensor = velocity_out_selected_rows.get_tensor()
-        velocity_out_tensor.set(velocity_out_np_array, place)
+        velocity_out.set(velocity_out_np_array, place)

        # create and initialize LeraningRate Variable
        lr = scope.var('LearningRate').get_tensor()
@@ -158,19 +149,22 @@ class TestSparseMomentumOp(unittest.TestCase):

        # get and compare result
        param_out_np_array = np.array(param_out)
-        velocity_out_np_array = np.array(velocity_out_tensor)
+        velocity_out_np_array = np.array(velocity_out)

        # TODO(dzh): add a more suitable general numpy interface
        # for sparse update.
-        _velocity_out = mu * velocity_np_array + grad_np_array
-        _param = param_array[rows]
+        _grad_np_array = np.full((height, row_numel), 0.0).astype("float32")
+        for i in range(len(rows)):
+            _grad_np_array[rows[i]] = grad_np_array[i]
+        _velocity_out = mu * velocity_np_array + _grad_np_array
+        _param = param_array
        if use_nesterov:
-            _param_out = _param - grad_np_array * lr_array - \
-                        _velocity_out * mu * lr_array
+            _param_out = _param - (_grad_np_array + _velocity_out * mu
+                                   ) * lr_array
        else:
-            _param_out = _param - lr * _velocity_out
-        self.assertTrue((_param_out == param_out_np_array[rows]).all())
+            _param_out = _param - lr_array * _velocity_out
        self.assertTrue((_velocity_out == velocity_out_np_array).all())
+        self.assertTrue((_param_out == param_out_np_array).all())

    def init_kernel(self):
        pass