diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py
index 776af6ceddd5928d07b120443cb9f7ca613d9aa6..25778e834a2a99a141b21d884d0dca8f0ff5ac6a 100644
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -295,6 +295,8 @@ def _create_loss_op_desc_(loss):
             core.op_proto_and_checker_maker.kOpRoleAttrName():
             int(core.op_proto_and_checker_maker.OpRole.Backward) |
             int(core.op_proto_and_checker_maker.OpRole.Loss),
+            core.op_proto_and_checker_maker.kOpDeviceAttrName():
+            loss.op.attr(core.op_proto_and_checker_maker.kOpDeviceAttrName())
         })
     return op_desc
 
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 2b7275b180f7ca1ff13736ff55b581d23f9c278b..ff6a42a4235bcf9864ecba9baa84132110c55d81 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -484,8 +484,7 @@ class Optimizer(object):
                     if param_name in input_arg_names:
                         self._param_device_map[param_name] = op.attr(
                             device_attr_name)
-                    else:
-                        self._param_device_map[param_name] = None
+                        break
 
     def _get_device_for_param(self, param_name):
         device = None
diff --git a/python/paddle/fluid/tests/unittests/test_device_guard.py b/python/paddle/fluid/tests/unittests/test_device_guard.py
index 3051ed3995a81c94ba14690bc55b71074e746d75..1384a22b252cbb1c348c1e1e839e7ae2889ef79d 100644
--- a/python/paddle/fluid/tests/unittests/test_device_guard.py
+++ b/python/paddle/fluid/tests/unittests/test_device_guard.py
@@ -144,6 +144,27 @@ class TestDeviceGuard(unittest.TestCase):
         for op in all_ops:
             self.assertEqual(op.desc.attr(device_attr_name), "gpu")
 
+    def test_loss_op_desc(self):
+        main_program = fluid.Program()
+        startup_program = fluid.Program()
+        with fluid.program_guard(main_program, startup_program):
+            data1 = fluid.layers.data(name="data_1", shape=[2], dtype="float32")
+            label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+            fc1 = fluid.layers.fc(input=data1, size=10)
+            with fluid.device_guard("gpu"):
+                out = fluid.layers.softmax_with_cross_entropy(
+                    logits=fc1, label=label)
+                loss = fluid.layers.mean(out)
+                opt = fluid.optimizer.SGDOptimizer(0.1)
+                opt.minimize(loss)
+
+        all_ops = main_program.global_block().ops
+        device_attr_name = core.op_proto_and_checker_maker.kOpDeviceAttrName()
+        for op in all_ops:
+            self.assertEqual(True, op.desc.has_attr(device_attr_name))
+            if op.desc == 'fill_constant':
+                self.assertEqual(op.desc.attr(device_attr_name), "gpu")
+
 
 if __name__ == '__main__':
     unittest.main()