diff --git a/paddle/fluid/framework/op_proto_maker.cc b/paddle/fluid/framework/op_proto_maker.cc
index b502ef7a7c61b2114248bab9ab2bbb2d1a90dd16..37f66a9692a8e6636dbe40008e0fa935000ff7e9 100644
--- a/paddle/fluid/framework/op_proto_maker.cc
+++ b/paddle/fluid/framework/op_proto_maker.cc
@@ -86,7 +86,8 @@ void OpProtoAndCheckerMaker::operator()(proto::OpProto* proto,
   AddAttr<std::vector<std::string>>(OpCreationCallstackAttrName(),
                                     "Callstack for Op Creatation.")
       .SetDefault({});
-
+  AddAttr<std::string>(OpDeviceAttrName(), "Device type of this operator.")
+      .SetDefault("");
   Validate();
 }
 
diff --git a/paddle/fluid/framework/op_proto_maker.h b/paddle/fluid/framework/op_proto_maker.h
index 5f3ce60e1d94ea4078cf0b709df362bad317f621..912e82f60ef5df81d163e1d8b937eb026e51478a 100644
--- a/paddle/fluid/framework/op_proto_maker.h
+++ b/paddle/fluid/framework/op_proto_maker.h
@@ -48,6 +48,7 @@ class OpProtoAndCheckerMaker {
   static const char *OpRoleVarAttrName() { return "op_role_var"; }
   static const char *OpNamescopeAttrName() { return "op_namescope"; }
   static const char *OpCreationCallstackAttrName() { return "op_callstack"; }
+  static const char *OpDeviceAttrName() { return "op_device"; }
 
   void operator()(proto::OpProto *proto, OpAttrChecker *attr_checker);
 
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index d64f3bf8bb71dccdfa8ef82a8b9f00384c534712..57bae3545c4a9d5a7aff1649f52669892e9ba7d5 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1056,6 +1056,22 @@ void OperatorWithKernel::ChooseKernel(const RuntimeContext& ctx,
 
   auto expected_kernel_key = this->GetExpectedKernelType(
       ExecutionContext(*this, scope, *dev_ctx, ctx, nullptr));
+  if (HasAttr("op_device")) {
+    if (Attr<std::string>("op_device") == "cpu") {
+      expected_kernel_key.place_ = platform::CPUPlace();
+    } else if (Attr<std::string>("op_device") == "gpu") {
+      // when the Op that only has CPUKernel is assigned to GPU, the CPUKernel
+      // will be executed and a warning will be given at the same time.
+      if (SupportGPU()) {
+        expected_kernel_key.place_ = dev_ctx->GetPlace();
+      } else {
+        expected_kernel_key.place_ = platform::CPUPlace();
+        LOG_FIRST_N(WARNING, 1)
+            << "Op(" << type_
+            << ") has no CUDA implementation. It will be assigned to CPUPlace.";
+      }
+    }
+  }
   VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
 
   auto kernel_iter = kernels.find(expected_kernel_key);
diff --git a/paddle/fluid/pybind/const_value.cc b/paddle/fluid/pybind/const_value.cc
index 5a889900ffbced143a9251df48bb6f229e2e7928..c24a2b4a088c5c4d1169b866d918fa37e9e37622 100644
--- a/paddle/fluid/pybind/const_value.cc
+++ b/paddle/fluid/pybind/const_value.cc
@@ -57,6 +57,8 @@ void BindConstValue(pybind11::module* m) {
   op_proto_and_checker_maker.def(
       "kOpCreationCallstackAttrName",
       framework::OpProtoAndCheckerMaker::OpCreationCallstackAttrName);
+  op_proto_and_checker_maker.def(
+      "kOpDeviceAttrName", framework::OpProtoAndCheckerMaker::OpDeviceAttrName);
 #if defined(PADDLE_WITH_DGC)
   auto dgc = m->def_submodule("dgc");
   dgc.def("kDGCKName", [] { return framework::details::g_dgc_k; });
diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py
index 7a2f201690848e6a2becb7d4edb59fd753e9cb9b..776af6ceddd5928d07b120443cb9f7ca613d9aa6 100644
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -876,6 +876,12 @@ def _append_backward_ops_(block,
         grad_op_desc, op_grad_to_var = core.get_grad_op_desc(
             op.desc, cpt.to_text(no_grad_dict[block.idx]), grad_sub_block_list)
 
+        # Set device for grad_op according to forward Op
+        device_attr_name = core.op_proto_and_checker_maker.kOpDeviceAttrName()
+        op_device = op.desc.attr(device_attr_name)
+        for op_desc in grad_op_desc:
+            op_desc._set_attr(device_attr_name, op_device)
+
         # If input_grad_names_set is not None, extend grad_op_descs only when
         # any input grad in outputs of previous grad ops.
         # But this strategy is not suited for while op for some control flow,
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 38408d2b9a9d42ed23291696391f6ac3f6595a4b..55db8e10d923abe98cd667c5fe162bcb9e902b2b 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -51,6 +51,7 @@ __all__ = [
     'Variable',
     'load_op_library',
     'require_version',
+    'device_guard',
 ]
 
 EMPTY_VAR_NAME = core.kEmptyVarName()
@@ -61,6 +62,7 @@ CONTROL_DEP_VAR_PREFIX = core.kControlDepVarName()
 
 _dygraph_tracer_ = None
 _dygraph_current_expected_place_ = None
+_current_device = None
 
 
 def require_version(min_version, max_version=None):
@@ -1696,7 +1698,8 @@ class OpProtoHolder(object):
             core.op_proto_and_checker_maker.kOpRoleAttrName(),
             core.op_proto_and_checker_maker.kOpRoleVarAttrName(),
             core.op_proto_and_checker_maker.kOpNameScopeAttrName(),
-            core.op_proto_and_checker_maker.kOpCreationCallstackAttrName()
+            core.op_proto_and_checker_maker.kOpCreationCallstackAttrName(),
+            core.op_proto_and_checker_maker.kOpDeviceAttrName()
         }
 
 
@@ -1804,6 +1807,24 @@ class Operator(object):
             namescope_var_name = op_maker.kOpNameScopeAttrName()
             op_attrs[namescope_var_name] = _full_name_scope()
 
+            # set device for op with kernels, give warning for op without kernels
+            # when force_cpu and device_guard are used at the same time, a warning will be given.
+            # TODO(zhangting2020): when force_cpu is removed, clear warning below.
+            if _current_device is not None:
+                if self._has_kernel(type):
+                    op_device = op_maker.kOpDeviceAttrName()
+                    op_attrs[op_device] = _current_device
+                else:
+                    warnings.warn("The Op(%s) is not support to set device." %
+                                  type)
+                if 'force_cpu' in op_attrs:
+                    if (type is 'less_than' and op_attrs['force_cpu'] != None
+                        ) or op_attrs['force_cpu'] != False:
+                        warnings.warn(
+                            "The Attr(force_cpu) of Op(%s) will be deprecated in the future, "
+                            "please use 'device_guard' instead. 'device_guard' has higher priority when they are "
+                            "used at the same time." % type)
+
             def find_name(var_list, name):
                 for var_name in var_list:
                     if var_list[var_name] is not None and var_name == name:
@@ -5056,3 +5077,62 @@ def load_op_library(lib_filename):
     """
     core.load_op_library(lib_filename)
     OpProtoHolder.instance().update_op_proto()
+
+
+def switch_device(device):
+    global _current_device
+    pre_device = _current_device
+    _current_device = device
+    return pre_device
+
+
+@signature_safe_contextmanager
+def device_guard(device=None):
+    """
+    **Notes**:
+        **The API only supports static mode.**
+
+    A context manager that specifies the device on which the OP will be placed.
+
+    Args:
+        device(str|None): Specify the device to use in the context. It should be 'cpu' or 'gpu',
+            When it is set to 'cpu' or 'gpu', all OPs created in the context will be
+            placed on CPUPlace or CUDAPlace. When 'gpu' is set and the program runs on
+            single-card, the device index will be the same as the device on which the
+            executor runs. Default: None, OPs in this context will be automatically
+            assigned devices.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle.fluid as fluid
+
+            support_gpu = fluid.is_compiled_with_cuda()
+            place = fluid.CPUPlace()
+            if support_gpu:
+                place = fluid.CUDAPlace(0)
+
+            # if GPU is supported, the three OPs below will be automatically assigned to CUDAPlace(0)
+            data1 = fluid.layers.fill_constant(shape=[1, 3, 8, 8], value=0.5, dtype='float32')
+            data2 = fluid.layers.fill_constant(shape=[1, 3, 5, 5], value=0.5, dtype='float32')
+            shape = fluid.layers.shape(data2)
+
+            with fluid.device_guard("cpu"):
+                # Ops created here will be placed on CPUPlace
+                shape = fluid.layers.slice(shape, axes=[0], starts=[0], ends=[4])
+            with fluid.device_guard('gpu'):
+                # if GPU is supported, OPs created here will be placed on CUDAPlace(0), otherwise on CPUPlace
+                out = fluid.layers.crop_tensor(data1, shape=shape)
+
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
+            result = exe.run(fetch_list=[out])
+    """
+
+    if device not in ['cpu', 'gpu', '', None]:
+        raise ValueError(
+            "The Attr(device) should be 'cpu' or 'gpu', and it can also be empty string or None "
+            "when there is no need to specify device. But received %s" % device)
+    pre_device = switch_device(device)
+    yield
+    switch_device(pre_device)
diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index 0c49982ded8cdc74418ecc44e49c11d016e4b731..b4764bdd7de07a83b7345b41f4563b5c0f50e0c4 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -1135,6 +1135,9 @@ def save_inference_model(dirname,
     # remind user to set auc_states to zeros if the program contains auc op 
     all_ops = main_program.global_block().ops
     for op in all_ops:
+        # clear device of Op
+        device_attr_name = core.op_proto_and_checker_maker.kOpDeviceAttrName()
+        op._set_attr(device_attr_name, "")
         if op.type == 'auc':
             warnings.warn(
                 "please ensure that you have set the auc states to zeros before saving inference model"
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index ca498bc6a47e5b3a6516eacc2164af7e261da523..90dee2b11eb44c03f156fe6b3c0fe37507e36cf3 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -18,7 +18,7 @@ import numpy as np
 from collections import defaultdict
 
 from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table
-from paddle.fluid.framework import Program, Variable, name_scope, default_main_program, default_startup_program
+from paddle.fluid.framework import Program, Variable, name_scope, default_main_program, default_startup_program, device_guard
 
 from . import framework
 from . import layers
@@ -108,6 +108,7 @@ class Optimizer(object):
         self.helper = None
         self._opti_name_list = []
         self._accumulators_holder = {}
+        self._param_device_map = dict()
 
     @framework.dygraph_only
     def state_dict(self):
@@ -405,7 +406,7 @@ class Optimizer(object):
                          fill_value=0.0,
                          shape=None,
                          type=None,
-                         force_cpu=False):
+                         device=None):
         """Utility function to add an accumulator for a parameter
 
         Args:
@@ -438,10 +439,11 @@ class Optimizer(object):
             type=param.type if type is None else type,
             shape=shape,
             belong_to_optimizer=True)
-        self.helper.set_variable_initializer(
-            var,
-            initializer=Constant(
-                value=float(fill_value), force_cpu=force_cpu))
+        if device is None:
+            device = self._get_device_for_param(param.name)
+        with device_guard(device):
+            self.helper.set_variable_initializer(
+                var, initializer=Constant(value=float(fill_value)))
 
         if framework.in_dygraph_mode():
             if len(self._accumulators_holder) > 0:
@@ -470,6 +472,27 @@ class Optimizer(object):
                             format(name, param.name))
         return self._accumulators[name][param.name]
 
+    def _update_param_device_map(self, parameters_and_grads, target_block):
+        for param_and_grad in parameters_and_grads:
+            if param_and_grad[0].trainable is True:
+                param_name = param_and_grad[0].name
+                ops = target_block.ops
+                device_attr_name = core.op_proto_and_checker_maker.kOpDeviceAttrName(
+                )
+                for op in ops:
+                    input_arg_names = op.input_arg_names
+                    if param_name in input_arg_names:
+                        self._param_device_map[param_name] = op.attr(
+                            device_attr_name)
+                    else:
+                        self._param_device_map[param_name] = None
+
+    def _get_device_for_param(self, param_name):
+        device = None
+        if param_name in self._param_device_map:
+            device = self._param_device_map[param_name]
+        return device
+
     def _create_optimization_pass(self, parameters_and_grads):
         """Add optimization operators to update gradients to variables.
 
@@ -505,6 +528,7 @@ class Optimizer(object):
 
         start = len(target_block.ops)
         self.helper = LayerHelper(self.__class__.__name__)
+        self._update_param_device_map(parameters_and_grads, target_block)
         self._create_accumulators(
             target_block,
             [p[0] for p in parameters_and_grads if p[0].trainable])
@@ -523,7 +547,11 @@ class Optimizer(object):
                 with param_and_grad[0].block.program._optimized_guard(
                         param_and_grad), name_scope("optimizer"):
                     if param_and_grad[0].trainable is True:
-                        self._append_optimize_op(target_block, param_and_grad)
+                        device = self._get_device_for_param(param_and_grad[0]
+                                                            .name)
+                        with device_guard(device):
+                            optimize_op = self._append_optimize_op(
+                                target_block, param_and_grad)
 
         # Get custom finish ops for subclasses
         # FIXME: Need to fix this once we figure out how to handle dependencies
@@ -1793,14 +1821,14 @@ class AdamOptimizer(Optimizer):
                 fill_value=0.9 if isinstance(self._beta1, Variable) \
                         else self._beta1,
                 shape=[1],
-                type=core.VarDesc.VarType.LOD_TENSOR, force_cpu=True)
+                type=core.VarDesc.VarType.LOD_TENSOR, device='cpu')
             self._add_accumulator(
                 name=self._beta2_pow_acc_str,
                 param=p,
                 fill_value=0.999 if isinstance(self._beta2, Variable) \
                         else self._beta2,
                 shape=[1],
-                type=core.VarDesc.VarType.LOD_TENSOR, force_cpu=True)
+                type=core.VarDesc.VarType.LOD_TENSOR, device='cpu')
 
     def _append_optimize_op(self, block, param_and_grad):
         assert isinstance(block, framework.Block)
diff --git a/python/paddle/fluid/tests/unittests/test_device_guard.py b/python/paddle/fluid/tests/unittests/test_device_guard.py
new file mode 100644
index 0000000000000000000000000000000000000000..3051ed3995a81c94ba14690bc55b71074e746d75
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_device_guard.py
@@ -0,0 +1,149 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+from op_test import OpTest
+
+import numpy as np
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+import warnings
+
+
+def execute(main_program, startup_program):
+    if core.is_compiled_with_cuda():
+        place = core.CUDAPlace(0)
+    else:
+        place = core.CPUPlace()
+    exe = fluid.Executor(place)
+    exe.run(startup_program)
+    exe.run(main_program)
+
+
+class TestDeviceGuard(unittest.TestCase):
+    def test_device_guard(self):
+        main_program = fluid.Program()
+        startup_program = fluid.Program()
+        with fluid.program_guard(main_program, startup_program):
+            data1 = fluid.layers.fill_constant(
+                shape=[1, 3, 8, 8], value=0.5, dtype='float32')
+            data2 = fluid.layers.fill_constant(
+                shape=[1, 3, 5, 5], value=0.5, dtype='float32')
+            shape = fluid.layers.shape(data2)
+            with fluid.device_guard("cpu"):
+                shape = fluid.layers.slice(
+                    shape, axes=[0], starts=[0], ends=[4])
+                with fluid.device_guard("gpu"):
+                    out = fluid.layers.crop_tensor(data1, shape=shape)
+        # check if the device attr is set correctly
+        all_ops = main_program.global_block().ops
+        device_attr_name = core.op_proto_and_checker_maker.kOpDeviceAttrName()
+        for op in all_ops:
+            if op.type == 'slice':
+                self.assertEqual(op.desc.attr(device_attr_name), "cpu")
+            if op.type == 'crop_tensor':
+                self.assertEqual(op.desc.attr(device_attr_name), "gpu")
+
+        execute(main_program, startup_program)
+
+    def test_cpu_only_op(self):
+        main_program = fluid.Program()
+        startup_program = fluid.Program()
+        with fluid.program_guard(main_program, startup_program):
+            x = fluid.layers.fill_constant(
+                shape=[2, 255, 13, 13], value=0.3, dtype='float32')
+            gt_box = fluid.layers.fill_constant(
+                shape=[2, 6, 4], value=0.5, dtype='float32')
+            gt_label = fluid.layers.fill_constant(
+                shape=[2, 6], value=1.0, dtype='int32')
+            gt_score = fluid.layers.fill_constant(
+                shape=[2, 6], value=0.5, dtype='float32')
+            anchors = [
+                10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156,
+                198, 373, 326
+            ]
+            anchor_mask = [0, 1, 2]
+            with fluid.device_guard("gpu"):
+                # yolov3_loss only has cpu kernel, so its cpu kernel will be executed
+                loss = fluid.layers.yolov3_loss(
+                    x=x,
+                    gt_box=gt_box,
+                    gt_label=gt_label,
+                    gt_score=gt_score,
+                    anchors=anchors,
+                    anchor_mask=anchor_mask,
+                    class_num=80,
+                    ignore_thresh=0.7,
+                    downsample_ratio=32)
+
+        execute(main_program, startup_program)
+
+    def test_without_kernel_op(self):
+        main_program = fluid.Program()
+        startup_program = fluid.Program()
+        with fluid.program_guard(main_program, startup_program):
+            i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=0)
+            loop_len = fluid.layers.fill_constant(
+                shape=[1], dtype='int64', value=10)
+            cond = fluid.layers.less_than(x=i, y=loop_len)
+
+            with warnings.catch_warnings(record=True) as w:
+                warnings.simplefilter("always")
+                with fluid.device_guard("cpu"):
+                    while_op = fluid.layers.While(cond=cond)
+                    with while_op.block():
+                        i = fluid.layers.increment(x=i, value=1, in_place=True)
+                        fluid.layers.less_than(x=i, y=loop_len, cond=cond)
+
+        assert len(w) == 1
+        all_ops = main_program.global_block().ops
+        device_attr_name = core.op_proto_and_checker_maker.kOpDeviceAttrName()
+        for op in all_ops:
+            if op.type == 'while':
+                self.assertEqual(op.desc.attr(device_attr_name), "")
+
+        execute(main_program, startup_program)
+
+    def test_error(self):
+        def device_attr():
+            with fluid.device_guard("cpu1"):
+                out = fluid.layers.fill_constant(
+                    shape=[1], value=0.2, dtype='float32')
+
+        self.assertRaises(ValueError, device_attr)
+
+    def test_warning(self):
+        main_program = fluid.Program()
+        startup_program = fluid.Program()
+        with fluid.program_guard(main_program, startup_program):
+            with warnings.catch_warnings(record=True) as w:
+                warnings.simplefilter("always")
+                with fluid.device_guard("gpu"):
+                    x = fluid.layers.fill_constant(
+                        shape=[1], value=3.0, dtype='float32', force_cpu=True)
+                    y = fluid.layers.fill_constant(
+                        shape=[1], value=4.0, dtype='float32')
+                    result = fluid.layers.less_than(x=x, y=y, force_cpu=False)
+
+        assert len(w) == 2
+        all_ops = main_program.global_block().ops
+        device_attr_name = core.op_proto_and_checker_maker.kOpDeviceAttrName()
+        for op in all_ops:
+            self.assertEqual(op.desc.attr(device_attr_name), "gpu")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_operator_desc.py b/python/paddle/fluid/tests/unittests/test_operator_desc.py
index 5932112c3c4d186e46705d6765f09dd158e81e36..0a7afe8caf22de8631b2dfb056b922fda174b0e1 100644
--- a/python/paddle/fluid/tests/unittests/test_operator_desc.py
+++ b/python/paddle/fluid/tests/unittests/test_operator_desc.py
@@ -70,7 +70,7 @@ class TestOperator(unittest.TestCase):
             set([
                 "x_num_col_dims", "y_num_col_dims", "op_role", "op_role_var",
                 "use_mkldnn", "scale_x", "scale_y", "scale_out",
-                "force_fp32_output", "op_namescope", "op_callstack"
+                "force_fp32_output", "op_namescope", "op_callstack", "op_device"
             ]))
         self.assertEqual(mul_op.has_attr("x_num_col_dims"), True)
         self.assertEqual(mul_op.attr_type("x_num_col_dims"), core.AttrType.INT)