add the support to specify device index for device_guard (#24555)

* add the support of device index for device_guard.

add the support to specify device index for device_guard (#24555)
* add the support of device index for device_guard.
29de0d97 · lilong12 · GitHub · 3016a4ac · 29de0d97 · 29de0d97
3 changed file
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1050,7 +1050,16 @@ void OperatorWithKernel::ChooseKernel(const RuntimeContext& ctx,
  if (HasAttr("op_device")) {
    if (Attr<std::string>("op_device") == "cpu") {
      expected_kernel_key.place_ = platform::CPUPlace();
-    } else if (Attr<std::string>("op_device") == "gpu") {
+    } else if (Attr<std::string>("op_device").find("gpu") !=
+               std::string::npos) {
+      auto device = Attr<std::string>("op_device");
+      size_t pos = device.find(':');
+      if (pos != std::string::npos) {
+        device = device.substr(0, pos);
+        LOG_FIRST_N(WARNING, 1)
+            << "Device index is only supported under pipeline parallelism, "
+            << "so it will be ignored.";
+      }
      // when the Op that only has CPUKernel is assigned to GPU, the CPUKernel
      // will be executed and a warning will be given at the same time.
      if (SupportGPU()) {

--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -5455,10 +5455,17 @@ def device_guard(device=None):
            result = exe.run(fetch_list=[out])
    """

+    index = None
+    if device and ':' in device:
+        device, index = device.split(':')
+        if device == 'cpu':
+            raise ValueError("Should not set device id for cpu.")
    if device not in ['cpu', 'gpu', '', None]:
        raise ValueError(
            "The Attr(device) should be 'cpu' or 'gpu', and it can also be empty string or None "
            "when there is no need to specify device. But received %s" % device)
+    if index:
+        device = ":".join([device, index])
    pre_device = switch_device(device)
    try:
        yield

--- a/python/paddle/fluid/tests/unittests/test_device_guard.py
+++ b/python/paddle/fluid/tests/unittests/test_device_guard.py
@@ -59,6 +59,31 @@ class TestDeviceGuard(unittest.TestCase):

        execute(main_program, startup_program)

+    def test_device_guard_with_id(self):
+        main_program = fluid.Program()
+        startup_program = fluid.Program()
+        with fluid.program_guard(main_program, startup_program):
+            data1 = fluid.layers.fill_constant(
+                shape=[1, 3, 8, 8], value=0.5, dtype='float32')
+            data2 = fluid.layers.fill_constant(
+                shape=[1, 3, 5, 5], value=0.5, dtype='float32')
+            shape = fluid.layers.shape(data2)
+            with fluid.device_guard("cpu"):
+                shape = fluid.layers.slice(
+                    shape, axes=[0], starts=[0], ends=[4])
+                with fluid.device_guard("gpu:1"):
+                    out = fluid.layers.crop_tensor(data1, shape=shape)
+        # check if the device attr is set correctly
+        all_ops = main_program.global_block().ops
+        device_attr_name = core.op_proto_and_checker_maker.kOpDeviceAttrName()
+        for op in all_ops:
+            if op.type == 'slice':
+                self.assertEqual(op.desc.attr(device_attr_name), "cpu")
+            if op.type == 'crop_tensor':
+                self.assertEqual(op.desc.attr(device_attr_name), "gpu:1")
+
+        execute(main_program, startup_program)
+
    def test_cpu_only_op(self):
        main_program = fluid.Program()
        startup_program = fluid.Program()
@@ -123,7 +148,13 @@ class TestDeviceGuard(unittest.TestCase):
                out = fluid.layers.fill_constant(
                    shape=[1], value=0.2, dtype='float32')

+        def device_attr2():
+            with fluid.device_guard("cpu:1"):
+                out = fluid.layers.fill_constant(
+                    shape=[1], value=0.2, dtype='float32')
+
        self.assertRaises(ValueError, device_attr)
+        self.assertRaises(ValueError, device_attr2)

    def test_warning(self):
        main_program = fluid.Program()