diff --git a/paddle/fluid/framework/op_proto_maker.cc b/paddle/fluid/framework/op_proto_maker.cc index b502ef7a7c61b2114248bab9ab2bbb2d1a90dd16..37f66a9692a8e6636dbe40008e0fa935000ff7e9 100644 --- a/paddle/fluid/framework/op_proto_maker.cc +++ b/paddle/fluid/framework/op_proto_maker.cc @@ -86,7 +86,8 @@ void OpProtoAndCheckerMaker::operator()(proto::OpProto* proto, AddAttr>(OpCreationCallstackAttrName(), "Callstack for Op Creatation.") .SetDefault({}); - + AddAttr(OpDeviceAttrName(), "Device type of this operator.") + .SetDefault(""); Validate(); } diff --git a/paddle/fluid/framework/op_proto_maker.h b/paddle/fluid/framework/op_proto_maker.h index 5f3ce60e1d94ea4078cf0b709df362bad317f621..912e82f60ef5df81d163e1d8b937eb026e51478a 100644 --- a/paddle/fluid/framework/op_proto_maker.h +++ b/paddle/fluid/framework/op_proto_maker.h @@ -48,6 +48,7 @@ class OpProtoAndCheckerMaker { static const char *OpRoleVarAttrName() { return "op_role_var"; } static const char *OpNamescopeAttrName() { return "op_namescope"; } static const char *OpCreationCallstackAttrName() { return "op_callstack"; } + static const char *OpDeviceAttrName() { return "op_device"; } void operator()(proto::OpProto *proto, OpAttrChecker *attr_checker); diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index d64f3bf8bb71dccdfa8ef82a8b9f00384c534712..57bae3545c4a9d5a7aff1649f52669892e9ba7d5 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -1056,6 +1056,22 @@ void OperatorWithKernel::ChooseKernel(const RuntimeContext& ctx, auto expected_kernel_key = this->GetExpectedKernelType( ExecutionContext(*this, scope, *dev_ctx, ctx, nullptr)); + if (HasAttr("op_device")) { + if (Attr("op_device") == "cpu") { + expected_kernel_key.place_ = platform::CPUPlace(); + } else if (Attr("op_device") == "gpu") { + // when the Op that only has CPUKernel is assigned to GPU, the CPUKernel + // will be executed and a warning will be given at the same time. + if (SupportGPU()) { + expected_kernel_key.place_ = dev_ctx->GetPlace(); + } else { + expected_kernel_key.place_ = platform::CPUPlace(); + LOG_FIRST_N(WARNING, 1) + << "Op(" << type_ + << ") has no CUDA implementation. It will be assigned to CPUPlace."; + } + } + } VLOG(3) << "expected_kernel_key:" << expected_kernel_key; auto kernel_iter = kernels.find(expected_kernel_key); diff --git a/paddle/fluid/pybind/const_value.cc b/paddle/fluid/pybind/const_value.cc index 5a889900ffbced143a9251df48bb6f229e2e7928..c24a2b4a088c5c4d1169b866d918fa37e9e37622 100644 --- a/paddle/fluid/pybind/const_value.cc +++ b/paddle/fluid/pybind/const_value.cc @@ -57,6 +57,8 @@ void BindConstValue(pybind11::module* m) { op_proto_and_checker_maker.def( "kOpCreationCallstackAttrName", framework::OpProtoAndCheckerMaker::OpCreationCallstackAttrName); + op_proto_and_checker_maker.def( + "kOpDeviceAttrName", framework::OpProtoAndCheckerMaker::OpDeviceAttrName); #if defined(PADDLE_WITH_DGC) auto dgc = m->def_submodule("dgc"); dgc.def("kDGCKName", [] { return framework::details::g_dgc_k; }); diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py index 7a2f201690848e6a2becb7d4edb59fd753e9cb9b..776af6ceddd5928d07b120443cb9f7ca613d9aa6 100644 --- a/python/paddle/fluid/backward.py +++ b/python/paddle/fluid/backward.py @@ -876,6 +876,12 @@ def _append_backward_ops_(block, grad_op_desc, op_grad_to_var = core.get_grad_op_desc( op.desc, cpt.to_text(no_grad_dict[block.idx]), grad_sub_block_list) + # Set device for grad_op according to forward Op + device_attr_name = core.op_proto_and_checker_maker.kOpDeviceAttrName() + op_device = op.desc.attr(device_attr_name) + for op_desc in grad_op_desc: + op_desc._set_attr(device_attr_name, op_device) + # If input_grad_names_set is not None, extend grad_op_descs only when # any input grad in outputs of previous grad ops. # But this strategy is not suited for while op for some control flow, diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 38408d2b9a9d42ed23291696391f6ac3f6595a4b..55db8e10d923abe98cd667c5fe162bcb9e902b2b 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -51,6 +51,7 @@ __all__ = [ 'Variable', 'load_op_library', 'require_version', + 'device_guard', ] EMPTY_VAR_NAME = core.kEmptyVarName() @@ -61,6 +62,7 @@ CONTROL_DEP_VAR_PREFIX = core.kControlDepVarName() _dygraph_tracer_ = None _dygraph_current_expected_place_ = None +_current_device = None def require_version(min_version, max_version=None): @@ -1696,7 +1698,8 @@ class OpProtoHolder(object): core.op_proto_and_checker_maker.kOpRoleAttrName(), core.op_proto_and_checker_maker.kOpRoleVarAttrName(), core.op_proto_and_checker_maker.kOpNameScopeAttrName(), - core.op_proto_and_checker_maker.kOpCreationCallstackAttrName() + core.op_proto_and_checker_maker.kOpCreationCallstackAttrName(), + core.op_proto_and_checker_maker.kOpDeviceAttrName() } @@ -1804,6 +1807,24 @@ class Operator(object): namescope_var_name = op_maker.kOpNameScopeAttrName() op_attrs[namescope_var_name] = _full_name_scope() + # set device for op with kernels, give warning for op without kernels + # when force_cpu and device_guard are used at the same time, a warning will be given. + # TODO(zhangting2020): when force_cpu is removed, clear warning below. + if _current_device is not None: + if self._has_kernel(type): + op_device = op_maker.kOpDeviceAttrName() + op_attrs[op_device] = _current_device + else: + warnings.warn("The Op(%s) is not support to set device." % + type) + if 'force_cpu' in op_attrs: + if (type is 'less_than' and op_attrs['force_cpu'] != None + ) or op_attrs['force_cpu'] != False: + warnings.warn( + "The Attr(force_cpu) of Op(%s) will be deprecated in the future, " + "please use 'device_guard' instead. 'device_guard' has higher priority when they are " + "used at the same time." % type) + def find_name(var_list, name): for var_name in var_list: if var_list[var_name] is not None and var_name == name: @@ -5056,3 +5077,62 @@ def load_op_library(lib_filename): """ core.load_op_library(lib_filename) OpProtoHolder.instance().update_op_proto() + + +def switch_device(device): + global _current_device + pre_device = _current_device + _current_device = device + return pre_device + + +@signature_safe_contextmanager +def device_guard(device=None): + """ + **Notes**: + **The API only supports static mode.** + + A context manager that specifies the device on which the OP will be placed. + + Args: + device(str|None): Specify the device to use in the context. It should be 'cpu' or 'gpu', + When it is set to 'cpu' or 'gpu', all OPs created in the context will be + placed on CPUPlace or CUDAPlace. When 'gpu' is set and the program runs on + single-card, the device index will be the same as the device on which the + executor runs. Default: None, OPs in this context will be automatically + assigned devices. + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + + support_gpu = fluid.is_compiled_with_cuda() + place = fluid.CPUPlace() + if support_gpu: + place = fluid.CUDAPlace(0) + + # if GPU is supported, the three OPs below will be automatically assigned to CUDAPlace(0) + data1 = fluid.layers.fill_constant(shape=[1, 3, 8, 8], value=0.5, dtype='float32') + data2 = fluid.layers.fill_constant(shape=[1, 3, 5, 5], value=0.5, dtype='float32') + shape = fluid.layers.shape(data2) + + with fluid.device_guard("cpu"): + # Ops created here will be placed on CPUPlace + shape = fluid.layers.slice(shape, axes=[0], starts=[0], ends=[4]) + with fluid.device_guard('gpu'): + # if GPU is supported, OPs created here will be placed on CUDAPlace(0), otherwise on CPUPlace + out = fluid.layers.crop_tensor(data1, shape=shape) + + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + result = exe.run(fetch_list=[out]) + """ + + if device not in ['cpu', 'gpu', '', None]: + raise ValueError( + "The Attr(device) should be 'cpu' or 'gpu', and it can also be empty string or None " + "when there is no need to specify device. But received %s" % device) + pre_device = switch_device(device) + yield + switch_device(pre_device) diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py index 0c49982ded8cdc74418ecc44e49c11d016e4b731..b4764bdd7de07a83b7345b41f4563b5c0f50e0c4 100644 --- a/python/paddle/fluid/io.py +++ b/python/paddle/fluid/io.py @@ -1135,6 +1135,9 @@ def save_inference_model(dirname, # remind user to set auc_states to zeros if the program contains auc op all_ops = main_program.global_block().ops for op in all_ops: + # clear device of Op + device_attr_name = core.op_proto_and_checker_maker.kOpDeviceAttrName() + op._set_attr(device_attr_name, "") if op.type == 'auc': warnings.warn( "please ensure that you have set the auc states to zeros before saving inference model" diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index ca498bc6a47e5b3a6516eacc2164af7e261da523..90dee2b11eb44c03f156fe6b3c0fe37507e36cf3 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -18,7 +18,7 @@ import numpy as np from collections import defaultdict from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table -from paddle.fluid.framework import Program, Variable, name_scope, default_main_program, default_startup_program +from paddle.fluid.framework import Program, Variable, name_scope, default_main_program, default_startup_program, device_guard from . import framework from . import layers @@ -108,6 +108,7 @@ class Optimizer(object): self.helper = None self._opti_name_list = [] self._accumulators_holder = {} + self._param_device_map = dict() @framework.dygraph_only def state_dict(self): @@ -405,7 +406,7 @@ class Optimizer(object): fill_value=0.0, shape=None, type=None, - force_cpu=False): + device=None): """Utility function to add an accumulator for a parameter Args: @@ -438,10 +439,11 @@ class Optimizer(object): type=param.type if type is None else type, shape=shape, belong_to_optimizer=True) - self.helper.set_variable_initializer( - var, - initializer=Constant( - value=float(fill_value), force_cpu=force_cpu)) + if device is None: + device = self._get_device_for_param(param.name) + with device_guard(device): + self.helper.set_variable_initializer( + var, initializer=Constant(value=float(fill_value))) if framework.in_dygraph_mode(): if len(self._accumulators_holder) > 0: @@ -470,6 +472,27 @@ class Optimizer(object): format(name, param.name)) return self._accumulators[name][param.name] + def _update_param_device_map(self, parameters_and_grads, target_block): + for param_and_grad in parameters_and_grads: + if param_and_grad[0].trainable is True: + param_name = param_and_grad[0].name + ops = target_block.ops + device_attr_name = core.op_proto_and_checker_maker.kOpDeviceAttrName( + ) + for op in ops: + input_arg_names = op.input_arg_names + if param_name in input_arg_names: + self._param_device_map[param_name] = op.attr( + device_attr_name) + else: + self._param_device_map[param_name] = None + + def _get_device_for_param(self, param_name): + device = None + if param_name in self._param_device_map: + device = self._param_device_map[param_name] + return device + def _create_optimization_pass(self, parameters_and_grads): """Add optimization operators to update gradients to variables. @@ -505,6 +528,7 @@ class Optimizer(object): start = len(target_block.ops) self.helper = LayerHelper(self.__class__.__name__) + self._update_param_device_map(parameters_and_grads, target_block) self._create_accumulators( target_block, [p[0] for p in parameters_and_grads if p[0].trainable]) @@ -523,7 +547,11 @@ class Optimizer(object): with param_and_grad[0].block.program._optimized_guard( param_and_grad), name_scope("optimizer"): if param_and_grad[0].trainable is True: - self._append_optimize_op(target_block, param_and_grad) + device = self._get_device_for_param(param_and_grad[0] + .name) + with device_guard(device): + optimize_op = self._append_optimize_op( + target_block, param_and_grad) # Get custom finish ops for subclasses # FIXME: Need to fix this once we figure out how to handle dependencies @@ -1793,14 +1821,14 @@ class AdamOptimizer(Optimizer): fill_value=0.9 if isinstance(self._beta1, Variable) \ else self._beta1, shape=[1], - type=core.VarDesc.VarType.LOD_TENSOR, force_cpu=True) + type=core.VarDesc.VarType.LOD_TENSOR, device='cpu') self._add_accumulator( name=self._beta2_pow_acc_str, param=p, fill_value=0.999 if isinstance(self._beta2, Variable) \ else self._beta2, shape=[1], - type=core.VarDesc.VarType.LOD_TENSOR, force_cpu=True) + type=core.VarDesc.VarType.LOD_TENSOR, device='cpu') def _append_optimize_op(self, block, param_and_grad): assert isinstance(block, framework.Block) diff --git a/python/paddle/fluid/tests/unittests/test_device_guard.py b/python/paddle/fluid/tests/unittests/test_device_guard.py new file mode 100644 index 0000000000000000000000000000000000000000..3051ed3995a81c94ba14690bc55b71074e746d75 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_device_guard.py @@ -0,0 +1,149 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +from op_test import OpTest + +import numpy as np +import paddle.fluid as fluid +import paddle.fluid.core as core +import warnings + + +def execute(main_program, startup_program): + if core.is_compiled_with_cuda(): + place = core.CUDAPlace(0) + else: + place = core.CPUPlace() + exe = fluid.Executor(place) + exe.run(startup_program) + exe.run(main_program) + + +class TestDeviceGuard(unittest.TestCase): + def test_device_guard(self): + main_program = fluid.Program() + startup_program = fluid.Program() + with fluid.program_guard(main_program, startup_program): + data1 = fluid.layers.fill_constant( + shape=[1, 3, 8, 8], value=0.5, dtype='float32') + data2 = fluid.layers.fill_constant( + shape=[1, 3, 5, 5], value=0.5, dtype='float32') + shape = fluid.layers.shape(data2) + with fluid.device_guard("cpu"): + shape = fluid.layers.slice( + shape, axes=[0], starts=[0], ends=[4]) + with fluid.device_guard("gpu"): + out = fluid.layers.crop_tensor(data1, shape=shape) + # check if the device attr is set correctly + all_ops = main_program.global_block().ops + device_attr_name = core.op_proto_and_checker_maker.kOpDeviceAttrName() + for op in all_ops: + if op.type == 'slice': + self.assertEqual(op.desc.attr(device_attr_name), "cpu") + if op.type == 'crop_tensor': + self.assertEqual(op.desc.attr(device_attr_name), "gpu") + + execute(main_program, startup_program) + + def test_cpu_only_op(self): + main_program = fluid.Program() + startup_program = fluid.Program() + with fluid.program_guard(main_program, startup_program): + x = fluid.layers.fill_constant( + shape=[2, 255, 13, 13], value=0.3, dtype='float32') + gt_box = fluid.layers.fill_constant( + shape=[2, 6, 4], value=0.5, dtype='float32') + gt_label = fluid.layers.fill_constant( + shape=[2, 6], value=1.0, dtype='int32') + gt_score = fluid.layers.fill_constant( + shape=[2, 6], value=0.5, dtype='float32') + anchors = [ + 10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, + 198, 373, 326 + ] + anchor_mask = [0, 1, 2] + with fluid.device_guard("gpu"): + # yolov3_loss only has cpu kernel, so its cpu kernel will be executed + loss = fluid.layers.yolov3_loss( + x=x, + gt_box=gt_box, + gt_label=gt_label, + gt_score=gt_score, + anchors=anchors, + anchor_mask=anchor_mask, + class_num=80, + ignore_thresh=0.7, + downsample_ratio=32) + + execute(main_program, startup_program) + + def test_without_kernel_op(self): + main_program = fluid.Program() + startup_program = fluid.Program() + with fluid.program_guard(main_program, startup_program): + i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=0) + loop_len = fluid.layers.fill_constant( + shape=[1], dtype='int64', value=10) + cond = fluid.layers.less_than(x=i, y=loop_len) + + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + with fluid.device_guard("cpu"): + while_op = fluid.layers.While(cond=cond) + with while_op.block(): + i = fluid.layers.increment(x=i, value=1, in_place=True) + fluid.layers.less_than(x=i, y=loop_len, cond=cond) + + assert len(w) == 1 + all_ops = main_program.global_block().ops + device_attr_name = core.op_proto_and_checker_maker.kOpDeviceAttrName() + for op in all_ops: + if op.type == 'while': + self.assertEqual(op.desc.attr(device_attr_name), "") + + execute(main_program, startup_program) + + def test_error(self): + def device_attr(): + with fluid.device_guard("cpu1"): + out = fluid.layers.fill_constant( + shape=[1], value=0.2, dtype='float32') + + self.assertRaises(ValueError, device_attr) + + def test_warning(self): + main_program = fluid.Program() + startup_program = fluid.Program() + with fluid.program_guard(main_program, startup_program): + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + with fluid.device_guard("gpu"): + x = fluid.layers.fill_constant( + shape=[1], value=3.0, dtype='float32', force_cpu=True) + y = fluid.layers.fill_constant( + shape=[1], value=4.0, dtype='float32') + result = fluid.layers.less_than(x=x, y=y, force_cpu=False) + + assert len(w) == 2 + all_ops = main_program.global_block().ops + device_attr_name = core.op_proto_and_checker_maker.kOpDeviceAttrName() + for op in all_ops: + self.assertEqual(op.desc.attr(device_attr_name), "gpu") + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_operator_desc.py b/python/paddle/fluid/tests/unittests/test_operator_desc.py index 5932112c3c4d186e46705d6765f09dd158e81e36..0a7afe8caf22de8631b2dfb056b922fda174b0e1 100644 --- a/python/paddle/fluid/tests/unittests/test_operator_desc.py +++ b/python/paddle/fluid/tests/unittests/test_operator_desc.py @@ -70,7 +70,7 @@ class TestOperator(unittest.TestCase): set([ "x_num_col_dims", "y_num_col_dims", "op_role", "op_role_var", "use_mkldnn", "scale_x", "scale_y", "scale_out", - "force_fp32_output", "op_namescope", "op_callstack" + "force_fp32_output", "op_namescope", "op_callstack", "op_device" ])) self.assertEqual(mul_op.has_attr("x_num_col_dims"), True) self.assertEqual(mul_op.attr_type("x_num_col_dims"), core.AttrType.INT)