From ac493f2c720da0e0e11e3fe8eeaa09e550fd474d Mon Sep 17 00:00:00 2001 From: LielinJiang <50691816+LielinJiang@users.noreply.github.com> Date: Thu, 11 Mar 2021 18:57:21 +0800 Subject: [PATCH 001/486] Update comments for API `RandomResizedCrop` (#31539) * update comments --- python/paddle/utils/download.py | 38 ------------------- python/paddle/vision/transforms/transforms.py | 3 +- 2 files changed, 2 insertions(+), 39 deletions(-) diff --git a/python/paddle/utils/download.py b/python/paddle/utils/download.py index 3af9a83f6a2..b7d7d0b5adb 100644 --- a/python/paddle/utils/download.py +++ b/python/paddle/utils/download.py @@ -61,44 +61,6 @@ WEIGHTS_HOME = osp.expanduser("~/.cache/paddle/hapi/weights") DOWNLOAD_RETRY_LIMIT = 3 -nlp_models = OrderedDict(( - ('RoBERTa-zh-base', - 'https://bert-models.bj.bcebos.com/chinese_roberta_wwm_ext_L-12_H-768_A-12.tar.gz' - ), - ('RoBERTa-zh-large', - 'https://bert-models.bj.bcebos.com/chinese_roberta_wwm_large_ext_L-24_H-1024_A-16.tar.gz' - ), - ('ERNIE-v2-en-base', - 'https://ernie.bj.bcebos.com/ERNIE_Base_en_stable-2.0.0.tar.gz'), - ('ERNIE-v2-en-large', - 'https://ernie.bj.bcebos.com/ERNIE_Large_en_stable-2.0.0.tar.gz'), - ('XLNet-cased-base', - 'https://xlnet.bj.bcebos.com/xlnet_cased_L-12_H-768_A-12.tgz'), - ('XLNet-cased-large', - 'https://xlnet.bj.bcebos.com/xlnet_cased_L-24_H-1024_A-16.tgz'), - ('ERNIE-v1-zh-base', - 'https://baidu-nlp.bj.bcebos.com/ERNIE_stable-1.0.1.tar.gz'), - ('ERNIE-v1-zh-base-max-len-512', - 'https://ernie.bj.bcebos.com/ERNIE_1.0_max-len-512.tar.gz'), - ('BERT-en-uncased-large-whole-word-masking', - 'https://bert-models.bj.bcebos.com/wwm_uncased_L-24_H-1024_A-16.tar.gz'), - ('BERT-en-cased-large-whole-word-masking', - 'https://bert-models.bj.bcebos.com/wwm_cased_L-24_H-1024_A-16.tar.gz'), - ('BERT-en-uncased-base', - 'https://bert-models.bj.bcebos.com/uncased_L-12_H-768_A-12.tar.gz'), - ('BERT-en-uncased-large', - 'https://bert-models.bj.bcebos.com/uncased_L-24_H-1024_A-16.tar.gz'), - ('BERT-en-cased-base', - 'https://bert-models.bj.bcebos.com/cased_L-12_H-768_A-12.tar.gz'), - ('BERT-en-cased-large', - 'https://bert-models.bj.bcebos.com/cased_L-24_H-1024_A-16.tar.gz'), - ('BERT-multilingual-uncased-base', - 'https://bert-models.bj.bcebos.com/multilingual_L-12_H-768_A-12.tar.gz'), - ('BERT-multilingual-cased-base', - 'https://bert-models.bj.bcebos.com/multi_cased_L-12_H-768_A-12.tar.gz'), - ('BERT-zh-base', - 'https://bert-models.bj.bcebos.com/chinese_L-12_H-768_A-12.tar.gz'), )) - def is_url(path): """ diff --git a/python/paddle/vision/transforms/transforms.py b/python/paddle/vision/transforms/transforms.py index a244d447829..7d3d5f525c2 100644 --- a/python/paddle/vision/transforms/transforms.py +++ b/python/paddle/vision/transforms/transforms.py @@ -407,7 +407,8 @@ class RandomResizedCrop(BaseTransform): Args: size (int|list|tuple): Target size of output image, with (height, width) shape. - scale (list|tuple): Range of size of the origin size cropped. Default: (0.08, 1.0) + scale (list|tuple): Scale range of the cropped image before resizing, relatively to the origin + image. Default: (0.08, 1.0) ratio (list|tuple): Range of aspect ratio of the origin aspect ratio cropped. Default: (0.75, 1.33) interpolation (int|str, optional): Interpolation method. Default: 'bilinear'. when use pil backend, support method are as following: -- GitLab From 49c3d2a97b914f13ce779f92ef75469c508e84e6 Mon Sep 17 00:00:00 2001 From: YUNSHEN XIE <1084314248@qq.com> Date: Thu, 11 Mar 2021 19:33:36 +0800 Subject: [PATCH 002/486] modified show_ut_retry_result (#31528) --- paddle/scripts/paddle_build.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 22ba30c5c8d..f1142dbbbba 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -1251,7 +1251,7 @@ set +x echo "This is the ${exec_time_array[$exec_times]} time to re-run" echo "=========================================" echo "The following unittest will be re-run:" - echo "${failed_test_lists_ult}" + echo "${retry_unittests}" for line in ${retry_unittests[@]} ; do @@ -1340,7 +1340,7 @@ function show_ut_retry_result() { echo "Summary Failed Tests... " echo "========================================" echo "The following tests FAILED: " - echo "${retry_unittests_record}" | grep -E "$failed_ut_re" + echo "${retry_unittests_record}" | sort -u | grep -E "$failed_ut_re" exit 8; fi fi -- GitLab From def27bc801219e2c9b742b12b940e0758b5e842d Mon Sep 17 00:00:00 2001 From: Aurelius84 Date: Thu, 11 Mar 2021 20:38:28 +0800 Subject: [PATCH 003/486] [Dy2stat]Fix bug with static_convert_var_shape in locals scope (#31556) * Fix bug with static_convert_var_shape * replace dot with dash --- .../dygraph_to_static/convert_operators.py | 12 +++---- .../tensor_shape_transformer.py | 30 ++++++++++++---- .../test_convert_operators.py | 35 +++++++++++++------ .../dygraph_to_static/test_tensor_shape.py | 22 ++++++++++++ 4 files changed, 76 insertions(+), 23 deletions(-) diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py b/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py index 403e77cb5cc..4126e942259 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py @@ -302,19 +302,19 @@ def convert_var_shape_simple(x): return x.shape -def eval_if_exist_else_none(name, local_symbol_table): +def eval_if_exist_else_none(name, global_symbol_table): """ Args: name([str]): Expression passed into `eval`. - local_symbol_table(dict): Specified from `locals()`. DO NOT use `globals()`, - it has a higher priority and will hide away variables - from `locals()`. + local_symbol_table(dict): Specified from `globals()`. DO NOT use `locals()`, + because all STATIC_CONVERT_VAR_SHAPE_SUFFIX vars is + declared with keyword `global`. Returns: - Return the variable if found in local_symbol_table else None. + Return the variable if found in global_symbol_table else None. """ try: - return eval(name, local_symbol_table) + return eval(name, global_symbol_table) except: return None diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/tensor_shape_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/tensor_shape_transformer.py index ffa1d65e628..eb53d7ec9be 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/tensor_shape_transformer.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/tensor_shape_transformer.py @@ -59,7 +59,7 @@ def create_convert_shape_node(var_shape_node, def create_choose_shape_node(attr_shape_name, api_shape_name, slice_node=None): - eval_exist_func = "paddle.jit.dy2static.eval_if_exist_else_none('{}', locals())".format( + eval_exist_func = "paddle.jit.dy2static.eval_if_exist_else_none('{}', globals())".format( api_shape_name) args = [attr_shape_name, eval_exist_func] @@ -293,6 +293,10 @@ class TensorShapeTransformer(gast.NodeTransformer): return False def _update_name_to_var_shape(self, node): + def replace_dot(name): + # replace all '.' into '_' + return name.replace('.', '_') + assert isinstance(node, gast.Assign) target_node = node.targets[0] value_node = node.value @@ -307,7 +311,8 @@ class TensorShapeTransformer(gast.NodeTransformer): if value_node.id in self.name_to_var_shape: # TODO(zhhsplendid): is context a problem for the result node of gast.parse? static_shape_var_name = unique_name.generate( - target_id + STATIC_CONVERT_VAR_SHAPE_SUFFIX) + replace_dot(target_id) + + STATIC_CONVERT_VAR_SHAPE_SUFFIX) static_shape_var_node = gast.parse( static_shape_var_name).body[0].value @@ -328,7 +333,8 @@ class TensorShapeTransformer(gast.NodeTransformer): if isinstance(value_node, gast.Attribute): if self._is_var_shape(value_node): # eg: x.shape static_shape_var_name = unique_name.generate( - target_id + STATIC_CONVERT_VAR_SHAPE_SUFFIX) + replace_dot(target_id) + + STATIC_CONVERT_VAR_SHAPE_SUFFIX) static_shape_var_node = gast.parse( static_shape_var_name).body[0].value @@ -341,6 +347,12 @@ class TensorShapeTransformer(gast.NodeTransformer): ast_to_source_code(static_shape_value_node).strip(), idx) sub_node = gast.parse(sub_node_str).body[0].value + # Note(Aurelius84): Becuase static_shape_var_name is used in + # eval_if_exist_else_none() as plain string, so it will not + # be pasred as argument in convert_loop/ifelse. We delcare it + # as global var because it has unique name. + update_static_shape_var_node.append( + gast.Global(names=[static_shape_var_name])) update_static_shape_var_node.append( gast.Assign( @@ -354,7 +366,8 @@ class TensorShapeTransformer(gast.NodeTransformer): if isinstance(value_node, gast.Name): if value_node.id in self.name_to_var_shape: static_shape_var_name = unique_name.generate( - target_id + STATIC_CONVERT_VAR_SHAPE_SUFFIX) + replace_dot(target_id) + + STATIC_CONVERT_VAR_SHAPE_SUFFIX) static_shape_var_node = gast.parse( static_shape_var_name).body[0].value static_shape_value_name = self.name_to_var_shape[ @@ -370,17 +383,20 @@ class TensorShapeTransformer(gast.NodeTransformer): self.name_to_var_shape[target_id] = static_shape_var_name elif self._is_var_shape(value_node): # eg: x.shape or x.shape[0] static_shape_var_name = unique_name.generate( - target_id + STATIC_CONVERT_VAR_SHAPE_SUFFIX) + replace_dot(target_id) + STATIC_CONVERT_VAR_SHAPE_SUFFIX) static_shape_var_node = gast.parse(static_shape_var_name).body[ 0].value static_shape_value_node = copy.deepcopy(value_node) # x.shape becomes convert_var_shape_simple(x) static_shape_value_node = ShapeAttributeTransformer().visit( static_shape_value_node) + # Declare static_shape_var_name as global var update_static_shape_var_node = [ + gast.Global(names=[static_shape_var_name]) + ] + update_static_shape_var_node.append( gast.Assign( targets=[static_shape_var_node], - value=static_shape_value_node) - ] + value=static_shape_value_node)) self.name_to_var_shape[target_id] = static_shape_var_name return update_static_shape_var_node diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_operators.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_operators.py index 7a9bad1236f..54dcc152fd6 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_operators.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_operators.py @@ -191,29 +191,44 @@ class TestChooseShapeAttrOrApi(unittest.TestCase): class TestEvaIfExistElseNone(unittest.TestCase): - def test_locals(self): + def test_globals(self): + global x_shape x_shape = [1, 2, 3] - self.assertEqual(eval_if_exist_else_none('x_shape', locals()), x_shape) + self.assertEqual(eval_if_exist_else_none('x_shape', locals()), None) + self.assertEqual(eval_if_exist_else_none('x_shape', globals()), x_shape) - def test_globals(self): + del x_shape + + def test_enclosing_scope(self): + global x_shape x_shape = [1, 2, 3] def foo(): - x_shape = [2, 3, 4] + y_shape = [2, 3, 4] + self.assertEqual( + eval_if_exist_else_none('x_shape', globals()), [1, 2, 3]) self.assertEqual( - eval_if_exist_else_none('x_shape', locals()), [2, 3, 4]) + eval_if_exist_else_none('y_shape', locals()), [2, 3, 4]) foo() + del x_shape - def test_invisible_of_func(self): + def test_global_in_func(self): x_shape = [1, 2, 3] def foo(): - x_shape = [2, 3, 4] - return x_shape + global y_shape + y_shape = [2, 3, 4] - self.assertEqual( - eval_if_exist_else_none('x_shape', locals()), [1, 2, 3]) + self.assertEqual( + eval_if_exist_else_none('y_shape', globals()), [2, 3, 4]) + self.assertEqual(eval_if_exist_else_none('x_shape', locals()), None) + self.assertEqual( + eval_if_exist_else_none('x_shape', globals()), None) + + del y_shape + + foo() def test_none(self): def foo(): diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_shape.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_shape.py index be571aaf2b7..70749c2e244 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_shape.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_shape.py @@ -541,5 +541,27 @@ class TestChangeShapeAfterAssign(TestTensorShapeBasic): self.expected_slice_op_num = 2 +def dyfunc_with_static_convert_var_shape(x): + # Note: this will create `batch_size__static_convert_var_shape_suffix_0` firstly. + batch_size = x.shape[0] + if len(x.shape) < 1: + res = x + else: + # Test for correctly to find `batch_size__static_convert_var_shape_suffix_0` in + # deeply nested scope. + res = fluid.layers.fill_constant( + value=8, shape=[batch_size], dtype="int32") + + return res + + +class TestFindStatiConvertVarShapeSuffixVar(unittest.TestCase): + def test(self): + x_spec = paddle.static.InputSpec(shape=[None, 10]) + func = paddle.jit.to_static(dyfunc_with_if_2, input_spec=[x_spec]) + # Call this function to trigger program translation. + func.concrete_program + + if __name__ == '__main__': unittest.main() -- GitLab From 95cceb2dd7b32a62b83d4264154f8a0290018f03 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Fri, 12 Mar 2021 10:14:02 +0800 Subject: [PATCH 004/486] [CustomOp] Support duplicable op input and output (#31535) * support duplicable op inout * add costom concat op test --- .../extension/include/ext_op_meta_info.h | 169 +++++++++++---- paddle/fluid/framework/custom_operator.cc | 201 ++++++++++++++---- .../fluid/tests/custom_op/CMakeLists.txt | 3 + .../fluid/tests/custom_op/concat_and_split.h | 84 ++++++++ .../fluid/tests/custom_op/custom_concat_op.cc | 145 +++++++++++++ .../tests/custom_op/test_custom_concat.py | 148 +++++++++++++ .../custom_op/test_custom_relu_op_jit.py | 1 - .../utils/cpp_extension/extension_utils.py | 13 +- 8 files changed, 670 insertions(+), 94 deletions(-) create mode 100644 python/paddle/fluid/tests/custom_op/concat_and_split.h create mode 100644 python/paddle/fluid/tests/custom_op/custom_concat_op.cc create mode 100644 python/paddle/fluid/tests/custom_op/test_custom_concat.py diff --git a/paddle/fluid/extension/include/ext_op_meta_info.h b/paddle/fluid/extension/include/ext_op_meta_info.h index a3b9a4c4910..5b8d5a0bf5a 100644 --- a/paddle/fluid/extension/include/ext_op_meta_info.h +++ b/paddle/fluid/extension/include/ext_op_meta_info.h @@ -56,32 +56,48 @@ using Tensor = paddle::Tensor; ///////////////// Util Define and Function //////////////// -inline std::string Grad(const std::string& var_name) { +constexpr char kGradTensorSuffix[] = "@GRAD"; +constexpr char kTensorVectorSuffix[] = "@VECTOR"; + +// Used for Construct Grad Tensor name +inline std::string Grad(const std::string& t_name) { + std::string result; + result.reserve(t_name.size() + 5U); + result += t_name; + result += kGradTensorSuffix; + return result; +} + +// Used for Construct std::vector name +inline std::string Vec(const std::string& t_name) { std::string result; - result.reserve(var_name.size() + 5U); - result += var_name; - result += "@GRAD"; + result.reserve(t_name.size() + 7U); + result += t_name; + result += kTensorVectorSuffix; return result; } ////////////////////// Kernel Function (PD_KERNEL) //////////////////////// // Record Op kernel core function -using KernelFunc = std::vector (*)(std::vector inputs, - std::vector attrs); +using KernelFunc = std::vector (*)( + std::vector inputs, std::vector> vec_inputs, + std::vector attrs); #define PD_SPECIALIZE_ComputeCallHelper(attr_type) \ template \ struct ComputeCallHelper { \ - template \ + template \ static Return Compute(std::vector inputs, \ + std::vector> vec_inputs, \ std::vector attrs, \ const PreviousArgs&... pargs) { \ try { \ attr_type arg = boost::any_cast(attrs[attr_idx]); \ - return ComputeCallHelper::template Compute( \ - inputs, attrs, pargs..., arg); \ + return ComputeCallHelper::template Compute< \ + in_idx, vec_in_idx, attr_idx + 1>(inputs, vec_inputs, attrs, \ + pargs..., arg); \ } catch (boost::bad_any_cast&) { \ PD_THROW( \ "Attribute cast error in custom operator. Expected " #attr_type \ @@ -99,9 +115,10 @@ struct KernelFuncImpl; template struct KernelFuncImpl { static Return Compute(std::vector inputs, + std::vector> vec_inputs, std::vector attrs) { - return ComputeCallHelper>::template Compute<0, 0>( - inputs, attrs); + return ComputeCallHelper>::template Compute<0, 0, 0>( + inputs, vec_inputs, attrs); } private: @@ -111,15 +128,32 @@ struct KernelFuncImpl { // for Tensor input template struct ComputeCallHelper { - template + template static Return Compute(std::vector inputs, + std::vector> vec_inputs, std::vector attrs, const PreviousArgs&... pargs) { - static_assert(attr_idx == 0, - "Input tensor should appear before attributes."); const Tensor& arg = inputs[in_idx]; - return ComputeCallHelper::template Compute( - inputs, attrs, pargs..., arg); + return ComputeCallHelper::template Compute( + inputs, vec_inputs, attrs, pargs..., arg); + } + }; + + // for std::vector input + template + struct ComputeCallHelper&, Tail...> { + template + static Return Compute(std::vector inputs, + std::vector> vec_inputs, + std::vector attrs, + const PreviousArgs&... pargs) { + const std::vector& arg = vec_inputs[vec_in_idx]; + return ComputeCallHelper::template Compute< + in_idx, vec_in_idx + 1, attr_idx>(inputs, vec_inputs, attrs, pargs..., + arg); } }; @@ -140,8 +174,9 @@ struct KernelFuncImpl { // end: base template template struct ComputeCallHelper> { - template + template static Return Compute(std::vector inputs, + std::vector> vec_inputs, std::vector attrs, const Args&... args) { return impl_fn(args...); } @@ -155,40 +190,62 @@ struct KernelFuncImpl { // Record Op infershape core function using InferShapeFunc = std::vector> (*)( - std::vector> input_shapes); + std::vector> input_shapes, + std::vector>> vec_input_shapes); template struct InferShapeFuncImpl; template struct InferShapeFuncImpl { - static Return InferShape(std::vector> input_shapes) { - return InferShapeCallHelper>::template InferShape<0>( - input_shapes); + static Return InferShape( + std::vector> input_shapes, + std::vector>> vec_input_shapes) { + return InferShapeCallHelper>::template InferShape<0, + 0>( + input_shapes, vec_input_shapes); } private: template struct InferShapeCallHelper; - // only one type input: std::vector template struct InferShapeCallHelper, Tail...> { - template - static Return InferShape(std::vector> input_shapes, - const PreviousArgs&... pargs) { + template + static Return InferShape( + std::vector> input_shapes, + std::vector>> vec_input_shapes, + const PreviousArgs&... pargs) { std::vector arg = input_shapes[in_idx]; - return InferShapeCallHelper::template InferShape( - input_shapes, pargs..., arg); + return InferShapeCallHelper::template InferShape( + input_shapes, vec_input_shapes, pargs..., arg); + } + }; + + template + struct InferShapeCallHelper>, Tail...> { + template + static Return InferShape( + std::vector> input_shapes, + std::vector>> vec_input_shapes, + const PreviousArgs&... pargs) { + std::vector> arg = vec_input_shapes[vec_in_idx]; + return InferShapeCallHelper::template InferShape( + input_shapes, vec_input_shapes, pargs..., arg); } }; // end: base template template struct InferShapeCallHelper> { - template - static Return InferShape(std::vector> input_shapes, - const Args&... args) { + template + static Return InferShape( + std::vector> input_shapes, + std::vector>> vec_input_shapes, + const Args&... args) { return impl_fn(args...); } }; @@ -200,41 +257,63 @@ struct InferShapeFuncImpl { /////////////// InferDataType Function (PD_INFER_DTYPE) /////////////// // Record Op Infer dtype core function -using InferDtypeFunc = - std::vector (*)(std::vector input_dtypes); +using InferDtypeFunc = std::vector (*)( + std::vector input_dtypes, + std::vector> vec_input_dtypes); template struct InferDtypeFuncImpl; template struct InferDtypeFuncImpl { - static Return InferDtype(std::vector input_dtypes) { - return InferDtypeCallHelper>::template InferDtype<0>( - input_dtypes); + static Return InferDtype( + std::vector input_dtypes, + std::vector> vec_input_dtypes) { + return InferDtypeCallHelper>::template InferDtype<0, + 0>( + input_dtypes, vec_input_dtypes); } private: template struct InferDtypeCallHelper; - // Only one type input now: DataType template struct InferDtypeCallHelper { - template - static Return InferDtype(std::vector input_dtypes, - const PreviousArgs&... pargs) { + template + static Return InferDtype( + std::vector input_dtypes, + std::vector> vec_input_dtypes, + const PreviousArgs&... pargs) { DataType arg = input_dtypes[in_idx]; - return InferDtypeCallHelper::template InferDtype( - input_dtypes, pargs..., arg); + return InferDtypeCallHelper::template InferDtype( + input_dtypes, vec_input_dtypes, pargs..., arg); + } + }; + + template + struct InferDtypeCallHelper, Tail...> { + template + static Return InferDtype( + std::vector input_dtypes, + std::vector> vec_input_dtypes, + const PreviousArgs&... pargs) { + std::vector arg = vec_input_dtypes[vec_in_idx]; + return InferDtypeCallHelper::template InferDtype( + input_dtypes, vec_input_dtypes, pargs..., arg); } }; // end: base template template struct InferDtypeCallHelper> { - template - static Return InferDtype(std::vector input_dtypes, - const Args&... args) { + template + static Return InferDtype( + std::vector input_dtypes, + std::vector> vec_input_dtypes, + const Args&... args) { return impl_fn(args...); } }; diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc index 66e28bb83ce..0baacd46213 100644 --- a/paddle/fluid/framework/custom_operator.cc +++ b/paddle/fluid/framework/custom_operator.cc @@ -27,7 +27,6 @@ limitations under the License. */ #include "paddle/fluid/extension/include/ext_tensor.h" #include "paddle/fluid/framework/attribute.h" -#include "paddle/fluid/framework/c/c_api.h" #include "paddle/fluid/framework/custom_tensor_utils.h" #include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/op_meta_info_helper.h" @@ -63,6 +62,11 @@ inline bool IsGradVar(const std::string& var_name) { return var_name.rfind(suffix) != std::string::npos; } +inline bool IsDuplicableVar(const std::string& var_name) { + std::string suffix = kTensorVectorSuffix; + return var_name.rfind(suffix) != std::string::npos; +} + inline std::string NoGrad(const std::string& var_name) { std::string suffix = kGradVarSuffix; return var_name.substr(0, var_name.size() - kGradVarSuffixSize); @@ -103,19 +107,47 @@ static void RunKernelFunc(const framework::ExecutionContext& ctx, const std::vector& attrs) { VLOG(1) << "Custom Operator: Start run KernelFunc."; std::vector custom_ins; + std::vector> custom_vec_ins; for (auto& in_name : inputs) { VLOG(1) << "Custom Operator: input name - " << in_name; - auto* x = ctx.Input(in_name); - PADDLE_ENFORCE_NOT_NULL(x, platform::errors::NotFound( - "Input tensor (%s) is nullptr.", in_name)); - PADDLE_ENFORCE_EQ(x->IsInitialized(), true, - platform::errors::InvalidArgument( - "Input tensor (%s) is not initialized.")); - auto custom_in = paddle::Tensor( - CustomTensorUtils::ConvertInnerPlaceToEnumPlace(x->place())); - CustomTensorUtils::ShareDataFrom(static_cast(x), custom_in); - CustomTensorUtils::SetTensorCurrentStream(&custom_in, ctx.GetPlace()); - custom_ins.emplace_back(custom_in); + if (detail::IsDuplicableVar(in_name)) { + // return const std::vector + auto vec_x = ctx.MultiInput(in_name); + PADDLE_ENFORCE_NE(vec_x.empty(), true, + platform::errors::NotFound( + "Input vector (%s) is empty.", in_name)); + std::vector custom_vec_in; + for (size_t i = 0; i < vec_x.size(); ++i) { + auto* x = vec_x[i]; + PADDLE_ENFORCE_NOT_NULL( + x, platform::errors::NotFound( + "The %d-th tensor in input vector (%s) is nullptr.", + i, in_name)); + PADDLE_ENFORCE_EQ(x->IsInitialized(), true, + platform::errors::InvalidArgument( + "The %d-th tensor in input vector (%s) " + "is not initialized.", + i, in_name)); + auto custom_t = paddle::Tensor( + CustomTensorUtils::ConvertInnerPlaceToEnumPlace(x->place())); + CustomTensorUtils::ShareDataFrom(static_cast(x), custom_t); + CustomTensorUtils::SetTensorCurrentStream(&custom_t, ctx.GetPlace()); + custom_vec_in.emplace_back(custom_t); + } + custom_vec_ins.emplace_back(custom_vec_in); + } else { + auto* x = ctx.Input(in_name); + PADDLE_ENFORCE_NOT_NULL(x, platform::errors::NotFound( + "Input tensor (%s) is nullptr.", in_name)); + PADDLE_ENFORCE_EQ(x->IsInitialized(), true, + platform::errors::InvalidArgument( + "Input tensor (%s) is not initialized.", in_name)); + auto custom_in = paddle::Tensor( + CustomTensorUtils::ConvertInnerPlaceToEnumPlace(x->place())); + CustomTensorUtils::ShareDataFrom(static_cast(x), custom_in); + CustomTensorUtils::SetTensorCurrentStream(&custom_in, ctx.GetPlace()); + custom_ins.emplace_back(custom_in); + } } std::vector custom_attrs; @@ -153,14 +185,34 @@ static void RunKernelFunc(const framework::ExecutionContext& ctx, } } - VLOG(1) << "Run ComputeFunc."; + VLOG(1) << "Custom Operator: Run ComputeFunc."; try { - auto outs = func(custom_ins, custom_attrs); + auto outs = func(custom_ins, custom_vec_ins, custom_attrs); VLOG(1) << "Custom Operator: Share outputs into ExecutionContext."; for (size_t i = 0; i < outputs.size(); ++i) { - auto* true_out = ctx.Output(outputs[i]); - CustomTensorUtils::ShareDataTo(outs.at(i), true_out); + auto out_name = outputs[i]; + if (detail::IsDuplicableVar(out_name)) { + PADDLE_ENFORCE(i == 0UL && outputs.size() == 1UL, + platform::errors::PreconditionNotMet( + "If custom operator's outputs contains `paddle::Vec(" + ")` type, " + "it only can hold one output.")); + auto vec_true_outs = ctx.MultiOutput(out_name); + PADDLE_ENFORCE_EQ( + vec_true_outs.size(), outs.size(), + platform::errors::InvalidArgument( + "The number of element in custom operator outputs is wrong, " + "expected contains %d Tensors, but actually contains %d " + "Tensors.", + vec_true_outs.size(), outs.size())); + for (size_t j = 0; j < vec_true_outs.size(); ++j) { + CustomTensorUtils::ShareDataTo(outs.at(j), vec_true_outs.at(j)); + } + } else { + auto* true_out = ctx.Output(out_name); + CustomTensorUtils::ShareDataTo(outs.at(i), true_out); + } } } catch (platform::EnforceNotMet& exception) { throw std::move(exception); @@ -221,10 +273,20 @@ class CustomOpMaker : public OpProtoAndCheckerMaker { void Make() override { for (auto& in_name : inputs_) { - AddInput(in_name, "The input " + in_name + "of Custom operator."); + if (detail::IsDuplicableVar(in_name)) { + AddInput(in_name, "The input " + in_name + "of Custom operator.") + .AsDuplicable(); + } else { + AddInput(in_name, "The input " + in_name + "of Custom operator."); + } } for (auto& out_name : outputs_) { - AddOutput(out_name, "The output " + out_name + "of Custom Operator."); + if (detail::IsDuplicableVar(out_name)) { + AddOutput(out_name, "The output " + out_name + "of Custom Operator.") + .AsDuplicable(); + } else { + AddOutput(out_name, "The output " + out_name + "of Custom Operator."); + } } for (auto& attr : attrs_) { auto attr_name_and_type = detail::ParseAttrStr(attr); @@ -331,7 +393,13 @@ class CustomGradOpMaker : public SingleGradOpMaker { } for (auto& out_name : outputs_) { VLOG(1) << "Custom Operator: GradOpDescMaker - output: " << out_name; - grad_op->SetOutput(out_name, this->InputGrad(detail::NoGrad(out_name))); + if (detail::IsDuplicableVar(out_name)) { + grad_op->SetOutput(out_name, + this->InputGrad(detail::NoGrad(out_name), + /*drop_empty_grad=*/false)); + } else { + grad_op->SetOutput(out_name, this->InputGrad(detail::NoGrad(out_name))); + } } grad_op->SetAttrMap(this->Attrs()); } @@ -493,9 +561,9 @@ void RegisterOperatorWithMetaInfo( platform::errors::Unavailable( "Your custom operator contains multiple inputs. " "We only allow a custom operator that contains only one input " - "and " - "only one output without setting the InferShapeFn. At this time, " - "the input shape will be directly set to the output shape.\n" + "and only one output without setting the InferShapeFn. " + "At this time, the input shape will be directly set to " + "the output shape.\n" "Please set the InferShapeFn of custom " "operator by .SetInferShapeFn(PD_INFER_SHAPE(...))")); PADDLE_ENFORCE_EQ( @@ -503,9 +571,9 @@ void RegisterOperatorWithMetaInfo( platform::errors::Unavailable( "Your custom operator contains multiple outputs. " "We only allow a custom operator that contains only one input " - "and " - "only one output without setting the InferShapeFn. At this time, " - "the input shape will be directly set to the output shape.\n" + "and only one output without setting the InferShapeFn. " + "At this time, the input shape will be directly set to " + "the output shape.\n" "Please set the InferShapeFn of custom " "operator by .SetInferShapeFn(PD_INFER_SHAPE(...))")); @@ -516,21 +584,46 @@ void RegisterOperatorWithMetaInfo( info.infer_shape_ = [op_inputs, op_outputs, infer_shape_func](InferShapeContext* ctx) { std::vector> input_shapes; + std::vector>> vec_input_shapes; VLOG(1) << "Custom Operator: InferShape - get input ddim."; for (auto& in_name : op_inputs) { - OP_INOUT_CHECK(ctx->HasInput(in_name), "Input", in_name, "Custom"); - auto ddim = ctx->GetInputDim(in_name); - input_shapes.emplace_back(framework::vectorize(ddim)); + if (detail::IsDuplicableVar(in_name)) { + OP_INOUT_CHECK(ctx->HasInputs(in_name), "Input", in_name, "Custom"); + auto vec_ddim = ctx->GetInputsDim(in_name); + std::vector> vec_shape; + vec_shape.reserve(vec_ddim.size()); + std::transform(vec_ddim.begin(), vec_ddim.end(), + std::back_inserter(vec_shape), + [&](const DDim& ddim) -> std::vector { + return framework::vectorize(ddim); + }); + vec_input_shapes.emplace_back(vec_shape); + } else { + OP_INOUT_CHECK(ctx->HasInput(in_name), "Input", in_name, "Custom"); + auto ddim = ctx->GetInputDim(in_name); + input_shapes.emplace_back(framework::vectorize(ddim)); + } } VLOG(1) << "Custom Operator: InferShape - calc output ddim."; - auto output_shapes = infer_shape_func(input_shapes); + auto output_shapes = infer_shape_func(input_shapes, vec_input_shapes); VLOG(1) << "Custom Operator: InferShape - set output ddim."; for (size_t i = 0; i < op_outputs.size(); ++i) { - ctx->SetOutputDim(op_outputs[i], - framework::make_ddim(output_shapes[i])); + auto out_name = op_outputs[i]; + if (detail::IsDuplicableVar(out_name)) { + std::vector vec_ddim; + vec_ddim.reserve(output_shapes.size()); + std::transform(output_shapes.begin(), output_shapes.end(), + std::back_inserter(vec_ddim), + [&](const std::vector& shape) -> DDim { + return framework::make_ddim(shape); + }); + ctx->SetOutputsDim(out_name, vec_ddim); + } else { + ctx->SetOutputDim(out_name, framework::make_ddim(output_shapes[i])); + } } }; } @@ -544,9 +637,9 @@ void RegisterOperatorWithMetaInfo( platform::errors::Unavailable( "Your custom operator contains multiple inputs. " "We only allow a custom operator that contains only one input " - "and " - "only one output without setting the InferDtypeFn. At this time, " - "the input dtype will be directly set to the output dtype.\n" + "and only one output without setting the InferDtypeFn. " + "At this time, the input dtype will be directly set to " + "the output dtype.\n" "Please set the InferDtypeFn of custom " "operator by .SetInferDtypeFn(PD_INFER_DTYPE(...))")); PADDLE_ENFORCE_EQ( @@ -554,9 +647,9 @@ void RegisterOperatorWithMetaInfo( platform::errors::Unavailable( "Your custom operator contains multiple outputs. " "We only allow a custom operator that contains only one input " - "and " - "only one output without setting the InferDtypeFn. At this time, " - "the input dtype will be directly set to the output dtype.\n" + "and only one output without setting the InferDtypeFn. " + "At this time, the input dtype will be directly set to " + "the output dtype.\n" "Please set the InferDtypeFn of custom " "operator by .SetInferDtypeFn(PD_INFER_DTYPE(...))")); @@ -568,22 +661,42 @@ void RegisterOperatorWithMetaInfo( info.infer_var_type_ = [op_inputs, op_outputs, infer_dtype_func](InferVarTypeContext* ctx) { std::vector input_dtypes; + std::vector> vec_input_dtypes; VLOG(1) << "Custom Operator: InferDtype - get input dtype."; for (auto& in_name : op_inputs) { - auto dtype = ctx->GetInputDataType(in_name); - input_dtypes.emplace_back( - CustomTensorUtils::ConvertInnerDTypeToEnumDType(dtype)); + if (detail::IsDuplicableVar(in_name)) { + std::vector vec_custom_dtype; + for (size_t i = 0; i < ctx->InputSize(in_name); ++i) { + auto dtype = ctx->GetInputDataType(in_name, i); + vec_custom_dtype.emplace_back( + CustomTensorUtils::ConvertInnerDTypeToEnumDType(dtype)); + } + vec_input_dtypes.emplace_back(vec_custom_dtype); + } else { + auto dtype = ctx->GetInputDataType(in_name); + input_dtypes.emplace_back( + CustomTensorUtils::ConvertInnerDTypeToEnumDType(dtype)); + } } VLOG(1) << "Custom Operator: InferDtype - infer output dtype."; - auto output_dtypes = infer_dtype_func(input_dtypes); + auto output_dtypes = infer_dtype_func(input_dtypes, vec_input_dtypes); VLOG(1) << "Custom Operator: InferDtype - set output dtype."; for (size_t i = 0; i < op_outputs.size(); ++i) { - ctx->SetOutputDataType( - op_outputs[i], - CustomTensorUtils::ConvertEnumDTypeToInnerDType(output_dtypes[i])); + auto out_name = op_outputs[i]; + if (detail::IsDuplicableVar(out_name)) { + for (size_t j = 0; j < output_dtypes.size(); ++j) { + auto dtype = CustomTensorUtils::ConvertEnumDTypeToInnerDType( + output_dtypes[i]); + ctx->SetOutputDataType(out_name, dtype, j); + } + } else { + ctx->SetOutputDataType( + out_name, CustomTensorUtils::ConvertEnumDTypeToInnerDType( + output_dtypes[i])); + } } }; } diff --git a/python/paddle/fluid/tests/custom_op/CMakeLists.txt b/python/paddle/fluid/tests/custom_op/CMakeLists.txt index f57d22d8710..620bff11a28 100644 --- a/python/paddle/fluid/tests/custom_op/CMakeLists.txt +++ b/python/paddle/fluid/tests/custom_op/CMakeLists.txt @@ -23,6 +23,9 @@ set_tests_properties(test_multi_out_jit PROPERTIES TIMEOUT 120) py_test(test_custom_attrs_jit SRCS test_custom_attrs_jit.py) set_tests_properties(test_custom_attrs_jit PROPERTIES TIMEOUT 120) +py_test(test_custom_concat SRCS test_custom_concat.py) +set_tests_properties(test_custom_concat PROPERTIES TIMEOUT 120) + py_test(test_check_abi SRCS test_check_abi.py) cc_test(test_check_error SRCS test_check_error.cc DEPS gtest) diff --git a/python/paddle/fluid/tests/custom_op/concat_and_split.h b/python/paddle/fluid/tests/custom_op/concat_and_split.h new file mode 100644 index 00000000000..9f24cc43699 --- /dev/null +++ b/python/paddle/fluid/tests/custom_op/concat_and_split.h @@ -0,0 +1,84 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "paddle/extension.h" + +int64_t GetRows(std::vector shape, int64_t axis) { + int64_t rows = 1; + for (int64_t i = 0; i < axis; ++i) { + rows *= shape[i]; + } + return rows; +} + +std::vector GetCols(const std::vector& ins, + int64_t rows, + int64_t* cols) { + std::vector cols_vec(ins.size()); + for (size_t i = 0; i < ins.size(); ++i) { + int64_t t_cols = ins[i].size() / rows; + *cols += t_cols; + cols_vec[i] = t_cols; + } + return cols_vec; +} + +template +void ConcatCpuKernel(const std::vector& ins, + paddle::Tensor* out, + int64_t axis) { + size_t num = ins.size(); + int64_t out_rows = GetRows(ins[0].shape(), axis); + int64_t out_cols = 0; + auto ins_cols = GetCols(ins, out_rows, &out_cols); + + auto* out_data = out->mutable_data(); + int64_t col_idx = 0; + for (size_t i = 0; i < num; ++i) { + int64_t col_len = ins_cols[i]; + auto* in_data = ins[i].data(); + for (int j = 0; j < out_rows; ++j) { + std::memcpy(out_data + j * out_cols + col_idx, + in_data + j * col_len, + sizeof(data_t) * col_len); + } + col_idx += col_len; + } +} + +template +void SplitCpuKernel(const paddle::Tensor& in, + const std::vector& ref_ins, + std::vector* outs, + int64_t axis) { + size_t num = outs->size(); + int64_t in_rows = GetRows(ref_ins[0].shape(), axis); + int64_t in_cols = 0; + auto out_cols = GetCols(ref_ins, in_rows, &in_cols); + + for (size_t i = 0; i < in_rows; ++i) { + auto* in_data = in.data() + i * in_cols; + int64_t col_idx = 0; + for (size_t j = 0; j < num; ++j) { + int64_t col_len = out_cols[j]; + auto* out_data = outs->at(j).mutable_data() + i * col_len; + std::memcpy(out_data, in_data + col_idx, sizeof(data_t) * col_len); + col_idx += col_len; + } + } +} diff --git a/python/paddle/fluid/tests/custom_op/custom_concat_op.cc b/python/paddle/fluid/tests/custom_op/custom_concat_op.cc new file mode 100644 index 00000000000..4ea39303991 --- /dev/null +++ b/python/paddle/fluid/tests/custom_op/custom_concat_op.cc @@ -0,0 +1,145 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include "concat_and_split.h" // NOLINT +#include "paddle/extension.h" + +#define CHECK_INPUT(x) \ + PD_CHECK(x.place() == paddle::PlaceType::kCPU, #x " must be a CPU Tensor.") + +int64_t ComputeAxis(int64_t axis, int64_t rank) { + PD_CHECK(axis >= -rank && axis < rank, + "The axis is excepted to be in range of [", + -rank, + ", ", + rank, + "]."); + if (axis < 0) { + axis = axis + rank; + } + return axis > 0 ? axis : 0; +} + +std::vector ComputeOutShape( + std::vector> in_shapes, int64_t axis) { + size_t n = in_shapes.size(); + auto out_shape = in_shapes[0]; + size_t zero_dim_size = out_shape.size(); + for (size_t i = 1; i < n; ++i) { + PD_CHECK(in_shapes[i].size() == out_shape.size(), + "Input dimension must be same."); + for (size_t j = 0; j < zero_dim_size; ++j) { + if (j == axis) { + out_shape[axis] += in_shapes[i][j]; + } else { + PD_CHECK(in_shapes[0][j] == in_shapes[i][j], + "The ", + j, + "-th dimension of input must be same."); + } + } + } + return out_shape; +} + +std::vector ConcatForwardDynamicAxis( + const std::vector& inputs, const paddle::Tensor& axis_t) { + // check inputs + PD_CHECK(inputs.size() >= 1, "No Tensor need to be concat."); + for (auto& t : inputs) { + CHECK_INPUT(t); + } + CHECK_INPUT(axis_t); + + // compute output shape + int64_t rank = static_cast(inputs[0].shape().size()); + int64_t axis = axis_t.data()[0]; + axis = ComputeAxis(axis, rank); + std::vector> in_shapes; + for (auto& t : inputs) { + in_shapes.emplace_back(t.shape()); + } + auto out_shape = ComputeOutShape(in_shapes, axis); + + // create output + auto out = paddle::Tensor(paddle::PlaceType::kCPU); + out.reshape(out_shape); + + // calc + PD_DISPATCH_FLOATING_AND_INTEGRAL_TYPES( + inputs[0].type(), "ConcatCpuKernel", ([&] { + ConcatCpuKernel(inputs, &out, axis); + })); + + return {out}; +} + +std::vector ConcatBackwardDynamicAxis( + const std::vector& inputs, + const paddle::Tensor& grad_out, + const paddle::Tensor& axis_t) { + // check input + PD_CHECK(inputs.size() >= 1, "No Tensor need to be concat."); + for (auto& t : inputs) { + CHECK_INPUT(t); + } + CHECK_INPUT(axis_t); + CHECK_INPUT(grad_out); + + // compate axis + int64_t rank = static_cast(inputs[0].shape().size()); + int64_t axis = axis_t.data()[0]; + axis = ComputeAxis(axis, rank); + + // create outputs + std::vector grad_inputs; + for (auto& t : inputs) { + auto grad = paddle::Tensor(paddle::PlaceType::kCPU); + grad.reshape(t.shape()); + grad_inputs.emplace_back(grad); + } + + // calc + PD_DISPATCH_FLOATING_AND_INTEGRAL_TYPES( + grad_out.type(), "SplitCpuKernel", ([&] { + SplitCpuKernel(grad_out, inputs, &grad_inputs, axis); + })); + + return grad_inputs; +} + +std::vector> ConcatInferShapeDynamicAxis( + std::vector> input_shapes, + std::vector axis_shape) { + return {std::vector(input_shapes[0].size(), -1)}; +} + +std::vector ConcatInferDtypeDynamicAxis( + std::vector input_dtypes, paddle::DataType axis_dtype) { + return {input_dtypes[0]}; +} + +PD_BUILD_OP(custom_concat) + .Inputs({paddle::Vec("X"), "Axis"}) + .Outputs({"Out"}) + .SetKernelFn(PD_KERNEL(ConcatForwardDynamicAxis)) + .SetInferShapeFn(PD_INFER_SHAPE(ConcatInferShapeDynamicAxis)) + .SetInferDtypeFn(PD_INFER_DTYPE(ConcatInferDtypeDynamicAxis)); + +PD_BUILD_GRAD_OP(custom_concat) + .Inputs({paddle::Vec("X"), paddle::Grad("Out"), "Axis"}) + .Outputs({paddle::Grad(paddle::Vec("X"))}) + .SetKernelFn(PD_KERNEL(ConcatBackwardDynamicAxis)); diff --git a/python/paddle/fluid/tests/custom_op/test_custom_concat.py b/python/paddle/fluid/tests/custom_op/test_custom_concat.py new file mode 100644 index 00000000000..4086224cd7b --- /dev/null +++ b/python/paddle/fluid/tests/custom_op/test_custom_concat.py @@ -0,0 +1,148 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import unittest +import numpy as np + +import paddle +import paddle.static as static +from paddle.utils.cpp_extension import load, get_build_directory +from paddle.utils.cpp_extension.extension_utils import run_cmd +from utils import paddle_includes, extra_cc_args, extra_nvcc_args + +# Because Windows don't use docker, the shared lib already exists in the +# cache dir, it will not be compiled again unless the shared lib is removed. +file = '{}\\custom_relu_module_jit\\custom_relu_module_jit.pyd'.format( + get_build_directory()) +if os.name == 'nt' and os.path.isfile(file): + cmd = 'del {}'.format(file) + run_cmd(cmd, True) + +if os.name == 'nt': + test_include = "..\\python\\paddle\\fluid\\tests\\custom_op" +else: + test_include = "../python/paddle/fluid/tests/custom_op" +paddle_includes.append(test_include) + +custom_ops = load( + name='custom_concat_jit', + sources=['custom_concat_op.cc'], + extra_include_paths=paddle_includes, # add for Coverage CI + extra_cxx_cflags=extra_cc_args, # test for cc flags + extra_cuda_cflags=extra_nvcc_args, # test for nvcc flags + verbose=True) + + +def concat_dynamic(func, device, dtype, np_inputs, axis_v): + paddle.set_device(device) + inputs = [ + paddle.to_tensor( + x, dtype=dtype, place=device, stop_gradient=False) + for x in np_inputs + ] + axis = paddle.full(shape=[1], dtype='int64', fill_value=axis_v) + out = func(inputs, axis) + out.stop_gradient = False + out.backward() + grad_inputs = [x.grad for x in inputs] + return out.numpy(), grad_inputs + + +def concat_static(func, device, dtype, np_inputs, axis_v): + paddle.enable_static() + paddle.set_device(device) + with static.scope_guard(static.Scope()): + with static.program_guard(static.Program()): + x1 = static.data(name="x1", shape=[2, 3], dtype=dtype) + x2 = static.data(name="x2", shape=[2, 3], dtype=dtype) + axis = paddle.full(shape=[1], dtype='int64', fill_value=axis_v) + x1.stop_gradient = False + x2.stop_gradient = False + out = func([x1, x2], axis) + # mean only support float, so here use sum + sum_out = paddle.sum(out) + static.append_backward(sum_out) + + exe = static.Executor() + exe.run(static.default_startup_program()) + + out_v, x1_grad_v, x2_grad_v = exe.run( + static.default_main_program(), + feed={ + "x1": np_inputs[0].astype(dtype), + "x2": np_inputs[1].astype(dtype), + "axis": axis + }, + fetch_list=[out.name, x1.name + "@GRAD", x2.name + "@GRAD"]) + paddle.disable_static() + return out_v, x1_grad_v, x2_grad_v + + +class TestCustomConcatDynamicAxisJit(unittest.TestCase): + def setUp(self): + self.dtypes = ['float32', 'float64', 'int32', 'int64'] + self.devices = ['cpu'] + self.np_inputs = [ + np.array([[1, 2, 3], [4, 5, 6]]), + np.array([[11, 12, 13], [14, 15, 16]]) + ] + self.axises = [0, 1] + + def test_dynamic(self): + for device in self.devices: + for dtype in self.dtypes: + for axis in self.axises: + out, grad_inputs = concat_dynamic(custom_ops.custom_concat, + device, dtype, + self.np_inputs, axis) + pd_out, pd_grad_inputs = concat_dynamic( + paddle.concat, device, dtype, self.np_inputs, axis) + + self.assertTrue( + np.array_equal(out, pd_out), + "custom op out: {},\n paddle api out: {}".format( + out, pd_out)) + for x_grad, pd_x_grad in zip(grad_inputs, pd_grad_inputs): + self.assertTrue( + np.array_equal(x_grad, pd_x_grad), + "custom op x grad: {},\n paddle api x grad: {}". + format(x_grad, pd_x_grad)) + + def test_static(self): + for device in self.devices: + for dtype in self.dtypes: + for axis in self.axises: + out, x1_grad, x2_grad = concat_static( + custom_ops.custom_concat, device, dtype, self.np_inputs, + axis) + pd_out, pd_x1_grad, pd_x2_grad = concat_static( + paddle.concat, device, dtype, self.np_inputs, axis) + + self.assertTrue( + np.array_equal(out, pd_out), + "custom op out: {},\n paddle api out: {}".format( + out, pd_out)) + self.assertTrue( + np.array_equal(x1_grad, pd_x1_grad), + "custom op x1_grad: {},\n paddle api x1_grad: {}". + format(x1_grad, pd_x1_grad)) + self.assertTrue( + np.array_equal(x2_grad, pd_x2_grad), + "custom op x2_grad: {},\n paddle api x2_grad: {}". + format(x2_grad, pd_x2_grad)) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py index 34cf38aacfa..1a96fc5f0ae 100644 --- a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py +++ b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py @@ -13,7 +13,6 @@ # limitations under the License. import os -import subprocess import unittest import paddle import numpy as np diff --git a/python/paddle/utils/cpp_extension/extension_utils.py b/python/paddle/utils/cpp_extension/extension_utils.py index fff92d85c8f..b68100fe521 100644 --- a/python/paddle/utils/cpp_extension/extension_utils.py +++ b/python/paddle/utils/cpp_extension/extension_utils.py @@ -781,13 +781,18 @@ def _get_api_inputs_str(op_name): in_names, out_names, attr_names = parse_op_info(op_name) # e.g: x, y, z param_names = in_names + attr_names - params_str = ','.join([p.lower() for p in param_names]) + # NOTE(chenweihang): we add suffix `@VECTOR` for std::vector input, + # but the string contains `@` cannot used as argument name, so we split + # input name by `@`, and only use first substr as argument + params_str = ','.join([p.split("@")[0].lower() for p in param_names]) # e.g: {'X': x, 'Y': y, 'Z': z} - ins_str = "{%s}" % ','.join( - ["'{}' : {}".format(in_name, in_name.lower()) for in_name in in_names]) + ins_str = "{%s}" % ','.join([ + "'{}' : {}".format(in_name, in_name.split("@")[0].lower()) + for in_name in in_names + ]) # e.g: {'num': n} attrs_str = "{%s}" % ",".join([ - "'{}' : {}".format(attr_name, attr_name.lower()) + "'{}' : {}".format(attr_name, attr_name.split("@")[0].lower()) for attr_name in attr_names ]) # e.g: ['Out', 'Index'] -- GitLab From f302bb4f8bfe9bd5c2b5fbb944e79601ac88bf72 Mon Sep 17 00:00:00 2001 From: YUNSHEN XIE <1084314248@qq.com> Date: Fri, 12 Mar 2021 10:59:41 +0800 Subject: [PATCH 005/486] help timeout ut debug (#31500) * To help timeout_ut debug * To help timeout_ut debug * added show information --- paddle/scripts/paddle_build.sh | 3 +++ tools/timeout_debug_help.sh | 27 +++++++++++++++++++++++++++ 2 files changed, 30 insertions(+) create mode 100644 tools/timeout_debug_help.sh diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index f1142dbbbba..3b20a403b71 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -1238,6 +1238,9 @@ set +x exec_retry_threshold=10 is_retry_execuate=0 if [ -n "$failed_test_lists" ];then + if [ ${TIMEOUT_DEBUG_HELP:-OFF} == "ON" ];then + bash $PADDLE_ROOT/tools/timeout_debug_help.sh "$failed_test_lists" # cat logs for tiemout uts which killed by ctest + fi read need_retry_ut_str <<< $(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' ) need_retry_ut_arr=(${need_retry_ut_str}) need_retry_ut_count=${#need_retry_ut_arr[@]} diff --git a/tools/timeout_debug_help.sh b/tools/timeout_debug_help.sh new file mode 100644 index 00000000000..45de2db87e8 --- /dev/null +++ b/tools/timeout_debug_help.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +set +e +failed_uts=$1 +need_debug_ut_re='test_dist_fleet' +cat_log_judge=$(echo "${failed_uts}" | grep 'Timeout' | grep -oEi "$need_debug_ut_re" ) +if [[ "$cat_log_judge" != "" ]];then + echo "==============================================" + echo "show timeout ut logs" + echo "==============================================" + cat /tmp/tr0_err.log /tmp/tr1_err.log /tmp/ps0_err.log /tmp/ps1_err.log + cat /tmp/heter0_err.log /tmp/heter1_err.log +fi +set -e -- GitLab From 3d5aa9d10a70b7e68b3cded9b2720f662c952016 Mon Sep 17 00:00:00 2001 From: Qi Li Date: Fri, 12 Mar 2021 13:55:14 +0800 Subject: [PATCH 006/486] [ROCM] fix conv2d and conv3d op, test=develop (#31553) --- paddle/fluid/operators/conv_cudnn_op.cu | 215 ++++++++-------- paddle/fluid/operators/conv_miopen_helper.h | 231 ++++++++---------- .../operators/conv_transpose_cudnn_op.cu | 40 ++- paddle/fluid/platform/miopen_desc.h | 25 +- .../fluid/tests/unittests/test_conv2d_op.py | 15 +- .../fluid/tests/unittests/test_conv3d_op.py | 14 ++ .../unittests/test_sync_batch_norm_op.py | 7 +- 7 files changed, 298 insertions(+), 249 deletions(-) diff --git a/paddle/fluid/operators/conv_cudnn_op.cu b/paddle/fluid/operators/conv_cudnn_op.cu index 110bb69a140..39e9d37ddc6 100644 --- a/paddle/fluid/operators/conv_cudnn_op.cu +++ b/paddle/fluid/operators/conv_cudnn_op.cu @@ -249,6 +249,7 @@ class CUDNNConvOpKernel : public framework::OpKernel { args.handle = handle; #ifdef PADDLE_WITH_HIP + // MIOPEN need to set groups in cdesc in miopen_desc.h args.cdesc.set(dtype, padding_common, strides, dilations, platform::AllowTF32Cudnn(), groups); #else @@ -264,6 +265,10 @@ class CUDNNConvOpKernel : public framework::OpKernel { platform::dynload::cudnnSetConvolutionGroupCount(args.cdesc.desc(), groups)); groups = 1; +#endif +#ifdef PADDLE_WITH_HIP + // MIOPEN do not set groups in wdesc after set groups in cdesc + groups = 1; #endif args.idesc.set(transformed_input, layout_format); args.wdesc.set(transformed_filter_channel, layout_format, groups); @@ -292,12 +297,14 @@ class CUDNNConvOpKernel : public framework::OpKernel { #ifdef PADDLE_WITH_HIP miopenConvFwdAlgorithm_t algo{}; using search = SearchAlgorithm; + workspace_size = search::GetWorkspaceSize(args); + algo = search::Find(args, exhaustive_search, false, workspace_size, ctx); #else cudnnConvolutionFwdAlgo_t algo{}; using search = SearchAlgorithm; -#endif algo = search::Find(args, exhaustive_search, false, ctx); workspace_size = search::GetWorkspaceSize(args, algo); +#endif #if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION_MIN(7, 0, 1) // when groups > 1, SearchAlgorithm find algo is CUDNN_CONVOLUTION_\ @@ -652,13 +659,17 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { #ifdef PADDLE_WITH_HIP using search1 = SearchAlgorithm; + workspace_size = + std::max(workspace_size, search1::GetWorkspaceSize(args1)); + data_algo = search1::Find(args1, exhaustive_search, deterministic, + workspace_size, ctx); #else using search1 = SearchAlgorithm; -#endif data_algo = search1::Find(args1, exhaustive_search, deterministic, ctx); workspace_size = std::max(workspace_size, search1::GetWorkspaceSize(args1, data_algo)); +#endif } if (filter_grad) { @@ -673,13 +684,17 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { platform::AllowTF32Cudnn(), c_groups); #ifdef PADDLE_WITH_HIP using search2 = SearchAlgorithm; + workspace_size = + std::max(workspace_size, search2::GetWorkspaceSize(args2)); + filter_algo = search2::Find(args2, exhaustive_search, deterministic, + workspace_size, ctx); #else using search2 = SearchAlgorithm; -#endif filter_algo = search2::Find(args2, exhaustive_search, deterministic, ctx); workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2, filter_algo)); +#endif } // ------------------- cudnn conv backward data --------------------- @@ -688,23 +703,22 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { VLOG(4) << "Conv_grad: use_addto = " << ctx.Attr("use_addto"); if (input_grad) { - // When beta is 0, it is unnecessary to reset input_grad. - // When beta is 1, the output cannot be reset since addt strategy used. - for (int i = 0; i < groups; i++) { +// When beta is 0, it is unnecessary to reset input_grad. +// When beta is 1, the output cannot be reset since addt strategy used. #ifdef PADDLE_WITH_HIP - workspace_handle.RunFunc( - [&](void* cudnn_workspace_ptr) { - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::miopenConvolutionBackwardData( - handle, &alpha, args1.odesc.desc(), - output_grad_data + i * group_offset_out, - args1.wdesc.desc(), filter_data + i * group_offset_filter, - args1.cdesc.desc(), data_algo, &beta, args1.idesc.desc(), - transformed_input_grad_data + i * group_offset_in, - cudnn_workspace_ptr, workspace_size)); - }, - workspace_size); + workspace_handle.RunFunc( + [&](void* cudnn_workspace_ptr) { + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::miopenConvolutionBackwardData( + handle, &alpha, args1.odesc.desc(), output_grad_data, + args1.wdesc.desc(), filter_data, args1.cdesc.desc(), + data_algo, &beta, args1.idesc.desc(), + transformed_input_grad_data, cudnn_workspace_ptr, + workspace_size)); + }, + workspace_size); #else + for (int i = 0; i < groups; i++) { workspace_handle.RunFunc( [&](void* cudnn_workspace_ptr) { PADDLE_ENFORCE_CUDA_SUCCESS( @@ -717,9 +731,8 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { transformed_input_grad_data + i * group_offset_in)); }, workspace_size); -#endif } - +#endif if (!is_sys_pad) { std::vector starts(transformed_input_channel.dims().size(), 0); std::vector axes(transformed_input_channel.dims().size(), 0); @@ -751,23 +764,20 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { ScalingParamType beta_filter = 0.0f; // ------------------- cudnn conv backward filter --------------------- if (filter_grad) { - // Because beta is zero, it is unnecessary to reset filter_grad. - for (int i = 0; i < groups; i++) { +// Because beta is zero, it is unnecessary to reset filter_grad. #ifdef PADDLE_WITH_HIP - workspace_handle.RunFunc( - [&](void* cudnn_workspace_ptr) { - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::miopenConvolutionBackwardWeights( - handle, &alpha, args2.odesc.desc(), - output_grad_data + i * group_offset_out, - args2.idesc.desc(), input_data + i * group_offset_in, - args2.cdesc.desc(), filter_algo, &beta, - args2.wdesc.desc(), - filter_grad_data + i * group_offset_filter, - cudnn_workspace_ptr, workspace_size)); - }, - workspace_size); + workspace_handle.RunFunc( + [&](void* cudnn_workspace_ptr) { + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::miopenConvolutionBackwardWeights( + handle, &alpha, args2.odesc.desc(), output_grad_data, + args2.idesc.desc(), input_data, args2.cdesc.desc(), + filter_algo, &beta, args2.wdesc.desc(), filter_grad_data, + cudnn_workspace_ptr, workspace_size)); + }, + workspace_size); #else + for (int i = 0; i < groups; i++) { workspace_handle.RunFunc( [&](void* cudnn_workspace_ptr) { PADDLE_ENFORCE_CUDA_SUCCESS( @@ -780,8 +790,8 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { filter_grad_data + i * group_offset_filter)); }, workspace_size); -#endif } +#endif if (compute_format == DataLayout::kNHWC) { TransToChannelFirst( @@ -1080,32 +1090,37 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel { #ifdef PADDLE_WITH_HIP using search1 = SearchAlgorithm; + workspace_size = search1::GetWorkspaceSize(args1); + fwd_algo1 = search1::Find(args1, exhaustive_search, false, + workspace_size, ctx); #else using search1 = SearchAlgorithm; -#endif fwd_algo1 = search1::Find(args1, exhaustive_search, false, ctx); workspace_size = search1::GetWorkspaceSize(args1, fwd_algo1); +#endif } if (ddW) { ddw = ddW->data(); args2.handle = handle; args2.idesc.set(transformed_X, iwo_group); - args2.wdesc.set(*ddW, layout, iwo_group); - args2.odesc.set(transformed_ddO_channel, iwo_group); args2.cdesc.set(dtype, padding_common, strides, dilations, platform::AllowTF32Cudnn(), c_group); #ifdef PADDLE_WITH_HIP using search2 = SearchAlgorithm; + workspace_size = + std::max(workspace_size, search2::GetWorkspaceSize(args2)); + fwd_algo2 = search2::Find(args2, exhaustive_search, false, + workspace_size, ctx); #else using search2 = SearchAlgorithm; -#endif fwd_algo2 = search2::Find(args2, exhaustive_search, false, ctx); workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2, fwd_algo2)); +#endif } } @@ -1114,21 +1129,23 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel { args3.handle = handle; args3.idesc.set(transformed_ddX, iwo_group); args3.wdesc.set(*dW, layout, iwo_group); - args3.odesc.set(transformed_dO_channel, iwo_group); - args3.cdesc.set(dtype, padding_common, strides, dilations, platform::AllowTF32Cudnn(), c_group); #ifdef PADDLE_WITH_HIP using search3 = SearchAlgorithm; + workspace_size = + std::max(workspace_size, search3::GetWorkspaceSize(args3)); + filter_algo = search3::Find(args3, exhaustive_search, deterministic, + workspace_size, ctx); #else using search3 = SearchAlgorithm; -#endif filter_algo = search3::Find(args3, exhaustive_search, deterministic, ctx); workspace_size = std::max(workspace_size, search3::GetWorkspaceSize(args3, filter_algo)); +#endif } if (ddW && dX) { @@ -1143,13 +1160,17 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel { #ifdef PADDLE_WITH_HIP using search4 = SearchAlgorithm; + workspace_size = + std::max(workspace_size, search4::GetWorkspaceSize(args4)); + data_algo = search4::Find(args4, exhaustive_search, deterministic, + workspace_size, ctx); #else using search4 = SearchAlgorithm; -#endif data_algo = search4::Find(args4, exhaustive_search, deterministic, ctx); workspace_size = std::max(workspace_size, search4::GetWorkspaceSize(args4, data_algo)); +#endif } int i_n, i_c, i_d, i_h, i_w; @@ -1176,21 +1197,19 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel { if (ddO) { if (ddX) { ddx = transformed_ddX.data(); - for (int i = 0; i < groups; i++) { #ifdef PADDLE_WITH_HIP - wkspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::miopenConvolutionForward( - handle, &alpha, args1.idesc.desc(), - ddx + i * group_offset_in, args1.wdesc.desc(), - w + i * group_offset_filter, args1.cdesc.desc(), - fwd_algo1, &beta, args1.odesc.desc(), - transformed_ddy_channel + i * group_offset_out, - workspace_ptr, workspace_size)); - }, - workspace_size); + wkspace_handle.RunFunc( + [&](void* workspace_ptr) { + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::miopenConvolutionForward( + handle, &alpha, args1.idesc.desc(), ddx, + args1.wdesc.desc(), w, args1.cdesc.desc(), fwd_algo1, + &beta, args1.odesc.desc(), transformed_ddy_channel, + workspace_ptr, workspace_size)); + }, + workspace_size); #else + for (int i = 0; i < groups; i++) { wkspace_handle.RunFunc( [&](void* workspace_ptr) { PADDLE_ENFORCE_CUDA_SUCCESS( @@ -1203,26 +1222,24 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel { transformed_ddy_channel + i * group_offset_out)); }, workspace_size); -#endif } +#endif } if (ddW) { - for (int i = 0; i < groups; i++) { #ifdef PADDLE_WITH_HIP - // MIOPEN ONLY support beta to be 0.0f - wkspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::miopenConvolutionForward( - handle, &alpha, args2.idesc.desc(), - x + i * group_offset_in, args2.wdesc.desc(), - ddw + i * group_offset_filter, args2.cdesc.desc(), - fwd_algo2, &beta, args2.odesc.desc(), - transformed_ddy_channel + i * group_offset_out, - workspace_ptr, workspace_size)); - }, - workspace_size); + // MIOPEN ONLY support beta to be 0.0f + wkspace_handle.RunFunc( + [&](void* workspace_ptr) { + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::miopenConvolutionForward( + handle, &alpha, args2.idesc.desc(), x, args2.wdesc.desc(), + ddw, args2.cdesc.desc(), fwd_algo2, &beta, + args2.odesc.desc(), transformed_ddy_channel, + workspace_ptr, workspace_size)); + }, + workspace_size); #else + for (int i = 0; i < groups; i++) { wkspace_handle.RunFunc( [&](void* workspace_ptr) { PADDLE_ENFORCE_CUDA_SUCCESS( @@ -1235,8 +1252,8 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel { transformed_ddy_channel + i * group_offset_out)); }, workspace_size); -#endif } +#endif } if (channel_last) { TransToChannelLast( @@ -1246,21 +1263,19 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel { T* transformed_dy_channel = transformed_dO_channel.data(); if (dW && ddX) { ddx = transformed_ddX.data(); - for (int i = 0; i < groups; i++) { #ifdef PADDLE_WITH_HIP - wkspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::miopenConvolutionBackwardWeights( - handle, &alpha, args3.odesc.desc(), - transformed_dy_channel + i * group_offset_out, - args3.idesc.desc(), ddx + i * group_offset_in, - args3.cdesc.desc(), filter_algo, &beta, - args3.wdesc.desc(), dw + i * group_offset_filter, - workspace_ptr, workspace_size)); - }, - workspace_size); + wkspace_handle.RunFunc( + [&](void* workspace_ptr) { + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::miopenConvolutionBackwardWeights( + handle, &alpha, args3.odesc.desc(), transformed_dy_channel, + args3.idesc.desc(), ddx, args3.cdesc.desc(), filter_algo, + &beta, args3.wdesc.desc(), dw, workspace_ptr, + workspace_size)); + }, + workspace_size); #else + for (int i = 0; i < groups; i++) { wkspace_handle.RunFunc( [&](void* workspace_ptr) { PADDLE_ENFORCE_CUDA_SUCCESS( @@ -1273,27 +1288,25 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel { dw + i * group_offset_filter)); }, workspace_size); -#endif } +#endif } if (dX && ddW) { ddw = ddW->data(); - for (int i = 0; i < groups; i++) { #ifdef PADDLE_WITH_HIP - wkspace_handle.RunFunc( - [&](void* workspace_ptr) { - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::miopenConvolutionBackwardData( - handle, &alpha, args4.odesc.desc(), - transformed_dy_channel + i * group_offset_out, - args4.wdesc.desc(), ddw + i * group_offset_filter, - args4.cdesc.desc(), data_algo, &beta, args4.idesc.desc(), - transformed_dx + i * group_offset_in, workspace_ptr, - workspace_size)); - }, - workspace_size); + wkspace_handle.RunFunc( + [&](void* workspace_ptr) { + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::miopenConvolutionBackwardData( + handle, &alpha, args4.odesc.desc(), transformed_dy_channel, + args4.wdesc.desc(), ddw, args4.cdesc.desc(), data_algo, + &beta, args4.idesc.desc(), transformed_dx, workspace_ptr, + workspace_size)); + }, + workspace_size); #else + for (int i = 0; i < groups; i++) { wkspace_handle.RunFunc( [&](void* workspace_ptr) { PADDLE_ENFORCE_CUDA_SUCCESS( @@ -1306,8 +1319,8 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel { transformed_dx + i * group_offset_in)); }, workspace_size); -#endif } +#endif if (!is_sys_pad) { // reverse padded input diff --git a/paddle/fluid/operators/conv_miopen_helper.h b/paddle/fluid/operators/conv_miopen_helper.h index 44ead95a355..3ab27e1ec4f 100644 --- a/paddle/fluid/operators/conv_miopen_helper.h +++ b/paddle/fluid/operators/conv_miopen_helper.h @@ -127,57 +127,52 @@ struct SearchAlgorithm { template static algo_t Find(const ConvArgs& args, bool exhaustive_search, - bool deterministic, + bool deterministic, size_t workspace_size, const framework::ExecutionContext& ctx) { - auto dtype = platform::CudnnDataType::type; - bool has_got_workspace_size = true; - size_t workspace_size_limit = FLAGS_conv_workspace_size_limit * 1024 * 1024; - size_t workspace_size = 0; algo_t algo; auto& dev_ctx = ctx.template device_context(); auto workspace_handle = dev_ctx.cudnn_workspace_handle(); - auto& temp = ctx.cuda_device_context(); - AlgorithmsCache& algo_cache = - *(framework::ConvSearchCache::Instance().GetForward()); - - auto x_dims = framework::vectorize(args.x->dims()); - auto w_dims = framework::vectorize(args.w->dims()); - - VLOG(10) << "miopenConvolutionFwdAlgoPerf_t:" - << ", x_dims:" << x_dims << ", w_dims:" << w_dims << ", args.s" - << args.s << ", args.p" << args.p << ", args.d" << args.d; - - algo = algo_cache.GetAlgorithm( - x_dims, w_dims, args.s, args.p, args.d, 0, - static_cast(args.cudnn_dtype), [&]() { - int returned_algo_count; - std::array perf_stat; - - auto cudnn_find_func = [&](void* cudnn_workspace_ptr) { - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::miopenFindConvolutionForwardAlgorithm( - args.handle, args.idesc.desc(), args.x->data(), - args.wdesc.desc(), args.w->data(), args.cdesc.desc(), - args.odesc.desc(), const_cast(args.o->data()), - kNUM_CUDNN_FWD_ALGS, &returned_algo_count, perf_stat.data(), - cudnn_workspace_ptr, workspace_size_limit, false)); - }; - workspace_handle.RunFuncSync(cudnn_find_func, workspace_size_limit); - - VLOG(3) << "FwdAlgo Perf result: (algo: stat, time, memory)"; - for (int i = 0; i < returned_algo_count; ++i) { - const auto& stat = perf_stat[i]; - VLOG(3) << stat.fwd_algo; - } - return perf_stat[0].fwd_algo; - }); + int find_count; + miopenConvAlgoPerf_t find_result; + auto cudnn_find_func = [&](void* cudnn_workspace_ptr) { + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::miopenFindConvolutionForwardAlgorithm( + args.handle, args.idesc.desc(), args.x->data(), + args.wdesc.desc(), args.w->data(), args.cdesc.desc(), + args.odesc.desc(), const_cast(args.o->data()), + kNUM_CUDNN_FWD_ALGS, &find_count, &find_result, + cudnn_workspace_ptr, workspace_size, false)); + }; + + if (!exhaustive_search && !deterministic) { + workspace_handle.RunFuncSync(cudnn_find_func, workspace_size); + algo = find_result.fwd_algo; + } else { + auto& temp = ctx.cuda_device_context(); + AlgorithmsCache& algo_cache = + *(framework::ConvSearchCache::Instance().GetForward()); + + auto x_dims = framework::vectorize(args.x->dims()); + auto w_dims = framework::vectorize(args.w->dims()); + + VLOG(10) << "miopenConvolutionFwdAlgoPerf_t:" + << ", x_dims:" << x_dims << ", w_dims:" << w_dims << ", args.s" + << args.s << ", args.p" << args.p << ", args.d" << args.d; + + algo = algo_cache.GetAlgorithm( + x_dims, w_dims, args.s, args.p, args.d, 0, + static_cast(args.cudnn_dtype), [&]() { + workspace_handle.RunFuncSync(cudnn_find_func, workspace_size); + return find_result.fwd_algo; + }); + } VLOG(3) << "choose algo " << algo; return algo; } - static size_t GetWorkspaceSize(const ConvArgs& args, algo_t algo) { + static size_t GetWorkspaceSize(const ConvArgs& args) { size_t workspace_size = 0; PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::miopenConvolutionForwardGetWorkSpaceSize( @@ -194,58 +189,51 @@ struct SearchAlgorithm { template static algo_t Find(const ConvArgs& args, bool exhaustive_search, - bool deterministic, + bool deterministic, size_t workspace_size, const framework::ExecutionContext& ctx) { - auto dtype = platform::CudnnDataType::type; - size_t workspace_size_limit = FLAGS_conv_workspace_size_limit * 1024 * 1024; - size_t workspace_size = 0; - bool has_got_workspace_size = true; algo_t algo; auto& dev_ctx = ctx.template device_context(); auto workspace_handle = dev_ctx.cudnn_workspace_handle(); - AlgorithmsCache& algo_cache = - *(framework::ConvSearchCache::Instance().GetBackwardData()); - - auto x_dims = framework::vectorize(args.x->dims()); - auto w_dims = framework::vectorize(args.w->dims()); - - VLOG(10) << "miopenConvolutionFwdAlgoPerf_t" - << ", x_dims:" << x_dims << ", w_dims:" << w_dims << ", args.s" - << args.s << ", args.p" << args.p << ", args.d" << args.d; - - algo = algo_cache.GetAlgorithm( - x_dims, w_dims, args.s, args.p, args.d, 0, - static_cast(args.cudnn_dtype), [&]() { - int returned_algo_count; - std::array perf_stat; - - auto cudnn_find_func = [&](void* cudnn_workspace_ptr) { - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::miopenFindConvolutionBackwardDataAlgorithm( - args.handle, args.odesc.desc(), args.o->data(), - args.wdesc.desc(), args.w->data(), args.cdesc.desc(), - args.idesc.desc(), const_cast(args.x->data()), - kNUM_CUDNN_BWD_DATA_ALGS, &returned_algo_count, - perf_stat.data(), cudnn_workspace_ptr, workspace_size_limit, - false)); - }; - workspace_handle.RunFuncSync(cudnn_find_func, workspace_size_limit); - - VLOG(3) << "BwdDataAlgo Perf result: (algo: stat, time, memory)"; - for (int i = 0; i < returned_algo_count; ++i) { - const auto& stat = perf_stat[i]; - VLOG(3) << stat.bwd_data_algo; - } - - return perf_stat[0].bwd_data_algo; - }); + int find_count; + miopenConvAlgoPerf_t find_result; + auto cudnn_find_func = [&](void* cudnn_workspace_ptr) { + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::miopenFindConvolutionBackwardDataAlgorithm( + args.handle, args.odesc.desc(), args.o->data(), + args.wdesc.desc(), args.w->data(), args.cdesc.desc(), + args.idesc.desc(), const_cast(args.x->data()), + kNUM_CUDNN_BWD_DATA_ALGS, &find_count, &find_result, + cudnn_workspace_ptr, workspace_size, false)); + }; + + if (!exhaustive_search && !deterministic) { + workspace_handle.RunFuncSync(cudnn_find_func, workspace_size); + algo = find_result.bwd_data_algo; + } else { + AlgorithmsCache& algo_cache = + *(framework::ConvSearchCache::Instance().GetBackwardData()); + + auto x_dims = framework::vectorize(args.x->dims()); + auto w_dims = framework::vectorize(args.w->dims()); + + VLOG(10) << "miopenConvolutionFwdAlgoPerf_t" + << ", x_dims:" << x_dims << ", w_dims:" << w_dims << ", args.s" + << args.s << ", args.p" << args.p << ", args.d" << args.d; + + algo = algo_cache.GetAlgorithm( + x_dims, w_dims, args.s, args.p, args.d, 0, + static_cast(args.cudnn_dtype), [&]() { + workspace_handle.RunFuncSync(cudnn_find_func, workspace_size); + return find_result.bwd_data_algo; + }); + } VLOG(3) << "choose algo " << algo; return algo; } - static size_t GetWorkspaceSize(const ConvArgs& args, algo_t algo) { + static size_t GetWorkspaceSize(const ConvArgs& args) { size_t workspace_size = 0; PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::miopenConvolutionBackwardDataGetWorkSpaceSize( @@ -262,56 +250,51 @@ struct SearchAlgorithm { template static algo_t Find(const ConvArgs& args, bool exhaustive_search, - bool deterministic, + bool deterministic, size_t workspace_size, const framework::ExecutionContext& ctx) { - auto dtype = platform::CudnnDataType::type; - size_t workspace_size_limit = FLAGS_conv_workspace_size_limit * 1024 * 1024; - size_t workspace_size = 0; - bool has_got_workspace_size = true; algo_t algo; auto& dev_ctx = ctx.template device_context(); auto workspace_handle = dev_ctx.cudnn_workspace_handle(); - AlgorithmsCache& algo_cache = - *(framework::ConvSearchCache::Instance().GetBackwardFilter()); - - auto x_dims = framework::vectorize(args.x->dims()); - auto w_dims = framework::vectorize(args.w->dims()); - - VLOG(10) << "miopenConvolutionFwdAlgoPerf_t:" - << ", x_dims:" << x_dims << ", w_dims:" << w_dims << ", args.s" - << args.s << ", args.p" << args.p << ", args.d" << args.d; - - algo = algo_cache.GetAlgorithm( - x_dims, w_dims, args.s, args.p, args.d, 0, - static_cast(args.cudnn_dtype), [&]() { - int returned_algo_count; - std::array perf_stat; - auto cudnn_find_func = [&](void* cudnn_workspace_ptr) { - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload:: - miopenFindConvolutionBackwardWeightsAlgorithm( - args.handle, args.odesc.desc(), args.o->data(), - args.idesc.desc(), args.x->data(), args.cdesc.desc(), - args.wdesc.desc(), const_cast(args.w->data()), - kNUM_CUDNN_BWD_FILTER_ALGS, &returned_algo_count, - perf_stat.data(), cudnn_workspace_ptr, - workspace_size_limit, false)); - }; - workspace_handle.RunFuncSync(cudnn_find_func, workspace_size_limit); - - VLOG(3) << "BwdFilterAlgo Perf result: (algo: stat, time, memory)"; - for (int i = 0; i < returned_algo_count; ++i) { - const auto& stat = perf_stat[i]; - VLOG(3) << stat.bwd_weights_algo; - } - return perf_stat[0].bwd_weights_algo; - }); + + int find_count; + miopenConvAlgoPerf_t find_result; + auto cudnn_find_func = [&](void* cudnn_workspace_ptr) { + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::miopenFindConvolutionBackwardWeightsAlgorithm( + args.handle, args.odesc.desc(), args.o->data(), + args.idesc.desc(), args.x->data(), args.cdesc.desc(), + args.wdesc.desc(), const_cast(args.w->data()), + kNUM_CUDNN_BWD_FILTER_ALGS, &find_count, &find_result, + cudnn_workspace_ptr, workspace_size, false)); + }; + + if (!exhaustive_search && !deterministic) { + workspace_handle.RunFuncSync(cudnn_find_func, workspace_size); + algo = find_result.bwd_weights_algo; + } else { + AlgorithmsCache& algo_cache = + *(framework::ConvSearchCache::Instance().GetBackwardFilter()); + + auto x_dims = framework::vectorize(args.x->dims()); + auto w_dims = framework::vectorize(args.w->dims()); + + VLOG(10) << "miopenConvolutionFwdAlgoPerf_t:" + << ", x_dims:" << x_dims << ", w_dims:" << w_dims << ", args.s" + << args.s << ", args.p" << args.p << ", args.d" << args.d; + + algo = algo_cache.GetAlgorithm( + x_dims, w_dims, args.s, args.p, args.d, 0, + static_cast(args.cudnn_dtype), [&]() { + workspace_handle.RunFuncSync(cudnn_find_func, workspace_size); + return find_result.bwd_weights_algo; + }); + } VLOG(3) << "choose algo " << algo; return algo; } - static size_t GetWorkspaceSize(const ConvArgs& args, algo_t algo) { + static size_t GetWorkspaceSize(const ConvArgs& args) { size_t workspace_size = 0; PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::miopenConvolutionBackwardWeightsGetWorkSpaceSize( diff --git a/paddle/fluid/operators/conv_transpose_cudnn_op.cu b/paddle/fluid/operators/conv_transpose_cudnn_op.cu index 376cefe5025..5781dd18b7b 100644 --- a/paddle/fluid/operators/conv_transpose_cudnn_op.cu +++ b/paddle/fluid/operators/conv_transpose_cudnn_op.cu @@ -244,13 +244,14 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel { #ifdef PADDLE_WITH_HIP using search = SearchAlgorithm; + workspace_size = std::max(workspace_size, search::GetWorkspaceSize(args)); + algo = search::Find(args, false, deterministic, workspace_size, ctx); #else using search = SearchAlgorithm; -#endif - algo = search::Find(args, false, deterministic, ctx); workspace_size = std::max(workspace_size, search::GetWorkspaceSize(args, algo)); +#endif // ------------------- cudnn conv transpose forward --------------------- int input_offset = @@ -504,12 +505,16 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { platform::AllowTF32Cudnn(), c_groups); #ifdef PADDLE_WITH_HIP using search1 = SearchAlgorithm; + workspace_size = + std::max(workspace_size, search1::GetWorkspaceSize(args1)); + data_algo = + search1::Find(args1, false, deterministic, workspace_size, ctx); #else using search1 = SearchAlgorithm; -#endif data_algo = search1::Find(args1, false, deterministic, ctx); workspace_size = std::max(workspace_size, search1::GetWorkspaceSize(args1, data_algo)); +#endif } if (filter_grad) { @@ -522,12 +527,16 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { platform::AllowTF32Cudnn(), c_groups); #ifdef PADDLE_WITH_HIP using search2 = SearchAlgorithm; + workspace_size = + std::max(workspace_size, search2::GetWorkspaceSize(args2)); + filter_algo = + search2::Find(args2, false, deterministic, workspace_size, ctx); #else using search2 = SearchAlgorithm; -#endif filter_algo = search2::Find(args2, false, deterministic, ctx); workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2, filter_algo)); +#endif } // ------------------- cudnn conv backward data --------------------- @@ -942,11 +951,14 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel { args1.cdesc.set(dtype, padding_common, strides, dilations, c_group); #ifdef PADDLE_WITH_HIP using search1 = SearchAlgorithm; + workspace_size = search1::GetWorkspaceSize(args1); + bwd_algo1 = + search1::Find(args1, false, deterministic, workspace_size, ctx); #else using search1 = SearchAlgorithm; -#endif bwd_algo1 = search1::Find(args1, false, deterministic, ctx); workspace_size = search1::GetWorkspaceSize(args1, bwd_algo1); +#endif } if (ddW) { @@ -958,12 +970,16 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel { args2.cdesc.set(dtype, padding_common, strides, dilations, c_group); #ifdef PADDLE_WITH_HIP using search2 = SearchAlgorithm; + workspace_size = + std::max(workspace_size, search2::GetWorkspaceSize(args2)); + bwd_algo2 = + search2::Find(args2, false, deterministic, workspace_size, ctx); #else using search2 = SearchAlgorithm; -#endif bwd_algo2 = search2::Find(args2, false, deterministic, ctx); workspace_size = std::max(workspace_size, search2::GetWorkspaceSize(args2, bwd_algo2)); +#endif } } @@ -978,12 +994,16 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel { args3.cdesc.set(dtype, padding_common, strides, dilations, c_group); #ifdef PADDLE_WITH_HIP using search3 = SearchAlgorithm; + workspace_size = + std::max(workspace_size, search3::GetWorkspaceSize(args3)); + filter_algo = + search3::Find(args3, false, deterministic, workspace_size, ctx); #else using search3 = SearchAlgorithm; -#endif filter_algo = search3::Find(args3, false, deterministic, ctx); workspace_size = std::max(workspace_size, search3::GetWorkspaceSize(args3, filter_algo)); +#endif } if (ddW && dX) { @@ -996,12 +1016,16 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel { args4.cdesc.set(dtype, padding_common, strides, dilations, c_group); #ifdef PADDLE_WITH_HIP using search4 = SearchAlgorithm; + workspace_size = + std::max(workspace_size, search4::GetWorkspaceSize(args4)); + data_algo = + search4::Find(args4, false, deterministic, workspace_size, ctx); #else using search4 = SearchAlgorithm; -#endif data_algo = search4::Find(args4, false, deterministic, ctx); workspace_size = std::max(workspace_size, search4::GetWorkspaceSize(args4, data_algo)); +#endif } int i_n, i_c, i_d, i_h, i_w; diff --git a/paddle/fluid/platform/miopen_desc.h b/paddle/fluid/platform/miopen_desc.h index 7de713559ae..c82e61ceb12 100644 --- a/paddle/fluid/platform/miopen_desc.h +++ b/paddle/fluid/platform/miopen_desc.h @@ -199,19 +199,24 @@ class FilterDescriptor { void set(const Tensor& tensor, const miopenTensorFormat_t format, const int groups = 1) { - auto dims = framework::vectorize(tensor.dims()); - std::vector transformed_dims; PADDLE_ENFORCE_EQ(format, MIOPEN_TENSOR_NCHW, platform::errors::InvalidArgument( "format should ONLY be NCHW in MIOPEN.")); - transformed_dims = dims; - // if (groups > 1) { - // transformed_dims[1] = transformed_dims[1] / groups; - // } - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSet4dTensorDescriptor( - (miopenTensorDescriptor_t)desc_.get(), ToCudnnDataType(tensor.type()), - transformed_dims[0], transformed_dims[1], transformed_dims[2], - transformed_dims[3])); + auto dims = framework::vectorize(tensor.dims()); + std::vector strides(dims.size()); + strides[dims.size() - 1] = 1; + for (int i = dims.size() - 2; i >= 0; i--) { + strides[i] = dims[i + 1] * strides[i + 1]; + } + std::vector dims_with_group(dims.begin(), dims.end()); + if (groups > 1) { + dims_with_group[1] = dims_with_group[1] / groups; + } + PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSetTensorDescriptor( + (miopenTensorDescriptor_t)(desc_.get()), ToCudnnDataType(tensor.type()), + static_cast(dims_with_group.size()), + const_cast(dims_with_group.data()), + const_cast(strides.data()))); } private: diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_op.py index 9992efee1b3..29c35d28d4d 100644 --- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py +++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py @@ -128,6 +128,8 @@ def create_test_cudnn_class(parent): class TestCUDNNCase(parent): def init_kernel_type(self): self.use_cudnn = True + self.dtype = np.float32 if core.is_compiled_with_rocm( + ) else np.float64 cls_name = "{0}_{1}".format(parent.__name__, "CUDNN") TestCUDNNCase.__name__ = cls_name @@ -185,6 +187,8 @@ def create_test_cudnn_channel_last_class(parent): class TestCudnnChannelLastCase(parent): def init_kernel_type(self): self.use_cudnn = True + self.dtype = np.float32 if core.is_compiled_with_rocm( + ) else np.float64 def init_data_format(self): self.data_format = "NHWC" @@ -264,6 +268,8 @@ def create_test_cudnn_padding_SAME_class(parent): class TestCUDNNPaddingSMAECase(parent): def init_kernel_type(self): self.use_cudnn = True + self.dtype = np.float32 if core.is_compiled_with_rocm( + ) else np.float64 def init_paddings(self): self.pad = [1, 1] @@ -280,6 +286,8 @@ def create_test_cudnn_padding_VALID_class(parent): class TestCUDNNPaddingVALIDCase(parent): def init_kernel_type(self): self.use_cudnn = True + self.dtype = np.float32 if core.is_compiled_with_rocm( + ) else np.float64 def init_paddings(self): self.pad = [1, 1] @@ -299,8 +307,7 @@ class TestConv2DOp(OpTest): self.use_mkldnn = False self.fuse_relu_before_depthwise_conv = False self.data_format = "AnyLayout" - # explicilty use float32 for ROCm, as MIOpen does not yet support float64 - self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64 + self.dtype = np.float64 self.init_kernel_type() self.init_group() self.init_dilation() @@ -693,6 +700,7 @@ class TestCUDNNExhaustiveSearch(TestConv2DOp): def init_kernel_type(self): self.use_cudnn = True self.exhaustive_search = True + self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64 class TestConv2DOpError(unittest.TestCase): @@ -734,8 +742,7 @@ class TestConv2DOp_v2(OpTest): self.use_cuda = False self.use_mkldnn = False self.fuse_relu_before_depthwise_conv = False - # explicilty use float32 for ROCm, as MIOpen does not yet support float64 - self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64 + self.dtype = np.float64 self.init_kernel_type() self.init_group() self.init_dilation() diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_op.py b/python/paddle/fluid/tests/unittests/test_conv3d_op.py index 1636019a625..59d1f3216e1 100644 --- a/python/paddle/fluid/tests/unittests/test_conv3d_op.py +++ b/python/paddle/fluid/tests/unittests/test_conv3d_op.py @@ -135,6 +135,8 @@ def create_test_cudnn_class(parent): class TestCUDNNCase(parent): def init_kernel_type(self): self.use_cudnn = True + self.dtype = np.float32 if core.is_compiled_with_rocm( + ) else np.float64 cls_name = "{0}_{1}".format(parent.__name__, "CUDNN") TestCUDNNCase.__name__ = cls_name @@ -169,6 +171,8 @@ def create_test_cudnn_padding_SAME_class(parent): class TestCUDNNPaddingSMAECase(parent): def init_kernel_type(self): self.use_cudnn = True + self.dtype = np.float32 if core.is_compiled_with_rocm( + ) else np.float64 def init_paddings(self): self.pad = [1, 1, 1] @@ -185,6 +189,8 @@ def create_test_cudnn_padding_VALID_class(parent): class TestCUDNNPaddingVALIDCase(parent): def init_kernel_type(self): self.use_cudnn = True + self.dtype = np.float32 if core.is_compiled_with_rocm( + ) else np.float64 def init_paddings(self): self.pad = [1, 1, 1] @@ -215,6 +221,8 @@ def create_test_cudnn_channel_last_class(parent): class TestCudnnChannelLastCase(parent): def init_kernel_type(self): self.use_cudnn = True + self.dtype = np.float32 if core.is_compiled_with_rocm( + ) else np.float64 def init_data_format(self): self.data_format = "NDHWC" @@ -410,6 +418,7 @@ class TestWithDilation(TestConv3DOp): class TestCUDNN(TestConv3DOp): def init_kernel_type(self): self.use_cudnn = True + self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64 @unittest.skipIf(not core.is_compiled_with_cuda(), @@ -431,6 +440,7 @@ class TestFP16CUDNN(TestConv3DOp): class TestWithGroup1CUDNN(TestWithGroup1): def init_kernel_type(self): self.use_cudnn = True + self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64 @unittest.skipIf(not core.is_compiled_with_cuda(), @@ -452,6 +462,7 @@ class TestFP16WithGroup1CUDNN(TestWithGroup1): class TestWithGroup2CUDNN(TestWithGroup2): def init_kernel_type(self): self.use_cudnn = True + self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64 @unittest.skipIf(not core.is_compiled_with_cuda(), @@ -473,6 +484,7 @@ class TestFP16WithGroup2CUDNN(TestWithGroup2): class TestWith1x1CUDNN(TestWith1x1): def init_kernel_type(self): self.use_cudnn = True + self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64 @unittest.skipIf(not core.is_compiled_with_cuda(), @@ -494,6 +506,7 @@ class TestFP16With1x1CUDNN(TestWith1x1): class TestWithInput1x1Filter1x1CUDNN(TestWithInput1x1Filter1x1): def init_kernel_type(self): self.use_cudnn = True + self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64 @unittest.skipIf(not core.is_compiled_with_cuda(), @@ -514,6 +527,7 @@ class TestCUDNNExhaustiveSearch(TestCUDNN): def init_kernel_type(self): self.use_cudnn = True self.exhaustive_search = True + self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64 # ---- test asymmetric padding ---- diff --git a/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py b/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py index 4649323b5b3..13aa7d3d37d 100644 --- a/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py +++ b/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py @@ -50,7 +50,7 @@ class TestSyncBatchNormOpTraining(unittest.TestCase): def setUp(self): """Setup.""" #self.dtype = np.float32 - self.dtype = np.float64 + self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64 self.N = 8 self.C = 16 self.H = 32 @@ -92,7 +92,10 @@ class TestSyncBatchNormOpTraining(unittest.TestCase): moving_variance_name='bn_moving_variance', data_layout=layout, is_test=only_forward) - bn = fluid.layers.cast(bn, 'float64') + if core.is_compiled_with_rocm(): + bn = fluid.layers.cast(bn, 'float32') + else: + bn = fluid.layers.cast(bn, 'float64') sigmoid = fluid.layers.sigmoid(bn) out = fluid.layers.reduce_sum(sigmoid) if not sync_bn: -- GitLab From 99dcd66508b5d45dc57b49b2891419178263d4d5 Mon Sep 17 00:00:00 2001 From: hong <43953930+phlrain@users.noreply.github.com> Date: Fri, 12 Mar 2021 14:22:05 +0800 Subject: [PATCH 007/486] try to fix imperative orc unitest error; test=develop (#31568) --- .../test_imperative_ocr_attention_model.py | 34 +++++++++---------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py b/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py index f256e97e837..973c5598579 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py @@ -29,19 +29,19 @@ class Config(object): config for training ''' # encoder rnn hidden_size - encoder_size = 16 + encoder_size = 8 # decoder size for decoder stage - decoder_size = 16 + decoder_size = 8 # size for word embedding - word_vector_dim = 16 + word_vector_dim = 8 # max length for label padding - max_length = 5 + max_length = 3 # optimizer setting LR = 1.0 learning_rate_decay = None # batch size to train - batch_size = 8 + batch_size = 2 # class number to classify num_classes = 64 @@ -55,7 +55,7 @@ class Config(object): TRAIN_LIST_FILE_NAME = "train.list" # data shape for input image - DATA_SHAPE = [1, 48, 384] + DATA_SHAPE = [1, 16, 64] class ConvBNPool(fluid.dygraph.Layer): @@ -124,13 +124,13 @@ class OCRConv(fluid.dygraph.Layer): def __init__(self, is_test=False, use_cudnn=True): super(OCRConv, self).__init__() self.conv_bn_pool_1 = ConvBNPool( - 2, [16, 16], [1, 16], is_test=is_test, use_cudnn=use_cudnn) + 2, [8, 8], [1, 8], is_test=is_test, use_cudnn=use_cudnn) self.conv_bn_pool_2 = ConvBNPool( - 2, [32, 32], [16, 32], is_test=is_test, use_cudnn=use_cudnn) + 2, [8, 8], [8, 8], is_test=is_test, use_cudnn=use_cudnn) self.conv_bn_pool_3 = ConvBNPool( - 2, [64, 64], [32, 64], is_test=is_test, use_cudnn=use_cudnn) + 2, [8, 8], [8, 8], is_test=is_test, use_cudnn=use_cudnn) self.conv_bn_pool_4 = ConvBNPool( - 2, [128, 128], [64, 128], + 2, [16, 16], [8, 16], is_test=is_test, pool=False, use_cudnn=use_cudnn) @@ -212,9 +212,9 @@ class EncoderNet(fluid.dygraph.Layer): self.ocr_convs = OCRConv(is_test=is_test, use_cudnn=use_cudnn) self.fc_1_layer = Linear( - 768, rnn_hidden_size * 3, param_attr=para_attr, bias_attr=False) + 32, rnn_hidden_size * 3, param_attr=para_attr, bias_attr=False) self.fc_2_layer = Linear( - 768, rnn_hidden_size * 3, param_attr=para_attr, bias_attr=False) + 32, rnn_hidden_size * 3, param_attr=para_attr, bias_attr=False) self.gru_forward_layer = DynamicGRU( size=rnn_hidden_size, h_0=h_0, @@ -241,10 +241,9 @@ class EncoderNet(fluid.dygraph.Layer): transpose_conv_features = fluid.layers.transpose( conv_features, perm=[0, 3, 1, 2]) - sliced_feature = fluid.layers.reshape( transpose_conv_features, [ - -1, 48, transpose_conv_features.shape[2] * + -1, 8, transpose_conv_features.shape[2] * transpose_conv_features.shape[3] ], inplace=False) @@ -376,9 +375,9 @@ class TestDygraphOCRAttention(unittest.TestCase): seed = 90 epoch_num = 1 if core.is_compiled_with_cuda(): - batch_num = 6 + batch_num = 3 else: - batch_num = 4 + batch_num = 2 np.random.seed = seed image_np = np.random.randn(Config.batch_size, Config.DATA_SHAPE[0], Config.DATA_SHAPE[1], @@ -536,8 +535,9 @@ class TestDygraphOCRAttention(unittest.TestCase): self.assertTrue(np.array_equal(value, dy_param_init_value[key])) for key, value in six.iteritems(static_param_value): - self.assertTrue(np.allclose(value, dy_param_value[key])) + self.assertTrue(np.allclose(value, dy_param_value[key], rtol=1e-05)) if __name__ == '__main__': + paddle.enable_static() unittest.main() -- GitLab From da9dda5c9b6b2d43e5e81d53baef9d9abaa7f1ce Mon Sep 17 00:00:00 2001 From: whs Date: Fri, 12 Mar 2021 14:54:49 +0800 Subject: [PATCH 008/486] Make CreateProgramDesc more robust (#31543) --- .../imperative/jit/program_desc_tracer.cc | 26 ++++++++++++++++--- .../imperative/jit/program_desc_tracer.h | 2 +- 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/imperative/jit/program_desc_tracer.cc b/paddle/fluid/imperative/jit/program_desc_tracer.cc index 53750f7bf02..1a44f50275e 100644 --- a/paddle/fluid/imperative/jit/program_desc_tracer.cc +++ b/paddle/fluid/imperative/jit/program_desc_tracer.cc @@ -69,6 +69,7 @@ UniqueBlockVarGenerator::UniqueBlockVarGenerator( std::string UniqueBlockVarGenerator::NameOf(const std::weak_ptr &var, const std::string &prefix) { + VLOG(3) << "Finding: " << var.lock()->Name(); auto all_vars_iter = all_vars_.find(var); PADDLE_ENFORCE_EQ(all_vars_iter != all_vars_.end(), true, platform::errors::NotFound( @@ -111,6 +112,15 @@ void UniqueBlockVarGenerator::InsertNewVarInBlock( } } +bool ProgramDescTracer::ContainVar(const std::weak_ptr &var) const { + auto vars_iter = vars_.find(var); + bool ret = (vars_iter != vars_.end()); + if (!ret) { + VLOG(5) << "Can't found variable: " << var.lock()->Name(); + } + return ret; +} + void ProgramDescTracer::InsertOp(const std::string &type, const NameVarBaseMap &inputs, const NameVarBaseMap &outputs, @@ -147,12 +157,16 @@ TracedProgramTuple ProgramDescTracer::CreateProgramDesc( std::vector feed_var_names; for (auto &feed_var : feed_vars) { - feed_var_names.emplace_back(generator.NameOf(feed_var, feed_prefix)); + if (ContainVar(feed_var)) { + feed_var_names.emplace_back(generator.NameOf(feed_var, feed_prefix)); + } } std::vector fetch_var_names; for (auto &fetch_var : fetch_vars) { - fetch_var_names.emplace_back(generator.NameOf(fetch_var, fetch_prefix)); + if (ContainVar(fetch_var)) { + fetch_var_names.emplace_back(generator.NameOf(fetch_var, fetch_prefix)); + } } for (auto &op : ops_) { @@ -164,7 +178,9 @@ TracedProgramTuple ProgramDescTracer::CreateProgramDesc( std::vector names; names.reserve(pair.second.size()); for (auto &var : pair.second) { - names.emplace_back(generator.NameOf(var, tmp_prefix)); + if (ContainVar(var)) { + names.emplace_back(generator.NameOf(var, tmp_prefix)); + } } op_desc->SetInput(pair.first, std::move(names)); @@ -174,7 +190,9 @@ TracedProgramTuple ProgramDescTracer::CreateProgramDesc( std::vector names; names.reserve(pair.second.size()); for (auto &var : pair.second) { - names.emplace_back(generator.NameOf(var, tmp_prefix)); + if (ContainVar(var)) { + names.emplace_back(generator.NameOf(var, tmp_prefix)); + } } op_desc->SetOutput(pair.first, std::move(names)); diff --git a/paddle/fluid/imperative/jit/program_desc_tracer.h b/paddle/fluid/imperative/jit/program_desc_tracer.h index 8e2e59a49ed..b231efb0e53 100644 --- a/paddle/fluid/imperative/jit/program_desc_tracer.h +++ b/paddle/fluid/imperative/jit/program_desc_tracer.h @@ -66,7 +66,7 @@ class ProgramDescTracer { const std::string &feed_prefix, const std::vector> &fetch_vars, const std::string &fetch_prefix, const std::string &tmp_prefix) const; - + bool ContainVar(const std::weak_ptr &var) const; void Reset(); private: -- GitLab From ef0dd3efed254c96b8dba012867c101cffb5889a Mon Sep 17 00:00:00 2001 From: guofei <52460041+gfwm2013@users.noreply.github.com> Date: Fri, 12 Mar 2021 16:46:17 +0800 Subject: [PATCH 009/486] Support loading parameters from checkpoint to save quantized model (#31419) * Support loading parameters from checkpoint to save quantized model * Fix the unittest test_moving_average_abs_max_scale_op * Add unittest of save_quantized_model from checkpoint * Add comments to explain the function --- .../slim/quantization/imperative/qat.py | 246 +++++++++++------- .../slim/quantization/imperative/quant_nn.py | 74 +++--- .../slim/tests/test_imperative_out_scale.py | 113 +++++++- 3 files changed, 308 insertions(+), 125 deletions(-) diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py index c5ee9ea6751..afe8a3de667 100644 --- a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py +++ b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py @@ -17,11 +17,15 @@ import logging import numpy as np import sys import os +import warnings + import paddle -from paddle.fluid import dygraph, core, framework +from paddle.fluid import dygraph, core, framework, unique_name from paddle.fluid.executor import Executor +from paddle.fluid.param_attr import ParamAttr +from paddle.fluid.initializer import Constant from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX -from paddle.nn import Linear, Conv2D, Conv2DTranspose, MaxPool2D, MaxPool1D, BatchNorm1D, BatchNorm2D, BatchNorm3D +from paddle.nn import Linear, Conv2D, Conv2DTranspose, MaxPool2D, MaxPool1D, BatchNorm1D, BatchNorm2D, BatchNorm3D, SyncBatchNorm from paddle.fluid.dygraph.nn import BatchNorm, Pool2D from paddle.fluid.io import load_inference_model, save_inference_model from paddle.nn.layer.activation import ReLU, LeakyReLU, Sigmoid, ReLU6, Tanh, Softmax, PReLU, Swish @@ -331,10 +335,73 @@ class ImperativeCalcOutScale(object): self._out_scale_layer_type_list = ( BatchNorm, BatchNorm1D, BatchNorm2D, BatchNorm3D, Conv2D, LeakyReLU, Linear, PReLU, Pool2D, MaxPool1D, MaxPool2D, ReLU, ReLU6, Sigmoid, - Softmax, Tanh, Swish) + Softmax, SyncBatchNorm, Tanh, Swish) self._register_hook_handle_list = [] self._out_scale_dict = collections.OrderedDict() + # Determine whether layer supports calculation out_scale + def _is_matched_layer(self, layer): + if not isinstance(layer, self._out_scale_layer_type_list): + if 'quantized_' not in layer.full_name(): + return False + return True + + # When inferenc model is saved, the logic in hook would not be executed + # in program translation, so that some parameters can not created in + # __init__, which would cause the model to fail to save. Therefore, the + # parameters creation in the hook is advanced to be exected outside the hook. + def _add_new_parameters(self, layer, name=None): + dtype = layer._dtype if layer._dtype is not None else "float32" + if dtype not in ["float32", "float64"]: + return + scale_prefix = '{}.scale'.format(name) if name else 'outscale.scale' + scale_name = unique_name.generate(scale_prefix) + scale_attr = ParamAttr( + name=scale_name, initializer=Constant(1), trainable=False) + layer._quant_out_scale = layer.create_parameter( + shape=[1], attr=scale_attr, dtype=dtype) + layer._quant_out_scale.stop_gradient = True + + state_prefix = "{}.state".format(name) if name else 'outscale.state' + state_attr = ParamAttr( + name=unique_name.generate(state_prefix), + initializer=Constant(1), + trainable=False) + layer._quant_out_state = layer.create_parameter( + shape=[1], attr=state_attr, dtype=dtype) + layer._quant_out_state.stop_gradient = True + + accum_prefix = "{}.accum".format(name) if name else 'outscale.accum' + accum_attr = ParamAttr( + name=unique_name.generate(accum_prefix), + initializer=Constant(1), + trainable=False) + layer._quant_out_accum = layer.create_parameter( + shape=[1], attr=accum_attr, dtype=dtype) + layer._quant_out_accum.stop_gradient = True + + # Judge whether the op in program matches the Layer in dynamic model + def _is_op_matched(self, layer_name, op, block): + output_var_names = quantization_pass._get_op_output_var_names(op) + for output_var_name in output_var_names: + output_var_tensor = block.var(output_var_name) + if output_var_tensor.dtype not in [ + core.VarDesc.VarType.FP64, core.VarDesc.VarType.FP32 + ]: + return False + + # Because the naming styles of static and dynamic graph are different, + # in order to avoid mistakes, we unify the name here. + op_type = output_var_names[0].split(".")[0] + op_type = op_type.rsplit("_", 1)[0] + if op_type == 'depthwise_conv2d': + op_type = 'conv2d' + if 'prelu' in op_type: + op_type = op_type.replace('prelu', 'p_re_lu') + if 'relu' in op_type: + op_type = op_type.replace('relu', 're_lu') + return op_type in layer_name + def calc_out_scale(self, model): """ Insert the `moving_average_abs_max_scale` op to calculate output scale of Specific layers in model. @@ -348,12 +415,11 @@ class ImperativeCalcOutScale(object): assert isinstance( model, dygraph.Layer), "model must be the instance of dygraph.Layer" for _, layer in model.named_sublayers(): - if not isinstance(layer, self._out_scale_layer_type_list): - if 'quantized_' not in layer.full_name(): - continue - forward_post_hook_handle = layer.register_forward_post_hook( - self._forward_post_hook) - self._register_hook_handle_list.append(forward_post_hook_handle) + if self._is_matched_layer(layer): + self._add_new_parameters(layer) + forward_post_hook_handle = layer.register_forward_post_hook( + self._forward_post_hook) + self._register_hook_handle_list.append(forward_post_hook_handle) def save_quantized_model(self, layer, path, input_spec=None, **config): """ @@ -380,14 +446,26 @@ class ImperativeCalcOutScale(object): assert isinstance( layer, dygraph.Layer), "model must be the instance of dygraph.Layer" + self._layer = layer is_dynamic_mode = False with dygraph.guard(): - layer.eval() - for handle in self._register_hook_handle_list: - handle.remove() - for key in self._out_scale_dict: - self._out_scale_dict[key] = float(self._out_scale_dict[key] - .numpy()) + self._layer.eval() + if self._register_hook_handle_list is not None: + for handle in self._register_hook_handle_list: + handle.remove() + if self._out_scale_dict: + for key in self._out_scale_dict: + self._out_scale_dict[key] = float(self._out_scale_dict[key] + .numpy()) + else: + for _, sub_layer in self._layer.named_sublayers(): + if self._is_matched_layer(sub_layer): + layer_name = sub_layer.full_name() + if hasattr(sub_layer, "layer_name"): + layer_name = sub_layer.layer_name + if hasattr(sub_layer, "_quant_out_scale"): + self._out_scale_dict[layer_name] = float( + sub_layer._quant_out_scale) if paddle.in_dynamic_mode(): is_dynamic_mode = True @@ -413,74 +491,68 @@ class ImperativeCalcOutScale(object): model_filename=model_filename, params_filename=params_filename)) - # Traverse all ops in the program and find out the op matching - # the Layer in the dynamic graph. - layer_var_dict = collections.OrderedDict() - ops_list = [key for key, _ in self._out_scale_dict.items()] + check_behind_op = False op_count = 0 - conv_count = 0 - - for block in inference_program.blocks: - for op in block.ops: - if op.type in _op_real_in_out_name: - if op.type in ["batch_norm", "pool2d"]: - if op.type == "pool2d" and op.attr( - "pooling_type") != "max": - continue - op_count = self.op_match(op, ops_list, op_count) - if op_count >= len(ops_list): - continue - op._set_attr('out_threshold', - self._out_scale_dict[ops_list[op_count]]) - op_count += 1 - else: - output_var_names = quantization_pass._get_op_output_var_names( - op) - for output_var_name in output_var_names: - output_var_tensor = block.var(output_var_name) - if output_var_tensor.dtype not in [ - core.VarDesc.VarType.FP64, - core.VarDesc.VarType.FP32 - ]: - continue - # Because the Layer in dygraph may correspond to multiple ops - # in static program after being saved. To ensure correctness, - # the outscale collected for output of dygraph Layer can only - # be set to the last op in the corresponding ops in static program. - # - # We can judge the execution order of the ops which corresponding - # to dygraph Layer by the name of output. And use dict to save - # the corresponding relationship between the dygraph Layer and the - # static graph op that needs to set the outscale attribute. - if '.' not in output_var_name: + ops_list = [key for key, _ in self._out_scale_dict.items()] + if len(ops_list) == 0: + warnings.warn( + "Warning: No Layer of the model while to be saved contains the out_threshold attribute, " + "so the generated inference model would not contain the out_threshold." + ) + else: + # Because the Layer in dygraph may correspond to multiple ops + # in static program after being saved. To ensure correctness, + # the outscale collected for output of dygraph Layer can only + # be set to the last op in the corresponding ops in static program. + # + # We can judge the execution order of the ops which corresponding + # to dygraph Layer by check_behind_op + forward_op = None + for block in inference_program.blocks: + for op in block.ops: + if op.type in _op_real_in_out_name: + if op_count > len(ops_list): + warnings.warn( + "The number of Layer which has out_threshold attribute should be bigger than the op in inference model" + ) + break + if check_behind_op: + check_behind_op = False + if op.type == "elementwise_add": + if self._is_op_matched(ops_list[op_count], op, + block): + op._set_attr("out_threshold", + self._out_scale_dict[ops_list[ + op_count]]) + op_count += 1 + forward_op = None continue - dynamic_layer_name, var_name_suffix = output_var_name.split( - ".") - if dynamic_layer_name in layer_var_dict: - if layer_var_dict[dynamic_layer_name][ - 0] < var_name_suffix: - layer_var_dict[dynamic_layer_name] = [ - var_name_suffix, op - ] else: - layer_var_dict[dynamic_layer_name] = [ - var_name_suffix, op - ] - - # Because the naming styles of static and dynamic graph are different, - # in order to avoid mistakes, we unify the name here. - for (layer_name, var_name_op_list) in layer_var_dict.items(): - if 'prelu' in layer_name: - layer_name = layer_name.replace('prelu', 'p_re_lu') - if 'relu' in layer_name: - layer_name = layer_name.replace('relu', 're_lu') - if 'conv2d' in layer_name: - layer_name = 'conv2d_' + str(conv_count) - conv_count = conv_count + 1 - if layer_name not in self._out_scale_dict: - continue - var_name_op_list[1]._set_attr('out_threshold', - self._out_scale_dict[layer_name]) + if forward_op is None: + raise ValueError( + "forward_op should not be None") + if self._is_op_matched(ops_list[op_count], + forward_op, block): + forward_op._set_attr( + "out_threshold", self._out_scale_dict[ + ops_list[op_count]]) + op_count += 1 + forward_op = None + + if op.type in ["conv2d", "depthwise_conv2d", "matmul"]: + check_behind_op = True + forward_op = op + continue + if op_count >= len(ops_list): + warnings.warn( + "The number of Layer which has out_threshold attribute should be bigger than the op in inference model" + ) + break + if self._is_op_matched(ops_list[op_count], op, block): + op._set_attr( + "out_threshold", + self._out_scale_dict[ops_list[op_count]]) + op_count += 1 # Save the processed program. save_inference_model( @@ -495,14 +567,6 @@ class ImperativeCalcOutScale(object): if is_dynamic_mode: paddle.disable_static() - def op_match(self, op, ops_list, op_count): - while op_count < len(ops_list) and op.type not in ops_list[op_count]: - op_count += 1 - while op_count < len(ops_list) and op.type is "pool2d" and op.attr( - "pooling_type") != "max": - op_count += 1 - return op_count - def _forward_post_hook(self, layer, input, output): assert isinstance( output, (core.VarBase, framework.Variable) @@ -512,9 +576,9 @@ class ImperativeCalcOutScale(object): ]: return if not hasattr(layer, "_out_scale"): - layer._out_scale = quant_nn.MovingAverageAbsMaxScale( - output.name, self._moving_rate, output.dtype) - scale_out = layer._out_scale(output) + self._out_scale = quant_nn.MovingAverageAbsMaxScale( + layer, output.name, self._moving_rate, output.dtype) + scale_out = self._out_scale(output) if hasattr(layer, 'layer_name'): layer_name = layer.layer_name else: diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py b/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py index 0469de7aef2..0b052d5dd0d 100644 --- a/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py +++ b/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py @@ -503,7 +503,7 @@ class QuantizedNoweightLayer(layers.Layer): class MovingAverageAbsMaxScale(layers.Layer): - def __init__(self, name=None, moving_rate=0.9, dtype='float32'): + def __init__(self, layer=None, name=None, moving_rate=0.9, dtype='float32'): r""" MovingAverageMaxScale layer is used to calculating the output quantization scale of Layer. Its computational formula is described as below: @@ -514,33 +514,48 @@ class MovingAverageAbsMaxScale(layers.Layer): super(MovingAverageAbsMaxScale, self).__init__() self._moving_rate = moving_rate self._dtype = dtype + self._layer = layer - scale_prefix = '{}.scale'.format(name) if name else 'outscale.scale' - name = unique_name.generate(scale_prefix) - scale_attr = ParamAttr( - name=name, initializer=Constant(1), trainable=False) - self._scale = self.create_parameter( - shape=[1], attr=scale_attr, dtype=self._dtype) - self._scale.stop_gradient = True + if self._layer is None or not hasattr(self._layer, "_quant_out_scale"): + scale_prefix = '{}.scale'.format(name) if name else 'outscale.scale' + scale_name = unique_name.generate(scale_prefix) + scale_attr = ParamAttr( + name=scale_name, initializer=Constant(1), trainable=False) + self._scale = self.create_parameter( + shape=[1], attr=scale_attr, dtype=self._dtype) + self._scale.stop_gradient = True + if self._layer is not None: + setattr(self._layer, "_quant_out_scale", self._scale) + else: + self._scale = self._layer._quant_out_scale - state_prefix = "{}.state".format(name) if name else 'outscale.state' - state_attr = ParamAttr( - name=unique_name.generate(state_prefix), - initializer=Constant(1), - trainable=False) - self._state = self.create_parameter( - shape=[1], attr=state_attr, dtype=self._dtype) - self._state.stop_gradient = True + if self._layer is None or not hasattr(self._layer, "_quant_out_state"): + state_prefix = "{}.state".format(name) if name else 'outscale.state' + state_attr = ParamAttr( + name=unique_name.generate(state_prefix), + initializer=Constant(1), + trainable=False) + self._state = self.create_parameter( + shape=[1], attr=state_attr, dtype=self._dtype) + self._state.stop_gradient = True + if self._layer is not None: + setattr(self._layer, "_quant_out_state", self._state) + else: + self._state = self._layer._quant_out_state - accum_prefix = "{}.accum".format(name) if name else 'outscale.accum' - accum_attr = ParamAttr( - name=unique_name.generate(accum_prefix), - initializer=Constant(1), - trainable=False) - self._accum = self.create_parameter( - shape=[1], attr=accum_attr, dtype=self._dtype) - self._accum.stop_gradient = True - MovingAverageAbsMaxScale._has_create = True + if self._layer is None or not hasattr(self._layer, "_quant_out_accum"): + accum_prefix = "{}.accum".format(name) if name else 'outscale.accum' + accum_attr = ParamAttr( + name=unique_name.generate(accum_prefix), + initializer=Constant(1), + trainable=False) + self._accum = self.create_parameter( + shape=[1], attr=accum_attr, dtype=self._dtype) + self._accum.stop_gradient = True + if self._layer is not None: + setattr(self._layer, "_quant_out_accum", self._accum) + else: + self._accum = self._layer._quant_out_accum def forward(self, input): if in_dygraph_mode(): @@ -549,18 +564,17 @@ class MovingAverageAbsMaxScale(layers.Layer): state = self._state if self.training else None accum = self._accum if self.training else None - out_scale, _, _ = core.ops.moving_average_abs_max_scale( + self._scale, _, _ = core.ops.moving_average_abs_max_scale( input, accum, state, self._scale, state, accum, *attrs) - return out_scale + return self._scale check_variable_and_dtype(input, 'input', ['float32', 'float64'], 'MovingAverageAbsMaxScale') - scale_out = self._scale attrs = {'moving_rate': self._moving_rate, 'is_test': not self.training} inputs = {"X": [input]} - outputs = {"OutScale": [scale_out]} + outputs = {"OutScale": [self._scale]} if self.training: inputs['InState'] = [self._state] @@ -574,4 +588,4 @@ class MovingAverageAbsMaxScale(layers.Layer): outputs=outputs, attrs=attrs) - return scale_out + return self._scale diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py index 47e21910b48..83ddac41965 100644 --- a/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py +++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py @@ -19,6 +19,8 @@ import numpy as np import random import unittest import logging +import warnings + import paddle import paddle.fluid as fluid import paddle.fluid.layers as layers @@ -29,7 +31,7 @@ from paddle.fluid.contrib.slim.quantization import ImperativeQuantAware from paddle.fluid.contrib.slim.quantization import OutScaleForTrainingPass, OutScaleForInferencePass, QuantizationTransformPass from paddle.fluid.dygraph.container import Sequential from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX -from paddle.nn.layer import ReLU, LeakyReLU, Sigmoid, Softmax, ReLU6 +from paddle.nn.layer import ReLU, LeakyReLU, Sigmoid, Softmax, PReLU from paddle.nn import Linear, Conv2D, Softmax, BatchNorm2D, MaxPool2D from paddle.fluid.dygraph.nn import Pool2D from paddle.fluid.log_helper import get_logger @@ -45,6 +47,14 @@ _logger = get_logger( __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s') +def get_vaild_warning_num(warning, w): + num = 0 + for i in range(len(w)): + if warning in str(w[i].message): + num += 1 + return num + + def StaticLenet(data, num_classes=10, classifier_activation='softmax'): conv2d_w1_attr = fluid.ParamAttr(name="conv2d_w_1") conv2d_w2_attr = fluid.ParamAttr(name="conv2d_w_2") @@ -76,9 +86,9 @@ def StaticLenet(data, num_classes=10, classifier_activation='softmax'): param_attr=conv2d_w2_attr, bias_attr=conv2d_b2_attr) batch_norm2 = layers.batch_norm(conv2) - relu6_1 = layers.relu6(batch_norm2) + prelu1 = layers.prelu(batch_norm2, mode='all') pool2 = fluid.layers.pool2d( - relu6_1, pool_size=2, pool_type='max', pool_stride=2) + prelu1, pool_size=2, pool_type='max', pool_stride=2) fc1 = fluid.layers.fc(input=pool2, size=120, @@ -132,7 +142,7 @@ class ImperativeLenet(fluid.dygraph.Layer): weight_attr=conv2d_w2_attr, bias_attr=conv2d_b2_attr), BatchNorm2D(16), - ReLU6(), + PReLU(), MaxPool2D( kernel_size=2, stride=2)) @@ -246,6 +256,10 @@ class TestImperativeOutSclae(unittest.TestCase): lenet.eval() + param_save_path = "test_save_quantized_model/lenet.pdparams" + save_dict = lenet.state_dict() + paddle.save(save_dict, param_save_path) + path = "./dynamic_outscale_infer_model/lenet" dynamic_save_dir = "./dynamic_outscale_infer_model" @@ -285,6 +299,8 @@ class TestImperativeOutSclae(unittest.TestCase): for param in main.all_parameters(): if "batch_norm" in param.name: param_name = param.name.replace("norm", "norm2d") + elif 'prelu' in param.name: + param_name = param.name.replace("prelu", 'p_re_lu') else: param_name = param.name param_tensor = scope.var(param.name).get_tensor() @@ -384,5 +400,94 @@ class TestImperativeOutSclae(unittest.TestCase): static_ops[i].attr("out_threshold")) +class TestSaveQuanztizedModelFromCheckPoint(unittest.TestCase): + def test_save_quantized_model(self): + weight_quantize_type = 'abs_max' + activation_quantize_type = 'moving_average_abs_max' + load_param_path = "test_save_quantized_model/lenet.pdparams" + path = "./dynamic_outscale_infer_model_from_checkpoint/lenet" + dynamic_model_save_dir = "./dynamic_outscale_infer_model_from_checkpoint" + static_model_save_dir = "./static_outscale_infer_model" + + imperative_out_scale = ImperativeQuantAware( + weight_quantize_type=weight_quantize_type, + activation_quantize_type=activation_quantize_type) + + with fluid.dygraph.guard(): + lenet = ImperativeLenet() + load_dict = paddle.load(load_param_path) + imperative_out_scale.quantize(lenet) + lenet.set_dict(load_dict) + + imperative_out_scale.save_quantized_model( + layer=lenet, + path=path, + input_spec=[ + paddle.static.InputSpec( + shape=[None, 1, 28, 28], dtype='float32') + ]) + + if core.is_compiled_with_cuda(): + place = core.CUDAPlace(0) + else: + place = core.CPUPlace() + exe = fluid.Executor(place) + + # load dynamic model + [dynamic_inference_program, feed_target_names, fetch_targets] = ( + fluid.io.load_inference_model( + dirname=dynamic_model_save_dir, + executor=exe, + model_filename="lenet" + INFER_MODEL_SUFFIX, + params_filename="lenet" + INFER_PARAMS_SUFFIX)) + # load static model + [static_inference_program, feed_target_names, fetch_targets] = ( + fluid.io.load_inference_model( + dirname=static_model_save_dir, + executor=exe, + model_filename="lenet" + INFER_MODEL_SUFFIX, + params_filename="lenet" + INFER_PARAMS_SUFFIX)) + + dynamic_ops = dynamic_inference_program.global_block().ops + static_ops = static_inference_program.global_block().ops + + for op in dynamic_ops[:]: + if op.type == "flatten2" or 'fake' in op.type: + dynamic_ops.remove(op) + + for op in static_ops[:]: + if 'fake' in op.type: + static_ops.remove(op) + + for i in range(len(dynamic_ops)): + if dynamic_ops[i].has_attr("out_threshold"): + self.assertTrue(dynamic_ops[i].type == static_ops[i].type) + self.assertTrue(dynamic_ops[i].attr("out_threshold") == + static_ops[i].attr("out_threshold")) + + +class TestSaveQuantizedModel_Warning(unittest.TestCase): + def test_warning(self): + path = "./dynamic_outscale_infer_model_with_warnings/lenet" + imperative_out_scale = ImperativeQuantAware() + with fluid.dygraph.guard(): + lenet = ImperativeLenet() + + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + imperative_out_scale.save_quantized_model( + layer=lenet, + path=path, + input_spec=[ + paddle.static.InputSpec( + shape=[None, 1, 28, 28], dtype='float32') + ]) + + warning_message = "Warning: No Layer of the model while to be saved contains the out_threshold attribute, " \ + "so the generated inference model would not contain the out_threshold." + num = get_vaild_warning_num(warning_message, w) + assert num == 1 + + if __name__ == '__main__': unittest.main() -- GitLab From 50ac7dbfd05e9a5fd8bf8a87111faa1f33590f67 Mon Sep 17 00:00:00 2001 From: Shang Zhizhou Date: Fri, 12 Mar 2021 18:05:38 +0800 Subject: [PATCH 010/486] Trt elementwise plugin serialize (#31587) * add serialize unittest * fix element_op trt plugin serialize bug --- .../tensorrt/plugin/elementwise_op_plugin.cu | 9 +++- .../tensorrt/plugin/elementwise_op_plugin.h | 47 ++++++++++++++++- .../ir/inference/test_trt_subgraph_pass.py | 52 +++++++++++++++++++ 3 files changed, 105 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu index 457d9dd8737..cc17f8aa248 100644 --- a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu @@ -152,9 +152,14 @@ int ElementWisePlugin::enqueue(int batch_size, const void *const *inputs, int ElementwisePluginDynamic::initialize() { return 0; } -size_t ElementwisePluginDynamic::getSerializationSize() const { return 0; } +size_t ElementwisePluginDynamic::getSerializationSize() const { + return SerializedSize(type_.c_str()) + SerializedSize(axis_); +} -void ElementwisePluginDynamic::serialize(void *buffer) const {} +void ElementwisePluginDynamic::serialize(void *buffer) const { + SerializeValue(&buffer, type_.c_str()); + SerializeValue(&buffer, axis_); +} nvinfer1::DimsExprs ElementwisePluginDynamic::getOutputDimensions( int output_index, const nvinfer1::DimsExprs *inputs, int nb_inputs, diff --git a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h index e37511868d8..49212aae9aa 100644 --- a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h @@ -92,7 +92,12 @@ class ElementwisePluginDynamic : public DynamicPluginTensorRT { public: explicit ElementwisePluginDynamic(const std::string& type, int axis) : type_(type), axis_(axis) {} - ElementwisePluginDynamic(void const* serialData, size_t serialLength) {} + ElementwisePluginDynamic(void const* serialData, size_t serialLength) { + const char* elementwise_type; + DeserializeValue(&serialData, &serialLength, &elementwise_type); + type_ = std::string(elementwise_type); + DeserializeValue(&serialData, &serialLength, &axis_); + } nvinfer1::IPluginV2DynamicExt* clone() const override { return new ElementwisePluginDynamic(type_, axis_); } @@ -138,6 +143,46 @@ class ElementwisePluginDynamic : public DynamicPluginTensorRT { std::string type_; int axis_; }; + +class ElementwisePluginV2Creator : public nvinfer1::IPluginCreator { + public: + ElementwisePluginV2Creator() {} + const char* getPluginName() const override { return "elementwise_plugin"; } + + const char* getPluginVersion() const override { return "1"; } + + const nvinfer1::PluginFieldCollection* getFieldNames() override { + return &field_collection_; + } + + nvinfer1::IPluginV2* createPlugin( + const char* name, const nvinfer1::PluginFieldCollection* fc) override { + return nullptr; + } + + nvinfer1::IPluginV2* deserializePlugin(const char* name, + const void* serial_data, + size_t serial_length) override { + auto plugin = new ElementwisePluginDynamic(serial_data, serial_length); + return plugin; + } + + void setPluginNamespace(const char* lib_namespace) override { + plugin_namespace_ = lib_namespace; + } + + const char* getPluginNamespace() const override { + return plugin_namespace_.c_str(); + } + + private: + std::string plugin_namespace_; + std::string plugin_name_; + nvinfer1::PluginFieldCollection field_collection_{0, nullptr}; + std::vector plugin_attributes_; +}; + +REGISTER_TRT_PLUGIN_V2(ElementwisePluginV2Creator); #endif } // namespace plugin diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_subgraph_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_subgraph_pass.py index 2c77ce17231..bdcdeee8dcb 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_subgraph_pass.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_subgraph_pass.py @@ -414,6 +414,58 @@ class TensorRTSubgraphPassElementwiseMulTest( return fluid.layers.elementwise_mul(x=data1, y=data2) +class TensorRTSubgraphPassElementwiseSerializeTest( + TensorRTSubgraphPassElementwiseTest): + def setUp(self): + super(TensorRTSubgraphPassElementwiseSerializeTest, self).setUp() + self.trt_parameters = TensorRTSubgraphPassElementwiseTest.TensorRTParam( + 1 << 30, 32, 0, AnalysisConfig.Precision.Float32, True, False) + + def test_check_output(self): + if os.path.exists(self.path + "_opt_cache"): + shutil.rmtree(self.path + "_opt_cache") + super(TensorRTSubgraphPassElementwiseSerializeTest, + self).test_check_output() + + +class TensorRTSubgraphPassElementwiseBroadcastDynamicTest(InferencePassTest): + def setUp(self): + with fluid.program_guard(self.main_program, self.startup_program): + data1 = fluid.data( + name="data1", shape=[-1, 3, 64, 64], dtype="float32") + data2 = fluid.data(name="data2", shape=[64, 64], dtype="float32") + eltwise_out = self.append_eltwise(data1, data2) + out = fluid.layers.batch_norm(eltwise_out, is_test=True) + self.feeds = { + "data1": np.random.random([1, 3, 64, 64]).astype("float32"), + "data2": np.random.random([64, 64]).astype("float32"), + } + self.enable_trt = True + self.trt_parameters = TensorRTSubgraphPassElementwiseBroadcastDynamicTest.TensorRTParam( + 1 << 30, 32, 0, AnalysisConfig.Precision.Float32, True, False) + self.dynamic_shape_params = TensorRTSubgraphPassElementwiseBroadcastDynamicTest.DynamicShapeParam( + { + 'data1': [1, 3, 8, 64], + 'data2': [8, 64] + }, {'data1': [1, 3, 512, 64], + 'data2': + [512, 64]}, {'data1': [1, 3, 256, 64], + 'data2': [256, 64]}, False) + self.fetch_list = [out] + + def append_eltwise(self, data1, data2): + return fluid.layers.elementwise_add(x=data1, y=data2) + + def test_check_output(self): + if os.path.exists(self.path + "_opt_cache"): + shutil.rmtree(self.path + "_opt_cache") + if core.is_compiled_with_cuda(): + use_gpu = True + self.check_output_with_option(use_gpu) + self.assertTrue( + PassVersionChecker.IsCompatible('tensorrt_subgraph_pass')) + + class TensorRTSubgraphPassShuffleChannelTest(InferencePassTest): def setUp(self): with fluid.program_guard(self.main_program, self.startup_program): -- GitLab From cac9635a6733ffbbd816b33e21c3054e0cd81ab1 Mon Sep 17 00:00:00 2001 From: Pei Yang Date: Fri, 12 Mar 2021 18:48:31 +0800 Subject: [PATCH 011/486] [Paddle-TRT] Fix engine key in trt int8 calibration (#31513) * fix engine key in trt int8 calibration * fix unit test --- .../ir_passes/tensorrt_subgraph_pass.cc | 26 ++++++++++++------- .../fluid/inference/api/analysis_predictor.cc | 4 +-- .../operators/tensorrt/tensorrt_engine_op.h | 12 ++++++--- .../tensorrt/tensorrt_engine_op_test.cc | 4 +++ 4 files changed, 31 insertions(+), 15 deletions(-) diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc index 8a14e168ca4..59ed09b96cc 100644 --- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc @@ -86,7 +86,7 @@ std::string GenerateEngineKey(const std::set &engine_inputs, const std::string &predictor_id, const std::string &max_batch_size, const std::string &precision, - const std::string &use_calib_mode) { + const bool for_calibration) { std::string engine_hash_key = ""; for (auto name : engine_inputs) { engine_hash_key += name; @@ -97,12 +97,13 @@ std::string GenerateEngineKey(const std::set &engine_inputs, engine_hash_key += "#"; } engine_hash_key += predictor_id; - engine_hash_key += "#"; - engine_hash_key += max_batch_size; + if (!for_calibration) { + engine_hash_key += "#"; + engine_hash_key += max_batch_size; + } engine_hash_key += "#"; engine_hash_key += precision; - engine_hash_key += "#"; - engine_hash_key += use_calib_mode; + auto engine_key = std::to_string(std::hash()(engine_hash_key)); VLOG(2) << "TRT engine hash key: " << engine_hash_key; VLOG(2) << "TRT engine key: " << engine_key; @@ -258,24 +259,31 @@ void TensorRtSubgraphPass::CreateTensorRTOp( // TODO(NHZlX) // There are models with the same structure but the different parameters, // when running in the 'use_serialize' mode, there is a bug. + // serialization is affected by max_batch_size, but calibration is not. + // So we use seperate engine keys in serialization and calibration. auto engine_key = GenerateEngineKey( input_names_with_id, output_names_with_id, std::to_string(0), std::to_string(Get("max_batch_size")), - std::to_string(static_cast(precision_mode)), - std::to_string(static_cast(use_calib_mode))); + std::to_string(static_cast(precision_mode)), false); + auto calibration_engine_key = GenerateEngineKey( + input_names_with_id, output_names_with_id, std::to_string(0), + std::to_string(Get("max_batch_size")), + std::to_string(static_cast(precision_mode)), true); auto predictor_id = Get("predictor_id"); // Get "" when there is no cached calibration table data. std::string calibration_data = ""; if (enable_int8 && use_calib_mode) { - calibration_data = GetTrtCalibTableData( - Get("model_opt_cache_dir"), engine_key, enable_int8); + calibration_data = + GetTrtCalibTableData(Get("model_opt_cache_dir"), + calibration_engine_key, enable_int8); } op_desc->SetAttr("calibration_data", calibration_data); op_desc->SetAttr("enable_int8", enable_int8); op_desc->SetAttr("enable_fp16", enable_fp16); op_desc->SetAttr("use_calib_mode", use_calib_mode); op_desc->SetAttr("engine_key", engine_key); + op_desc->SetAttr("calibration_engine_key", calibration_engine_key); op_desc->SetAttr("predictor_id", predictor_id); std::string trt_engine_serialized_data = ""; diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 2a1dacedca8..d6080bd6928 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -1017,8 +1017,8 @@ bool AnalysisPredictor::SaveTrtCalibToDisk() { auto &block = inference_program_->Block(0); for (auto &op_desc : block.AllOps()) { if (op_desc->Type() == "tensorrt_engine") { - std::string engine_name = - BOOST_GET_CONST(std::string, op_desc->GetAttr("engine_key")); + std::string engine_name = BOOST_GET_CONST( + std::string, op_desc->GetAttr("calibration_engine_key")); if (!Singleton::Global().Has(engine_name)) { LOG(ERROR) << "You should run the predictor(with trt) on the real data " "to generate calibration info"; diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h index b8805c025a7..1f0ae40798e 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h @@ -89,6 +89,7 @@ class TensorRTEngineOp : public framework::OperatorBase { bool use_calib_mode_; std::string calibration_data_; std::string engine_key_; + std::string calibration_engine_key_; bool calibration_mode_; int predictor_id_; int device_id_; @@ -109,6 +110,7 @@ class TensorRTEngineOp : public framework::OperatorBase { use_calib_mode_ = Attr("use_calib_mode"); calibration_data_ = Attr("calibration_data"); engine_key_ = Attr("engine_key"); + calibration_engine_key_ = Attr("calibration_engine_key"); predictor_id_ = Attr("predictor_id"); auto params = Attr>("parameters"); @@ -172,9 +174,11 @@ class TensorRTEngineOp : public framework::OperatorBase { "Paddle TRT int8..."; int runtime_batch = 1; - if (!Singleton::Global().Has(engine_key_)) { + if (!Singleton::Global().Has( + calibration_engine_key_)) { TRTCalibratorEngine *calib_res = - Singleton::Global().Create(engine_key_); + Singleton::Global().Create( + calibration_engine_key_); std::unordered_map calib_buffers; for (auto &x : input_names_) { if (param_names_.count(x)) continue; @@ -185,7 +189,7 @@ class TensorRTEngineOp : public framework::OperatorBase { runtime_batch = t_shape[0]; } calib_res->calib_.reset(new TRTInt8Calibrator( - calib_buffers, runtime_batch, engine_key_, dev_place)); + calib_buffers, runtime_batch, calibration_engine_key_, dev_place)); calib_res->thr_.reset(new std::thread([&]() { calib_res->engine_.reset(new TensorRTEngine( max_batch_size_, workspace_size_, precision_mode_, @@ -198,7 +202,7 @@ class TensorRTEngineOp : public framework::OperatorBase { TRTInt8Calibrator *temp_calibrator = Singleton::Global() - .Get(engine_key_) + .Get(calibration_engine_key_) ->calib_.get(); std::unordered_map calib_data; diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc index 1dcaccd6e92..4e88d79dfe4 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc @@ -102,6 +102,8 @@ TEST(TensorRTEngineOp, manual) { engine_op_desc.SetAttr("workspace_size", static_cast(1 << 20)); engine_op_desc.SetAttr("parameters", std::vector({})); engine_op_desc.SetAttr("engine_key", std::string("a_engine")); + engine_op_desc.SetAttr("calibration_engine_key", + std::string("a_calib_engine")); engine_op_desc.SetAttr("predictor_id", 1); engine_op_desc.SetAttr("calibration_data", std::string("")); engine_op_desc.SetAttr("enable_int8", static_cast(false)); @@ -204,6 +206,8 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) { engine_op_desc.SetAttr("parameters", std::vector({"y0", "y1", "y2", "y3"})); engine_op_desc.SetAttr("engine_key", std::string("b_engine")); + engine_op_desc.SetAttr("calibration_engine_key", + std::string("b_calib_engine")); engine_op_desc.SetAttr("predictor_id", 1); engine_op_desc.SetAttr("calibration_data", std::string("")); engine_op_desc.SetAttr("enable_int8", static_cast(false)); -- GitLab From 30a627aaf3af775620cda524058a4baccf7b109b Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Mon, 15 Mar 2021 10:06:20 +0800 Subject: [PATCH 012/486] Normalized function parameter writing (#31588) --- .../extension/include/ext_op_meta_info.h | 251 ++++++++++-------- .../fluid/tests/custom_op/attr_test_op.cc | 181 ++++++++++--- .../fluid/tests/custom_op/custom_concat_op.cc | 7 +- .../tests/custom_op/test_custom_attrs_jit.py | 38 ++- 4 files changed, 316 insertions(+), 161 deletions(-) diff --git a/paddle/fluid/extension/include/ext_op_meta_info.h b/paddle/fluid/extension/include/ext_op_meta_info.h index 5b8d5a0bf5a..bad1d6ad9f0 100644 --- a/paddle/fluid/extension/include/ext_op_meta_info.h +++ b/paddle/fluid/extension/include/ext_op_meta_info.h @@ -80,30 +80,31 @@ inline std::string Vec(const std::string& t_name) { ////////////////////// Kernel Function (PD_KERNEL) //////////////////////// // Record Op kernel core function -using KernelFunc = std::vector (*)( - std::vector inputs, std::vector> vec_inputs, - std::vector attrs); - -#define PD_SPECIALIZE_ComputeCallHelper(attr_type) \ - template \ - struct ComputeCallHelper { \ - template \ - static Return Compute(std::vector inputs, \ - std::vector> vec_inputs, \ - std::vector attrs, \ - const PreviousArgs&... pargs) { \ - try { \ - attr_type arg = boost::any_cast(attrs[attr_idx]); \ - return ComputeCallHelper::template Compute< \ - in_idx, vec_in_idx, attr_idx + 1>(inputs, vec_inputs, attrs, \ - pargs..., arg); \ - } catch (boost::bad_any_cast&) { \ - PD_THROW( \ - "Attribute cast error in custom operator. Expected " #attr_type \ - " value."); \ - } \ - } \ +using KernelFunc = + std::vector (*)(const std::vector& inputs, + const std::vector>& vec_inputs, + const std::vector& attrs); + +#define PD_SPECIALIZE_ComputeCallHelper(attr_type) \ + template \ + struct ComputeCallHelper { \ + template \ + static Return Compute(const std::vector& inputs, \ + const std::vector>& vec_inputs, \ + const std::vector& attrs, \ + const PreviousArgs&... pargs) { \ + try { \ + attr_type arg = boost::any_cast(attrs[attr_idx]); \ + return ComputeCallHelper::template Compute< \ + in_idx, vec_in_idx, attr_idx + 1>(inputs, vec_inputs, attrs, \ + pargs..., arg); \ + } catch (boost::bad_any_cast&) { \ + PD_THROW( \ + "Attribute cast error in custom operator. Expected " #attr_type \ + " value."); \ + } \ + } \ } template @@ -114,9 +115,9 @@ struct KernelFuncImpl; template struct KernelFuncImpl { - static Return Compute(std::vector inputs, - std::vector> vec_inputs, - std::vector attrs) { + static Return Compute(const std::vector& inputs, + const std::vector>& vec_inputs, + const std::vector& attrs) { return ComputeCallHelper>::template Compute<0, 0, 0>( inputs, vec_inputs, attrs); } @@ -125,14 +126,13 @@ struct KernelFuncImpl { template struct ComputeCallHelper; - // for Tensor input template struct ComputeCallHelper { template - static Return Compute(std::vector inputs, - std::vector> vec_inputs, - std::vector attrs, + static Return Compute(const std::vector& inputs, + const std::vector>& vec_inputs, + const std::vector& attrs, const PreviousArgs&... pargs) { const Tensor& arg = inputs[in_idx]; return ComputeCallHelper::template Compute { } }; - // for std::vector input template struct ComputeCallHelper&, Tail...> { template - static Return Compute(std::vector inputs, - std::vector> vec_inputs, - std::vector attrs, + static Return Compute(const std::vector& inputs, + const std::vector>& vec_inputs, + const std::vector& attrs, const PreviousArgs&... pargs) { const std::vector& arg = vec_inputs[vec_in_idx]; return ComputeCallHelper::template Compute< @@ -157,6 +156,23 @@ struct KernelFuncImpl { } }; + PD_SPECIALIZE_ComputeCallHelper(const bool&); + PD_SPECIALIZE_ComputeCallHelper(const int&); + PD_SPECIALIZE_ComputeCallHelper(const float&); + PD_SPECIALIZE_ComputeCallHelper(const int64_t&); + PD_SPECIALIZE_ComputeCallHelper(const std::string&); + PD_SPECIALIZE_ComputeCallHelper(const std::vector&); + PD_SPECIALIZE_ComputeCallHelper(const std::vector&); + PD_SPECIALIZE_ComputeCallHelper(const std::vector&); + PD_SPECIALIZE_ComputeCallHelper(const std::vector&); + // TODO(chenweihang): support other attribute type if needed. + // Why not support other attribute type here? + // - boost::blank, std::vector and std::vector + // are not used in op + // - BlockDesc* and std::vector are used in framework + + // NOTE(chenweihang): Used to be compatible with the 2.0.1 released + // interface, and will be deprecated in the future PD_SPECIALIZE_ComputeCallHelper(bool); PD_SPECIALIZE_ComputeCallHelper(int); PD_SPECIALIZE_ComputeCallHelper(float); @@ -166,18 +182,15 @@ struct KernelFuncImpl { PD_SPECIALIZE_ComputeCallHelper(std::vector); PD_SPECIALIZE_ComputeCallHelper(std::vector); PD_SPECIALIZE_ComputeCallHelper(std::vector); - // TODO(chenweihang): support other attribute type if needed. - // Why not support other attribute type here? - // - boost::blank, std::vector and std::vector - // are not used in op - // - BlockDesc* and std::vector are used in framework + // end: base template template struct ComputeCallHelper> { template - static Return Compute(std::vector inputs, - std::vector> vec_inputs, - std::vector attrs, const Args&... args) { + static Return Compute(const std::vector& inputs, + const std::vector>& vec_inputs, + const std::vector& attrs, + const Args&... args) { return impl_fn(args...); } }; @@ -190,8 +203,40 @@ struct KernelFuncImpl { // Record Op infershape core function using InferShapeFunc = std::vector> (*)( - std::vector> input_shapes, - std::vector>> vec_input_shapes); + const std::vector>& input_shapes, + const std::vector>>& vec_input_shapes); + +#define PD_SPECIALIZE_InferShapeCallHelper_FOR_SHAPE(input_type) \ + template \ + struct InferShapeCallHelper { \ + template \ + static Return InferShape( \ + const std::vector>& input_shapes, \ + const std::vector>>& \ + vec_input_shapes, \ + const PreviousArgs&... pargs) { \ + input_type arg = input_shapes[in_idx]; \ + return InferShapeCallHelper::template InferShape( \ + input_shapes, vec_input_shapes, pargs..., arg); \ + } \ + } + +#define PD_SPECIALIZE_InferShapeCallHelper_FOR_SHAPES(input_type) \ + template \ + struct InferShapeCallHelper { \ + template \ + static Return InferShape( \ + const std::vector>& input_shapes, \ + const std::vector>>& \ + vec_input_shapes, \ + const PreviousArgs&... pargs) { \ + input_type arg = vec_input_shapes[vec_in_idx]; \ + return InferShapeCallHelper::template InferShape< \ + in_idx, vec_in_idx + 1>(input_shapes, vec_input_shapes, pargs..., \ + arg); \ + } \ + } template struct InferShapeFuncImpl; @@ -199,8 +244,8 @@ struct InferShapeFuncImpl; template struct InferShapeFuncImpl { static Return InferShape( - std::vector> input_shapes, - std::vector>> vec_input_shapes) { + const std::vector>& input_shapes, + const std::vector>>& vec_input_shapes) { return InferShapeCallHelper>::template InferShape<0, 0>( input_shapes, vec_input_shapes); @@ -210,41 +255,23 @@ struct InferShapeFuncImpl { template struct InferShapeCallHelper; - template - struct InferShapeCallHelper, Tail...> { - template - static Return InferShape( - std::vector> input_shapes, - std::vector>> vec_input_shapes, - const PreviousArgs&... pargs) { - std::vector arg = input_shapes[in_idx]; - return InferShapeCallHelper::template InferShape( - input_shapes, vec_input_shapes, pargs..., arg); - } - }; + PD_SPECIALIZE_InferShapeCallHelper_FOR_SHAPE(const std::vector&); + PD_SPECIALIZE_InferShapeCallHelper_FOR_SHAPES( + const std::vector>&); - template - struct InferShapeCallHelper>, Tail...> { - template - static Return InferShape( - std::vector> input_shapes, - std::vector>> vec_input_shapes, - const PreviousArgs&... pargs) { - std::vector> arg = vec_input_shapes[vec_in_idx]; - return InferShapeCallHelper::template InferShape( - input_shapes, vec_input_shapes, pargs..., arg); - } - }; + // NOTE(chenweihang): Used to be compatible with the 2.0.1 released + // interface, and will be deprecated in the future + PD_SPECIALIZE_InferShapeCallHelper_FOR_SHAPE(std::vector); + PD_SPECIALIZE_InferShapeCallHelper_FOR_SHAPES( + std::vector>); // end: base template template struct InferShapeCallHelper> { template static Return InferShape( - std::vector> input_shapes, - std::vector>> vec_input_shapes, + const std::vector>& input_shapes, + const std::vector>>& vec_input_shapes, const Args&... args) { return impl_fn(args...); } @@ -258,8 +285,38 @@ struct InferShapeFuncImpl { // Record Op Infer dtype core function using InferDtypeFunc = std::vector (*)( - std::vector input_dtypes, - std::vector> vec_input_dtypes); + const std::vector& input_dtypes, + const std::vector>& vec_input_dtypes); + +#define PD_SPECIALIZE_InferDtypeCallHelper_TO_DTYPE(input_type) \ + template \ + struct InferDtypeCallHelper { \ + template \ + static Return InferDtype( \ + const std::vector& input_dtypes, \ + const std::vector>& vec_input_dtypes, \ + const PreviousArgs&... pargs) { \ + input_type arg = input_dtypes[in_idx]; \ + return InferDtypeCallHelper::template InferDtype( \ + input_dtypes, vec_input_dtypes, pargs..., arg); \ + } \ + } + +#define PD_SPECIALIZE_InferDtypeCallHelper_FOR_DTYPES(input_type) \ + template \ + struct InferDtypeCallHelper { \ + template \ + static Return InferDtype( \ + const std::vector& input_dtypes, \ + const std::vector>& vec_input_dtypes, \ + const PreviousArgs&... pargs) { \ + input_type arg = vec_input_dtypes[vec_in_idx]; \ + return InferDtypeCallHelper::template InferDtype< \ + in_idx, vec_in_idx + 1>(input_dtypes, vec_input_dtypes, pargs..., \ + arg); \ + } \ + } template struct InferDtypeFuncImpl; @@ -267,8 +324,8 @@ struct InferDtypeFuncImpl; template struct InferDtypeFuncImpl { static Return InferDtype( - std::vector input_dtypes, - std::vector> vec_input_dtypes) { + const std::vector& input_dtypes, + const std::vector>& vec_input_dtypes) { return InferDtypeCallHelper>::template InferDtype<0, 0>( input_dtypes, vec_input_dtypes); @@ -278,41 +335,21 @@ struct InferDtypeFuncImpl { template struct InferDtypeCallHelper; - template - struct InferDtypeCallHelper { - template - static Return InferDtype( - std::vector input_dtypes, - std::vector> vec_input_dtypes, - const PreviousArgs&... pargs) { - DataType arg = input_dtypes[in_idx]; - return InferDtypeCallHelper::template InferDtype( - input_dtypes, vec_input_dtypes, pargs..., arg); - } - }; + PD_SPECIALIZE_InferDtypeCallHelper_TO_DTYPE(const DataType&); + PD_SPECIALIZE_InferDtypeCallHelper_FOR_DTYPES(const std::vector&); - template - struct InferDtypeCallHelper, Tail...> { - template - static Return InferDtype( - std::vector input_dtypes, - std::vector> vec_input_dtypes, - const PreviousArgs&... pargs) { - std::vector arg = vec_input_dtypes[vec_in_idx]; - return InferDtypeCallHelper::template InferDtype( - input_dtypes, vec_input_dtypes, pargs..., arg); - } - }; + // NOTE(chenweihang): Used to be compatible with the 2.0.1 released + // interface, and will be deprecated in the future + PD_SPECIALIZE_InferDtypeCallHelper_TO_DTYPE(DataType); + PD_SPECIALIZE_InferDtypeCallHelper_FOR_DTYPES(std::vector); // end: base template template struct InferDtypeCallHelper> { template static Return InferDtype( - std::vector input_dtypes, - std::vector> vec_input_dtypes, + const std::vector& input_dtypes, + const std::vector>& vec_input_dtypes, const Args&... args) { return impl_fn(args...); } diff --git a/python/paddle/fluid/tests/custom_op/attr_test_op.cc b/python/paddle/fluid/tests/custom_op/attr_test_op.cc index 97aae106137..1edc10b8a8a 100644 --- a/python/paddle/fluid/tests/custom_op/attr_test_op.cc +++ b/python/paddle/fluid/tests/custom_op/attr_test_op.cc @@ -27,27 +27,15 @@ void assign_cpu_kernel(const data_t* x_data, } } -std::vector AttrTestForward( - const paddle::Tensor& x, - bool bool_attr, - int int_attr, - float float_attr, - int64_t int64_attr, - std::string str_attr, - std::vector int_vec_attr, - std::vector float_vec_attr, - std::vector int64_vec_attr, - std::vector str_vec_attr) { - auto out = paddle::Tensor(paddle::PlaceType::kCPU); - out.reshape(x.shape()); - - PD_DISPATCH_FLOATING_TYPES( - x.type(), "assign_cpu_kernel", ([&] { - assign_cpu_kernel( - x.data(), out.mutable_data(), x.size()); - })); - - // Check attrs value +void CheckAllForwardAttrs(const bool& bool_attr, + const int& int_attr, + const float& float_attr, + const int64_t& int64_attr, + const std::string& str_attr, + const std::vector& int_vec_attr, + const std::vector& float_vec_attr, + const std::vector& int64_vec_attr, + const std::vector& str_vec_attr) { if (bool_attr != true) { throw std::runtime_error("bool_attr value error."); } @@ -103,26 +91,11 @@ std::vector AttrTestForward( } } } - - return {out}; } -// The attrs of backward op must be the subset of attrs of forward op -std::vector AttrTestBackward( - const paddle::Tensor& grad_out, - int int_attr, - std::vector float_vec_attr, - std::vector str_vec_attr) { - auto grad_x = paddle::Tensor(paddle::PlaceType::kCPU); - grad_x.reshape(grad_out.shape()); - - PD_DISPATCH_FLOATING_TYPES(grad_out.type(), "assign_cpu_kernel", ([&] { - assign_cpu_kernel( - grad_out.data(), - grad_x.mutable_data(), - grad_out.size()); - })); - +void CheckAllBackwardAttrs(const int& int_attr, + const std::vector& float_vec_attr, + const std::vector& str_vec_attr) { if (int_attr != 10) { throw std::runtime_error("int_attr value error."); } @@ -146,6 +119,114 @@ std::vector AttrTestBackward( } } } +} + +std::vector AttrTestForward( + const paddle::Tensor& x, + bool bool_attr, + int int_attr, + float float_attr, + int64_t int64_attr, + std::string str_attr, + std::vector int_vec_attr, + std::vector float_vec_attr, + std::vector int64_vec_attr, + std::vector str_vec_attr) { + auto out = paddle::Tensor(paddle::PlaceType::kCPU); + out.reshape(x.shape()); + + PD_DISPATCH_FLOATING_TYPES( + x.type(), "assign_cpu_kernel", ([&] { + assign_cpu_kernel( + x.data(), out.mutable_data(), x.size()); + })); + + // Check attrs value + CheckAllForwardAttrs(bool_attr, + int_attr, + float_attr, + int64_attr, + str_attr, + int_vec_attr, + float_vec_attr, + int64_vec_attr, + str_vec_attr); + + return {out}; +} + +// The attrs of backward op must be the subset of attrs of forward op +std::vector AttrTestBackward( + const paddle::Tensor& grad_out, + int int_attr, + std::vector float_vec_attr, + std::vector str_vec_attr) { + auto grad_x = paddle::Tensor(paddle::PlaceType::kCPU); + grad_x.reshape(grad_out.shape()); + + PD_DISPATCH_FLOATING_TYPES(grad_out.type(), "assign_cpu_kernel", ([&] { + assign_cpu_kernel( + grad_out.data(), + grad_x.mutable_data(), + grad_out.size()); + })); + + CheckAllBackwardAttrs(int_attr, float_vec_attr, str_vec_attr); + + return {grad_x}; +} + +std::vector ConstAttrTestForward( + const paddle::Tensor& x, + const bool& bool_attr, + const int& int_attr, + const float& float_attr, + const int64_t& int64_attr, + const std::string& str_attr, + const std::vector& int_vec_attr, + const std::vector& float_vec_attr, + const std::vector& int64_vec_attr, + const std::vector& str_vec_attr) { + auto out = paddle::Tensor(paddle::PlaceType::kCPU); + out.reshape(x.shape()); + + PD_DISPATCH_FLOATING_TYPES( + x.type(), "assign_cpu_kernel", ([&] { + assign_cpu_kernel( + x.data(), out.mutable_data(), x.size()); + })); + + // Check attrs value + CheckAllForwardAttrs(bool_attr, + int_attr, + float_attr, + int64_attr, + str_attr, + int_vec_attr, + float_vec_attr, + int64_vec_attr, + str_vec_attr); + + return {out}; +} + +// The attrs of backward op must be the subset of attrs of forward op +std::vector ConstAttrTestBackward( + const paddle::Tensor& grad_out, + const int& int_attr, + const std::vector& float_vec_attr, + const std::vector& str_vec_attr) { + auto grad_x = paddle::Tensor(paddle::PlaceType::kCPU); + grad_x.reshape(grad_out.shape()); + + PD_DISPATCH_FLOATING_TYPES(grad_out.type(), "assign_cpu_kernel", ([&] { + assign_cpu_kernel( + grad_out.data(), + grad_x.mutable_data(), + grad_out.size()); + })); + + CheckAllBackwardAttrs(int_attr, float_vec_attr, str_vec_attr); return {grad_x}; } @@ -171,3 +252,25 @@ PD_BUILD_GRAD_OP(attr_test) "float_vec_attr: std::vector", "str_vec_attr: std::vector"}) .SetKernelFn(PD_KERNEL(AttrTestBackward)); + +PD_BUILD_OP(const_attr_test) + .Inputs({"X"}) + .Outputs({"Out"}) + .Attrs({"bool_attr: bool", + "int_attr: int", + "float_attr: float", + "int64_attr: int64_t", + "str_attr: std::string", + "int_vec_attr: std::vector", + "float_vec_attr: std::vector", + "int64_vec_attr: std::vector", + "str_vec_attr: std::vector"}) + .SetKernelFn(PD_KERNEL(AttrTestForward)); + +PD_BUILD_GRAD_OP(const_attr_test) + .Inputs({paddle::Grad("Out")}) + .Outputs({paddle::Grad("X")}) + .Attrs({"int_attr: int", + "float_vec_attr: std::vector", + "str_vec_attr: std::vector"}) + .SetKernelFn(PD_KERNEL(AttrTestBackward)); diff --git a/python/paddle/fluid/tests/custom_op/custom_concat_op.cc b/python/paddle/fluid/tests/custom_op/custom_concat_op.cc index 4ea39303991..2d8d0ccb88f 100644 --- a/python/paddle/fluid/tests/custom_op/custom_concat_op.cc +++ b/python/paddle/fluid/tests/custom_op/custom_concat_op.cc @@ -122,13 +122,14 @@ std::vector ConcatBackwardDynamicAxis( } std::vector> ConcatInferShapeDynamicAxis( - std::vector> input_shapes, - std::vector axis_shape) { + const std::vector>& input_shapes, + const std::vector& axis_shape) { return {std::vector(input_shapes[0].size(), -1)}; } std::vector ConcatInferDtypeDynamicAxis( - std::vector input_dtypes, paddle::DataType axis_dtype) { + const std::vector& input_dtypes, + const paddle::DataType& axis_dtype) { return {input_dtypes[0]}; } diff --git a/python/paddle/fluid/tests/custom_op/test_custom_attrs_jit.py b/python/paddle/fluid/tests/custom_op/test_custom_attrs_jit.py index a6278e3ffc3..1c9c6eedbae 100644 --- a/python/paddle/fluid/tests/custom_op/test_custom_attrs_jit.py +++ b/python/paddle/fluid/tests/custom_op/test_custom_attrs_jit.py @@ -40,24 +40,38 @@ custom_attrs = load( class TestJitCustomAttrs(unittest.TestCase): - def test_attr_value(self): + def setUp(self): paddle.set_device('cpu') # prepare test value - bool_attr = True - int_attr = 10 - float_attr = 3.14 - int64_attr = 10000000000 - str_attr = "StrAttr" - int_vec_attr = [10, 10, 10] - float_vec_attr = [3.14, 3.14, 3.14] - int64_vec_attr = [10000000000, 10000000000, 10000000000] - str_vec_attr = ["StrAttr", "StrAttr", "StrAttr"] + self.bool_attr = True + self.int_attr = 10 + self.float_attr = 3.14 + self.int64_attr = 10000000000 + self.str_attr = "StrAttr" + self.int_vec_attr = [10, 10, 10] + self.float_vec_attr = [3.14, 3.14, 3.14] + self.int64_vec_attr = [10000000000, 10000000000, 10000000000] + self.str_vec_attr = ["StrAttr", "StrAttr", "StrAttr"] + def test_attr_value(self): x = paddle.ones([2, 2], dtype='float32') x.stop_gradient = False out = custom_attrs.attr_test( - x, bool_attr, int_attr, float_attr, int64_attr, str_attr, - int_vec_attr, float_vec_attr, int64_vec_attr, str_vec_attr) + x, self.bool_attr, self.int_attr, self.float_attr, self.int64_attr, + self.str_attr, self.int_vec_attr, self.float_vec_attr, + self.int64_vec_attr, self.str_vec_attr) + out.stop_gradient = False + out.backward() + + self.assertTrue(np.array_equal(x.numpy(), out.numpy())) + + def test_const_attr_value(self): + x = paddle.ones([2, 2], dtype='float32') + x.stop_gradient = False + out = custom_attrs.const_attr_test( + x, self.bool_attr, self.int_attr, self.float_attr, self.int64_attr, + self.str_attr, self.int_vec_attr, self.float_vec_attr, + self.int64_vec_attr, self.str_vec_attr) out.stop_gradient = False out.backward() -- GitLab From a32e8bf1e7fb45b9bae85e80fe7742eae8739fac Mon Sep 17 00:00:00 2001 From: Kaipeng Deng Date: Mon, 15 Mar 2021 10:29:54 +0800 Subject: [PATCH 013/486] DataLoader supprot dict str (#31481) * add dict/str/list supprot for DataLoader. test=develop --- paddle/fluid/imperative/data_loader.cc | 24 +- .../fluid/operators/reader/blocking_queue.h | 12 +- paddle/fluid/pybind/reader_py.cc | 10 +- python/paddle/fluid/dataloader/collate.py | 87 +++++ .../fluid/dataloader/dataloader_iter.py | 342 +++--------------- python/paddle/fluid/dataloader/flat.py | 150 ++++++++ python/paddle/fluid/dataloader/worker.py | 253 +++++++++++++ python/paddle/fluid/multiprocess_utils.py | 4 + .../test_multiprocess_dataloader_dataset.py | 57 +++ ...ocess_dataloader_iterable_dataset_split.py | 4 +- 10 files changed, 646 insertions(+), 297 deletions(-) create mode 100644 python/paddle/fluid/dataloader/collate.py create mode 100644 python/paddle/fluid/dataloader/flat.py create mode 100644 python/paddle/fluid/dataloader/worker.py diff --git a/paddle/fluid/imperative/data_loader.cc b/paddle/fluid/imperative/data_loader.cc index 71ea82e9a19..c43149c9b56 100644 --- a/paddle/fluid/imperative/data_loader.cc +++ b/paddle/fluid/imperative/data_loader.cc @@ -71,9 +71,12 @@ void EraseLoadProcessPIDs(int64_t key) { } \ } while (0) -#define REGISTER_SIGNAL_HANDLER(SIGNAL, HANDLER_NAME) \ - static void HANDLER_NAME(int sig, siginfo_t *info, void *ctx) { \ - SIGNAL_HANDLE(SIGNAL); \ +#define REGISTER_SIGNAL_HANDLER(SIGNAL, HANDLER_NAME, ERROR_MSG) \ + static void HANDLER_NAME(int sig, siginfo_t *info, void *ctx) { \ + auto _w = \ + write(STDERR_FILENO, ERROR_MSG, sizeof(ERROR_MSG) / sizeof(char)); \ + (void)_w; \ + SIGNAL_HANDLE(SIGNAL); \ } #define REGISTER_SPEC_SIGNAL_HANDLER(SIGNAL, HANDLER_NAME) \ @@ -84,8 +87,18 @@ void EraseLoadProcessPIDs(int64_t key) { SIGNAL_HANDLE(SIGNAL); \ } -REGISTER_SIGNAL_HANDLER(SIGSEGV, SIGSEGV_handler); -REGISTER_SIGNAL_HANDLER(SIGBUS, SIGBUS_handler); +REGISTER_SIGNAL_HANDLER(SIGSEGV, SIGSEGV_handler, + "ERROR: Unexpected segmentation fault encountered in " + "DataLoader workers.\n"); +REGISTER_SIGNAL_HANDLER( + SIGBUS, SIGBUS_handler, + "ERROR: Unexpected BUS error encountered in DataLoader worker. " + "This might be caused by insufficient shared memory (shm), " + "please check whether use_shared_memory is set and storage space " + "in /dev/shm is enough\n"); +REGISTER_SIGNAL_HANDLER(SIGFPE, SIGFPE_handler, + "ERROR: Unexpected floating-point exception " + "encountered in DataLoader worker.\n") REGISTER_SPEC_SIGNAL_HANDLER(SIGTERM, SIGTERM_handler); static inline void setSignalHandler(int signal, @@ -105,6 +118,7 @@ static inline void setSignalHandler(int signal, void SetLoadProcessSignalHandler() { setSignalHandler(SIGSEGV, &SIGSEGV_handler, nullptr); setSignalHandler(SIGBUS, &SIGBUS_handler, nullptr); + setSignalHandler(SIGFPE, &SIGFPE_handler, nullptr); setSignalHandler(SIGTERM, &SIGTERM_handler, nullptr); } diff --git a/paddle/fluid/operators/reader/blocking_queue.h b/paddle/fluid/operators/reader/blocking_queue.h index 8929da20b53..f126070a7eb 100644 --- a/paddle/fluid/operators/reader/blocking_queue.h +++ b/paddle/fluid/operators/reader/blocking_queue.h @@ -45,7 +45,11 @@ class BlockingQueue { std::unique_lock lock(mutex_); send_cv_.wait( lock, [&] { return queue_.size() < capacity_ || closed_ || killed_; }); - EnforceNotKilled(); + if (killed_) { + VLOG(3) + << "WARNING:: Sending an element to a killed reader::BlokcingQueue"; + return false; + } if (closed_) { VLOG(5) << "WARNING: Sending an element to a closed reader::BlokcingQueue."; @@ -66,7 +70,11 @@ class BlockingQueue { std::unique_lock lock(mutex_); send_cv_.wait( lock, [&] { return queue_.size() < capacity_ || closed_ || killed_; }); - EnforceNotKilled(); + if (killed_) { + VLOG(3) + << "WARNING:: Sending an element to a killed reader::BlokcingQueue"; + return false; + } if (closed_) { VLOG(5) << "WARNING: Sending an element to a closed reader::BlokcingQueue."; diff --git a/paddle/fluid/pybind/reader_py.cc b/paddle/fluid/pybind/reader_py.cc index 856c5aac5eb..abe1977eb69 100644 --- a/paddle/fluid/pybind/reader_py.cc +++ b/paddle/fluid/pybind/reader_py.cc @@ -223,6 +223,10 @@ class MultiDeviceFeedReader { ReadAsync(); } + void Shutdown() { + for (auto &r : readers_) r->Shutdown(); + } + ~MultiDeviceFeedReader() { queue_->Close(); pool_.reset(); @@ -266,10 +270,6 @@ class MultiDeviceFeedReader { } } - void Shutdown() { - for (auto &r : readers_) r->Shutdown(); - } - void Start() { for (auto &r : readers_) r->Start(); } @@ -362,6 +362,8 @@ void BindMultiDeviceReader(py::module *module, const char *reader_name) { }, py::call_guard()) .def("reset", &ReaderType::Reset, + py::call_guard()) + .def("shutdown", &ReaderType::Shutdown, py::call_guard()); } diff --git a/python/paddle/fluid/dataloader/collate.py b/python/paddle/fluid/dataloader/collate.py new file mode 100644 index 00000000000..ddc010d0428 --- /dev/null +++ b/python/paddle/fluid/dataloader/collate.py @@ -0,0 +1,87 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import numbers +import numpy as np +from ..framework import in_dygraph_mode +from .. import core, layers + +try: + from collections.abc import Sequence, Mapping +except: + from collections import Sequence, Mapping + + +def default_collate_fn(batch): + """ + Default batch collating function for :code:`paddle.io.DataLoader`, + batch should be a list of samples, and each sample should be a list + of fields as follows: + + [[filed1, filed2, ...], [filed1, filed2, ...], ...] + + This default collate function zipped each filed together and stack + each filed as the batch field as follows: + + [batch_filed1, batch_filed2, ...] + + Args: + batch(list of list of numpy array|paddle.Tensor): the batch data, each fields + should be a numpy array, each sample should be a list of + fileds, and batch should be a list of sample. + + Returns: + a list of numpy array|Paddle.Tensor: collated batch of input batch data, + fields data type as same as fields in each sample. + """ + sample = batch[0] + if isinstance(sample, np.ndarray): + batch = np.stack(batch, axis=0) + return batch + elif isinstance(sample, paddle.Tensor): + return layers.stack(batch, axis=0) + elif isinstance(sample, numbers.Number): + batch = np.array(batch) + return batch + elif isinstance(sample, (str, bytes)): + return batch + elif isinstance(sample, Mapping): + return { + key: default_collate_fn([d[key] for d in batch]) + for key in sample + } + elif isinstance(sample, Sequence): + sample_fields_num = len(sample) + if not all(len(sample) == sample_fields_num for sample in iter(batch)): + raise RuntimeError( + "fileds number not same among samples in a batch") + return [default_collate_fn(fields) for fields in zip(*batch)] + + raise TypeError("batch data con only contains: tensor, numpy.ndarray, " + "dict, list, number, but got {}".format(type(sample))) + return outputs + + +def default_convert_fn(batch): + if isinstance(batch, (paddle.Tensor, np.ndarray)): + return batch + elif isinstance(batch, (str, bytes)): + return batch + elif isinstance(batch, Mapping): + return {key: default_convert_fn(batch[key]) for key in batch} + elif isinstance(batch, Sequence): + return [default_convert_fn(d) for d in batch] + else: + return batch diff --git a/python/paddle/fluid/dataloader/dataloader_iter.py b/python/paddle/fluid/dataloader/dataloader_iter.py index 0dd2420691a..0cd12e874d9 100644 --- a/python/paddle/fluid/dataloader/dataloader_iter.py +++ b/python/paddle/fluid/dataloader/dataloader_iter.py @@ -35,181 +35,16 @@ else: import paddle from .. import core, layers from ..framework import in_dygraph_mode -from ..multiprocess_utils import CleanupFuncRegistrar, _cleanup_mmap, _set_SIGCHLD_handler +from ..multiprocess_utils import _set_SIGCHLD_handler, MP_STATUS_CHECK_INTERVAL from .fetcher import _IterableDatasetFetcher, _MapDatasetFetcher from .batch_sampler import _InfiniteIterableSampler +from .collate import default_collate_fn, default_convert_fn +from .worker import ParentWatchDog, get_worker_info, _worker_loop, \ + _DatasetKind, _IterableDatasetStopIteration, _WorkerException +from .flat import _flatten_batch, _restore_batch __all__ = ['get_worker_info'] -# multi-process worker check indices queue interval, avoid -# hanging in subprocess data loading -MP_INDICES_CHECK_INTERVAL = 5 - -_IterableDatasetStopIteration = namedtuple('_IterableDatasetStopIteration', - ['worker_id']) - - -def default_collate_fn(batch): - """ - Default batch collating function for :code:`fluid.io.DataLoader`, - batch should be a list of samples, and each sample should be a list - of fields as follows: - - [[filed1, filed2, ...], [filed1, filed2, ...], ...] - - This default collate function zipped each filed together and stack - each filed as the batch field as follows: - - [batch_filed1, batch_filed2, ...] - - Args: - batch(list of list of numpy array): the batch data, each fields - should be a numpy array, each sample should be a list of - fileds, and batch should be a list of sample. - - Returns: - a list of numpy array: collated batch - """ - sample = batch[0] - # dataset has only 1 field - if isinstance(sample, np.ndarray): - return [np.stack(batch, axis=0)] - - # batch each field - slots = [] - for items in batch: - for i, item in enumerate(items): - if len(slots) < len(items): - slots.append([item]) - else: - slots[i].append(item) - - outputs = [] - for slot in slots: - if isinstance(slot[0], (np.ndarray, np.bool, numbers.Number)): - tmp = np.stack(slot, axis=0) - outputs.append(tmp) - elif isinstance(slot[0], paddle.Tensor): - tmp = layers.stack(slot, axis=0) - outputs.append(tmp) - else: - raise RuntimeError("Unknown data type {}".format(type(slot[0]))) - return outputs - - -class _DatasetKind(object): - MAP = 0 - ITER = 1 - - @staticmethod - def create_fetcher(kind, dataset, auto_collate_batch, collate_fn, - drop_last): - if kind == _DatasetKind.MAP: - return _MapDatasetFetcher(dataset, auto_collate_batch, collate_fn, - drop_last) - elif kind == _DatasetKind.ITER: - return _IterableDatasetFetcher(dataset, auto_collate_batch, - collate_fn, drop_last) - else: - raise NotImplementedError("unknown Dataset kind {}".format(kind)) - - -class ParentWatchDog(object): - def __init__(self): - self._parent_pid = os.getppid() - self._parent_alive = True - - def is_alive(self): - if self._parent_alive: - self._parent_alive = os.getppid() == self._parent_pid - return self._parent_alive - - -# worker information for each workers, used for splitting data copy -# for IteratorDataset in worker processes. -_worker_info = None - - -def get_worker_info(): - """ - Get DataLoader worker process information function, this function is - used to split data copy in worker process for IterableDataset - (see :code:`paddle.io.IterableDataset`), worker information contains - following fields: - - :attr:`num_workers`: total worker process number, see `paddle.io.DataLoader` - - :attr:`id`: the worker processs id, count from 0 to :attr:`num_workers - 1` - - :attr:`dataset`: the dataset object in this worker process - - Returns: - WorkerInfo: an instance of WorkerInfo which contains fields above. - - .. note:: - For mode usage and exampls, please see :code:`paddle.io.IterableDataset` - - Example: - - .. code-block:: python - - import math - import paddle - import numpy as np - from paddle.io import IterableDataset, DataLoader, get_worker_info - - class SplitedIterableDataset(IterableDataset): - def __init__(self, start, end): - self.start = start - self.end = end - - def __iter__(self): - worker_info = get_worker_info() - if worker_info is None: - iter_start = self.start - iter_end = self.end - else: - per_worker = int( - math.ceil((self.end - self.start) / float( - worker_info.num_workers))) - worker_id = worker_info.id - iter_start = self.start + worker_id * per_worker - iter_end = min(iter_start + per_worker, self.end) - - for i in range(iter_start, iter_end): - yield np.array([i]) - - place = paddle.CPUPlace() - dataset = SplitedIterableDataset(start=2, end=9) - dataloader = DataLoader( - dataset, - places=place, - num_workers=2, - batch_size=1, - drop_last=True) - - for data in dataloader: - print(data) - # outputs: [2, 5, 3, 6, 4, 7] - - """ - return _worker_info - - -class WorkerInfo(object): - __initialized = False - - def __init__(self, **kwargs): - for k, v in kwargs.items(): - setattr(self, k, v) - self.__initialized = True - - def __setattr__(self, key, val): - if self.__initialized: - raise RuntimeError("Cannot assign attributes to {} objects".format( - self.__class__.__name__)) - return super(WorkerInfo, self).__setattr__(key, val) - class _DataLoaderIterBase(object): """ @@ -230,7 +65,7 @@ class _DataLoaderIterBase(object): self._num_workers = loader.num_workers self._use_buffer_reader = loader.use_buffer_reader self._use_shared_memory = loader.use_shared_memory - self._timeout = loader.timeout if loader.timeout > 0 else MP_INDICES_CHECK_INTERVAL + self._timeout = loader.timeout if loader.timeout > 0 else MP_STATUS_CHECK_INTERVAL self._worker_init_fn = loader.worker_init_fn self._dataset_kind = loader.dataset_kind self._pin_memory = loader.pin_memory @@ -244,7 +79,7 @@ class _DataLoaderIterBase(object): else: self._sampler_iter = iter( _InfiniteIterableSampler(self._dataset, 1)) - self._collate_fn = loader.collate_fn + self._collate_fn = loader.collate_fn or default_convert_fn # LoDTensorBlockingQueue instance for create_py_reader and a thread # to put mini-batch data to self._blocking_queue, mini-batch data @@ -275,6 +110,14 @@ class _DataLoaderIterSingleProcess(_DataLoaderIterBase): self._dataset_kind, self._dataset, self._auto_collate_batch, self._collate_fn, True) + # NOTE: _structrue_infos used to record the data structure of + # batch to restore batch structure after reading Tensor + # from blocking_queue in single-process mode. Note that + # only single process is used in single-process mode, we + # can record the data structure sequencely in a list without + # recording the send and recv index + self._structure_infos = [] + # NOTE: len(self._places) batch data compose as an output # iteration, set blocking_queue can cache 2 iteration datas # at most here @@ -316,16 +159,14 @@ class _DataLoaderIterSingleProcess(_DataLoaderIterBase): # read data from dataset in mini-batch batch = self._dataset_fetcher.fetch(indices) + # flat batch and record structure infos + batch, structure = _flatten_batch(batch) + self._structure_infos.append(structure) + # pack as LoDTensorArray array = core.LoDTensorArray() for slot in batch: if not isinstance(slot, core.LoDTensor): - # FIXME(dkp): blocking_queue only support - # core.LoDTensorArray as input now, read - # numpy data into a LoDTensorArray here, - # should support paddle.Tensor list later - if isinstance(slot, paddle.Tensor): - slot = slot.numpy() tmp = core.LoDTensor() tmp.set(slot, core.CPUPlace()) slot = tmp @@ -348,20 +189,29 @@ class _DataLoaderIterSingleProcess(_DataLoaderIterBase): def __next__(self): try: if in_dygraph_mode(): - return self._reader.read_next_var_list() + data = self._reader.read_next_var_list() + data = _restore_batch(data, self._structure_infos.pop(0)) else: if self._return_list: + data = self._reader.read_next_list() + data = [ + _restore_batch(d, s) + for d, s in zip(data, self._structure_infos[:len( + self._places)]) + ] + self._structure_infos = self._structure_infos[len( + self._places):] # static graph organized data on multi-device with list, if # place number is 1, there is only 1 device, extra the data # from list for devices to be compatible with dygraph mode if len(self._places) == 1: - return self._reader.read_next_list()[0] - else: - return self._reader.read_next_list() + data = data[0] else: - return self._reader.read_next() + data = self._reader.read_next() + + return data except StopIteration: - self._reader.reset() + self._reader.shutdown() six.reraise(*sys.exc_info()) # python2 compatibility @@ -375,97 +225,6 @@ class _DataLoaderIterSingleProcess(_DataLoaderIterBase): self._blocking_queue.close() -# NOTE(chenweihang): _worker_loop must be top level method to be pickled -def _worker_loop(dataset, dataset_kind, indices_queue, out_queue, done_event, - auto_collate_batch, collate_fn, init_fn, worker_id, - num_workers, use_shared_memory): - try: - # NOTE: [ mmap files clear ] When the child process exits unexpectedly, - # some shared memory objects may have been applied for but have not yet - # been put into the inter-process Queue. This part of the object needs - # to be cleaned up when the process ends. - CleanupFuncRegistrar.register(_cleanup_mmap) - - # set signal handler - core._set_process_signal_handler() - - global _worker_info - _worker_info = WorkerInfo( - id=worker_id, num_workers=num_workers, dataset=dataset) - - init_exception = None - try: - if init_fn is not None: - init_fn(worker_id) - fetcher = _DatasetKind.create_fetcher( - dataset_kind, dataset, auto_collate_batch, collate_fn, True) - except: - init_exception = Exception("init_fn failed in worker {}: " \ - "{}".format(worker_id, sys.exc_info())) - - iterator_drained = False - parent_watch_dog = ParentWatchDog() - - while parent_watch_dog.is_alive(): - try: - data = indices_queue.get(MP_INDICES_CHECK_INTERVAL) - except queue.Empty: - continue - - # None as poison piil, so worker event should be set - if data is None: - assert done_event.is_set() or iterator_drained, \ - "get None when worker done_event set" - break - # If worker done event is set but get still get data in - # indices_queue, remaining data should be get and skipped. - if done_event.is_set() or iterator_drained: - continue - - idx, indices = data - try: - if init_exception is not None: - batch = init_exception - init_exception = None - else: - batch = fetcher.fetch(indices) - except Exception as e: - if isinstance( - e, StopIteration) and dataset_kind == _DatasetKind.ITER: - out_queue.put(_IterableDatasetStopIteration(worker_id)) - iterator_drained = True - else: - out_queue.put((idx, e)) - else: - if use_shared_memory: - # FIXME(dkp): _convert_to_tensor_list only support np.array - # list now, should support paddle.Tensor list - new_batch = [] - for sample in batch: - new_sample = [] - for s in sample: - if isinstance(s, paddle.Tensor): - new_sample.append(s.numpy()) - else: - new_sample.append(s) - new_batch.append(new_sample) - batch = new_batch - - tensor_list = core._convert_to_tensor_list(batch) - out_queue.put((idx, tensor_list)) - core._remove_tensor_list_mmap_fds(tensor_list) - else: - out_queue.put((idx, batch)) - except KeyboardInterrupt: - # NOTE: Main process will raise KeyboardInterrupt anyways, ignore it in child process - pass - except: - six.reraise(*sys.exc_info()) - finally: - if use_shared_memory: - _cleanup_mmap() - - class _DataLoaderIterMultiProcess(_DataLoaderIterBase): def __init__(self, loader): super(_DataLoaderIterMultiProcess, self).__init__(loader) @@ -483,6 +242,7 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase): self._rcvd_idx = 0 self._batches_outstanding = 0 self._task_infos = {} + self._structure_infos = [] # indices outstand as _outstanding_capacity at first, and # blocking_queue capacity is also _outstanding_capacity. @@ -617,8 +377,6 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase): if not self._thread_done_event.is_set(): if batch is None: self._exit_thread_expectedly() - elif isinstance(batch, Exception): - self._exit_thread_unexpectedly() else: try: # pack as LoDTensorArray @@ -654,8 +412,9 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase): # batch indices and increase _rcvd_idx if self._dataset_kind == _DatasetKind.ITER: while self._rcvd_idx < self._send_idx: + sys.stdout.flush() info = self._task_infos[self._rcvd_idx] - if len(info) == 2 or self._worker_status[info[0]]: + if len(info) == 3 or self._worker_status[info[0]]: break del self._task_infos[self._rcvd_idx] self._rcvd_idx += 1 @@ -669,13 +428,15 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase): continue if self._rcvd_idx in self._task_infos and \ - len(self._task_infos[self._rcvd_idx]) == 2: - return self._task_infos.pop(self._rcvd_idx)[1] + len(self._task_infos[self._rcvd_idx]) == 3: + info = self._task_infos.pop(self._rcvd_idx) + self._structure_infos.append(info[2]) + return info[1] try: # [ avoid hang ]: main process may blocking at _reader.read_next when # KeyboardInterrupt, we do following tradeoff: - # 1. get data with timeout, MP_INDICES_CHECK_INTERVAL(5s) as timeout + # 1. get data with timeout, MP_STATUS_CHECK_INTERVAL(5s) as timeout # default, if KeyboardInterrupt blocking, failed workers will be # checked and raise RuntimeError to quit DataLoader in timeout # exception handling. @@ -721,12 +482,17 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase): self._try_put_indices() continue - idx, batch = data + idx, batch, structure = data + if isinstance(batch, _WorkerException): + self._exit_thread_unexpectedly() + batch.reraise() + if idx == self._rcvd_idx: del self._task_infos[idx] + self._structure_infos.append(structure) return batch else: - self._task_infos[idx] += (batch, ) + self._task_infos[idx] += (batch, structure) continue def _try_put_indices(self): @@ -777,9 +543,17 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase): if in_dygraph_mode(): data = self._reader.read_next_var_list() + data = _restore_batch(data, self._structure_infos.pop(0)) else: if self._return_list: data = self._reader.read_next_list() + data = [ + _restore_batch(d, s) + for d, s in zip(data, self._structure_infos[:len( + self._places)]) + ] + self._structure_infos = self._structure_infos[len( + self._places):] # static graph organized data on multi-device with list, if # place number is 1, there is only 1 device, extra the data # from list for devices to be compatible with dygraph mode @@ -790,7 +564,7 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase): self._on_output_batch() return data except StopIteration: - self._reader.reset() + self._reader.shutdown() self._try_shutdown_all() six.reraise(*sys.exc_info()) diff --git a/python/paddle/fluid/dataloader/flat.py b/python/paddle/fluid/dataloader/flat.py new file mode 100644 index 00000000000..6cccbc7ee4e --- /dev/null +++ b/python/paddle/fluid/dataloader/flat.py @@ -0,0 +1,150 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import numbers +import numpy as np + +try: + from collections.abc import Sequence, Mapping +except: + from collections import Sequence, Mapping + +FIELD_PREFIX = "_paddle_field_" + + +def _flatten_batch(batch): + """ + For lod_blocking_queue only receive tensor array, flatten batch + data, extract numpy.array data out as a list of numpy.array to + send to lod_blocking_queue, and save the batch data structure + such as fields in other types (str, int, etc) or key-value map + of dictionaries + """ + + def _flatten(batch, flat_batch, structure, field_idx): + if isinstance(batch, Sequence): + for field in batch: + if isinstance(field, np.ndarray): + structure.append('{}{}'.format(FIELD_PREFIX, field_idx)) + flat_batch.append(field) + field_idx += 1 + elif isinstance(field, paddle.Tensor): + structure.append('{}{}'.format(FIELD_PREFIX, field_idx)) + flat_batch.append(field.numpy()) + field_idx += 1 + elif isinstance(field, (str, bytes, numbers.Number)): + structure.append(field) + elif isinstance(field, Sequence): + field_struct, field_idx = _flatten(field, flat_batch, [], + field_idx) + structure.append(field_struct) + elif isinstance(field, Mapping): + field_struct, field_idx = _flatten(field, flat_batch, {}, + field_idx) + structure.append(field_struct) + else: + structure.append(field) + elif isinstance(batch, Mapping): + for k, field in batch.items(): + if isinstance(field, np.ndarray): + structure[k] = '{}{}'.format(FIELD_PREFIX, field_idx) + flat_batch.append(field) + field_idx += 1 + elif isinstance(field, paddle.Tensor): + structure[k] = '{}{}'.format(FIELD_PREFIX, field_idx) + flat_batch.append(field.numpy()) + field_idx += 1 + elif isinstance(field, (str, bytes, numbers.Number)): + structure[k] = field + elif isinstance(field, Sequence): + field_struct, field_idx = _flatten(field, flat_batch, [], + field_idx) + structure[k] = field_struct + elif isinstance(field, Mapping): + field_struct, field_idx = _flatten(field, flat_batch, {}, + field_idx) + structure[k] = field_struct + else: + structure[k] = field + else: + raise TypeError("wrong flat data type: {}".format(type(batch))) + + return structure, field_idx + + # sample only contains single fields + if not isinstance(batch, Sequence): + flat_batch = [] + structure, _ = _flatten([batch], flat_batch, [], 0) + return flat_batch, structure[0] + flat_batch = [] + structure, _ = _flatten(batch, flat_batch, [], 0) + return flat_batch, structure + + +def _restore_batch(flat_batch, structure): + """ + After reading list of Tensor data from lod_blocking_queue outputs, + use this function to restore the batch data structrue, replace + :attr:`_paddle_field_x` with data from flat_batch + """ + + def _restore(structure, field_idx): + if isinstance(structure, Sequence): + for i, field in enumerate(structure): + if isinstance(field, str) and field.startswith(FIELD_PREFIX): + cur_field_idx = int(field.replace(FIELD_PREFIX, '')) + field_idx = max(field_idx, cur_field_idx) + assert flat_batch[cur_field_idx] is not None, \ + "flat_batch[{}] parsed repeatly" + structure[i] = flat_batch[cur_field_idx] + flat_batch[cur_field_idx] = None + elif isinstance(field, (str, bytes, numbers.Number)): + continue + elif isinstance(field, (Sequence, Mapping)): + field_idx = _restore(structure[i], field_idx) + elif isinstance(structure, Mapping): + for k, field in structure.items(): + if isinstance(field, str) and field.startswith(FIELD_PREFIX): + cur_field_idx = int(field.replace(FIELD_PREFIX, '')) + field_idx = max(field_idx, cur_field_idx) + assert flat_batch[cur_field_idx] is not None, \ + "flat_batch[{}] parsed repeatly" + structure[k] = flat_batch[cur_field_idx] + flat_batch[cur_field_idx] = None + elif isinstance(field, (str, bytes, numbers.Number)): + continue + elif isinstance(field, (Sequence, Mapping)): + field_idx = _restore(structure[k], field_idx) + else: + raise TypeError("wrong flat data type: {}".format(type(batch))) + + return field_idx + + assert isinstance(flat_batch, Sequence), \ + "flat_batch is not a list or tuple" + + # no np.array in dataset, no output tensor from blocking queue + # simply return structure + if len(flat_batch) == 0: + return structure + + # sample only contains single fields + if isinstance(structure, (str, bytes)): + assert structure == '{}{}'.format(FIELD_PREFIX, 0), \ + "invalid structure: {}".format(structure) + return flat_batch[0] + field_idx = _restore(structure, 0) + assert field_idx + 1 == len(flat_batch), "Tensor parse incomplete" + return structure diff --git a/python/paddle/fluid/dataloader/worker.py b/python/paddle/fluid/dataloader/worker.py new file mode 100644 index 00000000000..2d1b554e53d --- /dev/null +++ b/python/paddle/fluid/dataloader/worker.py @@ -0,0 +1,253 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import six +import sys +import paddle +import numpy as np +import traceback +from collections import namedtuple +from .. import core +from .fetcher import _IterableDatasetFetcher, _MapDatasetFetcher +from ..multiprocess_utils import _cleanup_mmap, CleanupFuncRegistrar, MP_STATUS_CHECK_INTERVAL +from ..framework import in_dygraph_mode +from .flat import _flatten_batch + +# NOTE: queue has a different name in python2 and python3 +if six.PY2: + import Queue as queue +else: + import queue + +__all__ = ['get_worker_info'] + + +class _IterableDatasetStopIteration(object): + def __init__(self, worker_id): + self.worker_id = worker_id + + +class _DatasetKind(object): + MAP = 0 + ITER = 1 + + @staticmethod + def create_fetcher(kind, dataset, auto_collate_batch, collate_fn, + drop_last): + if kind == _DatasetKind.MAP: + return _MapDatasetFetcher(dataset, auto_collate_batch, collate_fn, + drop_last) + elif kind == _DatasetKind.ITER: + return _IterableDatasetFetcher(dataset, auto_collate_batch, + collate_fn, drop_last) + else: + raise NotImplementedError("unknown Dataset kind {}".format(kind)) + + +class ParentWatchDog(object): + def __init__(self): + self._parent_pid = os.getppid() + self._parent_alive = True + + def is_alive(self): + if self._parent_alive: + self._parent_alive = os.getppid() == self._parent_pid + return self._parent_alive + + +# worker information for each workers, used for splitting data copy +# for IteratorDataset in worker processes. +_worker_info = None + + +def get_worker_info(): + """ + Get DataLoader worker process information function, this function is + used to split data copy in worker process for IterableDataset + (see :code:`paddle.io.IterableDataset`), worker information contains + following fields: + + :attr:`num_workers`: total worker process number, see `paddle.io.DataLoader` + + :attr:`id`: the worker processs id, count from 0 to :attr:`num_workers - 1` + + :attr:`dataset`: the dataset object in this worker process + + Returns: + WorkerInfo: an instance of WorkerInfo which contains fields above. + + .. note:: + For more usage and examples, please see :code:`paddle.io.IterableDataset` + + Example: + + .. code-block:: python + + import math + import paddle + import numpy as np + from paddle.io import IterableDataset, DataLoader, get_worker_info + + class SplitedIterableDataset(IterableDataset): + def __init__(self, start, end): + self.start = start + self.end = end + + def __iter__(self): + worker_info = get_worker_info() + if worker_info is None: + iter_start = self.start + iter_end = self.end + else: + per_worker = int( + math.ceil((self.end - self.start) / float( + worker_info.num_workers))) + worker_id = worker_info.id + iter_start = self.start + worker_id * per_worker + iter_end = min(iter_start + per_worker, self.end) + + for i in range(iter_start, iter_end): + yield np.array([i]) + + place = paddle.CPUPlace() + dataset = SplitedIterableDataset(start=2, end=9) + dataloader = DataLoader( + dataset, + places=place, + num_workers=2, + batch_size=1, + drop_last=True) + + for data in dataloader: + print(data) + # outputs: [2, 5, 3, 6, 4, 7] + + """ + return _worker_info + + +class WorkerInfo(object): + __initialized = False + + def __init__(self, **kwargs): + for k, v in kwargs.items(): + setattr(self, k, v) + self.__initialized = True + + def __setattr__(self, key, val): + if self.__initialized: + raise RuntimeError("Cannot assign attributes to {} objects".format( + self.__class__.__name__)) + return super(WorkerInfo, self).__setattr__(key, val) + + +class _WorkerException(object): + def __init__(self, worker_id, exc_info=None): + self.worker_id = worker_id + exc_info = exc_info or sys.exc_info() + self.exc_type = exc_info[0] + self.exc_msg = "".join(traceback.format_exception(*exc_info)) + + def reraise(self): + msg = "DataLoader worker({}) caught {} with message:\n{}".format( + self.worker_id, self.exc_type.__name__, self.exc_msg) + if getattr(self.exc_type, "message", None): + raise self.exc_type(message=msg) + raise self.exc_type(msg) + + +def _worker_loop(dataset, dataset_kind, indices_queue, out_queue, done_event, + auto_collate_batch, collate_fn, init_fn, worker_id, + num_workers, use_shared_memory): + try: + # NOTE: [ mmap files clear ] When the child process exits unexpectedly, + # some shared memory objects may have been applied for but have not yet + # been put into the inter-process Queue. This part of the object needs + # to be cleaned up when the process ends. + CleanupFuncRegistrar.register(_cleanup_mmap) + + # set signal handler + core._set_process_signal_handler() + + global _worker_info + _worker_info = WorkerInfo( + id=worker_id, num_workers=num_workers, dataset=dataset) + + init_exception = None + try: + if init_fn is not None: + init_fn(worker_id) + fetcher = _DatasetKind.create_fetcher( + dataset_kind, dataset, auto_collate_batch, collate_fn, True) + except: + init_exception = _WorkerException(worker_id) + + iterator_drained = False + parent_watch_dog = ParentWatchDog() + + while parent_watch_dog.is_alive(): + try: + data = indices_queue.get(MP_STATUS_CHECK_INTERVAL) + except queue.Empty: + continue + + # None as poison piil, so worker event should be set + if data is None: + assert done_event.is_set() or iterator_drained, \ + "get None when worker done_event set" + break + # If worker done event is set but get still get data in + # indices_queue, remaining data should be get and skipped. + if done_event.is_set() or iterator_drained: + continue + + idx, indices = data + try: + if init_exception is not None: + batch = init_exception + init_exception = None + else: + # NOTE: GPU tensor operation is not supported in sub-process + # but default device is GPU in paddle-gpu version, which + # may copy CPU tensor to GPU even if users want to use + # CPU tensor operation, so we add CPUPlace guard here + # to make sure tensor will be operated only on CPU + with paddle.fluid.dygraph.guard(place=paddle.CPUPlace()): + batch = fetcher.fetch(indices) + except Exception as e: + if isinstance( + e, StopIteration) and dataset_kind == _DatasetKind.ITER: + out_queue.put(_IterableDatasetStopIteration(worker_id)) + iterator_drained = True + else: + out_queue.put((idx, _WorkerException(worker_id), None)) + else: + if isinstance(batch, _WorkerException): + out_queue.put((idx, batch, None)) + batch, structure = _flatten_batch(batch) + if use_shared_memory: + tensor_list = core._convert_to_tensor_list(batch) + out_queue.put((idx, tensor_list, structure)) + core._remove_tensor_list_mmap_fds(tensor_list) + else: + out_queue.put((idx, batch, structure)) + except KeyboardInterrupt: + # NOTE: Main process will raise KeyboardInterrupt anyways, ignore it in child process + pass + except: + six.reraise(*sys.exc_info()) + finally: + if use_shared_memory: + _cleanup_mmap() diff --git a/python/paddle/fluid/multiprocess_utils.py b/python/paddle/fluid/multiprocess_utils.py index a63825e7363..82fb0f60b06 100644 --- a/python/paddle/fluid/multiprocess_utils.py +++ b/python/paddle/fluid/multiprocess_utils.py @@ -25,6 +25,10 @@ if six.PY2: else: import queue +# multi-process worker check indices queue interval, avoid +# hanging in subprocess data loading +MP_STATUS_CHECK_INTERVAL = 5. + # NOTE: [ mmap files clear ] If there is still data in the multiprocess queue when the main process finishes reading, # the data in the queue needs to be popped. Then the LoDTensor read by the main process # from the child process will automatically clear the memory-mapped file. diff --git a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dataset.py b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dataset.py index 39fc965e5ed..977882543a8 100755 --- a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dataset.py +++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dataset.py @@ -273,5 +273,62 @@ class TestNumpyMixTensorDataset(TestTensorDataset): assert isinstance(label, paddle.Tensor) +class ComplextDataset(Dataset): + def __init__(self, sample_num): + self.sample_num = sample_num + + def __len__(self): + return self.sample_num + + def __getitem__(self, idx): + return (3.1, 'abc', paddle.to_tensor( + np.random.random([IMAGE_SIZE]).astype('float32'), + place=paddle.CPUPlace()), + [1, np.random.random([2]).astype('float32')], { + 'a': 2.0, + 'b': np.random.random([2]).astype('float32') + }) + + +class TestComplextDataset(unittest.TestCase): + def run_main(self, num_workers): + paddle.static.default_startup_program().random_seed = 1 + paddle.static.default_main_program().random_seed = 1 + place = paddle.CPUPlace() + with fluid.dygraph.guard(place): + dataset = ComplextDataset(16) + assert len(dataset) == 16 + dataloader = DataLoader( + dataset, + places=place, + num_workers=num_workers, + batch_size=2, + drop_last=True) + + for i, data in enumerate(dataloader()): + assert len(data) == 5 + # data[0]: collate 3.1 + assert data[0].shape == [2] + assert isinstance(data[1], list) + # data[1]: collate 'abc' + assert len(data[1]) == 2 + assert isinstance(data[1][0], str) + assert isinstance(data[1][1], str) + # data[2]: collate tensor + assert data[2].shape == [2, IMAGE_SIZE] + # data[3]: collate list + assert isinstance(data[3], list) + assert data[3][0].shape == [2] + assert data[3][1].shape == [2, 2] + # data[4]: collate dict + assert isinstance(data[4], dict) + assert data[4]['a'].shape == [2] + assert data[4]['b'].shape == [2, 2] + + def test_main(self): + for num_workers in [0, 2]: + self.run_main(num_workers) + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_split.py b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_split.py index 56205133585..d2b7971a85d 100644 --- a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_split.py +++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_split.py @@ -58,7 +58,7 @@ class TestDynamicDataLoaderIterSplit(unittest.TestCase): rets = [] for d in dataloader: - rets.append(d[0].numpy()[0][0]) + rets.append(d.numpy()[0][0]) assert tuple(sorted(rets)) == tuple(range(0, 10)) @@ -102,7 +102,7 @@ class TestDynamicDataLoaderIterInitFuncSplit(unittest.TestCase): rets = [] for d in dataloader: - rets.append(d[0].numpy()[0][0]) + rets.append(d.numpy()[0][0]) assert tuple(sorted(rets)) == tuple(range(0, 10)) -- GitLab From 9066b74f58ad7163dfc0ad8ef912cc50264997d1 Mon Sep 17 00:00:00 2001 From: WangXi Date: Mon, 15 Mar 2021 10:54:51 +0800 Subject: [PATCH 014/486] c_gen_nccl_id add SocketServer to persit server (#31589) --- .../operators/collective/c_gen_nccl_id_op.cc | 3 ++- paddle/fluid/platform/gen_comm_id_helper.cc | 18 +++++++++++++++++ paddle/fluid/platform/gen_comm_id_helper.h | 20 +++++++++++++++++++ 3 files changed, 40 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc b/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc index 485a6d7ec4e..1592d809f91 100644 --- a/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc +++ b/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc @@ -75,7 +75,8 @@ class CGenNCCLIdOp : public framework::OperatorBase { platform::SendBroadCastCommID(endpoint_list, &nccl_ids); } else { std::string endpoint = Attr("endpoint"); - platform::RecvBroadCastCommID(endpoint, &nccl_ids); + int server_fd = platform::SocketServer::GetInstance(endpoint).socket(); + platform::RecvBroadCastCommID(server_fd, endpoint, &nccl_ids); } CopyNCCLIDToVar(nccl_ids, func, scope); diff --git a/paddle/fluid/platform/gen_comm_id_helper.cc b/paddle/fluid/platform/gen_comm_id_helper.cc index ffe82371b18..f38603e80fb 100644 --- a/paddle/fluid/platform/gen_comm_id_helper.cc +++ b/paddle/fluid/platform/gen_comm_id_helper.cc @@ -36,6 +36,8 @@ limitations under the License. */ namespace paddle { namespace platform { +std::once_flag SocketServer::init_flag_; + constexpr char COMM_HEAD[] = "_pd_gen_comm_id_"; // Check system calls, such as socket, bind. @@ -330,6 +332,22 @@ void RecvBroadCastCommID(int server_fd, std::string endpoint, CloseSocket(client); } +SocketServer& SocketServer::GetInstance(const std::string& end_point) { + static SocketServer instance; + std::call_once(init_flag_, [&]() { + instance.server_fd_ = CreateListenSocket(end_point); + instance.end_point_ = end_point; + }); + PADDLE_ENFORCE_NE(instance.server_fd_, -1, + platform::errors::Unavailable( + "listen socket failed with end_point=%s", end_point)); + PADDLE_ENFORCE_EQ(instance.end_point_, end_point, + platform::errors::InvalidArgument( + "old end_point=%s must equal with new end_point=%s", + instance.end_point_, end_point)); + return instance; +} + /// template instantiation #define INSTANT_TEMPLATE(Type) \ template void SendBroadCastCommID(std::vector servers, \ diff --git a/paddle/fluid/platform/gen_comm_id_helper.h b/paddle/fluid/platform/gen_comm_id_helper.h index 6014a2b4ff9..c51c5ac6c8a 100644 --- a/paddle/fluid/platform/gen_comm_id_helper.h +++ b/paddle/fluid/platform/gen_comm_id_helper.h @@ -17,6 +17,8 @@ limitations under the License. */ #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ defined(PADDLE_WITH_XPU_BKCL) #include +#include +#include #include #include @@ -39,6 +41,24 @@ void RecvBroadCastCommID(std::string endpoint, template void RecvBroadCastCommID(int server_fd, std::string endpoint, std::vector* nccl_ids); + +class SocketServer { + public: + SocketServer() = default; + + ~SocketServer() { CloseSocket(server_fd_); } + + int socket() const { return server_fd_; } + + static SocketServer& GetInstance(const std::string& end_point); + + private: + int server_fd_{-1}; + std::string end_point_; + + static std::once_flag init_flag_; +}; + } // namespace platform } // namespace paddle -- GitLab From 027b574a0e28e3096e5735a92defa627e11895ce Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Mon, 15 Mar 2021 11:30:27 +0800 Subject: [PATCH 015/486] [CustomOp] Remove the dependence of the underlying data types on eigen (#31602) * init commit * move eigen of bfloat16 * add complex header --- paddle/fluid/framework/data_type.h | 1 + paddle/fluid/platform/bfloat16.h | 124 ++-------- paddle/fluid/platform/bfloat16_test.cc | 1 + paddle/fluid/platform/complex128.h | 122 ++-------- paddle/fluid/platform/complex64.h | 125 ++-------- paddle/fluid/platform/eigen_ext.h | 306 +++++++++++++++++++++++++ 6 files changed, 357 insertions(+), 322 deletions(-) create mode 100644 paddle/fluid/platform/eigen_ext.h diff --git a/paddle/fluid/framework/data_type.h b/paddle/fluid/framework/data_type.h index 7aa7b7b2d96..c8f73a5469a 100644 --- a/paddle/fluid/framework/data_type.h +++ b/paddle/fluid/framework/data_type.h @@ -20,6 +20,7 @@ limitations under the License. */ #include "paddle/fluid/platform/bfloat16.h" #include "paddle/fluid/platform/complex128.h" #include "paddle/fluid/platform/complex64.h" +#include "paddle/fluid/platform/eigen_ext.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/float16.h" diff --git a/paddle/fluid/platform/bfloat16.h b/paddle/fluid/platform/bfloat16.h index f373e5ddb6d..d1257f853e0 100644 --- a/paddle/fluid/platform/bfloat16.h +++ b/paddle/fluid/platform/bfloat16.h @@ -15,22 +15,26 @@ #pragma once #include + +#include +#include #include + #if !defined(_WIN32) #define PADDLE_ALIGN(x) __attribute__((aligned(x))) #else #define PADDLE_ALIGN(x) __declspec(align(x)) #endif -#include - -#include "paddle/fluid/platform/hostdevice.h" -#include "unsupported/Eigen/CXX11/Tensor" - -namespace Eigen { -template -struct NumTraits; -} // namespace Eigen +#if (defined(__CUDACC__) || defined(__HIPCC__)) +#define HOSTDEVICE __host__ __device__ +#define DEVICE __device__ +#define HOST __host__ +#else +#define HOSTDEVICE +#define DEVICE +#define HOST +#endif namespace paddle { namespace platform { @@ -351,105 +355,3 @@ struct numeric_limits { }; } // namespace std - -namespace Eigen { - -using bfloat16 = paddle::platform::bfloat16; - -template <> -struct NumTraits : GenericNumTraits { - enum { - IsSigned = true, - IsInteger = false, - IsComplex = false, - RequireInitialization = false - }; - - HOSTDEVICE static inline bfloat16 epsilon() { - return paddle::platform::raw_uint16_to_bfloat16(0x3400); - } - HOSTDEVICE static inline bfloat16 dummy_precision() { - return bfloat16(1e-5f); - } - HOSTDEVICE static inline bfloat16 highest() { - return paddle::platform::raw_uint16_to_bfloat16(0x7f7f); - } - HOSTDEVICE static inline bfloat16 lowest() { - return paddle::platform::raw_uint16_to_bfloat16(0xff7f); - } - HOSTDEVICE static inline bfloat16 infinity() { - return paddle::platform::raw_uint16_to_bfloat16(0x7f80); - } - HOSTDEVICE static inline bfloat16 quiet_NaN() { - return paddle::platform::raw_uint16_to_bfloat16(0xffc1); - } -}; -namespace numext { - -template <> -HOSTDEVICE inline bool(isnan)(const bfloat16& a) { - return (paddle::platform::isnan)(a); -} - -template <> -HOSTDEVICE inline bool(isinf)(const bfloat16& a) { - return (paddle::platform::isinf)(a); -} - -template <> -HOSTDEVICE inline bool(isfinite)(const bfloat16& a) { - return (paddle::platform::isfinite)(a); -} - -template <> -HOSTDEVICE inline bfloat16 exp(const bfloat16& a) { - return bfloat16(::expf(static_cast(a))); -} - -template <> -HOSTDEVICE inline bfloat16 erf(const bfloat16& a) { - return bfloat16(::erff(static_cast(a))); -} - -template <> -HOSTDEVICE inline bfloat16 log(const bfloat16& a) { - return bfloat16(::logf(static_cast(a))); -} - -template <> -HOSTDEVICE inline bfloat16 tanh(const bfloat16& a) { - return bfloat16(::tanhf(static_cast(a))); -} - -template <> -HOSTDEVICE inline bfloat16 sqrt(const bfloat16& a) { - return bfloat16(::sqrtf(static_cast(a))); -} - -template <> -HOSTDEVICE inline bfloat16 ceil(const bfloat16& a) { - return bfloat16(::ceilf(static_cast(a))); -} - -template <> -HOSTDEVICE inline bfloat16 floor(const bfloat16& a) { - return bfloat16(::floorf(static_cast(a))); -} - -template <> -HOSTDEVICE inline bfloat16 round(const bfloat16& a) { - return bfloat16(::roundf(static_cast(a))); -} - -template <> -HOSTDEVICE inline bfloat16 pow(const bfloat16& a, const bfloat16& b) { - return bfloat16(::powf(static_cast(a), static_cast(b))); -} - -template <> -HOSTDEVICE inline bfloat16 abs(const bfloat16& a) { - return bfloat16(::fabs(static_cast(a))); -} - -} // namespace numext -} // namespace Eigen diff --git a/paddle/fluid/platform/bfloat16_test.cc b/paddle/fluid/platform/bfloat16_test.cc index 3adfcd89be9..dc2d3aa73ba 100644 --- a/paddle/fluid/platform/bfloat16_test.cc +++ b/paddle/fluid/platform/bfloat16_test.cc @@ -10,6 +10,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/platform/bfloat16.h" +#include "paddle/fluid/platform/eigen_ext.h" #define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h #include "gtest/gtest.h" diff --git a/paddle/fluid/platform/complex128.h b/paddle/fluid/platform/complex128.h index c50ff2f8103..d6fddd672a0 100644 --- a/paddle/fluid/platform/complex128.h +++ b/paddle/fluid/platform/complex128.h @@ -16,12 +16,10 @@ #include +#include +#include +#include #include -#if !defined(_WIN32) -#define PADDLE_ALIGN(x) __attribute__((aligned(x))) -#else -#define PADDLE_ALIGN(x) __declspec(align(x)) -#endif #ifdef PADDLE_WITH_CUDA #include @@ -33,15 +31,21 @@ #include // NOLINT #endif -#include - -#include "paddle/fluid/platform/hostdevice.h" -#include "unsupported/Eigen/CXX11/Tensor" +#if !defined(_WIN32) +#define PADDLE_ALIGN(x) __attribute__((aligned(x))) +#else +#define PADDLE_ALIGN(x) __declspec(align(x)) +#endif -namespace Eigen { -template -struct NumTraits; -} // namespace Eigen +#if (defined(__CUDACC__) || defined(__HIPCC__)) +#define HOSTDEVICE __host__ __device__ +#define DEVICE __device__ +#define HOST __host__ +#else +#define HOSTDEVICE +#define DEVICE +#define HOST +#endif namespace paddle { namespace platform { @@ -509,97 +513,5 @@ struct numeric_limits { }; } // namespace std -namespace Eigen { - -using complex128 = paddle::platform::complex128; - -template <> -struct NumTraits : GenericNumTraits> { - typedef double Real; - typedef typename NumTraits::Literal Literal; - enum { - IsComplex = 1, - RequireInitialization = NumTraits::RequireInitialization, - ReadCost = 2 * NumTraits::ReadCost, - AddCost = 2 * NumTraits::AddCost, - MulCost = 4 * NumTraits::MulCost + 2 * NumTraits::AddCost - }; - - EIGEN_DEVICE_FUNC - static inline Real epsilon() { return NumTraits::epsilon(); } - EIGEN_DEVICE_FUNC - static inline Real dummy_precision() { - return NumTraits::dummy_precision(); - } - EIGEN_DEVICE_FUNC - static inline int digits10() { return NumTraits::digits10(); } -}; -namespace numext { - -template <> -HOSTDEVICE inline bool(isnan)(const complex128& a) { - return (paddle::platform::isnan)(a); -} - -template <> -HOSTDEVICE inline bool(isinf)(const complex128& a) { - return (paddle::platform::isinf)(a); -} - -template <> -HOSTDEVICE inline bool(isfinite)(const complex128& a) { - return (paddle::platform::isfinite)(a); -} - -template <> -HOSTDEVICE inline complex128 exp(const complex128& a) { - double com = ::expf(a.real); - double res_real = com * ::cosf(a.imag); - double res_imag = com * ::sinf(a.imag); - return complex128(res_real, res_imag); -} - -template <> -HOSTDEVICE inline complex128 log(const complex128& a) { - return paddle::platform::log(a); -} - -template <> -HOSTDEVICE inline complex128 tanh(const complex128& a) { - return paddle::platform::tanh(a); -} - -template <> -HOSTDEVICE inline complex128 sqrt(const complex128& a) { - return paddle::platform::sqrt(a); -} - -template <> -HOSTDEVICE inline complex128 ceil(const complex128& a) { - return complex128(::ceilf(a.real), ::ceilf(a.imag)); -} - -template <> -HOSTDEVICE inline complex128 floor(const complex128& a) { - return complex128(::floorf(a.real), ::floor(a.imag)); -} - -template <> -HOSTDEVICE inline complex128 round(const complex128& a) { - return complex128(::roundf(a.real), ::roundf(a.imag)); -} - -template <> -HOSTDEVICE inline complex128 pow(const complex128& a, const complex128& b) { - return paddle::platform::pow(a, b); -} - -template <> -HOSTDEVICE inline double abs(const complex128& a) { - return paddle::platform::abs(a); -} - -} // namespace numext -} // namespace Eigen #define MKL_Complex16 paddle::platform::complex128 diff --git a/paddle/fluid/platform/complex64.h b/paddle/fluid/platform/complex64.h index b91fdbab28b..9d55ba19105 100644 --- a/paddle/fluid/platform/complex64.h +++ b/paddle/fluid/platform/complex64.h @@ -15,12 +15,11 @@ #pragma once #include + +#include +#include +#include #include -#if !defined(_WIN32) -#define PADDLE_ALIGN(x) __attribute__((aligned(x))) -#else -#define PADDLE_ALIGN(x) __declspec(align(x)) -#endif #ifdef PADDLE_WITH_CUDA #include @@ -32,16 +31,23 @@ #include // NOLINT #endif -#include +#if !defined(_WIN32) +#define PADDLE_ALIGN(x) __attribute__((aligned(x))) +#else +#define PADDLE_ALIGN(x) __declspec(align(x)) +#endif -#include "paddle/fluid/platform/complex128.h" -#include "paddle/fluid/platform/hostdevice.h" -#include "unsupported/Eigen/CXX11/Tensor" +#if (defined(__CUDACC__) || defined(__HIPCC__)) +#define HOSTDEVICE __host__ __device__ +#define DEVICE __device__ +#define HOST __host__ +#else +#define HOSTDEVICE +#define DEVICE +#define HOST +#endif -namespace Eigen { -template -struct NumTraits; -} // namespace Eigen +#include "complex128.h" // NOLINT namespace paddle { namespace platform { @@ -510,98 +516,5 @@ struct numeric_limits { }; } // namespace std -namespace Eigen { - -using complex64 = paddle::platform::complex64; - -template <> -struct NumTraits : GenericNumTraits> { - typedef float Real; - typedef typename NumTraits::Literal Literal; - enum { - IsComplex = 1, - RequireInitialization = NumTraits::RequireInitialization, - ReadCost = 2 * NumTraits::ReadCost, - AddCost = 2 * NumTraits::AddCost, - MulCost = 4 * NumTraits::MulCost + 2 * NumTraits::AddCost - }; - - EIGEN_DEVICE_FUNC - static inline Real epsilon() { return NumTraits::epsilon(); } - EIGEN_DEVICE_FUNC - static inline Real dummy_precision() { - return NumTraits::dummy_precision(); - } - EIGEN_DEVICE_FUNC - static inline int digits10() { return NumTraits::digits10(); } -}; - -namespace numext { - -template <> -HOSTDEVICE inline bool(isnan)(const complex64& a) { - return (paddle::platform::isnan)(a); -} - -template <> -HOSTDEVICE inline bool(isinf)(const complex64& a) { - return (paddle::platform::isinf)(a); -} - -template <> -HOSTDEVICE inline bool(isfinite)(const complex64& a) { - return (paddle::platform::isfinite)(a); -} - -template <> -HOSTDEVICE inline complex64 exp(const complex64& a) { - float com = ::expf(a.real); - float res_real = com * ::cosf(a.imag); - float res_imag = com * ::sinf(a.imag); - return complex64(res_real, res_imag); -} - -template <> -HOSTDEVICE inline complex64 log(const complex64& a) { - return paddle::platform::log(a); -} - -template <> -HOSTDEVICE inline complex64 tanh(const complex64& a) { - return paddle::platform::tanh(a); -} - -template <> -HOSTDEVICE inline complex64 sqrt(const complex64& a) { - return paddle::platform::sqrt(a); -} - -template <> -HOSTDEVICE inline complex64 ceil(const complex64& a) { - return complex64(::ceilf(a.real), ::ceilf(a.imag)); -} - -template <> -HOSTDEVICE inline complex64 floor(const complex64& a) { - return complex64(::floorf(a.real), ::floor(a.imag)); -} - -template <> -HOSTDEVICE inline complex64 round(const complex64& a) { - return complex64(::roundf(a.real), ::roundf(a.imag)); -} - -template <> -HOSTDEVICE inline complex64 pow(const complex64& a, const complex64& b) { - return paddle::platform::pow(a, b); -} - -template <> -HOSTDEVICE inline float abs(const complex64& a) { - return paddle::platform::abs(a); -} - -} // namespace numext -} // namespace Eigen #define MKL_Complex8 paddle::platform::complex64 diff --git a/paddle/fluid/platform/eigen_ext.h b/paddle/fluid/platform/eigen_ext.h new file mode 100644 index 00000000000..9e2c3630468 --- /dev/null +++ b/paddle/fluid/platform/eigen_ext.h @@ -0,0 +1,306 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/platform/bfloat16.h" +#include "paddle/fluid/platform/complex128.h" +#include "paddle/fluid/platform/complex64.h" +#include "paddle/fluid/platform/hostdevice.h" + +#include "unsupported/Eigen/CXX11/Tensor" + +namespace Eigen { + +using bfloat16 = paddle::platform::bfloat16; +using complex64 = paddle::platform::complex64; +using complex128 = paddle::platform::complex128; + +template +struct NumTraits; + +template <> +struct NumTraits : GenericNumTraits { + enum { + IsSigned = true, + IsInteger = false, + IsComplex = false, + RequireInitialization = false + }; + + HOSTDEVICE static inline bfloat16 epsilon() { + return paddle::platform::raw_uint16_to_bfloat16(0x3400); + } + HOSTDEVICE static inline bfloat16 dummy_precision() { + return bfloat16(1e-5f); + } + HOSTDEVICE static inline bfloat16 highest() { + return paddle::platform::raw_uint16_to_bfloat16(0x7f7f); + } + HOSTDEVICE static inline bfloat16 lowest() { + return paddle::platform::raw_uint16_to_bfloat16(0xff7f); + } + HOSTDEVICE static inline bfloat16 infinity() { + return paddle::platform::raw_uint16_to_bfloat16(0x7f80); + } + HOSTDEVICE static inline bfloat16 quiet_NaN() { + return paddle::platform::raw_uint16_to_bfloat16(0xffc1); + } +}; + +template <> +struct NumTraits : GenericNumTraits> { + typedef float Real; + typedef typename NumTraits::Literal Literal; + enum { + IsComplex = 1, + RequireInitialization = NumTraits::RequireInitialization, + ReadCost = 2 * NumTraits::ReadCost, + AddCost = 2 * NumTraits::AddCost, + MulCost = 4 * NumTraits::MulCost + 2 * NumTraits::AddCost + }; + + EIGEN_DEVICE_FUNC + static inline Real epsilon() { return NumTraits::epsilon(); } + EIGEN_DEVICE_FUNC + static inline Real dummy_precision() { + return NumTraits::dummy_precision(); + } + EIGEN_DEVICE_FUNC + static inline int digits10() { return NumTraits::digits10(); } +}; + +template <> +struct NumTraits : GenericNumTraits> { + typedef double Real; + typedef typename NumTraits::Literal Literal; + enum { + IsComplex = 1, + RequireInitialization = NumTraits::RequireInitialization, + ReadCost = 2 * NumTraits::ReadCost, + AddCost = 2 * NumTraits::AddCost, + MulCost = 4 * NumTraits::MulCost + 2 * NumTraits::AddCost + }; + + EIGEN_DEVICE_FUNC + static inline Real epsilon() { return NumTraits::epsilon(); } + EIGEN_DEVICE_FUNC + static inline Real dummy_precision() { + return NumTraits::dummy_precision(); + } + EIGEN_DEVICE_FUNC + static inline int digits10() { return NumTraits::digits10(); } +}; + +namespace numext { + +//////////// bfloat methods ///////////// + +template <> +HOSTDEVICE inline bool(isnan)(const bfloat16& a) { + return (paddle::platform::isnan)(a); +} + +template <> +HOSTDEVICE inline bool(isinf)(const bfloat16& a) { + return (paddle::platform::isinf)(a); +} + +template <> +HOSTDEVICE inline bool(isfinite)(const bfloat16& a) { + return (paddle::platform::isfinite)(a); +} + +template <> +HOSTDEVICE inline bfloat16 exp(const bfloat16& a) { + return bfloat16(::expf(static_cast(a))); +} + +template <> +HOSTDEVICE inline bfloat16 erf(const bfloat16& a) { + return bfloat16(::erff(static_cast(a))); +} + +template <> +HOSTDEVICE inline bfloat16 log(const bfloat16& a) { + return bfloat16(::logf(static_cast(a))); +} + +template <> +HOSTDEVICE inline bfloat16 tanh(const bfloat16& a) { + return bfloat16(::tanhf(static_cast(a))); +} + +template <> +HOSTDEVICE inline bfloat16 sqrt(const bfloat16& a) { + return bfloat16(::sqrtf(static_cast(a))); +} + +template <> +HOSTDEVICE inline bfloat16 ceil(const bfloat16& a) { + return bfloat16(::ceilf(static_cast(a))); +} + +template <> +HOSTDEVICE inline bfloat16 floor(const bfloat16& a) { + return bfloat16(::floorf(static_cast(a))); +} + +template <> +HOSTDEVICE inline bfloat16 round(const bfloat16& a) { + return bfloat16(::roundf(static_cast(a))); +} + +template <> +HOSTDEVICE inline bfloat16 pow(const bfloat16& a, const bfloat16& b) { + return bfloat16(::powf(static_cast(a), static_cast(b))); +} + +template <> +HOSTDEVICE inline bfloat16 abs(const bfloat16& a) { + return bfloat16(::fabs(static_cast(a))); +} + +//////////// complex64 methods ///////////// + +template <> +HOSTDEVICE inline bool(isnan)(const complex64& a) { + return (paddle::platform::isnan)(a); +} + +template <> +HOSTDEVICE inline bool(isinf)(const complex64& a) { + return (paddle::platform::isinf)(a); +} + +template <> +HOSTDEVICE inline bool(isfinite)(const complex64& a) { + return (paddle::platform::isfinite)(a); +} + +template <> +HOSTDEVICE inline complex64 exp(const complex64& a) { + float com = ::expf(a.real); + float res_real = com * ::cosf(a.imag); + float res_imag = com * ::sinf(a.imag); + return complex64(res_real, res_imag); +} + +template <> +HOSTDEVICE inline complex64 log(const complex64& a) { + return paddle::platform::log(a); +} + +template <> +HOSTDEVICE inline complex64 tanh(const complex64& a) { + return paddle::platform::tanh(a); +} + +template <> +HOSTDEVICE inline complex64 sqrt(const complex64& a) { + return paddle::platform::sqrt(a); +} + +template <> +HOSTDEVICE inline complex64 ceil(const complex64& a) { + return complex64(::ceilf(a.real), ::ceilf(a.imag)); +} + +template <> +HOSTDEVICE inline complex64 floor(const complex64& a) { + return complex64(::floorf(a.real), ::floor(a.imag)); +} + +template <> +HOSTDEVICE inline complex64 round(const complex64& a) { + return complex64(::roundf(a.real), ::roundf(a.imag)); +} + +template <> +HOSTDEVICE inline complex64 pow(const complex64& a, const complex64& b) { + return paddle::platform::pow(a, b); +} + +template <> +HOSTDEVICE inline float abs(const complex64& a) { + return paddle::platform::abs(a); +} + +//////////// complex128 methods ///////////// + +template <> +HOSTDEVICE inline bool(isnan)(const complex128& a) { + return (paddle::platform::isnan)(a); +} + +template <> +HOSTDEVICE inline bool(isinf)(const complex128& a) { + return (paddle::platform::isinf)(a); +} + +template <> +HOSTDEVICE inline bool(isfinite)(const complex128& a) { + return (paddle::platform::isfinite)(a); +} + +template <> +HOSTDEVICE inline complex128 exp(const complex128& a) { + double com = ::expf(a.real); + double res_real = com * ::cosf(a.imag); + double res_imag = com * ::sinf(a.imag); + return complex128(res_real, res_imag); +} + +template <> +HOSTDEVICE inline complex128 log(const complex128& a) { + return paddle::platform::log(a); +} + +template <> +HOSTDEVICE inline complex128 tanh(const complex128& a) { + return paddle::platform::tanh(a); +} + +template <> +HOSTDEVICE inline complex128 sqrt(const complex128& a) { + return paddle::platform::sqrt(a); +} + +template <> +HOSTDEVICE inline complex128 ceil(const complex128& a) { + return complex128(::ceilf(a.real), ::ceilf(a.imag)); +} + +template <> +HOSTDEVICE inline complex128 floor(const complex128& a) { + return complex128(::floorf(a.real), ::floor(a.imag)); +} + +template <> +HOSTDEVICE inline complex128 round(const complex128& a) { + return complex128(::roundf(a.real), ::roundf(a.imag)); +} + +template <> +HOSTDEVICE inline complex128 pow(const complex128& a, const complex128& b) { + return paddle::platform::pow(a, b); +} + +template <> +HOSTDEVICE inline double abs(const complex128& a) { + return paddle::platform::abs(a); +} + +} // namespace numext +} // namespace Eigen -- GitLab From c3634c6b0a45430e083deca42b796568514b6d81 Mon Sep 17 00:00:00 2001 From: ShenLiang Date: Mon, 15 Mar 2021 11:45:51 +0800 Subject: [PATCH 016/486] fix amp bug of fleet (#31532) --- python/paddle/distributed/fleet/base/fleet_base.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/python/paddle/distributed/fleet/base/fleet_base.py b/python/paddle/distributed/fleet/base/fleet_base.py index f4075e92c4c..19ba637cc96 100644 --- a/python/paddle/distributed/fleet/base/fleet_base.py +++ b/python/paddle/distributed/fleet/base/fleet_base.py @@ -637,6 +637,11 @@ class Fleet(object): self._user_defined_strategy = copy.deepcopy(strategy) self._context = {} + + # TODO(shenliang03): This is a temporary solution to support amp. In the case of a dynamic graph, + # the optimizer is returned directly. This problem will be fixed in the future. + if paddle.fluid.framework.in_dygraph_mode(): + return optimizer return self @dygraph_only -- GitLab From 75433126df2f6adfaf90c4a0b853ec37ed729892 Mon Sep 17 00:00:00 2001 From: LielinJiang <50691816+LielinJiang@users.noreply.github.com> Date: Mon, 15 Mar 2021 14:24:46 +0800 Subject: [PATCH 017/486] Fix summary bug when calaculating output shape (#31549) * fix summary bug --- python/paddle/hapi/model_summary.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/python/paddle/hapi/model_summary.py b/python/paddle/hapi/model_summary.py index babbe962a95..9f2769e1ca2 100644 --- a/python/paddle/hapi/model_summary.py +++ b/python/paddle/hapi/model_summary.py @@ -341,10 +341,12 @@ def summary_string(model, input_size, dtypes=None): total_params += summary[layer]["nb_params"] try: - total_output += np.prod(summary[layer]["output_shape"]) + total_output += np.sum( + np.prod( + summary[layer]["output_shape"], axis=-1)) except: for output_shape in summary[layer]["output_shape"]: - total_output += np.prod(output_shape) + total_output += np.sum(np.prod(output_shape, axis=-1)) if "trainable" in summary[layer]: if summary[layer]["trainable"] == True: -- GitLab From da10c5cf8b91b84c4f6f5e0f05879df0567c616a Mon Sep 17 00:00:00 2001 From: ronnywang <524019753@qq.com> Date: Tue, 16 Mar 2021 10:14:59 +0800 Subject: [PATCH 018/486] [ROCM] fix softmax_with_cross_entropy_op, test=develop (#31629) --- .../softmax_with_cross_entropy_op.cu | 13 +++------ .../test_softmax_with_cross_entropy_op.py | 28 +++++++++---------- 2 files changed, 18 insertions(+), 23 deletions(-) diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu index eaded93cce7..2257d816d89 100644 --- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu +++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu @@ -452,12 +452,7 @@ struct HardLabelCrossEntropyFunctorWithIgnoreIdx { // labels, loss view as [n, remain] int idx_lbl = idx_n * remain + idx_remain; - if (idx_axis == ignore_idx_) { - loss_[idx_lbl] = 0; - return; - } - - if (idx_axis == labels_[idx_lbl]) { + if (idx_axis == labels_[idx_lbl] && idx_axis != ignore_idx_) { loss_[idx_lbl] = -log_on_device(logits_data_[idx]); } } @@ -732,7 +727,7 @@ static void SoftmaxWithCrossEntropyFusedKernel( template static void CrossEntropyFusedKernel(const T* logits_data, const T* labels_data, T* loss_data, int n, int d, int axis_dim, - cudaStream_t stream) { + gpuStream_t stream) { constexpr int kMaxBlockDim = 512; int block_dim = axis_dim >= kMaxBlockDim ? kMaxBlockDim @@ -792,11 +787,11 @@ class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel { auto* softmax_out_data = softmax_out->mutable_data(context.GetPlace()); auto* loss_data = loss->mutable_data(context.GetPlace()); + math::SetConstant set_constant; + set_constant(context.cuda_device_context(), loss, static_cast(0)); if (axis_dim == 1) { - math::SetConstant set_constant; set_constant(context.cuda_device_context(), softmax_out, static_cast(1)); - set_constant(context.cuda_device_context(), loss, static_cast(0)); return; } diff --git a/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py b/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py index 5bfc422da82..e1f5ecf2683 100644 --- a/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py +++ b/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py @@ -116,7 +116,7 @@ class TestSoftmaxWithCrossEntropyOp_NotWithSoftmax_SoftLabel_1D( self.shape = [13, 8] self.axis = -1 self.ignore_index = -1 - self.dtype = np.float64 + self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64 self.softmax_switch = False #default is true, means "with softmax" @@ -129,7 +129,7 @@ class TestSoftmaxWithCrossEntropyOp_NotWithSoftmax_HardLabel_1D( self.shape = [13, 8] self.axis = -1 self.ignore_index = -1 - self.dtype = np.float64 + self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64 self.softmax_switch = False #default is true, means "with softmax" @@ -145,7 +145,7 @@ class TestSoftmaxWithCrossEntropyOp_NotWithSoftmax_SoftLabel_2D( self.shape = [3, 5, 7, 11] self.axis = -1 self.ignore_index = -1 - self.dtype = np.float64 + self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64 self.softmax_switch = False #default is true, means "with softmax" @@ -155,7 +155,7 @@ class TestSoftmaxWithCrossEntropyOp_NotWithSoftmax_SoftLabel_2D_Axis2( self.op_type = "softmax_with_cross_entropy" self.numeric_stable_mode = True self.soft_label = True - self.dtype = np.float64 + self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64 self.axis = 1 self.ignore_index = -1 self.shape = [3, 5, 7, 11] @@ -168,7 +168,7 @@ class TestSoftmaxWithCrossEntropyOp_NotWithSoftmax_SoftLabel_2D_Axis3( self.op_type = "softmax_with_cross_entropy" self.numeric_stable_mode = True self.soft_label = True - self.dtype = np.float64 + self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64 self.axis = 2 self.ignore_index = -1 self.shape = [3, 5, 7, 11] @@ -181,7 +181,7 @@ class TestSoftmaxWithCrossEntropyOp_NotWithSoftmax_SoftLabel_2D_Axis4( self.op_type = "softmax_with_cross_entropy" self.numeric_stable_mode = True self.soft_label = True - self.dtype = np.float64 + self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64 self.axis = 3 self.ignore_index = -1 self.shape = [3, 5, 7, 11] @@ -206,7 +206,7 @@ class TestSoftmaxWithCrossEntropyOp_NotWithSoftmax_HardLabel_2D( self.shape = [3, 5, 7, 11] self.axis = -1 self.ignore_index = -1 - self.dtype = np.float64 + self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64 self.softmax_switch = False #default is true, means "with softmax" @@ -216,7 +216,7 @@ class TestSoftmaxWithCrossEntropyOp_NotWithSoftmax_HardLabel_2D_Axis2( self.op_type = "softmax_with_cross_entropy" self.numeric_stable_mode = True self.soft_label = False - self.dtype = np.float64 + self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64 self.axis = 1 self.ignore_index = -1 self.shape = [3, 5, 7, 11] @@ -229,7 +229,7 @@ class TestSoftmaxWithCrossEntropyOp_NotWithSoftmax_HardLabel_2D_Axis3( self.op_type = "softmax_with_cross_entropy" self.numeric_stable_mode = True self.soft_label = False - self.dtype = np.float64 + self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64 self.axis = 2 self.ignore_index = -1 self.shape = [3, 5, 7, 11] @@ -242,7 +242,7 @@ class TestSoftmaxWithCrossEntropyOp_NotWithSoftmax_HardLabel_2D_Axis4( self.op_type = "softmax_with_cross_entropy" self.numeric_stable_mode = True self.soft_label = False - self.dtype = np.float64 + self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64 self.axis = 3 self.ignore_index = -1 self.shape = [3, 5, 7, 11] @@ -267,7 +267,7 @@ class TestSoftmaxWithCrossEntropyOp_NotWithSoftmax_HardLabel_Ignore( self.shape = [13, 8] self.axis = -1 self.ignore_index = 2 - self.dtype = np.float64 + self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64 self.softmax_switch = False #default is true, means "with softmax" @@ -280,7 +280,7 @@ class TestSoftmaxWithCrossEntropyOp_NotWithSoftmax_HardLabel_Ignore_Axis( self.shape = [13, 8] self.axis = 1 self.ignore_index = 2 - self.dtype = np.float64 + self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64 self.softmax_switch = False #default is true, means "with softmax" @@ -293,7 +293,7 @@ class TestSoftmaxWithCrossEntropyOp_NotWithSoftmax_HardLabel_2D_Ignore( self.shape = [3, 5, 7, 11] self.axis = -1 self.ignore_index = 2 - self.dtype = np.float64 + self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64 self.softmax_switch = False #default is true, means "with softmax" @@ -303,7 +303,7 @@ class TestSoftmaxWithCrossEntropyOp_NotWithSoftmax_HardLabel_2D_Ignore_Axis3( self.op_type = "softmax_with_cross_entropy" self.numeric_stable_mode = True self.soft_label = False - self.dtype = np.float64 + self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64 self.axis = 2 self.ignore_index = 2 self.shape = [3, 5, 7, 11] -- GitLab From 580442cebafa80af93f4fe350dfcd00d4768096e Mon Sep 17 00:00:00 2001 From: YUNSHEN XIE <1084314248@qq.com> Date: Tue, 16 Mar 2021 10:23:46 +0800 Subject: [PATCH 019/486] fix wget with no proxy on windows (#31505) * fix wget with no proxy on windows * modified import packages * fix format error * fix bug * fix format error * fix format error --- tools/get_pr_ut.py | 53 ++++++++++++++++++++++++++++------ tools/windows/run_unittests.sh | 2 ++ 2 files changed, 46 insertions(+), 9 deletions(-) diff --git a/tools/get_pr_ut.py b/tools/get_pr_ut.py index e97f69faf02..001f380049f 100644 --- a/tools/get_pr_ut.py +++ b/tools/get_pr_ut.py @@ -20,12 +20,15 @@ import sys import time import subprocess import requests +import urllib.request +import ssl import platform from github import Github PADDLE_ROOT = os.getenv('PADDLE_ROOT', '/paddle/') PADDLE_ROOT += '/' PADDLE_ROOT = PADDLE_ROOT.replace('//', '/') +ssl._create_default_https_context = ssl._create_unverified_context class PRChecker(object): @@ -75,7 +78,10 @@ class PRChecker(object): if ix // 2 == 0: proxy = '' else: - proxy = '--no-proxy' + if platform.system() == 'Windows': + proxy = '-Y off' + else: + proxy = '--no-proxy' code = subprocess.call( 'wget -q {} --no-check-certificate {}'.format(proxy, url), shell=True) @@ -88,6 +94,33 @@ class PRChecker(object): ix += 1 return False + def __urlretrieve(self, url, filename): + ix = 1 + with_proxy = urllib.request.getproxies() + without_proxy = {'http': '', 'http': ''} + while ix < 6: + if ix // 2 == 0: + cur_proxy = urllib.request.ProxyHandler(without_proxy) + else: + cur_proxy = urllib.request.ProxyHandler(with_proxy) + opener = urllib.request.build_opener(cur_proxy, + urllib.request.HTTPHandler) + urllib.request.install_opener(opener) + try: + urllib.request.urlretrieve(url, filename) + except Exception as e: + print(e) + print( + 'PREC download {} error, retry {} time(s) after {} secs.[proxy_option={}]'. + format(url, ix, ix * 10, proxy)) + continue + else: + return True + time.sleep(ix * 10) + ix += 1 + + return False + def get_pr_files(self): """ Get files in pull request. """ page = 0 @@ -202,9 +235,9 @@ class PRChecker(object): check_added_ut = False ut_list = [] file_ut_map = None - ret = self.__wget_with_retry( + ret = self.__urlretrieve( 'https://sys-p0.bj.bcebos.com/prec/file_ut.json{}'.format( - self.suffix)) + self.suffix), 'file_ut.json{}'.format(self.suffix)) if not ret: print('PREC download file_ut.json failed') exit(1) @@ -213,9 +246,11 @@ class PRChecker(object): for f in self.get_pr_files(): current_system = platform.system() if current_system == "Darwin" or current_system == "Windows": - f = f.replace(PADDLE_ROOT, '/paddle/', 1) - f = f.replace('//', '/') - if f not in file_ut_map: + f_judge = f.replace(PADDLE_ROOT, '/paddle/', 1) + f_judge = f_judge.replace('//', '/') + else: + f_judge = f + if f_judge not in file_ut_map: if f.endswith('.md'): ut_list.append('md_placeholder') elif f.endswith('.h') or f.endswith('.cu'): @@ -245,7 +280,7 @@ class PRChecker(object): if self.is_only_comment(f): ut_list.append('map_comment_placeholder') else: - ut_list.extend(file_ut_map.get(f)) + ut_list.extend(file_ut_map.get(f_judge)) ut_list = list(set(ut_list)) if check_added_ut: @@ -255,9 +290,9 @@ class PRChecker(object): ut_list.append(ut.rstrip('\r\n')) if ut_list: - ret = self.__wget_with_retry( + ret = self.__urlretrieve( 'https://sys-p0.bj.bcebos.com/prec/prec_delta{}'.format( - self.suffix)) + self.suffix), 'prec_delta{}'.format(self.suffix)) if ret: with open('prec_delta' + self.suffix) as delta: for ut in delta: diff --git a/tools/windows/run_unittests.sh b/tools/windows/run_unittests.sh index 71b5e65214f..312711c5141 100644 --- a/tools/windows/run_unittests.sh +++ b/tools/windows/run_unittests.sh @@ -221,6 +221,7 @@ if [ ${PRECISION_TEST:-OFF} == "ON" ]; then fi fi +set +e if [ ${PRECISION_TEST:-OFF} == "ON" ] && [[ "$precision_cases" != "" ]];then UT_list_prec='' re=$(cat ut_list|awk -F ' ' '{print }' | awk 'BEGIN{ all_str=""}{if (all_str==""){all_str=$1}else{all_str=all_str"$|^"$1}} END{print "^"all_str"$"}') @@ -238,6 +239,7 @@ if [ ${PRECISION_TEST:-OFF} == "ON" ] && [[ "$precision_cases" != "" ]];then done UT_list=$UT_list_prec fi +set -e output=$(python ${PADDLE_ROOT}/tools/parallel_UT_rule.py "${UT_list}") eight_parallel_job=$(echo $output | cut -d ";" -f 1) -- GitLab From 9c624b16d5aa4c938fc7bd81a3e51d5f76f5226b Mon Sep 17 00:00:00 2001 From: gongweibao Date: Tue, 16 Mar 2021 11:24:53 +0800 Subject: [PATCH 020/486] Extend unittest time of (#31570) --- python/paddle/fluid/tests/book/CMakeLists.txt | 2 +- .../tests/unittests/test_fleet_launch_ps.sh | 36 +++++++++---------- 2 files changed, 17 insertions(+), 21 deletions(-) diff --git a/python/paddle/fluid/tests/book/CMakeLists.txt b/python/paddle/fluid/tests/book/CMakeLists.txt index 6f717302468..09c650f16e2 100644 --- a/python/paddle/fluid/tests/book/CMakeLists.txt +++ b/python/paddle/fluid/tests/book/CMakeLists.txt @@ -9,7 +9,7 @@ endforeach() set_tests_properties(test_word2vec_book PROPERTIES TIMEOUT 120) set_tests_properties(test_recognize_digits PROPERTIES TIMEOUT 120) set_tests_properties(test_image_classification PROPERTIES TIMEOUT 200) -set_tests_properties(test_label_semantic_roles PROPERTIES TIMEOUT 120) +set_tests_properties(test_label_semantic_roles PROPERTIES TIMEOUT 240) set_tests_properties(test_machine_translation PROPERTIES TIMEOUT 120) set_tests_properties(test_rnn_encoder_decoder PROPERTIES TIMEOUT 120) set_tests_properties(test_fit_a_line PROPERTIES TIMEOUT 120) diff --git a/python/paddle/fluid/tests/unittests/test_fleet_launch_ps.sh b/python/paddle/fluid/tests/unittests/test_fleet_launch_ps.sh index 21875851bf5..67a8d7e5750 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_launch_ps.sh +++ b/python/paddle/fluid/tests/unittests/test_fleet_launch_ps.sh @@ -16,27 +16,19 @@ set -e -function test_launch_ps(){ - server_port_0=${PADDLE_DIST_UT_PORT} - server_port_1=$(( PADDLE_DIST_UT_PORT + 1 )) - echo "server_port_0:${server_port_0} server_port_1=${server_port_1}" - python -m paddle.distributed.fleet.launch --server_num=2 --worker_num=2 fleet_ps_training.py 2> ut.elog - if grep -q "server are killed" ut.elog; then - echo "test pserver launch succeed" - else - echo "test pserver launch failed" - exit -1 - fi +server_port_0=${PADDLE_DIST_UT_PORT} +server_port_1=$(( PADDLE_DIST_UT_PORT + 1 )) +worker_port_0=$(( PADDLE_DIST_UT_PORT + 2 )) +worker_port_1=$(( PADDLE_DIST_UT_PORT + 3 )) +heter_worker_port_0=$(( PADDLE_DIST_UT_PORT + 4 )) +heter_worker_port_1=$(( PADDLE_DIST_UT_PORT + 5 )) - python -m paddle.distributed.fleet.launch --servers="127.0.0.1:${server_port_0},127.0.0.1:${server_port_1}" --workers="127.0.0.1:6782,127.0.0.1:6783" fleet_ps_training.py 2> ut.elog - if grep -q "server are killed" ut.elog; then - echo "test pserver launch succeed" - else - echo "test pserver launch failed" - exit -1 - fi +function test_launch_ps(){ - python -m paddle.distributed.fleet.launch --servers="127.0.0.1:${server_port_0},127.0.0.1:${server_port_1}" --workers="127.0.0.1,127.0.0.1" fleet_ps_training.py 2> ut.elog + python -m paddle.distributed.fleet.launch \ + --servers="127.0.0.1:${server_port_0},127.0.0.1:${server_port_1}" \ + --workers="127.0.0.1:${worker_port_0},127.0.0.1:${worker_port_1}" \ + fleet_ps_training.py 2> ut.elog if grep -q "server are killed" ut.elog; then echo "test pserver launch succeed" else @@ -46,7 +38,11 @@ function test_launch_ps(){ } function test_launch_ps_heter(){ - python -m paddle.distributed.fleet.launch --server_num=2 --worker_num=2 --heter_worker_num=2 fleet_ps_training.py 2> ut.elog + python -m paddle.distributed.fleet.launch \ + --servers="127.0.0.1:${server_port_0},127.0.0.1:${server_port_1}" \ + --workers="127.0.0.1:${worker_port_0},127.0.0.1:${worker_port_1}" \ + --heter_workers="127.0.0.1:${heter_worker_port_0},127.0.0.1:${heter_worker_port_1}" \ + fleet_ps_training.py 2> ut.elog if grep -q "server are killed" ut.elog; then echo "test heter pserver launch succeed" else -- GitLab From c1b1ccfbf562ebcb04e29966076202e2a062549c Mon Sep 17 00:00:00 2001 From: yiak Date: Tue, 16 Mar 2021 14:56:14 +0800 Subject: [PATCH 021/486] Update tinyformat.h (#31612) Quick fix to https://github.com/PaddlePaddle/Paddle/issues/13860 --- paddle/fluid/string/tinyformat/tinyformat.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/string/tinyformat/tinyformat.h b/paddle/fluid/string/tinyformat/tinyformat.h index a5c1798e100..7498c6a46e3 100644 --- a/paddle/fluid/string/tinyformat/tinyformat.h +++ b/paddle/fluid/string/tinyformat/tinyformat.h @@ -777,7 +777,7 @@ inline void formatImpl(std::ostream &out, const char *fmt, // Print remaining part of format string. fmt = printFormatStringLiteral(out, fmt); - if (*fmt != '\0') + if (fmt != nullptr && *fmt != '\0' && *fmt != 0) TINYFORMAT_ERROR( "tinyformat: Too many conversion specifiers in format string"); -- GitLab From 41e9ecfd1fcfee1bb1f77c5ab29c5d14184110be Mon Sep 17 00:00:00 2001 From: wuhuanzhou Date: Tue, 16 Mar 2021 15:17:45 +0800 Subject: [PATCH 022/486] Optimize compilation with Ninja (#31449) * Optimize compilation with Ninja, notest, test=windows_ci, test=windows_op * no cache on windows ci, notest, test=windows_ci, test=windows_op * delete /Zc:inline compiled in NVCC, notest, test=windows_ci, test=windows_op * fix test_warpctc_op, notest, test=windows_ci * remove test code, test=develop --- CMakeLists.txt | 7 +++++++ cmake/external/warpctc.cmake | 12 ++++++------ 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f24513d605c..992c3f1c4fa 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -72,6 +72,13 @@ if(WIN32) set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj") set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj") + if("${CMAKE_GENERATOR}" STREQUAL "Ninja") + set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /Zc:inline") + set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /Zc:inline") + set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /Zc:inline") + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Zc:inline") + endif() + if (MSVC_STATIC_CRT) message(STATUS "Use static C runtime time, refer to https://docs.microsoft.com/en-us/cpp/c-runtime-library/crt-library-features?view=vs-2019") set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /MTd") diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake index 0ee3e2116a9..e633cae5401 100644 --- a/cmake/external/warpctc.cmake +++ b/cmake/external/warpctc.cmake @@ -49,12 +49,12 @@ ExternalProject_Add( BUILD_ALWAYS 1 CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} - -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} - -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} - -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} - -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} - -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} - -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} + -DCMAKE_C_FLAGS=$ + -DCMAKE_C_FLAGS_DEBUG=$ + -DCMAKE_C_FLAGS_RELEASE=$ + -DCMAKE_CXX_FLAGS=$ + -DCMAKE_CXX_FLAGS_RELEASE=$ + -DCMAKE_CXX_FLAGS_DEBUG=$ -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR} -DWITH_GPU=${WITH_GPU} -DWITH_OMP=${USE_OMP} -- GitLab From 1a6e3b04cdb4b9c99f0bc81c92e0995e5c0483fd Mon Sep 17 00:00:00 2001 From: YUNSHEN XIE <1084314248@qq.com> Date: Tue, 16 Mar 2021 15:48:21 +0800 Subject: [PATCH 023/486] Second optimization of retry method (#31646) * Second optimization of retry method * fix show_ut_retry_result repeat execuate --- paddle/scripts/paddle_build.sh | 14 ++++++++++++++ tools/windows/run_unittests.sh | 6 ++++++ 2 files changed, 20 insertions(+) diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 3b20a403b71..3fd93a664d4 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -637,6 +637,13 @@ EOF do retry_unittests_record="$retry_unittests_record$failed_test_lists" failed_test_lists_ult=`echo "${failed_test_lists}"` + if [[ "${exec_times}" == "1" ]];then + if [[ "${failed_test_lists}" == "" ]];then + break + else + read retry_unittests <<< $(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' ) + fi + fi echo "=========================================" echo "This is the ${exec_time_array[$exec_times]} time to re-run" echo "=========================================" @@ -1250,6 +1257,13 @@ set +x do retry_unittests_record="$retry_unittests_record$failed_test_lists" failed_test_lists_ult=`echo "${failed_test_lists}" |grep -Po '[^ ].*$'` + if [[ "${exec_times}" == "1" ]];then + if [[ "${failed_test_lists}" == "" ]];then + break + else + read retry_unittests <<< $(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' ) + fi + fi echo "=========================================" echo "This is the ${exec_time_array[$exec_times]} time to re-run" echo "=========================================" diff --git a/tools/windows/run_unittests.sh b/tools/windows/run_unittests.sh index 312711c5141..dd4b21c80d9 100644 --- a/tools/windows/run_unittests.sh +++ b/tools/windows/run_unittests.sh @@ -305,6 +305,12 @@ function unittests_retry(){ cur_order='first' elif ( [[ "$exec_times" == "1" ]] );then cur_order='second' + if [[ "$failed_test_lists" == "" ]]; then + break + else + retry_unittests=$(echo "${failed_test_lists}" | grep -oEi "\-.+\(" | sed 's/(//' | sed 's/- //' ) + retry_unittests_regular=$(echo "$retry_unittests" |awk -F ' ' '{print }' | awk 'BEGIN{ all_str=""}{if (all_str==""){all_str=$1}else{all_str=all_str"$|^"$1}} END{print "^"all_str"$"}') + fi elif ( [[ "$exec_times" == "2" ]] );then cur_order='third' fi -- GitLab From d9b50f664f31f978222317d1bec38f673893806a Mon Sep 17 00:00:00 2001 From: Qi Li Date: Tue, 16 Mar 2021 16:35:31 +0800 Subject: [PATCH 024/486] [ROCM] update ci scripts and dockefile, test=develop (#31551) --- paddle/scripts/paddle_build.sh | 22 +++++++++++++++++++--- tools/dockerfile/Dockerfile.rocm | 16 +++++++++------- 2 files changed, 28 insertions(+), 10 deletions(-) diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 3fd93a664d4..7a360ac2296 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -205,6 +205,13 @@ function cmake_base() { -DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.8.0/include/python3.8 -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.8.0/lib/libpython3.so" pip3.8 install -r ${PADDLE_ROOT}/python/requirements.txt + elif [ "$1" == "conda-python3.7" ]; then + export LD_LIBRARY_PATH=/opt/conda/lib/:${LD_LIBRARY_PATH} + export PATH=/opt/conda/bin/:${PATH} + export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/conda/bin/python + -DPYTHON_INCLUDE_DIR:PATH=/opt/conda/include/python3.7m + -DPYTHON_LIBRARIES:FILEPATH=/opt/conda/lib/libpython3.so" + /opt/conda/bin/pip install -r ${PADDLE_ROOT}/python/requirements.txt fi else pip install -r ${PADDLE_ROOT}/python/requirements.txt @@ -230,7 +237,8 @@ function cmake_base() { ${PYTHON_FLAGS} -DWITH_GPU=${WITH_GPU:-OFF} -DWITH_TENSORRT=${WITH_TENSORRT:-ON} - -DWITH_AMD_GPU=${WITH_AMD_GPU:-OFF} + -DWITH_ROCM=${WITH_ROCM:-OFF} + -DWITH_RCCL=${WITH_RCCL:-OFF} -DWITH_DISTRIBUTE=${distibuted_flag} -DWITH_MKL=${WITH_MKL:-ON} -DWITH_AVX=${WITH_AVX:-OFF} @@ -267,7 +275,8 @@ EOF ${PYTHON_FLAGS} \ -DWITH_GPU=${WITH_GPU:-OFF} \ -DWITH_TENSORRT=${WITH_TENSORRT:-ON} \ - -DWITH_AMD_GPU=${WITH_AMD_GPU:-OFF} \ + -DWITH_ROCM=${WITH_ROCM:-OFF} \ + -DWITH_RCCL=${WITH_RCCL:-OFF} \ -DWITH_DISTRIBUTE=${distibuted_flag} \ -DWITH_MKL=${WITH_MKL:-ON} \ -DWITH_AVX=${WITH_AVX:-OFF} \ @@ -1028,6 +1037,8 @@ function card_test() { # get the CUDA device count, XPU device count is one if [ "${WITH_XPU}" == "ON" ];then CUDA_DEVICE_COUNT=1 + elif [ "${WITH_ROCM}" == "ON" ];then + CUDA_DEVICE_COUNT=4 else CUDA_DEVICE_COUNT=$(nvidia-smi -L | wc -l) fi @@ -1423,7 +1434,7 @@ function parallel_test() { mkdir -p ${PADDLE_ROOT}/build cd ${PADDLE_ROOT}/build pip install ${PADDLE_ROOT}/build/python/dist/*whl - if [ "$WITH_GPU" == "ON" ];then + if [ "$WITH_GPU" == "ON" ] || [ "$WITH_ROCM" == "ON" ];then parallel_test_base_gpu else if [ "$WITH_XPU" == "ON" ];then @@ -1982,6 +1993,11 @@ function main() { parallel_test check_coverage ;; + check_rocm_coverage) + cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number} + parallel_test + check_coverage + ;; cmake_gen) cmake_gen ${PYTHON_ABI:-""} ;; diff --git a/tools/dockerfile/Dockerfile.rocm b/tools/dockerfile/Dockerfile.rocm index eab4ef07c87..5df66b9ea63 100644 --- a/tools/dockerfile/Dockerfile.rocm +++ b/tools/dockerfile/Dockerfile.rocm @@ -5,7 +5,6 @@ # Build: ROCM 4.0.1 # cd Paddle/tools/dockerfile # docker build -f Dockerfile.rocm \ -# --build-arg ROCM_VERSION=4.0.1 \ # -t paddlepaddle/paddle-centos-rocm401-dev:latest . # # docker run -it --device=/dev/kfd --device=/dev/dri \ @@ -22,7 +21,7 @@ ENV LANGUAGE en_US.UTF-8 RUN yum install -y epel-release deltarpm sudo openssh-server gettext-devel sqlite-devel \ zlib-devel openssl-devel pcre-devel vim tk-devel tkinter libtool xz graphviz wget curl-devel \ make bzip2 git patch unzip bison yasm diffutils automake which file kernel-headers kernel-devel \ - net-tools numactl-devel chrpath + net-tools numactl-devel chrpath screen initscripts # Install devtoolset-7 RUN yum install -y yum-utils centos-release-scl && \ @@ -45,11 +44,10 @@ RUN wget -q https://cmake.org/files/v3.16/cmake-3.16.0-Linux-x86_64.tar.gz && \ ENV PATH=/opt/cmake-3.16/bin:${PATH} # ROCM -ARG ROCM_VERSION RUN yum install -y kmod wget openblas-devel epel-release RUN echo "[ROCm]" > /etc/yum.repos.d/rocm.repo && \ echo "name=ROCm" >> /etc/yum.repos.d/rocm.repo && \ - echo "baseurl=http://repo.radeon.com/rocm/yum/${ROCM_VERSION}" >> /etc/yum.repos.d/rocm.repo && \ + echo "baseurl=http://repo.radeon.com/rocm/yum/4.0.1" >> /etc/yum.repos.d/rocm.repo && \ echo "enabled=1" >> /etc/yum.repos.d/rocm.repo && \ echo "gpgcheck=0" >> /etc/yum.repos.d/rocm.repo RUN yum install -y rocm-dev rocm-utils rocfft miopen-hip rocblas hipsparse rocrand rccl hipcub rocthrust rocprofiler-dev roctracer-dev @@ -89,10 +87,14 @@ RUN cd /opt && wget -q --no-check-certificate https://paddle-ci.cdn.bcebos.com/p cd .. && rm -f protobuf-cpp-3.6.1.tar.gz && rm -rf protobuf-3.6.1 # conda -RUN cd /opt && wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && chmod +x Miniconda3-latest-Linux-x86_64.sh -RUN mkdir /opt/conda && ./Miniconda3-latest-Linux-x86_64.sh -b -f -p "/opt/conda" && rm -rf Miniconda3-latest-Linux-x86_64.sh +ENV CONDA_FILE=Miniconda3-py37_4.9.2-Linux-x86_64.sh +RUN cd /opt && wget https://repo.anaconda.com/miniconda/${CONDA_FILE} && chmod +x ${CONDA_FILE} +RUN mkdir /opt/conda && ./${CONDA_FILE} -b -f -p "/opt/conda" && rm -rf ${CONDA_FILE} ENV PATH=/opt/conda/bin:${PATH} -RUN conda init bash && conda install -n base jupyter +RUN conda init bash && conda install -n base jupyter jupyterlab + +# install pylint and pre-commit +RUN /opt/conda/bin/pip install pre-commit pylint pytest astroid isort protocol PyGithub # install Paddle requirement RUN wget https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/python/requirements.txt -O /root/requirements.txt -- GitLab From cdc5a55ac1c929920fb204e5e57023e5fab0a947 Mon Sep 17 00:00:00 2001 From: YUNSHEN XIE <1084314248@qq.com> Date: Tue, 16 Mar 2021 19:40:46 +0800 Subject: [PATCH 025/486] turn off added ut check on windows (#31660) --- tools/get_pr_ut.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/get_pr_ut.py b/tools/get_pr_ut.py index 001f380049f..58d7d2c0d6b 100644 --- a/tools/get_pr_ut.py +++ b/tools/get_pr_ut.py @@ -265,7 +265,8 @@ class PRChecker(object): '.cu'): if f.find('test_') != -1 or f.find('_test') != -1: print('PREC {} need check new ut'.format(f)) - check_added_ut = True + if current_system != "Windows": + check_added_ut = True elif self.is_only_comment(f): ut_list.append('nomap_comment_placeholder') else: -- GitLab From 4c0c55bba14dd0b0e4197a8dcda5a71b76ee020a Mon Sep 17 00:00:00 2001 From: Zhou Wei <52485244+zhouwei25@users.noreply.github.com> Date: Wed, 17 Mar 2021 11:48:18 +0800 Subject: [PATCH 026/486] support Geforce RTX 30+ GPU (#31529) --- CMakeLists.txt | 6 +++++- cmake/cuda.cmake | 12 ++++++++++-- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 992c3f1c4fa..10b3b0aba4e 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -95,7 +95,7 @@ if(WIN32) endif() endforeach(flag_var) endif() - + # NOTE(Avin0323): Less parallel count result in faster compilation. math(EXPR PROCESS_MAX "${CPU_CORES} * 2 / 3") # windows build turn off warnings, use parallel compiling. @@ -123,6 +123,10 @@ if(WIN32) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838") + foreach(flag_var CMAKE_SHARED_LINKER_FLAGS CMAKE_STATIC_LINKER_FLAGS CMAKE_EXE_LINKER_FLAGS CMAKE_LINKER_FLAGS) + string(APPEND ${flag_var} "/ignore:4049 /ignore:4217 /ignore:4006 /ignore:4221") + endforeach(flag_var) + if (WITH_WIN_DUMP_DBG) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /Zi") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Zi") diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake index 2f4f5449f48..c4d1384312e 100644 --- a/cmake/cuda.cmake +++ b/cmake/cuda.cmake @@ -91,7 +91,7 @@ function(select_nvcc_arch_flags out_variable) if(${CUDA_ARCH_NAME} STREQUAL "Manual") set(CUDA_ARCH_BIN ${paddle_known_gpu_archs} CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported") - set(CUDA_ARCH_PTX "50" CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for") + set(CUDA_ARCH_PTX "" CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for") mark_as_advanced(CUDA_ARCH_BIN CUDA_ARCH_PTX) else() unset(CUDA_ARCH_BIN CACHE) @@ -175,14 +175,22 @@ elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 10.0) # CUDA 9.x set(paddle_known_gpu_archs ${paddle_known_gpu_archs9}) set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED") set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__") + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets") elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0) # CUDA 10.x set(paddle_known_gpu_archs ${paddle_known_gpu_archs10}) set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED") set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__") -elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 12.0) # CUDA 11.x + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets") +elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.2) # CUDA 11.0/11.1 set(paddle_known_gpu_archs ${paddle_known_gpu_archs11}) set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED") set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__") + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets") +elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 12.0) # CUDA 11.2+ + set(paddle_known_gpu_archs "${paddle_known_gpu_archs11} 86") + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED") + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__") + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets") endif() if (NOT ${CMAKE_CUDA_COMPILER_VERSION} LESS 10.0) -- GitLab From 19592d2b7108e8afe618a7a5cfd14e1d93acc378 Mon Sep 17 00:00:00 2001 From: cc <52520497+juncaipeng@users.noreply.github.com> Date: Wed, 17 Mar 2021 15:26:06 +0800 Subject: [PATCH 027/486] Refine dygraph qat, test=develop (#31680) --- .../slim/quantization/imperative/qat.py | 483 ++++++++++-------- .../slim/quantization/imperative/utils.py | 46 ++ 2 files changed, 303 insertions(+), 226 deletions(-) create mode 100644 python/paddle/fluid/contrib/slim/quantization/imperative/utils.py diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py index afe8a3de667..04aec158eac 100644 --- a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py +++ b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py @@ -25,101 +25,99 @@ from paddle.fluid.executor import Executor from paddle.fluid.param_attr import ParamAttr from paddle.fluid.initializer import Constant from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX -from paddle.nn import Linear, Conv2D, Conv2DTranspose, MaxPool2D, MaxPool1D, BatchNorm1D, BatchNorm2D, BatchNorm3D, SyncBatchNorm +from paddle.nn import Linear, Conv2D, Conv2DTranspose, MaxPool2D, MaxPool1D +from paddle.nn import BatchNorm1D, BatchNorm2D, BatchNorm3D, SyncBatchNorm from paddle.fluid.dygraph.nn import BatchNorm, Pool2D from paddle.fluid.io import load_inference_model, save_inference_model -from paddle.nn.layer.activation import ReLU, LeakyReLU, Sigmoid, ReLU6, Tanh, Softmax, PReLU, Swish +from paddle.nn.layer.activation import ReLU, LeakyReLU, Sigmoid, ReLU6 +from paddle.nn.layer.activation import Tanh, Softmax, PReLU, Swish from paddle.fluid.log_helper import get_logger from . import quant_nn from .. import quantization_pass +from . import utils -__all__ = ['ImperativeQuantAware', 'ImperativeCalcOutScale'] +__all__ = ['ImperativeQuantAware'] _logger = get_logger( __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s') -_op_real_in_out_name = { - "conv2d": [["Input", "Filter"], ["Output"]], - "depthwise_conv2d": [["Input", "Filter"], ["Output"]], - "pool2d": [["X"], ["Out"]], - "elementwise_add": [["X", "Y"], ["Out"]], - "softmax": [["X"], ["Out"]], - "relu": [["X"], ["Out"]], - "relu6": [["X"], ["Out"]], - "leaky_relu": [["X"], ["Out"]], - "prelu": [["X"], ["Out"]], - "tanh": [["X"], ["Out"]], - "batch_norm": [["X"], ["Y"]], - "sigmoid": [["X"], ["Out"]], - "swish": [["X"], ["Out"]], -} - class ImperativeQuantAware(object): """ - Add the fake quant logic for given quantizable layers, namely add the quant_dequant - computational logic both for activation inputs and weight inputs. + Applying quantization aware training (QAT) to dgraph model. """ def __init__(self, - weight_bits=8, - activation_bits=8, + quantizable_layer_type=['Conv2D', 'Linear'], weight_quantize_type='abs_max', activation_quantize_type='moving_average_abs_max', + weight_bits=8, + activation_bits=8, moving_rate=0.9, - quantizable_layer_type=['Conv2D', 'Linear'], weight_preprocess_layer=None, act_preprocess_layer=None, weight_quantize_layer=None, act_quantize_layer=None): - r""" + """ The constructor for ImperativeQuantAware. Args: - weight_bits(int): quantization bit number for weights, - whereas the bias is not quantized. - activation_bits(int): quantization bit number for activations. + quantizable_layer_type(list[str]): List the type of layers that + will be quantized. Default is ['Conv2D', 'Linear']. + The quantizable_op_type in QuantizationFreezePass and + ConvertToInt8Pass must be the same as this. weight_quantize_type(str): quantization type for weights, which supports 'abs_max' now. The 'moving_average_abs_max' - usually is not used for weights, since weights are fixed once the - model is well trained. + usually is not used for weights, since weights are fixed + once the model is well trained. activation_quantize_type(str): quantization type for activations, which supports 'abs_max' and 'moving_average_abs_max' now. - If using 'abs_max' mode, the quantization scale will be calculated - dynamically each step in both training and testing period. If using - 'moving_average_abs_max', the static quantization scale will be calculated - during training and used in inference. - moving_rate(float): the parameter for 'moving_average_abs_max' quantization. - quantizable_layer_type(list[str]): List the type of layers that will be quantized. - Default is ['Conv2D', 'Linear']. The quantizable_op_type in - QuantizationFreezePass and ConvertToInt8Pass must be the same as this. - weight_preprocess_layer(paddle.nn.Layer, optional): A paddle Layer that defines how to preprocess - weight before quantization. Using this can quickly test if user's - preprocess method works or not. The input is non-quantized - weight and function returns processed weight to be quantized. - If None, the weight will be quantized directly. Default is None. - act_preprocess_layer(paddle.nn.Layer, optional): A paddle Layer that defines how to preprocess - activation before quantization. Using this can quickly test if user's - preprocess method works or not. The input is non-quantized - activation and function returns processed activation to be quantized. - If None, the activation will be quantized directly. Default is None. - weight_quantize_layer(paddle.nn.Layer, optional): A paddle Layer that defines how to quantize weight. + If using 'abs_max' mode, the quantization scale will be + calculated dynamically each step in both training and testing + period. If using 'moving_average_abs_max', the static + quantization scale will be calculated during training and + used in inference. + weight_bits(int): quantization bit number for weights, + whereas the bias is not quantized. + activation_bits(int): quantization bit number for activations. + moving_rate(float): the parameter for 'moving_average_abs_max' + quantization. + weight_preprocess_layer(paddle.nn.Layer, optional): A paddle + Layer that defines how to preprocess weight before quantization. + Using this can quickly test if user's preprocess method works + or not. The input is non-quantized weight and function returns + processed weight to be quantized. + If None, the weight will be quantized directly. + Default is None. + act_preprocess_layer(paddle.nn.Layer, optional): A paddle Layer + that defines how to preprocess activation before quantization. + Using this can quickly test if user's preprocess method works + or not. The input is non-quantized activation and function returns + processed activation to be quantized. + If None, the activation will be quantized directly. + Default is None. + weight_quantize_layer(paddle.nn.Layer, optional): A paddle Layer that + defines how to quantize weight. Using this can quickly test if user's quantization method works or not. In this layer, user should both define quantization method and dequantization method, that is, the function's input is non-quantized - weight and returns dequantized weight. If None, will use - quantization op defined by 'weight_quantize_type'. Default is None. - act_quantize_layer(paddle.nn.Layer, optional): A paddle Layer that defines how to quantize activation. + weight and returns dequantized weight. + If None, will use uantization op defined by 'weight_quantize_type'. + Default is None. + act_quantize_layer(paddle.nn.Layer, optional): A paddle Layer that defines + how to quantize activation. Using this can quickly test if user's quantization method works or not. In this layer, user should both define quantization method and dequantization method, that is, the function's input is non-quantized - activation and returns dequantized activation. If None, will use - quantization op defined by 'activation_quantize_type'. Default is None. + activation and returns dequantized activation. + If None, will use quantization op defined by 'activation_quantize_type'. + Default is None. Note: - If user sets attribute 'skip_quant' to a Layer that support dynamic quantization and sets - it to true, the layer would not be quantized during training. If this attribute is not sets - or the attribute is false, the Layer would be qunatized in training. + If user sets attribute 'skip_quant' to a Layer that support dynamic + quantization and sets it to true, the layer would not be quantized + during training. If this attribute is not sets or the attribute is + false, the Layer would be qunatized in training. Examples 1: .. code-block:: python @@ -196,141 +194,175 @@ class ImperativeQuantAware(object): model_path="./imperative_model_qat") """ super(ImperativeQuantAware, self).__init__() - self._weight_bits = weight_bits - self._activation_bits = activation_bits - self._moving_rate = moving_rate - self._activation_quantize_type = activation_quantize_type - self._weight_quantize_type = weight_quantize_type - - self._weight_pre_layer = weight_preprocess_layer - self._act_pre_layer = act_preprocess_layer - self._weight_quant_layer = weight_quantize_layer - self._act_quant_layer = act_quantize_layer - self._out_scale = ImperativeCalcOutScale() - - t_check = lambda method: method is None or issubclass(method, dygraph.layers.Layer) - assert t_check( - self._weight_pre_layer), "weight_preprocess should be nn.Layer" - assert t_check(self._act_pre_layer), "act_preprocess should be nn.Layer" - assert t_check( - self._weight_quant_layer), "weight_quantize should be nn.Layer" - assert t_check(self._act_quant_layer), "act_quantize should be nn.Layer" - - quant_type = { - 'abs_max', 'moving_average_abs_max', 'channel_wise_abs_max' - } - assert activation_quantize_type != 'channel_wise_abs_max', \ - "The activation quantization type does not support 'channel_wise_abs_max'." - if activation_quantize_type not in quant_type: - raise ValueError( - "Unknown activation_quantize_type : '%s'. It can only be " - "'abs_max' or 'moving_average_abs_max' now." % - (str(activation_quantize_type))) - if weight_quantize_type not in quant_type: - raise ValueError( - "Unknown weight_quantize_type: '%s'. It can only be " - "'abs_max' or 'moving_average_abs_max' or 'channel_wise_abs_max' now." - % (str(weight_quantize_type))) - - self._quant_layers_map = { - 'Conv2D': Conv2D, - 'Linear': Linear, - 'Pool2D': Pool2D, - 'ReLU': ReLU, - 'LeakyReLU': LeakyReLU, - 'ReLU6': ReLU6, - 'Softmax': Softmax, - 'Tanh': Tanh, - 'Swish': Swish + kwargs = { + "quantizable_layer_type": quantizable_layer_type, + "weight_quantize_type": weight_quantize_type, + "activation_quantize_type": activation_quantize_type, + "weight_bits": weight_bits, + "activation_bits": activation_bits, + "moving_rate": moving_rate, + "weight_preprocess_layer": weight_preprocess_layer, + "act_preprocess_layer": act_preprocess_layer, + "weight_quantize_layer": weight_quantize_layer, + "act_quantize_layer": act_quantize_layer } - self._quantizable_layer_type = tuple( - self._quant_layers_map[layer] - if layer in self._quant_layers_map else layer - for layer in quantizable_layer_type) - for layer in self._quantizable_layer_type: - assert not isinstance( - layer, str), "{} is unspported to be quantized.".format(layer) + + self._quantize_inputs = ImperativeQuantizeInputs(**kwargs) + + self._calc_output_scale = ImperativeCalcOutputScale() def quantize(self, model): """ - According to weights' and activations' quantization types, the model will be added some fake - quant ops, such as fake_quantize_dequantize_moving_average_abs_max, fake_quantize_dequantize_abs_max - and so on. At the same time, the out_scale value of outputs would be calculated. + According to weights' and activations' quantization types, + the model will be added some fake quant ops, such as + fake_quantize_dequantize_moving_average_abs_max, + fake_quantize_dequantize_abs_max and so on. At the same time, + the out_scale value of outputs would be calculated. Args: model(fluid.dygraph.Layer): the model to be quantized. Returns: None """ + assert isinstance(model, dygraph.Layer), \ + "The model must be the instance of dygraph.Layer." + self._quantize_inputs.apply(model) + self._calc_output_scale.apply(model) + + def save_quantized_model(self, layer, path, input_spec=None, **config): + self._calc_output_scale.save_quantized_model(layer, path, input_spec, + **config) + + +class ImperativeQuantizeInputs(object): + """ + Based on the input params, add the quant_dequant computational + logic both for activation inputs and weight inputs. + """ + + def __init__(self, + quantizable_layer_type=['Conv2D', 'Linear'], + weight_quantize_type='abs_max', + activation_quantize_type='moving_average_abs_max', + weight_bits=8, + activation_bits=8, + moving_rate=0.9, + weight_preprocess_layer=None, + act_preprocess_layer=None, + weight_quantize_layer=None, + act_quantize_layer=None): + """ + The constructor for ImperativeQuantizeInputs. + + Please refer to the args of ImperativeQuantAware. + """ + super(ImperativeQuantizeInputs, self).__init__() + + self._quantizable_layer_type = tuple( + utils._quant_layers_map[layer] + if layer in utils._quant_layers_map else layer + for layer in quantizable_layer_type) + for layer in self._quantizable_layer_type: + assert not isinstance(layer, str), \ + "%s is unspported to be quantized." % layer + + quantize_type = { + 'abs_max', 'moving_average_abs_max', 'channel_wise_abs_max' + } + assert weight_quantize_type in quantize_type, \ + "Unsupported weight_quantize_type: %s. It can only " \ + "be abs_max or moving_average_abs_max or " \ + "channel_wise_abs_max." % weight_quantize_type + assert activation_quantize_type != 'channel_wise_abs_max' \ + and activation_quantize_type in quantize_type, \ + "Unsupported activation_quantize_type: %s. It can " \ + "only be abs_max or moving_average_abs_max now." \ + % activation_quantize_type + + bits_check = lambda bits: isinstance(bits, int) \ + and bits >= 0 and bits <= 16 + assert bits_check(weight_bits), \ + "weight_bits should be 1, 2,... or 16." + assert bits_check(activation_bits), \ + "activation_bits should be 1, 2,... or 16." + + layer_check = lambda method: method is None or \ + issubclass(method, dygraph.layers.Layer) + assert layer_check(weight_preprocess_layer), \ + "weight_preprocess should be nn.Layer." + assert layer_check(act_preprocess_layer), \ + "act_preprocess should be nn.Layer." + assert layer_check(weight_quantize_layer), \ + "weight_quantize should be nn.Layer." + assert layer_check(act_quantize_layer), \ + "act_quantize should be nn.Layer." + + self._kwargs = { + "weight_quantize_type": weight_quantize_type, + "activation_quantize_type": activation_quantize_type, + "weight_bits": weight_bits, + "activation_bits": activation_bits, + "moving_rate": moving_rate, + "weight_pre_layer": weight_preprocess_layer, + "act_pre_layer": act_preprocess_layer, + "weight_quant_layer": weight_quantize_layer, + "act_quant_layer": act_quantize_layer + } + + def apply(self, model): + assert isinstance(model, dygraph.Layer), \ + "The model must be the instance of dygraph.Layer." + for name, layer in model.named_sublayers(): - if not isinstance(layer, self._quantizable_layer_type): - continue - if hasattr(layer, "skip_quant") and layer.skip_quant == True: + if not isinstance(layer, self._quantizable_layer_type) \ + or (hasattr(layer, "skip_quant") \ + and layer.skip_quant == True): continue + # TODO(jc): optimize this module last_idx = 0 idx = 0 obj = model - parent = model - while idx < len(name): if (name[idx] == '.'): - if hasattr(parent, name[last_idx:idx]): + if hasattr(obj, name[last_idx:idx]): obj = getattr(obj, name[last_idx:idx]) - parent = obj last_idx = idx + 1 idx += 1 target = name[last_idx:idx] - quant_layer = self._get_quantized_counterpart(layer) + quant_layer = self._get_quantized_layer(layer) setattr(quant_layer, "layer_name", layer.full_name()) setattr(obj, target, quant_layer) - self._out_scale.calc_out_scale(model) - - def _get_quantized_counterpart(self, layer): - quant_layers = tuple(self._quant_layers_map.values()) - quantized_counterpart = tuple('Quantized' + k - for k in self._quant_layers_map.keys()) - - predicate = lambda value: isinstance(layer, value) - index_generator = (i for i, v in enumerate(quant_layers) - if predicate(v)) - - try: - index = next(index_generator) - except StopIteration: - _logger.fatal("The layer {} is unsupported to be quantized.".format( - layer.full_name())) - sys.exit(-1) + def _get_quantized_layer(self, layer): + quant_layer_name = None + for key, value in utils._quant_layers_map.items(): + if isinstance(layer, value): + quant_layer_name = 'Quantized' + key + break + assert quant_layer_name is not None, \ + "The layer %s is unsupported to be quantized." \ + % layer.full_name() layer_with_weight = ['QuantizedConv2D', 'QuantizedLinear'] - if quantized_counterpart[index] not in layer_with_weight: - quant_layer_class_name = 'QuantizedNoweightLayer' - else: - quant_layer_class_name = quantized_counterpart[index] - quantized_layer = quant_nn.__dict__[quant_layer_class_name]( - layer, self._weight_bits, self._activation_bits, self._moving_rate, - self._weight_quantize_type, self._activation_quantize_type, - self._weight_pre_layer, self._act_pre_layer, - self._weight_quant_layer, self._act_quant_layer) - return quantized_layer + if quant_layer_name not in layer_with_weight: + quant_layer_name = 'QuantizedNoweightLayer' - def save_quantized_model(self, layer, path, input_spec=None, **config): - self._out_scale.save_quantized_model(layer, path, input_spec, **config) + return quant_nn.__dict__[quant_layer_name](layer, **self._kwargs) -class ImperativeCalcOutScale(object): +class ImperativeCalcOutputScale(object): def __init__(self, moving_rate=0.9): """ - Add the logic of calculating and setting output quantization scales of some layers. - These output quantization scales may be used by tensorRT or some other inference engines. + Add the logic of calculating and setting output scales of some layers. Args: - moving_rate(float): The decay coefficient of moving average. The default value is 0.9. + moving_rate(float): The decay coefficient of moving average. + The default value is 0.9. """ - super(ImperativeCalcOutScale, self).__init__() + super(ImperativeCalcOutputScale, self).__init__() self._moving_rate = moving_rate self._out_scale_layer_type_list = ( BatchNorm, BatchNorm1D, BatchNorm2D, BatchNorm3D, Conv2D, LeakyReLU, @@ -339,83 +371,22 @@ class ImperativeCalcOutScale(object): self._register_hook_handle_list = [] self._out_scale_dict = collections.OrderedDict() - # Determine whether layer supports calculation out_scale - def _is_matched_layer(self, layer): - if not isinstance(layer, self._out_scale_layer_type_list): - if 'quantized_' not in layer.full_name(): - return False - return True - - # When inferenc model is saved, the logic in hook would not be executed - # in program translation, so that some parameters can not created in - # __init__, which would cause the model to fail to save. Therefore, the - # parameters creation in the hook is advanced to be exected outside the hook. - def _add_new_parameters(self, layer, name=None): - dtype = layer._dtype if layer._dtype is not None else "float32" - if dtype not in ["float32", "float64"]: - return - scale_prefix = '{}.scale'.format(name) if name else 'outscale.scale' - scale_name = unique_name.generate(scale_prefix) - scale_attr = ParamAttr( - name=scale_name, initializer=Constant(1), trainable=False) - layer._quant_out_scale = layer.create_parameter( - shape=[1], attr=scale_attr, dtype=dtype) - layer._quant_out_scale.stop_gradient = True - - state_prefix = "{}.state".format(name) if name else 'outscale.state' - state_attr = ParamAttr( - name=unique_name.generate(state_prefix), - initializer=Constant(1), - trainable=False) - layer._quant_out_state = layer.create_parameter( - shape=[1], attr=state_attr, dtype=dtype) - layer._quant_out_state.stop_gradient = True - - accum_prefix = "{}.accum".format(name) if name else 'outscale.accum' - accum_attr = ParamAttr( - name=unique_name.generate(accum_prefix), - initializer=Constant(1), - trainable=False) - layer._quant_out_accum = layer.create_parameter( - shape=[1], attr=accum_attr, dtype=dtype) - layer._quant_out_accum.stop_gradient = True - - # Judge whether the op in program matches the Layer in dynamic model - def _is_op_matched(self, layer_name, op, block): - output_var_names = quantization_pass._get_op_output_var_names(op) - for output_var_name in output_var_names: - output_var_tensor = block.var(output_var_name) - if output_var_tensor.dtype not in [ - core.VarDesc.VarType.FP64, core.VarDesc.VarType.FP32 - ]: - return False - - # Because the naming styles of static and dynamic graph are different, - # in order to avoid mistakes, we unify the name here. - op_type = output_var_names[0].split(".")[0] - op_type = op_type.rsplit("_", 1)[0] - if op_type == 'depthwise_conv2d': - op_type = 'conv2d' - if 'prelu' in op_type: - op_type = op_type.replace('prelu', 'p_re_lu') - if 'relu' in op_type: - op_type = op_type.replace('relu', 're_lu') - return op_type in layer_name - - def calc_out_scale(self, model): + def apply(self, model): """ - Insert the `moving_average_abs_max_scale` op to calculate output scale of Specific layers in model. + Insert the `moving_average_abs_max_scale` op to calculate output + scale of specific layers in model. Args: - model(fluid.dygraph.Layer): The target model which would be calculate the output quantization scale. + model(fluid.dygraph.Layer): The target model which would be + calculate the output quantization scale. Returns: None """ - assert isinstance( - model, dygraph.Layer), "model must be the instance of dygraph.Layer" + assert isinstance(model, dygraph.Layer), \ + "The model must be the instance of dygraph.Layer." for _, layer in model.named_sublayers(): - if self._is_matched_layer(layer): + if self._is_target_layer(layer): self._add_new_parameters(layer) forward_post_hook_handle = layer.register_forward_post_hook( self._forward_post_hook) @@ -459,7 +430,7 @@ class ImperativeCalcOutScale(object): .numpy()) else: for _, sub_layer in self._layer.named_sublayers(): - if self._is_matched_layer(sub_layer): + if self._is_target_layer(sub_layer): layer_name = sub_layer.full_name() if hasattr(sub_layer, "layer_name"): layer_name = sub_layer.layer_name @@ -510,7 +481,7 @@ class ImperativeCalcOutScale(object): forward_op = None for block in inference_program.blocks: for op in block.ops: - if op.type in _op_real_in_out_name: + if op.type in utils._op_real_in_out_name: if op_count > len(ops_list): warnings.warn( "The number of Layer which has out_threshold attribute should be bigger than the op in inference model" @@ -567,6 +538,66 @@ class ImperativeCalcOutScale(object): if is_dynamic_mode: paddle.disable_static() + def _is_target_layer(self, layer): + return isinstance(layer, self._out_scale_layer_type_list) \ + or 'quantized_' in layer.full_name() + + # When inferenc model is saved, the logic in hook would not be executed + # in program translation, so that some parameters can not created in + # __init__, which would cause the model to fail to save. Therefore, the + # parameters creation in the hook is advanced to be exected outside the hook. + def _add_new_parameters(self, layer, name=None): + dtype = layer._dtype if layer._dtype is not None else "float32" + if dtype not in ["float32", "float64"]: + return + scale_prefix = '{}.scale'.format(name) if name else 'outscale.scale' + scale_name = unique_name.generate(scale_prefix) + scale_attr = ParamAttr( + name=scale_name, initializer=Constant(1), trainable=False) + layer._quant_out_scale = layer.create_parameter( + shape=[1], attr=scale_attr, dtype=dtype) + layer._quant_out_scale.stop_gradient = True + + state_prefix = "{}.state".format(name) if name else 'outscale.state' + state_attr = ParamAttr( + name=unique_name.generate(state_prefix), + initializer=Constant(1), + trainable=False) + layer._quant_out_state = layer.create_parameter( + shape=[1], attr=state_attr, dtype=dtype) + layer._quant_out_state.stop_gradient = True + + accum_prefix = "{}.accum".format(name) if name else 'outscale.accum' + accum_attr = ParamAttr( + name=unique_name.generate(accum_prefix), + initializer=Constant(1), + trainable=False) + layer._quant_out_accum = layer.create_parameter( + shape=[1], attr=accum_attr, dtype=dtype) + layer._quant_out_accum.stop_gradient = True + + # Judge whether the op in program matches the Layer in dynamic model + def _is_op_matched(self, layer_name, op, block): + output_var_names = quantization_pass._get_op_output_var_names(op) + for output_var_name in output_var_names: + output_var_tensor = block.var(output_var_name) + if output_var_tensor.dtype not in [ + core.VarDesc.VarType.FP64, core.VarDesc.VarType.FP32 + ]: + return False + + # Because the naming styles of static and dynamic graph are different, + # in order to avoid mistakes, we unify the name here. + op_type = output_var_names[0].split(".")[0] + op_type = op_type.rsplit("_", 1)[0] + if op_type == 'depthwise_conv2d': + op_type = 'conv2d' + if 'prelu' in op_type: + op_type = op_type.replace('prelu', 'p_re_lu') + if 'relu' in op_type: + op_type = op_type.replace('relu', 're_lu') + return op_type in layer_name + def _forward_post_hook(self, layer, input, output): assert isinstance( output, (core.VarBase, framework.Variable) diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py b/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py new file mode 100644 index 00000000000..a732181db7d --- /dev/null +++ b/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py @@ -0,0 +1,46 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddle.nn import Linear, Conv2D +from paddle.fluid.dygraph.nn import Pool2D +from paddle.nn.layer.activation import ReLU, LeakyReLU, Sigmoid, ReLU6 +from paddle.nn.layer.activation import Tanh, Softmax, PReLU, Swish + +_op_real_in_out_name = { + "conv2d": [["Input", "Filter"], ["Output"]], + "depthwise_conv2d": [["Input", "Filter"], ["Output"]], + "pool2d": [["X"], ["Out"]], + "elementwise_add": [["X", "Y"], ["Out"]], + "softmax": [["X"], ["Out"]], + "relu": [["X"], ["Out"]], + "relu6": [["X"], ["Out"]], + "leaky_relu": [["X"], ["Out"]], + "prelu": [["X"], ["Out"]], + "tanh": [["X"], ["Out"]], + "batch_norm": [["X"], ["Y"]], + "sigmoid": [["X"], ["Out"]], + "swish": [["X"], ["Out"]], +} + +_quant_layers_map = { + 'Conv2D': Conv2D, + 'Linear': Linear, + 'Pool2D': Pool2D, + 'ReLU': ReLU, + 'LeakyReLU': LeakyReLU, + 'ReLU6': ReLU6, + 'Softmax': Softmax, + 'Tanh': Tanh, + 'Swish': Swish +} -- GitLab From 2fbe9b097a41bff2b8c73296bf52e387ec88842a Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Wed, 17 Mar 2021 16:21:03 +0800 Subject: [PATCH 028/486] [CustomOp] Remove Eigen dependencies of float16 (#31669) * remove eigen deps dof float16 * add cstdlib header * replace stdlib header by cmath --- paddle/fluid/platform/eigen_ext.h | 96 ++++++++++++++++ paddle/fluid/platform/float16.h | 152 +++----------------------- paddle/fluid/platform/float16_test.cc | 14 +-- paddle/fluid/platform/float16_test.cu | 1 + 4 files changed, 112 insertions(+), 151 deletions(-) diff --git a/paddle/fluid/platform/eigen_ext.h b/paddle/fluid/platform/eigen_ext.h index 9e2c3630468..a8ad729a31a 100644 --- a/paddle/fluid/platform/eigen_ext.h +++ b/paddle/fluid/platform/eigen_ext.h @@ -17,6 +17,7 @@ #include "paddle/fluid/platform/bfloat16.h" #include "paddle/fluid/platform/complex128.h" #include "paddle/fluid/platform/complex64.h" +#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/hostdevice.h" #include "unsupported/Eigen/CXX11/Tensor" @@ -26,6 +27,7 @@ namespace Eigen { using bfloat16 = paddle::platform::bfloat16; using complex64 = paddle::platform::complex64; using complex128 = paddle::platform::complex128; +using float16 = paddle::platform::float16; template struct NumTraits; @@ -103,6 +105,33 @@ struct NumTraits : GenericNumTraits> { static inline int digits10() { return NumTraits::digits10(); } }; +template <> +struct NumTraits : GenericNumTraits { + enum { + IsSigned = true, + IsInteger = false, + IsComplex = false, + RequireInitialization = false + }; + + HOSTDEVICE static inline float16 epsilon() { + return paddle::platform::raw_uint16_to_float16(0x0800); + } + HOSTDEVICE static inline float16 dummy_precision() { return float16(1e-2f); } + HOSTDEVICE static inline float16 highest() { + return paddle::platform::raw_uint16_to_float16(0x7bff); + } + HOSTDEVICE static inline float16 lowest() { + return paddle::platform::raw_uint16_to_float16(0xfbff); + } + HOSTDEVICE static inline float16 infinity() { + return paddle::platform::raw_uint16_to_float16(0x7c00); + } + HOSTDEVICE static inline float16 quiet_NaN() { + return paddle::platform::raw_uint16_to_float16(0x7c01); + } +}; + namespace numext { //////////// bfloat methods ///////////// @@ -302,5 +331,72 @@ HOSTDEVICE inline double abs(const complex128& a) { return paddle::platform::abs(a); } +//////////// float16 methods ///////////// + +template <> +HOSTDEVICE inline bool(isnan)(const float16& a) { + return (paddle::platform::isnan)(a); +} + +template <> +HOSTDEVICE inline bool(isinf)(const float16& a) { + return (paddle::platform::isinf)(a); +} + +template <> +HOSTDEVICE inline bool(isfinite)(const float16& a) { + return (paddle::platform::isfinite)(a); +} + +template <> +HOSTDEVICE inline float16 exp(const float16& a) { + return float16(::expf(static_cast(a))); +} + +template <> +HOSTDEVICE inline float16 erf(const float16& a) { + return float16(::erff(static_cast(a))); +} + +template <> +HOSTDEVICE inline float16 log(const float16& a) { + return float16(::logf(static_cast(a))); +} + +template <> +HOSTDEVICE inline float16 tanh(const float16& a) { + return float16(::tanhf(static_cast(a))); +} + +template <> +HOSTDEVICE inline float16 sqrt(const float16& a) { + return float16(::sqrtf(static_cast(a))); +} + +template <> +HOSTDEVICE inline float16 ceil(const float16& a) { + return float16(::ceilf(static_cast(a))); +} + +template <> +HOSTDEVICE inline float16 floor(const float16& a) { + return float16(::floorf(static_cast(a))); +} + +template <> +HOSTDEVICE inline float16 round(const float16& a) { + return float16(::roundf(static_cast(a))); +} + +template <> +HOSTDEVICE inline float16 pow(const float16& a, const float16& b) { + return float16(::powf(static_cast(a), static_cast(b))); +} + +template <> +HOSTDEVICE inline float16 abs(const float16& a) { + return float16(::fabs(static_cast(a))); +} + } // namespace numext } // namespace Eigen diff --git a/paddle/fluid/platform/float16.h b/paddle/fluid/platform/float16.h index df2a24400b4..bdd4d54b3d1 100644 --- a/paddle/fluid/platform/float16.h +++ b/paddle/fluid/platform/float16.h @@ -15,6 +15,9 @@ limitations under the License. */ #pragma once #include + +#include +#include #include #ifdef PADDLE_WITH_CUDA @@ -25,18 +28,6 @@ limitations under the License. */ #include #endif -#ifdef __GNUC__ -#define PADDLE_GNUC_VER (__GNUC__ * 10 + __GNUC_MINOR__) -#else -#define PADDLE_GNUC_VER 0 -#endif // __GNUC__ - -#ifdef __clang__ -#define PADDLE_CLANG_VER (__clang_major__ * 10 + __clang_minor__) -#else -#define PADDLE_CLANG_VER 0 -#endif // __clang__ - #if defined(__CUDACC__) && CUDA_VERSION >= 7050 #define PADDLE_CUDA_FP16 #include @@ -55,17 +46,15 @@ limitations under the License. */ #define CUDA_ARCH_FP16_SUPPORTED(CUDA_ARCH) (CUDA_ARCH >= 600) -namespace paddle { -namespace platform { - -// Forward declare float16 for eigen.h -struct float16; - -} // namespace platform -} // namespace paddle - -#include "paddle/fluid/platform/hostdevice.h" -#include "unsupported/Eigen/CXX11/Tensor" +#if (defined(__CUDACC__) || defined(__HIPCC__)) +#define HOSTDEVICE __host__ __device__ +#define DEVICE __device__ +#define HOST __host__ +#else +#define HOSTDEVICE +#define DEVICE +#define HOST +#endif namespace paddle { namespace platform { @@ -73,7 +62,7 @@ namespace platform { // Use PADDLE_ALIGNED(2) to ensure that each float16 will be allocated // and aligned at least on a 2-byte boundary, which leads to efficient // memory access of float16 struct and also makes float16 compatible -// with CUDA half, ARM float16_t, and Eigen::half data types. +// with CUDA half, ARM float16_t data types. struct PADDLE_ALIGN(2) float16 { public: uint16_t x; @@ -100,8 +89,6 @@ struct PADDLE_ALIGN(2) float16 { } #endif // PADDLE_CUDA_FP16 - HOSTDEVICE inline explicit float16(const Eigen::half& h) : x(h.x) {} - #ifdef PADDLE_WITH_NATIVE_FP16 // __fp16 is a native half precision data type for arm cpu, // float16_t is an alias for __fp16 @@ -163,11 +150,6 @@ struct PADDLE_ALIGN(2) float16 { } #endif - HOSTDEVICE inline float16& operator=(const Eigen::half& rhs) { - x = rhs.x; - return *this; - } - #ifdef PADDLE_WITH_NATIVE_FP16 HOSTDEVICE inline float16& operator=(const float16_t& rhs) { x = *reinterpret_cast(&rhs); @@ -245,12 +227,6 @@ struct PADDLE_ALIGN(2) float16 { } #endif // PADDLE_CUDA_FP16 - HOSTDEVICE inline explicit operator Eigen::half() const { - Eigen::half h; - h.x = x; - return h; - } - #ifdef PADDLE_WITH_NATIVE_FP16 HOSTDEVICE inline explicit operator float16_t() const { return *reinterpret_cast(this); @@ -1108,105 +1084,3 @@ HOSTDEVICE inline paddle::platform::float16 abs( } } // namespace std - -namespace Eigen { - -using float16 = paddle::platform::float16; - -template <> -struct NumTraits : GenericNumTraits { - enum { - IsSigned = true, - IsInteger = false, - IsComplex = false, - RequireInitialization = false - }; - - HOSTDEVICE static inline float16 epsilon() { - return paddle::platform::raw_uint16_to_float16(0x0800); - } - HOSTDEVICE static inline float16 dummy_precision() { return float16(1e-2f); } - HOSTDEVICE static inline float16 highest() { - return paddle::platform::raw_uint16_to_float16(0x7bff); - } - HOSTDEVICE static inline float16 lowest() { - return paddle::platform::raw_uint16_to_float16(0xfbff); - } - HOSTDEVICE static inline float16 infinity() { - return paddle::platform::raw_uint16_to_float16(0x7c00); - } - HOSTDEVICE static inline float16 quiet_NaN() { - return paddle::platform::raw_uint16_to_float16(0x7c01); - } -}; - -namespace numext { - -template <> -HOSTDEVICE inline bool(isnan)(const float16& a) { - return (paddle::platform::isnan)(a); -} - -template <> -HOSTDEVICE inline bool(isinf)(const float16& a) { - return (paddle::platform::isinf)(a); -} - -template <> -HOSTDEVICE inline bool(isfinite)(const float16& a) { - return (paddle::platform::isfinite)(a); -} - -template <> -HOSTDEVICE inline float16 exp(const float16& a) { - return float16(::expf(static_cast(a))); -} - -template <> -HOSTDEVICE inline float16 erf(const float16& a) { - return float16(::erff(static_cast(a))); -} - -template <> -HOSTDEVICE inline float16 log(const float16& a) { - return float16(::logf(static_cast(a))); -} - -template <> -HOSTDEVICE inline float16 tanh(const float16& a) { - return float16(::tanhf(static_cast(a))); -} - -template <> -HOSTDEVICE inline float16 sqrt(const float16& a) { - return float16(::sqrtf(static_cast(a))); -} - -template <> -HOSTDEVICE inline float16 ceil(const float16& a) { - return float16(::ceilf(static_cast(a))); -} - -template <> -HOSTDEVICE inline float16 floor(const float16& a) { - return float16(::floorf(static_cast(a))); -} - -template <> -HOSTDEVICE inline float16 round(const float16& a) { - return float16(::roundf(static_cast(a))); -} - -template <> -HOSTDEVICE inline float16 pow(const float16& a, const float16& b) { - return float16(::powf(static_cast(a), static_cast(b))); -} - -template <> -HOSTDEVICE inline float16 abs(const float16& a) { - return float16(::fabs(static_cast(a))); -} - -} // namespace numext - -} // namespace Eigen diff --git a/paddle/fluid/platform/float16_test.cc b/paddle/fluid/platform/float16_test.cc index f607988d920..56633a35116 100644 --- a/paddle/fluid/platform/float16_test.cc +++ b/paddle/fluid/platform/float16_test.cc @@ -8,26 +8,19 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ + #include "paddle/fluid/platform/float16.h" #define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h #include "gtest/gtest.h" #include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/platform/eigen_ext.h" #include "paddle/fluid/platform/enforce.h" namespace paddle { namespace platform { TEST(float16, conversion_cpu) { - // Explicit conversion from Eigen::half - EXPECT_EQ(float16(Eigen::half(1.0f)).x, 0x3c00); - EXPECT_EQ(float16(Eigen::half(0.5f)).x, 0x3800); - EXPECT_EQ(float16(Eigen::half(0.33333f)).x, 0x3555); - EXPECT_EQ(float16(Eigen::half(0.0f)).x, 0x0000); - EXPECT_EQ(float16(Eigen::half(-0.0f)).x, 0x8000); - EXPECT_EQ(float16(Eigen::half(65504.0f)).x, 0x7bff); - EXPECT_EQ(float16(Eigen::half(65536.0f)).x, 0x7c00); - // Conversion from float EXPECT_EQ(float16(1.0f).x, 0x3c00); EXPECT_EQ(float16(0.5f).x, 0x3800); @@ -61,8 +54,6 @@ TEST(float16, conversion_cpu) { float16 v_assign; v_assign = float16(0); EXPECT_EQ(v_assign.x, 0x0000); - v_assign = Eigen::half(1.0f); - EXPECT_EQ(v_assign.x, 0x3c00); v_assign = 0.5f; EXPECT_EQ(v_assign.x, 0x3800); v_assign = 0.33333; @@ -73,7 +64,6 @@ TEST(float16, conversion_cpu) { EXPECT_EQ(v_assign.x, 0x3c00); // Conversion operator - EXPECT_EQ(Eigen::half(float16(1.0f)).x, 0x3c00); EXPECT_EQ(static_cast(float16(0.5f)), 0.5f); EXPECT_NEAR(static_cast(float16(0.33333)), 0.33333, 0.0001); EXPECT_EQ(static_cast(float16(-1)), -1); diff --git a/paddle/fluid/platform/float16_test.cu b/paddle/fluid/platform/float16_test.cu index 527da790414..d181660e311 100644 --- a/paddle/fluid/platform/float16_test.cu +++ b/paddle/fluid/platform/float16_test.cu @@ -19,6 +19,7 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/platform/eigen_ext.h" #include "paddle/fluid/platform/enforce.h" #define ARITHMETIC_KERNEL(op_type, sign) \ -- GitLab From 402288ad6525f08d00a0b05eb66ed52dc1ad3e3a Mon Sep 17 00:00:00 2001 From: liym27 <33742067+liym27@users.noreply.github.com> Date: Wed, 17 Mar 2021 19:13:53 +0800 Subject: [PATCH 029/486] In __getitem__, convert integers to int64 Tensor not int32 to be compatible with Lite(#31658) --- python/paddle/fluid/framework.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 04ed384846f..036e8ab3044 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -877,7 +877,7 @@ def _getitem_impl_(var, item): new_list_tensor.append(dim) else: assert (isinstance(dim, int)) - temp_out = var.block.create_var(dtype='int32') + temp_out = var.block.create_var(dtype='int64') fill_constant([1], dim, force_cpu=True, out=temp_out) new_list_tensor.append(temp_out) return new_list_tensor -- GitLab From 7f50bb7ec162c42285d3822e643c93685a9c917e Mon Sep 17 00:00:00 2001 From: Zhang Ting Date: Wed, 17 Mar 2021 19:22:29 +0800 Subject: [PATCH 030/486] support NHWC for temporal_shift op (#31642) --- paddle/fluid/operators/temporal_shift_op.cc | 19 +- paddle/fluid/operators/temporal_shift_op.cu | 179 +++++++++++---- paddle/fluid/operators/temporal_shift_op.h | 211 ++++++++++++------ python/paddle/fluid/layers/nn.py | 22 +- .../tests/unittests/test_temporal_shift_op.py | 33 ++- 5 files changed, 338 insertions(+), 126 deletions(-) diff --git a/paddle/fluid/operators/temporal_shift_op.cc b/paddle/fluid/operators/temporal_shift_op.cc index 2e87447ed16..acf99d09ffb 100644 --- a/paddle/fluid/operators/temporal_shift_op.cc +++ b/paddle/fluid/operators/temporal_shift_op.cc @@ -80,7 +80,8 @@ class TemporalShiftOpMaker : public framework::OpProtoAndCheckerMaker { void Make() override { AddInput("X", "The input tensor of temporal shift operator. " - "This is a 4-D tensor with shape of [N*T, C, H, W]. " + "This is a 4-D tensor with shape of [N*T, C, H, W] " + "or [N*T, H, W, C]. " "While N is the batch size, T is the temporal segment " "number, C is the channel number, H is the height of " "features and W is the width of features. " @@ -100,15 +101,23 @@ class TemporalShiftOpMaker : public framework::OpProtoAndCheckerMaker { "by 1 along the temporal dimension. :attr:`shift_ratio` should be in " "range [0, 0.5]. Default 0.25.") .SetDefault(0.25); + AddAttr( + "data_format", + "(string, default NCHW) Only used in " + "an optional string from: \"NHWC\", \"NCHW\". " + "Specify that the data format of the input and output data is " + "channel_first or channel_last.") + .SetDefault("NCHW"); AddComment(R"DOC( This operator calculates the temporal shifting features for Input(X). - Input(X) should be in shape of [N*T, C, H, W], while N is the batch - size, T is the temporal segment number specified by :attr:`seg_num`, - C is the channel number, H and W is the height and width of features. + Input(X) should be in shape of [N*T, C, H, W] or [N*T, H, W, C], while + N is the batch size, T is the temporal segment number specified by + :attr:`seg_num`, C is the channel number, H and W is the height and + width of features. - Temporal Shifting is calculated as follows: + Temporal Shifting is calculated as follows when data format is NCHW: Step 1: Reshape Input(X) to [N, T, C, H, W]. diff --git a/paddle/fluid/operators/temporal_shift_op.cu b/paddle/fluid/operators/temporal_shift_op.cu index 4f2d7ce3cff..cb1ff5335cd 100644 --- a/paddle/fluid/operators/temporal_shift_op.cu +++ b/paddle/fluid/operators/temporal_shift_op.cu @@ -19,22 +19,46 @@ namespace operators { using framework::Tensor; template -__global__ void KeTemporalShiftFw(const T* input, T* output, const int ntchw, - const int tchw, const int chw, const int hw, - const int w, const int t, const int c, - const float shift_ratio) { +__global__ void KeTemporalShiftFwNCHW(const T* input, T* output, + const int ntchw, const int tchw, + const int chw, const int hw, const int t, + const int c1, const int c2) { int tid = blockIdx.x * blockDim.x + threadIdx.x; int stride = blockDim.x * gridDim.x; int src_it = 0; + for (; tid < ntchw; tid += stride) { - int in = tid / tchw; int it = (tid % tchw) / chw; int ic = (tid % chw) / hw; - int ih = (tid % hw) / w; - int iw = tid % w; - const int c1 = static_cast(c * shift_ratio); - const int c2 = static_cast(c * 2 * shift_ratio); + if (ic < c1) { + src_it = it - 1; + } else if (ic < c2) { + src_it = it + 1; + } else { + src_it = it; + } + + if (src_it < 0 || src_it >= t) { + output[tid] = 0; + } else { + output[tid] = input[tid + (src_it - it) * chw]; + } + } +} + +template +__global__ void KeTemporalShiftFwNHWC(const T* input, T* output, + const int nthwc, const int thwc, + const int hwc, const int t, const int c, + const int c1, const int c2) { + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + int src_it = 0; + + for (; tid < nthwc; tid += stride) { + int it = (tid % thwc) / hwc; + int ic = tid % c; if (ic < c1) { src_it = it - 1; @@ -47,42 +71,65 @@ __global__ void KeTemporalShiftFw(const T* input, T* output, const int ntchw, if (src_it < 0 || src_it >= t) { output[tid] = 0; } else { - int src_idx = GetEntryIndex(in, src_it, ic, ih, iw, tchw, chw, hw, w); - output[tid] = input[src_idx]; + output[tid] = input[tid + (src_it - it) * hwc]; } } } template -__global__ void KeTemporalShiftBw(const T* output_grad, T* input_grad, - const int ntchw, const int tchw, - const int chw, const int hw, const int w, - const int t, const int c, - const float shift_ratio) { +__global__ void KeTemporalShiftBwNCHW(const T* output_grad, T* input_grad, + const int ntchw, const int tchw, + const int chw, const int hw, const int t, + const int c1, const int c2) { int tid = blockIdx.x * blockDim.x + threadIdx.x; int stride = blockDim.x * gridDim.x; int src_it = 0; + for (; tid < ntchw; tid += stride) { - int in = tid / tchw; int it = (tid % tchw) / chw; int ic = (tid % chw) / hw; - int ih = (tid % hw) / w; - int iw = tid % w; - - const int c1 = static_cast(c * shift_ratio); - const int c2 = static_cast(c * 2 * shift_ratio); if (ic < c1) { - src_it = it - 1; + src_it = it + 1; } else if (ic < c2) { + src_it = it - 1; + } else { + src_it = it; + } + + if (src_it >= 0 && src_it < t) { + input_grad[tid] = output_grad[tid + (src_it - it) * chw]; + } else { + input_grad[tid] = 0; + } + } +} + +template +__global__ void KeTemporalShiftBwNHWC(const T* output_grad, T* input_grad, + const int nthwc, const int thwc, + const int hwc, const int t, const int c, + const int c1, const int c2) { + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + int src_it = 0; + + for (; tid < nthwc; tid += stride) { + int it = (tid % thwc) / hwc; + int ic = tid % c; + + if (ic < c1) { src_it = it + 1; + } else if (ic < c2) { + src_it = it - 1; } else { src_it = it; } if (src_it >= 0 && src_it < t) { - int src_idx = GetEntryIndex(in, src_it, ic, ih, iw, tchw, chw, hw, w); - input_grad[src_idx] = output_grad[tid]; + input_grad[tid] = output_grad[tid + (src_it - it) * hwc]; + } else { + input_grad[tid] = 0; } } } @@ -98,27 +145,48 @@ class TemporalShiftOpCUDAKernel : public framework::OpKernel { auto* output = ctx.Output("Out"); int t = ctx.Attr("seg_num"); float shift_ratio = ctx.Attr("shift_ratio"); + const std::string data_format_str = ctx.Attr("data_format"); + const DataLayout data_layout = + framework::StringToDataLayout(data_format_str); const int nt = input->dims()[0]; - const int c = input->dims()[1]; - const int h = input->dims()[2]; - const int w = input->dims()[3]; + const int c = (data_layout == DataLayout::kNCHW ? input->dims()[1] + : input->dims()[3]); + const int h = (data_layout == DataLayout::kNCHW ? input->dims()[2] + : input->dims()[1]); + const int w = (data_layout == DataLayout::kNCHW ? input->dims()[3] + : input->dims()[2]); const int hw = h * w; const int chw = c * hw; const int tchw = t * chw; const int ntchw = nt * chw; + const int c1 = static_cast(c * shift_ratio); + const int c2 = static_cast(c * 2 * shift_ratio); + + framework::DDim out_dims = (data_layout == DataLayout::kNCHW + ? framework::make_ddim({nt, c, h, w}) + : framework::make_ddim({nt, h, w, c})); const T* input_data = input->data(); - T* output_data = output->mutable_data({nt, c, h, w}, ctx.GetPlace()); + T* output_data = output->mutable_data(out_dims, ctx.GetPlace()); int pixelNum = nt * chw; - platform::GpuLaunchConfig config = - platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), pixelNum); + int threads = 1024; + int grid = (pixelNum + threads - 1) / threads; + const auto& dev_ctx = ctx.cuda_device_context(); + int blocks_per_sm = dev_ctx.GetMaxPhysicalThreadCount() / threads; + grid = std::min(dev_ctx.GetSMCount() * blocks_per_sm, grid); - KeTemporalShiftFw<<>>( - input_data, output_data, ntchw, tchw, chw, hw, w, t, c, shift_ratio); + if (data_layout == DataLayout::kNCHW) { + KeTemporalShiftFwNCHW< + T><<>>( + input_data, output_data, ntchw, tchw, chw, hw, t, c1, c2); + } else { + KeTemporalShiftFwNHWC< + T><<>>( + input_data, output_data, ntchw, tchw, chw, t, c, c1, c2); + } } }; @@ -130,32 +198,49 @@ class TemporalShiftGradOpCUDAKernel : public framework::OpKernel { auto* output_grad = ctx.Input(framework::GradVarName("Out")); int t = ctx.Attr("seg_num"); float shift_ratio = ctx.Attr("shift_ratio"); + const std::string data_format_str = ctx.Attr("data_format"); + const DataLayout data_layout = + framework::StringToDataLayout(data_format_str); const int nt = output_grad->dims()[0]; - const int c = output_grad->dims()[1]; - const int h = output_grad->dims()[2]; - const int w = output_grad->dims()[3]; + const int c = (data_layout == DataLayout::kNCHW ? output_grad->dims()[1] + : output_grad->dims()[3]); + const int h = (data_layout == DataLayout::kNCHW ? output_grad->dims()[2] + : output_grad->dims()[1]); + const int w = (data_layout == DataLayout::kNCHW ? output_grad->dims()[3] + : output_grad->dims()[2]); const int hw = h * w; const int chw = c * hw; const int tchw = t * chw; const int ntchw = nt * chw; + const int c1 = static_cast(c * shift_ratio); + const int c2 = static_cast(c * 2 * shift_ratio); + + framework::DDim in_grad_dims = (data_layout == DataLayout::kNCHW + ? framework::make_ddim({nt, c, h, w}) + : framework::make_ddim({nt, h, w, c})); const T* output_grad_data = output_grad->data(); T* input_grad_data = - input_grad->mutable_data({nt, c, h, w}, ctx.GetPlace()); - math::SetConstant()( - ctx.template device_context(), input_grad, - static_cast(0)); + input_grad->mutable_data(in_grad_dims, ctx.GetPlace()); int pixelNum = nt * chw; - platform::GpuLaunchConfig config = - platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), pixelNum); + int threads = 1024; + int grid = (pixelNum + threads - 1) / threads; + const auto& dev_ctx = ctx.cuda_device_context(); + int blocks_per_sm = dev_ctx.GetMaxPhysicalThreadCount() / threads; + grid = std::min(dev_ctx.GetSMCount() * blocks_per_sm, grid); - KeTemporalShiftBw<<>>( - output_grad_data, input_grad_data, ntchw, tchw, chw, hw, w, t, c, - shift_ratio); + if (data_layout == DataLayout::kNCHW) { + KeTemporalShiftBwNCHW< + T><<>>( + output_grad_data, input_grad_data, ntchw, tchw, chw, hw, t, c1, c2); + } else { + KeTemporalShiftBwNHWC< + T><<>>( + output_grad_data, input_grad_data, ntchw, tchw, chw, t, c, c1, c2); + } } }; diff --git a/paddle/fluid/operators/temporal_shift_op.h b/paddle/fluid/operators/temporal_shift_op.h index 4c7eed5af47..05364b94c92 100644 --- a/paddle/fluid/operators/temporal_shift_op.h +++ b/paddle/fluid/operators/temporal_shift_op.h @@ -17,12 +17,106 @@ namespace paddle { namespace operators { using Tensor = framework::Tensor; +using DataLayout = framework::DataLayout; -static HOSTDEVICE inline int GetEntryIndex(int in, int it, int ic, int ih, - int iw, const int tchw, - const int chw, const int hw, - const int w) { - return in * tchw + it * chw + ic * hw + ih * w + iw; +template +void TemporalShiftFwNCHW(const T* input, T* output, const int ntchw, + const int tchw, const int chw, const int hw, + const int t, const int c1, const int c2) { + int src_it = 0; + for (int i = 0; i < ntchw; i++) { + int it = (i % tchw) / chw; + int ic = (i % chw) / hw; + + if (ic < c1) { + src_it = it - 1; + } else if (ic < c2) { + src_it = it + 1; + } else { + src_it = it; + } + + if (src_it < 0 || src_it >= t) { + output[i] = 0; + } else { + output[i] = input[i + (src_it - it) * chw]; + } + } +} + +template +void TemporalShiftFwNHWC(const T* input, T* output, const int nthwc, + const int thwc, const int hwc, const int t, + const int c, const int c1, const int c2) { + int src_it = 0; + for (int i = 0; i < nthwc; i++) { + int it = (i % thwc) / hwc; + int ic = i % c; + + if (ic < c1) { + src_it = it - 1; + } else if (ic < c2) { + src_it = it + 1; + } else { + src_it = it; + } + + if (src_it < 0 || src_it >= t) { + output[i] = 0; + } else { + output[i] = input[i + (src_it - it) * hwc]; + } + } +} + +template +void TemporalShiftBwNCHW(const T* output_grad, T* input_grad, const int ntchw, + const int tchw, const int chw, const int hw, + const int t, const int c1, const int c2) { + int src_it = 0; + for (int i = 0; i < ntchw; i++) { + int it = (i % tchw) / chw; + int ic = (i % chw) / hw; + + if (ic < c1) { + src_it = it + 1; + } else if (ic < c2) { + src_it = it - 1; + } else { + src_it = it; + } + + if (src_it >= 0 && src_it < t) { + input_grad[i] = output_grad[i + (src_it - it) * chw]; + } else { + input_grad[i] = 0; + } + } +} + +template +void TemporalShiftBwNHWC(const T* output_grad, T* input_grad, const int nthwc, + const int thwc, const int hwc, const int t, + const int c, const int c1, const int c2) { + int src_it = 0; + for (int i = 0; i < nthwc; i++) { + int it = (i % thwc) / hwc; + int ic = i % c; + + if (ic < c1) { + src_it = it + 1; + } else if (ic < c2) { + src_it = it - 1; + } else { + src_it = it; + } + + if (src_it >= 0 && src_it < t) { + input_grad[i] = output_grad[i + (src_it - it) * hwc]; + } else { + input_grad[i] = 0; + } + } } template @@ -33,44 +127,38 @@ class TemporalShiftKernel : public framework::OpKernel { auto* output = ctx.Output("Out"); int t = ctx.Attr("seg_num"); float shift_ratio = ctx.Attr("shift_ratio"); + const std::string data_format_str = ctx.Attr("data_format"); + const DataLayout data_layout = + framework::StringToDataLayout(data_format_str); const int nt = input->dims()[0]; - const int c = input->dims()[1]; - const int h = input->dims()[2]; - const int w = input->dims()[3]; - - const int c1 = static_cast(c * shift_ratio); - const int c2 = static_cast(c * 2 * shift_ratio); + const int c = (data_layout == DataLayout::kNCHW ? input->dims()[1] + : input->dims()[3]); + const int h = (data_layout == DataLayout::kNCHW ? input->dims()[2] + : input->dims()[1]); + const int w = (data_layout == DataLayout::kNCHW ? input->dims()[3] + : input->dims()[2]); const int hw = h * w; const int chw = c * hw; const int tchw = t * chw; + const int ntchw = nt * chw; + const int c1 = static_cast(c * shift_ratio); + const int c2 = static_cast(c * 2 * shift_ratio); + + framework::DDim out_dims = (data_layout == DataLayout::kNCHW + ? framework::make_ddim({nt, c, h, w}) + : framework::make_ddim({nt, h, w, c})); const T* input_data = input->data(); - T* output_data = output->mutable_data({nt, c, h, w}, ctx.GetPlace()); - - int src_it = 0; - for (int i = 0; i < output->numel(); i++) { - int in = i / tchw; - int it = (i % tchw) / chw; - int ic = (i % chw) / hw; - int ih = (i % hw) / w; - int iw = i % w; - - if (ic < c1) { - src_it = it - 1; - } else if (ic < c2) { - src_it = it + 1; - } else { - src_it = it; - } - - if (src_it < 0 || src_it >= t) { - output_data[i] = 0; - } else { - int src_idx = GetEntryIndex(in, src_it, ic, ih, iw, tchw, chw, hw, w); - output_data[i] = input_data[src_idx]; - } + T* output_data = output->mutable_data(out_dims, ctx.GetPlace()); + + if (data_layout == DataLayout::kNCHW) { + TemporalShiftFwNCHW(input_data, output_data, ntchw, tchw, chw, hw, t, + c1, c2); + } else { + TemporalShiftFwNHWC(input_data, output_data, ntchw, tchw, chw, t, c, + c1, c2); } } }; @@ -83,44 +171,39 @@ class TemporalShiftGradKernel : public framework::OpKernel { auto* output_grad = ctx.Input(framework::GradVarName("Out")); int t = ctx.Attr("seg_num"); float shift_ratio = ctx.Attr("shift_ratio"); + const std::string data_format_str = ctx.Attr("data_format"); + const DataLayout data_layout = + framework::StringToDataLayout(data_format_str); const int nt = output_grad->dims()[0]; - const int c = output_grad->dims()[1]; - const int h = output_grad->dims()[2]; - const int w = output_grad->dims()[3]; - - const int c1 = static_cast(c * shift_ratio); - const int c2 = static_cast(c * 2 * shift_ratio); + const int c = (data_layout == DataLayout::kNCHW ? output_grad->dims()[1] + : output_grad->dims()[3]); + const int h = (data_layout == DataLayout::kNCHW ? output_grad->dims()[2] + : output_grad->dims()[1]); + const int w = (data_layout == DataLayout::kNCHW ? output_grad->dims()[3] + : output_grad->dims()[2]); const int hw = h * w; const int chw = c * hw; const int tchw = t * chw; + const int ntchw = nt * chw; + + const int c1 = static_cast(c * shift_ratio); + const int c2 = static_cast(c * 2 * shift_ratio); + framework::DDim in_grad_dims = (data_layout == DataLayout::kNCHW + ? framework::make_ddim({nt, c, h, w}) + : framework::make_ddim({nt, h, w, c})); const T* output_grad_data = output_grad->data(); T* input_grad_data = - input_grad->mutable_data({nt, c, h, w}, ctx.GetPlace()); - memset(input_grad_data, 0, input_grad->numel() * sizeof(T)); - - int src_it = 0; - for (int i = 0; i < output_grad->numel(); i++) { - int in = i / tchw; - int it = (i % tchw) / chw; - int ic = (i % chw) / hw; - int ih = (i % hw) / w; - int iw = i % w; - - if (ic < c1) { - src_it = it - 1; - } else if (ic < c2) { - src_it = it + 1; - } else { - src_it = it; - } - - if (src_it >= 0 && src_it < t) { - int src_idx = GetEntryIndex(in, src_it, ic, ih, iw, tchw, chw, hw, w); - input_grad_data[src_idx] = output_grad_data[i]; - } + input_grad->mutable_data(in_grad_dims, ctx.GetPlace()); + + if (data_layout == DataLayout::kNCHW) { + TemporalShiftBwNCHW(output_grad_data, input_grad_data, ntchw, tchw, + chw, hw, t, c1, c2); + } else { + TemporalShiftBwNHWC(output_grad_data, input_grad_data, ntchw, tchw, + chw, t, c, c1, c2); } } }; diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 8d96e46f833..fa8df14c866 100755 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -13334,7 +13334,7 @@ def shuffle_channel(x, group, name=None): @templatedoc() -def temporal_shift(x, seg_num, shift_ratio=0.25, name=None): +def temporal_shift(x, seg_num, shift_ratio=0.25, name=None, data_format="NCHW"): """ **Temporal Shift Operator** @@ -13348,6 +13348,8 @@ def temporal_shift(x, seg_num, shift_ratio=0.25, name=None): name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`. Usually name is no need to set and None by default. + data_format(str, optional): Data format that specifies the layout of input. + It can be "NCHW" or "NHWC". Default: "NCHW". Returns: out(Tensor): The temporal shifting result is a tensor with the @@ -13365,6 +13367,13 @@ def temporal_shift(x, seg_num, shift_ratio=0.25, name=None): input = paddle.randn([6, 4, 2, 2]) out = F.temporal_shift(x=input, seg_num=2, shift_ratio=0.2) """ + if data_format not in ["NCHW", "NHWC"]: + raise ValueError("Attr(data_format) should be 'NCHW' or 'NHWC'. " + "Received Attr(data_format): {}.".format(data_format)) + if in_dygraph_mode(): + return core.ops.temporal_shift(x, 'seg_num', seg_num, 'shift_ratio', + shift_ratio, 'data_format', data_format) + helper = LayerHelper("temporal_shift", **locals()) check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'temporal_shift') check_type(seg_num, 'seg_num', int, 'temporal_shift') @@ -13375,16 +13384,15 @@ def temporal_shift(x, seg_num, shift_ratio=0.25, name=None): if not isinstance(seg_num, int): raise TypeError("seg_num must be int type.") - if in_dygraph_mode(): - return core.ops.temporal_shift(x, 'seg_num', seg_num, 'shift_ratio', - shift_ratio) - helper.append_op( type="temporal_shift", inputs={"X": x}, outputs={"Out": out}, - attrs={"seg_num": seg_num, - "shift_ratio": shift_ratio}) + attrs={ + "seg_num": seg_num, + "shift_ratio": shift_ratio, + "data_format": data_format + }) return out diff --git a/python/paddle/fluid/tests/unittests/test_temporal_shift_op.py b/python/paddle/fluid/tests/unittests/test_temporal_shift_op.py index 050c38e5499..5bab4a52bf0 100644 --- a/python/paddle/fluid/tests/unittests/test_temporal_shift_op.py +++ b/python/paddle/fluid/tests/unittests/test_temporal_shift_op.py @@ -22,7 +22,9 @@ import paddle from paddle.fluid import core -def temporal_shift(x, seg_num, shift_ratio): +def temporal_shift(x, seg_num, shift_ratio, data_format): + if data_format == "NHWC": + x = np.transpose(x, (0, 3, 1, 2)) shape = x.shape reshape_x = x.reshape((-1, seg_num, shape[1], shape[2], shape[3])) pad_x = np.pad(reshape_x, ((0, 0), (1, 1), (0, 0), (0, 0), (0, 0)), @@ -33,7 +35,10 @@ def temporal_shift(x, seg_num, shift_ratio): slice2 = pad_x[:, 2:seg_num + 2, c1:c2, :, :] slice3 = pad_x[:, 1:seg_num + 1, c2:, :, :] concat_x = np.concatenate([slice1, slice2, slice3], axis=2) - return concat_x.reshape(shape) + out = concat_x.reshape(shape) + if data_format == "NHWC": + out = np.transpose(out, (0, 2, 3, 1)) + return out class TestTemporalShift(OpTest): @@ -45,11 +50,13 @@ class TestTemporalShift(OpTest): self.attrs = { "seg_num": self.seg_num, "shift_ratio": self.shift_ratio, + "data_format": self.data_format } self.inputs = {"X": x, } - output = temporal_shift(x, self.seg_num, self.shift_ratio) + output = temporal_shift(x, self.seg_num, self.shift_ratio, + self.data_format) self.outputs = {"Out": output} def test_check_output(self): @@ -63,6 +70,7 @@ class TestTemporalShift(OpTest): self.seg_num = 3 self.shift_ratio = 0.25 self.dtype = 'float64' + self.data_format = 'NCHW' class TestTemporalShift2(TestTemporalShift): @@ -70,6 +78,7 @@ class TestTemporalShift2(TestTemporalShift): self.x_shape = (4, 9, 7, 7) self.seg_num = 2 self.shift_ratio = 0.2 + self.data_format = 'NCHW' class TestTemporalShift3(TestTemporalShift): @@ -77,6 +86,15 @@ class TestTemporalShift3(TestTemporalShift): self.x_shape = (3, 10, 5, 5) self.seg_num = 1 self.shift_ratio = 0.3 + self.data_format = 'NCHW' + + +class TestTemporalShift4(TestTemporalShift): + def initTestCase(self): + self.x_shape = (6, 5, 5, 4) + self.seg_num = 3 + self.shift_ratio = 0.25 + self.data_format = 'NHWC' @unittest.skipIf(not core.is_compiled_with_cuda(), @@ -87,6 +105,7 @@ class TestTemporalShiftFP16(TestTemporalShift): self.seg_num = 1 self.shift_ratio = 0.3 self.dtype = 'float16' + self.data_format = 'NCHW' def test_check_output(self): place = core.CUDAPlace(0) @@ -114,6 +133,14 @@ class TestTemporalShiftAPI(unittest.TestCase): out = paddle.nn.functional.temporal_shift( x=input, seg_num=2, shift_ratio=0.2) + def test_error(self): + def attr_data_format(): + input = paddle.randn([6, 4, 2, 2]) + out = paddle.nn.functional.temporal_shift( + x=input, seg_num=2, shift_ratio=0.2, data_format="HWC") + + self.assertRaises(ValueError, attr_data_format) + if __name__ == "__main__": unittest.main() -- GitLab From 740359edaf819e679611968cf2ae13a25ccf5066 Mon Sep 17 00:00:00 2001 From: Kaipeng Deng Date: Thu, 18 Mar 2021 10:15:39 +0800 Subject: [PATCH 031/486] remove useless import (#31700) * remove useless import. test=develop --- python/paddle/fluid/dataloader/dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/fluid/dataloader/dataset.py b/python/paddle/fluid/dataloader/dataset.py index ac90cbafe17..e46083295d1 100755 --- a/python/paddle/fluid/dataloader/dataset.py +++ b/python/paddle/fluid/dataloader/dataset.py @@ -14,8 +14,8 @@ from __future__ import print_function +import paddle from .. import framework -import paddle.dataset.common __all__ = [ "Dataset", "IterableDataset", "TensorDataset", "ComposeDataset", -- GitLab From 09482ddec47bf844cde67aec2bf9f860573de4c0 Mon Sep 17 00:00:00 2001 From: Chengmo Date: Thu, 18 Mar 2021 10:50:46 +0800 Subject: [PATCH 032/486] =?UTF-8?q?=E3=80=90Paddle.Fleet=E3=80=91Fix=20one?= =?UTF-8?q?=20ps=20gradient=20clip=20=20(#31664)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix one ps gradient clip --- .../distributed/fleet/runtime/the_one_ps.py | 3 +- .../fleet/parameter_server/ir/public.py | 2 +- .../fleet/parameter_server/ir/trainer_pass.py | 2 +- .../tests/unittests/test_dist_fleet_base.py | 15 ++-- .../unittests/test_dist_fleet_grad_clip.py | 87 +++++++++++-------- 5 files changed, 62 insertions(+), 47 deletions(-) diff --git a/python/paddle/distributed/fleet/runtime/the_one_ps.py b/python/paddle/distributed/fleet/runtime/the_one_ps.py index abec4710f5d..a5686806005 100644 --- a/python/paddle/distributed/fleet/runtime/the_one_ps.py +++ b/python/paddle/distributed/fleet/runtime/the_one_ps.py @@ -150,7 +150,8 @@ class CommonAccessor: oop = None for op in optimizer_ops: - if op.input("Param")[0] == param_name: + if ("Param" in op.input_names) and ( + op.input("Param")[0] == param_name): oop = op break diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py b/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py index b987e01bba4..baf8add04ca 100644 --- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py +++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/public.py @@ -31,7 +31,7 @@ from paddle.fluid.incubate.fleet.parameter_server.ir.ps_dispatcher import RoundR from paddle.fluid.transpiler.details.program_utils import delete_ops OP_NAME_SCOPE = "op_namescope" -CLIP_OP_NAME_SCOPE = "@CLIP" +CLIP_OP_NAME_SCOPE = "gradient_clip" STEP_COUNTER = "@PS_STEP_COUNTER@" LEARNING_RATE_DECAY_COUNTER = "@LR_DECAY_COUNTER@" diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py b/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py index 2292d4c0a4d..08e64c15c48 100644 --- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py +++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py @@ -32,7 +32,7 @@ from paddle.fluid.incubate.fleet.parameter_server.ir.public import get_sparse_ta from paddle.fluid.incubate.fleet.parameter_server.mode import DistributedMode OP_NAME_SCOPE = "op_namescope" -CLIP_OP_NAME_SCOPE = "@CLIP" +CLIP_OP_NAME_SCOPE = "gradient_clip" STEP_COUNTER = "@PS_STEP_COUNTER@" OP_ROLE_VAR_ATTR_NAME = core.op_proto_and_checker_maker.kOpRoleVarAttrName() RPC_OP_ROLE_ATTR_NAME = core.op_proto_and_checker_maker.kOpRoleAttrName() diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py index 03d7251f829..e84e91de0ba 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py +++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_base.py @@ -18,6 +18,7 @@ from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler.distribu import paddle.distributed.fleet as fleet import paddle.distributed.fleet.base.role_maker as role_maker import paddle.fluid as fluid +import paddle """ high level unit test for distribute fleet. """ @@ -112,23 +113,21 @@ class FleetDistRunnerBase(object): def build_optimizer(self, avg_cost, strategy): use_grad_clip = int(os.getenv('GRAD_CLIP', 0)) + grad_clip = None if use_grad_clip: # 1: clip_by_value; 2: clip_by_norm; 3:clip_by_global_norm if use_grad_clip == 1: - fluid.clip.set_gradient_clip( - clip=fluid.clip.GradientClipByValue(2.0)) + grad_clip = paddle.nn.ClipGradByValue(min=-5.0, max=5.0) elif use_grad_clip == 2: - fluid.clip.set_gradient_clip( - clip=fluid.clip.GradientClipByNorm(2.0)) + grad_clip = paddle.nn.ClipGradByNorm(2.0) elif use_grad_clip == 3: - fluid.clip.set_gradient_clip( - clip=fluid.clip.GradientClipByGlobalNorm(2.0)) + grad_clip = paddle.nn.ClipGradByGlobalNorm(2.0) use_decay = int(os.getenv("USE_DECAY", "0")) if use_decay: scheduler = paddle.optimizer.lr.ExponentialDecay( learning_rate=LEARNING_RATE, gamma=0.999, verbose=True) - optimizer = fluid.optimizer.SGD(scheduler) + optimizer = fluid.optimizer.SGD(scheduler, grad_clip=grad_clip) """ # learning rate decay method before 2.0 optimizer = fluid.optimizer.SGD( @@ -139,7 +138,7 @@ class FleetDistRunnerBase(object): staircase=True)) """ else: - optimizer = fluid.optimizer.SGD(LEARNING_RATE) + optimizer = fluid.optimizer.SGD(LEARNING_RATE, grad_clip=grad_clip) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_grad_clip.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_grad_clip.py index 3c68af474cf..f9509d60072 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_fleet_grad_clip.py +++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_grad_clip.py @@ -16,53 +16,66 @@ from __future__ import print_function import os import unittest -import paddle.fluid as fluid -import paddle.fluid.incubate.fleet.base.role_maker as role_maker -from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet -from paddle.fluid.transpiler.distribute_transpiler import DistributeTranspilerConfig from test_dist_fleet_base import TestFleetBase -from dist_fleet_simnet_bow import train_network -@unittest.skip(reason="Skip unstable ut, add it after PR 22957 merged") -class TestDistGeoClipByGlobalNormTranspiler(unittest.TestCase): - def test_pserver(self): - role = role_maker.UserDefinedRoleMaker( - current_id=0, - role=role_maker.Role.SERVER, - worker_num=2, - server_endpoints=["127.0.0.1:36011", "127.0.0.1:36012"]) +class TestDistGeoClipByGlobalNorm(TestFleetBase): + def _setup_config(self): + self._mode = "geo" + self._reader = "dataset" + self._geo_sgd_need_push_nums = 5 + self._grad_clip_mode = 3 - fleet.init(role) + def check_with_place(self, + model_file, + delta=1e-3, + check_error_log=False, + need_envs={}): + required_envs = { + "PATH": os.getenv("PATH", ""), + "PYTHONPATH": os.getenv("PYTHONPATH", ""), + "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""), + "FLAGS_rpc_deadline": "5000", # 5sec to fail fast + "http_proxy": "" + } + required_envs.update(need_envs) - batch_size = 128 - is_sparse = True - is_distribute = False + tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs) - strategy = DistributeTranspilerConfig() - strategy.sync_mode = False - strategy.geo_sgd_mode = True - strategy.geo_sgd_need_push_nums = 5 + def test_dist_train(self): + self.check_with_place( + "dist_fleet_ctr.py", delta=1e-5, check_error_log=True) - avg_cost, _, _, _ = train_network(batch_size, is_distribute, is_sparse) - fluid.clip.set_gradient_clip( - clip=fluid.clip.GradientClipByGlobalNorm(2.0)) + def _setup_config(self): + self._sync_mode = False + self._grad_clip_mode = 2 - optimizer = fluid.optimizer.SGD(0.1) - optimizer = fleet.distributed_optimizer(optimizer, strategy) - optimizer.minimize(avg_cost) + def check_with_place(self, + model_file, + delta=1e-3, + check_error_log=False, + need_envs={}): + required_envs = { + "PATH": os.getenv("PATH", ""), + "PYTHONPATH": os.getenv("PYTHONPATH", ""), + "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""), + "FLAGS_rpc_deadline": "5000", # 5sec to fail fast + "http_proxy": "" + } + required_envs.update(need_envs) + + tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs) - pserver_startup_program = fleet.startup_program - pserver_mian_program = fleet.main_program + def test_dist_train(self): + self.check_with_place( + "dist_fleet_ctr.py", delta=1e-5, check_error_log=True) -@unittest.skip(reason="Skip unstable ut, add it after PR 22957 merged") -class TestDistGeoClipByGlobalNorm(TestFleetBase): +class TestDistASyncClipByValue(TestFleetBase): def _setup_config(self): - self._mode = "geo" + self._mode = "async" self._reader = "dataset" - self._geo_sgd_need_push_nums = 5 - self._grad_clip_mode = 3 + self._grad_clip_mode = 1 def check_with_place(self, model_file, @@ -84,8 +97,11 @@ class TestDistGeoClipByGlobalNorm(TestFleetBase): self.check_with_place( "dist_fleet_ctr.py", delta=1e-5, check_error_log=True) + +class TestDistASyncClipByNorm(TestFleetBase): def _setup_config(self): - self._sync_mode = False + self._mode = "async" + self._reader = "dataset" self._grad_clip_mode = 2 def check_with_place(self, @@ -109,7 +125,6 @@ class TestDistGeoClipByGlobalNorm(TestFleetBase): "dist_fleet_ctr.py", delta=1e-5, check_error_log=True) -@unittest.skip(reason="Skip unstable ut, add it after PR 22957 merged") class TestDistASyncClipByGlobalNorm(TestFleetBase): def _setup_config(self): self._mode = "async" -- GitLab From d4282ea97ece945b1d1d72aca4ed2aa794534c13 Mon Sep 17 00:00:00 2001 From: Zhou Wei <52485244+zhouwei25@users.noreply.github.com> Date: Thu, 18 Mar 2021 12:44:34 +0800 Subject: [PATCH 033/486] fix multi cuda environment bug (#31694) --- python/paddle/utils/cpp_extension/extension_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/paddle/utils/cpp_extension/extension_utils.py b/python/paddle/utils/cpp_extension/extension_utils.py index b68100fe521..1ff42a7bcbc 100644 --- a/python/paddle/utils/cpp_extension/extension_utils.py +++ b/python/paddle/utils/cpp_extension/extension_utils.py @@ -442,7 +442,8 @@ def find_cuda_home(): [which_cmd, 'nvcc'], stderr=devnull) if six.PY3: nvcc_path = nvcc_path.decode() - nvcc_path = nvcc_path.rstrip('\r\n') + # Multi CUDA, select the first + nvcc_path = nvcc_path.split('\r\n')[0] # for example: /usr/local/cuda/bin/nvcc cuda_home = os.path.dirname(os.path.dirname(nvcc_path)) -- GitLab From 4ea342786528b95c31135afd411c6bd81e89298b Mon Sep 17 00:00:00 2001 From: zlsh80826 Date: Thu, 18 Mar 2021 14:11:41 +0800 Subject: [PATCH 034/486] [Paddle-TRT] support batch axis concatenation when using dynamic shape (#31627) * support batch axis concatenation when using dynamic shape * opteller can't return true early, or some test will not be executed --- paddle/fluid/inference/tensorrt/op_teller.cc | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc index 052d17878a5..72338bcef11 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -159,7 +159,11 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, return false; } else { int axis = BOOST_GET_CONST(int, desc.GetAttr("axis")); - if (axis <= 0) return false; + if (with_dynamic_shape) { + if (axis < 0) return false; + } else { + if (axis <= 0) return false; + } } } if (op_type == "transpose2" || op_type == "transpose") { -- GitLab From fe241fd02f1c33ddba99c694f818a300fe8c371d Mon Sep 17 00:00:00 2001 From: zlsh80826 Date: Thu, 18 Mar 2021 14:32:43 +0800 Subject: [PATCH 035/486] [Paddle-TRT] gather converter (#31640) * trt gather converter * add trt gather unit_test --- .../fluid/inference/api/analysis_predictor.cc | 1 + .../inference/tensorrt/convert/CMakeLists.txt | 1 + .../inference/tensorrt/convert/gather_op.cc | 78 +++++++++++++++++++ paddle/fluid/inference/tensorrt/op_teller.cc | 5 ++ .../ir/inference/test_trt_gather_op.py | 70 +++++++++++++++++ 5 files changed, 155 insertions(+) create mode 100644 paddle/fluid/inference/tensorrt/convert/gather_op.cc create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_trt_gather_op.py diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index d6080bd6928..fc436311f07 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -1191,6 +1191,7 @@ USE_TRT_CONVERTER(slice); USE_TRT_CONVERTER(scale); USE_TRT_CONVERTER(stack); USE_TRT_CONVERTER(clip); +USE_TRT_CONVERTER(gather); #endif namespace paddle_infer { diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt index f9586ca1701..59205529ef4 100644 --- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt @@ -5,6 +5,7 @@ nv_library(tensorrt_converter pad_op.cc split_op.cc prelu_op.cc leaky_relu_op.cc gelu_op.cc layer_norm_op.cc multihead_matmul_op.cc shuffle_channel_op.cc swish_op.cc instance_norm_op.cc stack_op.cc transpose_op.cc flatten_op.cc emb_eltwise_layernorm.cc skip_layernorm.cc scale_op.cc slice_op.cc hard_sigmoid_op.cc hard_swish_op.cc clip_op.cc + gather_op.cc DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry) nv_test(test_op_converter SRCS test_op_converter.cc DEPS diff --git a/paddle/fluid/inference/tensorrt/convert/gather_op.cc b/paddle/fluid/inference/tensorrt/convert/gather_op.cc new file mode 100644 index 00000000000..346a8bffa00 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/gather_op.cc @@ -0,0 +1,78 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" + +namespace paddle { +namespace framework { +class Scope; + +namespace proto { +class OpDesc; +} // namespace proto +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace inference { +namespace tensorrt { + +/* + * Gather Op + */ +class GatherOpConverter : public OpConverter { + public: + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, bool test_mode) override { + VLOG(3) << "convert a fluid gather op to tensorrt gather layer"; + + framework::OpDesc op_desc(op, nullptr); + std::string input_name = op_desc.Input("X").front(); + std::string index_name = op_desc.Input("Index").front(); + std::string output_name = op_desc.Output("Out").front(); + + const auto input_tensor = engine_->GetITensor(input_name); + const auto index_tensor = engine_->GetITensor(index_name); + + const int axis = 0; + + auto layer = TRT_ENGINE_ADD_LAYER(engine_, Gather, *input_tensor, + *index_tensor, axis); + + auto odim = layer->getOutput(0)->getDimensions(); + + auto reshape_layer = + TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *layer->getOutput(0)); + + nvinfer1::Dims target_shape{}; + target_shape.nbDims = odim.nbDims - 1; + for (int i = 0; i < axis; ++i) { + target_shape.d[i] = odim.d[i]; + } + target_shape.d[axis] = 0; + for (int i = axis + 1; i < target_shape.nbDims; ++i) { + target_shape.d[i] = odim.d[i + 1]; + } + + reshape_layer->setReshapeDimensions(target_shape); + + RreplenishLayerAndOutput(reshape_layer, "gather", {output_name}, test_mode); + } +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +REGISTER_TRT_OP_CONVERTER(gather, GatherOpConverter); diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc index 72338bcef11..44939606b49 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -109,6 +109,7 @@ struct SimpleOpTypeSetTeller : public Teller { "transpose", "flatten2", "flatten", + "gather", }; }; @@ -186,6 +187,10 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, if (axis != 1) return false; } } + if (op_type == "gather") { + // current not support axis from input, use default 0 + if (!with_dynamic_shape || desc.Input("Axis").size() > 0) return false; + } if ((*teller)(op_type, desc, use_no_calib_int8)) return true; } return false; diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_gather_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_gather_op.py new file mode 100644 index 00000000000..fec15ea7295 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_gather_op.py @@ -0,0 +1,70 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +from inference_pass_test import InferencePassTest +import paddle.fluid as fluid +import paddle.fluid.core as core +from paddle.fluid.core import PassVersionChecker +from paddle.fluid.core import AnalysisConfig + + +class TRTGatherTest(InferencePassTest): + def setUp(self): + self.set_params() + with fluid.program_guard(self.main_program, self.startup_program): + data = fluid.data(name='data', shape=[-1, 512], dtype='float32') + index = fluid.data(name='index', shape=[-1], dtype='int32') + scale_out = self.append_gather(data, index) + out = fluid.layers.batch_norm(scale_out, is_test=True) + + index = np.arange(self.num_gather, dtype='int32') + np.random.shuffle(index) + + self.feeds = { + "data": np.random.random([self.bs, 512]).astype("float32"), + "index": index, + } + + self.enable_trt = True + self.trt_parameters = TRTGatherTest.TensorRTParam( + 1 << 30, self.bs, 1, AnalysisConfig.Precision.Float32, False, False) + self.fetch_list = [out] + + def set_params(self): + self.num_gather = 16 + self.bs = 32 + + def append_gather(self, data, index): + return fluid.layers.gather(data, index=index) + + def test_check_output(self): + if core.is_compiled_with_cuda(): + use_gpu = True + self.check_output_with_option(use_gpu, flatten=True) + self.assertTrue( + PassVersionChecker.IsCompatible('tensorrt_subgraph_pass')) + + +class TRTGatherTest1(TRTGatherTest): + def set_params(self): + self.num_gather = 32 + self.bs = 32 + + +if __name__ == "__main__": + unittest.main() -- GitLab From 87852616aaf2517567a68d6b7dd5a61ab3857380 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Thu, 18 Mar 2021 16:22:08 +0800 Subject: [PATCH 036/486] [CustomOp] Support complex dtype in custom op (#31657) * support custom complex op * fix detail error * add inference support * fix setup windows failed --- cmake/inference_lib.cmake | 6 + paddle/fluid/extension/include/ext_dispatch.h | 65 +++++++++ paddle/fluid/extension/include/ext_dtype.h | 31 ++-- paddle/fluid/extension/src/ext_tensor.cc | 34 +++++ paddle/fluid/framework/CMakeLists.txt | 7 +- paddle/fluid/framework/custom_operator.cc | 35 ++++- paddle/fluid/framework/custom_tensor_test.cc | 22 +++ paddle/fluid/framework/custom_tensor_utils.h | 8 ++ paddle/fluid/inference/CMakeLists.txt | 4 + paddle/fluid/pybind/CMakeLists.txt | 4 + .../fluid/tests/custom_op/CMakeLists.txt | 3 + .../fluid/tests/custom_op/custom_conj_op.cc | 94 ++++++++++++ .../fluid/tests/custom_op/dispatch_test_op.cc | 56 ++++++++ .../fluid/tests/custom_op/test_custom_conj.py | 136 ++++++++++++++++++ .../tests/custom_op/test_dispatch_jit.py | 20 +++ python/setup.py.in | 20 ++- 16 files changed, 530 insertions(+), 15 deletions(-) create mode 100644 python/paddle/fluid/tests/custom_op/custom_conj_op.cc create mode 100644 python/paddle/fluid/tests/custom_op/test_custom_conj.py diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index 2cba3d06936..570b37ff118 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -192,6 +192,12 @@ include_directories(${CMAKE_BINARY_DIR}/../paddle/fluid/framework/io) copy(inference_lib_dist SRCS ${PADDLE_SOURCE_DIR}/paddle/fluid/extension/include/* DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/) +copy(inference_lib_dist + SRCS ${PADDLE_SOURCE_DIR}/paddle/fluid/platform/complex64.h + DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/) +copy(inference_lib_dist + SRCS ${PADDLE_SOURCE_DIR}/paddle/fluid/platform/complex128.h + DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/) # CAPI inference library for only inference set(PADDLE_INFERENCE_C_INSTALL_DIR "${CMAKE_BINARY_DIR}/paddle_inference_c_install_dir" CACHE STRING diff --git a/paddle/fluid/extension/include/ext_dispatch.h b/paddle/fluid/extension/include/ext_dispatch.h index eed73604649..7b3893e2839 100644 --- a/paddle/fluid/extension/include/ext_dispatch.h +++ b/paddle/fluid/extension/include/ext_dispatch.h @@ -68,6 +68,22 @@ namespace paddle { } \ }() +///////// Complex Dispatch Marco /////////// + +#define PD_DISPATCH_COMPLEX_TYPES(TYPE, NAME, ...) \ + [&] { \ + const auto& __dtype__ = TYPE; \ + switch (__dtype__) { \ + PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::COMPLEX64, \ + ::paddle::complex64, __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::COMPLEX128, \ + ::paddle::complex128, __VA_ARGS__) \ + default: \ + PD_THROW("function " #NAME " is not implemented for data type `" + \ + ::paddle::ToString(__dtype__) + "`"); \ + } \ + }() + ///////// Floating and Integral Dispatch Marco /////////// #define PD_DISPATCH_FLOATING_AND_INTEGRAL_TYPES(TYPE, NAME, ...) \ @@ -93,6 +109,55 @@ namespace paddle { } \ }() +///////// Floating and Complex Dispatch Marco /////////// + +#define PD_DISPATCH_FLOATING_AND_COMPLEX_TYPES(TYPE, NAME, ...) \ + [&] { \ + const auto& __dtype__ = TYPE; \ + switch (__dtype__) { \ + PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::FLOAT32, float, \ + __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::FLOAT64, double, \ + __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::COMPLEX64, \ + ::paddle::complex64, __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::COMPLEX128, \ + ::paddle::complex128, __VA_ARGS__) \ + default: \ + PD_THROW("function " #NAME " is not implemented for data type `" + \ + ::paddle::ToString(__dtype__) + "`"); \ + } \ + }() + +///////// Floating, Integral and Complex Dispatch Marco /////////// + +#define PD_DISPATCH_FLOATING_AND_INTEGRAL_AND_COMPLEX_TYPES(TYPE, NAME, ...) \ + [&] { \ + const auto& __dtype__ = TYPE; \ + switch (__dtype__) { \ + PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::FLOAT32, float, \ + __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::FLOAT64, double, \ + __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::INT32, int, __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::INT64, int64_t, \ + __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::INT8, int8_t, \ + __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::UINT8, uint8_t, \ + __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::INT16, int16_t, \ + __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::COMPLEX64, \ + ::paddle::complex64, __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::COMPLEX128, \ + ::paddle::complex128, __VA_ARGS__) \ + default: \ + PD_THROW("function " #NAME " is not implemented for data type `" + \ + ::paddle::ToString(__dtype__) + "`"); \ + } \ + }() + // TODO(chenweihang): Add more Marcos in the future if needed } // namespace paddle diff --git a/paddle/fluid/extension/include/ext_dtype.h b/paddle/fluid/extension/include/ext_dtype.h index 46c4bac2360..a1e58fbacdf 100644 --- a/paddle/fluid/extension/include/ext_dtype.h +++ b/paddle/fluid/extension/include/ext_dtype.h @@ -16,10 +16,15 @@ limitations under the License. */ #include #include +#include "complex128.h" // NOLINT +#include "complex64.h" // NOLINT #include "ext_exception.h" // NOLINT namespace paddle { +using complex64 = paddle::platform::complex64; +using complex128 = paddle::platform::complex128; + enum class DataType { BOOL, INT8, @@ -29,6 +34,8 @@ enum class DataType { INT64, FLOAT32, FLOAT64, + COMPLEX64, + COMPLEX128, // TODO(JiabinYang) support more data types if needed. }; @@ -50,20 +57,26 @@ inline std::string ToString(DataType dtype) { return "float"; case DataType::FLOAT64: return "double"; + case DataType::COMPLEX64: + return "complex64"; + case DataType::COMPLEX128: + return "complex128"; default: PD_THROW("Unsupported paddle enum data type."); } } -#define PD_FOR_EACH_DATA_TYPE(_) \ - _(bool, DataType::BOOL) \ - _(int8_t, DataType::INT8) \ - _(uint8_t, DataType::UINT8) \ - _(int16_t, DataType::INT16) \ - _(int, DataType::INT32) \ - _(int64_t, DataType::INT64) \ - _(float, DataType::FLOAT32) \ - _(double, DataType::FLOAT64) +#define PD_FOR_EACH_DATA_TYPE(_) \ + _(bool, DataType::BOOL) \ + _(int8_t, DataType::INT8) \ + _(uint8_t, DataType::UINT8) \ + _(int16_t, DataType::INT16) \ + _(int, DataType::INT32) \ + _(int64_t, DataType::INT64) \ + _(float, DataType::FLOAT32) \ + _(double, DataType::FLOAT64) \ + _(complex64, DataType::COMPLEX64) \ + _(complex128, DataType::COMPLEX128) template struct DataTypeToCPPType; diff --git a/paddle/fluid/extension/src/ext_tensor.cc b/paddle/fluid/extension/src/ext_tensor.cc index 4434a3bf594..cb37bf180c3 100644 --- a/paddle/fluid/extension/src/ext_tensor.cc +++ b/paddle/fluid/extension/src/ext_tensor.cc @@ -13,10 +13,14 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/extension/include/ext_tensor.h" + #include + #include "paddle/fluid/framework/custom_tensor_utils.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/memory/memcpy.h" +#include "paddle/fluid/platform/complex128.h" +#include "paddle/fluid/platform/complex64.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/transform.h" @@ -162,6 +166,10 @@ DataType Tensor::type() const { return DataType::FLOAT64; } else if (type == framework::proto::VarType::BOOL) { return DataType::BOOL; + } else if (type == framework::proto::VarType::COMPLEX64) { + return DataType::COMPLEX64; + } else if (type == framework::proto::VarType::COMPLEX128) { + return DataType::COMPLEX128; } // TODO(JiabinYang) Support more dtype here return DataType::FLOAT32; @@ -217,6 +225,10 @@ template PD_DLL_DECL Tensor Tensor::copy_to(const PlaceType &target_place) const; template PD_DLL_DECL Tensor Tensor::copy_to(const PlaceType &target_place) const; +template PD_DLL_DECL Tensor Tensor::copy_to( + const PlaceType &target_place) const; +template PD_DLL_DECL Tensor Tensor::copy_to( + const PlaceType &target_place) const; template PD_DLL_DECL float *Tensor::data() const; template PD_DLL_DECL double *Tensor::data() const; @@ -226,6 +238,10 @@ template PD_DLL_DECL uint8_t *Tensor::data() const; template PD_DLL_DECL int8_t *Tensor::data() const; template PD_DLL_DECL int16_t *Tensor::data() const; template PD_DLL_DECL bool *Tensor::data() const; +template PD_DLL_DECL paddle::platform::complex64 * +Tensor::data() const; +template PD_DLL_DECL paddle::platform::complex128 * +Tensor::data() const; template PD_DLL_DECL float *Tensor::mutable_data(); template PD_DLL_DECL double *Tensor::mutable_data(); @@ -235,6 +251,10 @@ template PD_DLL_DECL uint8_t *Tensor::mutable_data(); template PD_DLL_DECL int8_t *Tensor::mutable_data(); template PD_DLL_DECL int16_t *Tensor::mutable_data(); template PD_DLL_DECL bool *Tensor::mutable_data(); +template PD_DLL_DECL paddle::platform::complex64 * +Tensor::mutable_data(); +template PD_DLL_DECL paddle::platform::complex128 * +Tensor::mutable_data(); template PD_DLL_DECL float *Tensor::mutable_data(const PlaceType &place); template PD_DLL_DECL double *Tensor::mutable_data( @@ -250,6 +270,10 @@ template PD_DLL_DECL int8_t *Tensor::mutable_data( template PD_DLL_DECL int16_t *Tensor::mutable_data( const PlaceType &place); template PD_DLL_DECL bool *Tensor::mutable_data(const PlaceType &place); +template PD_DLL_DECL paddle::platform::complex64 * +Tensor::mutable_data(const PlaceType &place); +template PD_DLL_DECL paddle::platform::complex128 * +Tensor::mutable_data(const PlaceType &place); std::vector Tensor::shape() const { GET_CASTED_TENSOR @@ -310,6 +334,16 @@ Tensor Tensor::cast(const DataType &target_type) const { framework::VisitDataType( dst_type, CastDataType(*tensor, rlt_tensor_, ctx)); break; + case framework::proto::VarType::COMPLEX64: + framework::VisitDataType( + dst_type, + CastDataType(*tensor, rlt_tensor_, ctx)); + break; + case framework::proto::VarType::COMPLEX128: + framework::VisitDataType(dst_type, + CastDataType( + *tensor, rlt_tensor_, ctx)); + break; // TODO(JiabinYang) Support more dtype here default: PADDLE_THROW(platform::errors::Unimplemented( diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 43bbc06787e..1fa4ce9b573 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -346,13 +346,16 @@ message(STATUS "branch: ${PADDLE_BRANCH}") configure_file(commit.h.in commit.h) +# Adapt to custom op mechanism: Include the header files related to the data type +# to avoid exposing the path of the underlying file +include_directories(${PADDLE_SOURCE_DIR}/paddle/fluid/platform) +include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../extension/include) + cc_library(custom_tensor SRCS ../extension/src/ext_tensor.cc DEPS lod_tensor memory enforce) cc_library(op_meta_info SRCS ../extension/src/ext_op_meta_info.cc DEPS custom_tensor) cc_library(custom_operator SRCS custom_operator.cc DEPS tensor attribute framework_proto op_registry operator dynamic_loader string_helper custom_tensor op_meta_info) cc_test(custom_tensor_test SRCS custom_tensor_test.cc DEPS custom_tensor glog) -include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../extension/include) - set(FLUID_FRAMEWORK_MODULES proto_desc memory lod_tensor executor data_feed_proto layer dynamic_loader custom_operator) cc_library(paddle_framework DEPS ${FLUID_FRAMEWORK_MODULES}) diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc index 0baacd46213..69a9be603e6 100644 --- a/paddle/fluid/framework/custom_operator.cc +++ b/paddle/fluid/framework/custom_operator.cc @@ -757,10 +757,39 @@ void RegisterOperatorWithMetaInfo( return new CustomOperator(type, inputs, outputs, attrs); }; - // Grad InferShape (gradient's shape is same with forward input default) - grad_info.infer_shape_ = [grad_op_outputs](InferShapeContext* ctx) { + // Grad InferShape + grad_info.infer_shape_ = [grad_op_inputs, + grad_op_outputs](InferShapeContext* ctx) { + // 1. if forward input exists, gradient's shape is same with forward input + // default + // [Suitable for most situations] + // 2. if forward input not exists, and only contains one grad input and + // output, + // use grad input shape as grad output shape + // [Suitable for the situation that forward input is not used as + // backward input] + // TODO(chenweihang): support set grad op infershape func if needed for (auto& out_name : grad_op_outputs) { - ctx->ShareDim(detail::NoGrad(out_name), out_name); + auto fwd_name = detail::NoGrad(out_name); + if (detail::IsDuplicableVar(fwd_name)) { + // Duplicable forward var must as backward input + ctx->ShareDim(fwd_name, out_name); + } else { + if (ctx->HasInput(fwd_name)) { + ctx->ShareDim(fwd_name, out_name); + } else { + PADDLE_ENFORCE_EQ( + grad_op_inputs.size() == 1UL && grad_op_outputs.size() == 1UL, + true, + platform::errors::Unavailable( + "Custom grad operator infershape error. " + "If a custom grad operator contains only one input and " + "only one output, the input shape will be directly set to " + "the output shape. Otherwise, Please set the forward input " + "as the grad operator's input.")); + ctx->ShareDim(grad_op_inputs[0], out_name); + } + } } }; diff --git a/paddle/fluid/framework/custom_tensor_test.cc b/paddle/fluid/framework/custom_tensor_test.cc index 2e42248f64b..7da56588600 100644 --- a/paddle/fluid/framework/custom_tensor_test.cc +++ b/paddle/fluid/framework/custom_tensor_test.cc @@ -109,6 +109,10 @@ void GroupTestCopy() { TestCopyTensor(); VLOG(2) << "uint8 cpu-cpu-gpu-gpu-cpu"; TestCopyTensor(); + VLOG(2) << "complex64 cpu-cpu-gpu-gpu-cpu"; + TestCopyTensor(); + VLOG(2) << "complex128 cpu-cpu-gpu-gpu-cpu"; + TestCopyTensor(); } void GroupTestCast() { @@ -126,6 +130,10 @@ void GroupTestCast() { TestCast(paddle::DataType::FLOAT32); VLOG(2) << "float cast"; TestCast(paddle::DataType::FLOAT32); + VLOG(2) << "complex64 cast"; + TestCast(paddle::DataType::FLOAT32); + VLOG(2) << "complex128 cast"; + TestCast(paddle::DataType::FLOAT32); } void GroupTestDtype() { @@ -136,6 +144,8 @@ void GroupTestDtype() { CHECK(TestDtype() == paddle::DataType::INT16); CHECK(TestDtype() == paddle::DataType::INT8); CHECK(TestDtype() == paddle::DataType::UINT8); + CHECK(TestDtype() == paddle::DataType::COMPLEX64); + CHECK(TestDtype() == paddle::DataType::COMPLEX128); } void GroupTestDtypeConvert() { @@ -162,6 +172,12 @@ void GroupTestDtypeConvert() { paddle::framework::proto::VarType::INT16); CHECK(paddle::framework::CustomTensorUtils::ConvertEnumDTypeToInnerDType( paddle::DataType::BOOL) == paddle::framework::proto::VarType::BOOL); + CHECK(paddle::framework::CustomTensorUtils::ConvertEnumDTypeToInnerDType( + paddle::DataType::COMPLEX64) == + paddle::framework::proto::VarType::COMPLEX64); + CHECK(paddle::framework::CustomTensorUtils::ConvertEnumDTypeToInnerDType( + paddle::DataType::COMPLEX128) == + paddle::framework::proto::VarType::COMPLEX128); // proto -> enum CHECK(paddle::framework::CustomTensorUtils::ConvertInnerDTypeToEnumDType( paddle::framework::proto::VarType::FP64) == @@ -185,6 +201,12 @@ void GroupTestDtypeConvert() { paddle::DataType::INT16); CHECK(paddle::framework::CustomTensorUtils::ConvertInnerDTypeToEnumDType( paddle::framework::proto::VarType::BOOL) == paddle::DataType::BOOL); + CHECK(paddle::framework::CustomTensorUtils::ConvertInnerDTypeToEnumDType( + paddle::framework::proto::VarType::COMPLEX64) == + paddle::DataType::COMPLEX64); + CHECK(paddle::framework::CustomTensorUtils::ConvertInnerDTypeToEnumDType( + paddle::framework::proto::VarType::COMPLEX128) == + paddle::DataType::COMPLEX128); } TEST(CustomTensor, copyTest) { diff --git a/paddle/fluid/framework/custom_tensor_utils.h b/paddle/fluid/framework/custom_tensor_utils.h index 919a3a1a49c..a252d6aef4e 100644 --- a/paddle/fluid/framework/custom_tensor_utils.h +++ b/paddle/fluid/framework/custom_tensor_utils.h @@ -56,6 +56,10 @@ class CustomTensorUtils { return framework::proto::VarType::INT64; case paddle::DataType::INT16: return framework::proto::VarType::INT16; + case paddle::DataType::COMPLEX64: + return framework::proto::VarType::COMPLEX64; + case paddle::DataType::COMPLEX128: + return framework::proto::VarType::COMPLEX128; case paddle::DataType::BOOL: return framework::proto::VarType::BOOL; default: @@ -83,6 +87,10 @@ class CustomTensorUtils { return paddle::DataType::UINT8; case framework::proto::VarType::INT16: return paddle::DataType::INT16; + case framework::proto::VarType::COMPLEX64: + return paddle::DataType::COMPLEX64; + case framework::proto::VarType::COMPLEX128: + return paddle::DataType::COMPLEX128; case framework::proto::VarType::BOOL: return paddle::DataType::BOOL; default: diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index 7a8bfc1a8c7..93fd85f13cb 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -36,6 +36,10 @@ endif() # fluid_modules exclude API-interface of inference/api and inference/capi get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES) +# Adapt to custom op mechanism: Include the header files related to the data type +# to avoid exposing the path of the underlying file +include_directories(${PADDLE_SOURCE_DIR}/paddle/fluid/platform) + add_subdirectory(api) # Create static inference library if needed diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index 7a63217d678..5452b2160ab 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -1,3 +1,7 @@ +# Adapt to custom op mechanism: Include the header files related to the data type +# to avoid exposing the path of the underlying file +include_directories(${PADDLE_SOURCE_DIR}/paddle/fluid/platform) + set(PYBIND_DEPS pybind python proto_desc memory executor fleet_wrapper box_wrapper prune feed_fetch_method pass_builder parallel_executor profiler layer tracer engine scope_pool analysis_predictor imperative_profiler imperative_flag save_load_util dlpack_tensor device_context diff --git a/python/paddle/fluid/tests/custom_op/CMakeLists.txt b/python/paddle/fluid/tests/custom_op/CMakeLists.txt index 620bff11a28..4ba537930ce 100644 --- a/python/paddle/fluid/tests/custom_op/CMakeLists.txt +++ b/python/paddle/fluid/tests/custom_op/CMakeLists.txt @@ -26,6 +26,9 @@ set_tests_properties(test_custom_attrs_jit PROPERTIES TIMEOUT 120) py_test(test_custom_concat SRCS test_custom_concat.py) set_tests_properties(test_custom_concat PROPERTIES TIMEOUT 120) +py_test(test_custom_conj SRCS test_custom_conj.py) +set_tests_properties(test_custom_conj PROPERTIES TIMEOUT 120) + py_test(test_check_abi SRCS test_check_abi.py) cc_test(test_check_error SRCS test_check_error.cc DEPS gtest) diff --git a/python/paddle/fluid/tests/custom_op/custom_conj_op.cc b/python/paddle/fluid/tests/custom_op/custom_conj_op.cc new file mode 100644 index 00000000000..4feb887ca03 --- /dev/null +++ b/python/paddle/fluid/tests/custom_op/custom_conj_op.cc @@ -0,0 +1,94 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WIdata_tHOUdata_t WARRANdata_tIES OR CONDIdata_tIONS OF ANY KIND, either +// express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include "paddle/extension.h" + +#define CHECK_INPUT(x) \ + PD_CHECK(x.place() == paddle::PlaceType::kCPU, #x " must be a CPU Tensor.") + +template +using EnableComplex = typename std::enable_if< + std::is_same::value || + std::is_same::value>::type; + +template +using DisableComplex = typename std::enable_if< + !std::is_same::value && + !std::is_same::value>::type; + +template +struct ConjFunctor; + +template +struct ConjFunctor> { + ConjFunctor(const data_t* input, int64_t numel, data_t* output) + : input_(input), numel_(numel), output_(output) {} + + void operator()(size_t idx) const { + output_[idx] = data_t(input_[idx].real, -input_[idx].imag); + } + + const data_t* input_; + int64_t numel_; + data_t* output_; +}; + +template +struct ConjFunctor> { + ConjFunctor(const data_t* input, int64_t numel, data_t* output) + : input_(input), numel_(numel), output_(output) {} + + void operator()(size_t idx) const { output_[idx] = input_[idx]; } + + const data_t* input_; + int64_t numel_; + data_t* output_; +}; + +template +void ConjCPUKernel(const data_t* x_data, int64_t numel, data_t* out_data) { + ConjFunctor conj(x_data, numel, out_data); + for (int64_t i = 0; i < numel; ++i) { + conj(i); + } +} + +std::vector ConjFunction(const paddle::Tensor& x) { + CHECK_INPUT(x); + + paddle::Tensor out(x.place()); + out.reshape(x.shape()); + + PD_DISPATCH_FLOATING_AND_COMPLEX_TYPES( + x.type(), "ConjCPUKernel", ([&] { + ConjCPUKernel( + x.data(), x.size(), out.mutable_data()); + })); + + return {out}; +} + +PD_BUILD_OP(custom_conj) + .Inputs({"X"}) + .Outputs({"Out"}) + .SetKernelFn(PD_KERNEL(ConjFunction)); + +PD_BUILD_GRAD_OP(custom_conj) + .Inputs({paddle::Grad("Out")}) + .Outputs({paddle::Grad("X")}) + .SetKernelFn(PD_KERNEL(ConjFunction)); diff --git a/python/paddle/fluid/tests/custom_op/dispatch_test_op.cc b/python/paddle/fluid/tests/custom_op/dispatch_test_op.cc index 33ca6ee86f0..fbf5442ac02 100644 --- a/python/paddle/fluid/tests/custom_op/dispatch_test_op.cc +++ b/python/paddle/fluid/tests/custom_op/dispatch_test_op.cc @@ -62,3 +62,59 @@ PD_BUILD_OP(dispatch_test_float_and_integer) .Inputs({"X"}) .Outputs({"Out"}) .SetKernelFn(PD_KERNEL(DispatchTestFloatAndInteger)); + +std::vector DispatchTestComplex(const paddle::Tensor& x) { + auto out = paddle::Tensor(paddle::PlaceType::kCPU); + out.reshape(x.shape()); + + PD_DISPATCH_COMPLEX_TYPES( + x.type(), "assign_cpu_kernel", ([&] { + assign_cpu_kernel( + x.data(), out.mutable_data(), x.size()); + })); + + return {out}; +} + +PD_BUILD_OP(dispatch_test_complex) + .Inputs({"X"}) + .Outputs({"Out"}) + .SetKernelFn(PD_KERNEL(DispatchTestComplex)); + +std::vector DispatchTestFloatAndComplex( + const paddle::Tensor& x) { + auto out = paddle::Tensor(paddle::PlaceType::kCPU); + out.reshape(x.shape()); + + PD_DISPATCH_FLOATING_AND_COMPLEX_TYPES( + x.type(), "assign_cpu_kernel", ([&] { + assign_cpu_kernel( + x.data(), out.mutable_data(), x.size()); + })); + + return {out}; +} + +PD_BUILD_OP(dispatch_test_float_and_complex) + .Inputs({"X"}) + .Outputs({"Out"}) + .SetKernelFn(PD_KERNEL(DispatchTestFloatAndComplex)); + +std::vector DispatchTestFloatAndIntegerAndComplex( + const paddle::Tensor& x) { + auto out = paddle::Tensor(paddle::PlaceType::kCPU); + out.reshape(x.shape()); + + PD_DISPATCH_FLOATING_AND_INTEGRAL_AND_COMPLEX_TYPES( + x.type(), "assign_cpu_kernel", ([&] { + assign_cpu_kernel( + x.data(), out.mutable_data(), x.size()); + })); + + return {out}; +} + +PD_BUILD_OP(dispatch_test_float_and_integer_and_complex) + .Inputs({"X"}) + .Outputs({"Out"}) + .SetKernelFn(PD_KERNEL(DispatchTestFloatAndIntegerAndComplex)); diff --git a/python/paddle/fluid/tests/custom_op/test_custom_conj.py b/python/paddle/fluid/tests/custom_op/test_custom_conj.py new file mode 100644 index 00000000000..3a8f79a06fc --- /dev/null +++ b/python/paddle/fluid/tests/custom_op/test_custom_conj.py @@ -0,0 +1,136 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import unittest +import numpy as np + +import paddle +import paddle.static as static +from paddle.utils.cpp_extension import load, get_build_directory +from paddle.utils.cpp_extension.extension_utils import run_cmd +from utils import paddle_includes, extra_cc_args, extra_nvcc_args + +# Because Windows don't use docker, the shared lib already exists in the +# cache dir, it will not be compiled again unless the shared lib is removed. +file = '{}\\custom_relu_module_jit\\custom_relu_module_jit.pyd'.format( + get_build_directory()) +if os.name == 'nt' and os.path.isfile(file): + cmd = 'del {}'.format(file) + run_cmd(cmd, True) + +custom_ops = load( + name='custom_conj_jit', + sources=['custom_conj_op.cc'], + extra_include_paths=paddle_includes, # add for Coverage CI + extra_cxx_cflags=extra_cc_args, # test for cc flags + extra_cuda_cflags=extra_nvcc_args, # test for nvcc flags + verbose=True) + + +def is_complex(dtype): + return dtype == paddle.fluid.core.VarDesc.VarType.COMPLEX64 or \ + dtype == paddle.fluid.core.VarDesc.VarType.COMPLEX128 + + +def to_complex(dtype): + if dtype == "float32": + return np.complex64 + elif dtype == "float64": + return np.complex128 + else: + return dtype + + +def conj_dynamic(func, dtype, np_input): + paddle.set_device("cpu") + x = paddle.to_tensor(np_input) + out = func(x) + out.stop_gradient = False + sum_out = paddle.sum(out) + if is_complex(sum_out.dtype): + sum_out.real().backward() + else: + sum_out.backward() + return out.numpy(), x.grad + + +def conj_static(func, shape, dtype, np_input): + paddle.enable_static() + paddle.set_device("cpu") + with static.scope_guard(static.Scope()): + with static.program_guard(static.Program()): + x = static.data(name="x", shape=shape, dtype=dtype) + x.stop_gradient = False + out = func(x) + sum_out = paddle.sum(out) + static.append_backward(sum_out) + + exe = static.Executor() + exe.run(static.default_startup_program()) + + out_v, x_grad_v = exe.run(static.default_main_program(), + feed={"x": np_input}, + fetch_list=[out.name, x.name + "@GRAD"]) + paddle.disable_static() + return out_v, x_grad_v + + +class TestCustomConjJit(unittest.TestCase): + def setUp(self): + self.dtypes = ['float32', 'float64'] + self.shape = [2, 20, 2, 3] + + def check_output(self, out, pd_out, name): + self.assertTrue( + np.array_equal(out, pd_out), + "custom op {}: {},\n paddle api {}: {}".format(name, out, name, + pd_out)) + + def run_dynamic(self, dtype, np_input): + out, x_grad = conj_dynamic(custom_ops.custom_conj, dtype, np_input) + pd_out, pd_x_grad = conj_dynamic(paddle.conj, dtype, np_input) + + self.check_output(out, pd_out, "out") + self.check_output(x_grad, pd_x_grad, "x's grad") + + def run_static(self, dtype, np_input): + out, x_grad = conj_static(custom_ops.custom_conj, self.shape, dtype, + np_input) + pd_out, pd_x_grad = conj_static(paddle.conj, self.shape, dtype, + np_input) + + self.check_output(out, pd_out, "out") + self.check_output(x_grad, pd_x_grad, "x's grad") + + def test_dynamic(self): + for dtype in self.dtypes: + np_input = np.random.random(self.shape).astype(dtype) + self.run_dynamic(dtype, np_input) + + def test_static(self): + for dtype in self.dtypes: + np_input = np.random.random(self.shape).astype(dtype) + self.run_static(dtype, np_input) + + # complex only used in dynamic mode now + def test_complex_dynamic(self): + for dtype in self.dtypes: + np_input = np.random.random(self.shape).astype( + dtype) + 1j * np.random.random(self.shape).astype(dtype) + self.run_dynamic(to_complex(dtype), np_input) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/custom_op/test_dispatch_jit.py b/python/paddle/fluid/tests/custom_op/test_dispatch_jit.py index 6cdbc61620d..bc36372c6a7 100644 --- a/python/paddle/fluid/tests/custom_op/test_dispatch_jit.py +++ b/python/paddle/fluid/tests/custom_op/test_dispatch_jit.py @@ -55,6 +55,11 @@ class TestJitDispatch(unittest.TestCase): for dtype in dtypes: self.run_dispatch_test(dispatch_op.dispatch_test_integer, dtype) + def test_dispatch_complex(self): + dtypes = ["complex64", "complex128"] + for dtype in dtypes: + self.run_dispatch_test(dispatch_op.dispatch_test_complex, dtype) + def test_dispatch_float_and_integer(self): dtypes = [ "float32", "float64", "int32", "int64", "int8", "uint8", "int16" @@ -63,6 +68,21 @@ class TestJitDispatch(unittest.TestCase): self.run_dispatch_test(dispatch_op.dispatch_test_float_and_integer, dtype) + def test_dispatch_float_and_complex(self): + dtypes = ["float32", "float64", "complex64", "complex128"] + for dtype in dtypes: + self.run_dispatch_test(dispatch_op.dispatch_test_float_and_complex, + dtype) + + def test_dispatch_float_and_integer_and_complex(self): + dtypes = [ + "float32", "float64", "int32", "int64", "int8", "uint8", "int16", + "complex64", "complex128" + ] + for dtype in dtypes: + self.run_dispatch_test( + dispatch_op.dispatch_test_float_and_integer_and_complex, dtype) + if __name__ == '__main__': unittest.main() diff --git a/python/setup.py.in b/python/setup.py.in index 0e214c5c65f..0afc3956a01 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -451,12 +451,30 @@ class InstallHeaders(Command): ('install_headers', 'install_dir'), ('force', 'force')) + def copy_data_type_headers(self, header): + if os.name == 'nt': + data_type_headers = ['platform\\complex64.h', 'platform\\complex128.h'] + else: + data_type_headers = ['platform/complex64.h', 'platform/complex128.h'] + for dtype_header in data_type_headers: + if dtype_header in header: + if os.name == 'nt': + install_dir = os.path.join(self.install_dir, "paddle\\fluid\\extension\\include") + else: + install_dir = os.path.join(self.install_dir, "paddle/fluid/extension/include") + if not os.path.exists(install_dir): + self.mkpath(install_dir) + return self.copy_file(header, install_dir) + def mkdir_and_copy_file(self, header): if 'pb.h' in header: install_dir = re.sub('${PADDLE_BINARY_DIR}/', '', header) elif 'third_party' not in header: - # framework + # paddle headers install_dir = re.sub('@PADDLE_SOURCE_DIR@/', '', header) + # For paddle data type headers, we also need to copy to `extension/incude`, + # used for new custom operator + self.copy_data_type_headers(header) else: # third_party install_dir = re.sub('${THIRD_PARTY_PATH}', 'third_party', header) -- GitLab From 420527f0d972ad6aa01bcc708c2eb184eda4480f Mon Sep 17 00:00:00 2001 From: ronnywang <524019753@qq.com> Date: Fri, 19 Mar 2021 10:10:38 +0800 Subject: [PATCH 037/486] [ROCM] fix layer_norm, norm, p_norm, test_sequence_softmax_op, test_math_op_patch_var_base (#31709) --- paddle/fluid/operators/layer_norm_op.cu | 9 ++++++++- paddle/fluid/operators/norm_op.cu | 9 ++++++++- paddle/fluid/operators/p_norm_op.cu | 10 ++++++++++ .../unittests/sequence/test_sequence_softmax_op.py | 6 +++--- .../tests/unittests/test_math_op_patch_var_base.py | 7 +++++-- 5 files changed, 34 insertions(+), 7 deletions(-) diff --git a/paddle/fluid/operators/layer_norm_op.cu b/paddle/fluid/operators/layer_norm_op.cu index d0f7dca98af..3656de3525d 100644 --- a/paddle/fluid/operators/layer_norm_op.cu +++ b/paddle/fluid/operators/layer_norm_op.cu @@ -43,7 +43,11 @@ template using LayerNormParamType = typename CudnnDataType::BatchNormParamType; inline static int GetDesiredBlockDim(int block_dim) { +#ifdef __HIPCC__ + const int kMaxBlockDim = 256; +#else const int kMaxBlockDim = 512; +#endif return block_dim >= kMaxBlockDim ? kMaxBlockDim : (1 << (static_cast(std::log2f(block_dim)))); @@ -698,8 +702,11 @@ static void LayerNormBackward(const T *x, const T *d_y, const U *scale, const framework::ExecutionContext &ctx) { auto &dev_ctx = ctx.cuda_device_context(); auto stream = dev_ctx.stream(); - +#ifdef __HIPCC__ + const int kMaxBlockDim = 256; +#else const int kMaxBlockDim = 512; +#endif const int kMaxBlockNum = 128; int gradient_flag = ((d_x != nullptr ? 1 : 0) << 2) | ((d_scale != nullptr ? 1 : 0) << 1) | diff --git a/paddle/fluid/operators/norm_op.cu b/paddle/fluid/operators/norm_op.cu index 6b5c70c9258..4c1674ded1a 100644 --- a/paddle/fluid/operators/norm_op.cu +++ b/paddle/fluid/operators/norm_op.cu @@ -79,8 +79,11 @@ class NormCUDAKernel : public framework::OpKernel { GetDims(xdim, axis, &pre, &n, &post); auto& dev_ctx = ctx.cuda_device_context(); - +#ifdef __HIPCC__ + const int block = 256; +#else const int block = 512; +#endif int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); const int max_blocks = std::max(max_threads / block, 1); int grid = std::min(max_blocks, pre * post); @@ -146,7 +149,11 @@ class NormGradCUDAKernel : public framework::OpKernel { auto& dev_ctx = ctx.cuda_device_context(); +#ifdef __HIPCC__ + const int block = 256; +#else const int block = 512; +#endif int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); const int max_blocks = std::max(max_threads / block, 1); int grid = std::min(max_blocks, pre * post); diff --git a/paddle/fluid/operators/p_norm_op.cu b/paddle/fluid/operators/p_norm_op.cu index 918f0bb1e49..bd6694abdbf 100644 --- a/paddle/fluid/operators/p_norm_op.cu +++ b/paddle/fluid/operators/p_norm_op.cu @@ -142,7 +142,12 @@ class PnormCUDAKernel : public framework::OpKernel { auto& dev_ctx = ctx.cuda_device_context(); +#ifdef __HIPCC__ + const int block = 256; +#else const int block = 512; +#endif + int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); const int max_blocks = std::max(max_threads / block, 1); int grid = std::min(max_blocks, pre * post); @@ -244,7 +249,12 @@ class PnormGradCUDAKernel : public framework::OpKernel { auto& dev_ctx = ctx.cuda_device_context(); +#ifdef __HIPCC__ + const int block = 256; +#else const int block = 512; +#endif + int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); const int max_blocks = std::max(max_threads / block, 1); int grid = std::min(max_blocks, pre * post); diff --git a/python/paddle/fluid/tests/unittests/sequence/test_sequence_softmax_op.py b/python/paddle/fluid/tests/unittests/sequence/test_sequence_softmax_op.py index 92146820da1..cb92a68bde6 100644 --- a/python/paddle/fluid/tests/unittests/sequence/test_sequence_softmax_op.py +++ b/python/paddle/fluid/tests/unittests/sequence/test_sequence_softmax_op.py @@ -28,10 +28,10 @@ class TestSequenceSoftmaxOp(OpTest): self.op_type = "sequence_softmax" self.use_cudnn = False self.init_op_type() - - x = np.random.uniform(0.1, 1, (110, 1)).astype("float64") + self.dtype = "float32" if core.is_compiled_with_rocm() else "float64" + x = np.random.uniform(0.1, 1, (110, 1)).astype(self.dtype) self.init_lod() - out = np.zeros((110, 1)).astype("float64") + out = np.zeros((110, 1)).astype(self.dtype) offset = 0 for i in range(len(self.lod[0])): if (self.lod[0][i] == 0): diff --git a/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py b/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py index e908f1a60a0..4b097f6359f 100644 --- a/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py +++ b/python/paddle/fluid/tests/unittests/test_math_op_patch_var_base.py @@ -354,8 +354,11 @@ class TestMathOpPatchesVarBase(unittest.TestCase): [1.30058, 1.0688717, 1.4928783], [1.0958099, 1.3724753, 1.8926544]]) d = d.matmul(d.t()) - self.assertTrue( - np.array_equal(d.cholesky().numpy(), paddle.cholesky(d).numpy())) + # ROCM not support cholesky + if not fluid.core.is_compiled_with_rocm(): + self.assertTrue( + np.array_equal(d.cholesky().numpy(), paddle.cholesky(d).numpy( + ))) self.assertTrue( np.array_equal(x.is_empty().numpy(), paddle.is_empty(x).numpy())) -- GitLab From 1d197f6c97675471ac803cb07251d50cb20521c7 Mon Sep 17 00:00:00 2001 From: cc <52520497+juncaipeng@users.noreply.github.com> Date: Fri, 19 Mar 2021 10:19:54 +0800 Subject: [PATCH 038/486] [dgraph qat] Refine calculating output scale of dygraph qat (#31710) * Refine calculating output scale of dygraph qat, test=develop --- .../slim/quantization/imperative/qat.py | 221 +++++++++--------- .../slim/quantization/imperative/quant_nn.py | 4 + .../slim/quantization/imperative/utils.py | 43 ++-- .../test_imperative_qat_addquantdequant.py | 4 +- 4 files changed, 138 insertions(+), 134 deletions(-) diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py index 04aec158eac..abfe06a3326 100644 --- a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py +++ b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py @@ -25,12 +25,7 @@ from paddle.fluid.executor import Executor from paddle.fluid.param_attr import ParamAttr from paddle.fluid.initializer import Constant from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX -from paddle.nn import Linear, Conv2D, Conv2DTranspose, MaxPool2D, MaxPool1D -from paddle.nn import BatchNorm1D, BatchNorm2D, BatchNorm3D, SyncBatchNorm -from paddle.fluid.dygraph.nn import BatchNorm, Pool2D from paddle.fluid.io import load_inference_model, save_inference_model -from paddle.nn.layer.activation import ReLU, LeakyReLU, Sigmoid, ReLU6 -from paddle.nn.layer.activation import Tanh, Softmax, PReLU, Swish from paddle.fluid.log_helper import get_logger from . import quant_nn from .. import quantization_pass @@ -62,14 +57,10 @@ class ImperativeQuantAware(object): The constructor for ImperativeQuantAware. Args: - quantizable_layer_type(list[str]): List the type of layers that - will be quantized. Default is ['Conv2D', 'Linear']. - The quantizable_op_type in QuantizationFreezePass and - ConvertToInt8Pass must be the same as this. + quantizable_layer_type(list[str | layer]): List the type of + layers that will be quantized. Default is ['Conv2D', 'Linear']. weight_quantize_type(str): quantization type for weights, - which supports 'abs_max' now. The 'moving_average_abs_max' - usually is not used for weights, since weights are fixed - once the model is well trained. + which supports 'abs_max' and 'channel_wise_abs_max'. activation_quantize_type(str): quantization type for activations, which supports 'abs_max' and 'moving_average_abs_max' now. If using 'abs_max' mode, the quantization scale will be @@ -77,8 +68,8 @@ class ImperativeQuantAware(object): period. If using 'moving_average_abs_max', the static quantization scale will be calculated during training and used in inference. - weight_bits(int): quantization bit number for weights, - whereas the bias is not quantized. + weight_bits(int): quantization bit number for weights, whereas + the bias is not quantized. activation_bits(int): quantization bit number for activations. moving_rate(float): the parameter for 'moving_average_abs_max' quantization. @@ -260,8 +251,8 @@ class ImperativeQuantizeInputs(object): super(ImperativeQuantizeInputs, self).__init__() self._quantizable_layer_type = tuple( - utils._quant_layers_map[layer] - if layer in utils._quant_layers_map else layer + utils.supported_quant_layers_map[layer] + if layer in utils.supported_quant_layers_map else layer for layer in quantizable_layer_type) for layer in self._quantizable_layer_type: assert not isinstance(layer, str), \ @@ -338,7 +329,7 @@ class ImperativeQuantizeInputs(object): def _get_quantized_layer(self, layer): quant_layer_name = None - for key, value in utils._quant_layers_map.items(): + for key, value in utils.supported_quant_layers_map.items(): if isinstance(layer, value): quant_layer_name = 'Quantized' + key break @@ -364,10 +355,6 @@ class ImperativeCalcOutputScale(object): """ super(ImperativeCalcOutputScale, self).__init__() self._moving_rate = moving_rate - self._out_scale_layer_type_list = ( - BatchNorm, BatchNorm1D, BatchNorm2D, BatchNorm3D, Conv2D, LeakyReLU, - Linear, PReLU, Pool2D, MaxPool1D, MaxPool2D, ReLU, ReLU6, Sigmoid, - Softmax, SyncBatchNorm, Tanh, Swish) self._register_hook_handle_list = [] self._out_scale_dict = collections.OrderedDict() @@ -378,7 +365,7 @@ class ImperativeCalcOutputScale(object): Args: model(fluid.dygraph.Layer): The target model which would be - calculate the output quantization scale. + calculate the output quantization scale. Returns: None @@ -387,10 +374,10 @@ class ImperativeCalcOutputScale(object): "The model must be the instance of dygraph.Layer." for _, layer in model.named_sublayers(): if self._is_target_layer(layer): - self._add_new_parameters(layer) - forward_post_hook_handle = layer.register_forward_post_hook( - self._forward_post_hook) - self._register_hook_handle_list.append(forward_post_hook_handle) + self._init_scale_params(layer) + hook_handle = layer.register_forward_post_hook( + self._calc_output_scale_hook) + self._register_hook_handle_list.append(hook_handle) def save_quantized_model(self, layer, path, input_spec=None, **config): """ @@ -398,63 +385,64 @@ class ImperativeCalcOutputScale(object): Args: layer (Layer): The Layer to be saved. - path (str): The path prefix to save model. The format is ``dirname/file_prefix`` or ``file_prefix``. - input_spec (list[InputSpec|Tensor], optional): Describes the input of the saved model's forward - method, which can be described by InputSpec or example Tensor. If None, all input variables of - the original Layer's forward method would be the inputs of the saved model. Default None. - **configs (dict, optional): Other save configuration options for compatibility. We do not - recommend using these configurations, they may be removed in the future. If not necessary, - DO NOT use them. Default None. + path (str): The path prefix to save model. The format is + ``dirname/file_prefix`` or ``file_prefix``. + input_spec (list[InputSpec|Tensor], optional): Describes the input + of the saved model's forward method, which can be described by + InputSpec or example Tensor. If None, all input variables of + the original Layer's forward method would be the inputs of + the saved model. Default None. + **configs (dict, optional): Other save configuration options for + compatibility. We do not recommend using these configurations, + they may be removed in the future. If not necessary, DO NOT use + them. Default None. The following options are currently supported: - (1) output_spec (list[Tensor]): Selects the output targets of the saved model. - By default, all return variables of original Layer's forward method are kept as the - output of the saved model. If the provided ``output_spec`` list is not all output variables, - the saved model will be pruned according to the given ``output_spec`` list. + (1) output_spec (list[Tensor]): Selects the output targets of + the saved model. By default, all return variables of original + Layer's forward method are kept as the output of the saved model. + If the provided ``output_spec`` list is not all output variables, + the saved model will be pruned according to the given + ``output_spec`` list. Returns: None """ - assert isinstance( - layer, dygraph.Layer), "model must be the instance of dygraph.Layer" - self._layer = layer - is_dynamic_mode = False + assert isinstance(layer, dygraph.Layer), \ + "The model must be the instance of dygraph.Layer." + + # remove handles and collect output scales with dygraph.guard(): - self._layer.eval() - if self._register_hook_handle_list is not None: - for handle in self._register_hook_handle_list: - handle.remove() - if self._out_scale_dict: - for key in self._out_scale_dict: - self._out_scale_dict[key] = float(self._out_scale_dict[key] - .numpy()) - else: - for _, sub_layer in self._layer.named_sublayers(): - if self._is_target_layer(sub_layer): + layer.eval() + for handle in self._register_hook_handle_list: + handle.remove() + for _, sub_layer in layer.named_sublayers(): + if self._is_target_layer(sub_layer): + if hasattr(sub_layer, "layer_name"): + layer_name = sub_layer.layer_name + else: layer_name = sub_layer.full_name() - if hasattr(sub_layer, "layer_name"): - layer_name = sub_layer.layer_name - if hasattr(sub_layer, "_quant_out_scale"): - self._out_scale_dict[layer_name] = float( - sub_layer._quant_out_scale) + if hasattr(sub_layer, "_quant_out_scale"): + self._out_scale_dict[layer_name] = float( + sub_layer._quant_out_scale) + # save the quantized model that doesn't have output scales + paddle.jit.save(layer=layer, path=path, input_spec=input_spec, **config) + + # load static model + is_dynamic_mode = False if paddle.in_dynamic_mode(): is_dynamic_mode = True paddle.enable_static() - paddle.jit.save(layer=layer, path=path, input_spec=input_spec, **config) - - if core.is_compiled_with_cuda(): - place = core.CUDAPlace(0) - else: - place = core.CPUPlace() + place = core.CUDAPlace(0) if core.is_compiled_with_cuda() \ + else core.CPUPlace() exe = Executor(place) - file_prefix = os.path.basename(path) dirname = os.path.dirname(path) - model_filename = file_prefix + INFER_MODEL_SUFFIX - params_filename = file_prefix + INFER_PARAMS_SUFFIX - + basename = os.path.basename(path) + model_filename = basename + INFER_MODEL_SUFFIX + params_filename = basename + INFER_PARAMS_SUFFIX [inference_program, feed_target_names, fetch_targets] = ( load_inference_model( dirname=dirname, @@ -462,14 +450,15 @@ class ImperativeCalcOutputScale(object): model_filename=model_filename, params_filename=params_filename)) + # set output scales to the static model check_behind_op = False op_count = 0 ops_list = [key for key, _ in self._out_scale_dict.items()] if len(ops_list) == 0: warnings.warn( - "Warning: No Layer of the model while to be saved contains the out_threshold attribute, " - "so the generated inference model would not contain the out_threshold." - ) + "Warning: No Layer of the model while to be saved contains " + "the out_threshold attribute, so the generated inference " + "model would not contain the out_threshold.") else: # Because the Layer in dygraph may correspond to multiple ops # in static program after being saved. To ensure correctness, @@ -481,11 +470,12 @@ class ImperativeCalcOutputScale(object): forward_op = None for block in inference_program.blocks: for op in block.ops: - if op.type in utils._op_real_in_out_name: + if op.type in utils.op_real_in_out_name: if op_count > len(ops_list): warnings.warn( - "The number of Layer which has out_threshold attribute should be bigger than the op in inference model" - ) + "The number of Layer which has " + "out_threshold attribute should be bigger than " + "the op in inference model") break if check_behind_op: check_behind_op = False @@ -525,7 +515,7 @@ class ImperativeCalcOutputScale(object): self._out_scale_dict[ops_list[op_count]]) op_count += 1 - # Save the processed program. + # save the final quantized model that has output scales save_inference_model( dirname=dirname, feeded_var_names=feed_target_names, @@ -539,41 +529,40 @@ class ImperativeCalcOutputScale(object): paddle.disable_static() def _is_target_layer(self, layer): - return isinstance(layer, self._out_scale_layer_type_list) \ + return isinstance(layer, utils.out_scale_layers_list) \ or 'quantized_' in layer.full_name() - # When inferenc model is saved, the logic in hook would not be executed - # in program translation, so that some parameters can not created in - # __init__, which would cause the model to fail to save. Therefore, the - # parameters creation in the hook is advanced to be exected outside the hook. - def _add_new_parameters(self, layer, name=None): + def _init_scale_params(self, layer, name=None): + """ + Init the scale params for calculating output scales and save them in the + target layer. + After the users define the dygraph model, the hooks for calculating output + scales will not execute immediately. If the users load the checkpoint now, + the scale params have not been created, so them cann't be loaded. + Therefore, define the scale params in the beginning. + """ + + def _create_param(in_layer, first_name, last_name, dtype): + prefix = '{}.{}'.format(first_name, last_name) \ + if first_name else 'outscale.{}'.format(last_name) + attr = ParamAttr( + name=unique_name.generate(prefix), + initializer=Constant(1), + trainable=False) + param = in_layer.create_parameter(shape=[1], attr=attr, dtype=dtype) + return param + dtype = layer._dtype if layer._dtype is not None else "float32" if dtype not in ["float32", "float64"]: return - scale_prefix = '{}.scale'.format(name) if name else 'outscale.scale' - scale_name = unique_name.generate(scale_prefix) - scale_attr = ParamAttr( - name=scale_name, initializer=Constant(1), trainable=False) - layer._quant_out_scale = layer.create_parameter( - shape=[1], attr=scale_attr, dtype=dtype) + + layer._quant_out_scale = _create_param(layer, name, "scale", dtype) layer._quant_out_scale.stop_gradient = True - state_prefix = "{}.state".format(name) if name else 'outscale.state' - state_attr = ParamAttr( - name=unique_name.generate(state_prefix), - initializer=Constant(1), - trainable=False) - layer._quant_out_state = layer.create_parameter( - shape=[1], attr=state_attr, dtype=dtype) + layer._quant_out_state = _create_param(layer, name, "state", dtype) layer._quant_out_state.stop_gradient = True - accum_prefix = "{}.accum".format(name) if name else 'outscale.accum' - accum_attr = ParamAttr( - name=unique_name.generate(accum_prefix), - initializer=Constant(1), - trainable=False) - layer._quant_out_accum = layer.create_parameter( - shape=[1], attr=accum_attr, dtype=dtype) + layer._quant_out_accum = _create_param(layer, name, "accum", dtype) layer._quant_out_accum.stop_gradient = True # Judge whether the op in program matches the Layer in dynamic model @@ -598,20 +587,18 @@ class ImperativeCalcOutputScale(object): op_type = op_type.replace('relu', 're_lu') return op_type in layer_name - def _forward_post_hook(self, layer, input, output): - assert isinstance( - output, (core.VarBase, framework.Variable) - ), "Multiple outputs are not currently supported in ImperativeOutScale." - if output.dtype not in [ - core.VarDesc.VarType.FP32, core.VarDesc.VarType.FP64 - ]: - return - if not hasattr(layer, "_out_scale"): - self._out_scale = quant_nn.MovingAverageAbsMaxScale( - layer, output.name, self._moving_rate, output.dtype) - scale_out = self._out_scale(output) - if hasattr(layer, 'layer_name'): - layer_name = layer.layer_name - else: - layer_name = layer.full_name() - self._out_scale_dict[layer_name] = scale_out + def _calc_output_scale_hook(self, layer, input, output): + """ + Create the MovingAverageAbsMaxScale layer for the target layer if needed. + Execute MovingAverageAbsMaxScale layer to calculate the output scale. + """ + assert isinstance(output, (core.VarBase, framework.Variable)), \ + "Multiple outputs are not currently supported in ImperativeOutScale." + + fp_types = [core.VarDesc.VarType.FP32, core.VarDesc.VarType.FP64] + if output.dtype in fp_types: + if not hasattr(layer, "_out_scale"): + self._out_scale = quant_nn.MovingAverageAbsMaxScale( + layer, output.name, self._moving_rate, output.dtype) + # TODO (jc): consider the ops that have several outputs + self._out_scale(output) diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py b/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py index 0b052d5dd0d..3c4fb323bc5 100644 --- a/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py +++ b/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py @@ -499,6 +499,10 @@ class QuantizedNoweightLayer(layers.Layer): def forward(self, input): quant_input = self._fake_quant_input(input) + # TODO (jc): support ops that have several inputs + if isinstance(input, list): + assert len(input) == 1, \ + "The QuantizedNoweightLayer should only have one input." return self._layer.forward(quant_input) diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py b/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py index a732181db7d..1ff4a408e05 100644 --- a/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py +++ b/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py @@ -12,12 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -from paddle.nn import Linear, Conv2D -from paddle.fluid.dygraph.nn import Pool2D -from paddle.nn.layer.activation import ReLU, LeakyReLU, Sigmoid, ReLU6 -from paddle.nn.layer.activation import Tanh, Softmax, PReLU, Swish +import paddle -_op_real_in_out_name = { +op_real_in_out_name = { "conv2d": [["Input", "Filter"], ["Output"]], "depthwise_conv2d": [["Input", "Filter"], ["Output"]], "pool2d": [["X"], ["Out"]], @@ -33,14 +30,30 @@ _op_real_in_out_name = { "swish": [["X"], ["Out"]], } -_quant_layers_map = { - 'Conv2D': Conv2D, - 'Linear': Linear, - 'Pool2D': Pool2D, - 'ReLU': ReLU, - 'LeakyReLU': LeakyReLU, - 'ReLU6': ReLU6, - 'Softmax': Softmax, - 'Tanh': Tanh, - 'Swish': Swish +supported_quant_layers_map = { + 'Conv2D': paddle.nn.Conv2D, + 'Linear': paddle.nn.Linear, + 'AdaptiveAvgPool2D': paddle.nn.AdaptiveAvgPool2D, + 'AdaptiveMaxPool2D': paddle.nn.AdaptiveMaxPool2D, + 'AvgPool2D': paddle.nn.AvgPool2D, + 'MaxPool2D': paddle.nn.MaxPool2D, + 'Hardswish': paddle.nn.Hardswish, + 'LeakyReLU': paddle.nn.LeakyReLU, + 'PReLU': paddle.nn.PReLU, + 'ReLU': paddle.nn.ReLU, + 'ReLU6': paddle.nn.ReLU6, + 'Sigmoid': paddle.nn.Sigmoid, + 'Softmax': paddle.nn.Softmax, + 'Swish': paddle.nn.Swish, + 'Tanh': paddle.nn.Tanh, + 'Hardswish': paddle.nn.Hardswish, + 'BatchNorm': paddle.nn.BatchNorm, + 'GroupNorm': paddle.nn.GroupNorm, + 'LayerNorm': paddle.nn.LayerNorm, } + +out_scale_layers_list = ( + paddle.nn.Conv2D, paddle.nn.Linear, paddle.nn.MaxPool2D, + paddle.nn.BatchNorm, paddle.nn.BatchNorm2D, paddle.nn.SyncBatchNorm, + paddle.nn.LeakyReLU, paddle.nn.PReLU, paddle.nn.ReLU, paddle.nn.ReLU6, + paddle.nn.Sigmoid, paddle.nn.Softmax, paddle.nn.Tanh, paddle.nn.Swish) diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_addquantdequant.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_addquantdequant.py index 9d2b2d726e3..d76e4825e0d 100644 --- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_addquantdequant.py +++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_addquantdequant.py @@ -191,8 +191,8 @@ class TestImperativeAddQuantDequant(unittest.TestCase): weight_quantize_type='abs_max', activation_quantize_type='moving_average_abs_max', quantizable_layer_type=[ - 'Conv2D', 'Linear', 'ReLU', 'Pool2D', 'LeakyReLU', 'ReLU6', - 'Tanh', 'Swish' + 'Conv2D', 'Linear', 'ReLU', 'LeakyReLU', 'ReLU6', 'Tanh', + 'Swish' ]) with fluid.dygraph.guard(): -- GitLab From 50cafa0b0c03116903016552630a818230cce003 Mon Sep 17 00:00:00 2001 From: zlsh80826 Date: Fri, 19 Mar 2021 10:45:55 +0800 Subject: [PATCH 039/486] remove redundant sync, set collect/dist kernel to context stream, sub_lod memcpy opt (#31641) --- .../detection/collect_fpn_proposals_op.cu | 4 ++-- .../detection/distribute_fpn_proposals_op.cu | 16 ++++++++-------- .../operators/detection/generate_proposals_op.cu | 1 - 3 files changed, 10 insertions(+), 11 deletions(-) diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu index bc74c80e031..1796a79b71b 100644 --- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu +++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu @@ -198,8 +198,8 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel { int threads = kNumCUDAThreads; // get length-based lod by batch ids - GetLengthLoD<<>>(real_post_num, out_id_data, - length_lod_data); + GetLengthLoD<<>>( + real_post_num, out_id_data, length_lod_data); std::vector length_lod_cpu(lod_size); memory::Copy(platform::CPUPlace(), length_lod_cpu.data(), place, length_lod_data, sizeof(int) * lod_size, dev_ctx.stream()); diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu index cc61035309e..1bec37e7112 100644 --- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu +++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu @@ -131,11 +131,10 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel { int dist_blocks = NumBlocks(roi_num); int threads = kNumCUDAThreads; // get target levels and sub_lod list - GPUDistFpnProposalsHelper<<>>( + GPUDistFpnProposalsHelper<<>>( roi_num, fpn_rois->data(), lod_size, refer_level, refer_scale, max_level, min_level, roi_batch_id_list_gpu.data(), sub_lod_list_data, target_lvls_data, pixel_offset); - dev_ctx.Wait(); auto place = BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()); Tensor index_in_t; @@ -172,17 +171,18 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel { int start = 0; auto multi_rois_num = ctx.MultiOutput("MultiLevelRoIsNum"); + std::vector sub_lod_list_cpu(lod_size * num_level); + memory::Copy(platform::CPUPlace(), sub_lod_list_cpu.data(), place, + sub_lod_list_data, sizeof(int) * lod_size * num_level, + dev_ctx.stream()); + dev_ctx.Wait(); + for (int i = 0; i < num_level; ++i) { Tensor sub_lod = sub_lod_list.Slice(i, i + 1); - int* sub_lod_data = sub_lod.data(); // transfer length-based lod to offset-based lod std::vector offset(1, 0); - std::vector sub_lod_cpu(lod_size); - memory::Copy(platform::CPUPlace(), sub_lod_cpu.data(), place, - sub_lod_data, sizeof(int) * lod_size, dev_ctx.stream()); - dev_ctx.Wait(); for (int j = 0; j < lod_size; ++j) { - offset.emplace_back(offset.back() + sub_lod_cpu[j]); + offset.emplace_back(offset.back() + sub_lod_list_cpu[i * lod_size + j]); } int sub_rois_num = offset.back(); diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cu b/paddle/fluid/operators/detection/generate_proposals_op.cu index 8359fbab519..e8ab628db16 100644 --- a/paddle/fluid/operators/detection/generate_proposals_op.cu +++ b/paddle/fluid/operators/detection/generate_proposals_op.cu @@ -198,7 +198,6 @@ class CUDAGenerateProposalsKernel : public framework::OpKernel { memory::Copy(place, rpn_roi_probs_data + num_proposals, place, scores.data(), sizeof(T) * scores.numel(), dev_ctx.stream()); - dev_ctx.Wait(); num_proposals += proposals.dims()[0]; offset.emplace_back(num_proposals); tmp_num.push_back(proposals.dims()[0]); -- GitLab From c86e771e9498674a3c8686f1a6d455ee6e294607 Mon Sep 17 00:00:00 2001 From: zlsh80826 Date: Fri, 19 Mar 2021 10:52:06 +0800 Subject: [PATCH 040/486] NMS Performance Optimization (#31634) * replace mask vector to raw ptr * launch nms on context stream * remove redundant mask declaration --- paddle/fluid/operators/detection/bbox_util.cu.h | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/operators/detection/bbox_util.cu.h b/paddle/fluid/operators/detection/bbox_util.cu.h index 27852d43948..6d271766b0e 100644 --- a/paddle/fluid/operators/detection/bbox_util.cu.h +++ b/paddle/fluid/operators/detection/bbox_util.cu.h @@ -275,15 +275,19 @@ static void NMS(const platform::CUDADeviceContext &ctx, const Tensor &proposals, const T *boxes = proposals.data(); auto place = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()); - framework::Vector mask(boxes_num * col_blocks); - NMSKernel<<>>(boxes_num, nms_threshold, boxes, - mask.CUDAMutableData(BOOST_GET_CONST( - platform::CUDAPlace, ctx.GetPlace())), - pixel_offset); + auto mask_ptr = memory::Alloc(ctx, boxes_num * col_blocks * sizeof(uint64_t)); + uint64_t *mask_dev = reinterpret_cast(mask_ptr->ptr()); + + NMSKernel<<>>( + boxes_num, nms_threshold, boxes, mask_dev, pixel_offset); std::vector remv(col_blocks); memset(&remv[0], 0, sizeof(uint64_t) * col_blocks); + std::vector mask_host(boxes_num * col_blocks); + memory::Copy(platform::CPUPlace(), mask_host.data(), place, mask_dev, + boxes_num * col_blocks * sizeof(uint64_t), ctx.stream()); + std::vector keep_vec; int num_to_keep = 0; for (int i = 0; i < boxes_num; i++) { @@ -293,7 +297,7 @@ static void NMS(const platform::CUDADeviceContext &ctx, const Tensor &proposals, if (!(remv[nblock] & (1ULL << inblock))) { ++num_to_keep; keep_vec.push_back(i); - uint64_t *p = &mask[0] + i * col_blocks; + uint64_t *p = mask_host.data() + i * col_blocks; for (int j = nblock; j < col_blocks; j++) { remv[j] |= p[j]; } -- GitLab From a4a2b77defe3ef1697794ca60911be45078798da Mon Sep 17 00:00:00 2001 From: Adam Osewski Date: Fri, 19 Mar 2021 03:54:24 +0100 Subject: [PATCH 041/486] [oneDNN] lookup_table op with support for BF16 data type. (#31558) --- .../ir/mkldnn/cpu_bfloat16_placement_pass.cc | 4 +- .../ir/mkldnn/cpu_bfloat16_placement_pass.h | 2 +- paddle/fluid/operators/lookup_table_op.cc | 7 +- paddle/fluid/operators/lookup_table_op.h | 6 +- paddle/fluid/operators/math/blas_impl.h | 11 ++ .../paddle/fluid/tests/unittests/op_test.py | 16 +- .../unittests/test_lookup_table_bf16_op.py | 176 ++++++++++++++++++ tools/static_mode_white_list.py | 1 + 8 files changed, 213 insertions(+), 10 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/test_lookup_table_bf16_op.py diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.cc index 3d7a9c1107b..531a04e1a0d 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.cc @@ -53,7 +53,7 @@ void CPUBfloat16PlacementPass::SetMkldnnDataType( gpd(graph, handler); } -void CPUBfloat16PlacementPass::RemoveOrhanedOperators( +void CPUBfloat16PlacementPass::RemoveOrphanedOperators( ir::Graph* graph, int* bfloat16_operators) const { // find orphaned bfloat16 operator that is between two float32 operators // revert mkldnn_data_type attr to float32 @@ -74,7 +74,7 @@ void CPUBfloat16PlacementPass::RemoveOrhanedOperators( void CPUBfloat16PlacementPass::ApplyImpl(ir::Graph* graph) const { int bfloat16_operators = 0; SetMkldnnDataType(graph, &bfloat16_operators); - RemoveOrhanedOperators(graph, &bfloat16_operators); + RemoveOrphanedOperators(graph, &bfloat16_operators); PrettyLogDetail("--- marked %d operators to bfloat16 ", bfloat16_operators); } diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.h index 1911b1a3cb3..53b97f0e972 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.h +++ b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.h @@ -28,7 +28,7 @@ class CPUBfloat16PlacementPass : public Pass { protected: void SetMkldnnDataType(ir::Graph* graph, int* bfloat16_operators) const; - void RemoveOrhanedOperators(ir::Graph* graph, int* bfloat16_operators) const; + void RemoveOrphanedOperators(ir::Graph* graph, int* bfloat16_operators) const; void ApplyImpl(ir::Graph* graph) const override; }; diff --git a/paddle/fluid/operators/lookup_table_op.cc b/paddle/fluid/operators/lookup_table_op.cc index 1b482235da5..2e8b551ea4e 100644 --- a/paddle/fluid/operators/lookup_table_op.cc +++ b/paddle/fluid/operators/lookup_table_op.cc @@ -19,6 +19,7 @@ limitations under the License. */ #include "paddle/fluid/framework/no_need_buffer_vars_inference.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/framework/var_type_inference.h" +#include "paddle/fluid/platform/bfloat16.h" namespace paddle { namespace operators { @@ -222,9 +223,11 @@ REGISTER_OPERATOR(lookup_table_grad, ops::LookupTableOpGrad, REGISTER_OP_CPU_KERNEL(lookup_table, ops::LookupTableKernel, ops::LookupTableKernel, - ops::LookupTableKernel); + ops::LookupTableKernel, + ops::LookupTableKernel); REGISTER_OP_CPU_KERNEL(lookup_table_grad, ops::LookupTableGradKernel, - ops::LookupTableGradKernel); + ops::LookupTableGradKernel, + ops::LookupTableGradKernel); /* ========================== register checkpoint ===========================*/ diff --git a/paddle/fluid/operators/lookup_table_op.h b/paddle/fluid/operators/lookup_table_op.h index 8baa3bccceb..e385d72d1f4 100644 --- a/paddle/fluid/operators/lookup_table_op.h +++ b/paddle/fluid/operators/lookup_table_op.h @@ -102,7 +102,8 @@ class LookupTableKernel : public framework::OpKernel { auto id_index = table_t.GetIndexFromId(ids[i]); if (id_index != -1) { - if (input_data_type == framework::proto::VarType::INT8) { + if (input_data_type == framework::proto::VarType::INT8 || + input_data_type == framework::proto::VarType::BF16) { memcpy(output + i * row_width, table + id_index * row_width, row_width * sizeof(T)); } else { @@ -128,7 +129,8 @@ class LookupTableKernel : public framework::OpKernel { "the input key should be exists. But received %d.", id_index)); - if (input_data_type == framework::proto::VarType::INT8) { + if (input_data_type == framework::proto::VarType::INT8 || + input_data_type == framework::proto::VarType::BF16) { memcpy(output + i * row_width, table + id_index * row_width, row_width * sizeof(T)); } else { diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h index 4847c1f05b0..64b533de098 100644 --- a/paddle/fluid/operators/math/blas_impl.h +++ b/paddle/fluid/operators/math/blas_impl.h @@ -21,6 +21,7 @@ #include #include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/bfloat16.h" #include "paddle/fluid/platform/complex128.h" #include "paddle/fluid/platform/complex64.h" @@ -40,6 +41,16 @@ struct CBlas { } }; +template <> +struct CBlas { + template + static void VCOPY(ARGS... args) { + PADDLE_THROW(platform::errors::Unimplemented( + "Blas VCOPY do not supported on CPU with bfloat16," + " please check your code")); + } +}; + #ifdef PADDLE_WITH_MKLML template <> struct CBlas { diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py index 8ca83d08d64..939e2ac0f59 100644 --- a/python/paddle/fluid/tests/unittests/op_test.py +++ b/python/paddle/fluid/tests/unittests/op_test.py @@ -33,10 +33,19 @@ from paddle.fluid.backward import append_backward from paddle.fluid.op import Operator from paddle.fluid.executor import Executor from paddle.fluid.framework import Program, OpProtoHolder, Variable -from testsuite import create_op, set_input, append_input_output, append_loss_ops +from paddle.fluid.tests.unittests.testsuite import ( + create_op, + set_input, + append_input_output, + append_loss_ops, ) from paddle.fluid import unique_name -from white_list import op_accuracy_white_list, check_shape_white_list, compile_vs_runtime_white_list, no_check_set_white_list -from white_list import op_threshold_white_list, no_grad_set_white_list +from paddle.fluid.tests.unittests.white_list import ( + op_accuracy_white_list, + check_shape_white_list, + compile_vs_runtime_white_list, + no_check_set_white_list, + op_threshold_white_list, + no_grad_set_white_list, ) def check_out_dtype(api_fn, in_specs, expect_dtypes, target_index=0, **configs): @@ -1452,6 +1461,7 @@ class OpTest(unittest.TestCase): analytic_grads = self._get_gradient(inputs_to_check, place, output_names, no_grad_set, user_defined_grad_outputs) + # comparison of bf16 results will happen as fp32 # loop over list of grads and convert bf16 to fp32 fp32_grads = [] diff --git a/python/paddle/fluid/tests/unittests/test_lookup_table_bf16_op.py b/python/paddle/fluid/tests/unittests/test_lookup_table_bf16_op.py new file mode 100644 index 00000000000..13c4aa6d767 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_lookup_table_bf16_op.py @@ -0,0 +1,176 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +from paddle.fluid.tests.unittests.op_test import ( + OpTest, convert_float_to_uint16, convert_uint16_to_float, + skip_check_grad_ci) +import paddle.fluid as fluid +import paddle.fluid.core as core +from paddle.fluid.op import Operator +from paddle import enable_static + + +def _lookup(weights, ids, flat_ids): + w_shape = weights.shape + out_shape = list(ids.shape[:-1]) + out_shape.append(w_shape[-1]) + out = weights[flat_ids].reshape(out_shape) + return out + + +def _get_grad(weights, ids, flat_ids): + w_shape = weights.shape + w_grad = np.zeros((w_shape), dtype=weights.dtype) + out_grad_shape = (np.prod(ids.shape[:-1]), w_shape[-1]) + out_grad = weights[flat_ids].reshape(out_grad_shape) + for i, idx in enumerate(flat_ids): + w_grad[idx, :] += out_grad[i] + return w_grad + + +@unittest.skipIf(not core.supports_bfloat16(), + "place does not support BF16 evaluation") +class TestLookupTableBF16Op(OpTest): + def setUp(self): + self.op_type = "lookup_table" + self.dtype = np.uint16 + + table = np.random.random((17, 31)).astype("float32") + self.ids = np.random.randint(0, 17, (4, 1)).astype("int64") + self.flat_ids = self.ids.flatten() + + self.w_bf16 = convert_float_to_uint16(table) + self.out_bf16 = _lookup(self.w_bf16, self.ids, self.flat_ids) + self.out_fp32 = _lookup(table, self.ids, self.flat_ids) + self.w_grad_fp32 = _get_grad(table, self.ids, self.flat_ids) + + self.inputs = {'W': self.w_bf16, 'Ids': self.ids} + self.outputs = {'Out': self.out_fp32} + + def test_check_output(self): + self.check_output_with_place(core.CPUPlace(), check_dygraph=False) + + def test_check_grad(self): + self.check_grad_with_place( + core.CPUPlace(), ['W'], + 'Out', + no_grad_set=set('Ids'), + check_dygraph=False, + max_relative_error=1.5e-2, + user_defined_grads=[self.w_grad_fp32], + user_defined_grad_outputs=[self.out_bf16]) + + +@unittest.skipIf(not core.supports_bfloat16(), + "place does not support BF16 evaluation") +class TestLookupTableBF16OpIds4D(TestLookupTableBF16Op): + def setUp(self): + super(TestLookupTableBF16OpIds4D, self).setUp() + self.ids = np.random.randint(0, 17, (2, 4, 5, 1)).astype("int64") + + +@unittest.skipIf(not core.supports_bfloat16(), + "place does not support BF16 evaluation") +class TestLookupTableBF16OpWIsSelectedRows(unittest.TestCase): + def setUp(self): + self.ids = np.random.randint( + low=0, high=15, size=(10, 1)).astype("int64") + self.flat_ids = self.ids.flatten() + self.w_fp32 = np.random.random((15, 32)).astype("float32") + self.w_bf16 = convert_float_to_uint16(self.w_fp32) + self.scope = core.Scope() + self.place = core.CPUPlace() + + def prepare_w(self): + rows = [a for a in range(self.w_bf16.shape[0])] + row_numel = self.w_bf16.shape[1] + + w_selected_rows = self.scope.var('W').get_selected_rows() + w_selected_rows.set_height(len(rows)) + w_selected_rows.set_rows(rows) + w_tensor = w_selected_rows.get_tensor() + w_tensor.set(self.w_bf16, self.place) + + def prepare_ids(self): + ids_tensor = self.scope.var('Ids').get_tensor() + ids_tensor.set(self.ids, self.place) + + def _check_output(self, reference, result_array): + result_array_fp32 = convert_uint16_to_float(result_array) + np.testing.assert_allclose(result_array_fp32, reference, rtol=1.5e-2) + + def test_check_output(self): + self.prepare_ids() + self.prepare_w() + out_tensor = self.scope.var('Out').get_tensor() + + # create and run lookup_table operator + lookup_table = Operator("lookup_table", W='W', Ids='Ids', Out='Out') + lookup_table.run(self.scope, self.place) + + # get result from Out + result_array = np.array(out_tensor) + ref = _lookup(self.w_fp32, self.ids, self.flat_ids) + self._check_output(ref, result_array) + + +@unittest.skipIf(not core.supports_bfloat16(), + "place does not support BF16 evaluation") +class TestLookupTableBF16OpWIsSelectedRows4DIds( + TestLookupTableBF16OpWIsSelectedRows): + def setUp(self): + super(TestLookupTableBF16OpWIsSelectedRows4DIds, self).setUp() + self.ids = np.random.randint( + low=0, high=15, size=(3, 4, 5, 1)).astype("int64") + self.flat_ids = self.ids.flatten() + + +@skip_check_grad_ci( + reason="Since paddings are not trainable and fixed in forward," + "the gradient of paddings makes no sense and we don't " + "test the gradient here.") +@unittest.skipIf(not core.supports_bfloat16(), + "place does not support BF16 evaluation") +class TestLookupTableBF16OpWithPadding(TestLookupTableBF16Op): + def test_check_output(self): + ids = np.squeeze(self.inputs['Ids']) + padding_idx = np.random.choice(ids, 1)[0] + self.outputs['Out'][ids == padding_idx] = np.zeros(31) + self.attrs = {'padding_idx': int(padding_idx)} + self.check_output_with_place(core.CPUPlace(), check_dygraph=False) + + +@skip_check_grad_ci( + reason="Since paddings are not trainable and fixed in forward," + "the gradient of paddings makes no sense and we don't " + "test the gradient here.") +@unittest.skipIf(not core.supports_bfloat16(), + "place does not support BF16 evaluation") +class TestLookupTableBF16OpIds4DPadding(TestLookupTableBF16OpIds4D): + def test_check_output(self): + ids = self.inputs['Ids'] + flatten_idx = ids.flatten() + padding_idx = np.random.choice(flatten_idx, 1)[0] + self.outputs['Out'][np.squeeze(ids == padding_idx)] = np.zeros(31) + self.attrs = {'padding_idx': int(padding_idx)} + self.check_output_with_place(core.CPUPlace(), check_dygraph=False) + + +if __name__ == "__main__": + enable_static() + unittest.main() diff --git a/tools/static_mode_white_list.py b/tools/static_mode_white_list.py index dc537cb2684..2ea3f7654af 100644 --- a/tools/static_mode_white_list.py +++ b/tools/static_mode_white_list.py @@ -21,6 +21,7 @@ STATIC_MODE_TESTING_LIST = [ 'test_linear_chain_crf_op', 'test_lod_reset_op', 'test_lookup_table_op', + 'test_lookup_table_bf16_op', 'test_pad2d_op', 'test_scatter_op', 'test_sequence_concat', -- GitLab From e429deb0c42c51a647bfc9d90e41be0382bded8a Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Fri, 19 Mar 2021 15:27:15 +0800 Subject: [PATCH 042/486] [CustomOp] Support attribute in infershape function (#31713) * support attribute in infershape * polish details --- .../extension/include/ext_op_meta_info.h | 112 ++++++++++----- paddle/fluid/framework/custom_operator.cc | 50 ++++++- .../fluid/tests/custom_op/custom_concat_op.cc | 90 ++++++++++++ .../tests/custom_op/test_custom_concat.py | 128 +++++++++++------- 4 files changed, 289 insertions(+), 91 deletions(-) diff --git a/paddle/fluid/extension/include/ext_op_meta_info.h b/paddle/fluid/extension/include/ext_op_meta_info.h index bad1d6ad9f0..c400164c754 100644 --- a/paddle/fluid/extension/include/ext_op_meta_info.h +++ b/paddle/fluid/extension/include/ext_op_meta_info.h @@ -204,38 +204,68 @@ struct KernelFuncImpl { // Record Op infershape core function using InferShapeFunc = std::vector> (*)( const std::vector>& input_shapes, - const std::vector>>& vec_input_shapes); + const std::vector>>& vec_input_shapes, + const std::vector& attrs); -#define PD_SPECIALIZE_InferShapeCallHelper_FOR_SHAPE(input_type) \ - template \ - struct InferShapeCallHelper { \ - template \ - static Return InferShape( \ - const std::vector>& input_shapes, \ - const std::vector>>& \ - vec_input_shapes, \ - const PreviousArgs&... pargs) { \ - input_type arg = input_shapes[in_idx]; \ - return InferShapeCallHelper::template InferShape( \ - input_shapes, vec_input_shapes, pargs..., arg); \ - } \ +#define PD_SPECIALIZE_InferShapeCallHelper_FOR_SHAPE(input_type) \ + template \ + struct InferShapeCallHelper { \ + template \ + static Return InferShape( \ + const std::vector>& input_shapes, \ + const std::vector>>& \ + vec_input_shapes, \ + const std::vector& attrs, const PreviousArgs&... pargs) { \ + input_type arg = input_shapes[in_idx]; \ + return InferShapeCallHelper::template InferShape< \ + in_idx + 1, vec_in_idx, attr_idx>(input_shapes, vec_input_shapes, \ + attrs, pargs..., arg); \ + } \ } -#define PD_SPECIALIZE_InferShapeCallHelper_FOR_SHAPES(input_type) \ - template \ - struct InferShapeCallHelper { \ - template \ - static Return InferShape( \ - const std::vector>& input_shapes, \ - const std::vector>>& \ - vec_input_shapes, \ - const PreviousArgs&... pargs) { \ - input_type arg = vec_input_shapes[vec_in_idx]; \ - return InferShapeCallHelper::template InferShape< \ - in_idx, vec_in_idx + 1>(input_shapes, vec_input_shapes, pargs..., \ - arg); \ - } \ +#define PD_SPECIALIZE_InferShapeCallHelper_FOR_SHAPES(input_type) \ + template \ + struct InferShapeCallHelper { \ + template \ + static Return InferShape( \ + const std::vector>& input_shapes, \ + const std::vector>>& \ + vec_input_shapes, \ + const std::vector& attrs, const PreviousArgs&... pargs) { \ + input_type arg = vec_input_shapes[vec_in_idx]; \ + return InferShapeCallHelper::template InferShape< \ + in_idx, vec_in_idx + 1, attr_idx>(input_shapes, vec_input_shapes, \ + attrs, pargs..., arg); \ + } \ + } + +#define PD_SPECIALIZE_InferShapeCallHelper_FOR_ATTR(attr_type) \ + template \ + struct InferShapeCallHelper { \ + template \ + static Return InferShape( \ + const std::vector>& input_shapes, \ + const std::vector>>& \ + vec_input_shapes, \ + const std::vector& attrs, const PreviousArgs&... pargs) { \ + try { \ + attr_type arg = boost::any_cast(attrs[attr_idx]); \ + return InferShapeCallHelper::template InferShape< \ + in_idx, vec_in_idx, attr_idx + 1>(input_shapes, vec_input_shapes, \ + attrs, pargs..., arg); \ + } catch (boost::bad_any_cast&) { \ + PD_THROW( \ + "Attribute cast error in custom operator InferShapeFn. " \ + "Expected " #attr_type \ + " value. InferShapeFn's attribute list must be exactly same as " \ + "Forward " \ + "KernelFn's attribute list except std::vector " \ + "attribute."); \ + } \ + } \ } template @@ -245,10 +275,10 @@ template struct InferShapeFuncImpl { static Return InferShape( const std::vector>& input_shapes, - const std::vector>>& vec_input_shapes) { - return InferShapeCallHelper>::template InferShape<0, - 0>( - input_shapes, vec_input_shapes); + const std::vector>>& vec_input_shapes, + const std::vector& attrs) { + return InferShapeCallHelper>::template InferShape< + 0, 0, 0>(input_shapes, vec_input_shapes, attrs); } private: @@ -265,14 +295,26 @@ struct InferShapeFuncImpl { PD_SPECIALIZE_InferShapeCallHelper_FOR_SHAPES( std::vector>); + PD_SPECIALIZE_InferShapeCallHelper_FOR_ATTR(const bool&); + PD_SPECIALIZE_InferShapeCallHelper_FOR_ATTR(const int&); + PD_SPECIALIZE_InferShapeCallHelper_FOR_ATTR(const float&); + PD_SPECIALIZE_InferShapeCallHelper_FOR_ATTR(const int64_t&); + PD_SPECIALIZE_InferShapeCallHelper_FOR_ATTR(const std::string&); + PD_SPECIALIZE_InferShapeCallHelper_FOR_ATTR(const std::vector&); + PD_SPECIALIZE_InferShapeCallHelper_FOR_ATTR(const std::vector&); + PD_SPECIALIZE_InferShapeCallHelper_FOR_ATTR(const std::vector&); + // NOTE(chenweihang): InferShape can't support std::vector attr type, + // because the input type is std::vector, only can use one rule to + // parse std::vector parameter + // end: base template template struct InferShapeCallHelper> { - template + template static Return InferShape( const std::vector>& input_shapes, const std::vector>>& vec_input_shapes, - const Args&... args) { + const std::vector& attrs, const Args&... args) { return impl_fn(args...); } }; diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc index 69a9be603e6..1ebb8998c85 100644 --- a/paddle/fluid/framework/custom_operator.cc +++ b/paddle/fluid/framework/custom_operator.cc @@ -178,7 +178,7 @@ static void RunKernelFunc(const framework::ExecutionContext& ctx, "Unsupported `%s` type value as custom attribute now. " "Supported data types include `bool`, `int`, `float`, " "`int64_t`, `std::string`, `std::vector`, " - "`std::vector`, `std::vector, " + "`std::vector`, `std::vector`, " "`std::vector`, Please check whether " "the attribute data type and data type string are matched.", attr_type_str)); @@ -327,7 +327,7 @@ class CustomOpMaker : public OpProtoAndCheckerMaker { "Unsupported `%s` type value as custom attribute now. " "Supported data types include `bool`, `int`, `float`, " "`int64_t`, `std::string`, `std::vector`, " - "`std::vector`, `std::vector, " + "`std::vector`, `std::vector`, " "`std::vector`, Please check whether " "the attribute data type and data type string are matched.", attr_type_str)); @@ -581,7 +581,7 @@ void RegisterOperatorWithMetaInfo( ctx->ShareDim(op_inputs[0], op_outputs[0]); }; } else { - info.infer_shape_ = [op_inputs, op_outputs, + info.infer_shape_ = [op_inputs, op_outputs, op_attrs, infer_shape_func](InferShapeContext* ctx) { std::vector> input_shapes; std::vector>> vec_input_shapes; @@ -606,8 +606,50 @@ void RegisterOperatorWithMetaInfo( } } + std::vector custom_attrs; + for (auto& attr_str : op_attrs) { + auto attr_name_and_type = detail::ParseAttrStr(attr_str); + auto attr_name = attr_name_and_type[0]; + auto attr_type_str = attr_name_and_type[1]; + if (attr_type_str == "bool") { + custom_attrs.emplace_back(ctx->Attrs().Get(attr_name)); + } else if (attr_type_str == "int") { + custom_attrs.emplace_back(ctx->Attrs().Get(attr_name)); + } else if (attr_type_str == "float") { + custom_attrs.emplace_back(ctx->Attrs().Get(attr_name)); + } else if (attr_type_str == "int64_t") { + custom_attrs.emplace_back(ctx->Attrs().Get(attr_name)); + } else if (attr_type_str == "std::string") { + custom_attrs.emplace_back(ctx->Attrs().Get(attr_name)); + } else if (attr_type_str == "std::vector") { + custom_attrs.emplace_back( + ctx->Attrs().Get>(attr_name)); + } else if (attr_type_str == "std::vector") { + custom_attrs.emplace_back( + ctx->Attrs().Get>(attr_name)); + } else if (attr_type_str == "std::vector") { + // NOTE(chenweihang): InferShape can't support std::vector + // attr type, because the input type is std::vector, only + // can use one rule to parse std::vector parameter + continue; + } else if (attr_type_str == "std::vector") { + custom_attrs.emplace_back( + ctx->Attrs().Get>(attr_name)); + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Unsupported `%s` type value as custom attribute now. " + "Supported data types include `bool`, `int`, `float`, " + "`int64_t`, `std::string`, `std::vector`, " + "`std::vector`, `std::vector`, " + "Please check whether the attribute data type and " + "data type string are matched.", + attr_type_str)); + } + } + VLOG(1) << "Custom Operator: InferShape - calc output ddim."; - auto output_shapes = infer_shape_func(input_shapes, vec_input_shapes); + auto output_shapes = + infer_shape_func(input_shapes, vec_input_shapes, custom_attrs); VLOG(1) << "Custom Operator: InferShape - set output ddim."; for (size_t i = 0; i < op_outputs.size(); ++i) { diff --git a/python/paddle/fluid/tests/custom_op/custom_concat_op.cc b/python/paddle/fluid/tests/custom_op/custom_concat_op.cc index 2d8d0ccb88f..a01e01f2bc5 100644 --- a/python/paddle/fluid/tests/custom_op/custom_concat_op.cc +++ b/python/paddle/fluid/tests/custom_op/custom_concat_op.cc @@ -144,3 +144,93 @@ PD_BUILD_GRAD_OP(custom_concat) .Inputs({paddle::Vec("X"), paddle::Grad("Out"), "Axis"}) .Outputs({paddle::Grad(paddle::Vec("X"))}) .SetKernelFn(PD_KERNEL(ConcatBackwardDynamicAxis)); + +std::vector ConcatForwardStaticAxis( + const std::vector& inputs, const int64_t& axis) { + // check inputs + PD_CHECK(inputs.size() >= 1, "No Tensor need to be concat."); + for (auto& t : inputs) { + CHECK_INPUT(t); + } + + // compute output shape + int64_t rank = static_cast(inputs[0].shape().size()); + auto final_axis = ComputeAxis(axis, rank); + std::vector> in_shapes; + for (auto& t : inputs) { + in_shapes.emplace_back(t.shape()); + } + auto out_shape = ComputeOutShape(in_shapes, final_axis); + + // create output + auto out = paddle::Tensor(paddle::PlaceType::kCPU); + out.reshape(out_shape); + + // calc + PD_DISPATCH_FLOATING_AND_INTEGRAL_TYPES( + inputs[0].type(), "ConcatCpuKernel", ([&] { + ConcatCpuKernel(inputs, &out, final_axis); + })); + + return {out}; +} + +std::vector ConcatBackwardStaticAxis( + const std::vector& inputs, + const paddle::Tensor& grad_out, + const int64_t& axis) { + // check input + PD_CHECK(inputs.size() >= 1, "No Tensor need to be concat."); + for (auto& t : inputs) { + CHECK_INPUT(t); + } + CHECK_INPUT(grad_out); + + // compate axis + int64_t rank = static_cast(inputs[0].shape().size()); + auto final_axis = ComputeAxis(axis, rank); + + // create outputs + std::vector grad_inputs; + for (auto& t : inputs) { + auto grad = paddle::Tensor(paddle::PlaceType::kCPU); + grad.reshape(t.shape()); + grad_inputs.emplace_back(grad); + } + + // calc + PD_DISPATCH_FLOATING_AND_INTEGRAL_TYPES( + grad_out.type(), "SplitCpuKernel", ([&] { + SplitCpuKernel(grad_out, inputs, &grad_inputs, final_axis); + })); + + return grad_inputs; +} + +std::vector> ConcatInferShapeStaticAxis( + const std::vector>& input_shapes, + const int64_t& axis) { + int64_t rank = static_cast(input_shapes[0].size()); + auto final_axis = ComputeAxis(axis, rank); + auto out_shape = ComputeOutShape(input_shapes, final_axis); + return {out_shape}; +} + +std::vector ConcatInferDtypeStaticAxis( + const std::vector& input_dtypes) { + return {input_dtypes[0]}; +} + +PD_BUILD_OP(custom_concat_with_attr) + .Inputs({paddle::Vec("X")}) + .Outputs({"Out"}) + .Attrs({"axis: int64_t"}) + .SetKernelFn(PD_KERNEL(ConcatForwardStaticAxis)) + .SetInferShapeFn(PD_INFER_SHAPE(ConcatInferShapeStaticAxis)) + .SetInferDtypeFn(PD_INFER_DTYPE(ConcatInferDtypeStaticAxis)); + +PD_BUILD_GRAD_OP(custom_concat_with_attr) + .Inputs({paddle::Vec("X"), paddle::Grad("Out")}) + .Outputs({paddle::Grad(paddle::Vec("X"))}) + .Attrs({"axis: int64_t"}) + .SetKernelFn(PD_KERNEL(ConcatBackwardStaticAxis)); diff --git a/python/paddle/fluid/tests/custom_op/test_custom_concat.py b/python/paddle/fluid/tests/custom_op/test_custom_concat.py index 4086224cd7b..ea41126c1c4 100644 --- a/python/paddle/fluid/tests/custom_op/test_custom_concat.py +++ b/python/paddle/fluid/tests/custom_op/test_custom_concat.py @@ -45,14 +45,16 @@ custom_ops = load( verbose=True) -def concat_dynamic(func, device, dtype, np_inputs, axis_v): - paddle.set_device(device) +def concat_dynamic(func, dtype, np_inputs, axis_v, with_attr=False): + paddle.set_device("cpu") inputs = [ paddle.to_tensor( - x, dtype=dtype, place=device, stop_gradient=False) - for x in np_inputs + x, dtype=dtype, stop_gradient=False) for x in np_inputs ] - axis = paddle.full(shape=[1], dtype='int64', fill_value=axis_v) + if with_attr: + axis = axis_v + else: + axis = paddle.full(shape=[1], dtype='int64', fill_value=axis_v) out = func(inputs, axis) out.stop_gradient = False out.backward() @@ -60,14 +62,17 @@ def concat_dynamic(func, device, dtype, np_inputs, axis_v): return out.numpy(), grad_inputs -def concat_static(func, device, dtype, np_inputs, axis_v): +def concat_static(func, dtype, np_inputs, axis_v, with_attr=False): paddle.enable_static() - paddle.set_device(device) + paddle.set_device("cpu") with static.scope_guard(static.Scope()): with static.program_guard(static.Program()): x1 = static.data(name="x1", shape=[2, 3], dtype=dtype) x2 = static.data(name="x2", shape=[2, 3], dtype=dtype) - axis = paddle.full(shape=[1], dtype='int64', fill_value=axis_v) + if with_attr: + axis = axis_v + else: + axis = paddle.full(shape=[1], dtype='int64', fill_value=axis_v) x1.stop_gradient = False x2.stop_gradient = False out = func([x1, x2], axis) @@ -78,13 +83,20 @@ def concat_static(func, device, dtype, np_inputs, axis_v): exe = static.Executor() exe.run(static.default_startup_program()) - out_v, x1_grad_v, x2_grad_v = exe.run( - static.default_main_program(), - feed={ + if with_attr: + feed_dict = { + "x1": np_inputs[0].astype(dtype), + "x2": np_inputs[1].astype(dtype) + } + else: + feed_dict = { "x1": np_inputs[0].astype(dtype), "x2": np_inputs[1].astype(dtype), "axis": axis - }, + } + out_v, x1_grad_v, x2_grad_v = exe.run( + static.default_main_program(), + feed=feed_dict, fetch_list=[out.name, x1.name + "@GRAD", x2.name + "@GRAD"]) paddle.disable_static() return out_v, x1_grad_v, x2_grad_v @@ -93,55 +105,67 @@ def concat_static(func, device, dtype, np_inputs, axis_v): class TestCustomConcatDynamicAxisJit(unittest.TestCase): def setUp(self): self.dtypes = ['float32', 'float64', 'int32', 'int64'] - self.devices = ['cpu'] self.np_inputs = [ np.array([[1, 2, 3], [4, 5, 6]]), np.array([[11, 12, 13], [14, 15, 16]]) ] self.axises = [0, 1] + def check_output(self, out, pd_out, name): + self.assertTrue( + np.array_equal(out, pd_out), + "custom op {}: {},\n paddle api {}: {}".format(name, out, name, + pd_out)) + def test_dynamic(self): - for device in self.devices: - for dtype in self.dtypes: - for axis in self.axises: - out, grad_inputs = concat_dynamic(custom_ops.custom_concat, - device, dtype, - self.np_inputs, axis) - pd_out, pd_grad_inputs = concat_dynamic( - paddle.concat, device, dtype, self.np_inputs, axis) - - self.assertTrue( - np.array_equal(out, pd_out), - "custom op out: {},\n paddle api out: {}".format( - out, pd_out)) - for x_grad, pd_x_grad in zip(grad_inputs, pd_grad_inputs): - self.assertTrue( - np.array_equal(x_grad, pd_x_grad), - "custom op x grad: {},\n paddle api x grad: {}". - format(x_grad, pd_x_grad)) + for dtype in self.dtypes: + for axis in self.axises: + out, grad_inputs = concat_dynamic(custom_ops.custom_concat, + dtype, self.np_inputs, axis) + pd_out, pd_grad_inputs = concat_dynamic(paddle.concat, dtype, + self.np_inputs, axis) + + self.check_output(out, pd_out, "out") + for x_grad, pd_x_grad in zip(grad_inputs, pd_grad_inputs): + self.check_output(x_grad, pd_x_grad, "x_grad") def test_static(self): - for device in self.devices: - for dtype in self.dtypes: - for axis in self.axises: - out, x1_grad, x2_grad = concat_static( - custom_ops.custom_concat, device, dtype, self.np_inputs, - axis) - pd_out, pd_x1_grad, pd_x2_grad = concat_static( - paddle.concat, device, dtype, self.np_inputs, axis) - - self.assertTrue( - np.array_equal(out, pd_out), - "custom op out: {},\n paddle api out: {}".format( - out, pd_out)) - self.assertTrue( - np.array_equal(x1_grad, pd_x1_grad), - "custom op x1_grad: {},\n paddle api x1_grad: {}". - format(x1_grad, pd_x1_grad)) - self.assertTrue( - np.array_equal(x2_grad, pd_x2_grad), - "custom op x2_grad: {},\n paddle api x2_grad: {}". - format(x2_grad, pd_x2_grad)) + for dtype in self.dtypes: + for axis in self.axises: + out, x1_grad, x2_grad = concat_static( + custom_ops.custom_concat, dtype, self.np_inputs, axis) + pd_out, pd_x1_grad, pd_x2_grad = concat_static( + paddle.concat, dtype, self.np_inputs, axis) + + self.check_output(out, pd_out, "out") + self.check_output(x1_grad, pd_x1_grad, "x1_grad") + self.check_output(x2_grad, pd_x2_grad, "x2_grad") + + def test_dynamic_with_attr(self): + for dtype in self.dtypes: + for axis in self.axises: + out, grad_inputs = concat_dynamic( + custom_ops.custom_concat_with_attr, dtype, self.np_inputs, + axis, True) + pd_out, pd_grad_inputs = concat_dynamic( + paddle.concat, dtype, self.np_inputs, axis, True) + + self.check_output(out, pd_out, "out") + for x_grad, pd_x_grad in zip(grad_inputs, pd_grad_inputs): + self.check_output(x_grad, pd_x_grad, "x_grad") + + def test_static_with_attr(self): + for dtype in self.dtypes: + for axis in self.axises: + out, x1_grad, x2_grad = concat_static( + custom_ops.custom_concat_with_attr, dtype, self.np_inputs, + axis, True) + pd_out, pd_x1_grad, pd_x2_grad = concat_static( + paddle.concat, dtype, self.np_inputs, axis, True) + + self.check_output(out, pd_out, "out") + self.check_output(x1_grad, pd_x1_grad, "x1_grad") + self.check_output(x2_grad, pd_x2_grad, "x2_grad") if __name__ == "__main__": -- GitLab From 1c67cf0c987b0b47f846554c148690a4ef08b9d4 Mon Sep 17 00:00:00 2001 From: zlsh80826 Date: Fri, 19 Mar 2021 15:27:23 +0800 Subject: [PATCH 043/486] run radix sort of proposals layer on context stream (#31631) --- paddle/fluid/operators/detection/bbox_util.cu.h | 5 +++-- .../operators/detection/collect_fpn_proposals_op.cu | 11 +++++++---- .../detection/distribute_fpn_proposals_op.cu | 10 +++++----- 3 files changed, 15 insertions(+), 11 deletions(-) diff --git a/paddle/fluid/operators/detection/bbox_util.cu.h b/paddle/fluid/operators/detection/bbox_util.cu.h index 6d271766b0e..725983f8153 100644 --- a/paddle/fluid/operators/detection/bbox_util.cu.h +++ b/paddle/fluid/operators/detection/bbox_util.cu.h @@ -66,7 +66,8 @@ static void SortDescending(const platform::CUDADeviceContext &ctx, // Determine temporary device storage requirements size_t temp_storage_bytes = 0; cub::DeviceRadixSort::SortPairsDescending( - nullptr, temp_storage_bytes, keys_in, keys_out, idx_in, idx_out, num); + nullptr, temp_storage_bytes, keys_in, keys_out, idx_in, idx_out, num, 0, + sizeof(T) * 8, ctx.stream()); // Allocate temporary storage auto place = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()); auto d_temp_storage = memory::Alloc(place, temp_storage_bytes); @@ -74,7 +75,7 @@ static void SortDescending(const platform::CUDADeviceContext &ctx, // Run sorting operation cub::DeviceRadixSort::SortPairsDescending( d_temp_storage->ptr(), temp_storage_bytes, keys_in, keys_out, idx_in, - idx_out, num); + idx_out, num, 0, sizeof(T) * 8, ctx.stream()); } template diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu index 1796a79b71b..ffd9ac6b2af 100644 --- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu +++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu @@ -144,7 +144,7 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel { size_t temp_storage_bytes = 0; cub::DeviceRadixSort::SortPairsDescending( nullptr, temp_storage_bytes, concat_scores.data(), keys_out, idx_in, - idx_out, total_roi_num); + idx_out, total_roi_num, 0, sizeof(T) * 8, dev_ctx.stream()); // Allocate temporary storage auto d_temp_storage = memory::Alloc(place, temp_storage_bytes); @@ -152,7 +152,8 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel { // sort score to get corresponding index cub::DeviceRadixSort::SortPairsDescending( d_temp_storage->ptr(), temp_storage_bytes, concat_scores.data(), - keys_out, idx_in, idx_out, total_roi_num); + keys_out, idx_in, idx_out, total_roi_num, 0, sizeof(T) * 8, + dev_ctx.stream()); index_out_t.Resize({real_post_num}); Tensor sorted_rois; sorted_rois.mutable_data({real_post_num, kBBoxSize}, dev_ctx.GetPlace()); @@ -176,7 +177,8 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel { temp_storage_bytes = 0; cub::DeviceRadixSort::SortPairs( nullptr, temp_storage_bytes, sorted_batch_id.data(), out_id_data, - batch_idx_in, index_out_t.data(), real_post_num); + batch_idx_in, index_out_t.data(), real_post_num, 0, + sizeof(int) * 8, dev_ctx.stream()); // Allocate temporary storage d_temp_storage = memory::Alloc(place, temp_storage_bytes); @@ -184,7 +186,8 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel { // sort batch_id to get corresponding index cub::DeviceRadixSort::SortPairs( d_temp_storage->ptr(), temp_storage_bytes, sorted_batch_id.data(), - out_id_data, batch_idx_in, index_out_t.data(), real_post_num); + out_id_data, batch_idx_in, index_out_t.data(), real_post_num, 0, + sizeof(int) * 8, dev_ctx.stream()); GPUGather(dev_ctx, sorted_rois, index_out_t, fpn_rois); diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu index 1bec37e7112..7ccb354e177 100644 --- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu +++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu @@ -149,9 +149,9 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel { // Determine temporary device storage requirements size_t temp_storage_bytes = 0; - cub::DeviceRadixSort::SortPairs(nullptr, temp_storage_bytes, - target_lvls_data, keys_out, - idx_in, idx_out, roi_num); + cub::DeviceRadixSort::SortPairs( + nullptr, temp_storage_bytes, target_lvls_data, keys_out, idx_in, + idx_out, roi_num, 0, sizeof(int) * 8, dev_ctx.stream()); // Allocate temporary storage auto d_temp_storage = memory::Alloc(place, temp_storage_bytes); @@ -159,14 +159,14 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel { // sort target level to get corresponding index cub::DeviceRadixSort::SortPairs( d_temp_storage->ptr(), temp_storage_bytes, target_lvls_data, keys_out, - idx_in, idx_out, roi_num); + idx_in, idx_out, roi_num, 0, sizeof(int) * 8, dev_ctx.stream()); int* restore_idx_data = restore_index->mutable_data({roi_num, 1}, dev_ctx.GetPlace()); // sort current index to get restore index cub::DeviceRadixSort::SortPairs( d_temp_storage->ptr(), temp_storage_bytes, idx_out, keys_out, idx_in, - restore_idx_data, roi_num); + restore_idx_data, roi_num, 0, sizeof(int) * 8, dev_ctx.stream()); int start = 0; auto multi_rois_num = ctx.MultiOutput("MultiLevelRoIsNum"); -- GitLab From c9e1d9dc314ad72c33b3dc1b272c0a1de9784471 Mon Sep 17 00:00:00 2001 From: ronnywang <524019753@qq.com> Date: Fri, 19 Mar 2021 15:29:04 +0800 Subject: [PATCH 044/486] [ROCM] fix test_rnn_op (#31735) --- paddle/fluid/operators/rnn_op.cu.cc | 7 ++--- paddle/fluid/platform/dynload/miopen.h | 1 + .../fluid/tests/unittests/test_rnn_op.py | 27 ++++++++++++++++--- 3 files changed, 29 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/operators/rnn_op.cu.cc b/paddle/fluid/operators/rnn_op.cu.cc index ccf619a074a..2be59c62044 100644 --- a/paddle/fluid/operators/rnn_op.cu.cc +++ b/paddle/fluid/operators/rnn_op.cu.cc @@ -117,10 +117,11 @@ class RNNDescriptors { // ------------------- cudnn rnn descriptors --------------------- #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetRNNDescriptor( - rnn_desc_.desc(), hidden_size_, num_layers_, miopenRNNlinear, + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetRNNDescriptor_V2( + rnn_desc_.desc(), hidden_size_, num_layers_, dropout_desc_.desc(), + miopenRNNlinear, is_bidirec_ ? miopenRNNbidirection : miopenRNNunidirection, mode_, - miopenRNNNoBias, miopenRNNdefault, cudnn_type)); + miopenRNNwithBias, miopenRNNdefault, cudnn_type)); #elif CUDNN_VERSION >= 6000 PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNDescriptor_v6( handle, rnn_desc_.desc(), hidden_size_, num_layers_, diff --git a/paddle/fluid/platform/dynload/miopen.h b/paddle/fluid/platform/dynload/miopen.h index 43a3e1a1079..15de4c64e3e 100644 --- a/paddle/fluid/platform/dynload/miopen.h +++ b/paddle/fluid/platform/dynload/miopen.h @@ -125,6 +125,7 @@ extern void EnforceCUDNNLoaded(const char* fn_name); __macro(miopenCreateRNNDescriptor); \ __macro(miopenDestroyRNNDescriptor); \ __macro(miopenSetRNNDescriptor); \ + __macro(miopenSetRNNDescriptor_V2); \ __macro(miopenGetRNNParamsSize); \ __macro(miopenGetRNNWorkspaceSize); \ __macro(miopenGetRNNTrainingReserveSize); \ diff --git a/python/paddle/fluid/tests/unittests/test_rnn_op.py b/python/paddle/fluid/tests/unittests/test_rnn_op.py index 5ad2ffec982..22e07b0bc48 100644 --- a/python/paddle/fluid/tests/unittests/test_rnn_op.py +++ b/python/paddle/fluid/tests/unittests/test_rnn_op.py @@ -47,8 +47,10 @@ class TestRNNOp(OpTest): def setUp(self): self.op_type = "rnn" - self.dtype = np.float64 - self.sequence_length = np.array([12, 11, 10, 9, 8], dtype=np.int32) + self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64 + self.sequence_length = None if core.is_compiled_with_rocm( + ) else np.array( + [12, 11, 10, 9, 8], dtype=np.int32) self.num_layers = 1 self.is_bidirec = False self.mode = "LSTM" @@ -78,12 +80,31 @@ class TestRNNOp(OpTest): num_layers=self.num_layers, time_major=True, direction=direction, - dropout=self.dropout) + dropout=self.dropout, + dtype=self.dtype) flat_w = get_params_for_net(rnn1) output, (last_hidden, last_cell) = rnn1( input, sequence_length=self.sequence_length) + if core.is_compiled_with_rocm(): + + def rocm_rnn_get_place(): + places = [core.CUDAPlace(0)] + return places + + self._get_places = rocm_rnn_get_place + + if self.is_bidirec: + for i in range(0, len(flat_w), 4): + flat_w[i + 1], flat_w[i + 2] = flat_w[i + 2], flat_w[i + 1] + + for i in range(len(flat_w)): + w = np.split(flat_w[i][1], 4, 0) + w = [w[0], w[1], w[3], w[2]] + w = np.concatenate(w) + flat_w[i] = (flat_w[i][0], w) + init_h = np.zeros((self.num_layers * self.direction_num, batch_size, hidden_size)).astype(self.dtype) init_c = np.zeros((self.num_layers * self.direction_num, batch_size, -- GitLab From 878e117b6d54a5b9e277688aef5b9b625dbdc20d Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Fri, 19 Mar 2021 18:42:14 +0800 Subject: [PATCH 045/486] [CustomOp] Support float16 in custom op (#31725) * support float16 in custom op * fix failed unittests --- cmake/inference_lib.cmake | 3 +++ paddle/fluid/extension/include/ext_dispatch.h | 16 +++++++++++++++ paddle/fluid/extension/include/ext_dtype.h | 6 ++++++ paddle/fluid/extension/src/ext_tensor.cc | 16 +++++++++++++++ paddle/fluid/framework/custom_tensor_test.cc | 11 ++++++++++ paddle/fluid/framework/custom_tensor_utils.h | 4 ++++ .../fluid/tests/custom_op/CMakeLists.txt | 13 ++---------- .../fluid/tests/custom_op/custom_relu_op.cu | 9 +++++---- .../fluid/tests/custom_op/dispatch_test_op.cc | 18 +++++++++++++++++ .../custom_op/test_custom_relu_op_jit.py | 20 +++++++++++++------ .../custom_op/test_custom_relu_op_setup.py | 14 +++++++++++-- .../tests/custom_op/test_dispatch_jit.py | 6 ++++++ python/setup.py.in | 9 +++------ 13 files changed, 116 insertions(+), 29 deletions(-) diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index 570b37ff118..4864e04fa05 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -198,6 +198,9 @@ copy(inference_lib_dist copy(inference_lib_dist SRCS ${PADDLE_SOURCE_DIR}/paddle/fluid/platform/complex128.h DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/) +copy(inference_lib_dist + SRCS ${PADDLE_SOURCE_DIR}/paddle/fluid/platform/float16.h + DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/) # CAPI inference library for only inference set(PADDLE_INFERENCE_C_INSTALL_DIR "${CMAKE_BINARY_DIR}/paddle_inference_c_install_dir" CACHE STRING diff --git a/paddle/fluid/extension/include/ext_dispatch.h b/paddle/fluid/extension/include/ext_dispatch.h index 7b3893e2839..9b3e199708a 100644 --- a/paddle/fluid/extension/include/ext_dispatch.h +++ b/paddle/fluid/extension/include/ext_dispatch.h @@ -47,6 +47,22 @@ namespace paddle { } \ }() +#define PD_DISPATCH_FLOATING_AND_HALF_TYPES(TYPE, NAME, ...) \ + [&] { \ + const auto& __dtype__ = TYPE; \ + switch (__dtype__) { \ + PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::FLOAT32, float, \ + __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::FLOAT64, double, \ + __VA_ARGS__) \ + PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::FLOAT16, paddle::float16, \ + __VA_ARGS__) \ + default: \ + PD_THROW("function " #NAME " is not implemented for data type `", \ + ::paddle::ToString(__dtype__), "`"); \ + } \ + }() + ///////// Integral Dispatch Marco /////////// #define PD_DISPATCH_INTEGRAL_TYPES(TYPE, NAME, ...) \ diff --git a/paddle/fluid/extension/include/ext_dtype.h b/paddle/fluid/extension/include/ext_dtype.h index a1e58fbacdf..3890631a6f8 100644 --- a/paddle/fluid/extension/include/ext_dtype.h +++ b/paddle/fluid/extension/include/ext_dtype.h @@ -19,11 +19,13 @@ limitations under the License. */ #include "complex128.h" // NOLINT #include "complex64.h" // NOLINT #include "ext_exception.h" // NOLINT +#include "float16.h" // NOLINT namespace paddle { using complex64 = paddle::platform::complex64; using complex128 = paddle::platform::complex128; +using float16 = paddle::platform::float16; enum class DataType { BOOL, @@ -32,6 +34,7 @@ enum class DataType { INT16, INT32, INT64, + FLOAT16, FLOAT32, FLOAT64, COMPLEX64, @@ -53,6 +56,8 @@ inline std::string ToString(DataType dtype) { return "int32_t"; case DataType::INT64: return "int64_t"; + case DataType::FLOAT16: + return "float16"; case DataType::FLOAT32: return "float"; case DataType::FLOAT64: @@ -73,6 +78,7 @@ inline std::string ToString(DataType dtype) { _(int16_t, DataType::INT16) \ _(int, DataType::INT32) \ _(int64_t, DataType::INT64) \ + _(float16, DataType::FLOAT16) \ _(float, DataType::FLOAT32) \ _(double, DataType::FLOAT64) \ _(complex64, DataType::COMPLEX64) \ diff --git a/paddle/fluid/extension/src/ext_tensor.cc b/paddle/fluid/extension/src/ext_tensor.cc index cb37bf180c3..0cae8f4af7b 100644 --- a/paddle/fluid/extension/src/ext_tensor.cc +++ b/paddle/fluid/extension/src/ext_tensor.cc @@ -22,6 +22,7 @@ limitations under the License. */ #include "paddle/fluid/platform/complex128.h" #include "paddle/fluid/platform/complex64.h" #include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/transform.h" namespace paddle { @@ -170,6 +171,8 @@ DataType Tensor::type() const { return DataType::COMPLEX64; } else if (type == framework::proto::VarType::COMPLEX128) { return DataType::COMPLEX128; + } else if (type == framework::proto::VarType::FP16) { + return DataType::FLOAT16; } // TODO(JiabinYang) Support more dtype here return DataType::FLOAT32; @@ -229,6 +232,8 @@ template PD_DLL_DECL Tensor Tensor::copy_to( const PlaceType &target_place) const; template PD_DLL_DECL Tensor Tensor::copy_to( const PlaceType &target_place) const; +template PD_DLL_DECL Tensor +Tensor::copy_to(const PlaceType &target_place) const; template PD_DLL_DECL float *Tensor::data() const; template PD_DLL_DECL double *Tensor::data() const; @@ -242,6 +247,8 @@ template PD_DLL_DECL paddle::platform::complex64 * Tensor::data() const; template PD_DLL_DECL paddle::platform::complex128 * Tensor::data() const; +template PD_DLL_DECL paddle::platform::float16 * +Tensor::data() const; template PD_DLL_DECL float *Tensor::mutable_data(); template PD_DLL_DECL double *Tensor::mutable_data(); @@ -255,6 +262,8 @@ template PD_DLL_DECL paddle::platform::complex64 * Tensor::mutable_data(); template PD_DLL_DECL paddle::platform::complex128 * Tensor::mutable_data(); +template PD_DLL_DECL paddle::platform::float16 * +Tensor::mutable_data(); template PD_DLL_DECL float *Tensor::mutable_data(const PlaceType &place); template PD_DLL_DECL double *Tensor::mutable_data( @@ -274,6 +283,8 @@ template PD_DLL_DECL paddle::platform::complex64 * Tensor::mutable_data(const PlaceType &place); template PD_DLL_DECL paddle::platform::complex128 * Tensor::mutable_data(const PlaceType &place); +template PD_DLL_DECL paddle::platform::float16 * +Tensor::mutable_data(const PlaceType &place); std::vector Tensor::shape() const { GET_CASTED_TENSOR @@ -344,6 +355,11 @@ Tensor Tensor::cast(const DataType &target_type) const { CastDataType( *tensor, rlt_tensor_, ctx)); break; + case framework::proto::VarType::FP16: + framework::VisitDataType( + dst_type, + CastDataType(*tensor, rlt_tensor_, ctx)); + break; // TODO(JiabinYang) Support more dtype here default: PADDLE_THROW(platform::errors::Unimplemented( diff --git a/paddle/fluid/framework/custom_tensor_test.cc b/paddle/fluid/framework/custom_tensor_test.cc index 7da56588600..8d6fd4efd5a 100644 --- a/paddle/fluid/framework/custom_tensor_test.cc +++ b/paddle/fluid/framework/custom_tensor_test.cc @@ -113,6 +113,8 @@ void GroupTestCopy() { TestCopyTensor(); VLOG(2) << "complex128 cpu-cpu-gpu-gpu-cpu"; TestCopyTensor(); + VLOG(2) << "Fp16 cpu-cpu-gpu-gpu-cpu"; + TestCopyTensor(); } void GroupTestCast() { @@ -134,6 +136,8 @@ void GroupTestCast() { TestCast(paddle::DataType::FLOAT32); VLOG(2) << "complex128 cast"; TestCast(paddle::DataType::FLOAT32); + VLOG(2) << "float16 cast"; + TestCast(paddle::DataType::FLOAT16); } void GroupTestDtype() { @@ -146,6 +150,7 @@ void GroupTestDtype() { CHECK(TestDtype() == paddle::DataType::UINT8); CHECK(TestDtype() == paddle::DataType::COMPLEX64); CHECK(TestDtype() == paddle::DataType::COMPLEX128); + CHECK(TestDtype() == paddle::DataType::FLOAT16); } void GroupTestDtypeConvert() { @@ -178,6 +183,9 @@ void GroupTestDtypeConvert() { CHECK(paddle::framework::CustomTensorUtils::ConvertEnumDTypeToInnerDType( paddle::DataType::COMPLEX128) == paddle::framework::proto::VarType::COMPLEX128); + CHECK(paddle::framework::CustomTensorUtils::ConvertEnumDTypeToInnerDType( + paddle::DataType::FLOAT16) == + paddle::framework::proto::VarType::FP16); // proto -> enum CHECK(paddle::framework::CustomTensorUtils::ConvertInnerDTypeToEnumDType( paddle::framework::proto::VarType::FP64) == @@ -207,6 +215,9 @@ void GroupTestDtypeConvert() { CHECK(paddle::framework::CustomTensorUtils::ConvertInnerDTypeToEnumDType( paddle::framework::proto::VarType::COMPLEX128) == paddle::DataType::COMPLEX128); + CHECK(paddle::framework::CustomTensorUtils::ConvertInnerDTypeToEnumDType( + paddle::framework::proto::VarType::FP16) == + paddle::DataType::FLOAT16); } TEST(CustomTensor, copyTest) { diff --git a/paddle/fluid/framework/custom_tensor_utils.h b/paddle/fluid/framework/custom_tensor_utils.h index a252d6aef4e..fad1e3ee349 100644 --- a/paddle/fluid/framework/custom_tensor_utils.h +++ b/paddle/fluid/framework/custom_tensor_utils.h @@ -60,6 +60,8 @@ class CustomTensorUtils { return framework::proto::VarType::COMPLEX64; case paddle::DataType::COMPLEX128: return framework::proto::VarType::COMPLEX128; + case paddle::DataType::FLOAT16: + return framework::proto::VarType::FP16; case paddle::DataType::BOOL: return framework::proto::VarType::BOOL; default: @@ -91,6 +93,8 @@ class CustomTensorUtils { return paddle::DataType::COMPLEX64; case framework::proto::VarType::COMPLEX128: return paddle::DataType::COMPLEX128; + case framework::proto::VarType::FP16: + return paddle::DataType::FLOAT16; case framework::proto::VarType::BOOL: return paddle::DataType::BOOL; default: diff --git a/python/paddle/fluid/tests/custom_op/CMakeLists.txt b/python/paddle/fluid/tests/custom_op/CMakeLists.txt index 4ba537930ce..36496ec499f 100644 --- a/python/paddle/fluid/tests/custom_op/CMakeLists.txt +++ b/python/paddle/fluid/tests/custom_op/CMakeLists.txt @@ -13,24 +13,15 @@ endif() py_test(test_sysconfig SRCS test_sysconfig.py) -# 'test_dispatch' compile .cc file +# CPU custom op tests: only compile .cc file py_test(test_dispatch_jit SRCS test_dispatch_jit.py) -set_tests_properties(test_dispatch_jit PROPERTIES TIMEOUT 120) - py_test(test_multi_out_jit SRCS test_multi_out_jit.py) -set_tests_properties(test_multi_out_jit PROPERTIES TIMEOUT 120) - py_test(test_custom_attrs_jit SRCS test_custom_attrs_jit.py) -set_tests_properties(test_custom_attrs_jit PROPERTIES TIMEOUT 120) - py_test(test_custom_concat SRCS test_custom_concat.py) -set_tests_properties(test_custom_concat PROPERTIES TIMEOUT 120) - py_test(test_custom_conj SRCS test_custom_conj.py) -set_tests_properties(test_custom_conj PROPERTIES TIMEOUT 120) +# other tests py_test(test_check_abi SRCS test_check_abi.py) - cc_test(test_check_error SRCS test_check_error.cc DEPS gtest) if(NOT LINUX) diff --git a/python/paddle/fluid/tests/custom_op/custom_relu_op.cu b/python/paddle/fluid/tests/custom_op/custom_relu_op.cu index be3309d84f5..4ec7d088458 100644 --- a/python/paddle/fluid/tests/custom_op/custom_relu_op.cu +++ b/python/paddle/fluid/tests/custom_op/custom_relu_op.cu @@ -20,7 +20,7 @@ __global__ void relu_cuda_forward_kernel(const data_t* x, const int num) { int gid = blockIdx.x * blockDim.x + threadIdx.x; for (int i = gid; i < num; i += blockDim.x * gridDim.x) { - y[i] = max(x[i], static_cast(0.)); + y[i] = x[i] > static_cast(0.) ? x[i] : static_cast(0.); } } @@ -31,7 +31,8 @@ __global__ void relu_cuda_backward_kernel(const data_t* dy, const int num) { int gid = blockIdx.x * blockDim.x + threadIdx.x; for (int i = gid; i < num; i += blockDim.x * gridDim.x) { - dx[i] = dy[i] * (y[i] > 0 ? 1. : 0.); + dx[i] = dy[i] * (y[i] > static_cast(0.) ? static_cast(1.) + : static_cast(0.)); } } @@ -42,7 +43,7 @@ std::vector relu_cuda_forward(const paddle::Tensor& x) { int numel = x.size(); int block = 512; int grid = (numel + block - 1) / block; - PD_DISPATCH_FLOATING_TYPES( + PD_DISPATCH_FLOATING_AND_HALF_TYPES( x.type(), "relu_cuda_forward_kernel", ([&] { relu_cuda_forward_kernel<<>>( x.data(), out.mutable_data(x.place()), numel); @@ -60,7 +61,7 @@ std::vector relu_cuda_backward(const paddle::Tensor& x, int numel = out.size(); int block = 512; int grid = (numel + block - 1) / block; - PD_DISPATCH_FLOATING_TYPES( + PD_DISPATCH_FLOATING_AND_HALF_TYPES( out.type(), "relu_cuda_backward_kernel", ([&] { relu_cuda_backward_kernel<<>>( grad_out.data(), diff --git a/python/paddle/fluid/tests/custom_op/dispatch_test_op.cc b/python/paddle/fluid/tests/custom_op/dispatch_test_op.cc index fbf5442ac02..0435f50b7c7 100644 --- a/python/paddle/fluid/tests/custom_op/dispatch_test_op.cc +++ b/python/paddle/fluid/tests/custom_op/dispatch_test_op.cc @@ -118,3 +118,21 @@ PD_BUILD_OP(dispatch_test_float_and_integer_and_complex) .Inputs({"X"}) .Outputs({"Out"}) .SetKernelFn(PD_KERNEL(DispatchTestFloatAndIntegerAndComplex)); + +std::vector DispatchTestFloatAndHalf(const paddle::Tensor& x) { + auto out = paddle::Tensor(paddle::PlaceType::kCPU); + out.reshape(x.shape()); + + PD_DISPATCH_FLOATING_AND_HALF_TYPES( + x.type(), "assign_cpu_kernel", ([&] { + assign_cpu_kernel( + x.data(), out.mutable_data(), x.size()); + })); + + return {out}; +} + +PD_BUILD_OP(dispatch_test_float_and_half) + .Inputs({"X"}) + .Outputs({"Out"}) + .SetKernelFn(PD_KERNEL(DispatchTestFloatAndHalf)); diff --git a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py index 1a96fc5f0ae..23733d20841 100644 --- a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py +++ b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py @@ -50,11 +50,17 @@ class TestJITLoad(unittest.TestCase): custom_module.custom_relu, custom_module.custom_relu_dup ] self.dtypes = ['float32', 'float64'] - self.devices = ['cpu', 'gpu'] + if paddle.is_compiled_with_cuda(): + self.dtypes.append('float16') + self.devices = ['cpu'] + if paddle.is_compiled_with_cuda(): + self.devices.append('gpu') def test_static(self): for device in self.devices: for dtype in self.dtypes: + if device == 'cpu' and dtype == 'float16': + continue x = np.random.uniform(-1, 1, [4, 8]).astype(dtype) for custom_op in self.custom_ops: out = custom_relu_static(custom_op, device, dtype, x) @@ -68,6 +74,8 @@ class TestJITLoad(unittest.TestCase): def test_dynamic(self): for device in self.devices: for dtype in self.dtypes: + if device == 'cpu' and dtype == 'float16': + continue x = np.random.uniform(-1, 1, [4, 8]).astype(dtype) for custom_op in self.custom_ops: out, x_grad = custom_relu_dynamic(custom_op, device, dtype, @@ -87,7 +95,7 @@ class TestJITLoad(unittest.TestCase): caught_exception = False try: x = np.random.uniform(-1, 1, [4, 8]).astype('int32') - custom_relu_dynamic(custom_module.custom_relu, 'cpu', 'float32', x) + custom_relu_dynamic(custom_module.custom_relu, 'cpu', 'int32', x) except OSError as e: caught_exception = True self.assertTrue( @@ -105,15 +113,15 @@ class TestJITLoad(unittest.TestCase): caught_exception = False try: - x = np.random.uniform(-1, 1, [4, 8]).astype('int64') - custom_relu_dynamic(custom_module.custom_relu, 'gpu', 'float32', x) + x = np.random.uniform(-1, 1, [4, 8]).astype('int32') + custom_relu_dynamic(custom_module.custom_relu, 'gpu', 'int32', x) except OSError as e: caught_exception = True self.assertTrue( - "function \"relu_cuda_forward_kernel\" is not implemented for data type `int64_t`" + "function \"relu_cuda_forward_kernel\" is not implemented for data type `int32_t`" in str(e)) self.assertTrue( - "python/paddle/fluid/tests/custom_op/custom_relu_op.cu:49" in + "python/paddle/fluid/tests/custom_op/custom_relu_op.cu:50" in str(e)) self.assertTrue(caught_exception) diff --git a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py index 6781915e021..5c5c2d65a59 100644 --- a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py +++ b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py @@ -26,7 +26,7 @@ from paddle.utils.cpp_extension.extension_utils import run_cmd def custom_relu_dynamic(func, device, dtype, np_x, use_func=True): paddle.set_device(device) - t = paddle.to_tensor(np_x) + t = paddle.to_tensor(np_x, dtype=dtype) t.stop_gradient = False out = func(t) if use_func else paddle.nn.functional.relu(t) @@ -171,7 +171,11 @@ class TestNewCustomOpSetUpInstall(unittest.TestCase): ] self.dtypes = ['float32', 'float64'] - self.devices = ['cpu', 'gpu'] + if paddle.is_compiled_with_cuda(): + self.dtypes.append('float16') + self.devices = ['cpu'] + if paddle.is_compiled_with_cuda(): + self.devices.append('gpu') # config seed SEED = 2021 @@ -181,6 +185,8 @@ class TestNewCustomOpSetUpInstall(unittest.TestCase): def test_static(self): for device in self.devices: for dtype in self.dtypes: + if device == 'cpu' and dtype == 'float16': + continue x = np.random.uniform(-1, 1, [4, 8]).astype(dtype) for custom_op in self.custom_ops: out = custom_relu_static(custom_op, device, dtype, x) @@ -194,6 +200,8 @@ class TestNewCustomOpSetUpInstall(unittest.TestCase): def test_static_pe(self): for device in self.devices: for dtype in self.dtypes: + if device == 'cpu' and dtype == 'float16': + continue x = np.random.uniform(-1, 1, [4, 8]).astype(dtype) for custom_op in self.custom_ops: out = custom_relu_static_pe(custom_op, device, dtype, x) @@ -207,6 +215,8 @@ class TestNewCustomOpSetUpInstall(unittest.TestCase): def test_dynamic(self): for device in self.devices: for dtype in self.dtypes: + if device == 'cpu' and dtype == 'float16': + continue x = np.random.uniform(-1, 1, [4, 8]).astype(dtype) for custom_op in self.custom_ops: out, x_grad = custom_relu_dynamic(custom_op, device, dtype, diff --git a/python/paddle/fluid/tests/custom_op/test_dispatch_jit.py b/python/paddle/fluid/tests/custom_op/test_dispatch_jit.py index bc36372c6a7..12e9f50a5e4 100644 --- a/python/paddle/fluid/tests/custom_op/test_dispatch_jit.py +++ b/python/paddle/fluid/tests/custom_op/test_dispatch_jit.py @@ -83,6 +83,12 @@ class TestJitDispatch(unittest.TestCase): self.run_dispatch_test( dispatch_op.dispatch_test_float_and_integer_and_complex, dtype) + def test_dispatch_float_and_half(self): + dtypes = ["float32", "float64", "float16"] + for dtype in dtypes: + self.run_dispatch_test(dispatch_op.dispatch_test_float_and_half, + dtype) + if __name__ == '__main__': unittest.main() diff --git a/python/setup.py.in b/python/setup.py.in index 0afc3956a01..71d4afdb283 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -453,15 +453,12 @@ class InstallHeaders(Command): def copy_data_type_headers(self, header): if os.name == 'nt': - data_type_headers = ['platform\\complex64.h', 'platform\\complex128.h'] + data_type_headers = ['platform\\complex64.h', 'platform\\complex128.h', 'platform\\float16.h'] else: - data_type_headers = ['platform/complex64.h', 'platform/complex128.h'] + data_type_headers = ['platform/complex64.h', 'platform/complex128.h', 'platform/float16.h'] for dtype_header in data_type_headers: if dtype_header in header: - if os.name == 'nt': - install_dir = os.path.join(self.install_dir, "paddle\\fluid\\extension\\include") - else: - install_dir = os.path.join(self.install_dir, "paddle/fluid/extension/include") + install_dir = os.path.join(self.install_dir, "paddle/fluid/extension/include") if not os.path.exists(install_dir): self.mkpath(install_dir) return self.copy_file(header, install_dir) -- GitLab From 25fc2a1fdb4b949f94f97a6d954ba13862f6c38a Mon Sep 17 00:00:00 2001 From: Jacek Czaja Date: Fri, 19 Mar 2021 13:28:04 +0100 Subject: [PATCH 046/486] [oneDNN] Added Elementwise Mul grad fp32/bf16 (#31647) --- .../operators/elementwise/elementwise_op.h | 5 +- .../mkldnn/elementwise_add_mkldnn_op.cc | 11 ++ .../mkldnn/elementwise_mkldnn_op.h | 1 - .../mkldnn/elementwise_mul_mkldnn_op.cc | 116 ++++++++++++++++++ paddle/fluid/platform/mkldnn_reuse.h | 10 +- .../test_elementwise_mul_bf16_mkldnn_op.py | 66 ++++++++-- .../mkldnn/test_elementwise_mul_mkldnn_op.py | 12 +- 7 files changed, 206 insertions(+), 15 deletions(-) diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h index 6ec73b02ade..e09f94a6c0f 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_op.h @@ -276,7 +276,7 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel { #ifdef PADDLE_WITH_MKLDNN // If broadcasting is needed, use native implementation - auto CanMKLDNNElementwiseAddGradBeUsed = [&]() { + auto CanMKLDNNElementwiseGradBeUsed = [&]() { auto dx_dims = ctx.Input("X")->dims(); auto dy_dims = ctx.Input("Y")->dims(); // No broadcast or broadcasting of data on inner dims is supported @@ -284,8 +284,7 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel { }; if (this->CanMKLDNNBeUsed(ctx, input_data_type) && - (ctx.Type() != "elementwise_add_grad" || - CanMKLDNNElementwiseAddGradBeUsed())) { + CanMKLDNNElementwiseGradBeUsed()) { return framework::OpKernelType(input_data_type, ctx.GetPlace(), framework::DataLayout::kMKLDNN, framework::LibraryType::kMKLDNN); diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc index 4db4adfe9e9..b43dddfcf19 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc @@ -61,6 +61,9 @@ class EltwiseAddMKLDNNGradKernel : public ElemwiseGradKernel { platform::EventRole::kUniqueOp); reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p); astream.wait(); + + dx->set_layout(DataLayout::kMKLDNN); + dx->set_format(platform::GetMKLDNNFormat(*reorder_dst_memory_p)); } if (dy) { @@ -75,6 +78,9 @@ class EltwiseAddMKLDNNGradKernel : public ElemwiseGradKernel { reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p); astream.wait(); + + dy->set_layout(DataLayout::kMKLDNN); + dy->set_format(platform::GetMKLDNNFormat(*reorder_dst_memory_p)); } else { // Broadcasting platform::ReductionMKLDNNHandler handler_sum( @@ -86,6 +92,11 @@ class EltwiseAddMKLDNNGradKernel : public ElemwiseGradKernel { reduction_p->execute(astream, {{DNNL_ARG_SRC, *reorder_src_memory_p}, {DNNL_ARG_DST, *dy_memory_p}}); astream.wait(); + + dy->set_layout(DataLayout::kMKLDNN); + dy->set_format( + platform::GetMKLDNNFormat(dy_memory_p->get_desc().reshape( + paddle::framework::vectorize(dy->dims())))); } } } diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h index 8a646e5865d..df827117a0d 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h @@ -15,7 +15,6 @@ #pragma once #include #include -#include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/operators/elementwise/elementwise_add_op.h" #include "paddle/fluid/operators/elementwise/elementwise_op_function.h" diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc index 293b5a1a2d3..c9209cc39d5 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mul_mkldnn_op.cc @@ -14,6 +14,118 @@ limitations under the License. */ #include "paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h" +namespace paddle { +namespace framework { +class ExecutionContext; +} // namespace framework +namespace platform { +class CPUDeviceContext; +struct CPUPlace; +} // namespace platform +} // namespace paddle + +namespace paddle { +namespace operators { +template +class EltwiseMulMKLDNNGradKernel : public ElemwiseGradKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + ElemwiseGradKernel::Compute(ctx); + + auto& dev_ctx = + ctx.template device_context(); + const auto& mkldnn_engine = dev_ctx.GetEngine(); + + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* dx = ctx.Output(framework::GradVarName("X")); + auto* dy = ctx.Output(framework::GradVarName("Y")); + int axis = ctx.Attr("axis"); + + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); + + if (dx) { + // dx = dout*y + platform::BinaryMKLDNNHandler handler( + dnnl::algorithm::binary_mul, axis, dev_ctx, mkldnn_engine, + ctx.GetPlace(), dout, y, dx, 1.0f, 1.0f, 1.0f, + ctx.InputName(framework::GradVarName("Out"))); + + const auto src_dout_memory = handler.AcquireSrcMemory(dout); + const auto src_y_memory = handler.AcquireSecondSrcMemory(y); + const auto dst_dx_memory = handler.AcquireDstMemory(dx); + + const auto binary_prim = handler.AcquireForwardPrimitive(); + + const std::unordered_map args = { + {DNNL_ARG_SRC_0, *src_dout_memory}, + {DNNL_ARG_SRC_1, *src_y_memory}, + {DNNL_ARG_DST, *dst_dx_memory}}; + + binary_prim->execute(astream, args); + astream.wait(); + + dx->set_layout(framework::DataLayout::kMKLDNN); + dx->set_format(platform::GetMKLDNNFormat(*dst_dx_memory)); + } + + if (dy) { + // dy = dout*x + // Handler is having nullptr passed instead of output tensor as + // we want Dst buffer to be allocated by oneDNN not to use Tensor + platform::BinaryMKLDNNHandler handler( + dnnl::algorithm::binary_mul, axis, dev_ctx, mkldnn_engine, + ctx.GetPlace(), dout, x, nullptr, 1.0f, 1.0f, 1.0f, + ctx.InputName(framework::GradVarName("Out"))); + + const auto src_dout_memory = handler.AcquireSrcMemory(dout); + const auto src_x_memory = handler.AcquireSecondSrcMemory(x); + + // If broadcasting is in use then let's write to temporary + // buffer allocated by oneDNN + const auto dst_dy_memory = (dout->dims() == dy->dims()) + ? handler.AcquireDstMemory(dy) + : handler.AcquireDstMemory(); + + const auto binary_prim = handler.AcquireForwardPrimitive(); + + const std::unordered_map args = { + {DNNL_ARG_SRC_0, *src_dout_memory}, + {DNNL_ARG_SRC_1, *src_x_memory}, + {DNNL_ARG_DST, *dst_dy_memory}}; + + binary_prim->execute(astream, args); + astream.wait(); + + dy->set_layout(framework::DataLayout::kMKLDNN); + + // Reduction is needed for broadcasting scenario + if (dout->dims() != dy->dims()) { + platform::ReductionMKLDNNHandler handler_sum( + dnnl::algorithm::reduction_sum, 0.0f, 0.0f, dev_ctx, mkldnn_engine, + ctx.GetPlace(), dout, dy, + ctx.InputName(framework::GradVarName("Out"))); + auto dy_memory_p = handler_sum.AcquireDstMemory(dy); + auto reduction_p = handler_sum.AcquireForwardPrimitive(); + // As source we use mem object with results from binary operation + reduction_p->execute(astream, {{DNNL_ARG_SRC, *dst_dy_memory}, + {DNNL_ARG_DST, *dy_memory_p}}); + astream.wait(); + dy->set_format( + platform::GetMKLDNNFormat(dy_memory_p->get_desc().reshape( + paddle::framework::vectorize(dy->dims())))); + + } else { + dy->set_format(platform::GetMKLDNNFormat(*dst_dy_memory)); + } + } + } +}; + +} // namespace operators +} // namespace paddle + namespace ops = paddle::operators; REGISTER_OP_KERNEL( @@ -23,3 +135,7 @@ REGISTER_OP_KERNEL( dnnl::algorithm::binary_mul>, ops::EltwiseMKLDNNKernel, ops::EltwiseMKLDNNKernel) + +REGISTER_OP_KERNEL(elementwise_mul_grad, MKLDNN, ::paddle::platform::CPUPlace, + ops::EltwiseMulMKLDNNGradKernel, + ops::EltwiseMulMKLDNNGradKernel) diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h index 0503c3f71a8..c79b642c51b 100644 --- a/paddle/fluid/platform/mkldnn_reuse.h +++ b/paddle/fluid/platform/mkldnn_reuse.h @@ -87,6 +87,11 @@ class MKLDNNHandlerT { "@dst_mem_p"); } + template + std::shared_ptr AcquireDstMemory(void) { + return this->AcquireMemoryFromPrimitive(fwd_pd_->dst_desc(), "@dstt_mem_p"); + } + template std::shared_ptr AcquireDstMemory( const framework::Tensor* output) { @@ -561,7 +566,10 @@ class BinaryMKLDNNHandler : public platform::MKLDNNHandlerT { const auto src_x_tz = framework::vectorize(x->dims()); const auto src_y_tz = framework::vectorize(y->dims()); - const auto dst_tz = framework::vectorize(z->dims()); + // if output tensor(z) is nullptr then we are computing into oneDNN + // managed buffer + const auto dst_tz = + (z == nullptr) ? src_x_tz : framework::vectorize(z->dims()); const auto src0_md = dnnl::memory::desc( src_x_tz, platform::MKLDNNGetDataType(), x->format()); diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_bf16_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_bf16_mkldnn_op.py index c2716420fba..9b7f4b9b860 100644 --- a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_bf16_mkldnn_op.py +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_bf16_mkldnn_op.py @@ -30,10 +30,9 @@ class TestElementwiseMulBf16MklDNNOp(OpTest): self.axis = -1 self.generate_data() - self.inputs = { - 'X': convert_float_to_uint16(self.x), - 'Y': convert_float_to_uint16(self.y) - } + self.x_bf16 = convert_float_to_uint16(self.x) + self.y_bf16 = convert_float_to_uint16(self.y) + self.inputs = {'X': self.x_bf16, 'Y': self.y_bf16} self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_mkldnn} self.outputs = {'Out': convert_float_to_uint16(self.out)} @@ -46,13 +45,66 @@ class TestElementwiseMulBf16MklDNNOp(OpTest): self.check_output_with_place(core.CPUPlace()) def test_check_grad_normal(self): - pass + self.check_grad_with_place( + core.CPUPlace(), ["X", "Y"], + "Out", + check_dygraph=False, + user_defined_grads=[ + np.multiply(self.x, self.y), np.multiply(self.x, self.x) + ], + user_defined_grad_outputs=[self.x_bf16]) def test_check_grad_ingore_x(self): - pass + self.check_grad_with_place( + core.CPUPlace(), ["Y"], + "Out", + check_dygraph=False, + user_defined_grads=[np.multiply(self.y, self.x)], + user_defined_grad_outputs=[self.y_bf16]) def test_check_grad_ingore_y(self): - pass + self.check_grad_with_place( + core.CPUPlace(), ["X"], + "Out", + check_dygraph=False, + user_defined_grads=[np.multiply(self.x, self.y)], + user_defined_grad_outputs=[self.x_bf16]) + + +class TestElementwiseMulBroadcastingBf16MklDNNOp( + TestElementwiseMulBf16MklDNNOp): + def generate_data(self): + self.x = np.random.uniform(1, 2, [1, 2, 3, 100]).astype(np.float32) + self.y = np.random.uniform(1, 2, [100]).astype(np.float32) + self.out = np.multiply(self.x, self.y) + + # Compute partial sums along all axes but last one + def compute_reduced_gradients(self, out_grads): + part_sum = np.add.reduceat(out_grads, [0], axis=0) + part_sum = np.add.reduceat(part_sum, [0], axis=1) + part_sum = np.add.reduceat(part_sum, [0], axis=2) + return part_sum.flatten() + + def test_check_grad_normal(self): + self.check_grad_with_place( + core.CPUPlace(), ["X", "Y"], + "Out", + check_dygraph=False, + user_defined_grads=[ + np.multiply(self.x, self.y), + self.compute_reduced_gradients(np.multiply(self.x, self.x)) + ], + user_defined_grad_outputs=[self.x_bf16]) + + def test_check_grad_ingore_x(self): + self.check_grad_with_place( + core.CPUPlace(), ["Y"], + "Out", + check_dygraph=False, + user_defined_grads=[ + self.compute_reduced_gradients(np.multiply(self.x, self.x)) + ], + user_defined_grad_outputs=[self.x_bf16]) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_mkldnn_op.py index d66f3dfb891..03dc2421b65 100644 --- a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_mkldnn_op.py +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_mkldnn_op.py @@ -17,6 +17,7 @@ import unittest import numpy as np from paddle.fluid.tests.unittests.op_test import skip_check_grad_ci from paddle.fluid.tests.unittests.test_elementwise_mul_op import ElementwiseMulOp +from paddle import enable_static class TestMKLDNNElementwiseMulOp(ElementwiseMulOp): @@ -51,13 +52,17 @@ class TestMKLDNNElementwiseMulOp4(TestMKLDNNElementwiseMulOp): def test_check_grad_normal(self): pass - def test_check_grad_ingore_x(self): - pass - def test_check_grad_ingore_y(self): pass +class TestMKLDNNElementwiseMulOp5(TestMKLDNNElementwiseMulOp): + def init_input_output(self): + self.x = np.random.uniform(1, 2, [2, 3, 4, 100]).astype(self.dtype) + self.y = np.random.uniform(1, 2, [100]).astype(self.dtype) + self.out = np.multiply(self.x, self.y) + + ''' INT8 Tests ''' @@ -140,4 +145,5 @@ class TestUint8Scales(TestInt8Scales): if __name__ == '__main__': + enable_static() unittest.main() -- GitLab From a45c8ca69d7dbcc116b76ea9ecc1ec1d98c6b2b2 Mon Sep 17 00:00:00 2001 From: Ouyang Chao Date: Sun, 21 Mar 2021 11:52:06 +0800 Subject: [PATCH 047/486] fix bug of DepthwiseConvTransposeGradKernel (#31762) --- paddle/fluid/operators/conv_transpose_op.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/conv_transpose_op.h b/paddle/fluid/operators/conv_transpose_op.h index 651719f1052..ecf5b6d774a 100644 --- a/paddle/fluid/operators/conv_transpose_op.h +++ b/paddle/fluid/operators/conv_transpose_op.h @@ -682,9 +682,9 @@ class DepthwiseConvTransposeGradKernel : public framework::OpKernel { if (input_grad) { math::DepthwiseConvFunctor depthwiseConv; depthwiseConv( - dev_ctx, *output_grad, filter, strides, paddings, + dev_ctx, *output_grad, filter, strides, std::vector{paddings[0], paddings[2], paddings[1], paddings[3]}, - input_grad, data_layout); + dilations, input_grad, data_layout); } if (filter_grad) { -- GitLab From 8c19d7aa2f89a38b3a68e53c73d88af16a3de8ce Mon Sep 17 00:00:00 2001 From: ronnywang <524019753@qq.com> Date: Sun, 21 Mar 2021 15:42:58 +0800 Subject: [PATCH 048/486] [ROCM] fix test_conv2d_transpose_op (#31749) --- paddle/fluid/operators/conv_transpose_cudnn_op.cu | 4 ++-- .../paddle/fluid/tests/unittests/test_conv2d_transpose_op.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/operators/conv_transpose_cudnn_op.cu b/paddle/fluid/operators/conv_transpose_cudnn_op.cu index 5781dd18b7b..a712d31cf7e 100644 --- a/paddle/fluid/operators/conv_transpose_cudnn_op.cu +++ b/paddle/fluid/operators/conv_transpose_cudnn_op.cu @@ -202,7 +202,7 @@ class CUDNNConvTransposeOpKernel : public framework::OpKernel { int iwo_groups = groups; int c_groups = 1; -#if CUDNN_VERSION_MIN(7, 0, 1) +#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1) iwo_groups = 1; c_groups = groups; groups = 1; @@ -452,7 +452,7 @@ class CUDNNConvTransposeGradOpKernel : public framework::OpKernel { int iwo_groups = groups; int c_groups = 1; -#if CUDNN_VERSION_MIN(7, 0, 1) +#if defined(PADDLE_WITH_HIP) || CUDNN_VERSION_MIN(7, 0, 1) iwo_groups = 1; c_groups = groups; groups = 1; diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py index fb6058c0f03..4e582d74c24 100644 --- a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py +++ b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py @@ -116,7 +116,7 @@ def conv2dtranspose_forward_naive(input_, filter_, attrs): class TestConv2DTransposeOp(OpTest): def setUp(self): # init as conv transpose - self.dtype = np.float64 + self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64 self.need_check_grad = True self.is_test = False self.use_cudnn = False -- GitLab From ed7956a816130f4eb37ba3e235c09d1105ed1807 Mon Sep 17 00:00:00 2001 From: guofei <52460041+gfwm2013@users.noreply.github.com> Date: Sun, 21 Mar 2021 19:59:44 +0800 Subject: [PATCH 049/486] Fix skip_quant in QAT (#31704) * Fix skip_quant in QAT --- .../slim/quantization/imperative/qat.py | 38 +++++++++++++++++-- .../slim/quantization/imperative/utils.py | 6 +++ .../slim/tests/test_imperative_out_scale.py | 7 ++++ .../slim/tests/test_imperative_skip_op.py | 16 ++++++-- 4 files changed, 60 insertions(+), 7 deletions(-) diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py index abfe06a3326..68b4cfdc661 100644 --- a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py +++ b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py @@ -515,6 +515,8 @@ class ImperativeCalcOutputScale(object): self._out_scale_dict[ops_list[op_count]]) op_count += 1 + self._set_skip_quant_attr(inference_program) + # save the final quantized model that has output scales save_inference_model( dirname=dirname, @@ -537,9 +539,12 @@ class ImperativeCalcOutputScale(object): Init the scale params for calculating output scales and save them in the target layer. After the users define the dygraph model, the hooks for calculating output - scales will not execute immediately. If the users load the checkpoint now, - the scale params have not been created, so them cann't be loaded. - Therefore, define the scale params in the beginning. + scales will not execute immediately. If the users load parameters form + checkpoint and save the quantized inference model immediately, the inference + model would not be saved successfully. Beacuse the dygraph_to_static requires + that the parameters created in __init__, but the uniqueness of hook make it + impossible to create parameters in __init__. To avoid this mistake, we define + the scale parameters in the beginning instead of hook. """ def _create_param(in_layer, first_name, last_name, dtype): @@ -587,6 +592,33 @@ class ImperativeCalcOutputScale(object): op_type = op_type.replace('relu', 're_lu') return op_type in layer_name + def _set_skip_quant_attr(self, program): + block = program.global_block() + for op in block.ops: + if self._is_skip_quant_op(block, op): + op._set_attr("skip_quant", True) + + def _is_skip_quant_op(self, block, in_op): + """ + The input op should be skipped quantization. + 1. the type of input op should be conv2d, depthwise_conv2d or matmul + 2. the previous ops of the input op are not fake_quantize_dequantize ops + """ + + def _find_previous_op(block, var_name): + for op in block.ops: + if var_name in op.output_arg_names: + return op + + target_op_types = ["conv2d", "depthwise_conv2d", "matmul"] + if in_op.type not in target_op_types: + return False + + previous_ops = [_find_previous_op(block, arg_name) \ + for arg_name in in_op.input_arg_names] + return any(op is not None and op.type not in utils.fake_quantize_dequantize_types \ + for op in previous_ops ) + def _calc_output_scale_hook(self, layer, input, output): """ Create the MovingAverageAbsMaxScale layer for the target layer if needed. diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py b/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py index 1ff4a408e05..3bf655265c6 100644 --- a/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py +++ b/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py @@ -52,6 +52,12 @@ supported_quant_layers_map = { 'LayerNorm': paddle.nn.LayerNorm, } +fake_quantize_dequantize_types = [ + "fake_quantize_dequantize_abs_max", + "fake_quantize_dequantize_channel_wise_abs_max", + "fake_quantize_dequantize_moving_average_abs_max" +] + out_scale_layers_list = ( paddle.nn.Conv2D, paddle.nn.Linear, paddle.nn.MaxPool2D, paddle.nn.BatchNorm, paddle.nn.BatchNorm2D, paddle.nn.SyncBatchNorm, diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py index 83ddac41965..ed29375d22b 100644 --- a/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py +++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py @@ -393,12 +393,16 @@ class TestImperativeOutSclae(unittest.TestCase): if 'fake' in op.type: static_ops.remove(op) + op_count = 0 for i in range(len(dynamic_ops)): if dynamic_ops[i].has_attr("out_threshold"): + op_count += 1 self.assertTrue(dynamic_ops[i].type == static_ops[i].type) self.assertTrue(dynamic_ops[i].attr("out_threshold") == static_ops[i].attr("out_threshold")) + self.assertTrue(op_count == 13) + class TestSaveQuanztizedModelFromCheckPoint(unittest.TestCase): def test_save_quantized_model(self): @@ -459,11 +463,14 @@ class TestSaveQuanztizedModelFromCheckPoint(unittest.TestCase): if 'fake' in op.type: static_ops.remove(op) + op_count = 0 for i in range(len(dynamic_ops)): if dynamic_ops[i].has_attr("out_threshold"): + op_count += 1 self.assertTrue(dynamic_ops[i].type == static_ops[i].type) self.assertTrue(dynamic_ops[i].attr("out_threshold") == static_ops[i].attr("out_threshold")) + self.assertTrue(op_count == 13) class TestSaveQuantizedModel_Warning(unittest.TestCase): diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_skip_op.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_skip_op.py index 0561055e6e0..bda02769cea 100644 --- a/python/paddle/fluid/contrib/slim/tests/test_imperative_skip_op.py +++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_skip_op.py @@ -200,9 +200,12 @@ class TestImperativeOutSclae(unittest.TestCase): params_filename="lenet" + INFER_PARAMS_SUFFIX)) model_ops = inference_program.global_block().ops - conv2d_count, mul_count = 0, 0 + conv2d_count, matmul_count = 0, 0 + conv2d_skip_count, matmul_skip_count = 0, 0 for i, op in enumerate(model_ops): if op.type == 'conv2d': + if op.has_attr("skip_quant"): + conv2d_skip_count += 1 if conv2d_count > 0: self.assertTrue( 'fake_quantize_dequantize' in model_ops[i - 1].type) @@ -211,14 +214,19 @@ class TestImperativeOutSclae(unittest.TestCase): 'fake_quantize_dequantize' not in model_ops[i - 1].type) conv2d_count += 1 - if op.type == 'mul': - if mul_count > 0: + if op.type == 'matmul': + if op.has_attr("skip_quant"): + matmul_skip_count += 1 + if matmul_count > 0: self.assertTrue( 'fake_quantize_dequantize' in model_ops[i - 1].type) else: self.assertTrue( 'fake_quantize_dequantize' not in model_ops[i - 1].type) - mul_count += 1 + matmul_count += 1 + + self.assertTrue(conv2d_skip_count == 1) + self.assertTrue(matmul_skip_count == 1) if __name__ == '__main__': -- GitLab From a501a7b0caadcfbbcb2f637ed58b52aa07f7d2dc Mon Sep 17 00:00:00 2001 From: lilong12 Date: Mon, 22 Mar 2021 13:20:33 +0800 Subject: [PATCH 050/486] [3D-parallel] add 1f1b scheduler for pipeline (#31566) * add 1f1b scheduler for pp, test=develop --- paddle/fluid/framework/device_worker.h | 20 +- .../framework/distributed_strategy.proto | 1 + paddle/fluid/framework/pipeline_trainer.cc | 10 +- paddle/fluid/framework/section_worker.cc | 173 ++++++++++++------ paddle/fluid/framework/trainer_desc.proto | 3 + .../meta_optimizers/pipeline_optimizer.py | 9 +- python/paddle/fluid/device_worker.py | 12 ++ python/paddle/fluid/optimizer.py | 5 + .../fluid/tests/unittests/pipeline_mnist.py | 23 ++- .../unittests/pipeline_mnist_one_device.py | 4 + .../fluid/tests/unittests/test_pipeline.py | 6 +- 11 files changed, 193 insertions(+), 73 deletions(-) diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h index 30387195392..05c54a90f7e 100644 --- a/paddle/fluid/framework/device_worker.h +++ b/paddle/fluid/framework/device_worker.h @@ -28,6 +28,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/data_feed.h" +#include "paddle/fluid/framework/executor_gc_helper.h" #include "paddle/fluid/framework/heter_service.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" @@ -454,7 +455,7 @@ class HeterBoxWorker : public HogwildWorker { virtual void CacheProgram(const ProgramDesc& main_program) { new (&program_) ProgramDesc(main_program); } - virtual void ProduceTasks() override; + void ProduceTasks() override; virtual void SetStream(const gpuStream_t stream) { copy_stream_ = stream; } virtual void SetEvent(const gpuEvent_t event) { event_ = event; } virtual void TrainFilesWithProfiler() {} @@ -555,7 +556,7 @@ class PSGPUWorker : public HogwildWorker { virtual void CacheProgram(const ProgramDesc& main_program) { new (&program_) ProgramDesc(main_program); } - virtual void ProduceTasks() override; + void ProduceTasks() override; virtual void SetStream(const gpuStream_t stream) { copy_stream_ = stream; } virtual void SetEvent(const gpuEvent_t event) { event_ = event; } void ResetStat(); @@ -659,6 +660,9 @@ class SectionWorker : public DeviceWorker { void SetDeviceIndex(int tid) override {} void SetThreadIndex(int thread_id) { thread_id_ = thread_id; } void SetMicrobatchNum(int num) { num_microbatches_ = num; } + void SetPipelineStageNum(int num) { num_pipeline_stages_ = num; } + void SetPipelineStage(int stage) { pipeline_stage_ = stage; } + void SetScheduleMode(int mode) { schedule_mode_ = mode; } void SetMicrobatchScopes(const std::vector& scope) { microbatch_scopes_ = scope; } @@ -666,11 +670,23 @@ class SectionWorker : public DeviceWorker { void SetSkipVars(const std::vector& skip_vars) { skip_vars_ = skip_vars; } + void RunBackward( + int micro_id, std::unique_ptr&, + std::unordered_map>&); + void RunForward( + int micro_id, std::unique_ptr&, + std::unordered_map>&); + void RunUpdate( + std::unique_ptr&, + std::unordered_map>&); protected: int section_id_; int thread_id_; int num_microbatches_; + int num_pipeline_stages_; + int pipeline_stage_; + int schedule_mode_; // 0 for F-then-B and 1 for 1F1B std::vector microbatch_scopes_; std::vector skip_vars_; const Scope* minibatch_scope_; diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto index 300f0eb0dbb..b36793507f5 100644 --- a/paddle/fluid/framework/distributed_strategy.proto +++ b/paddle/fluid/framework/distributed_strategy.proto @@ -120,6 +120,7 @@ message AsyncConfig { message PipelineConfig { optional int32 micro_batch_size = 1 [ default = 1 ]; optional int32 accumulate_steps = 2 [ default = 1 ]; + optional string schedule_mode = 3 [ default = '1F1B' ]; } message DistributedStrategy { diff --git a/paddle/fluid/framework/pipeline_trainer.cc b/paddle/fluid/framework/pipeline_trainer.cc index 8d350f70165..a97fc2e75aa 100644 --- a/paddle/fluid/framework/pipeline_trainer.cc +++ b/paddle/fluid/framework/pipeline_trainer.cc @@ -24,6 +24,9 @@ namespace framework { void PipelineTrainer::Initialize(const TrainerDesc& trainer_desc, Dataset* dataset) { const auto& section_params = trainer_desc.section_param(); + const int num_pipeline_stages_ = section_params.num_pipeline_stages(); + const int pipeline_stage_ = section_params.pipeline_stage(); + const int schedule_mode_ = section_params.schedule_mode(); num_microbatches_ = section_params.num_microbatches(); VLOG(3) << "Number of microbatches per minibatch: " << num_microbatches_; trainer_desc_ = trainer_desc; @@ -39,6 +42,9 @@ void PipelineTrainer::Initialize(const TrainerDesc& trainer_desc, this_worker->SetPlace(place_); this_worker->Initialize(trainer_desc); this_worker->SetMicrobatchNum(num_microbatches_); + this_worker->SetPipelineStageNum(num_pipeline_stages_); + this_worker->SetPipelineStage(pipeline_stage_); + this_worker->SetScheduleMode(schedule_mode_); } void PipelineTrainer::InitOtherEnv(const ProgramDesc& main_program) { @@ -75,7 +81,9 @@ void PipelineTrainer::CopyParameters(int microbatch_id, for (auto& var : global_block.AllVars()) { bool is_param_grad = false; size_t pos = 0; - if ((pos = var->Name().find(kGradVarSuffix)) != std::string::npos) { + // A magic suffix to indicate the merged gradient + std::string magicSuffix = std::string(kGradVarSuffix) + "@MERGED"; + if ((pos = var->Name().find(magicSuffix)) != std::string::npos) { auto prefix_name = var->Name().substr(0, pos); if (param_map.find(prefix_name) != param_map.end()) { is_param_grad = true; diff --git a/paddle/fluid/framework/section_worker.cc b/paddle/fluid/framework/section_worker.cc index 735c86faf08..90a371e4747 100644 --- a/paddle/fluid/framework/section_worker.cc +++ b/paddle/fluid/framework/section_worker.cc @@ -22,15 +22,79 @@ class TrainerDesc; uint64_t SectionWorker::batch_id_(0); -void SectionWorker::Initialize(const TrainerDesc& desc) { +void SectionWorker::Initialize(const TrainerDesc &desc) { dev_ctx_ = platform::DeviceContextPool::Instance().Get(place_); program_.reset( new ProgramDesc(desc.section_param().section_config().program_desc())); - for (auto& op_desc : program_->Block(0).AllOps()) { + for (auto &op_desc : program_->Block(0).AllOps()) { ops_.push_back(OpRegistry::CreateOp(*op_desc)); } } +void SectionWorker::RunForward( + int micro_id, std::unique_ptr &gc, + std::unordered_map> + &unused_vars_) { + for (auto &op : ops_) { + int op_role = op->Attr(std::string("op_role")); + // We run op with op_role = kLRSched only for the first microbatch + // to avoid increasing the @LR_DECAY_STEP@ multiple times. + bool run_first_mbatch = op_role == static_cast(OpRole::kForward) || + op_role == (static_cast(OpRole::kForward) | + static_cast(OpRole::kLoss)) || + op_role == static_cast(OpRole::kLRSched); + bool run_others = op_role == static_cast(OpRole::kForward) || + op_role == (static_cast(OpRole::kForward) | + static_cast(OpRole::kLoss)); + if ((micro_id == 0 && run_first_mbatch) || (micro_id != 0 && run_others)) { + VLOG(3) << "Forward: running op " << op->Type() << " for micro-batch " + << micro_id; + op->Run(*microbatch_scopes_[micro_id], place_); + if (gc) { + DeleteUnusedTensors(*microbatch_scopes_[micro_id], op.get(), + unused_vars_, gc.get()); + } + } + } +} + +void SectionWorker::RunBackward( + int micro_id, std::unique_ptr &gc, + std::unordered_map> + &unused_vars_) { + for (auto &op : ops_) { + int op_role = op->Attr(std::string("op_role")); + if (op_role == static_cast(OpRole::kBackward) || + op_role == (static_cast(OpRole::kBackward) | + static_cast(OpRole::kLoss))) { + VLOG(3) << "Backward: running op " << op->Type() << " for micro-batch " + << micro_id; + op->Run(*microbatch_scopes_[micro_id], place_); + if (gc) { + DeleteUnusedTensors(*microbatch_scopes_[micro_id], op.get(), + unused_vars_, gc.get()); + } + } + } +} + +void SectionWorker::RunUpdate( + std::unique_ptr &gc, + std::unordered_map> + &unused_vars_) { + for (auto &op : ops_) { + int op_role = op->Attr(std::string("op_role")); + if (op_role == static_cast(OpRole::kOptimize)) { + VLOG(3) << "Update: running op " << op->Type(); + op->Run(*microbatch_scopes_[num_microbatches_ - 1], place_); + if (gc) { + DeleteUnusedTensors(*microbatch_scopes_[num_microbatches_ - 1], + op.get(), unused_vars_, gc.get()); + } + } + } +} + void SectionWorker::TrainFiles() { VLOG(5) << "begin section_worker TrainFiles"; @@ -48,69 +112,56 @@ void SectionWorker::TrainFiles() { #endif } - for (int i = 0; i < num_microbatches_; ++i) { - for (auto& op : ops_) { - int op_role = op->Attr(std::string("op_role")); - // We run op with op_role = kLRSched only for the first microbatch - // to avoid increasing the @LR_DECAY_STEP@ multiple times. - bool run_first_mbatch = op_role == static_cast(OpRole::kForward) || - op_role == (static_cast(OpRole::kForward) | - static_cast(OpRole::kLoss)) || - op_role == static_cast(OpRole::kLRSched); - bool run_others = op_role == static_cast(OpRole::kForward) || - op_role == (static_cast(OpRole::kForward) | - static_cast(OpRole::kLoss)); - if ((i == 0 && run_first_mbatch) || (i != 0 && run_others)) { - VLOG(3) << "Forward: running op " << op->Type() << " for micro-batch " - << i; - op->Run(*microbatch_scopes_[i], place_); - if (gc) { - DeleteUnusedTensors(*microbatch_scopes_[i], op.get(), unused_vars_, - gc.get()); - } - } + if (schedule_mode_ == 0) { + // F-then-B scheduler which runs Forward phase for all microbatches, + // then runs Backward phase for all microbatches. + // step1: run forward + for (int i = 0; i < num_microbatches_; ++i) { + RunForward(i, gc, unused_vars_); } -#ifdef PADDLE_WITH_RCCL - hipDeviceSynchronize(); -#else - cudaDeviceSynchronize(); -#endif - } - - // backward pass - for (int i = 0; i < num_microbatches_; ++i) { - for (auto& op : ops_) { - int op_role = op->Attr(std::string("op_role")); - if (op_role == static_cast(OpRole::kBackward) || - op_role == (static_cast(OpRole::kBackward) | - static_cast(OpRole::kLoss))) { - VLOG(3) << "Backward: running op " << op->Type() << " for micro-batch " - << i; - op->Run(*microbatch_scopes_[i], place_); - if (gc) { - DeleteUnusedTensors(*microbatch_scopes_[i], op.get(), unused_vars_, - gc.get()); - } - } + // step2: run backward + for (int i = 0; i < num_microbatches_; ++i) { + RunBackward(i, gc, unused_vars_); + } + // step3: run update + RunUpdate(gc, unused_vars_); + } else { + // 1F1B scheduler, which runs forward phase and backward phase altertively + // after startup phase. For a stage, the number of microbatches for + // startup is num_pipeline_stages_ - pipeline_stage_ - 1, where + // num_pipeline_stages_ is the total number of pipeline stages and + // pipeline_stage_ is the pipeline stage of the current device. + auto startup_steps = num_pipeline_stages_ - pipeline_stage_ - 1; + VLOG(3) << "startup_steps:" << startup_steps + << ", num_stages: " << num_pipeline_stages_ + << ", stage:" << pipeline_stage_; + PADDLE_ENFORCE_GT( + num_microbatches_, startup_steps, + platform::errors::InvalidArgument( + "To use pipeline with 1F1B scheduler, please make sure number of " + "microbatches (%d) is than startup steps (%d).", + num_microbatches_, startup_steps)); + int fw_step = 0; + int bw_step = 0; + // startup phase + while (fw_step < startup_steps) { + RunForward(fw_step, gc, unused_vars_); + fw_step += 1; } -#ifdef PADDLE_WITH_RCCL - hipDeviceSynchronize(); -#else - cudaDeviceSynchronize(); -#endif - } - // update pass - for (auto& op : ops_) { - int op_role = op->Attr(std::string("op_role")); - if (op_role == static_cast(OpRole::kOptimize)) { - VLOG(3) << "Update: running op " << op->Type(); - op->Run(*microbatch_scopes_[0], place_); - if (gc) { - DeleteUnusedTensors(*microbatch_scopes_[0], op.get(), unused_vars_, - gc.get()); - } + // 1f1b phase + while (fw_step < num_microbatches_) { + RunForward(fw_step, gc, unused_vars_); + fw_step += 1; + RunBackward(bw_step, gc, unused_vars_); + bw_step += 1; + } + // backward phase + while (bw_step < num_microbatches_) { + RunBackward(bw_step, gc, unused_vars_); + bw_step += 1; } + RunUpdate(gc, unused_vars_); } dev_ctx_->Wait(); ++batch_id_; diff --git a/paddle/fluid/framework/trainer_desc.proto b/paddle/fluid/framework/trainer_desc.proto index 70481cf3727..504885ff5cc 100644 --- a/paddle/fluid/framework/trainer_desc.proto +++ b/paddle/fluid/framework/trainer_desc.proto @@ -93,6 +93,9 @@ message SectionWorkerParameter { optional int32 start_cpu_core_id = 4 [ default = 1 ]; repeated string param_need_sync = 5; optional int32 num_microbatches = 6; + optional int32 num_pipeline_stages = 7 [ default = 1 ]; + optional int32 pipeline_stage = 8 [ default = 1 ]; + optional int32 schedule_mode = 9 [ default = 0 ]; } message SectionConfig { diff --git a/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py index 1b79de03fdf..9535c9ef53c 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py @@ -138,7 +138,10 @@ class PipelineOptimizer(MetaOptimizerBase): super(PipelineOptimizer, self).__init__(optimizer) self.inner_opt = optimizer # we do not allow meta optimizer to be inner optimizer currently - self.meta_optimizers_white_list = [] + self.meta_optimizers_white_list = [ + "RecomputeOptimizer", + "AMPOptimizer", + ] self.meta_optimizers_black_list = ["GraphExecutionOptimizer", ] def _set_basic_info(self, loss, role_maker, user_defined_optimizer, @@ -149,6 +152,8 @@ class PipelineOptimizer(MetaOptimizerBase): 'micro_batch_size'] self.num_microbatches = user_defined_strategy.pipeline_configs[ 'accumulate_steps'] + self.schedule_mode = user_defined_strategy.pipeline_configs[ + 'schedule_mode'] def _can_apply(self): if not self.role_maker._is_collective: @@ -167,6 +172,7 @@ class PipelineOptimizer(MetaOptimizerBase): dist_strategy.pipeline_configs = { "micro_batch_size": 1, "accumulate_steps": 1, + "schedule_mode": "1F1B", } def minimize_impl(self, @@ -192,6 +198,7 @@ class PipelineOptimizer(MetaOptimizerBase): loss.block.program._pipeline_opt['local_rank'] = self.rank loss.block.program._pipeline_opt[ 'micro_batch_size'] = self.micro_batch_size + loss.block.program._pipeline_opt['schedule_mode'] = self.schedule_mode optimize_ops, params_grads, prog_list = self.wrapped_opt.minimize( loss, startup_program, parameter_list, no_grad_set) assert prog_list diff --git a/python/paddle/fluid/device_worker.py b/python/paddle/fluid/device_worker.py index 838aea37f18..b923f36af8d 100644 --- a/python/paddle/fluid/device_worker.py +++ b/python/paddle/fluid/device_worker.py @@ -413,6 +413,18 @@ class Section(DeviceWorker): section_param = trainer_desc.section_param section_param.num_microbatches = pipeline_opt["num_microbatches"] section_param.start_cpu_core_id = pipeline_opt["start_cpu_core_id"] + section_param.pipeline_stage = pipeline_opt["pipeline_stage"] + section_param.num_pipeline_stages = pipeline_opt["num_pipeline_stages"] + schedule_mode_str = pipeline_opt["schedule_mode"] + # F-then-B scheduler which runs Forward phase for all microbatches, + # then runs Backward phase for all microbatches. + # 1F1B scheduler, which runs forward phase and backward phase altertively + # after startup phase. + assert schedule_mode_str in ["F-then-B", "1F1B"], ( + "The schedule mode " + "for pipeline must be one of F-then-B or 1F1B") + schedule_mode = 0 if schedule_mode_str == "F-then-B" else 1 + section_param.schedule_mode = schedule_mode cfg = section_param.section_config program = pipeline_opt["section_program"] cfg.program_desc.ParseFromString(program["program"]._get_desc() diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 80f49ea939b..9c724cbfdd4 100755 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -4273,6 +4273,7 @@ class PipelineOptimizer(object): grad_name = self._append_grad_suffix(param_name) if not main_block.has_var(grad_name): continue grad_var = main_block.vars[grad_name] + grad_var.persistable = True main_block._insert_op( index=0, type='fill_constant', @@ -4517,6 +4518,7 @@ class PipelineOptimizer(object): "You must use pipeline with fleet" local_rank = main_program._pipeline_opt['local_rank'] % len( device_specs) + self.schedule_mode = main_program._pipeline_opt['schedule_mode'] place_list = [] for dev_spec in device_specs: @@ -4543,6 +4545,9 @@ class PipelineOptimizer(object): main_program._pipeline_opt = { "trainer": "PipelineTrainer", "device_worker": "Section", + "pipeline_stage": local_rank, + "num_pipeline_stages": len(device_specs), + "schedule_mode": self.schedule_mode, "inner_parallelism": len(device_specs), "section_program": program_list[local_rank], "place": place_list[local_rank], diff --git a/python/paddle/fluid/tests/unittests/pipeline_mnist.py b/python/paddle/fluid/tests/unittests/pipeline_mnist.py index d06be76b331..f433af24813 100644 --- a/python/paddle/fluid/tests/unittests/pipeline_mnist.py +++ b/python/paddle/fluid/tests/unittests/pipeline_mnist.py @@ -110,22 +110,31 @@ class TestDistMnist2x2(TestDistRunnerBase): lr_val = fluid.layers.piecewise_decay(boundaries=bd, values=lr) opt = fluid.optimizer.Momentum(learning_rate=lr_val, momentum=0.9) - # Reader - train_reader = paddle.batch( - paddle.dataset.mnist.test(), batch_size=batch_size) - test_reader = paddle.batch( - paddle.dataset.mnist.test(), batch_size=batch_size) - + acc_steps = 2 # accumulated steps for pipeline if dist_strategy: + # Reader + train_reader = paddle.batch( + paddle.dataset.mnist.test(), batch_size=batch_size) + test_reader = paddle.batch( + paddle.dataset.mnist.test(), batch_size=batch_size) fleet.init(is_collective=True) strategy = fleet.DistributedStrategy() strategy.pipeline = True - strategy.pipeline_configs = {'micro_batch_size': batch_size, } + strategy.pipeline_configs = { + 'micro_batch_size': batch_size, + 'schedule_mode': '1F1B', + 'accumulate_steps': acc_steps + } dist_opt = fleet.distributed_optimizer( optimizer=opt, strategy=strategy) dist_opt.minimize(avg_cost) else: opt.minimize(avg_cost) + # Reader + train_reader = paddle.batch( + paddle.dataset.mnist.test(), batch_size=batch_size * acc_steps) + test_reader = paddle.batch( + paddle.dataset.mnist.test(), batch_size=batch_size * acc_steps) if dist_strategy: return inference_program, avg_cost, train_reader, test_reader, batch_acc, predict, data_loader diff --git a/python/paddle/fluid/tests/unittests/pipeline_mnist_one_device.py b/python/paddle/fluid/tests/unittests/pipeline_mnist_one_device.py index d8d28ac1093..41b3ad34103 100644 --- a/python/paddle/fluid/tests/unittests/pipeline_mnist_one_device.py +++ b/python/paddle/fluid/tests/unittests/pipeline_mnist_one_device.py @@ -122,6 +122,10 @@ class TestDistMnist2x2(TestDistRunnerBase): if dist_strategy: strategy = fleet.DistributedStrategy() strategy.pipeline = True + strategy.pipeline_configs = { + 'schedule_mode': 'F-then-B', + 'micro_batch_size': batch_size + } dist_opt = fleet.distributed_optimizer( optimizer=opt, strategy=strategy) dist_opt.minimize(avg_cost) diff --git a/python/paddle/fluid/tests/unittests/test_pipeline.py b/python/paddle/fluid/tests/unittests/test_pipeline.py index e6d585e5bc1..cd592416c1a 100644 --- a/python/paddle/fluid/tests/unittests/test_pipeline.py +++ b/python/paddle/fluid/tests/unittests/test_pipeline.py @@ -34,9 +34,13 @@ class TestPipeline(TestDistBase): def test_dist_train(self): import paddle.fluid as fluid if fluid.core.is_compiled_with_cuda(): + # TODO (sandyhouse) fix the delta value. + # Now pipeline only gets the loss value of the last + # microbatch, so it is not consistable with the + # non-pipeline one. self.check_with_place( "pipeline_mnist.py", - delta=1e-5, + delta=1e0, check_error_log=True, log_name=flag_name) -- GitLab From 7ccf6b60306c700f59f5eb94d21abec323cd06eb Mon Sep 17 00:00:00 2001 From: arlesniak Date: Mon, 22 Mar 2021 07:43:33 +0100 Subject: [PATCH 051/486] [oneDNN] Initial bf16 amp integration (#31093) --- paddle/fluid/operators/cast_op.cc | 1 + paddle/fluid/operators/scale_op.cc | 2 + .../fluid/contrib/mixed_precision/__init__.py | 3 + .../contrib/mixed_precision/bf16/__init__.py | 24 ++ .../contrib/mixed_precision/bf16/amp_lists.py | 97 ++++++ .../contrib/mixed_precision/bf16/amp_utils.py | 296 ++++++++++++++++++ .../contrib/mixed_precision/fp16_lists.py | 2 +- .../fluid/contrib/tests/test_bf16_utils.py | 144 +++++++++ .../contrib/tests/test_model_cast_to_bf16.py | 138 ++++++++ python/paddle/fluid/data_feeder.py | 23 +- python/paddle/fluid/layers/nn.py | 16 +- .../fluid/tests/book/test_fit_a_line.py | 17 +- .../fluid/tests/book/test_word2vec_book.py | 29 +- .../paddle/fluid/tests/unittests/op_test.py | 17 +- python/paddle/static/amp/__init__.py | 3 + python/setup.py.in | 1 + tools/parallel_UT_rule.py | 1 + tools/static_mode_white_list.py | 1 + 18 files changed, 777 insertions(+), 38 deletions(-) create mode 100644 python/paddle/fluid/contrib/mixed_precision/bf16/__init__.py create mode 100644 python/paddle/fluid/contrib/mixed_precision/bf16/amp_lists.py create mode 100644 python/paddle/fluid/contrib/mixed_precision/bf16/amp_utils.py create mode 100644 python/paddle/fluid/contrib/tests/test_bf16_utils.py create mode 100644 python/paddle/fluid/contrib/tests/test_model_cast_to_bf16.py diff --git a/paddle/fluid/operators/cast_op.cc b/paddle/fluid/operators/cast_op.cc index c5cfa7a3baf..40f4b969ec0 100644 --- a/paddle/fluid/operators/cast_op.cc +++ b/paddle/fluid/operators/cast_op.cc @@ -97,5 +97,6 @@ REGISTER_OP_CPU_KERNEL(cast, ops::CastOpKernel, ops::CastOpKernel, ops::CastOpKernel, ops::CastOpKernel, + ops::CastOpKernel, ops::CastOpKernel, ops::CastOpKernel); diff --git a/paddle/fluid/operators/scale_op.cc b/paddle/fluid/operators/scale_op.cc index 281689d3bda..a9b1f299dab 100644 --- a/paddle/fluid/operators/scale_op.cc +++ b/paddle/fluid/operators/scale_op.cc @@ -128,6 +128,8 @@ REGISTER_OPERATOR(scale, ops::ScaleOp, ops::ScaleOpMaker, REGISTER_OP_CPU_KERNEL( scale, ops::ScaleKernel, ops::ScaleKernel, + ops::ScaleKernel, ops::ScaleKernel, ops::ScaleKernel, ops::ScaleKernel, diff --git a/python/paddle/fluid/contrib/mixed_precision/__init__.py b/python/paddle/fluid/contrib/mixed_precision/__init__.py index a580ae5574c..571b755b50d 100644 --- a/python/paddle/fluid/contrib/mixed_precision/__init__.py +++ b/python/paddle/fluid/contrib/mixed_precision/__init__.py @@ -20,7 +20,10 @@ from . import fp16_lists from .fp16_lists import * from . import fp16_utils from .fp16_utils import * +from . import bf16 +from .bf16 import * __all__ = decorator.__all__ __all__ += fp16_lists.__all__ __all__ += fp16_utils.__all__ +__all__ += bf16.__all__ diff --git a/python/paddle/fluid/contrib/mixed_precision/bf16/__init__.py b/python/paddle/fluid/contrib/mixed_precision/bf16/__init__.py new file mode 100644 index 00000000000..8c05bc4899c --- /dev/null +++ b/python/paddle/fluid/contrib/mixed_precision/bf16/__init__.py @@ -0,0 +1,24 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +from . import amp_lists +from .amp_lists import * +from . import amp_utils +from .amp_utils import * + +__all__ = [] +__all__ += amp_lists.__all__ +__all__ += amp_utils.__all__ diff --git a/python/paddle/fluid/contrib/mixed_precision/bf16/amp_lists.py b/python/paddle/fluid/contrib/mixed_precision/bf16/amp_lists.py new file mode 100644 index 00000000000..81dc32d114b --- /dev/null +++ b/python/paddle/fluid/contrib/mixed_precision/bf16/amp_lists.py @@ -0,0 +1,97 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +from ..fp16_lists import white_list as white_list_fp16, black_list as black_list_fp16,\ + gray_list as gray_list_fp16, unsupported_fp16_list + +__all__ = ["AutoMixedPrecisionListsBF16"] + + +class AutoMixedPrecisionListsBF16(object): + """ + AutoMixedPrecisionListsBF16 is a class for fp32/bf16 op types list. The lists are used for an + algorithm which determines op's execution mode (fp32 or bf16).It can update pre-defined + fp32 list and bf16 list according to users' custom fp32 bf16 lists. + + Args: + custom_bf16_list (set): Users' custom bf16 list. + custom_fp32_list (set): Users' custom fp32 list. + custom_fp32_varnames (set): Users' custom fp32 variables' names. + + Examples: + .. code-block:: python + import paddle + paddle.enable_static() + with paddle.static.amp.bf16_guard(): + paddle.static.amp.AutoMixedPrecisionListsBF16(custom_fp32_list={'lstm'}) + """ + + def __init__(self, + custom_bf16_list=None, + custom_fp32_list=None, + custom_fp32_varnames=None): + self._custom_bf16_list = custom_bf16_list + self._custom_fp32_list = custom_fp32_list + self.bf16_list = copy.copy(bf16_list) + self.fp32_list = copy.copy(fp32_list) + self.gray_list = copy.copy(gray_list) + self.unsupported_list = copy.copy(unsupported_list) + self.fp32_varnames = copy.copy(custom_fp32_varnames) + self._update_list() + + def _update_list(self): + """ + Update fp32 and bf16 list according to users' custom list. + """ + if self._custom_bf16_list and self._custom_fp32_list: + for op_name in self._custom_bf16_list: + if op_name in self._custom_fp32_list: + raise ValueError("Custom bf16 list overlap " + "custom fp32 list") + if self._custom_bf16_list: + for op_name in self._custom_bf16_list: + if op_name in self.fp32_list: + self.fp32_list.remove(op_name) + elif op_name in self.gray_list: + self.gray_list.remove(op_name) + self.bf16_list.add(op_name) + if self._custom_fp32_list: + for op_name in self._custom_fp32_list: + if op_name in self.bf16_list: + self.bf16_list.remove(op_name) + elif op_name in self.gray_list: + self.gray_list.remove(op_name) + self.fp32_list.add(op_name) + self.unsupported_list.add(op_name) + + +# always bf16 +bf16_list = {'elementwise_add', } + +# depends on the prev_op type +gray_list = { + 'reshape2', + 'lookup_table', +} + +unsupported_list = unsupported_fp16_list.copy().copy() +fp32_list = black_list_fp16.copy().copy() +fp32_list |= white_list_fp16 +fp32_list |= gray_list_fp16 + +fp32_list -= bf16_list +fp32_list -= gray_list +unsupported_list -= bf16_list +unsupported_list -= gray_list diff --git a/python/paddle/fluid/contrib/mixed_precision/bf16/amp_utils.py b/python/paddle/fluid/contrib/mixed_precision/bf16/amp_utils.py new file mode 100644 index 00000000000..c2c01f88c74 --- /dev/null +++ b/python/paddle/fluid/contrib/mixed_precision/bf16/amp_utils.py @@ -0,0 +1,296 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import struct + +from .... import core +from .... import framework +from ....log_helper import get_logger +from ....wrapped_decorator import signature_safe_contextmanager +from .amp_lists import AutoMixedPrecisionListsBF16 +from ..fp16_utils import find_true_prev_op, find_true_post_op, _rename_arg, find_op_index +import logging +import numpy as np + +__all__ = ["bf16_guard", "rewrite_program_bf16", "convert_float_to_uint16"] + +_logger = get_logger( + __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s') + +_valid_types = [ + core.VarDesc.VarType.LOD_TENSOR, core.VarDesc.VarType.SELECTED_ROWS, + core.VarDesc.VarType.LOD_TENSOR_ARRAY +] + +_bf16_guard_pattern = "__use_bf16__" + + +def convert_float_to_uint16(in_list): + in_list = np.asarray(in_list) + out = np.vectorize( + lambda x: struct.unpack('> 16, + otypes=[np.uint16])(in_list.flat) + return np.reshape(out, in_list.shape) + + +def _dtype_to_str(dtype): + """ + Convert specific variable type to its corresponding string. + + Args: + dtype (VarType): Variable type. + """ + if dtype == core.VarDesc.VarType.BF16: + return 'bf16' + else: + return 'fp32' + + +def _insert_cast_op(block, op, idx, src_dtype, dest_dtype): + """ + Insert cast op and rename args of input and output. + + Args: + block (Program): The block in which the operator is. + op (Operator): The operator to insert cast op. + idx (int): The index of current operator. + src_dtype (VarType): The input variable dtype of cast op. + dest_dtype (VarType): The output variable dtype of cast op. + + Returns: + num_cast_op (int): The number of cast ops that have been inserted. + """ + num_cast_ops = 0 + + for in_name in op.input_names: + if src_dtype == core.VarDesc.VarType.FP32 and op.type in [ + 'batch_norm', 'fused_bn_add_activation', 'layer_norm' + ]: + if in_name not in {'X', 'Z'}: + continue + for in_var_name in op.input(in_name): + in_var = block.var(in_var_name) + if in_var.type not in _valid_types or in_var.dtype == dest_dtype: + continue + if in_var.dtype == src_dtype: + cast_name = in_var.name + '.cast_' + _dtype_to_str(dest_dtype) + out_var = block.vars.get(cast_name) + if out_var is None or out_var.dtype != dest_dtype: + out_var = block.create_var( + name=cast_name, + dtype=dest_dtype, + persistable=False, + stop_gradient=in_var.stop_gradient) + + block._insert_op( + idx, + type="cast", + inputs={"X": in_var}, + outputs={"Out": out_var}, + attrs={ + "in_dtype": in_var.dtype, + "out_dtype": out_var.dtype + }) + num_cast_ops += 1 + _rename_arg(op, in_var.name, out_var.name) + else: + if op.has_attr('in_dtype'): + op._set_attr('in_dtype', dest_dtype) + if src_dtype == core.VarDesc.VarType.FP32 and dest_dtype == core.VarDesc.VarType.BF16: + for out_name in op.output_names: + if op.type in [ + 'batch_norm', 'fused_bn_add_activation', 'layer_norm' + ] and out_name != 'Y': + continue + for out_var_name in op.output(out_name): + out_var = block.var(out_var_name) + if out_var.type not in _valid_types: + continue + if out_var.dtype == core.VarDesc.VarType.FP32: + out_var.desc.set_dtype(core.VarDesc.VarType.BF16) + if op.has_attr('out_dtype'): + op._set_attr('out_dtype', core.VarDesc.VarType.BF16) + return num_cast_ops + + +def _is_in_fp32_varnames(op, amp_lists): + for in_name in op.input_arg_names: + if in_name in amp_lists.fp32_varnames: + return True + + for out_name in op.output_arg_names: + if out_name in amp_lists.fp32_varnames: + return True + + return False + + +def _need_keep_fp32(op, unsupported_op_list, use_bf16_guard): + if op.type in unsupported_op_list: + # the highest priority condition: If ops don't have bf16 computing kernels, + # they must be executed in fp32 calculation pattern. + return True + + # process ops about learning rate + in_out_arg_names = [] + in_out_arg_names.extend(list(op.input_arg_names)) + in_out_arg_names.extend(list(op.output_arg_names)) + for name in in_out_arg_names: + if "learning_rate" in name: + return True + + if use_bf16_guard: + if op.has_attr("op_namescope") and \ + (_bf16_guard_pattern in op.attr("op_namescope")): + # op in bf16 guard + return False + else: + # op not in bf16 guard + return True + else: + return False + + +@signature_safe_contextmanager +def bf16_guard(): + """ + As for the pure bf16 training, if users set `use_bf16_guard` to True, + only those ops created in the context manager `bf16_guard` will be + transformed as float16 type. + + Examples: + .. code-block:: python + + import numpy as np + import paddle + import paddle.nn.functional as F + paddle.enable_static() + data = paddle.static.data(name='X', shape=[None, 1, 28, 28], dtype='float32') + conv2d = paddle.static.nn.conv2d(input=data, num_filters=6, filter_size=3) + + with paddle.static.amp.bf16_guard(): + bn = paddle.static.nn.batch_norm(input=conv2d, act="relu") + pool = F.max_pool2d(bn, kernel_size=2, stride=2) + hidden = paddle.static.nn.fc(pool, size=10) + loss = paddle.mean(hidden) + """ + with framework.name_scope(prefix=_bf16_guard_pattern): + yield + + +def rewrite_program_bf16(main_prog, amp_lists=None, use_bf16_guard=False): + """ + Traverse all ops in current block and insert cast op according to + which set current op belongs to. + + 1. When an op belongs to the fp32 list, add it to fp32 set + 2. When an op belongs to the bf16 list, add it to bf16 set + 3. When an op belongs to the gray list. If one + of its inputs is the output of fp32 set op or fp32 list op, + add it to fp32 set. If all of its previous ops are not fp32 + op and one of its inputs is the output of bf16 set op or + bf16 list op, add it to bf16 set. + 4. When an op isn't in the lists, add it to fp32 op set. + 5. Add necessary cast ops to make sure that fp32 set op will be + computed in fp32 mode, while bf16 set op will be computed in + bf16 mode. + + Args: + main_prog (Program): The main program for training. + """ + if amp_lists is None: + amp_lists = AutoMixedPrecisionListsBF16() + block = main_prog.global_block() + ops = block.ops + bf16_op_set = set() + fp32_op_set = set() + for op in ops: + + # NOTE(zhiqiu): 'create_py_reader' and 'read' is used in non-iterable DataLoder, + # we don't need to handle reader op and the input of 'create_py_reader' is not + # in block, which may result in errors. + # See GeneratorLoader._init_non_iterable() for details. + if op.type == 'create_py_reader' or op.type == 'read': + continue + + if amp_lists.fp32_varnames is not None and _is_in_fp32_varnames( + op, amp_lists): + fp32_op_set.add(op) + continue + + if op.type in amp_lists.fp32_list or _need_keep_fp32( + op, amp_lists.unsupported_list, use_bf16_guard): + fp32_op_set.add(op) + elif op.type in amp_lists.bf16_list: + bf16_op_set.add(op) + elif op.type in amp_lists.gray_list: + is_fp32_op = False + is_bf16_op = False + for in_name in op.input_names: + # if this op has inputs + if in_name: + for in_var_name in op.input(in_name): + in_var = block.var(in_var_name) + # this in_var isn't the output of other op + if in_var.op is None: + continue + elif in_var.op is op: + prev_op = find_true_prev_op(ops, op, in_var_name) + if prev_op is None: + continue + else: + prev_op = in_var.op + # if it's one of inputs + if prev_op in fp32_op_set or \ + prev_op.type in amp_lists.fp32_list: + is_fp32_op = True + elif prev_op in bf16_op_set or \ + prev_op.type in amp_lists.bf16_list: + is_bf16_op = True + if is_fp32_op: + fp32_op_set.add(op) + elif is_bf16_op: + bf16_op_set.add(op) + else: + pass + else: + # For numerical safe, we apply fp32 computation on ops that + # are not determined which list they should stay. + fp32_op_set.add(op) + + idx = 0 + while idx < len(ops): + op = ops[idx] + num_cast_ops = 0 + if op in fp32_op_set: + num_cast_ops = _insert_cast_op(block, op, idx, + core.VarDesc.VarType.BF16, + core.VarDesc.VarType.FP32) + elif op in bf16_op_set: + if op.has_attr('use_mkldnn'): + op._set_attr('use_mkldnn', True) + op._set_attr('mkldnn_data_type', 'bfloat16') + elif op.has_attr('dtype') and op.attr( + 'dtype') == core.VarDesc.VarType.FP32: + op._set_attr('dtype', core.VarDesc.VarType.BF16) + + num_cast_ops = _insert_cast_op(block, op, idx, + core.VarDesc.VarType.FP32, + core.VarDesc.VarType.BF16) + else: + pass + + idx += num_cast_ops + 1 diff --git a/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py b/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py index c88ae2d9cbf..6a524af4ee2 100644 --- a/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py +++ b/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py @@ -69,7 +69,7 @@ class AutoMixedPrecisionLists(object): self.unsupported_list.add(op_name) -# The three sets listed below are changed dynamiclly. They don't contain all +# The three sets listed below are changed dynamiclly. They don't contain all # paddle ops currently. # The set of ops that support fp16 calculation and are considered numerically- diff --git a/python/paddle/fluid/contrib/tests/test_bf16_utils.py b/python/paddle/fluid/contrib/tests/test_bf16_utils.py new file mode 100644 index 00000000000..faf2307f814 --- /dev/null +++ b/python/paddle/fluid/contrib/tests/test_bf16_utils.py @@ -0,0 +1,144 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import copy +import unittest +import paddle.fluid as fluid +import paddle.fluid.contrib.mixed_precision as amp +from paddle.fluid import core +import paddle + +paddle.enable_static() + + +class AMPTest(unittest.TestCase): + def setUp(self): + self.bf16_list = copy.copy(amp.bf16.amp_lists.bf16_list) + self.fp32_list = copy.copy(amp.bf16.amp_lists.fp32_list) + self.gray_list = copy.copy(amp.bf16.amp_lists.gray_list) + self.amp_lists_ = None + + def tearDown(self): + self.assertEqual(self.amp_lists_.bf16_list, self.bf16_list) + self.assertEqual(self.amp_lists_.fp32_list, self.fp32_list) + self.assertEqual(self.amp_lists_.gray_list, self.gray_list) + + def test_amp_lists(self): + self.amp_lists_ = amp.AutoMixedPrecisionListsBF16() + + def test_amp_lists_1(self): + # 1. w={'exp}, b=None + self.bf16_list.add('exp') + self.fp32_list.remove('exp') + + self.amp_lists_ = amp.AutoMixedPrecisionListsBF16({'exp'}) + + def test_amp_lists_2(self): + # 2. w={'tanh'}, b=None + self.fp32_list.remove('tanh') + self.bf16_list.add('tanh') + + self.amp_lists_ = amp.AutoMixedPrecisionListsBF16({'tanh'}) + + def test_amp_lists_3(self): + # 3. w={'lstm'}, b=None + self.bf16_list.add('lstm') + + self.amp_lists_ = amp.AutoMixedPrecisionListsBF16({'lstm'}) + + def test_amp_lists_4(self): + # 4. w=None, b={'elementwise_add'} + self.bf16_list.remove('elementwise_add') + self.fp32_list.add('elementwise_add') + + self.amp_lists_ = amp.AutoMixedPrecisionListsBF16( + custom_fp32_list={'elementwise_add'}) + + def test_amp_lists_5(self): + # 5. w=None, b={'elementwise_add'} + self.fp32_list.add('elementwise_add') + self.bf16_list.remove('elementwise_add') + + self.amp_lists_ = amp.AutoMixedPrecisionListsBF16( + custom_fp32_list={'elementwise_add'}) + + def test_amp_lists_6(self): + # 6. w=None, b={'lstm'} + self.fp32_list.add('lstm') + + self.amp_lists_ = amp.AutoMixedPrecisionListsBF16( + custom_fp32_list={'lstm'}) + + def test_amp_lists_7(self): + self.fp32_list.add('reshape2') + self.gray_list.remove('reshape2') + + self.amp_lists_ = amp.AutoMixedPrecisionListsBF16( + custom_fp32_list={'reshape2'}) + + def test_amp_list_8(self): + self.bf16_list.add('reshape2') + self.gray_list.remove('reshape2') + + self.amp_lists_ = amp.AutoMixedPrecisionListsBF16( + custom_bf16_list={'reshape2'}) + + +class AMPTest2(unittest.TestCase): + def test_amp_lists_(self): + # 7. w={'lstm'} b={'lstm'} + # raise ValueError + self.assertRaises(ValueError, amp.AutoMixedPrecisionListsBF16, + {'lstm'}, {'lstm'}) + + def test_find_op_index(self): + block = fluid.default_main_program().global_block() + op_desc = core.OpDesc() + idx = amp.bf16.amp_utils.find_op_index(block.desc, op_desc) + assert (idx == -1) + + def test_is_in_fp32_varnames(self): + block = fluid.default_main_program().global_block() + + var1 = block.create_var(name="X", shape=[3], dtype='float32') + var2 = block.create_var(name="Y", shape=[3], dtype='float32') + var3 = block.create_var(name="Z", shape=[3], dtype='float32') + op1 = block.append_op( + type="abs", inputs={"X": [var1]}, outputs={"Out": [var2]}) + op2 = block.append_op( + type="abs", inputs={"X": [var2]}, outputs={"Out": [var3]}) + amp_lists_1 = amp.AutoMixedPrecisionListsBF16( + custom_fp32_varnames={'X'}) + assert amp.bf16.amp_utils._is_in_fp32_varnames(op1, amp_lists_1) + amp_lists_2 = amp.AutoMixedPrecisionListsBF16( + custom_fp32_varnames={'Y'}) + assert amp.bf16.amp_utils._is_in_fp32_varnames(op2, amp_lists_2) + assert amp.bf16.amp_utils._is_in_fp32_varnames(op1, amp_lists_2) + + def test_find_true_post_op(self): + + block = fluid.default_main_program().global_block() + + var1 = block.create_var(name="X", shape=[3], dtype='float32') + var2 = block.create_var(name="Y", shape=[3], dtype='float32') + var3 = block.create_var(name="Z", shape=[3], dtype='float32') + op1 = block.append_op( + type="abs", inputs={"X": [var1]}, outputs={"Out": [var2]}) + op2 = block.append_op( + type="abs", inputs={"X": [var2]}, outputs={"Out": [var3]}) + res = amp.bf16.amp_utils.find_true_post_op(block.ops, op1, "Y") + assert (res == [op2]) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/contrib/tests/test_model_cast_to_bf16.py b/python/paddle/fluid/contrib/tests/test_model_cast_to_bf16.py new file mode 100644 index 00000000000..40ddcf2e66b --- /dev/null +++ b/python/paddle/fluid/contrib/tests/test_model_cast_to_bf16.py @@ -0,0 +1,138 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import paddle +import paddle.fluid as fluid +import contextlib +import unittest +import numpy as np +import paddle.fluid.layers as layers +import paddle.static.amp as amp +from paddle.fluid import core + +paddle.enable_static() + + +@unittest.skipIf(not core.supports_bfloat16(), + "place does not support BF16 evaluation") +class TestModelCastBF16(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.seed = 111 + + @classmethod + def tearDownClass(cls): + pass + + @contextlib.contextmanager + def static_graph(self): + with self.scope_prog_guard(): + paddle.seed(self.seed) + paddle.framework.random._manual_program_seed(self.seed) + yield + + @contextlib.contextmanager + def scope_prog_guard(self): + prog = fluid.Program() + startup_prog = fluid.Program() + scope = fluid.core.Scope() + with fluid.scope_guard(scope): + with fluid.program_guard(prog, startup_prog): + yield + + def get_static_graph_result(self, feed, fetch_list, amp_fun, + with_lod=False): + exe = fluid.Executor(core.CPUPlace()) + exe.run(fluid.default_startup_program()) + prog = fluid.default_main_program() + if amp_fun is not None: + amp_fun(prog) + return exe.run(prog, + feed=feed, + fetch_list=fetch_list, + return_numpy=(not with_lod)) + + def test_graph_rewrite(self): + size = 3 + n = np.ones([size, size], dtype='float32') * 3.2 + nn = np.ones([size, size], dtype='float32') * -2.7 + + n_bf16 = amp.convert_float_to_uint16(n) + nn_bf16 = amp.convert_float_to_uint16(nn) + + with self.static_graph(): + t_bf16 = layers.data( + name='t_bf16', shape=[size, size], dtype=np.uint16) + tt_bf16 = layers.data( + name='tt_bf16', shape=[size, size], dtype=np.uint16) + t = layers.data(name='t', shape=[size, size], dtype='float32') + tt = layers.data(name='tt', shape=[size, size], dtype='float32') + + ret = layers.elementwise_add(t, tt) + ret = layers.elementwise_mul(ret, t) + ret = layers.reshape(ret, [0, 0]) + + with amp.bf16_guard(): + ret_bf16 = layers.elementwise_add(t_bf16, tt_bf16) + ret_bf16 = layers.elementwise_mul(ret_bf16, t_bf16) + ret_bf16 = layers.reshape(ret_bf16, [0, 0]) + + with amp.bf16_guard(): + ret_fp32bf16 = layers.elementwise_add(t, tt) + ret_fp32bf16 = layers.elementwise_mul(ret_fp32bf16, t) + ret_fp32bf16 = layers.reshape(ret_fp32bf16, [0, 0]) + + static_ret_bf16, static_ret, ret_fp32bf16 = self.get_static_graph_result( + feed={ + 't': n, + 'tt': nn, + 't_bf16': n_bf16, + 'tt_bf16': nn_bf16, + }, + fetch_list=[ret_bf16, ret, ret_fp32bf16], + amp_fun=lambda prog: amp.rewrite_program_bf16(prog, use_bf16_guard=True)) + + self.assertTrue(np.allclose(static_ret_bf16, static_ret, 1e-2)) + self.assertTrue(np.allclose(static_ret_bf16, ret_fp32bf16, 1e-2)) + + with self.static_graph(): + t = layers.data(name='t', shape=[size, size], dtype='float32') + tt = layers.data(name='tt', shape=[size, size], dtype='float32') + + with amp.bf16_guard(): + ret = layers.elementwise_add(t, tt) + ret = layers.reshape(ret, [0, 0], act='elu') + ret = layers.elementwise_mul(ret, t) + ret = layers.elementwise_add(ret, tt) + + static_ret_bf16 = \ + self.get_static_graph_result( + feed={'t': n, 'tt': nn}, + fetch_list=[ret], + amp_fun=lambda prog: amp.rewrite_program_bf16( + prog, + amp.AutoMixedPrecisionListsBF16( + custom_fp32_varnames={'elementwise_add_0.tmp_0'}), + use_bf16_guard=True + ) + ) + self.assertTrue( + static_ret_bf16, np.ones( + [size, size], dtype='float32') * -1.1) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/data_feeder.py b/python/paddle/fluid/data_feeder.py index b2db00296bf..52be7493cf2 100644 --- a/python/paddle/fluid/data_feeder.py +++ b/python/paddle/fluid/data_feeder.py @@ -29,6 +29,7 @@ __all__ = ['DataFeeder'] _PADDLE_DTYPE_2_NUMPY_DTYPE = { core.VarDesc.VarType.BOOL: 'bool', core.VarDesc.VarType.FP16: 'float16', + core.VarDesc.VarType.BF16: 'uint16', core.VarDesc.VarType.FP32: 'float32', core.VarDesc.VarType.FP64: 'float64', core.VarDesc.VarType.INT8: 'int8', @@ -47,16 +48,18 @@ def convert_dtype(dtype): return _PADDLE_DTYPE_2_NUMPY_DTYPE[dtype] elif isinstance(dtype, type): if dtype in [ - np.bool, np.float16, np.float32, np.float64, np.int8, np.int16, - np.int32, np.int64, np.uint8, np.complex64, np.complex128 + np.bool, np.float16, np.uint16, np.float32, np.float64, np.int8, + np.int16, np.int32, np.int64, np.uint8, np.complex64, + np.complex128 ]: return dtype.__name__ else: if dtype in [ - 'bool', 'float16', 'float32', 'float64', 'int8', 'int16', - 'int32', 'int64', 'uint8', 'complex64', 'complex128', u'bool', - u'float16', u'float32', u'float64', u'int8', u'int16', u'int32', - u'int64', u'uint8', u'complex64', u'complex128' + 'bool', 'float16', 'uint16', 'float32', 'float64', 'int8', + 'int16', 'int32', 'int64', 'uint8', 'complex64', 'complex128', + u'bool', u'float16', u'uint16', u'float32', u'float64', u'int8', + u'int16', u'int32', u'int64', u'uint8', u'complex64', + u'complex128' ]: # this code is a little bit dangerous, since error could happen # when casting no-ascii code to str in python2. @@ -66,7 +69,7 @@ def convert_dtype(dtype): return str(dtype) raise TypeError( - "dtype must be any of [bool, float16, float32, float64, int8, int16, " + "dtype must be any of [bool, float16, uint16, float32, float64, int8, int16, " "int32, int64, uint8, complex64, complex128], but received %s" % dtype) @@ -123,6 +126,12 @@ def check_dtype(input_dtype, warnings.warn( "The data type of '%s' in %s only support float16 in GPU now. %s" % (input_name, op_name, extra_message)) + if convert_dtype(input_dtype) in ['uint16'] and op_name not in [ + 'reshape', 'lookup_table', 'scale' + ]: + warnings.warn( + "The data type of '%s' in %s only support bfloat16 in OneDNN now. %s" + % (input_name, op_name, extra_message)) if convert_dtype(input_dtype) not in expected_dtype: raise TypeError( "The data type of '%s' in %s must be %s, but received %s. %s" % diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index fa8df14c866..00d1db19fc2 100755 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -6137,9 +6137,9 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=False, name=None): return dygraph_utils._append_activation_in_dygraph(out, act) - check_variable_and_dtype( - x, 'x', ['float16', 'float32', 'float64', 'int32', 'int64', - 'bool'], 'reshape') + check_variable_and_dtype(x, 'x', [ + 'float16', 'float32', 'float64', 'int32', 'int64', 'bool', 'uint16' + ], 'reshape') check_type(shape, 'shape', (list, tuple, Variable), 'reshape') check_type(actual_shape, 'actual_shape', (Variable, type(None)), 'reshape') @@ -11354,9 +11354,11 @@ def _elementwise_op(helper): assert x is not None, 'x cannot be None in {}'.format(op_type) assert y is not None, 'y cannot be None in {}'.format(op_type) check_variable_and_dtype( - x, 'x', ['float16', 'float32', 'float64', 'int32', 'int64'], op_type) + x, 'x', ['float16', 'uint16', 'float32', 'float64', 'int32', 'int64'], + op_type) check_variable_and_dtype( - y, 'y', ['float16', 'float32', 'float64', 'int32', 'int64'], op_type) + y, 'y', ['float16', 'uint16', 'float32', 'float64', 'int32', 'int64'], + op_type) axis = helper.kwargs.get('axis', -1) use_mkldnn = helper.kwargs.get('use_mkldnn', False) @@ -11428,8 +11430,8 @@ def scale(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None): return dygraph_utils._append_activation_in_dygraph(out) check_variable_and_dtype(x, "x", [ - 'float16', 'float32', 'float64', 'int8', 'int16', 'int32', 'int64', - 'uint8' + 'float16', 'uint16', 'float32', 'float64', 'int8', 'int16', 'int32', + 'int64', 'uint8' ], "scale") inputs = {'X': [x]} attrs = { diff --git a/python/paddle/fluid/tests/book/test_fit_a_line.py b/python/paddle/fluid/tests/book/test_fit_a_line.py index 9a2cc4ab1a1..df43d9366ff 100644 --- a/python/paddle/fluid/tests/book/test_fit_a_line.py +++ b/python/paddle/fluid/tests/book/test_fit_a_line.py @@ -26,7 +26,7 @@ import os paddle.enable_static() -def train(use_cuda, save_dirname, is_local): +def train(use_cuda, save_dirname, is_local, use_bf16): x = fluid.layers.data(name='x', shape=[13], dtype='float32') y_predict = fluid.layers.fc(input=x, size=1, act=None) @@ -37,6 +37,8 @@ def train(use_cuda, save_dirname, is_local): avg_cost = fluid.layers.mean(cost) sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001) + if use_bf16: + paddle.static.amp.rewrite_program_bf16(fluid.default_main_program()) sgd_optimizer.minimize(avg_cost) BATCH_SIZE = 20 @@ -133,14 +135,17 @@ def infer(use_cuda, save_dirname=None): print("ground truth: ", test_label) -def main(use_cuda, is_local=True): +def main(use_cuda, is_local=True, use_bf16=False): if use_cuda and not fluid.core.is_compiled_with_cuda(): return + if use_bf16 and not fluid.core.is_compiled_with_mkldnn(): + return + # Directory for saving the trained model save_dirname = "fit_a_line.inference.model" - train(use_cuda, save_dirname, is_local) + train(use_cuda, save_dirname, is_local, use_bf16) infer(use_cuda, save_dirname) @@ -153,6 +158,12 @@ class TestFitALine(unittest.TestCase): with self.program_scope_guard(): main(use_cuda=True) + @unittest.skipIf(not fluid.core.supports_bfloat16(), + "place does not support BF16 evaluation") + def test_bf16(self): + with self.program_scope_guard(): + main(use_cuda=False, use_bf16=True) + @contextlib.contextmanager def program_scope_guard(self): prog = fluid.Program() diff --git a/python/paddle/fluid/tests/book/test_word2vec_book.py b/python/paddle/fluid/tests/book/test_word2vec_book.py index e33b1cc514a..ad7550fa9dd 100644 --- a/python/paddle/fluid/tests/book/test_word2vec_book.py +++ b/python/paddle/fluid/tests/book/test_word2vec_book.py @@ -39,7 +39,12 @@ def get_place(target): format(target)) -def train(target, is_sparse, is_parallel, save_dirname, is_local=True): +def train(target, + is_sparse, + is_parallel, + save_dirname, + is_local=True, + use_bf16=False): PASS_NUM = 100 EMBED_SIZE = 32 HIDDEN_SIZE = 256 @@ -101,6 +106,8 @@ def train(target, is_sparse, is_parallel, save_dirname, is_local=True): raise NotImplementedError() sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001) + if use_bf16: + paddle.static.amp.rewrite_program_bf16(fluid.default_main_program()) sgd_optimizer.minimize(avg_cost) train_reader = paddle.batch( @@ -239,12 +246,15 @@ def infer(target, save_dirname=None): assert np.isclose(a, b, rtol=5e-5), "a: {}, b: {}".format(a, b) -def main(target, is_sparse, is_parallel): +def main(target, is_sparse, is_parallel, use_bf16): if target == "cuda" and not fluid.core.is_compiled_with_cuda(): return if target == "xpu" and not fluid.core.is_compiled_with_xpu(): return + if use_bf16 and not fluid.core.is_compiled_with_mkldnn(): + return + if not is_parallel: save_dirname = "word2vec.inference.model" else: @@ -255,7 +265,7 @@ def main(target, is_sparse, is_parallel): # so only inference is turned on. train("cpu", is_sparse, is_parallel, save_dirname) else: - train(target, is_sparse, is_parallel, save_dirname) + train(target, is_sparse, is_parallel, save_dirname, use_bf16=use_bf16) infer(target, save_dirname) @@ -268,10 +278,11 @@ class W2VTest(unittest.TestCase): pass -def inject_test_method(target, is_sparse, is_parallel): - fn_name = "test_{0}_{1}_{2}".format(target, "sparse" - if is_sparse else "dense", "parallel" - if is_parallel else "normal") +def inject_test_method(target, is_sparse, is_parallel, use_bf16=False): + fn_name = "test_{0}_{1}_{2}{3}".format(target, "sparse" + if is_sparse else "dense", "parallel" + if is_parallel else "normal", "_bf16" + if use_bf16 else "") def __impl__(*args, **kwargs): prog = fluid.Program() @@ -279,8 +290,7 @@ def inject_test_method(target, is_sparse, is_parallel): scope = fluid.core.Scope() with fluid.scope_guard(scope): with fluid.program_guard(prog, startup_prog): - main( - target=target, is_sparse=is_sparse, is_parallel=is_parallel) + main(target, is_sparse, is_parallel, use_bf16) if (not fluid.core.is_compiled_with_cuda() or target == "cuda") and is_sparse: @@ -297,6 +307,7 @@ for target in ("cuda", "cpu", "xpu"): for is_sparse in (False, True): for is_parallel in (False, ): inject_test_method(target, is_sparse, is_parallel) +inject_test_method("cpu", False, False, use_bf16=True) if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py index 939e2ac0f59..dff96a8cbc3 100644 --- a/python/paddle/fluid/tests/unittests/op_test.py +++ b/python/paddle/fluid/tests/unittests/op_test.py @@ -244,17 +244,12 @@ def convert_float_to_uint16(float_list, data_format="NCHW"): return new_output -def copy_bits_from_uint16_to_float(i): - i = np.uint32(i) << 16 - return struct.unpack(' Date: Mon, 22 Mar 2021 14:49:50 +0800 Subject: [PATCH 052/486] [Paddle-TRT] nearest_interp op (#31626) * nearest_interp op converter w/ dynamic/static * fix data_layout include * add trt nearest unit_test * add nearest_interp NHWC test * update trt nearest interp nhwc testcase * remove asterisk for python2 compatibility * add empty line to prevent conflict * nearest_interp op converter w/ dynamic/static * fix data_layout include * add trt nearest unit_test * add nearest_interp NHWC test * update trt nearest interp nhwc testcase * remove asterisk for python2 compatibility * add empty line to prevent conflict * change the priority of out_h, out_w --- .../fluid/inference/api/analysis_predictor.cc | 2 + .../inference/tensorrt/convert/CMakeLists.txt | 2 + .../tensorrt/convert/nearest_interp_op.cc | 114 +++++++++++ paddle/fluid/inference/tensorrt/op_teller.cc | 22 ++ .../inference/test_trt_nearest_interp_op.py | 192 ++++++++++++++++++ 5 files changed, 332 insertions(+) create mode 100644 paddle/fluid/inference/tensorrt/convert/nearest_interp_op.cc create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_trt_nearest_interp_op.py diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index fc436311f07..8f2b217a2fd 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -1192,6 +1192,8 @@ USE_TRT_CONVERTER(scale); USE_TRT_CONVERTER(stack); USE_TRT_CONVERTER(clip); USE_TRT_CONVERTER(gather); + +USE_TRT_CONVERTER(nearest_interp); #endif namespace paddle_infer { diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt index 59205529ef4..b0d0229ec05 100644 --- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt @@ -6,6 +6,8 @@ nv_library(tensorrt_converter shuffle_channel_op.cc swish_op.cc instance_norm_op.cc stack_op.cc transpose_op.cc flatten_op.cc emb_eltwise_layernorm.cc skip_layernorm.cc scale_op.cc slice_op.cc hard_sigmoid_op.cc hard_swish_op.cc clip_op.cc gather_op.cc + + nearest_interp_op.cc DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry) nv_test(test_op_converter SRCS test_op_converter.cc DEPS diff --git a/paddle/fluid/inference/tensorrt/convert/nearest_interp_op.cc b/paddle/fluid/inference/tensorrt/convert/nearest_interp_op.cc new file mode 100644 index 00000000000..e91a2ee13f4 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/nearest_interp_op.cc @@ -0,0 +1,114 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at +http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/data_layout.h" +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" + +namespace paddle { +namespace framework { +class Scope; +namespace proto { +class OpDesc; +} // namespace proto +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace inference { +namespace tensorrt { + +class NearestInterpolateOpConverter : public OpConverter { + public: + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, bool test_mode) override { + VLOG(3) << "convert a fluid nearest_interp op"; + + framework::OpDesc op_desc(op, nullptr); + + std::string input_name = op_desc.Input("X").front(); + std::string output_name = op_desc.Output("Out").front(); + + auto input = engine_->GetITensor(input_name); + + auto data_layout = framework::StringToDataLayout( + BOOST_GET_CONST(std::string, op_desc.GetAttr("data_layout"))); + auto interp_method = + BOOST_GET_CONST(std::string, op_desc.GetAttr("interp_method")); + bool align_corners = + BOOST_GET_CONST(bool, op_desc.GetAttr("align_corners")); + + auto input_names = op_desc.Input("X"); + auto scale = BOOST_GET_CONST(float, op_desc.GetAttr("scale")); + auto out_h = BOOST_GET_CONST(int, op_desc.GetAttr("out_h")); + auto out_w = BOOST_GET_CONST(int, op_desc.GetAttr("out_w")); + + auto layer = TRT_ENGINE_ADD_LAYER(engine_, Resize, *input); + layer->setAlignCorners(align_corners); + + auto in_dim = input->getDimensions(); + + float scale_h = 1.f; + float scale_w = 1.f; + + std::vector scales; + + if (scale > 0.f && (out_h <= 0 && out_w <= 0)) { + scale_h = scale; + scale_w = scale; + } else { + // axis are different in static/dynamic mode + PADDLE_ENFORCE_GT( + out_h, 0, platform::errors::InvalidArgument( + "out_h must be greater than 0 if scale is not set.")); + PADDLE_ENFORCE_GT( + out_w, 0, platform::errors::InvalidArgument( + "out_w must be greater than 0 if scale is not set.")); + + bool with_dynamic = engine_->with_dynamic_shape(); + + int h_axis = (data_layout == framework::DataLayout::kNCHW) + with_dynamic; + int w_axis = + (data_layout == framework::DataLayout::kNCHW) + 1 + with_dynamic; + + scale_h = + static_cast(out_h) / static_cast(in_dim.d[h_axis]); + scale_w = + static_cast(out_w) / static_cast(in_dim.d[w_axis]); + } + + if (engine_->with_dynamic_shape()) { + scales.push_back(1.f); + } + + if (data_layout == framework::DataLayout::kNCHW) { + scales.push_back(1.f); + scales.push_back(scale_h); + scales.push_back(scale_w); + } else if (data_layout == framework::DataLayout::kNHWC) { + // NHWC + scales.push_back(scale_h); + scales.push_back(scale_w); + scales.push_back(1.f); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "Data layout must be NCHW or NHWC.")); + } + layer->setScales(scales.data(), scales.size()); + + RreplenishLayerAndOutput(layer, "nearest_interp", {output_name}, test_mode); + } +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +REGISTER_TRT_OP_CONVERTER(nearest_interp, NearestInterpolateOpConverter); diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc index 44939606b49..2ec94f5f98c 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/inference/tensorrt/op_teller.h" #include "paddle/fluid/framework/block_desc.h" +#include "paddle/fluid/framework/data_layout.h" namespace paddle { namespace framework { @@ -110,6 +111,8 @@ struct SimpleOpTypeSetTeller : public Teller { "flatten2", "flatten", "gather", + + "nearest_interp", }; }; @@ -187,10 +190,29 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, if (axis != 1) return false; } } + if (op_type == "gather") { // current not support axis from input, use default 0 if (!with_dynamic_shape || desc.Input("Axis").size() > 0) return false; } + + if (op_type == "nearest_interp") { + std::vector attrs{"data_layout", "interp_method", + "align_corners", "scale", + "out_h", "out_w"}; + for (auto const attr : attrs) { + if (!desc.HasAttr(attr)) return false; + } + auto data_layout = framework::StringToDataLayout( + BOOST_GET_CONST(std::string, desc.GetAttr("data_layout"))); + if (data_layout != framework::DataLayout::kNCHW && + data_layout != framework::DataLayout::kNHWC) + return false; + auto interp_method = + BOOST_GET_CONST(std::string, desc.GetAttr("interp_method")); + if (interp_method != "nearest") return false; + } + if ((*teller)(op_type, desc, use_no_calib_int8)) return true; } return false; diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_nearest_interp_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_nearest_interp_op.py new file mode 100644 index 00000000000..1a58a6c9dda --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_nearest_interp_op.py @@ -0,0 +1,192 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +from inference_pass_test import InferencePassTest +import paddle.fluid as fluid +import paddle.fluid.core as core +from paddle.fluid.core import PassVersionChecker +from paddle.fluid.core import AnalysisConfig + + +class TRTNearestInterpTest(InferencePassTest): + def setUp(self): + self.set_params() + + with fluid.program_guard(self.main_program, self.startup_program): + if self.data_layout == 'NCHW': + shape = [ + -1, self.channels, self.origin_shape[0], + self.origin_shape[1] + ] + else: + shape = [ + -1, self.origin_shape[0], self.origin_shape[1], + self.channels + ] + data = fluid.data(name='data', shape=shape, dtype='float32') + resize_out = self.append_nearest_interp(data) + out = fluid.layers.batch_norm(resize_out, is_test=True) + + if self.data_layout == 'NCHW': + shape = [ + self.bs, self.channels, self.origin_shape[0], + self.origin_shape[1] + ] + else: + shape = [ + self.bs, self.origin_shape[0], self.origin_shape[1], + self.channels + ] + + self.feeds = {'data': np.random.random(shape).astype('float32'), } + self.enable_trt = True + self.trt_parameters = TRTNearestInterpTest.TensorRTParam( + 1 << 30, self.bs, 1, AnalysisConfig.Precision.Float32, False, False) + self.fetch_list = [out] + + def set_params(self): + self.bs = 4 + self.scale = 1 + self.channels = 3 + self.origin_shape = (32, 32) # HW + self.resize_shape = (64, 64) # HW + self.align_corners = True + self.data_layout = 'NCHW' + + def append_nearest_interp(self, data): + if self.scale > 0.: + return fluid.layers.resize_nearest( + data, + scale=self.scale, + align_corners=self.align_corners, + data_format=self.data_layout) + return fluid.layers.resize_nearest( + data, + out_shape=self.resize_shape, + align_corners=self.align_corners, + data_format=self.data_layout) + + def test_check_output(self): + if core.is_compiled_with_cuda(): + use_gpu = True + self.check_output_with_option(use_gpu, flatten=True) + self.assertTrue( + PassVersionChecker.IsCompatible('tensorrt_subgraph_pass')) + + +class TRTNearestInterpTest1(TRTNearestInterpTest): + def set_params(self): + self.bs = 4 + self.scale = -1 + self.channels = 3 + self.origin_shape = (32, 32) # HW + self.resize_shape = (64, 64) # HW + self.align_corners = True + self.data_layout = 'NCHW' + + +class TRTNearestInterpTest2(TRTNearestInterpTest): + def set_params(self): + self.bs = 4 + self.scale = 2. + self.channels = 3 + self.origin_shape = (32, 32) # HW + self.resize_shape = (64, 64) # HW + self.align_corners = False + self.data_layout = 'NCHW' + + +class TRTNearestInterpTest3(TRTNearestInterpTest): + def set_params(self): + self.bs = 4 + self.scale = -1 + self.channels = 3 + self.origin_shape = (32, 32) # HW + self.resize_shape = (64, 64) # HW + self.align_corners = False + self.data_layout = 'NCHW' + + +class TRTNearestInterpTest4(TRTNearestInterpTest): + def set_params(self): + self.bs = 4 + self.scale = -1 + self.channels = 3 + self.origin_shape = (32, 32) # HW + self.resize_shape = (47, 48) # HW + self.align_corners = False + self.data_layout = 'NCHW' + + +class TRTNearestInterpTest5(TRTNearestInterpTest): + def set_params(self): + self.bs = 4 + self.scale = -1 + self.channels = 3 + self.origin_shape = (32, 32) # HW + self.resize_shape = (64, 64) # HW + self.align_corners = True + self.data_layout = 'NHWC' + + +class TRTNearestInterpTest6(TRTNearestInterpTest): + def set_params(self): + self.bs = 4 + self.scale = 2. + self.channels = 3 + self.origin_shape = (32, 32) # HW + self.resize_shape = (64, 64) # HW + self.align_corners = False + self.data_layout = 'NHWC' + + +class TRTNearestInterpTest7(TRTNearestInterpTest): + def set_params(self): + self.bs = 4 + self.scale = -1 + self.channels = 3 + self.origin_shape = (32, 32) # HW + self.resize_shape = (64, 64) # HW + self.align_corners = False + self.data_layout = 'NHWC' + + +class TRTNearestInterpTest8(TRTNearestInterpTest): + def set_params(self): + self.bs = 4 + self.scale = -1 + self.channels = 3 + self.origin_shape = (32, 32) # HW + self.resize_shape = (47, 48) # HW + self.align_corners = False + self.data_layout = 'NHWC' + + +class TRTNearestInterpTest9(TRTNearestInterpTest): + def set_params(self): + self.bs = 4 + self.scale = -1 + self.channels = 3 + self.origin_shape = (32, 32) # HW + self.resize_shape = (47, 48) # HW + self.align_corners = False + self.data_layout = 'NHWC' + + +if __name__ == "__main__": + unittest.main() -- GitLab From 032de0bfd0759d5aa7ae7444025a70a887f2d891 Mon Sep 17 00:00:00 2001 From: Tao Luo Date: Tue, 23 Mar 2021 11:10:44 +0800 Subject: [PATCH 053/486] update approval (#31782) --- tools/check_api_approvals.sh | 4 +-- tools/check_file_diff_approvals.sh | 44 +++++++++++++++--------------- 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/tools/check_api_approvals.sh b/tools/check_api_approvals.sh index 1db3f6d3d27..4e8ea257154 100644 --- a/tools/check_api_approvals.sh +++ b/tools/check_api_approvals.sh @@ -61,8 +61,8 @@ DEV_OP_USE_DEFAULT_GRAD_MAKER_SPEC=${PADDLE_ROOT}/paddle/fluid/op_use_default_gr PR_OP_USE_DEFAULT_GRAD_MAKER_SPEC=${PADDLE_ROOT}/paddle/fluid/op_use_default_grad_maker_PR.spec ADDED_OP_USE_DEFAULT_GRAD_MAKER=`python ${PADDLE_ROOT}/tools/diff_use_default_grad_op_maker.py ${DEV_OP_USE_DEFAULT_GRAD_MAKER_SPEC} ${PR_OP_USE_DEFAULT_GRAD_MAKER_SPEC}` if [ "${ADDED_OP_USE_DEFAULT_GRAD_MAKER}" != "" ]; then - echo_line="You must have one RD (sneaxiy (Recommend) or luotao1) approval because you use DefaultGradOpMaker for ${ADDED_OP_USE_DEFAULT_GRAD_MAKER}, which manages the grad_op memory optimization.\n" - check_approval 1 32832641 6836917 + echo_line="You must have one RD (zhiqiu (Recommend) or zhhsplendid) approval because you use DefaultGradOpMaker for ${ADDED_OP_USE_DEFAULT_GRAD_MAKER}, which manages the grad_op memory optimization.\n" + check_approval 1 6888866 7913861 fi if [ -n "${echo_list}" ];then diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh index fd3175a5729..f3bf3ea508b 100644 --- a/tools/check_file_diff_approvals.sh +++ b/tools/check_file_diff_approvals.sh @@ -92,11 +92,11 @@ for API_FILE in ${API_FILES[*]}; do # You can use http://caius.github.io/github_id/ to find Github user id. # approval_user_list: XiaoguangHu01 46782768,Xreki 12538138,luotao1 6836917,qingqing01 7845005,guoshengCS 14105589,heavengate 12605721,kuke 3064195,Superjomn 328693,lanxianghit 47554610,cyj1986 39645414,hutuxian 11195205,frankwhzhang 20274488,nepeplwu 45024560,Dianhai 38231817,chenwhql 22561442,zhiqiu 6888866,seiriosPlus 5442383,gongweibao 10721757,saxon-zh 2870059, zhouwei25 52485244, Aurelius84 9301846, liym27 33742067, zhhsplendid 7913861, kolinwei 22165420, liuwei1031 46661762, swtkiwi 27208573, juncaipeng 52520497, zhangting2020 26615455, Shixiaowei02 39303645, Heeenrrry 28379894,XieYunshen 32428676, Dong Daxiang 35550832, phlrain 43953930. if [ "${API_FILE}" == "CMakeLists.txt" ];then - echo_line="You must have one RD (luotao1 or XiaoguangHu01) approval for CMakeLists.txt, which manages the compilation parameter.\n" - check_approval 1 6836917 46782768 + echo_line="You must have one RD (wanghuancoder, luotao1 or XiaoguangHu01) approval for CMakeLists.txt, which manages the compilation parameter.\n" + check_approval 1 6836917 46782768 26922892 elif [ "${API_FILE}" == "python/paddle/fluid/__init__.py" ];then - echo_line="You must have one RD (lanxianghit (Recommend) or luotao1) approval for the python/paddle/fluid/init.py, which manages the environment variables.\n" - check_approval 1 6836917 47554610 + echo_line="You must have one RD (lanxianghit (Recommend), phlrain or luotao1) approval for the python/paddle/fluid/init.py, which manages the environment variables.\n" + check_approval 1 6836917 47554610 43953930 elif [ "${API_FILE}" == "python/requirements.txt" ];then echo_line="You must have one RD (phlrain) and one TPM (swtkiwi) and one QA (kolinwei) approval for python/requirements.txt, which manages the third-party python package.\n" check_approval 3 43953930 27208573 22165420 @@ -104,8 +104,8 @@ for API_FILE in ${API_FILES[*]}; do echo_line="You must have one RD (gongweibao or seiriosPlus) approval for the paddle/fluid/operators/distributed/send_recv.proto.in, which manages the environment variables.\n" check_approval 1 10721757 5442383 elif [ "${API_FILE}" == "paddle/fluid/framework/unused_var_check.cc" ];then - echo_line="You must have one RD (zhiqiu (Recommend) or luotao1) approval for the changes of paddle/fluid/framework/unused_var_check.cc, which manages the allow list of operators that have unused input variables. Before change the allow list, please read the specification [https://github.com/PaddlePaddle/Paddle/wiki/OP-Should-Not-Have-Unused-Input] and try to refine code first. \n" - check_approval 1 6888866 6836917 + echo_line="You must have one RD (zhiqiu (Recommend) or chenwhql) approval for the changes of paddle/fluid/framework/unused_var_check.cc, which manages the allow list of operators that have unused input variables. Before change the allow list, please read the specification [https://github.com/PaddlePaddle/Paddle/wiki/OP-Should-Not-Have-Unused-Input] and try to refine code first. \n" + check_approval 1 6888866 22561442 elif [ "${API_FILE}" == "paddle/fluid/pybind/op_function_generator.cc" ];then echo_line="You must have one RD (zhiqiu (Recommend) , phlrain) approval for the changes of paddle/fluid/pybind/op_function_generator.cc, which manages the logic of automatic generating op functions for dygraph. \n" check_approval 1 6888866 43953930 @@ -122,14 +122,14 @@ for API_FILE in ${API_FILES[*]}; do echo_line="You must have one RD (cryoco (Recommend), luotao1 or phlrain) approval for the python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py, which manages the white list of setting no_check_set of check_output. \n" check_approval 1 12407750 6836917 43953930 elif [ "${API_FILE}" == "python/paddle/fluid/tests/unittests/white_list/check_op_sequence_instance_0_input_white_list.py" ]; then - echo_line="You must have one RD (luotao1, phlrain) approval for the ${API_FILE}, which manages the white list of instance size 0 input for sequence op test. For more information, please refer to [https://github.com/PaddlePaddle/Paddle/wiki/It-is-required-to-include-LoDTensor-input-with-instance_size=0-in-sequence-OP-test]. \n" - check_approval 1 6836917 43953930 + echo_line="You must have one RD (luotao1, lanxianghit, phlrain) approval for the ${API_FILE}, which manages the white list of instance size 0 input for sequence op test. For more information, please refer to [https://github.com/PaddlePaddle/Paddle/wiki/It-is-required-to-include-LoDTensor-input-with-instance_size=0-in-sequence-OP-test]. \n" + check_approval 1 6836917 43953930 47554610 elif [ "${API_FILE}" == "python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py" ];then echo_line="It is an Op accuracy problem, please take care of it. You must have one RD (juncaipeng (Recommend), zhangting2020 or luotao1) approval for the python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py, which manages the white list of error threshold for op test with float64 precision. For more information, please refer to: https://github.com/PaddlePaddle/Paddle/wiki/Upgrade-OP-Precision-to-Float64. \n" check_approval 1 52520497 26615455 6836917 elif [ "${API_FILE}" == "python/paddle/fluid/tests/unittests/white_list/check_op_sequence_batch_1_input_white_list.py" ];then - echo_line="You must have one RD (luotao1 or phlrain) approval for ${API_FILE}, which manages the white list of batch size 1 input for sequence op test. For more information, please refer to [https://github.com/PaddlePaddle/Paddle/wiki/It-is-required-to-include-LoDTensor-input-with-batch_size=1-in-sequence-OP-test]. \n" - check_approval 1 6836917 43953930 + echo_line="You must have one RD (luotao1, lanxianghit or phlrain) approval for ${API_FILE}, which manages the white list of batch size 1 input for sequence op test. For more information, please refer to [https://github.com/PaddlePaddle/Paddle/wiki/It-is-required-to-include-LoDTensor-input-with-batch_size=1-in-sequence-OP-test]. \n" + check_approval 1 6836917 43953930 47554610 elif [ "${API_FILE}" == "python/paddle/fluid/tests/unittests/white_list/no_grad_set_white_list.py" ];then echo_line="You must have one RD (Shixiaowei02 (Recommend), luotao1 or phlrain) approval for the python/paddle/fluid/tests/unittests/white_list/no_grad_set_white_list.py, which manages the white list of no_grad_set without value in operators. For more information, please refer to[https://github.com/PaddlePaddle/Paddle/wiki/It's-recommend-to-set-no_grad_set-to-be-None].\n" check_approval 1 39303645 6836917 43953930 @@ -143,17 +143,17 @@ for API_FILE in ${API_FILES[*]}; do echo_line="You must have (fuyinno4 (Recommend), raindrops2sea) approval for ${API_FILE} changes" check_approval 1 35824027 38231817 elif [ "${API_FILE}" == "paddle/scripts/paddle_build.bat" ] || [ "${API_FILE}" == "tools/windows/run_unittests.sh" ]; then - echo_line="You must have one RD (zhouwei25 (Recommend), luotao1) approval for ${API_FILE} changes, which manages the Paddle CI task on Windows.\n" - check_approval 1 52485244 6836917 + echo_line="You must have one RD (zhouwei25 (Recommend), wanghuancoder, luotao1) approval for ${API_FILE} changes, which manages the Paddle CI task on Windows.\n" + check_approval 1 52485244 6836917 26922892 elif [ "${API_FILE}" == "tools/parallel_UT_rule.py" ]; then - echo_line="You must have one RD (zhouwei25 (Recommend), luotao1) approval for ${API_FILE} changes, which manages the rule of running unittest with a same GPU. If the unittest failed due to Insufficient GPU memory or CUBLAS_STATUS_ALLOC_FAILED, you can remove it from ${API_FILE}.\n" - check_approval 1 52485244 6836917 + echo_line="You must have one RD (zhouwei25 (Recommend), wanghuancoder, luotao1) approval for ${API_FILE} changes, which manages the rule of running unittest with a same GPU. If the unittest failed due to Insufficient GPU memory or CUBLAS_STATUS_ALLOC_FAILED, you can remove it from ${API_FILE}.\n" + check_approval 1 52485244 6836917 26922892 elif [ "${API_FILE}" == "python/paddle/fluid/parallel_executor.py" ]; then echo_line="You must have one RD (Xreki,luotao1,zhhsplendid) approval for ${API_FILE}, which manages the underlying code for PaddlePaddle.\n" check_approval 1 12538138 6836917 7913861 else - echo_line="You must have one RD (XiaoguangHu01,Xreki,luotao1) approval for ${API_FILE}, which manages the underlying code for fluid.\n" - check_approval 1 46782768 12538138 6836917 + echo_line="You must have one RD (XiaoguangHu01,chenwhql,zhiqiu,Xreki,luotao1) approval for ${API_FILE}, which manages the underlying code for fluid.\n" + check_approval 1 46782768 12538138 6836917 22561442 6888866 fi fi done @@ -161,8 +161,8 @@ done FILTER=`git diff --name-only upstream/develop | grep -v "tools/"` HAS_CONST_CAST=`git diff -U0 upstream/$BRANCH $FILTER | grep '^\+' | grep -o -m 1 "const_cast" || true` if [ ${HAS_CONST_CAST} ] && [ "${GIT_PR_ID}" != "" ]; then - echo_line="You must have one RD (XiaoguangHu01,Xreki,luotao1) approval for the usage of const_cast.\n" - check_approval 1 46782768 12538138 6836917 + echo_line="You must have one RD (XiaoguangHu01,chenwhql,zhiqiu,Xreki,luotao1) approval for the usage of const_cast.\n" + check_approval 1 46782768 12538138 6836917 22561442 6888866 fi HAS_BOOST_GET=`git diff -U0 upstream/$BRANCH $FILTER |grep "^+" |grep -o -m 1 "boost::get" || true` @@ -185,14 +185,14 @@ fi HAS_UNITTEST_SKIP=`git diff -U0 upstream/$BRANCH | grep "^+[[:space:]]\{0,\}@unittest.skip" || true` if [ "${HAS_UNITTEST_SKIP}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then - echo_line="Unittest is not allowed to be disabled.\nYou must have one RD (kolinwei(Recommend), or luotao1) approval for the usage of @unittest.skip or @unittest.skipIf.\n${HAS_UNITTEST_SKIP}\n" - check_approval 1 22165420 6836917 46661762 + echo_line="Unittest is not allowed to be disabled.\nYou must have one RD (kolinwei(Recommend), wanghuancoder or luotao1) approval for the usage of @unittest.skip or @unittest.skipIf.\n${HAS_UNITTEST_SKIP}\n" + check_approval 1 22165420 6836917 46661762 26922892 fi HAS_MODIFIED_DEMO_CMAKE=`git diff --name-only upstream/$BRANCH | grep "paddle/fluid/inference/api/demo_ci/CMakeLists.txt" || true` if [ "${HAS_MODIFIED_DEMO_CMAKE}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then - echo_line="You must have one RD (Superjomn (Recommend), luotao1) approval for paddle/fluid/inference/api/demo_ci/CMakeLists.txt.\nwhich manages the compilation parameter of inference demo\n" - check_approval 1 328693 6836917 + echo_line="You must have one RD (Superjomn (Recommend), Shixiaowei02, luotao1) approval for paddle/fluid/inference/api/demo_ci/CMakeLists.txt.\nwhich manages the compilation parameter of inference demo\n" + check_approval 1 328693 6836917 39303645 fi ALL_PADDLE_ENFORCE=`git diff -U0 upstream/$BRANCH |grep "^+" |grep -zoE "PADDLE_ENFORCE\(.[^,\);]+.[^;]*\);\s" || true` -- GitLab From f72d197ec5a5f1a3314a52bbcd4106e575137ac6 Mon Sep 17 00:00:00 2001 From: gongweibao Date: Tue, 23 Mar 2021 11:30:50 +0800 Subject: [PATCH 054/486] fix launch ps ut test=develop (#31771) fix launch ps ut test=develop --- .../tests/unittests/test_fleet_launch_ps.sh | 26 ++++++++++++------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_fleet_launch_ps.sh b/python/paddle/fluid/tests/unittests/test_fleet_launch_ps.sh index 67a8d7e5750..0f28be614c0 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_launch_ps.sh +++ b/python/paddle/fluid/tests/unittests/test_fleet_launch_ps.sh @@ -16,18 +16,24 @@ set -e -server_port_0=${PADDLE_DIST_UT_PORT} -server_port_1=$(( PADDLE_DIST_UT_PORT + 1 )) -worker_port_0=$(( PADDLE_DIST_UT_PORT + 2 )) -worker_port_1=$(( PADDLE_DIST_UT_PORT + 3 )) -heter_worker_port_0=$(( PADDLE_DIST_UT_PORT + 4 )) -heter_worker_port_1=$(( PADDLE_DIST_UT_PORT + 5 )) +server_port_00=${PADDLE_DIST_UT_PORT} +server_port_10=$(( PADDLE_DIST_UT_PORT + 1 )) +worker_port_00=$(( PADDLE_DIST_UT_PORT + 2 )) +worker_port_10=$(( PADDLE_DIST_UT_PORT + 3 )) + +server_port_01=$(( PADDLE_DIST_UT_PORT + 4 )) +server_port_11=$(( PADDLE_DIST_UT_PORT + 5 )) +worker_port_01=$(( PADDLE_DIST_UT_PORT + 6 )) +worker_port_11=$(( PADDLE_DIST_UT_PORT + 7 )) + +heter_worker_port_0=$(( PADDLE_DIST_UT_PORT + 8 )) +heter_worker_port_1=$(( PADDLE_DIST_UT_PORT + 9 )) function test_launch_ps(){ python -m paddle.distributed.fleet.launch \ - --servers="127.0.0.1:${server_port_0},127.0.0.1:${server_port_1}" \ - --workers="127.0.0.1:${worker_port_0},127.0.0.1:${worker_port_1}" \ + --servers="127.0.0.1:${server_port_00},127.0.0.1:${server_port_10}" \ + --workers="127.0.0.1:${worker_port_00},127.0.0.1:${worker_port_10}" \ fleet_ps_training.py 2> ut.elog if grep -q "server are killed" ut.elog; then echo "test pserver launch succeed" @@ -39,8 +45,8 @@ function test_launch_ps(){ function test_launch_ps_heter(){ python -m paddle.distributed.fleet.launch \ - --servers="127.0.0.1:${server_port_0},127.0.0.1:${server_port_1}" \ - --workers="127.0.0.1:${worker_port_0},127.0.0.1:${worker_port_1}" \ + --servers="127.0.0.1:${server_port_01},127.0.0.1:${server_port_11}" \ + --workers="127.0.0.1:${worker_port_01},127.0.0.1:${worker_port_11}" \ --heter_workers="127.0.0.1:${heter_worker_port_0},127.0.0.1:${heter_worker_port_1}" \ fleet_ps_training.py 2> ut.elog if grep -q "server are killed" ut.elog; then -- GitLab From 46dd1d4aadedf77c3eaec2eb5eba04faabd448d2 Mon Sep 17 00:00:00 2001 From: Qi Li Date: Tue, 23 Mar 2021 12:21:30 +0800 Subject: [PATCH 055/486] [ROCM] fix reduce_sum nan in ROCM platform, test=develop (#31780) --- paddle/fluid/operators/reduce_ops/cub_reduce.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/paddle/fluid/operators/reduce_ops/cub_reduce.h b/paddle/fluid/operators/reduce_ops/cub_reduce.h index 39cce60faf3..29e46e091d0 100644 --- a/paddle/fluid/operators/reduce_ops/cub_reduce.h +++ b/paddle/fluid/operators/reduce_ops/cub_reduce.h @@ -161,7 +161,11 @@ static inline std::vector GetStrides(const std::vector& dims, return strides; } +#ifdef __HIPCC__ +constexpr int kMaxBlockDim = 256; +#else constexpr int kMaxBlockDim = 512; +#endif static inline int GetDesiredBlockDim(int block_dim) { return block_dim >= kMaxBlockDim -- GitLab From 9d04ef73692f38247e68e121a44bd34f9f28652c Mon Sep 17 00:00:00 2001 From: Shang Zhizhou Date: Tue, 23 Mar 2021 14:00:22 +0800 Subject: [PATCH 056/486] fix tensorrt output varible reshape (#31733) * fix tensorrt output varible reshape * move padding shape x 1 x 1 in ernie to qkv and fc * update layer name * fix softmax when input is dynamic, fc not padding any more * fix varlen * move fc x_dim assert to op_teller --- .../ir_passes/tensorrt_subgraph_pass.cc | 8 ++- .../fluid/inference/tensorrt/convert/fc_op.cc | 70 +++++++++++++++++-- .../tensorrt/convert/multihead_matmul_op.cc | 43 +++++++++--- .../inference/tensorrt/convert/softmax_op.cc | 13 ++-- paddle/fluid/inference/tensorrt/op_teller.cc | 12 +++- .../plugin/emb_eltwise_layernorm_plugin.cu | 4 +- .../tensorrt/plugin/qkv_to_context_plugin.cu | 4 +- .../plugin/skip_layernorm_op_plugin.cu | 5 -- .../tensorrt/plugin/special_slice_plugin.cu | 2 + 9 files changed, 125 insertions(+), 36 deletions(-) diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc index 59ed09b96cc..60de4234b41 100644 --- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc @@ -168,11 +168,11 @@ void TensorRtSubgraphPass::CreateTensorRTOp( std::set output_names; std::set output_names_with_id; - std::vector origin_output_dims; + std::map origin_name_output_dims; for (auto *x : node->outputs) { output_names.insert(x->Name()); output_names_with_id.insert(x->Name() + std::to_string(x->id())); - origin_output_dims.push_back(x->Var()->GetShape().size()); + origin_name_output_dims[x->Name()] = x->Var()->GetShape().size(); } std::unordered_map output_name_map; @@ -216,11 +216,13 @@ void TensorRtSubgraphPass::CreateTensorRTOp( // output_mapping help us copy the data from the renamed ITensor // to Tensor. std::vector output_mapping; + std::vector renamed_output_dims; for (auto name : output_names) { PADDLE_ENFORCE_NE(output_name_map.count(name), 0, platform::errors::PreconditionNotMet( "The output_name_map should have %s", name)); output_mapping.push_back(output_name_map[name]); + renamed_output_dims.push_back(origin_name_output_dims[name]); } PADDLE_ENFORCE_EQ(output_mapping.empty(), false, platform::errors::PreconditionNotMet( @@ -243,7 +245,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp( op_desc->SetAttr("workspace_size", Get("workspace_size")); op_desc->SetAttr("gpu_id", Get("gpu_device_id")); op_desc->SetAttr("output_name_mapping", output_mapping); - op_desc->SetAttr("origin_output_dims", origin_output_dims); + op_desc->SetAttr("origin_output_dims", renamed_output_dims); op_desc->SetAttr("parameters", params); // we record all inputs' shapes in attr to check if they are consistent diff --git a/paddle/fluid/inference/tensorrt/convert/fc_op.cc b/paddle/fluid/inference/tensorrt/convert/fc_op.cc index 41fbbb557d6..527d0ee2085 100644 --- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc @@ -144,7 +144,69 @@ class FcOpConverter : public OpConverter { static_cast(bias_num)}; if (engine_->with_dynamic_shape()) { - regist_fc(X, n_output, weight, bias); + // not NCHW layout, but NLP layout with added 'x 1 x 1' + auto x_dim = X->getDimensions(); + if (x_dim.nbDims == 3 || x_dim.nbDims == 2) { + auto output_name = op_desc.Output("Out").front(); + // add shuffle before fc + nvinfer1::Dims reshape_before_fc_dim; + reshape_before_fc_dim.nbDims = x_dim.nbDims + 2; + for (int i = 0; i < x_dim.nbDims; i++) { + reshape_before_fc_dim.d[i] = 0; + } + reshape_before_fc_dim.d[x_dim.nbDims] = 1; + reshape_before_fc_dim.d[x_dim.nbDims + 1] = 1; + auto* reshape_before_fc_layer = + TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *X); + reshape_before_fc_layer->setReshapeDimensions(reshape_before_fc_dim); + reshape_before_fc_layer->setName( + ("shuffle_before_fc(Output: " + output_name + ")").c_str()); + + // add fc layer + auto* fc_layer = TRT_ENGINE_ADD_LAYER( + engine_, FullyConnected, *reshape_before_fc_layer->getOutput(0), + n_output, weight.get(), bias.get()); + fc_layer->setName(("fc_layer(Output: " + output_name + ")").c_str()); + + // add shuffle after fc + nvinfer1::Dims reshape_after_fc_dim; + if (x_dim.nbDims == 3) { + if (x_num_col_dims == 2) { + reshape_after_fc_dim.nbDims = 3; + reshape_after_fc_dim.d[0] = 0; + reshape_after_fc_dim.d[1] = 0; + reshape_after_fc_dim.d[2] = 0; + } else { + reshape_after_fc_dim.nbDims = 2; + reshape_after_fc_dim.d[0] = 0; + auto dim = fc_layer->getOutput(0)->getDimensions(); + reshape_after_fc_dim.d[1] = dim.d[1] * dim.d[2]; + } + // x_dim.nbDims == 2 + } else { + reshape_after_fc_dim.nbDims = 2; + reshape_after_fc_dim.d[0] = 0; + reshape_after_fc_dim.d[1] = 0; + } + auto* reshape_after_fc_layer = + TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *fc_layer->getOutput(0)); + reshape_after_fc_layer->setReshapeDimensions(reshape_after_fc_dim); + + if (activation_type == "relu") { + reshape_after_fc_layer->setName( + ("shuffle_after_fc(Output: " + output_name + ")").c_str()); + nvinfer1::IActivationLayer* relu_layer = TRT_ENGINE_ADD_LAYER( + engine_, Activation, *(reshape_after_fc_layer->getOutput(0)), + nvinfer1::ActivationType::kRELU); + RreplenishLayerAndOutput(relu_layer, "relu_after_fc_shuffle", + {output_name}, test_mode); + } else { + RreplenishLayerAndOutput(reshape_after_fc_layer, "shuffle_after_fc", + {output_name}, test_mode); + } + } else { + regist_fc(X, n_output, weight, bias); + } return; } // in order to handle situations in NLP models(input dims < 3, @@ -154,12 +216,6 @@ class FcOpConverter : public OpConverter { auto input_d = X->getDimensions().d; int reshape_dim3[3] = {0}; int reshape_dim4[4] = {0}; - PADDLE_ENFORCE_EQ( - x_num_col_dims == 1 || x_num_col_dims == 2, true, - platform::errors::InvalidArgument( - "Wrong x_num_col_dims param of op mul. Paddle-TRT FC converter " - "expects x_num_col_dims is either 1 or 2, but got %d", - x_num_col_dims)); PADDLE_ENFORCE_LE(x_num_col_dims, input_dims, platform::errors::InvalidArgument( "Params and input dims mismatch. Paddle-TRT FC " diff --git a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc index ee04fd372c4..8ce46a19d4b 100644 --- a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc @@ -8,8 +8,8 @@ http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See +the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" @@ -28,7 +28,6 @@ class MultiheadMatMulOpConverter : public OpConverter { "network structure"; framework::OpDesc op_desc(op, nullptr); // Declare inputs - // Shouble be a 5 dims tensor. auto* input = engine_->GetITensor(op_desc.Input("Input").front()); // fc weights and fc bias @@ -69,6 +68,7 @@ class MultiheadMatMulOpConverter : public OpConverter { int head_number = BOOST_GET_CONST(int, op_desc.GetAttr("head_number")); nvinfer1::ILayer* layer = nullptr; + auto output_name = op_desc.Output("Out")[0]; if (engine_->with_dynamic_shape()) { if (engine_->use_oss()) { @@ -171,6 +171,12 @@ class MultiheadMatMulOpConverter : public OpConverter { plugin_inputs.data(), plugin_inputs.size(), *plugin); layer = plugin_layer; } else { + PADDLE_ENFORCE_EQ( + input->getDimensions().nbDims, 3, + platform::errors::InvalidArgument( + "The Input dim of the MultiheadMatMul should be 3, " + "but it's (%d) now.", + input->getDimensions().nbDims)); // transpose weight_data from m * n to n * m auto* input_bias_qk = engine_->GetITensor(op_desc.Input("BiasQK").front()); @@ -184,15 +190,37 @@ class MultiheadMatMulOpConverter : public OpConverter { static_cast(bias_data), static_cast(bias_t->numel())}; - auto* fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *input, - n, weight.get(), bias.get()); - auto* fc_out = fc_layer->getOutput(0); + // add shuffle before fc + nvinfer1::Dims reshape_before_fc_dim; + reshape_before_fc_dim.nbDims = 5; + reshape_before_fc_dim.d[0] = 0; + reshape_before_fc_dim.d[1] = 0; + reshape_before_fc_dim.d[2] = 0; + reshape_before_fc_dim.d[3] = 1; + reshape_before_fc_dim.d[4] = 1; + auto* reshape_before_fc_layer = + TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input); + reshape_before_fc_layer->setReshapeDimensions(reshape_before_fc_dim); + reshape_before_fc_layer->setName( + ("shuffle_before_multihead_mamul(Output: " + output_name + ")") + .c_str()); + + // add layer fc + auto* fc_layer = TRT_ENGINE_ADD_LAYER( + engine_, FullyConnected, *reshape_before_fc_layer->getOutput(0), n, + weight.get(), bias.get()); + fc_layer->setName( + ("multihead_mamul_fc(Output: " + output_name + ")").c_str()); + + // no need to add shuffle after fc, just change it in + // QkvToContextPluginDynamic + // add qkv to context int head_size = hidden_out / head_number; float scale = BOOST_GET_CONST(float, op_desc.GetAttr("alpha")); std::vector plugin_inputs; - plugin_inputs.push_back(fc_out); + plugin_inputs.push_back(fc_layer->getOutput(0)); plugin_inputs.push_back(input_bias_qk); bool with_fp16 = engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); @@ -208,7 +236,6 @@ class MultiheadMatMulOpConverter : public OpConverter { "You can use the config.SetTRTDynamicShapeInfo(...) interface to set " "the shape information to run the dynamic shape mode.")); } - auto output_name = op_desc.Output("Out")[0]; RreplenishLayerAndOutput(layer, "multihead_matmul", {output_name}, test_mode); #else diff --git a/paddle/fluid/inference/tensorrt/convert/softmax_op.cc b/paddle/fluid/inference/tensorrt/convert/softmax_op.cc index 79992065a22..9cefb24751e 100644 --- a/paddle/fluid/inference/tensorrt/convert/softmax_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/softmax_op.cc @@ -51,6 +51,7 @@ class SoftMaxOpConverter : public OpConverter { uint32_t axes = std::max(0, input_dims - 3); // TODO(cryoco): Poor workaround. Fix padded dims problem when TRT layers // support Nd. + // Tips: Dynammic shape alreay fixes. int padded_dims = 0; int explicit_batch = 0; if (engine_->with_dynamic_shape()) explicit_batch = 1; @@ -62,16 +63,16 @@ class SoftMaxOpConverter : public OpConverter { } } if (!engine_->with_dynamic_shape()) { - if (axis == -1) { - axes = input_dims - 1 - padded_dims; + if (axis < 0) { + axes = input_dims + axis - padded_dims; } else { - axes = axis; + axes = axis - 1; } } else { - if (axis == -1) { - axes = input_dims - 1 - padded_dims; + if (axis < 0) { + axes = input_dims + axis; } else { - axes = axis + 1; + axes = axis; } } layer->setAxes(1 << axes); diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc index 2ec94f5f98c..11752d71a45 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -195,7 +195,17 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, // current not support axis from input, use default 0 if (!with_dynamic_shape || desc.Input("Axis").size() > 0) return false; } - + if (op_type == "fc" || op_type == "mul") { + const int x_num_col_dims = + desc.HasAttr("x_num_col_dims") + ? BOOST_GET_CONST(int, desc.GetAttr("x_num_col_dims")) + : (desc.HasAttr("in_num_col_dims") + ? BOOST_GET_CONST(int, desc.GetAttr("in_num_col_dims")) + : 1); + if (x_num_col_dims != 1 && x_num_col_dims != 2) { + return false; + } + } if (op_type == "nearest_interp") { std::vector attrs{"data_layout", "interp_method", "align_corners", "scale", diff --git a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu index 238daa4a886..6d3872aaeb8 100644 --- a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu @@ -200,12 +200,10 @@ nvinfer1::DimsExprs EmbEltwiseLayernormPluginDynamic::getOutputDimensions( "but it's (%d)", output_index)); nvinfer1::DimsExprs ret; - ret.nbDims = 5; + ret.nbDims = 3; ret.d[0] = inputs[0].d[0]; ret.d[1] = inputs[0].d[1]; ret.d[2] = expr_builder.constant(hidden_size_); - ret.d[3] = expr_builder.constant(1); - ret.d[4] = expr_builder.constant(1); return ret; } diff --git a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu index 1e7c83f4c60..a5fc9e73c5f 100644 --- a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu @@ -169,12 +169,10 @@ nvinfer1::DimsExprs QkvToContextPluginDynamic::getOutputDimensions( "it has (%d) inputs", nb_inputs)); nvinfer1::DimsExprs ret; - ret.nbDims = 5; + ret.nbDims = 3; ret.d[0] = inputs[0].d[0]; ret.d[1] = inputs[0].d[1]; ret.d[2] = expr_builder.constant(head_size_ * head_number_); - ret.d[3] = expr_builder.constant(1); - ret.d[4] = expr_builder.constant(1); return ret; } diff --git a/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu index 3b9eea22199..7be9e3a740a 100644 --- a/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu @@ -54,11 +54,6 @@ void SkipLayerNormPluginDynamic::terminate() { nvinfer1::DimsExprs SkipLayerNormPluginDynamic::getOutputDimensions( int output_index, const nvinfer1::DimsExprs *inputs, int nb_inputs, nvinfer1::IExprBuilder &expr_builder) { - PADDLE_ENFORCE_EQ( - inputs[0].nbDims, 5, - platform::errors::InvalidArgument( - "The Input dim of the SkipLayernorm should be 5, but it's (%d) now.", - inputs[0].nbDims)); return inputs[0]; } diff --git a/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.cu index 250b944652b..fdb14f9ceaf 100644 --- a/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.cu @@ -62,6 +62,8 @@ nvinfer1::DimsExprs SpecialSlicePluginDynamic::getOutputDimensions( output.d[1] = one; output.d[0] = expr_builder.operation(nvinfer1::DimensionOperation::kSUB, *inputs[1].d[0], *one); + // remove padding 1 + output.nbDims -= 2; return output; } -- GitLab From 513641e153c5e9bb9eae7f4f202c1271251917cf Mon Sep 17 00:00:00 2001 From: tianshuo78520a <707759223@qq.com> Date: Tue, 23 Mar 2021 14:32:06 +0800 Subject: [PATCH 057/486] Delete fast_check_nan_inf (#31788) * Delete fast_check_nan_inf * Delete run_fast_nan_inf_debug --- paddle/fluid/framework/operator.cc | 22 -------- python/paddle/fluid/__init__.py | 1 - python/paddle/fluid/debugger.py | 85 ------------------------------ 3 files changed, 108 deletions(-) diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 833a28a7579..834cdb422ad 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -47,9 +47,6 @@ DECLARE_bool(benchmark); DECLARE_bool(check_nan_inf); DECLARE_bool(enable_unused_var_check); DEFINE_int32(inner_op_parallelism, 0, "number of threads for inner op"); -DEFINE_bool(fast_check_nan_inf, false, - "Fast checking NAN/INF after each operation. It will be a little" - "bit slow, much faster than check_nan_inf"); namespace paddle { namespace framework { @@ -1173,25 +1170,6 @@ void OperatorWithKernel::RunImpl(const Scope& scope, #endif } - if (FLAGS_fast_check_nan_inf) { - for (auto& vname : OutputVars(true)) { - // only check inserted vars, - // please see executor.py for details of fast_check_nan_inf - if (vname.rfind("debug_var") == 0) { - VLOG(3) << "debugging nan/inf in var " << vname; - - auto* var = exec_scope.FindVar(vname); - if (var == nullptr) continue; - if (var->IsType()) { - CheckTensorNANOrInf(type_, vname, var->Get()); - } else if (var->IsType()) { - CheckTensorNANOrInf(type_, vname, - var->Get().value()); - } - } - } - } - if (FLAGS_check_nan_inf) { framework::details::CheckOpHasNanOrInf(*this, exec_scope, place); } diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 1a88d3512ea..b24da29d0f5 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -175,7 +175,6 @@ def __bootstrap__(): sysstr = platform.system() read_env_flags = [ 'check_nan_inf', - 'fast_check_nan_inf', 'benchmark', 'eager_delete_scope', 'fraction_of_cpu_memory_to_use', diff --git a/python/paddle/fluid/debugger.py b/python/paddle/fluid/debugger.py index 9110b8daf38..75dc14a1d75 100644 --- a/python/paddle/fluid/debugger.py +++ b/python/paddle/fluid/debugger.py @@ -280,88 +280,3 @@ def draw_block_graphviz(block, highlights=None, path="./temp.dot"): add_op_link_var(opn, var, True) graph(path, show=False) - - -def prepare_fast_nan_inf_debug(_program): - """ - Given a program to run, insert a (reduce) sum op for every var in that program. - Instead of checking all vars originally defined in the program, - only those inserted ops will be checked in the c++ end, to detect if it contains NAN or INF. - Thereforce, the speed of nan/inf checking could be improved. - Please set ``FLAGS_fast_check_nan_inf" to open the fast nan/inf feature. - """ - - helper = LayerHelper('reduce_sum', **locals()) - - if _program is None: - _program = default_main_program() - - for _block in _program.blocks: - # fetch vars in the current block - _vars_in_prog = [] - for _var_name in _block.vars: - _vars_in_prog.append((_var_name, _block.vars[_var_name])) - - # append sum_op in the current block - for _var_name, _var in _vars_in_prog: - - try: - - if _var.dtype == -1: - continue - - ## create a var for holding sum output - _output_var = _block.create_var( - name=unique_name.generate("debug_var_" + _var_name), - dtype=_var.dtype, - type=core.VarDesc.VarType.LOD_TENSOR, - persistable=False, - stop_gradient=True) - - ## create a sum op, input each existing var in the block - _block.append_op( - type='sum', - outputs={'Out': _output_var}, - inputs={'X': [_var]}) - except Exception as e: - pass - - -def run_fast_nan_inf_debug(executor, - program=None, - feed=None, - fetch_list=None, - feed_var_name='feed', - fetch_var_name='fetch', - scope=None, - return_numpy=True, - use_program_cache=False, - dump_core=True): - """ - Run a program by the given executor. Catch the exception of NAN and INF, and save persistables into the dumped core. - """ - - assert (executor is not None) - - try: - output = executor.run(program=program, - feed=feed, - fetch_list=fetch_list, - feed_var_name=feed_var_name, - fetch_var_name=fetch_var_name, - scope=scope, - return_numpy=return_numpy, - use_program_cache=use_program_cache) - - return output - - except Exception as e: - - print("catch an exception:") - print(e) - - core_filename = "core" + str(int(random.random() * 10000)) + ".pdckpt" - io.save_persistables( - executor, "./", main_program=program, filename=core_filename) - - print("dumping a core into ./%s" % core_filename) -- GitLab From 814b38e30f0b548c1a9b07ef33165752ed434a72 Mon Sep 17 00:00:00 2001 From: Wojciech Uss Date: Tue, 23 Mar 2021 07:47:58 +0100 Subject: [PATCH 058/486] update scale collection and propagation algorithm (#31783) --- .../quantization/quant2_int8_mkldnn_pass.py | 50 +++++++++---------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py b/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py index d93a2059bdc..68cc8106c9c 100644 --- a/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py +++ b/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py @@ -62,9 +62,8 @@ class Quant2Int8MkldnnPass(object): self._ops_to_quantize = _ops_to_quantize self._op_ids_to_skip = _op_ids_to_skip if _op_ids_to_skip is not None else set( [-1]) - self._scale_immutable_ops = [ - 'transpose2', 'reshape2', 'pool2d', 'scale' - ] + self._scale_immutable_ops = ['transpose2', 'reshape2', 'pool2d'] + self._scale_ops = ['scale'] self._conv_ops = ['conv2d', 'depthwise_conv2d'] self._pool_ops = ['pool2d'] self._mul_ops = ['mul'] @@ -87,8 +86,8 @@ class Quant2Int8MkldnnPass(object): self._reset_pass_idx_and_group('int8') graph = self._label_skip_quantized_op(graph) graph = self._gather_weight_thresholds_from_fake(graph) - graph = self._gather_output_scales_from_attr(graph) graph = self._gather_input_scales_from_fake(graph) + graph = self._gather_output_scales_from_attr(graph) graph = self._remove_fake_ops(graph) graph = self._dequantize_weights(graph) graph = self._optimize_fp32_graph(graph) @@ -160,12 +159,16 @@ class Quant2Int8MkldnnPass(object): op_node.op()._set_attr("skip_quant", True) return graph - def _gather_input_scales_from_fake(self, graph): - def _add_scale_for_vars(var_names, use_unsigned_int, lod_tensor): - scales = self._var_quant_scales - for var_name in var_names: + def _add_scale_for_vars(self, var_names, use_unsigned_int, lod_tensor): + """ + Save quantization scales for variables. Do not overwrite. + """ + scales = self._var_quant_scales + for var_name in var_names: + if var_name not in scales: scales[var_name] = (use_unsigned_int, lod_tensor) + def _gather_input_scales_from_fake(self, graph): # fake_quantize_dequantize_abs_max doesn't have scale value fake_ops = ['fake_quantize_dequantize_moving_average_abs_max'] fake_ops.extend(self._fake_quantize_types) @@ -185,8 +188,8 @@ class Quant2Int8MkldnnPass(object): scale[scale == np.Inf] = 0.0 lod_tensor = self._convert_scale2tensor(scale) use_unsigned_int = False - _add_scale_for_vars([input_name, output_name], use_unsigned_int, - lod_tensor) + self._add_scale_for_vars([input_name, output_name], + use_unsigned_int, lod_tensor) return graph @@ -219,8 +222,8 @@ class Quant2Int8MkldnnPass(object): use_unsigned_int = False for output_name in op.op().outputs(): for out_var_name in op.op().output(output_name): - self._var_quant_scales[out_var_name] = ( - use_unsigned_int, scale_lod_tensor) + self._add_scale_for_vars( + [out_var_name], use_unsigned_int, scale_lod_tensor) return graph @@ -239,24 +242,21 @@ class Quant2Int8MkldnnPass(object): output_name = op.output("Out")[0] tensor_names = [input_name, output_name] - # Scale is not quantized, so if it doesn't have any scales - # to propagate, its tensors won't be added to the waiting list. - if all(name not in self._var_quant_scales for name in tensor_names) \ - and op.name() != 'scale': + if all(name not in self._var_quant_scales + for name in tensor_names): waiting_for_scale.update(tensor_names) continue - - if input_name in self._var_quant_scales: + elif input_name in self._var_quant_scales: self._var_quant_scales[ output_name] = self._var_quant_scales[input_name] elif output_name in self._var_quant_scales: - if op.name() == 'scale': - _update_scale_op_in_scale(op, input_name, - output_name) - else: - self._var_quant_scales[ - input_name] = self._var_quant_scales[ - output_name] + self._var_quant_scales[ + input_name] = self._var_quant_scales[output_name] + elif op.name() in self._scale_ops: + input_name = op.input("X")[0] + output_name = op.output("Out")[0] + if output_name in self._var_quant_scales: + _update_scale_op_in_scale(op, input_name, output_name) return waiting_for_scale waiting_for_scale = _update_scales(graph) -- GitLab From 372ac08a171d76c745deaab0feed2d587798f734 Mon Sep 17 00:00:00 2001 From: niuliling123 <51102941+niuliling123@users.noreply.github.com> Date: Tue, 23 Mar 2021 14:51:00 +0800 Subject: [PATCH 059/486] add relu forward kernel and backward kernel (#31613) * add relu forward kernel and backward kernel --- paddle/fluid/operators/activation_op.cu | 284 +++++++++++++++++++++++- 1 file changed, 283 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu index 2033081af22..29498da0f02 100644 --- a/paddle/fluid/operators/activation_op.cu +++ b/paddle/fluid/operators/activation_op.cu @@ -10,8 +10,276 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/activation_op.h" +#include "paddle/fluid/operators/math/math_cuda_utils.h" +#include "paddle/fluid/platform/cuda_device_function.h" #include "paddle/fluid/platform/float16.h" +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using float16 = paddle::platform::float16; + +template +struct CudaVecType { + using type = T; + static constexpr int vecsize = 1; +}; + +template <> +struct CudaVecType { + using type = __half2; + static constexpr int vecsize = 2; +}; + +template <> +struct CudaVecType { + using type = float4; + static constexpr int vecsize = 4; +}; + +template +class BaseGPUFunctor { + public: + using ELEMENT_TYPE = T; +}; + +/* ========================================================================== */ + +/* =========================== relu forward ============================ */ +template +class ReluGPUFuctor : public BaseGPUFunctor { + private: + T zero_; + + public: + ReluGPUFuctor() { zero_ = static_cast(0.0f); } + + // for relu forward when T is double + __device__ __forceinline__ typename CudaVecType::type Compute( + const typename CudaVecType::type* x); + + // when num % vecsize != 0 this func will be used + __device__ __forceinline__ T ComputeRemainder(const T x) { + return x > zero_ ? x : zero_; + } +}; + +template <> +__device__ __forceinline__ CudaVecType::type +ReluGPUFuctor::Compute(const CudaVecType::type* x) { +// relu forward : out = max(x, 0) +#ifdef __HIPCC__ || __CUDA_ARCH__ >= 350 || CUDA_VERSION >= 300 + return __ldg(x) > zero_ ? __ldg(x) : zero_; +#else + return (*x) > zero_ ? (*x) : zero_; +#endif +} + +template <> +__device__ __forceinline__ CudaVecType::type +ReluGPUFuctor::Compute(const CudaVecType::type* xx) { + // relu forward : out = max(xx, 0) + return make_float4((xx->x > zero_) * (xx->x), (xx->y > zero_) * (xx->y), + (xx->z > zero_) * (xx->z), (xx->w > zero_) * (xx->w)); +} + +template <> +__device__ __forceinline__ CudaVecType::type +ReluGPUFuctor::Compute(const CudaVecType::type* in) { +// relu forward : out = max(in, 0) +#ifdef __HIPCC__ || __CUDA_ARCH__ >= 350 || CUDA_VERSION >= 300 + const half2 kzero = __float2half2_rn(0.0f); + return __hmul2(__hgt2(__ldg(in), kzero), __ldg(in)); +#else + const float2 xx = __half22float2(*in); + return __floats2half2_rn((xx.x > 0.0f) * static_cast(xx.x), + (xx.y > 0.0f) * static_cast(xx.y)); +#endif +} +/* ========================================================================== */ + +/* =========================== relu backward ============================ + */ + +template +class ReluGradGPUFunctor : public BaseGPUFunctor { + private: + T zero_; + + public: + ReluGradGPUFunctor() { zero_ = static_cast(0.0f); } + + // for relu backward when T is double + __device__ __forceinline__ typename CudaVecType::type Compute( + const typename CudaVecType::type* out, + const typename CudaVecType::type* dout); + + // when num % vecsize != 0 this func will be used + __device__ __forceinline__ T ComputeRemainder(const T out, const T dout) { + // relu backward : dx = out > 0 ? dout : 0; + return out > zero_ ? dout : zero_; + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } +}; + +template <> +__device__ __forceinline__ CudaVecType::type +ReluGradGPUFunctor::Compute(const CudaVecType::type* out, + const CudaVecType::type* dout) { +// relu backward : dx = out > 0 ? dout : 0; +#ifdef __HIPCC__ || __CUDA_ARCH__ >= 350 || CUDA_VERSION >= 300 + return __ldg(out) > zero_ ? __ldg(dout) : zero_; +#else + return (*out) > zero_ ? (*dout) : zero_; +#endif +} + +template <> +__device__ __forceinline__ CudaVecType::type +ReluGradGPUFunctor::Compute(const CudaVecType::type* out, + const CudaVecType::type* dout) { + // relu backward : dx = out > 0 ? dout : 0; + return make_float4((out->x > zero_) * (dout->x), (out->y > zero_) * (dout->y), + (out->z > zero_) * (dout->z), + (out->w > zero_) * (dout->w)); +} + +template <> +__device__ __forceinline__ CudaVecType::type +ReluGradGPUFunctor::Compute(const CudaVecType::type* out, + const CudaVecType::type* dout) { +// relu backward : dx = out > 0 ? dout : 0; +#ifdef __HIPCC__ || __CUDA_ARCH__ >= 350 || CUDA_VERSION >= 300 + const half2 kzero = __float2half2_rn(0.0f); + return __hmul2(__hgt2(__ldg(out), kzero), __ldg(dout)); +#else + const float2 xx = __half22float2(*out); + const float2 yy = __half22float2(*dout); + return __floats2half2_rn((xx.x > 0.0f) * static_cast(yy.x), + (xx.y > 0.0f) * static_cast(yy.y)); +#endif +} + +/* ========================================================================== */ + +template +__global__ void ActivationGradKernelVec(const T* forward_data, const T* dout, + T* dx, int num, Functor functor) { + using VecType = typename CudaVecType::type; + constexpr int vecsize = CudaVecType::vecsize; + int idx = threadIdx.x + blockIdx.x * blockDim.x; + int stride = blockDim.x * gridDim.x; + int loop = num / vecsize; + int tail = num % vecsize; + const VecType* in_forward = reinterpret_cast(forward_data); + const VecType* in_dout = reinterpret_cast(dout); + VecType* out = reinterpret_cast(dx); + + for (int i = idx; i < loop; i += stride) { + out[i] = functor.Compute((in_forward + i), (in_dout + i)); + } + + while (idx == loop && tail) { + dx[num - tail] = + functor.ComputeRemainder(forward_data[num - tail], dout[num - tail]); + --tail; + } +} + +template +__global__ void ActivationkernelVec(const T* src, T* dst, int num, + Functor functor) { + constexpr int vecsize = CudaVecType::vecsize; + using VecType = typename CudaVecType::type; + int idx = threadIdx.x + blockIdx.x * blockDim.x; + int stride = blockDim.x * gridDim.x; + int loop = num / vecsize; + int tail = num % vecsize; + const VecType* in = reinterpret_cast(src); + VecType* out = reinterpret_cast(dst); + + for (int i = idx; i < loop; i += stride) { + out[i] = functor.Compute((in + i)); + } + + while (idx == loop && tail) { + dst[num - tail] = functor.ComputeRemainder(src[num - tail]); + --tail; + } +} + +template +class ActivationGPUKernel + : public framework::OpKernel { + public: + using T = typename Functor::ELEMENT_TYPE; + void Compute(const framework::ExecutionContext& context) const override { + const framework::Tensor* in_x = nullptr; + framework::Tensor* out = nullptr; + ExtractActivationTensor(context, &in_x, &out); + auto& dev_ctx = context.template device_context(); + + int num = in_x->numel(); + const T* input_data = in_x->data(); + T* output_data = out->mutable_data(dev_ctx.GetPlace(), + static_cast(num * sizeof(T))); + + int block = 512; +#ifdef __HIPCC__ + block = 256; +#endif + Functor functor; + constexpr int vecsize = CudaVecType::vecsize; + int grid = max((num / vecsize + block - 1) / block, 1); + ActivationkernelVec<<>>(input_data, output_data, + num, functor); + } +}; + +template +class ActivationGradGPUKernel + : public framework::OpKernel { + public: + using T = typename Functor::ELEMENT_TYPE; + void Compute(const framework::ExecutionContext& context) const override { + const framework::Tensor *x, *out, *d_out; + framework::Tensor* d_x = nullptr; + x = out = d_out = nullptr; + ExtractActivationGradTensor(context, &x, &out, &d_out, + &d_x); + int numel = d_out->numel(); + auto& dev_ctx = context.template device_context(); + auto* dx_data = d_x->mutable_data( + dev_ctx.GetPlace(), static_cast(numel * sizeof(T))); + auto* dout_data = d_out->data(); + + auto* forward_data = dout_data; + if (static_cast(Functor::FwdDeps()) == static_cast(kDepOut)) { + // Only need forward output Out + forward_data = out->data(); + } else if (static_cast(Functor::FwdDeps()) == + static_cast(kDepX)) { + // Only need forward input X + forward_data = x->data(); + } + + int block = 512; +#ifdef __HIPCC__ + block = 256; +#endif + Functor functor; + constexpr int vecsize = CudaVecType::vecsize; + int grid = max((numel / vecsize + block - 1) / block, 1); + ActivationGradKernelVec<<>>( + forward_data, dout_data, dx_data, numel, functor); + } +}; + +} // namespace operators +} // namespace paddle + namespace ops = paddle::operators; namespace plat = paddle::platform; @@ -60,7 +328,21 @@ REGISTER_OP_CUDA_KERNEL( /* ========================================================================== */ /* =========================== relu register ============================ */ -REGISTER_ACTIVATION_CUDA_KERNEL(relu, Relu, ReluCUDAFunctor, ReluGradFunctor); +REGISTER_OP_CUDA_KERNEL( + relu, ops::ActivationGPUKernel>, + ops::ActivationGPUKernel>, + ops::ActivationGPUKernel>); + +REGISTER_OP_CUDA_KERNEL( + relu_grad, ops::ActivationGradGPUKernel>, + ops::ActivationGradGPUKernel>, + ops::ActivationGradGPUKernel>); REGISTER_OP_CUDA_KERNEL( relu_grad_grad, -- GitLab From f4d9212de25a7a8c5b5d3d160ed6ce1c4f40bdd0 Mon Sep 17 00:00:00 2001 From: Wilber Date: Tue, 23 Mar 2021 15:11:02 +0800 Subject: [PATCH 060/486] trt plugin upgrade to pluginv2ext (#31670) --- .../inference/tensorrt/convert/split_op.cc | 2 +- paddle/fluid/inference/tensorrt/engine.cc | 9 +- paddle/fluid/inference/tensorrt/engine.h | 7 ++ .../inference/tensorrt/plugin/CMakeLists.txt | 3 + .../tensorrt/plugin/split_op_plugin.cu | 5 - .../tensorrt/plugin/split_op_plugin.h | 69 +++++++++-- .../tensorrt/plugin/test_split_plugin.cc | 58 +++++++++ .../inference/tensorrt/plugin/trt_plugin.cc | 78 ++++++++++-- .../inference/tensorrt/plugin/trt_plugin.h | 112 +++++++++++++++++- python/setup.py.in | 11 ++ 10 files changed, 322 insertions(+), 32 deletions(-) create mode 100644 paddle/fluid/inference/tensorrt/plugin/test_split_plugin.cc diff --git a/paddle/fluid/inference/tensorrt/convert/split_op.cc b/paddle/fluid/inference/tensorrt/convert/split_op.cc index 768c6efaa6b..5d494c2093b 100644 --- a/paddle/fluid/inference/tensorrt/convert/split_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/split_op.cc @@ -101,7 +101,7 @@ class SplitOpConverter : public OpConverter { engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); plugin::SplitPlugin* plugin = new plugin::SplitPlugin(axis, output_lengths, with_fp16); - layer = engine_->AddPlugin(&input, input_num, plugin); + layer = engine_->AddPluginV2Ext(&input, input_num, plugin); } std::string layer_name = "split (Output: "; diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc index 0bba4581ff9..99549fd6b5c 100644 --- a/paddle/fluid/inference/tensorrt/engine.cc +++ b/paddle/fluid/inference/tensorrt/engine.cc @@ -18,7 +18,7 @@ limitations under the License. */ #include #include -#include "cuda_runtime_api.h" +#include "cuda_runtime_api.h" // NOLINT #include "paddle/fluid/inference/tensorrt/helper.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/gpu_info.h" @@ -353,6 +353,13 @@ nvinfer1::IPluginLayer *TensorRTEngine::AddPlugin( return network()->addPluginExt(inputs, num_inputs, *plugin); } +nvinfer1::IPluginV2Layer *TensorRTEngine::AddPluginV2Ext( + nvinfer1::ITensor *const *inputs, int num_inputs, + plugin::PluginTensorRTV2Ext *plugin) { + owned_plugin_v2ext_.emplace_back(plugin); + return network()->addPluginV2(inputs, num_inputs, *plugin); +} + void TensorRTEngine::freshDeviceId() { int count; cudaGetDeviceCount(&count); diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h index 0e399578fa4..de2924824f0 100644 --- a/paddle/fluid/inference/tensorrt/engine.h +++ b/paddle/fluid/inference/tensorrt/engine.h @@ -305,8 +305,14 @@ class TensorRTEngine { } int GetDeviceId() { return device_id_; } + nvinfer1::IPluginLayer* AddPlugin(nvinfer1::ITensor* const* inputs, int num_inputs, plugin::PluginTensorRT*); + + nvinfer1::IPluginV2Layer* AddPluginV2Ext(nvinfer1::ITensor* const* inputs, + int num_inputs, + plugin::PluginTensorRTV2Ext* plugin); + void SetTensorDynamicRange(nvinfer1::ITensor* tensor, float range) { quant_dynamic_range_[tensor] = range; } @@ -414,6 +420,7 @@ class TensorRTEngine { itensor_map_; std::vector> owned_plugin_; + std::vector> owned_plugin_v2ext_; // TensorRT related internal members template diff --git a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt index e37beb3b8e5..7ee16a598d2 100644 --- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt @@ -6,3 +6,6 @@ nv_library(tensorrt_plugin qkv_to_context_plugin.cu skip_layernorm_op_plugin.cu slice_op_plugin.cu hard_swish_op_plugin.cu stack_op_plugin.cu special_slice_plugin.cu DEPS enforce tensorrt_engine prelu tensor bert_encoder_functor) + +nv_test(test_split_plugin SRCS test_split_plugin.cc DEPS + paddle_framework ${GLOB_OPERATOR_DEPS} tensorrt_plugin) diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu index 256aa28206a..1b5c39f8fff 100644 --- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu @@ -22,11 +22,6 @@ namespace inference { namespace tensorrt { namespace plugin { -SplitPlugin* CreateSplitPluginDeserialize(const void* buffer, size_t length) { - return new SplitPlugin(buffer, length); -} -REGISTER_TRT_PLUGIN("split_plugin", CreateSplitPluginDeserialize); - template __device__ int upper_bound(T const* vals, int n, T const& key) { int i = 0; diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h index 5c47ec3a990..e43b57357fb 100644 --- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h @@ -25,7 +25,7 @@ namespace inference { namespace tensorrt { namespace plugin { -class SplitPlugin : public PluginTensorRT { +class SplitPlugin : public PluginTensorRTV2Ext { public: SplitPlugin() {} SplitPlugin(int axis, std::vector const& output_lengths, bool with_fp16) @@ -39,13 +39,20 @@ class SplitPlugin : public PluginTensorRT { DeserializeValue(&serial_data, &serial_length, &output_length_); } - SplitPlugin* clone() const override { - auto* ptr = new SplitPlugin(axis_, output_length_, with_fp16_); + nvinfer1::IPluginV2Ext* clone() const override { + SplitPlugin* ptr = new SplitPlugin(axis_, output_length_, with_fp16_); + ptr->setPluginNamespace(this->getPluginNamespace()); ptr->shareData(this); return ptr; } - const char* getPluginType() const override { return "split_plugin"; } + nvinfer1::DataType getOutputDataType(int index, + const nvinfer1::DataType* input_types, + int nb_inputs) const override { + return input_types[0]; + } + + const char* getPluginType() const override { return "split_plugin_v2ext"; } int getNbOutputs() const override { return output_length_.size(); } nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* input_dims, @@ -53,17 +60,18 @@ class SplitPlugin : public PluginTensorRT { int initialize() override; void terminate() override; - int enqueue(int batchSize, const void* const* inputs, void** outputs, + int enqueue(int batch_size, const void* const* inputs, void** outputs, void* workspace, cudaStream_t stream) override; + void destroy() override { delete this; } + protected: - size_t getSerializationSize() override { - return SerializedSize(getPluginType()) + SerializedSize(axis_) + - SerializedSize(output_length_) + getBaseSerializationSize(); + size_t getSerializationSize() const override { + return SerializedSize(axis_) + SerializedSize(output_length_) + + getBaseSerializationSize(); } - void serialize(void* buffer) override { - SerializeValue(&buffer, getPluginType()); + void serialize(void* buffer) const override { serializeBase(buffer); SerializeValue(&buffer, axis_); SerializeValue(&buffer, output_length_); @@ -83,6 +91,47 @@ class SplitPlugin : public PluginTensorRT { void shareData(const SplitPlugin* another); }; +class SplitPluginCreator : public nvinfer1::IPluginCreator { + public: + SplitPluginCreator() {} + const char* getPluginName() const override { return "split_plugin_v2ext"; } + + const char* getPluginVersion() const override { return "1"; } + + const nvinfer1::PluginFieldCollection* getFieldNames() override { + return &field_collection_; + } + + nvinfer1::IPluginV2* createPlugin( + const char* name, const nvinfer1::PluginFieldCollection* fc) override { + // not implemented + return nullptr; + } + + nvinfer1::IPluginV2* deserializePlugin(const char* name, + const void* serial_data, + size_t serial_length) override { + auto plugin = new SplitPlugin(serial_data, serial_length); + return plugin; + } + + void setPluginNamespace(const char* lib_namespace) override { + plugin_namespace_ = lib_namespace; + } + + const char* getPluginNamespace() const override { + return plugin_namespace_.c_str(); + } + + private: + std::string plugin_namespace_; + std::string plugin_name_; + nvinfer1::PluginFieldCollection field_collection_{0, nullptr}; + std::vector plugin_attributes_; +}; + +REGISTER_TRT_PLUGIN_V2(SplitPluginCreator); + #if IS_TRT_VERSION_GE(6000) class SplitPluginDynamic : public DynamicPluginTensorRT { public: diff --git a/paddle/fluid/inference/tensorrt/plugin/test_split_plugin.cc b/paddle/fluid/inference/tensorrt/plugin/test_split_plugin.cc new file mode 100644 index 00000000000..6636513a555 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/test_split_plugin.cc @@ -0,0 +1,58 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h" + +namespace paddle { +namespace inference { +namespace tensorrt { +namespace plugin { + +TEST(split_op_plugin, test_plugin) { + int axis = 1; + std::vector output_lengths{1, 1}; + bool with_fp16 = false; + std::vector input_types{nvinfer1::DataType::kFLOAT}; + std::vector input_dims; + + SplitPlugin sp_plugin(axis, output_lengths, with_fp16); + nvinfer1::Dims in_dims; + in_dims.nbDims = 4; + input_dims.push_back(in_dims); + sp_plugin.configurePlugin(input_dims.data(), 1, nullptr, 2, + input_types.data(), nullptr, nullptr, nullptr, + nvinfer1::PluginFormat::kNCHW, 4); + sp_plugin.initialize(); + sp_plugin.getPluginType(); + sp_plugin.canBroadcastInputAcrossBatch(0); + sp_plugin.getNbOutputs(); + auto clone_plugin = sp_plugin.clone(); + clone_plugin->setPluginNamespace("test"); + clone_plugin->destroy(); + sp_plugin.getOutputDataType(0, input_types.data(), 1); + sp_plugin.terminate(); +} + +TEST(split_op_plugin, test_plugin_creater) { + SplitPluginCreator creator; + creator.getFieldNames(); + creator.createPlugin("test", nullptr); + creator.setPluginNamespace("test"); +} + +} // namespace plugin +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc index fd721b16145..55bc786746b 100644 --- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc +++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.cc @@ -19,27 +19,50 @@ namespace inference { namespace tensorrt { namespace plugin { +inline void Seria(void*& buffer, // NOLINT + const std::vector& input_dims, + size_t max_batch_size, nvinfer1::DataType data_type, + nvinfer1::PluginFormat data_format, bool with_fp16) { + SerializeValue(&buffer, input_dims); + SerializeValue(&buffer, max_batch_size); + SerializeValue(&buffer, data_type); + SerializeValue(&buffer, data_format); + SerializeValue(&buffer, with_fp16); +} + +inline void Deseria(void const*& serial_data, size_t& serial_length, // NOLINT + std::vector* input_dims, + size_t* max_batch_size, nvinfer1::DataType* data_type, + nvinfer1::PluginFormat* data_format, bool* with_fp16) { + DeserializeValue(&serial_data, &serial_length, input_dims); + DeserializeValue(&serial_data, &serial_length, max_batch_size); + DeserializeValue(&serial_data, &serial_length, data_type); + DeserializeValue(&serial_data, &serial_length, data_format); + DeserializeValue(&serial_data, &serial_length, with_fp16); +} + +inline size_t SeriaSize(const std::vector& input_dims, + size_t max_batch_size, nvinfer1::DataType data_type, + nvinfer1::PluginFormat data_format, bool with_fp16) { + return (SerializedSize(input_dims) + SerializedSize(max_batch_size) + + SerializedSize(data_type) + SerializedSize(data_format) + + SerializedSize(with_fp16)); +} + void PluginTensorRT::serializeBase(void*& buffer) { - SerializeValue(&buffer, input_dims_); - SerializeValue(&buffer, max_batch_size_); - SerializeValue(&buffer, data_type_); - SerializeValue(&buffer, data_format_); - SerializeValue(&buffer, with_fp16_); + Seria(buffer, input_dims_, max_batch_size_, data_type_, data_format_, + with_fp16_); } void PluginTensorRT::deserializeBase(void const*& serial_data, size_t& serial_length) { - DeserializeValue(&serial_data, &serial_length, &input_dims_); - DeserializeValue(&serial_data, &serial_length, &max_batch_size_); - DeserializeValue(&serial_data, &serial_length, &data_type_); - DeserializeValue(&serial_data, &serial_length, &data_format_); - DeserializeValue(&serial_data, &serial_length, &with_fp16_); + Deseria(serial_data, serial_length, &input_dims_, &max_batch_size_, + &data_type_, &data_format_, &with_fp16_); } size_t PluginTensorRT::getBaseSerializationSize() { - return (SerializedSize(input_dims_) + SerializedSize(max_batch_size_) + - SerializedSize(data_type_) + SerializedSize(data_format_) + - SerializedSize(with_fp16_)); + return SeriaSize(input_dims_, max_batch_size_, data_type_, data_format_, + with_fp16_); } bool PluginTensorRT::supportsFormat(nvinfer1::DataType type, @@ -58,6 +81,35 @@ void PluginTensorRT::configureWithFormat( max_batch_size_ = max_batch_size; } +void PluginTensorRTV2Ext::serializeBase(void*& buffer) const { + Seria(buffer, input_dims_, max_batch_size_, data_type_, data_format_, + with_fp16_); +} + +void PluginTensorRTV2Ext::deserializeBase(void const*& serial_data, + size_t& serial_length) { + Deseria(serial_data, serial_length, &input_dims_, &max_batch_size_, + &data_type_, &data_format_, &with_fp16_); +} + +size_t PluginTensorRTV2Ext::getBaseSerializationSize() const { + return SeriaSize(input_dims_, max_batch_size_, data_type_, data_format_, + with_fp16_); +} + +void PluginTensorRTV2Ext::configurePlugin( + const nvinfer1::Dims* input_dims, int32_t nb_inputs, + const nvinfer1::Dims* output_dims, int32_t nb_outputs, + const nvinfer1::DataType* input_types, + const nvinfer1::DataType* output_types, const bool* input_is_broadcast, + const bool* output_is_broadcast, nvinfer1::PluginFormat float_format, + int32_t max_batch_size) { + input_dims_.assign(input_dims, input_dims + nb_inputs); + max_batch_size_ = max_batch_size; + data_format_ = float_format; + data_type_ = input_types[0]; +} + } // namespace plugin } // namespace tensorrt } // namespace inference diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h index b3a3abe5d01..ce3133ae99e 100644 --- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h @@ -44,6 +44,7 @@ typedef std::function typedef std::function PluginConstructFunc; +// Deprecated. Do not inherit this class, please refer to PluginTensorRTV2Ext class PluginTensorRT : public nvinfer1::IPluginExt { public: PluginTensorRT() : with_fp16_(false) {} @@ -119,6 +120,114 @@ class PluginTensorRT : public nvinfer1::IPluginExt { bool with_fp16_; }; +// TensorRT introduced IPluginV2Ext after 5.1, Paddle no longer supports +// versions before 5.1 +class PluginTensorRTV2Ext : public nvinfer1::IPluginV2Ext { + public: + PluginTensorRTV2Ext() : with_fp16_(false) {} + PluginTensorRTV2Ext(const void* serialized_data, size_t length) {} + + nvinfer1::Dims const& getInputDims(int index) const { + return input_dims_.at(index); + } + size_t getMaxBatchSize() const { return max_batch_size_; } + nvinfer1::DataType getDataType() const { return data_type_; } + nvinfer1::PluginFormat getDataFormat() const { return data_format_; } + + // The Func in IPluginV2Ext + virtual nvinfer1::DataType getOutputDataType( + int index, const nvinfer1::DataType* input_types, + int nb_inputs) const = 0; + + virtual bool isOutputBroadcastAcrossBatch(int32_t output_index, + const bool* input_is_broadcasted, + int32_t nb_inputs) const { + return false; + } + + virtual bool canBroadcastInputAcrossBatch(int32_t input_index) const { + return false; + } + + void configurePlugin(const nvinfer1::Dims* input_dims, int32_t nb_inputs, + const nvinfer1::Dims* output_dims, int32_t nb_outputs, + const nvinfer1::DataType* input_types, + const nvinfer1::DataType* output_types, + const bool* input_is_broadcast, + const bool* output_is_broadcast, + nvinfer1::PluginFormat float_format, + int32_t max_batch_size) override; + + virtual IPluginV2Ext* clone() const = 0; + + void attachToContext(cudnnContext*, cublasContext*, + nvinfer1::IGpuAllocator*) override {} + + void detachFromContext() override {} + + // The Func in IPluginV2 + virtual const char* getPluginType() const = 0; + const char* getPluginVersion() const override { return "1"; } + virtual int32_t getNbOutputs() const { return 1; } + virtual nvinfer1::Dims getOutputDimensions(int32_t index, + const nvinfer1::Dims* inputs, + int32_t nb_input) = 0; + // Check format support. The default is FLOAT32 and NCHW. + bool supportsFormat(nvinfer1::DataType type, + nvinfer1::PluginFormat format) const override { + return ((type == nvinfer1::DataType::kFLOAT) && + (format == nvinfer1::PluginFormat::kNCHW)); + } + // Initialize the layer for execution. + // This is called when the engine is created. + int initialize() override { return 0; } + + // Shutdown the layer. This is called when the engine is destroyed + void terminate() override {} + + // Find the workspace size required by the layer + size_t getWorkspaceSize(int) const override { return 0; } + + // Execute the layer + virtual int enqueue(int batch_size, const void* const* inputs, void** outputs, + void* workspace, cudaStream_t stream) = 0; + + // Find the size of the serialization buffer required + virtual size_t getSerializationSize() const = 0; + + // Serialize the layer config to buffer. + // TensorRT will call this func to serialize the configuration of TensorRT + // engine. It should not be called by users. + virtual void serialize(void* buffer) const = 0; + + virtual void destroy() = 0; + + void setPluginNamespace(const char* plugin_namespace) override { + name_space_ = plugin_namespace; + } + + const char* getPluginNamespace() const override { + return name_space_.c_str(); + } + + protected: + void deserializeBase(void const*& serial_data, // NOLINT + size_t& serial_length); // NOLINT + size_t getBaseSerializationSize() const; + void serializeBase(void*& buffer) const; // NOLINT + + protected: + std::vector input_dims_; + size_t max_batch_size_; + nvinfer1::DataType data_type_; + nvinfer1::PluginFormat data_format_; + std::vector inputs_; + bool with_fp16_; + + private: + std::string name_space_; +}; + #if IS_TRT_VERSION_GE(6000) class DynamicPluginTensorRT : public nvinfer1::IPluginV2DynamicExt { public: @@ -184,6 +293,7 @@ class DynamicPluginTensorRT : public nvinfer1::IPluginV2DynamicExt { std::string name_space_; std::string plugin_base_; }; +#endif template class TrtPluginRegistrarV2 { @@ -203,8 +313,6 @@ class TrtPluginRegistrarV2 { static paddle::inference::tensorrt::plugin::TrtPluginRegistrarV2 \ plugin_registrar_##name {} -#endif - } // namespace plugin } // namespace tensorrt } // namespace inference diff --git a/python/setup.py.in b/python/setup.py.in index 64cfe6e9ccf..69a8bc771ae 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -336,6 +336,17 @@ if '${WITH_XPU_BKCL}' == 'ON': shutil.copy('${XPU_BKCL_LIB}', libs_path) package_data['paddle.libs']+=['${XPU_BKCL_LIB_NAME}'] +# Only for lite xpu inference. +if '${WITH_XPU}' == 'OFF' and '${XPU_SDK_ROOT}' != '': + xpu_api_lib = os.path.join('${XPU_SDK_ROOT}', 'XTDK/shlib/', 'libxpuapi.so') + xpu_rt_lib = os.path.join('${XPU_SDK_ROOT}', 'XTDK/runtime/shlib/', 'libxpurt.so') + if os.path.exists(xpu_api_lib): + shutil.copy(xpu_api_lib, libs_path) + package_data['paddle.libs']+=['libxpuapi.so'] + if os.path.exists(xpu_rt_lib): + shutil.copy(xpu_rt_lib, libs_path) + package_data['paddle.libs']+=['libxpurt.so'] + ### Old custom op extension mechanism related, will be removed in 2.1.0 ### # copy libpaddle_framework.so to libs on linux if sys.platform.startswith('linux'): -- GitLab From a70de87d766083bb8213dff31d75d310a6cd3d19 Mon Sep 17 00:00:00 2001 From: Zhou Wei <52485244+zhouwei25@users.noreply.github.com> Date: Tue, 23 Mar 2021 15:13:36 +0800 Subject: [PATCH 061/486] Update windows compiler and CI from VS2015 to VS2017 (#31652) * modify windows CI to VS2017 * modify windows CI to VS2017 * modify windows CI to VS2017 --- CMakeLists.txt | 4 +- cmake/external/warpctc.cmake | 2 +- cmake/generic.cmake | 10 ++-- cmake/init.cmake | 4 ++ cmake/paddle_win.props | 2 +- paddle/fluid/inference/api/demo_ci/run.sh | 4 +- .../api/demo_ci/run_windows_demo.bat | 8 +-- .../api/demo_ci/windows_inference.md | 2 +- paddle/fluid/pybind/CMakeLists.txt | 2 +- paddle/scripts/paddle_build.bat | 49 ++++++++++++------- paddle/scripts/windows_build/build.bat | 12 ++--- paddle/scripts/windows_build/config.ini | 2 +- python/CMakeLists.txt | 1 + 13 files changed, 59 insertions(+), 43 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 10b3b0aba4e..676c94591ee 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -61,6 +61,8 @@ if(WITH_MUSL) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=deprecated-declarations -Wno-deprecated-declarations -Wno-error=pessimizing-move -Wno-error=deprecated-copy") endif() +#set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Zm1000 /fp:fast") + if(WIN32) option(MSVC_STATIC_CRT "use static C Runtime library by default" ON) @@ -124,7 +126,7 @@ if(WIN32) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838") foreach(flag_var CMAKE_SHARED_LINKER_FLAGS CMAKE_STATIC_LINKER_FLAGS CMAKE_EXE_LINKER_FLAGS CMAKE_LINKER_FLAGS) - string(APPEND ${flag_var} "/ignore:4049 /ignore:4217 /ignore:4006 /ignore:4221") + set(${flag_var} "${${flag_var}} /ignore:4049 /ignore:4217 /ignore:4006 /ignore:4221") endforeach(flag_var) if (WITH_WIN_DUMP_DBG) diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake index e633cae5401..b0ef575f643 100644 --- a/cmake/external/warpctc.cmake +++ b/cmake/external/warpctc.cmake @@ -18,7 +18,7 @@ SET(WARPCTC_PREFIX_DIR ${THIRD_PARTY_PATH}/warpctc) SET(WARPCTC_SOURCE_DIR ${THIRD_PARTY_PATH}/warpctc/src/extern_warpctc) SET(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc) set(WARPCTC_REPOSITORY ${GIT_URL}/baidu-research/warp-ctc.git) -set(WARPCTC_TAG 95a461eddeabd51099ef059dcfada1117eb1bfb8) +set(WARPCTC_TAG cd828e5b6c3b953b82af73f7f44cddc393a20efa) SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include" CACHE PATH "Warp-ctc Directory" FORCE) diff --git a/cmake/generic.cmake b/cmake/generic.cmake index ba86cfabdf1..c85654a5674 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -492,10 +492,8 @@ function(nv_library TARGET_NAME) message(FATAL "Please specify source file or library in nv_library.") endif() endif(nv_library_SRCS) - if (WIN32 AND ${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0) - if(${MSVC_VERSION} LESS_EQUAL 1900) - set_target_properties(${TARGET_NAME} PROPERTIES VS_USER_PROPS ${WIN_PROPS}) - endif() + if((CUDA_VERSION GREATER 9.2) AND (CUDA_VERSION LESS 11.0) AND (MSVC_VERSION LESS 1910)) + set_target_properties(${TARGET_NAME} PROPERTIES VS_USER_PROPS ${WIN_PROPS}) endif() endif() endfunction(nv_library) @@ -512,7 +510,7 @@ function(nv_binary TARGET_NAME) add_dependencies(${TARGET_NAME} ${nv_binary_DEPS}) common_link(${TARGET_NAME}) endif() - if (WIN32 AND ${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0) + if((CUDA_VERSION GREATER 9.2) AND (CUDA_VERSION LESS 11.0) AND (MSVC_VERSION LESS 1910)) set_target_properties(${TARGET_NAME} PROPERTIES VS_USER_PROPS ${WIN_PROPS}) endif() endif() @@ -539,7 +537,7 @@ function(nv_test TARGET_NAME) set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true) set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true) set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true) - if (WIN32 AND ${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0) + if((CUDA_VERSION GREATER 9.2) AND (CUDA_VERSION LESS 11.0) AND (MSVC_VERSION LESS 1910)) set_target_properties(${TARGET_NAME} PROPERTIES VS_USER_PROPS ${WIN_PROPS}) endif() endif() diff --git a/cmake/init.cmake b/cmake/init.cmake index aea02088750..19fdb6c601a 100644 --- a/cmake/init.cmake +++ b/cmake/init.cmake @@ -18,6 +18,10 @@ if(NOT WIN32) set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O2 -g -DNDEBUG") set(CMAKE_CXX_FLAGS_MINSIZEREL "-Os -DNDEBUG") else() + # It has not been used now, it can specify CUDA compile flag manualy, + # its use is to remvoe /Zi to reduce GPU static library size. But it's dangerous + # because CUDA will update by nvidia, then error will occur. + # Now, it's used in CUDA:[10.0, 10.2] set(WIN_PROPS ${CMAKE_SOURCE_DIR}/cmake/paddle_win.props) endif() diff --git a/cmake/paddle_win.props b/cmake/paddle_win.props index 0115ad4b59f..296940dc3f5 100644 --- a/cmake/paddle_win.props +++ b/cmake/paddle_win.props @@ -15,7 +15,7 @@ InheritFromHost -ccbin "%(VCBinDir)" -x cu [GenerateRelocatableDeviceCode] [Include] [RequiredIncludes] [InterleaveSourceInPTX] [GPUDebugInfo] [GenerateLineInfo] [Keep] [KeepDir] [MaxRegCount] [PtxAsOptionV] [TargetMachinePlatform] [NvccCompilation] [CudaRuntime] [AdditionalOptions] - --use-local-env --cl-version $(CudaClVersion) + --use-local-env $(CudaClVersion) [CodeGeneration] -clean diff --git a/paddle/fluid/inference/api/demo_ci/run.sh b/paddle/fluid/inference/api/demo_ci/run.sh index e11a5b9c337..53f92596666 100755 --- a/paddle/fluid/inference/api/demo_ci/run.sh +++ b/paddle/fluid/inference/api/demo_ci/run.sh @@ -88,7 +88,7 @@ for WITH_STATIC_LIB in ON OFF; do return 0 fi # -----simple_on_word2vec on windows----- - cmake .. -G "Visual Studio 14 2015" -A x64 -DPADDLE_LIB=${inference_install_dir} \ + cmake .. -G "Visual Studio 15 2017" -A x64 -T host=x64 -DPADDLE_LIB=${inference_install_dir} \ -DWITH_MKL=$TURN_ON_MKL \ -DDEMO_NAME=simple_on_word2vec \ -DWITH_GPU=$TEST_GPU_CPU \ @@ -107,7 +107,7 @@ for WITH_STATIC_LIB in ON OFF; do # -----vis_demo on windows----- rm -rf * - cmake .. -G "Visual Studio 14 2015" -A x64 -DPADDLE_LIB=${inference_install_dir} \ + cmake .. -G "Visual Studio 15 2017" -A x64 -T host=x64 -DPADDLE_LIB=${inference_install_dir} \ -DWITH_MKL=$TURN_ON_MKL \ -DDEMO_NAME=vis_demo \ -DWITH_GPU=$TEST_GPU_CPU \ diff --git a/paddle/fluid/inference/api/demo_ci/run_windows_demo.bat b/paddle/fluid/inference/api/demo_ci/run_windows_demo.bat index 523dafa6649..d17f516fcca 100644 --- a/paddle/fluid/inference/api/demo_ci/run_windows_demo.bat +++ b/paddle/fluid/inference/api/demo_ci/run_windows_demo.bat @@ -67,7 +67,7 @@ if /i "%use_gpu%"=="Y" ( rem set_path_vs_command_prompt :set_vcvarsall_dir -SET /P vcvarsall_dir="Please input the path of visual studio command Prompt, such as C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat =======>" +SET /P vcvarsall_dir="Please input the path of visual studio command Prompt, such as C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvarsall.bat =======>" set tmp_var=!vcvarsall_dir! call:remove_space set vcvarsall_dir=!tmp_var! @@ -177,16 +177,16 @@ if /i "%use_mkl%"=="N" ( if /i "%gpu_inference%"=="Y" ( if "%demo_name%"=="trt_mobilenet_demo" ( - cmake .. -G "Visual Studio 14 2015 Win64" -T host=x64 -DWITH_GPU=ON ^ + cmake .. -G "Visual Studio 15 2017 Win64" -T host=x64 -DWITH_GPU=ON ^ -DWITH_MKL=%use_mkl% -DWITH_STATIC_LIB=ON -DCMAKE_BUILD_TYPE=Release -DDEMO_NAME=%demo_name% ^ -DPADDLE_LIB="%paddle_infernece_lib%" -DMSVC_STATIC_CRT=ON -DCUDA_LIB="%cuda_lib_dir%" -DUSE_TENSORRT=ON ) else ( - cmake .. -G "Visual Studio 14 2015 Win64" -T host=x64 -DWITH_GPU=ON ^ + cmake .. -G "Visual Studio 15 2017 Win64" -T host=x64 -DWITH_GPU=ON ^ -DWITH_MKL=%use_mkl% -DWITH_STATIC_LIB=ON -DCMAKE_BUILD_TYPE=Release -DDEMO_NAME=%demo_name% ^ -DPADDLE_LIB="%paddle_infernece_lib%" -DMSVC_STATIC_CRT=ON -DCUDA_LIB="%cuda_lib_dir%" ) ) else ( - cmake .. -G "Visual Studio 14 2015 Win64" -T host=x64 -DWITH_GPU=OFF ^ + cmake .. -G "Visual Studio 15 2017 Win64" -T host=x64 -DWITH_GPU=OFF ^ -DWITH_MKL=%use_mkl% -DWITH_STATIC_LIB=ON -DCMAKE_BUILD_TYPE=Release -DDEMO_NAME=%demo_name% ^ -DPADDLE_LIB="%paddle_infernece_lib%" -DMSVC_STATIC_CRT=ON ) diff --git a/paddle/fluid/inference/api/demo_ci/windows_inference.md b/paddle/fluid/inference/api/demo_ci/windows_inference.md index 73938cb995f..c646c351462 100644 --- a/paddle/fluid/inference/api/demo_ci/windows_inference.md +++ b/paddle/fluid/inference/api/demo_ci/windows_inference.md @@ -8,7 +8,7 @@ 3. 进入Paddle/paddle/fluid/inference/api/demo_ci目录,新建build目录,然后使用cmake生成vs2015的solution文件。 其中PADDLE_LIB是前面的paddle_inference.lib对应文件夹, CUDA_LIB指定为x64格式下的cuda系统库目录文件夹。 ```shell - cmake .. -G "Visual Studio 14 2015 Win64" -DWITH_GPU=ON -DWITH_MKL=OFF -DWITH_STATIC_LIB=ON -DCMAKE_BUILD_TYPE=Release -DDEMO_NAME=inference_icnet -DPADDLE_LIB=D:\to_the_paddle_inference.lib -DCUDA_LIB=D:\tools\v8.0\lib\x64 + cmake .. -G "Visual Studio 15 2017 Win64" -T host=x64 -DWITH_GPU=ON -DWITH_MKL=OFF -DWITH_STATIC_LIB=ON -DCMAKE_BUILD_TYPE=Release -DDEMO_NAME=inference_icnet -DPADDLE_LIB=D:\to_the_paddle_inference.lib -DCUDA_LIB=D:\tools\v8.0\lib\x64 ``` 然后用vs2015打开对应的项目文件,注意使用静态链接 "/MT",生成对应的exe。将openblas.dll放到exe所在目录。 diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index 5452b2160ab..5c9655edfb7 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -117,7 +117,7 @@ if(WITH_PYTHON) "${op_function_generator_path}/op_function_generator ${impl_file}\n" "if %ERRORLEVEL% NEQ 0 (\n" " set /a build_times=%build_times%+1\n" - " if %build_times% GTR 100 (\n" + " if %build_times% GTR 5 (\n" " exit /b 1\n" " ) else (\n" " goto :retry\n" diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat index 07de8ff6c2f..c5bb7ea472b 100644 --- a/paddle/scripts/paddle_build.bat +++ b/paddle/scripts/paddle_build.bat @@ -30,8 +30,13 @@ taskkill /f /im op_function_generator.exe wmic process where name="op_function_generator.exe" call terminate taskkill /f /im python.exe 2>NUL +:: TODO: Temporarily,REMOVE after VS2017 is stable. +set WITH_TPCACHE=OFF +rmdir %cache_dir%\third_party_GPU /s/q +rmdir %cache_dir%\third_party /s/q + rem ------initialize common variable------ -if not defined GENERATOR set GENERATOR="Visual Studio 14 2015 Win64" +if not defined GENERATOR set GENERATOR="Visual Studio 15 2017 Win64" if not defined BRANCH set BRANCH=develop if not defined WITH_TENSORRT set WITH_TENSORRT=ON if not defined TENSORRT_ROOT set TENSORRT_ROOT=D:/TensorRT @@ -157,9 +162,11 @@ if %GENERATOR% == "Ninja" ( rem ------show summary of current environment---------- cmake --version -nvcc --version -where nvidia-smi -nvidia-smi +if "%WITH_GPU%"=="ON" ( + nvcc --version + where nvidia-smi + nvidia-smi +) python %work_dir%\tools\summary_env.py %cache_dir%\tools\busybox64.exe bash %work_dir%\tools\get_cpu_info.sh @@ -241,7 +248,9 @@ echo ======================================== echo Step 1. Cmake ... echo ======================================== -call "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat" amd64 +rem Configure the environment for 64-bit builds. 'DISTUTILS_USE_SDK' indicates that the user has selected the compiler. +call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvars64.bat" +set DISTUTILS_USE_SDK=1 for /F %%# in ('wmic os get localdatetime^|findstr 20') do set start=%%# set start=%start:~4,10% @@ -261,16 +270,16 @@ if %day_now% NEQ %day_before% ( echo %day_now% > %cache_dir%\day.txt type %cache_dir%\day.txt if %day_now% EQU 21 ( - rmdir %cache_dir%\third_party_GPU/ /s/q - rmdir %cache_dir%\third_party/ /s/q + rmdir %cache_dir%\third_party_GPU /s/q + rmdir %cache_dir%\third_party /s/q ) if %day_now% EQU 11 ( - rmdir %cache_dir%\third_party_GPU/ /s/q - rmdir %cache_dir%\third_party/ /s/q + rmdir %cache_dir%\third_party_GPU /s/q + rmdir %cache_dir%\third_party /s/q ) if %day_now% EQU 01 ( - rmdir %cache_dir%\third_party_GPU/ /s/q - rmdir %cache_dir%\third_party/ /s/q + rmdir %cache_dir%\third_party_GPU /s/q + rmdir %cache_dir%\third_party /s/q ) ) @@ -294,14 +303,14 @@ if "%WITH_GPU%"=="ON" ( ) :cmake_impl -echo cmake .. -G %GENERATOR% -DCMAKE_BUILD_TYPE=Release -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^ +echo cmake .. -G %GENERATOR% -T host=x64 -DCMAKE_BUILD_TYPE=Release -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^ -DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DPYTHON_EXECUTABLE=%PYTHON_EXECUTABLE% -DON_INFER=%ON_INFER% ^ -DWITH_INFERENCE_API_TEST=%WITH_INFERENCE_API_TEST% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH% ^ -DINFERENCE_DEMO_INSTALL_DIR=%INFERENCE_DEMO_INSTALL_DIR% -DWITH_STATIC_LIB=%WITH_STATIC_LIB% ^ -DWITH_TENSORRT=%WITH_TENSORRT% -DTENSORRT_ROOT="%TENSORRT_ROOT%" -DMSVC_STATIC_CRT=%MSVC_STATIC_CRT% ^ -DWITH_UNITY_BUILD=%WITH_UNITY_BUILD% -DCUDA_ARCH_NAME=%CUDA_ARCH_NAME% -cmake .. -G %GENERATOR% -DCMAKE_BUILD_TYPE=Release -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^ +cmake .. -G %GENERATOR% -DCMAKE_BUILD_TYPE=Release -T host=x64 -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^ -DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DPYTHON_EXECUTABLE=%PYTHON_EXECUTABLE% -DON_INFER=%ON_INFER% ^ -DWITH_INFERENCE_API_TEST=%WITH_INFERENCE_API_TEST% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH% ^ -DINFERENCE_DEMO_INSTALL_DIR=%INFERENCE_DEMO_INSTALL_DIR% -DWITH_STATIC_LIB=%WITH_STATIC_LIB% ^ @@ -322,14 +331,16 @@ echo ======================================== echo Step 2. Buile Paddle ... echo ======================================== -for /F %%# in ('wmic cpu get NumberOfLogicalProcessors^|findstr [0-9]') do set /a PARALLEL_PROJECT_COUNT=%%#*2/3 +for /F %%# in ('wmic cpu get NumberOfLogicalProcessors^|findstr [0-9]') do set /a PARALLEL_PROJECT_COUNT=%%#*4/5 +echo "PARALLEL PROJECT COUNT is %PARALLEL_PROJECT_COUNT%" set build_times=1 :build_tp echo Build third_party the %build_times% time: + if %GENERATOR% == "Ninja" ( ninja third_party ) else ( - msbuild /m /p:Configuration=Release /verbosity:quiet third_party.vcxproj + MSBuild /m /p:PreferredToolArchitecture=x64 /p:Configuration=Release /verbosity:quiet third_party.vcxproj ) if %ERRORLEVEL% NEQ 0 ( set /a build_times=%build_times%+1 @@ -352,9 +363,9 @@ if %GENERATOR% == "Ninja" ( ninja -j %PARALLEL_PROJECT_COUNT% ) else ( if "%WITH_CLCACHE%"=="OFF" ( - msbuild /m:%PARALLEL_PROJECT_COUNT% /p:Configuration=Release /verbosity:%LOG_LEVEL% paddle.sln + MSBuild /m:%PARALLEL_PROJECT_COUNT% /p:PreferredToolArchitecture=x64 /p:Configuration=Release /verbosity:%LOG_LEVEL% ALL_BUILD.vcxproj ) else ( - msbuild /m:%PARALLEL_PROJECT_COUNT% /p:TrackFileAccess=false /p:CLToolExe=clcache.exe /p:CLToolPath=%PYTHON_ROOT%\Scripts /p:Configuration=Release /verbosity:%LOG_LEVEL% paddle.sln + MSBuild /m:%PARALLEL_PROJECT_COUNT% /p:PreferredToolArchitecture=x64 /p:TrackFileAccess=false /p:CLToolExe=clcache.exe /p:CLToolPath=%PYTHON_ROOT%\Scripts /p:Configuration=Release /verbosity:%LOG_LEVEL% paddle.sln ) ) @@ -579,7 +590,7 @@ echo git fetch upstream $BRANCH # develop is not fetched>> check_change_of_ echo fi>> check_change_of_unittest.sh echo git checkout -b origin_pr >> check_change_of_unittest.sh echo git checkout -f $BRANCH >> check_change_of_unittest.sh -echo cmake .. -G %GENERATOR% -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^ +echo cmake .. -G %GENERATOR% -T host=x64 -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^ -DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DPYTHON_EXECUTABLE=%PYTHON_EXECUTABLE% -DON_INFER=%ON_INFER% ^ -DWITH_INFERENCE_API_TEST=%WITH_INFERENCE_API_TEST% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH% ^ -DINFERENCE_DEMO_INSTALL_DIR=%INFERENCE_DEMO_INSTALL_DIR% -DWITH_STATIC_LIB=%WITH_STATIC_LIB% ^ @@ -696,7 +707,7 @@ echo ======================================== echo Clean up environment at the end ... echo ======================================== taskkill /f /im cmake.exe 2>NUL -taskkill /f /im msbuild.exe 2>NUL +taskkill /f /im MSBuild.exe 2>NUL taskkill /f /im git.exe 2>NUL taskkill /f /im cl.exe 2>NUL taskkill /f /im lib.exe 2>NUL diff --git a/paddle/scripts/windows_build/build.bat b/paddle/scripts/windows_build/build.bat index 6f99c23ccd2..9a2ed349e5b 100644 --- a/paddle/scripts/windows_build/build.bat +++ b/paddle/scripts/windows_build/build.bat @@ -61,8 +61,8 @@ echo Current directory : %cd% call:rest_env -echo cmake %dst_path%\..\Paddle -G "Visual Studio 14 2015 Win64" -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% -DWITH_AVX=OFF -DPYTHON_INCLUDE_DIR=%PYTHON_DIR%\include\ -DPYTHON_LIBRARY=%PYTHON_DIR%\libs\ -DPYTHON_EXECUTABLE=%PYTHON_DIR%\python.exe -DCMAKE_BUILD_TYPE=Release -DWITH_TESTING=OFF -DWITH_PYTHON=ON -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_DIR% -DCUDA_ARCH_NAME=All -cmake %dst_path%\..\Paddle -G "Visual Studio 14 2015 Win64" -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% -DWITH_AVX=OFF -DPYTHON_INCLUDE_DIR=%PYTHON_DIR%\include\ -DPYTHON_LIBRARY=%PYTHON_DIR%\libs\ -DPYTHON_EXECUTABLE=%PYTHON_DIR%\python.exe -DCMAKE_BUILD_TYPE=Release -DWITH_TESTING=OFF -DWITH_PYTHON=ON -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_DIR% -DCUDA_ARCH_NAME=All +echo cmake %dst_path%\..\Paddle -G "Visual Studio 15 2017 Win64" -T host=x64 -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% -DWITH_AVX=OFF -DPYTHON_INCLUDE_DIR=%PYTHON_DIR%\include\ -DPYTHON_LIBRARY=%PYTHON_DIR%\libs\ -DPYTHON_EXECUTABLE=%PYTHON_DIR%\python.exe -DCMAKE_BUILD_TYPE=Release -DWITH_TESTING=OFF -DWITH_PYTHON=ON -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_DIR% -DCUDA_ARCH_NAME=All +cmake %dst_path%\..\Paddle -G "Visual Studio 15 2017 Win64" -T host=x64 -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% -DWITH_AVX=OFF -DPYTHON_INCLUDE_DIR=%PYTHON_DIR%\include\ -DPYTHON_LIBRARY=%PYTHON_DIR%\libs\ -DPYTHON_EXECUTABLE=%PYTHON_DIR%\python.exe -DCMAKE_BUILD_TYPE=Release -DWITH_TESTING=OFF -DWITH_PYTHON=ON -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_DIR% -DCUDA_ARCH_NAME=All set MSBUILDDISABLENODEREUSE=1 @@ -82,8 +82,8 @@ echo Current directory : %cd% call:rest_env -echo cmake %dst_path%\..\Paddle -G "Visual Studio 14 2015 Win64" -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% -DWITH_AVX=ON -DPYTHON_INCLUDE_DIR=%PYTHON_DIR%\include\ -DPYTHON_LIBRARY=%PYTHON_DIR%\libs\ -DPYTHON_EXECUTABLE=%PYTHON_DIR%\python.exe -DCMAKE_BUILD_TYPE=Release -DWITH_TESTING=OFF -DWITH_PYTHON=ON -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_DIR% -DCUDA_ARCH_NAME=All -DNOAVX_CORE_FILE=%dst_path%_noavx\python\paddle\fluid\core_noavx.pyd -cmake %dst_path%\..\Paddle -G "Visual Studio 14 2015 Win64" -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% -DWITH_AVX=ON -DPYTHON_INCLUDE_DIR=%PYTHON_DIR%\include\ -DPYTHON_LIBRARY=%PYTHON_DIR%\libs\ -DPYTHON_EXECUTABLE=%PYTHON_DIR%\python.exe -DCMAKE_BUILD_TYPE=Release -DWITH_TESTING=OFF -DWITH_PYTHON=ON -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_DIR% -DCUDA_ARCH_NAME=All -DNOAVX_CORE_FILE=%dst_path%_noavx\python\paddle\fluid\core_noavx.pyd +echo cmake %dst_path%\..\Paddle -G "Visual Studio 15 2017 Win64" -T host=x64 -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% -DWITH_AVX=ON -DPYTHON_INCLUDE_DIR=%PYTHON_DIR%\include\ -DPYTHON_LIBRARY=%PYTHON_DIR%\libs\ -DPYTHON_EXECUTABLE=%PYTHON_DIR%\python.exe -DCMAKE_BUILD_TYPE=Release -DWITH_TESTING=OFF -DWITH_PYTHON=ON -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_DIR% -DCUDA_ARCH_NAME=All -DNOAVX_CORE_FILE=%dst_path%_noavx\python\paddle\fluid\core_noavx.pyd +cmake %dst_path%\..\Paddle -G "Visual Studio 15 2017 Win64" -T host=x64 -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% -DWITH_AVX=ON -DPYTHON_INCLUDE_DIR=%PYTHON_DIR%\include\ -DPYTHON_LIBRARY=%PYTHON_DIR%\libs\ -DPYTHON_EXECUTABLE=%PYTHON_DIR%\python.exe -DCMAKE_BUILD_TYPE=Release -DWITH_TESTING=OFF -DWITH_PYTHON=ON -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_DIR% -DCUDA_ARCH_NAME=All -DNOAVX_CORE_FILE=%dst_path%_noavx\python\paddle\fluid\core_noavx.pyd set MSBUILDDISABLENODEREUSE=1 @@ -107,8 +107,8 @@ echo Current directory : %cd% call:rest_env -echo cmake %dst_path%\..\Paddle -G "Visual Studio 14 2015 Win64" -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% -DCMAKE_BUILD_TYPE=Release -DWITH_PYTHON=OFF -DON_INFER=ON -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_DIR% -DCUDA_ARCH_NAME=All -cmake %dst_path%\..\Paddle -G "Visual Studio 14 2015 Win64" -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% -DCMAKE_BUILD_TYPE=Release -DWITH_PYTHON=OFF -DON_INFER=ON -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_DIR% -DCUDA_ARCH_NAME=All +echo cmake %dst_path%\..\Paddle -G "Visual Studio 15 2017 Win64" -T host=x64 -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% -DCMAKE_BUILD_TYPE=Release -DWITH_PYTHON=OFF -DON_INFER=ON -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_DIR% -DCUDA_ARCH_NAME=All +cmake %dst_path%\..\Paddle -G "Visual Studio 15 2017 Win64" -T host=x64 -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% -DCMAKE_BUILD_TYPE=Release -DWITH_PYTHON=OFF -DON_INFER=ON -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_DIR% -DCUDA_ARCH_NAME=All set MSBUILDDISABLENODEREUSE=1 diff --git a/paddle/scripts/windows_build/config.ini b/paddle/scripts/windows_build/config.ini index 32638d2873c..750d7af8c29 100644 --- a/paddle/scripts/windows_build/config.ini +++ b/paddle/scripts/windows_build/config.ini @@ -11,7 +11,7 @@ http_proxy=#please edit your proxy# https_proxy=#please edit your proxy# # Just for example, please set by your windows environment -vcvarsall_dir="C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat" +vcvarsall_dir="C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvarsall.bat" PYTHON3_PATH=C:\Python37 CUDA_PATH="C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.0" "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v9.0" diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index e0e845601cf..938547f363c 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -77,6 +77,7 @@ IF(WIN32) add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_SOURCE_DIR}/python/paddle ${PADDLE_BINARY_DIR}/python/paddle/ COMMAND ${CMAKE_COMMAND} -E env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel + COMMENT "Packing whl packages------>>>" DEPENDS copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto ${PY_FILES}) ELSE(WIN32) add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp -- GitLab From 4046f1303a1692624f7e0d988e04298e0d05c7ce Mon Sep 17 00:00:00 2001 From: Feiyu Chan Date: Tue, 23 Mar 2021 16:30:43 +0800 Subject: [PATCH 062/486] add coalesce_tensor into white list when checking re-creation of parameters (#31800) --- python/paddle/fluid/framework.py | 6 ++- .../rnn/test_rnn_cudnn_params_packing.py | 53 +++++++++++++++++++ 2 files changed, 58 insertions(+), 1 deletion(-) create mode 100644 python/paddle/fluid/tests/unittests/rnn/test_rnn_cudnn_params_packing.py diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 036e8ab3044..db487128bbe 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -3031,7 +3031,11 @@ class Block(object): # In startup_program, "c_broadcast" and "c_sync_comm_stream" # are treated as initialization ops that cause error. # Think of "c_broadcast" and "c_sync_comm_stream" as a special case here. - if op.type in ["c_broadcast", "c_sync_comm_stream"]: + # NOTE: "coalesce_tensor" is a special case for rnn with cudnn support + if op.type in [ + "c_broadcast", "c_sync_comm_stream", + "coalesce_tensor" + ]: continue init_ops.append(op) return init_ops diff --git a/python/paddle/fluid/tests/unittests/rnn/test_rnn_cudnn_params_packing.py b/python/paddle/fluid/tests/unittests/rnn/test_rnn_cudnn_params_packing.py new file mode 100644 index 00000000000..0712d5be23e --- /dev/null +++ b/python/paddle/fluid/tests/unittests/rnn/test_rnn_cudnn_params_packing.py @@ -0,0 +1,53 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +from unittest import TestCase + + +def create_model(): + hidden_size = 32 + bilstm = paddle.nn.LSTM( + hidden_size, hidden_size, num_layers=1, direction='bidirectional') + return bilstm + + +class TestRNNProgramClone(TestCase): + def setUp(self): + paddle.enable_static() + + def test_rnn_with_cudnn_clone(self): + train_program = paddle.static.Program() + test_program = paddle.static.Program() + startup_prog = paddle.static.Program() + + # test a typical case in static graph usage: create two nearly + # identical program with a shared startup program to share their + # parameters + # + # when creating a parameter, the name is checked. If there is already + # a parameter with the same name, which is the output of a operator + # (i.e. its creator), its re-creation is skipped. + # + # but if that parameter has been the output of more than one operator, + # an exception is raised. For special cases, white list is added. + # flattening rnn's parameters for the need to call cudnn kernel is such + # a case. + with paddle.static.program_guard(train_program, startup_prog): + with paddle.fluid.unique_name.guard(): + bilstm = create_model() + + with paddle.fluid.program_guard(test_program, startup_prog): + with paddle.fluid.unique_name.guard(): + bilstm = create_model() -- GitLab From 3f66e7deab7c2ddec30ffe015bc4597af48f68ae Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Tue, 23 Mar 2021 19:55:16 +0800 Subject: [PATCH 063/486] add cmath header for bfloat (#31792) --- paddle/fluid/platform/bfloat16.h | 1 + 1 file changed, 1 insertion(+) diff --git a/paddle/fluid/platform/bfloat16.h b/paddle/fluid/platform/bfloat16.h index d1257f853e0..6cb4901f1dd 100644 --- a/paddle/fluid/platform/bfloat16.h +++ b/paddle/fluid/platform/bfloat16.h @@ -16,6 +16,7 @@ #include +#include #include #include #include -- GitLab From 1eb927f9355b275819507da4b65358bed482470b Mon Sep 17 00:00:00 2001 From: Zhou Wei <52485244+zhouwei25@users.noreply.github.com> Date: Tue, 23 Mar 2021 22:23:01 +0800 Subject: [PATCH 064/486] Restore the third-party library cache for windows (#31811) --- CMakeLists.txt | 1 - paddle/scripts/paddle_build.bat | 6 +----- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 676c94591ee..765d8fc1578 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -61,7 +61,6 @@ if(WITH_MUSL) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=deprecated-declarations -Wno-deprecated-declarations -Wno-error=pessimizing-move -Wno-error=deprecated-copy") endif() -#set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Zm1000 /fp:fast") if(WIN32) option(MSVC_STATIC_CRT "use static C Runtime library by default" ON) diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat index c5bb7ea472b..2edb062ac80 100644 --- a/paddle/scripts/paddle_build.bat +++ b/paddle/scripts/paddle_build.bat @@ -30,10 +30,6 @@ taskkill /f /im op_function_generator.exe wmic process where name="op_function_generator.exe" call terminate taskkill /f /im python.exe 2>NUL -:: TODO: Temporarily,REMOVE after VS2017 is stable. -set WITH_TPCACHE=OFF -rmdir %cache_dir%\third_party_GPU /s/q -rmdir %cache_dir%\third_party /s/q rem ------initialize common variable------ if not defined GENERATOR set GENERATOR="Visual Studio 15 2017 Win64" @@ -85,7 +81,7 @@ git show-ref --verify --quiet refs/heads/last_pr if %ERRORLEVEL% EQU 0 ( git diff HEAD last_pr --stat --name-only git diff HEAD last_pr --stat --name-only | findstr "setup.py.in" - if %ERRORLEVEL% EQU 0 ( + if !ERRORLEVEL! EQU 0 ( rmdir build /s/q ) git branch -D last_pr -- GitLab From 270699e6478d1314b4f723bc603856d54f0bf59a Mon Sep 17 00:00:00 2001 From: ronnywang <524019753@qq.com> Date: Wed, 24 Mar 2021 10:46:12 +0800 Subject: [PATCH 065/486] [ROCM] fix test_matmul_v2_op (#31802) --- paddle/fluid/operators/dot_op.h | 2 +- python/paddle/fluid/tests/unittests/test_matmul_v2_op.py | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/operators/dot_op.h b/paddle/fluid/operators/dot_op.h index 0b0b7f69b9d..1b607922eda 100644 --- a/paddle/fluid/operators/dot_op.h +++ b/paddle/fluid/operators/dot_op.h @@ -160,7 +160,7 @@ struct DotGradFunction> { const Tensor* tensor_dout, Tensor* tensor_dx, Tensor* tensor_dy, const paddle::framework::ExecutionContext& ctx) { -#ifdef __NVCC__ +#if defined(__NVCC__) || defined(__HIPCC__) if (1 == tensor_dout->dims().size()) { auto dout = framework::EigenVector::Flatten(*tensor_dout); diff --git a/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py b/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py index 761d318d7b8..efcc0e4cfe3 100644 --- a/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py +++ b/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py @@ -67,7 +67,7 @@ class TestMatMulV2Op(OpTest): self.trans_y = False def init_kernel_type(self): - self.dtype = "float64" + self.dtype = "float32" if core.is_compiled_with_rocm() else "float64" def setUp(self): self.init_kernel_type() @@ -91,7 +91,10 @@ class TestMatMulV2Op(OpTest): self.check_output() def test_check_grad(self): - self.check_grad(['X', 'Y'], 'Out') + if core.is_compiled_with_rocm(): + self.check_grad(['X', 'Y'], 'Out', max_relative_error=1e-2) + else: + self.check_grad(['X', 'Y'], 'Out') class TestMatMuklOp2(TestMatMulV2Op): -- GitLab From 68497e7b39a13939f1a466f56874fc5aa984878a Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Wed, 24 Mar 2021 14:26:51 +0800 Subject: [PATCH 066/486] change trainable to stop_gradient in optimizer (#31823) --- python/paddle/optimizer/adam.py | 2 +- python/paddle/optimizer/adamax.py | 2 +- python/paddle/optimizer/optimizer.py | 19 ++++++++++--------- 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/python/paddle/optimizer/adam.py b/python/paddle/optimizer/adam.py index b0c05cf8de7..0cafbda893d 100644 --- a/python/paddle/optimizer/adam.py +++ b/python/paddle/optimizer/adam.py @@ -351,7 +351,7 @@ class Adam(Optimizer): """ params_grads = [] for param in self._parameter_list: - if not param.trainable: + if param.stop_gradient: continue if param._grad_ivar() is not None: grad_var = param._grad_ivar() diff --git a/python/paddle/optimizer/adamax.py b/python/paddle/optimizer/adamax.py index bd65fc19c32..4a6c2278a46 100644 --- a/python/paddle/optimizer/adamax.py +++ b/python/paddle/optimizer/adamax.py @@ -184,7 +184,7 @@ class Adamax(Optimizer): """ assert isinstance(block, framework.Block) for param, grad in parameters_and_grads: - if grad is None or param.trainable is False: + if grad is None or param.stop_gradient is True: continue with param.block.program._optimized_guard( [param, grad]), name_scope('adamax'): diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py index 212dad7c77c..b37d1726064 100644 --- a/python/paddle/optimizer/optimizer.py +++ b/python/paddle/optimizer/optimizer.py @@ -542,7 +542,7 @@ class Optimizer(object): def _update_param_device_map(self, parameters_and_grads, target_block): for param_and_grad in parameters_and_grads: - if param_and_grad[0].trainable is True: + if param_and_grad[0].stop_gradient is False: param_name = param_and_grad[0].name ops = target_block.ops device_attr_name = core.op_proto_and_checker_maker.kOpDeviceAttrName( @@ -598,14 +598,14 @@ class Optimizer(object): self._update_param_device_map(parameters_and_grads, target_block) self._create_accumulators( target_block, - [p[0] for p in parameters_and_grads if p[0].trainable]) + [p[0] for p in parameters_and_grads if not p[0].stop_gradient]) self._create_global_learning_rate() if framework.in_dygraph_mode(): for param_and_grad in parameters_and_grads: if param_and_grad[1] is None: continue - if param_and_grad[0].trainable is True: + if param_and_grad[0].stop_gradient is False: self._append_optimize_op(target_block, param_and_grad) else: for param_and_grad in parameters_and_grads: @@ -613,7 +613,7 @@ class Optimizer(object): continue with param_and_grad[0].block.program._optimized_guard( param_and_grad), name_scope("optimizer"): - if param_and_grad[0].trainable is True: + if param_and_grad[0].stop_gradient is False: device = self._get_device_for_param(param_and_grad[0] .name) with device_guard(device): @@ -689,7 +689,7 @@ class Optimizer(object): params_grads = [] for param in parameter_list: - if not param.trainable: + if param.stop_gradient: continue if param._grad_ivar() is not None: # create gradient tensor @@ -789,8 +789,9 @@ class Optimizer(object): def _get_no_grad_set(self, loss, no_grad_set=None): no_grad_set = _get_no_grad_set_name(no_grad_set) parameters = loss.block.program.global_block().all_parameters() - param_no_trainable = set( - [param.name for param in parameters if param.trainable is False]) + param_no_trainable = set([ + param.name for param in parameters if param.stop_gradient is True + ]) # If the parameter is no trainable, it should not have a gradient. no_grad_set.update(param_no_trainable) @@ -825,7 +826,7 @@ class Optimizer(object): """ for p in self._parameter_list: - if p.trainable: + if not p.stop_gradient: p.clear_gradient() @imperative_base.no_grad @@ -920,7 +921,7 @@ class Optimizer(object): """ params_grads = [] for param in self._parameter_list: - if not param.trainable: + if param.stop_gradient: continue if param._grad_ivar() is not None: grad_var = param._grad_ivar() -- GitLab From 84a551380efa7feffc496112a1b746ab7d0617d1 Mon Sep 17 00:00:00 2001 From: cc <52520497+juncaipeng@users.noreply.github.com> Date: Wed, 24 Mar 2021 14:40:14 +0800 Subject: [PATCH 067/486] [dygraph qat] Refine saving output scale to infer program (#31784) * Refine saving output scale to infer program --- .../slim/quantization/imperative/qat.py | 229 ++++++++++-------- .../slim/quantization/imperative/utils.py | 34 ++- .../slim/tests/test_imperative_out_scale.py | 23 +- 3 files changed, 166 insertions(+), 120 deletions(-) diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py index 68b4cfdc661..ea2e8e073b5 100644 --- a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py +++ b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py @@ -251,8 +251,8 @@ class ImperativeQuantizeInputs(object): super(ImperativeQuantizeInputs, self).__init__() self._quantizable_layer_type = tuple( - utils.supported_quant_layers_map[layer] - if layer in utils.supported_quant_layers_map else layer + utils.quant_input_layers_map[layer] + if layer in utils.quant_input_layers_map else layer for layer in quantizable_layer_type) for layer in self._quantizable_layer_type: assert not isinstance(layer, str), \ @@ -324,12 +324,11 @@ class ImperativeQuantizeInputs(object): target = name[last_idx:idx] quant_layer = self._get_quantized_layer(layer) - setattr(quant_layer, "layer_name", layer.full_name()) setattr(obj, target, quant_layer) def _get_quantized_layer(self, layer): quant_layer_name = None - for key, value in utils.supported_quant_layers_map.items(): + for key, value in utils.quant_input_layers_map.items(): if isinstance(layer, value): quant_layer_name = 'Quantized' + key break @@ -372,6 +371,9 @@ class ImperativeCalcOutputScale(object): """ assert isinstance(model, dygraph.Layer), \ "The model must be the instance of dygraph.Layer." + + # Calculate the target ops's output scale, and don't consider + # the skip_quant attr for _, layer in model.named_sublayers(): if self._is_target_layer(layer): self._init_scale_params(layer) @@ -411,24 +413,21 @@ class ImperativeCalcOutputScale(object): assert isinstance(layer, dygraph.Layer), \ "The model must be the instance of dygraph.Layer." - # remove handles and collect output scales + self._gather_output_scale(layer) + with dygraph.guard(): layer.eval() for handle in self._register_hook_handle_list: handle.remove() - for _, sub_layer in layer.named_sublayers(): - if self._is_target_layer(sub_layer): - if hasattr(sub_layer, "layer_name"): - layer_name = sub_layer.layer_name - else: - layer_name = sub_layer.full_name() - if hasattr(sub_layer, "_quant_out_scale"): - self._out_scale_dict[layer_name] = float( - sub_layer._quant_out_scale) - - # save the quantized model that doesn't have output scales paddle.jit.save(layer=layer, path=path, input_spec=input_spec, **config) + if len(self._out_scale_dict) == 0: + warnings.warn("Warning: No Layer of the model while to be " \ + "saved contains the out_threshold attribute, so the " \ + "generated inference model would not contain the " \ + "out_threshold.") + return + # load static model is_dynamic_mode = False if paddle.in_dynamic_mode(): @@ -443,79 +442,26 @@ class ImperativeCalcOutputScale(object): basename = os.path.basename(path) model_filename = basename + INFER_MODEL_SUFFIX params_filename = basename + INFER_PARAMS_SUFFIX - [inference_program, feed_target_names, fetch_targets] = ( + + [infer_program, feed_target_names, fetch_targets] = ( load_inference_model( dirname=dirname, executor=exe, model_filename=model_filename, params_filename=params_filename)) + # TODO(jc): analyse whether the dygraph model has + # several blocks before applying qat + assert infer_program.num_blocks == 1, \ + "Quantization aware training (QAT) requires the program " \ + "only has a block for now. When the model has if-else or " \ + "while, the program will have several blocks." + # set output scales to the static model - check_behind_op = False - op_count = 0 - ops_list = [key for key, _ in self._out_scale_dict.items()] - if len(ops_list) == 0: - warnings.warn( - "Warning: No Layer of the model while to be saved contains " - "the out_threshold attribute, so the generated inference " - "model would not contain the out_threshold.") - else: - # Because the Layer in dygraph may correspond to multiple ops - # in static program after being saved. To ensure correctness, - # the outscale collected for output of dygraph Layer can only - # be set to the last op in the corresponding ops in static program. - # - # We can judge the execution order of the ops which corresponding - # to dygraph Layer by check_behind_op - forward_op = None - for block in inference_program.blocks: - for op in block.ops: - if op.type in utils.op_real_in_out_name: - if op_count > len(ops_list): - warnings.warn( - "The number of Layer which has " - "out_threshold attribute should be bigger than " - "the op in inference model") - break - if check_behind_op: - check_behind_op = False - if op.type == "elementwise_add": - if self._is_op_matched(ops_list[op_count], op, - block): - op._set_attr("out_threshold", - self._out_scale_dict[ops_list[ - op_count]]) - op_count += 1 - forward_op = None - continue - else: - if forward_op is None: - raise ValueError( - "forward_op should not be None") - if self._is_op_matched(ops_list[op_count], - forward_op, block): - forward_op._set_attr( - "out_threshold", self._out_scale_dict[ - ops_list[op_count]]) - op_count += 1 - forward_op = None - - if op.type in ["conv2d", "depthwise_conv2d", "matmul"]: - check_behind_op = True - forward_op = op - continue - if op_count >= len(ops_list): - warnings.warn( - "The number of Layer which has out_threshold attribute should be bigger than the op in inference model" - ) - break - if self._is_op_matched(ops_list[op_count], op, block): - op._set_attr( - "out_threshold", - self._out_scale_dict[ops_list[op_count]]) - op_count += 1 - - self._set_skip_quant_attr(inference_program) + self._save_output_scale(infer_program) + + # process skip quant + self._set_skip_quant_attr(infer_program) # save the final quantized model that has output scales save_inference_model( @@ -523,16 +469,75 @@ class ImperativeCalcOutputScale(object): feeded_var_names=feed_target_names, target_vars=fetch_targets, executor=exe, - main_program=inference_program.clone(), + main_program=infer_program.clone(), model_filename=model_filename, params_filename=params_filename) if is_dynamic_mode: paddle.disable_static() + def _gather_output_scale(self, layer): + """ + Gather all output scales to self._out_scale_dict + """ + with dygraph.guard(): + layer.eval() + for _, sub_layer in layer.named_sublayers(): + if self._is_target_layer(sub_layer): + layer_name = sub_layer.full_name() + if hasattr(sub_layer, "_quant_out_scale"): + self._out_scale_dict[layer_name] = float( + sub_layer._quant_out_scale) + + def _save_output_scale(self, infer_program): + """ + Save all output scales to the corresponding ops in static + inference program. + + Because the Layer in dygraph may correspond to multiple ops + in static program after being saved. To ensure correctness, + the outscale collected for output of dygraph Layer can only + be set to the last op in the corresponding ops in static program. + """ + assert infer_program.num_blocks == 1, \ + "The inference program should only have a block." + + global_block = infer_program.global_block() + target_ops = global_block.ops + + scale_idx = 0 + op_idx = 0 + attr_name = "out_threshold" + + for scale_name, scale_value in self._out_scale_dict.items(): + while True: + if op_idx >= len(target_ops): + break + + op = target_ops[op_idx] + if not self._is_scale_op_matched(scale_name, op, global_block): + op_idx += 1 + else: + if op.type in utils.weight_op_types \ + and op_idx + 1 < len(target_ops) \ + and target_ops[op_idx+1].type == "elementwise_add": + target_ops[op_idx + 1]._set_attr(attr_name, scale_value) + op_idx += 2 + else: + op._set_attr(attr_name, scale_value) + op_idx += 1 + scale_idx += 1 + break + + if scale_idx != len(self._out_scale_dict): + _logger.warning("Warning: the model have %s output scales, "\ + "but it only saves %s output scales." \ + % (len(self._out_scale_dict), scale_idx)) + def _is_target_layer(self, layer): - return isinstance(layer, utils.out_scale_layers_list) \ - or 'quantized_' in layer.full_name() + return isinstance(layer, tuple(utils.quant_output_layers_map.values())) \ + or ('quantized_' in layer.full_name() and \ + 'quantized_noweight' not in layer.full_name()) def _init_scale_params(self, layer, name=None): """ @@ -570,27 +575,39 @@ class ImperativeCalcOutputScale(object): layer._quant_out_accum = _create_param(layer, name, "accum", dtype) layer._quant_out_accum.stop_gradient = True - # Judge whether the op in program matches the Layer in dynamic model - def _is_op_matched(self, layer_name, op, block): - output_var_names = quantization_pass._get_op_output_var_names(op) - for output_var_name in output_var_names: - output_var_tensor = block.var(output_var_name) - if output_var_tensor.dtype not in [ - core.VarDesc.VarType.FP64, core.VarDesc.VarType.FP32 - ]: - return False - - # Because the naming styles of static and dynamic graph are different, - # in order to avoid mistakes, we unify the name here. - op_type = output_var_names[0].split(".")[0] - op_type = op_type.rsplit("_", 1)[0] - if op_type == 'depthwise_conv2d': - op_type = 'conv2d' - if 'prelu' in op_type: - op_type = op_type.replace('prelu', 'p_re_lu') - if 'relu' in op_type: - op_type = op_type.replace('relu', 're_lu') - return op_type in layer_name + def _is_scale_op_matched(self, scale_name, op, block): + """ + Based on the op name and attrs to judge whether the op in + program matches the scale_name. We must know the corresponding + name between dgraph and static model. + """ + fp_type = [core.VarDesc.VarType.FP64, core.VarDesc.VarType.FP32] + if op.type in quantization_pass._op_real_in_out_name.keys(): + output_var_names = quantization_pass._get_op_output_var_names(op) + for output_var_name in output_var_names: + output_var_tensor = block.var(output_var_name) + if output_var_tensor.dtype not in fp_type: + return False + + # corresponding_map: [name, op_types, function] + # Note that, the items have priority in corresponding_map + corresponding_map = [ + ['conv2d_tranpose', ['conv2d_transpose', \ + 'depthwise_conv2d_transpose'], None], + ['conv2d', ['conv2d', 'depthwise_conv2d'], None], + ['linear', ['matmul'], None], + ['re_lu6', ['relu6'], None], + ['p_re_lu', ['prelu'], None], + ['leaky_re_lu', ['leaky_relu'], None], + ['re_lu', ['relu'], None], + ] + + for item in corresponding_map: + if item[0] in scale_name: + return (op.type in item[1]) and \ + (len(item) == 2 or item[2] is None or item[2](op)) + + return op.type in scale_name def _set_skip_quant_attr(self, program): block = program.global_block() diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py b/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py index 3bf655265c6..090f6cda389 100644 --- a/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py +++ b/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py @@ -30,7 +30,7 @@ op_real_in_out_name = { "swish": [["X"], ["Out"]], } -supported_quant_layers_map = { +quant_input_layers_map = { 'Conv2D': paddle.nn.Conv2D, 'Linear': paddle.nn.Linear, 'AdaptiveAvgPool2D': paddle.nn.AdaptiveAvgPool2D, @@ -58,8 +58,30 @@ fake_quantize_dequantize_types = [ "fake_quantize_dequantize_moving_average_abs_max" ] -out_scale_layers_list = ( - paddle.nn.Conv2D, paddle.nn.Linear, paddle.nn.MaxPool2D, - paddle.nn.BatchNorm, paddle.nn.BatchNorm2D, paddle.nn.SyncBatchNorm, - paddle.nn.LeakyReLU, paddle.nn.PReLU, paddle.nn.ReLU, paddle.nn.ReLU6, - paddle.nn.Sigmoid, paddle.nn.Softmax, paddle.nn.Tanh, paddle.nn.Swish) +quant_output_layers_map = { + 'Conv2D': paddle.nn.Conv2D, + 'Conv2DTranspose': paddle.nn.Conv2DTranspose, + 'Linear': paddle.nn.Linear, + 'AdaptiveAvgPool2D': paddle.nn.AdaptiveAvgPool2D, + 'AdaptiveMaxPool2D': paddle.nn.AdaptiveMaxPool2D, + 'AvgPool2D': paddle.nn.AvgPool2D, + 'MaxPool2D': paddle.nn.MaxPool2D, + 'BatchNorm': paddle.nn.BatchNorm, + 'BatchNorm2D': paddle.nn.BatchNorm2D, + 'SyncBatchNorm': paddle.nn.SyncBatchNorm, + 'ELU': paddle.nn.ELU, + 'GELU': paddle.nn.GELU, + 'LeakyReLU': paddle.nn.LeakyReLU, + 'PReLU': paddle.nn.PReLU, + 'ReLU': paddle.nn.ReLU, + 'ReLU6': paddle.nn.ReLU6, + 'Sigmoid': paddle.nn.Sigmoid, + 'Softmax': paddle.nn.Softmax, + 'Tanh': paddle.nn.Tanh, + 'Swish': paddle.nn.Swish, +} + +weight_op_types = [ + "conv2d", "depthwise_conv2d", "matmul", "conv2d_transpose", + "depthwise_conv2d_transpose" +] diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py index ed29375d22b..600174e503f 100644 --- a/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py +++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py @@ -33,7 +33,6 @@ from paddle.fluid.dygraph.container import Sequential from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX from paddle.nn.layer import ReLU, LeakyReLU, Sigmoid, Softmax, PReLU from paddle.nn import Linear, Conv2D, Softmax, BatchNorm2D, MaxPool2D -from paddle.fluid.dygraph.nn import Pool2D from paddle.fluid.log_helper import get_logger from paddle.fluid.dygraph import nn @@ -131,8 +130,8 @@ class ImperativeLenet(fluid.dygraph.Layer): bias_attr=False), BatchNorm2D(6), ReLU(), - Pool2D( - pool_size=2, pool_type='max', pool_stride=2), + MaxPool2D( + kernel_size=2, stride=2), Conv2D( in_channels=6, out_channels=16, @@ -357,7 +356,6 @@ class TestImperativeOutSclae(unittest.TestCase): "diff({}) at {}, dynamic loss = {}, static loss = {}". format(diff, i, loss_d, loss_s)) break - self.assertTrue( np.allclose( np.array(dynamic_loss_rec), @@ -398,10 +396,15 @@ class TestImperativeOutSclae(unittest.TestCase): if dynamic_ops[i].has_attr("out_threshold"): op_count += 1 self.assertTrue(dynamic_ops[i].type == static_ops[i].type) + if dynamic_ops[i].attr("out_threshold") != static_ops[i].attr( + "out_threshold"): + _logger.info(dynamic_ops[i].attr("out_threshold")) + _logger.info(static_ops[i].attr("out_threshold")) self.assertTrue(dynamic_ops[i].attr("out_threshold") == static_ops[i].attr("out_threshold")) - self.assertTrue(op_count == 13) + _logger.info("op_cout: {}".format(op_count)) + self.assertTrue(op_count == 14) class TestSaveQuanztizedModelFromCheckPoint(unittest.TestCase): @@ -470,7 +473,9 @@ class TestSaveQuanztizedModelFromCheckPoint(unittest.TestCase): self.assertTrue(dynamic_ops[i].type == static_ops[i].type) self.assertTrue(dynamic_ops[i].attr("out_threshold") == static_ops[i].attr("out_threshold")) - self.assertTrue(op_count == 13) + + _logger.info("op_cout: {}".format(op_count)) + self.assertTrue(op_count == 14) class TestSaveQuantizedModel_Warning(unittest.TestCase): @@ -490,8 +495,10 @@ class TestSaveQuantizedModel_Warning(unittest.TestCase): shape=[None, 1, 28, 28], dtype='float32') ]) - warning_message = "Warning: No Layer of the model while to be saved contains the out_threshold attribute, " \ - "so the generated inference model would not contain the out_threshold." + warning_message = "Warning: No Layer of the model while to be " \ + "saved contains the out_threshold attribute, so the " \ + "generated inference model would not contain the " \ + "out_threshold." num = get_vaild_warning_num(warning_message, w) assert num == 1 -- GitLab From f2cfc0f46d8b47f743320b8037d6f309a097d294 Mon Sep 17 00:00:00 2001 From: Aurelius84 Date: Wed, 24 Mar 2021 15:24:46 +0800 Subject: [PATCH 068/486] [CustomOp]Avoid raising warning while import paddle (#31804) --- python/paddle/utils/cpp_extension/cpp_extension.py | 6 +++--- python/paddle/utils/cpp_extension/extension_utils.py | 6 ------ 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/python/paddle/utils/cpp_extension/cpp_extension.py b/python/paddle/utils/cpp_extension/cpp_extension.py index d84ae67fff8..ea4c85e20db 100644 --- a/python/paddle/utils/cpp_extension/cpp_extension.py +++ b/python/paddle/utils/cpp_extension/cpp_extension.py @@ -400,14 +400,14 @@ class BuildExtension(build_ext, object): # ncvv compile CUDA source if is_cuda_file(src): if core.is_compiled_with_rocm(): - assert ROCM_HOME is not None + assert ROCM_HOME is not None, "Not found ROCM runtime, please use `export ROCM_PATH= XXX` to specific it." hipcc_cmd = os.path.join(ROCM_HOME, 'bin', 'hipcc') self.compiler.set_executable('compiler_so', hipcc_cmd) # {'nvcc': {}, 'cxx: {}} if isinstance(cflags, dict): cflags = cflags['hipcc'] else: - assert CUDA_HOME is not None + assert CUDA_HOME is not None, "Not found CUDA runtime, please use `export CUDA_HOME= XXX` to specific it." nvcc_cmd = os.path.join(CUDA_HOME, 'bin', 'nvcc') self.compiler.set_executable('compiler_so', nvcc_cmd) # {'nvcc': {}, 'cxx: {}} @@ -470,7 +470,7 @@ class BuildExtension(build_ext, object): src = src_list[0] obj = obj_list[0] if is_cuda_file(src): - assert CUDA_HOME is not None + assert CUDA_HOME is not None, "Not found CUDA runtime, please use `export CUDA_HOME= XXX` to specific it." nvcc_cmd = os.path.join(CUDA_HOME, 'bin', 'nvcc') if isinstance(self.cflags, dict): cflags = self.cflags['nvcc'] diff --git a/python/paddle/utils/cpp_extension/extension_utils.py b/python/paddle/utils/cpp_extension/extension_utils.py index 1ff42a7bcbc..7d6bcc4d564 100644 --- a/python/paddle/utils/cpp_extension/extension_utils.py +++ b/python/paddle/utils/cpp_extension/extension_utils.py @@ -461,9 +461,6 @@ def find_cuda_home(): if cuda_home and not os.path.exists( cuda_home) and core.is_compiled_with_cuda(): cuda_home = None - warnings.warn( - "Not found CUDA runtime, please use `export CUDA_HOME= XXX` to specific it." - ) return cuda_home @@ -494,9 +491,6 @@ def find_rocm_home(): if rocm_home and not os.path.exists( rocm_home) and core.is_compiled_with_rocm(): rocm_home = None - warnings.warn( - "Not found ROCM runtime, please use `export ROCM_PATH= XXX` to specific it." - ) return rocm_home -- GitLab From e5f7a834d4200ad9d7e8b748d2d96fc7faeb0e63 Mon Sep 17 00:00:00 2001 From: Wojciech Uss Date: Wed, 24 Mar 2021 08:41:47 +0100 Subject: [PATCH 069/486] fix cache key in concat oneDNN kernel (#31820) * fix cache key in concat oneDNN kernel * key simplified --- .../operators/mkldnn/concat_mkldnn_op.cc | 20 +++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc index 4beb7ad0178..df1b5af121d 100644 --- a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc @@ -71,6 +71,15 @@ static const std::vector ReduceMultiInput( return reduced; } +static const std::vector GetDimsForKey( + const std::vector& inputs) { + auto dims_key = paddle::framework::vectorize(inputs[0]->dims()); + for (auto it = std::next(inputs.begin()); it != inputs.end(); ++it) { + dims_key.push_back((*it)->dims()[0]); + } + return dims_key; +} + template class ConcatPrimitiveFactory { public: @@ -134,6 +143,8 @@ template class ConcatMKLDNNOpKernel : public paddle::framework::OpKernel { public: void Compute(const paddle::framework::ExecutionContext& ctx) const override { + // If any of the multiple inputs of concat has an input size of 0, the + // actual size of the multi_input will change auto multi_input = ReduceMultiInput(ctx.MultiInput("X")); EnforceLayouts(multi_input); Tensor* output = ctx.Output("Out"); @@ -156,12 +167,9 @@ class ConcatMKLDNNOpKernel : public paddle::framework::OpKernel { paddle::framework::ToMKLDNNDataType(multi_input[0]->type()); ConcatPrimitiveFactory prim_creator; - // If one of the multiple inputs of concat has an input size of 0, the - // actual size of the multi_input will change - std::string key = platform::CreateKey( - dev_ctx, paddle::framework::vectorize(multi_input[0]->dims()), - multi_input.size(), ctx.OutputName("Out"), dt, - platform::ThreadIDasStr()); + std::string key = + platform::CreateKey(dev_ctx, GetDimsForKey(multi_input), + multi_input.size(), ctx.OutputName("Out"), dt); key = platform::ExtendKeyWithThreadInfoIfNeeded(dev_ctx, key); const std::string key_prim = key + "@concat_p"; -- GitLab From 649868ffb262bdba89741eca93e7c7cb8632b9e2 Mon Sep 17 00:00:00 2001 From: Huihuang Zheng Date: Wed, 24 Mar 2021 16:37:16 +0800 Subject: [PATCH 070/486] [Dy2stat] Fix the bug that loop_body_func may return single element (#31806) Our old `loop_body` function may return single element when `loop_vars` just contains only 1 element, which can cause bug. The key point of this PR is forcing `loop_body` functions always return tuple. --- .../dygraph_to_static/loop_transformer.py | 2 +- .../fluid/dygraph/dygraph_to_static/utils.py | 12 ++++++-- .../dygraph_to_static/test_for_enumerate.py | 29 +++++++++++++++++-- 3 files changed, 37 insertions(+), 6 deletions(-) diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py index b7ef000938a..bd89a79c805 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py @@ -594,7 +594,7 @@ class LoopTransformer(gast.NodeTransformer): # append return values for loop body body_stmts.append( gast.Return(value=generate_name_node( - loop_var_names, ctx=gast.Load()))) + loop_var_names, ctx=gast.Load(), gen_tuple_if_single=True))) body_func_node = gast.FunctionDef( name=unique_name.generate(FOR_BODY_PREFIX), args=gast.arguments( diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py index 1071fc1350b..624ca085ac6 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py @@ -381,9 +381,15 @@ def get_attribute_full_name(node): return astor.to_source(gast.gast_to_ast(node)).strip() -def generate_name_node(name_ids, ctx=gast.Load()): +def generate_name_node(name_ids, ctx=gast.Load(), gen_tuple_if_single=False): """ - Generate list or gast.Tuple of ast.Name for Return statement. + If name_ids is list or tuple or set with multiple strings, this function + generates gast.Tuple of gast.Name. + If the name_ids is single string or contains only 1 string, this function + returns gast.Name if gen_tuple_if_single==False else returns gast.Tuple + with only one gast.Name + + This function is used at several gast.Return statements. """ if isinstance(name_ids, six.string_types): name_ids = [name_ids] @@ -395,7 +401,7 @@ def generate_name_node(name_ids, ctx=gast.Load()): id=name_id, ctx=ctx, annotation=None, type_comment=None) for name_id in name_ids ] - if len(gast_names) == 1: + if len(gast_names) == 1 and not gen_tuple_if_single: name_node = gast_names[0] else: name_node = gast.Tuple(elts=gast_names, ctx=ctx) diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_for_enumerate.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_for_enumerate.py index c28997c5c1c..517cff39a27 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_for_enumerate.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_for_enumerate.py @@ -233,6 +233,7 @@ def for_iter_var_idx(x_array): return z +# 17. for a,b,c in z: (a, b, c) is a tuple @paddle.jit.to_static def for_tuple_as_iter_var(x_array): x = paddle.to_tensor(x_array) @@ -250,6 +251,7 @@ def for_tuple_as_iter_var(x_array): return a_result, b_result, c_result +# 18. for t in enumerate(collection): t is tuple of (idx, element) @paddle.jit.to_static def for_tuple_as_enumerate_iter(x_array): x = paddle.to_tensor(x_array) @@ -263,6 +265,7 @@ def for_tuple_as_enumerate_iter(x_array): return a_result +# 19. for i, (a, b, c, d, e) in enumerate(collection): (a, b, c, d, e) is a tuple @paddle.jit.to_static def for_tuple_as_enumerate_value(x_array): x = paddle.to_tensor(x_array) @@ -284,6 +287,23 @@ def for_tuple_as_enumerate_value(x_array): return a_result +# 20. test for function in a class +class ForwardContainsForLayer(paddle.nn.Layer): + def __init__(self): + super(ForwardContainsForLayer, self).__init__() + self.high = 5 + self.low = 3 + + @paddle.jit.to_static + def forward(self, x): + # just for test case, x is useless in this method + y = paddle.zeros([10, 2, 3]) + z = [] + for i in range(self.high - self.low): + z.append(y[i].clone()) + return z + + class TestTransformBase(unittest.TestCase): def setUp(self): self.place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda( @@ -313,11 +333,11 @@ class TestTransformBase(unittest.TestCase): class TestTransform(TestTransformBase): def transformed_result_compare(self): dy_outs = self.get_dygraph_output() - if not isinstance(dy_outs, tuple): + if not isinstance(dy_outs, (tuple, list)): dy_outs = (dy_outs, ) st_outs = self.get_static_output() - if not isinstance(st_outs, tuple): + if not isinstance(st_outs, (tuple, list)): st_outs = (st_outs, ) for x, y in zip(dy_outs, st_outs): @@ -446,5 +466,10 @@ class TestForTupleAsEnumerateValue(TestForIterVarNumpy): self.dygraph_func = for_tuple_as_enumerate_value +class TestForwardContainsForLayer(TestForIterVarNumpy): + def set_test_func(self): + self.dygraph_func = ForwardContainsForLayer() + + if __name__ == '__main__': unittest.main() -- GitLab From 5d89ec36dc36c3b09a3972db326a2d41c4a330a5 Mon Sep 17 00:00:00 2001 From: parap1uie-s Date: Wed, 24 Mar 2021 17:25:00 +0800 Subject: [PATCH 071/486] Update pooling.py (#31829) Fix default argument of nn.MaxPool3D() --- python/paddle/nn/layer/pooling.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/paddle/nn/layer/pooling.py b/python/paddle/nn/layer/pooling.py index 0f3c4449a3f..5830af3a182 100755 --- a/python/paddle/nn/layer/pooling.py +++ b/python/paddle/nn/layer/pooling.py @@ -589,8 +589,8 @@ class MaxPool3D(layers.Layer): def __init__(self, kernel_size, - stride, - padding, + stride=None, + padding=0, return_mask=False, ceil_mode=False, data_format="NCDHW", -- GitLab From e7f28d6c0db54eb9c9a810612300b526687e56a6 Mon Sep 17 00:00:00 2001 From: winter-wang <78149749+winter-wang@users.noreply.github.com> Date: Wed, 24 Mar 2021 18:19:51 +0800 Subject: [PATCH 072/486] fix runtime crash when rnn model inference, test=develop (#31833) --- .../analysis/passes/memory_optimize_pass.cc | 1 + paddle/fluid/operators/recurrent_op.cc | 25 +++++++++---------- python/paddle/nn/functional/norm.py | 9 ++++--- 3 files changed, 18 insertions(+), 17 deletions(-) diff --git a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc index 5e6960c4c7e..fdfd2c60af0 100644 --- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc +++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc @@ -103,6 +103,7 @@ void MemoryOptimizePass::CollectVarMemorySize( "merge_lod_tensor", "equal", "sequence_pool", + "recurrent", "lod_reset"}; for (auto* tmp : node->inputs) { CHECK(tmp->IsOp()); diff --git a/paddle/fluid/operators/recurrent_op.cc b/paddle/fluid/operators/recurrent_op.cc index 9766008963b..92e5e4a0cd1 100644 --- a/paddle/fluid/operators/recurrent_op.cc +++ b/paddle/fluid/operators/recurrent_op.cc @@ -210,9 +210,10 @@ void RecurrentOp::RunImpl(const framework::Scope &scope, auto *block = Attr(kStepBlock); auto *program = block->Program(); - auto ctx = executor.Prepare( - *program, block->ID(), Attr>( - kSkipEagerDeletionVars) /*skip_ref_cnt_vars*/); + auto ctx = executor.Prepare(*program, block->ID(), + Attr>( + kSkipEagerDeletionVars), /*skip_ref_cnt_vars*/ + true); static std::mutex mutex; std::lock_guard lock(mutex); @@ -255,16 +256,6 @@ void RecurrentOp::RunImpl(const framework::Scope &scope, // Link inside::output -> outside::output // outside::output[seq_offset: seq_offset + 1] = inside::output executor.CreateVariables(ctx->prog_, &cur_scope, ctx->block_id_); - if (i > 0) { - LinkTensorWithCallback(scope, Outputs(kOutputs), cur_scope, - Outputs(kOutputs), - [&](const framework::LoDTensor &src_tensor, - framework::LoDTensor *dst_tensor) { - framework::Tensor src_slice = - src_tensor.Slice(seq_offset, seq_offset + 1); - dst_tensor->ShareDataWith(src_slice); - }); - } // Linked now, execute! executor.RunPreparedContext(ctx.get(), &cur_scope, @@ -284,6 +275,14 @@ void RecurrentOp::RunImpl(const framework::Scope &scope, // early. framework::TensorCopy(src_tensor, place, dev_ctx, &dst_out); }); + } else { + LinkTensorWithCallback( + cur_scope, Outputs(kOutputs), scope, Outputs(kOutputs), + [&](const framework::LoDTensor &src_tensor, + framework::LoDTensor *dst_tensor) { + auto dst_out = dst_tensor->Slice(seq_offset, seq_offset + 1); + framework::TensorCopy(src_tensor, place, dev_ctx, &dst_out); + }); } scopes.ForwardNext(); diff --git a/python/paddle/nn/functional/norm.py b/python/paddle/nn/functional/norm.py index 03ba78e12f6..54824233f70 100644 --- a/python/paddle/nn/functional/norm.py +++ b/python/paddle/nn/functional/norm.py @@ -188,10 +188,10 @@ def batch_norm(x, if in_dygraph_mode(): # for dygraph need tuple - attrs = ("momentum", momentum, "epsilon", epsilon, "data_layout", - data_format, "use_mkldnn", False, "fuse_with_relu", False, - "use_global_stats", use_global_stats, "trainable_statistics", - trainable_statistics) + attrs = ("momentum", momentum, "epsilon", epsilon, "is_test", + not training, "data_layout", data_format, "use_mkldnn", False, + "fuse_with_relu", False, "use_global_stats", use_global_stats, + "trainable_statistics", trainable_statistics) batch_norm_out, _, _, _, _, _ = core.ops.batch_norm( x, weight, bias, running_mean, running_var, mean_out, variance_out, *attrs) @@ -205,6 +205,7 @@ def batch_norm(x, attrs = { "momentum": momentum, "epsilon": epsilon, + "is_test": not training, "data_layout": data_format, "use_mkldnn": False, "fuse_with_relu": False, -- GitLab From 6472d62093c49e76cfcc5fc93224a4be4b1f063b Mon Sep 17 00:00:00 2001 From: niuliling123 <51102941+niuliling123@users.noreply.github.com> Date: Thu, 25 Mar 2021 08:57:24 +0800 Subject: [PATCH 073/486] Revert "add relu forward kernel and backward kernel (#31613)" (#31853) --- paddle/fluid/operators/activation_op.cu | 284 +----------------------- 1 file changed, 1 insertion(+), 283 deletions(-) diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu index 29498da0f02..2033081af22 100644 --- a/paddle/fluid/operators/activation_op.cu +++ b/paddle/fluid/operators/activation_op.cu @@ -10,276 +10,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/activation_op.h" -#include "paddle/fluid/operators/math/math_cuda_utils.h" -#include "paddle/fluid/platform/cuda_device_function.h" #include "paddle/fluid/platform/float16.h" -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -using float16 = paddle::platform::float16; - -template -struct CudaVecType { - using type = T; - static constexpr int vecsize = 1; -}; - -template <> -struct CudaVecType { - using type = __half2; - static constexpr int vecsize = 2; -}; - -template <> -struct CudaVecType { - using type = float4; - static constexpr int vecsize = 4; -}; - -template -class BaseGPUFunctor { - public: - using ELEMENT_TYPE = T; -}; - -/* ========================================================================== */ - -/* =========================== relu forward ============================ */ -template -class ReluGPUFuctor : public BaseGPUFunctor { - private: - T zero_; - - public: - ReluGPUFuctor() { zero_ = static_cast(0.0f); } - - // for relu forward when T is double - __device__ __forceinline__ typename CudaVecType::type Compute( - const typename CudaVecType::type* x); - - // when num % vecsize != 0 this func will be used - __device__ __forceinline__ T ComputeRemainder(const T x) { - return x > zero_ ? x : zero_; - } -}; - -template <> -__device__ __forceinline__ CudaVecType::type -ReluGPUFuctor::Compute(const CudaVecType::type* x) { -// relu forward : out = max(x, 0) -#ifdef __HIPCC__ || __CUDA_ARCH__ >= 350 || CUDA_VERSION >= 300 - return __ldg(x) > zero_ ? __ldg(x) : zero_; -#else - return (*x) > zero_ ? (*x) : zero_; -#endif -} - -template <> -__device__ __forceinline__ CudaVecType::type -ReluGPUFuctor::Compute(const CudaVecType::type* xx) { - // relu forward : out = max(xx, 0) - return make_float4((xx->x > zero_) * (xx->x), (xx->y > zero_) * (xx->y), - (xx->z > zero_) * (xx->z), (xx->w > zero_) * (xx->w)); -} - -template <> -__device__ __forceinline__ CudaVecType::type -ReluGPUFuctor::Compute(const CudaVecType::type* in) { -// relu forward : out = max(in, 0) -#ifdef __HIPCC__ || __CUDA_ARCH__ >= 350 || CUDA_VERSION >= 300 - const half2 kzero = __float2half2_rn(0.0f); - return __hmul2(__hgt2(__ldg(in), kzero), __ldg(in)); -#else - const float2 xx = __half22float2(*in); - return __floats2half2_rn((xx.x > 0.0f) * static_cast(xx.x), - (xx.y > 0.0f) * static_cast(xx.y)); -#endif -} -/* ========================================================================== */ - -/* =========================== relu backward ============================ - */ - -template -class ReluGradGPUFunctor : public BaseGPUFunctor { - private: - T zero_; - - public: - ReluGradGPUFunctor() { zero_ = static_cast(0.0f); } - - // for relu backward when T is double - __device__ __forceinline__ typename CudaVecType::type Compute( - const typename CudaVecType::type* out, - const typename CudaVecType::type* dout); - - // when num % vecsize != 0 this func will be used - __device__ __forceinline__ T ComputeRemainder(const T out, const T dout) { - // relu backward : dx = out > 0 ? dout : 0; - return out > zero_ ? dout : zero_; - } - - static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } -}; - -template <> -__device__ __forceinline__ CudaVecType::type -ReluGradGPUFunctor::Compute(const CudaVecType::type* out, - const CudaVecType::type* dout) { -// relu backward : dx = out > 0 ? dout : 0; -#ifdef __HIPCC__ || __CUDA_ARCH__ >= 350 || CUDA_VERSION >= 300 - return __ldg(out) > zero_ ? __ldg(dout) : zero_; -#else - return (*out) > zero_ ? (*dout) : zero_; -#endif -} - -template <> -__device__ __forceinline__ CudaVecType::type -ReluGradGPUFunctor::Compute(const CudaVecType::type* out, - const CudaVecType::type* dout) { - // relu backward : dx = out > 0 ? dout : 0; - return make_float4((out->x > zero_) * (dout->x), (out->y > zero_) * (dout->y), - (out->z > zero_) * (dout->z), - (out->w > zero_) * (dout->w)); -} - -template <> -__device__ __forceinline__ CudaVecType::type -ReluGradGPUFunctor::Compute(const CudaVecType::type* out, - const CudaVecType::type* dout) { -// relu backward : dx = out > 0 ? dout : 0; -#ifdef __HIPCC__ || __CUDA_ARCH__ >= 350 || CUDA_VERSION >= 300 - const half2 kzero = __float2half2_rn(0.0f); - return __hmul2(__hgt2(__ldg(out), kzero), __ldg(dout)); -#else - const float2 xx = __half22float2(*out); - const float2 yy = __half22float2(*dout); - return __floats2half2_rn((xx.x > 0.0f) * static_cast(yy.x), - (xx.y > 0.0f) * static_cast(yy.y)); -#endif -} - -/* ========================================================================== */ - -template -__global__ void ActivationGradKernelVec(const T* forward_data, const T* dout, - T* dx, int num, Functor functor) { - using VecType = typename CudaVecType::type; - constexpr int vecsize = CudaVecType::vecsize; - int idx = threadIdx.x + blockIdx.x * blockDim.x; - int stride = blockDim.x * gridDim.x; - int loop = num / vecsize; - int tail = num % vecsize; - const VecType* in_forward = reinterpret_cast(forward_data); - const VecType* in_dout = reinterpret_cast(dout); - VecType* out = reinterpret_cast(dx); - - for (int i = idx; i < loop; i += stride) { - out[i] = functor.Compute((in_forward + i), (in_dout + i)); - } - - while (idx == loop && tail) { - dx[num - tail] = - functor.ComputeRemainder(forward_data[num - tail], dout[num - tail]); - --tail; - } -} - -template -__global__ void ActivationkernelVec(const T* src, T* dst, int num, - Functor functor) { - constexpr int vecsize = CudaVecType::vecsize; - using VecType = typename CudaVecType::type; - int idx = threadIdx.x + blockIdx.x * blockDim.x; - int stride = blockDim.x * gridDim.x; - int loop = num / vecsize; - int tail = num % vecsize; - const VecType* in = reinterpret_cast(src); - VecType* out = reinterpret_cast(dst); - - for (int i = idx; i < loop; i += stride) { - out[i] = functor.Compute((in + i)); - } - - while (idx == loop && tail) { - dst[num - tail] = functor.ComputeRemainder(src[num - tail]); - --tail; - } -} - -template -class ActivationGPUKernel - : public framework::OpKernel { - public: - using T = typename Functor::ELEMENT_TYPE; - void Compute(const framework::ExecutionContext& context) const override { - const framework::Tensor* in_x = nullptr; - framework::Tensor* out = nullptr; - ExtractActivationTensor(context, &in_x, &out); - auto& dev_ctx = context.template device_context(); - - int num = in_x->numel(); - const T* input_data = in_x->data(); - T* output_data = out->mutable_data(dev_ctx.GetPlace(), - static_cast(num * sizeof(T))); - - int block = 512; -#ifdef __HIPCC__ - block = 256; -#endif - Functor functor; - constexpr int vecsize = CudaVecType::vecsize; - int grid = max((num / vecsize + block - 1) / block, 1); - ActivationkernelVec<<>>(input_data, output_data, - num, functor); - } -}; - -template -class ActivationGradGPUKernel - : public framework::OpKernel { - public: - using T = typename Functor::ELEMENT_TYPE; - void Compute(const framework::ExecutionContext& context) const override { - const framework::Tensor *x, *out, *d_out; - framework::Tensor* d_x = nullptr; - x = out = d_out = nullptr; - ExtractActivationGradTensor(context, &x, &out, &d_out, - &d_x); - int numel = d_out->numel(); - auto& dev_ctx = context.template device_context(); - auto* dx_data = d_x->mutable_data( - dev_ctx.GetPlace(), static_cast(numel * sizeof(T))); - auto* dout_data = d_out->data(); - - auto* forward_data = dout_data; - if (static_cast(Functor::FwdDeps()) == static_cast(kDepOut)) { - // Only need forward output Out - forward_data = out->data(); - } else if (static_cast(Functor::FwdDeps()) == - static_cast(kDepX)) { - // Only need forward input X - forward_data = x->data(); - } - - int block = 512; -#ifdef __HIPCC__ - block = 256; -#endif - Functor functor; - constexpr int vecsize = CudaVecType::vecsize; - int grid = max((numel / vecsize + block - 1) / block, 1); - ActivationGradKernelVec<<>>( - forward_data, dout_data, dx_data, numel, functor); - } -}; - -} // namespace operators -} // namespace paddle - namespace ops = paddle::operators; namespace plat = paddle::platform; @@ -328,21 +60,7 @@ REGISTER_OP_CUDA_KERNEL( /* ========================================================================== */ /* =========================== relu register ============================ */ -REGISTER_OP_CUDA_KERNEL( - relu, ops::ActivationGPUKernel>, - ops::ActivationGPUKernel>, - ops::ActivationGPUKernel>); - -REGISTER_OP_CUDA_KERNEL( - relu_grad, ops::ActivationGradGPUKernel>, - ops::ActivationGradGPUKernel>, - ops::ActivationGradGPUKernel>); +REGISTER_ACTIVATION_CUDA_KERNEL(relu, Relu, ReluCUDAFunctor, ReluGradFunctor); REGISTER_OP_CUDA_KERNEL( relu_grad_grad, -- GitLab From 511e204e620f3c6e3df2018746c52c5bf2386a59 Mon Sep 17 00:00:00 2001 From: Zhou Wei <52485244+zhouwei25@users.noreply.github.com> Date: Thu, 25 Mar 2021 11:24:01 +0800 Subject: [PATCH 074/486] LRScheduler.get_lr should not update lr in LinearWarmup (#31843) --- .../fluid/tests/unittests/test_lr_scheduler.py | 12 ++++++++++++ python/paddle/optimizer/lr.py | 5 ++--- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_lr_scheduler.py b/python/paddle/fluid/tests/unittests/test_lr_scheduler.py index 8c6383cd6ef..04a0d47e47c 100644 --- a/python/paddle/fluid/tests/unittests/test_lr_scheduler.py +++ b/python/paddle/fluid/tests/unittests/test_lr_scheduler.py @@ -537,6 +537,18 @@ class TestLRScheduler(unittest.TestCase): self._test_dygraph(python_func, paddle_api, kwarg, place) paddle.enable_static() + def test_linear_warmp(self): + natural_lr = paddle.optimizer.lr.NaturalExpDecay( + learning_rate=0.5, gamma=0.1) + natural_lr_warmup = paddle.optimizer.lr.LinearWarmup( + learning_rate=natural_lr, warmup_steps=10, start_lr=0.0, end_lr=0.1) + for idx in range(30): + if idx >= 10: + self.assertEqual(natural_lr_warmup.get_lr(), + natural_lr.get_lr()) + natural_lr.step() + natural_lr_warmup.step() + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/optimizer/lr.py b/python/paddle/optimizer/lr.py index 5085911ce92..484b4fb7246 100644 --- a/python/paddle/optimizer/lr.py +++ b/python/paddle/optimizer/lr.py @@ -786,9 +786,8 @@ class LinearWarmup(LRScheduler): self.last_epoch) / float(self.warmup_steps) + self.start_lr else: if isinstance(self.learning_rate, LRScheduler): - lr_value = self.learning_rate() - self.learning_rate.step() - return lr_value + self.learning_rate.step(self.last_epoch - self.warmup_steps) + return self.learning_rate() return self.learning_rate -- GitLab From 27f2d8df8e48847f62e31e627ee25ac2102f27fc Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Thu, 25 Mar 2021 11:36:16 +0800 Subject: [PATCH 075/486] Polish two error messages (#31852) * polish two error messages * polish details --- paddle/fluid/operators/detection/polygon_box_transform_op.cu | 3 ++- paddle/fluid/operators/matmul_op.cc | 2 +- paddle/fluid/operators/nll_loss_op.h | 5 ++++- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/operators/detection/polygon_box_transform_op.cu b/paddle/fluid/operators/detection/polygon_box_transform_op.cu index 337a76f9f97..5977a434a60 100644 --- a/paddle/fluid/operators/detection/polygon_box_transform_op.cu +++ b/paddle/fluid/operators/detection/polygon_box_transform_op.cu @@ -45,7 +45,8 @@ class PolygonBoxTransformOpCUDAKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& ctx) const override { PADDLE_ENFORCE_EQ( platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::InvalidArgument("It must use CUDAPlace.")); + platform::errors::InvalidArgument( + "The polygon_box_transform operator needs to be executed on GPU.")); auto* in = ctx.Input("Input"); auto in_dims = in->dims(); const T* in_data = in->data(); diff --git a/paddle/fluid/operators/matmul_op.cc b/paddle/fluid/operators/matmul_op.cc index 9b64e99c944..c12aecc9ba5 100644 --- a/paddle/fluid/operators/matmul_op.cc +++ b/paddle/fluid/operators/matmul_op.cc @@ -587,7 +587,7 @@ class MatMulOp : public framework::OperatorWithKernel { PADDLE_ENFORCE_EQ(mat_dim_x.width_, mat_dim_y.height_, platform::errors::InvalidArgument( "Input X's width should be equal to the Y's height, " - "but received X's shape: [%s]," + "but received X's shape: [%s], " "Y's shape: [%s].", dim_x, dim_y)); #endif diff --git a/paddle/fluid/operators/nll_loss_op.h b/paddle/fluid/operators/nll_loss_op.h index e93d5792205..be6f4422d4a 100644 --- a/paddle/fluid/operators/nll_loss_op.h +++ b/paddle/fluid/operators/nll_loss_op.h @@ -36,7 +36,10 @@ static void nll_loss_1D(T* out_data, T* total_weight_data, const T* x_data, } PADDLE_ENFORCE_EQ(cur_label >= 0 && cur_label < n_classes, true, platform::errors::InvalidArgument( - "label should not be out of bounds.")); + "Label value is out of range. " + "Expected label value in range of [0, %d), but " + "received value is %d.", + n_classes, cur_label)); const auto cur_weight = weight_data ? weight_data[cur_label] : static_cast(1); -- GitLab From bf09dcb346c9aa4c20fbfaf520ab781d4f640346 Mon Sep 17 00:00:00 2001 From: Kaipeng Deng Date: Thu, 25 Mar 2021 14:08:22 +0800 Subject: [PATCH 076/486] add GPU tensor notice & update default_collate_fn/default_convert_fn. test=develop (#31763) --- python/paddle/fluid/dataloader/collate.py | 47 +++++++++++++++++------ python/paddle/fluid/reader.py | 6 +++ 2 files changed, 42 insertions(+), 11 deletions(-) diff --git a/python/paddle/fluid/dataloader/collate.py b/python/paddle/fluid/dataloader/collate.py index ddc010d0428..8e90b308b39 100644 --- a/python/paddle/fluid/dataloader/collate.py +++ b/python/paddle/fluid/dataloader/collate.py @@ -27,24 +27,31 @@ except: def default_collate_fn(batch): """ Default batch collating function for :code:`paddle.io.DataLoader`, - batch should be a list of samples, and each sample should be a list - of fields as follows: + get input data as a list of sample datas, each element in list + if the data of a sample, and sample data should composed of list, + dictionary, string, number, numpy array and paddle.Tensor, this + function will parse input data recursively and stack number, + numpy array and paddle.Tensor datas as batch datas. e.g. for + following input data: + + [{'image': np.array(shape=[3, 224, 224]), 'label': 1}, + {'image': np.array(shape=[3, 224, 224]), 'label': 3}, + {'image': np.array(shape=[3, 224, 224]), 'label': 4}, + {'image': np.array(shape=[3, 224, 224]), 'label': 5},] - [[filed1, filed2, ...], [filed1, filed2, ...], ...] - This default collate function zipped each filed together and stack - each filed as the batch field as follows: + This default collate function zipped each number and numpy array + field together and stack each field as the batch field as follows: + + {'image': np.array(shape=[4, 3, 224, 224]), 'label': np.array([1, 3, 4, 5])} - [batch_filed1, batch_filed2, ...] Args: - batch(list of list of numpy array|paddle.Tensor): the batch data, each fields - should be a numpy array, each sample should be a list of - fileds, and batch should be a list of sample. + batch(list of sample data): batch should be a list of sample data. Returns: - a list of numpy array|Paddle.Tensor: collated batch of input batch data, - fields data type as same as fields in each sample. + Batched data: batched each number, numpy array and paddle.Tensor + in input data. """ sample = batch[0] if isinstance(sample, np.ndarray): @@ -75,6 +82,24 @@ def default_collate_fn(batch): def default_convert_fn(batch): + """ + Default batch converting function for :code:`paddle.io.DataLoader`. + get input data as a list of sample datas, each element in list + if the data of a sample, and sample data should composed of list, + dictionary, string, number, numpy array and paddle.Tensor. + + .. note:: + This function is default :attr:`collate_fn` in **Distable + automatic batching** mode, for **Distable automatic batching** + mode, please ses :attr:`paddle.io.DataLoader` + + Args: + batch(list of sample data): batch should be a list of sample data. + + Returns: + Batched data: batched each number, numpy array and paddle.Tensor + in input data. + """ if isinstance(batch, (paddle.Tensor, np.ndarray)): return batch elif isinstance(batch, (str, bytes)): diff --git a/python/paddle/fluid/reader.py b/python/paddle/fluid/reader.py index be196b73edd..9f2b2127aa7 100644 --- a/python/paddle/fluid/reader.py +++ b/python/paddle/fluid/reader.py @@ -165,6 +165,12 @@ class DataLoader(object): For :code:`batch_sampler` please see :code:`paddle.io.BatchSampler` + .. note:: + GPU tensor operation is not supported in subprocess currently, + please don't use GPU tensor operations in pipeline which will + be performed in subprocess, such as dataset transforms, collte_fn, + etc. Numpy array and CPU tensor operation is supported. + **Disable automatic batching** In certain cases such as some NLP tasks, instead of automatic batching, -- GitLab From f58cb01864151e27ff45d9fc99b61b72cce3295e Mon Sep 17 00:00:00 2001 From: Chengmo Date: Thu, 25 Mar 2021 17:37:09 +0800 Subject: [PATCH 077/486] =?UTF-8?q?=E3=80=90Paddle.Fleet=E3=80=91fix=20dat?= =?UTF-8?q?aset=20zip=20py3=20bug=20(#31441)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix zip py3 bug --- .../fleet/data_generator/data_generator.py | 26 +++++++----- .../tests/unittests/test_data_generator.py | 40 +++++++++++++++++++ 2 files changed, 56 insertions(+), 10 deletions(-) diff --git a/python/paddle/distributed/fleet/data_generator/data_generator.py b/python/paddle/distributed/fleet/data_generator/data_generator.py index 669d2ea24a0..9d743fc38bf 100644 --- a/python/paddle/distributed/fleet/data_generator/data_generator.py +++ b/python/paddle/distributed/fleet/data_generator/data_generator.py @@ -32,11 +32,11 @@ class DataGenerator(object): ''' Set batch size of current DataGenerator This is necessary only if a user wants to define generator_batch - + Example: .. code-block:: python - + import paddle.distributed.fleet.data_generator as dg class MyData(dg.DataGenerator): @@ -52,7 +52,7 @@ class DataGenerator(object): yield ("words", s[1].extend([s[1][0]])) mydata = MyData() mydata.set_batch(128) - + ''' self.batch_size_ = batch_size @@ -63,7 +63,7 @@ class DataGenerator(object): Example: .. code-block:: python - + import paddle.distributed.fleet.data_generator as dg class MyData(dg.DataGenerator): @@ -100,9 +100,9 @@ class DataGenerator(object): generated. Example: - + .. code-block:: python - + import paddle.distributed.fleet.data_generator as dg class MyData(dg.DataGenerator): @@ -161,7 +161,7 @@ class DataGenerator(object): The data format is list or tuple: [(name, [feasign, ...]), ...] or ((name, [feasign, ...]), ...) - + For example: [("words", [1926, 08, 17]), ("label", [1])] or (("words", [1926, 08, 17]), ("label", [1])) @@ -174,7 +174,7 @@ class DataGenerator(object): Example: .. code-block:: python - + import paddle.distributed.fleet.data_generator as dg class MyData(dg.DataGenerator): @@ -206,7 +206,7 @@ class DataGenerator(object): Example: .. code-block:: python - + import paddle.distributed.fleet.data_generator as dg class MyData(dg.DataGenerator): @@ -259,6 +259,9 @@ class MultiSlotStringDataGenerator(DataGenerator): Returns: Return a string data that can be read directly by the MultiSlotDataFeed. ''' + if sys.version > '3' and isinstance(line, zip): + line = list(line) + if not isinstance(line, list) and not isinstance(line, tuple): raise ValueError( "the output of process() must be in list or tuple type" @@ -289,7 +292,7 @@ class MultiSlotDataGenerator(DataGenerator): >>> [ids_num id1 id2 ...] ... The proto_info will be in this format: >>> [(name, type), ...] - + For example, if the input is like this: >>> [("words", [1926, 08, 17]), ("label", [1])] >>> or (("words", [1926, 08, 17]), ("label", [1])) @@ -304,6 +307,9 @@ class MultiSlotDataGenerator(DataGenerator): Returns: Return a string data that can be read directly by the MultiSlotDataFeed. ''' + if sys.version > '3' and isinstance(line, zip): + line = list(line) + if not isinstance(line, list) and not isinstance(line, tuple): raise ValueError( "the output of process() must be in list or tuple type" diff --git a/python/paddle/fluid/tests/unittests/test_data_generator.py b/python/paddle/fluid/tests/unittests/test_data_generator.py index 6381cb36402..69d8e01fd46 100644 --- a/python/paddle/fluid/tests/unittests/test_data_generator.py +++ b/python/paddle/fluid/tests/unittests/test_data_generator.py @@ -95,6 +95,32 @@ class MyMultiSlotDataGenerator_error_5(fleet.MultiSlotDataGenerator): return data_iter +class MyMultiSlotStringDataGenerator_zip(fleet.MultiSlotStringDataGenerator): + def generate_sample(self, line): + def data_iter(): + for i in range(40): + if i == 1: + yield None + feature_name = ["words", "label"] + data = [["1", "2", "3", "4"], ["0"]] + yield zip(feature_name, data) + + return data_iter + + +class MyMultiSlotDataGenerator_zip(fleet.MultiSlotDataGenerator): + def generate_sample(self, line): + def data_iter(): + for i in range(40): + if i == 1: + yield None + feature_name = ["words", "label"] + data = [[1, 2, 3, 4], [0]] + yield zip(feature_name, data) + + return data_iter + + class TestMultiSlotDataGenerator(unittest.TestCase): def test_MultiSlotDataGenerator_basic(self): my_ms_dg = MyMultiSlotDataGenerator() @@ -149,5 +175,19 @@ class TestMultiSlotDataGenerator_error_5(unittest.TestCase): my_ms_dg.run_from_memory() +class TestMultiSlotStringDataGeneratorZip(unittest.TestCase): + def test_MultiSlotStringDataGenerator_zip(self): + my_ms_dg = MyMultiSlotStringDataGenerator_zip() + my_ms_dg.set_batch(1) + my_ms_dg.run_from_memory() + + +class TestMultiSlotDataGeneratorZip(unittest.TestCase): + def test_MultiSlotDataGenerator_zip(self): + my_ms_dg = MyMultiSlotDataGenerator_zip() + my_ms_dg.set_batch(1) + my_ms_dg.run_from_memory() + + if __name__ == '__main__': unittest.main() -- GitLab From e804f08559d96a87b8c7eb50120eef68402e4313 Mon Sep 17 00:00:00 2001 From: tianshuo78520a <707759223@qq.com> Date: Fri, 26 Mar 2021 13:43:48 +0800 Subject: [PATCH 078/486] delete include framework.pb.h (#31859) * delete include framework.pb.h * fix error --- paddle/fluid/framework/custom_operator.cc | 1 - paddle/fluid/framework/executor_gc_helper.cc | 1 - paddle/fluid/framework/ir/graph_pattern_detector.h | 1 - paddle/fluid/framework/ir/layer_norm_fuse_pass.cc | 1 - paddle/fluid/framework/ir/layer_norm_fuse_pass_tester.cc | 1 - paddle/fluid/framework/op_info.h | 1 - paddle/fluid/framework/op_proto_maker.h | 1 - paddle/fluid/framework/op_version_registry.h | 1 - paddle/fluid/framework/operator.h | 1 - paddle/fluid/framework/program_desc.h | 1 - paddle/fluid/framework/reader.h | 1 - paddle/fluid/framework/tensor_util.h | 1 - paddle/fluid/framework/var_type.h | 1 - paddle/fluid/framework/var_type_traits.h | 1 - paddle/fluid/framework/variable_helper.h | 1 - paddle/fluid/imperative/gradient_accumulator.cc | 1 - paddle/fluid/inference/analysis/analysis_pass.h | 1 - paddle/fluid/inference/analysis/helper.cc | 1 - paddle/fluid/inference/analysis/helper.h | 1 - paddle/fluid/inference/engine.h | 1 - paddle/fluid/operators/cast_op.h | 1 - paddle/fluid/operators/distributed_ops/recv_save_op.cc | 1 - .../operators/fused/fused_embedding_eltwise_layernorm_op.cu | 1 - paddle/fluid/operators/inplace_abn_op.cc | 1 - paddle/fluid/operators/one_hot_op.cc | 1 - paddle/fluid/operators/one_hot_op_xpu.cc | 1 - paddle/fluid/operators/one_hot_v2_op.cc | 1 - paddle/fluid/operators/one_hot_v2_op_xpu.cc | 1 - paddle/fluid/operators/reader/create_py_reader_op.cc | 1 - paddle/fluid/operators/reader/read_op.cc | 1 - paddle/fluid/operators/save_combine_op.h | 1 - paddle/fluid/operators/save_op.h | 1 - paddle/fluid/pybind/pybind.cc | 1 - 33 files changed, 33 deletions(-) diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc index 1ebb8998c85..97d58df6dc5 100644 --- a/paddle/fluid/framework/custom_operator.cc +++ b/paddle/fluid/framework/custom_operator.cc @@ -28,7 +28,6 @@ limitations under the License. */ #include "paddle/fluid/extension/include/ext_tensor.h" #include "paddle/fluid/framework/attribute.h" #include "paddle/fluid/framework/custom_tensor_utils.h" -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/op_meta_info_helper.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" diff --git a/paddle/fluid/framework/executor_gc_helper.cc b/paddle/fluid/framework/executor_gc_helper.cc index c8bc7357904..c06a3d4a183 100644 --- a/paddle/fluid/framework/executor_gc_helper.cc +++ b/paddle/fluid/framework/executor_gc_helper.cc @@ -18,7 +18,6 @@ #include "glog/logging.h" #include "paddle/fluid/framework/block_desc.h" -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/no_need_buffer_vars_inference.h" #include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/operator.h" diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h index 2e518c1d4df..b6c1074d90d 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.h +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -28,7 +28,6 @@ #include #include -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/node.h" #include "paddle/fluid/inference/analysis/dot.h" diff --git a/paddle/fluid/framework/ir/layer_norm_fuse_pass.cc b/paddle/fluid/framework/ir/layer_norm_fuse_pass.cc index 69edc3d87f9..18d2e9817eb 100644 --- a/paddle/fluid/framework/ir/layer_norm_fuse_pass.cc +++ b/paddle/fluid/framework/ir/layer_norm_fuse_pass.cc @@ -14,7 +14,6 @@ #include -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/framework/ir/layer_norm_fuse_pass.h" #include "paddle/fluid/framework/op_version_registry.h" diff --git a/paddle/fluid/framework/ir/layer_norm_fuse_pass_tester.cc b/paddle/fluid/framework/ir/layer_norm_fuse_pass_tester.cc index 5fd47b21733..5fe71fbc214 100644 --- a/paddle/fluid/framework/ir/layer_norm_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/layer_norm_fuse_pass_tester.cc @@ -17,7 +17,6 @@ #include #include "paddle/fluid/framework/block_desc.h" -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/ir/layer_norm_fuse_pass.h" #include "paddle/fluid/framework/ir/pass_test_util.h" #include "paddle/fluid/framework/naive_executor.h" diff --git a/paddle/fluid/framework/op_info.h b/paddle/fluid/framework/op_info.h index af657232e91..ddd84bfd81a 100644 --- a/paddle/fluid/framework/op_info.h +++ b/paddle/fluid/framework/op_info.h @@ -20,7 +20,6 @@ limitations under the License. */ #include #include "paddle/fluid/framework/attribute.h" -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/no_need_buffer_vars_inference.h" #include "paddle/fluid/framework/type_defs.h" #include "paddle/fluid/platform/enforce.h" diff --git a/paddle/fluid/framework/op_proto_maker.h b/paddle/fluid/framework/op_proto_maker.h index 912e82f60ef..506c3eb1e0a 100644 --- a/paddle/fluid/framework/op_proto_maker.h +++ b/paddle/fluid/framework/op_proto_maker.h @@ -16,7 +16,6 @@ limitations under the License. */ #include #include "glog/logging.h" #include "paddle/fluid/framework/attribute.h" -#include "paddle/fluid/framework/framework.pb.h" namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/op_version_registry.h b/paddle/fluid/framework/op_version_registry.h index b9ec5507612..5ae8f255d63 100644 --- a/paddle/fluid/framework/op_version_registry.h +++ b/paddle/fluid/framework/op_version_registry.h @@ -20,7 +20,6 @@ limitations under the License. */ #include #include -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/op_version_proto.h" #include "paddle/fluid/platform/enforce.h" diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index e9ecf9b5a83..bf27a8e37e0 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -27,7 +27,6 @@ limitations under the License. */ #include "glog/logging.h" // For VLOG #include "paddle/fluid/framework/attribute.h" #include "paddle/fluid/framework/block_desc.h" -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/op_kernel_type.h" diff --git a/paddle/fluid/framework/program_desc.h b/paddle/fluid/framework/program_desc.h index cfef80b8d37..4ceb0c5c824 100644 --- a/paddle/fluid/framework/program_desc.h +++ b/paddle/fluid/framework/program_desc.h @@ -20,7 +20,6 @@ limitations under the License. */ #include #include "paddle/fluid/framework/block_desc.h" -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/proto_desc.h" #include "paddle/fluid/platform/macros.h" diff --git a/paddle/fluid/framework/reader.h b/paddle/fluid/framework/reader.h index a4207deb7e8..e7c23eab1fa 100644 --- a/paddle/fluid/framework/reader.h +++ b/paddle/fluid/framework/reader.h @@ -20,7 +20,6 @@ #include #include "paddle/fluid/framework/ddim.h" -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/platform/place.h" diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h index 8a127e0ed59..fd0f98784ce 100644 --- a/paddle/fluid/framework/tensor_util.h +++ b/paddle/fluid/framework/tensor_util.h @@ -18,7 +18,6 @@ limitations under the License. */ #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/dlpack_tensor.h" #include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/platform/device_context.h" diff --git a/paddle/fluid/framework/var_type.h b/paddle/fluid/framework/var_type.h index 8affeda67b3..2e35f9b845a 100644 --- a/paddle/fluid/framework/var_type.h +++ b/paddle/fluid/framework/var_type.h @@ -13,7 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/lod_rank_table.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor_array.h" diff --git a/paddle/fluid/framework/var_type_traits.h b/paddle/fluid/framework/var_type_traits.h index b0d8f43a90f..fc754cbaf17 100644 --- a/paddle/fluid/framework/var_type_traits.h +++ b/paddle/fluid/framework/var_type_traits.h @@ -21,7 +21,6 @@ #include #include "paddle/fluid/framework/feed_fetch_type.h" -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/platform/place.h" #ifdef PADDLE_WITH_CUDA diff --git a/paddle/fluid/framework/variable_helper.h b/paddle/fluid/framework/variable_helper.h index 6e65bc2c932..4cdfba29249 100644 --- a/paddle/fluid/framework/variable_helper.h +++ b/paddle/fluid/framework/variable_helper.h @@ -15,7 +15,6 @@ limitations under the License. */ #include -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/variable.h" namespace paddle { diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc index deb504a1b65..b9df88b1f1e 100644 --- a/paddle/fluid/imperative/gradient_accumulator.cc +++ b/paddle/fluid/imperative/gradient_accumulator.cc @@ -18,7 +18,6 @@ #include #include -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/imperative/layer.h" diff --git a/paddle/fluid/inference/analysis/analysis_pass.h b/paddle/fluid/inference/analysis/analysis_pass.h index d5a972fab3b..14a1c3eea34 100644 --- a/paddle/fluid/inference/analysis/analysis_pass.h +++ b/paddle/fluid/inference/analysis/analysis_pass.h @@ -18,7 +18,6 @@ limitations under the License. */ #include #include -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/inference/analysis/argument.h" #include "paddle/fluid/inference/analysis/helper.h" diff --git a/paddle/fluid/inference/analysis/helper.cc b/paddle/fluid/inference/analysis/helper.cc index 368ef2e5583..ede0402f816 100644 --- a/paddle/fluid/inference/analysis/helper.cc +++ b/paddle/fluid/inference/analysis/helper.cc @@ -13,7 +13,6 @@ // limitations under the License. #include "paddle/fluid/inference/analysis/helper.h" -#include "paddle/fluid/framework/framework.pb.h" namespace paddle { namespace inference { diff --git a/paddle/fluid/inference/analysis/helper.h b/paddle/fluid/inference/analysis/helper.h index ab494993514..cace420d87c 100644 --- a/paddle/fluid/inference/analysis/helper.h +++ b/paddle/fluid/inference/analysis/helper.h @@ -25,7 +25,6 @@ limitations under the License. */ #include #include -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/variable.h" #include "paddle/fluid/platform/enforce.h" diff --git a/paddle/fluid/inference/engine.h b/paddle/fluid/inference/engine.h index 1a13ba51038..e29162cf5b2 100644 --- a/paddle/fluid/inference/engine.h +++ b/paddle/fluid/inference/engine.h @@ -15,7 +15,6 @@ limitations under the License. */ #pragma once #include -#include "paddle/fluid/framework/framework.pb.h" namespace paddle { namespace inference { diff --git a/paddle/fluid/operators/cast_op.h b/paddle/fluid/operators/cast_op.h index 8fa0416049f..cd60c7707cb 100644 --- a/paddle/fluid/operators/cast_op.h +++ b/paddle/fluid/operators/cast_op.h @@ -15,7 +15,6 @@ limitations under the License. */ #pragma once #include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/transform.h" diff --git a/paddle/fluid/operators/distributed_ops/recv_save_op.cc b/paddle/fluid/operators/distributed_ops/recv_save_op.cc index d194fcda36a..d6da818e1df 100644 --- a/paddle/fluid/operators/distributed_ops/recv_save_op.cc +++ b/paddle/fluid/operators/distributed_ops/recv_save_op.cc @@ -20,7 +20,6 @@ limitations under the License. */ #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type_transform.h" -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/selected_rows.h" diff --git a/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu b/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu index 9711cc8d811..14a6608836a 100644 --- a/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu +++ b/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu @@ -14,7 +14,6 @@ #include #include -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/operators/math/bert_encoder_functor.h" diff --git a/paddle/fluid/operators/inplace_abn_op.cc b/paddle/fluid/operators/inplace_abn_op.cc index 652c071be6b..8234d63d681 100644 --- a/paddle/fluid/operators/inplace_abn_op.cc +++ b/paddle/fluid/operators/inplace_abn_op.cc @@ -16,7 +16,6 @@ #include #include #include -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/operators/batch_norm_op.h" namespace paddle { diff --git a/paddle/fluid/operators/one_hot_op.cc b/paddle/fluid/operators/one_hot_op.cc index 9c321832f84..64323e588c6 100644 --- a/paddle/fluid/operators/one_hot_op.cc +++ b/paddle/fluid/operators/one_hot_op.cc @@ -15,7 +15,6 @@ #include "paddle/fluid/operators/one_hot_op.h" #include #include -#include "paddle/fluid/framework/framework.pb.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/one_hot_op_xpu.cc b/paddle/fluid/operators/one_hot_op_xpu.cc index 14ecd11d114..3e214aa8bf8 100644 --- a/paddle/fluid/operators/one_hot_op_xpu.cc +++ b/paddle/fluid/operators/one_hot_op_xpu.cc @@ -16,7 +16,6 @@ #include #include -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/operators/one_hot_op.h" namespace paddle { diff --git a/paddle/fluid/operators/one_hot_v2_op.cc b/paddle/fluid/operators/one_hot_v2_op.cc index 29fe6f10c72..c42db1e6f44 100644 --- a/paddle/fluid/operators/one_hot_v2_op.cc +++ b/paddle/fluid/operators/one_hot_v2_op.cc @@ -15,7 +15,6 @@ #include "paddle/fluid/operators/one_hot_v2_op.h" #include #include -#include "paddle/fluid/framework/framework.pb.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/one_hot_v2_op_xpu.cc b/paddle/fluid/operators/one_hot_v2_op_xpu.cc index 6fec597db17..e24be3bead6 100644 --- a/paddle/fluid/operators/one_hot_v2_op_xpu.cc +++ b/paddle/fluid/operators/one_hot_v2_op_xpu.cc @@ -16,7 +16,6 @@ #include #include -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/operators/one_hot_op.h" namespace paddle { diff --git a/paddle/fluid/operators/reader/create_py_reader_op.cc b/paddle/fluid/operators/reader/create_py_reader_op.cc index c04bdb2f109..a7d177f326e 100644 --- a/paddle/fluid/operators/reader/create_py_reader_op.cc +++ b/paddle/fluid/operators/reader/create_py_reader_op.cc @@ -13,7 +13,6 @@ // limitations under the License. #include "paddle/fluid/framework/ddim.h" -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/operators/reader/py_reader.h" #include "paddle/fluid/operators/reader/reader_op_registry.h" diff --git a/paddle/fluid/operators/reader/read_op.cc b/paddle/fluid/operators/reader/read_op.cc index 9086291e17d..38894495b4c 100644 --- a/paddle/fluid/operators/reader/read_op.cc +++ b/paddle/fluid/operators/reader/read_op.cc @@ -12,7 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/reader.h" #include "paddle/fluid/platform/profiler.h" diff --git a/paddle/fluid/operators/save_combine_op.h b/paddle/fluid/operators/save_combine_op.h index 0246c42d433..939768693a2 100644 --- a/paddle/fluid/operators/save_combine_op.h +++ b/paddle/fluid/operators/save_combine_op.h @@ -22,7 +22,6 @@ limitations under the License. */ #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type_transform.h" -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device_context.h" diff --git a/paddle/fluid/operators/save_op.h b/paddle/fluid/operators/save_op.h index fbde722a425..e44a5c77bd8 100644 --- a/paddle/fluid/operators/save_op.h +++ b/paddle/fluid/operators/save_op.h @@ -19,7 +19,6 @@ limitations under the License. */ #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type_transform.h" -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/selected_rows.h" diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index c8ca3bf2c8f..e1ff69e7485 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -29,7 +29,6 @@ limitations under the License. */ #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/feed_fetch_method.h" #include "paddle/fluid/framework/feed_fetch_type.h" -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/garbage_collector.h" #include "paddle/fluid/framework/io/fs.h" #include "paddle/fluid/framework/ir/coalesce_grad_tensor_pass.h" -- GitLab From 70b67f1029a8ddfa68cf2a6f0d5631b95ff591bd Mon Sep 17 00:00:00 2001 From: Wilber Date: Fri, 26 Mar 2021 13:45:31 +0800 Subject: [PATCH 079/486] fix go api bug. (#31857) --- go/README_cn.md | 1 + go/demo/mobilenet.go | 2 +- go/paddle/common.go | 2 +- go/paddle/config.go | 2 +- go/paddle/predictor.go | 4 ++-- go/paddle/tensor.go | 4 ++-- paddle/fluid/inference/capi/pd_predictor.cc | 9 ++++++--- 7 files changed, 14 insertions(+), 10 deletions(-) diff --git a/go/README_cn.md b/go/README_cn.md index a184ecbb8de..040540e939b 100644 --- a/go/README_cn.md +++ b/go/README_cn.md @@ -50,6 +50,7 @@ output_data := value.Interface().([][]float32) 运行 ```bash +go mod init github.com/paddlepaddle export LD_LIBRARY_PATH=`pwd`/paddle_c/paddle/lib:$LD_LIBRARY_PATH go run ./demo/mobilenet.go ``` diff --git a/go/demo/mobilenet.go b/go/demo/mobilenet.go index 1b42fe8049a..c1ca2e967f7 100644 --- a/go/demo/mobilenet.go +++ b/go/demo/mobilenet.go @@ -13,7 +13,7 @@ // limitations under the License. package main -import "../paddle" +import "github.com/paddlepaddle/paddle" import "strings" import "io/ioutil" import "strconv" diff --git a/go/paddle/common.go b/go/paddle/common.go index 4bf94765931..cbbde6a45f5 100644 --- a/go/paddle/common.go +++ b/go/paddle/common.go @@ -15,7 +15,7 @@ package paddle // #cgo CFLAGS: -I${SRCDIR}/../paddle_c/paddle/include -// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_fluid_c +// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_inference_c // #include // #include import "C" diff --git a/go/paddle/config.go b/go/paddle/config.go index 89f7d7e63ff..68a31230997 100644 --- a/go/paddle/config.go +++ b/go/paddle/config.go @@ -15,7 +15,7 @@ package paddle // #cgo CFLAGS: -I${SRCDIR}/../paddle_c/paddle/include -// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_fluid_c +// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_inference_c // #include // #include // #include diff --git a/go/paddle/predictor.go b/go/paddle/predictor.go index 59bad908e6a..5f2b2c81a60 100644 --- a/go/paddle/predictor.go +++ b/go/paddle/predictor.go @@ -15,7 +15,7 @@ package paddle // #cgo CFLAGS: -I${SRCDIR}/../paddle_c/paddle/include -// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_fluid_c +// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_inference_c // #include // #include "paddle_c_api.h" import "C" @@ -88,7 +88,7 @@ func (predictor *Predictor) GetInputNames() []string { } func (predictor *Predictor) GetOutputNames() []string { - names := make([]string, predictor.GetInputNum()) + names := make([]string, predictor.GetOutputNum()) for i := 0; i < len(names); i++ { names[i] = predictor.GetOutputName(i) } diff --git a/go/paddle/tensor.go b/go/paddle/tensor.go index e6e2c53fef1..6fbcf039f88 100644 --- a/go/paddle/tensor.go +++ b/go/paddle/tensor.go @@ -15,7 +15,7 @@ package paddle // #cgo CFLAGS: -I${SRCDIR}/../paddle_c/paddle/include -// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_fluid_c +// #cgo LDFLAGS: -L${SRCDIR}/../paddle_c/paddle/lib -lpaddle_inference_c // #include // #include // #include @@ -209,7 +209,7 @@ func DecodeTensor(r *bytes.Reader, shape []int32, t reflect.Type, ptr reflect.Va value := reflect.Indirect(ptr) value.Set(reflect.MakeSlice(t, int(shape[0]), int(shape[0]))) if len(shape) == 1 && value.Len() > 0 { - switch value.Index(1).Kind() { + switch value.Index(0).Kind() { case reflect.Uint8, reflect.Int32, reflect.Int64, reflect.Float32: binary.Read(r, Endian(), value.Interface()) return diff --git a/paddle/fluid/inference/capi/pd_predictor.cc b/paddle/fluid/inference/capi/pd_predictor.cc index c1bf4c974fa..c4e195b6ec8 100644 --- a/paddle/fluid/inference/capi/pd_predictor.cc +++ b/paddle/fluid/inference/capi/pd_predictor.cc @@ -207,13 +207,16 @@ int PD_GetOutputNum(const PD_Predictor* predictor) { } const char* PD_GetInputName(const PD_Predictor* predictor, int n) { - static std::vector names = predictor->predictor->GetInputNames(); + static std::vector names; + names.resize(predictor->predictor->GetInputNames().size()); + names[n] = predictor->predictor->GetInputNames()[n]; return names[n].c_str(); } const char* PD_GetOutputName(const PD_Predictor* predictor, int n) { - static std::vector names = - predictor->predictor->GetOutputNames(); + static std::vector names; + names.resize(predictor->predictor->GetOutputNames().size()); + names[n] = predictor->predictor->GetOutputNames()[n]; return names[n].c_str(); } -- GitLab From 01aa252624a639552116a0c46188ca7f5c43a1ee Mon Sep 17 00:00:00 2001 From: zlsh80826 Date: Fri, 26 Mar 2021 15:58:50 +0800 Subject: [PATCH 080/486] [Paddle-TRT] multiclass nms (#31742) * add multiclass_nms * add multiclass_nms unittest * add default enable_tensorrt_oss option * refine multiclas nms unittest and add serialization/dynamic test * change super to InferencePassTest for python2 compatibility * refine multiclass nms unittest * move out dynamic shape test due to ci timelimit --- .../fluid/inference/api/analysis_predictor.cc | 2 +- .../inference/tensorrt/convert/CMakeLists.txt | 2 +- .../tensorrt/convert/multiclass_nms_op.cc | 133 ++++++++++++++++ paddle/fluid/inference/tensorrt/op_teller.cc | 34 ++++- .../ir/inference/inference_pass_test.py | 3 + .../inference/test_trt_multiclass_nms_op.py | 144 ++++++++++++++++++ 6 files changed, 315 insertions(+), 3 deletions(-) create mode 100644 paddle/fluid/inference/tensorrt/convert/multiclass_nms_op.cc create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_trt_multiclass_nms_op.py diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 8f2b217a2fd..0007582e2c7 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -1192,7 +1192,7 @@ USE_TRT_CONVERTER(scale); USE_TRT_CONVERTER(stack); USE_TRT_CONVERTER(clip); USE_TRT_CONVERTER(gather); - +USE_TRT_CONVERTER(multiclass_nms); USE_TRT_CONVERTER(nearest_interp); #endif diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt index b0d0229ec05..be7fa0548d9 100644 --- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt @@ -6,7 +6,7 @@ nv_library(tensorrt_converter shuffle_channel_op.cc swish_op.cc instance_norm_op.cc stack_op.cc transpose_op.cc flatten_op.cc emb_eltwise_layernorm.cc skip_layernorm.cc scale_op.cc slice_op.cc hard_sigmoid_op.cc hard_swish_op.cc clip_op.cc gather_op.cc - + multiclass_nms_op.cc nearest_interp_op.cc DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry) diff --git a/paddle/fluid/inference/tensorrt/convert/multiclass_nms_op.cc b/paddle/fluid/inference/tensorrt/convert/multiclass_nms_op.cc new file mode 100644 index 00000000000..b0d67a5bf90 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/multiclass_nms_op.cc @@ -0,0 +1,133 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at +http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" + +namespace paddle { +namespace framework { +class Scope; +namespace proto { +class OpDesc; +} // namespace proto +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace inference { +namespace tensorrt { + +class MultiClassNMSOpConverter : public OpConverter { + public: + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, bool test_mode) override { + VLOG(3) << "convert a fluid multiclassNMS op to tensorrt plugin"; + + // for now, only work for static shape and regular tensor + framework::OpDesc op_desc(op, nullptr); + + std::string bboxes = op_desc.Input("BBoxes").front(); + std::string scores = op_desc.Input("Scores").front(); + std::string output_name = op_desc.Output("Out").front(); + + auto* bboxes_tensor = engine_->GetITensor(bboxes); + auto* scores_tensor = engine_->GetITensor(scores); + + int background_label = + BOOST_GET_CONST(int, op_desc.GetAttr("background_label")); + float score_threshold = + BOOST_GET_CONST(float, op_desc.GetAttr("score_threshold")); + int nms_top_k = BOOST_GET_CONST(int, op_desc.GetAttr("nms_top_k")); + float nms_threshold = + BOOST_GET_CONST(float, op_desc.GetAttr("nms_threshold")); + int keep_top_k = BOOST_GET_CONST(int, op_desc.GetAttr("keep_top_k")); + bool normalized = BOOST_GET_CONST(bool, op_desc.GetAttr("normalized")); + int num_classes = scores_tensor->getDimensions().d[0]; + + auto bboxes_dims = bboxes_tensor->getDimensions(); + nvinfer1::Dims3 bboxes_expand_dims(bboxes_dims.d[0], 1, bboxes_dims.d[1]); + auto* bboxes_expand_layer = + TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *bboxes_tensor); + bboxes_expand_layer->setReshapeDimensions(bboxes_expand_dims); + + nvinfer1::Permutation permutation{1, 0}; + auto* scores_transpose_layer = + TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *scores_tensor); + scores_transpose_layer->setFirstTranspose(permutation); + + std::vector batch_nms_inputs; + batch_nms_inputs.push_back(bboxes_expand_layer->getOutput(0)); + batch_nms_inputs.push_back(scores_transpose_layer->getOutput(0)); + + constexpr bool shareLocation = true; + constexpr bool clip_boxes = false; + + const std::vector fields{ + {"shareLocation", &shareLocation, nvinfer1::PluginFieldType::kINT32, 1}, + {"backgroundLabelId", &background_label, + nvinfer1::PluginFieldType::kINT32, 1}, + {"numClasses", &num_classes, nvinfer1::PluginFieldType::kINT32, 1}, + {"topK", &nms_top_k, nvinfer1::PluginFieldType::kINT32, 1}, + {"keepTopK", &keep_top_k, nvinfer1::PluginFieldType::kINT32, 1}, + {"scoreThreshold", &score_threshold, + nvinfer1::PluginFieldType::kFLOAT32, 1}, + {"iouThreshold", &nms_threshold, nvinfer1::PluginFieldType::kFLOAT32, + 1}, + {"isNormalized", &normalized, nvinfer1::PluginFieldType::kINT32, 1}, + {"clipBoxes", &clip_boxes, nvinfer1::PluginFieldType::kINT32, 1}, + }; + + nvinfer1::PluginFieldCollection* plugin_collections = + static_cast( + malloc(sizeof(*plugin_collections) + + fields.size() * sizeof(nvinfer1::PluginField))); + plugin_collections->nbFields = static_cast(fields.size()); + plugin_collections->fields = fields.data(); + + auto creator = GetPluginRegistry()->getPluginCreator("BatchedNMS_TRT", "1"); + auto batch_nms_plugin = + creator->createPlugin("BatchNMSPlugin", plugin_collections); + free(plugin_collections); + + auto batch_nms_layer = engine_->network()->addPluginV2( + batch_nms_inputs.data(), batch_nms_inputs.size(), *batch_nms_plugin); + auto nmsed_boxes = batch_nms_layer->getOutput(1); + auto nmsed_scores = batch_nms_layer->getOutput(2); + auto nmsed_classes = batch_nms_layer->getOutput(3); + + auto nmsed_scores_transpose_layer = + TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *nmsed_scores); + nmsed_scores_transpose_layer->setReshapeDimensions( + nvinfer1::Dims2(keep_top_k, 1)); + auto nmsed_classes_reshape_layer = + TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *nmsed_classes); + nmsed_classes_reshape_layer->setReshapeDimensions( + nvinfer1::Dims2(keep_top_k, 1)); + + std::vector concat_inputs; + concat_inputs.push_back(nmsed_classes_reshape_layer->getOutput(0)); + concat_inputs.push_back(nmsed_scores_transpose_layer->getOutput(0)); + concat_inputs.push_back(nmsed_boxes); + + auto nms_concat_layer = TRT_ENGINE_ADD_LAYER( + engine_, Concatenation, concat_inputs.data(), concat_inputs.size()); + nms_concat_layer->setAxis(1); + + RreplenishLayerAndOutput(nms_concat_layer, "multiclass_nms", {output_name}, + test_mode); + } +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +REGISTER_TRT_OP_CONVERTER(multiclass_nms, MultiClassNMSOpConverter); diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc index 11752d71a45..82f58254fe8 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -111,7 +111,7 @@ struct SimpleOpTypeSetTeller : public Teller { "flatten2", "flatten", "gather", - + "multiclass_nms", "nearest_interp", }; }; @@ -195,6 +195,38 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, // current not support axis from input, use default 0 if (!with_dynamic_shape || desc.Input("Axis").size() > 0) return false; } + + if (op_type == "multiclass_nms") { + if (with_dynamic_shape) return false; + auto* block = desc.Block(); + for (auto& param_name : desc.Inputs()) { + for (auto& var_name : param_name.second) { + auto* var_desc = block->FindVar(var_name); + const auto shape = var_desc->GetShape(); + if (shape.size() != 3) { + VLOG(1) << "multiclass_nms op dims != 3 not supported in tensorrt, " + "but got dims " + << shape.size() << ", so jump it."; + return false; + } + } + } + bool has_attrs = + (desc.HasAttr("background_label") && + desc.HasAttr("score_threshold") && desc.HasAttr("nms_top_k") && + desc.HasAttr("keep_top_k") && desc.HasAttr("normalized")); + if (has_attrs == false) return false; + + auto nms_top_k = BOOST_GET_CONST(int, desc.GetAttr("nms_top_k")); + if (nms_top_k < 0) return false; + + auto keep_top_k = BOOST_GET_CONST(int, desc.GetAttr("keep_top_k")); + if (keep_top_k < 0) return false; + + auto registry = GetPluginRegistry(); + if (registry == nullptr) return false; + } + if (op_type == "fc" || op_type == "mul") { const int x_num_col_dims = desc.HasAttr("x_num_col_dims") diff --git a/python/paddle/fluid/tests/unittests/ir/inference/inference_pass_test.py b/python/paddle/fluid/tests/unittests/ir/inference/inference_pass_test.py index 993493a3ccf..010086bfbbc 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/inference_pass_test.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/inference_pass_test.py @@ -46,6 +46,7 @@ class InferencePassTest(unittest.TestCase): self.enable_mkldnn = False self.enable_mkldnn_bfloat16 = False self.enable_trt = False + self.enable_tensorrt_oss = True self.trt_parameters = None self.dynamic_shape_params = None self.enable_lite = False @@ -133,6 +134,8 @@ class InferencePassTest(unittest.TestCase): self.dynamic_shape_params.max_input_shape, self.dynamic_shape_params.optim_input_shape, self.dynamic_shape_params.disable_trt_plugin_fp16) + if self.enable_tensorrt_oss: + config.enable_tensorrt_oss() elif use_mkldnn: config.enable_mkldnn() diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_multiclass_nms_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_multiclass_nms_op.py new file mode 100644 index 00000000000..3ca69859859 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_multiclass_nms_op.py @@ -0,0 +1,144 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import itertools +import numpy as np +from inference_pass_test import InferencePassTest +import paddle.fluid as fluid +import paddle.fluid.core as core +from paddle.fluid.core import PassVersionChecker +from paddle.fluid.core import AnalysisConfig + + +class TensorRTMultiClassNMSTest(InferencePassTest): + def setUp(self): + self.enable_trt = True + self.enable_tensorrt_oss = True + self.precision = AnalysisConfig.Precision.Float32 + self.serialize = False + self.bs = 1 + self.background_label = -1 + self.score_threshold = .5 + self.nms_top_k = 8 + self.nms_threshold = .3 + self.keep_top_k = 8 + self.normalized = False + self.num_classes = 8 + self.num_boxes = 8 + self.trt_parameters = InferencePassTest.TensorRTParam( + 1 << 30, self.bs, 2, self.precision, self.serialize, False) + + def build(self): + with fluid.program_guard(self.main_program, self.startup_program): + boxes = fluid.data( + name='bboxes', shape=[-1, self.num_boxes, 4], dtype='float32') + scores = fluid.data( + name='scores', + shape=[-1, self.num_classes, self.num_boxes], + dtype='float32') + multiclass_nms_out = fluid.layers.multiclass_nms( + bboxes=boxes, + scores=scores, + background_label=self.background_label, + score_threshold=self.score_threshold, + nms_top_k=self.nms_top_k, + nms_threshold=self.nms_threshold, + keep_top_k=self.keep_top_k, + normalized=self.normalized) + mutliclass_nms_out = multiclass_nms_out + 1. + multiclass_nms_out = fluid.layers.reshape( + multiclass_nms_out, [self.bs, 1, self.keep_top_k, 6], + name='reshape') + out = fluid.layers.batch_norm(multiclass_nms_out, is_test=True) + + boxes_data = np.arange(self.num_boxes * 4).reshape( + [self.bs, self.num_boxes, 4]).astype('float32') + scores_data = np.arange(1 * self.num_classes * self.num_boxes).reshape( + [self.bs, self.num_classes, self.num_boxes]).astype('float32') + self.feeds = { + 'bboxes': boxes_data, + 'scores': scores_data, + } + self.fetch_list = [out] + + def run_test(self): + self.build() + self.check_output() + + def run_test_all(self): + precision_opt = [ + AnalysisConfig.Precision.Float32, AnalysisConfig.Precision.Half + ] + serialize_opt = [False, True] + max_shape = { + 'bboxes': [self.bs, self.num_boxes, 4], + 'scores': [self.bs, self.num_classes, self.num_boxes], + } + opt_shape = max_shape + dynamic_shape_opt = [ + None, InferencePassTest.DynamicShapeParam({ + 'bboxes': [1, 1, 4], + 'scores': [1, 1, 1] + }, max_shape, opt_shape, False) + ] + for precision, serialize, dynamic_shape in itertools.product( + precision_opt, serialize_opt, dynamic_shape_opt): + self.precision = precision + self.serialize = serialize + self.dynamic_shape_params = dynamic_shape + self.build() + self.check_output() + + def check_output(self): + if core.is_compiled_with_cuda(): + use_gpu = True + self.check_output_with_option(use_gpu) + self.assertTrue( + PassVersionChecker.IsCompatible('tensorrt_subgraph_pass')) + + def test_base(self): + self.run_test() + + def test_fp16(self): + self.precision = AnalysisConfig.Precision.Half + self.run_test() + + def test_serialize(self): + self.serialize = True + self.run_test() + + def test_dynamic(self): + max_shape = { + 'bboxes': [self.bs, self.num_boxes, 4], + 'scores': [self.bs, self.num_classes, self.num_boxes], + } + opt_shape = max_shape + self.dynamic_shape_params = InferencePassTest.DynamicShapeParam({ + 'bboxes': [1, 1, 4], + 'scores': [1, 1, 1] + }, max_shape, opt_shape, False) + self.run_test() + + def test_background(self): + self.background = 7 + self.run_test() + + def test_disable_oss(self): + self.diable_tensorrt_oss = False + self.run_test() + + +if __name__ == "__main__": + unittest.main() -- GitLab From c3974d0e2a6353f3a134e8925aeb15cac7f0e48b Mon Sep 17 00:00:00 2001 From: lilong12 Date: Fri, 26 Mar 2021 18:11:51 +0800 Subject: [PATCH 081/486] [3D-parallel] Reformat pipeline parallel (#31786) * update, test=develop --- paddle/fluid/framework/section_worker.cc | 20 +- .../fleet/meta_optimizers/common.py | 41 +- .../meta_optimizers/pipeline_optimizer.py | 308 +++--- .../contrib/mixed_precision/fp16_utils.py | 10 +- python/paddle/fluid/device_worker.py | 2 +- python/paddle/fluid/executor.py | 23 +- python/paddle/fluid/optimizer.py | 954 +++++++++++------- .../fluid/tests/unittests/pipeline_mnist.py | 27 +- 8 files changed, 816 insertions(+), 569 deletions(-) diff --git a/paddle/fluid/framework/section_worker.cc b/paddle/fluid/framework/section_worker.cc index 90a371e4747..e740771e5ca 100644 --- a/paddle/fluid/framework/section_worker.cc +++ b/paddle/fluid/framework/section_worker.cc @@ -39,13 +39,13 @@ void SectionWorker::RunForward( int op_role = op->Attr(std::string("op_role")); // We run op with op_role = kLRSched only for the first microbatch // to avoid increasing the @LR_DECAY_STEP@ multiple times. - bool run_first_mbatch = op_role == static_cast(OpRole::kForward) || - op_role == (static_cast(OpRole::kForward) | - static_cast(OpRole::kLoss)) || - op_role == static_cast(OpRole::kLRSched); - bool run_others = op_role == static_cast(OpRole::kForward) || - op_role == (static_cast(OpRole::kForward) | - static_cast(OpRole::kLoss)); + bool run_first_mbatch = (op_role == static_cast(OpRole::kForward)) || + (op_role == (static_cast(OpRole::kForward) | + static_cast(OpRole::kLoss))) || + (op_role == static_cast(OpRole::kLRSched)); + bool run_others = (op_role == static_cast(OpRole::kForward)) || + (op_role == (static_cast(OpRole::kForward) | + static_cast(OpRole::kLoss))); if ((micro_id == 0 && run_first_mbatch) || (micro_id != 0 && run_others)) { VLOG(3) << "Forward: running op " << op->Type() << " for micro-batch " << micro_id; @@ -64,9 +64,9 @@ void SectionWorker::RunBackward( &unused_vars_) { for (auto &op : ops_) { int op_role = op->Attr(std::string("op_role")); - if (op_role == static_cast(OpRole::kBackward) || - op_role == (static_cast(OpRole::kBackward) | - static_cast(OpRole::kLoss))) { + if ((op_role == static_cast(OpRole::kBackward)) || + (op_role == (static_cast(OpRole::kBackward) | + static_cast(OpRole::kLoss)))) { VLOG(3) << "Backward: running op " << op->Type() << " for micro-batch " << micro_id; op->Run(*microbatch_scopes_[micro_id], place_); diff --git a/python/paddle/distributed/fleet/meta_optimizers/common.py b/python/paddle/distributed/fleet/meta_optimizers/common.py index 00d58cbd997..c3d27bcc4ea 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/common.py +++ b/python/paddle/distributed/fleet/meta_optimizers/common.py @@ -47,7 +47,7 @@ def is_optimizer_op(op): class CollectiveHelper(object): - def __init__(self, role_maker, nrings=1, wait_port='6174'): + def __init__(self, role_maker, nrings=1, wait_port=True): self.nrings = nrings self.wait_port = wait_port self.role_maker = role_maker @@ -65,14 +65,48 @@ class CollectiveHelper(object): self.role_maker._worker_index(), ring_id, self.wait_port) self._broadcast_params() - def _init_communicator(self, program, current_endpoint, endpoints, rank, - ring_id, wait_port): + def _init_communicator(self, + program, + current_endpoint, + endpoints, + rank, + ring_id, + wait_port, + global_ring_id=None, + sync=True): nranks = len(endpoints) other_endpoints = endpoints[:] other_endpoints.remove(current_endpoint) if rank == 0 and wait_port: wait_server_ready(other_endpoints) + def _add_sync_by_allreduce(block): + sync_var = block.create_var( + name=unique_name.generate('sync_var'), + dtype=core.VarDesc.VarType.INT32, + persistable=False, + stop_gradient=True) + block.append_op( + type='fill_constant', + inputs={}, + outputs={'Out': [sync_var]}, + attrs={ + 'shape': [1], + 'dtype': sync_var.dtype, + 'value': 1, + 'force_cpu': False, + OP_ROLE_KEY: OpRole.Forward + }) + block.append_op( + type='c_allreduce_sum', + inputs={'X': [sync_var]}, + outputs={'Out': [sync_var]}, + attrs={ + 'ring_id': global_ring_id, + 'use_calc_stream': True, + OP_ROLE_KEY: OpRole.Forward + }) + block = program.global_block() if core.is_compiled_with_cuda(): comm_id_var = block.create_var( @@ -128,6 +162,7 @@ class CollectiveHelper(object): raise ValueError( "comm_id must be generated in paddlepaddle-xpu or paddlepaddle-xpu." ) + if sync: _add_sync_by_allreduce(block) def _wait(self, current_endpoint, endpoints): assert (self.wait_port) diff --git a/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py index 9535c9ef53c..6f435bb86ba 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py @@ -19,130 +19,21 @@ from paddle.fluid import core, unique_name from ..base.private_helper_function import wait_server_ready from paddle.fluid.optimizer import PipelineOptimizer as PO from .meta_optimizer_base import MetaOptimizerBase -from .common import OpRole, OP_ROLE_KEY, OP_ROLE_VAR_KEY, CollectiveHelper, is_update_op, is_loss_grad_op, is_backward_op, is_optimizer_op - - -def _get_node_num(endpoints): - ss = set() - for ep in endpoints: - ip = ep.split(":")[0].strip() - if ip not in ss: - ss.add(ip) - return len(ss) - - -class PipelineHelper(object): - def __init__(self, role_maker, wait_port='6174'): - self.wait_port = wait_port - self.role_maker = role_maker - - def update_startup_program(self, - startup_program=None, - inner_parallelism=None): - self.startup_program = startup_program - - nranks = self.role_maker._worker_num() - rank = self.role_maker._worker_index() - endpoints = self.role_maker._get_trainer_endpoints() - current_endpoint = endpoints[rank] - node_num = _get_node_num(endpoints) - assert nranks % node_num == 0 - - # Create ring 0 for all gpus in the same pipeline - if inner_parallelism > 1: - pipeline_rank = rank % inner_parallelism - pipeline_id = rank // inner_parallelism - start_index = pipeline_id * inner_parallelism - pipeline_endpoints = endpoints[start_index:start_index + - inner_parallelism] - self._init_communicator(self.startup_program, current_endpoint, - pipeline_endpoints, pipeline_rank, 0, - self.wait_port) - - pipeline_num = len(endpoints) // inner_parallelism - if pipeline_num == 1: return - # Create rings for gpus with the same pipeline id for data parallel - eps = [] - pipeline_rank = rank % inner_parallelism - ring_id = pipeline_rank + 1 - for i in range(pipeline_num): - eps.append(endpoints[i * inner_parallelism + pipeline_rank]) - # rank in a ring of gpus with the same pipeline id for data parallel - dp_rank = rank // inner_parallelism - self._init_communicator(self.startup_program, current_endpoint, eps, - dp_rank, ring_id, self.wait_port) - self._broadcast_params(ring_id) - - def _init_communicator(self, program, current_endpoint, endpoints, rank, - ring_id, wait_port): - nranks = len(endpoints) - other_endpoints = endpoints[:] - other_endpoints.remove(current_endpoint) - if rank == 0 and wait_port: - wait_server_ready(other_endpoints) - - block = program.global_block() - nccl_id_var = block.create_var( - name=unique_name.generate('nccl_id'), - persistable=True, - type=core.VarDesc.VarType.RAW) - block.append_op( - type='c_gen_nccl_id', - inputs={}, - outputs={'Out': nccl_id_var}, - attrs={ - 'rank': rank, - 'endpoint': current_endpoint, - 'other_endpoints': other_endpoints, - OP_ROLE_KEY: OpRole.Forward, - }) - block.append_op( - type='c_comm_init', - inputs={'X': nccl_id_var}, - outputs={}, - attrs={ - 'nranks': nranks, - 'rank': rank, - 'ring_id': ring_id, - OP_ROLE_KEY: OpRole.Forward, - }) - - def _broadcast_params(self, ring_id): - block = self.startup_program.global_block() - for var_name in block.vars: - if "nccl_id" in var_name: continue - param = block.var(var_name) - if not param.persistable: - continue - - block.append_op( - type='c_broadcast', - inputs={'X': param}, - outputs={'Out': param}, - attrs={ - 'ring_id': ring_id, - 'root': 0, - OP_ROLE_KEY: OpRole.Forward - }) - - block.append_op( - type='c_sync_comm_stream', - inputs={'X': param}, - outputs={'Out': param}, - attrs={'ring_id': ring_id, - OP_ROLE_KEY: OpRole.Forward}) +from .common import OpRole, OP_ROLE_KEY, OP_ROLE_VAR_KEY, CollectiveHelper, is_loss_grad_op, is_backward_op, is_optimizer_op class PipelineOptimizer(MetaOptimizerBase): def __init__(self, optimizer): super(PipelineOptimizer, self).__init__(optimizer) self.inner_opt = optimizer - # we do not allow meta optimizer to be inner optimizer currently self.meta_optimizers_white_list = [ "RecomputeOptimizer", "AMPOptimizer", ] self.meta_optimizers_black_list = ["GraphExecutionOptimizer", ] + self.global_ring_id = 1 + self.dp_ring_id = 2 + self.start_pipeline_ring_id = 20 # Just a magic number def _set_basic_info(self, loss, role_maker, user_defined_optimizer, user_defined_strategy): @@ -165,7 +56,11 @@ class PipelineOptimizer(MetaOptimizerBase): def _disable_strategy(self, dist_strategy): dist_strategy.pipeline = False - dist_strategy.pipeline_configs = {} + dist_strategy.pipeline_configs = { + "micro_batch_size": 1, + "accumulate_steps": 1, + "schedule_mode": "1F1B", + } def _enable_strategy(self, dist_strategy, context): dist_strategy.pipeline = True @@ -175,61 +70,134 @@ class PipelineOptimizer(MetaOptimizerBase): "schedule_mode": "1F1B", } + def _broadcast_params(self, ring_id): + block = self.startup_program.global_block() + param = None + for param in block.iter_parameters(): + if param.is_distributed: + continue + + block.append_op( + type='c_broadcast', + inputs={'X': param}, + outputs={'Out': param}, + attrs={ + 'ring_id': ring_id, + 'root': 0, + OP_ROLE_KEY: OpRole.Forward + }) + + if not param: return # no parameter on this device + block.append_op( + type='c_sync_comm_stream', + inputs={'X': param}, + outputs={'Out': param}, + attrs={'ring_id': ring_id, + OP_ROLE_KEY: OpRole.Forward}) + + def _get_process_group_info(self): + # global ring info + self.global_endpoints = self.endpoints + self.global_rank = self.rank + self.global_nranks = self.nranks + + # data parallel ring info + if self.pipeline_num > 1: + self.dp_rank = self.rank // self.inner_parallelism + self.dp_nranks = self.nranks // self.inner_parallelism + start_index = self.rank % self.inner_parallelism + self.dp_endpoints = [ + self.endpoints[start_index + i * self.inner_parallelism] + for i in range(self.pipeline_num) + ] + + def _init_process_group(self, pipeline_pair, pipeline_ring_map): + self._get_process_group_info() + collective_helper = CollectiveHelper(self.role_maker, wait_port=False) + # Create global ring for all gpus (ring_id = 0) + collective_helper._init_communicator( + self.startup_program, self.current_endpoint, self.global_endpoints, + self.global_rank, self.global_ring_id, True, self.global_ring_id, + True) + # Create pipeline rings + if self.inner_parallelism > 1: + pipeline_id = self.rank // self.inner_parallelism + start_index = pipeline_id * self.inner_parallelism + for pair in pipeline_pair: + pair_key = pair[0] * 1000 + pair[1] + ring_id = pipeline_ring_map[pair_key] + assert ring_id >= self.start_pipeline_ring_id + first_node = pair[0] + start_index + second_node = pair[1] + start_index + if self.rank != first_node and self.rank != second_node: + continue + pipeline_endpoints = [ + self.endpoints[first_node], self.endpoints[second_node] + ] + pipeline_rank = 0 if self.rank == first_node else 1 + pipeline_nranks = 2 + collective_helper._init_communicator( + self.startup_program, self.current_endpoint, + pipeline_endpoints, pipeline_rank, ring_id, False, + self.global_ring_id, True) + + # Create dp rings + if self.pipeline_num > 1: + collective_helper._init_communicator( + self.startup_program, self.current_endpoint, self.dp_endpoints, + self.dp_rank, self.dp_ring_id, True, self.global_ring_id, True) + self._broadcast_params(self.dp_ring_id) + def minimize_impl(self, loss, startup_program=None, parameter_list=None, no_grad_set=None): - endpoints = self.role_maker._get_trainer_endpoints() - current_endpoint = endpoints[self.role_maker._worker_index()] - self.wrapped_opt = PO(self.inner_opt, - num_microbatches=self.num_microbatches) - node_num = _get_node_num(endpoints) - gpus_per_node = len(endpoints) // node_num - self.startup_program = startup_program - if startup_program is None: - self.startup_program = fluid.default_startup_program() - + self.endpoints = self.role_maker._get_trainer_endpoints() + self.current_endpoint = self.endpoints[self.role_maker._worker_index()] self.rank = self.role_maker._worker_index() self.nranks = self.role_maker._worker_num() - assert self.nranks % node_num == 0 - loss.block.program._pipeline_opt = dict() - loss.block.program._pipeline_opt['local_rank'] = self.rank - loss.block.program._pipeline_opt[ - 'micro_batch_size'] = self.micro_batch_size - loss.block.program._pipeline_opt['schedule_mode'] = self.schedule_mode - optimize_ops, params_grads, prog_list = self.wrapped_opt.minimize( + self.wrapped_opt = PO(self.inner_opt, + num_microbatches=self.num_microbatches) + orig_startup_program = startup_program if startup_program else fluid.default_startup_program( + ) + block = loss.block + program = block.program + + program._pipeline_opt = dict() + program._pipeline_opt['local_rank'] = self.rank + program._pipeline_opt['global_ring_id'] = self.global_ring_id + program._pipeline_opt['ring_id'] = self.start_pipeline_ring_id + program._pipeline_opt['micro_batch_size'] = self.micro_batch_size + program._pipeline_opt['schedule_mode'] = self.schedule_mode + optimize_ops, params_grads, prog_list, pp_pair, ring_map = self.wrapped_opt.minimize( loss, startup_program, parameter_list, no_grad_set) - assert prog_list - - self.main_program_list = prog_list - self.main_program = loss.block.program - self.inner_parallelism = loss.block.program._pipeline_opt[ - 'inner_parallelism'] + self.startup_program = orig_startup_program._pipeline_opt[ + 'startup_program'] + self.inner_parallelism = program._pipeline_opt['inner_parallelism'] assert self.nranks % self.inner_parallelism == 0 + assert prog_list + self.pipeline_num = len(self.endpoints) // self.inner_parallelism - pipeline_helper = PipelineHelper(self.role_maker) - pipeline_helper.update_startup_program( - self.startup_program._pipeline_opt["startup_program"], - self.inner_parallelism) + self._init_process_group(pp_pair, ring_map) - pipeline_num = self.nranks // self.inner_parallelism - self._transpile_main_program(loss, pipeline_num, self.inner_parallelism) + self.main_program_list = prog_list + self.main_program = program + if self.pipeline_num > 1: + self._transpile_main_program(loss) return optimize_ops, params_grads - def _transpile_main_program(self, loss, pipeline_num, inner_parallelism): - if pipeline_num <= 1: return - self._insert_loss_grad_ops(loss, pipeline_num) - for ring_id in range(1, inner_parallelism + 1): - self._insert_allreduce_ops(ring_id) + def _transpile_main_program(self, loss): + self._insert_loss_grad_ops(loss, self.pipeline_num) + self._insert_allreduce_ops(self.dp_ring_id) def _insert_loss_grad_ops(self, loss, pipeline_num): """ In order to keep the learning rate consistent in different numbers of training workers, we scale the loss grad by the number of workers """ - block = self.main_program_list[-1]['program'].global_block() + block = self.main_program_list[-1].global_block() for idx, op in reversed(list(enumerate(block.ops))): if is_loss_grad_op(op): loss_grad_var = block.vars[op.output_arg_names[0]] @@ -244,57 +212,53 @@ class PipelineOptimizer(MetaOptimizerBase): }) def _insert_allreduce_ops(self, ring_id): - block = self.main_program_list[ring_id - 1]['program'].global_block() + block = self.main_program._pipeline_opt['section_program'].global_block( + ) origin_block = self.main_program.global_block() grad = None processed_param_name = set() + first_optimize_op_idx = None + add_sync_calc_stream = False for idx, op in reversed(list(enumerate(block.ops))): + if is_backward_op(op) and not first_optimize_op_idx: + first_optimize_op_idx = idx + 1 + # no optimize phase + if first_optimize_op_idx == len(block.ops): return if is_backward_op(op) and \ OP_ROLE_VAR_KEY in op.attr_names: op_role_var = op.all_attrs()[OP_ROLE_VAR_KEY] if len(op_role_var) == 0: continue assert len(op_role_var) % 2 == 0 - offset = idx + offset = 0 for i in range(0, len(op_role_var), 2): param_name = op_role_var[i] param = block.vars[op_role_var[i]] if param_name in processed_param_name: continue processed_param_name.add(param_name) - grad = block.vars[op_role_var[i + 1]] + grad_name = op_role_var[i + 1] + if not 'MERGED' in grad_name: grad_name += '@MERGED' + grad = block.vars[grad_name] origin_param = origin_block.vars[op_role_var[i]] if origin_param.is_distributed: continue - if offset == idx: - offset += 1 + if not add_sync_calc_stream: + add_sync_calc_stream = True block._insert_op( - offset, + first_optimize_op_idx + offset, type='c_sync_calc_stream', inputs={'X': grad}, outputs={'Out': grad}, - attrs={OP_ROLE_KEY: OpRole.Backward}) + attrs={OP_ROLE_KEY: OpRole.Optimize}) offset += 1 block._insert_op( - offset, + first_optimize_op_idx + offset, type='c_allreduce_sum', inputs={'X': grad}, outputs={'Out': grad}, attrs={ 'ring_id': ring_id, - OP_ROLE_KEY: OpRole.Backward + 'use_calc_stream': True, + OP_ROLE_KEY: OpRole.Optimize }) - - if grad is None: - return - - for idx, op in enumerate(block.ops): - if is_optimizer_op(op): - block._insert_op( - idx, - type='c_sync_comm_stream', - inputs={'X': grad}, - outputs={'Out': grad}, - attrs={'ring_id': ring_id, - OP_ROLE_KEY: OpRole.Backward}) - break diff --git a/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py index f9c3a613c40..67e83a2ec46 100644 --- a/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py +++ b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py @@ -123,7 +123,8 @@ def _insert_cast_op(block, op, idx, src_dtype, dest_dtype): outputs={"Out": out_var}, attrs={ "in_dtype": in_var.dtype, - "out_dtype": out_var.dtype + "out_dtype": out_var.dtype, + "op_device": op.attr("op_device") }) num_cast_ops += 1 _rename_arg(op, in_var.name, out_var.name) @@ -171,8 +172,11 @@ def _insert_cast_post_op(block, op, idx, src_dtype, dest_dtype, target_name, type="cast", inputs={"X": target_var}, outputs={"Out": cast_var}, - attrs={"in_dtype": target_var.dtype, - "out_dtype": cast_var.dtype}) + attrs={ + "in_dtype": target_var.dtype, + "out_dtype": cast_var.dtype, + "op_device": op.attr("op_device") + }) num_cast_ops += 1 op_var_rename_map[block.idx][target_var.name] = cast_var.name diff --git a/python/paddle/fluid/device_worker.py b/python/paddle/fluid/device_worker.py index b923f36af8d..0f98af57723 100644 --- a/python/paddle/fluid/device_worker.py +++ b/python/paddle/fluid/device_worker.py @@ -427,7 +427,7 @@ class Section(DeviceWorker): section_param.schedule_mode = schedule_mode cfg = section_param.section_config program = pipeline_opt["section_program"] - cfg.program_desc.ParseFromString(program["program"]._get_desc() + cfg.program_desc.ParseFromString(program._get_desc() .serialize_to_string()) # TODO: why does not work # cfg.program_desc.CopyFrom(program.program._get_desc()) diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index 9b0b04a6ea7..da326ec074c 100644 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -1458,7 +1458,7 @@ class Executor(object): dataset._prepare_to_run() real_fetch_list = [] if program._pipeline_opt: - real_program = program._pipeline_opt["section_program"]['program'] + real_program = program._pipeline_opt["section_program"] for fetch_var in fetch_list: if isinstance(fetch_var, Variable): fetch_var_name = fetch_var.name @@ -1467,13 +1467,20 @@ class Executor(object): if fetch_var_name in real_program.global_block().vars: real_fetch_list.append(fetch_var) - program._pipeline_opt["section_program"][ - 'program'] = self._add_feed_fetch_ops( - program=program._pipeline_opt["section_program"]['program'], - feed=[], - fetch_list=real_fetch_list, - feed_var_name='feed', - fetch_var_name='fetch') + program._pipeline_opt["section_program"] = self._add_feed_fetch_ops( + program=program._pipeline_opt["section_program"], + feed=[], + fetch_list=real_fetch_list, + feed_var_name='feed', + fetch_var_name='fetch') + main_block = program._pipeline_opt["section_program"].block(0) + for op in main_block.ops: + # set the op_role of fetch op to Optimize to avoid + # erase the fetched vars by gc for pipeline + if op.type == 'fetch': + op._set_attr( + 'op_role', + core.op_proto_and_checker_maker.OpRole.Optimize) fetch_list = None scope, trainer = self._prepare_trainer( diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 9c724cbfdd4..2aa918bf806 100755 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -3784,6 +3784,12 @@ class PipelineOptimizer(object): "Optimizer, but the given type is {}.".format( type(optimizer))) self._optimizer = optimizer + + # Get the original optimizer defined by users, such as SGD + self._origin_optimizer = self._optimizer + while hasattr(self._origin_optimizer, "inner_opt"): + self._origin_optimizer = self._origin_optimizer.inner_opt + assert num_microbatches >= 1, ( "num_microbatches must be a positive value.") self._num_microbatches = num_microbatches @@ -3797,13 +3803,98 @@ class PipelineOptimizer(object): self._op_role_var_key = op_maker.kOpRoleVarAttrName() self._op_device_key = op_maker.kOpDeviceAttrName() self._param_device_map = None + self._pipeline_pair = [] + self._pp_ring_map = dict() + self._global_ring_id = None + + # insert allreduce op to sync global information for global + # gradient clip and amp + def _insert_allreduce_op(self, op_idx, block): + """ + Insert allreduce op to sync global information for global + gradient clip and amp. + """ + op = block.ops[op_idx] + out_name = op.desc.output_arg_names()[0] + out_var = block.var(out_name) + offset = 0 + if op.type == "reduce_any": + # cast the bool var to int32 to use allreduce_max op + temp_var_name = unique_name.generate(out_name + "_cast_int32") + temp_var = block.create_var( + name=temp_var_name, shape=[1], dtype="int32") + block._insert_op( + op_idx + 1 + offset, + type='cast', + inputs={'X': out_var}, + outputs={'Out': temp_var}, + attrs={ + 'in_dtype': out_var.dtype, + 'out_dtype': temp_var.dtype, + self._op_role_key: self._op_role.Optimize + }) + offset += 1 + block._insert_op( + op_idx + 1 + offset, + type='c_allreduce_max' + if op.type == "reduce_any" else 'c_allreduce_sum', + inputs={'X': temp_var if op.type == "reduce_any" else out_var}, + outputs={'Out': temp_var if op.type == "reduce_any" else out_var}, + attrs={ + 'ring_id': self._global_ring_id, + self._op_role_key: self._op_role.Optimize, + 'use_calc_stream': True + }) + offset += 1 + if op.type == "reduce_any": + block._insert_op( + op_idx + 1 + offset, + type='cast', + inputs={'X': temp_var}, + outputs={'Out': out_var}, + attrs={ + 'in_dtype': temp_var.dtype, + 'out_dtype': out_var.dtype, + self._op_role_key: self._op_role.Optimize + }) + return offset def _create_vars(self, block, ori_block): - # Create vars for block, copied from main_program's global block + # Create vars for block, copied from ori_block used_var_set = set() - for op_idx in range(block.desc.op_size()): - op_desc = block.desc.op(op_idx) - vars = op_desc.input_arg_names() + op_desc.output_arg_names() + added_op_num = 0 + op_idx = 0 + op_size = block.desc.op_size() + while op_idx < op_size + added_op_num: + # Whether to insert allreduce_sum or allreduce_max op. + # For amp and global gradient clip strategies, we should + # get the global information, so allreduce op is needed. + should_insert = False + op = block.ops[op_idx] + # For op process vars on all devices, remove its input + # vars not in this block + reserved_x = [] + if op.type == 'reduce_any' and self._is_optimize_op(op): + should_insert = True + elif op.type == 'concat' and self._is_optimize_op(op): + for input_name in op.desc.input("X"): + if block._find_var_recursive(input_name): + reserved_x.append(input_name) + op.desc.set_input('X', reserved_x) + elif op.type == 'update_loss_scaling': + for input_name in op.desc.input("X"): + if block._find_var_recursive(input_name): + reserved_x.append(input_name) + op.desc.set_input('X', reserved_x) + op.desc.set_output('Out', reserved_x) + elif op.type == 'sum' and self._is_gradient_clip_op(op): + for input_name in op.desc.input("X"): + if block._find_var_recursive(input_name): + reserved_x.append(input_name) + op.desc.set_input('X', reserved_x) + should_insert = True + + vars = op.desc.input_arg_names() + op.desc.output_arg_names() for var in vars: # a var whose name contains "blocking_queue" # only exists in startup program @@ -3813,27 +3904,39 @@ class PipelineOptimizer(object): if block._find_var_recursive(str(var)): continue source_var = ori_block._var_recursive(str(var)) if source_var.type == core.VarDesc.VarType.READER: - block.create_var( + dest_var = block.create_var( name=var, type=core.VarDesc.VarType.READER, persistable=source_var.persistable) else: - block._clone_variable(source_var, False) + dest_var = block._clone_variable(source_var, False) + dest_var.stop_gradient = source_var.stop_gradient + # When use with sharding, allreduce_sum and allreduce_max + # used for global gradient clip and amp will be added by sharding. + op_idx += 1 + if self.use_sharding or not should_insert: continue + inserted_ops = self._insert_allreduce_op(op_idx - 1, block) + added_op_num += inserted_ops + op_idx += inserted_ops + block._sync_with_cpp() def _is_loss_grad_op(self, op): - if self._op_role_key not in op.attr_names: - return False - op_role = int(op.all_attrs()[self._op_role_key]) + assert self._op_role_key in op.attr_names + op_role = int(op.attr(self._op_role_key)) return op_role & int(self._op_role.Backward) and op_role & int( self._op_role.Loss) def _is_backward_op(self, op): - return self._op_role_key in op.attr_names and int(op.all_attrs()[ - self._op_role_key]) & int(self._op_role.Backward) + return self._op_role_key in op.attr_names and ( + int(op.attr(self._op_role_key)) & int(self._op_role.Backward)) + + def _is_loss_op(self, op): + assert self._op_role_key in op.attr_names + return int(op.attr(self._op_role_key)) == int(self._op_role.Loss) def _is_optimize_op(self, op): - return self._op_role_key in op.attr_names and int(op.all_attrs()[ - self._op_role_key]) & int(self._op_role.Optimize) + return self._op_role_key in op.attr_names and ( + int(op.attr(self._op_role_key)) & int(self._op_role.Optimize)) def _is_update_op(self, op): return 'Param' in op.input_names and 'Grad' in op.input_names and ( @@ -3842,50 +3945,40 @@ class PipelineOptimizer(object): def _split_program(self, main_program, devices): """ Split a program into sections according to devices that ops run on. - The ops of the role LRSched are copied to all sections. + The op whose op_device attr is "gpu:all" is copied to all sections. Args: main_program (Program): the main program devices: all used devices """ - programs = [] # Map from device to its corresponding section program info - device_program_map = dict() - for device in devices: - p = {'program': Program()} - device_program_map[device] = p + device_program_map = defaultdict(Program) block = main_program.block(0) for op in block.ops: device = op.attr(self._op_device_key) - op_role = op.attr(self._op_role_key) - if int(op_role) & int(self._op_role.LRSched): - # Copy ops of the role LRSched to all sections. - for device in device_program_map.keys(): - program = device_program_map[device] - op_desc = op.desc - ap_op = program["program"].block(0).desc.append_op() - ap_op.copy_from(op_desc) - # ap_op._set_attr(self._op_device_key, "") - elif op.type == "create_py_reader" or op.type == "read" or op.type == "create_double_buffer_reader": - # Copy read related ops to all section to make them exit after each epoch. - for device in device_program_map.keys(): + # Copy ops whose op_device set to "gpu:all" to all sections. + if device == "gpu:all": + for device in devices: program = device_program_map[device] op_desc = op.desc - ap_op = program["program"].block(0).desc.append_op() + ap_op = program.global_block().desc.append_op() ap_op.copy_from(op_desc) + ap_op._set_attr(self._op_device_key, "") else: program = device_program_map[device] op_desc = op.desc - ap_op = program["program"].block(0).desc.append_op() + ap_op = program.global_block().desc.append_op() ap_op.copy_from(op_desc) + ap_op._set_attr(self._op_device_key, "") + program_list = [] for key in devices: program = device_program_map[key] - program['program']._sync_with_cpp() - programs.append(program) + program._sync_with_cpp() + program_list.append(program) - return programs + return program_list def _get_op_device_for_startup_program(self, var_name): """ @@ -3894,21 +3987,22 @@ class PipelineOptimizer(object): get the real op_device attribute of the fill_constant as the device where the corresponding parameters on. """ - assert "beta1_pow_acc" in var_name or "beta2_pow_acc" in var_name + assert "beta1_pow_acc" in var_name or "beta2_pow_acc" in var_name, \ + 'For accumulators for Adam, the name must contain beta1_pow_acc ' \ + 'or beta2_pow_acc.' param_name = var_name[0:var_name.index('_beta')] device = self._param_device_map[param_name] return device - def _split_startup_program(self, startup_program, local_rank): - block = startup_program.block(0) + def _split_startup_program(self, startup_program, device_id): + block = startup_program.global_block() new_startup_program = Program() for op in block.ops: device = op.attr(self._op_device_key) if device == "cpu": assert op.type == "fill_constant", ( - "For ops in startup " - "program that with the op_device attribute of cpu, " - "they must be fill_constant.") + "For ops in startup program with the op_device attribute " + "of cpu, they must be of type fill_constant.") output_var = op.output_arg_names[0] device = self._get_op_device_for_startup_program(output_var) @@ -3917,14 +4011,13 @@ class PipelineOptimizer(object): else: # LR related ops device = None - if device and device_index != local_rank: continue + if device and device_index != device_id: continue op_desc = op.desc - ap_op = new_startup_program.block(0).desc.append_op() + ap_op = new_startup_program.global_block().desc.append_op() ap_op.copy_from(op_desc) ap_op._set_attr(self._op_device_key, "") new_startup_program._sync_with_cpp() - self._create_vars( - new_startup_program.block(0), startup_program.global_block()) + self._create_vars(new_startup_program.global_block(), block) return new_startup_program def _find_post_op(self, ops, cur_op, var_name): @@ -3937,6 +4030,11 @@ class PipelineOptimizer(object): var_name as output. var_name (string): Variable name. """ + # To skip the cast op added by amp which has no op_device set + if '.cast_fp32' in var_name: + var_name = var_name.replace('.cast_fp32', '') + elif '.cast_fp16' in var_name: + var_name = var_name.replace('.cast_fp16', '') post_op = [] before = True for op in ops: @@ -3965,7 +4063,8 @@ class PipelineOptimizer(object): """ prev_op = [] for op in ops: - if op.type == 'send_v2' or op.type == 'recv_v2': + if op.type == 'send_v2' or op.type == 'recv_v2' \ + or op.type == 'c_broadcast': continue if op == cur_op: break @@ -3980,11 +4079,8 @@ class PipelineOptimizer(object): return None def _rename_arg(self, op, old_name, new_name): - op_desc = op.desc - if isinstance(op_desc, tuple): - op_desc = op_desc[0] - op_desc._rename_input(old_name, new_name) - op_desc._rename_output(old_name, new_name) + op._rename_input(old_name, new_name) + op._rename_output(old_name, new_name) def _create_var(self, block, ref_var, name): """ @@ -3998,99 +4094,12 @@ class PipelineOptimizer(object): dtype=ref_var.dtype, type=ref_var.type, lod_level=ref_var.lod_level, - persistable=False, - is_data=False, + persistable=ref_var.persistable, + is_data=ref_var.is_data, need_check_feed=ref_var.desc.need_check_feed()) + new_var.stop_gradient = ref_var.stop_gradient return new_var - def _get_data_var_info(self, block): - """ - Get info of all vars whose is_data attribute are true. - """ - # map of data vars to devices that that data on - data_devices_map = dict() - for op in block.ops: - dev_spec = op.attr(self._op_device_key) - for var_name in op.input_arg_names: - if "blocking_queue" in var_name: continue - var = block.var(var_name) - if not var.is_data: - continue - if not var_name in data_devices_map: - data_devices_map[var_name] = [] - if not dev_spec in data_devices_map[var_name]: - data_devices_map[var_name].append(dev_spec) - return data_devices_map - - def _insert_sendrecv_for_data_var(self, main_block, programs, startup, - devices): - """ - Insert send and recv ops for data var that on other devices. - - Args: - main_block (Block): Global block for main program - programs (dict): Dictionary for section params - startup (Program): Startup program - devices (list): List of devices in the format (dev:dev_index) - """ - main_program = main_block.program - data_devices_map = self._get_data_var_info(main_block) - - first_prog = programs[0]['program'] - first_block = first_prog.block(0) - insert_index = 0 - for op in first_block.ops: - insert_index += 1 - if op.type == "read": - break - first_dev_spec = devices[0] - first_dev_index = int(first_dev_spec.split(':')[1]) - for var_name in data_devices_map.keys(): - for device in data_devices_map[var_name]: - if device == first_dev_spec: continue - main_var = main_block.var(var_name) - assert main_var.is_data - if not var_name in first_block.vars: - self._create_var(first_block, main_var, var_name) - dev_index = int(device.split(':')[1]) - first_block._insert_op( - index=insert_index, - type='send_v2', - inputs={'X': first_block.var(var_name)}, - attrs={ - self._op_device_key: first_dev_spec, - self._op_role_key: self._op_role.Forward, - 'use_calc_stream': True, - 'peer': dev_index, - }) - # Get the device that that data on - assert device in devices - prog_index = devices.index(device) - prog = programs[prog_index]['program'] - block = prog.block(0) - index = 0 - for op in block.ops: - index += 1 - if op.type == "read": - break - source_var = main_program.block(0).var(var_name) - new_var = self._create_var(block, source_var, var_name) - new_var_shape = list(new_var.shape) - new_var_shape[0] = self.micro_batch_size if new_var_shape[ - 0] < 0 else new_var_shape[0] - block._insert_op( - index=index, - type='recv_v2', - outputs={'Out': [new_var]}, - attrs={ - 'out_shape': new_var_shape, - 'dtype': new_var.dtype, - self._op_device_key: device, - self._op_role_key: self._op_role.Forward, - 'peer': first_dev_index, - 'use_calc_stream': True, - }) - def _strip_grad_suffix(self, name): """ Strip the grad suffix from the given variable name @@ -4104,95 +4113,161 @@ class PipelineOptimizer(object): """ return name + core.grad_var_suffix() - def _add_opdevice_attr_for_regularization_clip(self, block): + def _get_op_device_attr(self, op): """ - Add op_device attribute for regulization and clip ops. + Get the op_device attribute of a op. """ - for op in block.ops: - # role for regularization and clip ops is optimize - if int(op.attr(self._op_role_key)) != int(self._op_role.Optimize): - continue - if op.has_attr(self._op_device_key) and ( - op.attr(self._op_device_key) != ""): - continue - assert self._op_role_var_key in op.attr_names - op_role_var = op.all_attrs()[self._op_role_var_key] - assert len(op_role_var) == 2 + device = op.attr(self._op_device_key) \ + if op.has_attr(self._op_device_key) else None + if device: + assert device[0:3] == 'gpu', "Now, only gpu devices are " \ + "supported in pipeline parallemism." + return device + + def _add_op_device_attr_for_op(self, op, idx, block): + """ + Add op_device attrribute for ops that have not that attribute set. + We use "gpu:all" to represent the op should be put on all + sub-programs, such as lr-related ops. Note that: "gpu:all" + is only used by pipeline as an indicator. + """ + lrsched_role = int(self._op_role.LRSched) + if op.attr(self._op_role_key) == lrsched_role: + # For LRSched ops, we should put them on all sub-programs to + # make sure each sub-program update the lr correctly + op._set_attr(self._op_device_key, "gpu:all") + elif (op.type == "cast" or + op.type == "scale") and self._is_backward_op(op): + prev_op = self._find_real_prev_op(block.ops, op, + op.desc.input("X")[0]) + op._set_attr(self._op_device_key, prev_op.attr(self._op_device_key)) + elif op.type == "memcpy" and not self._is_optimize_op(op): + assert len(op.input_arg_names) == 1 and len( + op.output_arg_names) == 1 + input_name = op.input_arg_names[0] + output_name = op.output_arg_names[0] + if '@Fetch' in output_name: + post_op = self._find_post_op(block.ops, op, output_name) + op._set_attr(self._op_device_key, + post_op.attr(self._op_device_key)) + else: + prev_op = self._find_real_prev_op(block.ops, op, + op.desc.input("X")[0]) + op._set_attr(self._op_device_key, + prev_op.attr(self._op_device_key)) + elif self._is_loss_op(op): + # For loss * loss_scaling op added by AMP + offset = 1 + while (not block.ops[idx + offset].has_attr(self._op_device_key) or + not block.ops[idx + offset].attr(self._op_device_key)): + offset += 1 + device = block.ops[idx + offset].attr(self._op_device_key) + assert device, "Please put you program within device_guard scope." + for i in range(offset): + block.ops[idx + i]._set_attr(self._op_device_key, device) + elif self._is_optimize_op(op) and op.type == "check_finite_and_unscale": + op_role_var = op.attr(self._op_role_var_key) param_name = op_role_var[0] device = self._param_device_map[param_name] op._set_attr(self._op_device_key, device) - - def _add_default_opdevice_attr(self, block): + elif self._is_optimize_op(op) and op.type == "cast": + # For fp16-->fp32 cast added by AMP + grad_name = op.output('Out') + assert len(grad_name) == 1 + param_name = grad_name[0].strip(core.grad_var_suffix()) + device = self._param_device_map[param_name] + op._set_attr(self._op_device_key, device) + elif self._is_gradient_clip_op(op) or self._is_regularization_op(op): + # For gradient clip and regularization ops, we set their op_device + # attribute to the device where their corresponding parameters on. + assert self._op_role_var_key in op.attr_names, "gradient_clip " \ + "and regularization ops must have op_role_var attribute." + op_role_var = op.attr(self._op_role_var_key) + assert len(op_role_var) == 2, "op_role_var for gradient_clip " \ + "regularization ops must have two elements." + param_name = op_role_var[0] + device = self._param_device_map[param_name] + # For sum op added by global gradient clip, it must be + # put on all devices + if (op.type == 'sum' or op.type == 'sqrt' or + op.type == 'fill_constant' or + op.type == 'elementwise_max' or + op.type == 'elementwise_div'): + device = "gpu:all" + op._set_attr(self._op_device_key, device) + else: + other_known_ops = [ + 'update_loss_scaling', 'reduce_any', 'concat', 'sum' + ] + assert op.type in other_known_ops, "For other ops without " \ + "op_device set, they must be one of {}, but it " \ + "is {}".format(other_known_ops, op.type) + assert self._is_optimize_op(op) + op._set_attr(self._op_device_key, "gpu:all") + + def _add_op_device_attr(self, block): """ - 1. Add default op_device attribute for lr-related ops. - The default value is the one that of the first place. - 2. Add default op_device attribute for sum ops added during - backward. For these ops, we set the op_device attribute - as the one of its post op, i.e, which op has the output of the - sum op as an input. + Add op_device attrribute for ops in block that have + not that attribute set. """ - first_devcie = "" - - # Get the device spec of the first place. - # device_spec: 'cpu' for cpu device and 'gpu:id' for gpu device, - # e.g. 'gpu:0', 'gpu:1', etc. - for op in block.ops: - if op.has_attr(self._op_device_key) and ( - op.attr(self._op_device_key) != ""): - first_device = op.attr(self._op_device_key) - break - assert first_device - first_device_type = first_device.split(":")[0] - assert first_device_type == "gpu" - - # set op_device attr for lr-related ops - lrsched_role = int(self._op_role.LRSched) - for op in block.ops: - if not op.has_attr(self._op_device_key) or ( - op.attr(self._op_device_key) == ""): - if op.type == "sum": - # For sum ops that compute the sum of @RENAMED@ vars - for name in op.desc.input_arg_names(): - assert '@RENAME@' in name - assert len(op.desc.output_arg_names()) == 1 - out_name = op.desc.output_arg_names()[0] - post_op = self._find_post_op(block.ops, op, out_name) - device = post_op.attr(self._op_device_key) - assert device - op._set_attr(self._op_device_key, device) - continue - - assert op.attr(self._op_role_key) == lrsched_role, ( - "Op whose op_device attr has not been set for pipeline" - " must be of the role LRSched.") - op._set_attr(self._op_device_key, first_device) + for idx, op in enumerate(list(block.ops)): + if (op.type == "create_py_reader" or op.type == "read" or + op.type == "create_double_buffer_reader"): + # Copy read related ops to all section to make them exit + # after each epoch. + # We use "gpu:all" to represent the op should be put on all + # sub-programs, such as lr-related ops. Note that: "gpu:all" + # is only used by pipeline as an indicator. + op._set_attr(self._op_device_key, "gpu:all") + continue + # op_device attribute has been set + if self._get_op_device_attr(op): continue + self._add_op_device_attr_for_op(op, idx, block) def _check_validation(self, block): """ - Check whether ops in a block are all validate (i.e., the - op_device attribute has been set). - Then, return all device specifications in order. + Check whether ops in a block have both the op_device and the + op_role attributes set. + Then, return all devices in order. """ - device_specs = [] + device_list = [] + # Section worker only supports the following op_role + valid_op_role_value = [ + int(self._op_role.LRSched), + int(self._op_role.Forward), + int(self._op_role.Backward), + int(self._op_role.Loss), + int(self._op_role.Optimize), + int(self._op_role.Backward) | int(self._op_role.Loss), + ] for op in block.ops: - type = op.type - if not op._has_kernel(type): + if not op._has_kernel(op.type): assert op.type == "conditional_block" and ( op.attr(self._op_role_key) == int(self._op_role.LRSched)), ( "Now, the only supported op without kernel is " "conditional_block, and its op role must be LRSched.") + assert op.has_attr(self._op_role_key), ( + "op ({}) has no {} attribute.".format(op.type, + self._op_role_key)) + assert int(op.attr(self._op_role_key)) in valid_op_role_value, \ + "op_role {} for op {} must be one of {}".format( + op.attr(self._op_role_key), + op.type, + valid_op_role_value) assert op.has_attr(self._op_device_key), ( "op ({}) has no {} attribute.".format(op.type, self._op_device_key)) - dev_spec = op.attr(self._op_device_key) - assert dev_spec, ("op_device attribute for op " - "{} has not been set.".format(op.type)) - dev_type = dev_spec.split(':')[0] + + device = op.attr(self._op_device_key) + assert device, ("op_device attribute for op " + "{} has not been set.".format(op.type)) + if device == "gpu:all": continue + dev_type = device.split(':')[0] assert dev_type == "gpu", ("Now only gpu devices are supported " "for pipeline parallelism.") - if not dev_spec in device_specs: - device_specs.append(dev_spec) - return device_specs + if not device in device_list: + device_list.append(device) + return device_list def _insert_sendrecv_ops_for_boundaries(self, block): """ @@ -4201,148 +4276,267 @@ class PipelineOptimizer(object): """ extra_index = 0 - # A map from var to device spec where op takes it as input, + # A map from var to device where op takes it as input, # avoiding multiple send and recv ops. - var_devspec = dict() + var_dev_map = dict() for index, op in enumerate(list(block.ops)): - # skips lr-related ops and vars, as we will process them later. - if int(op.attr(self._op_role_key)) & int(self._op_role.LRSched): - continue - # skips update ops and vars, as we will process them later. - if self._is_update_op(op): continue - - cur_device_spec = op.attr(self._op_device_key) + cur_device = op.attr(self._op_device_key) + if cur_device == "gpu:all": continue for var_name in op.input_arg_names: # i.e., lod_tensor_blocking_queue created by DataLoader, # which only exists in startup program. - if not var_name in block.vars: continue var = block.var(var_name) # skip data, because we will process it later if var.is_data: continue + prev_device = None + if var_name in self._param_device_map: + prev_device = self._param_device_map[var_name] prev_op = self._find_real_prev_op(block.ops, op, var_name) - if prev_op is None: - continue - prev_device_spec = prev_op.attr(self._op_device_key) + if not prev_device: + prev_device = prev_op.attr(self._op_device_key) \ + if prev_op else None + if not prev_device or prev_device == 'gpu:all': continue - if prev_device_spec != cur_device_spec: - if var_name not in var_devspec: - var_devspec[var_name] = [] - if cur_device_spec in var_devspec[var_name]: continue - var_devspec[var_name].append(cur_device_spec) + if prev_device != cur_device: + if var_name not in var_dev_map: var_dev_map[var_name] = [] + if cur_device in var_dev_map[var_name]: continue + var_dev_map[var_name].append(cur_device) op_role = op.all_attrs()[self._op_role_key] var = block.vars[var_name] - prev_device_index = int(prev_device_spec.split(':')[1]) - cur_device_index = int(cur_device_spec.split(':')[1]) - block._insert_op( - index=index + extra_index, - type='send_v2', - inputs={'X': var}, - attrs={ - self._op_device_key: prev_device_spec, - self._op_role_key: op_role, - 'use_calc_stream': True, - 'peer': cur_device_index, - }) - extra_index += 1 - var_shape = list(var.shape) - var_shape[0] = self.micro_batch_size if var_shape[ - 0] < 0 else var_shape[0] - block._insert_op( - index=index + extra_index, - type='recv_v2', - outputs={'Out': [var]}, - attrs={ - 'out_shape': var_shape, - 'dtype': var.dtype, - self._op_device_key: cur_device_spec, - self._op_role_key: op_role, - 'use_calc_stream': True, - 'peer': prev_device_index, - }) - extra_index += 1 - - def _clear_gradients(self, main_block, dev_spec): - """ - Clear gradients at the begining of each run of a minibatch. - """ - for param_name in self._param_device_map: - device = self._param_device_map[param_name] - if device != dev_spec: continue - grad_name = self._append_grad_suffix(param_name) - if not main_block.has_var(grad_name): continue - grad_var = main_block.vars[grad_name] - grad_var.persistable = True - main_block._insert_op( - index=0, - type='fill_constant', - inputs={}, - outputs={'Out': [grad_var]}, - attrs={ - 'shape': grad_var.shape, - 'dtype': grad_var.dtype, - 'value': float(0), - self._op_device_key: device, - # a trick to run this op once per mini-batch - self._op_role_key: self._op_role.Optimize.LRSched, - }) + prev_device_index = int(prev_device.split(':')[1]) + cur_device_index = int(cur_device.split(':')[1]) + pair = (prev_device_index, cur_device_index) + pair_key = prev_device_index * 1000 + cur_device_index + if pair not in self._pipeline_pair: + self._pipeline_pair.append(pair) + self._pp_ring_map[pair_key] = self.ring_id + ring_id = self.ring_id + self.ring_id += 1 + else: + ring_id = self._pp_ring_map[pair_key] + if self.schedule_mode == 'F-then-B': # F-then-B + block._insert_op( + index=index + extra_index, + type='send_v2', + inputs={'X': var}, + attrs={ + self._op_device_key: prev_device, + self._op_role_key: op_role, + 'use_calc_stream': True, + 'peer': 1, + 'ring_id': ring_id + }) + extra_index += 1 + block._insert_op( + index=index + extra_index, + type='recv_v2', + outputs={'Out': [var]}, + attrs={ + 'out_shape': var.shape, + 'dtype': var.dtype, + self._op_device_key: cur_device, + self._op_role_key: op_role, + 'use_calc_stream': True, + 'peer': 0, + 'ring_id': ring_id + }) + extra_index += 1 + elif self.schedule_mode == '1F1B': # 1F1B + block._insert_op( + index=index + extra_index, + type='c_sync_calc_stream', + inputs={'X': [var]}, + outputs={'Out': [var]}, + attrs={ + self._op_device_key: prev_device, + self._op_role_key: op_role, + }) + extra_index += 1 + block._insert_op( + index=index + extra_index, + type='send_v2', + inputs={'X': var}, + attrs={ + self._op_device_key: prev_device, + self._op_role_key: op_role, + 'use_calc_stream': False, + 'ring_id': ring_id, + 'peer': 1, + }) + extra_index += 1 + block._insert_op( + index=index + extra_index, + type='c_sync_comm_stream', + inputs={'X': [var]}, + outputs={'Out': [var]}, + attrs={ + self._op_device_key: prev_device, + self._op_role_key: self._op_role.Backward, + 'ring_id': ring_id, + }) + extra_index += 1 + var_shape = list(var.shape) + var_shape[0] = self.micro_batch_size if var_shape[ + 0] < 0 else var_shape[0] + block._insert_op( + index=index + extra_index, + type='recv_v2', + outputs={'Out': [var]}, + attrs={ + 'out_shape': var_shape, + 'dtype': var.dtype, + self._op_device_key: cur_device, + self._op_role_key: op_role, + 'use_calc_stream': True, + 'peer': 0, + 'ring_id': ring_id + }) + extra_index += 1 + else: + raise ValueError( + "Now only 'F-then-B' and '1F1B' are supported." + "The given value is {}.".format(self.schedule_mode)) - def _accumulate_gradients(self, block): + def _insert_loss_scale(self, block): """ - Accumulate the gradients generated in microbatch to the one in mini-batch. - We also scale the loss corresponding to number of micro-batches as well. + Scale the loss corresponding to number of micro-batches. """ + if self._num_microbatches == 1: return for index, op in reversed(tuple(enumerate(list(block.ops)))): - offset = index - device = op.attr(self._op_device_key) - - # Backward pass if self._is_loss_grad_op(op): loss_grad_var = block.vars[op.output_arg_names[0]] - scale_factor = self._num_microbatches block._insert_op( index=index + 1, type='scale', inputs={'X': loss_grad_var}, outputs={'Out': loss_grad_var}, attrs={ - 'scale': 1.0 / scale_factor, - self._op_device_key: device, + 'scale': 1.0 / self._num_microbatches, self._op_role_key: self._op_role.Backward }) break - if self._is_backward_op(op) and ( - self._op_role_var_key in op.attr_names): - op_role_var = op.all_attrs()[self._op_role_var_key] - if len(op_role_var) == 0: + def _rename_gradient_var_name(self, block): + for index, op in enumerate(block.ops): + if not self._is_optimize_op(op): continue + input_names = op.input_arg_names + output_names = op.output_arg_names + in_out_names = input_names + output_names + if op.type == 'cast': continue + # append "MERGED" to the names of parameter gradients, + # and mofify the op_role_var attribute (by rename_arg func). + for name in in_out_names: + if not core.grad_var_suffix() in name: continue + param_name = name.strip(core.grad_var_suffix()) + new_grad_name = name + "@MERGED" + self._rename_arg(op, name, new_grad_name) + + def _accumulate_gradients(self, block, pp_allreduce_in_optimize=False): + """ + Create a new merged gradient for each parameter and accumulate the + corresponding gradient to it. + """ + merged_gradient_names = [] + first_opt_op_idx = None + + for index, op in reversed(tuple(enumerate(list(block.ops)))): + # remove the cast op of fp16 grad to fp32 grad + if self._is_optimize_op(op) and op.type == 'cast': + in_name = op.input_arg_names[0] + out_name = op.output_arg_names[0] + if out_name.strip('@GRAD') in self._param_device_map: + assert in_name.replace('.cast_fp16', '') == out_name + block._remove_op(index) continue + + if self._is_backward_op(op) and not first_opt_op_idx: + first_opt_op_idx = index + 1 + # no optimize phase + if first_opt_op_idx == len(block.ops): return + if block.ops[first_opt_op_idx].type == "c_sync_comm_stream": + first_opt_op_idx += 1 + + if self._is_backward_op(op) and ( + self._op_role_var_key in op.attr_names): + op_role_var = op.attr(self._op_role_var_key) + if len(op_role_var) == 0: continue assert len(op_role_var) % 2 == 0 - offset = index for i in range(0, len(op_role_var), 2): - grad_name = op_role_var[i + 1] - grad_var = block.vars[grad_name] - new_grad_var_name = unique_name.generate(grad_name) - new_var = self._create_var(block, grad_var, - new_grad_var_name) - self._rename_arg(op, grad_name, new_grad_var_name) + offset = 0 + param_name = op_role_var[i] + if not block.has_var(param_name): continue + if '@BroadCast' in param_name: continue + param_grad_name = param_name + core.grad_var_suffix() + merged_param_grad_name = param_grad_name + '@MERGED' + if not block.has_var(merged_param_grad_name): + self._create_var(block, block.vars[param_name], + merged_param_grad_name) + assert block.has_var(merged_param_grad_name) + param_grad_var = block.var(param_grad_name) + merged_param_grad_var = block.var(merged_param_grad_name) + merged_param_grad_var.persistable = True block._insert_op( - index=offset + 1, - type='sum', - inputs={'X': [grad_var, new_var]}, - outputs={'Out': grad_var}, + index=first_opt_op_idx + offset, + type='fill_constant', + inputs={}, + outputs={'Out': [merged_param_grad_var]}, attrs={ - self._op_device_key: device, - self._op_role_key: self._op_role.Backward, - self._op_role_var_key: op_role_var + 'shape': merged_param_grad_var.shape, + 'dtype': merged_param_grad_var.dtype, + 'value': float(0), + # a trick to run this op once per mini-batch + self._op_role_key: self._op_role.Optimize.LRSched, }) offset += 1 + grad_name = op_role_var[i + 1] + grad_var = block.vars[grad_name] + if not 'cast_fp16' in grad_name: + block._insert_op( + index=first_opt_op_idx + offset, + type='sum', + inputs={'X': [grad_var, merged_param_grad_var]}, + outputs={'Out': merged_param_grad_var}, + attrs={ + self._op_role_key: self._op_role.Backward, + }) + offset += 1 + merged_gradient_names.append(merged_param_grad_name) + else: + # cast gradient to fp32 to accumulate to merged gradient + cast_grad_var_name = param_grad_name + '@TMP' + cast_grad_var = self._create_var(block, param_grad_var, + cast_grad_var_name) + cast_grad_var.persistable = False + block._insert_op( + index=first_opt_op_idx + offset, + type='cast', + inputs={'X': grad_var}, + outputs={'Out': cast_grad_var}, + attrs={ + 'in_dtype': grad_var.dtype, + 'out_dtype': cast_grad_var.dtype, + self._op_role_key: self._op_role.Backward, + }) + offset += 1 + block._insert_op( + index=first_opt_op_idx + offset, + type='sum', + inputs={ + 'X': [merged_param_grad_var, cast_grad_var] + }, + outputs={'Out': merged_param_grad_var}, + attrs={ + self._op_role_key: self._op_role.Backward, + }) + offset += 1 + merged_gradient_names.append(merged_param_grad_name) + return merged_gradient_names def _add_sub_blocks(self, main_block, program_list): main_program = main_block.program - for prog_info in program_list: - prog = prog_info['program'] + for prog in program_list: for op in prog.block(0).ops: if not op.has_attr('sub_block'): continue @@ -4372,8 +4566,7 @@ class PipelineOptimizer(object): # var_info = {var_name: [program1, program2...]}, # persistable var only var_info = dict() - for prog_info in program_list: - prog = prog_info['program'] + for prog in program_list: block = prog.block(0) for var_name in block.vars: if var_name == "double_buffer_0": continue @@ -4395,7 +4588,7 @@ class PipelineOptimizer(object): block = prog.block(0) for op in block.ops: if op.type == "recv_v2" or op.type == "create_py_reader" or \ - op.type == "read": + op.type == "read" or op.type == "update_loss_scaling": continue # We have processed lr related vars if op.attr(self._op_role_key) == int( @@ -4423,6 +4616,15 @@ class PipelineOptimizer(object): read_block = prog.block(0) read_device = self._get_device_info(read_block) read_dev_index = int(read_device.split(':')[1]) + pair = (write_dev_index, read_dev_index) + pair_key = write_dev_index * 1000 + read_dev_index + if pair not in self._pipeline_pair: + self._pipeline_pair.append(pair) + self._pp_ring_map[pair_key] = self.ring_id + ring_id = self.ring_id + self.ring_id += 1 + else: + ring_id = self._pp_ring_map[pair_key] write_block._insert_op( index=0, @@ -4430,11 +4632,12 @@ class PipelineOptimizer(object): inputs={'X': write_block.var(var_name), }, attrs={ self._op_device_key: write_device, - 'use_calc_stream': True, + 'use_calc_stream': False, # A trick to make the role LRSched to avoid copy every # microbatch self._op_role_key: self._op_role.LRSched, 'peer': read_dev_index, + 'ring_id': ring_id }) read_block._insert_op( index=0, @@ -4444,12 +4647,33 @@ class PipelineOptimizer(object): 'out_shape': read_block.var(var_name).shape, 'dtype': read_block.var(var_name).dtype, self._op_device_key: read_device, - 'use_calc_stream': True, + 'use_calc_stream': False, # A trick to make the role LRSched to avoid copy every # microbatch self._op_role_key: self._op_role.LRSched, - 'peer': write_dev_index + 'peer': write_dev_index, + 'ring_id': ring_id }) + read_block._insert_op( + index=1, + type='c_sync_comm_stream', + inputs={'X': [read_block.var(var_name)]}, + outputs={'Out': [read_block.var(var_name)]}, + attrs={ + self._op_device_key: read_device, + # A trick to make the role LRSched to avoid copy every + # microbatch + self._op_role_key: self._op_role.LRSched, + 'ring_id': ring_id + }) + + def _is_gradient_clip_op(self, op): + return op.desc.has_attr("op_namescope") \ + and op.desc.attr("op_namescope").startswith("/gradient_clip") + + def _is_regularization_op(self, op): + return op.desc.has_attr("op_namescope") \ + and op.desc.attr("op_namescope").startswith("/regularization") def minimize(self, loss, @@ -4457,23 +4681,34 @@ class PipelineOptimizer(object): parameter_list=None, no_grad_set=None): main_block = loss.block + self.origin_main_block = main_block if startup_program is None: startup_program = default_startup_program() optimize_ops, params_grads = self._optimizer.minimize( loss, startup_program, parameter_list, no_grad_set) - self._param_device_map = self._optimizer._param_device_map + self._param_device_map = self._origin_optimizer._param_device_map + assert main_block.program._pipeline_opt \ + and 'local_rank' in main_block.program._pipeline_opt, \ + 'Please use pipeline with fleet.' + local_rank = main_block.program._pipeline_opt['local_rank'] + self._global_ring_id = main_block.program._pipeline_opt[ + 'global_ring_id'] + schedule_mode = 0 + if 'schedule_mode' in main_block.program._pipeline_opt: + schedule_mode = main_block.program._pipeline_opt['schedule_mode'] + self.schedule_mode = schedule_mode + # micro batch size self.micro_batch_size = main_block.program._pipeline_opt[ 'micro_batch_size'] - # Step1: add default op_device attribute for regulization and clip ops - self._add_opdevice_attr_for_regularization_clip(main_block) - - # Step2: add default op_device attribute for ops whose op_device - # attribute have not been set yet. Then check all ops have the - # op_device attribute. - self._add_default_opdevice_attr(main_block) + self.use_sharding = False + if 'use_sharding' in main_block.program._pipeline_opt: + self.use_sharding = main_block.program._pipeline_opt['use_sharding'] + self.ring_id = main_block.program._pipeline_opt['ring_id'] - device_specs = self._check_validation(main_block) + # Step1: add default op_device attribute for ops. + self._add_op_device_attr(main_block) + device_list = self._check_validation(main_block) def device_cmp(device1, device2): dev1_id = int(device1.split(':')[1]) @@ -4485,70 +4720,59 @@ class PipelineOptimizer(object): else: return 0 - sorted_device_spec = sorted(device_specs, key=cmp_to_key(device_cmp)) - assert sorted_device_spec == device_specs, ( - "With pipeline " - "parallelism, you must use gpu devices one after another " - "in the order of their ids.") - - # Step3: add send and recv ops between section boundaries + sorted_device_list = sorted(device_list, key=cmp_to_key(device_cmp)) + assert sorted_device_list == device_list, ( + "With pipeline parallelism, you must use gpu devices one after " + "another in the order of their ids.") + # Step2: add send and recv ops between section boundaries self._insert_sendrecv_ops_for_boundaries(main_block) - # Step4: split program into sections and add pairs of + # Step3: split program into sections and add pairs of # send and recv ops for data var. main_program = main_block.program - program_list = self._split_program(main_program, device_specs) + program_list = self._split_program(main_program, device_list) for p in program_list: - self._create_vars(p["program"].block(0), - main_program.global_block()) - self._insert_sendrecv_for_data_var(main_block, program_list, - startup_program, device_specs) + self._create_vars(p.global_block(), main_block) - # Step5: Special Case: process persistable vars that exist in + # Step4: Special Case: process persistable vars that exist in # multiple sections self._process_persistable_vars_in_multi_sections( main_program, startup_program, program_list) - # Step6: Add sub blocks for section programs + # Step5: Add sub blocks for section programs self._add_sub_blocks(main_block, program_list) - assert (main_program._pipeline_opt and - isinstance(main_program._pipeline_opt, dict) and - 'local_rank' in main_program._pipeline_opt), \ - "You must use pipeline with fleet" - local_rank = main_program._pipeline_opt['local_rank'] % len( - device_specs) - self.schedule_mode = main_program._pipeline_opt['schedule_mode'] - + local_rank = main_program._pipeline_opt['local_rank'] % len(device_list) place_list = [] - for dev_spec in device_specs: - dev_index = dev_spec.split(":")[1] - place_list.append(core.CUDAPlace(local_rank)) + for dev in device_list: + dev_index = int(dev.split(":")[1]) + place_list.append(core.CUDAPlace(dev_index % 8)) - # Step7: Split startup program + # Step6: Split startup program new_startup_program = self._split_startup_program(startup_program, local_rank) - # Step8: clear gradients before each mini-batch and - # accumulate gradients during backward - self._clear_gradients( - program_list[local_rank]['program'].global_block(), - dev_spec=device_specs[local_rank]) - self._accumulate_gradients(program_list[local_rank]['program'] - .global_block()) - startup_program._pipeline_opt = { "startup_program": new_startup_program, } + real_block = program_list[local_rank].global_block() + self._insert_loss_scale(real_block) + if not self.use_sharding: + # Step7: clear gradients before each mini-batch and + # accumulate gradients during backward + self._rename_gradient_var_name(real_block) + real_block._sync_with_cpp() + self._accumulate_gradients(real_block) + real_block._sync_with_cpp() place_id = int(os.getenv("FLAGS_selected_gpus", "0")) main_program._pipeline_opt = { "trainer": "PipelineTrainer", "device_worker": "Section", "pipeline_stage": local_rank, - "num_pipeline_stages": len(device_specs), + "num_pipeline_stages": len(device_list), "schedule_mode": self.schedule_mode, - "inner_parallelism": len(device_specs), + "inner_parallelism": len(device_list), "section_program": program_list[local_rank], "place": place_list[local_rank], "place_id": place_id, @@ -4556,7 +4780,7 @@ class PipelineOptimizer(object): "num_microbatches": self._num_microbatches, "start_cpu_core_id": self._start_cpu_core_id, } - return optimize_ops, params_grads, program_list + return optimize_ops, params_grads, program_list, self._pipeline_pair, self._pp_ring_map class RecomputeOptimizer(Optimizer): diff --git a/python/paddle/fluid/tests/unittests/pipeline_mnist.py b/python/paddle/fluid/tests/unittests/pipeline_mnist.py index f433af24813..8c3a66f933f 100644 --- a/python/paddle/fluid/tests/unittests/pipeline_mnist.py +++ b/python/paddle/fluid/tests/unittests/pipeline_mnist.py @@ -66,12 +66,21 @@ def cnn_model(data): param_shape = [reduce(lambda a, b: a * b, input_shape[1:], 1)] + [SIZE] scale = (2.0 / (param_shape[0]**2 * SIZE))**0.5 - predict = fluid.layers.fc( - input=conv_pool_2, - size=SIZE, - act="softmax", - param_attr=fluid.param_attr.ParamAttr( - initializer=fluid.initializer.Constant(value=0.01))) + with fluid.device_guard("gpu:1"): + predict = fluid.layers.fc( + input=conv_pool_2, + size=SIZE, + act="softmax", + param_attr=fluid.param_attr.ParamAttr( + initializer=fluid.initializer.Constant(value=0.01))) + # To cover @RENAMED@GRADIENT + predict2 = fluid.layers.fc( + input=conv_pool_1, + size=SIZE, + act="softmax", + param_attr=fluid.param_attr.ParamAttr( + initializer=fluid.initializer.Constant(value=0.01))) + predict += predict2 return predict @@ -108,7 +117,10 @@ class TestDistMnist2x2(TestDistRunnerBase): bd = [steps_per_pass * p for p in passes] lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)] lr_val = fluid.layers.piecewise_decay(boundaries=bd, values=lr) - opt = fluid.optimizer.Momentum(learning_rate=lr_val, momentum=0.9) + opt = fluid.optimizer.Momentum( + learning_rate=lr_val, + momentum=0.9, + grad_clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0)) acc_steps = 2 # accumulated steps for pipeline if dist_strategy: @@ -120,6 +132,7 @@ class TestDistMnist2x2(TestDistRunnerBase): fleet.init(is_collective=True) strategy = fleet.DistributedStrategy() strategy.pipeline = True + strategy.amp = True strategy.pipeline_configs = { 'micro_batch_size': batch_size, 'schedule_mode': '1F1B', -- GitLab From b47478efc2caf9247fd73245a3b87154ec3e81a1 Mon Sep 17 00:00:00 2001 From: cc <52520497+juncaipeng@users.noreply.github.com> Date: Fri, 26 Mar 2021 19:15:56 +0800 Subject: [PATCH 082/486] [dygraph qat] Use layer to calculate output scale (#31861) * Use layer to calculate output scale * add backward for moving_average_abs_max_scale and save output scales to op's attr --- paddle/fluid/operators/fake_quantize_op.cc | 69 +++-- paddle/fluid/operators/fake_quantize_op.cu | 4 +- paddle/fluid/operators/fake_quantize_op.h | 16 +- paddle/fluid/pybind/op_function_generator.cc | 6 +- .../slim/quantization/imperative/qat.py | 268 +++++------------- .../slim/quantization/imperative/quant_nn.py | 111 ++++---- .../slim/quantization/imperative/utils.py | 47 +-- .../slim/tests/test_imperative_out_scale.py | 25 -- .../tests/unittests/test_fake_quantize_op.py | 5 +- 9 files changed, 222 insertions(+), 329 deletions(-) diff --git a/paddle/fluid/operators/fake_quantize_op.cc b/paddle/fluid/operators/fake_quantize_op.cc index abfc88e5155..45443867188 100644 --- a/paddle/fluid/operators/fake_quantize_op.cc +++ b/paddle/fluid/operators/fake_quantize_op.cc @@ -649,13 +649,18 @@ class MovingAverageAbsMaxScaleOp : public framework::OperatorWithKernel { "MovingAverageAbsMaxScale"); OP_INOUT_CHECK(ctx->HasOutput("OutScale"), "Output", "OutScale", "MovingAverageAbsMaxScale"); + if (ctx->HasOutput("OutState")) { ctx->SetOutputDim("OutState", {1}); } if (ctx->HasOutput("OutAccum")) { ctx->SetOutputDim("OutAccum", {1}); } - ctx->SetOutputDim("OutScale", {1}); + if (ctx->HasOutput("Out")) { + ctx->SetOutputDim("Out", ctx->GetInputDim("X")); + ctx->SetOutputDim("OutScale", {1}); + ctx->ShareLoD("X", /*->*/ "Out"); + } } protected: @@ -673,6 +678,9 @@ class MovingAverageAbsMaxScaleOpMaker AddInput("X", "(Tensor) Input is float data type."); AddInput("InAccum", "Last accum.").AsDispensable(); AddInput("InState", "Last state.").AsDispensable(); + AddOutput("Out", + "(Tensor) Output tensor is just equivalent to the input tensor.") + .AsDispensable(); AddOutput("OutScale", " Current scale"); AddOutput("OutState", "(Tensor) state buffer.").AsDispensable(); AddOutput("OutAccum", "(Tensor) accum buffer.").AsDispensable(); @@ -693,7 +701,7 @@ $$Out = X$$ } }; -class FakeQuantDequantGradOp : public framework::OperatorWithKernel { +class StrightThroughEstimatorGradOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; @@ -701,9 +709,9 @@ class FakeQuantDequantGradOp : public framework::OperatorWithKernel { auto out_grad_name = framework::GradVarName("Out"); auto x_grad_name = framework::GradVarName("X"); OP_INOUT_CHECK(ctx->HasInput(out_grad_name), "Input", out_grad_name, - "FakeQuantDequantGradOp"); + "StrightThroughEstimatorGradOp"); OP_INOUT_CHECK(ctx->HasOutput(x_grad_name), "Output", x_grad_name, - "FakeQuantDequantGradOp"); + "StrightThroughEstimatorGradOp"); ctx->SetOutputDim(x_grad_name, ctx->GetInputDim(out_grad_name)); } @@ -717,13 +725,13 @@ class FakeQuantDequantGradOp : public framework::OperatorWithKernel { }; template -class FakeQuantDequantGradMaker : public framework::SingleGradOpMaker { +class StrightThroughEstimatorMaker : public framework::SingleGradOpMaker { public: using framework::SingleGradOpMaker::SingleGradOpMaker; protected: void Apply(GradOpPtr grad_op) const override { - grad_op->SetType("fake_quantize_dequantize_grad"); + grad_op->SetType("stright_throuth_estimator_grad"); grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); grad_op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); grad_op->SetAttrMap(this->Attrs()); @@ -744,11 +752,11 @@ REGISTER_OPERATOR( REGISTER_OP_CPU_KERNEL(fake_quantize_abs_max, ops::FakeQuantizeAbsMaxKernel); -REGISTER_OPERATOR(fake_quantize_dequantize_abs_max, - ops::FakeQuantOrWithDequantAbsMaxOp, - ops::FakeQuantOrWithDequantAbsMaxOpMaker, - ops::FakeQuantDequantGradMaker, - ops::FakeQuantDequantGradMaker); +REGISTER_OPERATOR( + fake_quantize_dequantize_abs_max, ops::FakeQuantOrWithDequantAbsMaxOp, + ops::FakeQuantOrWithDequantAbsMaxOpMaker, + ops::StrightThroughEstimatorMaker, + ops::StrightThroughEstimatorMaker); REGISTER_OP_CPU_KERNEL(fake_quantize_dequantize_abs_max, ops::FakeQuantizeDequantizeAbsMaxKernel); @@ -769,11 +777,12 @@ REGISTER_OPERATOR( REGISTER_OP_CPU_KERNEL(fake_quantize_moving_average_abs_max, ops::FakeQuantizeMovingAverageAbsMaxKernel); -REGISTER_OPERATOR(fake_quantize_dequantize_moving_average_abs_max, - ops::FakeQuantOrWithDequantMovingAverageAbsMaxOp, - ops::FakeQuantOrWithDequantMovingAverageAbsMaxOpMaker, - ops::FakeQuantDequantGradMaker, - ops::FakeQuantDequantGradMaker); +REGISTER_OPERATOR( + fake_quantize_dequantize_moving_average_abs_max, + ops::FakeQuantOrWithDequantMovingAverageAbsMaxOp, + ops::FakeQuantOrWithDequantMovingAverageAbsMaxOpMaker, + ops::StrightThroughEstimatorMaker, + ops::StrightThroughEstimatorMaker); REGISTER_OP_CPU_KERNEL( fake_quantize_dequantize_moving_average_abs_max, ops::FakeQuantizeDequantizeMovingAverageAbsMaxKernel); @@ -789,20 +798,22 @@ REGISTER_OP_CPU_KERNEL(fake_channel_wise_quantize_abs_max, REGISTER_OPERATOR( moving_average_abs_max_scale, ops::MovingAverageAbsMaxScaleOp, ops::MovingAverageAbsMaxScaleOpMaker, - paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); + ops::StrightThroughEstimatorMaker, + ops::StrightThroughEstimatorMaker); REGISTER_OP_CPU_KERNEL(moving_average_abs_max_scale, ops::MovingAverageAbsMaxScaleKernel); -REGISTER_OPERATOR(fake_quantize_dequantize_grad, ops::FakeQuantDequantGradOp); -REGISTER_OP_CPU_KERNEL(fake_quantize_dequantize_grad, - ops::FakeQuantDequantGradKernel); +REGISTER_OPERATOR(stright_throuth_estimator_grad, + ops::StrightThroughEstimatorGradOp); +REGISTER_OP_CPU_KERNEL(stright_throuth_estimator_grad, + ops::StrightThroughEstimatorGradKernel); -REGISTER_OPERATOR(fake_channel_wise_quantize_dequantize_abs_max, - ops::FakeChannelWiseQuantizeDequantizeAbsMaxOp, - ops::FakeChannelWiseQuantizeDequantizeAbsMaxOpMaker, - ops::FakeQuantDequantGradMaker, - ops::FakeQuantDequantGradMaker); +REGISTER_OPERATOR( + fake_channel_wise_quantize_dequantize_abs_max, + ops::FakeChannelWiseQuantizeDequantizeAbsMaxOp, + ops::FakeChannelWiseQuantizeDequantizeAbsMaxOpMaker, + ops::StrightThroughEstimatorMaker, + ops::StrightThroughEstimatorMaker); REGISTER_OP_CPU_KERNEL( fake_channel_wise_quantize_dequantize_abs_max, ops::FakeChannelWiseQuantizeDequantizeAbsMaxKernel); @@ -820,4 +831,8 @@ REGISTER_OP_VERSION(moving_average_abs_max_scale) "Out", "Delete output in order to make the inference model not " "save moving_average_abs_max_scale operator. This will " - "make the quantitative model be correctly applied in inference.")); + "make the quantitative model be correctly applied in inference.")) + .AddCheckpoint( + R"ROC(Incompatible upgrade of output [Out])ROC", + paddle::framework::compatible::OpVersionDesc().NewOutput( + "Out", "In order to support dygraph qat, add output again.")); diff --git a/paddle/fluid/operators/fake_quantize_op.cu b/paddle/fluid/operators/fake_quantize_op.cu index 92127f9aebd..78052179f6b 100644 --- a/paddle/fluid/operators/fake_quantize_op.cu +++ b/paddle/fluid/operators/fake_quantize_op.cu @@ -543,8 +543,8 @@ REGISTER_OP_CUDA_KERNEL(moving_average_abs_max_scale, REGISTER_OP_CUDA_KERNEL( fake_quantize_dequantize_moving_average_abs_max, ops::FakeQuantizeDequantizeMovingAverageAbsMaxKernel); -REGISTER_OP_CUDA_KERNEL(fake_quantize_dequantize_grad, - ops::FakeQuantDequantGradKernel); +REGISTER_OP_CUDA_KERNEL(stright_throuth_estimator_grad, + ops::StrightThroughEstimatorGradKernel); REGISTER_OP_CUDA_KERNEL( fake_channel_wise_quantize_dequantize_abs_max, ops::FakeChannelWiseQuantizeDequantizeAbsMaxKernel); diff --git a/paddle/fluid/operators/fake_quantize_op.h b/paddle/fluid/operators/fake_quantize_op.h index 94a75f930be..11a2d2de8bc 100644 --- a/paddle/fluid/operators/fake_quantize_op.h +++ b/paddle/fluid/operators/fake_quantize_op.h @@ -314,6 +314,12 @@ class MovingAverageAbsMaxScaleKernel : public framework::OpKernel { auto* in = context.Input("X"); auto& dev_ctx = context.template device_context(); + if (context.HasOutput("Out")) { + auto* out = context.Output("Out"); + out->mutable_data(context.GetPlace()); + framework::TensorCopy(*in, context.GetPlace(), dev_ctx, out); + } + bool is_test = context.Attr("is_test"); // testing if (is_test) { @@ -344,17 +350,17 @@ class MovingAverageAbsMaxScaleKernel : public framework::OpKernel { }; template -class FakeQuantDequantGradKernel : public framework::OpKernel { +class StrightThroughEstimatorGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { auto* d_out = context.Input(framework::GradVarName("Out")); auto x_grad_name = framework::GradVarName("X"); auto* d_x = context.Output(x_grad_name); - PADDLE_ENFORCE_NOT_NULL( - d_x, platform::errors::PreconditionNotMet( - "FakeQuantDequantGradOp doesn't have the output named %s.", - x_grad_name)); + PADDLE_ENFORCE_NOT_NULL(d_x, platform::errors::PreconditionNotMet( + "StrightThroughEstimatorGradKernel " + "doesn't have the output named %s.", + x_grad_name)); // Initialize dx as same as d_out d_x->mutable_data(context.GetPlace()); diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc index b1c42d91df5..69856fa4fa1 100644 --- a/paddle/fluid/pybind/op_function_generator.cc +++ b/paddle/fluid/pybind/op_function_generator.cc @@ -84,7 +84,8 @@ std::map> op_outs_map = { {"matrix_nms", {"Out", "Index", "RoisNum"}}, {"distribute_fpn_proposals", {"MultiFpnRois", "RestoreIndex", "MultiLevelRoIsNum"}}, - {"moving_average_abs_max_scale", {"OutScale", "OutAccum", "OutState"}}, + {"moving_average_abs_max_scale", + {"Out", "OutScale", "OutAccum", "OutState"}}, {"multiclass_nms3", {"Out", "NmsRoisNum"}}, {"generate_proposals_v2", {"RpnRois", "RpnRoiProbs", "RpnRoisNum"}}, {"momentum", {"ParamOut", "VelocityOut"}}, @@ -137,7 +138,8 @@ std::map> op_passing_outs_map = { {"check_finite_and_unscale", {"Out", "FoundInfinite"}}, {"update_loss_scaling", {"Out", "LossScaling", "OutGoodSteps", "OutBadSteps"}}, - {"moving_average_abs_max_scale", {"OutScale", "OutAccum", "OutState"}}, + {"moving_average_abs_max_scale", + {"Out", "OutScale", "OutAccum", "OutState"}}, {"lamb", {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut"}}, {"rnn", {"DropoutState"}}, diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py index ea2e8e073b5..f4620ff0001 100644 --- a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py +++ b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py @@ -21,14 +21,14 @@ import warnings import paddle from paddle.fluid import dygraph, core, framework, unique_name -from paddle.fluid.executor import Executor +from paddle.fluid.executor import Executor, global_scope from paddle.fluid.param_attr import ParamAttr from paddle.fluid.initializer import Constant from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX from paddle.fluid.io import load_inference_model, save_inference_model from paddle.fluid.log_helper import get_logger -from . import quant_nn from .. import quantization_pass +from . import quant_nn from . import utils __all__ = ['ImperativeQuantAware'] @@ -201,7 +201,7 @@ class ImperativeQuantAware(object): self._quantize_inputs = ImperativeQuantizeInputs(**kwargs) - self._calc_output_scale = ImperativeCalcOutputScale() + self._quantize_outputs = ImperativeQuantizeOutputs() def quantize(self, model): """ @@ -219,11 +219,11 @@ class ImperativeQuantAware(object): assert isinstance(model, dygraph.Layer), \ "The model must be the instance of dygraph.Layer." self._quantize_inputs.apply(model) - self._calc_output_scale.apply(model) + self._quantize_outputs.apply(model) def save_quantized_model(self, layer, path, input_spec=None, **config): - self._calc_output_scale.save_quantized_model(layer, path, input_spec, - **config) + self._quantize_outputs.save_quantized_model(layer, path, input_spec, + **config) class ImperativeQuantizeInputs(object): @@ -323,10 +323,10 @@ class ImperativeQuantizeInputs(object): idx += 1 target = name[last_idx:idx] - quant_layer = self._get_quantized_layer(layer) + quant_layer = self._get_input_quantized_layer(layer) setattr(obj, target, quant_layer) - def _get_quantized_layer(self, layer): + def _get_input_quantized_layer(self, layer): quant_layer_name = None for key, value in utils.quant_input_layers_map.items(): if isinstance(layer, value): @@ -343,24 +343,26 @@ class ImperativeQuantizeInputs(object): return quant_nn.__dict__[quant_layer_name](layer, **self._kwargs) -class ImperativeCalcOutputScale(object): +class ImperativeQuantizeOutputs(object): + """ + Calculate the output scales for some layers. + """ + def __init__(self, moving_rate=0.9): """ - Add the logic of calculating and setting output scales of some layers. + The constructor for ImperativeQuantizeOutputs. Args: moving_rate(float): The decay coefficient of moving average. The default value is 0.9. """ - super(ImperativeCalcOutputScale, self).__init__() + super(ImperativeQuantizeOutputs, self).__init__() self._moving_rate = moving_rate - self._register_hook_handle_list = [] - self._out_scale_dict = collections.OrderedDict() def apply(self, model): """ - Insert the `moving_average_abs_max_scale` op to calculate output - scale of specific layers in model. + Insert the `moving_average_abs_max_scale` layers to calculate the + output scales for specific layers in the dygraph model. Args: model(fluid.dygraph.Layer): The target model which would be @@ -372,14 +374,25 @@ class ImperativeCalcOutputScale(object): assert isinstance(model, dygraph.Layer), \ "The model must be the instance of dygraph.Layer." - # Calculate the target ops's output scale, and don't consider - # the skip_quant attr - for _, layer in model.named_sublayers(): - if self._is_target_layer(layer): - self._init_scale_params(layer) - hook_handle = layer.register_forward_post_hook( - self._calc_output_scale_hook) - self._register_hook_handle_list.append(hook_handle) + for name, layer in model.named_sublayers(): + if not self._is_target_layer(layer): + continue + + # TODO(jc): optimize this module + last_idx = 0 + idx = 0 + obj = model + while idx < len(name): + if (name[idx] == '.'): + if hasattr(obj, name[last_idx:idx]): + obj = getattr(obj, name[last_idx:idx]) + last_idx = idx + 1 + idx += 1 + target = name[last_idx:idx] + + quant_layer = quant_nn.__dict__["QuantizedOutputLayer"]( + layer, self._moving_rate) + setattr(obj, target, quant_layer) def save_quantized_model(self, layer, path, input_spec=None, **config): """ @@ -409,33 +422,18 @@ class ImperativeCalcOutputScale(object): Returns: None """ - assert isinstance(layer, dygraph.Layer), \ "The model must be the instance of dygraph.Layer." - self._gather_output_scale(layer) - - with dygraph.guard(): - layer.eval() - for handle in self._register_hook_handle_list: - handle.remove() paddle.jit.save(layer=layer, path=path, input_spec=input_spec, **config) - if len(self._out_scale_dict) == 0: - warnings.warn("Warning: No Layer of the model while to be " \ - "saved contains the out_threshold attribute, so the " \ - "generated inference model would not contain the " \ - "out_threshold.") - return - - # load static model is_dynamic_mode = False if paddle.in_dynamic_mode(): is_dynamic_mode = True paddle.enable_static() - place = core.CUDAPlace(0) if core.is_compiled_with_cuda() \ - else core.CPUPlace() + place = core.CPUPlace() + scope = global_scope() exe = Executor(place) dirname = os.path.dirname(path) @@ -450,20 +448,10 @@ class ImperativeCalcOutputScale(object): model_filename=model_filename, params_filename=params_filename)) - # TODO(jc): analyse whether the dygraph model has - # several blocks before applying qat - assert infer_program.num_blocks == 1, \ - "Quantization aware training (QAT) requires the program " \ - "only has a block for now. When the model has if-else or " \ - "while, the program will have several blocks." + self._save_output_scale(infer_program, scope) - # set output scales to the static model - self._save_output_scale(infer_program) - - # process skip quant self._set_skip_quant_attr(infer_program) - # save the final quantized model that has output scales save_inference_model( dirname=dirname, feeded_var_names=feed_target_names, @@ -476,144 +464,42 @@ class ImperativeCalcOutputScale(object): if is_dynamic_mode: paddle.disable_static() - def _gather_output_scale(self, layer): - """ - Gather all output scales to self._out_scale_dict - """ - with dygraph.guard(): - layer.eval() - for _, sub_layer in layer.named_sublayers(): - if self._is_target_layer(sub_layer): - layer_name = sub_layer.full_name() - if hasattr(sub_layer, "_quant_out_scale"): - self._out_scale_dict[layer_name] = float( - sub_layer._quant_out_scale) - - def _save_output_scale(self, infer_program): + def _is_target_layer(self, layer): """ - Save all output scales to the corresponding ops in static - inference program. - - Because the Layer in dygraph may correspond to multiple ops - in static program after being saved. To ensure correctness, - the outscale collected for output of dygraph Layer can only - be set to the last op in the corresponding ops in static program. + Whether the layer needs to calculate output scales. """ - assert infer_program.num_blocks == 1, \ - "The inference program should only have a block." - - global_block = infer_program.global_block() - target_ops = global_block.ops - - scale_idx = 0 - op_idx = 0 - attr_name = "out_threshold" - - for scale_name, scale_value in self._out_scale_dict.items(): - while True: - if op_idx >= len(target_ops): - break - - op = target_ops[op_idx] - if not self._is_scale_op_matched(scale_name, op, global_block): - op_idx += 1 - else: - if op.type in utils.weight_op_types \ - and op_idx + 1 < len(target_ops) \ - and target_ops[op_idx+1].type == "elementwise_add": - target_ops[op_idx + 1]._set_attr(attr_name, scale_value) - op_idx += 2 - else: - op._set_attr(attr_name, scale_value) - op_idx += 1 - scale_idx += 1 - break - - if scale_idx != len(self._out_scale_dict): - _logger.warning("Warning: the model have %s output scales, "\ - "but it only saves %s output scales." \ - % (len(self._out_scale_dict), scale_idx)) - - def _is_target_layer(self, layer): return isinstance(layer, tuple(utils.quant_output_layers_map.values())) \ - or ('quantized_' in layer.full_name() and \ + or ('quantized' in layer.full_name() and \ 'quantized_noweight' not in layer.full_name()) - def _init_scale_params(self, layer, name=None): + def _save_output_scale(self, program, scope): """ - Init the scale params for calculating output scales and save them in the - target layer. - After the users define the dygraph model, the hooks for calculating output - scales will not execute immediately. If the users load parameters form - checkpoint and save the quantized inference model immediately, the inference - model would not be saved successfully. Beacuse the dygraph_to_static requires - that the parameters created in __init__, but the uniqueness of hook make it - impossible to create parameters in __init__. To avoid this mistake, we define - the scale parameters in the beginning instead of hook. + Save all output scales to the corresponding ops in static + inference program and delete 'moving_average_abs_max_scale' ops. """ + for block in program.blocks: + for op in block.ops: + if op.type == "moving_average_abs_max_scale": + in_var_name = op.input('X')[0] + out_var_name = op.output('Out')[0] + out_scale_name = op.output('OutScale')[0] - def _create_param(in_layer, first_name, last_name, dtype): - prefix = '{}.{}'.format(first_name, last_name) \ - if first_name else 'outscale.{}'.format(last_name) - attr = ParamAttr( - name=unique_name.generate(prefix), - initializer=Constant(1), - trainable=False) - param = in_layer.create_parameter(shape=[1], attr=attr, dtype=dtype) - return param - - dtype = layer._dtype if layer._dtype is not None else "float32" - if dtype not in ["float32", "float64"]: - return - - layer._quant_out_scale = _create_param(layer, name, "scale", dtype) - layer._quant_out_scale.stop_gradient = True - - layer._quant_out_state = _create_param(layer, name, "state", dtype) - layer._quant_out_state.stop_gradient = True + out_scale = utils.load_variable_data(scope, out_scale_name) + previous_op = utils.find_previous_op(block, in_var_name) + previous_op._set_attr("out_threshold", float(out_scale)) - layer._quant_out_accum = _create_param(layer, name, "accum", dtype) - layer._quant_out_accum.stop_gradient = True + next_ops = utils.find_next_ops(block, out_var_name) + for next_op in next_ops: + next_op._rename_input(out_var_name, in_var_name) - def _is_scale_op_matched(self, scale_name, op, block): + def _set_skip_quant_attr(self, program): """ - Based on the op name and attrs to judge whether the op in - program matches the scale_name. We must know the corresponding - name between dgraph and static model. + Label the skip quantized ops. """ - fp_type = [core.VarDesc.VarType.FP64, core.VarDesc.VarType.FP32] - if op.type in quantization_pass._op_real_in_out_name.keys(): - output_var_names = quantization_pass._get_op_output_var_names(op) - for output_var_name in output_var_names: - output_var_tensor = block.var(output_var_name) - if output_var_tensor.dtype not in fp_type: - return False - - # corresponding_map: [name, op_types, function] - # Note that, the items have priority in corresponding_map - corresponding_map = [ - ['conv2d_tranpose', ['conv2d_transpose', \ - 'depthwise_conv2d_transpose'], None], - ['conv2d', ['conv2d', 'depthwise_conv2d'], None], - ['linear', ['matmul'], None], - ['re_lu6', ['relu6'], None], - ['p_re_lu', ['prelu'], None], - ['leaky_re_lu', ['leaky_relu'], None], - ['re_lu', ['relu'], None], - ] - - for item in corresponding_map: - if item[0] in scale_name: - return (op.type in item[1]) and \ - (len(item) == 2 or item[2] is None or item[2](op)) - - return op.type in scale_name - - def _set_skip_quant_attr(self, program): - block = program.global_block() - for op in block.ops: - if self._is_skip_quant_op(block, op): - op._set_attr("skip_quant", True) + for block in program.blocks: + for op in block.ops: + if self._is_skip_quant_op(block, op): + op._set_attr("skip_quant", True) def _is_skip_quant_op(self, block, in_op): """ @@ -621,33 +507,11 @@ class ImperativeCalcOutputScale(object): 1. the type of input op should be conv2d, depthwise_conv2d or matmul 2. the previous ops of the input op are not fake_quantize_dequantize ops """ - - def _find_previous_op(block, var_name): - for op in block.ops: - if var_name in op.output_arg_names: - return op - target_op_types = ["conv2d", "depthwise_conv2d", "matmul"] if in_op.type not in target_op_types: return False - previous_ops = [_find_previous_op(block, arg_name) \ + previous_ops = [utils.find_previous_op(block, arg_name) \ for arg_name in in_op.input_arg_names] - return any(op is not None and op.type not in utils.fake_quantize_dequantize_types \ - for op in previous_ops ) - - def _calc_output_scale_hook(self, layer, input, output): - """ - Create the MovingAverageAbsMaxScale layer for the target layer if needed. - Execute MovingAverageAbsMaxScale layer to calculate the output scale. - """ - assert isinstance(output, (core.VarBase, framework.Variable)), \ - "Multiple outputs are not currently supported in ImperativeOutScale." - - fp_types = [core.VarDesc.VarType.FP32, core.VarDesc.VarType.FP64] - if output.dtype in fp_types: - if not hasattr(layer, "_out_scale"): - self._out_scale = quant_nn.MovingAverageAbsMaxScale( - layer, output.name, self._moving_rate, output.dtype) - # TODO (jc): consider the ops that have several outputs - self._out_scale(output) + return any(op is not None and op.type not in \ + utils.fake_quantize_dequantize_types for op in previous_ops) diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py b/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py index 3c4fb323bc5..f6fef0689d4 100644 --- a/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py +++ b/python/paddle/fluid/contrib/slim/quantization/imperative/quant_nn.py @@ -507,59 +507,42 @@ class QuantizedNoweightLayer(layers.Layer): class MovingAverageAbsMaxScale(layers.Layer): - def __init__(self, layer=None, name=None, moving_rate=0.9, dtype='float32'): + def __init__(self, name=None, moving_rate=0.9, dtype='float32'): r""" - MovingAverageMaxScale layer is used to calculating the output quantization scale of Layer. - Its computational formula is described as below: + MovingAverageMaxScale layer is used to calculating the output quantization + scale of Layer. Its computational formula is described as below: :math:`scale = (moving\_rate*accum+max(abs(x)))/(moving\_rate*state+1)` :math:`Out = X` """ super(MovingAverageAbsMaxScale, self).__init__() self._moving_rate = moving_rate - self._dtype = dtype - self._layer = layer - if self._layer is None or not hasattr(self._layer, "_quant_out_scale"): - scale_prefix = '{}.scale'.format(name) if name else 'outscale.scale' - scale_name = unique_name.generate(scale_prefix) - scale_attr = ParamAttr( - name=scale_name, initializer=Constant(1), trainable=False) - self._scale = self.create_parameter( - shape=[1], attr=scale_attr, dtype=self._dtype) - self._scale.stop_gradient = True - if self._layer is not None: - setattr(self._layer, "_quant_out_scale", self._scale) - else: - self._scale = self._layer._quant_out_scale + scale_prefix = '{}.scale'.format(name) if name else 'outscale.scale' + scale_name = unique_name.generate(scale_prefix) + scale_attr = ParamAttr( + name=scale_name, initializer=Constant(1), trainable=False) + self._scale = self.create_parameter( + shape=[1], attr=scale_attr, dtype=dtype) + self._scale.stop_gradient = True - if self._layer is None or not hasattr(self._layer, "_quant_out_state"): - state_prefix = "{}.state".format(name) if name else 'outscale.state' - state_attr = ParamAttr( - name=unique_name.generate(state_prefix), - initializer=Constant(1), - trainable=False) - self._state = self.create_parameter( - shape=[1], attr=state_attr, dtype=self._dtype) - self._state.stop_gradient = True - if self._layer is not None: - setattr(self._layer, "_quant_out_state", self._state) - else: - self._state = self._layer._quant_out_state + state_prefix = "{}.state".format(name) if name else 'outscale.state' + state_attr = ParamAttr( + name=unique_name.generate(state_prefix), + initializer=Constant(1), + trainable=False) + self._state = self.create_parameter( + shape=[1], attr=state_attr, dtype=dtype) + self._state.stop_gradient = True - if self._layer is None or not hasattr(self._layer, "_quant_out_accum"): - accum_prefix = "{}.accum".format(name) if name else 'outscale.accum' - accum_attr = ParamAttr( - name=unique_name.generate(accum_prefix), - initializer=Constant(1), - trainable=False) - self._accum = self.create_parameter( - shape=[1], attr=accum_attr, dtype=self._dtype) - self._accum.stop_gradient = True - if self._layer is not None: - setattr(self._layer, "_quant_out_accum", self._accum) - else: - self._accum = self._layer._quant_out_accum + accum_prefix = "{}.accum".format(name) if name else 'outscale.accum' + accum_attr = ParamAttr( + name=unique_name.generate(accum_prefix), + initializer=Constant(1), + trainable=False) + self._accum = self.create_parameter( + shape=[1], attr=accum_attr, dtype=dtype) + self._accum.stop_gradient = True def forward(self, input): if in_dygraph_mode(): @@ -567,18 +550,30 @@ class MovingAverageAbsMaxScale(layers.Layer): not self.training) state = self._state if self.training else None accum = self._accum if self.training else None + quant_out = _varbase_creator( + type=input.type, + name="{}.tmp".format(input.name), + shape=input.shape, + dtype=input.dtype, + persistable=False) - self._scale, _, _ = core.ops.moving_average_abs_max_scale( - input, accum, state, self._scale, state, accum, *attrs) - return self._scale + out, _, _, _ = core.ops.moving_average_abs_max_scale( + input, accum, state, quant_out, self._scale, state, accum, + *attrs) + return out check_variable_and_dtype(input, 'input', ['float32', 'float64'], 'MovingAverageAbsMaxScale') attrs = {'moving_rate': self._moving_rate, 'is_test': not self.training} - inputs = {"X": [input]} - outputs = {"OutScale": [self._scale]} + quant_out = self._helper.create_variable( + name="{}.tmp".format(input.name), + dtype=input.dtype, + type=core.VarDesc.VarType.LOD_TENSOR, + persistable=False, + stop_gradient=False) + outputs = {"Out": [quant_out], "OutScale": [self._scale]} if self.training: inputs['InState'] = [self._state] @@ -592,4 +587,22 @@ class MovingAverageAbsMaxScale(layers.Layer): outputs=outputs, attrs=attrs) - return self._scale + return quant_out + + +class QuantizedOutputLayer(layers.Layer): + def __init__(self, layer=None, moving_rate=0.9, dtype='float32'): + r""" + Add MovingAverageMaxScale layer to the behind of the input layer. + """ + super(QuantizedOutputLayer, self).__init__() + self._layer = layer + self._moving_average_abs_max_scale = \ + MovingAverageAbsMaxScale(layer.full_name(), moving_rate, dtype) + + def forward(self, input): + if isinstance(input, list): + assert len(input) == 1, \ + "The QuantizedOutputLayer should only have one input." + out = self._layer(input) + return self._moving_average_abs_max_scale(out) diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py b/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py index 090f6cda389..f45eb8c97f4 100644 --- a/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py +++ b/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py @@ -13,22 +13,7 @@ # limitations under the License. import paddle - -op_real_in_out_name = { - "conv2d": [["Input", "Filter"], ["Output"]], - "depthwise_conv2d": [["Input", "Filter"], ["Output"]], - "pool2d": [["X"], ["Out"]], - "elementwise_add": [["X", "Y"], ["Out"]], - "softmax": [["X"], ["Out"]], - "relu": [["X"], ["Out"]], - "relu6": [["X"], ["Out"]], - "leaky_relu": [["X"], ["Out"]], - "prelu": [["X"], ["Out"]], - "tanh": [["X"], ["Out"]], - "batch_norm": [["X"], ["Y"]], - "sigmoid": [["X"], ["Out"]], - "swish": [["X"], ["Out"]], -} +import numpy as np quant_input_layers_map = { 'Conv2D': paddle.nn.Conv2D, @@ -85,3 +70,33 @@ weight_op_types = [ "conv2d", "depthwise_conv2d", "matmul", "conv2d_transpose", "depthwise_conv2d_transpose" ] + + +def load_variable_data(scope, var_name): + ''' + Load variable value from scope + ''' + var_node = scope.find_var(var_name) + assert var_node is not None, \ + "Can not find " + var_name + " in the scope." + return np.array(var_node.get_tensor()) + + +def find_previous_op(block, var_name): + """ + Find the previous op for the input variable. + """ + for op in block.ops: + if var_name in op.output_arg_names: + return op + + +def find_next_ops(block, var_name): + """ + Find all followed ops for the input variable. + """ + res_ops = [] + for op in block.ops: + if var_name in op.input_arg_names: + res_ops.append(op) + return res_ops diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py index 600174e503f..8d6ce76ef0f 100644 --- a/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py +++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py @@ -478,30 +478,5 @@ class TestSaveQuanztizedModelFromCheckPoint(unittest.TestCase): self.assertTrue(op_count == 14) -class TestSaveQuantizedModel_Warning(unittest.TestCase): - def test_warning(self): - path = "./dynamic_outscale_infer_model_with_warnings/lenet" - imperative_out_scale = ImperativeQuantAware() - with fluid.dygraph.guard(): - lenet = ImperativeLenet() - - with warnings.catch_warnings(record=True) as w: - warnings.simplefilter("always") - imperative_out_scale.save_quantized_model( - layer=lenet, - path=path, - input_spec=[ - paddle.static.InputSpec( - shape=[None, 1, 28, 28], dtype='float32') - ]) - - warning_message = "Warning: No Layer of the model while to be " \ - "saved contains the out_threshold attribute, so the " \ - "generated inference model would not contain the " \ - "out_threshold." - num = get_vaild_warning_num(warning_message, w) - assert num == 1 - - if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py index 01f0abe0f21..1d7bfc9f696 100644 --- a/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py +++ b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py @@ -166,12 +166,14 @@ class TestMovingAverageAbsMaxScaleOp(OpTest): accum[0] = 1 state = np.zeros(1).astype("float32") state[0] = 1 + x = np.random.random((8, 16, 7, 7)).astype("float32") self.inputs = { - 'X': np.random.random((8, 16, 7, 7)).astype("float32"), + 'X': x, 'InAccum': accum, 'InState': state, } + out = x out_accum = np.zeros(1).astype("float32") out_state = np.zeros(1).astype("float32") out_scale = np.zeros(1).astype("float32") @@ -180,6 +182,7 @@ class TestMovingAverageAbsMaxScaleOp(OpTest): out_state[0] = self.attrs['moving_rate'] * state[0] + 1 out_scale = out_accum / out_state self.outputs = { + 'Out': out, 'OutAccum': out_accum, 'OutState': out_state, 'OutScale': out_scale, -- GitLab From bfb5cf5567a604fded177d90d639f7337015e3fa Mon Sep 17 00:00:00 2001 From: zlsh80826 Date: Mon, 29 Mar 2021 10:16:54 +0800 Subject: [PATCH 083/486] [Paddle-TRT] trt affine channel converter (#31628) * trt affine channel converter * add trt affine channel base test * add trt affine channel NHWC * remove asterisk for python2 compatibility * trt affine channel converter * add trt affine channel base test * add trt affine channel NHWC * remove asterisk for python2 compatibility * fix rebase * move LodTensor to Tensor * add dbg info * affine channel converter only support NCHW * scale,bias are parameters, use create_parameters api * reduce test input size to not exceed the timelimit of ci * refine affine channel unittest and add serialization/dynamic test * change super to InferencePassTest for python2 compatibility * change super to InferencePassTest for python2 compatibility * fix affine channel fp16 serialize setting --- .../fluid/inference/api/analysis_predictor.cc | 1 + .../inference/tensorrt/convert/CMakeLists.txt | 1 + .../tensorrt/convert/affine_channel_op.cc | 94 ++++++++++++ paddle/fluid/inference/tensorrt/op_teller.cc | 10 +- .../inference/test_trt_affine_channel_op.py | 141 ++++++++++++++++++ 5 files changed, 246 insertions(+), 1 deletion(-) create mode 100644 paddle/fluid/inference/tensorrt/convert/affine_channel_op.cc create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_trt_affine_channel_op.py diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 0007582e2c7..76bf5948a2b 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -1192,6 +1192,7 @@ USE_TRT_CONVERTER(scale); USE_TRT_CONVERTER(stack); USE_TRT_CONVERTER(clip); USE_TRT_CONVERTER(gather); +USE_TRT_CONVERTER(affine_channel); USE_TRT_CONVERTER(multiclass_nms); USE_TRT_CONVERTER(nearest_interp); #endif diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt index be7fa0548d9..6af76bd11cd 100644 --- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt @@ -6,6 +6,7 @@ nv_library(tensorrt_converter shuffle_channel_op.cc swish_op.cc instance_norm_op.cc stack_op.cc transpose_op.cc flatten_op.cc emb_eltwise_layernorm.cc skip_layernorm.cc scale_op.cc slice_op.cc hard_sigmoid_op.cc hard_swish_op.cc clip_op.cc gather_op.cc + affine_channel_op.cc multiclass_nms_op.cc nearest_interp_op.cc DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry) diff --git a/paddle/fluid/inference/tensorrt/convert/affine_channel_op.cc b/paddle/fluid/inference/tensorrt/convert/affine_channel_op.cc new file mode 100644 index 00000000000..813342c0848 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/affine_channel_op.cc @@ -0,0 +1,94 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/data_layout.h" +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" + +namespace paddle { +namespace framework { +class Scope; + +namespace proto { +class OpDesc; +} // namespace proto +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace inference { +namespace tensorrt { + +/* + * Affine Channel Op + */ +class AffineChannelOpConverter : public OpConverter { + public: + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, bool test_mode) override { + VLOG(3) << "convert a fluid affine_channel op to tensorrt scale nd layer"; + + framework::OpDesc op_desc(op, nullptr); + std::string input_name = op_desc.Input("X").front(); + std::string scale_name = op_desc.Input("Scale").front(); + std::string bias_name = op_desc.Input("Bias").front(); + std::string output_name = op_desc.Output("Out").front(); + + auto input_tensor = engine_->GetITensor(input_name); + auto idim = input_tensor->getDimensions(); + + auto* scale_v = scope.FindVar(scale_name); + auto* scale_t = scale_v->GetMutable(); + float* scale_ptr = engine_->GetWeightCPUData(scale_name, scale_t, false); + + auto* bias_v = scope.FindVar(bias_name); + auto* bias_t = bias_v->GetMutable(); + float* bias_ptr = engine_->GetWeightCPUData(bias_name, bias_t, false); + + auto data_layout = framework::StringToDataLayout( + BOOST_GET_CONST(std::string, op_desc.GetAttr("data_layout"))); + + PADDLE_ENFORCE_EQ( + data_layout, framework::DataLayout::kNCHW, + platform::errors::InvalidArgument( + "TensorRT affine channel converter can only convert NCHW format. " + "Other format should be run in fluid mode. Report a bug on github " + "issue if you see this line.")); + + // tensorrt scalend layer only support spatial dims >= 2, + // so nhwc is not availabe (spatial dims == 0) + const int channel_axis = engine_->with_dynamic_shape(); + + TensorRTEngine::Weight scale_weights{nvinfer1::DataType::kFLOAT, + static_cast(scale_ptr), + (size_t)idim.d[channel_axis]}; + TensorRTEngine::Weight bias_weights{nvinfer1::DataType::kFLOAT, + static_cast(bias_ptr), + (size_t)idim.d[channel_axis]}; + TensorRTEngine::Weight power_weights{nvinfer1::DataType::kFLOAT, nullptr, + 0}; + + auto layer = TRT_ENGINE_ADD_LAYER(engine_, ScaleNd, *input_tensor, + nvinfer1::ScaleMode::kCHANNEL, + bias_weights.get(), scale_weights.get(), + power_weights.get(), channel_axis); + + RreplenishLayerAndOutput(layer, "affine_channel", {output_name}, test_mode); + } +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +REGISTER_TRT_OP_CONVERTER(affine_channel, AffineChannelOpConverter); diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc index 82f58254fe8..eb429405d18 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -111,6 +111,7 @@ struct SimpleOpTypeSetTeller : public Teller { "flatten2", "flatten", "gather", + "affine_channel", "multiclass_nms", "nearest_interp", }; @@ -196,6 +197,13 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, if (!with_dynamic_shape || desc.Input("Axis").size() > 0) return false; } + if (op_type == "affine_channel") { + if (!desc.HasAttr("data_layout")) return false; + auto data_layout = framework::StringToDataLayout( + BOOST_GET_CONST(std::string, desc.GetAttr("data_layout"))); + if (data_layout != framework::DataLayout::kNCHW) return false; + } + if (op_type == "multiclass_nms") { if (with_dynamic_shape) return false; auto* block = desc.Block(); @@ -238,6 +246,7 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, return false; } } + if (op_type == "nearest_interp") { std::vector attrs{"data_layout", "interp_method", "align_corners", "scale", @@ -254,7 +263,6 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, BOOST_GET_CONST(std::string, desc.GetAttr("interp_method")); if (interp_method != "nearest") return false; } - if ((*teller)(op_type, desc, use_no_calib_int8)) return true; } return false; diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_affine_channel_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_affine_channel_op.py new file mode 100644 index 00000000000..8bbba7c8b55 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_affine_channel_op.py @@ -0,0 +1,141 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import itertools +import numpy as np +from inference_pass_test import InferencePassTest +import paddle.fluid as fluid +import paddle.fluid.core as core +from paddle.fluid.core import PassVersionChecker +from paddle.fluid.core import AnalysisConfig + + +class TRTAffineChannelTest(InferencePassTest): + def setUp(self): + self.bs = 2 + self.channel = 8 + self.height = 16 + self.width = 16 + self.data_layout = 'NCHW' + self.precision = AnalysisConfig.Precision.Float32 + self.serialize = False + self.enable_trt = True + + def build(self): + # set min_graph_size to 2, + # because affine channel doesn't support nhwc format + self.trt_parameters = InferencePassTest.TensorRTParam( + 1 << 30, self.bs, 2, self.precision, self.serialize, False) + + with fluid.program_guard(self.main_program, self.startup_program): + if self.data_layout == 'NCHW': + shape = [-1, self.channel, self.height, self.width] + else: + shape = [-1, self.height, self.width, self.channel] + + data = fluid.data(name='in', shape=shape, dtype='float32') + # set scale, bias by constant + scale = fluid.layers.create_parameter( + shape=[self.channel], + dtype='float32', + default_initializer=fluid.initializer.Constant(2.)) + bias = fluid.layers.create_parameter( + shape=[self.channel], + dtype='float32', + default_initializer=fluid.initializer.Constant(.5)) + affine_channel_out = fluid.layers.affine_channel( + data, scale=scale, bias=bias, data_layout=self.data_layout) + out = fluid.layers.batch_norm(affine_channel_out, is_test=True) + + shape[0] = self.bs + self.feeds = {'in': np.random.random(shape).astype('float32'), } + self.fetch_list = [out] + + def check_output(self): + if core.is_compiled_with_cuda(): + use_gpu = True + atol = 1e-5 + if self.trt_parameters.precision == AnalysisConfig.Precision.Half: + atol = 1e-3 + self.check_output_with_option(use_gpu, atol, flatten=True) + self.assertTrue( + PassVersionChecker.IsCompatible('tensorrt_subgraph_pass')) + + def run_test(self): + self.build() + self.check_output() + + def run_test_all(self): + precision_opt = [ + AnalysisConfig.Precision.Float32, AnalysisConfig.Precision.Half + ] + serialize_opt = [False, True] + + if self.data_layout == 'NCHW': + min_shape = [ + self.bs, self.channel, self.height // 2, self.width // 2 + ] + max_shape = [self.bs, self.channel, self.height * 2, self.width * 2] + opt_shape = [self.bs, self.channel, self.height, self.width] + + if self.data_layout == 'NHWC': + min_shape = [ + self.bs, self.height // 2, self.width // 2, self.channel + ] + max_shape = [self.bs, self.height * 2, self.width * 2, self.channel] + opt_shape = [self.bs, self.height, self.width, self.channel] + + dynamic_shape_profile = InferencePassTest.DynamicShapeParam({ + 'in': min_shape + }, {'in': max_shape}, {'in': opt_shape}, False) + dynamic_shape_opt = [None, dynamic_shape_profile] + + for precision, serialize, dynamic_shape in itertools.product( + precision_opt, serialize_opt, dynamic_shape_opt): + self.precision = precision + self.serialize = serialize + self.dynamic_shape_params = dynamic_shape + self.run_test() + + def test_base(self): + self.run_test() + + def test_fp16(self): + self.precision = AnalysisConfig.Precision.Half + self.run_test() + + def test_serialize(self): + self.serialize = True + self.run_test() + + def test_dynamic(self): + self.dynamic_shape_params = InferencePassTest.DynamicShapeParam({ + 'in': [self.bs, self.channel, self.height // 2, self.width // 2] + }, {'in': [self.bs, self.channel, self.height * 2, self.width * 2] + }, {'in': [self.bs, self.channel, self.height, self.width]}, False) + self.run_test() + + def test_nchw_all(self): + self.run_test_all() + + def test_nhwc(self): + self.data_layout = 'NHWC' + self.run_test_all() + + +if __name__ == "__main__": + unittest.main() -- GitLab From e3a38d790a0f275fd9332b5ce0ad152f74257b61 Mon Sep 17 00:00:00 2001 From: zlsh80826 Date: Mon, 29 Mar 2021 14:16:56 +0800 Subject: [PATCH 084/486] [Paddle-TRT] roi_align_plugin (#31732) * add roi_align_plugin * add roi align unit_test * add roi align serialization * remove roi align static plugin because of batch dim issue * refine roi align unittest and add fp16/serialization * add trt roi align condition to op_teller * refine error message * remove unnecessary reshape layer --- .../fluid/inference/api/analysis_predictor.cc | 1 + .../inference/tensorrt/convert/CMakeLists.txt | 1 + .../tensorrt/convert/roi_align_op.cc | 86 ++++ paddle/fluid/inference/tensorrt/op_teller.cc | 24 ++ .../inference/tensorrt/plugin/CMakeLists.txt | 1 + .../tensorrt/plugin/roi_align_op_plugin.cu | 380 ++++++++++++++++++ .../tensorrt/plugin/roi_align_op_plugin.h | 112 ++++++ .../ir/inference/test_trt_roi_align_op.py | 119 ++++++ 8 files changed, 724 insertions(+) create mode 100644 paddle/fluid/inference/tensorrt/convert/roi_align_op.cc create mode 100644 paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.cu create mode 100644 paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.h create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_trt_roi_align_op.py diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 76bf5948a2b..7bb092d0e3c 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -1192,6 +1192,7 @@ USE_TRT_CONVERTER(scale); USE_TRT_CONVERTER(stack); USE_TRT_CONVERTER(clip); USE_TRT_CONVERTER(gather); +USE_TRT_CONVERTER(roi_align); USE_TRT_CONVERTER(affine_channel); USE_TRT_CONVERTER(multiclass_nms); USE_TRT_CONVERTER(nearest_interp); diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt index 6af76bd11cd..bc7b7355ea1 100644 --- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt @@ -6,6 +6,7 @@ nv_library(tensorrt_converter shuffle_channel_op.cc swish_op.cc instance_norm_op.cc stack_op.cc transpose_op.cc flatten_op.cc emb_eltwise_layernorm.cc skip_layernorm.cc scale_op.cc slice_op.cc hard_sigmoid_op.cc hard_swish_op.cc clip_op.cc gather_op.cc + roi_align_op.cc affine_channel_op.cc multiclass_nms_op.cc nearest_interp_op.cc diff --git a/paddle/fluid/inference/tensorrt/convert/roi_align_op.cc b/paddle/fluid/inference/tensorrt/convert/roi_align_op.cc new file mode 100644 index 00000000000..1329608aecd --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/roi_align_op.cc @@ -0,0 +1,86 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.h" + +namespace paddle { +namespace framework { +class Scope; + +namespace proto { +class OpDesc; +} // namespace proto +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace inference { +namespace tensorrt { + +/* + * Roi Align Op + */ +class RoiAlignOpConverter : public OpConverter { + public: + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, bool test_mode) override { + VLOG(3) << "convert a fluid roi align op to tensorrt plugin"; + + framework::OpDesc op_desc(op, nullptr); + std::string input_name = op_desc.Input("X").front(); + std::string rois_name = op_desc.Input("ROIs").front(); + std::string output_name = op_desc.Output("Out").front(); + + const auto pooled_height = + BOOST_GET_CONST(int, op_desc.GetAttr("pooled_height")); + const auto pooled_width = + BOOST_GET_CONST(int, op_desc.GetAttr("pooled_width")); + const auto spatial_scale = + BOOST_GET_CONST(float, op_desc.GetAttr("spatial_scale")); + const auto sampling_ratio = + BOOST_GET_CONST(int, op_desc.GetAttr("sampling_ratio")); + + const auto input_tensor = engine_->GetITensor(input_name); + const auto rois_tensor = engine_->GetITensor(rois_name); + + const nvinfer1::DataType data_type_ = engine_->WithFp16() + ? nvinfer1::DataType::kHALF + : nvinfer1::DataType::kFLOAT; + + std::vector inputs{input_tensor, rois_tensor}; + nvinfer1::ILayer* layer = nullptr; + + PADDLE_ENFORCE_EQ( + engine_->with_dynamic_shape(), true, + platform::errors::InvalidArgument( + "TRT roi align plugin only accept the dynamic shape, because that " + "the roi_align will change the batch size.")); + + auto* roi_align_plugin = new plugin::RoiAlignPluginDynamic( + data_type_, pooled_height, pooled_width, spatial_scale, sampling_ratio); + auto roi_align_layer = engine_->network()->addPluginV2( + inputs.data(), inputs.size(), *roi_align_plugin); + layer = roi_align_layer; + + std::vector output_names{output_name}; + RreplenishLayerAndOutput(layer, "roi_align", output_names, test_mode); + } +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +REGISTER_TRT_OP_CONVERTER(roi_align, RoiAlignOpConverter); diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc index eb429405d18..7c1b2e8001e 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -111,6 +111,7 @@ struct SimpleOpTypeSetTeller : public Teller { "flatten2", "flatten", "gather", + "roi_align", "affine_channel", "multiclass_nms", "nearest_interp", @@ -263,6 +264,29 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, BOOST_GET_CONST(std::string, desc.GetAttr("interp_method")); if (interp_method != "nearest") return false; } + + if (op_type == "roi_align") { + if (!with_dynamic_shape) return false; + + std::vector attrs{"pooled_height", "pooled_width", + "spatial_scale", "sampling_ratio"}; + for (auto const attr : attrs) { + if (!desc.HasAttr(attr)) return false; + } + + const auto pooled_height = + BOOST_GET_CONST(int, desc.GetAttr("pooled_height")); + if (pooled_height <= 0) return false; + + const auto pooled_width = + BOOST_GET_CONST(int, desc.GetAttr("pooled_width")); + if (pooled_width <= 0) return false; + + const auto spatial_scale = + BOOST_GET_CONST(float, desc.GetAttr("spatial_scale")); + if (spatial_scale <= 0.f) return false; + } + if ((*teller)(op_type, desc, use_no_calib_int8)) return true; } return false; diff --git a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt index 7ee16a598d2..4107f9ef674 100644 --- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt @@ -5,6 +5,7 @@ nv_library(tensorrt_plugin instance_norm_op_plugin.cu emb_eltwise_layernorm_plugin.cu qkv_to_context_plugin.cu skip_layernorm_op_plugin.cu slice_op_plugin.cu hard_swish_op_plugin.cu stack_op_plugin.cu special_slice_plugin.cu + roi_align_op_plugin.cu DEPS enforce tensorrt_engine prelu tensor bert_encoder_functor) nv_test(test_split_plugin SRCS test_split_plugin.cc DEPS diff --git a/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.cu new file mode 100644 index 00000000000..42c0df41a1b --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.cu @@ -0,0 +1,380 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include + +#include "paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.h" +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h" + +namespace paddle { +namespace inference { +namespace tensorrt { +namespace plugin { + +template +__inline__ __device__ T BilinearInterpolate(const T* input_data, + const int height, const int width, + T y, T x) { + if (y < -1.f || y > height || x < -1.f || x > width) return 0; + y = y <= 0.f ? 0.f : y; + x = x <= 0.f ? 0.f : x; + int y_low = static_cast(y); + int x_low = static_cast(x); + int y_high; + int x_high; + if (y_low >= height - 1) { + y_high = y_low = height - 1; + y = static_cast(y_low); + } else { + y_high = y_low + 1; + } + if (x_low >= width - 1) { + x_high = x_low = width - 1; + x = static_cast(x_low); + } else { + x_high = x_low + 1; + } + T ly = y - y_low, lx = x - x_low; + T hy = 1.f - ly, hx = 1.f - lx; + T v1 = input_data[y_low * width + x_low]; + T v2 = input_data[y_low * width + x_high]; + T v3 = input_data[y_high * width + x_low]; + T v4 = input_data[y_high * width + x_high]; + T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx; + T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); + return val; +} + +template +__global__ void GPUROIAlignOpt(const int nthreads, + const T* __restrict__ input_data, + const T* __restrict__ input_rois, + const float spatial_scale, const int channels, + const int height, const int width, + const int pooled_height, const int pooled_width, + const int sampling_ratio, const int num_rois, + OutT* __restrict__ output_data) { + const int batch = blockIdx.x; + const int channel = blockIdx.y; + const T* offset_input_data = + input_data + (batch * channels + channel) * height * width; + extern __shared__ T s_input_data[]; + if (USE_SMEM) { + for (int idx = threadIdx.x; idx < height * width; idx += blockDim.x) { + s_input_data[idx] = offset_input_data[idx]; + } + __syncthreads(); + } + for (int idx = threadIdx.x; idx < num_rois * pooled_height * pooled_width; + idx += blockDim.x) { + const int pw = idx % pooled_width; + const int ph = (idx / pooled_width) % pooled_height; + const int roi_idx = (idx / pooled_width / pooled_height) % num_rois; + const int n = batch * num_rois + roi_idx; + const float4 rois_offset = reinterpret_cast(input_rois)[n]; + const T roi_xmin = rois_offset.x * spatial_scale; + const T roi_ymin = rois_offset.y * spatial_scale; + const T roi_xmax = rois_offset.z * spatial_scale; + const T roi_ymax = rois_offset.w * spatial_scale; + const T roi_width = max(roi_xmax - roi_xmin, static_cast(1.f)); + const T roi_height = max(roi_ymax - roi_ymin, static_cast(1.f)); + const T bin_size_h = roi_height / static_cast(pooled_height); + const T bin_size_w = roi_width / static_cast(pooled_width); + const int roi_bin_grid_h = (sampling_ratio > 0) + ? sampling_ratio + : ceil(roi_height / pooled_height); + const int roi_bin_grid_w = + (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width); + const T count = roi_bin_grid_h * roi_bin_grid_w; + + T output_val = 0.f; + for (int iy = 0; iy < roi_bin_grid_h; ++iy) { + const T y = roi_ymin + ph * bin_size_h + + static_cast(iy + .5f) * bin_size_h / + static_cast(roi_bin_grid_h); + for (int ix = 0; ix < roi_bin_grid_w; ++ix) { + const T x = roi_xmin + pw * bin_size_w + + static_cast(ix + .5f) * bin_size_w / + static_cast(roi_bin_grid_w); + if (USE_SMEM) { + T val = BilinearInterpolate(s_input_data, height, width, y, x); + output_val += val; + } else { + T val = + BilinearInterpolate(offset_input_data, height, width, y, x); + output_val += val; + } + } + } + output_val /= count; + const int out_offset = + batch * num_rois * channels * pooled_height * pooled_width + + roi_idx * channels * pooled_height * pooled_width + + channel * pooled_height * pooled_width + ph * pooled_width + pw; + output_data[out_offset] = static_cast(output_val); + } +} + +#if IS_TRT_VERSION_GE(6000) +RoiAlignPluginDynamic::RoiAlignPluginDynamic(const nvinfer1::DataType data_type, + const int pooled_height, + const int pooled_width, + float spatial_scale, + int sampling_ratio) + : data_type_(data_type), + pooled_height_(pooled_height), + pooled_width_(pooled_width), + spatial_scale_(spatial_scale), + sampling_ratio_(sampling_ratio) { + bool data_type_is_valid = data_type_ == nvinfer1::DataType::kFLOAT || + data_type_ == nvinfer1::DataType::kHALF; + PADDLE_ENFORCE_EQ(data_type_is_valid, true, + platform::errors::InvalidArgument( + "TRT RoiAlign plugin only accepts kFLOAT(%d) or " + "kHALF(%d) data type, but the received data type = %d", + static_cast(nvinfer1::DataType::kFLOAT), + static_cast(nvinfer1::DataType::kHALF), + static_cast(data_type_))); + + PADDLE_ENFORCE_GT(pooled_height_, 0, + platform::errors::InvalidArgument( + "TRT RoiAlign plugin only accepts pooled_height " + "greater than %d, but the received pooled_height = %d", + 0, pooled_height_)); + + PADDLE_ENFORCE_GT(pooled_width_, 0, + platform::errors::InvalidArgument( + "TRT RoiAlign plugin only accepts pooled_width greater " + "than %d, but the received pooled_width = %d", + 0, pooled_height_)); + + PADDLE_ENFORCE_GT(spatial_scale_, 0.f, + platform::errors::InvalidArgument( + "TRT RoiAlign plugin only accepts spatial_scale " + "greater than %f, but the received spatial_scale = %f", + 0, spatial_scale_)); + + int smem_per_block = -1; + int device = -1; + cudaGetDevice(&device); + + PADDLE_ENFORCE_GE( + device, 0, + platform::errors::InvalidArgument( + "The cuda device ID should be greater than %d, but device ID is %d", + 0, device)); + + cudaDeviceGetAttribute(&smem_per_block, cudaDevAttrMaxSharedMemoryPerBlock, + device); + smem_per_block_ = smem_per_block; +} + +RoiAlignPluginDynamic::RoiAlignPluginDynamic(void const* data, size_t length) { + DeserializeValue(&data, &length, &data_type_); + DeserializeValue(&data, &length, &pooled_height_); + DeserializeValue(&data, &length, &pooled_width_); + DeserializeValue(&data, &length, &spatial_scale_); + DeserializeValue(&data, &length, &sampling_ratio_); + int smem_per_block = -1; + int device = -1; + cudaGetDevice(&device); + PADDLE_ENFORCE_GE( + device, 0, + platform::errors::InvalidArgument( + "The cuda device ID should be greater than %d, but device ID is %d", + 0, device)); + cudaDeviceGetAttribute(&smem_per_block, cudaDevAttrMaxSharedMemoryPerBlock, + device); + smem_per_block_ = smem_per_block; +} + +nvinfer1::IPluginV2DynamicExt* RoiAlignPluginDynamic::clone() const { + auto* plugin = + new RoiAlignPluginDynamic(data_type_, pooled_height_, pooled_width_, + spatial_scale_, sampling_ratio_); + plugin->setPluginNamespace(namespace_.c_str()); + return plugin; +} + +nvinfer1::DimsExprs RoiAlignPluginDynamic::getOutputDimensions( + int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs, + nvinfer1::IExprBuilder& exprBuilder) { + nvinfer1::DimsExprs ret{}; + ret.nbDims = 4; + ret.d[0] = inputs[1].d[0]; // roi + ret.d[1] = inputs[0].d[1]; // X + ret.d[2] = exprBuilder.constant(pooled_height_); + ret.d[3] = exprBuilder.constant(pooled_width_); + return ret; +} + +bool RoiAlignPluginDynamic::supportsFormatCombination( + int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, + int nbOutputs) { + if (inOut[pos].format != nvinfer1::TensorFormat::kLINEAR) { + return false; + } + if (pos < 2) { // input + return inOut[pos].type == nvinfer1::DataType::kFLOAT; + } + return inOut[pos].type == data_type_; +} + +void RoiAlignPluginDynamic::configurePlugin( + const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs, + const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) {} + +size_t RoiAlignPluginDynamic::getWorkspaceSize( + const nvinfer1::PluginTensorDesc* inputs, int nbInputs, + const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const { + return 0; +} + +template +int RoiAlignPluginDynamic::enqueue_impl( + const nvinfer1::PluginTensorDesc* inputDesc, + const nvinfer1::PluginTensorDesc* outputDesc, const void* const* inputs, + void* const* outputs, void* workspace, cudaStream_t stream) { + auto in_dims = inputDesc[0].dims; + auto rois_dims = inputDesc[1].dims; + auto out_dims = outputDesc[0].dims; + + int rois_num = rois_dims.d[0]; + if (rois_num == 0) return cudaGetLastError() != cudaSuccess; + + int batch = in_dims.d[0]; + int channels = in_dims.d[1]; + int height = in_dims.d[2]; + int width = in_dims.d[3]; + + int output_size = + out_dims.d[0] * out_dims.d[1] * out_dims.d[2] * out_dims.d[3]; + + const dim3 blocks(batch, channels); + const int threads = 512; + + if (smem_per_block_ < width * height * sizeof(T)) { + GPUROIAlignOpt<<>>( + output_size, static_cast(inputs[0]), + static_cast(inputs[1]), spatial_scale_, channels, height, + width, pooled_height_, pooled_width_, sampling_ratio_, rois_num / batch, + static_cast(outputs[0])); + } else { + GPUROIAlignOpt< + T, OutT, true><<>>( + output_size, static_cast(inputs[0]), + static_cast(inputs[1]), spatial_scale_, channels, height, + width, pooled_height_, pooled_width_, sampling_ratio_, rois_num / batch, + static_cast(outputs[0])); + } + + return cudaGetLastError() != cudaSuccess; +} + +int RoiAlignPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* inputDesc, + const nvinfer1::PluginTensorDesc* outputDesc, + const void* const* inputs, + void* const* outputs, void* workspace, + cudaStream_t stream) { + PADDLE_ENFORCE_EQ(outputDesc[0].type, data_type_, + platform::errors::InvalidArgument( + "TRT RoiAlignPluginDynamic expects outputDesc[0].type " + "equal to data_type_")); + + if (data_type_ == nvinfer1::DataType::kHALF) { + return enqueue_impl(inputDesc, outputDesc, inputs, outputs, + workspace, stream); + } + return enqueue_impl(inputDesc, outputDesc, inputs, outputs, + workspace, stream); +} + +nvinfer1::DataType RoiAlignPluginDynamic::getOutputDataType( + int index, const nvinfer1::DataType* inputTypes, int nbInputs) const { + return data_type_; +} + +const char* RoiAlignPluginDynamic::getPluginType() const { + return "roi_align_plugin_dynamic"; +} + +int RoiAlignPluginDynamic::getNbOutputs() const { return 1; } + +int RoiAlignPluginDynamic::initialize() { return 0; } + +void RoiAlignPluginDynamic::terminate() {} + +size_t RoiAlignPluginDynamic::getSerializationSize() const { + size_t serialize_size = 0; + serialize_size += SerializedSize(data_type_); + serialize_size += SerializedSize(pooled_height_); + serialize_size += SerializedSize(pooled_width_); + serialize_size += SerializedSize(spatial_scale_); + serialize_size += SerializedSize(sampling_ratio_); + return serialize_size; +} + +void RoiAlignPluginDynamic::serialize(void* buffer) const { + SerializeValue(&buffer, data_type_); + SerializeValue(&buffer, pooled_height_); + SerializeValue(&buffer, pooled_width_); + SerializeValue(&buffer, spatial_scale_); + SerializeValue(&buffer, sampling_ratio_); +} + +void RoiAlignPluginDynamic::destroy() {} + +RoiAlignPluginDynamicCreator::RoiAlignPluginDynamicCreator() {} + +void RoiAlignPluginDynamicCreator::setPluginNamespace( + const char* lib_namespace) { + namespace_ = std::string(lib_namespace); +} + +const char* RoiAlignPluginDynamicCreator::getPluginNamespace() const { + return namespace_.c_str(); +} + +const char* RoiAlignPluginDynamicCreator::getPluginName() const { + return "roi_align_plugin_dynamic"; +} + +const char* RoiAlignPluginDynamicCreator::getPluginVersion() const { + return "1"; +} + +const nvinfer1::PluginFieldCollection* +RoiAlignPluginDynamicCreator::getFieldNames() { + return &field_collection_; +} + +nvinfer1::IPluginV2Ext* RoiAlignPluginDynamicCreator::createPlugin( + const char* name, const nvinfer1::PluginFieldCollection* fc) { + const nvinfer1::PluginField* fields = fc->fields; +} + +nvinfer1::IPluginV2Ext* RoiAlignPluginDynamicCreator::deserializePlugin( + const char* name, const void* serial_data, size_t serial_length) { + auto plugin = new RoiAlignPluginDynamic(serial_data, serial_length); + plugin->setPluginNamespace(namespace_.c_str()); + return plugin; +} +#endif + +} // namespace plugin +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.h new file mode 100644 index 00000000000..bba7d0d5a99 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.h @@ -0,0 +1,112 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include "paddle/fluid/inference/tensorrt/engine.h" +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" + +namespace paddle { +namespace inference { +namespace tensorrt { +namespace plugin { + +#if IS_TRT_VERSION_GE(6000) +class RoiAlignPluginDynamic : public DynamicPluginTensorRT { + public: + explicit RoiAlignPluginDynamic(const nvinfer1::DataType data_type, + const int pooled_height, + const int pooled_width, float spatial_scale, + int sampling_ratio); + RoiAlignPluginDynamic(void const* data, size_t length); + ~RoiAlignPluginDynamic() = default; + nvinfer1::IPluginV2DynamicExt* clone() const override; + nvinfer1::DimsExprs getOutputDimensions( + int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs, + nvinfer1::IExprBuilder& exprBuilder) override; + bool supportsFormatCombination(int pos, + const nvinfer1::PluginTensorDesc* inOut, + int nbInputs, int nbOutputs) override; + void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, + int nbInputs, + const nvinfer1::DynamicPluginTensorDesc* out, + int nbOutputs) override; + size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, + int nbInputs, + const nvinfer1::PluginTensorDesc* outputs, + int nbOutputs) const override; + int enqueue(const nvinfer1::PluginTensorDesc* inputDesc, + const nvinfer1::PluginTensorDesc* outputDesc, + const void* const* inputs, void* const* outputs, void* workspace, + cudaStream_t stream) override; + + nvinfer1::DataType getOutputDataType(int index, + const nvinfer1::DataType* inputTypes, + int nbInputs) const override; + + const char* getPluginType() const override; + int getNbOutputs() const override; + int initialize() override; + void terminate() override; + size_t getSerializationSize() const override; + void serialize(void* buffer) const override; + void destroy() override; + + private: + template + int enqueue_impl(const nvinfer1::PluginTensorDesc* inputDesc, + const nvinfer1::PluginTensorDesc* outputDesc, + const void* const* inputs, void* const* outputs, + void* workspace, cudaStream_t stream); + + nvinfer1::DataType data_type_; + int pooled_height_; + int pooled_width_; + float spatial_scale_; + int sampling_ratio_; + int smem_per_block_; + std::string namespace_; +}; + +class RoiAlignPluginDynamicCreator : public nvinfer1::IPluginCreator { + public: + RoiAlignPluginDynamicCreator(); + ~RoiAlignPluginDynamicCreator() override = default; + + void setPluginNamespace(const char* lib_namespace) override; + const char* getPluginNamespace() const override; + const char* getPluginName() const override; + const char* getPluginVersion() const override; + const nvinfer1::PluginFieldCollection* getFieldNames() override; + + nvinfer1::IPluginV2Ext* createPlugin( + const char* name, const nvinfer1::PluginFieldCollection* fc) override; + nvinfer1::IPluginV2Ext* deserializePlugin(const char* name, + const void* serial_data, + size_t serial_length) override; + + private: + std::string namespace_; + nvinfer1::PluginFieldCollection field_collection_; +}; +REGISTER_TRT_PLUGIN_V2(RoiAlignPluginDynamicCreator); +#endif + +} // namespace plugin +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_roi_align_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_roi_align_op.py new file mode 100644 index 00000000000..fa276dd342b --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_roi_align_op.py @@ -0,0 +1,119 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +from inference_pass_test import InferencePassTest +import paddle.fluid as fluid +import paddle.fluid.core as core +from paddle.fluid.core import PassVersionChecker +from paddle.fluid.core import AnalysisConfig + + +class TRTRoiAlignTest(InferencePassTest): + def setUp(self): + self.bs = 2 + self.num_rois = 4 + self.channel = 16 + self.height = 32 + self.width = 32 + self.precision = AnalysisConfig.Precision.Float32 + self.serialize = False + self.enable_trt = True + + def build(self): + self.trt_parameters = TRTRoiAlignTest.TensorRTParam( + 1 << 30, self.bs * self.num_rois, 1, self.precision, self.serialize, + False) + with fluid.program_guard(self.main_program, self.startup_program): + data_shape = [-1, self.channel, self.height, self.width] + data = fluid.data(name='data', shape=data_shape, dtype='float32') + rois = fluid.data( + name='rois', shape=[-1, 4], dtype='float32', lod_level=1) + roi_align_out = fluid.layers.roi_align(data, rois) + out = fluid.layers.batch_norm(roi_align_out, is_test=True) + + rois_lod = fluid.create_lod_tensor( + np.random.random([self.bs * self.num_rois, 4]).astype('float32'), + [[self.num_rois, self.num_rois]], fluid.CPUPlace()) + + data_shape[0] = self.bs + self.feeds = { + 'data': np.random.random(data_shape).astype('float32'), + 'rois': rois_lod, + } + self.fetch_list = [out] + + def check_output(self): + if core.is_compiled_with_cuda(): + use_gpu = True + atol = 1e-5 + if self.trt_parameters.precision == AnalysisConfig.Precision.Half: + atol = 1e-3 + self.check_output_with_option(use_gpu, atol, flatten=True) + self.assertTrue( + PassVersionChecker.IsCompatible('tensorrt_subgraph_pass')) + + def set_dynamic(self): + min_shape_spec = dict() + max_shape_spec = dict() + opt_shape_spec = dict() + min_shape_spec['data'] = [ + self.bs, self.channel, self.height // 2, self.width // 2 + ] + min_shape_spec['rois'] = [1, 4] + max_shape_spec[ + 'data'] = [self.bs, self.channel, self.height * 2, self.width * 2] + max_shape_spec['rois'] = [self.bs * self.num_rois, 4] + opt_shape_spec[ + 'data'] = [self.bs, self.channel, self.height, self.width] + opt_shape_spec['rois'] = [self.bs * self.num_rois, 4] + + self.dynamic_shape_params = InferencePassTest.DynamicShapeParam( + min_shape_spec, max_shape_spec, opt_shape_spec, False) + + def run_test(self): + self.build() + self.check_output() + + def test_base(self): + self.run_test() + + def test_fp16(self): + self.precision = AnalysisConfig.Precision.Half + self.run_test() + + def test_serialize(self): + self.serialize = True + self.run_test() + + def test_dynamic(self): + self.set_dynamic() + self.run_test() + + def test_dynamic_fp16(self): + self.set_dynamic() + self.precision = AnalysisConfig.Precision.Half + self.run_test() + + def test_dynamic_serialize(self): + self.set_dynamic() + self.serialize = True + self.run_test() + + +if __name__ == "__main__": + unittest.main() -- GitLab From 51eb29de18adcf8c20272218f105eb1c2135cc09 Mon Sep 17 00:00:00 2001 From: Jiabin Yang Date: Mon, 29 Mar 2021 14:17:54 +0800 Subject: [PATCH 085/486] [CustomOP] Add shape related constructor for Tensor (#31681) * give shape related contructor and reshape warning * change line num to fit ut * change ut to fit * remove useless code * call resize directly in constructor --- paddle/fluid/extension/include/ext_tensor.h | 3 +++ paddle/fluid/extension/src/ext_tensor.cc | 21 ++++++++++++++++++- paddle/fluid/framework/custom_tensor_utils.h | 2 +- .../fluid/tests/custom_op/custom_relu_op.cc | 3 +-- .../custom_op/test_custom_relu_op_jit.py | 4 ++-- 5 files changed, 27 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/extension/include/ext_tensor.h b/paddle/fluid/extension/include/ext_tensor.h index be492a6d553..52606b2a7f5 100644 --- a/paddle/fluid/extension/include/ext_tensor.h +++ b/paddle/fluid/extension/include/ext_tensor.h @@ -52,6 +52,9 @@ class PD_DLL_DECL Tensor { /// \brief Construct a Tensor on target Place for CustomOp. /// Generally it's only used for user to create Tensor. explicit Tensor(const PlaceType& place); + /// \brief Construct a Tensor on target Place with shape for CustomOp. + /// Generally it's only used for user to create Tensor. + Tensor(const PlaceType& place, const std::vector& shape); /// \brief Reset the shape of the tensor. /// Generally it's only used for the input tensor. /// Reshape must be called before calling diff --git a/paddle/fluid/extension/src/ext_tensor.cc b/paddle/fluid/extension/src/ext_tensor.cc index 0cae8f4af7b..e9705e2101c 100644 --- a/paddle/fluid/extension/src/ext_tensor.cc +++ b/paddle/fluid/extension/src/ext_tensor.cc @@ -102,13 +102,32 @@ void GpuCopy(T *src, T *dst, PlaceType src_plc, PlaceType dst_plc, void Tensor::reshape(const std::vector &shape) { GET_CASTED_TENSOR - tensor->Resize(framework::make_ddim(shape)); + auto new_dim = framework::make_ddim(shape); + if (tensor->numel() != framework::product(new_dim)) { + LOG(WARNING) << "Custom Op: Calling reshape to a new shape which is bigger " + "or smaller" + << "than original shape will not change your tensor's memory " + "Please call" + << "paddle::Tensor::mutable_data() after to reallocate " + "your tensor's size." + << std::endl; + } + tensor->Resize(new_dim); } Tensor::Tensor(const PlaceType &place) : tensor_(std::make_shared()), place_(place), stream_(StreamWrapper()) {} + +Tensor::Tensor(const PlaceType &place, const std::vector &shape) + : tensor_(std::make_shared()), + place_(place), + stream_(StreamWrapper()) { + GET_CASTED_TENSOR + tensor->Resize(framework::make_ddim(shape)); +} + template T *Tensor::mutable_data(const PlaceType &place) { place_ = place; diff --git a/paddle/fluid/framework/custom_tensor_utils.h b/paddle/fluid/framework/custom_tensor_utils.h index fad1e3ee349..809a6b965aa 100644 --- a/paddle/fluid/framework/custom_tensor_utils.h +++ b/paddle/fluid/framework/custom_tensor_utils.h @@ -37,7 +37,7 @@ class CustomTensorUtils { /// \brief Share data FROM another tensor. /// Use this to pass tensor from op to op /// \return void. - static void ShareDataFrom(const void* src, const Tensor& dst); + static void ShareDataFrom(const void* src, const paddle::Tensor& dst); static framework::proto::VarType::Type ConvertEnumDTypeToInnerDType( const paddle::DataType& dtype) { diff --git a/python/paddle/fluid/tests/custom_op/custom_relu_op.cc b/python/paddle/fluid/tests/custom_op/custom_relu_op.cc index c0b30a1cb55..c075d27f7b1 100644 --- a/python/paddle/fluid/tests/custom_op/custom_relu_op.cc +++ b/python/paddle/fluid/tests/custom_op/custom_relu_op.cc @@ -38,9 +38,8 @@ void relu_cpu_backward_kernel(const data_t* grad_out_data, } std::vector relu_cpu_forward(const paddle::Tensor& x) { - auto out = paddle::Tensor(paddle::PlaceType::kCPU); + auto out = paddle::Tensor(paddle::PlaceType::kCPU, x.shape()); - out.reshape(x.shape()); PD_DISPATCH_FLOATING_TYPES( x.type(), "relu_cpu_forward", ([&] { relu_cpu_forward_kernel( diff --git a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py index 23733d20841..641630b0f44 100644 --- a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py +++ b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py @@ -103,11 +103,11 @@ class TestJITLoad(unittest.TestCase): in str(e)) if IS_WINDOWS: self.assertTrue( - r"python\paddle\fluid\tests\custom_op\custom_relu_op.cc:48" + r"python\paddle\fluid\tests\custom_op\custom_relu_op.cc:47" in str(e)) else: self.assertTrue( - "python/paddle/fluid/tests/custom_op/custom_relu_op.cc:48" + "python/paddle/fluid/tests/custom_op/custom_relu_op.cc:47" in str(e)) self.assertTrue(caught_exception) -- GitLab From 61805d8f0aa304f4226e5793b97da97552a43282 Mon Sep 17 00:00:00 2001 From: Shang Zhizhou Date: Mon, 29 Mar 2021 17:11:26 +0800 Subject: [PATCH 086/486] fix cmake model path (#31866) * fix cmake model path * update cmake * fix unittest * fix unittest --- paddle/fluid/inference/tests/api/CMakeLists.txt | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index 92f9c20a369..75628adbe8a 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -530,7 +530,7 @@ if(WITH_GPU AND TENSORRT_FOUND) ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models) set(TRT_MODEL_QUANT_RESNET_DIR "${INFERENCE_DEMO_INSTALL_DIR}/small_quant_model") - if (NOT EXISTS ${TRT_MODEL_QUANT_RESNET_DIR}/small_quant_model.tgz) + if (NOT EXISTS ${INFERENCE_DEMO_INSTALL_DIR}/small_quant_model.tgz) inference_download_and_uncompress(${INFERENCE_DEMO_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "small_quant_model.tgz") endif() inference_analysis_test(trt_quant_int8_test SRCS trt_quant_int8_test.cc @@ -538,7 +538,7 @@ if(WITH_GPU AND TENSORRT_FOUND) ARGS --infer_model=${TRT_MODEL_QUANT_RESNET_DIR}) set(TRT_MODEL_QUANT_YOLOV3_DIR "${INFERENCE_DEMO_INSTALL_DIR}/yolov3_r50_quant_aware") - if (NOT EXISTS ${TRT_MODEL_QUANT_YOLOV3_DIR}/yolov3_r50_quant_aware.tgz) + if (NOT EXISTS ${INFERENCE_DEMO_INSTALL_DIR}/yolov3_r50_quant_aware.tgz) inference_download_and_uncompress(${INFERENCE_DEMO_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "yolov3_r50_quant_aware.tgz") endif() inference_analysis_test(trt_quant_int8_yolov3_r50_test SRCS trt_quant_int8_yolov3_r50_test.cc @@ -576,8 +576,7 @@ if(WITH_GPU AND TENSORRT_FOUND) EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} ARGS --infer_model=${TEST_TRT_TRANSFORMER_PRUNE_MODEL}/transformer_prune) - set(TEST_TRT_ERNIE_UNSER_MODEL "${TRT_MODEL_INSTALL_DIR}/ernie_test/ernie_model_4_unserialized/") - if (NOT EXISTS ${TEST_TRT_ERNIE_UNSER_MODEL}/ernie_model_4_unserialized.tgz) + if (NOT EXISTS ${TEST_TRT_ERNIE_MODEL}/ernie_model_4_unserialized.tgz) inference_download_and_uncompress(${TEST_TRT_ERNIE_MODEL} ${INFERENCE_URL}/tensorrt_test "ernie_model_4_unserialized.tgz") endif() @@ -585,8 +584,7 @@ if(WITH_GPU AND TENSORRT_FOUND) EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} ARGS --infer_model=${TEST_TRT_ERNIE_MODEL}/ernie_model_4_unserialized) - set(TEST_TRT_ERNIE_UNSER_FP16_MODEL "${TRT_MODEL_INSTALL_DIR}/ernie_test/ernie_model_4_fp16_unserialized/") - if (NOT EXISTS ${TEST_TRT_ERNIE_UNSER_FP16_MODEL}/ernie_model_4_unserialized.tgz) + if (NOT EXISTS ${TEST_TRT_ERNIE_MODEL}/ernie_model_4_fp16_unserialized.tgz) inference_download_and_uncompress(${TEST_TRT_ERNIE_MODEL} ${INFERENCE_URL}/tensorrt_test "ernie_model_4_fp16_unserialized.tgz") endif() -- GitLab From 123949eb48378262c888bf2e5aa3f2127e6bf32f Mon Sep 17 00:00:00 2001 From: ronnywang <524019753@qq.com> Date: Mon, 29 Mar 2021 17:41:31 +0800 Subject: [PATCH 087/486] [ROCM] added a cudnn switch of conv2d for rocm platform (#31836) --- paddle/fluid/platform/flags.cc | 12 +++++++ .../pybind/global_value_getter_setter.cc | 4 ++- python/paddle/fluid/__init__.py | 1 + python/paddle/fluid/layers/nn.py | 4 +++ .../fluid/tests/unittests/test_conv2d_op.py | 36 +++++++++++++++++++ python/paddle/nn/layer/conv.py | 5 +++ 6 files changed, 61 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc index 1a55562f2b8..fa77c0be037 100644 --- a/paddle/fluid/platform/flags.cc +++ b/paddle/fluid/platform/flags.cc @@ -564,3 +564,15 @@ DEFINE_string(tracer_mkldnn_ops_on, "", */ DEFINE_string(tracer_mkldnn_ops_off, "", "List of OneDNN operation types to be turned off"); + +/** + * CUDNN related FLAG + * Name: conv2d_disable_cudnn + * Since Version: + * Value Range: bool, default=false + * Example: + * Note: Disable cudnn in conv2d. + */ +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +DEFINE_bool(conv2d_disable_cudnn, false, "Disable cudnn in conv2d"); +#endif diff --git a/paddle/fluid/pybind/global_value_getter_setter.cc b/paddle/fluid/pybind/global_value_getter_setter.cc index 6074d191ad2..e8ba16398d2 100644 --- a/paddle/fluid/pybind/global_value_getter_setter.cc +++ b/paddle/fluid/pybind/global_value_getter_setter.cc @@ -72,6 +72,7 @@ DECLARE_uint64(conv_workspace_size_limit); DECLARE_bool(cudnn_batchnorm_spatial_persistent); DECLARE_bool(cudnn_deterministic); DECLARE_bool(cudnn_exhaustive_search); +DECLARE_bool(conv2d_disable_cudnn); // data processing DECLARE_bool(enable_cublas_tensor_op_math); // device management @@ -367,7 +368,8 @@ static void RegisterGlobalVarGetterSetter() { FLAGS_fraction_of_cuda_pinned_memory_to_use, FLAGS_fraction_of_gpu_memory_to_use, FLAGS_initial_gpu_memory_in_mb, FLAGS_reallocate_gpu_memory_in_mb, FLAGS_enable_cublas_tensor_op_math, - FLAGS_selected_gpus, FLAGS_sync_nccl_allreduce); + FLAGS_selected_gpus, FLAGS_sync_nccl_allreduce, + FLAGS_conv2d_disable_cudnn); #endif #ifdef PADDLE_WITH_XPU REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_selected_xpus); diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index b24da29d0f5..ae341868785 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -230,6 +230,7 @@ def __bootstrap__(): 'gpu_allocator_retry_time', 'local_exe_sub_scope_limit', 'gpu_memory_limit_mb', + 'conv2d_disable_cudnn', ] core.init_gflags(["--tryfromenv=" + ",".join(read_env_flags)]) core.init_glog(sys.argv[0]) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 00d1db19fc2..6bc69ffd5cd 100755 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -1603,6 +1603,10 @@ def conv2d(input, pre_bias = helper.create_variable_for_type_inference(dtype) + if (core.is_compiled_with_cuda() and paddle.fluid.get_flags( + "FLAGS_conv2d_disable_cudnn")["FLAGS_conv2d_disable_cudnn"]): + use_cudnn = False + helper.append_op( type=l_type, inputs={ diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_op.py index 29c35d28d4d..83bba0b0ca1 100644 --- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py +++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py @@ -1465,5 +1465,41 @@ class TestConv2DAPI_Error(unittest.TestCase): self.assertRaises(ValueError, run_7) +# --------- test environment variable ------ +@unittest.skipIf( + not (core.is_compiled_with_cuda() or core.is_compiled_with_rocm()), + "core is not compiled with CUDA or ROCM") +class TestConv2DEnviron(unittest.TestCase): + def run_conv2d_api(self): + inputs = fluid.layers.data( + shape=[2, 3, 5, 5], + append_batch_size=False, + name="inputs", + dtype="float32") + fluid.layers.conv2d( + input=inputs, + num_filters=4, + filter_size=[3, 3], + stride=[1, 1], + padding=0, + dilation=[1, 1], + groups=1, + data_format="NCHW") + + x_var = paddle.uniform((2, 3, 5, 5), dtype="float32", min=-1., max=1.) + conv = paddle.nn.Conv2D( + in_channels=3, + out_channels=4, + kernel_size=(3, 3), + data_format="NCHW") + y_var = conv(x_var) + + def test_environ(self): + fluid.set_flags({'FLAGS_conv2d_disable_cudnn': False}) + self.run_conv2d_api() + fluid.set_flags({'FLAGS_conv2d_disable_cudnn': True}) + self.run_conv2d_api() + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/nn/layer/conv.py b/python/paddle/nn/layer/conv.py index 389920b9238..d65b874c8ba 100644 --- a/python/paddle/nn/layer/conv.py +++ b/python/paddle/nn/layer/conv.py @@ -25,6 +25,7 @@ __all__ = [ import numpy as np +from ...fluid import get_flags from ...fluid import core from ...device import get_cudnn_version from ...fluid.dygraph import layers @@ -644,6 +645,10 @@ class Conv2D(_ConvNd): bias_attr=bias_attr, data_format=data_format) + if (core.is_compiled_with_cuda() and get_flags( + "FLAGS_conv2d_disable_cudnn")["FLAGS_conv2d_disable_cudnn"]): + self._use_cudnn = False + def forward(self, x): if self._padding_mode != 'zeros': x = F.pad(x, -- GitLab From 525c32e33c8023472cb8178990bbc9c2ec3f1e3c Mon Sep 17 00:00:00 2001 From: liym27 <33742067+liym27@users.noreply.github.com> Date: Mon, 29 Mar 2021 19:47:55 +0800 Subject: [PATCH 088/486] =?UTF-8?q?Fix=20bug=20of=20set=5Fvalue=20op?= =?UTF-8?q?=EF=BC=9ADecerease=20axes=20to=20do=20right=20broadcast=20(#318?= =?UTF-8?q?75)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- paddle/fluid/operators/set_value_op.cc | 11 ++- paddle/fluid/operators/set_value_op.h | 72 ++++++++++++++++--- python/paddle/fluid/framework.py | 11 ++- .../tests/unittests/test_set_value_op.py | 14 ++++ 4 files changed, 95 insertions(+), 13 deletions(-) diff --git a/paddle/fluid/operators/set_value_op.cc b/paddle/fluid/operators/set_value_op.cc index 94d34c648d1..105d61015fc 100644 --- a/paddle/fluid/operators/set_value_op.cc +++ b/paddle/fluid/operators/set_value_op.cc @@ -124,6 +124,9 @@ class SetValueMaker : public framework::OpProtoAndCheckerMaker { AddAttr>( "steps", "(list) Stride step from the start to the end.") .SetDefault({}); + AddAttr>("decrease_axes", + "(list) The axes to decrease.") + .SetDefault({}); AddAttr>("bool_values", "Store the bool values.") .SetDefault({}); @@ -185,4 +188,10 @@ Upgrade set_value, add 3 inputs [StartsTensorList, EndsTensorList, StepsTensorLi "Ending indices of corresponding axis in `axes`.", std::vector{}) .NewAttr("steps", "Stride step from the start to the end.", - std::vector{})); + std::vector{})) + .AddCheckpoint( + R"ROC( +Upgrade set_value, add 1 attribute [decrease_axes]. + )ROC", + paddle::framework::compatible::OpVersionDesc().NewAttr( + "decrease_axes", "The axes to decrease.", std::vector{})); diff --git a/paddle/fluid/operators/set_value_op.h b/paddle/fluid/operators/set_value_op.h index 325a2b0b865..eca51147f81 100644 --- a/paddle/fluid/operators/set_value_op.h +++ b/paddle/fluid/operators/set_value_op.h @@ -106,10 +106,10 @@ inline void CheckAndUpdateSlice(const framework::DDim in_dims, } inline framework::DDim GetSliceDims(const framework::DDim in_dims, - const std::vector axes, - const std::vector starts, - const std::vector ends, - const std::vector steps) { + const std::vector& axes, + const std::vector& starts, + const std::vector& ends, + const std::vector& steps) { framework::DDim slice_dims(in_dims); for (size_t i = 0; i < axes.size(); ++i) { @@ -127,6 +127,38 @@ inline framework::DDim GetSliceDims(const framework::DDim in_dims, return slice_dims; } +inline framework::DDim GetDecreasedDims( + const framework::DDim slice_dims, + const std::vector& decrease_axes) { + // Get dims after decreasing axes. + framework::DDim decreased_dims(slice_dims); + if (decrease_axes.size() > 0) { + for (size_t i = 0; i < decrease_axes.size(); ++i) { + int64_t axis = decrease_axes[i]; + PADDLE_ENFORCE_EQ( + decreased_dims[axis], 1, + platform::errors::InvalidArgument("decrease dim should be 1")); + decreased_dims[axis] = 0; + } + + std::vector new_shape; + for (int i = 0; i < decreased_dims.size(); ++i) { + if (decreased_dims[i] != 0) { + new_shape.push_back(decreased_dims[i]); + } + } + + // NOTE(liym27): Paddle does not support that the rank of Tensor is 0, and + // uses [1] instead. + if (new_shape.size() == 0) { + new_shape.push_back(1); + } + + decreased_dims = framework::make_ddim(new_shape); + } + return decreased_dims; +} + template class SetValueKernel : public framework::OpKernel { public: @@ -179,6 +211,7 @@ class SetValueKernel : public framework::OpKernel { auto ends = ctx.Attr>("ends"); auto steps = ctx.Attr>("steps"); auto shape = ctx.Attr>("shape"); + auto decrease_axes = ctx.Attr>("decrease_axes"); auto dtype = in->type(); if (!starts_tensor_list.empty()) { @@ -194,6 +227,7 @@ class SetValueKernel : public framework::OpKernel { auto in_dims = in->dims(); CheckAndUpdateSlice(in_dims, axes, &starts, &ends, &steps); auto slice_dims = GetSliceDims(in_dims, axes, starts, ends, steps); + auto decrease_slice_dims = GetDecreasedDims(slice_dims, decrease_axes); auto place = ctx.GetPlace(); auto& eigen_place = @@ -212,13 +246,13 @@ class SetValueKernel : public framework::OpKernel { // set_value is what we want. TensorCopy(*in, place, out); - Tensor slice_t(dtype), pad_t(dtype); - slice_t.mutable_data(slice_dims, place); - pad_t.mutable_data(in_dims, place); + Tensor slice_tensor(dtype), pad_tensor(dtype); + slice_tensor.mutable_data(slice_dims, place); + pad_tensor.mutable_data(in_dims, place); - auto pad_e = framework::EigenTensor::From(pad_t, in_dims); + auto pad_e = framework::EigenTensor::From(pad_tensor, in_dims); auto out_e = framework::EigenTensor::From(*out); - auto slice_e = framework::EigenTensor::From(slice_t, slice_dims); + auto slice_e = framework::EigenTensor::From(slice_tensor, slice_dims); // Step 1: Set the value of out at `_index` to zero slice_e.device(eigen_place) = slice_e.constant(T(0)); @@ -244,11 +278,26 @@ class SetValueKernel : public framework::OpKernel { // Step 2: Set a tensor with the same shape as out tensor. And its data at // '_index' is the same as value_tensor, and data out of '_index' to zero + // - Step 2.1 Set slice tensor with value + + // NOTE(liym27): [ Why resize slice_tensor here? ] + // A: When do broadcasting on slice_tensor and value_tensor, the shape of + // slice_tensor should be decreased dims. + // e.g. + // x[:,0] = value_tensor + // x's shape = [3, 4], value_tensor's shape = [3] + // We get slice_dims = [3, 1], decrease_slice_dims = [3] + // If do broadcasting on Tensor with shape [3, 1] and [3], the result's + // shape is [3, 3], which cross the border; + // If do broadcasting on Tensor with shape [3] and [3], the result's shape + // is [3], which is right. + + slice_tensor.Resize(decrease_slice_dims); if (value_tensor != nullptr) { // ElementwiseComputeEx can do broadcasting ElementwiseComputeEx, DeviceContext, T>( - ctx, &slice_t, value_tensor, -1, SubFunctor(), &slice_t); + ctx, &slice_tensor, value_tensor, -1, SubFunctor(), &slice_tensor); } else { Tensor value_t(dtype); auto value_dims = framework::make_ddim(shape); @@ -257,8 +306,9 @@ class SetValueKernel : public framework::OpKernel { CopyVecotorToTensor(value_name.c_str(), &value_t, ctx); value_t.Resize(value_dims); ElementwiseComputeEx, DeviceContext, T>( - ctx, &slice_t, &value_t, -1, SubFunctor(), &slice_t); + ctx, &slice_tensor, &value_t, -1, SubFunctor(), &slice_tensor); } + slice_tensor.Resize(slice_dims); // - Step 2.2 Pad slice tensor with 0 pad_e.device(eigen_place) = pad_e.constant(T(0)); diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index db487128bbe..18162059e99 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -1863,6 +1863,7 @@ class Variable(object): if not isinstance(item, tuple): item = [item] + decrease_axes = [] axes = [] starts = [] ends = [] @@ -1933,15 +1934,23 @@ class Variable(object): if end is None: end = max_integer if step > 0 else (0 - max_integer) else: + decrease_axes.append(dim) start = slice_item end = slice_item + 1 if slice_item != -1 else max_integer step = 1 + axes.append(dim) starts.append(start) ends.append(end) steps.append(step) - attrs = {'axes': axes, 'starts': starts, 'ends': ends, 'steps': steps} + attrs = { + 'axes': axes, + 'starts': starts, + 'ends': ends, + 'steps': steps, + 'decrease_axes': decrease_axes + } from .layers import utils if utils._contain_var(starts): diff --git a/python/paddle/fluid/tests/unittests/test_set_value_op.py b/python/paddle/fluid/tests/unittests/test_set_value_op.py index 23dac41f64a..1239a2249cc 100644 --- a/python/paddle/fluid/tests/unittests/test_set_value_op.py +++ b/python/paddle/fluid/tests/unittests/test_set_value_op.py @@ -671,6 +671,20 @@ class TestSetValueValueShape4(TestSetValueApi): self.data[0] = self.value +class TestSetValueValueShape5(TestSetValueApi): + def set_value(self): + self.value = np.array([3, 3, 3]).astype(self.dtype) + + def set_shape(self): + self.shape = [3, 4] + + def _call_setitem(self, x): + x[:, 0] = paddle.assign(self.value) # x is Paddle.Tensor + + def _get_answer(self): + self.data[:, 0] = self.value + + # 4. Test error class TestError(TestSetValueBase): def _value_type_error(self): -- GitLab From b48841ba2e7335eaa435a54436ed580d4aef001c Mon Sep 17 00:00:00 2001 From: wanghuancoder Date: Mon, 29 Mar 2021 19:53:12 +0800 Subject: [PATCH 089/486] modify API nn.Bilinear's doc (#31889) * modify API nn.Bilinear's doc, test=develop * modify API nn.Bilinear's doc, test=develop --- python/paddle/nn/layer/common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/paddle/nn/layer/common.py b/python/paddle/nn/layer/common.py index d0f97625bcb..60c846f9f76 100644 --- a/python/paddle/nn/layer/common.py +++ b/python/paddle/nn/layer/common.py @@ -578,7 +578,7 @@ class Bilinear(layers.Layer): .. math:: - out_{i} = x1 * W_{i} * {x2^\mathrm{T}}, i=0,1,...,size-1 + out_{i} = x1 * W_{i} * {x2^\mathrm{T}}, i=0,1,...,outfeatures-1 out = out + b @@ -586,7 +586,7 @@ class Bilinear(layers.Layer): - :math:`x1`: the first input contains in1_features elements, shape is [batch_size, in1_features]. - :math:`x2`: the second input contains in2_features elements, shape is [batch_size, in2_features]. - :math:`W_{i}`: the i-th learned weight, shape is [in1_features, in2_features], and learned weight's shape is [out_features, in1_features, in2_features]. - - :math:`out_{i}`: the i-th element of out, shape is [batch_size, out_features]. + - :math:`out_{i}`: the i-th element of out, shape is [batch_size], and out's shape is [batch_size, out_features]. - :math:`b`: the learned bias, shape is [1, out_features]. - :math:`x2^\mathrm{T}`: the transpose of :math:`x2`. -- GitLab From 8829a309fe056dfecd472f19050c390fd049fead Mon Sep 17 00:00:00 2001 From: tianshuo78520a <707759223@qq.com> Date: Mon, 29 Mar 2021 20:11:26 +0800 Subject: [PATCH 090/486] Delete cudnn6 code (#31835) --- paddle/fluid/operators/conv_cudnn_op_cache.h | 5 --- paddle/fluid/operators/cudnn_lstm_cache.h | 10 +----- paddle/fluid/operators/cudnn_rnn_cache.h | 7 ---- paddle/fluid/platform/cudnn_helper.h | 38 -------------------- 4 files changed, 1 insertion(+), 59 deletions(-) diff --git a/paddle/fluid/operators/conv_cudnn_op_cache.h b/paddle/fluid/operators/conv_cudnn_op_cache.h index ddddb7f8641..23a471cfa00 100644 --- a/paddle/fluid/operators/conv_cudnn_op_cache.h +++ b/paddle/fluid/operators/conv_cudnn_op_cache.h @@ -40,11 +40,6 @@ static constexpr size_t kNUM_CUDNN_BWD_FILTER_ALGS = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT; static constexpr size_t kNUM_CUDNN_BWD_DATA_ALGS = CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT; -#else -// cuDNN v5 has no CUDNN_CONVOLUTION_FWD_ALGO_COUNT etc. -static constexpr size_t kNUM_CUDNN_FWD_ALGS = 7; -static constexpr size_t kNUM_CUDNN_BWD_FILTER_ALGS = 4; -static constexpr size_t kNUM_CUDNN_BWD_DATA_ALGS = 5; #endif } // namespace operators diff --git a/paddle/fluid/operators/cudnn_lstm_cache.h b/paddle/fluid/operators/cudnn_lstm_cache.h index 3181e4b1d99..b7859237e73 100644 --- a/paddle/fluid/operators/cudnn_lstm_cache.h +++ b/paddle/fluid/operators/cudnn_lstm_cache.h @@ -85,20 +85,12 @@ class ScopedRNNBase { dropout_desc_.descriptor(handle, place, initialized_, dropout_prob_, dropout_state, seed_, state_size); -// ------------------- cudnn rnn descriptors --------------------- -#if CUDNN_VERSION >= 6000 + // ------------------- cudnn rnn descriptors --------------------- PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNDescriptor_v6( handle, rnn_desc_.desc(), hidden_size_, num_layers_, dropout_desc_.desc(), CUDNN_LINEAR_INPUT, is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM, CUDNN_RNN_ALGO_STANDARD, cudnn_type)); -#else - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNDescriptor( - rnn_desc_.desc(), hidden_size_, num_layers_, dropout_desc_.desc(), - CUDNN_LINEAR_INPUT, - is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM, - cudnn_type)); -#endif #if CUDNN_VERSION >= 7201 if (!sequence_length.empty()) { diff --git a/paddle/fluid/operators/cudnn_rnn_cache.h b/paddle/fluid/operators/cudnn_rnn_cache.h index 13a3e7d09b9..a6a23a91c76 100644 --- a/paddle/fluid/operators/cudnn_rnn_cache.h +++ b/paddle/fluid/operators/cudnn_rnn_cache.h @@ -168,18 +168,11 @@ struct CudnnRNNCache { PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnCreateRNNDescriptor(&rnn_desc_)); -#if CUDNN_VERSION >= 6000 PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNDescriptor_v6( handle, rnn_desc_, hidden_size_, num_layers_, dropout_desc_, CUDNN_LINEAR_INPUT, is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM, CUDNN_RNN_ALGO_STANDARD, cudnn_type)); -#else - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNDescriptor( - rnn_desc_, hidden_size_, num_layers_, dropout_desc_, CUDNN_LINEAR_INPUT, - is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM, - cudnn_type)); -#endif PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnCreateFilterDescriptor(&w_desc_)); diff --git a/paddle/fluid/platform/cudnn_helper.h b/paddle/fluid/platform/cudnn_helper.h index af0df2efc5e..6c3c96b68c4 100644 --- a/paddle/fluid/platform/cudnn_helper.h +++ b/paddle/fluid/platform/cudnn_helper.h @@ -91,30 +91,6 @@ enum class ActivationMode { kBandPass, }; -#if CUDNN_VERSION < 6000 -#pragma message "CUDNN version under 6.0 is supported at best effort." -#pragma message "We strongly encourage you to move to 6.0 and above." -#pragma message "This message is intended to annoy you enough to update." -#pragma message \ - "please see https://docs.nvidia.com/deeplearning/sdk/cudnn-release-notes/" - -inline cudnnPoolingMode_t GetPoolingMode(const PoolingMode& mode) { - switch (mode) { - case PoolingMode::kMaximumDeterministic: - return CUDNN_POOLING_MAX; - case PoolingMode::kAverageExclusive: - return CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING; - case PoolingMode::kAverageInclusive: - return CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING; - case PoolingMode::kMaximum: - return CUDNN_POOLING_MAX; - default: - PADDLE_THROW( - platform::errors::Unimplemented("Unexpected CUDNN pooling mode.")); - } -} -#else - inline cudnnPoolingMode_t GetPoolingMode(const PoolingMode& mode) { switch (mode) { case PoolingMode::kMaximumDeterministic: @@ -130,7 +106,6 @@ inline cudnnPoolingMode_t GetPoolingMode(const PoolingMode& mode) { platform::errors::Unimplemented("Unexpected CUDNN pooling mode.")); } } -#endif // CUDNN_VERSION < 6000 inline ActivationMode StringToActivationMode(const std::string& str) { if (str == "identity") { @@ -471,19 +446,6 @@ class ScopedConvolutionDescriptor { "of pads is %d, size of dilations is %d.", pads.size(), dilations.size())); -#if !CUDNN_VERSION_MIN(6, 0, 0) - // cudnn v5 does not support dilation conv, the argument is called upscale - // instead of dilations and it is must be one. - for (size_t i = 0; i < dilations.size(); ++i) { - PADDLE_ENFORCE_EQ(dilations[i], 1, - platform::errors::InvalidArgument( - "Dilations conv is not supported in this cuDNN " - "version(%d.%d.%d).", - CUDNN_VERSION / 1000, CUDNN_VERSION % 1000 / 100, - CUDNN_VERSION % 100)); - } -#endif - cudnnDataType_t compute_type = (type == CUDNN_DATA_DOUBLE) ? CUDNN_DATA_DOUBLE : CUDNN_DATA_FLOAT; PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetConvolutionNdDescriptor( -- GitLab From a71d72d921fc861051553c6d44b32bc9037706bc Mon Sep 17 00:00:00 2001 From: niuliling123 <51102941+niuliling123@users.noreply.github.com> Date: Mon, 29 Mar 2021 20:30:37 +0800 Subject: [PATCH 091/486] relu forward and backward with vectortype (#31869) --- paddle/fluid/operators/activation_op.cu | 286 +++++++++++++++++++++++- 1 file changed, 285 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu index 2033081af22..c6d2fbccd8e 100644 --- a/paddle/fluid/operators/activation_op.cu +++ b/paddle/fluid/operators/activation_op.cu @@ -10,8 +10,278 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/activation_op.h" +#include "paddle/fluid/operators/math/math_cuda_utils.h" +#include "paddle/fluid/platform/cuda_device_function.h" #include "paddle/fluid/platform/float16.h" +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using float16 = paddle::platform::float16; + +template +struct CudaVecType { + using type = T; + static constexpr int vecsize = 1; +}; + +template <> +struct CudaVecType { + using type = __half2; + static constexpr int vecsize = 2; +}; + +template <> +struct CudaVecType { + using type = float4; + static constexpr int vecsize = 4; +}; + +template +class BaseGPUFunctor { + public: + using ELEMENT_TYPE = T; +}; + +/* ========================================================================== */ + +/* =========================== relu forward ============================ */ +template +class ReluGPUFunctor : public BaseGPUFunctor { + private: + T zero_; + + public: + ReluGPUFunctor() { zero_ = static_cast(0.0f); } + + // for relu forward when T is double + __device__ __forceinline__ typename CudaVecType::type Compute( + const typename CudaVecType::type* x); + + // when num % vecsize != 0 this func will be used + __device__ __forceinline__ T ComputeRemainder(const T x) { + return x > zero_ ? x : zero_; + } +}; + +template <> +__device__ __forceinline__ CudaVecType::type +ReluGPUFunctor::Compute(const CudaVecType::type* x) { +// relu forward : out = max(x, 0) +#ifdef __HIPCC__ || __CUDA_ARCH__ >= 350 + return __ldg(x) > zero_ ? __ldg(x) : zero_; +#else + return (*x) > zero_ ? (*x) : zero_; +#endif +} + +template <> +__device__ __forceinline__ CudaVecType::type +ReluGPUFunctor::Compute(const CudaVecType::type* xx) { + // relu forward : out = max(xx, 0) + return make_float4((xx->x > zero_) * (xx->x), (xx->y > zero_) * (xx->y), + (xx->z > zero_) * (xx->z), (xx->w > zero_) * (xx->w)); +} + +template <> +__device__ __forceinline__ CudaVecType::type +ReluGPUFunctor::Compute(const CudaVecType::type* in) { +// relu forward : out = max(in, 0) +#ifdef __HIPCC__ || CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__) + const half2 kzero = __float2half2_rn(0.0f); + return __hmul2(__hgt2(__ldg(in), kzero), __ldg(in)); +#else + const float2 xx = __half22float2(*in); + return __floats2half2_rn((xx.x > 0.0f) * static_cast(xx.x), + (xx.y > 0.0f) * static_cast(xx.y)); +#endif +} +/* ========================================================================== */ + +/* =========================== relu backward ============================ + */ + +template +class ReluGradGPUFunctor : public BaseGPUFunctor { + private: + T zero_; + + public: + ReluGradGPUFunctor() { zero_ = static_cast(0.0f); } + + // for relu backward when T is double + __device__ __forceinline__ typename CudaVecType::type Compute( + const typename CudaVecType::type* out, + const typename CudaVecType::type* dout); + + // when num % vecsize != 0 this func will be used + __device__ __forceinline__ T ComputeRemainder(const T out, const T dout) { + // relu backward : dx = out > 0 ? dout : 0 + return out > zero_ ? dout : zero_; + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } +}; + +template <> +__device__ __forceinline__ CudaVecType::type +ReluGradGPUFunctor::Compute(const CudaVecType::type* out, + const CudaVecType::type* dout) { +// relu backward : dx = out > 0 ? dout : 0; +#ifdef __HIPCC__ || __CUDA_ARCH__ >= 350 + return __ldg(out) > zero_ ? __ldg(dout) : zero_; +#else + return (*out) > zero_ ? (*dout) : zero_; +#endif +} + +template <> +__device__ __forceinline__ CudaVecType::type +ReluGradGPUFunctor::Compute(const CudaVecType::type* out, + const CudaVecType::type* dout) { + // relu backward : dx = out > 0 ? dout : 0; + return make_float4((out->x > zero_) * (dout->x), (out->y > zero_) * (dout->y), + (out->z > zero_) * (dout->z), + (out->w > zero_) * (dout->w)); +} + +template <> +__device__ __forceinline__ CudaVecType::type +ReluGradGPUFunctor::Compute(const CudaVecType::type* out, + const CudaVecType::type* dout) { +// relu backward : dx = out > 0 ? dout : 0; +#ifdef __HIPCC__ || CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__) + const half2 kzero = __float2half2_rn(0.0f); + return __hmul2(__hgt2(__ldg(out), kzero), __ldg(dout)); +#else + const float2 xx = __half22float2(*out); + const float2 yy = __half22float2(*dout); + return __floats2half2_rn((xx.x > 0.0f) * static_cast(yy.x), + (xx.y > 0.0f) * static_cast(yy.y)); +#endif +} + +/* ========================================================================== */ + +template +__global__ void ActivationGradKernelVec(const T* forward_data, const T* dout, + T* dx, int num, Functor functor) { + using VecType = typename CudaVecType::type; + constexpr int vecsize = CudaVecType::vecsize; + int idx = threadIdx.x + blockIdx.x * blockDim.x; + int stride = blockDim.x * gridDim.x; + int loop = num / vecsize; + int tail = num % vecsize; + const VecType* in_forward = reinterpret_cast(forward_data); + const VecType* in_dout = reinterpret_cast(dout); + VecType* out = reinterpret_cast(dx); + + for (int i = idx; i < loop; i += stride) { + out[i] = functor.Compute((in_forward + i), (in_dout + i)); + } + + while (idx == loop && tail) { + dx[num - tail] = + functor.ComputeRemainder(forward_data[num - tail], dout[num - tail]); + --tail; + } +} + +template +__global__ void ActivationkernelVec(const T* src, T* dst, int num, + Functor functor) { + constexpr int vecsize = CudaVecType::vecsize; + using VecType = typename CudaVecType::type; + int idx = threadIdx.x + blockIdx.x * blockDim.x; + int stride = blockDim.x * gridDim.x; + int loop = num / vecsize; + int tail = num % vecsize; + const VecType* in = reinterpret_cast(src); + VecType* out = reinterpret_cast(dst); + + for (int i = idx; i < loop; i += stride) { + out[i] = functor.Compute((in + i)); + } + + while (idx == loop && tail) { + dst[num - tail] = functor.ComputeRemainder(src[num - tail]); + --tail; + } +} + +template +class ActivationGPUKernel + : public framework::OpKernel { + public: + using T = typename Functor::ELEMENT_TYPE; + void Compute(const framework::ExecutionContext& context) const override { + const framework::Tensor* in_x = nullptr; + framework::Tensor* out = nullptr; + ExtractActivationTensor(context, &in_x, &out); + auto& dev_ctx = context.template device_context(); + + int num = in_x->numel(); + const T* input_data = in_x->data(); + T* output_data = out->mutable_data(dev_ctx.GetPlace(), + static_cast(num * sizeof(T))); + + int block = 512; +#ifdef __HIPCC__ + block = 256; +#endif + Functor functor; + constexpr int vecsize = CudaVecType::vecsize; + int grid = max((num / vecsize + block - 1) / block, 1); + auto stream = context.cuda_device_context().stream(); + ActivationkernelVec<<>>( + input_data, output_data, num, functor); + } +}; + +template +class ActivationGradGPUKernel + : public framework::OpKernel { + public: + using T = typename Functor::ELEMENT_TYPE; + void Compute(const framework::ExecutionContext& context) const override { + const framework::Tensor *x, *out, *d_out; + framework::Tensor* d_x = nullptr; + x = out = d_out = nullptr; + ExtractActivationGradTensor(context, &x, &out, &d_out, + &d_x); + int numel = d_out->numel(); + auto& dev_ctx = context.template device_context(); + auto* dx_data = d_x->mutable_data( + dev_ctx.GetPlace(), static_cast(numel * sizeof(T))); + auto* dout_data = d_out->data(); + + auto* forward_data = dout_data; + if (static_cast(Functor::FwdDeps()) == static_cast(kDepOut)) { + // Only need forward output Out + forward_data = out->data(); + } else if (static_cast(Functor::FwdDeps()) == + static_cast(kDepX)) { + // Only need forward input X + forward_data = x->data(); + } + + int block = 512; +#ifdef __HIPCC__ + block = 256; +#endif + Functor functor; + constexpr int vecsize = CudaVecType::vecsize; + int grid = max((numel / vecsize + block - 1) / block, 1); + auto stream = context.cuda_device_context().stream(); + ActivationGradKernelVec<<>>( + forward_data, dout_data, dx_data, numel, functor); + } +}; + +} // namespace operators +} // namespace paddle + namespace ops = paddle::operators; namespace plat = paddle::platform; @@ -60,7 +330,21 @@ REGISTER_OP_CUDA_KERNEL( /* ========================================================================== */ /* =========================== relu register ============================ */ -REGISTER_ACTIVATION_CUDA_KERNEL(relu, Relu, ReluCUDAFunctor, ReluGradFunctor); +REGISTER_OP_CUDA_KERNEL( + relu, ops::ActivationGPUKernel>, + ops::ActivationGPUKernel>, + ops::ActivationGPUKernel>); + +REGISTER_OP_CUDA_KERNEL( + relu_grad, ops::ActivationGradGPUKernel>, + ops::ActivationGradGPUKernel>, + ops::ActivationGradGPUKernel>); REGISTER_OP_CUDA_KERNEL( relu_grad_grad, -- GitLab From 17030ff28b9a54bb57779e9b8448a6d222110ec5 Mon Sep 17 00:00:00 2001 From: wuhuanzhou Date: Tue, 30 Mar 2021 08:45:06 +0800 Subject: [PATCH 092/486] fix op benchmark ci error caused by missing test_pr branch, test=document_fix (#31920) --- tools/test_op_benchmark.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/test_op_benchmark.sh b/tools/test_op_benchmark.sh index f0937ca7dfa..95e9164bd1b 100644 --- a/tools/test_op_benchmark.sh +++ b/tools/test_op_benchmark.sh @@ -187,7 +187,7 @@ function run_op_benchmark_test { done # install tensorflow for testing accuary pip install tensorflow==2.3.0 tensorflow-probability - for branch_name in "develop" "test_pr" + for branch_name in "develop" "test" do git checkout $branch_name [ $? -ne 0 ] && LOG "[FATAL] Missing branch ${branch_name}." && exit 7 -- GitLab From c4b60efabde5351681e8f7f724e4e0f9ecce6808 Mon Sep 17 00:00:00 2001 From: Aurelius84 Date: Tue, 30 Mar 2021 09:52:54 +0800 Subject: [PATCH 093/486] Fix segment Fault from set_value (#31891) * Avoid raising warning while import paddle * fix segment fault of set_value * fix code style --- python/paddle/fluid/framework.py | 3 ++- .../fluid/tests/unittests/test_set_value_op.py | 17 +++++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 18162059e99..b87c2eb388a 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -2006,7 +2006,8 @@ class Variable(object): "paddle.Tensor to a paddle.Tensor, but received {}".format( type(value))) - self.block.append_op( + cur_block = default_main_program().current_block() + cur_block.append_op( type="set_value", inputs=inputs, outputs={'Out': self}, attrs=attrs) return self diff --git a/python/paddle/fluid/tests/unittests/test_set_value_op.py b/python/paddle/fluid/tests/unittests/test_set_value_op.py index 1239a2249cc..808d77d4761 100644 --- a/python/paddle/fluid/tests/unittests/test_set_value_op.py +++ b/python/paddle/fluid/tests/unittests/test_set_value_op.py @@ -106,6 +106,23 @@ class TestSetValueItemSlice4(TestSetValueApi): self.data[0:, 1:2, :] = self.value +class TestSetValueItemSliceInWhile(TestSetValueApi): + def _call_setitem(self, x): + def cond(i, x): + return i < 1 + + def body(i, x): + x[i] = self.value + i = i + 1 + return i, x + + i = paddle.zeros(shape=(1, ), dtype='int32') + i, x = paddle.fluid.layers.while_loop(cond, body, [i, x]) + + def _get_answer(self): + self.data[0] = self.value + + # 1.2.2 step > 1 class TestSetValueItemSliceStep(TestSetValueApi): def set_shape(self): -- GitLab From 64ee255ffda2cc8187e3caf738f58c917b75939f Mon Sep 17 00:00:00 2001 From: zlsh80826 Date: Tue, 30 Mar 2021 10:13:49 +0800 Subject: [PATCH 094/486] [Paddle-TRT] yolobox (#31755) * yolobox converter and plugin * yolobox unittest * add dynamic shape restriction * fix git merge log --- .../fluid/inference/api/analysis_predictor.cc | 1 + .../inference/tensorrt/convert/CMakeLists.txt | 1 + .../inference/tensorrt/convert/yolo_box_op.cc | 79 ++++ paddle/fluid/inference/tensorrt/op_teller.cc | 10 + .../inference/tensorrt/plugin/CMakeLists.txt | 1 + .../tensorrt/plugin/yolo_box_op_plugin.cu | 404 ++++++++++++++++++ .../tensorrt/plugin/yolo_box_op_plugin.h | 117 +++++ .../ir/inference/test_trt_yolo_box_op.py | 76 ++++ 8 files changed, 689 insertions(+) create mode 100644 paddle/fluid/inference/tensorrt/convert/yolo_box_op.cc create mode 100644 paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu create mode 100644 paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.h create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_trt_yolo_box_op.py diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 7bb092d0e3c..21ef3b2312f 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -1192,6 +1192,7 @@ USE_TRT_CONVERTER(scale); USE_TRT_CONVERTER(stack); USE_TRT_CONVERTER(clip); USE_TRT_CONVERTER(gather); +USE_TRT_CONVERTER(yolo_box); USE_TRT_CONVERTER(roi_align); USE_TRT_CONVERTER(affine_channel); USE_TRT_CONVERTER(multiclass_nms); diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt index bc7b7355ea1..3f792300942 100644 --- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt @@ -6,6 +6,7 @@ nv_library(tensorrt_converter shuffle_channel_op.cc swish_op.cc instance_norm_op.cc stack_op.cc transpose_op.cc flatten_op.cc emb_eltwise_layernorm.cc skip_layernorm.cc scale_op.cc slice_op.cc hard_sigmoid_op.cc hard_swish_op.cc clip_op.cc gather_op.cc + yolo_box_op.cc roi_align_op.cc affine_channel_op.cc multiclass_nms_op.cc diff --git a/paddle/fluid/inference/tensorrt/convert/yolo_box_op.cc b/paddle/fluid/inference/tensorrt/convert/yolo_box_op.cc new file mode 100644 index 00000000000..2d12eaf736b --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/yolo_box_op.cc @@ -0,0 +1,79 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at +http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.h" + +namespace paddle { +namespace framework { +class Scope; +namespace proto { +class OpDesc; +} // namespace proto +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace inference { +namespace tensorrt { + +class YoloBoxOpConverter : public OpConverter { + public: + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, bool test_mode) override { + VLOG(3) << "convert a fluid yolo box op to tensorrt plugin"; + + framework::OpDesc op_desc(op, nullptr); + std::string X = op_desc.Input("X").front(); + std::string img_size = op_desc.Input("ImgSize").front(); + + auto* X_tensor = engine_->GetITensor(X); + auto* img_size_tensor = engine_->GetITensor(img_size); + + int class_num = BOOST_GET_CONST(int, op_desc.GetAttr("class_num")); + std::vector anchors = + BOOST_GET_CONST(std::vector, op_desc.GetAttr("anchors")); + + int downsample_ratio = + BOOST_GET_CONST(int, op_desc.GetAttr("downsample_ratio")); + float conf_thresh = BOOST_GET_CONST(float, op_desc.GetAttr("conf_thresh")); + bool clip_bbox = BOOST_GET_CONST(bool, op_desc.GetAttr("clip_bbox")); + float scale_x_y = BOOST_GET_CONST(float, op_desc.GetAttr("scale_x_y")); + + int type_id = static_cast(engine_->WithFp16()); + auto input_dim = X_tensor->getDimensions(); + auto* yolo_box_plugin = new plugin::YoloBoxPlugin( + type_id ? nvinfer1::DataType::kHALF : nvinfer1::DataType::kFLOAT, + anchors, class_num, conf_thresh, downsample_ratio, clip_bbox, scale_x_y, + input_dim.d[1], input_dim.d[2]); + + std::vector yolo_box_inputs; + yolo_box_inputs.push_back(X_tensor); + yolo_box_inputs.push_back(img_size_tensor); + + auto* yolo_box_layer = engine_->network()->addPluginV2( + yolo_box_inputs.data(), yolo_box_inputs.size(), *yolo_box_plugin); + + std::vector output_names; + output_names.push_back(op_desc.Output("Boxes").front()); + output_names.push_back(op_desc.Output("Scores").front()); + + RreplenishLayerAndOutput(yolo_box_layer, "yolo_box", output_names, + test_mode); + } +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +REGISTER_TRT_OP_CONVERTER(yolo_box, YoloBoxOpConverter); diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc index 7c1b2e8001e..c95912a931e 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -111,6 +111,7 @@ struct SimpleOpTypeSetTeller : public Teller { "flatten2", "flatten", "gather", + "yolo_box", "roi_align", "affine_channel", "multiclass_nms", @@ -198,6 +199,15 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, if (!with_dynamic_shape || desc.Input("Axis").size() > 0) return false; } + if (op_type == "yolo_box") { + if (with_dynamic_shape) return false; + bool has_attrs = + (desc.HasAttr("class_num") && desc.HasAttr("anchors") && + desc.HasAttr("downsample_ratio") && desc.HasAttr("conf_thresh") && + desc.HasAttr("clip_bbox") && desc.HasAttr("scale_x_y")); + return has_attrs; + } + if (op_type == "affine_channel") { if (!desc.HasAttr("data_layout")) return false; auto data_layout = framework::StringToDataLayout( diff --git a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt index 4107f9ef674..b4e948edd8a 100644 --- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt @@ -5,6 +5,7 @@ nv_library(tensorrt_plugin instance_norm_op_plugin.cu emb_eltwise_layernorm_plugin.cu qkv_to_context_plugin.cu skip_layernorm_op_plugin.cu slice_op_plugin.cu hard_swish_op_plugin.cu stack_op_plugin.cu special_slice_plugin.cu + yolo_box_op_plugin.cu roi_align_op_plugin.cu DEPS enforce tensorrt_engine prelu tensor bert_encoder_functor) diff --git a/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu new file mode 100644 index 00000000000..e1b4c898d21 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu @@ -0,0 +1,404 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include + +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h" +#include "paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.h" +#include "paddle/fluid/operators/detection/yolo_box_op.h" + +namespace paddle { +namespace inference { +namespace tensorrt { +namespace plugin { + +YoloBoxPlugin::YoloBoxPlugin(const nvinfer1::DataType data_type, + const std::vector& anchors, + const int class_num, const float conf_thresh, + const int downsample_ratio, const bool clip_bbox, + const float scale_x_y, const int input_h, + const int input_w) + : data_type_(data_type), + class_num_(class_num), + conf_thresh_(conf_thresh), + downsample_ratio_(downsample_ratio), + clip_bbox_(clip_bbox), + scale_x_y_(scale_x_y), + input_h_(input_h), + input_w_(input_w) { + anchors_.insert(anchors_.end(), anchors.cbegin(), anchors.cend()); + assert(data_type_ == nvinfer1::DataType::kFLOAT || + data_type_ == nvinfer1::DataType::kHALF); + assert(class_num_ > 0); + assert(input_h_ > 0); + assert(input_w_ > 0); + + cudaMalloc(&anchors_device_, anchors.size() * sizeof(int)); + cudaMemcpy(anchors_device_, anchors.data(), anchors.size() * sizeof(int), + cudaMemcpyHostToDevice); +} + +YoloBoxPlugin::YoloBoxPlugin(const void* data, size_t length) { + DeserializeValue(&data, &length, &data_type_); + DeserializeValue(&data, &length, &anchors_); + DeserializeValue(&data, &length, &class_num_); + DeserializeValue(&data, &length, &conf_thresh_); + DeserializeValue(&data, &length, &downsample_ratio_); + DeserializeValue(&data, &length, &clip_bbox_); + DeserializeValue(&data, &length, &scale_x_y_); + DeserializeValue(&data, &length, &input_h_); + DeserializeValue(&data, &length, &input_w_); +} + +YoloBoxPlugin::~YoloBoxPlugin() { + if (anchors_device_ != nullptr) { + cudaFree(anchors_device_); + anchors_device_ = nullptr; + } +} + +const char* YoloBoxPlugin::getPluginType() const { return "yolo_box_plugin"; } + +const char* YoloBoxPlugin::getPluginVersion() const { return "1"; } + +int YoloBoxPlugin::getNbOutputs() const { return 2; } + +nvinfer1::Dims YoloBoxPlugin::getOutputDimensions(int index, + const nvinfer1::Dims* inputs, + int nb_input_dims) { + const int anchor_num = anchors_.size() / 2; + const int box_num = inputs[0].d[1] * inputs[0].d[2] * anchor_num; + + assert(index <= 1); + + if (index == 0) { + return nvinfer1::Dims2(box_num, 4); + } + return nvinfer1::Dims2(box_num, class_num_); +} + +bool YoloBoxPlugin::supportsFormat(nvinfer1::DataType type, + nvinfer1::TensorFormat format) const { + return ((type == data_type_ || type == nvinfer1::DataType::kINT32) && + format == nvinfer1::TensorFormat::kLINEAR); +} + +size_t YoloBoxPlugin::getWorkspaceSize(int max_batch_size) const { return 0; } + +template +__device__ inline T sigmoid(T x) { + return 1. / (1. + exp(-x)); +} + +template <> +__device__ inline float sigmoid(float x) { + return 1.f / (1.f + expf(-x)); +} + +template +__device__ inline void GetYoloBox(float* box, const T* x, const int* anchors, + int i, int j, int an_idx, int grid_size_h, + int grid_size_w, int input_size_h, + int input_size_w, int index, int stride, + int img_height, int img_width, float scale, + float bias) { + box[0] = static_cast( + (i + sigmoid(static_cast(x[index]) * scale + bias)) * img_width / + grid_size_w); + box[1] = static_cast( + (j + sigmoid(static_cast(x[index + stride]) * scale + bias)) * + img_height / grid_size_h); + box[2] = static_cast(expf(static_cast(x[index + 2 * stride])) * + anchors[2 * an_idx] * img_width / input_size_w); + box[3] = + static_cast(expf(static_cast(x[index + 3 * stride])) * + anchors[2 * an_idx + 1] * img_height / input_size_h); +} + +__device__ inline int GetEntryIndex(int batch, int an_idx, int hw_idx, + int an_num, int an_stride, int stride, + int entry) { + return (batch * an_num + an_idx) * an_stride + entry * stride + hw_idx; +} + +template +__device__ inline void CalcDetectionBox(T* boxes, const float* box, + const int box_idx, const int img_height, + const int img_width, bool clip_bbox) { + float tmp_box_0, tmp_box_1, tmp_box_2, tmp_box_3; + tmp_box_0 = box[0] - box[2] / 2; + tmp_box_1 = box[1] - box[3] / 2; + tmp_box_2 = box[0] + box[2] / 2; + tmp_box_3 = box[1] + box[3] / 2; + + if (clip_bbox) { + tmp_box_0 = max(tmp_box_0, 0.f); + tmp_box_1 = max(tmp_box_1, 0.f); + tmp_box_2 = min(tmp_box_2, static_cast(img_width - 1)); + tmp_box_3 = min(tmp_box_3, static_cast(img_height - 1)); + } + + boxes[box_idx + 0] = static_cast(tmp_box_0); + boxes[box_idx + 1] = static_cast(tmp_box_1); + boxes[box_idx + 2] = static_cast(tmp_box_2); + boxes[box_idx + 3] = static_cast(tmp_box_3); +} + +template +__device__ inline void CalcLabelScore(T* scores, const T* input, + const int label_idx, const int score_idx, + const int class_num, const float conf, + const int stride) { + for (int i = 0; i < class_num; i++) { + scores[score_idx + i] = static_cast( + conf * sigmoid(static_cast(input[label_idx + i * stride]))); + } +} + +template +__global__ void KeYoloBoxFw(const T* const input, const int* const imgsize, + T* boxes, T* scores, const float conf_thresh, + const int* anchors, const int n, const int h, + const int w, const int an_num, const int class_num, + const int box_num, int input_size_h, + int input_size_w, bool clip_bbox, const float scale, + const float bias) { + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + float box[4]; + for (; tid < n * box_num; tid += stride) { + int grid_num = h * w; + int i = tid / box_num; + int j = (tid % box_num) / grid_num; + int k = (tid % grid_num) / w; + int l = tid % w; + + int an_stride = (5 + class_num) * grid_num; + int img_height = imgsize[2 * i]; + int img_width = imgsize[2 * i + 1]; + + int obj_idx = + GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 4); + float conf = sigmoid(static_cast(input[obj_idx])); + int box_idx = + GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 0); + + if (conf < conf_thresh) { + for (int i = 0; i < 4; ++i) { + box[i] = 0.f; + } + } else { + GetYoloBox(box, input, anchors, l, k, j, h, w, input_size_h, + input_size_w, box_idx, grid_num, img_height, img_width, + scale, bias); + } + + box_idx = (i * box_num + j * grid_num + k * w + l) * 4; + CalcDetectionBox(boxes, box, box_idx, img_height, img_width, clip_bbox); + + int label_idx = + GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 5); + int score_idx = (i * box_num + j * grid_num + k * w + l) * class_num; + CalcLabelScore(scores, input, label_idx, score_idx, class_num, conf, + grid_num); + } +} + +template +int YoloBoxPlugin::enqueue_impl(int batch_size, const void* const* inputs, + void** outputs, void* workspace, + cudaStream_t stream) { + const int n = batch_size; + const int h = input_h_; + const int w = input_w_; + const int an_num = anchors_.size() / 2; + const int box_num = h * w * an_num; + int input_size_h = downsample_ratio_ * h; + int input_size_w = downsample_ratio_ * w; + + float bias = -0.5 * (scale_x_y_ - 1.); + constexpr int threads = 256; + + KeYoloBoxFw<<<(n * box_num + threads - 1) / threads, threads, 0, stream>>>( + reinterpret_cast(inputs[0]), + reinterpret_cast(inputs[1]), + reinterpret_cast(outputs[0]), reinterpret_cast(outputs[1]), + conf_thresh_, anchors_device_, n, h, w, an_num, class_num_, box_num, + input_size_h, input_size_w, clip_bbox_, scale_x_y_, bias); + return cudaGetLastError() != cudaSuccess; +} + +int YoloBoxPlugin::enqueue(int batch_size, const void* const* inputs, + void** outputs, void* workspace, + cudaStream_t stream) { + if (data_type_ == nvinfer1::DataType::kFLOAT) { + return enqueue_impl(batch_size, inputs, outputs, workspace, stream); + } else if (data_type_ == nvinfer1::DataType::kHALF) { + return enqueue_impl(batch_size, inputs, outputs, workspace, stream); + } + assert("unsupported type."); +} + +int YoloBoxPlugin::initialize() { return 0; } + +void YoloBoxPlugin::terminate() {} + +size_t YoloBoxPlugin::getSerializationSize() const { + size_t serialize_size = 0; + serialize_size += SerializedSize(data_type_); + serialize_size += SerializedSize(anchors_); + serialize_size += SerializedSize(class_num_); + serialize_size += SerializedSize(conf_thresh_); + serialize_size += SerializedSize(downsample_ratio_); + serialize_size += SerializedSize(clip_bbox_); + serialize_size += SerializedSize(scale_x_y_); + serialize_size += SerializedSize(input_h_); + serialize_size += SerializedSize(input_w_); + return serialize_size; +} + +void YoloBoxPlugin::serialize(void* buffer) const { + SerializeValue(&buffer, data_type_); + SerializeValue(&buffer, anchors_); + SerializeValue(&buffer, class_num_); + SerializeValue(&buffer, conf_thresh_); + SerializeValue(&buffer, downsample_ratio_); + SerializeValue(&buffer, clip_bbox_); + SerializeValue(&buffer, scale_x_y_); + SerializeValue(&buffer, input_h_); + SerializeValue(&buffer, input_w_); +} + +void YoloBoxPlugin::destroy() { + cudaFree(anchors_device_); + delete this; +} + +void YoloBoxPlugin::setPluginNamespace(const char* lib_namespace) { + namespace_ = std::string(lib_namespace); +} + +const char* YoloBoxPlugin::getPluginNamespace() const { + return namespace_.c_str(); +} + +nvinfer1::DataType YoloBoxPlugin::getOutputDataType( + int index, const nvinfer1::DataType* input_type, int nb_inputs) const { + return data_type_; +} + +bool YoloBoxPlugin::isOutputBroadcastAcrossBatch(int output_index, + const bool* input_is_broadcast, + int nb_inputs) const { + return false; +} + +bool YoloBoxPlugin::canBroadcastInputAcrossBatch(int input_index) const { + return false; +} + +void YoloBoxPlugin::configurePlugin( + const nvinfer1::Dims* input_dims, int nb_inputs, + const nvinfer1::Dims* output_dims, int nb_outputs, + const nvinfer1::DataType* input_types, + const nvinfer1::DataType* output_types, const bool* input_is_broadcast, + const bool* output_is_broadcast, nvinfer1::PluginFormat float_format, + int max_batct_size) {} + +nvinfer1::IPluginV2Ext* YoloBoxPlugin::clone() const { + return new YoloBoxPlugin(data_type_, anchors_, class_num_, conf_thresh_, + downsample_ratio_, clip_bbox_, scale_x_y_, input_h_, + input_w_); +} + +YoloBoxPluginCreator::YoloBoxPluginCreator() {} + +void YoloBoxPluginCreator::setPluginNamespace(const char* lib_namespace) { + namespace_ = std::string(lib_namespace); +} + +const char* YoloBoxPluginCreator::getPluginNamespace() const { + return namespace_.c_str(); +} + +const char* YoloBoxPluginCreator::getPluginName() const { + return "yolo_box_plugin"; +} + +const char* YoloBoxPluginCreator::getPluginVersion() const { return "1"; } + +const nvinfer1::PluginFieldCollection* YoloBoxPluginCreator::getFieldNames() { + return &field_collection_; +} + +nvinfer1::IPluginV2Ext* YoloBoxPluginCreator::createPlugin( + const char* name, const nvinfer1::PluginFieldCollection* fc) { + const nvinfer1::PluginField* fields = fc->fields; + + int type_id = -1; + std::vector anchors; + int class_num = -1; + float conf_thresh = 0.01; + int downsample_ratio = 32; + bool clip_bbox = true; + float scale_x_y = 1.; + int h = -1; + int w = -1; + + for (int i = 0; i < fc->nbFields; ++i) { + const std::string field_name(fc->fields[i].name); + if (field_name.compare("type_id") == 0) { + type_id = *static_cast(fc->fields[i].data); + } else if (field_name.compare("anchors")) { + const int length = fc->fields[i].length; + const int* data = static_cast(fc->fields[i].data); + anchors.insert(anchors.end(), data, data + length); + } else if (field_name.compare("class_num")) { + class_num = *static_cast(fc->fields[i].data); + } else if (field_name.compare("conf_thresh")) { + conf_thresh = *static_cast(fc->fields[i].data); + } else if (field_name.compare("downsample_ratio")) { + downsample_ratio = *static_cast(fc->fields[i].data); + } else if (field_name.compare("clip_bbox")) { + clip_bbox = *static_cast(fc->fields[i].data); + } else if (field_name.compare("scale_x_y")) { + scale_x_y = *static_cast(fc->fields[i].data); + } else if (field_name.compare("h")) { + h = *static_cast(fc->fields[i].data); + } else if (field_name.compare("w")) { + w = *static_cast(fc->fields[i].data); + } else { + assert(false && "unknown plugin field name."); + } + } + + return new YoloBoxPlugin( + type_id ? nvinfer1::DataType::kHALF : nvinfer1::DataType::kFLOAT, anchors, + class_num, conf_thresh, downsample_ratio, clip_bbox, scale_x_y, h, w); +} + +nvinfer1::IPluginV2Ext* YoloBoxPluginCreator::deserializePlugin( + const char* name, const void* serial_data, size_t serial_length) { + auto plugin = new YoloBoxPlugin(serial_data, serial_length); + plugin->setPluginNamespace(namespace_.c_str()); + return plugin; +} + +} // namespace plugin +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.h new file mode 100644 index 00000000000..8ca21da7ae0 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.h @@ -0,0 +1,117 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include "paddle/fluid/inference/tensorrt/engine.h" +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" + +namespace paddle { +namespace inference { +namespace tensorrt { +namespace plugin { + +class YoloBoxPlugin : public nvinfer1::IPluginV2Ext { + public: + explicit YoloBoxPlugin(const nvinfer1::DataType data_type, + const std::vector& anchors, const int class_num, + const float conf_thresh, const int downsample_ratio, + const bool clip_bbox, const float scale_x_y, + const int input_h, const int input_w); + YoloBoxPlugin(const void* data, size_t length); + ~YoloBoxPlugin() override; + + const char* getPluginType() const override; + const char* getPluginVersion() const override; + int getNbOutputs() const override; + nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs, + int nb_input_dims) override; + bool supportsFormat(nvinfer1::DataType type, + nvinfer1::TensorFormat format) const override; + size_t getWorkspaceSize(int max_batch_size) const override; + int enqueue(int batch_size, const void* const* inputs, void** outputs, + void* workspace, cudaStream_t stream) override; + template + int enqueue_impl(int batch_size, const void* const* inputs, void** outputs, + void* workspace, cudaStream_t stream); + int initialize() override; + void terminate() override; + size_t getSerializationSize() const override; + void serialize(void* buffer) const override; + void destroy() override; + void setPluginNamespace(const char* lib_namespace) override; + const char* getPluginNamespace() const override; + + nvinfer1::DataType getOutputDataType(int index, + const nvinfer1::DataType* input_type, + int nb_inputs) const override; + bool isOutputBroadcastAcrossBatch(int output_index, + const bool* input_is_broadcast, + int nb_inputs) const override; + bool canBroadcastInputAcrossBatch(int input_index) const override; + void configurePlugin(const nvinfer1::Dims* input_dims, int nb_inputs, + const nvinfer1::Dims* output_dims, int nb_outputs, + const nvinfer1::DataType* input_types, + const nvinfer1::DataType* output_types, + const bool* input_is_broadcast, + const bool* output_is_broadcast, + nvinfer1::PluginFormat float_format, + int max_batct_size) override; + nvinfer1::IPluginV2Ext* clone() const override; + + private: + nvinfer1::DataType data_type_; + std::vector anchors_; + int* anchors_device_; + int class_num_; + float conf_thresh_; + int downsample_ratio_; + bool clip_bbox_; + float scale_x_y_; + int input_h_; + int input_w_; + std::string namespace_; +}; + +class YoloBoxPluginCreator : public nvinfer1::IPluginCreator { + public: + YoloBoxPluginCreator(); + ~YoloBoxPluginCreator() override = default; + + void setPluginNamespace(const char* lib_namespace) override; + const char* getPluginNamespace() const override; + const char* getPluginName() const override; + const char* getPluginVersion() const override; + const nvinfer1::PluginFieldCollection* getFieldNames() override; + + nvinfer1::IPluginV2Ext* createPlugin( + const char* name, const nvinfer1::PluginFieldCollection* fc) override; + nvinfer1::IPluginV2Ext* deserializePlugin(const char* name, + const void* serial_data, + size_t serial_length) override; + + private: + std::string namespace_; + nvinfer1::PluginFieldCollection field_collection_; +}; + +REGISTER_TRT_PLUGIN_V2(YoloBoxPluginCreator); + +} // namespace plugin +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_yolo_box_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_yolo_box_op.py new file mode 100644 index 00000000000..cff8091cd93 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_yolo_box_op.py @@ -0,0 +1,76 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +from inference_pass_test import InferencePassTest +import paddle.fluid as fluid +import paddle.fluid.core as core +from paddle.fluid.core import PassVersionChecker +from paddle.fluid.core import AnalysisConfig + + +class TRTYoloBoxTest(InferencePassTest): + def setUp(self): + self.set_params() + with fluid.program_guard(self.main_program, self.startup_program): + image_shape = [self.bs, self.channel, self.height, self.width] + image = fluid.data(name='image', shape=image_shape, dtype='float32') + image_size = fluid.data( + name='image_size', shape=[self.bs, 2], dtype='int32') + boxes, scores = self.append_yolobox(image, image_size) + scores = fluid.layers.reshape(scores, (self.bs, -1)) + out = fluid.layers.batch_norm(scores, is_test=True) + + self.feeds = { + 'image': np.random.random(image_shape).astype('float32'), + 'image_size': np.random.randint( + 32, 64, size=(self.bs, 2)).astype('int32'), + } + self.enable_trt = True + self.trt_parameters = TRTYoloBoxTest.TensorRTParam( + 1 << 30, self.bs, 1, AnalysisConfig.Precision.Float32, False, False) + self.fetch_list = [out, boxes] + + def set_params(self): + self.bs = 4 + self.channel = 255 + self.height = 64 + self.width = 64 + self.class_num = 80 + self.anchors = [10, 13, 16, 30, 33, 23] + self.conf_thresh = .1 + self.downsample_ratio = 32 + + def append_yolobox(self, image, image_size): + return fluid.layers.yolo_box( + x=image, + img_size=image_size, + class_num=self.class_num, + anchors=self.anchors, + conf_thresh=self.conf_thresh, + downsample_ratio=self.downsample_ratio) + + def test_check_output(self): + if core.is_compiled_with_cuda(): + use_gpu = True + self.check_output_with_option(use_gpu, flatten=True) + self.assertTrue( + PassVersionChecker.IsCompatible('tensorrt_subgraph_pass')) + + +if __name__ == "__main__": + unittest.main() -- GitLab From 8084b7594ba3c083d65b69737a8114e150d7541f Mon Sep 17 00:00:00 2001 From: Shang Zhizhou Date: Tue, 30 Mar 2021 10:15:32 +0800 Subject: [PATCH 095/486] fix batchnorm when inpu dims < 3 (#31933) * fix batchnorm when inpu dims < 3 * add unittest for batchnorm dims = 2 --- .../tensorrt/convert/batch_norm_op.cc | 42 ++++++++++++++++--- .../ir/inference/test_trt_scale_op.py | 28 +++++++++++++ 2 files changed, 65 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc index 26cd7b22d2b..a6484a13557 100644 --- a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc @@ -158,17 +158,49 @@ class BatchNormOpConverter : public OpConverter { TensorRTEngine::Weight power_weights{nvinfer1::DataType::kFLOAT, nullptr, 0}; - nvinfer1::IScaleLayer* layer = - TRT_ENGINE_ADD_LAYER(engine_, Scale, *const_cast(X), - nvinfer1::ScaleMode::kCHANNEL, shift_weights.get(), - scale_weights.get(), power_weights.get()); + int dynamic_shape_offset = engine_->with_dynamic_shape() ? 1 : 0; + nvinfer1::ILayer* layer = nullptr; + nvinfer1::IShuffleLayer* expand_layer = nullptr; + nvinfer1::IShuffleLayer* squeeze_layer = nullptr; + + auto x_dim = X->getDimensions(); + if (x_dim.nbDims < 3 + dynamic_shape_offset) { + nvinfer1::Dims expand_shape; + expand_shape.nbDims = 3 + dynamic_shape_offset; + for (int i = 0; i < 3 + dynamic_shape_offset; i++) { + if (i < x_dim.nbDims) { + expand_shape.d[i] = x_dim.d[i] < 0 ? 0 : x_dim.d[i]; + } else { + expand_shape.d[i] = 1; + } + } + expand_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *X); + expand_layer->setReshapeDimensions(expand_shape); + X = expand_layer->getOutput(0); + } + + layer = TRT_ENGINE_ADD_LAYER( + engine_, Scale, *X, nvinfer1::ScaleMode::kCHANNEL, shift_weights.get(), + scale_weights.get(), power_weights.get()); auto output_name = op_desc.Output("Y").front(); engine_->SetWeights(op_desc.Input("Bias").front(), std::move(combile_bias_tensor)); engine_->SetWeights(op_desc.Input("Scale").front(), std::move(combile_scale_tensor)); - RreplenishLayerAndOutput(layer, "pool2d", {output_name}, test_mode); + if (x_dim.nbDims < 3 + dynamic_shape_offset) { + nvinfer1::Dims squeeze_shape; + squeeze_shape.nbDims = x_dim.nbDims; + for (int i = 0; i < squeeze_shape.nbDims; i++) { + squeeze_shape.d[i] = x_dim.d[i] < 0 ? 0 : x_dim.d[i]; + } + squeeze_layer = + TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *(layer->getOutput(0))); + squeeze_layer->setReshapeDimensions(squeeze_shape); + layer = static_cast(squeeze_layer); + } + RreplenishLayerAndOutput(layer, "batchnorm_add_scale", {output_name}, + test_mode); } }; diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_scale_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_scale_op.py index 67a1253b2cd..4530e04d8de 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_scale_op.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_scale_op.py @@ -48,5 +48,33 @@ class TRTScaleTest(InferencePassTest): PassVersionChecker.IsCompatible('tensorrt_subgraph_pass')) +class TRTScaleShape2Test(InferencePassTest): + def setUp(self): + with fluid.program_guard(self.main_program, self.startup_program): + data = fluid.data( + name="data", shape=[-1, 512, 512], dtype="float32") + scale_out = self.append_scale(data) + out = fluid.layers.batch_norm(scale_out, is_test=True) + + self.feeds = { + "data": np.random.random([1, 512, 512]).astype("float32"), + } + self.enable_trt = True + self.trt_parameters = TRTScaleShape2Test.TensorRTParam( + 1 << 30, 32, 1, AnalysisConfig.Precision.Float32, False, False) + self.fetch_list = [out] + + def append_scale(self, data): + return fluid.layers.scale( + x=data, scale=2.0, bias=-1.0, bias_after_scale=False) + + def test_check_output(self): + if core.is_compiled_with_cuda(): + use_gpu = True + self.check_output_with_option(use_gpu, flatten=True) + self.assertTrue( + PassVersionChecker.IsCompatible('tensorrt_subgraph_pass')) + + if __name__ == "__main__": unittest.main() -- GitLab From 73a6fa3ed0fe2bbbfe72c05f42faabccd3bbadb7 Mon Sep 17 00:00:00 2001 From: chajchaj <57249073+chajchaj@users.noreply.github.com> Date: Tue, 30 Mar 2021 10:33:10 +0800 Subject: [PATCH 096/486] add deprecated for softmax_with_cross_entropy (#31722) * add deprecated for softmax_with_cross_entropy, test=develop * test for deprecated in english doc, test=develop * test deprecated for softmax_with_cross_entropy in english doc, test=develop * fix readme and English doc for cross_entropy, test=develop * rm test for softmax_with_cross_entropy deprecated, test=develop * update readme for CrossEntropyLoss, test=develop * fix readme format, test=develop * fix readme format, test=develop * fix readme format for cross_entropy, test=develop * add softmax_switch and fix softlabel for cross_entropy, test=develop * 1)recovery softmax_with_cross_entropy in fluid 2) change softmax_switch to use_softmax 3) add example for softlabel for cross_entropy, test=develop * fix Example number for cross_entropy, test=develop * fix code format, test=develop * fix for CI-Coverage, test=develop * fix for CI-Coverage, test=develop * fix ci-coverage for Non-ASCII character '\xe2' in file, test=develop * fix ci-coverage for Non-ASCII character '\xe2' in nn.layer.loss.py, test=develop * update description for doc when use_softmax=Fasle, test=develop * fix some docs and code example for cross_entropy, test=develop * delete redundant description for soft_label parameter of cross_entropy, test=develop * fix some comment for test_cross_entropy_loss.py, test=develop --- .../unittests/test_cross_entropy_loss.py | 638 +++++++++++++++++- python/paddle/nn/functional/loss.py | 382 ++++++++--- python/paddle/nn/layer/loss.py | 273 ++++++-- 3 files changed, 1155 insertions(+), 138 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py b/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py index 81e2160a556..1a5e4b28355 100644 --- a/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py +++ b/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py @@ -18,6 +18,8 @@ import paddle import paddle.fluid as fluid import numpy as np import unittest +from test_softmax_op import stable_softmax +from test_softmax_with_cross_entropy_op import cross_entropy def stable_softmax(x): @@ -42,6 +44,8 @@ def cross_entropy_loss_1d(input, C = input_shape[1] out = np.zeros_like(label).astype(np.float64) total_weight = 0 + ###1. compute softmax cross_entropy (with weight) + ### Note: only support hard labels. for i in range(N): cur_target = label[i] if cur_target == ignore_index: @@ -50,6 +54,8 @@ def cross_entropy_loss_1d(input, cur_weight = weight[cur_target] if weight is not None else 1 total_weight += cur_weight out[i] = -log_softmax_out[i][cur_target] * cur_weight + + ###2. deal with reduction if reduction == 'sum': return np.sum(out), np.array([total_weight]).astype('float64') elif reduction == 'mean': @@ -92,7 +98,620 @@ def cross_entropy_loss_2d(input, return out +def cross_entropy_soft(softmax, + label, + axis, + N, + weight=None, + reduction='mean', + ignore_index=-100): + #1.loss + loss = cross_entropy( + softmax, + label, + True, #soft_label, + axis, + ignore_index) + + if weight is None and reduction == 'none': + return loss + + #2.weight + weighted_loss = loss + total_weight = N #for weight is None + if weight is not None: + weighted_loss = np.zeros_like(loss).astype(np.float64) + total_weight = 0 + for i in range(N): + cur_soft_label = label[i] + cur_weight = np.dot(weight, cur_soft_label) + total_weight += cur_weight + weighted_loss[i] = loss[i] * cur_weight + + #3.reduce + if reduction == 'none': + return weighted_loss + + elif reduction == 'mean': + weighted_loss_sum = np.sum(weighted_loss) + weighted_loss_mean = weighted_loss_sum / total_weight + return weighted_loss_mean + + else: + weighted_loss_sum = np.sum(weighted_loss) + return weighted_loss_sum + + +def cross_entropy_soft_2d(softmax, + label, + axis, + N, + H, + W, + weight=None, + reduction='mean', + ignore_index=-100): + #1.loss + loss = cross_entropy( + softmax, + label, + True, #soft_label, + axis, + ignore_index) + + if weight is None and reduction == 'none': + return loss + + #2.weight + weighted_loss = loss + total_weight = N #for weight is None + if weight is not None: + weighted_loss = np.zeros_like(loss).astype(np.float64) + total_weight = 0 + for i in range(N): + for h in range(H): + for w in range(W): + cur_soft_label = label[i][h][w] + cur_weight = np.dot(weight, cur_soft_label) + total_weight += cur_weight + weighted_loss[i][h][w] = loss[i][h][w] * cur_weight + + #3.reduce + if reduction == 'none': + return weighted_loss + + elif reduction == 'mean': + weighted_loss_sum = np.sum(weighted_loss) + weighted_loss_mean = weighted_loss_sum / total_weight + return weighted_loss_mean + + else: + weighted_loss_sum = np.sum(weighted_loss) + return weighted_loss_sum + + class CrossEntropyLoss(unittest.TestCase): + + ###test for deprecated softmax_with_cross_entropy + def test_softmax_with_cross_entropy(self): + self.numeric_stable_mode = False + self.soft_label = True + self.dtype = np.float64 + self.axis = -1 + self.ignore_index = -100 #should not be changed + self.N = 4 + self.C = 3 + self.shape = [self.N, self.C] + self.use_softmax = True + self.reduction = 'none' + self.weight = None + self.logits = getattr( + self, "logits", + np.random.uniform(0.1, 1.0, self.shape).astype(self.dtype)) + softmax = np.apply_along_axis(stable_softmax, self.axis, self.logits) + + self.labels = np.random.uniform(0.1, 1.0, self.shape).astype(self.dtype) + self.labels /= np.sum(self.labels, axis=self.axis, keepdims=True) + + expected = cross_entropy_soft( + softmax, + self.labels, + self.axis, + self.N, + weight=self.weight, + reduction=self.reduction, + ignore_index=self.ignore_index) + + paddle.set_device("cpu") + + paddle.disable_static() + paddle_loss_swce = paddle.nn.functional.softmax_with_cross_entropy( + fluid.dygraph.to_variable(self.logits), + fluid.dygraph.to_variable(self.labels), + soft_label=True, + axis=self.axis) + + paddle_loss_ce = paddle.nn.functional.cross_entropy( + fluid.dygraph.to_variable(self.logits), + fluid.dygraph.to_variable(self.labels), + soft_label=True, + axis=self.axis, + weight=fluid.dygraph.to_variable(self.weight) + if self.weight is not None else None, + reduction=self.reduction) + + self.assertTrue(np.allclose(paddle_loss_swce.numpy(), expected)) + self.assertTrue(np.allclose(paddle_loss_ce.numpy(), expected)) + + ###soft_label test start + ###soft_label test 1 + def test_cross_entropy_loss_soft_1d(self): + self.numeric_stable_mode = False + self.soft_label = True + self.dtype = np.float64 + self.axis = -1 + self.ignore_index = -100 #should not be changed + self.N = 4 + self.C = 3 + self.shape = [self.N, self.C] + self.use_softmax = True + self.reduction = 'none' + self.weight = None + self.logits = getattr( + self, "logits", + np.random.uniform(0.1, 1.0, self.shape).astype(self.dtype)) + softmax = np.apply_along_axis(stable_softmax, self.axis, self.logits) + + self.labels = np.random.uniform(0.1, 1.0, self.shape).astype(self.dtype) + self.labels /= np.sum(self.labels, axis=self.axis, keepdims=True) + + expected = cross_entropy_soft( + softmax, + self.labels, + self.axis, + self.N, + weight=self.weight, + reduction=self.reduction, + ignore_index=self.ignore_index) + + paddle.set_device("cpu") + + #2. dygraph + paddle.disable_static() + paddle_loss_none_weight = paddle.nn.functional.cross_entropy( + fluid.dygraph.to_variable(self.logits), + fluid.dygraph.to_variable(self.labels), + soft_label=True, + axis=self.axis, + weight=fluid.dygraph.to_variable(self.weight) + if self.weight is not None else None, + reduction=self.reduction) + dy_ret_value = paddle_loss_none_weight.numpy() + + #3. static + paddle.enable_static() + prog = fluid.Program() + startup_prog = fluid.Program() + place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda( + ) else fluid.CPUPlace() + with fluid.program_guard(prog, startup_prog): + input = fluid.data( + name='input', shape=[self.N, self.C], dtype='float64') + label = fluid.data( + name='label', shape=[self.N, self.C], dtype='float64') + + cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss( + reduction=self.reduction, soft_label=True) + ret = cross_entropy_loss(input, label) + + exe = fluid.Executor(place) + static_ret = exe.run(prog, + feed={ + 'input': self.logits, + 'label': self.labels, + }, + fetch_list=[ret]) + self.assertIsNotNone(static_ret) + paddle.disable_static() + + self.assertTrue(np.allclose(static_ret, expected)) + self.assertTrue(np.allclose(dy_ret_value, expected)) + + ###soft_label test 2 + def test_cross_entropy_loss_soft_1d_weight(self): + self.numeric_stable_mode = False + self.soft_label = True + self.dtype = np.float64 + self.axis = -1 + self.ignore_index = -100 #should not be changed + self.N = 4 + self.C = 3 + self.shape = [self.N, self.C] + self.use_softmax = True + self.reduction = 'none' + self.weight = np.random.uniform(0.1, 1.0, self.C).astype(self.dtype) + self.logits = getattr( + self, "logits", + np.random.uniform(0.1, 1.0, self.shape).astype(self.dtype)) + softmax = np.apply_along_axis(stable_softmax, self.axis, self.logits) + + if self.soft_label: + self.labels = np.random.uniform(0.1, 1.0, + self.shape).astype(self.dtype) + self.labels /= np.sum(self.labels, axis=self.axis, keepdims=True) + else: + axis_dim = self.shape[self.axis] + self.shape[self.axis] = 1 + self.labels = np.random.randint( + 0, axis_dim, self.shape, dtype="int64") + + #1. numpy + expected = cross_entropy_soft( + softmax, + self.labels, + self.axis, + self.N, + weight=self.weight, + reduction=self.reduction, + ignore_index=self.ignore_index) + + paddle.set_device("cpu") + + #2. dygraph + paddle.disable_static() + paddle_loss_none_weight = paddle.nn.functional.cross_entropy( + fluid.dygraph.to_variable(self.logits), + fluid.dygraph.to_variable(self.labels), + soft_label=True, + axis=self.axis, + weight=fluid.dygraph.to_variable(self.weight), + reduction=self.reduction) + dy_ret_value = paddle_loss_none_weight.numpy() + + # 3.static + paddle.enable_static() + prog = fluid.Program() + startup_prog = fluid.Program() + place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda( + ) else fluid.CPUPlace() + with fluid.program_guard(prog, startup_prog): + input = fluid.data( + name='input', shape=[self.N, self.C], dtype='float64') + label = fluid.data( + name='label', shape=[self.N, self.C], dtype='float64') + weight = fluid.data(name='weight', shape=[self.C], dtype='float64') + + cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss( + weight=weight, reduction=self.reduction, soft_label=True) + ret = cross_entropy_loss(input, label) + + exe = fluid.Executor(place) + static_ret = exe.run(prog, + feed={ + 'input': self.logits, + 'label': self.labels, + "weight": self.weight + }, + fetch_list=[ret]) + self.assertIsNotNone(static_ret) + paddle.disable_static() + + self.assertTrue(np.allclose(static_ret, expected)) + self.assertTrue(np.allclose(dy_ret_value, expected)) + + ###soft_label test 3 + def test_cross_entropy_loss_soft_1d_mean(self): + self.numeric_stable_mode = False + self.soft_label = True + self.dtype = np.float64 + self.axis = -1 + self.ignore_index = -100 #should not be changed + self.N = 4 + self.C = 3 + self.shape = [self.N, self.C] + self.use_softmax = True + self.reduction = 'mean' + self.weight = None + self.logits = getattr( + self, "logits", + np.random.uniform(0.1, 1.0, self.shape).astype(self.dtype)) + softmax = np.apply_along_axis(stable_softmax, self.axis, self.logits) + + self.labels = np.random.uniform(0.1, 1.0, self.shape).astype(self.dtype) + self.labels /= np.sum(self.labels, axis=self.axis, keepdims=True) + + #1. numpy + expected = cross_entropy_soft( + softmax, + self.labels, + self.axis, + self.N, + weight=self.weight, + reduction=self.reduction, + ignore_index=self.ignore_index) + + paddle.set_device("cpu") + + #2 dygraph + paddle.disable_static() + paddle_loss_mean = paddle.nn.functional.cross_entropy( + fluid.dygraph.to_variable(self.logits), + fluid.dygraph.to_variable(self.labels), + soft_label=True, + axis=self.axis, + weight=self.weight, + reduction=self.reduction) + dy_ret_value = paddle_loss_mean.numpy() + + #3. static + paddle.enable_static() + prog = fluid.Program() + startup_prog = fluid.Program() + place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda( + ) else fluid.CPUPlace() + with fluid.program_guard(prog, startup_prog): + input = fluid.data( + name='input', shape=[self.N, self.C], dtype='float64') + label = fluid.data( + name='label', shape=[self.N, self.C], dtype='float64') + + cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss( + reduction=self.reduction, soft_label=True) + ret = cross_entropy_loss(input, label) + + exe = fluid.Executor(place) + static_ret = exe.run( + prog, + feed={'input': self.logits, + 'label': self.labels}, + fetch_list=[ret]) + self.assertIsNotNone(static_ret) + paddle.disable_static() + + self.assertTrue(np.allclose(static_ret, expected)) + self.assertTrue(np.allclose(dy_ret_value, expected)) + + ###soft_label test 4 + def test_cross_entropy_loss_soft_1d_weight_mean(self): + self.numeric_stable_mode = False + self.soft_label = True + self.dtype = np.float64 + self.axis = -1 + self.ignore_index = -100 #should not be changed + self.N = 4 + self.C = 3 + self.shape = [self.N, self.C] + self.use_softmax = True + self.reduction = 'mean' + self.weight = np.random.uniform(0.1, 1.0, self.C).astype(self.dtype) + self.logits = getattr( + self, "logits", + np.random.uniform(0.1, 1.0, self.shape).astype(self.dtype)) + softmax = np.apply_along_axis(stable_softmax, self.axis, self.logits) + + self.labels = np.random.uniform(0.1, 1.0, self.shape).astype(self.dtype) + self.labels /= np.sum(self.labels, axis=self.axis, keepdims=True) + + #1. numpy + expected = cross_entropy_soft( + softmax, + self.labels, + self.axis, + self.N, + weight=self.weight, + reduction=self.reduction, + ignore_index=self.ignore_index) + + paddle.set_device("cpu") + paddle.disable_static() + + #2. dygraph + paddle_loss_none_weight = paddle.nn.functional.cross_entropy( + fluid.dygraph.to_variable(self.logits), + fluid.dygraph.to_variable(self.labels), + soft_label=True, + axis=self.axis, + weight=fluid.dygraph.to_variable(self.weight), + reduction=self.reduction) + dy_ret_value = paddle_loss_none_weight.numpy() + + #3. static + paddle.enable_static() + prog = fluid.Program() + startup_prog = fluid.Program() + place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda( + ) else fluid.CPUPlace() + with fluid.program_guard(prog, startup_prog): + input = fluid.data( + name='input', shape=[self.N, self.C], dtype='float64') + label = fluid.data( + name='label', shape=[self.N, self.C], dtype='float64') + weight = fluid.data(name='weight', shape=[self.C], dtype='float64') + + cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss( + weight=weight, reduction=self.reduction, soft_label=True) + ret = cross_entropy_loss(input, label) + exe = fluid.Executor(place) + static_ret = exe.run(prog, + feed={ + 'input': self.logits, + 'label': self.labels, + "weight": self.weight + }, + fetch_list=[ret]) + self.assertIsNotNone(static_ret) + paddle.disable_static() + + self.assertTrue(np.allclose(static_ret, expected)) + self.assertTrue(np.allclose(dy_ret_value, expected)) + + ###soft_label test 5 + def test_cross_entropy_loss_soft_2d(self): + self.numeric_stable_mode = False + self.soft_label = True + self.dtype = np.float64 + self.axis = -1 + self.ignore_index = -100 #should not be changed + self.N = 3 + self.H = 2 + self.W = 2 + self.C = 5 + self.shape = [self.N, self.H, self.W, self.C] + self.use_softmax = True + self.reduction = 'none' + self.weight = None + self.logits = getattr( + self, "logits", + np.random.uniform(0.1, 1.0, self.shape).astype(self.dtype)) + softmax = np.apply_along_axis(stable_softmax, self.axis, self.logits) + + self.labels = np.random.uniform(0.1, 1.0, self.shape).astype(self.dtype) + self.labels /= np.sum(self.labels, axis=self.axis, keepdims=True) + + #1. numpy + expected = cross_entropy_soft_2d( + softmax, + self.labels, + self.axis, + self.N, + self.H, + self.W, + weight=self.weight, + reduction=self.reduction, + ignore_index=self.ignore_index) + + paddle.set_device("cpu") + paddle.disable_static() + + #2. dygraph + paddle_loss_none_weight = paddle.nn.functional.cross_entropy( + fluid.dygraph.to_variable(self.logits), + fluid.dygraph.to_variable(self.labels), + soft_label=True, + axis=self.axis, + weight=fluid.dygraph.to_variable(self.weight) + if self.weight is not None else None, + reduction=self.reduction) + dy_ret_value = paddle_loss_none_weight.numpy() + + #3. static + paddle.enable_static() + prog = fluid.Program() + startup_prog = fluid.Program() + place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda( + ) else fluid.CPUPlace() + with fluid.program_guard(prog, startup_prog): + input = fluid.data( + name='input', + shape=[self.N, self.H, self.W, self.C], + dtype='float64') + label = fluid.data( + name='label', + shape=[self.N, self.H, self.W, self.C], + dtype='float64') + + cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss( + reduction=self.reduction, soft_label=True) + ret = cross_entropy_loss(input, label) + exe = fluid.Executor(place) + static_ret = exe.run(prog, + feed={ + 'input': self.logits, + 'label': self.labels, + }, + fetch_list=[ret]) + self.assertIsNotNone(static_ret) + paddle.disable_static() + + self.assertTrue(np.allclose(static_ret, dy_ret_value)) + self.assertTrue(np.allclose(static_ret, expected)) + self.assertTrue(np.allclose(dy_ret_value, expected)) + + ###soft_label test 6 + def test_cross_entropy_loss_soft_2d_weight_mean(self): + self.numeric_stable_mode = False + self.soft_label = True + self.dtype = np.float64 + self.axis = -1 + self.ignore_index = -100 #should not be changed + self.N = 3 + self.H = 2 + self.W = 2 + self.C = 5 + self.shape = [self.N, self.H, self.W, self.C] + self.use_softmax = True + self.reduction = 'mean' + self.weight = np.random.uniform(0.1, 1.0, self.C).astype(self.dtype) + self.logits = getattr( + self, "logits", + np.random.uniform(0.1, 1.0, self.shape).astype(self.dtype)) + softmax = np.apply_along_axis(stable_softmax, self.axis, self.logits) + + self.labels = np.random.uniform(0.1, 1.0, self.shape).astype(self.dtype) + self.labels /= np.sum(self.labels, axis=self.axis, keepdims=True) + + #1. numpy + expected = cross_entropy_soft_2d( + softmax, + self.labels, + self.axis, + self.N, + self.H, + self.W, + weight=self.weight, + reduction=self.reduction, + ignore_index=self.ignore_index) + + paddle.set_device("cpu") + paddle.disable_static() + + #2. dygraph + paddle_loss_none_weight = paddle.nn.functional.cross_entropy( + fluid.dygraph.to_variable(self.logits), + fluid.dygraph.to_variable(self.labels), + soft_label=True, + axis=self.axis, + weight=fluid.dygraph.to_variable(self.weight), + reduction=self.reduction) + dy_ret_value = paddle_loss_none_weight.numpy() + + #3. static + paddle.enable_static() + prog = fluid.Program() + startup_prog = fluid.Program() + place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda( + ) else fluid.CPUPlace() + with fluid.program_guard(prog, startup_prog): + input = fluid.data( + name='input', + shape=[self.N, self.H, self.W, self.C], + dtype='float64') + label = fluid.data( + name='label', + shape=[self.N, self.H, self.W, self.C], + dtype='float64') + weight = fluid.data(name='weight', shape=[self.C], dtype='float64') + + cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss( + weight=weight, reduction=self.reduction, soft_label=True) + ret = cross_entropy_loss(input, label) + exe = fluid.Executor(place) + static_ret = exe.run(prog, + feed={ + 'input': self.logits, + 'label': self.labels, + "weight": self.weight + }, + fetch_list=[ret]) + self.assertIsNotNone(static_ret) + paddle.disable_static() + + self.assertTrue(np.allclose(static_ret, dy_ret_value)) + self.assertTrue(np.allclose(static_ret, expected)) + self.assertTrue(np.allclose(dy_ret_value, expected)) + + ###soft_label test end + def test_cross_entropy_loss_1d_with_mean_ignore(self): input_np = np.random.random([2, 4]).astype(np.float64) label_np = np.random.randint(0, 4, size=(2)).astype(np.int64) @@ -131,19 +750,21 @@ class CrossEntropyLoss(unittest.TestCase): self.assertTrue(np.allclose(dy_ret_value, expected)) def test_cross_entropy_loss_1d_with_weight_mean_ignore(self): - input_np = np.random.random([2, 4]).astype(np.float64) - label_np = np.random.randint(0, 4, size=(2)).astype(np.int64) - weight_np = np.random.random([4]).astype(np.float64) #shape:C + N = 100 + C = 200 + input_np = np.random.random([N, C]).astype(np.float64) + label_np = np.random.randint(0, C, size=(N)).astype(np.int64) + weight_np = np.random.random([C]).astype(np.float64) paddle.enable_static() prog = fluid.Program() startup_prog = fluid.Program() place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda( ) else fluid.CPUPlace() with fluid.program_guard(prog, startup_prog): - input = fluid.data(name='input', shape=[2, 4], dtype='float64') - label = fluid.data(name='label', shape=[2], dtype='int64') + input = fluid.data(name='input', shape=[N, C], dtype='float64') + label = fluid.data(name='label', shape=[N], dtype='int64') weight = fluid.data( - name='weight', shape=[4], + name='weight', shape=[C], dtype='float64') #weight for each class cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss( weight=weight, ignore_index=0) @@ -158,8 +779,6 @@ class CrossEntropyLoss(unittest.TestCase): }, fetch_list=[ret]) self.assertIsNotNone(static_ret) - expected = cross_entropy_loss_1d( - input_np, label_np, weight=weight_np)[0] with fluid.dygraph.guard(): cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss( @@ -173,6 +792,7 @@ class CrossEntropyLoss(unittest.TestCase): self.assertIsNotNone(dy_ret_value) expected = cross_entropy_loss_1d( input_np, label_np, weight=weight_np, ignore_index=0)[0] + self.assertTrue(np.allclose(static_ret, dy_ret_value)) self.assertTrue(np.allclose(static_ret, expected)) self.assertTrue(np.allclose(dy_ret_value, expected)) @@ -265,6 +885,7 @@ class CrossEntropyLoss(unittest.TestCase): input_np = np.random.random([100, 200]).astype(np.float64) #N,C label_np = np.random.randint(0, 100, size=(100)).astype(np.int64) #N,1 weight_np = np.random.random([200]).astype(np.float64) #C + paddle.enable_static() prog = fluid.Program() startup_prog = fluid.Program() @@ -274,6 +895,7 @@ class CrossEntropyLoss(unittest.TestCase): input = fluid.data(name='input', shape=[100, 200], dtype='float64') label = fluid.data(name='label', shape=[100], dtype='int64') weight = fluid.data(name='weight', shape=[200], dtype='float64') + cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss( weight=weight, reduction='none') ret = cross_entropy_loss(input, label) diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py index c223addc260..1dad1632e26 100755 --- a/python/paddle/nn/functional/loss.py +++ b/python/paddle/nn/functional/loss.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -* # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -27,7 +28,7 @@ from ...fluid.layers import dice_loss #DEFINE_ALIAS from ...fluid.layers import log_loss #DEFINE_ALIAS from ...fluid.layers import npair_loss #DEFINE_ALIAS from ...fluid.layers import reshape -from ...fluid.layers import softmax_with_cross_entropy #DEFINE_ALIAS +from ...fluid.layers import softmax_with_cross_entropy as fluid_softmax_with_cross_entropy #DEFINE_ALIAS from ...fluid.layers import square_error_cost #DEFINE_ALIAS from ...fluid.layers import edit_distance #DEFINE_ALIAS @@ -36,6 +37,7 @@ from ...fluid.layer_helper import LayerHelper from ...fluid.framework import in_dygraph_mode from ...fluid.framework import _varbase_creator from ...fluid.framework import Variable +from paddle.utils import deprecated __all__ = [ 'binary_cross_entropy', @@ -682,7 +684,6 @@ def l1_loss(input, label, reduction='mean', name=None): import paddle - paddle.disable_static() input = paddle.to_tensor([[1.5, 0.8], [0.2, 1.3]]) label = paddle.to_tensor([[1.7, 1], [0.4, 0.5]]) @@ -1112,6 +1113,19 @@ def ctc_loss(log_probs, return loss_out +@deprecated(since="2.0.0", update_to="paddle.nn.functional.cross_entropy") +def softmax_with_cross_entropy(logits, + label, + soft_label=False, + ignore_index=-100, + numeric_stable_mode=True, + return_softmax=False, + axis=-1): + return fluid_softmax_with_cross_entropy(logits, label, soft_label, + ignore_index, numeric_stable_mode, + return_softmax, axis) + + def cross_entropy(input, label, weight=None, @@ -1119,87 +1133,248 @@ def cross_entropy(input, reduction='mean', soft_label=False, axis=-1, + use_softmax=True, name=None): r""" - This operator implements the cross entropy loss function with softmax. This function + By default, this operator implements the cross entropy loss function with softmax. This function combines the calculation of the softmax operation and the cross entropy loss function - to provide a more numerically stable gradient. - Because this operator performs a softmax on logits internally, it expects - unscaled logits. This operator should not be used with the output of - softmax operator since that would produce incorrect results. + to provide a more numerically stable computing. - When the attribute :attr:`soft_label` is set :attr:`False`, this operators - expects mutually exclusive hard labels, each sample in a batch is in exactly - one class with a probability of 1.0. Each sample in the batch will have a - single label. + This operator will calculate the cross entropy loss function without softmax when use_softmax=False. - The equation is as follows: + By default, this operator will calculate the mean of the result, and you can also affect + the default behavior by using the reduction parameter. Please refer to the part of + parameters for details. - 1) Hard label (one-hot label, so every sample has exactly one class) + This operator can be used to calculate the softmax cross entropy loss with soft and hard labels. + Where, the hard labels mean the actual label value, 0, 1, 2, etc. And the soft labels + mean the probability of the actual label, 0.6, 0.8, 0.2, etc. - .. math:: + The calculation of this operator includes the following two steps. - loss_j = -\\text{logits}_{label_j} + - \\log\\left(\\sum_{i=0}^{K}\\exp(\\text{logits}_i)\\right), j = 1,..., K + - **1.softmax cross entropy** - 2) Soft label (each sample can have a distribution over all classes) + 1. Hard label (each sample can only be assigned into one category) - .. math:: + 1.1. when use_softmax=True - loss_j = -\\sum_{i=0}^{K}\\text{label}_i - \\left(\\text{logits}_i - \\log\\left(\\sum_{i=0}^{K} - \\exp(\\text{logits}_i)\\right)\\right), j = 1,...,K + .. math:: + \\loss_j=-\text{logits}_{label_j}+\log\left(\sum_{i=0}^{C}\exp(\text{logits}_i)\right) , j = 1,...,N - - It is useful when training a classification problem with ``C`` classes. + where, N is the number of samples and C is the number of categories. + + 1.2. when use_softmax=False + + .. math:: + \\loss_j=-\log\left({P}_{label_j}\right) , j = 1,...,N + + where, N is the number of samples and C is the number of categories, P is input(the output of softmax). + + + 2. Soft label (each sample is assigned to multiple categories with a certain probability, and the probability sum is 1). + + 2.1. when use_softmax=True + + .. math:: + \\loss_j=-\sum_{i=0}^{C}\text{label}_i\left(\text{logits}_i-\log\left(\sum_{i=0}^{C}\exp(\text{logits}_i)\right)\right) , j = 1,...,N + + where, N is the number of samples and C is the number of categories. + + 2.2. when use_softmax=False + + .. math:: + \\loss_j=-\sum_{j=0}^{C}\left({label}_j*\log\left({P}_{label_j}\right)\right) , j = 1,...,N + + where, N is the number of samples and C is the number of categories, P is input(the output of softmax). + + + + + - **2. Weight and reduction processing** + + 1. Weight + + If the ``weight`` parameter is ``None`` , go to the next step directly. + + If the ``weight`` parameter is not ``None`` , the cross entropy of each sample is weighted by weight + according to soft_label = False or True as follows. + + 1.1. Hard labels (soft_label = False) + + .. math:: + \\loss_j=loss_j*weight[label_j] + 1.2. Soft labels (soft_label = True) + + .. math:: + \\loss_j=loss_j*\sum_{i}\left(weight[label_i]*logits_i\right) + + 2. reduction + + 2.1 if the ``reduction`` parameter is ``none`` + + Return the previous result directly + + 2.2 if the ``reduction`` parameter is ``sum`` + + Return the sum of the previous results + + .. math:: + \\loss=\sum_{j}loss_j + + 2.3 if the ``reduction`` parameter is ``mean`` , it will be processed according to + the ``weight`` parameter as follows. + + 2.3.1. If the ``weight`` parameter is ``None`` + + Return the average value of the previous results + + .. math:: + \\loss=\sum_{j}loss_j/N + + where, N is the number of samples and C is the number of categories. + + 2.3.2. If the 'weight' parameter is not 'None', the weighted average value of the previous result will be returned + + 1. Hard labels (soft_label = False) + + .. math:: + \\loss=\sum_{j}loss_j/\sum_{j}weight[label_j] + + 2. Soft labels (soft_label = True) + + .. math:: + \\loss=\sum_{j}loss_j/\sum_{j}\left(\sum_{i}weight[label_i]\right) + + Parameters: - input (Tensor): Input tensor, the data type is float32, float64. Shape is - (N, C), where C is number of classes, and if shape is more than 2D, this - is (N, D1, D2,..., Dk, C), k >= 1. - label (Tensor): Label tensor, the data type is int64. Shape is (N), where each - value is 0 <= label[i] <= C-1, and if shape is more than 2D, this is - (N, D1, D2,..., Dk), k >= 1. - weight (Tensor, optional):a manual rescaling weight given to each class. + + - **input** (Tensor) + + Input tensor, the data type is float32, float64. Shape is + :math:`[N_1, N_2, ..., N_k, C]`, where C is number of classes , ``k >= 1`` . + + Note: + + 1. when use_softmax=True, it expects unscaled logits. This operator should not be used with the + output of softmax operator, which will produce incorrect results. + + 2. when use_softmax=False, it expects the output of softmax operator. + + - **label** (Tensor) + + 1. If soft_label=False, the shape is + :math:`[N_1, N_2, ..., N_k]` or :math:`[N_1, N_2, ..., N_k, 1]`, k >= 1. + the data type is int32, int64, float32, float64, where each value is [0, C-1]. + + 2. If soft_label=True, the shape and data type should be same with ``input`` , + and the sum of the labels for each sample should be 1. + + - **weight** (Tensor, optional) + + a manual rescaling weight given to each class. If given, has to be a Tensor of size C and the data type is float32, float64. - Default is ``'None'``. - reduction (str, optional): Indicate how to average the loss by batch_size, + Default is ``'None'`` . + + - **ignore_index** (int64, optional) + + Specifies a target value that is ignored + and does not contribute to the loss. A negative value means that no label + value needs to be ignored. Only valid when soft_label = False. + Default is ``-100`` . + + - **reduction** (str, optional) + + Indicate how to average the loss by batch_size, the candicates are ``'none'`` | ``'mean'`` | ``'sum'``. If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned; If :attr:`size_average` is ``'sum'``, the reduced sum loss is returned. If :attr:`reduction` is ``'none'``, the unreduced loss is returned. Default is ``'mean'``. - ignore_index (int64, optional): Specifies a target value that is ignored - and does not contribute to the input gradient. Default is ``-100``. - soft_label (bool): indicate whether label is soft. Default False, meaning that - the label is hard. If soft_label=True, the label is soft. - axis (int, optional): The index of dimension to perform softmax calculations. It - should be in range :math:`[-1, rank - 1]`, while :math:`rank` - is the rank of input :attr:`logits`. Default: -1. + - **soft_label** (bool, optional) + + Indicate whether label is soft. + Default is ``False``. + + - **axis** (int, optional) + + The index of dimension to perform softmax calculations. + It should be in range :math:`[-1, rank - 1]`, where :math:`rank` is the + number of dimensions of input :attr:`input`. + Default is ``-1`` . + + - **use_softmax** (bool, optional) + + Indicate whether compute softmax before cross_entropy. + Default is ``True``. + + - **name** (str,optional) + + The name of the operator. Default is ``None`` . + For more information, please refer to :ref:`api_guide_Name` . Returns: - Tensor.The tensor storing the cross_entropy_loss of input and label. + Tensor. Return the softmax cross_entropy loss of ``input`` and ``label``. + The data type is the same as input. - Examples: - .. code-block:: python + If :attr:`reduction` is ``'mean'`` or ``'sum'`` , the dimension of return value is ``1``. - import paddle - import numpy as np + If :attr:`reduction` is ``'none'``: - input_data = np.random.random([5, 100]).astype("float64") - label_data = np.random.randint(0, 100, size=(5)).astype(np.int64) - weight_data = np.random.random([100]).astype("float64") + 1. If soft_label = False, the dimension of return value is the same with ``label`` . - input = paddle.to_tensor(input_data) - label = paddle.to_tensor(label_data) - weight = paddle.to_tensor(weight_data) + 2. if soft_label = True, the dimension of return value is :math:`[N_1, N_2, ..., N_k, 1]` . + + + Example1(hard labels): + + .. code-block:: python + + import paddle + paddle.seed(99999) + N=100 + C=200 + reduction='mean' + input = paddle.rand([N, C], dtype='float64') + label = paddle.randint(0, C, shape=[N], dtype='int64') + weight = paddle.rand([C], dtype='float64') + + cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss( + weight=weight, reduction=reduction) + dy_ret = cross_entropy_loss( + input, + label) + print(dy_ret.numpy()) #[5.41993642] + + + Example2(soft labels): + + .. code-block:: python + + import paddle + paddle.seed(99999) + axis = -1 + ignore_index = -100 + N = 4 + C = 3 + shape = [N, C] + reduction='mean' + weight = None + logits = paddle.uniform(shape, dtype='float64', min=0.1, max=1.0) + labels = paddle.uniform(shape, dtype='float64', min=0.1, max=1.0) + labels /= paddle.sum(labels, axis=axis, keepdim=True) + paddle_loss_mean = paddle.nn.functional.cross_entropy( + logits, + labels, + soft_label=True, + axis=axis, + weight=weight, + reduction=reduction) + print(paddle_loss_mean.numpy()) #[1.12908343] - loss = paddle.nn.functional.cross_entropy(input=input, label=label, weight=weight) - print(loss) - # [4.28546723] """ if reduction not in ['sum', 'mean', 'none']: @@ -1207,6 +1382,12 @@ def cross_entropy(input, "The value of 'reduction' in softmax_cross_entropy" "should be 'sum', 'mean' or 'none', but received %s, which is not allowed." % reduction) + if ignore_index > 0 and soft_label == True: + raise ValueError( + "When soft_label == True, the value of 'ignore_index' in softmax_cross_entropy" + "should be '-100', but received %s, which is not allowed." % + ignore_index) + input_dims = len(list(input.shape)) label_dims = len(list(label.shape)) if input_dims - 1 != label_dims and input_dims != label_dims: @@ -1216,27 +1397,46 @@ def cross_entropy(input, if input_dims - 1 == label_dims: label = paddle.unsqueeze(label, axis=axis) if in_dygraph_mode(): - out = softmax_with_cross_entropy( - input, - label, - soft_label=soft_label, - ignore_index=ignore_index, - axis=axis) + _, out = core.ops.softmax_with_cross_entropy( + input, label, 'soft_label', soft_label, 'ignore_index', + ignore_index, 'numeric_stable_mode', True, 'axis', axis, + 'use_softmax', use_softmax) + if weight is not None: - weight_gather = core.ops.gather_nd( - weight, label) #trans weight from class to sample, shape:N - input_shape = list(label.shape) - weight_gather_reshape = reshape(weight_gather, shape=input_shape) - out = core.ops.elementwise_mul(out, weight_gather_reshape) + + #trans weight from class to sample, shape:N or [N,H,W] for 1d and 2d cases. + if soft_label == True: + # chajchaj: + # weight's shape is C, where C is class num. + # for 1d case: label's shape is [N,C], weight_gather's shape is N. + # for 2d case: label's shape is [N,H,W,C], weight_gather's shape is [N,H,W]. + weight_gather = paddle.matmul( + x=paddle.cast(label, weight.dtype), + y=weight, + transpose_x=False, + transpose_y=True) + out_shape = list(out.shape) + weight_gather_reshape = reshape(weight_gather, shape=out_shape) + out = paddle.cast(out, weight_gather_reshape.dtype) + + out = core.ops.elementwise_mul(out, weight_gather_reshape) + + else: + weight_gather = core.ops.gather_nd(weight, label) + input_shape = list(label.shape) + weight_gather_reshape = reshape( + weight_gather, shape=input_shape) + out = paddle.cast(out, weight_gather_reshape.dtype) + out = core.ops.elementwise_mul(out, weight_gather_reshape) if reduction == "sum": - # because of softmax_with_cross_entropy op's inner logic, + # because of fluid_softmax_with_cross_entropy op's inner logic, # in the out tensor of this op, the loss of sample with class_index==ignore_index is 0 # so, reduce_sum all directly is ok return core.ops.reduce_sum(out, 'reduce_all', True) elif reduction == "mean": #1. if weight==none, - # numerator: reduce_sum all loss directly is ok causeof softmax_with_cross_entropy's inner logic + # numerator: reduce_sum all loss directly is ok causeof fluid_softmax_with_cross_entropy's inner logic # denominator: count sample num with class_index!=ignore_index #2. else # numerator: loss's weighted sum @@ -1247,7 +1447,7 @@ def cross_entropy(input, #mask[i]=0, if label[i]==ignore_index #mask[i]=1, otherwise mask = (label != ignore_index) - if (weight is None): + if weight is None: mask = paddle.cast(mask, dtype=out_sum.dtype) count = core.ops.reduce_sum(mask, 'reduce_all', True) ret = out_sum / count @@ -1277,20 +1477,48 @@ def cross_entropy(input, fluid.data_feeder.check_variable_and_dtype( label, 'label', ['int32', 'int64', 'float32', 'float64'], 'softmax_cross_entropy') - out = softmax_with_cross_entropy( - input, - label, - soft_label=soft_label, - ignore_index=ignore_index, - axis=axis) + attrs = { + 'soft_label': soft_label, + 'ignore_index': ignore_index, + 'numeric_stable_mode': True, + 'axis': axis, + 'use_softmax': use_softmax + } + helper = LayerHelper('softmax_with_cross_entropy', **locals()) + softmax = helper.create_variable_for_type_inference(dtype=input.dtype) + out = helper.create_variable_for_type_inference(dtype=input.dtype) + helper.append_op( + type='softmax_with_cross_entropy', + inputs={'Logits': input, + 'Label': label}, + outputs={'Softmax': softmax, + 'Loss': out}, + attrs=attrs) + if weight is not None: fluid.data_feeder.check_variable_and_dtype( weight, 'weight', ['float32', 'float64'], 'softmax_cross_entropy') weight_name = name if reduction == 'none' else None - weight_gather = paddle.gather_nd( - weight, label) #trans weight from class to sample, shape:N - input_shape = list(label.shape) - weight_gather_reshape = reshape(weight_gather, shape=input_shape) + if soft_label == True: + # chajchaj: + #trans weight from class to sample, shape:N or [N,H,W] for 1d and 2d cases. + # weight's shape is C, where C is class num. + # for 1d case: label's shape is [N,C], weight_gather's shape is N. + # for 2d case: label's shape is [N,H,W,C], weight_gather's shape is [N,H,W]. + weight_gather = paddle.matmul( + x=paddle.cast(label, weight.dtype), + y=weight, + transpose_x=False, + transpose_y=True) + + out_shape = list(out.shape) + weight_gather_reshape = reshape(weight_gather, shape=out_shape) + out = paddle.cast(out, weight_gather_reshape.dtype) + else: + weight_gather = paddle.gather_nd( + weight, label) #trans weight from class to sample, shape:N + input_shape = list(label.shape) + weight_gather_reshape = reshape(weight_gather, shape=input_shape) out = paddle.multiply(out, weight_gather_reshape, name=weight_name) if reduction == "sum": diff --git a/python/paddle/nn/layer/loss.py b/python/paddle/nn/layer/loss.py index ac1cb5a8187..ad046b90417 100644 --- a/python/paddle/nn/layer/loss.py +++ b/python/paddle/nn/layer/loss.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -* # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -108,7 +109,6 @@ class BCEWithLogitsLoss(fluid.dygraph.Layer): .. code-block:: python import paddle - paddle.disable_static() logit = paddle.to_tensor([5.0, 1.0, 3.0], dtype="float32") label = paddle.to_tensor([1.0, 0.0, 1.0], dtype="float32") bce_logit_loss = paddle.nn.BCEWithLogitsLoss() @@ -142,85 +142,249 @@ class BCEWithLogitsLoss(fluid.dygraph.Layer): class CrossEntropyLoss(fluid.dygraph.Layer): r""" - This operator implements the cross entropy loss function with softmax. This function + By default, this operator implements the cross entropy loss function with softmax. This function combines the calculation of the softmax operation and the cross entropy loss function - to provide a more numerically stable gradient. + to provide a more numerically stable computing. - Because this operator performs a softmax on logits internally, it expects - unscaled logits. This operator should not be used with the output of - softmax operator since that would produce incorrect results. + This operator will calculate the cross entropy loss function without softmax when use_softmax=False. - When the attribute :attr:`soft_label` is set :attr:`False`, this operators - expects mutually exclusive hard labels, each sample in a batch is in exactly - one class with a probability of 1.0. Each sample in the batch will have a - single label. + By default, this operator will calculate the mean of the result, and you can also affect + the default behavior by using the reduction parameter. Please refer to the part of + parameters for details. - The equation is as follows: + This operator can be used to calculate the softmax cross entropy loss with soft and hard labels. + Where, the hard labels mean the actual label value, 0, 1, 2, etc. And the soft labels + mean the probability of the actual label, 0.6, 0.8, 0.2, etc. - 1) Hard label (one-hot label, so every sample has exactly one class) + The calculation of this operator includes the following two steps. - .. math:: + - **I.softmax cross entropy** - loss_j = -\\text{logits}_{label_j} + - \\log\\left(\\sum_{i=0}^{K}\\exp(\\text{logits}_i)\\right), j = 1,..., K + 1. Hard label (each sample can only be assigned into one category) - 2) Soft label (each sample can have a distribution over all classes) + 1.1. when use_softmax=True - .. math:: + .. math:: + \\loss_j=-\text{logits}_{label_j}+\log\left(\sum_{i=0}^{C}\exp(\text{logits}_i)\right) , j = 1,...,N - loss_j = -\\sum_{i=0}^{K}\\text{label}_i - \\left(\\text{logits}_i - \\log\\left(\\sum_{i=0}^{K} - \\exp(\\text{logits}_i)\\right)\\right), j = 1,...,K + where, N is the number of samples and C is the number of categories. + + 1.2. when use_softmax=False + + .. math:: + \\loss_j=-\log\left({P}_{label_j}\right) , j = 1,...,N + + where, N is the number of samples and C is the number of categories, P is input(the output of softmax). + + + 2. Soft label (each sample is assigned to multiple categories with a certain probability, and the probability sum is 1). + + 2.1. when use_softmax=True + + .. math:: + \\loss_j=-\sum_{i=0}^{C}\text{label}_i\left(\text{logits}_i-\log\left(\sum_{i=0}^{C}\exp(\text{logits}_i)\right)\right) , j = 1,...,N + + where, N is the number of samples and C is the number of categories. + + 2.2. when use_softmax=False + + .. math:: + \\loss_j=-\sum_{j=0}^{C}\left({label}_j*\log\left({P}_{label_j}\right)\right) , j = 1,...,N + + where, N is the number of samples and C is the number of categories, P is input(the output of softmax). + + + + - **II.Weight and reduction processing** + + 1. Weight + + If the ``weight`` parameter is ``None`` , go to the next step directly. + + If the ``weight`` parameter is not ``None`` , the cross entropy of each sample is weighted by weight + according to soft_label = False or True as follows. + + 1.1. Hard labels (soft_label = False) + + .. math:: + \\loss_j=loss_j*weight[label_j] - - It is useful when training a classification problem with ``C`` classes. + 1.2. Soft labels (soft_label = True) + .. math:: + \\loss_j=loss_j*\sum_{i}\left(weight[label_i]*logits_i\right) + + 2. reduction + + 2.1 if the ``reduction`` parameter is ``none`` + + Return the previous result directly + + 2.2 if the ``reduction`` parameter is ``sum`` + + Return the sum of the previous results + + .. math:: + \\loss=\sum_{j}loss_j + + 2.3 if the ``reduction`` parameter is ``mean`` , it will be processed according to + the ``weight`` parameter as follows. + + 2.3.1. If the ``weight`` parameter is ``None`` + + Return the average value of the previous results + + .. math:: + \\loss=\sum_{j}loss_j/N + + where, N is the number of samples and C is the number of categories. + + 2.3.2. If the 'weight' parameter is not 'None', the weighted average value of the previous result will be returned + + 1. Hard labels (soft_label = False) + + .. math:: + \\loss=\sum_{j}loss_j/\sum_{j}weight[label_j] + + 2. Soft labels (soft_label = True) + + .. math:: + \\loss=\sum_{j}loss_j/\sum_{j}\left(\sum_{i}weight[label_i]\right) + + Parameters: - input (Tensor): Input tensor, the data type is float32, float64. Shape is - (N, C), where C is number of classes, and if shape is more than 2D, this - is (N, C, D1, D2,..., Dk), k >= 1. - label (Tensor): Label tensor, the data type is int64. Shape is (N), where each - value is 0 <= label[i] <= C-1, and if shape is more than 2D, this is - (N, D1, D2,..., Dk), k >= 1. - weight (Tensor, optional): Weight tensor, a manual rescaling weight given - to each class and the shape is (C). It has the same dimensions as class - number and the data type is float32, float64. Default is ``'None'``. - reduction (str, optional): Indicate how to average the loss by batch_size, + + - **weight** (Tensor, optional) + + a manual rescaling weight given to each class. + If given, has to be a Tensor of size C and the data type is float32, float64. + Default is ``'None'`` . + + - **ignore_index** (int64, optional) + + Specifies a target value that is ignored + and does not contribute to the loss. A negative value means that no label + value needs to be ignored. Only valid when soft_label = False. + Default is ``-100`` . + + - **reduction** (str, optional) + + Indicate how to average the loss by batch_size, the candicates are ``'none'`` | ``'mean'`` | ``'sum'``. If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned; If :attr:`size_average` is ``'sum'``, the reduced sum loss is returned. If :attr:`reduction` is ``'none'``, the unreduced loss is returned. Default is ``'mean'``. - ignore_index (int64, optional): Specifies a target value that is ignored - and does not contribute to the input gradient. Default is ``-100``. - soft_label (bool): indicate whether label is soft. Default False, meaning that - the label is hard. If soft_label=True, the label is soft. - axis (int, optional): The index of dimension to perform softmax calculations. It - should be in range :math:`[-1, rank - 1]`, while :math:`rank` - is the rank of input :attr:`logits`. Default: -1. + - **soft_label** (bool, optional) - Returns: - Tensor. The tensor storing the cross_entropy_loss of input and label. + Indicate whether label is soft. + If soft_label=False, the label is hard. If soft_label=True, the label is soft. + Default is ``False``. + - **axis** (int, optional) + + The index of dimension to perform softmax calculations. + It should be in range :math:`[-1, rank - 1]`, where :math:`rank` is the number + of dimensions of input :attr:`input`. + Default is ``-1`` . + + - **use_softmax** (bool, optional) + + Indicate whether compute softmax before cross_entropy. + Default is ``True``. + + - **name** (str,optional) + + The name of the operator. Default is ``None`` . + For more information, please refer to :ref:`api_guide_Name` . + + + Shape: + + - **input** (Tensor) + + Input tensor, the data type is float32, float64. Shape is + :math:`[N_1, N_2, ..., N_k, C]`, where C is number of classes , ``k >= 1`` . + + Note: + + 1. when use_softmax=True, it expects unscaled logits. This operator should not be used with the + output of softmax operator, which will produce incorrect results. + + 2. when use_softmax=False, it expects the output of softmax operator. + + + - **label** (Tensor) + + 1. If soft_label=False,the shape is + :math:`[N_1, N_2, ..., N_k]` or :math:`[N_1, N_2, ..., N_k, 1]`, k >= 1. + the data type is int32, int64, float32, float64, where each value is [0, C-1]. + + 2. If soft_label=True, the shape and data type should be same with ``input`` , + and the sum of the labels for each sample should be 1. + + - **output** (Tensor) + + Return the softmax cross_entropy loss of ``input`` and ``label``. + + The data type is the same as input. + + If :attr:`reduction` is ``'mean'`` or ``'sum'`` , the dimension of return value is ``1``. + + If :attr:`reduction` is ``'none'``: + + 1. If soft_label = False, the dimension of return value is the same with ``label`` . + + 2. if soft_label = True, the dimension of return value is :math:`[N_1, N_2, ..., N_k, 1]` . + + Example1(hard labels): - Examples: .. code-block:: python import paddle - import numpy as np + paddle.seed(99999) + N=100 + C=200 + reduction='mean' + input = paddle.rand([N, C], dtype='float64') + label = paddle.randint(0, C, shape=[N], dtype='int64') + weight = paddle.rand([C], dtype='float64') + + cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss( + weight=weight, reduction=reduction) + dy_ret = cross_entropy_loss( + input, + label) + print(dy_ret.numpy()) #[5.41993642] + + + Example2(soft labels): + + .. code-block:: python + + import paddle + paddle.seed(99999) + axis = -1 + ignore_index = -100 + N = 4 + C = 3 + shape = [N, C] + reduction='mean' + weight = None + logits = paddle.uniform(shape, dtype='float64', min=0.1, max=1.0) + labels = paddle.uniform(shape, dtype='float64', min=0.1, max=1.0) + labels /= paddle.sum(labels, axis=axis, keepdim=True) + paddle_loss_mean = paddle.nn.functional.cross_entropy( + logits, + labels, + soft_label=True, + axis=axis, + weight=weight, + reduction=reduction) + print(paddle_loss_mean.numpy()) #[1.12908343] - input_data = paddle.uniform([5, 100], dtype="float64") - label_data = np.random.randint(0, 100, size=(5)).astype(np.int64) - weight_data = np.random.random([100]).astype("float64") - input = paddle.to_tensor(input_data) - label = paddle.to_tensor(label_data) - weight = paddle.to_tensor(weight_data) - ce_loss = paddle.nn.CrossEntropyLoss(weight=weight, reduction='mean') - output = ce_loss(input, label) - print(output) - # [4.84496039] """ def __init__(self, @@ -229,6 +393,7 @@ class CrossEntropyLoss(fluid.dygraph.Layer): reduction='mean', soft_label=False, axis=-1, + use_softmax=True, name=None): super(CrossEntropyLoss, self).__init__() self.weight = weight @@ -236,6 +401,7 @@ class CrossEntropyLoss(fluid.dygraph.Layer): self.ignore_index = ignore_index self.soft_label = soft_label self.axis = axis + self.use_softmax = use_softmax self.name = name def forward(self, input, label): @@ -247,6 +413,7 @@ class CrossEntropyLoss(fluid.dygraph.Layer): reduction=self.reduction, soft_label=self.soft_label, axis=self.axis, + use_softmax=self.use_softmax, name=self.name) return ret -- GitLab From fe2848686b6b14822caca1adee80107346d4426f Mon Sep 17 00:00:00 2001 From: wangguanzhong Date: Tue, 30 Mar 2021 10:44:02 +0800 Subject: [PATCH 097/486] add exclusive for test_conv2d_op, test=develop (#31936) --- python/paddle/fluid/tests/unittests/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index b5c554a58cb..0c292d355dd 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -450,6 +450,7 @@ py_test_modules(test_imperative_static_runner_mnist MODULES test_imperative_stat FLAGS_cudnn_deterministic=1) py_test_modules(test_imperative_static_runner_while MODULES test_imperative_static_runner_while ENVS FLAGS_cudnn_deterministic=1) +set_tests_properties(test_conv2d_op PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE") if(WITH_DISTRIBUTE) # FIXME(typhoonzero): add these tests back list(REMOVE_ITEM DIST_TEST_OPS "test_dist_transformer") -- GitLab From 04a49b097eb8d8956ee5672268caba5024eb628a Mon Sep 17 00:00:00 2001 From: Zhou Wei <52485244+zhouwei25@users.noreply.github.com> Date: Tue, 30 Mar 2021 12:16:05 +0800 Subject: [PATCH 098/486] [Custom OP]Remove old custom OP and reduce whl package volume (#31813) * Remove old custom OP to reduce whl package volume * [Custom OP]Remove old custom OP to reduce whl package volume --- paddle/fluid/framework/CMakeLists.txt | 37 +----- paddle/fluid/framework/c/c_api.cc | 53 -------- paddle/fluid/framework/c/c_api.h | 55 -------- paddle/fluid/framework/load_op_lib.h | 120 ------------------ paddle/fluid/pybind/pybind.cc | 2 - python/paddle/fluid/framework.py | 28 ---- python/paddle/fluid/tests/CMakeLists.txt | 3 +- .../fluid/tests/custom_op/CMakeLists.txt | 41 +----- .../paddle/fluid/tests/custom_op/relu_op.cc | 115 ----------------- .../paddle/fluid/tests/custom_op/relu_op.cu | 87 ------------- .../paddle/fluid/tests/custom_op/relu_op3.cc | 115 ----------------- .../paddle/fluid/tests/custom_op/relu_op3.cu | 87 ------------- .../fluid/tests/custom_op/setup_build.py | 37 ------ .../fluid/tests/custom_op/setup_install.py | 29 ----- .../fluid/tests/custom_op/test_custom_op.py | 120 ------------------ .../fluid/tests/custom_op/test_jit_load.py | 51 -------- .../fluid/tests/custom_op/test_setup_build.py | 69 ---------- .../tests/custom_op/test_setup_install.py | 65 ---------- python/paddle/incubate/__init__.py | 1 - python/paddle/utils/__init__.py | 3 +- .../utils/cpp_extension/cpp_extension.py | 2 +- .../utils/cpp_extension/extension_utils.py | 35 +---- python/setup.py.in | 53 +++----- 23 files changed, 26 insertions(+), 1182 deletions(-) delete mode 100644 paddle/fluid/framework/c/c_api.cc delete mode 100644 paddle/fluid/framework/c/c_api.h delete mode 100644 paddle/fluid/framework/load_op_lib.h delete mode 100644 python/paddle/fluid/tests/custom_op/relu_op.cc delete mode 100644 python/paddle/fluid/tests/custom_op/relu_op.cu delete mode 100644 python/paddle/fluid/tests/custom_op/relu_op3.cc delete mode 100644 python/paddle/fluid/tests/custom_op/relu_op3.cu delete mode 100644 python/paddle/fluid/tests/custom_op/setup_build.py delete mode 100644 python/paddle/fluid/tests/custom_op/setup_install.py delete mode 100644 python/paddle/fluid/tests/custom_op/test_custom_op.py delete mode 100644 python/paddle/fluid/tests/custom_op/test_jit_load.py delete mode 100644 python/paddle/fluid/tests/custom_op/test_setup_build.py delete mode 100644 python/paddle/fluid/tests/custom_op/test_setup_install.py diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 1fa4ce9b573..2842f230ca9 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -360,46 +360,11 @@ set(FLUID_FRAMEWORK_MODULES proto_desc memory lod_tensor executor data_feed_prot cc_library(paddle_framework DEPS ${FLUID_FRAMEWORK_MODULES}) -# Old custom op extension mechanism related, will be removed in 2.1.0 -cc_library(paddle_framework_shared - SHARED SRCS executor.cc operator.cc - ${CMAKE_CURRENT_SOURCE_DIR}/c/c_api.cc - ${CMAKE_SOURCE_DIR}/paddle/fluid/imperative/layer.cc - DEPS ${FLUID_FRAMEWORK_MODULES}) -get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) -set_target_properties(paddle_framework_shared PROPERTIES OUTPUT_NAME paddle_framework) -target_link_libraries(paddle_framework_shared ${os_dependency_modules}) - -if (LINUX) - set(FLUID_FRAMEWORK_SHARED_LIB - ${PADDLE_BINARY_DIR}/paddle/fluid/framework/libpaddle_framework.so - CACHE INTERNAL "Fluid framework lib") -endif() - -if (WIN32) - if("${CMAKE_GENERATOR}" STREQUAL "Ninja") - set(paddle_framework_lib_path ${CMAKE_CURRENT_BINARY_DIR}) - else() - set(paddle_framework_lib_path ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}) - endif() - set(FLUID_FRAMEWORK_IMPORT_LIB - ${paddle_framework_lib_path}/paddle_framework.lib - CACHE INTERNAL "Fluid framework lib") - set(FLUID_FRAMEWORK_SHARED_LIB - ${paddle_framework_lib_path}/paddle_framework.dll - CACHE INTERNAL "Fluid framework dll") -endif() - -if(APPLE) - set(FLUID_FRAMEWORK_SHARED_LIB - ${PADDLE_BINARY_DIR}/paddle/fluid/framework/libpaddle_framework.dylib - CACHE INTERNAL "Fluid framework lib") -endif() if(WITH_TESTING AND TEST selected_rows_test) set_tests_properties(selected_rows_test PROPERTIES TIMEOUT 120) endif() -# New custom op extension mechanism related +##### 2.0 New custom op extension mechanism related ##### # if not deps `layer`, will cause: undefined symbol: _ZN6paddle10imperative7VarBase9name_set_ set(PADDLE_CUSTOM_OP_MODULES custom_tensor op_meta_info custom_operator layer) diff --git a/paddle/fluid/framework/c/c_api.cc b/paddle/fluid/framework/c/c_api.cc deleted file mode 100644 index 5e73c5cc23a..00000000000 --- a/paddle/fluid/framework/c/c_api.cc +++ /dev/null @@ -1,53 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/c/c_api.h" - -#include "paddle/fluid/framework/op_info.h" -#include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/enforce.h" - -extern "C" { - -paddle::framework::OpInfoMap &PD_GetOpInfoMap() { - return paddle::framework::OpInfoMap::Instance(); -} - -void PD_InitDevicesPool(paddle::platform::DeviceContextPool *pool) { - paddle::platform::DeviceContextPool::SetPool(pool); -} - -std::vector PD_GetGradOpDescStrs( - const paddle::framework::OpDesc &op_desc, - const std::unordered_set &no_grad_set, - std::unordered_map *grad_to_var, - const std::vector &grad_block) { - auto &op_info = PD_GetOpInfoMap().Get(op_desc.Type()); - std::vector ret; - if (op_info.grad_op_maker_) { - auto grad_op_descs = - op_info.grad_op_maker_(op_desc, no_grad_set, grad_to_var, grad_block); - size_t op_num = grad_op_descs.size(); - ret.resize(op_num); - for (size_t i = 0; i < op_num; ++i) { - PADDLE_ENFORCE_EQ( - grad_op_descs[i]->Proto()->SerializePartialToString(&ret[i]), true, - paddle::platform::errors::Unavailable( - "Cannot serialize operator desc message.")); - } - } - return ret; -} - -} // end extern "C" diff --git a/paddle/fluid/framework/c/c_api.h b/paddle/fluid/framework/c/c_api.h deleted file mode 100644 index a9ec402f381..00000000000 --- a/paddle/fluid/framework/c/c_api.h +++ /dev/null @@ -1,55 +0,0 @@ -/* copyright (c) 2019 paddlepaddle authors. all rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include -#include - -#include "paddle/fluid/framework/block_desc.h" -#include "paddle/fluid/framework/op_desc.h" -#include "paddle/fluid/framework/op_info.h" -#include "paddle/fluid/platform/device_context.h" - -namespace paddle { -namespace framework { -class OpInfoMap; -} // namespace framework -namespace platform { -class DeviceContextPool; -} // namespace platform -} // namespace paddle - -#ifdef __cplusplus -extern "C" { -#endif - -// C-API to get global OpInfo map. -paddle::framework::OpInfoMap &PD_GetOpInfoMap(); - -// C-API to init global DeviceContextPool from outside. -void PD_InitDevicesPool(paddle::platform::DeviceContextPool *pool); - -// C-API to serialize the grad op protocol message to a binary string. -std::vector PD_GetGradOpDescStrs( - const paddle::framework::OpDesc &op_desc, - const std::unordered_set &no_grad_set, - std::unordered_map *grad_to_var, - const std::vector &grad_block); - -#ifdef __cplusplus -} -#endif diff --git a/paddle/fluid/framework/load_op_lib.h b/paddle/fluid/framework/load_op_lib.h deleted file mode 100644 index 16cffe119d6..00000000000 --- a/paddle/fluid/framework/load_op_lib.h +++ /dev/null @@ -1,120 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include -#include -#include - -#include "paddle/fluid/framework/op_desc.h" -#include "paddle/fluid/platform/dynload/dynamic_loader.h" -#include "paddle/fluid/platform/port.h" - -namespace paddle { -namespace framework { - -template -T *DynLoad(void *handle, std::string name) { - T *func = reinterpret_cast(dlsym(handle, name.c_str())); -#if !defined(_WIN32) - auto errorno = dlerror(); -#else - auto errorno = GetLastError(); -#endif // !_WIN32 - PADDLE_ENFORCE_NOT_NULL( - func, - platform::errors::NotFound( - "Failed to load dynamic operator library, error code(%s).", errorno)); - return func; -} - -void LoadOpLib(const std::string &dso_name) { - void *handle = paddle::platform::dynload::GetOpDsoHandle(dso_name); - - typedef OpInfoMap &get_op_info_t(); - get_op_info_t *get_op_info = - DynLoad(handle, "PD_GetOpInfoMap"); - auto &op_info = get_op_info(); - auto *dyn_info_map = op_info.mutable_map(); - - typedef std::vector grad_op_desc_maker_t( - const OpDesc &, const std::unordered_set &, - std::unordered_map *, - const std::vector &); - - grad_op_desc_maker_t *grad_op_desc_maker = - DynLoad(handle, "PD_GetGradOpDescStrs"); - - auto &info_map = OpInfoMap::Instance(); - for (const auto &n : *(dyn_info_map)) { - auto type = n.first; - if (type == "recurrent" || type == "recurrent_grad" || - type == "conditional_block" || type == "conditional_block_grad") { - continue; - } - PADDLE_ENFORCE_NE(info_map.Has(n.first), true, - platform::errors::AlreadyExists( - "Operator (%s) has been registered.", type)); - OpInfo info; - info.creator_ = n.second.creator_; - - // If get the protocol buffer from dynamic library directly, there - // will be deconstruction error - // ** Error in `python`: free(): invalid pointer: - // ... paddle::framework::proto::OpDesc::SharedDtor() - // It seems a bug in protobuf, see - // https://github.com/protocolbuffers/protobuf/issues/435 - // So, get the serialized binary string from dynamic library, - // then deserialize to protocol buffer. - info.grad_op_maker_ = [grad_op_desc_maker]( - const OpDesc &op_desc, - const std::unordered_set &no_grad_set, - std::unordered_map *grad_to_var, - const std::vector &grad_block) { - std::vector strs = - grad_op_desc_maker(op_desc, no_grad_set, grad_to_var, grad_block); - std::vector> ret; - for (auto &str : strs) { - proto::OpDesc proto_desc; - PADDLE_ENFORCE_EQ(proto_desc.ParseFromString(str), true, - platform::errors::InvalidArgument( - "Failed to parse OpDesc from string.")); - ret.emplace_back(new OpDesc(proto_desc, nullptr)); - } - return ret; - }; - info.proto_ = n.second.proto_; - info.checker_ = n.second.checker_; - info.infer_var_type_ = n.second.infer_var_type_; - info.infer_shape_ = n.second.infer_shape_; - info.infer_inplace_ = n.second.infer_inplace_; - info.infer_no_need_buffer_vars_ = n.second.infer_no_need_buffer_vars_; - info.use_default_grad_op_desc_maker_ = - n.second.use_default_grad_op_desc_maker_; - info.use_empty_grad_op_desc_maker_ = n.second.use_empty_grad_op_desc_maker_; - - info_map.Insert(type, info); - } - - typedef void init_device_t(platform::DeviceContextPool *); - init_device_t *init_dev = - DynLoad(handle, "PD_InitDevicesPool"); - init_dev(&(platform::DeviceContextPool::Instance())); -} - -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index e1ff69e7485..d8ee80c0070 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -33,7 +33,6 @@ limitations under the License. */ #include "paddle/fluid/framework/io/fs.h" #include "paddle/fluid/framework/ir/coalesce_grad_tensor_pass.h" #include "paddle/fluid/framework/ir/pass_builder.h" -#include "paddle/fluid/framework/load_op_lib.h" #include "paddle/fluid/framework/lod_rank_table.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor_array.h" @@ -1752,7 +1751,6 @@ All parameter, weight, gradient are variables in Paddle. m.def("init_gflags", framework::InitGflags); m.def("init_glog", framework::InitGLOG); - m.def("load_op_library", framework::LoadOpLib); m.def("load_op_meta_info_and_register_op", framework::LoadOpMetaInfoAndRegisterOp); m.def("init_devices", []() { framework::InitDevices(); }); diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index b87c2eb388a..be795b9e59c 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -53,7 +53,6 @@ __all__ = [ 'is_compiled_with_cuda', 'is_compiled_with_xpu', 'Variable', - 'load_op_library', 'require_version', 'device_guard', 'set_flags', @@ -5771,33 +5770,6 @@ def _dygraph_place_guard(place): _set_dygraph_tracer_expected_place(tmp_place) -def load_op_library(lib_filename): - """ - :api_attr: Static Graph - - Load a dynamic library, including custom operators and kernels. - When library is loaded, ops and kernels registered in the library - will be available in PaddlePaddle main process. - Please note, the type of custom operators can't have the same type - with the existing operators in the framework. - - Args: - lib_filename (str): name of dynamic library. - - Returns: - list[str]: new registered custom op names. - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - #fluid.load_op_library('custom_op.so') - - """ - core.load_op_library(lib_filename) - return OpProtoHolder.instance().update_op_proto() - - def switch_device(device): global _current_device pre_device = _current_device diff --git a/python/paddle/fluid/tests/CMakeLists.txt b/python/paddle/fluid/tests/CMakeLists.txt index 899d6ae7f0e..1d404151415 100644 --- a/python/paddle/fluid/tests/CMakeLists.txt +++ b/python/paddle/fluid/tests/CMakeLists.txt @@ -9,7 +9,8 @@ endforeach() add_subdirectory(unittests) add_subdirectory(book) -# TODO: support New Custom OP on Mac +# 2.0 New custom OP can support Windows/Linux now +# TODO: support 2.0 New Custom OP on Mac if(NOT APPLE) add_subdirectory(custom_op) endif() diff --git a/python/paddle/fluid/tests/custom_op/CMakeLists.txt b/python/paddle/fluid/tests/custom_op/CMakeLists.txt index 36496ec499f..ceaf4bbdfeb 100644 --- a/python/paddle/fluid/tests/custom_op/CMakeLists.txt +++ b/python/paddle/fluid/tests/custom_op/CMakeLists.txt @@ -1,6 +1,6 @@ # New custom OP can support Windows/Linux now if(WITH_GPU) - # 'test_custom_relu_op_setup/jit' compile .cc and .cu file + # GPU custom op tests: compile both .cc and .cu file py_test(test_custom_relu_op_setup SRCS test_custom_relu_op_setup.py) py_test(test_custom_relu_op_jit SRCS test_custom_relu_op_jit.py) py_test(test_custom_relu_model SRCS test_custom_relu_model.py) @@ -11,8 +11,6 @@ if(WITH_GPU) set_tests_properties(test_custom_relu_model PROPERTIES TIMEOUT 180) endif() -py_test(test_sysconfig SRCS test_sysconfig.py) - # CPU custom op tests: only compile .cc file py_test(test_dispatch_jit SRCS test_dispatch_jit.py) py_test(test_multi_out_jit SRCS test_multi_out_jit.py) @@ -21,41 +19,6 @@ py_test(test_custom_concat SRCS test_custom_concat.py) py_test(test_custom_conj SRCS test_custom_conj.py) # other tests +py_test(test_sysconfig SRCS test_sysconfig.py) py_test(test_check_abi SRCS test_check_abi.py) cc_test(test_check_error SRCS test_check_error.cc DEPS gtest) - -if(NOT LINUX) - return() -endif() - -# Old custom OP only support Linux, only run on Linux -py_test(test_custom_op SRCS test_custom_op.py) -py_test(test_jit_load SRCS test_jit_load.py) -py_test(test_setup_install SRCS test_setup_install.py) -py_test(test_setup_build SRCS test_setup_build.py) - -set_tests_properties(test_jit_load PROPERTIES TIMEOUT 180) -set_tests_properties(test_setup_install PROPERTIES TIMEOUT 250) -set_tests_properties(test_setup_build PROPERTIES TIMEOUT 180) - - -if(WITH_ROCM) - hip_library(relu_op_shared SHARED SRCS relu_op.cc relu_op.cu DEPS paddle_framework_shared) -elseif(WITH_GPU) - nv_library(relu_op_shared SHARED SRCS relu_op.cc relu_op.cu DEPS paddle_framework_shared) -else() - cc_library(relu_op_shared SHARED SRCS relu_op.cc DEPS paddle_framework_shared) -endif() -set_target_properties(relu_op_shared PROPERTIES OUTPUT_NAME relu2_op) -target_link_libraries(relu_op_shared ${FLUID_FRAMEWORK_SHARED_LIB}) - -# remove the linked glog and gflags when compling relu_op_shared -# otherwise, there is running error: -# ERROR: something wrong with flag 'logtostderr' in file -# 'third_party/glog/src/extern_glog/src/logging.cc'. -# One possibility: file 'third_party/glog/src/extern_glog/src/logging.cc' -# is being linked both statically and dynamically into this executable. -get_target_property(TARGET_LIBRARIES relu_op_shared LINK_LIBRARIES) -LIST(REMOVE_ITEM TARGET_LIBRARIES glog) -LIST(REMOVE_ITEM TARGET_LIBRARIES gflags) -set_property(TARGET relu_op_shared PROPERTY LINK_LIBRARIES ${TARGET_LIBRARIES} ) diff --git a/python/paddle/fluid/tests/custom_op/relu_op.cc b/python/paddle/fluid/tests/custom_op/relu_op.cc deleted file mode 100644 index 837f5bab6be..00000000000 --- a/python/paddle/fluid/tests/custom_op/relu_op.cc +++ /dev/null @@ -1,115 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -class Relu2Op : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - auto in_dims = ctx->GetInputDim("X"); - ctx->SetOutputDim("Y", in_dims); - } -}; - -class Relu2OpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("X", "The input tensor."); - AddOutput("Y", "Output of relu_op"); - AddComment(R"DOC( -Relu2 Operator. -)DOC"); - } -}; - -class Relu2GradOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - auto in_dims = ctx->GetInputDim(framework::GradVarName("Y")); - ctx->SetOutputDim(framework::GradVarName("X"), in_dims); - } -}; - -template -class Relu2GradMaker : public framework::SingleGradOpMaker { - public: - using framework::SingleGradOpMaker::SingleGradOpMaker; - - void Apply(GradOpPtr op) const override { - op->SetType("relu2_grad"); - op->SetInput("Y", this->Output("Y")); - op->SetInput(framework::GradVarName("Y"), this->OutputGrad("Y")); - op->SetAttrMap(this->Attrs()); - op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); - } -}; - -using Tensor = framework::Tensor; - -template -class Relu2Kernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* in_t = ctx.Input("X"); - auto* out_t = ctx.Output("Y"); - auto x = in_t->data(); - auto y = out_t->mutable_data(ctx.GetPlace()); - for (int i = 0; i < in_t->numel(); ++i) { - y[i] = std::max(static_cast(0.), x[i]); - } - } -}; - -template -class Relu2GradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* dy_t = ctx.Input(framework::GradVarName("Y")); - auto* y_t = ctx.Input("Y"); - auto* dx_t = ctx.Output(framework::GradVarName("X")); - - auto dy = dy_t->data(); - auto y = y_t->data(); - auto dx = dx_t->mutable_data(ctx.GetPlace()); - - for (int i = 0; i < y_t->numel(); ++i) { - dx[i] = dy[i] * (y[i] > static_cast(0) ? 1. : 0.); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -using CPU = paddle::platform::CPUDeviceContext; -REGISTER_OPERATOR(relu2, - ops::Relu2Op, - ops::Relu2OpMaker, - ops::Relu2GradMaker, - ops::Relu2GradMaker); -REGISTER_OPERATOR(relu2_grad, ops::Relu2GradOp); -REGISTER_OP_CPU_KERNEL(relu2, - ops::Relu2Kernel, - ops::Relu2Kernel); -REGISTER_OP_CPU_KERNEL(relu2_grad, - ops::Relu2GradKernel, - ops::Relu2GradKernel); diff --git a/python/paddle/fluid/tests/custom_op/relu_op.cu b/python/paddle/fluid/tests/custom_op/relu_op.cu deleted file mode 100644 index 53ad75e413d..00000000000 --- a/python/paddle/fluid/tests/custom_op/relu_op.cu +++ /dev/null @@ -1,87 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -__global__ void KeRelu2(const T* x, const int num, T* y) { - int gid = blockIdx.x * blockDim.x + threadIdx.x; - for (int i = gid; i < num; i += blockDim.x * gridDim.x) { - y[i] = max(x[i], static_cast(0.)); - } -} - -template -class Relu2CUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* in_t = ctx.Input("X"); - auto* out_t = ctx.Output("Y"); - auto x = in_t->data(); - auto y = out_t->mutable_data(ctx.GetPlace()); - - auto& dev_ctx = ctx.template device_context(); - - int num = in_t->numel(); - int block = 512; - int grid = (num + block - 1) / block; - KeRelu2<<>>(x, num, y); - } -}; - -template -__global__ void KeRelu2Grad(const T* y, const T* dy, const int num, T* dx) { - int gid = blockIdx.x * blockDim.x + threadIdx.x; - for (int i = gid; i < num; i += blockDim.x * gridDim.x) { - dx[i] = dy[i] * (y[i] > 0 ? 1. : 0.); - } -} - -template -class Relu2GradCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* dy_t = ctx.Input(framework::GradVarName("Y")); - auto* y_t = ctx.Input("Y"); - auto* dx_t = ctx.Output(framework::GradVarName("X")); - - auto dy = dy_t->data(); - auto y = y_t->data(); - auto dx = dx_t->mutable_data(ctx.GetPlace()); - - auto& dev_ctx = ctx.template device_context(); - - int num = dy_t->numel(); - int block = 512; - int grid = (num + block - 1) / block; - KeRelu2Grad<<>>(y, dy, num, dx); - } -}; - -} // namespace operators -} // namespace paddle - -using CUDA = paddle::platform::CUDADeviceContext; -REGISTER_OP_CUDA_KERNEL(relu2, - paddle::operators::Relu2CUDAKernel, - paddle::operators::Relu2CUDAKernel); - -REGISTER_OP_CUDA_KERNEL(relu2_grad, - paddle::operators::Relu2GradCUDAKernel, - paddle::operators::Relu2GradCUDAKernel); diff --git a/python/paddle/fluid/tests/custom_op/relu_op3.cc b/python/paddle/fluid/tests/custom_op/relu_op3.cc deleted file mode 100644 index ace9598c586..00000000000 --- a/python/paddle/fluid/tests/custom_op/relu_op3.cc +++ /dev/null @@ -1,115 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -class Relu3Op : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - auto in_dims = ctx->GetInputDim("X"); - ctx->SetOutputDim("Y", in_dims); - } -}; - -class Relu3OpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("X", "The input tensor."); - AddOutput("Y", "Output of relu_op"); - AddComment(R"DOC( -Relu3 Operator. -)DOC"); - } -}; - -class Relu3GradOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - auto in_dims = ctx->GetInputDim(framework::GradVarName("Y")); - ctx->SetOutputDim(framework::GradVarName("X"), in_dims); - } -}; - -template -class Relu3GradMaker : public framework::SingleGradOpMaker { - public: - using framework::SingleGradOpMaker::SingleGradOpMaker; - - void Apply(GradOpPtr op) const override { - op->SetType("relu3_grad"); - op->SetInput("Y", this->Output("Y")); - op->SetInput(framework::GradVarName("Y"), this->OutputGrad("Y")); - op->SetAttrMap(this->Attrs()); - op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); - } -}; - -using Tensor = framework::Tensor; - -template -class Relu3Kernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* in_t = ctx.Input("X"); - auto* out_t = ctx.Output("Y"); - auto x = in_t->data(); - auto y = out_t->mutable_data(ctx.GetPlace()); - for (int i = 0; i < in_t->numel(); ++i) { - y[i] = std::max(static_cast(0.), x[i]); - } - } -}; - -template -class Relu3GradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* dy_t = ctx.Input(framework::GradVarName("Y")); - auto* y_t = ctx.Input("Y"); - auto* dx_t = ctx.Output(framework::GradVarName("X")); - - auto dy = dy_t->data(); - auto y = y_t->data(); - auto dx = dx_t->mutable_data(ctx.GetPlace()); - - for (int i = 0; i < y_t->numel(); ++i) { - dx[i] = dy[i] * (y[i] > static_cast(0) ? 1. : 0.); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -using CPU = paddle::platform::CPUDeviceContext; -REGISTER_OPERATOR(relu3, - ops::Relu3Op, - ops::Relu3OpMaker, - ops::Relu3GradMaker, - ops::Relu3GradMaker); -REGISTER_OPERATOR(relu3_grad, ops::Relu3GradOp); -REGISTER_OP_CPU_KERNEL(relu3, - ops::Relu3Kernel, - ops::Relu3Kernel); -REGISTER_OP_CPU_KERNEL(relu3_grad, - ops::Relu3GradKernel, - ops::Relu3GradKernel); diff --git a/python/paddle/fluid/tests/custom_op/relu_op3.cu b/python/paddle/fluid/tests/custom_op/relu_op3.cu deleted file mode 100644 index 8a229cafebb..00000000000 --- a/python/paddle/fluid/tests/custom_op/relu_op3.cu +++ /dev/null @@ -1,87 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -__global__ void KeRelu3(const T* x, const int num, T* y) { - int gid = blockIdx.x * blockDim.x + threadIdx.x; - for (int i = gid; i < num; i += blockDim.x * gridDim.x) { - y[i] = max(x[i], static_cast(0.)); - } -} - -template -class Relu3CUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* in_t = ctx.Input("X"); - auto* out_t = ctx.Output("Y"); - auto x = in_t->data(); - auto y = out_t->mutable_data(ctx.GetPlace()); - - auto& dev_ctx = ctx.template device_context(); - - int num = in_t->numel(); - int block = 512; - int grid = (num + block - 1) / block; - KeRelu3<<>>(x, num, y); - } -}; - -template -__global__ void KeRelu3Grad(const T* y, const T* dy, const int num, T* dx) { - int gid = blockIdx.x * blockDim.x + threadIdx.x; - for (int i = gid; i < num; i += blockDim.x * gridDim.x) { - dx[i] = dy[i] * (y[i] > 0 ? 1. : 0.); - } -} - -template -class Relu3GradCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* dy_t = ctx.Input(framework::GradVarName("Y")); - auto* y_t = ctx.Input("Y"); - auto* dx_t = ctx.Output(framework::GradVarName("X")); - - auto dy = dy_t->data(); - auto y = y_t->data(); - auto dx = dx_t->mutable_data(ctx.GetPlace()); - - auto& dev_ctx = ctx.template device_context(); - - int num = dy_t->numel(); - int block = 512; - int grid = (num + block - 1) / block; - KeRelu3Grad<<>>(y, dy, num, dx); - } -}; - -} // namespace operators -} // namespace paddle - -using CUDA = paddle::platform::CUDADeviceContext; -REGISTER_OP_CUDA_KERNEL(relu3, - paddle::operators::Relu3CUDAKernel, - paddle::operators::Relu3CUDAKernel); - -REGISTER_OP_CUDA_KERNEL(relu3_grad, - paddle::operators::Relu3GradCUDAKernel, - paddle::operators::Relu3GradCUDAKernel); diff --git a/python/paddle/fluid/tests/custom_op/setup_build.py b/python/paddle/fluid/tests/custom_op/setup_build.py deleted file mode 100644 index 16a74779307..00000000000 --- a/python/paddle/fluid/tests/custom_op/setup_build.py +++ /dev/null @@ -1,37 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import os - -from utils import paddle_includes, extra_compile_args -from paddle.utils.cpp_extension import CppExtension, CUDAExtension, BuildExtension, setup -from paddle.utils.cpp_extension.extension_utils import use_new_custom_op_load_method - -# switch to old custom op method -use_new_custom_op_load_method(False) - -file_dir = os.path.dirname(os.path.abspath(__file__)) - -setup( - name='librelu2_op_from_setup', - ext_modules=[ - CUDAExtension( - sources=['relu_op3.cc', 'relu_op3.cu', 'relu_op.cc', - 'relu_op.cu'], # test for multi ops - include_dirs=paddle_includes, - extra_compile_args=extra_compile_args) - ], - cmdclass={ - 'build_ext': BuildExtension.with_options( - no_python_abi_suffix=True, output_dir=file_dir) # for unittest - }) diff --git a/python/paddle/fluid/tests/custom_op/setup_install.py b/python/paddle/fluid/tests/custom_op/setup_install.py deleted file mode 100644 index 18fbfbaf8b6..00000000000 --- a/python/paddle/fluid/tests/custom_op/setup_install.py +++ /dev/null @@ -1,29 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import os - -from utils import paddle_includes, extra_compile_args -from paddle.utils.cpp_extension import CUDAExtension, setup -from paddle.utils.cpp_extension.extension_utils import use_new_custom_op_load_method - -# switch to old custom op method -use_new_custom_op_load_method(False) - -setup( - name='custom_relu2', - ext_modules=CUDAExtension( # test for not specific name here. - sources=['relu_op.cc', 'relu_op.cu', 'relu_op3.cc', - 'relu_op3.cu'], # test for multi ops - include_dirs=paddle_includes, - extra_compile_args=extra_compile_args)) diff --git a/python/paddle/fluid/tests/custom_op/test_custom_op.py b/python/paddle/fluid/tests/custom_op/test_custom_op.py deleted file mode 100644 index 1c0db0be154..00000000000 --- a/python/paddle/fluid/tests/custom_op/test_custom_op.py +++ /dev/null @@ -1,120 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import sys -import numpy as np -import unittest -import contextlib - -import paddle -import paddle.fluid as fluid -paddle.enable_static() - - -def load_so(so_name): - """ - Load .so file and parse custom op into OpInfoMap. - """ - file_dir = os.path.dirname(os.path.abspath(__file__)) - fluid.load_op_library(os.path.join(file_dir, so_name)) - - -from paddle.fluid.layer_helper import LayerHelper - - -def relu2(x, name=None): - helper = LayerHelper("relu2", **locals()) - out = helper.create_variable( - type=x.type, name=name, dtype=x.dtype, persistable=False) - helper.append_op(type="relu2", inputs={"X": x}, outputs={"Y": out}) - return out - - -@contextlib.contextmanager -def scope_prog_guard(): - prog = fluid.Program() - startup_prog = fluid.Program() - scope = fluid.core.Scope() - with fluid.scope_guard(scope): - with fluid.program_guard(prog, startup_prog): - yield - - -def linear_fc(data, label, use_custom_relu): - hidden = fluid.layers.fc(data, size=128) - hidden = relu2(hidden) if use_custom_relu else fluid.layers.relu(hidden) - hidden = fluid.layers.fc(hidden, size=128) - hidden = fluid.layers.fc(hidden, size=10, act='softmax') - loss = fluid.layers.cross_entropy(input=hidden, label=label) - loss = fluid.layers.mean(loss) - return loss - - -def custom_op_test(use_gpu=True, use_custom_relu=True): - with scope_prog_guard(): - np.random.seed(0) - fluid.default_startup_program().random_seed = 10 - fluid.default_main_program().random_seed = 10 - - data = fluid.layers.data( - name='data', shape=[1, 28, 28], dtype='float32') - label = fluid.layers.data(name='label', shape=[1], dtype='int64') - loss = linear_fc(data, label, use_custom_relu) - - optimizer = fluid.optimizer.Momentum(learning_rate=0.1, momentum=0.9) - optimizer.minimize(loss) - - place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace() - exe = fluid.Executor(place) - exe.run(fluid.default_startup_program()) - - compile_program = fluid.compiler.CompiledProgram( - fluid.default_main_program()).with_data_parallel( - loss_name=loss.name) - - reader = paddle.batch(paddle.dataset.mnist.train(), batch_size=32) - feeder = fluid.DataFeeder(feed_list=[data, label], place=place) - - num = 4 - for i, data in enumerate(reader()): - outs, = exe.run(compile_program, - feed=feeder.feed(data), - fetch_list=[loss]) - if i == num: - break - return outs - - -class CustomOpTest(unittest.TestCase): - @classmethod - def setUpClass(cls): - os.environ['CPU_NUM'] = str(2) - - def test_cpu(self): - actual = custom_op_test(False, True) - expect = custom_op_test(False, False) - self.assertEqual(actual.all(), expect.all()) - - def test_gpu(self): - if not fluid.core.is_compiled_with_cuda(): - return - actual = custom_op_test(True, True) - expect = custom_op_test(True, False) - self.assertEqual(actual.all(), expect.all()) - - -if __name__ == '__main__': - load_so(so_name='librelu2_op.so') - unittest.main() diff --git a/python/paddle/fluid/tests/custom_op/test_jit_load.py b/python/paddle/fluid/tests/custom_op/test_jit_load.py deleted file mode 100644 index 4e6d74b7d60..00000000000 --- a/python/paddle/fluid/tests/custom_op/test_jit_load.py +++ /dev/null @@ -1,51 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import unittest -import paddle -import numpy as np -from paddle.utils.cpp_extension import load -from utils import paddle_includes, extra_cc_args, extra_nvcc_args -from paddle.utils.cpp_extension.extension_utils import use_new_custom_op_load_method - -# switch to old custom op method -use_new_custom_op_load_method(False) - -# Compile and load custom op Just-In-Time. -custom_module = load( - name='custom_relu2', - sources=['relu_op.cc', 'relu_op.cu', 'relu_op3.cc', 'relu_op3.cu'], - extra_include_paths=paddle_includes, # add for Coverage CI - extra_cxx_cflags=extra_cc_args, # test for cc flags - extra_cuda_cflags=extra_nvcc_args, # test for nvcc flags - verbose=True # add for unittest -) - - -class TestJITLoad(unittest.TestCase): - def test_api(self): - raw_data = np.array([[-1, 1, 0], [1, -1, -1]]).astype('float32') - gt_data = np.array([[0, 1, 0], [1, 0, 0]]).astype('float32') - x = paddle.to_tensor(raw_data, dtype='float32') - # use custom api - out = custom_module.relu2(x) - out3 = custom_module.relu3(x) - - self.assertTrue(np.array_equal(out.numpy(), gt_data)) - self.assertTrue(np.array_equal(out3.numpy(), gt_data)) - - -if __name__ == '__main__': - unittest.main() diff --git a/python/paddle/fluid/tests/custom_op/test_setup_build.py b/python/paddle/fluid/tests/custom_op/test_setup_build.py deleted file mode 100644 index 1ef14c2e3aa..00000000000 --- a/python/paddle/fluid/tests/custom_op/test_setup_build.py +++ /dev/null @@ -1,69 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import unittest -import numpy as np -from test_custom_op import CustomOpTest, load_so -import paddle -from paddle.utils.cpp_extension.extension_utils import run_cmd -from paddle.fluid.layer_helper import LayerHelper -from paddle.utils.cpp_extension.extension_utils import use_new_custom_op_load_method - -# switch to old custom op method -use_new_custom_op_load_method(False) - - -def compile_so(): - """ - Compile .so file by running setup.py config. - """ - # build .so with setup.py - file_dir = os.path.dirname(os.path.abspath(__file__)) - cmd = 'cd {} && python setup_build.py build'.format(file_dir) - run_cmd(cmd) - - -# `setup.py build` only produce .so file containing multi operators. -# Python Interface should be added manually. `relu2` api is in `test_custom_op.py` -def relu3(x, name=None): - helper = LayerHelper("relu3", **locals()) - out = helper.create_variable( - type=x.type, name=name, dtype=x.dtype, persistable=False) - helper.append_op(type="relu3", inputs={"X": x}, outputs={"Y": out}) - return out - - -class TestCompileMultiOp(unittest.TestCase): - def setUp(self): - paddle.disable_static() - - def test_relu3(self): - raw_data = np.array([[-1, 1, 0], [1, -1, -1]]).astype('float32') - x = paddle.to_tensor(raw_data, dtype='float32') - # use custom api - out = relu3(x) - - self.assertTrue( - np.array_equal(out.numpy(), - np.array([[0, 1, 0], [1, 0, 0]]).astype('float32'))) - - def tearDown(self): - paddle.enable_static() - - -if __name__ == '__main__': - compile_so() - load_so(so_name='librelu2_op_from_setup.so') - unittest.main() diff --git a/python/paddle/fluid/tests/custom_op/test_setup_install.py b/python/paddle/fluid/tests/custom_op/test_setup_install.py deleted file mode 100644 index 1fd7b8a06f9..00000000000 --- a/python/paddle/fluid/tests/custom_op/test_setup_install.py +++ /dev/null @@ -1,65 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import sys -import site -import unittest -import paddle -import subprocess -import numpy as np -from paddle.utils.cpp_extension.extension_utils import run_cmd -from paddle.utils.cpp_extension.extension_utils import use_new_custom_op_load_method - -# switch to old custom op method -use_new_custom_op_load_method(False) - - -class TestSetUpInstall(unittest.TestCase): - def setUp(self): - cur_dir = os.path.dirname(os.path.abspath(__file__)) - # compile, install the custom op egg into site-packages under background - cmd = 'cd {} && python setup_install.py install'.format(cur_dir) - run_cmd(cmd) - - # NOTE(Aurelius84): Normally, it's no need to add following codes for users. - # But we simulate to pip install in current process, so interpreter don't snap - # sys.path has been updated. So we update it manually. - - # See: https://stackoverflow.com/questions/56974185/import-runtime-installed-module-using-pip-in-python-3 - site_dir = site.getsitepackages()[0] - custom_egg_path = [ - x for x in os.listdir(site_dir) if 'custom_relu2' in x - ] - assert len(custom_egg_path) == 1, "Matched egg number is %d." % len( - custom_egg_path) - sys.path.append(os.path.join(site_dir, custom_egg_path[0])) - - def test_api(self): - # usage: import the package directly - import custom_relu2 - - raw_data = np.array([[-1, 1, 0], [1, -1, -1]]).astype('float32') - gt_data = np.array([[0, 1, 0], [1, 0, 0]]).astype('float32') - x = paddle.to_tensor(raw_data, dtype='float32') - # use custom api - out = custom_relu2.relu2(x) - out3 = custom_relu2.relu3(x) - - self.assertTrue(np.array_equal(out.numpy(), gt_data)) - self.assertTrue(np.array_equal(out3.numpy(), gt_data)) - - -if __name__ == '__main__': - unittest.main() diff --git a/python/paddle/incubate/__init__.py b/python/paddle/incubate/__init__.py index c422bacdf78..662515f0e52 100644 --- a/python/paddle/incubate/__init__.py +++ b/python/paddle/incubate/__init__.py @@ -14,7 +14,6 @@ from . import optimizer from ..fluid.contrib import reader -from ..fluid import load_op_library from ..fluid.layer_helper import LayerHelper __all__ = [] diff --git a/python/paddle/utils/__init__.py b/python/paddle/utils/__init__.py index 1db1b66426c..d32fa4c88c4 100644 --- a/python/paddle/utils/__init__.py +++ b/python/paddle/utils/__init__.py @@ -20,7 +20,6 @@ from .lazy_import import try_import from .op_version import OpLastCheckpointChecker from .install_check import run_check from ..fluid.framework import unique_name -from ..fluid.framework import load_op_library from ..fluid.framework import require_version from . import download @@ -30,4 +29,4 @@ from . import cpp_extension __all__ = ['dump_config', 'deprecated', 'download', 'run_check'] #TODO: define new api under this directory -__all__ += ['unique_name', 'load_op_library', 'require_version'] +__all__ += ['unique_name', 'require_version'] diff --git a/python/paddle/utils/cpp_extension/cpp_extension.py b/python/paddle/utils/cpp_extension/cpp_extension.py index ea4c85e20db..606f5465e1b 100644 --- a/python/paddle/utils/cpp_extension/cpp_extension.py +++ b/python/paddle/utils/cpp_extension/cpp_extension.py @@ -26,7 +26,7 @@ from .extension_utils import find_cuda_home, find_rocm_home, normalize_extension from .extension_utils import is_cuda_file, prepare_unix_cudaflags, prepare_win_cudaflags from .extension_utils import _import_module_from_library, _write_setup_file, _jit_compile from .extension_utils import check_abi_compatibility, log_v, CustomOpInfo, parse_op_name_from -from .extension_utils import use_new_custom_op_load_method, clean_object_if_change_cflags +from .extension_utils import clean_object_if_change_cflags from .extension_utils import bootstrap_context, get_build_directory, add_std_without_repeat from .extension_utils import IS_WINDOWS, OS_NAME, MSVC_COMPILE_FLAGS, MSVC_COMPILE_FLAGS diff --git a/python/paddle/utils/cpp_extension/extension_utils.py b/python/paddle/utils/cpp_extension/extension_utils.py index 7d6bcc4d564..65655eaf48e 100644 --- a/python/paddle/utils/cpp_extension/extension_utils.py +++ b/python/paddle/utils/cpp_extension/extension_utils.py @@ -28,7 +28,6 @@ import subprocess from contextlib import contextmanager from setuptools.command import bdist_egg -from .. import load_op_library from ...fluid import core from ...fluid.framework import OpProtoHolder from ...sysconfig import get_include, get_lib @@ -86,7 +85,6 @@ information !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! ''' -USING_NEW_CUSTOM_OP_LOAD_METHOD = True DEFAULT_OP_ATTR_NAMES = [ core.op_proto_and_checker_maker.kOpRoleAttrName(), @@ -97,18 +95,6 @@ DEFAULT_OP_ATTR_NAMES = [ ] -# NOTE(chenweihang): In order to be compatible with -# the two custom op define method, after removing -# old method, we can remove them together -def use_new_custom_op_load_method(*args): - global USING_NEW_CUSTOM_OP_LOAD_METHOD - if len(args) == 0: - return USING_NEW_CUSTOM_OP_LOAD_METHOD - else: - assert len(args) == 1 and isinstance(args[0], bool) - USING_NEW_CUSTOM_OP_LOAD_METHOD = args[0] - - @contextmanager def bootstrap_context(): """ @@ -122,10 +108,7 @@ def bootstrap_context(): def load_op_meta_info_and_register_op(lib_filename): - if USING_NEW_CUSTOM_OP_LOAD_METHOD: - core.load_op_meta_info_and_register_op(lib_filename) - else: - core.load_op_library(lib_filename) + core.load_op_meta_info_and_register_op(lib_filename) return OpProtoHolder.instance().update_op_proto() @@ -406,10 +389,7 @@ def normalize_extension_kwargs(kwargs, use_cuda=False): # append link flags extra_link_args = kwargs.get('extra_link_args', []) - if use_new_custom_op_load_method(): - extra_link_args.append('-lpaddle_custom_op') - else: - extra_link_args.append('-lpaddle_framework') + extra_link_args.append('-lpaddle_custom_op') if use_cuda: extra_link_args.append('-lcudart') @@ -811,9 +791,7 @@ def _write_setup_file(name, import os from paddle.utils.cpp_extension import CppExtension, CUDAExtension, BuildExtension, setup from paddle.utils.cpp_extension import get_build_directory - from paddle.utils.cpp_extension.extension_utils import use_new_custom_op_load_method - use_new_custom_op_load_method({use_new_method}) setup( name='{name}', @@ -841,8 +819,7 @@ def _write_setup_file(name, extra_cxx_cflags=list2str(extra_cxx_cflags), extra_cuda_cflags=list2str(extra_cuda_cflags), extra_link_args=list2str(link_args), - build_dir=build_dir, - use_new_method=use_new_custom_op_load_method()) + build_dir=build_dir) log_v('write setup.py into {}'.format(file_path), verbose) with open(file_path, 'w') as f: @@ -898,11 +875,7 @@ def parse_op_name_from(sources): """ def regex(content): - if USING_NEW_CUSTOM_OP_LOAD_METHOD: - pattern = re.compile(r'PD_BUILD_OP\(([^,\)]+)\)') - else: - pattern = re.compile(r'REGISTER_OPERATOR\(([^,]+),') - + pattern = re.compile(r'PD_BUILD_OP\(([^,\)]+)\)') content = re.sub(r'\s|\t|\n', '', content) op_name = pattern.findall(content) op_name = set([re.sub('_grad', '', name) for name in op_name]) diff --git a/python/setup.py.in b/python/setup.py.in index 69a8bc771ae..5876ac19d46 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -347,11 +347,6 @@ if '${WITH_XPU}' == 'OFF' and '${XPU_SDK_ROOT}' != '': shutil.copy(xpu_rt_lib, libs_path) package_data['paddle.libs']+=['libxpurt.so'] -### Old custom op extension mechanism related, will be removed in 2.1.0 ### -# copy libpaddle_framework.so to libs on linux -if sys.platform.startswith('linux'): - shutil.copy('${FLUID_FRAMEWORK_SHARED_LIB}', libs_path) - package_data['paddle.libs'] += ['libpaddle_framework.so'] ### New custom op extension mechanism related ### # copy libpaddle_custom_op.so to libs on linux @@ -405,25 +400,8 @@ def find_files(pattern, root): headers = ( list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle')) + - list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/fluid/extension')) + - list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/fluid/framework')) + - list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/fluid/imperative')) + - list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/fluid/memory')) + - list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/fluid/platform')) + - list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/fluid/string')) + - list(find_files('*.pb.h', '${PADDLE_BINARY_DIR}/paddle/fluid/platform')) + - list(find_files('*.pb.h', '${PADDLE_BINARY_DIR}/paddle/fluid/framework')) + - list(find_files('*.pb', '${cudaerror_INCLUDE_DIR}')) + # errorMessage.pb for errormessage - ['${EIGEN_INCLUDE_DIR}/Eigen/Core'] + # eigen - list(find_files('*', '${EIGEN_INCLUDE_DIR}/Eigen/src')) + # eigen - list(find_files('*', '${EIGEN_INCLUDE_DIR}/unsupported/Eigen')) + # eigen - list(find_files('*', '${GFLAGS_INSTALL_DIR}/include')) + # gflags - list(find_files('*', '${GLOG_INSTALL_DIR}/include')) + # glog - list(find_files('*', '${BOOST_INCLUDE_DIR}/boost')) + # boost - list(find_files('*', '${XXHASH_INSTALL_DIR}/include')) + # xxhash - list(find_files('*', '${PROTOBUF_INCLUDE_DIR}')) + # protobuf - list(find_files('*', '${DLPACK_INCLUDE_DIR}')) + # dlpack - list(find_files('*.h', '${THREADPOOL_INCLUDE_DIR}'))) # threadpool + list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/fluid/extension')) + # extension + list(find_files('*', '${BOOST_INCLUDE_DIR}/boost'))) # boost if '${WITH_MKLDNN}' == 'ON': headers += list(find_files('*', '${MKLDNN_INSTALL_DIR}/include')) # mkldnn @@ -463,17 +441,18 @@ class InstallHeaders(Command): ('install_headers', 'install_dir'), ('force', 'force')) - def copy_data_type_headers(self, header): - if os.name == 'nt': - data_type_headers = ['platform\\complex64.h', 'platform\\complex128.h', 'platform\\float16.h'] - else: - data_type_headers = ['platform/complex64.h', 'platform/complex128.h', 'platform/float16.h'] - for dtype_header in data_type_headers: - if dtype_header in header: - install_dir = os.path.join(self.install_dir, "paddle/fluid/extension/include") - if not os.path.exists(install_dir): - self.mkpath(install_dir) - return self.copy_file(header, install_dir) + def copy_data_type_headers(self): + # For paddle uew custom op, only copy data type headers from `paddle/fluid/platform` + # to `extension/incude`, + data_type_headers = (['@PADDLE_SOURCE_DIR@/paddle/fluid/platform/complex64.h'] + + ['@PADDLE_SOURCE_DIR@/paddle/fluid/platform/complex128.h'] + + ['@PADDLE_SOURCE_DIR@/paddle/fluid/platform/float16.h']) + + install_dir = os.path.join(self.install_dir, "paddle/fluid/extension/include") + if not os.path.exists(install_dir): + self.mkpath(install_dir) + for header in data_type_headers: + self.copy_file(header, install_dir) def mkdir_and_copy_file(self, header): if 'pb.h' in header: @@ -481,9 +460,6 @@ class InstallHeaders(Command): elif 'third_party' not in header: # paddle headers install_dir = re.sub('@PADDLE_SOURCE_DIR@/', '', header) - # For paddle data type headers, we also need to copy to `extension/incude`, - # used for new custom operator - self.copy_data_type_headers(header) else: # third_party install_dir = re.sub('${THIRD_PARTY_PATH}', 'third_party', header) @@ -509,6 +485,7 @@ class InstallHeaders(Command): for header in hdrs: (out, _) = self.mkdir_and_copy_file(header) self.outfiles.append(out) + self.copy_data_type_headers() def get_inputs(self): return self.distribution.headers or [] -- GitLab From e50bc2c2a6fcfec748d5bb991588a2fdc2ab4caf Mon Sep 17 00:00:00 2001 From: Yiqun Liu Date: Tue, 30 Mar 2021 14:14:26 +0800 Subject: [PATCH 099/486] Enhance cmake to support specifying CUDA_ARCH_NAME to Ampere. (#31923) --- cmake/cuda.cmake | 4 +++- cmake/cudnn.cmake | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake index c4d1384312e..e6770da6763 100644 --- a/cmake/cuda.cmake +++ b/cmake/cuda.cmake @@ -74,7 +74,7 @@ endfunction() # select_nvcc_arch_flags(out_variable) function(select_nvcc_arch_flags out_variable) # List of arch names - set(archs_names "Kepler" "Maxwell" "Pascal" "Volta" "Turing" "All" "Manual") + set(archs_names "Kepler" "Maxwell" "Pascal" "Volta" "Turing" "Ampere" "All" "Manual") set(archs_name_default "Auto") list(APPEND archs_names "Auto") @@ -108,6 +108,8 @@ function(select_nvcc_arch_flags out_variable) set(cuda_arch_bin "70") elseif(${CUDA_ARCH_NAME} STREQUAL "Turing") set(cuda_arch_bin "75") + elseif(${CUDA_ARCH_NAME} STREQUAL "Ampere") + set(cuda_arch_bin "80") elseif(${CUDA_ARCH_NAME} STREQUAL "All") set(cuda_arch_bin ${paddle_known_gpu_archs}) elseif(${CUDA_ARCH_NAME} STREQUAL "Auto") diff --git a/cmake/cudnn.cmake b/cmake/cudnn.cmake index d8d8f634e76..c82847100ab 100644 --- a/cmake/cudnn.cmake +++ b/cmake/cudnn.cmake @@ -94,7 +94,7 @@ macro(find_cudnn_version cudnn_header_file) "${CUDNN_MAJOR_VERSION} * 1000 + ${CUDNN_MINOR_VERSION} * 100 + ${CUDNN_PATCHLEVEL_VERSION}") message(STATUS "Current cuDNN header is ${cudnn_header_file} " - "Current cuDNN version is v${CUDNN_MAJOR_VERSION}.${CUDNN_MINOR_VERSION}. ") + "Current cuDNN version is v${CUDNN_MAJOR_VERSION}.${CUDNN_MINOR_VERSION}.${CUDNN_PATCHLEVEL_VERSION}. ") endif() endif() endmacro() -- GitLab From e1f931610ef4cf400c48a2403d184931f3d5e0a6 Mon Sep 17 00:00:00 2001 From: Zhen Wang Date: Tue, 30 Mar 2021 14:24:48 +0800 Subject: [PATCH 100/486] Fix save/load error in imperative qat UT. (#31937) --- .../contrib/slim/tests/test_imperative_qat.py | 26 ++++++++++++++---- .../test_imperative_qat_addquantdequant.py | 27 +++++++++++++++---- 2 files changed, 43 insertions(+), 10 deletions(-) diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py index 96b3b67103b..99a23525409 100644 --- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py +++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py @@ -17,6 +17,8 @@ from __future__ import print_function import os import numpy as np import random +import shutil +import time import unittest import logging import paddle @@ -157,6 +159,20 @@ class TestImperativeQat(unittest.TestCase): QAT = quantization-aware training """ + @classmethod + def setUpClass(cls): + timestamp = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime()) + cls.root_path = os.path.join(os.getcwd(), "imperative_qat_" + timestamp) + cls.save_path = os.path.join(cls.root_path, "lenet") + cls.dynamic_root_path = os.path.join(os.getcwd(), + "dynamic_mnist_" + timestamp) + cls.dynamic_save_path = os.path.join(cls.dynamic_root_path, "model") + + @classmethod + def tearDownClass(cls): + shutil.rmtree(cls.root_path) + shutil.rmtree(cls.dynamic_root_path) + def test_qat_save(self): imperative_qat = ImperativeQuantAware( weight_quantize_type='abs_max', @@ -206,6 +222,8 @@ class TestImperativeQat(unittest.TestCase): "Train | At epoch {} step {}: loss = {:}, acc= {:}". format(epoch, batch_id, avg_loss.numpy(), acc.numpy())) + if batch_id == 500: # For shortening CI time + break lenet.eval() for batch_id, data in enumerate(test_reader()): @@ -242,11 +260,9 @@ class TestImperativeQat(unittest.TestCase): before_save = lenet(test_img) # save inference quantized model - path = "./qat_infer_model/lenet" - save_dir = "./qat_infer_model" paddle.jit.save( layer=lenet, - path=path, + path=TestImperativeQat.save_path, input_spec=[ paddle.static.InputSpec( shape=[None, 1, 28, 28], dtype='float32') @@ -259,7 +275,7 @@ class TestImperativeQat(unittest.TestCase): exe = fluid.Executor(place) [inference_program, feed_target_names, fetch_targets] = fluid.io.load_inference_model( - dirname=save_dir, + dirname=TestImperativeQat.root_path, executor=exe, model_filename="lenet" + INFER_MODEL_SUFFIX, params_filename="lenet" + INFER_PARAMS_SUFFIX) @@ -351,7 +367,7 @@ class TestImperativeQat(unittest.TestCase): paddle.jit.save( layer=lenet, - path="./dynamic_mnist/model", + path=TestImperativeQat.dynamic_save_path, input_spec=[ paddle.static.InputSpec( shape=[None, 1, 28, 28], dtype='float32') diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_addquantdequant.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_addquantdequant.py index d76e4825e0d..f5b3e89ef41 100644 --- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_addquantdequant.py +++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_addquantdequant.py @@ -17,6 +17,8 @@ from __future__ import print_function import os import numpy as np import random +import shutil +import time import unittest import logging import paddle @@ -185,6 +187,21 @@ class ImperativeLenet(fluid.dygraph.Layer): class TestImperativeAddQuantDequant(unittest.TestCase): + @classmethod + def setUpClass(cls): + timestamp = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime()) + cls.root_path = os.path.join(os.getcwd(), + "imperative_qat_aqd_" + timestamp) + cls.save_path = os.path.join(cls.root_path, "lenet") + cls.dynamic_root_path = os.path.join(os.getcwd(), + "dynamic_mnist_aqd_" + timestamp) + cls.dynamic_save_path = os.path.join(cls.dynamic_root_path, "model") + + @classmethod + def tearDownClass(cls): + shutil.rmtree(cls.root_path) + shutil.rmtree(cls.dynamic_root_path) + def test_qat_save(self): imperative_qat = ImperativeQuantAware( @@ -228,6 +245,8 @@ class TestImperativeAddQuantDequant(unittest.TestCase): "Train | At epoch {} step {}: loss = {:}, acc= {:}". format(epoch, batch_id, avg_loss.numpy(), acc.numpy())) + if batch_id == 500: # For shortening CI time + break lenet.eval() for batch_id, data in enumerate(test_reader()): @@ -264,11 +283,9 @@ class TestImperativeAddQuantDequant(unittest.TestCase): before_save = lenet(test_img) # save inference quantized model - path = "./qat_infer_model/lenet" - save_dir = "./qat_infer_model" paddle.jit.save( layer=lenet, - path=path, + path=TestImperativeAddQuantDequant.save_path, input_spec=[ paddle.static.InputSpec( shape=[None, 1, 28, 28], dtype='float32') @@ -280,7 +297,7 @@ class TestImperativeAddQuantDequant(unittest.TestCase): exe = fluid.Executor(place) [inference_program, feed_target_names, fetch_targets] = fluid.io.load_inference_model( - dirname=save_dir, + dirname=TestImperativeAddQuantDequant.root_path, executor=exe, model_filename="lenet" + INFER_MODEL_SUFFIX, params_filename="lenet" + INFER_PARAMS_SUFFIX) @@ -378,7 +395,7 @@ class TestImperativeAddQuantDequant(unittest.TestCase): lenet.eval() paddle.jit.save( layer=lenet, - path="./dynamic_mnist/model", + path=TestImperativeAddQuantDequant.dynamic_save_path, input_spec=[ paddle.static.InputSpec( shape=[None, 1, 28, 28], dtype='float32') -- GitLab From 245252b86e9878373754db8c66fad35b38cd8e1a Mon Sep 17 00:00:00 2001 From: Zhou Wei <52485244+zhouwei25@users.noreply.github.com> Date: Tue, 30 Mar 2021 15:57:36 +0800 Subject: [PATCH 101/486] fix bug when dtype of to_tensor is core.VarType (#31931) --- python/paddle/fluid/tests/unittests/test_var_base.py | 5 +++++ python/paddle/tensor/creation.py | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/python/paddle/fluid/tests/unittests/test_var_base.py b/python/paddle/fluid/tests/unittests/test_var_base.py index b0c9dda7a30..1fea1935473 100644 --- a/python/paddle/fluid/tests/unittests/test_var_base.py +++ b/python/paddle/fluid/tests/unittests/test_var_base.py @@ -76,6 +76,11 @@ class TestVarBase(unittest.TestCase): y = x.cuda(blocking=True) self.assertEqual(y.place.__repr__(), "CUDAPlace(0)") + # support 'dtype' is core.VarType + x = paddle.rand((2, 2)) + y = paddle.to_tensor([2, 2], dtype=x.dtype) + self.assertEqual(y.dtype, core.VarDesc.VarType.FP32) + # set_default_dtype take effect on complex x = paddle.to_tensor(1 + 2j, place=place, stop_gradient=False) self.assertTrue(np.array_equal(x.numpy(), [1 + 2j])) diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py index 056a0226723..69ee2962303 100644 --- a/python/paddle/tensor/creation.py +++ b/python/paddle/tensor/creation.py @@ -168,7 +168,7 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True): data = data.astype(default_type) if dtype and convert_dtype(dtype) != data.dtype: - data = data.astype(dtype) + data = data.astype(convert_dtype(dtype)) return paddle.Tensor( value=data, -- GitLab From 14b7e3cf06ec7fc667a21488129274f250bcd235 Mon Sep 17 00:00:00 2001 From: Pei Yang Date: Tue, 30 Mar 2021 15:59:48 +0800 Subject: [PATCH 102/486] [Paddle-TRT] TRT inference support for BERT/Transformer in paddle 2.0 api (#31744) * support multihead_matmul_fuse_pass_v3 * fix compile problems * embedding_eltwise_ln pass support lookup_table_v2 * suppoort matmul and matmul_v2 in qkv matmul --- .../embedding_eltwise_layernorm_fuse_pass.cc | 19 +- .../framework/ir/graph_pattern_detector.cc | 30 ++ .../framework/ir/graph_pattern_detector.h | 5 + .../ir/multihead_matmul_fuse_pass.cc | 468 ++++++++++++++++++ .../framework/ir/multihead_matmul_fuse_pass.h | 66 ++- .../inference/api/paddle_pass_builder.cc | 5 +- 6 files changed, 585 insertions(+), 8 deletions(-) diff --git a/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc index 84c6b03e76b..44069f61d93 100644 --- a/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc +++ b/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc @@ -34,15 +34,19 @@ namespace patterns { static PDNode* create_emb_vars(PDPattern* pattern, const std::string& name, const std::string& arg, bool is_persist = false) { + std::unordered_set embedding_ops{"lookup_table", + "lookup_table_v2"}; PDNode* node = - pattern->NewNode(name)->assert_is_op_input("lookup_table", arg); + pattern->NewNode(name)->assert_is_ops_input(embedding_ops, arg); if (is_persist) return node->assert_is_persistable_var(); return node; } static PDNode* create_emb_out_vars(PDPattern* pattern, const std::string& name, const std::string& arg) { + std::unordered_set embedding_ops{"lookup_table", + "lookup_table_v2"}; PDNode* node = pattern->NewNode(name) - ->assert_is_only_output_of_op("lookup_table") + ->assert_is_only_output_of_ops(embedding_ops) ->assert_is_op_input("elementwise_add", arg) ->AsIntermediate(); return node; @@ -56,10 +60,12 @@ void Embedding2Eltwise1Pattern::operator()() { create_emb_vars(pattern, lookup_table1_w_repr(), "W", true); auto* lookup_table2_w = create_emb_vars(pattern, lookup_table2_w_repr(), "W", true); + std::unordered_set embedding_ops{"lookup_table", + "lookup_table_v2"}; auto* lookup_table1 = - pattern->NewNode(lookup_table1_repr())->assert_is_op("lookup_table"); + pattern->NewNode(lookup_table1_repr())->assert_is_ops(embedding_ops); auto* lookup_table2 = - pattern->NewNode(lookup_table2_repr())->assert_is_op("lookup_table"); + pattern->NewNode(lookup_table2_repr())->assert_is_ops(embedding_ops); auto* lookup_table1_out = create_emb_out_vars(pattern, lookup_table1_out_repr(), "X"); auto* lookup_table2_out = @@ -80,8 +86,10 @@ void Embedding1Eltwise1Pattern::operator()() { create_emb_vars(pattern, lookup_table1_x_repr(), "Ids"); auto* lookup_table1_w = create_emb_vars(pattern, lookup_table1_w_repr(), "W", true); + std::unordered_set embedding_ops{"lookup_table", + "lookup_table_v2"}; auto* lookup_table1 = - pattern->NewNode(lookup_table1_repr())->assert_is_op("lookup_table"); + pattern->NewNode(lookup_table1_repr())->assert_is_ops(embedding_ops); auto* lookup_table1_out = create_emb_out_vars(pattern, lookup_table1_out_repr(), "Y"); auto* eltwise_add = @@ -347,4 +355,5 @@ REGISTER_PASS_CAPABILITY(embedding_eltwise_layernorm_fuse_pass) .AddCombination( paddle::framework::compatible::OpVersionComparatorCombination() .EQ("lookup_table", 0) + .LE("lookup_table_v2", 1) .EQ("elementweise_add", 0)); diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index deb182c0fbe..d74e8e5f65c 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -652,6 +652,36 @@ PDNode *PDNode::assert_is_ops_input( return this; } +PDNode *PDNode::assert_is_only_input_of_ops( + const std::unordered_set &op_types) { + assert_is_var(); + asserts_.emplace_back([=](Node *x) { + for (auto *op : x->outputs) { + if (op && op->IsOp() && op->Op() && op_types.count(op->Op()->Type()) && + op->inputs.size() == 1) { + return true; + } + } + return false; + }); + return this; +} + +PDNode *PDNode::assert_is_only_output_of_ops( + const std::unordered_set &op_types) { + assert_is_var(); + asserts_.emplace_back([=](Node *x) { + for (auto *op : x->inputs) { + if (op && op->IsOp() && op->Op() && op_types.count(op->Op()->Type()) && + op->outputs.size() == 1) { + return true; + } + } + return false; + }); + return this; +} + bool VarLinksToOp(Node *node, const std::string &op_type) { for (auto *out : node->outputs) { if (out->IsOp() && out->Op()->Type() == op_type) { diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h index b6c1074d90d..cfac01ec9de 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.h +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -145,6 +145,11 @@ struct PDNode { const std::unordered_set& op_types, const std::string& argument, int nth); + PDNode* assert_is_only_input_of_ops( + const std::unordered_set& op_types); + PDNode* assert_is_only_output_of_ops( + const std::unordered_set& op_types); + PDNode* assert_has_n_inputs(size_t n); PDNode* assert_has_n_outputs(size_t n); diff --git a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc index e20c0667ec3..e8f4dbd2954 100644 --- a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc +++ b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc @@ -682,6 +682,447 @@ static int BuildFusionV2(Graph* graph, const std::string& name_scope, return fusion_count; } +PDNode* MultiHeadMatmulV3Pattern::operator()() { + std::unordered_set matmul_ops{"matmul", "matmul_v2"}; + auto* input0 = pattern->NewNode(input0_repr()); + input0->assert_is_op_input("matmul"); + + // First path with scale + auto* mul0 = pattern->NewNode(mul0_repr())->assert_is_op("matmul"); + auto* mul0_w_var = pattern->NewNode(mul0_w_repr()) + ->AsInput() + ->assert_is_op_input("matmul", "Y"); + auto* mul0_out_var = + pattern->NewNode(mul0_out_repr())->assert_is_op_output("matmul"); + + decltype(mul0) eltadd0; + decltype(mul0) eltadd0_b_var; + decltype(mul0) eltadd0_out_var; + + mul0_out_var->AsIntermediate()->assert_is_op_input("elementwise_add"); + + eltadd0 = pattern->NewNode(eltadd0_repr())->assert_is_op("elementwise_add"); + eltadd0_b_var = pattern->NewNode(eltadd0_b_repr()) + ->AsInput() + ->assert_is_op_input("elementwise_add", "Y"); + + eltadd0_out_var = pattern->NewNode(eltadd0_out_repr()) + ->assert_is_op_output("elementwise_add"); + eltadd0_out_var->AsIntermediate()->assert_is_op_input("reshape2"); + + auto* reshape2_0 = + pattern->NewNode(reshape2_0_repr())->assert_is_op("reshape2"); + + auto* reshape2_0_out_var = + pattern->NewNode(reshape2_0_out_repr())->assert_is_op_output("reshape2"); + reshape2_0_out_var->AsIntermediate()->assert_is_op_input("transpose2"); + + auto* transpose2_0 = + pattern->NewNode(transpose2_0_repr())->assert_is_op("transpose2"); + auto* transpose2_0_out_var = pattern->NewNode(transpose2_0_out_repr()) + ->assert_is_op_output("transpose2"); + transpose2_0_out_var->AsIntermediate()->assert_is_op_input("matmul"); + + auto* matmul_qk = pattern->NewNode(matmul_qk_repr())->assert_is_op("matmul"); + auto* matmul_qk_out_var = + pattern->NewNode(matmul_qk_out_repr())->assert_is_op_output("matmul"); + matmul_qk_out_var->AsIntermediate()->assert_is_op_input("elementwise_add"); + + auto* eltadd_qk = + pattern->NewNode(eltadd_qk_repr())->assert_is_op("elementwise_add"); + auto* eltadd_qk_b_var = pattern->NewNode(eltadd_qk_b_repr()) + ->AsInput() + ->assert_is_op_input("elementwise_add", "Y"); + auto* eltadd_qk_out_var = pattern->NewNode(eltadd_qk_out_repr()) + ->assert_is_op_output("elementwise_add"); + eltadd_qk_out_var->AsIntermediate()->assert_is_op_input("softmax"); + + auto* softmax_qk = + pattern->NewNode(softmax_qk_repr())->assert_is_op("softmax"); + auto* softmax_qk_out_var = + pattern->NewNode(softmax_qk_out_repr())->assert_is_op_output("softmax"); + softmax_qk_out_var->AsIntermediate()->assert_is_ops_input(matmul_ops); + + auto* matmul_qkv = + pattern->NewNode(matmul_qkv_repr())->assert_is_ops(matmul_ops); + auto* matmul_qkv_out_var = + pattern->NewNode(matmul_qkv_out_repr())->assert_is_ops_output(matmul_ops); + matmul_qkv_out_var->AsIntermediate()->assert_is_op_input("transpose2"); + + auto* transpose2_qkv = + pattern->NewNode(transpose2_qkv_repr())->assert_is_op("transpose2"); + auto* transpose2_qkv_out_var = pattern->NewNode(transpose2_qkv_out_repr()) + ->assert_is_op_output("transpose2"); + transpose2_qkv_out_var->AsIntermediate()->assert_is_op_input("reshape2"); + + auto* reshape2_qkv = + pattern->NewNode(reshape2_qkv_repr())->assert_is_op("reshape2"); + auto* reshape2_qkv_out_var = pattern->NewNode(reshape2_qkv_out_repr()) + ->assert_is_op_output("reshape2"); + reshape2_qkv_out_var->assert_is_op_input("matmul"); + + // Second path to matmul + auto* mul1 = pattern->NewNode(mul1_repr())->assert_is_op("matmul"); + auto* mul1_w_var = pattern->NewNode(mul1_w_repr()) + ->AsInput() + ->assert_is_op_input("matmul", "Y"); + auto* mul1_out_var = + pattern->NewNode(mul1_out_repr())->assert_is_op_output("matmul"); + + decltype(mul1) eltadd1; + decltype(mul1) eltadd1_b_var; + decltype(mul1) eltadd1_out_var; + + mul1_out_var->AsIntermediate()->assert_is_op_input("elementwise_add"); + eltadd1 = pattern->NewNode(eltadd1_repr())->assert_is_op("elementwise_add"); + eltadd1_b_var = pattern->NewNode(eltadd1_b_repr()) + ->AsInput() + ->assert_is_op_input("elementwise_add", "Y"); + + eltadd1_out_var = pattern->NewNode(eltadd1_out_repr()) + ->assert_is_op_output("elementwise_add"); + eltadd1_out_var->AsIntermediate()->assert_is_op_input("reshape2"); + + auto* reshape2_1 = + pattern->NewNode(reshape2_1_repr())->assert_is_op("reshape2"); + + auto* reshape2_1_out_var = + pattern->NewNode(reshape2_1_out_repr())->assert_is_op_output("reshape2"); + reshape2_1_out_var->AsIntermediate()->assert_is_op_input("transpose2"); + + auto* transpose2_1 = + pattern->NewNode(transpose2_1_repr())->assert_is_op("transpose2"); + auto* transpose2_1_out_var = pattern->NewNode(transpose2_1_out_repr()) + ->assert_is_op_output("transpose2"); + transpose2_1_out_var->AsIntermediate()->assert_is_op_input( + "matmul"); // link to matmul qk + + // Third path to matmul + auto* mul2 = pattern->NewNode(mul2_repr())->assert_is_op("matmul"); + auto* mul2_w_var = pattern->NewNode(mul2_w_repr()) + ->AsInput() + ->assert_is_op_input("matmul", "Y"); + auto* mul2_out_var = + pattern->NewNode(mul2_out_repr())->assert_is_op_output("matmul"); + + decltype(mul2) eltadd2; + decltype(mul2) eltadd2_b_var; + decltype(mul2) eltadd2_out_var; + + mul2_out_var->AsIntermediate()->assert_is_op_input("elementwise_add"); + eltadd2 = pattern->NewNode(eltadd2_repr())->assert_is_op("elementwise_add"); + eltadd2_b_var = pattern->NewNode(eltadd2_b_repr()) + ->AsInput() + ->assert_is_op_input("elementwise_add", "Y"); + + eltadd2_out_var = pattern->NewNode(eltadd2_out_repr()) + ->assert_is_op_output("elementwise_add"); + eltadd2_out_var->AsIntermediate()->assert_is_op_input("reshape2"); + + auto* reshape2_2 = + pattern->NewNode(reshape2_2_repr())->assert_is_op("reshape2"); + + auto* reshape2_2_out_var = + pattern->NewNode(reshape2_2_out_repr())->assert_is_op_output("reshape2"); + reshape2_2_out_var->AsIntermediate()->assert_is_op_input("transpose2"); + + auto* transpose2_2 = + pattern->NewNode(transpose2_2_repr())->assert_is_op("transpose2"); + auto* transpose2_2_out_var = pattern->NewNode(transpose2_2_out_repr()) + ->assert_is_op_output("transpose2"); + transpose2_2_out_var->AsIntermediate()->assert_is_ops_input( + matmul_ops); // link to matmul qkv + + // Q path + mul0->LinksFrom({input0, mul0_w_var}).LinksTo({mul0_out_var}); + eltadd0->LinksFrom({mul0_out_var, eltadd0_b_var}).LinksTo({eltadd0_out_var}); + + reshape2_0->LinksFrom({eltadd0_out_var}).LinksTo({reshape2_0_out_var}); + transpose2_0->LinksFrom({reshape2_0_out_var}).LinksTo({transpose2_0_out_var}); + // K path + mul1->LinksFrom({input0, mul1_w_var}).LinksTo({mul1_out_var}); + eltadd1->LinksFrom({mul1_out_var, eltadd1_b_var}).LinksTo({eltadd1_out_var}); + reshape2_1->LinksFrom({eltadd1_out_var}).LinksTo({reshape2_1_out_var}); + transpose2_1->LinksFrom({reshape2_1_out_var}).LinksTo({transpose2_1_out_var}); + // compute q*k + matmul_qk->LinksFrom({transpose2_0_out_var, transpose2_1_out_var}) + .LinksTo({matmul_qk_out_var}); + eltadd_qk->LinksFrom({matmul_qk_out_var, eltadd_qk_b_var}) + .LinksTo({eltadd_qk_out_var}); + softmax_qk->LinksFrom({eltadd_qk_out_var}).LinksTo({softmax_qk_out_var}); + // V path + mul2->LinksFrom({input0, mul2_w_var}).LinksTo({mul2_out_var}); + eltadd2->LinksFrom({mul2_out_var, eltadd2_b_var}).LinksTo({eltadd2_out_var}); + reshape2_2->LinksFrom({eltadd2_out_var}).LinksTo({reshape2_2_out_var}); + transpose2_2->LinksFrom({reshape2_2_out_var}).LinksTo({transpose2_2_out_var}); + // compute q*k*v + matmul_qkv->LinksFrom({softmax_qk_out_var, transpose2_2_out_var}) + .LinksTo({matmul_qkv_out_var}); + transpose2_qkv->LinksFrom({matmul_qkv_out_var}) + .LinksTo({transpose2_qkv_out_var}); + reshape2_qkv->LinksFrom({transpose2_qkv_out_var}) + .LinksTo({reshape2_qkv_out_var}); + + return transpose2_2_out_var; +} + +static int BuildFusionV3(Graph* graph, const std::string& name_scope, + Scope* scope) { + GraphPatternDetector gpd; + auto* pattern = gpd.mutable_pattern(); + + // Create pattern. + MultiHeadMatmulV3Pattern multihead_pattern(pattern, name_scope); + + multihead_pattern(); + // Create New OpDesc + auto fuse_creater = [&]( + Node* input0, Node* mul0, Node* mul1, Node* mul2, Node* mul0_out, + Node* mul1_out, Node* mul2_out, Node* mul0_w, Node* mul1_w, Node* mul2_w, + Node* eltadd0_b, Node* eltadd1_b, Node* eltadd2_b, Node* eltadd_qk_b, + Node* reshape2, Node* reshape2_qkv_out, Node* matmul_qk) { + auto scale_attr = BOOST_GET_CONST(float, matmul_qk->Op()->GetAttr("alpha")); + + // mul (B * S * Hidden) x (Hidden * 3 * N * H) = (B * S * 3 * N * H) + // bias (B * S * 3 * N * H) + bias (3 * N * H) + // Transpose (B * S * 3 * N * H) -> (3 * B * N * S * H) + auto* wq_tensor = scope->FindVar(mul0_w->Name())->GetMutable(); + auto* wk_tensor = scope->FindVar(mul1_w->Name())->GetMutable(); + auto* wv_tensor = scope->FindVar(mul2_w->Name())->GetMutable(); + + auto* bq_tensor = + scope->FindVar(eltadd0_b->Name())->GetMutable(); + auto* bk_tensor = + scope->FindVar(eltadd1_b->Name())->GetMutable(); + auto* bv_tensor = + scope->FindVar(eltadd2_b->Name())->GetMutable(); + + auto* wq_data = wq_tensor->mutable_data(platform::CPUPlace()); + auto* wk_data = wk_tensor->mutable_data(platform::CPUPlace()); + auto* wv_data = wv_tensor->mutable_data(platform::CPUPlace()); + auto* bq_data = bq_tensor->mutable_data(platform::CPUPlace()); + auto* bk_data = bk_tensor->mutable_data(platform::CPUPlace()); + auto* bv_data = bv_tensor->mutable_data(platform::CPUPlace()); + + auto combined_w_dims = + framework::make_ddim({wq_tensor->dims()[0], 3, wq_tensor->dims()[1]}); + auto combined_bias_dims = framework::make_ddim({3, bq_tensor->dims()[0]}); + + // reuse the mul0_w and eltadd_0_b nodes for the combined nodes. + auto* combined_w_desc = mul0_w->Var(); + combined_w_desc->SetShape({wq_tensor->dims()[0], 3, wq_tensor->dims()[1]}); + combined_w_desc->SetPersistable(true); + + auto* combined_bias_desc = eltadd0_b->Var(); + combined_bias_desc->SetShape({3, bq_tensor->dims()[0]}); + combined_bias_desc->SetPersistable(true); + + framework::LoDTensor tmp_combined_w_tensor; + tmp_combined_w_tensor.Resize(combined_w_dims); + auto* tmp_combined_w_data = + tmp_combined_w_tensor.mutable_data(platform::CPUPlace()); + + std::vector w_vec = {wq_data, wk_data, wv_data}; + int dims_h = combined_w_dims[0], dims_w = combined_w_dims[2]; + // Combine the three fc weights together. + for (int i = 0; i < dims_h; i++) { + for (int j = 0; j < 3; j++) { + for (int k = 0; k < dims_w; k++) { + int out_index = i * (3 * dims_w) + j * dims_w + k; + int in_index = i * dims_w + k; + tmp_combined_w_data[out_index] = w_vec[j][in_index]; + } + } + } + + wq_tensor->Resize(combined_w_dims); + auto* new_combined_w_data = + wq_tensor->mutable_data(platform::CPUPlace()); + memcpy(new_combined_w_data, tmp_combined_w_data, + sizeof(float) * wq_tensor->numel()); + + scope->EraseVars({mul1_w->Name(), mul2_w->Name()}); + + framework::LoDTensor tmp_combined_bias_tensor; + tmp_combined_bias_tensor.Resize(combined_bias_dims); + auto* tmp_combined_bias_data = + tmp_combined_bias_tensor.mutable_data(platform::CPUPlace()); + + size_t bias_size = bq_tensor->numel(); + memcpy(tmp_combined_bias_data, bq_data, sizeof(float) * bias_size); + memcpy(tmp_combined_bias_data + bias_size, bk_data, + sizeof(float) * bias_size); + memcpy(tmp_combined_bias_data + 2 * bias_size, bv_data, + sizeof(float) * bias_size); + + bq_tensor->Resize(combined_bias_dims); + auto* new_combined_bias_data = + bq_tensor->mutable_data(platform::CPUPlace()); + memcpy(new_combined_bias_data, tmp_combined_bias_data, + sizeof(float) * bq_tensor->numel()); + + scope->EraseVars({eltadd1_b->Name(), eltadd2_b->Name()}); + + auto reshape_desc = reshape2->Op(); + int head_number = + BOOST_GET_CONST(std::vector, reshape_desc->GetAttr("shape")).at(2); + + OpDesc multihead_op_desc; + multihead_op_desc.SetType("multihead_matmul"); + + multihead_op_desc.SetInput("Input", {input0->Name()}); + multihead_op_desc.SetInput("W", {mul0_w->Name()}); + multihead_op_desc.SetInput("Bias", {eltadd0_b->Name()}); + multihead_op_desc.SetInput("BiasQK", {eltadd_qk_b->Name()}); + + multihead_op_desc.SetOutput("Out", {reshape2_qkv_out->Name()}); + multihead_op_desc.SetAttr("alpha", scale_attr); + multihead_op_desc.SetAttr("head_number", head_number); + + auto* multihead = graph->CreateOpNode(&multihead_op_desc); + + IR_NODE_LINK_TO(input0, multihead); + IR_NODE_LINK_TO(mul0_w, multihead); + IR_NODE_LINK_TO(eltadd0_b, multihead); + IR_NODE_LINK_TO(eltadd_qk_b, multihead); + + IR_NODE_LINK_TO(multihead, reshape2_qkv_out); + }; + + int fusion_count{0}; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + // GET_IR_NODE_FROM_SUBGRAPH(dropout_out, dropout_out, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(input0, input0, multihead_pattern); + + GET_IR_NODE_FROM_SUBGRAPH(mul0, mul0, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(mul0_out, mul0_out, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(mul0_w, mul0_w, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(reshape2_0, reshape2_0, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(reshape2_0_out, reshape2_0_out, + multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(transpose2_0, transpose2_0, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(transpose2_0_out, transpose2_0_out, + multihead_pattern); + + GET_IR_NODE_FROM_SUBGRAPH(mul1, mul1, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(mul1_out, mul1_out, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(mul1_w, mul1_w, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(reshape2_1, reshape2_1, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(reshape2_1_out, reshape2_1_out, + multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(transpose2_1, transpose2_1, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(transpose2_1_out, transpose2_1_out, + multihead_pattern); + + GET_IR_NODE_FROM_SUBGRAPH(mul2, mul2, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(mul2_out, mul2_out, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(mul2_w, mul2_w, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(reshape2_2, reshape2_2, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(reshape2_2_out, reshape2_2_out, + multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(transpose2_2, transpose2_2, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(transpose2_2_out, transpose2_2_out, + multihead_pattern); + + // nodes need be removed + GET_IR_NODE_FROM_SUBGRAPH(eltadd0, eltadd0, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(eltadd0_b, eltadd0_b, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(eltadd0_out, eltadd0_out, multihead_pattern); + + GET_IR_NODE_FROM_SUBGRAPH(eltadd1, eltadd1, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(eltadd1_b, eltadd1_b, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(eltadd1_out, eltadd1_out, multihead_pattern); + + GET_IR_NODE_FROM_SUBGRAPH(eltadd2, eltadd2, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(eltadd2_b, eltadd2_b, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(eltadd2_out, eltadd2_out, multihead_pattern); + + GET_IR_NODE_FROM_SUBGRAPH(matmul_qk, matmul_qk, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_qk_out, matmul_qk_out, multihead_pattern); + + GET_IR_NODE_FROM_SUBGRAPH(eltadd_qk, eltadd_qk, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(eltadd_qk_b, eltadd_qk_b, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(eltadd_qk_out, eltadd_qk_out, multihead_pattern); + + GET_IR_NODE_FROM_SUBGRAPH(softmax_qk, softmax_qk, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(softmax_qk_out, softmax_qk_out, + multihead_pattern); + + GET_IR_NODE_FROM_SUBGRAPH(matmul_qkv, matmul_qkv, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_qkv_out, matmul_qkv_out, + multihead_pattern); + + GET_IR_NODE_FROM_SUBGRAPH(reshape2_qkv, reshape2_qkv, multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(reshape2_qkv_out, reshape2_qkv_out, + multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(transpose2_qkv, transpose2_qkv, + multihead_pattern); + GET_IR_NODE_FROM_SUBGRAPH(transpose2_qkv_out, transpose2_qkv_out, + multihead_pattern); + + // If weights or biases in qkv's fc are shared by multiple multihead_matmul + // patterns, we do not support this kind of fusion, this pass will not take + // effect. + bool is_fc_params_shared = + mul0_w->outputs.size() > 1 || mul1_w->outputs.size() > 1 || + mul2_w->outputs.size() > 1 || eltadd0_b->outputs.size() > 1 || + eltadd1_b->outputs.size() > 1 || eltadd2_b->outputs.size() > 1; + if (is_fc_params_shared) { + return; + } + fuse_creater(input0, mul0, mul1, mul2, mul0_out, mul1_out, mul2_out, mul0_w, + mul1_w, mul2_w, eltadd0_b, eltadd1_b, eltadd2_b, eltadd_qk_b, + reshape2_0, reshape2_qkv_out, matmul_qk); + + std::unordered_set marked_nodes({eltadd0, + eltadd1, + eltadd2, + eltadd1_b, + eltadd2_b, + eltadd0_out, + eltadd1_out, + eltadd2_out, + reshape2_0, + reshape2_1, + reshape2_2, + reshape2_0_out, + reshape2_1_out, + reshape2_2_out, + transpose2_0, + transpose2_1, + transpose2_2, + transpose2_0_out, + transpose2_1_out, + transpose2_2_out, + matmul_qk, + matmul_qk_out, + eltadd_qk, + eltadd_qk_out, + softmax_qk, + softmax_qk_out, + transpose2_qkv, + transpose2_qkv_out, + matmul_qkv, + matmul_qkv_out, + mul0, + mul1, + mul2, + mul0_out, + mul1_out, + mul2_out, + mul1_w, + mul2_w, + reshape2_qkv}); + // Remove unneeded nodes. + GraphSafeRemoveNodes(graph, marked_nodes); + ++fusion_count; + }; + gpd(graph, handler); + + return fusion_count; +} + } // namespace patterns void MultiHeadMatmulFusePass::ApplyImpl(Graph* graph) const { @@ -706,6 +1147,21 @@ void MultiHeadMatmulV2FusePass::ApplyImpl(Graph* graph) const { AddStatis(fusion_count); } +void MultiHeadMatmulV3FusePass::ApplyImpl(Graph* graph) const { + FusePassBase::Init(name_scope_, graph); + auto* scope = param_scope(); + PADDLE_ENFORCE_NOT_NULL( + scope, + platform::errors::Fatal( + "During the multiheadMatmul pass, The scope should not be null.")); + + int fusion_count = patterns::BuildFusionV3(graph, name_scope_, scope); + if (fusion_count > 0) { + graph->Set(kMultiheadMatmulPass, new bool(true)); + } + AddStatis(fusion_count); +} + } // namespace ir } // namespace framework } // namespace paddle @@ -715,6 +1171,8 @@ REGISTER_PASS(multihead_matmul_fuse_pass, REGISTER_PASS(multihead_matmul_fuse_pass_v2, paddle::framework::ir::MultiHeadMatmulV2FusePass); +REGISTER_PASS(multihead_matmul_fuse_pass_v3, + paddle::framework::ir::MultiHeadMatmulV3FusePass); REGISTER_PASS_CAPABILITY(multihead_matmul_fuse_pass_v2) .AddCombination( paddle::framework::compatible::OpVersionComparatorCombination() @@ -725,3 +1183,13 @@ REGISTER_PASS_CAPABILITY(multihead_matmul_fuse_pass_v2) .EQ("scale", 0) .LE("matmul", 1) .EQ("softmax", 0)); +REGISTER_PASS_CAPABILITY(multihead_matmul_fuse_pass_v3) + .AddCombination( + paddle::framework::compatible::OpVersionComparatorCombination() + .LE("elementwise_add", 1) + .EQ("reshape2", 0) + .EQ("transpose2", 0) + .EQ("scale", 0) + .LE("matmul", 1) + .EQ("matmul_v2", 0) + .EQ("softmax", 0)); diff --git a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.h b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.h index f5327dc7108..c7f1336211d 100644 --- a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.h +++ b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.h @@ -89,9 +89,63 @@ struct MultiHeadMatmulPattern : public PatternBase { PATTERN_DECL_NODE(matmul_qkv); PATTERN_DECL_NODE(matmul_qkv_out); }; + +struct MultiHeadMatmulV3Pattern : public PatternBase { + MultiHeadMatmulV3Pattern(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "multihead_matmul_v3") {} + + PDNode* operator()(); + + // declare operator node's name + PATTERN_DECL_NODE(input0); + PATTERN_DECL_NODE(mul0); + PATTERN_DECL_NODE(mul1); + PATTERN_DECL_NODE(mul2); + PATTERN_DECL_NODE(mul0_w); + PATTERN_DECL_NODE(mul1_w); + PATTERN_DECL_NODE(mul2_w); + PATTERN_DECL_NODE(mul0_out); + PATTERN_DECL_NODE(mul1_out); + PATTERN_DECL_NODE(mul2_out); + PATTERN_DECL_NODE(eltadd0); // ELEMENTWISE_ADD + PATTERN_DECL_NODE(eltadd1); // ELEMENTWISE_ADD + PATTERN_DECL_NODE(eltadd2); // ELEMENTWISE_ADD + PATTERN_DECL_NODE(eltadd0_b); // ELEMENTWISE_ADD + PATTERN_DECL_NODE(eltadd1_b); // ELEMENTWISE_ADD + PATTERN_DECL_NODE(eltadd2_b); // ELEMENTWISE_ADD + PATTERN_DECL_NODE(eltadd0_out); + PATTERN_DECL_NODE(eltadd1_out); + PATTERN_DECL_NODE(eltadd2_out); + PATTERN_DECL_NODE(reshape2_0); + PATTERN_DECL_NODE(reshape2_1); + PATTERN_DECL_NODE(reshape2_2); + PATTERN_DECL_NODE(reshape2_qkv); + PATTERN_DECL_NODE(reshape2_0_out); + PATTERN_DECL_NODE(reshape2_1_out); + PATTERN_DECL_NODE(reshape2_2_out); + PATTERN_DECL_NODE(reshape2_qkv_out); + PATTERN_DECL_NODE(transpose2_0); + PATTERN_DECL_NODE(transpose2_1); + PATTERN_DECL_NODE(transpose2_2); + PATTERN_DECL_NODE(transpose2_qkv); + PATTERN_DECL_NODE(transpose2_0_out); + PATTERN_DECL_NODE(transpose2_1_out); + PATTERN_DECL_NODE(transpose2_2_out); + PATTERN_DECL_NODE(transpose2_qkv_out); + PATTERN_DECL_NODE(matmul_qk); + PATTERN_DECL_NODE(matmul_qk_out); + PATTERN_DECL_NODE(eltadd_qk); + PATTERN_DECL_NODE(eltadd_qk_b); + PATTERN_DECL_NODE(eltadd_qk_out); + PATTERN_DECL_NODE(softmax_qk); + PATTERN_DECL_NODE(softmax_qk_out); + + PATTERN_DECL_NODE(matmul_qkv); + PATTERN_DECL_NODE(matmul_qkv_out); +}; + } // namespace patterns -// The MulGRUFusePass and MulGRUFusePass will fuse to the same FusionGRU op. class MultiHeadMatmulFusePass : public FusePassBase { public: virtual ~MultiHeadMatmulFusePass() {} @@ -112,6 +166,16 @@ class MultiHeadMatmulV2FusePass : public FusePassBase { const std::string name_scope_{"multihead_matmul_fuse_v2"}; }; +class MultiHeadMatmulV3FusePass : public FusePassBase { + public: + virtual ~MultiHeadMatmulV3FusePass() {} + + protected: + void ApplyImpl(Graph* graph) const; + + const std::string name_scope_{"multihead_matmul_fuse_v3"}; +}; + } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc index 61fcdb7a908..1d77ddaf73e 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.cc +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -86,6 +86,7 @@ const std::vector kTRTSubgraphPasses({ "simplify_with_basic_ops_pass", // "embedding_eltwise_layernorm_fuse_pass", // "multihead_matmul_fuse_pass_v2", // + "multihead_matmul_fuse_pass_v3", // "skip_layernorm_fuse_pass", // "conv_bn_fuse_pass", // "unsqueeze2_eltwise_fuse_pass", // @@ -235,8 +236,8 @@ void CpuPassStrategy::EnableMKLDNN() { "reshape_transpose_matmul_mkldnn_fuse_pass", // "matmul_transpose_reshape_fuse_pass", // // Disabled due to topology-dependent speed-up - //"fc_mkldnn_pass", - //"fc_act_mkldnn_fuse_pass", + // "fc_mkldnn_pass", + // "fc_act_mkldnn_fuse_pass", "batch_norm_act_fuse_pass", // TODO(intel): Please fix the bug on windows. // https://github.com/PaddlePaddle/Paddle/issues/29710 -- GitLab From 6dca7a1de70a85b16e2fa8d7f1affd5c632ca10c Mon Sep 17 00:00:00 2001 From: jakpiase <62569058+jakpiase@users.noreply.github.com> Date: Tue, 30 Mar 2021 11:04:07 +0200 Subject: [PATCH 103/486] Added int8 kernel for oneDNN LSTM op (#31894) --- .../fluid/operators/fused/fusion_lstm_op.cc | 12 ++ .../fused/mkldnn/fusion_lstm_mkldnn_op.cc | 19 ++- .../mkldnn/test_fusion_lstm_int8_mkldnn_op.py | 153 ++++++++++++++++++ tools/static_mode_white_list.py | 1 + 4 files changed, 178 insertions(+), 7 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/mkldnn/test_fusion_lstm_int8_mkldnn_op.py diff --git a/paddle/fluid/operators/fused/fusion_lstm_op.cc b/paddle/fluid/operators/fused/fusion_lstm_op.cc index 3c82be2c4e4..6cca6b5a972 100644 --- a/paddle/fluid/operators/fused/fusion_lstm_op.cc +++ b/paddle/fluid/operators/fused/fusion_lstm_op.cc @@ -249,6 +249,18 @@ void FusionLSTMOpMaker::Make() { AddAttr("use_mkldnn", "(bool, default false) Only used in mkldnn kernel") .SetDefault(false); + AddAttr("Scale_data", + "Scale to be used for int8 input/output data." + "Only used with MKL-DNN INT8.") + .SetDefault(1.0f); + AddAttr("Shift_data", + "Shift to be used for int8 input/output data." + "Only used with MKL-DNN INT8.") + .SetDefault(0.0f); + AddAttr>("Scale_weights", + "Scale_weights to be used for int8 weights data." + "Only used with MKL-DNN INT8.") + .SetDefault({1.0f}); AddAttr("force_fp32_output", "(bool, default false) Force INT8 kernel output FP32, only " "used in MKL-DNN INT8") diff --git a/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc b/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc index cf39968a900..1adbd5cd9e7 100644 --- a/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc +++ b/paddle/fluid/operators/fused/mkldnn/fusion_lstm_mkldnn_op.cc @@ -79,13 +79,11 @@ class LSTMMKLDNNHandler MKLDNNMemoryFormat::ldgo); auto hidden_md = MKLDNNMemDesc({Ti, N, OC}, MKLDNNGetDataType(), MKLDNNMemoryFormat::tnc); + auto h0_md = MKLDNNMemDesc({L, D, N, OC}, MKLDNNGetDataType(), MKLDNNMemoryFormat::ldnc); - auto c0_md = MKLDNNMemDesc( - {L, D, N, OC}, MKLDNNGetDataType(), // Vanilla LSTM and LSTM - // with peepoles has c0 as - // fp32 - MKLDNNMemoryFormat::ldnc); + auto c0_md = MKLDNNMemDesc({L, D, N, OC}, MKLDNNGetDataType(), + MKLDNNMemoryFormat::ldnc); // Create LSTM oneDNN primitive const auto direction = @@ -266,7 +264,7 @@ class LSTMMKLDNNHandler this->fwd_pd_->src_iter_c_desc(), this->engine_); auto& astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream(); - dnnl::reorder(user_c0_memory, *memory_p, this->attr_) + dnnl::reorder(user_c0_memory, *memory_p) .execute(astream, user_c0_memory, *memory_p); this->dev_ctx_.SetBlob(c0_key, memory_p); @@ -360,6 +358,12 @@ class FusionLSTMMKLDNNKernel : public framework::OpKernel { weight_h_memory_p = handler.template AcquireWeightHMemory( weight_h); + } else { + h0_memory_p = handler.template AcquireH0Memory(h0); + weight_x_memory_p = + handler.template AcquireWeightXMemory(weight_x); + weight_h_memory_p = + handler.template AcquireWeightHMemory(weight_h); } auto bias_memory_p = handler.AcquireBiasMemory(bias); @@ -406,4 +410,5 @@ class FusionLSTMMKLDNNKernel : public framework::OpKernel { namespace ops = paddle::operators; REGISTER_OP_KERNEL(fusion_lstm, MKLDNN, paddle::platform::CPUPlace, ops::FusionLSTMMKLDNNKernel, - ops::FusionLSTMMKLDNNKernel); + ops::FusionLSTMMKLDNNKernel, + ops::FusionLSTMMKLDNNKernel); diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_lstm_int8_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_lstm_int8_mkldnn_op.py new file mode 100644 index 00000000000..93dc45f2650 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_lstm_int8_mkldnn_op.py @@ -0,0 +1,153 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np +from paddle.fluid.tests.unittests.op_test import OpTest +from paddle.fluid.tests.unittests.test_fusion_lstm_op import fc, ACTIVATION, fusion_lstm + + +class TestFusionLSTMINT8MKLDNNOp(OpTest): + def set_confs(self): + pass + + def setUp(self): + self.op_type = "fusion_lstm" + self.lod = [[2, 3, 5, 4]] + self.IC = 3 + self.OC = 5 + self.is_reverse = False + self.has_initial_state = False + self.act_cell = 'tanh' + self.act_gate = 'sigmoid' + self.act_cand = 'tanh' + self.use_peepholes = False # LSTM u8 doesn't support peepholes + self.use_mkldnn = True + self.force_fp32_output = False + self.error_margin = 1e-5 + self.set_confs() + + # RNN dimensions + T = sum(self.lod[0]) + N = len(self.lod[0]) + + # Input data + x_f32 = np.random.rand(T, self.IC).astype('float32') * 2 - 1 + scale_data = 63.0 + shift_data = 64.0 + x_u8 = np.rint(x_f32 * scale_data + shift_data).astype(np.uint8) + + # WeightX/WeightH data + wx = np.random.rand(self.IC, 4 * self.OC).astype('float32') * 2 - 1 + wh = np.random.rand(self.OC, 4 * self.OC).astype('float32') * 2 - 1 + + # Calculating weight scales + # scales = 127 / max(abs(channel_wise(weightsX + weightsH))) + s8_max = 127.0 + + scale_weights = s8_max / np.max( + np.abs(np.concatenate( + [wx[:, :], wh[:, :]], axis=0)), axis=0) + + scale_weights = scale_weights.astype('float') + + if self.use_peepholes: + b = np.random.rand(1, 7 * self.OC).astype('float32') + else: + b = np.random.rand(1, 4 * self.OC).astype('float32') + w_b = np.copy(b[:, 0:4 * self.OC]) + w_c = b[:, 4 * self.OC:] if self.use_peepholes else None + + bx = np.random.normal(size=(1, 4 * self.OC)).astype('float32') + b[0, 0:4 * self.OC] += bx[0, :] + + if self.has_initial_state: + h0 = np.random.rand(N, self.OC).astype('float32') + c0 = np.random.rand(N, self.OC).astype('float32') + else: + h0 = np.zeros((N, self.OC)).astype('float32') + c0 = np.zeros((N, self.OC)).astype('float32') + + hidden_f32, c = fusion_lstm( + x_f32, self.lod, wx, bx, h0, c0, wh, w_b, w_c, self.is_reverse, + ACTIVATION[self.act_gate], ACTIVATION[self.act_cell], + ACTIVATION[self.act_cand]) + + self.inputs = { + 'X': (x_u8, self.lod), + 'WeightX': wx, + 'WeightH': wh, + 'Bias': b + } + + if self.has_initial_state: + self.inputs['H0'] = h0 + self.inputs['C0'] = c0 + + if self.force_fp32_output: + self.error_margin = 1e-1 + self.outputs = { + 'Hidden': (hidden_f32, self.lod), + 'Cell': (c, self.lod) + } + else: + self.error_margin = 2 + hidden_u8 = np.rint(hidden_f32 * scale_data + shift_data).astype( + np.uint8) + self.outputs = { + 'Hidden': (hidden_u8, self.lod), + 'Cell': (c, self.lod) + } + + self.attrs = { + 'gate_activation': self.act_gate, + 'cell_activation': self.act_cell, + 'candidate_activation': self.act_cand, + 'is_reverse': self.is_reverse, + 'use_peepholes': self.use_peepholes, + 'use_mkldnn': self.use_mkldnn, + 'force_fp32_output': self.force_fp32_output, + 'Scale_data': scale_data, + 'Shift_data': shift_data, + 'Scale_weights': scale_weights + } + + def test_check_output(self): + for use_seq in {True, False}: + self.attrs['use_seq'] = use_seq + self.check_output( + check_dygraph=False, + no_check_set=["Cell"], + atol=self.error_margin) + + +class TestFusionLSTMINT8MKLDNNOp2(TestFusionLSTMINT8MKLDNNOp): + def set_confs(self): + self.force_fp32_output = True + + +class TestFusionLSTMINT8MKLDNNOp4(TestFusionLSTMINT8MKLDNNOp): + def set_confs(self): + self.is_reverse = True + + +class TestFusionLSTMINT8MKLDNNOp5(TestFusionLSTMINT8MKLDNNOp): + def set_confs(self): + self.has_initial_state = True + + +if __name__ == "__main__": + from paddle import enable_static + enable_static() + unittest.main() diff --git a/tools/static_mode_white_list.py b/tools/static_mode_white_list.py index 6453eb48d70..ab5b6516b90 100644 --- a/tools/static_mode_white_list.py +++ b/tools/static_mode_white_list.py @@ -606,6 +606,7 @@ STATIC_MODE_TESTING_LIST = [ 'test_fusion_gru_bf16_mkldnn_op', 'test_fusion_gru_mkldnn_op', 'test_fusion_lstm_mkldnn_op', + 'test_fusion_lstm_int8_mkldnn_op', 'test_fusion_lstm_bf16_mkldnn_op', 'test_gaussian_random_mkldnn_op', 'test_lrn_mkldnn_op', -- GitLab From a37a7f67e17e072fe36dbe444a3e7fb36474e610 Mon Sep 17 00:00:00 2001 From: wuhuanzhou Date: Tue, 30 Mar 2021 19:41:23 +0800 Subject: [PATCH 104/486] modify CI recommend information (#31395) --- paddle/scripts/paddle_build.sh | 4 ++-- tools/test_op_benchmark.sh | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 7a360ac2296..7f184f18986 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -1810,11 +1810,11 @@ function collect_ccache_hits() { function test_op_benchmark() { # The PR will pass quickly when get approval from specific person. - # Xreki 12538138, luotao1 6836917, GaoWei8 53294385 + # Xreki 12538138, luotao1 6836917, Avin0323 16167147 set +x approval_line=$(curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000) if [ "${approval_line}" != "" ]; then - APPROVALS=$(echo ${approval_line} | python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 53294385 12538138 6836917) + APPROVALS=$(echo ${approval_line} | python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 16167147 12538138 6836917) echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}" if [ "${APPROVALS}" == "TRUE" ]; then echo "===================================" diff --git a/tools/test_op_benchmark.sh b/tools/test_op_benchmark.sh index 95e9164bd1b..4f7288eb125 100644 --- a/tools/test_op_benchmark.sh +++ b/tools/test_op_benchmark.sh @@ -263,7 +263,7 @@ function summary_problems { done if [ $exit_code -ne 0 ]; then LOG "[INFO] See https://github.com/PaddlePaddle/Paddle/wiki/PR-CI-OP-benchmark-Manual for details." - LOG "[INFO] Or you can apply for one RD (GaoWei8(Recommend), Xreki, luotao1) approval to pass this PR." + LOG "[INFO] Or you can apply for one RD (Avin0323(Recommend), Xreki, luotao1) approval to pass this PR." exit $exit_code fi } -- GitLab From 98e803e04f7057bd6bd1a6d3816b80054a7e354b Mon Sep 17 00:00:00 2001 From: Pei Yang Date: Tue, 30 Mar 2021 20:20:48 +0800 Subject: [PATCH 105/486] map_matmul_to_mul_pass support 3dim (#31958) --- paddle/fluid/framework/ir/map_matmul_to_mul_pass.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/framework/ir/map_matmul_to_mul_pass.cc b/paddle/fluid/framework/ir/map_matmul_to_mul_pass.cc index a2443c86986..c36123f65f6 100644 --- a/paddle/fluid/framework/ir/map_matmul_to_mul_pass.cc +++ b/paddle/fluid/framework/ir/map_matmul_to_mul_pass.cc @@ -57,7 +57,7 @@ void MapMatmul2MulPass::ApplyImpl(ir::Graph* graph) const { std::vector y_shape = matmul_in_y->Var()->GetShape(); size_t x_rank = x_shape.size(); size_t y_rank = y_shape.size(); - flag = flag && x_rank == 2 && y_rank == 2; + flag = flag && (x_rank == 2 || x_rank == 3) && y_rank == 2; std::vector& next_ops = matmul_out->outputs; flag = flag && next_ops.size() == 1 && @@ -69,7 +69,7 @@ void MapMatmul2MulPass::ApplyImpl(ir::Graph* graph) const { desc.SetInput("X", {matmul_in_x->Name()}); desc.SetInput("Y", {matmul_in_y->Name()}); desc.SetOutput("Out", {matmul_out->Name()}); - desc.SetAttr("x_num_col_dims", 1); + desc.SetAttr("x_num_col_dims", static_cast(x_rank - 1)); desc.SetAttr("y_num_col_dims", 1); if (matmul_op->Op()->HasAttr("enable_int8")) { desc.SetAttr("enable_int8", matmul_op->Op()->GetAttr("enable_int8")); -- GitLab From 0fa6c8a35c61fdcff79d42ec509ff683e8ad9f78 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=9F=B3=E6=99=93=E4=BC=9F?= <39303645+Shixiaowei02@users.noreply.github.com> Date: Tue, 30 Mar 2021 20:35:44 +0800 Subject: [PATCH 106/486] fix a syntax error, test=develop (#31930) --- paddle/fluid/inference/tests/api/lite_mul_model_test.cc | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/inference/tests/api/lite_mul_model_test.cc b/paddle/fluid/inference/tests/api/lite_mul_model_test.cc index 6d4bb70df6f..9211ea246a5 100644 --- a/paddle/fluid/inference/tests/api/lite_mul_model_test.cc +++ b/paddle/fluid/inference/tests/api/lite_mul_model_test.cc @@ -75,14 +75,15 @@ int test_predictor_zero_copy(const AnalysisConfig& config_in, } std::vector input({1}); - auto in_tensor{predictor->GetInputTensor(predictor->GetInputNames().front())}; + auto in_tensor = + predictor->GetInputTensor(predictor->GetInputNames().front()); in_tensor->Reshape({1, 1}); in_tensor->copy_from_cpu(input.data()); predictor->ZeroCopyRun(); - auto out_tensor{ - predictor->GetOutputTensor(predictor->GetOutputNames().front())}; + auto out_tensor = + predictor->GetOutputTensor(predictor->GetOutputNames().front()); std::vector data_o(10); out_tensor->copy_to_cpu(data_o.data()); -- GitLab From 57d4288ad4c45ca83e25f900f3aacd90626d3202 Mon Sep 17 00:00:00 2001 From: liym27 <33742067+liym27@users.noreply.github.com> Date: Tue, 30 Mar 2021 21:01:20 +0800 Subject: [PATCH 107/486] [dynamic setitem] Fix bug of dynamic setitem: Decerease axes to do right broadcast (#31960) --- paddle/fluid/pybind/imperative.cc | 14 ++++++---- .../tests/unittests/test_set_value_op.py | 28 ++++++++++++++++--- 2 files changed, 32 insertions(+), 10 deletions(-) diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index 58ef1778630..eed3b3b7691 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -611,15 +611,17 @@ void BindImperative(py::module *m_ptr) { // TODO(liym27): Try not to call TensorToPyArray because it always // copys data to cpu place, which reduces performance. if (parse_index && value_is_tensor) { - std::vector axes, starts, ends, steps, decrease_axis, + std::vector axes, starts, ends, steps, decrease_axes, infer_flags; ParseIndexingSlice(self_tensor, index_ptr, &axes, &starts, &ends, - &steps, &decrease_axis, &infer_flags); + &steps, &decrease_axes, &infer_flags); - framework::AttributeMap attrs = {{"axes", axes}, - {"starts", starts}, - {"ends", ends}, - {"steps", steps}}; + framework::AttributeMap attrs = { + {"axes", axes}, + {"starts", starts}, + {"ends", ends}, + {"steps", steps}, + {"decrease_axes", decrease_axes}}; imperative::NameVarBaseMap ins = {{"Input", {self}}}; imperative::NameVarBaseMap outs = {{"Out", {self}}}; diff --git a/python/paddle/fluid/tests/unittests/test_set_value_op.py b/python/paddle/fluid/tests/unittests/test_set_value_op.py index 808d77d4761..0885891cdbe 100644 --- a/python/paddle/fluid/tests/unittests/test_set_value_op.py +++ b/python/paddle/fluid/tests/unittests/test_set_value_op.py @@ -48,18 +48,37 @@ class TestSetValueBase(unittest.TestCase): class TestSetValueApi(TestSetValueBase): - def test_api(self): + def _run_static(self): + paddle.enable_static() with paddle.static.program_guard(self.program): x = paddle.ones(shape=self.shape, dtype=self.dtype) self._call_setitem(x) exe = paddle.static.Executor(paddle.CPUPlace()) out = exe.run(self.program, fetch_list=[x]) + paddle.disable_static() + return out + + def _run_dynamic(self): + paddle.disable_static() + x = paddle.ones(shape=self.shape, dtype=self.dtype) + self._call_setitem(x) + out = x.numpy() + paddle.enable_static() + return out + + def test_api(self): + static_out = self._run_static() + dynamic_out = self._run_dynamic() self._get_answer() + + error_msg = "\nIn {} mode: \nExpected res = \n{}, \n\nbut received : \n{}" self.assertTrue( - (self.data == out).all(), - msg="\nExpected res = \n{}, \n\nbut received : \n{}".format( - self.data, out)) + (self.data == static_out).all(), + msg=error_msg.format("static", self.data, static_out)) + self.assertTrue( + (self.data == dynamic_out).all(), + msg=error_msg.format("dynamic", self.data, dynamic_out)) # 1. Test different type of item: int, Python slice, Paddle Tensor @@ -748,6 +767,7 @@ class TestError(TestSetValueBase): exe.run(program) def test_error(self): + paddle.enable_static() with paddle.static.program_guard(self.program): self._value_type_error() self._dtype_error() -- GitLab From 95f808c878eb464651c3ccad5c69bebf9c223ed3 Mon Sep 17 00:00:00 2001 From: Jiawei Wang Date: Tue, 30 Mar 2021 21:20:52 +0800 Subject: [PATCH 108/486] fix stack op grad nullptr (#31962) --- paddle/fluid/operators/stack_op.h | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/operators/stack_op.h b/paddle/fluid/operators/stack_op.h index 38ab60afd91..03d53245289 100644 --- a/paddle/fluid/operators/stack_op.h +++ b/paddle/fluid/operators/stack_op.h @@ -30,7 +30,7 @@ struct StackGradFunctor { int i = idx / (n_ * post_); int which_x = idx / post_ - i * n_; int x_index = i * post_ + idx % post_; - dx_[which_x][x_index] = dy_[idx]; + if (dx_[which_x] != nullptr) dx_[which_x][x_index] = dy_[idx]; } private: @@ -95,19 +95,21 @@ class StackGradKernel : public framework::OpKernel { auto dx = ctx.MultiOutput(framework::GradVarName("X")); int axis = ctx.Attr("axis"); if (axis < 0) axis += dy->dims().size(); - int n = dy->dims()[axis]; std::vector dx_datas(n); // NOLINT + for (int i = 0; i < n; i++) { - dx_datas[i] = dx[i]->mutable_data(ctx.GetPlace()); + if (dx[i] == nullptr) { + dx_datas[i] = nullptr; + } else { + dx_datas[i] = dx[i]->mutable_data(ctx.GetPlace()); + } } auto dy_data = dy->data(); - int pre = 1; for (int i = 0; i < axis; ++i) pre *= dy->dims()[i]; int total_num = dy->numel(); int post = total_num / (n * pre); - auto &dev_ctx = ctx.template device_context(); auto dx_data_arr = dx_datas.data(); StackGradFunctorForRange(dev_ctx, dx_data_arr, dy_data, total_num, n, post); -- GitLab From ef8323d49eb0f98c8fc282207728ef543d3f94d8 Mon Sep 17 00:00:00 2001 From: furnace <34057289+windstamp@users.noreply.github.com> Date: Wed, 31 Mar 2021 10:17:25 +0800 Subject: [PATCH 109/486] [ROCM] Add ROCm support for warpctc op (#31817) * bugfix for warpctc * fix warpctc commit id * fix warpctc commit id * fix warpctc commit id * fix warpctc commit id * fix warpctc commit id * fix WARPCTC_WITH_HIP invalid * Add logs to find out why can not dlopen libwarpctc.so * fix warpctc commit id * fix unit test test_warpctc_op * Optime failed log for dlopen * Optime failed log for dlopen * Delete extra changes * fix warpctc commit id * fix warpctc commit id * Add is_compiled_with_rocm for test_warpctc_op * fix warpctc commit id * Cancel optimize dlopen failed reason, move to next pr, due to it makes windows ci failed * Cancel optimize dlopen failed reason, move to next pr, due to it makes windows ci failed * Cancel optimize dlopen failed reason, move to next pr, due to it makes windows ci failed * fix code style problems --- cmake/external/warpctc.cmake | 7 ++++- paddle/fluid/operators/warpctc_op.h | 3 +- .../fluid/tests/unittests/test_warpctc_op.py | 29 ++++++++++++++++--- 3 files changed, 32 insertions(+), 7 deletions(-) diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake index b0ef575f643..ac28f7561f6 100644 --- a/cmake/external/warpctc.cmake +++ b/cmake/external/warpctc.cmake @@ -14,11 +14,15 @@ INCLUDE(ExternalProject) +IF(WITH_ROCM) + add_definitions(-DWARPCTC_WITH_HIP) +ENDIF() + SET(WARPCTC_PREFIX_DIR ${THIRD_PARTY_PATH}/warpctc) SET(WARPCTC_SOURCE_DIR ${THIRD_PARTY_PATH}/warpctc/src/extern_warpctc) SET(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc) set(WARPCTC_REPOSITORY ${GIT_URL}/baidu-research/warp-ctc.git) -set(WARPCTC_TAG cd828e5b6c3b953b82af73f7f44cddc393a20efa) +set(WARPCTC_TAG c690fc5755abbdbdc98ef78d51ec10a6748a8cd1) SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include" CACHE PATH "Warp-ctc Directory" FORCE) @@ -57,6 +61,7 @@ ExternalProject_Add( -DCMAKE_CXX_FLAGS_DEBUG=$ -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR} -DWITH_GPU=${WITH_GPU} + -DWITH_ROCM=${WITH_ROCM} -DWITH_OMP=${USE_OMP} -DWITH_TORCH=OFF -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON diff --git a/paddle/fluid/operators/warpctc_op.h b/paddle/fluid/operators/warpctc_op.h index 7451cac63d0..e90eefd72d4 100644 --- a/paddle/fluid/operators/warpctc_op.h +++ b/paddle/fluid/operators/warpctc_op.h @@ -159,8 +159,7 @@ class WarpCTCFunctor { warpctc_version_ = platform::dynload::get_warpctc_version(); if (platform::is_gpu_place(ctx.GetPlace())) { -// HIP not support ctcOptions in third-party warpctc -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) options_.loc = CTC_GPU; options_.stream = reinterpret_cast( ctx.device_context()) diff --git a/python/paddle/fluid/tests/unittests/test_warpctc_op.py b/python/paddle/fluid/tests/unittests/test_warpctc_op.py index 6310a76d8d0..53f3b3cf53d 100644 --- a/python/paddle/fluid/tests/unittests/test_warpctc_op.py +++ b/python/paddle/fluid/tests/unittests/test_warpctc_op.py @@ -20,6 +20,7 @@ import numpy as np from op_test import OpTest from test_softmax_op import stable_softmax import paddle.fluid as fluid +import paddle.fluid.core as core from paddle.fluid import Program, program_guard import paddle import paddle.nn.functional as F @@ -240,8 +241,18 @@ class TestWarpCTCOp(OpTest): def test_check_grad(self): self.outputs['WarpCTCGrad'] = self.gradient - self.check_grad( - ["Logits"], "Loss", max_relative_error=0.007, check_dygraph=False) + if core.is_compiled_with_rocm(): + self.check_grad( + ["Logits"], + "Loss", + max_relative_error=0.009, + check_dygraph=False) + else: + self.check_grad( + ["Logits"], + "Loss", + max_relative_error=0.007, + check_dygraph=False) class TestWarpCTCOpCase1(TestWarpCTCOp): @@ -335,8 +346,18 @@ class TestWarpCTCOpWithPadding(OpTest): def test_check_grad(self): self.outputs['WarpCTCGrad'] = self.gradient - self.check_grad( - ["Logits"], "Loss", max_relative_error=0.007, check_dygraph=False) + if core.is_compiled_with_rocm(): + self.check_grad( + ["Logits"], + "Loss", + max_relative_error=0.009, + check_dygraph=False) + else: + self.check_grad( + ["Logits"], + "Loss", + max_relative_error=0.007, + check_dygraph=False) class TestWarpCTCOpWithPaddingCase1(TestWarpCTCOpWithPadding): -- GitLab From 5394194e3ab5eb851fea5e5d50a4e49a1d596e8b Mon Sep 17 00:00:00 2001 From: Wenyu Date: Wed, 31 Mar 2021 10:40:51 +0800 Subject: [PATCH 110/486] support minus-int idx to LayerList (#31750) * support minus-int idx to LayerList * update layerlist test --- python/paddle/fluid/dygraph/container.py | 22 +++++++++++++++++-- .../test_imperative_container_layerlist.py | 12 ++++++++++ 2 files changed, 32 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/dygraph/container.py b/python/paddle/fluid/dygraph/container.py index dd04b107204..e80bc1245f9 100644 --- a/python/paddle/fluid/dygraph/container.py +++ b/python/paddle/fluid/dygraph/container.py @@ -213,13 +213,25 @@ class LayerList(Layer): for idx, layer in enumerate(sublayers): self.add_sublayer(str(idx), layer) + def _get_abs_idx(self, idx): + if isinstance(idx, int): + if not (-len(self) <= idx < len(self)): + raise IndexError( + 'index {} is out of range, should be an integer in range [{}, {})'. + format(idx, -len(self), len(self))) + if idx < 0: + idx += len(self) + return idx + def __getitem__(self, idx): if isinstance(idx, slice): return self.__class__(list(self._sub_layers.values())[idx]) else: + idx = self._get_abs_idx(idx) return self._sub_layers[str(idx)] def __setitem__(self, idx, sublayer): + idx = self._get_abs_idx(idx) return setattr(self, str(idx), sublayer) def __delitem__(self, idx): @@ -227,6 +239,7 @@ class LayerList(Layer): for k in range(len(self._sub_layers))[idx]: delattr(self, str(k)) else: + idx = self._get_abs_idx(idx) delattr(self, str(idx)) str_indices = [str(i) for i in range(len(self._sub_layers))] self._sub_layers = OrderedDict( @@ -275,10 +288,15 @@ class LayerList(Layer): another = paddle.nn.Linear(10, 10) linears.insert(3, another) print(linears[3] is another) # True + another = paddle.nn.Linear(10, 10) + linears.insert(-1, another) + print(linears[-2] is another) # True """ assert isinstance(index, int) and \ - 0 <= index < len(self._sub_layers), \ - "index should be an integer in range [0, len(self))" + -len(self._sub_layers) <= index < len(self._sub_layers), \ + "index should be an integer in range [{}, {})".format(-len(self), len(self)) + + index = self._get_abs_idx(index) for i in range(len(self._sub_layers), index, -1): self._sub_layers[str(i)] = self._sub_layers[str(i - 1)] self._sub_layers[str(index)] = sublayer diff --git a/python/paddle/fluid/tests/unittests/test_imperative_container_layerlist.py b/python/paddle/fluid/tests/unittests/test_imperative_container_layerlist.py index ef90dd04986..2e722b69c3e 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_container_layerlist.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_container_layerlist.py @@ -84,6 +84,18 @@ class TestImperativeContainer(unittest.TestCase): self.assertListEqual(res8.shape, [5, 3**3]) res8.backward() + model4 = MyLayer(layerlist[:3]) + model4.layerlist[-1] = fluid.dygraph.Linear(4, 5) + res9 = model4(x) + self.assertListEqual(res9.shape, [5, 5]) + del model4.layerlist[-1] + res10 = model4(x) + self.assertListEqual(res10.shape, [5, 4]) + model4.layerlist.insert(-1, fluid.dygraph.Linear(2, 2)) + res11 = model4(x) + self.assertListEqual(res11.shape, [5, 4]) + res11.backward() + def test_layer_list(self): self.layer_list(True) self.layer_list(False) -- GitLab From 52b05baca349d1bbfcbb6ed78b289d6c66dbec3e Mon Sep 17 00:00:00 2001 From: taixiurong Date: Wed, 31 Mar 2021 10:57:46 +0800 Subject: [PATCH 111/486] fix some bug in transformer training in xpu (#31918) --- cmake/external/xpu.cmake | 2 +- paddle/fluid/memory/memcpy.cc | 6 +- paddle/fluid/operators/cast_op_xpu.cc | 40 +++- paddle/fluid/operators/matmul_op_xpu.cc | 77 +++++-- paddle/fluid/operators/matmul_v2_op_xpu.cc | 62 ++++-- .../fluid/operators/optimizers/adam_op_xpu.cc | 22 +- paddle/fluid/operators/reshape_op.cc | 28 +-- .../softmax_with_cross_entropy_op_xpu.cc | 18 +- .../fluid/tests/unittests/test_matmul_op.py | 36 +++ .../tests/unittests/xpu/test_cast_op_xpu.py | 8 +- .../tests/unittests/xpu/test_matmul_op_xpu.py | 58 +++-- .../unittests/xpu/test_matmul_v2_op_xpu.py | 205 +++++++++--------- 12 files changed, 354 insertions(+), 208 deletions(-) diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake index b5a3f015474..16c69a7b503 100644 --- a/cmake/external/xpu.cmake +++ b/cmake/external/xpu.cmake @@ -13,7 +13,7 @@ if(NOT XPU_SDK_ROOT) elseif(WITH_SUNWAY) SET(XPU_URL "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/sunway/xpu_2021_01_13.tar.gz" CACHE STRING "" FORCE) else() - SET(XPU_URL "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/xpu_2021_02_27.tar.gz" CACHE STRING "" FORCE) + SET(XPU_URL "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/xpu_2021_03_30.tar.gz" CACHE STRING "" FORCE) endif() SET(XPU_SOURCE_DIR "${THIRD_PARTY_PATH}/xpu") diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc index 7f871fab5a1..6f252e1bd0d 100644 --- a/paddle/fluid/memory/memcpy.cc +++ b/paddle/fluid/memory/memcpy.cc @@ -40,7 +40,7 @@ void Copy(platform::XPUPlace dst_place, platform::CPUPlace src_place, const void* src, size_t num) { if (num <= 0) { - VLOG(0) << "memcpy XPU_HOST_TO_DEVICE size <= 0 (" << num << ")"; + VLOG(1) << "memcpy XPU_HOST_TO_DEVICE size <= 0 (" << num << ")"; return; } int dev_id = -1; @@ -86,7 +86,7 @@ void Copy(platform::CPUPlace dst_place, platform::XPUPlace src_place, const void* src, size_t num) { if (num <= 0) { - VLOG(0) << "memcpy XPU_DEVICE_TO_HOST size <= 0 (" << num << ")"; + VLOG(1) << "memcpy XPU_DEVICE_TO_HOST size <= 0 (" << num << ")"; return; } int dev_id = -1; @@ -132,7 +132,7 @@ void Copy(platform::XPUPlace dst_place, platform::XPUPlace src_place, const void* src, size_t num) { if (num <= 0) { - VLOG(0) << "memcpy XPU_DEVICE_TO_DEVICE size <= 0 (" << num << ")"; + VLOG(1) << "memcpy XPU_DEVICE_TO_DEVICE size <= 0 (" << num << ")"; return; } int dev_id = -1; diff --git a/paddle/fluid/operators/cast_op_xpu.cc b/paddle/fluid/operators/cast_op_xpu.cc index bbd43274a00..ca15858cf67 100644 --- a/paddle/fluid/operators/cast_op_xpu.cc +++ b/paddle/fluid/operators/cast_op_xpu.cc @@ -23,8 +23,22 @@ limitations under the License. */ namespace paddle { namespace operators { +template +class XPUFPTypeTrait { + public: + using Type = T; +}; + +template <> +class XPUFPTypeTrait { + public: + using Type = float16; +}; + template class CastXPUKernel : public framework::OpKernel { + using XPUInTDType = typename XPUFPTypeTrait::Type; + public: void Compute(const framework::ExecutionContext& context) const override { auto* in = context.Input("X"); @@ -34,27 +48,39 @@ class CastXPUKernel : public framework::OpKernel { auto out_type = static_cast( context.Attr("out_dtype")); auto* in_data = in->data(); + + // using XPUOutTDType = typename XPUFPTypeTrait::Type; auto numel = in->numel(); auto& dev_ctx = context.template device_context(); int r = -1; if (out_type == framework::proto::VarType::FP32) { auto* out_data = out->mutable_data(context.GetPlace()); - r = xpu::cast_v2(dev_ctx.x_context(), in_data, out_data, - numel); + r = xpu::cast_v2( + dev_ctx.x_context(), reinterpret_cast(in_data), + out_data, numel); } else if (out_type == framework::proto::VarType::INT32) { auto* out_data = out->mutable_data(context.GetPlace()); - r = xpu::cast_v2(dev_ctx.x_context(), in_data, out_data, - numel); + r = xpu::cast_v2( + dev_ctx.x_context(), reinterpret_cast(in_data), + out_data, numel); } else if (out_type == framework::proto::VarType::INT64) { auto* out_data = out->mutable_data(context.GetPlace()); - r = xpu::cast_v2(dev_ctx.x_context(), in_data, out_data, - numel); + r = xpu::cast_v2( + dev_ctx.x_context(), reinterpret_cast(in_data), + out_data, numel); } else if ((out_type == framework::proto::VarType::BOOL) && (in_type == framework::proto::VarType::FP32)) { auto* out_data = out->mutable_data(context.GetPlace()); r = xpu::cast_v2( dev_ctx.x_context(), (const float*)in_data, reinterpret_cast(out_data), numel); + } else if (out_type == framework::proto::VarType::FP16) { + auto* out_data = + out->mutable_data(context.GetPlace()); + r = xpu::cast_v2( + dev_ctx.x_context(), reinterpret_cast(in_data), + reinterpret_cast(out_data), numel); + } else { PADDLE_THROW(platform::errors::Unavailable("Not supported cast %d -> %d", in_type, out_type)); @@ -75,5 +101,7 @@ namespace ops = paddle::operators; REGISTER_OP_XPU_KERNEL( cast, ops::CastXPUKernel, ops::CastXPUKernel, + ops::CastXPUKernel, ops::CastXPUKernel); #endif diff --git a/paddle/fluid/operators/matmul_op_xpu.cc b/paddle/fluid/operators/matmul_op_xpu.cc index f92cff2f6cd..6fa96aca4be 100644 --- a/paddle/fluid/operators/matmul_op_xpu.cc +++ b/paddle/fluid/operators/matmul_op_xpu.cc @@ -23,7 +23,6 @@ limitations under the License. */ namespace paddle { namespace operators { - using framework::Tensor; static framework::DDim RowMatrixFromVector(const framework::DDim &x_dim) { @@ -123,34 +122,47 @@ static void MatMulXPUFunction(const Tensor *x, const Tensor *y, Tensor *out, mat_dim_b.height_ = mat_dim_b.height_ / mat_dim_b.batch_size_; } } - PADDLE_ENFORCE_EQ( - mat_dim_a.width_, mat_dim_b.height_, - platform::errors::InvalidArgument("Shape mistake in matmul_op, the " - "first tensor width must be same as " - "second tensor height, but received " - "width:%d, height:%d", - mat_dim_a.width_, mat_dim_b.height_)); + + if (mat_dim_a.width_ == mat_dim_b.height_) { + if (mat_dim_a.batch_size_ == 0 && mat_dim_b.batch_size_ == 1) { + mat_dim_a.batch_size_ = mat_dim_b.batch_size_ = 0; + } + if (mat_dim_a.batch_size_ == 1 && mat_dim_b.batch_size_ == 0) { + mat_dim_a.batch_size_ = mat_dim_b.batch_size_ = 0; + } + } + + PADDLE_ENFORCE_EQ(mat_dim_a.width_, mat_dim_b.height_, + platform::errors::InvalidArgument( + "Shape mistake in matmul_op, the " + "first tensor width must be same as " + "second tensor height, but received " + "width:%d, height:%d x_dims = %s , y_dims = %s", + mat_dim_a.width_, mat_dim_b.height_, + x_dims.to_str().c_str(), y_dims.to_str().c_str())); PADDLE_ENFORCE_EQ(mat_dim_a.batch_size_, mat_dim_b.batch_size_, platform::errors::InvalidArgument( "Shape mistake in matmul_op, the two input" "tensor batch_size must be same, but received first " "tensor batch_size:%d, second " - "tensor batch_size:%d", - mat_dim_a.batch_size_, mat_dim_b.batch_size_)); + "tensor batch_size:%d, x_dims = %s , y_dims = %s", + mat_dim_a.batch_size_, mat_dim_b.batch_size_, + x_dims.to_str().c_str(), y_dims.to_str().c_str())); - T alpha = static_cast(ctx.Attr("alpha")); + float alpha = static_cast(ctx.Attr("alpha")); - float *data_c = out->data(); + T *data_c = out->data(); int m = mat_dim_a.height_; int n = mat_dim_b.width_; int k = mat_dim_a.width_; + int batch_size = mat_dim_a.batch_size_; + int ldx = mat_dim_a.trans_ ? m : k; int ldy = mat_dim_b.trans_ ? k : n; int ldout = n; - int batch_size = mat_dim_a.batch_size_; - - if (batch_size == 0) { - int r = xpu::fc_fusion( + if (batch_size <= 1) { + int r = 0; + r = xpu::fc_fusion( dev_ctx.x_context(), x->data(), y->data(), data_c, m, n, k, mat_dim_a.trans_, mat_dim_b.trans_, nullptr, nullptr, nullptr, ldx, ldy, ldout, alpha, 0, nullptr, xpu::Activation_t::LINEAR); @@ -159,14 +171,32 @@ static void MatMulXPUFunction(const Tensor *x, const Tensor *y, Tensor *out, "XPU fc_fusion kernel return wrong value[%d %s]", r, XPUAPIErrorMsg[r])); } else { - int r = xpu::fc_batched( - dev_ctx.x_context(), batch_size, mat_dim_a.trans_, mat_dim_b.trans_, m, - n, k, alpha, x->data(), mat_dim_a.stride_, y->data(), - mat_dim_b.stride_, 0.0, data_c, m * n, nullptr, nullptr); + // batch matmul + int r = xpu::fc_batched( + dev_ctx.x_context(), // Context* ctx, + batch_size, // int batch_size, + mat_dim_a.trans_, // bool x_trans, + mat_dim_b.trans_, // bool w_trans, + m, // int m, + n, // int n, + k, // int k, + alpha, // float alpha, + reinterpret_cast(x->data()), // const TX* x, + mat_dim_a.stride_, // int stride_a, + reinterpret_cast(y->data()), // const TW* w, + mat_dim_b.stride_, // int stride_b, + 0.0, // float beta, + reinterpret_cast(data_c), // TY* y, + m * n, // int stride_c, + nullptr, // const float* x_maxptr, + nullptr); // const float* w_maxptr + PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External( - "XPU fc_batched kernel return wrong value[%d %s]", r, - XPUAPIErrorMsg[r])); + "XPU fc_batched kernel return wrong value[%d %s] " + "x_dims = %s , y_dims = %s", + r, XPUAPIErrorMsg[r], x_dims.to_str().c_str(), + y_dims.to_str().c_str())); } } @@ -206,9 +236,8 @@ static framework::Tensor XPUFoldHeadAndLastDims( static_cast(in_dims[1]), static_cast(in_dims[2])}; std::vector axis_host = {1, 0, 2}; - int r = xpu::transpose(context.x_context(), input.data(), output.data(), - in_shape_host.data(), axis_host.data(), /*ndims=*/3); + in_shape_host, axis_host); PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External( "XPU transpose kernel return wrong value[%d %s]", r, diff --git a/paddle/fluid/operators/matmul_v2_op_xpu.cc b/paddle/fluid/operators/matmul_v2_op_xpu.cc index dbb1d7bfb0a..d992ef847db 100644 --- a/paddle/fluid/operators/matmul_v2_op_xpu.cc +++ b/paddle/fluid/operators/matmul_v2_op_xpu.cc @@ -57,32 +57,55 @@ static void MatMulXPUFunction(const Tensor* x, const Tensor* y, Tensor* out, PADDLE_ENFORCE_EQ(mat_dim_a.width_, mat_dim_b.height_, platform::errors::InvalidArgument( - "Shape mistake in matmul_v2_op xdims = %s ydims = %s", - x_dims.to_str(), y_dims.to_str())); + "Shape mistake in matmul_v2_op xdims = %s ydims = %s " + "x_trans = %d y_trans = %d", + x_dims.to_str(), y_dims.to_str(), mat_dim_a.trans_, + mat_dim_b.trans_)); PADDLE_ENFORCE_EQ(mat_dim_a.batch_size_, mat_dim_b.batch_size_, platform::errors::InvalidArgument( - "Shape mistake in matmul_v2_op xdims = %s ydims = %s", - x_dims.to_str(), y_dims.to_str())); + "Shape mistake in matmul_v2_op xdims = %s ydims = %s " + "x_trans = %d y_trans = %d", + x_dims.to_str(), y_dims.to_str(), mat_dim_a.trans_, + mat_dim_b.trans_)); - float* data_c = out->data(); + T* data_c = out->data(); int m = mat_dim_a.height_; int n = mat_dim_b.width_; int k = mat_dim_a.width_; int batch_size = mat_dim_a.batch_size_; - - if (batch_size == 0) { - int r = xpu::fc( - dev_ctx.x_context(), x->data(), y->data(), data_c, m, n, k, - mat_dim_a.trans_, mat_dim_b.trans_, nullptr, nullptr, nullptr); - PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, - platform::errors::External( - "XPU fc_fusion kernel return wrong value[%d %s]", r, - XPUAPIErrorMsg[r])); + if (batch_size <= 1) { + int r = 0; + r = xpu::fc(dev_ctx.x_context(), x->data(), y->data(), + data_c, m, n, k, mat_dim_a.trans_, + mat_dim_b.trans_, nullptr, nullptr, nullptr); + PADDLE_ENFORCE_EQ( + r, XPU_SUCCESS, + platform::errors::External( + "XPU fc_fusion kernel return wrong value[%d %s] , m = %d, n = " + "%d, " + "k = %d, a_tr = %d, b_tr = %d", + r, XPUAPIErrorMsg[r], m, n, k, mat_dim_a.trans_, mat_dim_b.trans_)); } else { - int r = xpu::fc_batched( - dev_ctx.x_context(), batch_size, mat_dim_a.trans_, mat_dim_b.trans_, m, - n, k, 1.0, x->data(), mat_dim_a.stride_, y->data(), - mat_dim_b.stride_, 0.0, data_c, m * n, nullptr, nullptr); + // batch matmul + int r = xpu::fc_batched( + dev_ctx.x_context(), // Context* ctx, + batch_size, // int batch_size, + mat_dim_a.trans_, // bool x_trans, + mat_dim_b.trans_, // bool w_trans, + m, // int m, + n, // int n, + k, // int k, + 1.0, // float alpha, + reinterpret_cast(x->data()), // const TX* x, + mat_dim_a.stride_, // int stride_a, + reinterpret_cast(y->data()), // const TW* w, + mat_dim_b.stride_, // int stride_b, + 0.0, // float beta, + reinterpret_cast(data_c), // TY* y, + m * n, // int stride_c, + nullptr, // const float* x_maxptr, + nullptr); // const float* w_maxptr + PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External( "XPU fc_batched kernel return wrong value[%d %s]", r, @@ -125,7 +148,7 @@ static framework::Tensor XPUFoldHeadAndLastDims( std::vector axis_host = {1, 0, 2}; int r = xpu::transpose(context.x_context(), input.data(), output.data(), - in_shape_host.data(), axis_host.data(), /*ndims=*/3); + in_shape_host, axis_host); PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External( "XPU transpose kernel return wrong value[%d %s]", r, @@ -189,6 +212,7 @@ class MatMulV2XPUGradKernel : public framework::OpKernel { auto* dx = context.Output(framework::GradVarName("X")); auto* dy = context.Output(framework::GradVarName("Y")); ReshapeXYOutIntoMatrixSequence(&x, &y, &dout, transpose_x, transpose_y); + framework::DDim dx_dims; if (dx) { dx_dims = dx->dims(); diff --git a/paddle/fluid/operators/optimizers/adam_op_xpu.cc b/paddle/fluid/operators/optimizers/adam_op_xpu.cc index 1740f2982b6..3baba424e8f 100644 --- a/paddle/fluid/operators/optimizers/adam_op_xpu.cc +++ b/paddle/fluid/operators/optimizers/adam_op_xpu.cc @@ -121,19 +121,25 @@ class AdamOpXPUKernel : public framework::OpKernel { } else { T cpu_beta1_pow_out_data; T cpu_beta2_pow_out_data; - xpu_memcpy(&cpu_beta1_pow_out_data, beta1_pow_ptr, sizeof(T), - XPU_DEVICE_TO_HOST); + memory::Copy(platform::CPUPlace(), &cpu_beta1_pow_out_data, + BOOST_GET_CONST(platform::XPUPlace, beta1_pow.place()), + beta1_pow_ptr, sizeof(T)); + cpu_beta1_pow_out_data = cpu_beta1_pow_out_data * beta1; - xpu_memcpy(&cpu_beta2_pow_out_data, beta2_pow_ptr, sizeof(T), - XPU_DEVICE_TO_HOST); + memory::Copy(platform::CPUPlace(), &cpu_beta2_pow_out_data, + BOOST_GET_CONST(platform::XPUPlace, beta2_pow.place()), + beta2_pow_ptr, sizeof(T)); + cpu_beta2_pow_out_data = cpu_beta2_pow_out_data * beta2; T* beta1_pow_out_p = beta1_pow_out->mutable_data(ctx.GetPlace()); T* beta2_pow_out_p = beta2_pow_out->mutable_data(ctx.GetPlace()); - xpu_memcpy(beta1_pow_out_p, &cpu_beta1_pow_out_data, sizeof(T), - XPU_HOST_TO_DEVICE); - xpu_memcpy(beta2_pow_out_p, &cpu_beta2_pow_out_data, sizeof(T), - XPU_HOST_TO_DEVICE); + memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()), + beta1_pow_out_p, platform::CPUPlace(), + &cpu_beta1_pow_out_data, sizeof(T)); + memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()), + beta2_pow_out_p, platform::CPUPlace(), + &cpu_beta2_pow_out_data, sizeof(T)); } PADDLE_ENFORCE_EQ(r == xpu::Error_t::SUCCESS, true, diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc index 94efa70e467..e119a21caa2 100644 --- a/paddle/fluid/operators/reshape_op.cc +++ b/paddle/fluid/operators/reshape_op.cc @@ -377,31 +377,9 @@ class ReshapeKernel { out->Resize(out_dims); out->mutable_data(ctx.GetPlace(), in->type()); - -#ifdef PADDLE_WITH_XPU - if (platform::is_xpu_place(ctx.GetPlace())) { - void *out_ptr = out->data(); - const void *in_ptr = in->data(); - if ((out_ptr != nullptr) && (in_ptr != nullptr) && - (paddle::framework::SizeOfType(in->type()) > 0)) { - auto &dev_ctx = - ctx.template device_context(); - int r = xpu::memcpy_device( - dev_ctx.x_context(), out_ptr, in_ptr, - in->numel() * paddle::framework::SizeOfType(in->type())); - PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, - platform::errors::External( - "XPU memcpy_device return wrong value[%d %s]", r, - XPUAPIErrorMsg[r])); - } - } else { -#endif - framework::TensorCopy( - *in, ctx.GetPlace(), - ctx.template device_context(), out); -#ifdef PADDLE_WITH_XPU - } -#endif + framework::TensorCopy( + *in, ctx.GetPlace(), + ctx.template device_context(), out); out->Resize(out_dims); } }; diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op_xpu.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op_xpu.cc index 346ed965d06..8635def2ecf 100644 --- a/paddle/fluid/operators/softmax_with_cross_entropy_op_xpu.cc +++ b/paddle/fluid/operators/softmax_with_cross_entropy_op_xpu.cc @@ -45,11 +45,25 @@ class SoftmaxWithCrossEntropyXPUKernel : public framework::OpKernel { const int n = SizeToAxis(axis, logits->dims()); const int d = SizeFromAxis(axis, logits->dims()); std::vector logits_dims = framework::vectorize(logits->dims()); + // softmax auto& dev_ctx = context.template device_context(); - int r = xpu::softmax(dev_ctx.x_context(), logits->data(), - softmax->data(), logits_dims, axis); + int r = XPU_SUCCESS; + Tensor clip_logits; + int len = logits->numel(); + T* clip_logits_data = + clip_logits.mutable_data(context.GetPlace(), len * sizeof(T)); + r = xpu::clip(dev_ctx.x_context(), logits->data(), clip_logits_data, + len, -1e30, 1e30); + PADDLE_ENFORCE_EQ( + r, xpu::Error_t::SUCCESS, + platform::errors::External("XPU kernel error. clip " + "execution not succeed, error code=%d", + r)); + + r = xpu::softmax(dev_ctx.x_context(), clip_logits_data, + softmax->data(), logits_dims, axis); PADDLE_ENFORCE_EQ( r, xpu::Error_t::SUCCESS, diff --git a/python/paddle/fluid/tests/unittests/test_matmul_op.py b/python/paddle/fluid/tests/unittests/test_matmul_op.py index 2d5f098a7fe..b936567d5b5 100644 --- a/python/paddle/fluid/tests/unittests/test_matmul_op.py +++ b/python/paddle/fluid/tests/unittests/test_matmul_op.py @@ -206,6 +206,42 @@ for dim_X in (1, 2, 3): api_test(dim_X, dim_Y, transose_x, transose_y) +# Test case more batch_size and N, M, K +def generate_compatible_shapes(dim_X, dim_Y, transpose_X, transpose_Y, + batch_size): + BATCH_SIZE = 2 + M = 3 + N = 4 + K = 5 + if (dim_X == 1 and transpose_X) or (dim_Y == 1 and transpose_Y): + K = 1 + if dim_X == 1: + if transpose_X: + shape_X = [M] + else: + shape_X = [K] + if dim_Y == 1: + if transpose_Y: + shape_Y = [N] + else: + shape_Y = [K] + if dim_X >= 2: + if transpose_X: + shape_X = [K, M] + else: + shape_X = [M, K] + if dim_X == 3: + shape_X = [BATCH_SIZE] + shape_X + if dim_Y >= 2: + if transpose_Y: + shape_Y = [N, K] + else: + shape_Y = [K, N] + if dim_Y == 3: + shape_Y = [BATCH_SIZE] + shape_Y + return shape_X, shape_Y + + # Test case n-dim def generate_compatible_shapes(dim, transpose_X, transpose_Y): M = 2 diff --git a/python/paddle/fluid/tests/unittests/xpu/test_cast_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_cast_op_xpu.py index cb64cb90e8c..f1ba8828f2b 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_cast_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_cast_op_xpu.py @@ -51,10 +51,10 @@ class TestCastOp2(op_test.OpTest): def setUp(self): ipt = np.random.random(size=[10, 10]) self.inputs = {'X': ipt.astype('float32')} - self.outputs = {'Out': ipt.astype('float32')} + self.outputs = {'Out': ipt.astype('float16')} self.attrs = { 'in_dtype': int(core.VarDesc.VarType.FP32), - 'out_dtype': int(core.VarDesc.VarType.FP32) + 'out_dtype': int(core.VarDesc.VarType.FP16) } self.op_type = 'cast' @@ -68,10 +68,10 @@ class TestCastOp2(op_test.OpTest): class TestCastOp3(op_test.OpTest): def setUp(self): ipt = np.random.random(size=[10, 10]) - self.inputs = {'X': ipt.astype('float32')} + self.inputs = {'X': ipt.astype('float16')} self.outputs = {'Out': ipt.astype('float32')} self.attrs = { - 'in_dtype': int(core.VarDesc.VarType.FP32), + 'in_dtype': int(core.VarDesc.VarType.FP16), 'out_dtype': int(core.VarDesc.VarType.FP32) } self.op_type = 'cast' diff --git a/python/paddle/fluid/tests/unittests/xpu/test_matmul_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_matmul_op_xpu.py index fa0feb02f43..54dc46cd0ec 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_matmul_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_matmul_op_xpu.py @@ -27,8 +27,12 @@ from paddle.fluid import Program, program_guard paddle.enable_static() -def generate_compatible_shapes(dim_X, dim_Y, transpose_X, transpose_Y): +def generate_compatible_shapes(dim_X, dim_Y, transpose_X, transpose_Y, + batch_size): BATCH_SIZE = 2 + if batch_size != None: + BATCH_SIZE = batch_size + M = 3 N = 4 K = 5 @@ -58,6 +62,13 @@ def generate_compatible_shapes(dim_X, dim_Y, transpose_X, transpose_Y): shape_Y = [K, N] if dim_Y == 3: shape_Y = [BATCH_SIZE] + shape_Y + + if dim_Y == 3 and dim_X == 2: + if transpose_X == False: + shape_X[1] = shape_X[1] * BATCH_SIZE + else: + shape_X[0] = shape_X[0] * BATCH_SIZE + return shape_X, shape_Y @@ -77,11 +88,19 @@ def reference_matmul(X, Y, transpose_X=False, transpose_Y=False): if transpose_Y: if Y.ndim == 1: Y = Y.reshape((1, Y.size)) + elif Y.ndim == 2: + Y = Y.T else: dim = [i for i in range(len(Y.shape))] dim[-1], dim[len(Y.shape) - 2] = dim[len(Y.shape) - 2], dim[-1] Y = np.transpose(Y, tuple(dim)) + if X.ndim == 3 and Y.ndim == 2: + x_dims = X.shape + X = X.reshape((x_dims[0] * x_dims[1], x_dims[2])) + if Y.ndim == 3 and X.ndim == 2: + y_dims = Y.shape + Y = Y.reshape((y_dims[0] * y_dims[1], y_dims[2])) Out = np.matmul(X, Y) if not Out.shape: # We do not support 0-dimensional Tensors (scalars). So where @@ -203,11 +222,11 @@ def test_negative_dims_program(obj): # Generate program api cases for all negative possibilities -def api_test(dim_x, dim_y, trans_x, trans_y): +def api_test(dim_x, dim_y, trans_x, trans_y, batch_size): test_name = ('TestMatMulAPI_dimX_{}_dim_Y_{}_transX_{}_transY_{}'.format( dim_x, dim_y, trans_x, trans_y)) shape_x, shape_y = generate_compatible_shapes(dim_x, dim_y, trans_x, - trans_y) + trans_y, batch_size) globals()[test_name] = type(test_name, (unittest.TestCase, ), { 'shape_X': shape_x, 'shape_Y': shape_y, @@ -218,29 +237,35 @@ def api_test(dim_x, dim_y, trans_x, trans_y): # Generate operators cases for all possibilities -def inject_test(dim_x, dim_y, trans_x, trans_y): - test_name = ('TestMatMulOp_dimX_{}_dim_Y_{}_transX_{}_transY_{}'.format( - dim_x, dim_y, trans_x, trans_y)) +def inject_test(dim_x, dim_y, trans_x, trans_y, batch_size): + test_name = ( + 'TestMatMulOp_dimX_{}_dim_Y_{}_transX_{}_transY_{}_batch_{}'.format( + dim_x, dim_y, trans_x, trans_y, batch)) shape_x, shape_y = generate_compatible_shapes(dim_x, dim_y, trans_x, - trans_y) + trans_y, batch_size) globals()[test_name] = type(test_name, (Generator, XPUOpTest), { 'shape_X': shape_x, 'shape_Y': shape_y, 'transpose_X': trans_x, 'transpose_Y': trans_y, + 'op_type': "matmul" }) -for dim_X in (1, 2, 3): - for dim_Y in (1, 2, 3): - transose_x = False - transose_y = False - if dim_X == 3 and dim_Y == 3: - inject_test(dim_X, dim_Y, transose_x, transose_y) - api_test(dim_X, dim_Y, transose_x, transose_y) +xpu_support_dims_list = [[1, 1], [2, 2], [3, 3]] +batch_size = [2, 4, 5, 10, 50, 100, 300] +for dims in xpu_support_dims_list: + dim_X = dims[0] + dim_Y = dims[1] + for transose_x in (False, True): + for transose_y in (False, True): + for batch in batch_size: + inject_test(dim_X, dim_Y, transose_x, transose_y, batch) + # xpu not support all negative possibilities + # api_test(dim_X, dim_Y, False, False, 10) -# Test case n-dim + # Test case n-dim def generate_compatible_shapes(dim, transpose_X, transpose_Y): M = 2 N = 4 @@ -261,7 +286,7 @@ def generate_compatible_shapes(dim, transpose_X, transpose_Y): return shape_X, shape_Y -# # Test case n-dim +# Test case n-dim for dim in [4]: for transpose_X in [False, True]: for transpose_Y in [False, True]: @@ -275,6 +300,7 @@ for dim in [4]: 'shape_Y': shape_Y, 'transpose_X': transpose_X, 'transpose_Y': transpose_Y, + 'op_type': "matmul" }) diff --git a/python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py index 531e9488d60..435026220c2 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py @@ -45,7 +45,6 @@ def reference_matmul(X, Y, transpose_X=False, transpose_Y=False): dim = [i for i in range(len(Y.shape))] dim[-1], dim[len(Y.shape) - 2] = dim[len(Y.shape) - 2], dim[-1] Y = np.transpose(Y, tuple(dim)) - Out = np.matmul(X, Y) if not Out.shape: # We do not support 0-dimensional Tensors (scalars). So where @@ -98,16 +97,16 @@ class TestMatMulV2Op(XPUOpTest): self.check_grad_with_place(place, ['X', 'Y'], 'Out') -# class TestMatMuklOp2(TestMatMulV2Op): -# """ -# case 2 -# """ +class TestMatMuklOp2(TestMatMulV2Op): + """ + case 2 + """ -# def config(self): -# self.x_shape = (100, ) -# self.y_shape = (1, 3, 2, 100) -# self.trans_x = False -# self.trans_y = True + def config(self): + self.x_shape = (100) + self.y_shape = (100, 3) + self.trans_x = False + self.trans_y = False class TestMatMuklOp3(TestMatMulV2Op): @@ -122,16 +121,16 @@ class TestMatMuklOp3(TestMatMulV2Op): self.trans_y = False -# class TestMatMuklOp4(TestMatMulV2Op): -# """ -# case 4 -# """ +class TestMatMuklOp4(TestMatMulV2Op): + """ + case 4 + """ -# def config(self): -# self.x_shape = (100, ) -# self.y_shape = (1, 2, 100, 2) -# self.trans_x = False -# self.trans_y = False + def config(self): + self.x_shape = (1, 1, 100, 1) + self.y_shape = (1, 100) + self.trans_x = False + self.trans_y = False class TestMatMuklOp5(TestMatMulV2Op): @@ -146,27 +145,28 @@ class TestMatMuklOp5(TestMatMulV2Op): self.trans_y = False -# class TestMatMuklOp6(TestMatMulV2Op): -# """ -# case 6 -# """ +class TestMatMuklOp6(TestMatMulV2Op): + """ + case 6 + """ -# def config(self): -# self.x_shape = (1, 2, 102, 1) -# self.y_shape = (102, ) -# self.trans_x = True -# self.trans_y = False + def config(self): + self.x_shape = (1, 2, 102, 10) + self.y_shape = (2, 10, 111) + self.trans_x = False + self.trans_y = False -# class TestMatMuklOp7(TestMatMulV2Op): -# """ -# case 7 -# """ -# def config(self): -# self.x_shape = (1, 2, 1, 100) -# self.y_shape = (100, ) -# self.trans_x = False -# self.trans_y = False +class TestMatMuklOp7(TestMatMulV2Op): + """ + case 7 + """ + + def config(self): + self.x_shape = (1, 2, 100, 1) + self.y_shape = (2, 100, 12) + self.trans_x = True + self.trans_y = False class TestMatMuklOp8(TestMatMulV2Op): @@ -181,49 +181,52 @@ class TestMatMuklOp8(TestMatMulV2Op): self.trans_y = False -# class TestMatMuklOp9(TestMatMulV2Op): -# """ -# case 9 -# """ +class TestMatMuklOp9(TestMatMulV2Op): + """ + case 9 + """ -# def config(self): -# self.x_shape = (1, 1, 1, 100) -# self.y_shape = (2, 1, 2, 100) -# self.trans_x = False -# self.trans_y = True + def config(self): + self.x_shape = (100, 20, 100) + self.y_shape = (100, 100, 100) + self.trans_x = False + self.trans_y = True -# class TestMatMuklOp10(TestMatMulV2Op): -# """ -# case 10 -# """ -# def config(self): -# self.x_shape = (1, 1, 25, 4) -# self.y_shape = (1, 2, 4, 25) -# self.trans_x = False -# self.trans_y = False +class TestMatMuklOp10(TestMatMulV2Op): + """ + case 10 + """ -# class TestMatMuklOp11(TestMatMulV2Op): -# """ -# case 11 -# """ + def config(self): + self.x_shape = (100, 20, 100) + self.y_shape = (100, 20, 100) + self.trans_x = True + self.trans_y = False -# def config(self): -# self.x_shape = (2, 1, 2, 100) -# self.y_shape = (1, 1, 100, 2) -# self.trans_x = False -# self.trans_y = False -# class TestMatMuklOp12(TestMatMulV2Op): -# """ -# case 12 -# """ +class TestMatMuklOp11(TestMatMulV2Op): + """ + case 11 + """ -# def config(self): -# self.x_shape = (2, 1, 4, 25) -# self.y_shape = (1, 1, 4, 25) -# self.trans_x = True -# self.trans_y = False + def config(self): + self.x_shape = (2, 20, 100) + self.y_shape = (100, 30) + self.trans_x = False + self.trans_y = False + + +class TestMatMuklOp12(TestMatMulV2Op): + """ + case 12 + """ + + def config(self): + self.x_shape = (1, 20, 100) + self.y_shape = (100, ) + self.trans_x = False + self.trans_y = False class TestMatMuklOp13(TestMatMulV2Op): @@ -238,38 +241,40 @@ class TestMatMuklOp13(TestMatMulV2Op): self.trans_y = False -# class TestMatMuklOp14(TestMatMulV2Op): -# """ -# case 14_1 -# """ +class TestMatMuklOp14(TestMatMulV2Op): + """ + case 14_1 + """ -# def config(self): -# self.x_shape = (3, 1, 6, 6) -# self.y_shape = (1, 2, 6, 9) -# self.trans_x = True -# self.trans_y = False + def config(self): + self.x_shape = (100, 2, 100, 10) + self.y_shape = (100, 2, 10, 90) + self.trans_x = False + self.trans_y = False -# class TestMatMuklOp15(TestMatMulV2Op): -# """ -# case 14_2 -# """ -# def config(self): -# self.x_shape = (3, 1, 6, 6) -# self.y_shape = (1, 2, 6, 9) -# self.trans_x = False -# self.trans_y = False +class TestMatMuklOp15(TestMatMulV2Op): + """ + case 14_2 + """ -# class TestMatMuklOp16(TestMatMulV2Op): -# """ -# case 16 : to check the gradient for special case -# """ + def config(self): + self.x_shape = (100, 2, 100, 10) + self.y_shape = (100, 2, 100, 10) + self.trans_x = False + self.trans_y = True -# def config(self): -# self.x_shape = (100) -# self.y_shape = (1, 2, 2, 100, 2) -# self.trans_x = False -# self.trans_y = False + +class TestMatMuklOp16(TestMatMulV2Op): + """ + case 16 : to check the big data + """ + + def config(self): + self.x_shape = (1000, 2, 100, 100) + self.y_shape = (1000, 2, 100, 900) + self.trans_x = False + self.trans_y = False class TestMatMuklOp17(TestMatMulV2Op): -- GitLab From 3a95a0bc261200f1823b8f568009d5670ce44933 Mon Sep 17 00:00:00 2001 From: wuhuanzhou Date: Wed, 31 Mar 2021 11:04:33 +0800 Subject: [PATCH 112/486] update cmake minimum version to 3.15 (#31807) * update cmake minimum version to 3.15, test=develop * fix compilation error on Windows, test=develop * fix compilation error on Windows, test=develop * fix compilation error on Windows, test=develop --- CMakeLists.txt | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 765d8fc1578..2d2f613eff5 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -12,7 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License -cmake_minimum_required(VERSION 3.10) +cmake_minimum_required(VERSION 3.15) +cmake_policy(VERSION 3.10) set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake") set(PADDLE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}) @@ -38,11 +39,6 @@ endif() if (WITH_GPU AND WITH_ASCEND) message(FATAL_ERROR "Error when compile GPU and ASCEND at the same time") endif() -# cmake 3.12, 3.13, 3.14 will append gcc link options to nvcc, and nvcc doesn't recognize them. -if(WITH_GPU AND (${CMAKE_VERSION} VERSION_GREATER_EQUAL 3.12) AND (${CMAKE_VERSION} VERSION_LESS 3.15)) - message(FATAL_ERROR "cmake ${CMAKE_VERSION} is not supported when WITH_GPU=ON because of bug https://cmake.org/pipermail/cmake/2018-September/068195.html. " - "You can use cmake 3.16 (recommended), 3.10, 3.11, 3.15 or 3.17. Please refer to the install document: https://cmake.org/install/") -endif() if(WITH_GPU AND NOT APPLE) enable_language(CUDA) -- GitLab From 393b3bd6b7adadedc21d801c68c5bd002047fdc3 Mon Sep 17 00:00:00 2001 From: Thunderbrook <52529258+Thunderbrook@users.noreply.github.com> Date: Wed, 31 Mar 2021 11:14:06 +0800 Subject: [PATCH 113/486] fix split core (#31892) * fix split core * format --- .../fluid/operators/math/concat_and_split.cu | 48 +++++++++---------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/paddle/fluid/operators/math/concat_and_split.cu b/paddle/fluid/operators/math/concat_and_split.cu index a29997e5654..d62c1e42d3b 100644 --- a/paddle/fluid/operators/math/concat_and_split.cu +++ b/paddle/fluid/operators/math/concat_and_split.cu @@ -114,8 +114,8 @@ __global__ void ConcatKernel(const T** inputs_data, const int in_num, } template -__global__ void SplitKernel(const T* input_data, const int in_row, - const int in_col, const int* out_cols, +__global__ void SplitKernel(const T* input_data, const int64_t in_row, + const int64_t in_col, const int64_t* out_cols, int out_cols_size, T** outputs_data) { int tid_x = blockIdx.x * blockDim.x + threadIdx.x; int curr_segment = 0; @@ -159,15 +159,15 @@ __device__ void SplitKernelDetail(const T* input_data, const int in_row, } template -__global__ void SplitKernel(const T* input_data, const int in_row, - const int in_col, const int fixed_out_col, +__global__ void SplitKernel(const T* input_data, const int64_t in_row, + const int64_t in_col, const int64_t fixed_out_col, T** outputs_data) { SplitKernelDetail(input_data, in_row, in_col, fixed_out_col, outputs_data); } template -__global__ void SplitKernel(const T* input_data, const int in_row, - const int in_col, const int fixed_out_col, +__global__ void SplitKernel(const T* input_data, const int64_t in_row, + const int64_t in_col, const int64_t fixed_out_col, T* outputs_addr0, T* outputs_addr1) { T* outputs_data[2]; outputs_data[0] = outputs_addr0; @@ -176,8 +176,8 @@ __global__ void SplitKernel(const T* input_data, const int in_row, } template -__global__ void SplitKernel(const T* input_data, const int in_row, - const int in_col, const int fixed_out_col, +__global__ void SplitKernel(const T* input_data, const int64_t in_row, + const int64_t in_col, const int64_t fixed_out_col, T* outputs_addr0, T* outputs_addr1, T* outputs_addr2) { T* outputs_data[3]; @@ -188,8 +188,8 @@ __global__ void SplitKernel(const T* input_data, const int in_row, } template -__global__ void SplitKernel(const T* input_data, const int in_row, - const int in_col, const int fixed_out_col, +__global__ void SplitKernel(const T* input_data, const int64_t in_row, + const int64_t in_col, const int64_t fixed_out_col, T* outputs_addr0, T* outputs_addr1, T* outputs_addr2, T* outputs_addr3) { T* outputs_data[4]; @@ -201,8 +201,8 @@ __global__ void SplitKernel(const T* input_data, const int in_row, } static inline void GetBlockDims(const platform::CUDADeviceContext& context, - int num_rows, int num_cols, dim3* block_dims, - dim3* grid_dims) { + int64_t num_rows, int64_t num_cols, + dim3* block_dims, dim3* grid_dims) { // Set the thread block and grid according to CurrentDeviceId const int kThreadsPerBlock = 1024; int block_cols = kThreadsPerBlock; @@ -213,12 +213,12 @@ static inline void GetBlockDims(const platform::CUDADeviceContext& context, *block_dims = dim3(block_cols, block_rows, 1); int max_threads = context.GetMaxPhysicalThreadCount(); - int max_blocks = std::max(max_threads / kThreadsPerBlock, 1); + int64_t max_blocks = std::max(max_threads / kThreadsPerBlock, 1); int grid_cols = std::min((num_cols + block_cols - 1) / block_cols, max_blocks); - int grid_rows = - std::min(max_blocks / grid_cols, std::max(num_rows / block_rows, 1)); + int grid_rows = std::min(max_blocks / grid_cols, + std::max(num_rows / block_rows, (int64_t)1)); *grid_dims = dim3(grid_cols, grid_rows, 1); } @@ -319,22 +319,22 @@ class SplitFunctor { int axis, std::vector* outputs) { // TODO(zcd): Add input data validity checking int o_num = outputs->size(); - int out_row = 1; + int64_t out_row = 1; auto dim_0 = ref_inputs[0]->dims(); for (int i = 0; i < axis; ++i) { out_row *= dim_0[i]; } - int out0_col = ref_inputs[0]->numel() / out_row; - int in_col = 0, in_row = out_row; + int64_t out0_col = ref_inputs[0]->numel() / out_row; + int64_t in_col = 0, in_row = out_row; bool has_same_shape = true; std::vector outputs_data(o_num); - std::vector outputs_cols(o_num + 1); + std::vector outputs_cols(o_num + 1); outputs_cols[0] = 0; for (int i = 0; i < o_num; ++i) { - int t_col = ref_inputs.at(i)->numel() / out_row; + int64_t t_col = ref_inputs.at(i)->numel() / out_row; if (has_same_shape) { if (t_col != out0_col) has_same_shape = false; } @@ -384,13 +384,13 @@ class SplitFunctor { auto tmp_dev_ins_col_data = memory::Alloc(context, - outputs_cols.size() * sizeof(int)); + outputs_cols.size() * sizeof(int64_t)); memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()), tmp_dev_ins_col_data->ptr(), platform::CPUPlace(), reinterpret_cast(outputs_cols.data()), - outputs_cols.size() * sizeof(int), context.stream()); - int* dev_outs_col_data = - reinterpret_cast(tmp_dev_ins_col_data->ptr()); + outputs_cols.size() * sizeof(int64_t), context.stream()); + int64_t* dev_outs_col_data = + reinterpret_cast(tmp_dev_ins_col_data->ptr()); SplitKernel<<>>( input.data(), in_row, in_col, dev_outs_col_data, -- GitLab From b09c1ce09af6d600cbf0f279a0b182a2c29f048d Mon Sep 17 00:00:00 2001 From: tianshuo78520a <707759223@qq.com> Date: Wed, 31 Mar 2021 11:22:31 +0800 Subject: [PATCH 114/486] fix whl package push pypi (#31585) * fix whl package push pypi * add rst --- python/paddle/{README.md => README.rst} | 0 python/setup.py.in | 4 ++-- 2 files changed, 2 insertions(+), 2 deletions(-) rename python/paddle/{README.md => README.rst} (100%) diff --git a/python/paddle/README.md b/python/paddle/README.rst similarity index 100% rename from python/paddle/README.md rename to python/paddle/README.rst diff --git a/python/setup.py.in b/python/setup.py.in index 5876ac19d46..73c773bab49 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -511,10 +511,10 @@ else: # Log for PYPI if sys.version_info > (3,0): - with open("@PADDLE_BINARY_DIR@/python/paddle/README.md", "r", encoding='UTF-8') as f: + with open("@PADDLE_BINARY_DIR@/python/paddle/README.rst", "r", encoding='UTF-8') as f: long_description = f.read() else: - with open("@PADDLE_BINARY_DIR@/python/paddle/README.md", "r")as f: + with open("@PADDLE_BINARY_DIR@/python/paddle/README.rst", "r")as f: long_description = unicode(f.read(), 'UTF-8') with redirect_stdout(): -- GitLab From 587d99ae443c684faa25d1fd261eb81d37cb32e4 Mon Sep 17 00:00:00 2001 From: wuhuanzhou Date: Wed, 31 Mar 2021 11:53:54 +0800 Subject: [PATCH 115/486] update compilation with C++14 (#31815) * update compilation with C++14, test=develop * fix compilation error in eigen, test=develop --- cmake/cuda.cmake | 7 ++----- cmake/flags.cmake | 22 +++++++--------------- paddle/fluid/operators/jit/benchmark.cc | 5 ++++- paddle/fluid/operators/jit/test.cc | 5 ++++- 4 files changed, 17 insertions(+), 22 deletions(-) diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake index e6770da6763..05b55952074 100644 --- a/cmake/cuda.cmake +++ b/cmake/cuda.cmake @@ -208,14 +208,11 @@ select_nvcc_arch_flags(NVCC_FLAGS_EXTRA) set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${NVCC_FLAGS_EXTRA}") message(STATUS "NVCC_FLAGS_EXTRA: ${NVCC_FLAGS_EXTRA}") -# Set C++11 support +# Set C++14 support set(CUDA_PROPAGATE_HOST_FLAGS OFF) # Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc. # So, don't set these flags here. -if (NOT WIN32) # windows msvc2015 support c++11 natively. - # -std=c++11 -fPIC not recoginize by msvc, -Xcompiler will be added by cmake. - set(CMAKE_CUDA_STANDARD 11) -endif(NOT WIN32) +set(CMAKE_CUDA_STANDARD 14) # (Note) For windows, if delete /W[1-4], /W1 will be added defaultly and conflic with -w # So replace /W[1-4] with /W0 diff --git a/cmake/flags.cmake b/cmake/flags.cmake index e110524dd1a..a2ddad557c2 100644 --- a/cmake/flags.cmake +++ b/cmake/flags.cmake @@ -4,10 +4,10 @@ include(CheckCCompilerFlag) include(CheckCXXSymbolExists) include(CheckTypeSize) -function(CheckCompilerCXX11Flag) +function(CheckCompilerCXX14Flag) if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU") - if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 4.8) - message(FATAL_ERROR "Unsupported GCC version. GCC >= 4.8 required.") + if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 5.4) + message(FATAL_ERROR "Unsupported GCC version. GCC >= 5.4 required.") elseif(${CMAKE_CXX_COMPILER_VERSION} VERSION_GREATER 8.2) message(WARNING "Found GCC ${CMAKE_CXX_COMPILER_VERSION} which is too high, recommended to use GCC 8.2") endif() @@ -20,23 +20,15 @@ function(CheckCompilerCXX11Flag) message(FATAL_ERROR "Unsupported AppleClang version. AppleClang >= 5.1 required.") endif() else() - if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 3.3) - message(FATAL_ERROR "Unsupported Clang version. Clang >= 3.3 required.") + if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 3.4) + message(FATAL_ERROR "Unsupported Clang version. Clang >= 3.4 required.") endif() endif() endif() endfunction() -CheckCompilerCXX11Flag() -if (WITH_GPU) - if (${CMAKE_CUDA_COMPILER_VERSION} GREATER_EQUAL 11.0) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14") - else() - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") - endif() -else() - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") -endif() +CheckCompilerCXX14Flag() +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14") # safe_set_flag # # Set a compile flag only if compiler is support diff --git a/paddle/fluid/operators/jit/benchmark.cc b/paddle/fluid/operators/jit/benchmark.cc index 419c4d44b6d..a8e441a9671 100644 --- a/paddle/fluid/operators/jit/benchmark.cc +++ b/paddle/fluid/operators/jit/benchmark.cc @@ -330,7 +330,10 @@ void BenchKernelSgd() { for (int i = 0; i < n; ++i) { all.push_back(i); } - std::random_shuffle(all.begin(), all.end()); + std::random_device rnd; + int64_t seed_tmp = rnd(); + std::default_random_engine rng(seed_tmp); + std::shuffle(all.begin(), all.end(), rng); out.insert(out.begin(), all.begin(), all.begin() + n); return out; }; diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc index cfddbf213ef..ff68565637c 100644 --- a/paddle/fluid/operators/jit/test.cc +++ b/paddle/fluid/operators/jit/test.cc @@ -861,7 +861,10 @@ void TestKernelSgd() { for (int i = 0; i < n; ++i) { all.push_back(i); } - std::random_shuffle(all.begin(), all.end()); + std::random_device rnd; + int64_t seed_tmp = rnd(); + std::default_random_engine rng(seed_tmp); + std::shuffle(all.begin(), all.end(), rng); out.insert(out.begin(), all.begin(), all.begin() + n); return out; }; -- GitLab From 495e7f9c848bb6d36b2ba64bf84fdebf5da3f71b Mon Sep 17 00:00:00 2001 From: wuhuanzhou Date: Wed, 31 Mar 2021 14:32:45 +0800 Subject: [PATCH 116/486] Update eigen version to f612df27 (#31832) * update eigen version to f612df27, test=develop * fix compilation error, test=develop * remove patch command in eigen, test=develop * fix compilation error caused by call Eigen function with float16 and bfloat16, test=develop * fix unittest error, test=develop * fix unittest error caused by precision, test=develop * remove patch files used by old version eigen, test=develop --- cmake/external/eigen.cmake | 48 +- paddle/fluid/operators/activation_op.h | 4 +- paddle/fluid/platform/eigen_ext.h | 97 +- patches/eigen/BinaryFunctors.h | 509 ----- patches/eigen/Geometry_SSE.h | 189 -- patches/eigen/Half.h | 733 ------- patches/eigen/MathFunctions.h | 1938 ----------------- patches/eigen/Meta.h | 722 ------ patches/eigen/Tensor | 156 -- patches/eigen/TensorBlock.h | 1559 ------------- .../tests/unittests/test_activation_op.py | 6 +- 11 files changed, 73 insertions(+), 5888 deletions(-) delete mode 100644 patches/eigen/BinaryFunctors.h delete mode 100644 patches/eigen/Geometry_SSE.h delete mode 100644 patches/eigen/Half.h delete mode 100644 patches/eigen/MathFunctions.h delete mode 100755 patches/eigen/Meta.h delete mode 100644 patches/eigen/Tensor delete mode 100644 patches/eigen/TensorBlock.h diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake index 5a755a816c3..f68db1eab3d 100644 --- a/cmake/external/eigen.cmake +++ b/cmake/external/eigen.cmake @@ -14,11 +14,11 @@ include(ExternalProject) -# update eigen to the commit id 4da2c6b1 on 03/19/2020 +# update eigen to the commit id f612df27 on 03/16/2021 set(EIGEN_PREFIX_DIR ${THIRD_PARTY_PATH}/eigen3) set(EIGEN_SOURCE_DIR ${THIRD_PARTY_PATH}/eigen3/src/extern_eigen3) set(EIGEN_REPOSITORY https://gitlab.com/libeigen/eigen.git) -set(EIGEN_TAG 4da2c6b1974827b1999bab652a3d4703e1992d26) +set(EIGEN_TAG f612df273689a19d25b45ca4f8269463207c4fee) cache_third_party(extern_eigen3 REPOSITORY ${EIGEN_REPOSITORY} @@ -27,48 +27,6 @@ cache_third_party(extern_eigen3 if(WIN32) add_definitions(-DEIGEN_STRONG_INLINE=inline) - file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/Half.h native_src) - file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/Eigen/src/Core/arch/CUDA/Half.h native_dst) - # For Windows - # which will cause a compilation error in Tensor:74: - # "can not open file 'unistd.h'" - # so use following patch to solve compilation error On Windows. - file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/Tensor native_src2) - file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/unsupported/Eigen/CXX11/Tensor native_dst2) - # For VS2015 - # which will cause a compilation error in TensorBlock.h:1028: - # "syntax error" - # so use following patch to solve compilation error On Windows. - file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/TensorBlock.h native_src3) - file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h native_dst3) - set(EIGEN_PATCH_COMMAND copy ${native_src} ${native_dst} /Y && copy ${native_src2} ${native_dst2} /Y && copy ${native_src3} ${native_dst3} /Y) -elseif(LINUX) - # For gxx=4.8, __GXX_ABI_VERSION is less than 1004 - # which will cause a compilation error in Geometry_SSE.h:38: - # "no matching function for call to 'pmul(Eigen::internal::Packet4f&, __m128)" - # refer to: https://gitlab.com/libeigen/eigen/-/blob/4da2c6b1974827b1999bab652a3d4703e1992d26/Eigen/src/Core/arch/SSE/PacketMath.h#L33-60 - # add -fabi-version=4 could avoid above error, but will cause "double free corruption" when compile with gcc8 - # so use following patch to solve compilation error with different version of gcc. - file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/Geometry_SSE.h native_src1) - file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/Eigen/src/Geometry/arch/Geometry_SSE.h native_dst1) - # The compiler fully support const expressions since c++14, - # but Eigen use some const expressions such as std::max and std::min, which are not supported in c++11 - # add patch to avoid compilation error in c++11 - file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/MathFunctions.h native_src2) - file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/Eigen/src/Core/MathFunctions.h native_dst2) - if(WITH_ROCM) - # For HIPCC Eigen::internal::device::numeric_limits is not EIGEN_DEVICE_FUNC - # which will cause compiler error of using __host__ funciont in __host__ __device__ - file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/Meta.h native_src3) - file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/Eigen/src/Core/util/Meta.h native_dst3) - # For HIPCC Eigen::internal::scalar_sum_op is not EIGEN_DEVICE_FUNC - # which will cause compiler error of using __host__ funciont in __host__ __device__ - file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/BinaryFunctors.h native_src4) - file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/Eigen/src/Core/functors/BinaryFunctors.h native_dst4) - set(EIGEN_PATCH_COMMAND cp ${native_src1} ${native_dst1} && cp ${native_src2} ${native_dst2} && cp ${native_src3} ${native_dst3} && cp ${native_src4} ${native_dst4}) - else() - set(EIGEN_PATCH_COMMAND cp ${native_src1} ${native_dst1} && cp ${native_src2} ${native_dst2}) - endif() endif() set(EIGEN_INCLUDE_DIR ${EIGEN_SOURCE_DIR}) @@ -82,7 +40,7 @@ ExternalProject_Add( PREFIX ${EIGEN_PREFIX_DIR} SOURCE_DIR ${EIGEN_SOURCE_DIR} UPDATE_COMMAND "" - PATCH_COMMAND ${EIGEN_PATCH_COMMAND} + PATCH_COMMAND "" CONFIGURE_COMMAND "" BUILD_COMMAND "" INSTALL_COMMAND "" diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h index bc7def61b2e..fb5c4db91ec 100644 --- a/paddle/fluid/operators/activation_op.h +++ b/paddle/fluid/operators/activation_op.h @@ -400,7 +400,7 @@ struct HardShrinkFunctor : public BaseActivationFunctor { void operator()(Device d, X x, Out out) const { auto temp1 = x < static_cast(threshold * -1.f); auto temp2 = x > static_cast(threshold); - out.device(d) = x * (temp1 + temp2 > 0).template cast(); + out.device(d) = x * (temp1 + temp2).template cast(); } }; @@ -417,7 +417,7 @@ struct HardShrinkGradFunctor : public BaseActivationFunctor { void operator()(Device d, X x, Out out, dOut dout, dX dx) const { auto temp1 = x < static_cast(threshold * -1.f); auto temp2 = x > static_cast(threshold); - dx.device(d) = dout * (temp1 + temp2 > 0).template cast(); + dx.device(d) = dout * (temp1 + temp2).template cast(); } static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } diff --git a/paddle/fluid/platform/eigen_ext.h b/paddle/fluid/platform/eigen_ext.h index a8ad729a31a..0db4cc71b1b 100644 --- a/paddle/fluid/platform/eigen_ext.h +++ b/paddle/fluid/platform/eigen_ext.h @@ -24,7 +24,6 @@ namespace Eigen { -using bfloat16 = paddle::platform::bfloat16; using complex64 = paddle::platform::complex64; using complex128 = paddle::platform::complex128; using float16 = paddle::platform::float16; @@ -33,7 +32,8 @@ template struct NumTraits; template <> -struct NumTraits : GenericNumTraits { +struct NumTraits + : GenericNumTraits { enum { IsSigned = true, IsInteger = false, @@ -41,22 +41,22 @@ struct NumTraits : GenericNumTraits { RequireInitialization = false }; - HOSTDEVICE static inline bfloat16 epsilon() { + HOSTDEVICE static inline paddle::platform::bfloat16 epsilon() { return paddle::platform::raw_uint16_to_bfloat16(0x3400); } - HOSTDEVICE static inline bfloat16 dummy_precision() { - return bfloat16(1e-5f); + HOSTDEVICE static inline paddle::platform::bfloat16 dummy_precision() { + return paddle::platform::bfloat16(1e-5f); } - HOSTDEVICE static inline bfloat16 highest() { + HOSTDEVICE static inline paddle::platform::bfloat16 highest() { return paddle::platform::raw_uint16_to_bfloat16(0x7f7f); } - HOSTDEVICE static inline bfloat16 lowest() { + HOSTDEVICE static inline paddle::platform::bfloat16 lowest() { return paddle::platform::raw_uint16_to_bfloat16(0xff7f); } - HOSTDEVICE static inline bfloat16 infinity() { + HOSTDEVICE static inline paddle::platform::bfloat16 infinity() { return paddle::platform::raw_uint16_to_bfloat16(0x7f80); } - HOSTDEVICE static inline bfloat16 quiet_NaN() { + HOSTDEVICE static inline paddle::platform::bfloat16 quiet_NaN() { return paddle::platform::raw_uint16_to_bfloat16(0xffc1); } }; @@ -137,68 +137,91 @@ namespace numext { //////////// bfloat methods ///////////// template <> -HOSTDEVICE inline bool(isnan)(const bfloat16& a) { +HOSTDEVICE inline bool(isnan)(const paddle::platform::bfloat16& a) { return (paddle::platform::isnan)(a); } template <> -HOSTDEVICE inline bool(isinf)(const bfloat16& a) { +HOSTDEVICE inline bool(isinf)(const paddle::platform::bfloat16& a) { return (paddle::platform::isinf)(a); } template <> -HOSTDEVICE inline bool(isfinite)(const bfloat16& a) { +HOSTDEVICE inline bool(isfinite)(const paddle::platform::bfloat16& a) { return (paddle::platform::isfinite)(a); } template <> -HOSTDEVICE inline bfloat16 exp(const bfloat16& a) { - return bfloat16(::expf(static_cast(a))); +HOSTDEVICE inline paddle::platform::bfloat16 exp( + const paddle::platform::bfloat16& a) { + return paddle::platform::bfloat16(::expf(static_cast(a))); } template <> -HOSTDEVICE inline bfloat16 erf(const bfloat16& a) { - return bfloat16(::erff(static_cast(a))); +HOSTDEVICE inline paddle::platform::bfloat16 erf( + const paddle::platform::bfloat16& a) { + return paddle::platform::bfloat16(::erff(static_cast(a))); } template <> -HOSTDEVICE inline bfloat16 log(const bfloat16& a) { - return bfloat16(::logf(static_cast(a))); +HOSTDEVICE inline paddle::platform::bfloat16 log( + const paddle::platform::bfloat16& a) { + return paddle::platform::bfloat16(::logf(static_cast(a))); } template <> -HOSTDEVICE inline bfloat16 tanh(const bfloat16& a) { - return bfloat16(::tanhf(static_cast(a))); +HOSTDEVICE inline paddle::platform::bfloat16 tanh( + const paddle::platform::bfloat16& a) { + return paddle::platform::bfloat16(::tanhf(static_cast(a))); } template <> -HOSTDEVICE inline bfloat16 sqrt(const bfloat16& a) { - return bfloat16(::sqrtf(static_cast(a))); +HOSTDEVICE inline paddle::platform::bfloat16 sqrt( + const paddle::platform::bfloat16& a) { + return paddle::platform::bfloat16(::sqrtf(static_cast(a))); } template <> -HOSTDEVICE inline bfloat16 ceil(const bfloat16& a) { - return bfloat16(::ceilf(static_cast(a))); +HOSTDEVICE inline paddle::platform::bfloat16 ceil( + const paddle::platform::bfloat16& a) { + return paddle::platform::bfloat16(::ceilf(static_cast(a))); } template <> -HOSTDEVICE inline bfloat16 floor(const bfloat16& a) { - return bfloat16(::floorf(static_cast(a))); +HOSTDEVICE inline paddle::platform::bfloat16 floor( + const paddle::platform::bfloat16& a) { + return paddle::platform::bfloat16(::floorf(static_cast(a))); } template <> -HOSTDEVICE inline bfloat16 round(const bfloat16& a) { - return bfloat16(::roundf(static_cast(a))); +HOSTDEVICE inline paddle::platform::bfloat16 round( + const paddle::platform::bfloat16& a) { + return paddle::platform::bfloat16(::roundf(static_cast(a))); } template <> -HOSTDEVICE inline bfloat16 pow(const bfloat16& a, const bfloat16& b) { - return bfloat16(::powf(static_cast(a), static_cast(b))); +HOSTDEVICE inline paddle::platform::bfloat16 pow( + const paddle::platform::bfloat16& a, const paddle::platform::bfloat16& b) { + return paddle::platform::bfloat16( + ::powf(static_cast(a), static_cast(b))); } template <> -HOSTDEVICE inline bfloat16 abs(const bfloat16& a) { - return bfloat16(::fabs(static_cast(a))); +HOSTDEVICE inline paddle::platform::bfloat16 abs( + const paddle::platform::bfloat16& a) { + return paddle::platform::bfloat16(::fabs(static_cast(a))); +} + +template <> +HOSTDEVICE inline paddle::platform::bfloat16 mini( + const paddle::platform::bfloat16& a, const paddle::platform::bfloat16& b) { + return b < a ? b : a; +} + +template <> +HOSTDEVICE inline paddle::platform::bfloat16 maxi( + const paddle::platform::bfloat16& a, const paddle::platform::bfloat16& b) { + return a < b ? b : a; } //////////// complex64 methods ///////////// @@ -398,5 +421,15 @@ HOSTDEVICE inline float16 abs(const float16& a) { return float16(::fabs(static_cast(a))); } +template <> +HOSTDEVICE inline float16 mini(const float16& a, const float16& b) { + return b < a ? b : a; +} + +template <> +HOSTDEVICE inline float16 maxi(const float16& a, const float16& b) { + return a < b ? b : a; +} + } // namespace numext } // namespace Eigen diff --git a/patches/eigen/BinaryFunctors.h b/patches/eigen/BinaryFunctors.h deleted file mode 100644 index 54d0395507a..00000000000 --- a/patches/eigen/BinaryFunctors.h +++ /dev/null @@ -1,509 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2008-2010 Gael Guennebaud -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -// clang-format off - -#ifndef EIGEN_BINARY_FUNCTORS_H -#define EIGEN_BINARY_FUNCTORS_H - -namespace Eigen { - -namespace internal { - -//---------- associative binary functors ---------- - -template -struct binary_op_base -{ - typedef Arg1 first_argument_type; - typedef Arg2 second_argument_type; -}; - -/** \internal - * \brief Template functor to compute the sum of two scalars - * - * \sa class CwiseBinaryOp, MatrixBase::operator+, class VectorwiseOp, DenseBase::sum() - */ -template -struct scalar_sum_op : binary_op_base -{ - typedef typename ScalarBinaryOpTraits::ReturnType result_type; -#ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN - EIGEN_EMPTY_STRUCT_CTOR(scalar_sum_op) -#else - scalar_sum_op() { - EIGEN_SCALAR_BINARY_OP_PLUGIN - } -#endif - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a + b; } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const - { return internal::padd(a,b); } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type predux(const Packet& a) const - { return internal::predux(a); } -}; -template -struct functor_traits > { - enum { - Cost = (NumTraits::AddCost+NumTraits::AddCost)/2, // rough estimate! - PacketAccess = is_same::value && packet_traits::HasAdd && packet_traits::HasAdd - // TODO vectorize mixed sum - }; -}; - -/** \internal - * \brief Template specialization to deprecate the summation of boolean expressions. - * This is required to solve Bug 426. - * \sa DenseBase::count(), DenseBase::any(), ArrayBase::cast(), MatrixBase::cast() - */ -template<> struct scalar_sum_op : scalar_sum_op { - EIGEN_DEPRECATED EIGEN_DEVICE_FUNC - scalar_sum_op() {} -}; - - -/** \internal - * \brief Template functor to compute the product of two scalars - * - * \sa class CwiseBinaryOp, Cwise::operator*(), class VectorwiseOp, MatrixBase::redux() - */ -template -struct scalar_product_op : binary_op_base -{ - typedef typename ScalarBinaryOpTraits::ReturnType result_type; -#ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN - EIGEN_EMPTY_STRUCT_CTOR(scalar_product_op) -#else - scalar_product_op() { - EIGEN_SCALAR_BINARY_OP_PLUGIN - } -#endif - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a * b; } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const - { return internal::pmul(a,b); } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type predux(const Packet& a) const - { return internal::predux_mul(a); } -}; -template -struct functor_traits > { - enum { - Cost = (NumTraits::MulCost + NumTraits::MulCost)/2, // rough estimate! - PacketAccess = is_same::value && packet_traits::HasMul && packet_traits::HasMul - // TODO vectorize mixed product - }; -}; - -/** \internal - * \brief Template functor to compute the conjugate product of two scalars - * - * This is a short cut for conj(x) * y which is needed for optimization purpose; in Eigen2 support mode, this becomes x * conj(y) - */ -template -struct scalar_conj_product_op : binary_op_base -{ - - enum { - Conj = NumTraits::IsComplex - }; - - typedef typename ScalarBinaryOpTraits::ReturnType result_type; - - EIGEN_EMPTY_STRUCT_CTOR(scalar_conj_product_op) - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const - { return conj_helper().pmul(a,b); } - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const - { return conj_helper().pmul(a,b); } -}; -template -struct functor_traits > { - enum { - Cost = NumTraits::MulCost, - PacketAccess = internal::is_same::value && packet_traits::HasMul - }; -}; - -/** \internal - * \brief Template functor to compute the min of two scalars - * - * \sa class CwiseBinaryOp, MatrixBase::cwiseMin, class VectorwiseOp, MatrixBase::minCoeff() - */ -template -struct scalar_min_op : binary_op_base -{ - typedef typename ScalarBinaryOpTraits::ReturnType result_type; - EIGEN_EMPTY_STRUCT_CTOR(scalar_min_op) - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return numext::mini(a, b); } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const - { return internal::pmin(a,b); } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type predux(const Packet& a) const - { return internal::predux_min(a); } -}; -template -struct functor_traits > { - enum { - Cost = (NumTraits::AddCost+NumTraits::AddCost)/2, - PacketAccess = internal::is_same::value && packet_traits::HasMin - }; -}; - -/** \internal - * \brief Template functor to compute the max of two scalars - * - * \sa class CwiseBinaryOp, MatrixBase::cwiseMax, class VectorwiseOp, MatrixBase::maxCoeff() - */ -template -struct scalar_max_op : binary_op_base -{ - typedef typename ScalarBinaryOpTraits::ReturnType result_type; - EIGEN_EMPTY_STRUCT_CTOR(scalar_max_op) - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return numext::maxi(a, b); } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const - { return internal::pmax(a,b); } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type predux(const Packet& a) const - { return internal::predux_max(a); } -}; -template -struct functor_traits > { - enum { - Cost = (NumTraits::AddCost+NumTraits::AddCost)/2, - PacketAccess = internal::is_same::value && packet_traits::HasMax - }; -}; - -/** \internal - * \brief Template functors for comparison of two scalars - * \todo Implement packet-comparisons - */ -template struct scalar_cmp_op; - -template -struct functor_traits > { - enum { - Cost = (NumTraits::AddCost+NumTraits::AddCost)/2, - PacketAccess = false - }; -}; - -template -struct result_of(LhsScalar,RhsScalar)> { - typedef bool type; -}; - - -template -struct scalar_cmp_op : binary_op_base -{ - typedef bool result_type; - EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op) - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return a==b;} -}; -template -struct scalar_cmp_op : binary_op_base -{ - typedef bool result_type; - EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op) - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return a -struct scalar_cmp_op : binary_op_base -{ - typedef bool result_type; - EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op) - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return a<=b;} -}; -template -struct scalar_cmp_op : binary_op_base -{ - typedef bool result_type; - EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op) - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return a>b;} -}; -template -struct scalar_cmp_op : binary_op_base -{ - typedef bool result_type; - EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op) - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return a>=b;} -}; -template -struct scalar_cmp_op : binary_op_base -{ - typedef bool result_type; - EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op) - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return !(a<=b || b<=a);} -}; -template -struct scalar_cmp_op : binary_op_base -{ - typedef bool result_type; - EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op) - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const LhsScalar& a, const RhsScalar& b) const {return a!=b;} -}; - - -/** \internal - * \brief Template functor to compute the hypot of two \b positive \b and \b real scalars - * - * \sa MatrixBase::stableNorm(), class Redux - */ -template -struct scalar_hypot_op : binary_op_base -{ - EIGEN_EMPTY_STRUCT_CTOR(scalar_hypot_op) - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar &x, const Scalar &y) const - { - // This functor is used by hypotNorm only for which it is faster to first apply abs - // on all coefficients prior to reduction through hypot. - // This way we avoid calling abs on positive and real entries, and this also permits - // to seamlessly handle complexes. Otherwise we would have to handle both real and complexes - // through the same functor... - return internal::positive_real_hypot(x,y); - } -}; -template -struct functor_traits > { - enum - { - Cost = 3 * NumTraits::AddCost + - 2 * NumTraits::MulCost + - 2 * scalar_div_cost::value, - PacketAccess = false - }; -}; - -/** \internal - * \brief Template functor to compute the pow of two scalars - */ -template -struct scalar_pow_op : binary_op_base -{ - typedef typename ScalarBinaryOpTraits::ReturnType result_type; -#ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN - EIGEN_EMPTY_STRUCT_CTOR(scalar_pow_op) -#else - scalar_pow_op() { - typedef Scalar LhsScalar; - typedef Exponent RhsScalar; - EIGEN_SCALAR_BINARY_OP_PLUGIN - } -#endif - EIGEN_DEVICE_FUNC - inline result_type operator() (const Scalar& a, const Exponent& b) const { return numext::pow(a, b); } -}; -template -struct functor_traits > { - enum { Cost = 5 * NumTraits::MulCost, PacketAccess = false }; -}; - - - -//---------- non associative binary functors ---------- - -/** \internal - * \brief Template functor to compute the difference of two scalars - * - * \sa class CwiseBinaryOp, MatrixBase::operator- - */ -template -struct scalar_difference_op : binary_op_base -{ - typedef typename ScalarBinaryOpTraits::ReturnType result_type; -#ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN - EIGEN_EMPTY_STRUCT_CTOR(scalar_difference_op) -#else - scalar_difference_op() { - EIGEN_SCALAR_BINARY_OP_PLUGIN - } -#endif - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a - b; } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const - { return internal::psub(a,b); } -}; -template -struct functor_traits > { - enum { - Cost = (NumTraits::AddCost+NumTraits::AddCost)/2, - PacketAccess = is_same::value && packet_traits::HasSub && packet_traits::HasSub - }; -}; - -/** \internal - * \brief Template functor to compute the quotient of two scalars - * - * \sa class CwiseBinaryOp, Cwise::operator/() - */ -template -struct scalar_quotient_op : binary_op_base -{ - typedef typename ScalarBinaryOpTraits::ReturnType result_type; -#ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN - EIGEN_EMPTY_STRUCT_CTOR(scalar_quotient_op) -#else - scalar_quotient_op() { - EIGEN_SCALAR_BINARY_OP_PLUGIN - } -#endif - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a / b; } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const - { return internal::pdiv(a,b); } -}; -template -struct functor_traits > { - typedef typename scalar_quotient_op::result_type result_type; - enum { - PacketAccess = is_same::value && packet_traits::HasDiv && packet_traits::HasDiv, - Cost = scalar_div_cost::value - }; -}; - - - -/** \internal - * \brief Template functor to compute the and of two booleans - * - * \sa class CwiseBinaryOp, ArrayBase::operator&& - */ -struct scalar_boolean_and_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_boolean_and_op) - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator() (const bool& a, const bool& b) const { return a && b; } -}; -template<> struct functor_traits { - enum { - Cost = NumTraits::AddCost, - PacketAccess = false - }; -}; - -/** \internal - * \brief Template functor to compute the or of two booleans - * - * \sa class CwiseBinaryOp, ArrayBase::operator|| - */ -struct scalar_boolean_or_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_boolean_or_op) - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator() (const bool& a, const bool& b) const { return a || b; } -}; -template<> struct functor_traits { - enum { - Cost = NumTraits::AddCost, - PacketAccess = false - }; -}; - -/** \internal - * \brief Template functor to compute the xor of two booleans - * - * \sa class CwiseBinaryOp, ArrayBase::operator^ - */ -struct scalar_boolean_xor_op { - EIGEN_EMPTY_STRUCT_CTOR(scalar_boolean_xor_op) - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator() (const bool& a, const bool& b) const { return a ^ b; } -}; -template<> struct functor_traits { - enum { - Cost = NumTraits::AddCost, - PacketAccess = false - }; -}; - -/** \internal - * \brief Template functor to compute the absolute difference of two scalars - * - * \sa class CwiseBinaryOp, MatrixBase::absolute_difference - */ -template -struct scalar_absolute_difference_op : binary_op_base -{ - typedef typename ScalarBinaryOpTraits::ReturnType result_type; -#ifndef EIGEN_SCALAR_BINARY_OP_PLUGIN - EIGEN_EMPTY_STRUCT_CTOR(scalar_absolute_difference_op) -#else - scalar_absolute_difference_op() { - EIGEN_SCALAR_BINARY_OP_PLUGIN - } -#endif - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const - { return numext::absdiff(a,b); } - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const - { return internal::pabsdiff(a,b); } -}; -template -struct functor_traits > { - enum { - Cost = (NumTraits::AddCost+NumTraits::AddCost)/2, - PacketAccess = is_same::value && packet_traits::HasAbsDiff - }; -}; - - - -//---------- binary functors bound to a constant, thus appearing as a unary functor ---------- - -// The following two classes permits to turn any binary functor into a unary one with one argument bound to a constant value. -// They are analogues to std::binder1st/binder2nd but with the following differences: -// - they are compatible with packetOp -// - they are portable across C++ versions (the std::binder* are deprecated in C++11) -template struct bind1st_op : BinaryOp { - - typedef typename BinaryOp::first_argument_type first_argument_type; - typedef typename BinaryOp::second_argument_type second_argument_type; - typedef typename BinaryOp::result_type result_type; - - EIGEN_DEVICE_FUNC explicit bind1st_op(const first_argument_type &val) : m_value(val) {} - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const second_argument_type& b) const { return BinaryOp::operator()(m_value,b); } - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& b) const - { return BinaryOp::packetOp(internal::pset1(m_value), b); } - - first_argument_type m_value; -}; -template struct functor_traits > : functor_traits {}; - - -template struct bind2nd_op : BinaryOp { - - typedef typename BinaryOp::first_argument_type first_argument_type; - typedef typename BinaryOp::second_argument_type second_argument_type; - typedef typename BinaryOp::result_type result_type; - - EIGEN_DEVICE_FUNC explicit bind2nd_op(const second_argument_type &val) : m_value(val) {} - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator() (const first_argument_type& a) const { return BinaryOp::operator()(a,m_value); } - - template - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const - { return BinaryOp::packetOp(a,internal::pset1(m_value)); } - - second_argument_type m_value; -}; -template struct functor_traits > : functor_traits {}; - - -} // end namespace internal - -} // end namespace Eigen - -#endif // EIGEN_BINARY_FUNCTORS_H - -// clang-format on diff --git a/patches/eigen/Geometry_SSE.h b/patches/eigen/Geometry_SSE.h deleted file mode 100644 index f45d5eb8a01..00000000000 --- a/patches/eigen/Geometry_SSE.h +++ /dev/null @@ -1,189 +0,0 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2009 Rohit Garg -// Copyright (C) 2009-2010 Gael Guennebaud -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_GEOMETRY_SSE_H -#define EIGEN_GEOMETRY_SSE_H - -namespace Eigen { - -namespace internal { - -template -struct quat_product { - enum { - AAlignment = traits::Alignment, - BAlignment = traits::Alignment, - ResAlignment = traits>::Alignment - }; - static inline Quaternion run(const QuaternionBase& _a, - const QuaternionBase& _b) { - evaluator ae(_a.coeffs()); - evaluator be(_b.coeffs()); - Quaternion res; - const __m128 mask = _mm_setr_ps(0.f, 0.f, 0.f, -0.f); - __m128 a = ae.template packet(0); - __m128 b = be.template packet(0); - __m128 s1 = - pmul(vec4f_swizzle1(a, 1, 2, 0, 2), vec4f_swizzle1(b, 2, 0, 1, 2)); - __m128 s2 = - pmul(vec4f_swizzle1(a, 3, 3, 3, 1), vec4f_swizzle1(b, 0, 1, 2, 1)); - pstoret( - &res.x(), - padd(psub(pmul(a, vec4f_swizzle1(b, 3, 3, 3, 3)), - pmul(vec4f_swizzle1(a, 2, 0, 1, 0), - vec4f_swizzle1(b, 1, 2, 0, 0))), - pxor(mask, padd(s1, s2)))); - - return res; - } -}; - -template -struct quat_conj { - enum { ResAlignment = traits>::Alignment }; - static inline Quaternion run(const QuaternionBase& q) { - evaluator qe(q.coeffs()); - Quaternion res; - const Packet4f mask = _mm_setr_ps(-0.f, -0.f, -0.f, 0.f); - pstoret( - &res.x(), - pxor(mask, - qe.template packet::Alignment, Packet4f>(0))); - return res; - } -}; - -template -struct cross3_impl { - enum { - ResAlignment = - traits::type>::Alignment - }; - static inline typename plain_matrix_type::type run( - const VectorLhs& lhs, const VectorRhs& rhs) { - evaluator lhs_eval(lhs); - evaluator rhs_eval(rhs); - __m128 a = - lhs_eval.template packet::Alignment, __m128>(0); - __m128 b = - rhs_eval.template packet::Alignment, __m128>(0); - __m128 mul1 = - pmul(vec4f_swizzle1(a, 1, 2, 0, 3), vec4f_swizzle1(b, 2, 0, 1, 3)); - __m128 mul2 = - pmul(vec4f_swizzle1(a, 2, 0, 1, 3), vec4f_swizzle1(b, 1, 2, 0, 3)); - typename plain_matrix_type::type res; - pstoret(&res.x(), psub(mul1, mul2)); - return res; - } -}; - -template -struct quat_product { - enum { - BAlignment = traits::Alignment, - ResAlignment = traits>::Alignment - }; - - static inline Quaternion run(const QuaternionBase& _a, - const QuaternionBase& _b) { - const Packet2d mask = - _mm_castsi128_pd(_mm_set_epi32(0x0, 0x0, 0x80000000, 0x0)); - - Quaternion res; - - evaluator ae(_a.coeffs()); - evaluator be(_b.coeffs()); - - const double* a = _a.coeffs().data(); - Packet2d b_xy = be.template packet(0); - Packet2d b_zw = be.template packet(2); - Packet2d a_xx = pset1(a[0]); - Packet2d a_yy = pset1(a[1]); - Packet2d a_zz = pset1(a[2]); - Packet2d a_ww = pset1(a[3]); - - // two temporaries: - Packet2d t1, t2; - - /* - * t1 = ww*xy + yy*zw - * t2 = zz*xy - xx*zw - * res.xy = t1 +/- swap(t2) - */ - t1 = padd(pmul(a_ww, b_xy), pmul(a_yy, b_zw)); - t2 = psub(pmul(a_zz, b_xy), pmul(a_xx, b_zw)); -#ifdef EIGEN_VECTORIZE_SSE3 - EIGEN_UNUSED_VARIABLE(mask) - pstoret(&res.x(), - _mm_addsub_pd(t1, preverse(t2))); -#else - pstoret(&res.x(), - padd(t1, pxor(mask, preverse(t2)))); -#endif - - /* - * t1 = ww*zw - yy*xy - * t2 = zz*zw + xx*xy - * res.zw = t1 -/+ swap(t2) = swap( swap(t1) +/- t2) - */ - t1 = psub(pmul(a_ww, b_zw), pmul(a_yy, b_xy)); - t2 = padd(pmul(a_zz, b_zw), pmul(a_xx, b_xy)); -#ifdef EIGEN_VECTORIZE_SSE3 - EIGEN_UNUSED_VARIABLE(mask) - pstoret( - &res.z(), preverse(_mm_addsub_pd(preverse(t1), t2))); -#else - pstoret(&res.z(), - psub(t1, pxor(mask, preverse(t2)))); -#endif - - return res; - } -}; - -template -struct quat_conj { - enum { ResAlignment = traits>::Alignment }; - static inline Quaternion run(const QuaternionBase& q) { - evaluator qe(q.coeffs()); - Quaternion res; - const Packet2d mask0 = _mm_setr_pd(-0., -0.); - const Packet2d mask2 = _mm_setr_pd(-0., 0.); - pstoret( - &res.x(), - pxor(mask0, - qe.template packet::Alignment, Packet2d>(0))); - pstoret( - &res.z(), - pxor(mask2, - qe.template packet::Alignment, Packet2d>(2))); - return res; - } -}; - -} // end namespace internal - -} // end namespace Eigen - -#endif // EIGEN_GEOMETRY_SSE_H diff --git a/patches/eigen/Half.h b/patches/eigen/Half.h deleted file mode 100644 index 2d4e0164b59..00000000000 --- a/patches/eigen/Half.h +++ /dev/null @@ -1,733 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. -// -// The conversion routines are Copyright (c) Fabian Giesen, 2016. -// The original license follows: -// -// Copyright (c) Fabian Giesen, 2016 -// All rights reserved. -// Redistribution and use in source and binary forms, with or without -// modification, are permitted. -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -// Standard 16-bit float type, mostly useful for GPUs. Defines a new -// type Eigen::half (inheriting from CUDA's __half struct) with -// operator overloads such that it behaves basically as an arithmetic -// type. It will be quite slow on CPUs (so it is recommended to stay -// in fp32 for CPUs, except for simple parameter conversions, I/O -// to disk and the likes), but fast on GPUs. - -#ifndef EIGEN_HALF_CUDA_H -#define EIGEN_HALF_CUDA_H - -#if __cplusplus > 199711L -#define EIGEN_EXPLICIT_CAST(tgt_type) explicit operator tgt_type() -#else -#define EIGEN_EXPLICIT_CAST(tgt_type) operator tgt_type() -#endif - -namespace Eigen { - -struct half; - -namespace half_impl { - -#if !defined(EIGEN_HAS_CUDA_FP16) -// Make our own __half_raw definition that is similar to CUDA's. -struct __half_raw { - EIGEN_DEVICE_FUNC __half_raw() : x(0) {} - explicit EIGEN_DEVICE_FUNC __half_raw(unsigned short raw) : x(raw) {} - unsigned short x; -}; -#elif defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000 -// In CUDA < 9.0, __half is the equivalent of CUDA 9's __half_raw -typedef __half __half_raw; -#endif - -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw -raw_uint16_to_half(unsigned short x); -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff); -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h); - -struct half_base : public __half_raw { - EIGEN_DEVICE_FUNC half_base() {} - EIGEN_DEVICE_FUNC half_base(const half_base& h) : __half_raw(h) {} - EIGEN_DEVICE_FUNC half_base(const __half_raw& h) : __half_raw(h) {} -#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC_VER) && \ - EIGEN_CUDACC_VER >= 90000 - EIGEN_DEVICE_FUNC half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {} -#endif -}; - -} // namespace half_impl - -// Class definition. -struct half : public half_impl::half_base { -#if !defined(EIGEN_HAS_CUDA_FP16) || \ - (defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000) - typedef half_impl::__half_raw __half_raw; -#endif - - EIGEN_DEVICE_FUNC half() {} - - EIGEN_DEVICE_FUNC half(const __half_raw& h) : half_impl::half_base(h) {} - EIGEN_DEVICE_FUNC half(const half& h) : half_impl::half_base(h) {} -#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC_VER) && \ - EIGEN_CUDACC_VER >= 90000 - EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {} -#endif - - explicit EIGEN_DEVICE_FUNC half(bool b) - : half_impl::half_base(half_impl::raw_uint16_to_half(b ? 0x3c00 : 0)) {} - template - explicit EIGEN_DEVICE_FUNC half(const T& val) - : half_impl::half_base( - half_impl::float_to_half_rtne(static_cast(val))) {} - explicit EIGEN_DEVICE_FUNC half(float f) - : half_impl::half_base(half_impl::float_to_half_rtne(f)) {} - - EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(bool) const { - // +0.0 and -0.0 become false, everything else becomes true. - return (x & 0x7fff) != 0; - } - EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(signed char) const { - return static_cast(half_impl::half_to_float(*this)); - } - EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned char) const { - return static_cast(half_impl::half_to_float(*this)); - } - EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(short) const { - return static_cast(half_impl::half_to_float(*this)); - } - EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned short) const { - return static_cast(half_impl::half_to_float(*this)); - } - EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(int) const { - return static_cast(half_impl::half_to_float(*this)); - } - EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned int) const { - return static_cast(half_impl::half_to_float(*this)); - } - EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(long) const { - return static_cast(half_impl::half_to_float(*this)); - } - EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned long) const { - return static_cast(half_impl::half_to_float(*this)); - } - EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(long long) const { - return static_cast(half_impl::half_to_float(*this)); - } - EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned long long) const { - return static_cast(half_to_float(*this)); - } - EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(float) const { - return half_impl::half_to_float(*this); - } - EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(double) const { - return static_cast(half_impl::half_to_float(*this)); - } - - EIGEN_DEVICE_FUNC half& operator=(const half& other) { - x = other.x; - return *this; - } -}; - -namespace half_impl { - -#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && \ - EIGEN_CUDA_ARCH >= 530 - -// Intrinsics for native fp16 support. Note that on current hardware, -// these are no faster than fp32 arithmetic (you need to use the half2 -// versions to get the ALU speed increased), but you do save the -// conversion steps back and forth. - -EIGEN_STRONG_INLINE __device__ half operator+(const half& a, const half& b) { -#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000 - return __hadd(::__half(a), ::__half(b)); -#else - return __hadd(a, b); -#endif -} -EIGEN_STRONG_INLINE __device__ half operator*(const half& a, const half& b) { - return __hmul(a, b); -} -EIGEN_STRONG_INLINE __device__ half operator-(const half& a, const half& b) { - return __hsub(a, b); -} -EIGEN_STRONG_INLINE __device__ half operator/(const half& a, const half& b) { -#if defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000 - return __hdiv(a, b); -#else - float num = __half2float(a); - float denom = __half2float(b); - return __float2half(num / denom); -#endif -} -EIGEN_STRONG_INLINE __device__ half operator-(const half& a) { - return __hneg(a); -} -EIGEN_STRONG_INLINE __device__ half& operator+=(half& a, const half& b) { - a = a + b; - return a; -} -EIGEN_STRONG_INLINE __device__ half& operator*=(half& a, const half& b) { - a = a * b; - return a; -} -EIGEN_STRONG_INLINE __device__ half& operator-=(half& a, const half& b) { - a = a - b; - return a; -} -EIGEN_STRONG_INLINE __device__ half& operator/=(half& a, const half& b) { - a = a / b; - return a; -} -EIGEN_STRONG_INLINE __device__ bool operator==(const half& a, const half& b) { - return __heq(a, b); -} -EIGEN_STRONG_INLINE __device__ bool operator!=(const half& a, const half& b) { - return __hne(a, b); -} -EIGEN_STRONG_INLINE __device__ bool operator<(const half& a, const half& b) { - return __hlt(a, b); -} -EIGEN_STRONG_INLINE __device__ bool operator<=(const half& a, const half& b) { - return __hle(a, b); -} -EIGEN_STRONG_INLINE __device__ bool operator>(const half& a, const half& b) { - return __hgt(a, b); -} -EIGEN_STRONG_INLINE __device__ bool operator>=(const half& a, const half& b) { - return __hge(a, b); -} - -#else // Emulate support for half floats - -// Definitions for CPUs and older CUDA, mostly working through conversion -// to/from fp32. - -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator+(const half& a, - const half& b) { - return half(float(a) + float(b)); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator*(const half& a, - const half& b) { - return half(float(a) * float(b)); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator-(const half& a, - const half& b) { - return half(float(a) - float(b)); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator/(const half& a, - const half& b) { - return half(float(a) / float(b)); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator-(const half& a) { - half result; - result.x = a.x ^ 0x8000; - return result; -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator+=(half& a, const half& b) { - a = half(float(a) + float(b)); - return a; -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator*=(half& a, const half& b) { - a = half(float(a) * float(b)); - return a; -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator-=(half& a, const half& b) { - a = half(float(a) - float(b)); - return a; -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator/=(half& a, const half& b) { - a = half(float(a) / float(b)); - return a; -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator==(const half& a, - const half& b) { - return float(a) == float(b); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator!=(const half& a, - const half& b) { - return float(a) != float(b); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator<(const half& a, - const half& b) { - return float(a) < float(b); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator<=(const half& a, - const half& b) { - return float(a) <= float(b); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>(const half& a, - const half& b) { - return float(a) > float(b); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>=(const half& a, - const half& b) { - return float(a) >= float(b); -} - -#endif // Emulate support for half floats - -// Division by an index. Do it in full float precision to avoid accuracy -// issues in converting the denominator to half. -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator/(const half& a, Index b) { - return half(static_cast(a) / static_cast(b)); -} - -// Conversion routines, including fallbacks for the host or older CUDA. -// Note that newer Intel CPUs (Haswell or newer) have vectorized versions of -// these in hardware. If we need more performance on older/other CPUs, they are -// also possible to vectorize directly. - -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw -raw_uint16_to_half(unsigned short x) { - __half_raw h; - h.x = x; - return h; -} - -union FP32 { - unsigned int u; - float f; -}; - -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) { -#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && \ - EIGEN_CUDA_ARCH >= 300 - __half tmp_ff = __float2half(ff); - return *(__half_raw*)&tmp_ff; - -#elif defined(EIGEN_HAS_FP16_C) - __half_raw h; - h.x = _cvtss_sh(ff, 0); - return h; - -#else - FP32 f; - f.f = ff; - - const FP32 f32infty = {255 << 23}; - const FP32 f16max = {(127 + 16) << 23}; - const FP32 denorm_magic = {((127 - 15) + (23 - 10) + 1) << 23}; - unsigned int sign_mask = 0x80000000u; - __half_raw o; - o.x = static_cast(0x0u); - - unsigned int sign = f.u & sign_mask; - f.u ^= sign; - - // NOTE all the integer compares in this function can be safely - // compiled into signed compares since all operands are below - // 0x80000000. Important if you want fast straight SSE2 code - // (since there's no unsigned PCMPGTD). - - if (f.u >= f16max.u) { // result is Inf or NaN (all exponent bits set) - o.x = (f.u > f32infty.u) ? 0x7e00 : 0x7c00; // NaN->qNaN and Inf->Inf - } else { // (De)normalized number or zero - if (f.u < (113 << 23)) { // resulting FP16 is subnormal or zero - // use a magic value to align our 10 mantissa bits at the bottom of - // the float. as long as FP addition is round-to-nearest-even this - // just works. - f.f += denorm_magic.f; - - // and one integer subtract of the bias later, we have our final float! - o.x = static_cast(f.u - denorm_magic.u); - } else { - unsigned int mant_odd = (f.u >> 13) & 1; // resulting mantissa is odd - - // update exponent, rounding bias part 1 - f.u += ((unsigned int)(15 - 127) << 23) + 0xfff; - // rounding bias part 2 - f.u += mant_odd; - // take the bits! - o.x = static_cast(f.u >> 13); - } - } - - o.x |= static_cast(sign >> 16); - return o; -#endif -} - -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h) { -#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && \ - EIGEN_CUDA_ARCH >= 300 - return __half2float(h); - -#elif defined(EIGEN_HAS_FP16_C) - return _cvtsh_ss(h.x); - -#else - const FP32 magic = {113 << 23}; - const unsigned int shifted_exp = 0x7c00 << 13; // exponent mask after shift - FP32 o; - - o.u = (h.x & 0x7fff) << 13; // exponent/mantissa bits - unsigned int exp = shifted_exp & o.u; // just the exponent - o.u += (127 - 15) << 23; // exponent adjust - - // handle exponent special cases - if (exp == shifted_exp) { // Inf/NaN? - o.u += (128 - 16) << 23; // extra exp adjust - } else if (exp == 0) { // Zero/Denormal? - o.u += 1 << 23; // extra exp adjust - o.f -= magic.f; // renormalize - } - - o.u |= (h.x & 0x8000) << 16; // sign bit - return o.f; -#endif -} - -// --- standard functions --- - -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool(isinf)(const half& a) { - return (a.x & 0x7fff) == 0x7c00; -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool(isnan)(const half& a) { -#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && \ - EIGEN_CUDA_ARCH >= 530 - return __hisnan(a); -#else - return (a.x & 0x7fff) > 0x7c00; -#endif -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool(isfinite)(const half& a) { - return !(isinf EIGEN_NOT_A_MACRO(a)) && !(isnan EIGEN_NOT_A_MACRO(a)); -} - -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) { - half result; - result.x = a.x & 0x7FFF; - return result; -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) { -#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && \ - EIGEN_CUDA_ARCH >= 530 - return half(hexp(a)); -#else - return half(::expf(float(a))); -#endif -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half expm1(const half& a) { - return half(numext::expm1(float(a))); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) { -#if defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDACC_VER >= 80000 && \ - defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530 - return half(::hlog(a)); -#else - return half(::logf(float(a))); -#endif -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log1p(const half& a) { - return half(numext::log1p(float(a))); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log10(const half& a) { - return half(::log10f(float(a))); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) { -#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && \ - EIGEN_CUDA_ARCH >= 530 - return half(hsqrt(a)); -#else - return half(::sqrtf(float(a))); -#endif -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half pow(const half& a, const half& b) { - return half(::powf(float(a), float(b))); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sin(const half& a) { - return half(::sinf(float(a))); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half cos(const half& a) { - return half(::cosf(float(a))); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tan(const half& a) { - return half(::tanf(float(a))); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tanh(const half& a) { - return half(::tanhf(float(a))); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) { -#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && \ - EIGEN_CUDA_ARCH >= 300 - return half(hfloor(a)); -#else - return half(::floorf(float(a))); -#endif -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) { -#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && \ - EIGEN_CUDA_ARCH >= 300 - return half(hceil(a)); -#else - return half(::ceilf(float(a))); -#endif -} - -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half(min)(const half& a, const half& b) { -#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && \ - EIGEN_CUDA_ARCH >= 530 - return __hlt(b, a) ? b : a; -#else - const float f1 = static_cast(a); - const float f2 = static_cast(b); - return f2 < f1 ? b : a; -#endif -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half(max)(const half& a, const half& b) { -#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && \ - EIGEN_CUDA_ARCH >= 530 - return __hlt(a, b) ? b : a; -#else - const float f1 = static_cast(a); - const float f2 = static_cast(b); - return f1 < f2 ? b : a; -#endif -} - -EIGEN_ALWAYS_INLINE std::ostream& operator<<(std::ostream& os, const half& v) { - os << static_cast(v); - return os; -} - -} // end namespace half_impl - -// import Eigen::half_impl::half into Eigen namespace -// using half_impl::half; - -namespace internal { - -template <> -struct random_default_impl { - static inline half run(const half& x, const half& y) { - return x + (y - x) * half(float(std::rand()) / float(RAND_MAX)); - } - static inline half run() { return run(half(-1.f), half(1.f)); } -}; - -template <> -struct is_arithmetic { - enum { value = true }; -}; - -} // end namespace internal - -} // end namespace Eigen - -namespace std { -template <> -struct numeric_limits { - static const bool is_specialized = true; - static const bool is_signed = true; - static const bool is_integer = false; - static const bool is_exact = false; - static const bool has_infinity = true; - static const bool has_quiet_NaN = true; - static const bool has_signaling_NaN = true; - static const float_denorm_style has_denorm = denorm_present; - static const bool has_denorm_loss = false; - static const std::float_round_style round_style = std::round_to_nearest; - static const bool is_iec559 = false; - static const bool is_bounded = false; - static const bool is_modulo = false; - static const int digits = 11; - static const int digits10 = 3; // according to - // http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html - static const int max_digits10 = 5; // according to - // http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html - static const int radix = 2; - static const int min_exponent = -13; - static const int min_exponent10 = -4; - static const int max_exponent = 16; - static const int max_exponent10 = 4; - static const bool traps = true; - static const bool tinyness_before = false; - - static Eigen::half(min)() { - return Eigen::half_impl::raw_uint16_to_half(0x400); - } - static Eigen::half lowest() { - return Eigen::half_impl::raw_uint16_to_half(0xfbff); - } - static Eigen::half(max)() { - return Eigen::half_impl::raw_uint16_to_half(0x7bff); - } - static Eigen::half epsilon() { - return Eigen::half_impl::raw_uint16_to_half(0x0800); - } - static Eigen::half round_error() { return Eigen::half(0.5); } - static Eigen::half infinity() { - return Eigen::half_impl::raw_uint16_to_half(0x7c00); - } - static Eigen::half quiet_NaN() { - return Eigen::half_impl::raw_uint16_to_half(0x7e00); - } - static Eigen::half signaling_NaN() { - return Eigen::half_impl::raw_uint16_to_half(0x7e00); - } - static Eigen::half denorm_min() { - return Eigen::half_impl::raw_uint16_to_half(0x1); - } -}; -} - -namespace Eigen { - -template <> -struct NumTraits : GenericNumTraits { - enum { - IsSigned = true, - IsInteger = false, - IsComplex = false, - RequireInitialization = false - }; - - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half epsilon() { - return half_impl::raw_uint16_to_half(0x0800); - } - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half dummy_precision() { - return Eigen::half(1e-2f); - } - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half highest() { - return half_impl::raw_uint16_to_half(0x7bff); - } - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half lowest() { - return half_impl::raw_uint16_to_half(0xfbff); - } - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half infinity() { - return half_impl::raw_uint16_to_half(0x7c00); - } - EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half quiet_NaN() { - return half_impl::raw_uint16_to_half(0x7c01); - } -}; - -} // end namespace Eigen - -// C-like standard mathematical functions and trancendentals. -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half fabsh(const Eigen::half& a) { - Eigen::half result; - result.x = a.x & 0x7FFF; - return result; -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half exph(const Eigen::half& a) { - return Eigen::half(::expf(float(a))); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half logh(const Eigen::half& a) { -#if EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && \ - EIGEN_CUDA_ARCH >= 530 - return Eigen::half(::hlog(a)); -#else - return Eigen::half(::logf(float(a))); -#endif -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half sqrth(const Eigen::half& a) { - return Eigen::half(::sqrtf(float(a))); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half powh(const Eigen::half& a, - const Eigen::half& b) { - return Eigen::half(::powf(float(a), float(b))); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half floorh(const Eigen::half& a) { - return Eigen::half(::floorf(float(a))); -} -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half ceilh(const Eigen::half& a) { - return Eigen::half(::ceilf(float(a))); -} - -namespace std { - -#if __cplusplus > 199711L -template <> -struct hash { - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t operator()( - const Eigen::half& a) const { - return static_cast(a.x); - } -}; -#endif - -} // end namespace std - -// Add the missing shfl_xor intrinsic -#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300 -__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, - int laneMask, - int width = warpSize) { -#if EIGEN_CUDACC_VER < 90000 - return static_cast( - __shfl_xor(static_cast(var), laneMask, width)); -#else - return static_cast( - __shfl_xor_sync(0xFFFFFFFF, static_cast(var), laneMask, width)); -#endif -} -#endif - -// ldg() has an overload for __half_raw, but we also need one for Eigen::half. -#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350 -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg( - const Eigen::half* ptr) { - return Eigen::half_impl::raw_uint16_to_half( - __ldg(reinterpret_cast(ptr))); -} -#endif - -#if defined(EIGEN_CUDA_ARCH) -namespace Eigen { -namespace numext { - -template <> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isnan)(const Eigen::half& h) { - return (half_impl::isnan)(h); -} - -template <> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isinf)(const Eigen::half& h) { - return (half_impl::isinf)(h); -} - -template <> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isfinite)(const Eigen::half& h) { - return (half_impl::isfinite)(h); -} - -} // namespace Eigen -} // namespace numext -#endif - -#endif // EIGEN_HALF_CUDA_H diff --git a/patches/eigen/MathFunctions.h b/patches/eigen/MathFunctions.h deleted file mode 100644 index 9f6a4d0e332..00000000000 --- a/patches/eigen/MathFunctions.h +++ /dev/null @@ -1,1938 +0,0 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2006-2010 Benoit Jacob -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_MATHFUNCTIONS_H -#define EIGEN_MATHFUNCTIONS_H - -// source: http://www.geom.uiuc.edu/~huberty/math5337/groupe/digits.html -// TODO this should better be moved to NumTraits -#define EIGEN_PI \ - 3.141592653589793238462643383279502884197169399375105820974944592307816406L - -namespace Eigen { - -// On WINCE, std::abs is defined for int only, so let's defined our own -// overloads: -// This issue has been confirmed with MSVC 2008 only, but the issue might exist -// for more recent versions too. -#if EIGEN_OS_WINCE && EIGEN_COMP_MSVC && EIGEN_COMP_MSVC <= 1500 -long abs(long x) { return (labs(x)); } -double abs(double x) { return (fabs(x)); } -float abs(float x) { return (fabsf(x)); } -long double abs(long double x) { return (fabsl(x)); } -#endif - -namespace internal { - -/** \internal \class global_math_functions_filtering_base - * - * What it does: - * Defines a typedef 'type' as follows: - * - if type T has a member typedef - * Eigen_BaseClassForSpecializationOfGlobalMathFuncImpl, then - * global_math_functions_filtering_base::type is a typedef for it. - * - otherwise, global_math_functions_filtering_base::type is a typedef for - * T. - * - * How it's used: - * To allow to defined the global math functions (like sin...) in certain - * cases, like the Array expressions. - * When you do sin(array1+array2), the object array1+array2 has a complicated - * expression type, all what you want to know - * is that it inherits ArrayBase. So we implement a partial specialization of - * sin_impl for ArrayBase. - * So we must make sure to use sin_impl > and not - * sin_impl, otherwise our partial specialization - * won't be used. How does sin know that? That's exactly what - * global_math_functions_filtering_base tells it. - * - * How it's implemented: - * SFINAE in the style of enable_if. Highly susceptible of breaking compilers. - * With GCC, it sure does work, but if you replace - * the typename dummy by an integer template parameter, it doesn't work - * anymore! - */ - -template -struct global_math_functions_filtering_base { - typedef T type; -}; - -template -struct always_void { - typedef void type; -}; - -template -struct global_math_functions_filtering_base< - T, - typename always_void< - typename T::Eigen_BaseClassForSpecializationOfGlobalMathFuncImpl>:: - type> { - typedef typename T::Eigen_BaseClassForSpecializationOfGlobalMathFuncImpl type; -}; - -#define EIGEN_MATHFUNC_IMPL(func, scalar) \ - Eigen::internal::func##_impl< \ - typename Eigen::internal::global_math_functions_filtering_base< \ - scalar>::type> -#define EIGEN_MATHFUNC_RETVAL(func, scalar) \ - typename Eigen::internal::func##_retval< \ - typename Eigen::internal::global_math_functions_filtering_base< \ - scalar>::type>::type - -/**************************************************************************** -* Implementation of real * -****************************************************************************/ - -template ::IsComplex> -struct real_default_impl { - typedef typename NumTraits::Real RealScalar; - EIGEN_DEVICE_FUNC - static inline RealScalar run(const Scalar& x) { return x; } -}; - -template -struct real_default_impl { - typedef typename NumTraits::Real RealScalar; - EIGEN_DEVICE_FUNC - static inline RealScalar run(const Scalar& x) { - using std::real; - return real(x); - } -}; - -template -struct real_impl : real_default_impl {}; - -#if defined(EIGEN_GPU_COMPILE_PHASE) -template -struct real_impl> { - typedef T RealScalar; - EIGEN_DEVICE_FUNC - static inline T run(const std::complex& x) { return x.real(); } -}; -#endif - -template -struct real_retval { - typedef typename NumTraits::Real type; -}; - -/**************************************************************************** -* Implementation of imag * -****************************************************************************/ - -template ::IsComplex> -struct imag_default_impl { - typedef typename NumTraits::Real RealScalar; - EIGEN_DEVICE_FUNC - static inline RealScalar run(const Scalar&) { return RealScalar(0); } -}; - -template -struct imag_default_impl { - typedef typename NumTraits::Real RealScalar; - EIGEN_DEVICE_FUNC - static inline RealScalar run(const Scalar& x) { - using std::imag; - return imag(x); - } -}; - -template -struct imag_impl : imag_default_impl {}; - -#if defined(EIGEN_GPU_COMPILE_PHASE) -template -struct imag_impl> { - typedef T RealScalar; - EIGEN_DEVICE_FUNC - static inline T run(const std::complex& x) { return x.imag(); } -}; -#endif - -template -struct imag_retval { - typedef typename NumTraits::Real type; -}; - -/**************************************************************************** -* Implementation of real_ref * -****************************************************************************/ - -template -struct real_ref_impl { - typedef typename NumTraits::Real RealScalar; - EIGEN_DEVICE_FUNC - static inline RealScalar& run(Scalar& x) { - return reinterpret_cast(&x)[0]; - } - EIGEN_DEVICE_FUNC - static inline const RealScalar& run(const Scalar& x) { - return reinterpret_cast(&x)[0]; - } -}; - -template -struct real_ref_retval { - typedef typename NumTraits::Real& type; -}; - -/**************************************************************************** -* Implementation of imag_ref * -****************************************************************************/ - -template -struct imag_ref_default_impl { - typedef typename NumTraits::Real RealScalar; - EIGEN_DEVICE_FUNC - static inline RealScalar& run(Scalar& x) { - return reinterpret_cast(&x)[1]; - } - EIGEN_DEVICE_FUNC - static inline const RealScalar& run(const Scalar& x) { - return reinterpret_cast(&x)[1]; - } -}; - -template -struct imag_ref_default_impl { - EIGEN_DEVICE_FUNC - static inline Scalar run(Scalar&) { return Scalar(0); } - EIGEN_DEVICE_FUNC - static inline const Scalar run(const Scalar&) { return Scalar(0); } -}; - -template -struct imag_ref_impl - : imag_ref_default_impl::IsComplex> {}; - -template -struct imag_ref_retval { - typedef typename NumTraits::Real& type; -}; - -/**************************************************************************** -* Implementation of conj * -****************************************************************************/ - -template ::IsComplex> -struct conj_default_impl { - EIGEN_DEVICE_FUNC - static inline Scalar run(const Scalar& x) { return x; } -}; - -template -struct conj_default_impl { - EIGEN_DEVICE_FUNC - static inline Scalar run(const Scalar& x) { - using std::conj; - return conj(x); - } -}; - -template -struct conj_impl : conj_default_impl {}; - -#if defined(EIGEN_GPU_COMPILE_PHASE) -template -struct conj_impl> { - EIGEN_DEVICE_FUNC - static inline std::complex run(const std::complex& x) { - return std::complex(x.real(), -x.imag()); - } -}; -#endif - -template -struct conj_retval { - typedef Scalar type; -}; - -/**************************************************************************** -* Implementation of abs2 * -****************************************************************************/ - -template -struct abs2_impl_default { - typedef typename NumTraits::Real RealScalar; - EIGEN_DEVICE_FUNC - static inline RealScalar run(const Scalar& x) { return x * x; } -}; - -template -struct abs2_impl_default // IsComplex -{ - typedef typename NumTraits::Real RealScalar; - EIGEN_DEVICE_FUNC - static inline RealScalar run(const Scalar& x) { - return x.real() * x.real() + x.imag() * x.imag(); - } -}; - -template -struct abs2_impl { - typedef typename NumTraits::Real RealScalar; - EIGEN_DEVICE_FUNC - static inline RealScalar run(const Scalar& x) { - return abs2_impl_default::IsComplex>::run(x); - } -}; - -template -struct abs2_retval { - typedef typename NumTraits::Real type; -}; - -/**************************************************************************** -* Implementation of norm1 * -****************************************************************************/ - -template -struct norm1_default_impl; - -template -struct norm1_default_impl { - typedef typename NumTraits::Real RealScalar; - EIGEN_DEVICE_FUNC - static inline RealScalar run(const Scalar& x) { - EIGEN_USING_STD_MATH(abs); - return abs(x.real()) + abs(x.imag()); - } -}; - -template -struct norm1_default_impl { - EIGEN_DEVICE_FUNC - static inline Scalar run(const Scalar& x) { - EIGEN_USING_STD_MATH(abs); - return abs(x); - } -}; - -template -struct norm1_impl : norm1_default_impl::IsComplex> {}; - -template -struct norm1_retval { - typedef typename NumTraits::Real type; -}; - -/**************************************************************************** -* Implementation of hypot * -****************************************************************************/ - -template -struct hypot_impl; - -template -struct hypot_retval { - typedef typename NumTraits::Real type; -}; - -/**************************************************************************** -* Implementation of cast * -****************************************************************************/ - -template -struct cast_impl { - EIGEN_DEVICE_FUNC - static inline NewType run(const OldType& x) { - return static_cast(x); - } -}; - -// here, for once, we're plainly returning NewType: we don't want cast to do -// weird things. - -template -EIGEN_DEVICE_FUNC inline NewType cast(const OldType& x) { - return cast_impl::run(x); -} - -/**************************************************************************** -* Implementation of round * -****************************************************************************/ - -#if EIGEN_HAS_CXX11_MATH -template -struct round_impl { - EIGEN_DEVICE_FUNC - static inline Scalar run(const Scalar& x) { - EIGEN_STATIC_ASSERT((!NumTraits::IsComplex), - NUMERIC_TYPE_MUST_BE_REAL) - EIGEN_USING_STD_MATH(round); - return round(x); - } -}; -#else -template -struct round_impl { - EIGEN_DEVICE_FUNC - static inline Scalar run(const Scalar& x) { - EIGEN_STATIC_ASSERT((!NumTraits::IsComplex), - NUMERIC_TYPE_MUST_BE_REAL) - EIGEN_USING_STD_MATH(floor); - EIGEN_USING_STD_MATH(ceil); - return (x > Scalar(0)) ? floor(x + Scalar(0.5)) : ceil(x - Scalar(0.5)); - } -}; -#endif - -template -struct round_retval { - typedef Scalar type; -}; - -/**************************************************************************** -* Implementation of rint * -****************************************************************************/ - -template -struct rint_impl { - EIGEN_DEVICE_FUNC - static inline Scalar run(const Scalar& x) { - EIGEN_STATIC_ASSERT((!NumTraits::IsComplex), - NUMERIC_TYPE_MUST_BE_REAL) -#if EIGEN_HAS_CXX11_MATH - EIGEN_USING_STD_MATH(rint); -#endif - return rint(x); - } -}; - -#if !EIGEN_HAS_CXX11_MATH -template <> -struct rint_impl { - EIGEN_DEVICE_FUNC - static inline double run(const double& x) { return ::rint(x); } -}; -template <> -struct rint_impl { - EIGEN_DEVICE_FUNC - static inline float run(const float& x) { return ::rintf(x); } -}; -#endif - -template -struct rint_retval { - typedef Scalar type; -}; - -/**************************************************************************** -* Implementation of arg * -****************************************************************************/ - -#if EIGEN_HAS_CXX11_MATH -template -struct arg_impl { - EIGEN_DEVICE_FUNC - static inline Scalar run(const Scalar& x) { -#if defined(EIGEN_HIP_DEVICE_COMPILE) - // HIP does not seem to have a native device side implementation for the - // math routine "arg" - using std::arg; -#else - EIGEN_USING_STD_MATH(arg); -#endif - return arg(x); - } -}; -#else -template ::IsComplex> -struct arg_default_impl { - typedef typename NumTraits::Real RealScalar; - EIGEN_DEVICE_FUNC - static inline RealScalar run(const Scalar& x) { - return (x < Scalar(0)) ? Scalar(EIGEN_PI) : Scalar(0); - } -}; - -template -struct arg_default_impl { - typedef typename NumTraits::Real RealScalar; - EIGEN_DEVICE_FUNC - static inline RealScalar run(const Scalar& x) { - EIGEN_USING_STD_MATH(arg); - return arg(x); - } -}; - -template -struct arg_impl : arg_default_impl {}; -#endif - -template -struct arg_retval { - typedef typename NumTraits::Real type; -}; - -/**************************************************************************** -* Implementation of expm1 * -****************************************************************************/ - -// This implementation is based on GSL Math's expm1. -namespace std_fallback { -// fallback expm1 implementation in case there is no expm1(Scalar) function in -// namespace of Scalar, -// or that there is no suitable std::expm1 function available. Implementation -// attributed to Kahan. See: http://www.plunk.org/~hatch/rightway.php. -template -EIGEN_DEVICE_FUNC inline Scalar expm1(const Scalar& x) { - EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar) - typedef typename NumTraits::Real RealScalar; - - EIGEN_USING_STD_MATH(exp); - Scalar u = exp(x); - if (numext::equal_strict(u, Scalar(1))) { - return x; - } - Scalar um1 = u - RealScalar(1); - if (numext::equal_strict(um1, Scalar(-1))) { - return RealScalar(-1); - } - - EIGEN_USING_STD_MATH(log); - Scalar logu = log(u); - return numext::equal_strict(u, logu) ? u : (u - RealScalar(1)) * x / logu; -} -} - -template -struct expm1_impl { - EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x) { - EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar) -#if EIGEN_HAS_CXX11_MATH - using std::expm1; -#else - using std_fallback::expm1; -#endif - return expm1(x); - } -}; - -// Specialization for complex types that are not supported by std::expm1. -template -struct expm1_impl> { - EIGEN_DEVICE_FUNC static inline std::complex run( - const std::complex& x) { - EIGEN_STATIC_ASSERT_NON_INTEGER(RealScalar) - RealScalar xr = x.real(); - RealScalar xi = x.imag(); - // expm1(z) = exp(z) - 1 - // = exp(x + i * y) - 1 - // = exp(x) * (cos(y) + i * sin(y)) - 1 - // = exp(x) * cos(y) - 1 + i * exp(x) * sin(y) - // Imag(expm1(z)) = exp(x) * sin(y) - // Real(expm1(z)) = exp(x) * cos(y) - 1 - // = exp(x) * cos(y) - 1. - // = expm1(x) + exp(x) * (cos(y) - 1) - // = expm1(x) + exp(x) * (2 * sin(y / 2) ** 2) - - // TODO better use numext::expm1 and numext::sin (but that would require - // forward declarations or moving this specialization down). - RealScalar erm1 = expm1_impl::run(xr); - RealScalar er = erm1 + RealScalar(1.); - EIGEN_USING_STD_MATH(sin); - RealScalar sin2 = sin(xi / RealScalar(2.)); - sin2 = sin2 * sin2; - RealScalar s = sin(xi); - RealScalar real_part = erm1 - RealScalar(2.) * er * sin2; - return std::complex(real_part, er * s); - } -}; - -template -struct expm1_retval { - typedef Scalar type; -}; - -/**************************************************************************** -* Implementation of log1p * -****************************************************************************/ - -namespace std_fallback { -// fallback log1p implementation in case there is no log1p(Scalar) function in -// namespace of Scalar, -// or that there is no suitable std::log1p function available -template -EIGEN_DEVICE_FUNC inline Scalar log1p(const Scalar& x) { - EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar) - typedef typename NumTraits::Real RealScalar; - EIGEN_USING_STD_MATH(log); - Scalar x1p = RealScalar(1) + x; - Scalar log_1p = log(x1p); - const bool is_small = numext::equal_strict(x1p, Scalar(1)); - const bool is_inf = numext::equal_strict(x1p, log_1p); - return (is_small || is_inf) ? x : x * (log_1p / (x1p - RealScalar(1))); -} -} - -template -struct log1p_impl { - EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x) { - EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar) -#if EIGEN_HAS_CXX11_MATH - using std::log1p; -#else - using std_fallback::log1p; -#endif - return log1p(x); - } -}; - -// Specialization for complex types that are not supported by std::log1p. -template -struct log1p_impl> { - EIGEN_DEVICE_FUNC static inline std::complex run( - const std::complex& x) { - EIGEN_STATIC_ASSERT_NON_INTEGER(RealScalar) - return std_fallback::log1p(x); - } -}; - -template -struct log1p_retval { - typedef Scalar type; -}; - -/**************************************************************************** -* Implementation of pow * -****************************************************************************/ - -template ::IsInteger&& NumTraits::IsInteger> -struct pow_impl { - // typedef Scalar retval; - typedef typename ScalarBinaryOpTraits< - ScalarX, - ScalarY, - internal::scalar_pow_op>::ReturnType result_type; - static EIGEN_DEVICE_FUNC inline result_type run(const ScalarX& x, - const ScalarY& y) { - EIGEN_USING_STD_MATH(pow); - return pow(x, y); - } -}; - -template -struct pow_impl { - typedef ScalarX result_type; - static EIGEN_DEVICE_FUNC inline ScalarX run(ScalarX x, ScalarY y) { - ScalarX res(1); - eigen_assert(!NumTraits::IsSigned || y >= 0); - if (y & 1) res *= x; - y >>= 1; - while (y) { - x *= x; - if (y & 1) res *= x; - y >>= 1; - } - return res; - } -}; - -/**************************************************************************** -* Implementation of random * -****************************************************************************/ - -template -struct random_default_impl {}; - -template -struct random_impl : random_default_impl::IsComplex, - NumTraits::IsInteger> {}; - -template -struct random_retval { - typedef Scalar type; -}; - -template -inline EIGEN_MATHFUNC_RETVAL(random, Scalar) - random(const Scalar& x, const Scalar& y); -template -inline EIGEN_MATHFUNC_RETVAL(random, Scalar) random(); - -template -struct random_default_impl { - static inline Scalar run(const Scalar& x, const Scalar& y) { - return x + (y - x) * Scalar(std::rand()) / Scalar(RAND_MAX); - } - static inline Scalar run() { - return run(Scalar(NumTraits::IsSigned ? -1 : 0), Scalar(1)); - } -}; - -enum { - meta_floor_log2_terminate, - meta_floor_log2_move_up, - meta_floor_log2_move_down, - meta_floor_log2_bogus -}; - -template -struct meta_floor_log2_selector { - enum { - middle = (lower + upper) / 2, - value = (upper <= lower + 1) - ? int(meta_floor_log2_terminate) - : (n < (1 << middle)) ? int(meta_floor_log2_move_down) - : (n == 0) ? int(meta_floor_log2_bogus) - : int(meta_floor_log2_move_up) - }; -}; - -template ::value> -struct meta_floor_log2 {}; - -template -struct meta_floor_log2 { - enum { - value = meta_floor_log2< - n, - lower, - meta_floor_log2_selector::middle>::value - }; -}; - -template -struct meta_floor_log2 { - enum { - value = meta_floor_log2::middle, - upper>::value - }; -}; - -template -struct meta_floor_log2 { - enum { - value = (n >= ((unsigned int)(1) << (lower + 1))) ? lower + 1 : lower - }; -}; - -template -struct meta_floor_log2 { - // no value, error at compile time -}; - -template -struct random_default_impl { - static inline Scalar run(const Scalar& x, const Scalar& y) { - if (y <= x) return x; - // ScalarU is the unsigned counterpart of Scalar, possibly Scalar itself. - typedef typename make_unsigned::type ScalarU; - // ScalarX is the widest of ScalarU and unsigned int. - // We'll deal only with ScalarX and unsigned int below thus avoiding signed - // types and arithmetic and signed overflows (which are undefined behavior). - typedef typename conditional<(ScalarU(-1) > unsigned(-1)), - ScalarU, - unsigned>::type ScalarX; - // The following difference doesn't overflow, provided our integer types are - // two's - // complement and have the same number of padding bits in signed and - // unsigned variants. - // This is the case in most modern implementations of C++. - ScalarX range = ScalarX(y) - ScalarX(x); - ScalarX offset = 0; - ScalarX divisor = 1; - ScalarX multiplier = 1; - const unsigned rand_max = RAND_MAX; - if (range <= rand_max) - divisor = (rand_max + 1) / (range + 1); - else - multiplier = 1 + range / (rand_max + 1); - // Rejection sampling. - do { - offset = (unsigned(std::rand()) * multiplier) / divisor; - } while (offset > range); - return Scalar(ScalarX(x) + offset); - } - - static inline Scalar run() { -#ifdef EIGEN_MAKING_DOCS - return run(Scalar(NumTraits::IsSigned ? -10 : 0), Scalar(10)); -#else - enum { - rand_bits = meta_floor_log2<(unsigned int)(RAND_MAX) + 1>::value, - scalar_bits = sizeof(Scalar) * CHAR_BIT, - shift = EIGEN_PLAIN_ENUM_MAX(0, int(rand_bits) - int(scalar_bits)), - offset = NumTraits::IsSigned - ? (1 << (EIGEN_PLAIN_ENUM_MIN(rand_bits, scalar_bits) - 1)) - : 0}; - return Scalar((std::rand() >> shift) - offset); -#endif - } -}; - -template -struct random_default_impl { - static inline Scalar run(const Scalar& x, const Scalar& y) { - return Scalar(random(x.real(), y.real()), random(x.imag(), y.imag())); - } - static inline Scalar run() { - typedef typename NumTraits::Real RealScalar; - return Scalar(random(), random()); - } -}; - -template -inline EIGEN_MATHFUNC_RETVAL(random, Scalar) - random(const Scalar& x, const Scalar& y) { - return EIGEN_MATHFUNC_IMPL(random, Scalar)::run(x, y); -} - -template -inline EIGEN_MATHFUNC_RETVAL(random, Scalar) random() { - return EIGEN_MATHFUNC_IMPL(random, Scalar)::run(); -} - -// Implementation of is* functions - -// std::is* do not work with fast-math and gcc, std::is* are available on MSVC -// 2013 and newer, as well as in clang. -#if (EIGEN_HAS_CXX11_MATH && \ - !(EIGEN_COMP_GNUC_STRICT && __FINITE_MATH_ONLY__)) || \ - (EIGEN_COMP_MSVC >= 1800) || (EIGEN_COMP_CLANG) -#define EIGEN_USE_STD_FPCLASSIFY 1 -#else -#define EIGEN_USE_STD_FPCLASSIFY 0 -#endif - -template -EIGEN_DEVICE_FUNC - typename internal::enable_if::value, bool>::type - isnan_impl(const T&) { - return false; -} - -template -EIGEN_DEVICE_FUNC - typename internal::enable_if::value, bool>::type - isinf_impl(const T&) { - return false; -} - -template -EIGEN_DEVICE_FUNC - typename internal::enable_if::value, bool>::type - isfinite_impl(const T&) { - return true; -} - -template -EIGEN_DEVICE_FUNC - typename internal::enable_if<(!internal::is_integral::value) && - (!NumTraits::IsComplex), - bool>::type - isfinite_impl(const T& x) { -#if defined(EIGEN_GPU_COMPILE_PHASE) - return (::isfinite)(x); -#elif EIGEN_USE_STD_FPCLASSIFY - using std::isfinite; - return isfinite EIGEN_NOT_A_MACRO(x); -#else - return x <= NumTraits::highest() && x >= NumTraits::lowest(); -#endif -} - -template -EIGEN_DEVICE_FUNC - typename internal::enable_if<(!internal::is_integral::value) && - (!NumTraits::IsComplex), - bool>::type - isinf_impl(const T& x) { -#if defined(EIGEN_GPU_COMPILE_PHASE) - return (::isinf)(x); -#elif EIGEN_USE_STD_FPCLASSIFY - using std::isinf; - return isinf EIGEN_NOT_A_MACRO(x); -#else - return x > NumTraits::highest() || x < NumTraits::lowest(); -#endif -} - -template -EIGEN_DEVICE_FUNC - typename internal::enable_if<(!internal::is_integral::value) && - (!NumTraits::IsComplex), - bool>::type - isnan_impl(const T& x) { -#if defined(EIGEN_GPU_COMPILE_PHASE) - return (::isnan)(x); -#elif EIGEN_USE_STD_FPCLASSIFY - using std::isnan; - return isnan EIGEN_NOT_A_MACRO(x); -#else - return x != x; -#endif -} - -#if (!EIGEN_USE_STD_FPCLASSIFY) - -#if EIGEN_COMP_MSVC - -template -EIGEN_DEVICE_FUNC bool isinf_msvc_helper(T x) { - return _fpclass(x) == _FPCLASS_NINF || _fpclass(x) == _FPCLASS_PINF; -} - -// MSVC defines a _isnan builtin function, but for double only -EIGEN_DEVICE_FUNC inline bool isnan_impl(const long double& x) { - return _isnan(x) != 0; -} -EIGEN_DEVICE_FUNC inline bool isnan_impl(const double& x) { - return _isnan(x) != 0; -} -EIGEN_DEVICE_FUNC inline bool isnan_impl(const float& x) { - return _isnan(x) != 0; -} - -EIGEN_DEVICE_FUNC inline bool isinf_impl(const long double& x) { - return isinf_msvc_helper(x); -} -EIGEN_DEVICE_FUNC inline bool isinf_impl(const double& x) { - return isinf_msvc_helper(x); -} -EIGEN_DEVICE_FUNC inline bool isinf_impl(const float& x) { - return isinf_msvc_helper(x); -} - -#elif (defined __FINITE_MATH_ONLY__ && __FINITE_MATH_ONLY__ && EIGEN_COMP_GNUC) - -#if EIGEN_GNUC_AT_LEAST(5, 0) -#define EIGEN_TMP_NOOPT_ATTRIB \ - EIGEN_DEVICE_FUNC inline __attribute__((optimize("no-finite-math-only"))) -#else -// NOTE the inline qualifier and noinline attribute are both needed: the former -// is to avoid linking issue (duplicate symbol), -// while the second prevent too aggressive optimizations in fast-math mode: -#define EIGEN_TMP_NOOPT_ATTRIB \ - EIGEN_DEVICE_FUNC inline \ - __attribute__((noinline, optimize("no-finite-math-only"))) -#endif - -template <> -EIGEN_TMP_NOOPT_ATTRIB bool isnan_impl(const long double& x) { - return __builtin_isnan(x); -} -template <> -EIGEN_TMP_NOOPT_ATTRIB bool isnan_impl(const double& x) { - return __builtin_isnan(x); -} -template <> -EIGEN_TMP_NOOPT_ATTRIB bool isnan_impl(const float& x) { - return __builtin_isnan(x); -} -template <> -EIGEN_TMP_NOOPT_ATTRIB bool isinf_impl(const double& x) { - return __builtin_isinf(x); -} -template <> -EIGEN_TMP_NOOPT_ATTRIB bool isinf_impl(const float& x) { - return __builtin_isinf(x); -} -template <> -EIGEN_TMP_NOOPT_ATTRIB bool isinf_impl(const long double& x) { - return __builtin_isinf(x); -} - -#undef EIGEN_TMP_NOOPT_ATTRIB - -#endif - -#endif - -// The following overload are defined at the end of this file -template -EIGEN_DEVICE_FUNC bool isfinite_impl(const std::complex& x); -template -EIGEN_DEVICE_FUNC bool isnan_impl(const std::complex& x); -template -EIGEN_DEVICE_FUNC bool isinf_impl(const std::complex& x); - -template -T generic_fast_tanh_float(const T& a_x); -} // end namespace internal - -/**************************************************************************** -* Generic math functions * -****************************************************************************/ - -namespace numext { - -#if (!defined(EIGEN_GPUCC)) -template -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y) { - EIGEN_USING_STD_MATH(min); - return min EIGEN_NOT_A_MACRO(x, y); -} - -template -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y) { - EIGEN_USING_STD_MATH(max); - return max EIGEN_NOT_A_MACRO(x, y); -} -#else -template -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y) { - return y < x ? y : x; -} -template <> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float mini(const float& x, - const float& y) { - return fminf(x, y); -} -template <> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double mini(const double& x, - const double& y) { - return fmin(x, y); -} -template <> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE long double mini(const long double& x, - const long double& y) { -#if defined(EIGEN_HIPCC) - // no "fminl" on HIP yet - return (x < y) ? x : y; -#else - return fminl(x, y); -#endif -} - -template -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y) { - return x < y ? y : x; -} -template <> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float maxi(const float& x, - const float& y) { - return fmaxf(x, y); -} -template <> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double maxi(const double& x, - const double& y) { - return fmax(x, y); -} -template <> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE long double maxi(const long double& x, - const long double& y) { -#if defined(EIGEN_HIPCC) - // no "fmaxl" on HIP yet - return (x > y) ? x : y; -#else - return fmaxl(x, y); -#endif -} -#endif - -#if defined(SYCL_DEVICE_ONLY) - -#define SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_BINARY(NAME, FUNC) \ - SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_char) \ - SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_short) \ - SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_int) \ - SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_long) -#define SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_UNARY(NAME, FUNC) \ - SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_char) \ - SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_short) \ - SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_int) \ - SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_long) -#define SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_BINARY(NAME, FUNC) \ - SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_uchar) \ - SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_ushort) \ - SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_uint) \ - SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_ulong) -#define SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY(NAME, FUNC) \ - SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_uchar) \ - SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_ushort) \ - SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_uint) \ - SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_ulong) -#define SYCL_SPECIALIZE_INTEGER_TYPES_BINARY(NAME, FUNC) \ - SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_BINARY(NAME, FUNC) \ - SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_BINARY(NAME, FUNC) -#define SYCL_SPECIALIZE_INTEGER_TYPES_UNARY(NAME, FUNC) \ - SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_UNARY(NAME, FUNC) \ - SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY(NAME, FUNC) -#define SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(NAME, FUNC) \ - SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_float) \ - SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_double) -#define SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(NAME, FUNC) \ - SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_float) \ - SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_double) -#define SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE( \ - NAME, FUNC, RET_TYPE) \ - SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, RET_TYPE, cl::sycl::cl_float) \ - SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, RET_TYPE, cl::sycl::cl_double) - -#define SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, RET_TYPE, ARG_TYPE) \ - template <> \ - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE RET_TYPE NAME(const ARG_TYPE& x) { \ - return cl::sycl::FUNC(x); \ - } - -#define SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, TYPE) \ - SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, TYPE, TYPE) - -#define SYCL_SPECIALIZE_GEN1_BINARY_FUNC( \ - NAME, FUNC, RET_TYPE, ARG_TYPE1, ARG_TYPE2) \ - template <> \ - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE RET_TYPE NAME(const ARG_TYPE1& x, \ - const ARG_TYPE2& y) { \ - return cl::sycl::FUNC(x, y); \ - } - -#define SYCL_SPECIALIZE_GEN2_BINARY_FUNC(NAME, FUNC, RET_TYPE, ARG_TYPE) \ - SYCL_SPECIALIZE_GEN1_BINARY_FUNC(NAME, FUNC, RET_TYPE, ARG_TYPE, ARG_TYPE) - -#define SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, TYPE) \ - SYCL_SPECIALIZE_GEN2_BINARY_FUNC(NAME, FUNC, TYPE, TYPE) - -SYCL_SPECIALIZE_INTEGER_TYPES_BINARY(mini, min) -SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(mini, fmin) -SYCL_SPECIALIZE_INTEGER_TYPES_BINARY(maxi, max) -SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(maxi, fmax) - -#endif - -template -EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(real, Scalar) - real(const Scalar& x) { - return EIGEN_MATHFUNC_IMPL(real, Scalar)::run(x); -} - -template -EIGEN_DEVICE_FUNC inline typename internal::add_const_on_value_type< - EIGEN_MATHFUNC_RETVAL(real_ref, Scalar)>::type -real_ref(const Scalar& x) { - return internal::real_ref_impl::run(x); -} - -template -EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(real_ref, Scalar) - real_ref(Scalar& x) { - return EIGEN_MATHFUNC_IMPL(real_ref, Scalar)::run(x); -} - -template -EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(imag, Scalar) - imag(const Scalar& x) { - return EIGEN_MATHFUNC_IMPL(imag, Scalar)::run(x); -} - -template -EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(arg, Scalar) - arg(const Scalar& x) { - return EIGEN_MATHFUNC_IMPL(arg, Scalar)::run(x); -} - -template -EIGEN_DEVICE_FUNC inline typename internal::add_const_on_value_type< - EIGEN_MATHFUNC_RETVAL(imag_ref, Scalar)>::type -imag_ref(const Scalar& x) { - return internal::imag_ref_impl::run(x); -} - -template -EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(imag_ref, Scalar) - imag_ref(Scalar& x) { - return EIGEN_MATHFUNC_IMPL(imag_ref, Scalar)::run(x); -} - -template -EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(conj, Scalar) - conj(const Scalar& x) { - return EIGEN_MATHFUNC_IMPL(conj, Scalar)::run(x); -} - -template -EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(abs2, Scalar) - abs2(const Scalar& x) { - return EIGEN_MATHFUNC_IMPL(abs2, Scalar)::run(x); -} - -EIGEN_DEVICE_FUNC -inline bool abs2(bool x) { return x; } - -template -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T absdiff(const T& x, const T& y) { - return x > y ? x - y : y - x; -} -template <> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float absdiff(const float& x, - const float& y) { - return fabsf(x - y); -} -template <> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double absdiff(const double& x, - const double& y) { - return fabs(x - y); -} -template <> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE long double absdiff( - const long double& x, const long double& y) { -#if defined(EIGEN_HIPCC) - // no "fabsl" on HIP yet - return (x > y) ? x : y; -#else - return fabsl(x - y); -#endif -} - -template -EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(norm1, Scalar) - norm1(const Scalar& x) { - return EIGEN_MATHFUNC_IMPL(norm1, Scalar)::run(x); -} - -template -EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(hypot, Scalar) - hypot(const Scalar& x, const Scalar& y) { - return EIGEN_MATHFUNC_IMPL(hypot, Scalar)::run(x, y); -} - -#if defined(SYCL_DEVICE_ONLY) -SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(hypot, hypot) -#endif - -template -EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(log1p, Scalar) - log1p(const Scalar& x) { - return EIGEN_MATHFUNC_IMPL(log1p, Scalar)::run(x); -} - -#if defined(SYCL_DEVICE_ONLY) -SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(log1p, log1p) -#endif - -#if defined(EIGEN_GPUCC) -template <> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float log1p(const float& x) { - return ::log1pf(x); -} - -template <> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double log1p(const double& x) { - return ::log1p(x); -} -#endif - -template -EIGEN_DEVICE_FUNC inline - typename internal::pow_impl::result_type - pow(const ScalarX& x, const ScalarY& y) { - return internal::pow_impl::run(x, y); -} - -#if defined(SYCL_DEVICE_ONLY) -SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(pow, pow) -#endif - -template -EIGEN_DEVICE_FUNC bool(isnan)(const T& x) { - return internal::isnan_impl(x); -} -template -EIGEN_DEVICE_FUNC bool(isinf)(const T& x) { - return internal::isinf_impl(x); -} -template -EIGEN_DEVICE_FUNC bool(isfinite)(const T& x) { - return internal::isfinite_impl(x); -} - -#if defined(SYCL_DEVICE_ONLY) -SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(isnan, isnan, bool) -SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(isinf, isinf, bool) -SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(isfinite, isfinite, bool) -#endif - -template -EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(rint, Scalar) - rint(const Scalar& x) { - return EIGEN_MATHFUNC_IMPL(rint, Scalar)::run(x); -} - -template -EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(round, Scalar) - round(const Scalar& x) { - return EIGEN_MATHFUNC_IMPL(round, Scalar)::run(x); -} - -#if defined(SYCL_DEVICE_ONLY) -SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(round, round) -#endif - -template -EIGEN_DEVICE_FUNC T(floor)(const T& x) { - EIGEN_USING_STD_MATH(floor); - return floor(x); -} - -#if defined(SYCL_DEVICE_ONLY) -SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(floor, floor) -#endif - -#if defined(EIGEN_GPUCC) -template <> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float floor(const float& x) { - return ::floorf(x); -} - -template <> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double floor(const double& x) { - return ::floor(x); -} -#endif - -template -EIGEN_DEVICE_FUNC T(ceil)(const T& x) { - EIGEN_USING_STD_MATH(ceil); - return ceil(x); -} - -#if defined(SYCL_DEVICE_ONLY) -SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(ceil, ceil) -#endif - -#if defined(EIGEN_GPUCC) -template <> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float ceil(const float& x) { - return ::ceilf(x); -} - -template <> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double ceil(const double& x) { - return ::ceil(x); -} -#endif - -/** Log base 2 for 32 bits positive integers. - * Conveniently returns 0 for x==0. */ -inline int log2(int x) { - eigen_assert(x >= 0); - unsigned int v(x); - static const int table[32] = {0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, - 18, 22, 25, 3, 30, 8, 12, 20, 28, 15, 17, - 24, 7, 19, 27, 23, 6, 26, 5, 4, 31}; - v |= v >> 1; - v |= v >> 2; - v |= v >> 4; - v |= v >> 8; - v |= v >> 16; - return table[(v * 0x07C4ACDDU) >> 27]; -} - -/** \returns the square root of \a x. - * - * It is essentially equivalent to - * \code using std::sqrt; return sqrt(x); \endcode - * but slightly faster for float/double and some compilers (e.g., gcc), thanks - * to - * specializations when SSE is enabled. - * - * It's usage is justified in performance critical functions, like - * norm/normalize. - */ -template -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T sqrt(const T& x) { - EIGEN_USING_STD_MATH(sqrt); - return sqrt(x); -} - -#if defined(SYCL_DEVICE_ONLY) -SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(sqrt, sqrt) -#endif - -template -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T log(const T& x) { - EIGEN_USING_STD_MATH(log); - return log(x); -} - -#if defined(SYCL_DEVICE_ONLY) -SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(log, log) -#endif - -#if defined(EIGEN_GPUCC) -template <> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float log(const float& x) { - return ::logf(x); -} - -template <> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double log(const double& x) { - return ::log(x); -} -#endif - -template -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE - typename internal::enable_if::IsSigned || - NumTraits::IsComplex, - typename NumTraits::Real>::type - abs(const T& x) { - EIGEN_USING_STD_MATH(abs); - return abs(x); -} - -template -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE - typename internal::enable_if::IsSigned || - NumTraits::IsComplex), - typename NumTraits::Real>::type - abs(const T& x) { - return x; -} - -#if defined(SYCL_DEVICE_ONLY) -SYCL_SPECIALIZE_INTEGER_TYPES_UNARY(abs, abs) -SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(abs, fabs) -#endif - -#if defined(EIGEN_GPUCC) -template <> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float abs(const float& x) { - return ::fabsf(x); -} - -template <> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double abs(const double& x) { - return ::fabs(x); -} - -template <> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float abs(const std::complex& x) { - return ::hypotf(x.real(), x.imag()); -} - -template <> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double abs( - const std::complex& x) { - return ::hypot(x.real(), x.imag()); -} -#endif - -template -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T exp(const T& x) { - EIGEN_USING_STD_MATH(exp); - return exp(x); -} - -#if defined(SYCL_DEVICE_ONLY) -SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(exp, exp) -#endif - -#if defined(EIGEN_GPUCC) -template <> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float exp(const float& x) { - return ::expf(x); -} - -template <> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double exp(const double& x) { - return ::exp(x); -} - -template <> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::complex exp( - const std::complex& x) { - float com = ::expf(x.real()); - float res_real = com * ::cosf(x.imag()); - float res_imag = com * ::sinf(x.imag()); - return std::complex(res_real, res_imag); -} - -template <> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::complex exp( - const std::complex& x) { - double com = ::exp(x.real()); - double res_real = com * ::cos(x.imag()); - double res_imag = com * ::sin(x.imag()); - return std::complex(res_real, res_imag); -} -#endif - -template -EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(expm1, Scalar) - expm1(const Scalar& x) { - return EIGEN_MATHFUNC_IMPL(expm1, Scalar)::run(x); -} - -#if defined(SYCL_DEVICE_ONLY) -SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(expm1, expm1) -#endif - -#if defined(EIGEN_GPUCC) -template <> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float expm1(const float& x) { - return ::expm1f(x); -} - -template <> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double expm1(const double& x) { - return ::expm1(x); -} -#endif - -template -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T cos(const T& x) { - EIGEN_USING_STD_MATH(cos); - return cos(x); -} - -#if defined(SYCL_DEVICE_ONLY) -SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(cos, cos) -#endif - -#if defined(EIGEN_GPUCC) -template <> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float cos(const float& x) { - return ::cosf(x); -} - -template <> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double cos(const double& x) { - return ::cos(x); -} -#endif - -template -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T sin(const T& x) { - EIGEN_USING_STD_MATH(sin); - return sin(x); -} - -#if defined(SYCL_DEVICE_ONLY) -SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(sin, sin) -#endif - -#if defined(EIGEN_GPUCC) -template <> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float sin(const float& x) { - return ::sinf(x); -} - -template <> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double sin(const double& x) { - return ::sin(x); -} -#endif - -template -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T tan(const T& x) { - EIGEN_USING_STD_MATH(tan); - return tan(x); -} - -#if defined(SYCL_DEVICE_ONLY) -SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(tan, tan) -#endif - -#if defined(EIGEN_GPUCC) -template <> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float tan(const float& x) { - return ::tanf(x); -} - -template <> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double tan(const double& x) { - return ::tan(x); -} -#endif - -template -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T acos(const T& x) { - EIGEN_USING_STD_MATH(acos); - return acos(x); -} - -#if EIGEN_HAS_CXX11_MATH -template -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T acosh(const T& x) { - EIGEN_USING_STD_MATH(acosh); - return acosh(x); -} -#endif - -#if defined(SYCL_DEVICE_ONLY) -SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(acos, acos) -SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(acosh, acosh) -#endif - -#if defined(EIGEN_GPUCC) -template <> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float acos(const float& x) { - return ::acosf(x); -} - -template <> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double acos(const double& x) { - return ::acos(x); -} -#endif - -template -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T asin(const T& x) { - EIGEN_USING_STD_MATH(asin); - return asin(x); -} - -#if EIGEN_HAS_CXX11_MATH -template -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T asinh(const T& x) { - EIGEN_USING_STD_MATH(asinh); - return asinh(x); -} -#endif - -#if defined(SYCL_DEVICE_ONLY) -SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(asin, asin) -SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(asinh, asinh) -#endif - -#if defined(EIGEN_GPUCC) -template <> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float asin(const float& x) { - return ::asinf(x); -} - -template <> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double asin(const double& x) { - return ::asin(x); -} -#endif - -template -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T atan(const T& x) { - EIGEN_USING_STD_MATH(atan); - return atan(x); -} - -#if EIGEN_HAS_CXX11_MATH -template -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T atanh(const T& x) { - EIGEN_USING_STD_MATH(atanh); - return atanh(x); -} -#endif - -#if defined(SYCL_DEVICE_ONLY) -SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(atan, atan) -SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(atanh, atanh) -#endif - -#if defined(EIGEN_GPUCC) -template <> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float atan(const float& x) { - return ::atanf(x); -} - -template <> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double atan(const double& x) { - return ::atan(x); -} -#endif - -template -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T cosh(const T& x) { - EIGEN_USING_STD_MATH(cosh); - return cosh(x); -} - -#if defined(SYCL_DEVICE_ONLY) -SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(cosh, cosh) -#endif - -#if defined(EIGEN_GPUCC) -template <> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float cosh(const float& x) { - return ::coshf(x); -} - -template <> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double cosh(const double& x) { - return ::cosh(x); -} -#endif - -template -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T sinh(const T& x) { - EIGEN_USING_STD_MATH(sinh); - return sinh(x); -} - -#if defined(SYCL_DEVICE_ONLY) -SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(sinh, sinh) -#endif - -#if defined(EIGEN_GPUCC) -template <> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float sinh(const float& x) { - return ::sinhf(x); -} - -template <> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double sinh(const double& x) { - return ::sinh(x); -} -#endif - -template -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T tanh(const T& x) { - EIGEN_USING_STD_MATH(tanh); - return tanh(x); -} - -#if (!defined(EIGEN_GPUCC)) && EIGEN_FAST_MATH && !defined(SYCL_DEVICE_ONLY) -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float tanh(float x) { - return internal::generic_fast_tanh_float(x); -} -#endif - -#if defined(SYCL_DEVICE_ONLY) -SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(tanh, tanh) -#endif - -#if defined(EIGEN_GPUCC) -template <> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float tanh(const float& x) { - return ::tanhf(x); -} - -template <> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double tanh(const double& x) { - return ::tanh(x); -} -#endif - -template -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T fmod(const T& a, const T& b) { - EIGEN_USING_STD_MATH(fmod); - return fmod(a, b); -} - -#if defined(SYCL_DEVICE_ONLY) -SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(fmod, fmod) -#endif - -#if defined(EIGEN_GPUCC) -template <> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float fmod(const float& a, - const float& b) { - return ::fmodf(a, b); -} - -template <> -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double fmod(const double& a, - const double& b) { - return ::fmod(a, b); -} -#endif - -#if defined(SYCL_DEVICE_ONLY) -#undef SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_BINARY -#undef SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_UNARY -#undef SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_BINARY -#undef SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY -#undef SYCL_SPECIALIZE_INTEGER_TYPES_BINARY -#undef SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY -#undef SYCL_SPECIALIZE_FLOATING_TYPES_BINARY -#undef SYCL_SPECIALIZE_FLOATING_TYPES_UNARY -#undef SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE -#undef SYCL_SPECIALIZE_GEN_UNARY_FUNC -#undef SYCL_SPECIALIZE_UNARY_FUNC -#undef SYCL_SPECIALIZE_GEN1_BINARY_FUNC -#undef SYCL_SPECIALIZE_GEN2_BINARY_FUNC -#undef SYCL_SPECIALIZE_BINARY_FUNC -#endif - -} // end namespace numext - -namespace internal { - -template -EIGEN_DEVICE_FUNC bool isfinite_impl(const std::complex& x) { - return (numext::isfinite)(numext::real(x)) && - (numext::isfinite)(numext::imag(x)); -} - -template -EIGEN_DEVICE_FUNC bool isnan_impl(const std::complex& x) { - return (numext::isnan)(numext::real(x)) || (numext::isnan)(numext::imag(x)); -} - -template -EIGEN_DEVICE_FUNC bool isinf_impl(const std::complex& x) { - return ((numext::isinf)(numext::real(x)) || - (numext::isinf)(numext::imag(x))) && - (!(numext::isnan)(x)); -} - -/**************************************************************************** -* Implementation of fuzzy comparisons * -****************************************************************************/ - -template -struct scalar_fuzzy_default_impl {}; - -template -struct scalar_fuzzy_default_impl { - typedef typename NumTraits::Real RealScalar; - template - EIGEN_DEVICE_FUNC static inline bool isMuchSmallerThan( - const Scalar& x, const OtherScalar& y, const RealScalar& prec) { - return numext::abs(x) <= numext::abs(y) * prec; - } - EIGEN_DEVICE_FUNC - static inline bool isApprox(const Scalar& x, - const Scalar& y, - const RealScalar& prec) { - return numext::abs(x - y) <= - numext::mini(numext::abs(x), numext::abs(y)) * prec; - } - EIGEN_DEVICE_FUNC - static inline bool isApproxOrLessThan(const Scalar& x, - const Scalar& y, - const RealScalar& prec) { - return x <= y || isApprox(x, y, prec); - } -}; - -template -struct scalar_fuzzy_default_impl { - typedef typename NumTraits::Real RealScalar; - template - EIGEN_DEVICE_FUNC static inline bool isMuchSmallerThan(const Scalar& x, - const Scalar&, - const RealScalar&) { - return x == Scalar(0); - } - EIGEN_DEVICE_FUNC - static inline bool isApprox(const Scalar& x, - const Scalar& y, - const RealScalar&) { - return x == y; - } - EIGEN_DEVICE_FUNC - static inline bool isApproxOrLessThan(const Scalar& x, - const Scalar& y, - const RealScalar&) { - return x <= y; - } -}; - -template -struct scalar_fuzzy_default_impl { - typedef typename NumTraits::Real RealScalar; - template - EIGEN_DEVICE_FUNC static inline bool isMuchSmallerThan( - const Scalar& x, const OtherScalar& y, const RealScalar& prec) { - return numext::abs2(x) <= numext::abs2(y) * prec * prec; - } - EIGEN_DEVICE_FUNC - static inline bool isApprox(const Scalar& x, - const Scalar& y, - const RealScalar& prec) { - return numext::abs2(x - y) <= - numext::mini(numext::abs2(x), numext::abs2(y)) * prec * prec; - } -}; - -template -struct scalar_fuzzy_impl - : scalar_fuzzy_default_impl::IsComplex, - NumTraits::IsInteger> {}; - -template -EIGEN_DEVICE_FUNC inline bool isMuchSmallerThan( - const Scalar& x, - const OtherScalar& y, - const typename NumTraits::Real& precision = - NumTraits::dummy_precision()) { - return scalar_fuzzy_impl::template isMuchSmallerThan( - x, y, precision); -} - -template -EIGEN_DEVICE_FUNC inline bool isApprox( - const Scalar& x, - const Scalar& y, - const typename NumTraits::Real& precision = - NumTraits::dummy_precision()) { - return scalar_fuzzy_impl::isApprox(x, y, precision); -} - -template -EIGEN_DEVICE_FUNC inline bool isApproxOrLessThan( - const Scalar& x, - const Scalar& y, - const typename NumTraits::Real& precision = - NumTraits::dummy_precision()) { - return scalar_fuzzy_impl::isApproxOrLessThan(x, y, precision); -} - -/****************************************** -*** The special case of the bool type *** -******************************************/ - -template <> -struct random_impl { - static inline bool run() { return random(0, 1) == 0 ? false : true; } -}; - -template <> -struct scalar_fuzzy_impl { - typedef bool RealScalar; - - template - EIGEN_DEVICE_FUNC static inline bool isMuchSmallerThan(const bool& x, - const bool&, - const bool&) { - return !x; - } - - EIGEN_DEVICE_FUNC - static inline bool isApprox(bool x, bool y, bool) { return x == y; } - - EIGEN_DEVICE_FUNC - static inline bool isApproxOrLessThan(const bool& x, - const bool& y, - const bool&) { - return (!x) || y; - } -}; - -} // end namespace internal - -} // end namespace Eigen - -#endif // EIGEN_MATHFUNCTIONS_H diff --git a/patches/eigen/Meta.h b/patches/eigen/Meta.h deleted file mode 100755 index d7f5cbd240a..00000000000 --- a/patches/eigen/Meta.h +++ /dev/null @@ -1,722 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2008-2015 Gael Guennebaud -// Copyright (C) 2006-2008 Benoit Jacob -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -// clang-format off - -#ifndef EIGEN_META_H -#define EIGEN_META_H - -#if defined(EIGEN_GPU_COMPILE_PHASE) - - #include - - #if defined(EIGEN_CUDA_ARCH) - #include - #endif - - #if defined(EIGEN_HIP_DEVICE_COMPILE) - #include "Eigen/src/Core/arch/HIP/hcc/math_constants.h" - #endif - -#endif - -#if EIGEN_COMP_ICC>=1600 && __cplusplus >= 201103L -#include -#endif - -namespace Eigen { - -typedef EIGEN_DEFAULT_DENSE_INDEX_TYPE DenseIndex; - -/** - * \brief The Index type as used for the API. - * \details To change this, \c \#define the preprocessor symbol \c EIGEN_DEFAULT_DENSE_INDEX_TYPE. - * \sa \blank \ref TopicPreprocessorDirectives, StorageIndex. - */ - -typedef EIGEN_DEFAULT_DENSE_INDEX_TYPE Index; - -namespace internal { - -/** \internal - * \file Meta.h - * This file contains generic metaprogramming classes which are not specifically related to Eigen. - * \note In case you wonder, yes we're aware that Boost already provides all these features, - * we however don't want to add a dependency to Boost. - */ - -// Only recent versions of ICC complain about using ptrdiff_t to hold pointers, -// and older versions do not provide *intptr_t types. -#if EIGEN_COMP_ICC>=1600 && __cplusplus >= 201103L -typedef std::intptr_t IntPtr; -typedef std::uintptr_t UIntPtr; -#else -typedef std::ptrdiff_t IntPtr; -typedef std::size_t UIntPtr; -#endif - -struct true_type { enum { value = 1 }; }; -struct false_type { enum { value = 0 }; }; - -template -struct bool_constant; - -template<> -struct bool_constant : true_type {}; - -template<> -struct bool_constant : false_type {}; - -template -struct conditional { typedef Then type; }; - -template -struct conditional { typedef Else type; }; - -template struct remove_reference { typedef T type; }; -template struct remove_reference { typedef T type; }; - -template struct remove_pointer { typedef T type; }; -template struct remove_pointer { typedef T type; }; -template struct remove_pointer { typedef T type; }; - -template struct remove_const { typedef T type; }; -template struct remove_const { typedef T type; }; -template struct remove_const { typedef T type[]; }; -template struct remove_const { typedef T type[Size]; }; - -template struct remove_all { typedef T type; }; -template struct remove_all { typedef typename remove_all::type type; }; -template struct remove_all { typedef typename remove_all::type type; }; -template struct remove_all { typedef typename remove_all::type type; }; -template struct remove_all { typedef typename remove_all::type type; }; -template struct remove_all { typedef typename remove_all::type type; }; - -template struct is_arithmetic { enum { value = false }; }; -template<> struct is_arithmetic { enum { value = true }; }; -template<> struct is_arithmetic { enum { value = true }; }; -template<> struct is_arithmetic { enum { value = true }; }; -template<> struct is_arithmetic { enum { value = true }; }; -template<> struct is_arithmetic { enum { value = true }; }; -template<> struct is_arithmetic { enum { value = true }; }; -template<> struct is_arithmetic { enum { value = true }; }; -template<> struct is_arithmetic { enum { value = true }; }; -template<> struct is_arithmetic{ enum { value = true }; }; -template<> struct is_arithmetic { enum { value = true }; }; -template<> struct is_arithmetic { enum { value = true }; }; -template<> struct is_arithmetic { enum { value = true }; }; -template<> struct is_arithmetic { enum { value = true }; }; - -template struct is_same { enum { value = 0 }; }; -template struct is_same { enum { value = 1 }; }; - -template< class T > -struct is_void : is_same::type> {}; - -#if EIGEN_HAS_CXX11 -template<> struct is_arithmetic { enum { value = true }; }; -template<> struct is_arithmetic { enum { value = true }; }; -using std::is_integral; -#else -template struct is_integral { enum { value = false }; }; -template<> struct is_integral { enum { value = true }; }; -template<> struct is_integral { enum { value = true }; }; -template<> struct is_integral { enum { value = true }; }; -template<> struct is_integral { enum { value = true }; }; -template<> struct is_integral { enum { value = true }; }; -template<> struct is_integral { enum { value = true }; }; -template<> struct is_integral { enum { value = true }; }; -template<> struct is_integral { enum { value = true }; }; -template<> struct is_integral { enum { value = true }; }; -template<> struct is_integral { enum { value = true }; }; -#if EIGEN_COMP_MSVC -template<> struct is_integral { enum { value = true }; }; -template<> struct is_integral { enum { value = true }; }; -#endif -#endif - -#if EIGEN_HAS_CXX11 -using std::make_unsigned; -#else -// TODO: Possibly improve this implementation of make_unsigned. -// It is currently used only by -// template struct random_default_impl. -template struct make_unsigned; -template<> struct make_unsigned { typedef unsigned char type; }; -template<> struct make_unsigned { typedef unsigned char type; }; -template<> struct make_unsigned { typedef unsigned char type; }; -template<> struct make_unsigned { typedef unsigned short type; }; -template<> struct make_unsigned { typedef unsigned short type; }; -template<> struct make_unsigned { typedef unsigned int type; }; -template<> struct make_unsigned { typedef unsigned int type; }; -template<> struct make_unsigned { typedef unsigned long type; }; -template<> struct make_unsigned { typedef unsigned long type; }; -#if EIGEN_COMP_MSVC -template<> struct make_unsigned { typedef unsigned __int64 type; }; -template<> struct make_unsigned { typedef unsigned __int64 type; }; -#endif -#endif - -template struct add_const { typedef const T type; }; -template struct add_const { typedef T& type; }; - -template struct is_const { enum { value = 0 }; }; -template struct is_const { enum { value = 1 }; }; - -template struct add_const_on_value_type { typedef const T type; }; -template struct add_const_on_value_type { typedef T const& type; }; -template struct add_const_on_value_type { typedef T const* type; }; -template struct add_const_on_value_type { typedef T const* const type; }; -template struct add_const_on_value_type { typedef T const* const type; }; - -#if EIGEN_HAS_CXX11 - -using std::is_convertible; - -#else - -template -struct is_convertible_impl -{ -private: - struct any_conversion - { - template any_conversion(const volatile T&); - template any_conversion(T&); - }; - struct yes {int a[1];}; - struct no {int a[2];}; - - template - static yes test(T, int); - - template - static no test(any_conversion, ...); - -public: - static typename internal::remove_reference::type* ms_from; -#ifdef __INTEL_COMPILER - #pragma warning push - #pragma warning ( disable : 2259 ) -#endif - enum { value = sizeof(test(*ms_from, 0))==sizeof(yes) }; -#ifdef __INTEL_COMPILER - #pragma warning pop -#endif -}; - -template -struct is_convertible -{ - enum { value = is_convertible_impl::value }; -}; - -template -struct is_convertible { enum { value = false }; }; - -template -struct is_convertible { enum { value = true }; }; - -#endif - -/** \internal Allows to enable/disable an overload - * according to a compile time condition. - */ -template struct enable_if; - -template struct enable_if -{ typedef T type; }; - -#if defined(EIGEN_GPU_COMPILE_PHASE) -#if !defined(__FLT_EPSILON__) -#define __FLT_EPSILON__ FLT_EPSILON -#define __DBL_EPSILON__ DBL_EPSILON -#endif - -namespace device { - -template struct numeric_limits -{ - EIGEN_DEVICE_FUNC static T epsilon() { return 0; } - EIGEN_DEVICE_FUNC static T (max)() { assert(false && "Highest not supported for this type"); } - EIGEN_DEVICE_FUNC static T (min)() { assert(false && "Lowest not supported for this type"); } - EIGEN_DEVICE_FUNC static T infinity() { assert(false && "Infinity not supported for this type"); } - EIGEN_DEVICE_FUNC static T quiet_NaN() { assert(false && "quiet_NaN not supported for this type"); } -}; -template<> struct numeric_limits -{ - EIGEN_DEVICE_FUNC - static float epsilon() { return __FLT_EPSILON__; } - EIGEN_DEVICE_FUNC - static float (max)() { - #if defined(EIGEN_CUDA_ARCH) - return CUDART_MAX_NORMAL_F; - #else - return HIPRT_MAX_NORMAL_F; - #endif - } - EIGEN_DEVICE_FUNC - static float (min)() { return FLT_MIN; } - EIGEN_DEVICE_FUNC - static float infinity() { - #if defined(EIGEN_CUDA_ARCH) - return CUDART_INF_F; - #else - return HIPRT_INF_F; - #endif - } - EIGEN_DEVICE_FUNC - static float quiet_NaN() { - #if defined(EIGEN_CUDA_ARCH) - return CUDART_NAN_F; - #else - return HIPRT_NAN_F; - #endif - } -}; -template<> struct numeric_limits -{ - EIGEN_DEVICE_FUNC - static double epsilon() { return __DBL_EPSILON__; } - EIGEN_DEVICE_FUNC - static double (max)() { return DBL_MAX; } - EIGEN_DEVICE_FUNC - static double (min)() { return DBL_MIN; } - EIGEN_DEVICE_FUNC - static double infinity() { - #if defined(EIGEN_CUDA_ARCH) - return CUDART_INF; - #else - return HIPRT_INF; - #endif - } - EIGEN_DEVICE_FUNC - static double quiet_NaN() { - #if defined(EIGEN_CUDA_ARCH) - return CUDART_NAN; - #else - return HIPRT_NAN; - #endif - } -}; -template<> struct numeric_limits -{ - EIGEN_DEVICE_FUNC - static int epsilon() { return 0; } - EIGEN_DEVICE_FUNC - static int (max)() { return INT_MAX; } - EIGEN_DEVICE_FUNC - static int (min)() { return INT_MIN; } -}; -template<> struct numeric_limits -{ - EIGEN_DEVICE_FUNC - static unsigned int epsilon() { return 0; } - EIGEN_DEVICE_FUNC - static unsigned int (max)() { return UINT_MAX; } - EIGEN_DEVICE_FUNC - static unsigned int (min)() { return 0; } -}; -template<> struct numeric_limits -{ - EIGEN_DEVICE_FUNC - static long epsilon() { return 0; } - EIGEN_DEVICE_FUNC - static long (max)() { return LONG_MAX; } - EIGEN_DEVICE_FUNC - static long (min)() { return LONG_MIN; } -}; -template<> struct numeric_limits -{ - EIGEN_DEVICE_FUNC - static unsigned long epsilon() { return 0; } - EIGEN_DEVICE_FUNC - static unsigned long (max)() { return ULONG_MAX; } - EIGEN_DEVICE_FUNC - static unsigned long (min)() { return 0; } -}; -template<> struct numeric_limits -{ - EIGEN_DEVICE_FUNC - static long long epsilon() { return 0; } - EIGEN_DEVICE_FUNC - static long long (max)() { return LLONG_MAX; } - EIGEN_DEVICE_FUNC - static long long (min)() { return LLONG_MIN; } -}; -template<> struct numeric_limits -{ - EIGEN_DEVICE_FUNC - static unsigned long long epsilon() { return 0; } - EIGEN_DEVICE_FUNC - static unsigned long long (max)() { return ULLONG_MAX; } - EIGEN_DEVICE_FUNC - static unsigned long long (min)() { return 0; } -}; - -} - -#endif - -/** \internal - * A base class do disable default copy ctor and copy assignment operator. - */ -class noncopyable -{ - EIGEN_DEVICE_FUNC noncopyable(const noncopyable&); - EIGEN_DEVICE_FUNC const noncopyable& operator=(const noncopyable&); -protected: - EIGEN_DEVICE_FUNC noncopyable() {} - EIGEN_DEVICE_FUNC ~noncopyable() {} -}; - -/** \internal - * Provides access to the number of elements in the object of as a compile-time constant expression. - * It "returns" Eigen::Dynamic if the size cannot be resolved at compile-time (default). - * - * Similar to std::tuple_size, but more general. - * - * It currently supports: - * - any types T defining T::SizeAtCompileTime - * - plain C arrays as T[N] - * - std::array (c++11) - * - some internal types such as SingleRange and AllRange - * - * The second template parameter eases SFINAE-based specializations. - */ -template struct array_size { - enum { value = Dynamic }; -}; - -template struct array_size::type> { - enum { value = T::SizeAtCompileTime }; -}; - -template struct array_size { - enum { value = N }; -}; -template struct array_size { - enum { value = N }; -}; - -#if EIGEN_HAS_CXX11 -template struct array_size > { - enum { value = N }; -}; -template struct array_size > { - enum { value = N }; -}; -#endif - -/** \internal - * Analogue of the std::size free function. - * It returns the size of the container or view \a x of type \c T - * - * It currently supports: - * - any types T defining a member T::size() const - * - plain C arrays as T[N] - * - */ -template -Index size(const T& x) { return x.size(); } - -template -Index size(const T (&) [N]) { return N; } - -/** \internal - * Convenient struct to get the result type of a unary or binary functor. - * - * It supports both the current STL mechanism (using the result_type member) as well as - * upcoming next STL generation (using a templated result member). - * If none of these members is provided, then the type of the first argument is returned. FIXME, that behavior is a pretty bad hack. - */ -#if EIGEN_HAS_STD_RESULT_OF -template struct result_of { - typedef typename std::result_of::type type1; - typedef typename remove_all::type type; -}; -#else -template struct result_of { }; - -struct has_none {int a[1];}; -struct has_std_result_type {int a[2];}; -struct has_tr1_result {int a[3];}; - -template -struct unary_result_of_select {typedef typename internal::remove_all::type type;}; - -template -struct unary_result_of_select {typedef typename Func::result_type type;}; - -template -struct unary_result_of_select {typedef typename Func::template result::type type;}; - -template -struct result_of { - template - static has_std_result_type testFunctor(T const *, typename T::result_type const * = 0); - template - static has_tr1_result testFunctor(T const *, typename T::template result::type const * = 0); - static has_none testFunctor(...); - - // note that the following indirection is needed for gcc-3.3 - enum {FunctorType = sizeof(testFunctor(static_cast(0)))}; - typedef typename unary_result_of_select::type type; -}; - -template -struct binary_result_of_select {typedef typename internal::remove_all::type type;}; - -template -struct binary_result_of_select -{typedef typename Func::result_type type;}; - -template -struct binary_result_of_select -{typedef typename Func::template result::type type;}; - -template -struct result_of { - template - static has_std_result_type testFunctor(T const *, typename T::result_type const * = 0); - template - static has_tr1_result testFunctor(T const *, typename T::template result::type const * = 0); - static has_none testFunctor(...); - - // note that the following indirection is needed for gcc-3.3 - enum {FunctorType = sizeof(testFunctor(static_cast(0)))}; - typedef typename binary_result_of_select::type type; -}; - -template -struct ternary_result_of_select {typedef typename internal::remove_all::type type;}; - -template -struct ternary_result_of_select -{typedef typename Func::result_type type;}; - -template -struct ternary_result_of_select -{typedef typename Func::template result::type type;}; - -template -struct result_of { - template - static has_std_result_type testFunctor(T const *, typename T::result_type const * = 0); - template - static has_tr1_result testFunctor(T const *, typename T::template result::type const * = 0); - static has_none testFunctor(...); - - // note that the following indirection is needed for gcc-3.3 - enum {FunctorType = sizeof(testFunctor(static_cast(0)))}; - typedef typename ternary_result_of_select::type type; -}; -#endif - -struct meta_yes { char a[1]; }; -struct meta_no { char a[2]; }; - -// Check whether T::ReturnType does exist -template -struct has_ReturnType -{ - template static meta_yes testFunctor(C const *, typename C::ReturnType const * = 0); - template static meta_no testFunctor(...); - - enum { value = sizeof(testFunctor(static_cast(0))) == sizeof(meta_yes) }; -}; - -template const T* return_ptr(); - -template -struct has_nullary_operator -{ - template static meta_yes testFunctor(C const *,typename enable_if<(sizeof(return_ptr()->operator()())>0)>::type * = 0); - static meta_no testFunctor(...); - - enum { value = sizeof(testFunctor(static_cast(0))) == sizeof(meta_yes) }; -}; - -template -struct has_unary_operator -{ - template static meta_yes testFunctor(C const *,typename enable_if<(sizeof(return_ptr()->operator()(IndexType(0)))>0)>::type * = 0); - static meta_no testFunctor(...); - - enum { value = sizeof(testFunctor(static_cast(0))) == sizeof(meta_yes) }; -}; - -template -struct has_binary_operator -{ - template static meta_yes testFunctor(C const *,typename enable_if<(sizeof(return_ptr()->operator()(IndexType(0),IndexType(0)))>0)>::type * = 0); - static meta_no testFunctor(...); - - enum { value = sizeof(testFunctor(static_cast(0))) == sizeof(meta_yes) }; -}; - -/** \internal In short, it computes int(sqrt(\a Y)) with \a Y an integer. - * Usage example: \code meta_sqrt<1023>::ret \endcode - */ -template Y))) > - // use ?: instead of || just to shut up a stupid gcc 4.3 warning -class meta_sqrt -{ - enum { - MidX = (InfX+SupX)/2, - TakeInf = MidX*MidX > Y ? 1 : 0, - NewInf = int(TakeInf) ? InfX : int(MidX), - NewSup = int(TakeInf) ? int(MidX) : SupX - }; - public: - enum { ret = meta_sqrt::ret }; -}; - -template -class meta_sqrt { public: enum { ret = (SupX*SupX <= Y) ? SupX : InfX }; }; - - -/** \internal Computes the least common multiple of two positive integer A and B - * at compile-time. It implements a naive algorithm testing all multiples of A. - * It thus works better if A>=B. - */ -template -struct meta_least_common_multiple -{ - enum { ret = meta_least_common_multiple::ret }; -}; -template -struct meta_least_common_multiple -{ - enum { ret = A*K }; -}; - -/** \internal determines whether the product of two numeric types is allowed and what the return type is */ -template struct scalar_product_traits -{ - enum { Defined = 0 }; -}; - -// FIXME quick workaround around current limitation of result_of -// template -// struct result_of(ArgType0,ArgType1)> { -// typedef typename scalar_product_traits::type, typename remove_all::type>::ReturnType type; -// }; - -/** \internal Obtains a POD type suitable to use as storage for an object of a size - * of at most Len bytes, aligned as specified by \c Align. - */ -template -struct aligned_storage { - struct type { - EIGEN_ALIGN_TO_BOUNDARY(Align) unsigned char data[Len]; - }; -}; - -} // end namespace internal - -namespace numext { - -#if defined(EIGEN_GPU_COMPILE_PHASE) -template EIGEN_DEVICE_FUNC void swap(T &a, T &b) { T tmp = b; b = a; a = tmp; } -#else -template EIGEN_STRONG_INLINE void swap(T &a, T &b) { std::swap(a,b); } -#endif - -#if defined(EIGEN_GPU_COMPILE_PHASE) -using internal::device::numeric_limits; -#else -using std::numeric_limits; -#endif - -// Integer division with rounding up. -// T is assumed to be an integer type with a>=0, and b>0 -template -EIGEN_DEVICE_FUNC -T div_ceil(const T &a, const T &b) -{ - return (a+b-1) / b; -} - -// The aim of the following functions is to bypass -Wfloat-equal warnings -// when we really want a strict equality comparison on floating points. -template EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC -bool equal_strict(const X& x,const Y& y) { return x == y; } - -#if !defined(EIGEN_GPU_COMPILE_PHASE) || (!defined(EIGEN_CUDA_ARCH) && defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC)) -template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC -bool equal_strict(const float& x,const float& y) { return std::equal_to()(x,y); } - -template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC -bool equal_strict(const double& x,const double& y) { return std::equal_to()(x,y); } -#endif - -template EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC -bool not_equal_strict(const X& x,const Y& y) { return x != y; } - -#if !defined(EIGEN_GPU_COMPILE_PHASE) || (!defined(EIGEN_CUDA_ARCH) && defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC)) -template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC -bool not_equal_strict(const float& x,const float& y) { return std::not_equal_to()(x,y); } - -template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC -bool not_equal_strict(const double& x,const double& y) { return std::not_equal_to()(x,y); } -#endif - -/** \internal extract the bits of the float \a x */ -inline unsigned int as_uint(float x) -{ - unsigned int ret; - std::memcpy(&ret, &x, sizeof(float)); - return ret; -} - -} // end namespace numext - -} // end namespace Eigen - -// Define portable (u)int{32,64} types -#if EIGEN_HAS_CXX11 -#include -namespace Eigen { -namespace numext { -typedef std::uint8_t uint8_t; -typedef std::int8_t int8_t; -typedef std::uint16_t uint16_t; -typedef std::int16_t int16_t; -typedef std::uint32_t uint32_t; -typedef std::int32_t int32_t; -typedef std::uint64_t uint64_t; -typedef std::int64_t int64_t; -} -} -#else -// Without c++11, all compilers able to compile Eigen also -// provides the C99 stdint.h header file. -#include -namespace Eigen { -namespace numext { -typedef ::uint8_t uint8_t; -typedef ::int8_t int8_t; -typedef ::uint16_t uint16_t; -typedef ::int16_t int16_t; -typedef ::uint32_t uint32_t; -typedef ::int32_t int32_t; -typedef ::uint64_t uint64_t; -typedef ::int64_t int64_t; -} -} -#endif - -#endif // EIGEN_META_H - -// clang-format on diff --git a/patches/eigen/Tensor b/patches/eigen/Tensor deleted file mode 100644 index 1f1016f9b44..00000000000 --- a/patches/eigen/Tensor +++ /dev/null @@ -1,156 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// Copyright (C) 2014 Benoit Steiner -// Copyright (C) 2013 Christian Seiler -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -//#ifndef EIGEN_CXX11_TENSOR_MODULE -//#define EIGEN_CXX11_TENSOR_MODULE - -#include "../../../Eigen/Core" - -#if EIGEN_HAS_CXX11 - -#include "../SpecialFunctions" - -#include "../../../Eigen/src/Core/util/DisableStupidWarnings.h" -#include "src/util/CXX11Meta.h" -#include "src/util/MaxSizeVector.h" - -/** \defgroup CXX11_Tensor_Module Tensor Module - * - * This module provides a Tensor class for storing arbitrarily indexed - * objects. - * - * \code - * #include - * \endcode - * - * Much of the documentation can be found \ref eigen_tensors "here". - */ - -#include -#include -#include -#include - -#ifdef _WIN32 -typedef __int16 int16_t; -typedef unsigned __int16 uint16_t; -typedef __int32 int32_t; -typedef unsigned __int32 uint32_t; -typedef __int64 int64_t; -typedef unsigned __int64 uint64_t; -#include -#else -#include -#include -#endif - -#ifdef _WIN32 -#include -#elif defined(__APPLE__) -#include -#else -#include -#endif - -#if defined(EIGEN_USE_THREADS) || defined(EIGEN_USE_SYCL) -#include "ThreadPool" -#endif - -#ifdef EIGEN_USE_GPU - #include - #if defined(EIGEN_USE_HIP) - #include - #else - #include - #endif - #include -#endif - -#include "src/Tensor/TensorMacros.h" -#include "src/Tensor/TensorForwardDeclarations.h" -#include "src/Tensor/TensorMeta.h" -#include "src/Tensor/TensorFunctors.h" -#include "src/Tensor/TensorCostModel.h" -#include "src/Tensor/TensorDeviceDefault.h" -#include "src/Tensor/TensorDeviceThreadPool.h" -#include "src/Tensor/TensorDeviceGpu.h" -#ifndef gpu_assert -#define gpu_assert(x) -#endif -#include "src/Tensor/TensorDeviceSycl.h" -#include "src/Tensor/TensorIndexList.h" -#include "src/Tensor/TensorDimensionList.h" -#include "src/Tensor/TensorDimensions.h" -#include "src/Tensor/TensorInitializer.h" -#include "src/Tensor/TensorTraits.h" -#include "src/Tensor/TensorRandom.h" -#include "src/Tensor/TensorUInt128.h" -#include "src/Tensor/TensorIntDiv.h" -#include "src/Tensor/TensorGlobalFunctions.h" - -#include "src/Tensor/TensorBase.h" -#include "src/Tensor/TensorBlock.h" - -#include "src/Tensor/TensorEvaluator.h" -#include "src/Tensor/TensorExpr.h" -#include "src/Tensor/TensorReduction.h" -#include "src/Tensor/TensorReductionGpu.h" -#include "src/Tensor/TensorArgMax.h" -#include "src/Tensor/TensorConcatenation.h" -#include "src/Tensor/TensorContractionMapper.h" -#include "src/Tensor/TensorContractionBlocking.h" -#include "src/Tensor/TensorContraction.h" -#include "src/Tensor/TensorContractionThreadPool.h" -#include "src/Tensor/TensorContractionGpu.h" -#include "src/Tensor/TensorConversion.h" -#include "src/Tensor/TensorConvolution.h" -#include "src/Tensor/TensorFFT.h" -#include "src/Tensor/TensorPatch.h" -#include "src/Tensor/TensorImagePatch.h" -#include "src/Tensor/TensorVolumePatch.h" -#include "src/Tensor/TensorBroadcasting.h" -#include "src/Tensor/TensorChipping.h" -#include "src/Tensor/TensorInflation.h" -#include "src/Tensor/TensorLayoutSwap.h" -#include "src/Tensor/TensorMorphing.h" -#include "src/Tensor/TensorPadding.h" -#include "src/Tensor/TensorReverse.h" -#include "src/Tensor/TensorShuffling.h" -#include "src/Tensor/TensorStriding.h" -#include "src/Tensor/TensorCustomOp.h" -#include "src/Tensor/TensorEvalTo.h" -#include "src/Tensor/TensorForcedEval.h" -#include "src/Tensor/TensorGenerator.h" -#include "src/Tensor/TensorAssign.h" -#include "src/Tensor/TensorScan.h" -#include "src/Tensor/TensorTrace.h" - -#ifdef EIGEN_USE_SYCL -#include "src/Tensor/TensorReductionSycl.h" -#include "src/Tensor/TensorConvolutionSycl.h" -#include "src/Tensor/TensorContractionSycl.h" -#include "src/Tensor/TensorScanSycl.h" -#endif - -#include "src/Tensor/TensorExecutor.h" -#include "src/Tensor/TensorDevice.h" - -#include "src/Tensor/TensorStorage.h" -#include "src/Tensor/Tensor.h" -#include "src/Tensor/TensorFixedSize.h" -#include "src/Tensor/TensorMap.h" -#include "src/Tensor/TensorRef.h" - -#include "src/Tensor/TensorIO.h" - -#include "../../../Eigen/src/Core/util/ReenableStupidWarnings.h" - -#endif // EIGEN_HAS_CXX11 -//#endif // EIGEN_CXX11_TENSOR_MODULE diff --git a/patches/eigen/TensorBlock.h b/patches/eigen/TensorBlock.h deleted file mode 100644 index 1e55d12c42f..00000000000 --- a/patches/eigen/TensorBlock.h +++ /dev/null @@ -1,1559 +0,0 @@ -// This file is part of Eigen, a lightweight C++ template library -// for linear algebra. -// -// This Source Code Form is subject to the terms of the Mozilla -// Public License v. 2.0. If a copy of the MPL was not distributed -// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - -#ifndef EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H -#define EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H - -namespace Eigen { -namespace internal { - -// -------------------------------------------------------------------------- // -// Forward declarations for templates defined below. -template -class TensorBlockIO; - -// -------------------------------------------------------------------------- // -// Helper function to compute strides for densely stored buffer of given -// dimensions. - -// TODO(ezhulenev): We compute strides 1000 times in different evaluators, use -// this function instead everywhere. -template -EIGEN_ALWAYS_INLINE DSizes strides( - const DSizes& dimensions) { - DSizes strides; - if (NumDims == 0) return strides; - - // TODO(ezhulenev): Use templates to unroll this loop (similar to - // h_array_reduce in CXX11meta.h)? Benchmark it. - if (static_cast(Layout) == static_cast(ColMajor)) { - strides[0] = 1; - for (int i = 1; i < NumDims; ++i) { - strides[i] = strides[i - 1] * dimensions[i - 1]; - } - } else { - strides[NumDims - 1] = 1; - for (int i = NumDims - 2; i >= 0; --i) { - strides[i] = strides[i + 1] * dimensions[i + 1]; - } - } - - return strides; -} - -template -EIGEN_ALWAYS_INLINE DSizes strides( - const Eigen::array& dimensions) { - return strides(DSizes(dimensions)); -} - -template -EIGEN_STRONG_INLINE DSizes strides( - const Sizes& sizes) { - return strides(DSizes(sizes)); -} - -// -------------------------------------------------------------------------- // - -// Tensor block shape type defines what are the shape preference for the blocks -// extracted from the larger tensor. -// -// Example: blocks of 100 elements from the large 100x100 tensor: -// - tensor: 100x100 -// - target_block_size: 100 -// -// TensorBlockShapeType: -// - kUniformAllDims: 100 blocks of size 10x10 -// - kSkewedInnerDims: 100 blocks of size 100x1 (or 1x100 depending on a column -// or row major layout) -enum class TensorBlockShapeType { kUniformAllDims, kSkewedInnerDims }; - -struct TensorBlockResourceRequirements { - TensorBlockShapeType shape_type; // target block shape - size_t size; // target block size - TensorOpCost cost_per_coeff; // cost of computing a single block element - -#ifdef EIGEN_HIPCC - // For HIPCC, we need to explicitly declare as a "device fun", the constructor - // which is implicitly invoked in the "merge" / "any" routines. else HIPCC - // errors out complaining about the lack of a matching constructor - EIGEN_DEVICE_FUNC - TensorBlockResourceRequirements(TensorBlockShapeType shape_type_, size_t size_, - TensorOpCost cost_) - : shape_type(shape_type_), size(size_), cost_per_coeff(cost_) - {} -#endif - - template - EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements withShapeAndSize( - TensorBlockShapeType shape_type, size_t size_in_bytes, - TensorOpCost cost) { - const size_t size = numext::maxi(size_t(1), size_in_bytes / sizeof(Scalar)); - return {shape_type, size, cost}; - } - - template - EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements withShapeAndSize( - TensorBlockShapeType shape_type, size_t size_in_bytes) { - // This default cost per coefficient is valid for most materialized tensor - // block evaluation implementations, because they typically just read - // coefficients from the underlying tensor storage, and write to the tensor - // block buffer (scratch or destination memory, reads and writes have linear - // access pattern). We ignore the fixed cost of block evaluation, because in - // practice it should negligible. - // - // Lazy block evaluation adds the cost of calling a functor for each - // coefficient. - // - // All non-trivial block evaluation implementations must provide their own - // cost approximation (e.g. shuffling inner dimension has a much higher cost - // because it reads memory randomly, although the total number of moved - // bytes is the same). - return withShapeAndSize(shape_type, size_in_bytes, - {/*bytes_loaded=*/sizeof(Scalar), - /*bytes_stored=*/sizeof(Scalar), - /*compute_cycles=*/0}); - } - - template - EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements skewed( - size_t size_in_bytes) { - return withShapeAndSize(TensorBlockShapeType::kSkewedInnerDims, - size_in_bytes); - } - - template - EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements uniform( - size_t size_in_bytes) { - return withShapeAndSize(TensorBlockShapeType::kUniformAllDims, - size_in_bytes); - } - - EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE TensorBlockResourceRequirements - merge(const TensorBlockResourceRequirements& lhs, - const TensorBlockResourceRequirements& rhs) { - return {merge(lhs.shape_type, rhs.shape_type), // shape_type - merge(lhs.size, rhs.size), // size - merge(lhs.cost_per_coeff, rhs.cost_per_coeff)}; // cost_per_coeff - } - - EIGEN_DEVICE_FUNC TensorBlockResourceRequirements& addCostPerCoeff( - TensorOpCost cost) { - cost_per_coeff += cost; - return *this; - } - - // This is a resource requirement that should be returned from expressions - // that do not have any block evaluation preference (e.g. default tensor - // expression with raw buffer access). - EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE TensorBlockResourceRequirements any() { - return {TensorBlockShapeType::kUniformAllDims, 1, {0, 0, 0}}; - } - - private: - using Requirements = TensorBlockResourceRequirements; - - EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE size_t merge(size_t lhs_size, size_t rhs_size) { - return numext::maxi(lhs_size, rhs_size); - } - - EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE TensorBlockShapeType - merge(TensorBlockShapeType lhs, TensorBlockShapeType rhs) { - return (lhs == TensorBlockShapeType::kSkewedInnerDims || - rhs == TensorBlockShapeType::kSkewedInnerDims) - ? TensorBlockShapeType::kSkewedInnerDims - : TensorBlockShapeType::kUniformAllDims; - } - - EIGEN_DEVICE_FUNC - static EIGEN_STRONG_INLINE TensorOpCost merge(TensorOpCost lhs_cost, - TensorOpCost rhs_cost) { - return lhs_cost + rhs_cost; - } -}; - -// -------------------------------------------------------------------------- // -// TensorBlockDescriptor specifies a block offset within a tensor and the block -// sizes along each of the tensor dimensions. - -template -class TensorBlockDescriptor { - public: - typedef DSizes Dimensions; - - // If we evaluate a Tensor assignment, and expression on the left, already has - // a memory buffer, then we might do performance optimization, and evaluate - // the root expression directly into the final output memory. Some time it's - // possible to reuse it for materializing subexpressions inside an expression - // tree, to to avoid dynamic memory allocation. - // - // The pointer type of the underlying storage is erased, because passing - // Scalar type through all the expression evaluation layers is way too many - // templates. In practice destination buffer type should always match the - // evaluated expression scalar type. - class DestinationBuffer { - public: - enum DestinationBufferKind : int { - // The above explicit specification of "int" as the enum basetype is - // needed to get around a HIPCC link error ("the field type is not - // amp-compatible") - // which is issued for class members with the enum type. - // TODO(rocm): - // remove the "int" basetype once HIPCC has been fixed to not error out - // in the above scenario. - - // Destination buffer is not defined (`m_data` == nullptr). - kEmpty, - - // Tensor block defined by an owning tensor block descriptor can fit - // contiguously into the destination buffer. In this case it's safe to - // materialize tensor block in the destination buffer, wrap it in a - // TensorMap, and use to build Eigen expression on top of it. - kContiguous, - - // Destination buffer strides do not match strides of the contiguously - // stored block, and it's impossible to define a TensorMap over this - // buffer. However if we are evaluating a root of an expression tree, we - // still can materialize an output into this destination, because we can - // guarantee that no one will ever access it through block API. - // - // In theory it is possible to build valid TensorStriding - // expression on top of this destination buffer, however it has - // inefficient coeff/packet access, and defeats the purpose of fast block - // evaluation API. - kStrided - }; - - template - Scalar* data() const { - eigen_assert(m_data_type_size == sizeof(Scalar)); - return static_cast(m_data); - } - - const Dimensions& strides() const { return m_strides; } - const DestinationBufferKind& kind() const { return m_kind; } - - private: - friend class TensorBlockDescriptor; - - DestinationBuffer() : m_data(NULL), m_data_type_size(0), m_kind(kEmpty) {} - - template - DestinationBuffer(Scalar* data, const Dimensions& strides, - DestinationBufferKind kind) - : m_data(static_cast(data)), - m_data_type_size(sizeof(Scalar)), - m_strides(strides), - m_kind(kind) {} - - template - static DestinationBuffer make(const TensorBlockDescriptor& desc, - Scalar* data, const Dimensions& strides) { - return DestinationBuffer(data, strides, kind(desc, strides)); - } - - template - static DestinationBufferKind kind(const TensorBlockDescriptor& desc, - const Dimensions& strides) { - const Dimensions& desc_dims = desc.dimensions(); - const Dimensions& desc_strides = internal::strides(desc_dims); - for (int i = 0; i < NumDims; ++i) { - if (desc_dims[i] == 1) continue; - if (desc_strides[i] != strides[i]) return kStrided; - } - return kContiguous; - } - - // Storage pointer is type erased, to reduce template bloat, but we still - // keep the size of the underlying element type for error checking. - void* m_data; - size_t m_data_type_size; - - // Destination buffer dimensions always match the dimensions of a tensor - // block descriptor it belongs to, however strides might be different. - Dimensions m_strides; - - DestinationBufferKind m_kind; - }; - - TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions, - const DestinationBuffer& destination) - : m_offset(offset), - m_dimensions(dimensions), - m_destination(destination) {} - - TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions) - : m_offset(offset), - m_dimensions(dimensions), - m_destination(DestinationBuffer()) {} - - IndexType offset() const { return m_offset; } - const Dimensions& dimensions() const { return m_dimensions; } - IndexType dimension(int index) const { return m_dimensions[index]; } - IndexType size() const { return array_prod(m_dimensions); } - - const DestinationBuffer& destination() const { return m_destination; } - - template - void AddDestinationBuffer(Scalar* dst_base, const Dimensions& dst_strides) { - eigen_assert(dst_base != NULL); - m_destination = - DestinationBuffer::template make(*this, dst_base, dst_strides); - } - - template - void AddDestinationBuffer( - Scalar* dst_base, - const DSizes& dst_strides) { - // DSizes constructor will do index type promotion if it's safe. - AddDestinationBuffer(dst_base, Dimensions(dst_strides)); - } - - TensorBlockDescriptor& DropDestinationBuffer() { - m_destination.m_data = NULL; - m_destination.m_kind = DestinationBuffer::kEmpty; - return *this; - } - - bool HasDestinationBuffer() const { - return m_destination.kind() != DestinationBuffer::kEmpty; - } - - // Returns a copy of `*this` with updated offset. - TensorBlockDescriptor WithOffset(IndexType offset) const { - return TensorBlockDescriptor(offset, m_dimensions, m_destination); - } - - private: - // Offset and dimensions are immutable after construction. Block descriptor - // can only be mutated by adding or dropping destination. - const IndexType m_offset; - const Dimensions m_dimensions; - DestinationBuffer m_destination; -}; - -// -------------------------------------------------------------------------- // -// TensorBlockMapper is responsible for iterating over the blocks of a tensor. - -template -class TensorBlockMapper { - typedef TensorBlockDescriptor BlockDescriptor; - - public: - typedef DSizes Dimensions; - - TensorBlockMapper() = default; - TensorBlockMapper(const DSizes& dimensions, - const TensorBlockResourceRequirements& requirements) - : m_tensor_dimensions(dimensions), m_requirements(requirements) { - // Compute block dimensions and the total number of blocks. - InitializeBlockDimensions(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockCount() const { - return m_total_block_count; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockTotalSize() const { - return m_block_dimensions.TotalSize(); - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const DSizes& - blockDimensions() const { - return m_block_dimensions; - } - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockDescriptor - blockDescriptor(IndexType block_index) const { - static const bool isColMajor = Layout == static_cast(ColMajor); - - IndexType offset = 0; - DSizes dimensions; - - if (NumDims == 0) return BlockDescriptor(offset, dimensions); - - // Iterate outer -> inner dimensions. - for (int i = NumDims - 1; i >= 0; --i) { - const int dim = isColMajor ? i : NumDims - i - 1; - - const IndexType idx = block_index / m_block_strides[dim]; - block_index -= idx * m_block_strides[dim]; - - const IndexType coord = idx * m_block_dimensions[dim]; - dimensions[dim] = numext::mini(m_tensor_dimensions[dim] - coord, - m_block_dimensions[dim]); - offset += coord * m_tensor_strides[dim]; - } - - return {offset, dimensions}; - } - - private: - void InitializeBlockDimensions() { - // Requested block shape and size. - const TensorBlockShapeType shape_type = m_requirements.shape_type; - IndexType target_block_size = - numext::maxi(1, static_cast(m_requirements.size)); - - IndexType tensor_size = m_tensor_dimensions.TotalSize(); - - // Corner case: one of the dimensions is zero. Logic below is too complex - // to handle this case on a general basis, just use unit block size. - // Note: we must not yield blocks with zero dimensions (recipe for - // overflows/underflows, divisions by zero and NaNs later). - if (tensor_size == 0) { - for (int i = 0; i < NumDims; ++i) { - m_block_dimensions[i] = 1; - } - m_total_block_count = 0; - return; - } - - // If tensor fits into a target block size, evaluate it as a single block. - if (tensor_size <= target_block_size) { - m_block_dimensions = m_tensor_dimensions; - m_total_block_count = 1; - // The only valid block index is `0`, and in this case we do not need - // to compute real strides for tensor or blocks (see blockDescriptor). - for (int i = 0; i < NumDims; ++i) { - m_tensor_strides[i] = 0; - m_block_strides[i] = 1; - } - return; - } - - static const bool isColMajor = Layout == static_cast(ColMajor); - - // Block shape skewed towards inner dimension. - if (shape_type == TensorBlockShapeType::kSkewedInnerDims) { - IndexType coeff_to_allocate = target_block_size; - - for (int i = 0; i < NumDims; ++i) { - const int dim = isColMajor ? i : NumDims - i - 1; - m_block_dimensions[dim] = - numext::mini(coeff_to_allocate, m_tensor_dimensions[dim]); - coeff_to_allocate = divup( - coeff_to_allocate, - numext::maxi(static_cast(1), m_block_dimensions[dim])); - } - eigen_assert(coeff_to_allocate == 1); - - } else if (shape_type == TensorBlockShapeType::kUniformAllDims) { - // Tensor will not fit within 'target_block_size' budget: calculate tensor - // block dimension sizes based on "square" dimension size target. - const IndexType dim_size_target = convert_index( - std::pow(static_cast(target_block_size), - 1.0f / static_cast(m_block_dimensions.rank()))); - - for (int i = 0; i < NumDims; ++i) { - // TODO(andydavis) Adjust the inner most 'block_dim_size' to make it - // a multiple of the packet size. Note that reducing - // 'block_dim_size' in this manner can increase the number of - // blocks, and so will amplify any per-block overhead. - m_block_dimensions[i] = - numext::mini(dim_size_target, m_tensor_dimensions[i]); - } - - // Add any un-allocated coefficients to inner dimension(s). - IndexType total_size = m_block_dimensions.TotalSize(); - for (int i = 0; i < NumDims; ++i) { - const int dim = isColMajor ? i : NumDims - i - 1; - - if (m_block_dimensions[dim] < m_tensor_dimensions[dim]) { - const IndexType total_size_other_dims = - total_size / m_block_dimensions[dim]; - const IndexType alloc_avail = - divup(target_block_size, total_size_other_dims); - if (alloc_avail == m_block_dimensions[dim]) { - // Insufficient excess coefficients to allocate. - break; - } - m_block_dimensions[dim] = - numext::mini(m_tensor_dimensions[dim], alloc_avail); - total_size = total_size_other_dims * m_block_dimensions[dim]; - } - } - - } else { - eigen_assert(false); // unknown block shape - } - - eigen_assert(m_block_dimensions.TotalSize() >= - numext::mini(target_block_size, - m_tensor_dimensions.TotalSize())); - - // Calculate block counts by dimension and total block count. - DSizes block_count; - for (int i = 0; i < NumDims; ++i) { - block_count[i] = divup(m_tensor_dimensions[i], m_block_dimensions[i]); - } - m_total_block_count = array_prod(block_count); - - // Calculate block strides (used for enumerating blocks). - m_tensor_strides = strides(m_tensor_dimensions); - m_block_strides = strides(block_count); - } - - DSizes m_tensor_dimensions; - TensorBlockResourceRequirements m_requirements; - - DSizes m_block_dimensions; - IndexType m_total_block_count; - - DSizes m_tensor_strides; - DSizes m_block_strides; -}; - -// -------------------------------------------------------------------------- // -// TensorBlockScratchAllocator is responsible for allocating temporary buffers -// for block evaluation (output or input block materialization). Given that -// Eigen expression traversal order is deterministic, all temporary allocations -// are happening in the same order, and usually have exactly the same size. -// Scratch allocator keeps a trace of all dynamic allocations, and after the -// first block evaluation is completed, we should be able to reuse all the -// temporary buffers for the next block evaluation. - -template -class TensorBlockScratchAllocator { - public: - explicit TensorBlockScratchAllocator(const Device& device) - : m_device(device), m_allocation_index(0) {} - - ~TensorBlockScratchAllocator() { - for (size_t i = 0; i < m_allocations.size(); ++i) { - m_device.deallocate(m_allocations[i].ptr); - } - } - - void* allocate(size_t size) { - // TODO(ezhulenev): Remove when replaced with inlined vector. - if (m_allocations.capacity() == 0) m_allocations.reserve(8); - - // Check if we already have an existing allocation att current index. - const int num_allocations = static_cast(m_allocations.size()); - const bool has_allocation = m_allocation_index < num_allocations; - - // Allocation index can't be larger than the number of allocations. - eigen_assert(m_allocation_index <= num_allocations); - - // If we have existing allocation, and its size is larger or equal to - // requested size, we do nothing. - - // If current allocation can't fit requested size, we deallocate it, and - // replace with a larger allocation. - if (has_allocation && m_allocations[m_allocation_index].size < size) { - m_device.deallocate(m_allocations[m_allocation_index].ptr); - m_allocations[m_allocation_index].ptr = m_device.allocate(size); - m_allocations[m_allocation_index].size = size; - } - - // Make a new allocation if we don't have and existing one. - if (!has_allocation) { - Allocation allocation; - allocation.ptr = m_device.allocate(size); - allocation.size = size; - m_allocations.push_back(allocation); - } - - eigen_assert(m_allocations[m_allocation_index].ptr != NULL); - eigen_assert(m_allocations[m_allocation_index].size >= size); - - return m_allocations[m_allocation_index++].ptr; - } - - void reset() { m_allocation_index = 0; } - - private: - struct Allocation { - void* ptr; - size_t size; - }; - - const Device& m_device; - int m_allocation_index; - // TODO(ezhulenev): This should be an inlined vector. - std::vector m_allocations; -}; - -// -------------------------------------------------------------------------- // -// TensorBlockKind represents all possible block kinds, that can be produced by -// TensorEvaluator::evalBlock function. -enum TensorBlockKind { - // Tensor block that is a lazy expression that must be assigned to a - // destination using TensorBlockAssign. - kExpr, - - // Tensor block that is a view into a memory buffer owned by an underlying - // Tensor expression (e.g. it can be a view into a Tensor buffer). - kView, - - // Tensor block that was materialized in a scratch memory buffer, allocated - // with TensorBlockScratchAllocator. This block must be copied to a - // destination, similar to a block of `kExpr` type. - kMaterializedInScratch, - - // Tensor block that was materialized directly into the final output memory - // buffer. For example if the left side of an assignment is a Tensor, we can - // directly materialize the block in the destination memory. - // - // If strides in the output buffer do not match tensor block strides, the - // Tensor expression will be invalid, and should not be used by - // TensorBlockAssign or for constructing another block expression. - kMaterializedInOutput -}; - -// -------------------------------------------------------------------------- // -// TensorBlockNotImplemented should be used to defined TensorBlock typedef in -// TensorEvaluators that do not support block evaluation. - -class TensorBlockNotImplemented { - public: - typedef void XprType; -}; - -// -------------------------------------------------------------------------- // -// XprScalar extracts Scalar type from the Eigen expressions (if expression type -// is not void). It's required to be able to define lazy block expression for -// argument types, that do not support block evaluation. - -template -struct XprScalar { - typedef typename XprType::Scalar type; -}; -template <> -struct XprScalar { - typedef void type; -}; - -// -------------------------------------------------------------------------- // -// TensorMaterializedBlock is a fully evaluated block of the original tensor, -// and XprType is just a TensorMap over the data. This block type is typically -// used to materialize blocks of tensor expressions, that can't be efficiently -// represented as lazy Tensor expressions with fast coeff/packet operations, -// e.g. we materialize all broadcasts into evaluated blocks. -// -// TensorMaterializedBlock does not own its memory buffer, it's either a memory -// buffer that backs the original expression (e.g. block is just a view into a -// Tensor), or a memory buffer allocated with scratch allocator, and in this -// case the scratch allocator will deallocate it at the end of block based -// expression execution. -// -// If the block was evaluated directly into the output buffer, and strides in -// the output buffer do not match block strides, the TensorMap expression will -// be invalid, and should never be used in block assignment or any other tensor -// expression. - -template -class TensorMaterializedBlock { - public: - typedef DSizes Dimensions; - typedef TensorMap > XprType; - - TensorMaterializedBlock(TensorBlockKind kind, const Scalar* data, - const Dimensions& dimensions, bool valid_expr = true) - : m_kind(kind), - m_data(data), - m_dimensions(dimensions), - m_expr(m_data, m_dimensions), - m_valid_expr(valid_expr) { - eigen_assert(m_kind == internal::TensorBlockKind::kView || - m_kind == internal::TensorBlockKind::kMaterializedInScratch || - m_kind == internal::TensorBlockKind::kMaterializedInOutput); - } - - TensorBlockKind kind() const { return m_kind; } - // NOTE(ezhulenev): Returning XprType by value like in other block types - // causes asan failures. The theory is that XprType::Nested doesn't work - // properly for TensorMap. - const XprType& expr() const { - eigen_assert(m_valid_expr); - return m_expr; - } - const Scalar* data() const { return m_data; } - void cleanup() {} - - typedef internal::TensorBlockDescriptor TensorBlockDesc; - - // TensorMaterializedBlock can be backed by different types of storage: - // - // (1) Contiguous block of memory allocated with scratch allocator. - // (2) Contiguous block of memory reused from tensor block descriptor - // destination buffer. - // (3) Strided block of memory reused from tensor block descriptor - // destination buffer. - // - class Storage { - public: - Scalar* data() const { return m_data; } - const Dimensions& dimensions() const { return m_dimensions; } - const Dimensions& strides() const { return m_strides; } - - TensorMaterializedBlock AsTensorMaterializedBlock() const { - return TensorMaterializedBlock( - m_materialized_in_output - ? internal::TensorBlockKind::kMaterializedInOutput - : internal::TensorBlockKind::kMaterializedInScratch, - m_data, m_dimensions, !m_strided_storage); - } - - private: - friend class TensorMaterializedBlock; - - Storage(Scalar* data, const Dimensions& dimensions, - const Dimensions& strides, bool materialized_in_output, - bool strided_storage) - : m_data(data), - m_dimensions(dimensions), - m_strides(strides), - m_materialized_in_output(materialized_in_output), - m_strided_storage(strided_storage) {} - - Scalar* m_data; - Dimensions m_dimensions; - Dimensions m_strides; - bool m_materialized_in_output; - bool m_strided_storage; - }; - - // Creates a storage for materialized block either from the block descriptor - // destination buffer, or allocates a new buffer with scratch allocator. - template - EIGEN_STRONG_INLINE static Storage prepareStorage( - TensorBlockDesc& desc, TensorBlockScratch& scratch, - bool allow_strided_storage = false) { - // Try to reuse destination as an output block buffer. - typedef typename TensorBlockDesc::DestinationBuffer DestinationBuffer; - - if (desc.destination().kind() == DestinationBuffer::kContiguous) { - Scalar* buffer = desc.destination().template data(); - desc.DropDestinationBuffer(); - return Storage(buffer, desc.dimensions(), - internal::strides(desc.dimensions()), - /*materialized_in_output=*/true, - /*strided_storage=*/false); - - } else if (desc.destination().kind() == DestinationBuffer::kStrided && - allow_strided_storage) { - Scalar* buffer = desc.destination().template data(); - desc.DropDestinationBuffer(); - return Storage(buffer, desc.dimensions(), desc.destination().strides(), - /*materialized_in_output=*/true, /*strided_storage=*/true); - - } else { - void* mem = scratch.allocate(desc.size() * sizeof(Scalar)); - return Storage(static_cast(mem), desc.dimensions(), - internal::strides(desc.dimensions()), - /*materialized_in_output=*/false, - /*strided_storage=*/false); - } - } - - // Creates a materialized block for the given descriptor from a memory buffer. - template - EIGEN_STRONG_INLINE static TensorMaterializedBlock materialize( - const Scalar* data, const DataDimensions& data_dims, - TensorBlockDesc& desc, TensorBlockScratch& scratch) { - eigen_assert(array_size::value == desc.dimensions().size()); - - // If a tensor block dimensions covers a contiguous block of the underlying - // memory, we can skip block buffer memory allocation, and construct a block - // from existing `data` memory buffer. - // - // Example: (RowMajor layout) - // data_dims: [11, 12, 13, 14] - // desc.dimensions(): [1, 1, 3, 14] - // - // In this case we can construct a TensorBlock starting at - // `data + desc.offset()`, with a `desc.dimensions()` block sizes. - static const bool is_col_major = Layout == ColMajor; - - // Find out how many inner dimensions have a matching size. - int num_matching_inner_dims = 0; - for (int i = 0; i < NumDims; ++i) { - int dim = is_col_major ? i : NumDims - i - 1; - if (data_dims[dim] != desc.dimensions()[dim]) break; - ++num_matching_inner_dims; - } - - // All the outer dimensions must be of size `1`, except a single dimension - // before the matching inner dimension (`3` in the example above). - bool can_use_direct_access = true; - for (int i = num_matching_inner_dims + 1; i < NumDims; ++i) { - int dim = is_col_major ? i : NumDims - i - 1; - if (desc.dimension(dim) != 1) { - can_use_direct_access = false; - break; - } - } - - if (can_use_direct_access) { - const Scalar* block_start = data + desc.offset(); - return TensorMaterializedBlock(internal::TensorBlockKind::kView, - block_start, desc.dimensions()); - - } else { - // Reuse destination buffer or allocate new buffer with scratch allocator. - const Storage storage = prepareStorage(desc, scratch); - - typedef internal::TensorBlockIO - TensorBlockIO; - typedef typename TensorBlockIO::Dst TensorBlockIODst; - typedef typename TensorBlockIO::Src TensorBlockIOSrc; - - TensorBlockIOSrc src(internal::strides(Dimensions(data_dims)), - data, desc.offset()); - TensorBlockIODst dst(storage.dimensions(), storage.strides(), - storage.data()); - - TensorBlockIO::Copy(dst, src); - return storage.AsTensorMaterializedBlock(); - } - } - - private: - TensorBlockKind m_kind; - const Scalar* m_data; - Dimensions m_dimensions; - XprType m_expr; - bool m_valid_expr; -}; - -// -------------------------------------------------------------------------- // -// TensorCwiseUnaryBlock is a lazy tensor expression block that applies UnaryOp -// functor to the blocks produced by the underlying Tensor expression. - -template -class TensorCwiseUnaryBlock { - static const bool NoArgBlockAccess = - internal::is_void::value; - - public: - typedef typename conditional< - NoArgBlockAccess, void, - TensorCwiseUnaryOp >:: - type XprType; - - typedef typename XprScalar::type Scalar; - - TensorCwiseUnaryBlock(const ArgTensorBlock& arg_block, const UnaryOp& functor) - : m_arg_block(arg_block), m_functor(functor) {} - - TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; } - - XprType expr() const { return XprType(m_arg_block.expr(), m_functor); } - const Scalar* data() const { return NULL; } - void cleanup() { m_arg_block.cleanup(); } - - private: - ArgTensorBlock m_arg_block; - UnaryOp m_functor; -}; - -// -------------------------------------------------------------------------- // -// TensorCwiseUnaryBlock is a lazy tensor expression block that applies BinaryOp -// functor to the blocks produced by the underlying Tensor expression. - -template -class TensorCwiseBinaryBlock { - static const bool NoArgBlockAccess = - internal::is_void::value || - internal::is_void::value; - - public: - typedef typename conditional< - NoArgBlockAccess, void, - TensorCwiseBinaryOp >::type - XprType; - - typedef typename XprScalar::type Scalar; - - TensorCwiseBinaryBlock(const LhsTensorBlock& left_block, - const RhsTensorBlock& right_block, - const BinaryOp& functor) - : m_left_block(left_block), - m_right_block(right_block), - m_functor(functor) {} - - TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; } - - XprType expr() const { - return XprType(m_left_block.expr(), m_right_block.expr(), m_functor); - } - - const Scalar* data() const { return NULL; } - - void cleanup() { - m_left_block.cleanup(); - m_right_block.cleanup(); - } - - private: - LhsTensorBlock m_left_block; - RhsTensorBlock m_right_block; - BinaryOp m_functor; -}; - -// -------------------------------------------------------------------------- // -// TensorUnaryExprBlock is a lazy tensor expression block that can construct -// an arbitrary tensor expression from a block of the underlying type (this is a -// generalization of the TensorCwiseUnaryBlock for arbitrary expressions). - -template -class TensorUnaryExprBlock { - typedef typename ArgTensorBlock::XprType ArgXprType; - static const bool NoArgBlockAccess = internal::is_void::value; - - public: - typedef typename conditional< - NoArgBlockAccess, void, - typename BlockFactory::template XprType::type>::type XprType; - - typedef typename XprScalar::type Scalar; - - TensorUnaryExprBlock(const ArgTensorBlock& arg_block, - const BlockFactory& factory) - : m_arg_block(arg_block), m_factory(factory) {} - - TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; } - XprType expr() const { return m_factory.expr(m_arg_block.expr()); } - const Scalar* data() const { return NULL; } - void cleanup() { m_arg_block.cleanup(); } - - private: - ArgTensorBlock m_arg_block; - BlockFactory m_factory; -}; - -// -------------------------------------------------------------------------- // -// TensorTernaryExprBlock is a lazy tensor expression block that can construct -// an arbitrary tensor expression from three blocks of the underlying type. - -template -class TensorTernaryExprBlock { - typedef typename Arg1TensorBlock::XprType Arg1XprType; - typedef typename Arg2TensorBlock::XprType Arg2XprType; - typedef typename Arg3TensorBlock::XprType Arg3XprType; - - static const bool NoArgBlockAccess = internal::is_void::value || - internal::is_void::value || - internal::is_void::value; - - public: - typedef typename conditional< - NoArgBlockAccess, void, - typename BlockFactory::template XprType::type>::type XprType; - - typedef typename XprScalar::type Scalar; - - TensorTernaryExprBlock(const Arg1TensorBlock& arg1_block, - const Arg2TensorBlock& arg2_block, - const Arg3TensorBlock& arg3_block, - const BlockFactory& factory) - : m_arg1_block(arg1_block), - m_arg2_block(arg2_block), - m_arg3_block(arg3_block), - m_factory(factory) {} - - TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; } - XprType expr() const { - return m_factory.expr(m_arg1_block.expr(), m_arg2_block.expr(), - m_arg3_block.expr()); - } - const Scalar* data() const { return NULL; } - void cleanup() { - m_arg1_block.cleanup(); - m_arg2_block.cleanup(); - m_arg3_block.cleanup(); - } - - private: - Arg1TensorBlock m_arg1_block; - Arg2TensorBlock m_arg2_block; - Arg3TensorBlock m_arg3_block; - BlockFactory m_factory; -}; - -// -------------------------------------------------------------------------- // -// StridedLinearBufferCopy provides a method to copy data between two linear -// buffers with different strides, with optimized paths for scatter/gather. - -template -class StridedLinearBufferCopy { - typedef typename packet_traits::type Packet; - enum { - Vectorizable = packet_traits::Vectorizable, - PacketSize = packet_traits::size - }; - - public: - // Specifying linear copy kind statically gives ~30% speedup for small sizes. - enum class Kind { - Linear = 0, // src_stride == 1 && dst_stride == 1 - Scatter = 1, // src_stride == 1 && dst_stride != 1 - FillLinear = 2, // src_stride == 0 && dst_stride == 1 - FillScatter = 3, // src_stride == 0 && dst_stride != 1 - Gather = 4, // dst_stride == 1 - Random = 5 // everything else - }; - - struct Dst { - Dst(IndexType o, IndexType s, Scalar* d) : offset(o), stride(s), data(d) {} - - IndexType offset; - IndexType stride; - Scalar* data; - }; - - struct Src { - Src(IndexType o, IndexType s, const Scalar* d) - : offset(o), stride(s), data(d) {} - - IndexType offset; - IndexType stride; - const Scalar* data; - }; - - template - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const Dst& dst, - const Src& src, - const size_t count) { - Run(count, dst.offset, dst.stride, dst.data, src.offset, src.stride, - src.data); - } - - private: - template - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( - const IndexType count, const IndexType dst_offset, - const IndexType dst_stride, Scalar* EIGEN_RESTRICT dst_data, - const IndexType src_offset, const IndexType src_stride, - const Scalar* EIGEN_RESTRICT src_data) { - const Scalar* src = &src_data[src_offset]; - Scalar* dst = &dst_data[dst_offset]; - - if (!Vectorizable) { - for (Index i = 0; i < count; ++i) { - dst[i * dst_stride] = src[i * src_stride]; - } - return; - } - - const IndexType vectorized_size = count - PacketSize; - IndexType i = 0; - - if (kind == StridedLinearBufferCopy::Kind::Linear) { - // ******************************************************************** // - // Linear copy from `src` to `dst`. - const IndexType unrolled_size = count - 4 * PacketSize; - eigen_assert(src_stride == 1 && dst_stride == 1); - for (; i <= unrolled_size; i += 4 * PacketSize) { - for (int j = 0; j < 4; ++j) { - Packet p = ploadu(src + i + j * PacketSize); - pstoreu(dst + i + j * PacketSize, p); - } - } - for (; i <= vectorized_size; i += PacketSize) { - Packet p = ploadu(src + i); - pstoreu(dst + i, p); - } - for (; i < count; ++i) { - dst[i] = src[i]; - } - // ******************************************************************** // - } else if (kind == StridedLinearBufferCopy::Kind::Scatter) { - // Scatter from `src` to `dst`. - eigen_assert(src_stride == 1 && dst_stride != 1); - for (; i <= vectorized_size; i += PacketSize) { - Packet p = ploadu(src + i); - pscatter(dst + i * dst_stride, p, dst_stride); - } - for (; i < count; ++i) { - dst[i * dst_stride] = src[i]; - } - // ******************************************************************** // - } else if (kind == StridedLinearBufferCopy::Kind::FillLinear) { - // Fill `dst` with value at `*src`. - eigen_assert(src_stride == 0 && dst_stride == 1); - const IndexType unrolled_size = count - 4 * PacketSize; - Packet p = pload1(src); - for (; i <= unrolled_size; i += 4 * PacketSize) { - for (int j = 0; j < 4; ++j) { - pstoreu(dst + i + j * PacketSize, p); - } - } - for (; i <= vectorized_size; i += PacketSize) { - pstoreu(dst + i, p); - } - for (; i < count; ++i) { - dst[i] = *src; - } - // ******************************************************************** // - } else if (kind == StridedLinearBufferCopy::Kind::FillScatter) { - // Scatter `*src` into `dst`. - eigen_assert(src_stride == 0 && dst_stride != 1); - Packet p = pload1(src); - for (; i <= vectorized_size; i += PacketSize) { - pscatter(dst + i * dst_stride, p, dst_stride); - } - for (; i < count; ++i) { - dst[i * dst_stride] = *src; - } - // ******************************************************************** // - } else if (kind == StridedLinearBufferCopy::Kind::Gather) { - // Gather from `src` into `dst`. - eigen_assert(dst_stride == 1); - for (; i <= vectorized_size; i += PacketSize) { - Packet p = pgather(src + i * src_stride, src_stride); - pstoreu(dst + i, p); - } - for (; i < count; ++i) { - dst[i] = src[i * src_stride]; - } - // ******************************************************************** // - } else if (kind == StridedLinearBufferCopy::Kind::Random) { - // Random. - for (; i < count; ++i) { - dst[i * dst_stride] = src[i * src_stride]; - } - } else { - eigen_assert(false); - } - } -}; - -// -------------------------------------------------------------------------- // -// TensorBlockIO copies data from `src` tensor block, to the `dst` tensor block. -// It's possible to specify src->dst dimension mapping for the copy operation. -// Dimensions of `dst` specify how many elements have to be copied, for the -// `src` we need to know only stride to navigate through source memory buffer. - -template -class TensorBlockIO { - static const bool IsColMajor = (Layout == ColMajor); - - typedef StridedLinearBufferCopy LinCopy; - - public: - typedef DSizes Dimensions; - typedef DSizes DimensionsMap; - - struct Dst { - Dst(const Dimensions& dst_dims, const Dimensions& dst_strides, Scalar* dst, - IndexType dst_offset = 0) - : dims(dst_dims), strides(dst_strides), data(dst), offset(dst_offset) {} - - Dimensions dims; - Dimensions strides; - Scalar* data; - IndexType offset; - }; - - struct Src { - Src(const Dimensions& src_strides, const Scalar* src, - IndexType src_offset = 0) - : strides(src_strides), data(src), offset(src_offset) {} - - Dimensions strides; - const Scalar* data; - IndexType offset; - }; - - // Copies data to `dst` from `src`, using provided dimensions mapping: - // - // src_dimension_index = dst_to_src_dim_map[dst_dimension_index] - // - // Returns the number of copied elements. - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType Copy( - const Dst& dst, const Src& src, const DimensionsMap& dst_to_src_dim_map) { - // Copy single scalar value from `src` to `dst`. - if (NumDims == 0) { - *(dst.data + dst.offset) = *(src.data + src.offset); - return 1; - } - - // Both `dst` and `src` must have contiguous innermost dimension. We also - // accept the special case with stride '0', because it's used as a trick to - // implement broadcasting. - { - int inner_dim = IsColMajor ? 0 : NumDims - 1; - EIGEN_UNUSED_VARIABLE(inner_dim); - eigen_assert(dst.strides[inner_dim] == 1 || dst.strides[inner_dim] == 0); - eigen_assert(src.strides[inner_dim] == 1 || src.strides[inner_dim] == 0); - } - - // Give a shorter name to `dst_to_src_dim_map`. - const DimensionsMap& dim_map = dst_to_src_dim_map; - - // Do not squeeze reordered inner dimensions. - int num_squeezable_dims = NumSqueezableInnerDims(dim_map); - - // NOTE: We find the innermost dimension (contiguous in memory) in the dst - // block, and we write data linearly into that dimension, reading it from - // the src. If dimensions are reordered, we might end up reading data from - // the src with `stride != 1`. - // - // NOTE: Random-Read/Linear-Write can be up to ~2X faster than - // Linear-Read/Random-Write: https://stackoverflow.com/a/54935680 - - // Find the innermost dimension in the dst whose size is not 1. This is the - // effective inner dim. - int num_size_one_inner_dims = 0; - for (int i = 0; i < num_squeezable_dims; ++i) { - const int dst_dim = IsColMajor ? i : NumDims - i - 1; - if (dst.dims[dst_dim] != 1) break; - num_size_one_inner_dims++; - } - - // If all dimensions are of size 1, just copy a scalar from `src` to `dst`. - if (num_size_one_inner_dims == NumDims) { - *(dst.data + dst.offset) = *(src.data + src.offset); - return 1; - } - - // Outermost dimension in the dst with `stride == 1` (contiguous in memory). - const int dst_stride1_dim = IsColMajor - ? num_size_one_inner_dims - : NumDims - num_size_one_inner_dims - 1; - - // Dimension in the src that corresponds to the dst innermost dimension. - const int src_dim_for_dst_stride1_dim = - NumDims == 0 ? 1 : dim_map[dst_stride1_dim]; - - // Size of the innermost dimension (length of contiguous blocks of memory). - IndexType dst_inner_dim_size = NumDims == 0 ? 1 : dst.dims[dst_stride1_dim]; - - // Squeeze multiple inner dims into one if they are contiguous in `dst` and - // `src` memory, so we can do less linear copy calls. - for (int i = num_size_one_inner_dims + 1; i < num_squeezable_dims; ++i) { - const int dst_dim = IsColMajor ? i : NumDims - i - 1; - const IndexType dst_stride = dst.strides[dst_dim]; - const IndexType src_stride = src.strides[dim_map[dst_dim]]; - if (dst_inner_dim_size == dst_stride && dst_stride == src_stride) { - dst_inner_dim_size *= dst.dims[dst_dim]; - ++num_size_one_inner_dims; - } else { - break; - } - } - - // Setup strides to read data from `src` and write to `dst`. - IndexType input_offset = src.offset; - IndexType output_offset = dst.offset; - IndexType input_stride = - NumDims == 0 ? 1 : src.strides[src_dim_for_dst_stride1_dim]; - IndexType output_stride = NumDims == 0 ? 1 : dst.strides[dst_stride1_dim]; - - const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1; - array it; - - // Initialize block iterator state. Squeeze away any dimension of size 1. - int idx = 0; // currently initialized iterator state index - for (int i = num_size_one_inner_dims; i < NumDims - 1; ++i) { - const int dst_dim = IsColMajor ? i + 1 : NumDims - i - 2; - if (dst.dims[dst_dim] == 1) continue; - - it[idx].size = dst.dims[dst_dim]; - it[idx].input_stride = src.strides[dim_map[dst_dim]]; - it[idx].output_stride = dst.strides[dst_dim]; - - it[idx].input_span = it[idx].input_stride * (it[idx].size - 1); - it[idx].output_span = it[idx].output_stride * (it[idx].size - 1); - - idx++; - } - - // Iterate copying data from src to dst. - const IndexType block_total_size = NumDims == 0 ? 1 : dst.dims.TotalSize(); - -#define COPY_INNER_DIM(KIND) \ - IndexType num_copied = 0; \ - for (num_copied = 0; num_copied < block_total_size; \ - num_copied += dst_inner_dim_size) { \ - LinCopy::template Run( \ - typename LinCopy::Dst(output_offset, output_stride, dst.data), \ - typename LinCopy::Src(input_offset, input_stride, src.data), \ - dst_inner_dim_size); \ - \ - for (int j = 0; j < idx; ++j) { \ - if (++it[j].count < it[j].size) { \ - input_offset += it[j].input_stride; \ - output_offset += it[j].output_stride; \ - break; \ - } \ - it[j].count = 0; \ - input_offset -= it[j].input_span; \ - output_offset -= it[j].output_span; \ - } \ - } \ - return num_copied; - - if (input_stride == 1 && output_stride == 1) { - COPY_INNER_DIM(LinCopy::Kind::Linear); - } else if (input_stride == 1 && output_stride != 1) { - COPY_INNER_DIM(LinCopy::Kind::Scatter); - } else if (input_stride == 0 && output_stride == 1) { - COPY_INNER_DIM(LinCopy::Kind::FillLinear); - } else if (input_stride == 0 && output_stride != 1) { - COPY_INNER_DIM(LinCopy::Kind::FillScatter); - } else if (output_stride == 1) { - COPY_INNER_DIM(LinCopy::Kind::Gather); - } else { - COPY_INNER_DIM(LinCopy::Kind::Random); - } - -#undef COPY_INNER_DIM - } - - // Copy from `src` to `dst` with an identity src->dst dimension map. Returns - // the number of copied elements. - static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexType Copy(const Dst& dst, - const Src& src) { - DimensionsMap dst_to_src_map; - for (int i = 0; i < NumDims; ++i) dst_to_src_map[i] = i; - return Copy(dst, src, dst_to_src_map); - } - - private: - struct BlockIteratorState { - BlockIteratorState() - : size(0), - count(0), - input_stride(0), - output_stride(0), - input_span(0), - output_span(0) {} - - IndexType size; - IndexType count; - IndexType input_stride; - IndexType output_stride; - IndexType input_span; - IndexType output_span; - }; - - // Compute how many inner dimensions it's allowed to squeeze when doing IO - // between two tensor blocks. It's safe to squeeze inner dimensions, only - // if they are not reordered. - static int NumSqueezableInnerDims(const DimensionsMap& dim_map) { - int num_squeezable_dims = 0; - for (int i = 0; i < NumDims; ++i) { - const int dim = IsColMajor ? i : NumDims - i - 1; - if (dim_map[dim] != dim) break; - num_squeezable_dims++; - } - return num_squeezable_dims; - } -}; - -// -------------------------------------------------------------------------- // -// TensorBlockAssignment assigns a block expression of type `TensorBlockExpr` to -// a Tensor block defined by `desc`, backed by a memory buffer at `target`. -// -// Currently there is no way to write from a Tensor expression to a block of -// memory, if dimensions are reordered. If you need to do that, you should -// materialize a Tensor block expression into a memory buffer, and then use -// TensorBlockIO to copy data between two memory buffers with a custom -// `target->src` dimension map (see definition above). -// -// Also currently the innermost dimension of `target` must have a stride '1' -// (contiguous in memory). This restriction could be lifted with a `pscatter`, -// but in practice it's never needed, and there is a similar TensorBlockIO -// workaround for that. -// -// TODO(ezhulenev): TensorBlockAssignment is a special case of TensorBlockIO -// where `src` is a tensor expression. Explore if it is possible to rewrite IO -// to use expressions instead of pointers, and after that TensorBlockAssignment -// will become an alias to IO. -template -class TensorBlockAssignment { - // We will use coeff/packet path to evaluate block expressions. - typedef TensorEvaluator - TensorBlockEvaluator; - - typedef DSizes Dimensions; - - enum { - Vectorizable = packet_traits::Vectorizable, - PacketSize = packet_traits::size - }; - - template - struct InnerDimAssign { - EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count, - const Evaluator& eval, - IndexType eval_offset) { - for (IndexType i = 0; i < count; ++i) { - target[i] = eval.coeff(eval_offset + i); - } - } - }; - - template - struct InnerDimAssign { - EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count, - const Evaluator& eval, - IndexType eval_offset) { - typedef typename packet_traits::type Packet; - - const IndexType unrolled_size = count - 4 * PacketSize; - const IndexType vectorized_size = count - PacketSize; - IndexType i = 0; - - for (; i <= unrolled_size; i += 4 * PacketSize) { - for (int j = 0; j < 4; ++j) { - const IndexType idx = eval_offset + i + j * PacketSize; - Packet p = eval.template packet(idx); - pstoreu(target + i + j * PacketSize, p); - } - } - - for (; i <= vectorized_size; i += PacketSize) { - Packet p = eval.template packet(eval_offset + i); - pstoreu(target + i, p); - } - - for (; i < count; ++i) { - target[i] = eval.coeff(eval_offset + i); - } - } - }; - - public: - struct Target { - Target(const Dimensions& target_dims, const Dimensions& target_strides, - Scalar* target_data, IndexType target_offset = 0) - : dims(target_dims), - strides(target_strides), - data(target_data), - offset(target_offset) {} - - Dimensions dims; - Dimensions strides; - Scalar* data; - IndexType offset; - }; - - static Target target(const Dimensions& target_dims, - const Dimensions& target_strides, Scalar* target_data, - IndexType target_offset = 0) { - return Target(target_dims, target_strides, target_data, target_offset); - } - - template - static Target target( - const DSizes& target_dims, - const DSizes& target_strides, - Scalar* target_data, IndexType target_offset = 0) { - // DSizes constructor will do index type promotion if it's safe. - return Target(Dimensions(target_dims), Dimensions(target_strides), - target_data, target_offset); - } - - static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run( - const Target& target, const TensorBlockExpr& expr) { - // Prepare evaluator for block expression. - DefaultDevice default_device; - TensorBlockEvaluator eval(expr, default_device); - - // Tensor block expression dimension should match destination dimensions. - eigen_assert(dimensions_match(target.dims, eval.dimensions())); - - static const int Layout = TensorBlockEvaluator::Layout; - static const bool is_col_major = Layout == ColMajor; - - // Initialize output inner dimension size based on a layout. - const IndexType output_size = NumDims == 0 ? 1 : target.dims.TotalSize(); - const int inner_dim_idx = is_col_major ? 0 : NumDims - 1; - IndexType output_inner_dim_size = target.dims[inner_dim_idx]; - - // Target inner dimension stride must be '1'. - eigen_assert(target.strides[inner_dim_idx] == 1); - - // Squeeze multiple inner dims into one if they are contiguous in `target`. - IndexType num_squeezed_dims = 0; - for (Index i = 1; i < NumDims; ++i) { - const Index dim = is_col_major ? i : NumDims - i - 1; - const IndexType target_stride = target.strides[dim]; - - if (output_inner_dim_size == target_stride) { - output_inner_dim_size *= target.dims[dim]; - num_squeezed_dims++; - } else { - break; - } - } - - // Initialize output block iterator state. Dimension in this array are - // always in inner_most -> outer_most order (col major layout). - array it; - - int idx = 0; // currently initialized iterator state index - for (Index i = num_squeezed_dims; i < NumDims - 1; ++i) { - const Index dim = is_col_major ? i + 1 : NumDims - i - 2; - - it[idx].count = 0; - it[idx].size = target.dims[dim]; - it[idx].output_stride = target.strides[dim]; - it[idx].output_span = it[idx].output_stride * (it[idx].size - 1); - idx++; - } - - // We read block expression from the beginning, and start writing data to - // `target` at given offset. - IndexType input_offset = 0; - IndexType output_offset = target.offset; - - // Iterate copying data from `eval` to `target`. - for (IndexType i = 0; i < output_size; i += output_inner_dim_size) { - // Assign to `target` at current offset. - InnerDimAssign::Run(target.data + output_offset, - output_inner_dim_size, eval, - input_offset); - - // Move input offset forward by the number of assigned coefficients. - input_offset += output_inner_dim_size; - - // Update index. - for (int j = 0; j < idx; ++j) { - if (++it[j].count < it[j].size) { - output_offset += it[j].output_stride; - break; - } - it[j].count = 0; - output_offset -= it[j].output_span; - } - } - } - - private: - struct BlockIteratorState { - BlockIteratorState() - : count(0), size(0), output_stride(0), output_span(0) {} - - IndexType count; - IndexType size; - IndexType output_stride; - IndexType output_span; - }; -}; - -// -------------------------------------------------------------------------- // - -} // namespace internal -} // namespace Eigen - -#endif // EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py index bcf80fa4771..ea183e94448 100755 --- a/python/paddle/fluid/tests/unittests/test_activation_op.py +++ b/python/paddle/fluid/tests/unittests/test_activation_op.py @@ -1971,9 +1971,9 @@ class TestPow_factor_tensor(TestActivation): feed={"x": input}, fetch_list=[out_1, out_2, res, out_6]) - assert np.array_equal(res_1, np.power(input, 2)) - assert np.array_equal(res_2, np.power(input, 3)) - assert np.array_equal(res_6, np.power(input, 3)) + assert np.allclose(res_1, np.power(input, 2)) + assert np.allclose(res_2, np.power(input, 3)) + assert np.allclose(res_6, np.power(input, 3)) def test_error(self): in1 = fluid.layers.data( -- GitLab From e973bd732d80c4c9ffa0395c1db7d80b9dfabda5 Mon Sep 17 00:00:00 2001 From: Kaipeng Deng Date: Wed, 31 Mar 2021 15:16:31 +0800 Subject: [PATCH 117/486] Polish tensor pipeline (#31701) * polish tensor pipeline. test=develop --- paddle/fluid/pybind/imperative.cc | 62 +++++++++++++++++++ python/paddle/fluid/core.py | 2 + .../fluid/dataloader/dataloader_iter.py | 8 ++- python/paddle/fluid/dataloader/flat.py | 12 +--- python/paddle/fluid/dataloader/worker.py | 6 +- 5 files changed, 77 insertions(+), 13 deletions(-) diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index eed3b3b7691..40cf6cd84be 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -494,6 +494,39 @@ void BindImperative(py::module *m_ptr) { }, py::return_value_policy::take_ownership); + m.def("_array_to_share_memory_tensor", + [](py::object &obj) { + // 1. cast to python array + auto array = obj.cast(); + PADDLE_ENFORCE_NE( + string::Sprintf("%s", array.dtype()).compare("object"), 0, + platform::errors::InvalidArgument( + "Faild to convert input data to a regular ndarray.\n * " + "Usually this means the input data contains nested " + "lists with different lengths.\n * Check the reader " + "function passed to 'set_(sample/sample_list/batch)" + "_generator' to locate the data causes this issue.")); + // 2. construcct LoDTensor + framework::LoDTensor t; + SetTensorFromPyArray(&t, array, + platform::CPUPlace(), true); + // 3. allocate shared memory + void *data_ptr = t.data(); + size_t data_size = t.numel() * framework::SizeOfType(t.type()); + auto shared_writer_holder = + memory::allocation::AllocateMemoryMapWriterAllocation(data_size); + // 4. maintain mmap fd set & backup ipc_name + const std::string &ipc_name = shared_writer_holder->ipc_name(); + memory::allocation::MemoryMapFdSet::Instance().Insert(ipc_name); + // 5. copy data & reset holder + memory::Copy(platform::CPUPlace(), shared_writer_holder->ptr(), + platform::CPUPlace(), data_ptr, data_size); + t.ResetHolder(shared_writer_holder); + + return t; + }, + py::return_value_policy::take_ownership); + m.def("_remove_tensor_list_mmap_fds", [](py::list &tensor_list) { for (size_t i = 0; i < tensor_list.size(); ++i) { auto t = tensor_list[i].cast(); @@ -1111,6 +1144,35 @@ void BindImperative(py::module *m_ptr) { y = x.cuda(1) print(y.place) # CUDAPlace(1) )DOC") + .def("_share_memory", + [](const std::shared_ptr &self) { +#ifndef _WIN32 + PADDLE_ENFORCE_EQ( + platform::is_cpu_place(self->Place()), true, + platform::errors::InvalidArgument( + "Sharing memory only support CPU Tensor currently")); + // 1. get LoDTensor + auto *t = self->MutableVar()->GetMutable(); + // 2. allocate shared memory + void *data_ptr = t->data(); + size_t data_size = t->numel() * framework::SizeOfType(t->type()); + auto shared_writer_holder = + memory::allocation::AllocateMemoryMapWriterAllocation( + data_size); + // 3. maintain mmap fd set & backup ipc_name + const std::string &ipc_name = shared_writer_holder->ipc_name(); + memory::allocation::MemoryMapFdSet::Instance().Insert(ipc_name); + // 4. copy data & reset holder + memory::Copy(platform::CPUPlace(), shared_writer_holder->ptr(), + platform::CPUPlace(), data_ptr, data_size); + t->ResetHolder(shared_writer_holder); + return *t; +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "Sharing memory in Windows OS is not supported currently")); +#endif + }, + py::return_value_policy::reference) .def("copy_", &imperative::VarBase::CopyFrom) .def("_copy_to", [](const std::shared_ptr &self, diff --git a/python/paddle/fluid/core.py b/python/paddle/fluid/core.py index 4c24eb3d7fc..d3dc26c946d 100644 --- a/python/paddle/fluid/core.py +++ b/python/paddle/fluid/core.py @@ -279,6 +279,7 @@ if avx_supported(): from .core_avx import _set_process_signal_handler from .core_avx import _throw_error_if_process_failed from .core_avx import _convert_to_tensor_list + from .core_avx import _array_to_share_memory_tensor from .core_avx import _cleanup_mmap_fds from .core_avx import _remove_tensor_list_mmap_fds except Exception as e: @@ -333,6 +334,7 @@ if load_noavx: from .core_noavx import _set_process_signal_handler from .core_noavx import _throw_error_if_process_failed from .core_noavx import _convert_to_tensor_list + from .core_noavx import _array_to_share_memory_tensor from .core_noavx import _cleanup_mmap_fds from .core_noavx import _remove_tensor_list_mmap_fds except Exception as e: diff --git a/python/paddle/fluid/dataloader/dataloader_iter.py b/python/paddle/fluid/dataloader/dataloader_iter.py index 0cd12e874d9..167c7987c55 100644 --- a/python/paddle/fluid/dataloader/dataloader_iter.py +++ b/python/paddle/fluid/dataloader/dataloader_iter.py @@ -166,7 +166,9 @@ class _DataLoaderIterSingleProcess(_DataLoaderIterBase): # pack as LoDTensorArray array = core.LoDTensorArray() for slot in batch: - if not isinstance(slot, core.LoDTensor): + if isinstance(slot, paddle.Tensor): + slot = slot.value().get_tensor() + elif not isinstance(slot, core.LoDTensor): tmp = core.LoDTensor() tmp.set(slot, core.CPUPlace()) slot = tmp @@ -388,7 +390,9 @@ class _DataLoaderIterMultiProcess(_DataLoaderIterBase): # LoDTensor not in shared memory is not # serializable, cannot be create in workers for slot in batch: - if not isinstance(slot, core.LoDTensor): + if isinstance(slot, paddle.Tensor): + slot = slot.value().get_tensor() + elif not isinstance(slot, core.LoDTensor): tmp = core.LoDTensor() tmp.set(slot, core.CPUPlace()) slot = tmp diff --git a/python/paddle/fluid/dataloader/flat.py b/python/paddle/fluid/dataloader/flat.py index 6cccbc7ee4e..db3a725ece0 100644 --- a/python/paddle/fluid/dataloader/flat.py +++ b/python/paddle/fluid/dataloader/flat.py @@ -36,14 +36,10 @@ def _flatten_batch(batch): def _flatten(batch, flat_batch, structure, field_idx): if isinstance(batch, Sequence): for field in batch: - if isinstance(field, np.ndarray): + if isinstance(field, (np.ndarray, paddle.Tensor)): structure.append('{}{}'.format(FIELD_PREFIX, field_idx)) flat_batch.append(field) field_idx += 1 - elif isinstance(field, paddle.Tensor): - structure.append('{}{}'.format(FIELD_PREFIX, field_idx)) - flat_batch.append(field.numpy()) - field_idx += 1 elif isinstance(field, (str, bytes, numbers.Number)): structure.append(field) elif isinstance(field, Sequence): @@ -58,14 +54,10 @@ def _flatten_batch(batch): structure.append(field) elif isinstance(batch, Mapping): for k, field in batch.items(): - if isinstance(field, np.ndarray): + if isinstance(field, (np.ndarray, paddle.Tensor)): structure[k] = '{}{}'.format(FIELD_PREFIX, field_idx) flat_batch.append(field) field_idx += 1 - elif isinstance(field, paddle.Tensor): - structure[k] = '{}{}'.format(FIELD_PREFIX, field_idx) - flat_batch.append(field.numpy()) - field_idx += 1 elif isinstance(field, (str, bytes, numbers.Number)): structure[k] = field elif isinstance(field, Sequence): diff --git a/python/paddle/fluid/dataloader/worker.py b/python/paddle/fluid/dataloader/worker.py index 2d1b554e53d..26bd1f06e12 100644 --- a/python/paddle/fluid/dataloader/worker.py +++ b/python/paddle/fluid/dataloader/worker.py @@ -238,7 +238,11 @@ def _worker_loop(dataset, dataset_kind, indices_queue, out_queue, done_event, out_queue.put((idx, batch, None)) batch, structure = _flatten_batch(batch) if use_shared_memory: - tensor_list = core._convert_to_tensor_list(batch) + tensor_list = [ + core._array_to_share_memory_tensor(b) + if isinstance(b, np.ndarray) else b._share_memory() + for b in batch + ] out_queue.put((idx, tensor_list, structure)) core._remove_tensor_list_mmap_fds(tensor_list) else: -- GitLab From ea738ddaf42633654f06f4c6c88497c94cd546bc Mon Sep 17 00:00:00 2001 From: tianshuo78520a <707759223@qq.com> Date: Wed, 31 Mar 2021 15:39:04 +0800 Subject: [PATCH 118/486] delete cuda9 code (#31883) --- cmake/configure.cmake | 4 +-- cmake/cuda.cmake | 28 ++----------------- .../elementwise/elementwise_op_function.cu.h | 4 --- paddle/fluid/platform/cuda_device_function.h | 12 +++----- paddle/fluid/platform/cuda_helper.h | 4 --- paddle/fluid/platform/float16_test.cu | 3 +- 6 files changed, 9 insertions(+), 46 deletions(-) diff --git a/cmake/configure.cmake b/cmake/configure.cmake index 9c1bd52e7fb..851520328f3 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -93,8 +93,8 @@ if(WITH_GPU) FIND_PACKAGE(CUDA REQUIRED) - if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_LESS 7) - message(FATAL_ERROR "Paddle needs CUDA >= 7.0 to compile") + if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_LESS 10.1) + message(FATAL_ERROR "Paddle needs CUDA >= 10.1 to compile") endif() if(NOT CUDNN_FOUND) diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake index 05b55952074..7f2addb02d3 100644 --- a/cmake/cuda.cmake +++ b/cmake/cuda.cmake @@ -6,15 +6,9 @@ endif() if (WITH_NV_JETSON) add_definitions(-DWITH_NV_JETSON) set(paddle_known_gpu_archs "53 62 72") - set(paddle_known_gpu_archs7 "53") - set(paddle_known_gpu_archs8 "53 62") - set(paddle_known_gpu_archs9 "53 62") set(paddle_known_gpu_archs10 "53 62 72") else() - set(paddle_known_gpu_archs "30 35 50 52 60 61 70") - set(paddle_known_gpu_archs7 "30 35 50 52") - set(paddle_known_gpu_archs8 "30 35 50 52 60 61") - set(paddle_known_gpu_archs9 "30 35 50 52 60 61 70") + set(paddle_known_gpu_archs "35 50 52 60 61 70 75 80") set(paddle_known_gpu_archs10 "35 50 52 60 61 70 75") set(paddle_known_gpu_archs11 "52 60 61 70 75 80") endif() @@ -160,25 +154,7 @@ function(select_nvcc_arch_flags out_variable) endfunction() message(STATUS "CUDA detected: " ${CMAKE_CUDA_COMPILER_VERSION}) -if (${CMAKE_CUDA_COMPILER_VERSION} LESS 7.0) - set(paddle_known_gpu_archs ${paddle_known_gpu_archs}) -elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 8.0) # CUDA 7.x - set(paddle_known_gpu_archs ${paddle_known_gpu_archs7}) - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED") - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__") -elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 9.0) # CUDA 8.x - set(paddle_known_gpu_archs ${paddle_known_gpu_archs8}) - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED") - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__") - # CUDA 8 may complain that sm_20 is no longer supported. Suppress the - # warning for now. - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets") -elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 10.0) # CUDA 9.x - set(paddle_known_gpu_archs ${paddle_known_gpu_archs9}) - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED") - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__") - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Wno-deprecated-gpu-targets") -elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0) # CUDA 10.x +if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0) # CUDA 10.x set(paddle_known_gpu_archs ${paddle_known_gpu_archs10}) set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D_MWAITXINTRIN_H_INCLUDED") set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__") diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_function.cu.h index 1121d0ef68c..6d5dcc4dd6f 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_function.cu.h +++ b/paddle/fluid/operators/elementwise/elementwise_op_function.cu.h @@ -34,10 +34,6 @@ limitations under the License. */ #endif #endif // PADDLE_WITH_HIP -#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION < 9000 -#define __h2div h2div -#endif - #define DIV_ERROR_INFO \ "InvalidArgumentError: Integer division by zero encountered in divide. " \ "Please check.\n" diff --git a/paddle/fluid/platform/cuda_device_function.h b/paddle/fluid/platform/cuda_device_function.h index 4f504b414de..dde9531e591 100644 --- a/paddle/fluid/platform/cuda_device_function.h +++ b/paddle/fluid/platform/cuda_device_function.h @@ -26,14 +26,10 @@ namespace platform { #ifdef PADDLE_WITH_HIP #define CREATE_SHFL_MASK(mask, predicate) mask = __ballot((predicate)) #else -#if CUDA_VERSION < 9000 -#define CREATE_SHFL_MASK(mask, predicate) mask = 0u; -#else #define FULL_WARP_MASK 0xFFFFFFFF #define CREATE_SHFL_MASK(mask, predicate) \ mask = __ballot_sync(FULL_WARP_MASK, (predicate)) #endif -#endif inline static int RoundToPowerOfTwo(int dim) { if (dim > 512) { @@ -69,7 +65,7 @@ template __forceinline__ __device__ T CudaShuffleDownSync(unsigned mask, T val, int delta, int width = warpSize) { -#if defined(PADDLE_WITH_HIP) || CUDA_VERSION < 9000 +#if defined(PADDLE_WITH_HIP) return __shfl_down(val, delta, width); #else return __shfl_down_sync(mask, val, static_cast(delta), width); @@ -79,7 +75,7 @@ __forceinline__ __device__ T CudaShuffleDownSync(unsigned mask, T val, template __forceinline__ __device__ T CudaShuffleXorSync(unsigned mask, T val, int width = warpSize) { -#if defined(PADDLE_WITH_HIP) || CUDA_VERSION < 9000 +#if defined(PADDLE_WITH_HIP) return __shfl_xor(val, width); #else return __shfl_xor_sync(mask, val, width); @@ -87,7 +83,7 @@ __forceinline__ __device__ T CudaShuffleXorSync(unsigned mask, T val, } // CUDA 9.0 have native compatible float16 shfl_down -#if defined(PADDLE_WITH_HIP) || CUDA_VERSION < 9000 +#if defined(PADDLE_WITH_HIP) template <> __forceinline__ __device__ float16 CudaShuffleDownSync(unsigned mask, float16 val, int delta, @@ -170,7 +166,7 @@ __forceinline__ __device__ paddle::platform::complex128 CudaShuffleXorSync( template __forceinline__ __device__ T CudaShuffleSync(unsigned mask, T val, int src_line, int width = 32) { -#if defined(PADDLE_WITH_HIP) || CUDA_VERSION < 9000 +#if defined(PADDLE_WITH_HIP) return __shfl(val, src_line, width); #else return __shfl_sync(mask, val, src_line, width); diff --git a/paddle/fluid/platform/cuda_helper.h b/paddle/fluid/platform/cuda_helper.h index fa4ef3f8c12..202be920c55 100644 --- a/paddle/fluid/platform/cuda_helper.h +++ b/paddle/fluid/platform/cuda_helper.h @@ -25,10 +25,6 @@ #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/macros.h" -#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION < 9000 -enum cublasMath_t { CUBLAS_DEFAULT_MATH = 0 }; -#endif - namespace paddle { namespace platform { diff --git a/paddle/fluid/platform/float16_test.cu b/paddle/fluid/platform/float16_test.cu index d181660e311..75e35d398c2 100644 --- a/paddle/fluid/platform/float16_test.cu +++ b/paddle/fluid/platform/float16_test.cu @@ -197,8 +197,7 @@ limitations under the License. */ namespace paddle { namespace platform { -#if defined(PADDLE_WITH_HIP) || \ - (defined(PADDLE_WITH_CUDA) && CUDA_VERSION < 9000) +#if defined(PADDLE_WITH_HIP) ARITHMETIC_KERNEL(Add, +) ARITHMETIC_KERNEL(Sub, -) ARITHMETIC_KERNEL(Mul, *) -- GitLab From 6f85e24187bb1c29dce10eaccbac6a4935d3513c Mon Sep 17 00:00:00 2001 From: Kqnonrime <36952116+Kqnonrime@users.noreply.github.com> Date: Wed, 31 Mar 2021 15:58:30 +0800 Subject: [PATCH 119/486] fix one error massage (#31904) * fix one error massage * fix a error message * new fix three error messages * new fix three error messages * new fix some error * new fix one error message --- paddle/fluid/operators/meshgrid_op.cc | 4 +++- paddle/fluid/operators/meshgrid_op.h | 10 +++++++--- paddle/fluid/operators/range_op.h | 2 +- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/operators/meshgrid_op.cc b/paddle/fluid/operators/meshgrid_op.cc index 12b255329da..33f71b4adc0 100644 --- a/paddle/fluid/operators/meshgrid_op.cc +++ b/paddle/fluid/operators/meshgrid_op.cc @@ -108,7 +108,9 @@ class MeshgridGradOp : public framework::OperatorWithKernel { void InferShape(framework::InferShapeContext* ctx) const override { PADDLE_ENFORCE_GT(ctx->Inputs(framework::GradVarName("Out")).size(), 1, platform::errors::InvalidArgument( - "Number of Inputs(Out@Grad) must be larger than 1")); + "Number of Inputs(Out@Grad) should be larger than 1." + "But received Inputs(Out@Grad)' size = %d .", + ctx->Inputs(framework::GradVarName("Out")).size())); ctx->SetOutputsDim(framework::GradVarName("X"), ctx->GetInputsDim("X")); } diff --git a/paddle/fluid/operators/meshgrid_op.h b/paddle/fluid/operators/meshgrid_op.h index 11cd43b2204..162622c7d01 100644 --- a/paddle/fluid/operators/meshgrid_op.h +++ b/paddle/fluid/operators/meshgrid_op.h @@ -60,7 +60,8 @@ class MeshgridKernel : public framework::OpKernel { REP_MESHGRID_TEMPLATE(MAX_RANK_SUPPORTED) default: PADDLE_THROW(platform::errors::InvalidArgument( - "Only support tensor nums between 1 and 6.")); + "Excepted Tensor numbers between 1 and 6, but only received d% .", + rank)); } } @@ -71,7 +72,9 @@ class MeshgridKernel : public framework::OpKernel { auto outs = context.MultiOutput("Out"); PADDLE_ENFORCE_EQ( ins.size() > 1, true, - platform::errors::InvalidArgument("expect at least 2 input tensors")); + platform::errors::InvalidArgument( + "Expected at least 2 input tensors, but only received d%.", + ins.size())); int64_t size = ins.size(); std::vector shape(size); @@ -131,7 +134,8 @@ class MeshgridGradKernel : public framework::OpKernel { REP_MESHGRID_GRAD_TEMPLATE(MAX_RANK_SUPPORTED) default: PADDLE_THROW(platform::errors::InvalidArgument( - "only support tensor nums being between 1 and 6.")); + "Excepted Tensor numbers between 1 and 6, but only received d% .", + n)); } } diff --git a/paddle/fluid/operators/range_op.h b/paddle/fluid/operators/range_op.h index a793d12f522..5344147a906 100644 --- a/paddle/fluid/operators/range_op.h +++ b/paddle/fluid/operators/range_op.h @@ -34,7 +34,7 @@ void GetSize(T start, T end, T step, int64_t* size) { if (start > end) { PADDLE_ENFORCE_LT(step, 0, platform::errors::InvalidArgument( - "step should be less than 0 while start > end.")); + "The step should be less than 0 while start > end.")); } *size = std::is_integral::value -- GitLab From 695dd37182824d9c82ea4490d7e13e924de1c00b Mon Sep 17 00:00:00 2001 From: lilong12 Date: Wed, 31 Mar 2021 16:06:11 +0800 Subject: [PATCH 120/486] Adjust pipeline optimizer for 3d parallelism (#31939) * update, test=develop --- paddle/fluid/framework/pipeline_trainer.cc | 27 +- .../fleet/meta_optimizers/common.py | 5 + .../meta_optimizers/pipeline_optimizer.py | 11 +- python/paddle/fluid/optimizer.py | 286 ++++++++++-------- 4 files changed, 168 insertions(+), 161 deletions(-) diff --git a/paddle/fluid/framework/pipeline_trainer.cc b/paddle/fluid/framework/pipeline_trainer.cc index a97fc2e75aa..5968df548df 100644 --- a/paddle/fluid/framework/pipeline_trainer.cc +++ b/paddle/fluid/framework/pipeline_trainer.cc @@ -71,37 +71,16 @@ void PipelineTrainer::CopyParameters(int microbatch_id, const ProgramDesc& program, const platform::Place& place) { auto& global_block = program.Block(0); - std::map param_map; - for (auto& var : global_block.AllVars()) { - if (var->Persistable()) { - param_map[var->Name()] = 1; - } - } for (auto& var : global_block.AllVars()) { - bool is_param_grad = false; - size_t pos = 0; - // A magic suffix to indicate the merged gradient - std::string magicSuffix = std::string(kGradVarSuffix) + "@MERGED"; - if ((pos = var->Name().find(magicSuffix)) != std::string::npos) { - auto prefix_name = var->Name().substr(0, pos); - if (param_map.find(prefix_name) != param_map.end()) { - is_param_grad = true; - } - } if (var->Persistable() && microbatch_id == 0) { auto* ptr = root_scope_->Var(var->Name()); InitializeVariable(ptr, var->GetType()); - VLOG(3) << "Create persistable var: " << var->Name() - << ", which pointer is " << ptr; - } else if (is_param_grad && microbatch_id == 0) { - auto* ptr = minibatch_scope_->Var(var->Name()); - InitializeVariable(ptr, var->GetType()); - VLOG(3) << "Create grad for persistable var: " << var->Name() + VLOG(5) << "Create persistable var: " << var->Name() << ", which pointer is " << ptr; - } else if (!var->Persistable() && !is_param_grad) { + } else if (!var->Persistable()) { auto* ptr = microbatch_scopes_[microbatch_id]->Var(var->Name()); - VLOG(3) << "Create variable " << var->Name() << " for microbatch " + VLOG(5) << "Create variable " << var->Name() << " for microbatch " << microbatch_id << ", which pointer is " << ptr; InitializeVariable(ptr, var->GetType()); } diff --git a/python/paddle/distributed/fleet/meta_optimizers/common.py b/python/paddle/distributed/fleet/meta_optimizers/common.py index c3d27bcc4ea..a7f938647ad 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/common.py +++ b/python/paddle/distributed/fleet/meta_optimizers/common.py @@ -106,6 +106,11 @@ class CollectiveHelper(object): 'use_calc_stream': True, OP_ROLE_KEY: OpRole.Forward }) + block.append_op( + type='c_sync_calc_stream', + inputs={'X': sync_var}, + outputs={'Out': sync_var}, + attrs={OP_ROLE_KEY: OpRole.Forward}) block = program.global_block() if core.is_compiled_with_cuda(): diff --git a/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py index 6f435bb86ba..6cb7593b6bf 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py @@ -171,6 +171,7 @@ class PipelineOptimizer(MetaOptimizerBase): program._pipeline_opt['ring_id'] = self.start_pipeline_ring_id program._pipeline_opt['micro_batch_size'] = self.micro_batch_size program._pipeline_opt['schedule_mode'] = self.schedule_mode + program._pipeline_opt['use_sharding'] = False optimize_ops, params_grads, prog_list, pp_pair, ring_map = self.wrapped_opt.minimize( loss, startup_program, parameter_list, no_grad_set) self.startup_program = orig_startup_program._pipeline_opt[ @@ -218,7 +219,6 @@ class PipelineOptimizer(MetaOptimizerBase): grad = None processed_param_name = set() first_optimize_op_idx = None - add_sync_calc_stream = False for idx, op in reversed(list(enumerate(block.ops))): if is_backward_op(op) and not first_optimize_op_idx: first_optimize_op_idx = idx + 1 @@ -242,15 +242,6 @@ class PipelineOptimizer(MetaOptimizerBase): origin_param = origin_block.vars[op_role_var[i]] if origin_param.is_distributed: continue - if not add_sync_calc_stream: - add_sync_calc_stream = True - block._insert_op( - first_optimize_op_idx + offset, - type='c_sync_calc_stream', - inputs={'X': grad}, - outputs={'Out': grad}, - attrs={OP_ROLE_KEY: OpRole.Optimize}) - offset += 1 block._insert_op( first_optimize_op_idx + offset, diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 2aa918bf806..76c5a309103 100755 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -3805,7 +3805,6 @@ class PipelineOptimizer(object): self._param_device_map = None self._pipeline_pair = [] self._pp_ring_map = dict() - self._global_ring_id = None # insert allreduce op to sync global information for global # gradient clip and amp @@ -3841,7 +3840,7 @@ class PipelineOptimizer(object): inputs={'X': temp_var if op.type == "reduce_any" else out_var}, outputs={'Out': temp_var if op.type == "reduce_any" else out_var}, attrs={ - 'ring_id': self._global_ring_id, + 'ring_id': self.global_ring_id, self._op_role_key: self._op_role.Optimize, 'use_calc_stream': True }) @@ -3887,6 +3886,16 @@ class PipelineOptimizer(object): reserved_x.append(input_name) op.desc.set_input('X', reserved_x) op.desc.set_output('Out', reserved_x) + elif op.type == 'check_finite_and_unscale': + for input_name in op.desc.input("X"): + if block._find_var_recursive(input_name): + reserved_x.append(input_name) + op.desc.set_input('X', reserved_x) + op.desc.set_output('Out', reserved_x) + if len(reserved_x) == 0: + block._remove_op(op_idx) + op_size -= 1 + continue elif op.type == 'sum' and self._is_gradient_clip_op(op): for input_name in op.desc.input("X"): if block._find_var_recursive(input_name): @@ -4020,63 +4029,32 @@ class PipelineOptimizer(object): self._create_vars(new_startup_program.global_block(), block) return new_startup_program - def _find_post_op(self, ops, cur_op, var_name): + def _find_post_op(self, index, var_name): """ - Find the real post op that has variable named var_name as input. - - Args: - ops (list): A list of ops. - cur_op (Operator): Current operator which has variable named - var_name as output. - var_name (string): Variable name. + Find the post op that has variable named var_name as input. """ - # To skip the cast op added by amp which has no op_device set - if '.cast_fp32' in var_name: - var_name = var_name.replace('.cast_fp32', '') - elif '.cast_fp16' in var_name: - var_name = var_name.replace('.cast_fp16', '') - post_op = [] - before = True - for op in ops: - if op == cur_op: - before = False - continue - if before: - continue - for in_var_name in op.input_arg_names: - if in_var_name == var_name: - post_op.append(op) - break - if post_op: - return post_op[0] - return None + post_ops = self.input_var_to_op[var_name] + if post_ops == None: return None + result_op = None + for post_op, post_idx in reversed(post_ops): + if post_idx > index: + result_op = post_op + break + return result_op - def _find_real_prev_op(self, ops, cur_op, var_name): + def _find_prev_op(self, index, var_name): """ - Find the real previous op that outputs variable named var_name. - - Args: - ops (list): A list of ops. - cur_op (Operator): Current operator which has variable named - var_name as input. - var_name (string): Variable name. + Find the previous op of op with index that outputs + variable named var_name. """ - prev_op = [] - for op in ops: - if op.type == 'send_v2' or op.type == 'recv_v2' \ - or op.type == 'c_broadcast': - continue - if op == cur_op: + prev_ops = self.output_var_to_op[var_name] + if prev_ops == None: return None + result_op = None + for prev_op, prev_idx in reversed(prev_ops): + if prev_idx < index: + result_op = prev_op break - for out_var_name in op.output_arg_names: - if out_var_name == var_name: - prev_op.append(op) - if prev_op: - # A op may have more than one prev op, - # e.g., for 'learning_rate', there may be multiple ops have it as - # output. - return prev_op[-1] - return None + return result_op def _rename_arg(self, op, old_name, new_name): op._rename_input(old_name, new_name) @@ -4136,23 +4114,21 @@ class PipelineOptimizer(object): # For LRSched ops, we should put them on all sub-programs to # make sure each sub-program update the lr correctly op._set_attr(self._op_device_key, "gpu:all") - elif (op.type == "cast" or - op.type == "scale") and self._is_backward_op(op): - prev_op = self._find_real_prev_op(block.ops, op, - op.desc.input("X")[0]) + elif op.type == "scale" and self._is_backward_op(op): + prev_op = self._find_prev_op(idx, op.desc.input("X")[0]) op._set_attr(self._op_device_key, prev_op.attr(self._op_device_key)) elif op.type == "memcpy" and not self._is_optimize_op(op): + # for checkpoint offloading assert len(op.input_arg_names) == 1 and len( op.output_arg_names) == 1 input_name = op.input_arg_names[0] output_name = op.output_arg_names[0] if '@Fetch' in output_name: - post_op = self._find_post_op(block.ops, op, output_name) + post_op = self._find_post_op(idx, output_name) op._set_attr(self._op_device_key, post_op.attr(self._op_device_key)) else: - prev_op = self._find_real_prev_op(block.ops, op, - op.desc.input("X")[0]) + prev_op = self._find_prev_op(idx, op.desc.input("X")[0]) op._set_attr(self._op_device_key, prev_op.attr(self._op_device_key)) elif self._is_loss_op(op): @@ -4165,16 +4141,11 @@ class PipelineOptimizer(object): assert device, "Please put you program within device_guard scope." for i in range(offset): block.ops[idx + i]._set_attr(self._op_device_key, device) - elif self._is_optimize_op(op) and op.type == "check_finite_and_unscale": - op_role_var = op.attr(self._op_role_var_key) - param_name = op_role_var[0] - device = self._param_device_map[param_name] - op._set_attr(self._op_device_key, device) elif self._is_optimize_op(op) and op.type == "cast": # For fp16-->fp32 cast added by AMP grad_name = op.output('Out') assert len(grad_name) == 1 - param_name = grad_name[0].strip(core.grad_var_suffix()) + param_name = self._strip_grad_suffix(grad_name[0]) device = self._param_device_map[param_name] op._set_attr(self._op_device_key, device) elif self._is_gradient_clip_op(op) or self._is_regularization_op(op): @@ -4197,7 +4168,11 @@ class PipelineOptimizer(object): op._set_attr(self._op_device_key, device) else: other_known_ops = [ - 'update_loss_scaling', 'reduce_any', 'concat', 'sum' + 'update_loss_scaling', + 'reduce_any', + 'concat', + 'sum', + 'check_finite_and_unscale', ] assert op.type in other_known_ops, "For other ops without " \ "op_device set, they must be one of {}, but it " \ @@ -4274,41 +4249,70 @@ class PipelineOptimizer(object): Insert a pair of send and recv ops for every two consecutive ops on different devices. """ - extra_index = 0 + extra_index_info = {'index': 0} # A map from var to device where op takes it as input, # avoiding multiple send and recv ops. - var_dev_map = dict() + input_var_to_device = dict() for index, op in enumerate(list(block.ops)): cur_device = op.attr(self._op_device_key) if cur_device == "gpu:all": continue for var_name in op.input_arg_names: - # i.e., lod_tensor_blocking_queue created by DataLoader, - # which only exists in startup program. var = block.var(var_name) - # skip data, because we will process it later + # skip data var if var.is_data: continue prev_device = None - if var_name in self._param_device_map: + generate_ops = self.output_var_to_op.get(var_name) + if generate_ops is None: + if var_name not in self._param_device_map: + continue prev_device = self._param_device_map[var_name] - prev_op = self._find_real_prev_op(block.ops, op, var_name) + + prev_op = self._find_prev_op(index, var_name) + if not prev_device: prev_device = prev_op.attr(self._op_device_key) \ if prev_op else None - if not prev_device or prev_device == 'gpu:all': continue - if prev_device != cur_device: - if var_name not in var_dev_map: var_dev_map[var_name] = [] - if cur_device in var_dev_map[var_name]: continue - var_dev_map[var_name].append(cur_device) + if prev_device is None or prev_device == "gpu:all": continue + + if prev_device == cur_device: continue - op_role = op.all_attrs()[self._op_role_key] + if var_name not in input_var_to_device: + input_var_to_device[var_name] = [] + if (cur_device, prev_device) in input_var_to_device[var_name]: + continue + + device_type = cur_device.split(':')[0] + ':' + + def _insert_send_recv(cur_id, prev_id): + cur_dev = device_type + str(cur_id) + prev_dev = device_type + str(prev_id) + if (cur_dev, prev_dev) in input_var_to_device[var_name]: + return + + if cur_id - prev_id > 1: + _insert_send_recv(cur_id - 1, prev_id) + _insert_send_recv(cur_id, cur_id - 1) + input_var_to_device[var_name].append( + (cur_dev, prev_dev)) + return + elif cur_id - prev_id < -1: + _insert_send_recv(cur_id + 1, prev_id) + _insert_send_recv(cur_id, cur_id + 1) + input_var_to_device[var_name].append( + (cur_dev, prev_dev)) + return + + assert abs(cur_id - prev_id) == 1 + input_var_to_device[var_name].append((cur_dev, prev_dev)) + + op_role = op.attr(self._op_role_key) var = block.vars[var_name] - prev_device_index = int(prev_device.split(':')[1]) - cur_device_index = int(cur_device.split(':')[1]) - pair = (prev_device_index, cur_device_index) - pair_key = prev_device_index * 1000 + cur_device_index + pair = (prev_id, cur_id) + # 1000 is just a magic number + pair_key = prev_id * 1000 + cur_id if pair not in self._pipeline_pair: self._pipeline_pair.append(pair) self._pp_ring_map[pair_key] = self.ring_id @@ -4316,89 +4320,95 @@ class PipelineOptimizer(object): self.ring_id += 1 else: ring_id = self._pp_ring_map[pair_key] + if self.schedule_mode == 'F-then-B': # F-then-B block._insert_op( - index=index + extra_index, + index=index + extra_index_info['index'], type='send_v2', inputs={'X': var}, attrs={ - self._op_device_key: prev_device, + self._op_device_key: prev_dev, self._op_role_key: op_role, 'use_calc_stream': True, 'peer': 1, 'ring_id': ring_id }) - extra_index += 1 + extra_index_info['index'] += 1 block._insert_op( - index=index + extra_index, + index=index + extra_index_info['index'], type='recv_v2', outputs={'Out': [var]}, attrs={ 'out_shape': var.shape, 'dtype': var.dtype, - self._op_device_key: cur_device, + self._op_device_key: cur_dev, self._op_role_key: op_role, 'use_calc_stream': True, 'peer': 0, 'ring_id': ring_id }) - extra_index += 1 + extra_index_info['index'] += 1 elif self.schedule_mode == '1F1B': # 1F1B block._insert_op( - index=index + extra_index, + index=index + extra_index_info['index'], type='c_sync_calc_stream', inputs={'X': [var]}, outputs={'Out': [var]}, attrs={ - self._op_device_key: prev_device, + self._op_device_key: prev_dev, self._op_role_key: op_role, }) - extra_index += 1 + extra_index_info['index'] += 1 block._insert_op( - index=index + extra_index, + index=index + extra_index_info['index'], type='send_v2', inputs={'X': var}, attrs={ - self._op_device_key: prev_device, + self._op_device_key: prev_dev, self._op_role_key: op_role, 'use_calc_stream': False, 'ring_id': ring_id, 'peer': 1, }) - extra_index += 1 + extra_index_info['index'] += 1 block._insert_op( - index=index + extra_index, + index=index + extra_index_info['index'], type='c_sync_comm_stream', inputs={'X': [var]}, outputs={'Out': [var]}, attrs={ - self._op_device_key: prev_device, + self._op_device_key: prev_dev, self._op_role_key: self._op_role.Backward, 'ring_id': ring_id, }) - extra_index += 1 + extra_index_info['index'] += 1 var_shape = list(var.shape) var_shape[0] = self.micro_batch_size if var_shape[ 0] < 0 else var_shape[0] block._insert_op( - index=index + extra_index, + index=index + extra_index_info['index'], type='recv_v2', outputs={'Out': [var]}, attrs={ 'out_shape': var_shape, 'dtype': var.dtype, - self._op_device_key: cur_device, + self._op_device_key: cur_dev, self._op_role_key: op_role, 'use_calc_stream': True, 'peer': 0, 'ring_id': ring_id }) - extra_index += 1 + extra_index_info['index'] += 1 else: raise ValueError( "Now only 'F-then-B' and '1F1B' are supported." "The given value is {}.".format(self.schedule_mode)) + _insert_send_recv( + int(cur_device.split(':')[1]), + int(prev_device.split(':')[1])) + block._sync_with_cpp() + def _insert_loss_scale(self, block): """ Scale the loss corresponding to number of micro-batches. @@ -4675,6 +4685,23 @@ class PipelineOptimizer(object): return op.desc.has_attr("op_namescope") \ and op.desc.attr("op_namescope").startswith("/regularization") + def _get_input_output_info(self, block): + ''' + Get info of op input and output. + ''' + # A map from output var to op which generate it. + self.output_var_to_op = dict() + # A map from var to op which takes it as input. + self.input_var_to_op = dict() + + for index, op in enumerate(list(block.ops)): + for var_name in op.input_arg_names: + ops = self.input_var_to_op.setdefault(var_name, []) + ops.append([op, index]) + for var_name in op.output_arg_names: + ops = self.output_var_to_op.setdefault(var_name, []) + ops.append([op, index]) + def minimize(self, loss, startup_program=None, @@ -4682,30 +4709,35 @@ class PipelineOptimizer(object): no_grad_set=None): main_block = loss.block self.origin_main_block = main_block + main_program = main_block.program if startup_program is None: startup_program = default_startup_program() - optimize_ops, params_grads = self._optimizer.minimize( - loss, startup_program, parameter_list, no_grad_set) - self._param_device_map = self._origin_optimizer._param_device_map - assert main_block.program._pipeline_opt \ - and 'local_rank' in main_block.program._pipeline_opt, \ - 'Please use pipeline with fleet.' - local_rank = main_block.program._pipeline_opt['local_rank'] - self._global_ring_id = main_block.program._pipeline_opt[ - 'global_ring_id'] - schedule_mode = 0 - if 'schedule_mode' in main_block.program._pipeline_opt: - schedule_mode = main_block.program._pipeline_opt['schedule_mode'] - self.schedule_mode = schedule_mode - # micro batch size + + assert main_program._pipeline_opt, 'Please use pipeline with fleet.' + required_keys = [ + 'local_rank', + 'schedule_mode', + 'micro_batch_size', + 'ring_id', + 'global_ring_id', + 'use_sharding', + ] + for key in required_keys: + assert key in main_program._pipeline_opt, \ + 'Please use pipeline with fleet to use {}.'.format(key) + self.local_rank = main_block.program._pipeline_opt['local_rank'] + self.schedule_mode = main_block.program._pipeline_opt['schedule_mode'] self.micro_batch_size = main_block.program._pipeline_opt[ 'micro_batch_size'] - - self.use_sharding = False - if 'use_sharding' in main_block.program._pipeline_opt: - self.use_sharding = main_block.program._pipeline_opt['use_sharding'] + self.use_sharding = main_block.program._pipeline_opt['use_sharding'] self.ring_id = main_block.program._pipeline_opt['ring_id'] + self.global_ring_id = main_block.program._pipeline_opt['global_ring_id'] + + optimize_ops, params_grads = self._optimizer.minimize( + loss, startup_program, parameter_list, no_grad_set) + self._param_device_map = self._origin_optimizer._param_device_map + self._get_input_output_info(main_block) # Step1: add default op_device attribute for ops. self._add_op_device_attr(main_block) device_list = self._check_validation(main_block) @@ -4742,20 +4774,20 @@ class PipelineOptimizer(object): # Step5: Add sub blocks for section programs self._add_sub_blocks(main_block, program_list) - local_rank = main_program._pipeline_opt['local_rank'] % len(device_list) + self.local_rank %= len(device_list) place_list = [] for dev in device_list: dev_index = int(dev.split(":")[1]) - place_list.append(core.CUDAPlace(dev_index % 8)) + place_list.append(core.CUDAPlace(0)) # Step6: Split startup program new_startup_program = self._split_startup_program(startup_program, - local_rank) + self.local_rank) startup_program._pipeline_opt = { "startup_program": new_startup_program, } - real_block = program_list[local_rank].global_block() + real_block = program_list[self.local_rank].global_block() self._insert_loss_scale(real_block) if not self.use_sharding: # Step7: clear gradients before each mini-batch and @@ -4769,12 +4801,12 @@ class PipelineOptimizer(object): main_program._pipeline_opt = { "trainer": "PipelineTrainer", "device_worker": "Section", - "pipeline_stage": local_rank, + "pipeline_stage": self.local_rank, "num_pipeline_stages": len(device_list), "schedule_mode": self.schedule_mode, "inner_parallelism": len(device_list), - "section_program": program_list[local_rank], - "place": place_list[local_rank], + "section_program": program_list[self.local_rank], + "place": place_list[self.local_rank], "place_id": place_id, "sync_steps": -1, "num_microbatches": self._num_microbatches, -- GitLab From b05f61421011e5a710be9547c48046156e26a8ef Mon Sep 17 00:00:00 2001 From: Zhou Wei <52485244+zhouwei25@users.noreply.github.com> Date: Wed, 31 Mar 2021 21:11:44 +0800 Subject: [PATCH 121/486] [Parallel UT]Improve Parallel UT level on Windows/Linux (#31377) * [Parallel UT]improve Parallel UT level on Windows/Linux * [Parallel UT]improve Parallel UT level on Windows/Linux * [Parallel UT]Improve Parallel UT level on Windows/Linux * [Parallel UT]Improve Parallel UT level on Windows/Linux * fix CI --- paddle/fluid/inference/api/CMakeLists.txt | 2 - paddle/fluid/pybind/CMakeLists.txt | 2 +- paddle/scripts/paddle_build.bat | 1 + paddle/scripts/paddle_build.sh | 93 +-- python/CMakeLists.txt | 4 +- tools/parallel_UT_rule.py | 908 +++++++++++++++++----- tools/windows/run_unittests.sh | 28 +- 7 files changed, 761 insertions(+), 277 deletions(-) diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt index 9a4637306bb..03f86cc7ba6 100755 --- a/paddle/fluid/inference/api/CMakeLists.txt +++ b/paddle/fluid/inference/api/CMakeLists.txt @@ -57,11 +57,9 @@ if(WITH_TESTING) if (NOT APPLE AND NOT WIN32) inference_base_test(test_api_impl SRCS api_impl_tester.cc DEPS paddle_inference_shared ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${IMG_CLS_RESNET_INSTALL_DIR}) - set_tests_properties(test_api_impl PROPERTIES DEPENDS test_image_classification) elseif(WIN32) inference_base_test(test_api_impl SRCS api_impl_tester.cc DEPS ${inference_deps} ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${IMG_CLS_RESNET_INSTALL_DIR}) - set_tests_properties(test_api_impl PROPERTIES DEPENDS test_image_classification) endif() endif() diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index 5c9655edfb7..97ebd64a07e 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -105,7 +105,7 @@ if(WITH_PYTHON) set(tmp_impl_file ${impl_file}.tmp) if(WIN32) - if("${CMAKE_GENERATOR}" STREQUAL "Ninja") + if("${CMAKE_GENERATOR}" STREQUAL "Ninja") set(op_function_generator_path "${CMAKE_CURRENT_BINARY_DIR}") else() set(op_function_generator_path "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}") diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat index 2edb062ac80..b04c5f490c1 100644 --- a/paddle/scripts/paddle_build.bat +++ b/paddle/scripts/paddle_build.bat @@ -499,6 +499,7 @@ setlocal enabledelayedexpansion :: if %errorlevel% NEQ 0 exit /b 8 :: for /F %%# in ('cmd /C nvidia-smi -L ^|find "GPU" /C') do set CUDA_DEVICE_COUNT=%%# set CUDA_DEVICE_COUNT=1 +set FLAGS_fraction_of_gpu_memory_to_use=0.92 %cache_dir%\tools\busybox64.exe bash %work_dir%\tools\windows\run_unittests.sh %NIGHTLY_MODE% %PRECISION_TEST% diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 7f184f18986..7516e4c99ea 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -991,7 +991,7 @@ function case_count(){ EOF testcases=$1 num=$(echo $testcases|grep -o '\^'|wc -l) - if [ "$2" == "" ]; then + if (( $2 == -1 )); then echo "exclusive TestCases count is $num" echo "ipipe_log_param_Exclusive_TestCases_Count: $num" else @@ -1034,6 +1034,11 @@ function card_test() { set -m case_count $1 $2 ut_startTime_s=`date +%s` + + testcases=$1 + cardnumber=$2 + parallel_level_base=${CTEST_PARALLEL_LEVEL:-1} + # get the CUDA device count, XPU device count is one if [ "${WITH_XPU}" == "ON" ];then CUDA_DEVICE_COUNT=1 @@ -1043,20 +1048,13 @@ function card_test() { CUDA_DEVICE_COUNT=$(nvidia-smi -L | wc -l) fi - testcases=$1 - parallel_level_base=${CTEST_PARALLEL_LEVEL:-1} - if (( $# > 1 )); then - cardnumber=$2 - if (( $cardnumber > $CUDA_DEVICE_COUNT )); then - cardnumber=$CUDA_DEVICE_COUNT - fi - if (( $# > 2 )); then - parallel_job=`expr $3 \* $parallel_level_base` - else - parallel_job=$parallel_level_base - fi - else + if (( $cardnumber == -1 ));then cardnumber=$CUDA_DEVICE_COUNT + fi + + if (( $# > 2 )); then + parallel_job=`expr $3 \* $parallel_level_base` + else parallel_job=$parallel_level_base fi @@ -1098,7 +1096,7 @@ function card_test() { done wait; # wait for all subshells to finish ut_endTime_s=`date +%s` - if [ "$2" == "" ]; then + if (( $2 == -1 )); then echo "exclusive TestCases Total Time: $[ $ut_endTime_s - $ut_startTime_s ]s" echo "ipipe_log_param_Exclusive_TestCases_Total_Time: $[ $ut_endTime_s - $ut_startTime_s ]s" else @@ -1153,13 +1151,18 @@ set -x set +x EXIT_CODE=0; test_cases=$(ctest -N -V) # get all test cases - single_card_tests_eight_parallel='^job$' # cases list which would run 8 job each time with single GPU - single_card_tests_tetrad_parallel='^job$' # cases list which would run 4 job each time with single GPU - single_card_tests_non_parallel_1='^job$' # cases list which would run 1 job each time with single GPU - single_card_tests_non_parallel_2='^job$' # cases list which would run 1 job each time with single GPU - single_card_tests='^job$' # all cases list which would take one graph card - exclusive_tests='' # cases list which would be run exclusively - multiple_card_tests='' # cases list which would take multiple GPUs, most cases would be two GPUs + # Note(zhouwei): Parallel runs are relative to 'CTEST_PARALLEL_LEVEL', e.g: '4 job each time' means 4*CTEST_PARALLEL_LEVEL + single_card_tests_high_parallel='^job$' # cases list which would run the most job each time with single GPU + single_card_tests_two_parallel='^job$' # cases list which would run 2 job each time with single GPU + single_card_tests_non_parallel='^job$' # cases list which would run 1 job each time with single GPU + single_card_tests='^job$' # all cases list which would take single GPU + + multiple_card_tests_two_parallel='^job$' # cases list which would run 2 job each time with multiple GPUs, most cases would be two GPUs + multiple_card_tests_non_parallel='^job$' # cases list which would run 1 job each time with multiple GPUs, most cases would be two GPUs + + exclusive_tests_two_parallel='^job$' # cases list which would run 2 job exclusively(with all GPUs) + exclusive_tests_non_parallel='^job$' # cases list which would run 1 job exclusively(with all GPUs) + is_exclusive='' # indicate whether the case is exclusive type is_multicard='' # indicate whether the case is multiple GPUs type is_nightly='' # indicate whether the case will only run at night @@ -1167,9 +1170,10 @@ set +x UT_list=$(ctest -N | awk -F ': ' '{print $2}' | sed '/^$/d' | sed '$d') output=$(python ${PADDLE_ROOT}/tools/parallel_UT_rule.py "${UT_list}") - eight_parallel_job=$(echo $output | cut -d ";" -f 1) - tetrad_parallel_jog=$(echo $output | cut -d ";" -f 2) - non_parallel_job=$(echo $output | cut -d ";" -f 3) + cpu_parallel_job=$(echo $output | cut -d ";" -f 1) + tetrad_parallel_job=$(echo $output | cut -d ";" -f 2) + two_parallel_job=$(echo $output | cut -d ";" -f 3) + non_parallel_job=$(echo $output | cut -d ";" -f 4) while read -r line; do if [[ "$line" == "" ]]; then continue @@ -1211,26 +1215,24 @@ set +x fi if [[ "$is_exclusive" != "" ]]; then - if [[ "$exclusive_tests" == "" ]]; then - exclusive_tests="^$testcase$" + if [[ $(echo $cpu_parallel_job$tetrad_parallel_job$two_parallel_job | grep -o $testcase) != "" ]]; then + exclusive_tests_two_parallel="$exclusive_tests_two_parallel|^$testcase$" else - exclusive_tests="$exclusive_tests|^$testcase$" + exclusive_tests_non_parallel="$exclusive_tests_non_parallel|^$testcase$" fi elif [[ "$is_multicard" != "" ]]; then - if [[ "$multiple_card_tests" == "" ]]; then - multiple_card_tests="^$testcase$" + if [[ $(echo $cpu_parallel_job$tetrad_parallel_job$two_parallel_job | grep -o $testcase) != "" ]]; then + multiple_card_tests_two_parallel="$multiple_card_tests_two_parallel|^$testcase$" else - multiple_card_tests="$multiple_card_tests|^$testcase$" + multiple_card_tests_non_parallel="$multiple_card_tests_non_parallel|^$testcase$" fi else - if [[ $(echo $eight_parallel_job | grep $testcase) != "" ]]; then - single_card_tests_eight_parallel="$single_card_tests_eight_parallel|^$testcase$" - elif [[ $(echo $tetrad_parallel_jog | grep $testcase) != "" ]]; then - single_card_tests_tetrad_parallel="$single_card_tests_tetrad_parallel|^$testcase$" - elif [[ "${#single_card_tests_non_parallel_1}" -gt 10000 ]];then - single_card_tests_non_parallel_2="$single_card_tests_non_parallel_2|^$testcase$" + if [[ $(echo $cpu_parallel_job | grep -o $testcase) != "" ]]; then + single_card_tests_high_parallel="$single_card_tests_high_parallel|^$testcase$" + elif [[ $(echo $tetrad_parallel_job$two_parallel_job | grep -o $testcase) != "" ]]; then + single_card_tests_two_parallel="$single_card_tests_two_parallel|^$testcase$" else - single_card_tests_non_parallel_1="$single_card_tests_non_parallel_1|^$testcase$" + single_card_tests_non_parallel="$single_card_tests_non_parallel|^$testcase$" fi single_card_tests="$single_card_tests|^$testcase$" fi @@ -1241,12 +1243,13 @@ set +x testcase='' done <<< "$test_cases"; - card_test "$single_card_tests_eight_parallel" 1 8 # run cases 8 job each time with single GPU - card_test "$single_card_tests_tetrad_parallel" 1 4 # run cases 4 job each time with single GPU - card_test "$single_card_tests_non_parallel_1" 1 # run cases 1 job each time with single GPU - card_test "$single_card_tests_non_parallel_2" 1 # run cases 1 job each time with single GPU - card_test "$multiple_card_tests" 2 # run cases with two GPUs - card_test "$exclusive_tests" # run cases exclusively, in this cases would be run with 4/8 GPUs + card_test "$single_card_tests_high_parallel" 1 8 # run cases the most each time with single GPU + card_test "$single_card_tests_two_parallel" 1 2 # run cases 2 job each time with single GPU + card_test "$single_card_tests_non_parallel" 1 # run cases 1 job each time with single GPU + card_test "$multiple_card_tests_two_parallel" 2 2 # run cases 2 job each time with two GPUs + card_test "$multiple_card_tests_non_parallel" 2 # run cases 1 job each time with two GPUs + card_test "$exclusive_tests_two_parallel" -1 2 # run cases exclusively, in this cases would be run with 2/4/8 GPUs + card_test "$exclusive_tests_non_parallel" -1 # run cases exclusively, in this cases would be run with 2/4/8 GPUs collect_failed_tests rm -f $tmp_dir/* exec_times=0 @@ -1319,7 +1322,7 @@ set +x fi if [[ "$exclusive_retry" != "" ]]; then - card_test "$exclusive_retry" + card_test "$exclusive_retry" -1 fi exec_times=$[$exec_times+1] diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 938547f363c..9b03cd08ba9 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -18,10 +18,10 @@ set(FLUID_CORE_NAME "core") if(WITH_AVX AND AVX_FOUND) set(FLUID_CORE_NAME "${FLUID_CORE_NAME}_avx") if(NOT DEFINED NOAVX_CORE_FILE OR NOAVX_CORE_FILE STREQUAL "") - message(STATUS "WARNING: This is just a warning for publishing release. + message(STATUS "MESSAGE: This is just a message for publishing release. You are building AVX version without NOAVX core. So the wheel package may fail on NOAVX machine. - You can add -DFLUID_CORE_NAME=/path/to/your/core_noavx.* in cmake command + You can add -DNOAVX_CORE_FILE=/path/to/your/core_noavx.* in cmake command to get a full wheel package to resolve this warning. While, this version will still work on local machine.") endif() diff --git a/tools/parallel_UT_rule.py b/tools/parallel_UT_rule.py index 3fb78b0d0a1..0f745f21207 100644 --- a/tools/parallel_UT_rule.py +++ b/tools/parallel_UT_rule.py @@ -16,166 +16,148 @@ import sys import os # *=======These unittest doesn't occupy GPU memory, just run as CPU unittest=======* # -# It run 8 job each time, If it failed due to Insufficient GPU memory or CUBLAS_STATUS_ALLOC_FAILED, +# It run 16 job each time, If it failed due to Insufficient GPU memory or CUBLAS_STATUS_ALLOC_FAILED, # just remove it from this list. CPU_PARALLEL_JOB = [ - 'test_row_conv', - 'test_nce', - 'test_conv3d_mkldnn_op', - 'dim_test', - 'test_limit_gpu_memory', - 'profiler_test', - 'test_dequantize_mkldnn_op', - 'test_elementwise_add_bf16_mkldnn_op', - 'test_rpn_target_assign_op', - 'test_hash_op', - 'reader_blocking_queue_test', - 'jit_kernel_test', - 'test_tdm_child_op', - 'test_simplify_with_basic_ops_pass', - 'test_sequence_last_step', - 'test_sequence_first_step', - 'test_seq_concat_fc_fuse_pass', - 'test_fc_gru_fuse_pass', - 'test_dataset_imdb', - 'dlpack_tensor_test', - 'check_reduce_rank_test', + 'test_static_save_load_large', + 'version_test', 'var_type_traits_test', 'var_type_inference_test', + 'variable_test', + 'unroll_array_ops_test', + 'tuple_test', 'to_string_test', + 'timer_test', 'threadpool_test', + 'test_zeros_op', + 'test_while_op', + 'test_weight_quantization_mobilenetv1', 'test_version', 'test_var_info', 'test_var_conv_2d', + 'test_utils', 'test_unique_name', 'test_transpose_int8_mkldnn_op', 'test_transpose_bf16_mkldnn_op', + 'test_trainer_desc', 'test_trainable', 'test_teacher_student_sigmoid_loss_op', 'test_tdm_sampler_op', + 'test_tdm_child_op', + 'test_sysconfig', + 'test_sync_batch_norm_pass', 'test_switch', 'test_static_shape_inferrence_for_shape_tensor', - 'test_squared_mat_sub_fuse_pass', - 'test_sequence_scatter_op', - 'test_sequence_scatter_op', - 'test_scaled_dot_product_attention', - 'test_rnn_memory_helper_op', - 'test_requantize_mkldnn_op', - 'test_quantize_transpiler', - 'test_quantize_mkldnn_op', - 'test_py_reader_sample_generator', - 'test_parallel_executor_seresnext_with_reduce_cpu', - 'test_parallel_executor_seresnext_with_fuse_all_reduce_cpu', - 'test_parallel_executor_seresnext_base_cpu', - 'test_parallel_dygraph_sync_batch_norm', - 'test_origin_info', - 'test_multiclass_nms_op', - 'test_mkldnn_conv_bias_fuse_pass', - 'test_mkldnn_conv_activation_fuse_pass', - 'test_matrix_nms_op', - 'test_ir_graph', - 'test_inference_api', - 'test_infer_shape', - 'test_infer_no_need_buffer_slots', - 'test_imperative_numpy_bridge', - 'test_imperative_decorator', - 'test_hooks', - 'test_gpu_package_without_gpu_device', - 'test_global_var_getter_setter', - 'test_get_set_flags', - 'test_fusion_repeated_fc_relu_op', - 'test_fused_emb_seq_pool_op', - 'test_fleet_base_4', - 'test_fc_lstm_fuse_pass', - 'test_executor_feed_non_tensor', - 'test_executor_check_feed', - 'test_executor_and_use_program_cache', - 'test_exception', - 'test_error_clip', - 'test_embedding_eltwise_layernorm_fuse_pass', - 'test_dyn_rnn', - 'test_dpsgd_op', - 'test_distributed_reader', - 'test_directory_migration', - 'test_dataset_wmt', - 'test_dataset_uci_housing', - 'test_dataset_cifar', - 'test_data_feeder', - 'test_cudnn_placement_pass', - 'test_conv3d_layer', - 'test_concat_bf16_mkldnn_op', - 'test_common_infer_shape_functions', - 'test_check_import_scipy', - 'test_calc_gradient', - 'test_bipartite_match_op', - 'test_attention_lstm_op', - 'test_array_read_write_op', - 'stringprintf_test', - 'stringpiece_test', - 'selected_rows_test', - 'scope_test', - 'reader_test', - 'prune_test', - 'op_tester', - 'eigen_test', - 'device_worker_test', - 'cudnn_helper_test', - 'cudnn_desc_test', - 'tuple_test', - 'timer_test', - 'test_zeros_op', - 'test_while_op', - 'test_utils', 'test_static_analysis', + 'test_squared_mat_sub_fuse_pass', 'test_split_and_merge_lod_tensor_op', 'test_spawn_and_init_parallel_env', 'test_slice_var', + 'test_skip_layernorm_fuse_pass', + 'test_simplify_with_basic_ops_pass', 'test_similarity_focus_op', 'test_shuffle_batch_op', 'test_shrink_rnn_memory', 'test_set_bool_attr', 'test_sequence_topk_avg_pooling', + 'test_sequence_scatter_op', + 'test_sequence_scatter_op', + 'test_sequence_last_step', + 'test_sequence_first_step', + 'test_seqpool_cvm_concat_fuse_pass', + 'test_seqpool_concat_fuse_pass', + 'test_seq_concat_fc_fuse_pass', 'test_selected_rows', 'test_scope', + 'test_scale_matmul_fuse_pass', + 'test_scaled_dot_product_attention', 'test_sampling_id_op', 'test_runtime_and_compiletime_exception', 'test_run_fluid_by_module_or_command_line', + 'test_rpn_target_assign_op', + 'test_row_conv', + 'test_rnn_memory_helper_op', 'test_retinanet_detection_output', + 'test_reshape_transpose_matmul_mkldnn_fuse_pass', + 'test_reshape_bf16_op', 'test_require_version', + 'test_requantize_mkldnn_op', + 'test_repeated_fc_relu_fuse_pass', 'test_repeated_fc_relu_fuse_pass', 'test_registry', + 'test_reducescatter_api', + 'test_reducescatter', 'test_recurrent_op', 'test_recommender_system', 'test_query_op', + 'test_quantize_transpiler', + 'test_quantize_mkldnn_op', 'test_quantization_mkldnn_pass', + 'test_quant_int8_resnet50_mkldnn', + 'test_quant_int8_mobilenetv2_mkldnn', + 'test_quant_int8_mobilenetv1_mkldnn', + 'test_quant_int8_googlenet_mkldnn', + 'test_quant2_int8_resnet50_range_mkldnn', + 'test_quant2_int8_resnet50_mkldnn', + 'test_quant2_int8_resnet50_channelwise_mkldnn', + 'test_quant2_int8_mobilenetv1_mkldnn', 'test_quant2_int8_mkldnn_pass', - 'test_pybind_interface', + 'test_quant2_int8_ernie_mkldnn', + 'test_py_reader_sample_generator', + 'test_py_reader_return_list', + 'test_py_reader_lod_level_share', 'test_py_reader_error_msg', + 'test_pyramid_hash_op', + 'test_pybind_interface', + 'test_ps_dispatcher', 'test_prune', + 'test_protobuf_descs', 'test_protobuf', 'test_progressbar', 'test_program_to_string', 'test_program_code', 'test_program', 'test_precision_recall_op', + 'test_post_training_quantization_resnet50', + 'test_post_training_quantization_mobilenetv1', + 'test_post_training_quantization_mnist', 'test_positive_negative_pair_op', - 'test_parallel_executor_run_load_infer_program', + 'test_paddle_inference_api', + 'test_origin_info', 'test_op_version', 'test_op_support_gpu', + 'test_operator_desc', + 'test_operator', 'test_ones_op', 'test_npair_loss_op', 'test_nn_functional_embedding_static', + 'test_nce', 'test_name_scope', + 'test_naive_executor', 'test_multiprocess_dataloader_iterable_dataset_split', + 'test_multiprocess_dataloader_exception', + 'test_multihead_matmul_fuse_pass', + 'test_multi_gru_seq_fuse_pass', 'test_multi_gru_mkldnn_op', + 'test_multi_gru_fuse_pass', + 'test_multiclass_nms_op', 'test_mul_int8_mkldnn_op', 'test_mkldnn_scale_matmul_fuse_pass', + 'test_mkldnn_placement_pass', + 'test_mkldnn_op_nhwc', 'test_mkldnn_op_inplace', 'test_mkldnn_matmul_transpose_reshape_fuse_pass', + 'test_mkldnn_matmul_op_output_fuse_pass', + 'test_mkldnn_inplace_pass', 'test_mkldnn_inplace_fuse_pass', 'test_mkldnn_cpu_bfloat16_pass', + 'test_mkldnn_conv_concat_relu_mkldnn_fuse_pass', + 'test_mkldnn_conv_bias_fuse_pass', + 'test_mkldnn_conv_activation_fuse_pass', 'test_mine_hard_examples_op', 'test_memory_usage', + 'test_matrix_nms_op', + 'test_matmul_transpose_reshape_fuse_pass', 'test_matmul_mkldnn_op', 'test_matmul_bf16_mkldnn_op', 'test_math_op_patch', @@ -186,53 +168,100 @@ CPU_PARALLEL_JOB = [ 'test_lod_tensor_array_ops', 'test_lod_tensor_array', 'test_lod_rank_table', - 'test_lod_array_length_op', 'test_locality_aware_nms_op', 'test_load_vars_shape_check', 'test_load_op_xpu', 'test_load_op', - 'test_linear_chain_crf_op', + 'test_limit_gpu_memory', 'test_layer_norm_mkldnn_op', 'test_layer_norm_bf16_mkldnn_op', + 'test_layer', 'test_lambv2_op', + 'test_is_test_pass', 'test_ir_skip_layernorm_pass', + 'test_ir_graph', 'test_io_save_load', 'test_input_spec', + 'test_infer_shape', + 'test_infer_no_need_buffer_slots', 'test_inference_model_io', + 'test_inference_api', + 'test_imperative_signal_handler', + 'test_imperative_numpy_bridge', + 'test_imperative_group', + 'test_imperative_decorator', + 'test_imperative_data_loader_process', + 'test_imperative_data_loader_exit_func', 'test_imperative_base', 'test_image_classification_layer', 'test_image', 'test_ifelse_basic', 'test_hsigmoid_op', + 'test_hooks', + 'test_hash_op', + 'test_group', + 'test_graph_pattern_detector', + 'test_gpu_package_without_gpu_device', + 'test_global_var_getter_setter', + 'test_get_set_flags', 'test_generator', 'test_generate_proposal_labels_op', 'test_generate_mask_labels_op', 'test_gast_with_compatibility', 'test_fusion_squared_mat_sub_op', + 'test_fusion_seqpool_cvm_concat_op', + 'test_fusion_seqpool_concat_op', + 'test_fusion_seqexpand_concat_fc_op', 'test_fusion_seqconv_eltadd_relu_op', + 'test_fusion_repeated_fc_relu_op', 'test_fusion_lstm_op', 'test_fusion_gru_op', + 'test_fusion_gru_mkldnn_op', 'test_fusion_gru_int8_mkldnn_op', 'test_fusion_gru_bf16_mkldnn_op', + 'test_fused_emb_seq_pool_op', 'test_fused_embedding_fc_lstm_op', 'test_function_spec', 'test_full_op', + 'test_fs_interface', + 'test_fs', 'test_framework_debug_str', 'test_fp16_utils', + 'test_fleet_util', + 'test_fleet_unitaccessor', + 'test_fleet_runtime', + 'test_fleet_rolemaker_init', 'test_bf16_utils', 'test_fleet_rolemaker_4', + 'test_fleet_rolemaker_3', + 'test_fleet_rolemaker', + 'test_fleet_nocvm_1', + 'test_fleet_base_4', + 'test_fleet', + 'test_fleet', 'test_flags_use_mkldnn', + 'test_flags_mkldnn_ops_on_off', 'test_filter_by_instag_op', 'test_fetch_var', 'test_fetch_handler', 'test_feed_fetch_method', 'test_fc_mkldnn_op', 'test_fc_lstm_fuse_pass', + 'test_fc_lstm_fuse_pass', 'test_fc_gru_fuse_pass', + 'test_fc_gru_fuse_pass', + 'test_fc_elementwise_layernorm_fuse_pass', 'test_fc_bf16_mkldnn_op', - 'test_entry_attr', + 'test_executor_feed_non_tensor', + 'test_executor_check_feed', + 'test_executor_and_use_program_cache', + 'test_exception', + 'test_error_clip', 'test_entry_attr2', + 'test_entry_attr', + 'test_embedding_eltwise_layernorm_fuse_pass', 'test_elementwise_mul_bf16_mkldnn_op', + 'test_elementwise_add_bf16_mkldnn_op', 'test_eager_deletion_recurrent_op', 'test_eager_deletion_padding_rnn', 'test_eager_deletion_mnist', @@ -240,203 +269,658 @@ CPU_PARALLEL_JOB = [ 'test_eager_deletion_conditional_block', 'test_dynrnn_static_input', 'test_dynrnn_gradient_check', + 'test_dyn_rnn', 'test_dygraph_mode_of_unittest', + 'test_dpsgd_op', + 'test_downpoursgd', 'test_download', 'test_distributions', + 'test_distributed_reader', + 'test_directory_migration', 'test_detection_map_op', 'test_desc_clone', + 'test_dequantize_mkldnn_op', 'test_depthwise_conv_mkldnn_pass', 'test_deprecated_memory_optimize_interfaces', 'test_default_scope_funcs', 'test_default_dtype', + 'test_debugger', + 'test_dataset_wmt', 'test_dataset_voc', + 'test_dataset_uci_housing', 'test_dataset_movielens', 'test_dataset_imikolov', + 'test_dataset_imdb', 'test_dataset_conll05', + 'test_dataset_cifar', + 'test_dataloader_unkeep_order', + 'test_dataloader_keep_order', + 'test_dataloader_dataset', 'test_data_generator', + 'test_data_feeder', 'test_data', 'test_cyclic_cifar_dataset', + 'test_cudnn_placement_pass', 'test_crypto', + 'test_crf_decoding_op', + 'test_create_parameter', 'test_create_op_doc_string', 'test_create_global_var', + 'test_cpu_quantize_squash_pass', + 'test_cpu_quantize_placement_pass', + 'test_cpu_quantize_pass', + 'test_cpu_bfloat16_placement_pass', + 'test_cpu_bfloat16_pass', + 'test_conv_elementwise_add_mkldnn_fuse_pass', + 'test_conv_concat_relu_mkldnn_fuse_pass', + 'test_conv_bias_mkldnn_fuse_pass', + 'test_conv_batch_norm_mkldnn_fuse_pass', + 'test_conv_activation_mkldnn_fuse_pass', 'test_conv3d_transpose_layer', + 'test_conv3d_mkldnn_op', + 'test_conv3d_layer', 'test_conv2d_transpose_layer', 'test_conv2d_mkldnn_op', 'test_conv2d_layer', 'test_conv2d_int8_mkldnn_op', 'test_conv2d_bf16_mkldnn_op', + 'test_context_manager', 'test_const_value', 'test_conditional_block', 'test_concat_int8_mkldnn_op', + 'test_concat_bf16_mkldnn_op', 'test_compat', - 'test_collective_base', - 'test_collective_api_base', + 'test_common_infer_shape_functions', 'test_chunk_eval_op', + 'test_check_import_scipy', + 'test_c_comm_init_all_op', + 'test_calc_gradient', 'test_broadcast_to_op', 'test_broadcast_shape', 'test_broadcast_error', + 'test_broadcast', 'test_bpr_loss_op', + 'test_boxps', + 'test_bipartite_match_op', + 'test_benchmark', 'test_beam_search_op', 'test_batch_sampler', + 'test_batch_norm_act_fuse_pass', 'test_basic_rnn_name', + 'test_attention_lstm_op', + 'test_analyzer', + 'test_allreduce', + 'test_allgather', 'test_aligned_allocator', + 'system_allocator_test', + 'stringprintf_test', + 'stringpiece_test', + 'split_test', + 'selected_rows_test', + 'selected_rows_functor_test', + 'scope_test', 'scatter_test', + 'save_quant2_model_resnet50', + 'save_quant2_model_gru', + 'save_quant2_model_ernie', + 'save_load_util_test', + 'save_load_op_test', 'save_load_combine_op_test', + 'rw_lock_test', + 'retry_allocator_test', + 'reader_test', + 'reader_blocking_queue_test', + 'prune_test', 'program_desc_test', - 'lodtensor_printer_test', - 'lod_tensor_test', - 'gather_test', - 'gather_op_test', - 'fused_broadcast_op_test', - 'exception_holder_test', - 'decorator_test', - 'ddim_test', - 'data_layout_transform_test', - 'cpu_vec_test', - 'cow_ptr_tests', - 'conditional_block_op_test', - 'bfloat16_test', - 'assign_op_test', - 'unroll_array_ops_test', - 'test_seqpool_cvm_concat_fuse_pass', - 'test_seqpool_concat_fuse_pass', - 'test_reshape_bf16_op', - 'test_repeated_fc_relu_fuse_pass', - 'test_py_reader_return_list', - 'test_py_reader_lod_level_share', - 'test_protobuf_descs', - 'test_paddle_inference_api', - 'test_operator_desc', - 'test_operator', - 'test_mkldnn_matmul_op_output_fuse_pass', - 'test_mkldnn_inplace_pass', - 'test_mkldnn_conv_concat_relu_mkldnn_fuse_pass', - 'test_layer', - 'test_is_test_pass', - 'test_graph_pattern_detector', - 'test_fusion_seqpool_cvm_concat_op', - 'test_fusion_seqpool_concat_op', - 'test_fusion_seqexpand_concat_fc_op', - 'test_fusion_gru_mkldnn_op', - 'test_fleet_util', - 'test_fleet_runtime', - 'test_fleet_rolemaker_init', - 'test_flags_mkldnn_ops_on_off', - 'test_dataset_download', - 'test_dataloader_unkeep_order', - 'test_dataloader_keep_order', - 'test_dataloader_dataset', - 'test_crf_decoding_op', - 'test_create_parameter', - 'test_context_manager', - 'test_analyzer', - 'tensor_test', - 'split_test', - 'save_load_op_test', + 'profiler_test', 'place_test', + 'pass_test', 'op_version_registry_test', + 'op_tester', 'op_proto_maker_test', 'op_kernel_type_test', - 'mask_util_test', - 'inlined_vector_test', - 'infer_io_utils_tester', - 'errors_test', - 'enforce_test', - 'dropout_op_test', - 'data_type_test', - 'cpu_info_test', - 'cpu_helper_test', - 'beam_search_decode_op_test', - 'auto_growth_best_fit_allocator_test', - 'test_skip_layernorm_fuse_pass', - 'test_multihead_matmul_fuse_pass', - 'test_fc_elementwise_layernorm_fuse_pass', - 'version_test', - 'variable_test', - 'test_scale_matmul_fuse_pass', - 'test_reshape_transpose_matmul_mkldnn_fuse_pass', - 'test_multi_gru_seq_fuse_pass', - 'test_multi_gru_fuse_pass', - 'test_mkldnn_placement_pass', - 'test_mkldnn_op_nhwc', - 'test_matmul_transpose_reshape_fuse_pass', - 'test_fs', - 'test_fleet', - 'test_cpu_quantize_squash_pass', - 'test_cpu_quantize_placement_pass', - 'test_cpu_quantize_pass', - 'test_cpu_bfloat16_placement_pass', - 'test_cpu_bfloat16_pass', - 'test_conv_elementwise_add_mkldnn_fuse_pass', - 'test_conv_concat_relu_mkldnn_fuse_pass', - 'test_conv_bias_mkldnn_fuse_pass', - 'test_conv_batch_norm_mkldnn_fuse_pass', - 'test_conv_activation_mkldnn_fuse_pass', - 'test_benchmark', - 'test_batch_norm_act_fuse_pass', - 'selected_rows_functor_test', - 'save_load_util_test', - 'pass_test', 'operator_test', 'operator_exception_test', 'op_debug_string_test', 'op_compatible_info_test', 'op_call_stack_test', - 'node_test', 'no_need_buffer_vars_inference_test', + 'node_test', 'nccl_context_test', + 'mmap_allocator_test', 'math_function_test', + 'mask_util_test', + 'lod_tensor_test', + 'test_check_abi', + 'lodtensor_printer_test', + 'jit_kernel_test', + 'test_dispatch_jit', + 'inlined_vector_test', 'init_test', + 'infer_io_utils_tester', 'graph_to_program_pass_test', 'graph_test', 'graph_helper_test', + 'gather_test', + 'gather_op_test', + 'fused_broadcast_op_test', 'float16_test', + 'exception_holder_test', + 'errors_test', + 'enforce_test', + 'eigen_test', + 'dropout_op_test', + 'dlpack_tensor_test', 'dist_multi_trainer_test', + 'dim_test', + 'device_worker_test', + 'decorator_test', + 'ddim_test', + 'data_type_test', + 'test_check_error', + 'data_layout_transform_test', + 'cudnn_helper_test', + 'cudnn_desc_test', + 'cpu_vec_test', + 'cpu_info_test', + 'cpu_helper_test', + 'cow_ptr_tests', + 'convert_model2dot_ernie', + 'conditional_block_op_test', 'cipher_utils_test', + 'check_reduce_rank_test', + 'buffered_allocator_test', 'broadcast_op_test', + 'bfloat16_test', + 'beam_search_decode_op_test', + 'auto_growth_best_fit_allocator_test', + 'assign_op_test', + 'allocator_facade_frac_flags_test', 'aes_cipher_test', ] # It run 4 job each time, If it failed due to Insufficient GPU memory or CUBLAS_STATUS_ALLOC_FAILED, # just remove it from this list. TETRAD_PARALLEL_JOB = [ - 'system_allocator_test', 'buffered_allocator_test', - 'test_tensor_to_numpy', + 'allocator_facade_frac_flags_test', + 'cuda_helper_test', + 'sequence_padding_test', + 'test_auto_growth_gpu_memory_limit', 'test_imperative_framework', + 'device_context_test', + 'test_reference_count_pass_last_lived_ops', + 'copy_same_tensor_test', + 'float16_gpu_test', + 'test_leaky_relu_grad_grad_functor', + 'sequence_pooling_test', + 'mixed_vector_test', + 'op_registry_test', + 'strided_memcpy_test', + 'selected_rows_functor_gpu_test', + 'test_prepare_op', + 'data_device_transform_test', + 'test_tensor_to_numpy', 'test_naive_best_fit_gpu_memory_limit', - 'test_auto_growth_gpu_memory_limit', + 'vol2col_test', 'test_imperative_using_non_zero_gpu', - 'cuda_helper_test', 'retry_allocator_test', - 'allocator_facade_frac_flags_test', + 'system_allocator_test', + 'test_fc_fuse_pass_cc', + 'test_fc_lstm_fuse_pass_cc', + 'test_fc_gru_fuse_pass_cc', + 'test_conv_bn_fuse_pass_cc', + 'test_adaptive_pool2d_convert_global_pass', + 'test_unsqueeze2_eltwise_fuse_pass', + 'test_layer_norm_fuse_pass_cc', + 'test_fc_act_mkldnn_fuse_pass', + 'test_fleet_cc', + 'tensor_test', + 'test_repeated_fc_relu_fuse_pass_cc', + 'test_mkldnn_caching', +] + +# It run 2 job each time, If it failed due to Insufficient GPU memory or CUBLAS_STATUS_ALLOC_FAILED, +# just remove it from this list. +TWO_PARALLEL_JOB = [ + 'im2col_test', + 'test_elementwise_add_grad_grad', + 'test_logical_op', + 'test_imperative_mnist', + 'test_imperative_deepcf', + 'test_cholesky_op', + 'test_multiprocess_dataloader_iterable_dataset_static', + 'test_sample_logits_op', + 'test_ir_fc_fuse_pass', + 'test_imperative_qat_channelwise', + 'test_fleet_base_single', + 'test_imperative_out_scale', + 'test_multiprocess_dataloader_iterable_dataset_dynamic', + 'test_fill_op', + 'test_slice_op', + 'test_cond', + 'test_compiled_program', + 'test_lstm', + 'test_ema', + 'test_py_reader_using_executor', + 'test_nan_inf', + 'test_isinstance', + 'test_jit_save_load', + 'test_box_clip_op', + 'test_group_norm_op', + 'test_seed_op', + 'test_activation_nn_grad', + 'test_pool2d_int8_mkldnn_op', + 'test_adagrad_op_v2', + 'test_elementwise_add_op', + 'test_nn_functional_hot_op', + 'test_op_name_conflict', + 'test_softmax_with_cross_entropy_op', + 'test_imperative_gan', + 'test_simnet', + 'test_instance_norm_op', + 'test_amp_check_finite_and_scale_op', + 'test_random_seed', + 'test_histogram_op', + 'test_sequence_conv', + 'test_eye_op', + 'test_row_conv_op', + 'test_full_like_op', + 'test_optimizer_in_control_flow', + 'test_gru_unit_op', + 'test_distribute_fpn_proposals_op', + 'test_log_loss_op', + 'test_adadelta_op', + 'test_diag_embed', + 'test_unsqueeze2_op', + 'test_fused_fc_elementwise_layernorm_op', + 'test_sum_bf16_mkldnn_op', + 'test_sequence_erase_op', + 'test_sigmoid_cross_entropy_with_logits_op', + 'test_regularizer_api', + 'test_lrn_op', + 'test_rank_attention_op', + 'test_parallel_ssa_graph_inference_feed_partial_data', + 'test_lod_reset_op', + 'test_install_check', + 'test_anchor_generator_op', + 'test_imperative_ptb_rnn', + 'test_gather_nd_op', + 'test_flatten_contiguous_range_op', + 'test_network_with_dtype', + 'test_elementwise_sub_op', + 'test_assert_op', + 'test_elementwise_div_op', + 'test_gather_tree_op', + 'test_decoupled_py_reader', + 'test_imperative_named_members', + 'test_conv3d_op', + 'test_seqconv_eltadd_relu_fuse_pass', + 'test_analysis_predictor', + 'test_convert_operators', + 'test_add_reader_dependency', + 'test_is_tensor', + 'test_variable', + 'test_unsqueeze_op', + 'test_save_model_without_var', + 'test_unfold_op', + 'test_conv_bn_fuse_pass', + 'test_truncated_gaussian_random_op', + 'test_tree_conv_op', + 'test_traced_layer_err_msg', + 'test_unique_with_counts', + 'test_auc_single_pred_op', + 'test_stack_op', + 'test_conv_bn_fuse_pass', + 'test_instance_norm_op_v2', + 'test_softmax_bf16_mkldnn_op', + 'test_mean_iou', + 'test_sequence_slice_op', + 'test_polygon_box_transform', + 'test_sequence_pad_op', + 'test_sequence_expand', + 'test_cudnn_grucell', + 'test_pool2d_bf16_mkldnn_op', + 'test_bilinear_api', + 'test_parallel_executor_inference_feed_partial_data', + 'test_initializer_nn', + 'test_modified_huber_loss_op', + 'test_lookup_table_op', + 'test_conv1d_layer', + 'test_kron_op', + 'test_isfinite_v2_op', + 'test_ctc_align', + 'test_imperative_save_load_v2', + 'test_decayed_adagrad_op', + 'test_generator_dataloader', + 'test_dropout_op', + 'test_functional_conv3d', + 'test_executor_return_tensor_not_overwriting', + 'test_flatten2_op', + 'test_fsp_op', + 'test_fusion_transpose_flatten_concat_op', + 'test_elementwise_nn_grad', + 'test_hinge_loss_op', + 'test_elementwise_add_mkldnn_op', + 'test_optimizer', + 'test_deformable_conv_op', + 'test_py_reader_push_pop', + 'test_random_crop_op', + 'test_shuffle_channel_op', + 'test_center_loss', + 'test_temporal_shift_op', + 'test_case', + 'test_transformer_api', + 'test_bmm_op', + 'test_adagrad_op', + 'test_batch_norm_mkldnn_op', + 'test_adam_op_multi_thread', + 'test_adamax_op', + 'test_while_loop_op', + 'test_affine_grid_function', + 'test_trilinear_interp_op', + 'test_transpose_flatten_concat_fuse_pass', + 'test_trace_op', + 'test_backward', + 'test_top_k_op', + 'test_batch_fc_op', + 'test_tensor_scalar_type_promotion_static', + 'test_squared_l2_distance_op', + 'test_bicubic_interp_op', + 'test_spp_op', + 'test_space_to_depth_op', + 'test_callbacks', + 'test_sigmoid_focal_loss_op', + 'test_collect_fpn_proposals_op', + 'test_sgd_op', + 'test_sequence_unpad_op', + 'test_conv1d_transpose_layer', + 'test_sequence_slice_op', + 'test_sequence_pool', + 'test_conv_elementwise_add_fuse_pass', + 'test_sequence_pad_op', + 'test_conv_shift_op', + 'test_sequence_expand_as', + 'test_cos_sim_op', + 'test_sequence_enumerate_op', + 'test_cross_entropy2_op', + 'test_sequence_concat', + 'test_cudnn_lstmcell', + 'test_data_norm_op', + 'test_decoupled_py_reader_data_check', + 'test_deformable_conv_v1_op', + 'test_roi_align_op', + 'test_detach', + 'test_rnn_cells', + 'test_elementwise_floordiv_op', + 'test_elementwise_min_op', + 'test_reduce_op', + 'test_embedding_id_stop_gradient', + 'test_empty_op', + 'test_py_reader_combination', + 'test_ptb_lm', + 'test_expand_op', + 'test_prroi_pool_op', + 'test_fake_dequantize_op', + 'test_fetch_feed', + 'test_prelu_op', + 'test_fill_zeros_like_op', + 'test_pool2d_op', + 'test_for_enumerate', + 'test_gather_op', + 'test_partial_concat_op', + 'test_gaussian_random_op', + 'test_paddle_imperative_double_grad', + 'test_generate_proposals_v2_op', + 'test_pad_constant_like', + 'test_grid_sample_function', + 'test_pad2d_op', + 'test_huber_loss_op', + 'test_one_hot_op', + 'test_normal', + 'test_imperative_auto_prune', + 'test_nn_grad', + 'test_nearest_interp_op', + 'test_minus_op', + 'test_imperative_reinforcement', + 'test_maxout_op', + 'test_matmul_op', + 'test_increment', + 'test_masked_select_op', + 'test_lstmp_op', + 'test_loop', + 'test_label_smooth_op', + 'test_logsumexp', + 'test_log_softmax', + 'test_learning_rate_scheduler', + 'test_linspace', + 'test_linear_interp_op', + 'test_layer_norm_op_v2', + 'test_lamb_op', + 'test_lookup_table_v2_op', + 'test_l1_norm_op', + 'test_lstm_op', + 'test_margin_rank_loss_op', + 'test_index_sample_op', + 'test_imperative_static_runner_while', + 'test_imperative_save_load', + 'test_imperative_ptb_rnn_sorted_gradient', + 'test_mul_op', + 'test_imperative_lod_tensor_to_selected_rows', + 'test_imperative_data_parallel', + 'test_norm_nn_grad', + 'test_im2sequence_op', + 'test_if_else_op', + 'test_one_hot_v2_op', + 'test_grid_sampler_op', + 'test_pad_op', + 'test_generate_proposals_op', + 'test_parameter', + 'test_gaussian_random_mkldnn_op', + 'test_partial_sum_op', + 'test_ftrl_op', + 'test_flip', + 'test_pool_max_op', + 'test_prior_box_op', + 'test_fake_quantize_op', + 'test_proximal_gd_op', + 'test_expand_v2_op', + 'test_psroi_pool_op', + 'test_expand_as_v2_op', + 'test_ptb_lm_v2', + 'test_rand_op', + 'test_empty_like_op', + 'test_rank_loss_op', + 'test_elementwise_mod_op', + 'test_reinforcement_learning', + 'test_elementwise_max_op', + 'test_retain_graph', + 'test_edit_distance_op', + 'test_reverse_op', + 'test_device_guard', + 'test_rnn_cells_static', + 'test_deformable_psroi_pooling', + 'test_roi_perspective_transform_op', + 'test_segment_ops', + 'test_cvm_op', + 'test_selu_op', + 'test_cross_op', + 'test_sequence_conv', + 'test_crop_tensor_op', + 'test_sequence_expand', + 'test_sequence_mask', + 'test_conv_nn_grad', + 'test_sequence_pool', + 'test_conv_elementwise_add2_act_fuse_pass', + 'test_sequence_reshape', + 'test_conv2d_fusion_op', + 'test_sequence_softmax_op', + 'test_sequence_unpad_op', + 'test_compare_reduce_op', + 'test_clip_by_norm_op', + 'test_box_coder_op', + 'test_smooth_l1_loss_op', + 'test_bilinear_interp_op', + 'test_spectral_norm_op', + 'test_sum_mkldnn_op', + 'test_batch_norm_op', + 'test_base_layer', + 'test_argsort_op', + 'test_arg_min_max_op', + 'test_transpose_op', + 'test_affine_grid_op', + 'test_unpool_op', + 'test_addmm_op', + 'test_adam_optimizer_fp32_fp64', + 'test_auc_op', + 'test_adam_op', + 'test_bilinear_tensor_product_op', + 'test_break_continue', + 'test_transpose_mkldnn_op', + 'test_callback_reduce_lr_on_plateau', + 'test_cast_op', + 'test_scatter_nd_op', + 'test_conv2d_transpose_op_depthwise_conv', + 'test_queue', + 'test_cross_entropy_op', + 'test_detection', + 'test_elementwise_mul_mkldnn_op', + 'test_grid_generator', + 'test_functional_conv2d', + 'test_fit_a_line', + 'test_fill_any_like_op', + 'test_functional_conv2d_transpose', + 'test_functional_conv3d_transpose', + 'test_dot_op', + 'test_gru_op', + 'test_device', + 'test_imperative_layer_apply', + 'test_dataloader_early_reset', + 'test_imperative_selected_rows_to_lod_tensor', + 'test_crop_op', + 'test_linear_interp_v2_op', + 'test_lr_scheduler', + 'test_tensor_array_to_tensor', + 'test_mean_op', + 'test_momentum_op', + 'test_iou_similarity_op', + 'test_optimizer_grad', + 'test_dygraph_weight_norm', + 'test_batch_norm_op_v2', + 'test_pool2d_mkldnn_op', + 'test_regularizer', + 'test_sequence_concat', + 'test_sequence_expand_as', + 'test_sequence_reverse', + 'test_shape_op', + 'test_lod_tensor', + 'test_diag', + 'test_strided_slice_op', + 'test_switch_case', + 'test_target_assign_op', + 'test_translated_layer', + 'test_isfinite_op', + 'test_conv_elementwise_add_act_fuse_pass', + 'test_unbind_op', + 'test_size_op', + 'test_unique', + 'test_unstack_op', + 'test_wrappers', + 'test_deprecated_decorator', + 'test_affine_channel_op', + 'test_arange', + 'test_lrn_mkldnn_op', + 'test_imperative_gnn', + 'test_eager_deletion_while_op', + 'test_dequantize_abs_max_op', + 'test_elementwise_mul_op', + 'test_tensor_scalar_type_promotion_dynamic', + 'test_fc_op', + 'test_mish_op', + 'test_flatten_op', + 'test_gradient_clip', + 'test_allclose_layer', + 'test_meshgrid_op', + 'test_get_places_op', + 'test_reader_reset', + 'test_squared_l2_norm_op', + 'test_softmax_mkldnn_op', + 'test_numel_op', + 'test_squeeze2_op', + 'test_dygraph_mnist_fp16', + 'test_activation_mkldnn_op', + 'test_imperative_layer_children', + 'test_nearest_interp_v2_op', + 'test_fill_zeros_like2_op', + 'test_sync_batch_norm_op', + 'test_static_save_load', + 'test_coalesce_tensor_op', + 'test_fuse_bn_act_pass', + 'test_simnet_v2', + 'test_shard_index_op', + 'test_cuda_random_seed', + 'test_dequantize_log_op', + 'test_mkldnn_batch_norm_act_fuse_pass', + 'test_imperative_skip_op', + 'test_proximal_adagrad_op', + 'test_word2vec', + 'test_conv2d_transpose_mkldnn_op', + 'test_imperative_optimizer', + 'test_assign_value_op', + 'test_roi_pool_op', + 'test_imperative_basic', + 'test_word2vec', + 'test_manual_seed', + 'test_buffer_shared_memory_reuse_pass', + 'test_range', + 'test_activation_op', + 'test_box_decoder_and_assign_op', + 'test_imperative_optimizer_v2', + 'test_python_operator_overriding', + 'test_is_empty_op', + 'test_imperative_qat', + 'test_py_reader_pin_memory', + 'test_train_recognize_digits', + 'test_parallel_executor_feed_persistable_var', + 'test_mnist', + 'test_update_loss_scaling_op', + 'test_rnn_cell_api', + 'test_parallel_executor_fetch_isolated_var', + 'test_imperative_load_static_param', + 'test_fuse_bn_add_act_pass', + 'test_buffer_shared_memory_reuse_pass_and_fuse_optimization_op_pass', ] def main(): - eight_parallel_job = '^job$' + cpu_parallel_job = '^job$' tetrad_parallel_job = '^job$' - non_parallel_job_1 = '^job$' - non_parallel_job_2 = '^job$' + two_parallel_job = '^job$' + non_parallel_job = '^job$' test_cases = sys.argv[1] test_cases = test_cases.split("\n") - for unittest in test_cases: - if unittest in CPU_PARALLEL_JOB: - eight_parallel_job = eight_parallel_job + '|^' + unittest + '$' - continue - if unittest in TETRAD_PARALLEL_JOB: + + for unittest in CPU_PARALLEL_JOB: + if unittest in test_cases: + cpu_parallel_job = cpu_parallel_job + '|^' + unittest + '$' + test_cases.remove(unittest) + + for unittest in TETRAD_PARALLEL_JOB: + if unittest in test_cases: tetrad_parallel_job = tetrad_parallel_job + '|^' + unittest + '$' - continue + test_cases.remove(unittest) - if len(non_parallel_job_1) < 10000: - non_parallel_job_1 = non_parallel_job_1 + '|^' + unittest + '$' - else: - non_parallel_job_2 = non_parallel_job_2 + '|^' + unittest + '$' + for unittest in TWO_PARALLEL_JOB: + if unittest in test_cases: + two_parallel_job = two_parallel_job + '|^' + unittest + '$' + test_cases.remove(unittest) + + for unittest in test_cases: + non_parallel_job = non_parallel_job + '|^' + unittest + '$' - non_parallel_job = ",".join([non_parallel_job_1, non_parallel_job_2]) - print("{};{};{}".format(eight_parallel_job, tetrad_parallel_job, - non_parallel_job)) + print("{};{};{};{}".format(cpu_parallel_job, tetrad_parallel_job, + two_parallel_job, non_parallel_job)) if __name__ == '__main__': diff --git a/tools/windows/run_unittests.sh b/tools/windows/run_unittests.sh index dd4b21c80d9..db3f3648ce2 100644 --- a/tools/windows/run_unittests.sh +++ b/tools/windows/run_unittests.sh @@ -214,10 +214,8 @@ echo "Windows 1 card TestCases count is $num" if [ ${PRECISION_TEST:-OFF} == "ON" ]; then python ${PADDLE_ROOT}/tools/get_pr_ut.py if [[ -f "ut_list" ]]; then - set +x echo "PREC length: "`wc -l ut_list` precision_cases=`cat ut_list` - set -x fi fi @@ -242,12 +240,11 @@ fi set -e output=$(python ${PADDLE_ROOT}/tools/parallel_UT_rule.py "${UT_list}") -eight_parallel_job=$(echo $output | cut -d ";" -f 1) -tetrad_parallel_jog=$(echo $output | cut -d ";" -f 2) -non_parallel_job=$(echo $output | cut -d ";" -f 3) +cpu_parallel_job=$(echo $output | cut -d ";" -f 1) +tetrad_parallel_job=$(echo $output | cut -d ";" -f 2) +two_parallel_job=$(echo $output | cut -d ";" -f 3) +non_parallel_job=$(echo $output | cut -d ";" -f 4) -non_parallel_job_1=$(echo $non_parallel_job | cut -d "," -f 1) -non_parallel_job_2=$(echo $non_parallel_job | cut -d "," -f 2) failed_test_lists='' tmp_dir=`mktemp -d` @@ -270,10 +267,11 @@ function collect_failed_tests() { function run_unittest() { test_case=$1 parallel_job=$2 + parallel_level_base=${CTEST_PARALLEL_LEVEL:-1} if [ "$2" == "" ]; then - parallel_job=1 + parallel_job=$parallel_level_base else - parallel_job=$2 + parallel_job=`expr $2 \* $parallel_level_base` fi echo "************************************************************************" echo "********These unittests run $parallel_job job each time with 1 GPU**********" @@ -336,7 +334,7 @@ function unittests_retry(){ function show_ut_retry_result() { if [[ "$is_retry_execuate" != "0" ]];then - failed_test_lists_ult=`echo "${failed_test_lists}" | grep -Po '[^ ].*$'` + failed_test_lists_ult=`echo "${failed_test_lists}" | grep -o '[^ ].*$'` echo "=========================================" echo "There are more than 10 failed unit tests, so no unit test retry!!!" echo "=========================================" @@ -349,7 +347,7 @@ function show_ut_retry_result() { echo "========================================" echo "There are failed tests, which have been successful after re-run:" echo "========================================" - echo "The following tests have been re-ran:" + echo "The following tests have been re-run:" echo "${retry_unittests_record}" else failed_ut_re=$(echo "${retry_unittests_record_judge}" | awk 'BEGIN{ all_str=""}{if (all_str==""){all_str=$1}else{all_str=all_str"|"$1}} END{print all_str}') @@ -365,10 +363,10 @@ function show_ut_retry_result() { } set +e -run_unittest $eight_parallel_job 8 -run_unittest $tetrad_parallel_jog 4 -run_unittest $non_parallel_job_1 -run_unittest $non_parallel_job_2 +run_unittest $cpu_parallel_job 12 +run_unittest $tetrad_parallel_job 4 +run_unittest $two_parallel_job 2 +run_unittest $non_parallel_job collect_failed_tests set -e rm -f $tmp_dir/* -- GitLab From d5b5004b5b6b3b010b959f73806c5fa87cf8cd82 Mon Sep 17 00:00:00 2001 From: tianshuo78520a <707759223@qq.com> Date: Wed, 31 Mar 2021 22:39:11 +0800 Subject: [PATCH 122/486] Delete legacy C++ training user-interface (#31949) * delete include framework.pb.h * fix error * delete fluid_train --- paddle/fluid/CMakeLists.txt | 1 - paddle/fluid/train/CMakeLists.txt | 31 --- paddle/fluid/train/demo/CMakeLists.txt | 77 ------- paddle/fluid/train/demo/README.md | 65 ------ paddle/fluid/train/demo/clean.sh | 20 -- paddle/fluid/train/demo/demo_network.py | 47 ----- paddle/fluid/train/demo/demo_trainer.cc | 118 ----------- paddle/fluid/train/demo/run.sh | 30 --- paddle/fluid/train/imdb_demo/CMakeLists.txt | 76 ------- paddle/fluid/train/imdb_demo/README.md | 115 ----------- paddle/fluid/train/imdb_demo/demo_trainer.cc | 192 ------------------ .../fluid/train/imdb_demo/generate_program.py | 72 ------- paddle/fluid/train/imdb_demo/imdb_reader.py | 75 ------- .../train/imdb_demo/include/save_model.h | 41 ---- paddle/fluid/train/imdb_demo/nets.py | 140 ------------- paddle/fluid/train/imdb_demo/run.sh | 3 - paddle/fluid/train/imdb_demo/save_model.cc | 77 ------- paddle/fluid/train/imdb_demo/train.cfg | 7 - .../fluid/train/imdb_demo/train_filelist.txt | 12 -- .../train/test_train_recognize_digits.cc | 95 --------- 20 files changed, 1294 deletions(-) delete mode 100644 paddle/fluid/train/CMakeLists.txt delete mode 100644 paddle/fluid/train/demo/CMakeLists.txt delete mode 100644 paddle/fluid/train/demo/README.md delete mode 100755 paddle/fluid/train/demo/clean.sh delete mode 100644 paddle/fluid/train/demo/demo_network.py delete mode 100644 paddle/fluid/train/demo/demo_trainer.cc delete mode 100755 paddle/fluid/train/demo/run.sh delete mode 100644 paddle/fluid/train/imdb_demo/CMakeLists.txt delete mode 100644 paddle/fluid/train/imdb_demo/README.md delete mode 100644 paddle/fluid/train/imdb_demo/demo_trainer.cc delete mode 100644 paddle/fluid/train/imdb_demo/generate_program.py delete mode 100644 paddle/fluid/train/imdb_demo/imdb_reader.py delete mode 100644 paddle/fluid/train/imdb_demo/include/save_model.h delete mode 100644 paddle/fluid/train/imdb_demo/nets.py delete mode 100644 paddle/fluid/train/imdb_demo/run.sh delete mode 100644 paddle/fluid/train/imdb_demo/save_model.cc delete mode 100644 paddle/fluid/train/imdb_demo/train.cfg delete mode 100644 paddle/fluid/train/imdb_demo/train_filelist.txt delete mode 100644 paddle/fluid/train/test_train_recognize_digits.cc diff --git a/paddle/fluid/CMakeLists.txt b/paddle/fluid/CMakeLists.txt index c18332d3b87..dcff02a662e 100644 --- a/paddle/fluid/CMakeLists.txt +++ b/paddle/fluid/CMakeLists.txt @@ -9,4 +9,3 @@ add_subdirectory(pybind) # NOTE: please add subdirectory inference at last. add_subdirectory(inference) -add_subdirectory(train) diff --git a/paddle/fluid/train/CMakeLists.txt b/paddle/fluid/train/CMakeLists.txt deleted file mode 100644 index 0688c63cac3..00000000000 --- a/paddle/fluid/train/CMakeLists.txt +++ /dev/null @@ -1,31 +0,0 @@ -function(train_test TARGET_NAME) - set(options "") - set(oneValueArgs "") - set(multiValueArgs ARGS) - cmake_parse_arguments(train_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - - if (NOT APPLE AND NOT WIN32) - cc_test(test_train_${TARGET_NAME} - SRCS test_train_${TARGET_NAME}.cc - DEPS paddle_inference_shared - ARGS --dirname=${PYTHON_TESTS_DIR}/book/) - else() - cc_test(test_train_${TARGET_NAME} - SRCS test_train_${TARGET_NAME}.cc - DEPS paddle_inference_io - ARGS --dirname=${PYTHON_TESTS_DIR}/book/) - endif() - if(TEST test_train_${TARGET_NAME}) - set_tests_properties(test_train_${TARGET_NAME} - PROPERTIES FIXTURES_REQUIRED test_${TARGET_NAME}_infer_model) - if(NOT WIN32 AND NOT APPLE) - set_tests_properties(test_train_${TARGET_NAME} - PROPERTIES TIMEOUT 150) - endif() - endif() -endfunction(train_test) - - -if(WITH_TESTING) - train_test(recognize_digits) -endif() diff --git a/paddle/fluid/train/demo/CMakeLists.txt b/paddle/fluid/train/demo/CMakeLists.txt deleted file mode 100644 index 95da77d68d4..00000000000 --- a/paddle/fluid/train/demo/CMakeLists.txt +++ /dev/null @@ -1,77 +0,0 @@ -cmake_minimum_required(VERSION 3.0) - -project(cpp_train_demo CXX C) - -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") - -if(NOT DEFINED PADDLE_LIB) - message(FATAL_ERROR "please set PADDLE_LIB with -DPADDLE_LIB=/paddle/lib/dir") -endif() - -option(WITH_MKLDNN "Compile PaddlePaddle with MKLDNN" OFF) -option(WITH_MKL "Compile PaddlePaddle with MKL support, default use openblas." OFF) - -include_directories("${PADDLE_LIB}") -include_directories("${PADDLE_LIB}/third_party/install/protobuf/include") -include_directories("${PADDLE_LIB}/third_party/install/glog/include") -include_directories("${PADDLE_LIB}/third_party/install/gflags/include") -include_directories("${PADDLE_LIB}/third_party/install/xxhash/include") -include_directories("${PADDLE_LIB}/third_party/install/zlib/include") - -include_directories("${PADDLE_LIB}/third_party/boost") -include_directories("${PADDLE_LIB}/third_party/eigen3") -include_directories("${PADDLE_LIB}/third_party/threadpool") -include_directories("${PADDLE_LIB}/third_party/dlpack") - -link_directories("${PADDLE_LIB}/third_party/install/protobuf/lib") -link_directories("${PADDLE_LIB}/third_party/install/glog/lib") -link_directories("${PADDLE_LIB}/third_party/install/gflags/lib") -link_directories("${PADDLE_LIB}/third_party/install/xxhash/lib") -link_directories("${PADDLE_LIB}/third_party/install/zlib/lib") - -add_executable(demo_trainer demo_trainer.cc) - -if(WITH_MKLDNN) - add_definitions(-DPADDLE_WITH_MKLDNN) - include_directories("${PADDLE_LIB}/third_party/install/mkldnn/include") - if(WIN32) - set(MKLDNN_LIB ${PADDLE_LIB}/third_party/install/mkldnn/lib/mkldnn.lib) - else(WIN32) - set(MKLDNN_LIB ${PADDLE_LIB}/third_party/install/mkldnn/lib/libmkldnn.so.0) - endif(WIN32) -endif(WITH_MKLDNN) - -if(WITH_MKL) - include_directories("${PADDLE_LIB}/third_party/install/mklml/include") - if(WIN32) - set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/mklml.lib) - else(WIN32) - set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel.so) - endif(WIN32) -else() - if(APPLE) - set(MATH_LIB cblas) - elseif(WIN32) - set(MATH_LIB ${PADDLE_LIB}/third_party/install/openblas/lib/libopenblas.lib) - else() - set(MATH_LIB ${PADDLE_LIB}/third_party/install/openblas/lib/libopenblas.a) - endif(APPLE) -endif() - -if(APPLE) - set(MACOS_LD_FLAGS "-undefined dynamic_lookup -Wl,-all_load -framework CoreFoundation -framework Security") -else(APPLE) - set(ARCHIVE_START "-Wl,--whole-archive") - set(ARCHIVE_END "-Wl,--no-whole-archive") - set(EXTERNAL_LIB "-lrt -ldl -lpthread") -endif(APPLE) - -target_link_libraries(demo_trainer - ${MACOS_LD_FLAGS} - ${ARCHIVE_START} - ${PADDLE_LIB}/paddle/fluid/inference/libpaddle_inference.so - ${ARCHIVE_END} - ${MATH_LIB} - ${MKLDNN_LIB} - glog gflags protobuf z xxhash - ${EXTERNAL_LIB}) diff --git a/paddle/fluid/train/demo/README.md b/paddle/fluid/train/demo/README.md deleted file mode 100644 index 8a44c25aea9..00000000000 --- a/paddle/fluid/train/demo/README.md +++ /dev/null @@ -1,65 +0,0 @@ - -### step 1. build paddle lib - -``` - -# WITH_MKL=ON|OFF -# WITH_MKLDNN=ON|OFF - -PADDLE_LIB=/paddle/lib/dir -cmake .. -DPADDLE_INSTALL_DIR=$PADDLE_LIB \ - -DCMAKE_BUILD_TYPE=Release \ - -DWITH_GPU=OFF \ - -DWITH_STYLE_CHECK=OFF \ - -DWITH_MKL=OFF \ - -DWITH_MKLDNN=OFF -make -j8 -make -j8 fluid_lib_dist -``` - -### step 2. generate program desc -``` -# please install paddle before run this scripe -pip install --upgrade paddlepaddle-*.whl -python demo_network.py -``` - -This will generate two program desc files: - - startup_program: used to init all parameters - - main_program: main logic of the network - -### step 3. build demo_trainer and run it. - - -``` -# Make a build dir at the same dir of this README.md document. -# The demo dir can be put anywhere. -mkdir build -cd build - -# WITH_MKL=ON|OFF -# WITH_MKLDNN=ON|OFF -PADDLE_LIB=/paddle/lib/dir - -# PADDLE_LIB is the same with PADDLE_INSTALL_DIR when building the lib -cmake .. -DPADDLE_LIB=$PADDLE_LIB \ - -DWITH_MKLDNN=OFF \ - -DWITH_MKL=OFF -make - -# copy startup_program and main_program to this dir -cp ../startup_program . -cp ../main_program . - -# run demo cpp trainer -./demo_trainer - -``` - -The output will be: -``` -step: 0 loss: 1069.02 -step: 1 loss: 1069.02 -step: 2 loss: 1069.02 -.... -``` diff --git a/paddle/fluid/train/demo/clean.sh b/paddle/fluid/train/demo/clean.sh deleted file mode 100755 index a2064492c08..00000000000 --- a/paddle/fluid/train/demo/clean.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/bash - -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -set -x -cd "$(dirname "$0")" -rm -rf build/ -set +x diff --git a/paddle/fluid/train/demo/demo_network.py b/paddle/fluid/train/demo/demo_network.py deleted file mode 100644 index 41e98c6a24a..00000000000 --- a/paddle/fluid/train/demo/demo_network.py +++ /dev/null @@ -1,47 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import paddle.fluid as fluid -import paddle.fluid.framework as framework - - -def train_network(with_optimize): - x = fluid.layers.data(name='x', shape=[13], dtype='float32') - y_predict = fluid.layers.fc(input=x, size=1, act=None) - - y = fluid.layers.data(name='y', shape=[1], dtype='float32') - cost = fluid.layers.square_error_cost(input=y_predict, label=y) - avg_cost = fluid.layers.mean(cost) - - if with_optimize: - sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.00001) - sgd_optimizer.minimize(avg_cost) - else: - fluid.backward.append_backward(avg_cost) - - -def save_program_desc(network_func): - startup_program = framework.Program() - train_program = framework.Program() - - with framework.program_guard(train_program, startup_program): - network_func(with_optimize=False) - - with open("startup_program", "w") as f: - f.write(startup_program.desc.serialize_to_string()) - with open("main_program", "w") as f: - f.write(train_program.desc.serialize_to_string()) - - -save_program_desc(train_network) diff --git a/paddle/fluid/train/demo/demo_trainer.cc b/paddle/fluid/train/demo/demo_trainer.cc deleted file mode 100644 index 830f00b8db1..00000000000 --- a/paddle/fluid/train/demo/demo_trainer.cc +++ /dev/null @@ -1,118 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include - -#include "paddle/fluid/framework/executor.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/init.h" -#include "paddle/fluid/platform/place.h" -#include "paddle/fluid/platform/profiler.h" - -namespace paddle { -namespace train { - -void ReadBinaryFile(const std::string& filename, std::string* contents) { - std::ifstream fin(filename, std::ios::in | std::ios::binary); - PADDLE_ENFORCE_EQ( - fin.is_open(), true, - platform::errors::Unavailable("Failed to open file %s.", filename)); - fin.seekg(0, std::ios::end); - contents->clear(); - contents->resize(fin.tellg()); - fin.seekg(0, std::ios::beg); - fin.read(&(contents->at(0)), contents->size()); - fin.close(); -} - -std::unique_ptr Load( - paddle::framework::Executor* executor, const std::string& model_filename) { - VLOG(3) << "loading model from " << model_filename; - std::string program_desc_str; - ReadBinaryFile(model_filename, &program_desc_str); - - std::unique_ptr main_program( - new paddle::framework::ProgramDesc(program_desc_str)); - return main_program; -} - -} // namespace train -} // namespace paddle - -int main() { - paddle::framework::InitDevices(); - - const auto cpu_place = paddle::platform::CPUPlace(); - - paddle::framework::Executor executor(cpu_place); - paddle::framework::Scope scope; - auto startup_program = paddle::train::Load(&executor, "startup_program"); - auto train_program = paddle::train::Load(&executor, "main_program"); - - std::string loss_name = ""; - for (auto op_desc : train_program->Block(0).AllOps()) { - if (op_desc->Type() == "mean") { - loss_name = op_desc->Output("Out")[0]; - break; - } - } - - PADDLE_ENFORCE_NE(loss_name, "", - platform::errors::NotFound("Loss name is not found.")); - - // init all parameters - executor.Run(*startup_program, &scope, 0); - - // prepare data - auto x_var = scope.Var("x"); - auto x_tensor = x_var->GetMutable(); - x_tensor->Resize({2, 13}); - - auto x_data = x_tensor->mutable_data(cpu_place); - for (int i = 0; i < 2 * 13; ++i) { - x_data[i] = static_cast(i); - } - - auto y_var = scope.Var("y"); - auto y_tensor = y_var->GetMutable(); - y_tensor->Resize({2, 1}); - auto y_data = y_tensor->mutable_data(cpu_place); - for (int i = 0; i < 2 * 1; ++i) { - y_data[i] = static_cast(i); - } - - auto loss_var = scope.Var(loss_name); - - paddle::platform::ProfilerState pf_state; - pf_state = paddle::platform::ProfilerState::kCPU; - paddle::platform::EnableProfiler(pf_state); - clock_t t1 = clock(); - - for (int i = 0; i < 10; ++i) { - executor.Run(*train_program, &scope, 0, false, true); - std::cout << "step: " << i << " loss: " - << loss_var->Get().data()[0] - << std::endl; - } - - clock_t t2 = clock(); - paddle::platform::DisableProfiler(paddle::platform::EventSortingKey::kTotal, - "run_paddle_op_profiler"); - std::cout << "run_time = " << t2 - t1 << std::endl; - return 0; -} diff --git a/paddle/fluid/train/demo/run.sh b/paddle/fluid/train/demo/run.sh deleted file mode 100755 index 2955e7574da..00000000000 --- a/paddle/fluid/train/demo/run.sh +++ /dev/null @@ -1,30 +0,0 @@ -#!/bin/bash - -set -x - -PADDLE_ROOT=$1 -TURN_ON_MKL=$2 # use MKL or Openblas - -# download models -function download() { - wget -q http://paddle-tar.bj.bcebos.com/train_demo/LR-1-7/main_program - wget -q http://paddle-tar.bj.bcebos.com/train_demo/LR-1-7/startup_program -} - -download - -# build demo trainer -paddle_install_dir=${PADDLE_ROOT}/build/paddle_install_dir - -mkdir -p build -cd build -rm -rf * -cmake .. -DPADDLE_LIB=$paddle_install_dir \ - -DWITH_MKLDNN=$TURN_ON_MKL \ - -DWITH_MKL=$TURN_ON_MKL -make - -cd .. - -# run demo trainer -build/demo_trainer diff --git a/paddle/fluid/train/imdb_demo/CMakeLists.txt b/paddle/fluid/train/imdb_demo/CMakeLists.txt deleted file mode 100644 index e943d6bc78e..00000000000 --- a/paddle/fluid/train/imdb_demo/CMakeLists.txt +++ /dev/null @@ -1,76 +0,0 @@ -cmake_minimum_required(VERSION 3.0) - -project(cpp_imdb_train_demo CXX C) - -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") - -if(NOT DEFINED PADDLE_LIB) - message(FATAL_ERROR "please set PADDLE_LIB with -DPADDLE_LIB=/paddle/lib/dir") -endif() - -option(WITH_MKLDNN "Compile PaddlePaddle with MKLDNN" OFF) -option(WITH_MKL "Compile PaddlePaddle with MKL support, default use openblas." OFF) - -include_directories("${PADDLE_LIB}") -include_directories("${PADDLE_LIB}/third_party/install/protobuf/include") -include_directories("${PADDLE_LIB}/third_party/install/glog/include") -include_directories("${PADDLE_LIB}/third_party/install/gflags/include") -include_directories("${PADDLE_LIB}/third_party/install/xxhash/include") -include_directories("${PADDLE_LIB}/third_party/install/zlib/include") - -include_directories("${PADDLE_LIB}/third_party/boost") -include_directories("${PADDLE_LIB}/third_party/eigen3") -include_directories("${PADDLE_LIB}/third_party/threadpool") -include_directories("${PADDLE_LIB}/third_party/dlpack") - -link_directories("${PADDLE_LIB}/third_party/install/protobuf/lib") -link_directories("${PADDLE_LIB}/third_party/install/glog/lib") -link_directories("${PADDLE_LIB}/third_party/install/gflags/lib") -link_directories("${PADDLE_LIB}/third_party/install/xxhash/lib") -link_directories("${PADDLE_LIB}/third_party/install/zlib/lib") - -add_executable(demo_trainer save_model.cc demo_trainer.cc) - -if(WITH_MKLDNN) - include_directories("${PADDLE_LIB}/third_party/install/mkldnn/include") - if(WIN32) - set(MKLDNN_LIB ${PADDLE_LIB}/third_party/install/mkldnn/lib/mkldnn.lib) - else(WIN32) - set(MKLDNN_LIB ${PADDLE_LIB}/third_party/install/mkldnn/lib/libmkldnn.so.0) - endif(WIN32) -endif(WITH_MKLDNN) - -if(WITH_MKL) - include_directories("${PADDLE_LIB}/third_party/install/mklml/include") - if(WIN32) - set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/mklml.lib) - else(WIN32) - set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel.so) - endif(WIN32) -else() - if(APPLE) - set(MATH_LIB cblas) - elseif(WIN32) - set(MATH_LIB ${PADDLE_LIB}/third_party/install/openblas/lib/libopenblas.lib) - else() - set(MATH_LIB ${PADDLE_LIB}/third_party/install/openblas/lib/libopenblas.a) - endif(APPLE) -endif() - -if(APPLE) - set(MACOS_LD_FLAGS "-undefined dynamic_lookup -Wl,-all_load -framework CoreFoundation -framework Security") -else(APPLE) - set(ARCHIVE_START "-Wl,--whole-archive") - set(ARCHIVE_END "-Wl,--no-whole-archive") - set(EXTERNAL_LIB "-lrt -ldl -lpthread") -endif(APPLE) - -target_link_libraries(demo_trainer - ${MACOS_LD_FLAGS} - ${ARCHIVE_START} - ${PADDLE_LIB}/paddle/fluid/inference/libpaddle_inference.so - ${ARCHIVE_END} - ${MATH_LIB} - ${MKLDNN_LIB} - glog gflags protobuf z xxhash - ${EXTERNAL_LIB}) diff --git a/paddle/fluid/train/imdb_demo/README.md b/paddle/fluid/train/imdb_demo/README.md deleted file mode 100644 index 28fd66710f8..00000000000 --- a/paddle/fluid/train/imdb_demo/README.md +++ /dev/null @@ -1,115 +0,0 @@ -# Train with C++ inference API - -What is C++ inference API and how to install it: - -see: [PaddlePaddle Fluid 提供了 C++ API 来支持模型的部署上线](https://paddlepaddle.org.cn/documentation/docs/zh/1.5/advanced_usage/deploy/inference/index_cn.html) - -After downloading the source code of Paddle, you can build your own inference lib: - -```shell -PADDLE_ROOT=./Paddle -cd Paddle -mkdir build -cd build -cmake -DPADDLE_INFERENCE_INSTALL_DIR=$PADDLE_ROOT \ - -DCMAKE_BUILD_TYPE=Release \ - -DWITH_PYTHON=OFF \ - -DWITH_MKL=OFF \ - -DWITH_GPU=OFF \ - -DON_INFER=ON \ - .. -make -make inference_lib_dist -``` - -## IMDB task - -see: [IMDB Dataset of 50K Movie Reviews | Kaggle](https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews) - -## Quick Start - -### prepare data - -```shell - wget https://fleet.bj.bcebos.com/text_classification_data.tar.gz - tar -zxvf text_classification_data.tar.gz -``` -### build - -```shell - mkdir build - cd build - rm -rf * - PADDLE_LIB=path/to/Paddle/build/paddle_install_dir - cmake .. -DPADDLE_LIB=$PADDLE_LIB -DWITH_MKLDNN=OFF -DWITH_MKL=OFF - make -``` - -### generate program description - -``` - python generate_program.py bow -``` - -### run - -```shell - # After editing train.cfg - sh run.sh -``` - -## results - -Below are training logs on BOW model, the losses go down as expected. - -``` -WARNING: Logging before InitGoogleLogging() is written to STDERR -I0731 22:39:06.974232 10965 demo_trainer.cc:130] Start training... -I0731 22:39:57.395229 10965 demo_trainer.cc:164] epoch: 0; average loss: 0.405706 -I0731 22:40:50.262344 10965 demo_trainer.cc:164] epoch: 1; average loss: 0.110746 -I0731 22:41:49.731079 10965 demo_trainer.cc:164] epoch: 2; average loss: 0.0475805 -I0731 22:43:31.398355 10965 demo_trainer.cc:164] epoch: 3; average loss: 0.0233249 -I0731 22:44:58.744391 10965 demo_trainer.cc:164] epoch: 4; average loss: 0.00701507 -I0731 22:46:30.451735 10965 demo_trainer.cc:164] epoch: 5; average loss: 0.00258187 -I0731 22:48:14.396687 10965 demo_trainer.cc:164] epoch: 6; average loss: 0.00113157 -I0731 22:49:56.242744 10965 demo_trainer.cc:164] epoch: 7; average loss: 0.000698234 -I0731 22:51:11.585919 10965 demo_trainer.cc:164] epoch: 8; average loss: 0.000510136 -I0731 22:52:50.573947 10965 demo_trainer.cc:164] epoch: 9; average loss: 0.000400932 -I0731 22:54:02.686152 10965 demo_trainer.cc:164] epoch: 10; average loss: 0.000329259 -I0731 22:54:55.233342 10965 demo_trainer.cc:164] epoch: 11; average loss: 0.000278644 -I0731 22:56:15.496256 10965 demo_trainer.cc:164] epoch: 12; average loss: 0.000241055 -I0731 22:57:45.015926 10965 demo_trainer.cc:164] epoch: 13; average loss: 0.000212085 -I0731 22:59:18.419997 10965 demo_trainer.cc:164] epoch: 14; average loss: 0.000189109 -I0731 23:00:15.409077 10965 demo_trainer.cc:164] epoch: 15; average loss: 0.000170465 -I0731 23:01:38.795770 10965 demo_trainer.cc:164] epoch: 16; average loss: 0.000155051 -I0731 23:02:57.289487 10965 demo_trainer.cc:164] epoch: 17; average loss: 0.000142106 -I0731 23:03:48.032507 10965 demo_trainer.cc:164] epoch: 18; average loss: 0.000131089 -I0731 23:04:51.195230 10965 demo_trainer.cc:164] epoch: 19; average loss: 0.000121605 -I0731 23:06:27.008040 10965 demo_trainer.cc:164] epoch: 20; average loss: 0.00011336 -I0731 23:07:56.568284 10965 demo_trainer.cc:164] epoch: 21; average loss: 0.000106129 -I0731 23:09:23.948290 10965 demo_trainer.cc:164] epoch: 22; average loss: 9.97393e-05 -I0731 23:10:56.062590 10965 demo_trainer.cc:164] epoch: 23; average loss: 9.40532e-05 -I0731 23:12:23.014047 10965 demo_trainer.cc:164] epoch: 24; average loss: 8.89622e-05 -I0731 23:13:21.439818 10965 demo_trainer.cc:164] epoch: 25; average loss: 8.43784e-05 -I0731 23:14:56.171597 10965 demo_trainer.cc:164] epoch: 26; average loss: 8.02322e-05 -I0731 23:16:01.513542 10965 demo_trainer.cc:164] epoch: 27; average loss: 7.64629e-05 -I0731 23:17:18.709139 10965 demo_trainer.cc:164] epoch: 28; average loss: 7.30239e-05 -I0731 23:18:41.421555 10965 demo_trainer.cc:164] epoch: 29; average loss: 6.98716e-05 -``` - -I trained a Bow model and a CNN model on IMDB dataset using the trainer. At the same time, I also trained the same models using traditional Python training methods. -Results show that the two methods achieve almost the same dev accuracy: - -CNN: - - - -BOW: - - - -I also recorded the training speed of the C++ Trainer and the python training methods, C++ trainer is quicker on CNN model: - - - -#TODO (mapingshuo): find the reason why C++ trainer is quicker on CNN model than python method. diff --git a/paddle/fluid/train/imdb_demo/demo_trainer.cc b/paddle/fluid/train/imdb_demo/demo_trainer.cc deleted file mode 100644 index 6d3b8e7ca4a..00000000000 --- a/paddle/fluid/train/imdb_demo/demo_trainer.cc +++ /dev/null @@ -1,192 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include - -#include "include/save_model.h" -#include "paddle/fluid/framework/data_feed_factory.h" -#include "paddle/fluid/framework/dataset_factory.h" -#include "paddle/fluid/framework/executor.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/framework/variable_helper.h" -#include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/init.h" -#include "paddle/fluid/platform/place.h" -#include "paddle/fluid/platform/profiler.h" - -#include "gflags/gflags.h" - -DEFINE_string(filelist, "train_filelist.txt", "filelist for fluid dataset"); -DEFINE_string(data_proto_desc, "data.proto", "data feed protobuf description"); -DEFINE_string(startup_program_file, "startup_program", - "startup program description"); -DEFINE_string(main_program_file, "", "main program description"); -DEFINE_string(loss_name, "mean_0.tmp_0", - "loss tensor name in the main program"); -DEFINE_string(save_dir, "cnn_model", "directory to save trained models"); -DEFINE_int32(epoch_num, 30, "number of epochs to run when training"); - -namespace paddle { -namespace train { - -void ReadBinaryFile(const std::string& filename, std::string* contents) { - std::ifstream fin(filename, std::ios::in | std::ios::binary); - PADDLE_ENFORCE_EQ( - fin.is_open(), true, - platform::errors::Unavailable("Failed to open file %s.", filename)); - fin.seekg(0, std::ios::end); - contents->clear(); - contents->resize(fin.tellg()); - fin.seekg(0, std::ios::beg); - fin.read(&(contents->at(0)), contents->size()); - fin.close(); -} - -std::unique_ptr LoadProgramDesc( - const std::string& model_filename) { - VLOG(3) << "loading model from " << model_filename; - std::string program_desc_str; - ReadBinaryFile(model_filename, &program_desc_str); - std::unique_ptr main_program( - new paddle::framework::ProgramDesc(program_desc_str)); - return main_program; -} - -bool IsPersistable(const paddle::framework::VarDesc* var) { - if (var->Persistable() && - var->GetType() != paddle::framework::proto::VarType::FEED_MINIBATCH && - var->GetType() != paddle::framework::proto::VarType::FETCH_LIST && - var->GetType() != paddle::framework::proto::VarType::RAW) { - return true; - } - return false; -} - -} // namespace train -} // namespace paddle - -int main(int argc, char* argv[]) { - ::GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true); - - std::cerr << "filelist: " << FLAGS_filelist << std::endl; - std::cerr << "data_proto_desc: " << FLAGS_data_proto_desc << std::endl; - std::cerr << "startup_program_file: " << FLAGS_startup_program_file - << std::endl; - std::cerr << "main_program_file: " << FLAGS_main_program_file << std::endl; - std::cerr << "loss_name: " << FLAGS_loss_name << std::endl; - std::cerr << "save_dir: " << FLAGS_save_dir << std::endl; - std::cerr << "epoch_num: " << FLAGS_epoch_num << std::endl; - - std::string filelist = std::string(FLAGS_filelist); - std::vector file_vec; - std::ifstream fin(filelist); - if (fin) { - std::string filename; - while (fin >> filename) { - file_vec.push_back(filename); - } - } - PADDLE_ENFORCE_GE( - file_vec.size(), 1, - platform::errors::InvalidArgument( - "At least one file to train, but received number of file is %d.", - file_vec.size())); - paddle::framework::InitDevices(); - const auto cpu_place = paddle::platform::CPUPlace(); - paddle::framework::Executor executor(cpu_place); - paddle::framework::Scope scope; - auto startup_program = - paddle::train::LoadProgramDesc(std::string(FLAGS_startup_program_file)); - auto main_program = - paddle::train::LoadProgramDesc(std::string(FLAGS_main_program_file)); - - executor.Run(*startup_program, &scope, 0); - - std::string data_feed_desc_str; - paddle::train::ReadBinaryFile(std::string(FLAGS_data_proto_desc), - &data_feed_desc_str); - VLOG(3) << "load data feed desc done."; - std::unique_ptr dataset_ptr; - dataset_ptr = - paddle::framework::DatasetFactory::CreateDataset("MultiSlotDataset"); - VLOG(3) << "initialize dataset ptr done"; - - // find all params - std::vector param_names; - const paddle::framework::BlockDesc& global_block = main_program->Block(0); - for (auto* var : global_block.AllVars()) { - if (paddle::train::IsPersistable(var)) { - VLOG(3) << "persistable variable's name: " << var->Name(); - param_names.push_back(var->Name()); - } - } - - int epoch_num = FLAGS_epoch_num; - std::string loss_name = FLAGS_loss_name; - auto loss_var = scope.Var(loss_name); - - LOG(INFO) << "Start training..."; - - for (int epoch = 0; epoch < epoch_num; ++epoch) { - VLOG(3) << "Epoch:" << epoch; - // get reader - dataset_ptr->SetFileList(file_vec); - VLOG(3) << "set file list done"; - dataset_ptr->SetThreadNum(1); - VLOG(3) << "set thread num done"; - dataset_ptr->SetDataFeedDesc(data_feed_desc_str); - VLOG(3) << "set data feed desc done"; - dataset_ptr->CreateReaders(); - const std::vector readers = - dataset_ptr->GetReaders(); - PADDLE_ENFORCE_EQ(readers.size(), 1, - platform::errors::InvalidArgument( - "Readers num(%d) should be equal to thread num(1).", - readers.size())); - readers[0]->SetPlace(paddle::platform::CPUPlace()); - const std::vector& input_feed_names = - readers[0]->GetUseSlotAlias(); - for (auto name : input_feed_names) { - readers[0]->AddFeedVar(scope.Var(name), name); - } - VLOG(3) << "get reader done"; - readers[0]->Start(); - VLOG(3) << "start a reader"; - VLOG(3) << "readers size: " << readers.size(); - - int step = 0; - std::vector loss_vec; - - while (readers[0]->Next() > 0) { - executor.Run(*main_program, &scope, 0, false, true); - loss_vec.push_back( - loss_var->Get().data()[0]); - } - float average_loss = - accumulate(loss_vec.begin(), loss_vec.end(), 0.0) / loss_vec.size(); - - LOG(INFO) << "epoch: " << epoch << "; average loss: " << average_loss; - dataset_ptr->DestroyReaders(); - - // save model - std::string save_dir_root = FLAGS_save_dir; - std::string save_dir = - save_dir_root + "/epoch" + std::to_string(epoch) + ".model"; - paddle::framework::save_model(main_program, &scope, param_names, save_dir, - false); - } -} diff --git a/paddle/fluid/train/imdb_demo/generate_program.py b/paddle/fluid/train/imdb_demo/generate_program.py deleted file mode 100644 index a12282d94dd..00000000000 --- a/paddle/fluid/train/imdb_demo/generate_program.py +++ /dev/null @@ -1,72 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import os -import sys -import paddle -import logging -import paddle.fluid as fluid - -logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s') -logger = logging.getLogger("fluid") -logger.setLevel(logging.INFO) - - -def load_vocab(filename): - vocab = {} - with open(filename) as f: - wid = 0 - for line in f: - vocab[line.strip()] = wid - wid += 1 - vocab[""] = len(vocab) - return vocab - - -if __name__ == "__main__": - vocab = load_vocab('imdb.vocab') - dict_dim = len(vocab) - model_name = sys.argv[1] - data = fluid.layers.data( - name="words", shape=[1], dtype="int64", lod_level=1) - label = fluid.layers.data(name="label", shape=[1], dtype="int64") - - dataset = fluid.DatasetFactory().create_dataset() - dataset.set_batch_size(128) - dataset.set_pipe_command("python imdb_reader.py") - - dataset.set_use_var([data, label]) - desc = dataset.proto_desc - - with open("data.proto", "w") as f: - f.write(dataset.desc()) - - from nets import * - if model_name == 'cnn': - logger.info("Generate program description of CNN net") - avg_cost, acc, prediction = cnn_net(data, label, dict_dim) - elif model_name == 'bow': - logger.info("Generate program description of BOW net") - avg_cost, acc, prediction = bow_net(data, label, dict_dim) - else: - logger.error("no such model: " + model_name) - exit(0) - # optimizer = fluid.optimizer.SGD(learning_rate=0.01) - optimizer = fluid.optimizer.Adagrad(learning_rate=0.01) - optimizer.minimize(avg_cost) - - with open(model_name + "_main_program", "wb") as f: - f.write(fluid.default_main_program().desc.serialize_to_string()) - - with open(model_name + "_startup_program", "wb") as f: - f.write(fluid.default_startup_program().desc.serialize_to_string()) diff --git a/paddle/fluid/train/imdb_demo/imdb_reader.py b/paddle/fluid/train/imdb_demo/imdb_reader.py deleted file mode 100644 index f197c95ec32..00000000000 --- a/paddle/fluid/train/imdb_demo/imdb_reader.py +++ /dev/null @@ -1,75 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import sys -import os -import paddle -import re -import paddle.fluid.incubate.data_generator as dg - - -class IMDBDataset(dg.MultiSlotDataGenerator): - def load_resource(self, dictfile): - self._vocab = {} - wid = 0 - with open(dictfile) as f: - for line in f: - self._vocab[line.strip()] = wid - wid += 1 - self._unk_id = len(self._vocab) - self._pattern = re.compile(r'(;|,|\.|\?|!|\s|\(|\))') - self.return_value = ("words", [1, 2, 3, 4, 5, 6]), ("label", [0]) - - def get_words_and_label(self, line): - send = '|'.join(line.split('|')[:-1]).lower().replace("
", - " ").strip() - label = [int(line.split('|')[-1])] - - words = [x for x in self._pattern.split(send) if x and x != " "] - feas = [ - self._vocab[x] if x in self._vocab else self._unk_id for x in words - ] - return feas, label - - def infer_reader(self, infer_filelist, batch, buf_size): - def local_iter(): - for fname in infer_filelist: - with open(fname, "r") as fin: - for line in fin: - feas, label = self.get_words_and_label(line) - yield feas, label - - import paddle - batch_iter = paddle.batch( - paddle.reader.shuffle( - local_iter, buf_size=buf_size), - batch_size=batch) - return batch_iter - - def generate_sample(self, line): - def memory_iter(): - for i in range(1000): - yield self.return_value - - def data_iter(): - feas, label = self.get_words_and_label(line) - yield ("words", feas), ("label", label) - - return data_iter - - -if __name__ == "__main__": - imdb = IMDBDataset() - imdb.load_resource("imdb.vocab") - imdb.run_from_stdin() diff --git a/paddle/fluid/train/imdb_demo/include/save_model.h b/paddle/fluid/train/imdb_demo/include/save_model.h deleted file mode 100644 index 45205286685..00000000000 --- a/paddle/fluid/train/imdb_demo/include/save_model.h +++ /dev/null @@ -1,41 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#pragma once -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "gflags/gflags.h" -#include "paddle/fluid/framework/feed_fetch_method.h" -#include "paddle/fluid/framework/feed_fetch_type.h" -#include "paddle/fluid/framework/lod_rank_table.h" -#include "paddle/fluid/framework/lod_tensor_array.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/prune.h" -#include "paddle/fluid/framework/reader.h" -#include "paddle/fluid/platform/place.h" - -namespace paddle { -namespace framework { -void save_model(const std::unique_ptr& main_program, Scope* scope, - const std::vector& param_names, - const std::string& model_name, bool save_combine); -} -} diff --git a/paddle/fluid/train/imdb_demo/nets.py b/paddle/fluid/train/imdb_demo/nets.py deleted file mode 100644 index a25e67e3b5d..00000000000 --- a/paddle/fluid/train/imdb_demo/nets.py +++ /dev/null @@ -1,140 +0,0 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import sys -import time -import numpy as np - -import paddle -import paddle.fluid as fluid - - -def bow_net(data, - label, - dict_dim, - emb_dim=128, - hid_dim=128, - hid_dim2=96, - class_dim=2): - """ - bow net - """ - emb = fluid.layers.embedding( - input=data, size=[dict_dim, emb_dim], is_sparse=True) - bow = fluid.layers.sequence_pool(input=emb, pool_type='sum') - bow_tanh = fluid.layers.tanh(bow) - fc_1 = fluid.layers.fc(input=bow_tanh, size=hid_dim, act="tanh") - fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh") - prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax") - cost = fluid.layers.cross_entropy(input=prediction, label=label) - avg_cost = fluid.layers.mean(x=cost) - acc = fluid.layers.accuracy(input=prediction, label=label) - - return avg_cost, acc, prediction - - -def cnn_net(data, - label, - dict_dim, - emb_dim=128, - hid_dim=128, - hid_dim2=96, - class_dim=2, - win_size=3): - """ - conv net - """ - emb = fluid.layers.embedding( - input=data, size=[dict_dim, emb_dim], is_sparse=True) - conv_3 = fluid.nets.sequence_conv_pool( - input=emb, - num_filters=hid_dim, - filter_size=win_size, - act="tanh", - pool_type="max") - - fc_1 = fluid.layers.fc(input=[conv_3], size=hid_dim2) - - prediction = fluid.layers.fc(input=[fc_1], size=class_dim, act="softmax") - cost = fluid.layers.cross_entropy(input=prediction, label=label) - avg_cost = fluid.layers.mean(x=cost) - acc = fluid.layers.accuracy(input=prediction, label=label) - - return avg_cost, acc, prediction - - -def lstm_net(data, - label, - dict_dim, - emb_dim=128, - hid_dim=128, - hid_dim2=96, - class_dim=2, - emb_lr=30.0): - """ - lstm net - """ - emb = fluid.layers.embedding( - input=data, - size=[dict_dim, emb_dim], - param_attr=fluid.ParamAttr(learning_rate=emb_lr), - is_sparse=True) - - fc0 = fluid.layers.fc(input=emb, size=hid_dim * 4) - - lstm_h, c = fluid.layers.dynamic_lstm( - input=fc0, size=hid_dim * 4, is_reverse=False) - - lstm_max = fluid.layers.sequence_pool(input=lstm_h, pool_type='max') - lstm_max_tanh = fluid.layers.tanh(lstm_max) - - fc1 = fluid.layers.fc(input=lstm_max_tanh, size=hid_dim2, act='tanh') - - prediction = fluid.layers.fc(input=fc1, size=class_dim, act='softmax') - - cost = fluid.layers.cross_entropy(input=prediction, label=label) - avg_cost = fluid.layers.mean(x=cost) - acc = fluid.layers.accuracy(input=prediction, label=label) - - return avg_cost, acc, prediction - - -def gru_net(data, - label, - dict_dim, - emb_dim=128, - hid_dim=128, - hid_dim2=96, - class_dim=2, - emb_lr=400.0): - """ - gru net - """ - emb = fluid.layers.embedding( - input=data, - size=[dict_dim, emb_dim], - param_attr=fluid.ParamAttr(learning_rate=emb_lr)) - - fc0 = fluid.layers.fc(input=emb, size=hid_dim * 3) - gru_h = fluid.layers.dynamic_gru(input=fc0, size=hid_dim, is_reverse=False) - gru_max = fluid.layers.sequence_pool(input=gru_h, pool_type='max') - gru_max_tanh = fluid.layers.tanh(gru_max) - fc1 = fluid.layers.fc(input=gru_max_tanh, size=hid_dim2, act='tanh') - prediction = fluid.layers.fc(input=fc1, size=class_dim, act='softmax') - - cost = fluid.layers.cross_entropy(input=prediction, label=label) - avg_cost = fluid.layers.mean(x=cost) - acc = fluid.layers.accuracy(input=prediction, label=label) - - return avg_cost, acc, prediction diff --git a/paddle/fluid/train/imdb_demo/run.sh b/paddle/fluid/train/imdb_demo/run.sh deleted file mode 100644 index f71b4bac602..00000000000 --- a/paddle/fluid/train/imdb_demo/run.sh +++ /dev/null @@ -1,3 +0,0 @@ - -set -exu -build/demo_trainer --flagfile="train.cfg" diff --git a/paddle/fluid/train/imdb_demo/save_model.cc b/paddle/fluid/train/imdb_demo/save_model.cc deleted file mode 100644 index 49da550dbb7..00000000000 --- a/paddle/fluid/train/imdb_demo/save_model.cc +++ /dev/null @@ -1,77 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#include "include/save_model.h" -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "gflags/gflags.h" -#include "paddle/fluid/framework/feed_fetch_method.h" -#include "paddle/fluid/framework/feed_fetch_type.h" -#include "paddle/fluid/framework/lod_rank_table.h" -#include "paddle/fluid/framework/lod_tensor_array.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/framework/prune.h" -#include "paddle/fluid/framework/reader.h" -#include "paddle/fluid/platform/place.h" - -using std::unique_ptr; - -namespace paddle { -namespace framework { -void save_model(const unique_ptr& main_program, Scope* scope, - const std::vector& param_names, - const std::string& model_name, bool save_combine) { - auto place = platform::CPUPlace(); - const BlockDesc& global_block = main_program->Block(0); - std::vector paralist; - for (auto* var : global_block.AllVars()) { - bool is_model_param = false; - for (auto param_name : param_names) { - if (var->Name() == param_name) { - is_model_param = true; - break; - } - } - - if (!is_model_param) continue; - - if (!save_combine) { - VLOG(3) << "model var name: %s" << var->Name().c_str(); - - paddle::framework::AttributeMap attrs; - attrs.insert({"file_path", model_name + "/" + var->Name()}); - auto save_op = paddle::framework::OpRegistry::CreateOp( - "save", {{"X", {var->Name()}}}, {}, attrs); - - save_op->Run(*scope, place); - } else { - paralist.push_back(var->Name()); - } - } - if (save_combine) { - std::sort(paralist.begin(), paralist.end()); - paddle::framework::AttributeMap attrs; - attrs.insert({"file_path", model_name}); - auto save_op = paddle::framework::OpRegistry::CreateOp( - "save_combine", {{"X", paralist}}, {}, attrs); - save_op->Run(*scope, place); - } -} -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/train/imdb_demo/train.cfg b/paddle/fluid/train/imdb_demo/train.cfg deleted file mode 100644 index 1821498890b..00000000000 --- a/paddle/fluid/train/imdb_demo/train.cfg +++ /dev/null @@ -1,7 +0,0 @@ ---filelist=train_filelist.txt ---data_proto_desc=data.proto ---loss_name=mean_0.tmp_0 ---startup_program_file=bow_startup_program ---main_program_file=bow_main_program ---save_dir=bow_model ---epoch_num=30 diff --git a/paddle/fluid/train/imdb_demo/train_filelist.txt b/paddle/fluid/train/imdb_demo/train_filelist.txt deleted file mode 100644 index dcf088af417..00000000000 --- a/paddle/fluid/train/imdb_demo/train_filelist.txt +++ /dev/null @@ -1,12 +0,0 @@ -train_data/part-0 -train_data/part-1 -train_data/part-10 -train_data/part-11 -train_data/part-2 -train_data/part-3 -train_data/part-4 -train_data/part-5 -train_data/part-6 -train_data/part-7 -train_data/part-8 -train_data/part-9 diff --git a/paddle/fluid/train/test_train_recognize_digits.cc b/paddle/fluid/train/test_train_recognize_digits.cc deleted file mode 100644 index 7a980cbac8b..00000000000 --- a/paddle/fluid/train/test_train_recognize_digits.cc +++ /dev/null @@ -1,95 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include - -#include "gflags/gflags.h" -#include "gtest/gtest.h" - -#include "paddle/fluid/framework/executor.h" -#include "paddle/fluid/framework/io/fs.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/inference/io.h" -#include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/init.h" -#include "paddle/fluid/platform/place.h" - -DEFINE_string(dirname, "", "Directory of the train model."); - -namespace paddle { - -void Train(std::string model_dir) { - framework::InitDevices(); - const auto cpu_place = platform::CPUPlace(); - framework::Executor executor(cpu_place); - framework::Scope scope; - - auto train_program = inference::Load( - &executor, &scope, model_dir + "__model_combined__.main_program", - model_dir + "__params_combined__"); - - std::string loss_name = ""; - for (auto op_desc : train_program->Block(0).AllOps()) { - if (op_desc->Type() == "mean") { - loss_name = op_desc->Output("Out")[0]; - break; - } - } - - PADDLE_ENFORCE_NE(loss_name, "", - platform::errors::NotFound("Loss name is not found.")); - - // prepare data - auto x_var = scope.Var("img"); - auto x_tensor = x_var->GetMutable(); - x_tensor->Resize({64, 1, 28, 28}); - - auto x_data = x_tensor->mutable_data(cpu_place); - for (int i = 0; i < 64 * 28 * 28; ++i) { - x_data[i] = 1.0; - } - - auto y_var = scope.Var("label"); - auto y_tensor = y_var->GetMutable(); - y_tensor->Resize({64, 1}); - auto y_data = y_tensor->mutable_data(cpu_place); - for (int i = 0; i < 64 * 1; ++i) { - y_data[i] = static_cast(1); - } - - auto loss_var = scope.Var(loss_name); - float first_loss = 0.0; - float last_loss = 0.0; - for (int i = 0; i < 100; ++i) { - executor.Run(*train_program, &scope, 0, false, true, - {loss_name, "img", "label"}); - if (i == 0) { - first_loss = loss_var->Get().data()[0]; - } else if (i == 99) { - last_loss = loss_var->Get().data()[0]; - } - } - EXPECT_LT(last_loss, first_loss); -} - -TEST(train, recognize_digits) { - CHECK(!FLAGS_dirname.empty()); - Train(FLAGS_dirname + "recognize_digits_mlp.train.model/"); - Train(FLAGS_dirname + "recognize_digits_conv.train.model/"); -} - -} // namespace paddle -- GitLab From eb3199fc54f04d6b25b6bb271dff9e97375d7f77 Mon Sep 17 00:00:00 2001 From: wuhuanzhou Date: Thu, 1 Apr 2021 09:58:00 +0800 Subject: [PATCH 123/486] fix compilation error on rocm, test=develop (#31991) --- cmake/external/eigen.cmake | 10 +- patches/eigen/Meta.h | 806 +++++++++++++++++++++++++++++++++++++ 2 files changed, 815 insertions(+), 1 deletion(-) create mode 100755 patches/eigen/Meta.h diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake index f68db1eab3d..4619f9f7b7e 100644 --- a/cmake/external/eigen.cmake +++ b/cmake/external/eigen.cmake @@ -27,6 +27,14 @@ cache_third_party(extern_eigen3 if(WIN32) add_definitions(-DEIGEN_STRONG_INLINE=inline) +elseif(LINUX) + if(WITH_ROCM) + # For HIPCC Eigen::internal::device::numeric_limits is not EIGEN_DEVICE_FUNC + # which will cause compiler error of using __host__ funciont in __host__ __device__ + file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/Meta.h native_src) + file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/Eigen/src/Core/util/Meta.h native_dst) + set(EIGEN_PATCH_COMMAND cp ${native_src} ${native_dst}) + endif() endif() set(EIGEN_INCLUDE_DIR ${EIGEN_SOURCE_DIR}) @@ -40,7 +48,7 @@ ExternalProject_Add( PREFIX ${EIGEN_PREFIX_DIR} SOURCE_DIR ${EIGEN_SOURCE_DIR} UPDATE_COMMAND "" - PATCH_COMMAND "" + PATCH_COMMAND ${EIGEN_PATCH_COMMAND} CONFIGURE_COMMAND "" BUILD_COMMAND "" INSTALL_COMMAND "" diff --git a/patches/eigen/Meta.h b/patches/eigen/Meta.h new file mode 100755 index 00000000000..b7b789a19c4 --- /dev/null +++ b/patches/eigen/Meta.h @@ -0,0 +1,806 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2008-2015 Gael Guennebaud +// Copyright (C) 2006-2008 Benoit Jacob +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_META_H +#define EIGEN_META_H + +#if defined(EIGEN_GPU_COMPILE_PHASE) + + #include + + #if defined(EIGEN_CUDA_ARCH) + #include + #endif + + #if defined(EIGEN_HIP_DEVICE_COMPILE) + #include "Eigen/src/Core/arch/HIP/hcc/math_constants.h" + #endif + +#endif + +// Recent versions of ICC require for pointer types below. +#define EIGEN_ICC_NEEDS_CSTDINT (EIGEN_COMP_ICC>=1600 && EIGEN_COMP_CXXVER >= 11) + +// Define portable (u)int{32,64} types +#if EIGEN_HAS_CXX11 || EIGEN_ICC_NEEDS_CSTDINT +#include +namespace Eigen { +namespace numext { +typedef std::uint8_t uint8_t; +typedef std::int8_t int8_t; +typedef std::uint16_t uint16_t; +typedef std::int16_t int16_t; +typedef std::uint32_t uint32_t; +typedef std::int32_t int32_t; +typedef std::uint64_t uint64_t; +typedef std::int64_t int64_t; +} +} +#else +// Without c++11, all compilers able to compile Eigen also +// provide the C99 stdint.h header file. +#include +namespace Eigen { +namespace numext { +typedef ::uint8_t uint8_t; +typedef ::int8_t int8_t; +typedef ::uint16_t uint16_t; +typedef ::int16_t int16_t; +typedef ::uint32_t uint32_t; +typedef ::int32_t int32_t; +typedef ::uint64_t uint64_t; +typedef ::int64_t int64_t; +} +} +#endif + +namespace Eigen { + +typedef EIGEN_DEFAULT_DENSE_INDEX_TYPE DenseIndex; + +/** + * \brief The Index type as used for the API. + * \details To change this, \c \#define the preprocessor symbol \c EIGEN_DEFAULT_DENSE_INDEX_TYPE. + * \sa \blank \ref TopicPreprocessorDirectives, StorageIndex. + */ + +typedef EIGEN_DEFAULT_DENSE_INDEX_TYPE Index; + +namespace internal { + +/** \internal + * \file Meta.h + * This file contains generic metaprogramming classes which are not specifically related to Eigen. + * \note In case you wonder, yes we're aware that Boost already provides all these features, + * we however don't want to add a dependency to Boost. + */ + +// Only recent versions of ICC complain about using ptrdiff_t to hold pointers, +// and older versions do not provide *intptr_t types. +#if EIGEN_ICC_NEEDS_CSTDINT +typedef std::intptr_t IntPtr; +typedef std::uintptr_t UIntPtr; +#else +typedef std::ptrdiff_t IntPtr; +typedef std::size_t UIntPtr; +#endif +#undef EIGEN_ICC_NEEDS_CSTDINT + +struct true_type { enum { value = 1 }; }; +struct false_type { enum { value = 0 }; }; + +template +struct bool_constant; + +template<> +struct bool_constant : true_type {}; + +template<> +struct bool_constant : false_type {}; + +template +struct conditional { typedef Then type; }; + +template +struct conditional { typedef Else type; }; + +template struct remove_reference { typedef T type; }; +template struct remove_reference { typedef T type; }; + +template struct remove_pointer { typedef T type; }; +template struct remove_pointer { typedef T type; }; +template struct remove_pointer { typedef T type; }; + +template struct remove_const { typedef T type; }; +template struct remove_const { typedef T type; }; +template struct remove_const { typedef T type[]; }; +template struct remove_const { typedef T type[Size]; }; + +template struct remove_all { typedef T type; }; +template struct remove_all { typedef typename remove_all::type type; }; +template struct remove_all { typedef typename remove_all::type type; }; +template struct remove_all { typedef typename remove_all::type type; }; +template struct remove_all { typedef typename remove_all::type type; }; +template struct remove_all { typedef typename remove_all::type type; }; + +template struct is_arithmetic { enum { value = false }; }; +template<> struct is_arithmetic { enum { value = true }; }; +template<> struct is_arithmetic { enum { value = true }; }; +template<> struct is_arithmetic { enum { value = true }; }; +template<> struct is_arithmetic { enum { value = true }; }; +template<> struct is_arithmetic { enum { value = true }; }; +template<> struct is_arithmetic { enum { value = true }; }; +template<> struct is_arithmetic { enum { value = true }; }; +template<> struct is_arithmetic { enum { value = true }; }; +template<> struct is_arithmetic{ enum { value = true }; }; +template<> struct is_arithmetic { enum { value = true }; }; +template<> struct is_arithmetic { enum { value = true }; }; +template<> struct is_arithmetic { enum { value = true }; }; +template<> struct is_arithmetic { enum { value = true }; }; + +template struct is_same { enum { value = 0 }; }; +template struct is_same { enum { value = 1 }; }; + +template< class T > +struct is_void : is_same::type> {}; + +#if EIGEN_HAS_CXX11 +template<> struct is_arithmetic { enum { value = true }; }; +template<> struct is_arithmetic { enum { value = true }; }; +using std::is_integral; +#else +template struct is_integral { enum { value = false }; }; +template<> struct is_integral { enum { value = true }; }; +template<> struct is_integral { enum { value = true }; }; +template<> struct is_integral { enum { value = true }; }; +template<> struct is_integral { enum { value = true }; }; +template<> struct is_integral { enum { value = true }; }; +template<> struct is_integral { enum { value = true }; }; +template<> struct is_integral { enum { value = true }; }; +template<> struct is_integral { enum { value = true }; }; +template<> struct is_integral { enum { value = true }; }; +template<> struct is_integral { enum { value = true }; }; +#if EIGEN_COMP_MSVC +template<> struct is_integral { enum { value = true }; }; +template<> struct is_integral { enum { value = true }; }; +#endif +#endif + +#if EIGEN_HAS_CXX11 +using std::make_unsigned; +#else +// TODO: Possibly improve this implementation of make_unsigned. +// It is currently used only by +// template struct random_default_impl. +template struct make_unsigned; +template<> struct make_unsigned { typedef unsigned char type; }; +template<> struct make_unsigned { typedef unsigned char type; }; +template<> struct make_unsigned { typedef unsigned char type; }; +template<> struct make_unsigned { typedef unsigned short type; }; +template<> struct make_unsigned { typedef unsigned short type; }; +template<> struct make_unsigned { typedef unsigned int type; }; +template<> struct make_unsigned { typedef unsigned int type; }; +template<> struct make_unsigned { typedef unsigned long type; }; +template<> struct make_unsigned { typedef unsigned long type; }; +#if EIGEN_COMP_MSVC +template<> struct make_unsigned { typedef unsigned __int64 type; }; +template<> struct make_unsigned { typedef unsigned __int64 type; }; +#endif + +// Some platforms define int64_t as long long even for C++03. In this case we +// are missing the definition for make_unsigned. If we just define it, we get +// duplicated definitions for platforms defining int64_t as signed long for +// C++03. We therefore add the specialization for C++03 long long for these +// platforms only. +#if EIGEN_OS_MAC +template<> struct make_unsigned { typedef unsigned long long type; }; +template<> struct make_unsigned { typedef unsigned long long type; }; +#endif +#endif + +template struct add_const { typedef const T type; }; +template struct add_const { typedef T& type; }; + +template struct is_const { enum { value = 0 }; }; +template struct is_const { enum { value = 1 }; }; + +template struct add_const_on_value_type { typedef const T type; }; +template struct add_const_on_value_type { typedef T const& type; }; +template struct add_const_on_value_type { typedef T const* type; }; +template struct add_const_on_value_type { typedef T const* const type; }; +template struct add_const_on_value_type { typedef T const* const type; }; + +#if EIGEN_HAS_CXX11 + +using std::is_convertible; + +#else + +template +struct is_convertible_impl +{ +private: + struct any_conversion + { + template any_conversion(const volatile T&); + template any_conversion(T&); + }; + struct yes {int a[1];}; + struct no {int a[2];}; + + template + static yes test(T, int); + + template + static no test(any_conversion, ...); + +public: + static typename internal::remove_reference::type* ms_from; +#ifdef __INTEL_COMPILER + #pragma warning push + #pragma warning ( disable : 2259 ) +#endif + enum { value = sizeof(test(*ms_from, 0))==sizeof(yes) }; +#ifdef __INTEL_COMPILER + #pragma warning pop +#endif +}; + +template +struct is_convertible +{ + enum { value = is_convertible_impl::value }; +}; + +template +struct is_convertible { enum { value = false }; }; + +template +struct is_convertible { enum { value = true }; }; + +#endif + +/** \internal Allows to enable/disable an overload + * according to a compile time condition. + */ +template struct enable_if; + +template struct enable_if +{ typedef T type; }; + +#if defined(EIGEN_GPU_COMPILE_PHASE) +#if !defined(__FLT_EPSILON__) +#define __FLT_EPSILON__ FLT_EPSILON +#define __DBL_EPSILON__ DBL_EPSILON +#endif + +namespace device { + +template struct numeric_limits +{ + EIGEN_DEVICE_FUNC static T epsilon() { return 0; } + EIGEN_DEVICE_FUNC static T (max)() { assert(false && "Highest not supported for this type"); } + EIGEN_DEVICE_FUNC static T (min)() { assert(false && "Lowest not supported for this type"); } + EIGEN_DEVICE_FUNC static T infinity() { assert(false && "Infinity not supported for this type"); } + EIGEN_DEVICE_FUNC static T quiet_NaN() { assert(false && "quiet_NaN not supported for this type"); } +}; +template<> struct numeric_limits +{ + EIGEN_DEVICE_FUNC + static float epsilon() { return __FLT_EPSILON__; } + EIGEN_DEVICE_FUNC + static float (max)() { + #if defined(EIGEN_CUDA_ARCH) + return CUDART_MAX_NORMAL_F; + #else + return HIPRT_MAX_NORMAL_F; + #endif + } + EIGEN_DEVICE_FUNC + static float (min)() { return FLT_MIN; } + EIGEN_DEVICE_FUNC + static float infinity() { + #if defined(EIGEN_CUDA_ARCH) + return CUDART_INF_F; + #else + return HIPRT_INF_F; + #endif + } + EIGEN_DEVICE_FUNC + static float quiet_NaN() { + #if defined(EIGEN_CUDA_ARCH) + return CUDART_NAN_F; + #else + return HIPRT_NAN_F; + #endif + } +}; +template<> struct numeric_limits +{ + EIGEN_DEVICE_FUNC + static double epsilon() { return __DBL_EPSILON__; } + EIGEN_DEVICE_FUNC + static double (max)() { return DBL_MAX; } + EIGEN_DEVICE_FUNC + static double (min)() { return DBL_MIN; } + EIGEN_DEVICE_FUNC + static double infinity() { + #if defined(EIGEN_CUDA_ARCH) + return CUDART_INF; + #else + return HIPRT_INF; + #endif + } + EIGEN_DEVICE_FUNC + static double quiet_NaN() { + #if defined(EIGEN_CUDA_ARCH) + return CUDART_NAN; + #else + return HIPRT_NAN; + #endif + } +}; +template<> struct numeric_limits +{ + EIGEN_DEVICE_FUNC + static int epsilon() { return 0; } + EIGEN_DEVICE_FUNC + static int (max)() { return INT_MAX; } + EIGEN_DEVICE_FUNC + static int (min)() { return INT_MIN; } +}; +template<> struct numeric_limits +{ + EIGEN_DEVICE_FUNC + static unsigned int epsilon() { return 0; } + EIGEN_DEVICE_FUNC + static unsigned int (max)() { return UINT_MAX; } + EIGEN_DEVICE_FUNC + static unsigned int (min)() { return 0; } +}; +template<> struct numeric_limits +{ + EIGEN_DEVICE_FUNC + static long epsilon() { return 0; } + EIGEN_DEVICE_FUNC + static long (max)() { return LONG_MAX; } + EIGEN_DEVICE_FUNC + static long (min)() { return LONG_MIN; } +}; +template<> struct numeric_limits +{ + EIGEN_DEVICE_FUNC + static unsigned long epsilon() { return 0; } + EIGEN_DEVICE_FUNC + static unsigned long (max)() { return ULONG_MAX; } + EIGEN_DEVICE_FUNC + static unsigned long (min)() { return 0; } +}; +template<> struct numeric_limits +{ + EIGEN_DEVICE_FUNC + static long long epsilon() { return 0; } + EIGEN_DEVICE_FUNC + static long long (max)() { return LLONG_MAX; } + EIGEN_DEVICE_FUNC + static long long (min)() { return LLONG_MIN; } +}; +template<> struct numeric_limits +{ + EIGEN_DEVICE_FUNC + static unsigned long long epsilon() { return 0; } + EIGEN_DEVICE_FUNC + static unsigned long long (max)() { return ULLONG_MAX; } + EIGEN_DEVICE_FUNC + static unsigned long long (min)() { return 0; } +}; +template<> struct numeric_limits +{ + EIGEN_DEVICE_FUNC + static bool epsilon() { return false; } + EIGEN_DEVICE_FUNC + static bool (max)() { return true; } + EIGEN_DEVICE_FUNC + static bool (min)() { return false; } +}; + +} + +#endif + +/** \internal + * A base class do disable default copy ctor and copy assignment operator. + */ +class noncopyable +{ + EIGEN_DEVICE_FUNC noncopyable(const noncopyable&); + EIGEN_DEVICE_FUNC const noncopyable& operator=(const noncopyable&); +protected: + EIGEN_DEVICE_FUNC noncopyable() {} + EIGEN_DEVICE_FUNC ~noncopyable() {} +}; + +/** \internal + * Provides access to the number of elements in the object of as a compile-time constant expression. + * It "returns" Eigen::Dynamic if the size cannot be resolved at compile-time (default). + * + * Similar to std::tuple_size, but more general. + * + * It currently supports: + * - any types T defining T::SizeAtCompileTime + * - plain C arrays as T[N] + * - std::array (c++11) + * - some internal types such as SingleRange and AllRange + * + * The second template parameter eases SFINAE-based specializations. + */ +template struct array_size { + enum { value = Dynamic }; +}; + +template struct array_size::type> { + enum { value = T::SizeAtCompileTime }; +}; + +template struct array_size { + enum { value = N }; +}; +template struct array_size { + enum { value = N }; +}; + +#if EIGEN_HAS_CXX11 +template struct array_size > { + enum { value = N }; +}; +template struct array_size > { + enum { value = N }; +}; +#endif + +/** \internal + * Analogue of the std::size free function. + * It returns the size of the container or view \a x of type \c T + * + * It currently supports: + * - any types T defining a member T::size() const + * - plain C arrays as T[N] + * + */ +template +Index size(const T& x) { return x.size(); } + +template +Index size(const T (&) [N]) { return N; } + +/** \internal + * Convenient struct to get the result type of a nullary, unary, binary, or + * ternary functor. + * + * Pre C++11: + * Supports both a Func::result_type member and templated + * Func::result::type member. + * + * If none of these members is provided, then the type of the first + * argument is returned. + * + * Post C++11: + * This uses std::result_of. However, note the `type` member removes + * const and converts references/pointers to their corresponding value type. + */ +#if EIGEN_HAS_STD_INVOKE_RESULT +template struct result_of; + +template +struct result_of { + typedef typename std::invoke_result::type type1; + typedef typename remove_all::type type; +}; +#elif EIGEN_HAS_STD_RESULT_OF +template struct result_of { + typedef typename std::result_of::type type1; + typedef typename remove_all::type type; +}; +#else +template struct result_of { }; + +struct has_none {int a[1];}; +struct has_std_result_type {int a[2];}; +struct has_tr1_result {int a[3];}; + +template +struct nullary_result_of_select {}; + +template +struct nullary_result_of_select {typedef typename Func::result_type type;}; + +template +struct nullary_result_of_select {typedef typename Func::template result::type type;}; + +template +struct result_of { + template + static has_std_result_type testFunctor(T const *, typename T::result_type const * = 0); + template + static has_tr1_result testFunctor(T const *, typename T::template result::type const * = 0); + static has_none testFunctor(...); + + // note that the following indirection is needed for gcc-3.3 + enum {FunctorType = sizeof(testFunctor(static_cast(0)))}; + typedef typename nullary_result_of_select::type type; +}; + +template +struct unary_result_of_select {typedef typename internal::remove_all::type type;}; + +template +struct unary_result_of_select {typedef typename Func::result_type type;}; + +template +struct unary_result_of_select {typedef typename Func::template result::type type;}; + +template +struct result_of { + template + static has_std_result_type testFunctor(T const *, typename T::result_type const * = 0); + template + static has_tr1_result testFunctor(T const *, typename T::template result::type const * = 0); + static has_none testFunctor(...); + + // note that the following indirection is needed for gcc-3.3 + enum {FunctorType = sizeof(testFunctor(static_cast(0)))}; + typedef typename unary_result_of_select::type type; +}; + +template +struct binary_result_of_select {typedef typename internal::remove_all::type type;}; + +template +struct binary_result_of_select +{typedef typename Func::result_type type;}; + +template +struct binary_result_of_select +{typedef typename Func::template result::type type;}; + +template +struct result_of { + template + static has_std_result_type testFunctor(T const *, typename T::result_type const * = 0); + template + static has_tr1_result testFunctor(T const *, typename T::template result::type const * = 0); + static has_none testFunctor(...); + + // note that the following indirection is needed for gcc-3.3 + enum {FunctorType = sizeof(testFunctor(static_cast(0)))}; + typedef typename binary_result_of_select::type type; +}; + +template +struct ternary_result_of_select {typedef typename internal::remove_all::type type;}; + +template +struct ternary_result_of_select +{typedef typename Func::result_type type;}; + +template +struct ternary_result_of_select +{typedef typename Func::template result::type type;}; + +template +struct result_of { + template + static has_std_result_type testFunctor(T const *, typename T::result_type const * = 0); + template + static has_tr1_result testFunctor(T const *, typename T::template result::type const * = 0); + static has_none testFunctor(...); + + // note that the following indirection is needed for gcc-3.3 + enum {FunctorType = sizeof(testFunctor(static_cast(0)))}; + typedef typename ternary_result_of_select::type type; +}; + +#endif + +#if EIGEN_HAS_STD_INVOKE_RESULT +template +struct invoke_result { + typedef typename std::invoke_result::type type1; + typedef typename remove_all::type type; +}; +#elif EIGEN_HAS_CXX11 +template +struct invoke_result { + typedef typename result_of::type type1; + typedef typename remove_all::type type; +}; +#else +template +struct invoke_result { + typedef typename result_of::type type1; + typedef typename remove_all::type type; +}; + +template +struct invoke_result { + typedef typename result_of::type type1; + typedef typename remove_all::type type; +}; + +template +struct invoke_result { + typedef typename result_of::type type1; + typedef typename remove_all::type type; +}; + +template +struct invoke_result { + typedef typename result_of::type type1; + typedef typename remove_all::type type; +}; +#endif + +struct meta_yes { char a[1]; }; +struct meta_no { char a[2]; }; + +// Check whether T::ReturnType does exist +template +struct has_ReturnType +{ + template static meta_yes testFunctor(C const *, typename C::ReturnType const * = 0); + template static meta_no testFunctor(...); + + enum { value = sizeof(testFunctor(static_cast(0))) == sizeof(meta_yes) }; +}; + +template const T* return_ptr(); + +template +struct has_nullary_operator +{ + template static meta_yes testFunctor(C const *,typename enable_if<(sizeof(return_ptr()->operator()())>0)>::type * = 0); + static meta_no testFunctor(...); + + enum { value = sizeof(testFunctor(static_cast(0))) == sizeof(meta_yes) }; +}; + +template +struct has_unary_operator +{ + template static meta_yes testFunctor(C const *,typename enable_if<(sizeof(return_ptr()->operator()(IndexType(0)))>0)>::type * = 0); + static meta_no testFunctor(...); + + enum { value = sizeof(testFunctor(static_cast(0))) == sizeof(meta_yes) }; +}; + +template +struct has_binary_operator +{ + template static meta_yes testFunctor(C const *,typename enable_if<(sizeof(return_ptr()->operator()(IndexType(0),IndexType(0)))>0)>::type * = 0); + static meta_no testFunctor(...); + + enum { value = sizeof(testFunctor(static_cast(0))) == sizeof(meta_yes) }; +}; + +/** \internal In short, it computes int(sqrt(\a Y)) with \a Y an integer. + * Usage example: \code meta_sqrt<1023>::ret \endcode + */ +template Y))) > + // use ?: instead of || just to shut up a stupid gcc 4.3 warning +class meta_sqrt +{ + enum { + MidX = (InfX+SupX)/2, + TakeInf = MidX*MidX > Y ? 1 : 0, + NewInf = int(TakeInf) ? InfX : int(MidX), + NewSup = int(TakeInf) ? int(MidX) : SupX + }; + public: + enum { ret = meta_sqrt::ret }; +}; + +template +class meta_sqrt { public: enum { ret = (SupX*SupX <= Y) ? SupX : InfX }; }; + + +/** \internal Computes the least common multiple of two positive integer A and B + * at compile-time. It implements a naive algorithm testing all multiples of A. + * It thus works better if A>=B. + */ +template +struct meta_least_common_multiple +{ + enum { ret = meta_least_common_multiple::ret }; +}; +template +struct meta_least_common_multiple +{ + enum { ret = A*K }; +}; + +/** \internal determines whether the product of two numeric types is allowed and what the return type is */ +template struct scalar_product_traits +{ + enum { Defined = 0 }; +}; + +// FIXME quick workaround around current limitation of result_of +// template +// struct result_of(ArgType0,ArgType1)> { +// typedef typename scalar_product_traits::type, typename remove_all::type>::ReturnType type; +// }; + +/** \internal Obtains a POD type suitable to use as storage for an object of a size + * of at most Len bytes, aligned as specified by \c Align. + */ +template +struct aligned_storage { + struct type { + EIGEN_ALIGN_TO_BOUNDARY(Align) unsigned char data[Len]; + }; +}; + +} // end namespace internal + +namespace numext { + +#if defined(EIGEN_GPU_COMPILE_PHASE) +template EIGEN_DEVICE_FUNC void swap(T &a, T &b) { T tmp = b; b = a; a = tmp; } +#else +template EIGEN_STRONG_INLINE void swap(T &a, T &b) { std::swap(a,b); } +#endif + +#if defined(EIGEN_GPU_COMPILE_PHASE) +using internal::device::numeric_limits; +#else +using std::numeric_limits; +#endif + +// Integer division with rounding up. +// T is assumed to be an integer type with a>=0, and b>0 +template +EIGEN_DEVICE_FUNC +T div_ceil(const T &a, const T &b) +{ + return (a+b-1) / b; +} + +// The aim of the following functions is to bypass -Wfloat-equal warnings +// when we really want a strict equality comparison on floating points. +template EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC +bool equal_strict(const X& x,const Y& y) { return x == y; } + +#if !defined(EIGEN_GPU_COMPILE_PHASE) || (!defined(EIGEN_CUDA_ARCH) && defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC)) +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC +bool equal_strict(const float& x,const float& y) { return std::equal_to()(x,y); } + +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC +bool equal_strict(const double& x,const double& y) { return std::equal_to()(x,y); } +#endif + +template EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC +bool not_equal_strict(const X& x,const Y& y) { return x != y; } + +#if !defined(EIGEN_GPU_COMPILE_PHASE) || (!defined(EIGEN_CUDA_ARCH) && defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC)) +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC +bool not_equal_strict(const float& x,const float& y) { return std::not_equal_to()(x,y); } + +template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC +bool not_equal_strict(const double& x,const double& y) { return std::not_equal_to()(x,y); } +#endif + +} // end namespace numext + +} // end namespace Eigen + +#endif // EIGEN_META_H -- GitLab From 6b7448668d85ad9f831ea13a3a0c134a9ae99984 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Thu, 1 Apr 2021 10:18:38 +0800 Subject: [PATCH 124/486] fix en doc for emb (#31980) * fix en doc for emb, test=document_fix; Change-Id: I4757e67caacd7189f068493ed45a7445f87ffb40 --- python/paddle/nn/functional/input.py | 4 +--- python/paddle/nn/layer/common.py | 14 ++++++-------- 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/python/paddle/nn/functional/input.py b/python/paddle/nn/functional/input.py index bf389717518..b88a2b042ff 100644 --- a/python/paddle/nn/functional/input.py +++ b/python/paddle/nn/functional/input.py @@ -148,9 +148,7 @@ def embedding(x, weight, padding_idx=None, sparse=False, name=None): sparse(bool): The flag indicating whether to use sparse update. This parameter only affects the performance of the backwards gradient update. It is recommended to set True because sparse update is faster. But some optimizers does not support sparse update, - such as :ref:`api_optimizer_AdadeltaOptimizer` , :ref:`api_optimizer_AdamaxOptimizer` , - :ref:`api_optimizer_DecayedAdagradOptimizer` , :ref:`api_optimizer_FtrlOptimizer` , - :ref:`api_optimizer_LambOptimizer` and :ref:`api_optimizer_LarsMomentumOptimizer` . + such as :ref:`api_paddle_optimizer_adadelta_Adadelta` , :ref:`api_paddle_optimizer_adamax_Adamax` , :ref:`api_paddle_optimizer_lamb_Lamb`. In these cases, sparse must be False. Default: False. padding_idx(int|long|None): padding_idx needs to be in the interval [-weight.shape[0], weight.shape[0]). If :math:`padding\_idx < 0`, the :math:`padding\_idx` will automatically be converted diff --git a/python/paddle/nn/layer/common.py b/python/paddle/nn/layer/common.py index 60c846f9f76..86a6fae0d68 100644 --- a/python/paddle/nn/layer/common.py +++ b/python/paddle/nn/layer/common.py @@ -1229,7 +1229,7 @@ class Embedding(layers.Layer): For specific usage, refer to code examples. It implements the function of the Embedding Layer. This layer is used to lookup embeddings vector of ids provided by :attr:`x` . It automatically constructs a 2D embedding matrix based on the - input :attr:`num_embeddings` and attr:`embedding_dim`. + input :attr:`num_embeddings` and :attr:`embedding_dim`. The shape of output Tensor is generated by appending an emb_size dimension to the last dimension of the input Tensor shape. @@ -1241,9 +1241,9 @@ class Embedding(layers.Layer): Case 1: - input is a Tensor. padding_idx = -1 - input.data = [[1, 3], [2, 4], [4, 127] - input.shape = [3, 2] + x is a Tensor. padding_idx = -1 + x.data = [[1, 3], [2, 4], [4, 127] + x.shape = [3, 2] Given size = [128, 16] output is a Tensor: out.shape = [3, 2, 16] @@ -1261,7 +1261,7 @@ class Embedding(layers.Layer): Parameters: num_embeddings (int): Just one element which indicate the size of the dictionary of embeddings. - embedding_dim: Just one element which indicate the size of each embedding vector respectively. + embedding_dim (int): Just one element which indicate the size of each embedding vector respectively. padding_idx(int|long|None): padding_idx needs to be in the interval [-num_embeddings, num_embeddings). If :math:`padding\_idx < 0`, the :math:`padding\_idx` will automatically be converted to :math:`vocab\_size + padding\_idx` . It will output all-zero padding data whenever lookup @@ -1270,9 +1270,7 @@ class Embedding(layers.Layer): sparse(bool): The flag indicating whether to use sparse update. This parameter only affects the performance of the backwards gradient update. It is recommended to set True because sparse update is faster. But some optimizer does not support sparse update, - such as :ref:`api_optimizer_AdadeltaOptimizer` , :ref:`api_optimizer_AdamaxOptimizer` , - :ref:`api_optimizer_DecayedAdagradOptimizer` , :ref:`api_optimizer_FtrlOptimizer` , - :ref:`api_optimizer_LambOptimizer` and :ref:`api_optimizer_LarsMomentumOptimizer` . + such as :ref:`api_paddle_optimizer_adadelta_Adadelta` , :ref:`api_paddle_optimizer_adamax_Adamax` , :ref:`api_paddle_optimizer_lamb_Lamb`. In these case, sparse must be False. Default: False. weight_attr(ParamAttr): To specify the weight parameter property. Default: None, which means the default weight parameter property is used. See usage for details in :ref:`api_ParamAttr` . In addition, -- GitLab From dbeb3ea422acaf888684c588066b12fbfce9d52c Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Wed, 31 Mar 2021 21:44:36 -0500 Subject: [PATCH 125/486] Refactor and simplify hook design & add Tensor.register_hook API (#31775) * refactor and simplify hook design * fix reducer add hook error * add Tensor.register_hook basic impl * refine prepare data impl * revert prepare data change * support register_hook for Tensor * add hook test in model * polish tests and doc example * fix double grad test failed * remove reduce hook func * fix set empty error * polish code by comments * change reduce_hook to mutable_hook * remove useless tmp_ins * fix shape code format error * fix shape code format error --- paddle/fluid/imperative/basic_engine.cc | 79 +++- .../fluid/imperative/gradient_accumulator.cc | 61 ++- .../fluid/imperative/gradient_accumulator.h | 70 ++- paddle/fluid/imperative/hooks.h | 196 ++------- paddle/fluid/imperative/layer.h | 21 + paddle/fluid/imperative/op_base.h | 2 - .../fluid/imperative/partial_grad_engine.cc | 4 + paddle/fluid/imperative/reducer.cc | 8 +- paddle/fluid/imperative/tests/test_hooks.cc | 20 +- paddle/fluid/imperative/variable_wrapper.h | 120 ++--- paddle/fluid/pybind/imperative.cc | 125 ++++-- .../fluid/dygraph/varbase_patch_methods.py | 100 ++++- .../unittests/test_tensor_register_hook.py | 413 ++++++++++++++++++ 13 files changed, 863 insertions(+), 356 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/test_tensor_register_hook.py diff --git a/paddle/fluid/imperative/basic_engine.cc b/paddle/fluid/imperative/basic_engine.cc index 29ba5498680..9e46af9cb72 100644 --- a/paddle/fluid/imperative/basic_engine.cc +++ b/paddle/fluid/imperative/basic_engine.cc @@ -141,17 +141,6 @@ void BasicEngine::PrepareGradAccumulators( << var.get() << ") that don't have grad node with reference count " << accumulator->RefCnt(); - - if (var->HasLeafHooks()) { - VLOG(3) << "Grad variable wrapper (" << var->Name() - << ") has leaf grad hooks."; - PADDLE_ENFORCE_NE( - var->HasGradNode(), true, - platform::errors::PermissionDenied( - "Only leaf Tensor's gradient can append hook to " - "Gradientaccumulator.")); - accumulator->SetPostHooks(var->GetLeafHooks()); - } } else { // Because Inplace op overwrites the grad_node of the input grad_var. So // only the information of grad_pending_node can be used to find the @@ -262,6 +251,30 @@ void BasicEngine::PrepareDeps() { } } +static std::shared_ptr> CallGradientHooks( + const NameVarMap& bwd_ins, const std::string& op_type) { + std::shared_ptr> tmp_ins_ptr = nullptr; + for (const auto& pair : bwd_ins) { + for (size_t i = 0; i < pair.second.size(); ++i) { + auto& var = pair.second[i]; + if (var->HasHook()) { + if (tmp_ins_ptr == nullptr) { + tmp_ins_ptr = std::make_shared>(bwd_ins); + } + VLOG(3) << "Call " << var->GetHooks().size() << " hooks of " << op_type + << "'s input `" << pair.first << "`'s var `" << var->Name() + << "`."; + auto tmp_var = var; + for (const auto& hook_pair : var->GetHooks()) { + tmp_var = (*hook_pair.second)(tmp_var); + } + (*tmp_ins_ptr)[pair.first][i] = tmp_var; + } + } + } + return tmp_ins_ptr; +} + void BasicEngine::Execute() { if (init_node_ == nullptr) { return; @@ -292,10 +305,15 @@ void BasicEngine::Execute() { auto& bwd_ins = cur_op.GetInsMap(); auto& bwd_outs = cur_op.GetOutsMap(); + /** + * [ Why need temporary outputs here? ] + * + * - construct the temp output map, avoid to disrupt graph + * - replace the element in the map by temp var, because a + * var may be coresponding to several grad var in one op + */ NameVarMap tmp_outs(bwd_outs); - // 1. construct the temp output map, avoid to disrupt graph - // 2. replace the element in the map by temp var, because a - // var may be coresponding to several grad var in one op + for (auto& pair : tmp_outs) { if (!pair.second.IsGrad()) { continue; @@ -408,10 +426,28 @@ void BasicEngine::Execute() { } } + /** + * [ Why need temporary inputs here? ] + * + * - Hook execution should not change original input tensor. + * User can register hook for Tensor's gradient, It is expected + * that the hook only affects the gradient of the backward + * propagation, and does not affect the gradient value input + * as the hook. + * - use `tmp_ins_ptr`, only copy bwd_ins when the var in bwd_ins + * hold hooks + */ + auto tmp_ins_ptr = CallGradientHooks(bwd_ins, cur_op.Type()); + { VLOG(3) << "Start to execute grad op " << cur_op.Type(); - OpBase::Run(cur_op.InnerOp(), bwd_ins, tmp_outs, cur_op.Attrs(), - cur_op.place()); + if (tmp_ins_ptr == nullptr) { + OpBase::Run(cur_op.InnerOp(), bwd_ins, tmp_outs, cur_op.Attrs(), + cur_op.place()); + } else { + OpBase::Run(cur_op.InnerOp(), *tmp_ins_ptr, tmp_outs, cur_op.Attrs(), + cur_op.place()); + } } for (auto& pair : inplace_output_grad_var_list_) { @@ -428,15 +464,14 @@ void BasicEngine::Execute() { if (!accumulator->SumGradCompleted()) { continue; } - // 1. Call Hooks for **inner_var_** + // 1. Call Hooks for `inner_var_` + accumulator->CallGradientHooks(); - // 2. Sum Gradient with Previous Graph + // 2. Sum Gradient `inner_var_` to `var_` of Current or Previous Graph accumulator->AccumulateGrad(); - // 3. Call backward Hooks for **var_** - if (accumulator->HasPostHooks()) { - accumulator->CallBackwardPostHooks(); - } + // 3. Call backward Hooks for `var_` + accumulator->CallReduceHooks(); } need_accu_var_list_.clear(); diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc index b9df88b1f1e..df5ff750c99 100644 --- a/paddle/fluid/imperative/gradient_accumulator.cc +++ b/paddle/fluid/imperative/gradient_accumulator.cc @@ -384,8 +384,8 @@ static platform::Place GetPlaceOfVar( void GradientAccumulator::AccumulateGrad() { /** - * If the gradient has been calculated by previous graph, - * it should be added to the previous graph result. + * If the leaf gradient has been calculated done, the inner_var_ + * should be added to the var_. */ if (!var_->IsLeafGrad() || !SumGradCompleted() || !HasInnerVar()) { return; @@ -396,7 +396,7 @@ void GradientAccumulator::AccumulateGrad() { "this auto-grad")); PADDLE_ENFORCE_EQ(inner_var_->Var().IsInitialized(), true, platform::errors::InvalidArgument( - "Interior var of Leaf tensor should be initialized.")); + "Interior var of Leaf tensor should be initialized.")); auto* src = inner_var_->MutableVar(); auto* dst = var_->MutableVar(); if (!var_->IsEmpty()) { @@ -427,10 +427,65 @@ void GradientAccumulator::AccumulateGrad() { *(dst) = std::move(*src); var_->SetType(inner_var_->Type()); var_->SetDataType(inner_var_->DataType()); + var_->SetIsEmpty(false); } inner_var_.reset(); } +void GradientAccumulator::CallGradientHooks() { + PADDLE_ENFORCE_EQ(var_->IsLeafGrad(), true, + platform::errors::Unavailable( + "Only leaf gradient Tensor can deal with by gradient " + "hook in gradient accumulator.")); + PADDLE_ENFORCE_EQ( + SumGradCompleted(), true, + platform::errors::PreconditionNotMet( + "Only can call gradient hooks after sum gradient completed.")); + PADDLE_ENFORCE_EQ( + HasInnerVar(), true, + platform::errors::PreconditionNotMet( + "Leaf Tensor's inner var is nullptr when call gradient hook.")); + PADDLE_ENFORCE_EQ( + inner_var_->Var().IsInitialized(), true, + platform::errors::PreconditionNotMet("Leaf Tensor's inner var " + "is not initialized when " + "call gradient hook.")); + if (var_->HasHook()) { + VLOG(3) << "Call " << var_->GetHooks().size() + << " hooks of leaf gradient accumulator's inner var `" + << var_->Name() << "`."; + auto tmp_var = inner_var_; + VLOG(3) << "Input var " << var_->Name() << "'s hook size - " + << var_->GetHooks().size(); + for (const auto& hook_pair : var_->GetHooks()) { + tmp_var = (*hook_pair.second)(tmp_var); + } + inner_var_ = tmp_var; + } +} + +void GradientAccumulator::CallReduceHooks() { + PADDLE_ENFORCE_EQ( + var_->IsLeafGrad(), true, + platform::errors::Unavailable("Only leaf gradient Tensor can deal with " + "by reduce hook in gradient accumulator.")); + PADDLE_ENFORCE_EQ(SumGradCompleted(), true, + platform::errors::PreconditionNotMet( + "Only can call reduce hooks after the gradient " + "summation is completed in current batch.")); + PADDLE_ENFORCE_EQ(HasInnerVar(), false, + platform::errors::PreconditionNotMet( + "Only can call reduce hooks after the " + "gradient accumulation is completed in " + "current batch or across batchs.")); + if (var_->HasMutableHook()) { + for (const auto& hook : var_->GetMutableHooks()) { + VLOG(3) << "call gradient accumulator backward hooks."; + (*hook)(var_); + } + } +} + void EagerGradientAccumulator::SumGrad(std::shared_ptr var, size_t trace_id, bool unchange_input) { /** diff --git a/paddle/fluid/imperative/gradient_accumulator.h b/paddle/fluid/imperative/gradient_accumulator.h index e2dabc06a7d..6411dce4405 100644 --- a/paddle/fluid/imperative/gradient_accumulator.h +++ b/paddle/fluid/imperative/gradient_accumulator.h @@ -40,8 +40,8 @@ class GradientAccumulator { } // inner_var_ record the grad of this auto-grad. - // Only need to generate inner var for non-empty leaf-tensor. - if (var->IsLeafGrad() && !var->IsEmpty()) { + // Only need to generate inner var for leaf-tensor. + if (var->IsLeafGrad()) { inner_var_ = std::make_shared(var->Name()); inner_var_->SetType(var->Type()); inner_var_->SetDataType(var->DataType()); @@ -52,9 +52,6 @@ class GradientAccumulator { << ") to store result of this Graph"; } - // TODO(zhouwei): fix Tensor.clear_gradient() bug, remove this hard flag - var->SetIsEmpty(false); - // var_ is the final grad, processed by hooks and grad accumulation var_ = var; } @@ -93,42 +90,38 @@ class GradientAccumulator { inline bool HasInnerVar() const { return inner_var_ != nullptr; } - /* Hook related methods */ - inline bool HasPostHooks() const { return !post_hooks_.expired(); } - - void SetPostHooks(const std::shared_ptr& hooks) { - PADDLE_ENFORCE_NOT_NULL( - hooks, platform::errors::InvalidArgument( - "The hook set to GradientAccumulator is nullptr.")); - - auto shared_hooks = post_hooks_.lock(); - if (shared_hooks != hooks) { - PADDLE_ENFORCE_EQ( - shared_hooks, nullptr, - platform::errors::PermissionDenied( - "Cannot set post hooks twice to GradientAccumulator.")); - post_hooks_ = hooks; - } - } - // void CallHooks(){} - // ** inner_var_ ** - // function that Sum Gradient with Previous Graph void AccumulateGrad(); - // call backward post hooks, such as reduce hook - void CallBackwardPostHooks() { - PADDLE_ENFORCE_NE( - post_hooks_.expired(), true, - platform::errors::NotFound( - "The post hooks of GradientAccumulator for Tensor `%s` expired.", - var_->Name())); - auto shared_hooks = post_hooks_.lock(); - for (const auto& hook : shared_hooks->backward_hooks()) { - VLOG(3) << "call gradient accumulator backward hooks."; - (*hook)(var_); - } - } + /** [ Hook related methods ] + * + * [Why need two types of VariableWrapperHook? ] + * + * There are two types of gradient accumulation: + * 1. Gradient accumulation in same batch + * 2. Gradient accumulation across batchs + * The order of execution between Hooks and gradient accumulation: + + * [ Gradient accumulation in same batch] + * | + * [ leaf GradVarBase hooks ] + * | + * [ Gradient accumulation across batchs ] + * | + * [ Gradient reduce / allreduce hooks ] + + * Because we currently intend to accumulate these two gradient + * accumulation in one GradientAccumulator, We must distinguish between + * two types of hooks. + + * And the InplaceVariableWrapperHook does not allow users to register + * directly, and is currently only used to support the reduce strategy of + * parallel multi-card training. + */ + + void CallGradientHooks(); + + void CallReduceHooks(); protected: VariableWrapper* var_; @@ -137,7 +130,6 @@ class GradientAccumulator { std::shared_ptr inner_var_; size_t ref_cnt_{0}; size_t cur_cnt_{0}; - std::weak_ptr post_hooks_; }; class EagerGradientAccumulator : public GradientAccumulator { diff --git a/paddle/fluid/imperative/hooks.h b/paddle/fluid/imperative/hooks.h index 1211ec6ae6c..4d59298aed5 100644 --- a/paddle/fluid/imperative/hooks.h +++ b/paddle/fluid/imperative/hooks.h @@ -18,100 +18,67 @@ #include #include #include - -#include "paddle/fluid/imperative/type_defs.h" -#include "paddle/fluid/platform/macros.h" - namespace paddle { namespace imperative { class VariableWrapper; -/** [ Basic hook classes ] - * s - * @brief OpBasePreHook is executed before the grad OpBase is executed, +/** [ Const VariableWrapper Hook: Pre hook functor of OpBase ] + * + * @brief This hook functor is executed before the grad OpBase is executed, * taking the input of the current grad OpBase as input, and * executing python hooks (user-defined) or C++ hooks (developer-defined) * to achieve the purpose of custom operations on the interior VarBase * gradient. * - * @note OpBasePreHook will not change the input gradient VarBase. + * @note This hook functor will not change the input gradient VarBase. * * @note [Why need to be OpBase `PreHook`, why not `PostHook`?] * - * If set OpBase post hook, when the op executed end, the op's output - * gradient may not be the final state, because it may need other op's - * gradient output to accumulated to it. But before op can be executed, - * the gradient output must have been accumulated to final value. + * 1. We expect If set OpBase post hook, when the op executed end, the + * op's output gradient may not be the final state, because it may need + * other op's gradient output to accumulated to it. But before op can + * be executed, the gradient output must have been accumulated to final + * value. + * 2. We don’t want the hook to change its input Tensor value, so now + * we can't call all hooks in GradAccumulator. * * @note [Why only can be used for interior VarBase?] * * Because the leaf VarBase's GradVarBase has no GradOpNode, so leaf * GradVarBase has no next OpBase to executed, so if need to deal with - * the leaf GradVarBase, cannot use OpBasePreHook. For this case, we - * deal with by GradAccumulatorPostHook. + * the leaf GradVarBase, cannot use this hook functor. For this case, we + * deal with by other inplace hook method. */ -class OpBasePreHook { +class VariableWrapperHook { public: - virtual ~OpBasePreHook() = default; - virtual VariableWrapperList operator()( - const VariableWrapperList& grad_inputs) = 0; + virtual ~VariableWrapperHook() = default; + virtual std::shared_ptr operator()( + const std::shared_ptr& var) = 0; }; -/** - * @brief GradAccumulatorPostHook is the Hook that operates on the current +/** [ Inplace VariableWrapper Hook: Post hook functor of GradAccumulator ] + * + * @brief This hook functor is the Hook that operates on the current * gradientafter the GradientAccumulator has accumulated the gradient. * Leaf GradVarBase has no next OpBase, if we want to register hook * for it, we also need to wait until the leaf GradVarBase accumulation * is completed, so we can add post hook to GradientAccumulator. * - * @note GradAccumulatorPostHook will change the grad VarBase value. + * @note This hook functor will change the grad VarBase value. * - * @note Only allow leaf VarBase hold GradientAccumulatorPostHook. + * @note Only allow leaf VarBase hold call this hook functor. */ -class GradAccumulatorPostHook { +class InplaceVariableWrapperHook { public: - virtual ~GradAccumulatorPostHook() = default; + virtual ~InplaceVariableWrapperHook() = default; virtual void operator()(VariableWrapper* var) = 0; }; -/** [ Hook for cpp functions ] - * - * Here we design three C++ hooks; - * 1. CppOpBasePreHook (Implement later): - * - used for developer-defined C++ interior VarBase hooks - * 2. CppGradAccumulatorPostHook (Implement later): - * - used for developer-defined C++ leaf VarBase hooks - * 3. LambdaGradAccumulatorPostHook: - * - used for VarBase reduce in parallel training - * - * @note [Why need two types of GradAccumulatorPostHook? ] - * - * There are two types of gradient accumulation: - * 1. Gradient accumulation in same batch - * 2. Gradient accumulation across batchs - * The order of execution between Hooks and gradient accumulation: - * - * [ Gradient accumulation in same batch] - * | - * [ leaf GradVarBase hooks ] - * | - * [ Gradient accumulation across batchs ] - * | - * [ Gradient reduce / allreduce] - * - * Because we currently intend to accumulate these two gradient - * accumulation in one GradientAccumulator, We must distinguish between - * two types of hooks. - * - * And the LambdaGradAccumulatorPostHook does not allow users to register - * directly, and is currently only used to support the reduce strategy of - * parallel multi-card training. - */ -class LambdaGradAccumulatorPostHook : public GradAccumulatorPostHook { +class LambdaInplaceVariableWrapperHook : public InplaceVariableWrapperHook { public: - explicit LambdaGradAccumulatorPostHook( - std::function fn) + explicit LambdaInplaceVariableWrapperHook( + std::function&& fn) : fn_(std::move(fn)) {} void operator()(VariableWrapper* var) override { fn_(var); } @@ -120,114 +87,5 @@ class LambdaGradAccumulatorPostHook : public GradAccumulatorPostHook { std::function fn_; }; -/* Hooks for python function: in pybind/imperative.cc */ - -/** Add Python Hooks later: - * - PyOpBasePreHook (Implement later): used for user-defined interior python - * VarBase hooks - * - PyGradAccumulatorPostHook (Implement later): used for user-defined leaf - * python VarBase hooks - */ - -/** [ Hook Pipeline classes ] - * - * @note [Why need hook pipeline classes?] - * - * There are 2 purposes for adding Hook pipeline here: - * - * 1. Make the code implementation cleaner. - * - * If there are no Hook pipeline, we need to add 3 hook vector into - * VariableWrapper, 1 hook vector into OpBase, 2 hook vector into - * GradientAccumulator, like: - * - * - VariableWrapper: - * std::vector> - * interior_var_hooks_; - * std::vector> - * leaf_var_hooks_; - * std::vector> - * backward_hooks_; - * - * - OpBase: - * std::vector> - * interior_var_hooks_; - * - * - GradientAccumulator: - * std::vector> - * leaf_var_hooks_; - * std::vector> - * backward_hooks_; - * - * This seems more complicated, and std::vector> - * is not easy to destruct. - * - * 2. Make the code easier to understand. - * - * From these two packages, we can clearly understand that we - * have two types of Hooks, respectively for the interior - * gradient var and leaf gradient var inside the backward - * calculation graph. - */ - -class InteriorVarHookPipeline { - public: - InteriorVarHookPipeline() = default; - - void add_hook(std::unique_ptr&& hook) { - hooks_.emplace_back(std::move(hook)); - } - - const std::vector>& hooks() const { - return hooks_; - } - - std::vector>& hooks() { return hooks_; } - - private: - std::vector> hooks_; - - DISABLE_COPY_AND_ASSIGN(InteriorVarHookPipeline); -}; - -class LeafVarHookPipeline { - public: - LeafVarHookPipeline() = default; - - void add_hook(std::unique_ptr&& hook) { - hooks_.emplace_back(std::move(hook)); - } - - const std::vector>& hooks() const { - return hooks_; - } - - std::vector>& hooks() { - return hooks_; - } - - void add_backward_hook(std::unique_ptr&& hook) { - backward_hooks_.emplace_back(std::move(hook)); - } - - const std::vector>& backward_hooks() - const { - return backward_hooks_; - } - - std::vector>& backward_hooks() { - return backward_hooks_; - } - - private: - std::vector> hooks_; - // NOTE: the `backward` here means the `whole backward process`, - // the `backward_hooks_` need to be executed after the `whole backward - // process`. - std::vector> backward_hooks_; - - DISABLE_COPY_AND_ASSIGN(LeafVarHookPipeline); -}; - } // namespace imperative } // namespace paddle diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h index ff5a780a5f9..f87db415768 100644 --- a/paddle/fluid/imperative/layer.h +++ b/paddle/fluid/imperative/layer.h @@ -30,6 +30,7 @@ #include "paddle/fluid/framework/var_type.h" #include "paddle/fluid/framework/variable.h" #include "paddle/fluid/imperative/flags.h" +#include "paddle/fluid/imperative/hooks.h" #include "paddle/fluid/imperative/saved_variable_wrapper_list.h" #include "paddle/fluid/imperative/type_defs.h" #include "paddle/fluid/imperative/variable_wrapper.h" @@ -220,6 +221,26 @@ class VarBase { void BumpInplaceVersion(); + /* Hook related method: now only used for GradVarBase */ + bool HasHook() const { return var_->HasHook(); } + + int64_t AddHook(std::shared_ptr&& hook) { + return var_->AddHook( + std::forward>(hook)); + } + + bool RemoveHook(const int64_t& hook_id) { return var_->RemoveHook(hook_id); } + + const std::map>& GetHooks() + const { + return var_->GetHooks(); + } + + void AddMutableHook(std::shared_ptr&& hook) { + var_->AddMutableHook( + std::forward>(hook)); + } + private: /** * NOTE(zengjinle): never remove the const qualifier of `var_` if you are diff --git a/paddle/fluid/imperative/op_base.h b/paddle/fluid/imperative/op_base.h index 2b7642ae7cf..0164ff9313c 100644 --- a/paddle/fluid/imperative/op_base.h +++ b/paddle/fluid/imperative/op_base.h @@ -177,8 +177,6 @@ class OpBase { std::unique_ptr op_; platform::Place place_; size_t id_{-1UL}; - - std::weak_ptr pre_hooks_; }; class GradOpNode { diff --git a/paddle/fluid/imperative/partial_grad_engine.cc b/paddle/fluid/imperative/partial_grad_engine.cc index 8dd8cafc835..3da3a05ed10 100644 --- a/paddle/fluid/imperative/partial_grad_engine.cc +++ b/paddle/fluid/imperative/partial_grad_engine.cc @@ -369,6 +369,10 @@ class GradientAccumulationInfo { *is_finished = (cur_ref_cnt_ == total_ref_cnt_); accumulator_->SumGrad(grad_var_partial, trace_id, unchange_input); + if (*is_finished && accumulator_->HasInnerVar()) { + accumulator_->AccumulateGrad(); + } + if (create_graph_) { VLOG(10) << "Store partial grad grad for double grad " << mapped_grad_var_->Name(); diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc index e8b531d35ca..4b18886821b 100644 --- a/paddle/fluid/imperative/reducer.cc +++ b/paddle/fluid/imperative/reducer.cc @@ -310,11 +310,9 @@ Reducer::Reducer(const std::vector> &vars, for (size_t global_var_index = 0; global_var_index < vars_.size(); ++global_var_index) { auto var = vars_[global_var_index]; - var->SharedVar()->AddGradVarLeafBackwardHook( - std::unique_ptr( - new LambdaGradAccumulatorPostHook([=](VariableWrapper *grad) { - this->AddDistHook(global_var_index); - }))); + var->GradVarBase()->AddMutableHook( + std::make_shared([=]( + VariableWrapper *grad) { this->AddDistHook(global_var_index); })); var_index_map_[var->GradVarBase()->SharedVar().get()] = global_var_index; } } diff --git a/paddle/fluid/imperative/tests/test_hooks.cc b/paddle/fluid/imperative/tests/test_hooks.cc index 7bf5f876681..9b75fac0ca5 100644 --- a/paddle/fluid/imperative/tests/test_hooks.cc +++ b/paddle/fluid/imperative/tests/test_hooks.cc @@ -74,16 +74,15 @@ TEST(TestHooks, TestGradVarLeafBackwardHook) { mul_attr_map["use_mkldnn"] = false; // add GradAccumulatorPostHook - auto x_var_wrapper = x->SharedVar(); - x_var_wrapper->AddGradVarLeafBackwardHook( - std::unique_ptr( - new LambdaGradAccumulatorPostHook([=](VariableWrapper* grad) { + x->GradVarBase()->AddMutableHook( + std::make_shared( + [=](VariableWrapper* grad) { auto* grad_tensor = grad->MutableVar()->GetMutable(); for (int i = 0; i < grad_tensor->numel(); ++i) { grad_tensor->mutable_data(place)[i] *= 2.0; } - }))); + })); // 2. forward tracer.TraceOp("mul", ins, outs, mul_attr_map, place, true); @@ -151,17 +150,16 @@ void GradVarLeafBackwardHookWithGradAccmulatedTest() { memory::Copy(place, mutable_z, place, src_data.data(), sizeof(float) * src_data.size()); - // add GradAccumulatorPostHook - auto x_var_wrapper = x->SharedVar(); - x_var_wrapper->AddGradVarLeafBackwardHook( - std::unique_ptr( - new LambdaGradAccumulatorPostHook([=](VariableWrapper* grad) { + // add ReduceBackwardHook + x->GradVarBase()->AddMutableHook( + std::make_shared( + [=](VariableWrapper* grad) { auto* grad_tensor = grad->MutableVar()->GetMutable(); for (int i = 0; i < grad_tensor->numel(); ++i) { grad_tensor->mutable_data(place)[i] *= 2.0; } - }))); + })); // 2. forward var_pair x_pair = var_pair("X", vb_vector(1, x)); diff --git a/paddle/fluid/imperative/variable_wrapper.h b/paddle/fluid/imperative/variable_wrapper.h index b42f25dcc88..7d287c98291 100644 --- a/paddle/fluid/imperative/variable_wrapper.h +++ b/paddle/fluid/imperative/variable_wrapper.h @@ -27,8 +27,8 @@ namespace paddle { namespace imperative { -class InteriorVarHookPipeline; -class LeafVarHookPipeline; +class VariableWrapperHook; +class InplaceVariableWrapperHook; class VarBase; class GradOpNode; @@ -193,42 +193,6 @@ class VariableWrapper { } } - /* Hook related method: only can be call by GradVarBase */ - - bool HasInteriorHooks() const { return interior_hooks_ != nullptr; } - - bool HasLeafHooks() const { return leaf_hooks_ != nullptr; } - - void AddGradVarInteriorHook(std::unique_ptr&& hook) { - auto interior_hooks = GetGradVarInteriorHooksSafely(); - interior_hooks->add_hook(std::move(hook)); - } - - void AddGradVarLeafHook(std::unique_ptr&& hook) { - auto leaf_hooks = GetGradVarLeafHooksSafely(); - leaf_hooks->add_hook(std::move(hook)); - } - - void AddGradVarLeafBackwardHook( - std::unique_ptr&& hook) { - auto leaf_hooks = GetGradVarLeafHooksSafely(); - leaf_hooks->add_backward_hook(std::move(hook)); - } - - const std::shared_ptr& GetInteriorHooks() const { - return interior_hooks_; - } - - std::shared_ptr& GetInteriorHooks() { - return interior_hooks_; - } - - const std::shared_ptr& GetLeafHooks() const { - return leaf_hooks_; - } - - std::shared_ptr& GetLeafHooks() { return leaf_hooks_; } - uint32_t InplaceVersionSnapshot() const { return inplace_version_snapshot_; } void ResetInplaceVersion() { @@ -255,6 +219,38 @@ class VariableWrapper { return; } + /* Hook related methods */ + bool HasHook() const { return !hooks_.empty(); } + + bool HasMutableHook() const { return !mutable_hooks_.empty(); } + + int64_t AddHook(std::shared_ptr&& hook) { + hooks_.emplace(next_hook_id_, std::move(hook)); + return next_hook_id_++; + } + + bool RemoveHook(const int64_t& hook_id) { + auto remove_cnt = hooks_.erase(hook_id); + if (remove_cnt == 0) { + return false; + } + return true; + } + + const std::map>& GetHooks() + const { + return hooks_; + } + + void AddMutableHook(std::shared_ptr&& hook) { + mutable_hooks_.emplace_back(std::move(hook)); + } + + const std::vector>& + GetMutableHooks() const { + return mutable_hooks_; + } + private: void SetGradVar(const std::shared_ptr& var) { auto shared_var = grad_var_.lock(); @@ -289,41 +285,6 @@ class VariableWrapper { } } - /* Hook related private methods */ - std::shared_ptr GetGradVarSafely() const { - auto shared_grad_var = grad_var_.lock(); - PADDLE_ENFORCE_NOT_NULL( - shared_grad_var, - platform::errors::PermissionDenied( - "Cannot add gradient hook on Tensor without gradient.")); - return shared_grad_var; - } - - std::shared_ptr& GetGradVarInteriorHooksSafely() { - auto shared_grad_var = GetGradVarSafely(); - PADDLE_ENFORCE_EQ(HasGradNode(), true, - platform::errors::PermissionDenied( - "Only interior Tensor in backward can register " - "interior gradient hook.")); - if (shared_grad_var->interior_hooks_ == nullptr) { - shared_grad_var->interior_hooks_ = - std::make_shared(); - } - return shared_grad_var->interior_hooks_; - } - - std::shared_ptr& GetGradVarLeafHooksSafely() { - auto shared_grad_var = GetGradVarSafely(); - PADDLE_ENFORCE_EQ( - HasGradNode(), false, - platform::errors::PermissionDenied( - "Only leaf Tensor in backward can register leaf gradient hook.")); - if (shared_grad_var->leaf_hooks_ == nullptr) { - shared_grad_var->leaf_hooks_ = std::make_shared(); - } - return shared_grad_var->leaf_hooks_; - } - private: framework::Variable var_; std::string name_; @@ -358,11 +319,14 @@ class VariableWrapper { // isn't need bool is_empty_{false}; - // NOTE: only grad var can hold hooks now - // only interior var can hold interior hooks - std::shared_ptr interior_hooks_; - // only leaf var can hold leaf hooks - std::shared_ptr leaf_hooks_; + // NOTE(chenweihang): only grad var can hold hooks now + int64_t next_hook_id_{0}; + // Hooks used to register hook for grad var, support adding and removing, + // key is the accumulated int64_t value + std::map> hooks_; + // Hooks executed after the execution of the entire backward process is over, + // currently only supported for reducing in distributed training + std::vector> mutable_hooks_; }; } // namespace imperative diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index 40cf6cd84be..38ba1dc0293 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -34,6 +34,7 @@ limitations under the License. */ #include "paddle/fluid/imperative/basic_engine.h" #include "paddle/fluid/imperative/bkcl_context.h" #include "paddle/fluid/imperative/data_loader.h" +#include "paddle/fluid/imperative/hooks.h" #include "paddle/fluid/imperative/layer.h" #include "paddle/fluid/imperative/nccl_context.h" #include "paddle/fluid/imperative/partial_grad_engine.h" @@ -63,6 +64,65 @@ class Layer : public imperative::Layer { } }; +template +static T PyObjectCast(PyObject *obj) { + try { + return py::cast(py::handle(obj)); + } catch (py::cast_error &) { + PADDLE_THROW(platform::errors::InvalidArgument( + "Python object is not type of %s", typeid(T).name())); + } +} + +class PyVariableWrapperHook : public imperative::VariableWrapperHook { + public: + explicit PyVariableWrapperHook(PyObject *func) : py_func_(func) { + Py_INCREF(py_func_); + } + + ~PyVariableWrapperHook() { + py::gil_scoped_acquire gil; + Py_DECREF(py_func_); + } + + std::shared_ptr operator()( + const std::shared_ptr &var) override { + py::gil_scoped_acquire gil; + VLOG(3) << "Call PyVariableWrapperHook for var " << var->Name(); + + // 1. unpack temp VarBase from VariableWrapper + std::shared_ptr tmp_varbase = + std::make_shared(var); + + // 2. call hook and return + PyObject *res = nullptr; + try { + res = PyObject_CallFunctionObjArgs(py_func_, py::cast(tmp_varbase).ptr(), + nullptr); + } catch (platform::EnforceNotMet &e) { + throw std::move(e); + } catch (std::exception &e) { + PADDLE_THROW(platform::errors::Unavailable( + "Hook function of Tensor raises an exception: %s.", e.what())); + } catch (...) { + PADDLE_THROW(platform::errors::Fatal( + "Hook function of Tensor raises an unknown exception.")); + } + + PADDLE_ENFORCE_NOT_NULL(res, + platform::errors::Unavailable( + "Hook function of Tensor return a nullptr.")); + if (res == Py_None) { + return var; + } + + return PyObjectCast>(res)->SharedVar(); + } + + private: + PyObject *py_func_; +}; + static const platform::Place PyObjectToPlace(const py::object &place_obj) { if (py::isinstance(place_obj)) { return place_obj.cast(); @@ -213,16 +273,6 @@ static std::string GetTypeName(const imperative::VarBase &var) { using PyNameVarBaseMap = std::unordered_map; -template -static T PyObjectCast(PyObject *obj) { - try { - return py::cast(py::handle(obj)); - } catch (py::cast_error &) { - PADDLE_THROW(platform::errors::InvalidArgument( - "Python object is not type of %s", typeid(T).name())); - } -} - // NOTE(zjl): py::handle is a very light wrapper of PyObject *. // Unlike py::object, py::handle does not change reference count of PyObject *. static std::vector> @@ -1023,6 +1073,23 @@ void BindImperative(py::module *m_ptr) { } }, py::call_guard()) + .def("_register_grad_hook", + [](imperative::VarBase &self, const py::handle &hook) { + PADDLE_ENFORCE_EQ( + self.HasGradVar(), true, + platform::errors::InvalidArgument( + "Cannot register hook on a tensor without gradient.")); + return self.GradVarBase()->AddHook( + std::make_shared(hook.ptr())); + }) + .def("_remove_grad_hook", + [](imperative::VarBase &self, int64_t hook_id) { + PADDLE_ENFORCE_EQ( + self.HasGradVar(), true, + platform::errors::InvalidArgument( + "Cannot remove hook on a tensor without gradient.")); + return self.GradVarBase()->RemoveHook(hook_id); + }) .def("cpu", [](const std::shared_ptr &self) { if (platform::is_cpu_place(self->Place())) { @@ -1231,22 +1298,28 @@ void BindImperative(py::module *m_ptr) { &imperative::VarBase::SetOverridedStopGradient) .def_property("persistable", &imperative::VarBase::Persistable, &imperative::VarBase::SetPersistable) - .def_property_readonly( - "shape", - [](imperative::VarBase &self) { - if (self.Var().IsType()) { - return framework::vectorize( - self.Var().Get().dims()); - } else if (self.Var().IsType()) { - return framework::vectorize( - self.Var().Get().value().dims()); - } else { - VLOG(2) << "It is meaningless to get shape of " - "variable type " - << GetTypeName(self); - return std::vector(); - } - }) + .def_property_readonly("shape", + [](imperative::VarBase &self) { + if (self.Var().IsType()) { + return framework::vectorize( + self.Var() + .Get() + .dims()); + } else if (self.Var() + .IsType< + framework::SelectedRows>()) { + return framework::vectorize( + self.Var() + .Get() + .value() + .dims()); + } else { + VLOG(2) << "It is meaningless to get shape of " + "variable type " + << GetTypeName(self); + return std::vector(); + } + }) .def_property_readonly("is_leaf", &imperative::VarBase::IsLeaf, R"DOC( Whether a Tensor is leaf Tensor. diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py index ac0944c5718..e565552632f 100644 --- a/python/paddle/fluid/dygraph/varbase_patch_methods.py +++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py @@ -14,6 +14,8 @@ import inspect import numpy as np +import warnings +import weakref import paddle from .. import framework @@ -26,6 +28,34 @@ from .parallel import scale_loss from paddle.fluid.data_feeder import convert_dtype, _PADDLE_DTYPE_2_NUMPY_DTYPE +class TensorHookRemoveHelper(object): + """ + A helper class that for removing Tensor gradient's hook. + """ + + def __init__(self, tensor, hook_id): + self._tensor_ref = weakref.ref(tensor) + self._hook_id = hook_id + + def remove(self): + """ + Remove reference Tensor's hook. + + Returns: + bool: Return True if removed successfully + """ + tensor = self._tensor_ref() + if tensor is not None: + res = tensor._remove_grad_hook(self._hook_id) + if res is True: + return True + else: + warnings.warn( + "The backward hook (ID: %d) of Tensor `%s` you want to remove does not exist or has been removed." + % (self._hook_id, tensor.name), RuntimeWarning) + return False + + def monkey_patch_varbase(): @switch_to_static_graph def _to_static_var(self, to_parameter=False, **kwargs): @@ -211,6 +241,73 @@ def monkey_patch_varbase(): else: return np.array(new_ivar.value().get_tensor()) + @framework.dygraph_only + def register_hook(self, hook): + """ + Registers a backward hook for current Tensor. + + The hook will be called every time the gradient Tensor of current Tensor is computed. + + The hook should not modify the input gradient Tensor, but it can optionally return + a new gradient Tensor which will be used in place of current Tensor's gradient. + + The hook should have the following signature: + + hook(grad) -> Tensor or None + + Args: + hook(function): A backward hook to be registered for Tensor.grad + + Returns: + TensorHookRemoveHelper: A helper object that can be used to remove the registered hook by calling `remove()` method. + + Examples: + .. code-block:: python + + import paddle + + # hook function return None + def print_hook_fn(grad): + print(grad) + + # hook function return Tensor + def double_hook_fn(grad): + grad = grad * 2 + return grad + + x = paddle.to_tensor([0., 1., 2., 3.], stop_gradient=False) + y = paddle.to_tensor([4., 5., 6., 7.], stop_gradient=False) + z = paddle.to_tensor([1., 2., 3., 4.]) + + # one Tensor can register multiple hooks + h = x.register_hook(print_hook_fn) + x.register_hook(double_hook_fn) + + w = x + y + # register hook by lambda function + w.register_hook(lambda grad: grad * 2) + + o = z.matmul(w) + o.backward() + # print_hook_fn print content in backward + # Tensor(shape=[4], dtype=float32, place=CUDAPlace(0), stop_gradient=False, + # [2., 4., 6., 8.]) + + print("w.grad:", w.grad) # w.grad: [1. 2. 3. 4.] + print("x.grad:", x.grad) # x.grad: [ 4. 8. 12. 16.] + print("y.grad:", y.grad) # y.grad: [2. 4. 6. 8.] + + # remove hook + h.remove() + """ + if self.stop_gradient is True: + raise RuntimeError( + "Cannot register hook on a tensor that stop gradient.") + + hook_id = self._register_grad_hook(hook) + helper = TensorHookRemoveHelper(self, hook_id) + return helper + @property def grad(self): """ @@ -316,7 +413,8 @@ def monkey_patch_varbase(): ("_to_static_var", _to_static_var), ("set_value", set_value), ("block", block), ("backward", backward), ("clear_grad", clear_grad), ("inplace_version", inplace_version), ("grad", grad), - ("gradient", gradient), ("__str__", __str__), ("__repr__", __str__), + ("gradient", gradient), ("register_hook", register_hook), + ("__str__", __str__), ("__repr__", __str__), ("__deepcopy__", __deepcopy__), ("__module__", "paddle"), ("__name__", "Tensor")): setattr(core.VarBase, method_name, method) diff --git a/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py b/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py new file mode 100644 index 00000000000..a390dd9d807 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py @@ -0,0 +1,413 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np + +import paddle +import paddle.nn as nn + + +class SimpleNet(nn.Layer): + def __init__(self, in_size, out_size): + super(SimpleNet, self).__init__() + self.linear1 = nn.Linear(in_size, in_size) + self.linear2 = nn.Linear(in_size, out_size) + + def forward(self, x, hook=None, register=False, remove=False): + ret1 = self.linear1(x) + if hook is not None: + if register: + h = ret1.register_hook(hook) + if remove: + h.remove() + ret2 = self.linear2(ret1) + out = paddle.mean(ret2, axis=-1) + return ret1, out + + +class TestTensorRegisterHook(unittest.TestCase): + def setUp(self): + self.seed = 2021 + self.in_size = 10 + self.out_size = 10 + self.batch_size = 4 + self.devices = ["cpu"] + if paddle.is_compiled_with_cuda(): + self.devices.append("gpu") + + def test_hook_for_interior_var(self): + def run_double_hook_for_interior_var(double_hook, removed=False): + for device in self.devices: + paddle.set_device(device) + + x = paddle.to_tensor([0., 1., 2., 3.]) + y = paddle.to_tensor([4., 5., 6., 7.]) + x.stop_gradient = False + y.stop_gradient = False + + w = x + y + w.stop_gradient = False + helper = w.register_hook(double_hook) + + z = paddle.to_tensor([1., 2., 3., 4.]) + z.stop_gradient = False + + o = z.matmul(w) + + # remove hook before backward + if removed: + helper.remove() + + o.backward() + + # z.grad is not affected + self.assertTrue(np.array_equal(z.grad, w.numpy())) + # w.grad is not changed by hook + self.assertTrue(np.array_equal(w.grad, z.numpy())) + # x.grad and y.grad are changed if run hook + self.assertTrue( + np.array_equal(x.grad, + z.numpy() * 2 if not removed else z.numpy())) + self.assertTrue( + np.array_equal(y.grad, + z.numpy() * 2 if not removed else z.numpy())) + + def run_print_hook_for_interior_var(print_hook, removed=False): + for device in self.devices: + paddle.set_device(device) + + x = paddle.to_tensor([0., 1., 2., 3.]) + y = paddle.to_tensor([4., 5., 6., 7.]) + x.stop_gradient = False + y.stop_gradient = False + + w = x + y + w.stop_gradient = False + helper = w.register_hook(print_hook) + + z = paddle.to_tensor([1., 2., 3., 4.]) + z.stop_gradient = False + + o = z.matmul(w) + + # remove hook before backward + if removed: + helper.remove() + + o.backward() + + # all grads are not affected + self.assertTrue(np.array_equal(z.grad, w.numpy())) + self.assertTrue(np.array_equal(w.grad, z.numpy())) + self.assertTrue(np.array_equal(x.grad, z.numpy())) + self.assertTrue(np.array_equal(y.grad, z.numpy())) + + def double_hook(grad): + grad = grad * 2 + print(grad) + return grad + + def print_hook(grad): + print(grad) + + # register hook + run_double_hook_for_interior_var(double_hook) + # register hook and removed + run_double_hook_for_interior_var(double_hook, removed=True) + + # register hook + run_double_hook_for_interior_var(lambda grad: grad * 2) + # register hook and removed + run_double_hook_for_interior_var(lambda grad: grad * 2, removed=True) + + # register hook + run_print_hook_for_interior_var(print_hook) + # register hook and removed + run_print_hook_for_interior_var(print_hook, removed=True) + + def test_hook_for_leaf_var(self): + def run_double_hook_for_leaf_var(double_hook, removed=False): + for device in self.devices: + paddle.set_device(device) + + x = paddle.to_tensor([0., 1., 2., 3.]) + y = paddle.to_tensor([4., 5., 6., 7.]) + x.stop_gradient = False + y.stop_gradient = False + helper = y.register_hook(double_hook) + + w = x + y + w.stop_gradient = False + + z = paddle.to_tensor([1., 2., 3., 4.]) + z.stop_gradient = False + + o = z.matmul(w) + + # remove hook before backward + if removed: + helper.remove() + + o.backward() + + # z.grad, w.grad, x.grad is not affected + self.assertTrue(np.array_equal(z.grad, w.numpy())) + self.assertTrue(np.array_equal(w.grad, z.numpy())) + self.assertTrue(np.array_equal(x.grad, z.numpy())) + # y.grad are changed if run hook + self.assertTrue( + np.array_equal(y.grad, + z.numpy() * 2 if not removed else z.numpy())) + + # register hook + run_double_hook_for_leaf_var(lambda grad: grad * 2) + # register hook and removed + run_double_hook_for_leaf_var(lambda grad: grad * 2, removed=True) + + def test_hook_for_accumulated_grad(self): + def run_double_hook_for_accumulated_grad(double_hook, removed=False): + for device in self.devices: + paddle.set_device(device) + + a = paddle.to_tensor([0., 1., 1., 2.]) + b = paddle.to_tensor([0., 0., 1., 2.]) + a.stop_gradient = False + b.stop_gradient = False + + helper1 = a.register_hook(double_hook) + + x = a + b + x.stop_gradient = False + + helper2 = x.register_hook(double_hook) + + y = paddle.to_tensor([4., 5., 6., 7.]) + z = paddle.to_tensor([1., 2., 3., 4.]) + y.stop_gradient = False + z.stop_gradient = False + + o1 = x + y + o2 = x + z + o1.stop_gradient = False + o2.stop_gradient = False + + o = o1.matmul(o2) + + # remove hook before backward + if removed: + helper1.remove() + helper2.remove() + + o.backward() + + base_grad = np.array([5., 9., 13., 19.]) + # x.grad is not changed + self.assertTrue(np.array_equal(x.grad, base_grad)) + # b.grad is changed by x.hook + self.assertTrue( + np.array_equal(b.grad, base_grad * 2 + if not removed else base_grad)) + # a.grad is changed by x.hook and a.hook + self.assertTrue( + np.array_equal(a.grad, base_grad * 4 + if not removed else base_grad)) + + # register hook + run_double_hook_for_accumulated_grad(lambda grad: grad * 2) + # register hook and removed + run_double_hook_for_accumulated_grad( + lambda grad: grad * 2, removed=True) + + def test_hook_in_model(self): + def run_double_hook_in_model(data, + label, + hook=None, + register=False, + remove=False): + for device in self.devices: + paddle.seed(self.seed) + paddle.set_device(device) + + net = SimpleNet(self.in_size, self.out_size) + loss_fn = nn.MSELoss() + + data = paddle.to_tensor(data) + label = paddle.to_tensor(label) + + ret1, out = net(data, hook, register, remove) + loss = loss_fn(out, label) + loss.backward() + + return ret1.grad, net.linear1.weight.grad, net.linear1.bias.grad + + data = np.random.uniform( + size=[self.batch_size, self.in_size]).astype('float32') + label = np.random.uniform(size=[self.batch_size, 1]).astype('float32') + + # get original value + ret1_grad, linear1_w_grad, linear1_b_grad = run_double_hook_in_model( + data, label) + # get value changed by hook + ret1_grad_hook, linear1_w_grad_hook, linear1_b_grad_hook = run_double_hook_in_model( + data, label, lambda grad: grad * 2, True) + # get value after removing hook + ret1_grad_rm, linear1_w_grad_rm, linear1_b_grad_rm = run_double_hook_in_model( + data, label, lambda grad: grad * 2, True, True) + + # compare original value and with hook + self.assertTrue(np.array_equal(ret1_grad, ret1_grad_hook)) + self.assertTrue(np.array_equal(linear1_w_grad * 2, linear1_w_grad_hook)) + self.assertTrue(np.array_equal(linear1_b_grad * 2, linear1_b_grad_hook)) + + # compare original value and remove hook + self.assertTrue(np.array_equal(ret1_grad, ret1_grad_rm)) + self.assertTrue(np.array_equal(linear1_w_grad, linear1_w_grad_rm)) + self.assertTrue(np.array_equal(linear1_b_grad, linear1_b_grad_rm)) + + def test_multiple_hooks_for_interior_var(self): + def run_multiple_hooks_for_interior_var(device, + hooks, + remove1=False, + remove2=False, + remove3=False): + paddle.set_device(device) + + x = paddle.to_tensor([0., 1., 2., 3.]) + y = paddle.to_tensor([4., 5., 6., 7.]) + x.stop_gradient = False + y.stop_gradient = False + + w = x + y + w.stop_gradient = False + + helpers = [] + for hook in hooks: + helper = w.register_hook(hook) + helpers.append(helper) + + z = paddle.to_tensor([1., 2., 3., 4.]) + z.stop_gradient = False + + o = z.matmul(w) + + if remove1: + helpers[0].remove() + if remove2: + helpers[1].remove() + if remove3: + helpers[2].remove() + + o.backward() + + return z.numpy(), w.grad, x.grad, y.grad + + def double_hook(grad): + return grad * 2 + + hooks = [double_hook, double_hook, double_hook] + + for device in self.devices: + z, w_grad, x_grad, y_grad = run_multiple_hooks_for_interior_var( + device, hooks) + + self.assertTrue(np.array_equal(w_grad, z)) + self.assertTrue(np.array_equal(x_grad, z * 8)) + self.assertTrue(np.array_equal(y_grad, z * 8)) + + z, w_grad, x_grad, y_grad = run_multiple_hooks_for_interior_var( + device, hooks, remove1=True) + + self.assertTrue(np.array_equal(w_grad, z)) + self.assertTrue(np.array_equal(x_grad, z * 4)) + self.assertTrue(np.array_equal(y_grad, z * 4)) + + z, w_grad, x_grad, y_grad = run_multiple_hooks_for_interior_var( + device, hooks, remove2=True) + + self.assertTrue(np.array_equal(w_grad, z)) + self.assertTrue(np.array_equal(x_grad, z * 4)) + self.assertTrue(np.array_equal(y_grad, z * 4)) + + z, w_grad, x_grad, y_grad = run_multiple_hooks_for_interior_var( + device, hooks, remove3=True) + + self.assertTrue(np.array_equal(w_grad, z)) + self.assertTrue(np.array_equal(x_grad, z * 4)) + self.assertTrue(np.array_equal(y_grad, z * 4)) + + z, w_grad, x_grad, y_grad = run_multiple_hooks_for_interior_var( + device, hooks, remove1=True, remove2=True, remove3=True) + + self.assertTrue(np.array_equal(w_grad, z)) + self.assertTrue(np.array_equal(x_grad, z)) + self.assertTrue(np.array_equal(y_grad, z)) + + def test_hook_in_double_grad(self): + def double_print_hook(grad): + grad = grad * 2 + print(grad) + return grad + + x = paddle.ones(shape=[1], dtype='float32') + x.stop_gradient = False + + # hook only works in backward + # for forward var x, the x.grad generated in + # paddle.grad will not deal with by hook + x.register_hook(double_print_hook) + + y = x * x + + # Since y = x * x, dx = 2 * x + dx = paddle.grad( + outputs=[y], inputs=[x], create_graph=True, retain_graph=True)[0] + + z = y + dx + self.assertTrue(x.grad is None) + + # If create_graph = True, the gradient of dx + # would be backpropagated. Therefore, + # z = x * x + dx = x * x + 2 * x, and + # x.gradient() = 2 * x + 2 = 4.0 + # after changed by hook: 8.0 + + z.backward() + self.assertTrue(np.array_equal(x.grad, np.array([8.]))) + + def test_remove_one_hook_multiple_times(self): + for device in self.devices: + paddle.set_device(device) + + x = paddle.to_tensor([1., 2., 3., 4.]) + x.stop_gradient = False + + h = x.register_hook(lambda grad: grad * 2) + self.assertTrue(h.remove()) + self.assertFalse(h.remove()) + + def test_register_hook_for_stop_gradient_var(self): + for device in self.devices: + paddle.set_device(device) + + x = paddle.to_tensor([1., 2., 3., 4.]) + + with self.assertRaises(RuntimeError): + x.register_hook(lambda grad: grad * 2) + + +if __name__ == '__main__': + unittest.main() -- GitLab From 07741593fadbda741f8b0f9935add64901b93f3b Mon Sep 17 00:00:00 2001 From: kuizhiqing Date: Thu, 1 Apr 2021 11:19:43 +0800 Subject: [PATCH 126/486] new group (#31682) * new group * ci compatible fix * assert nccl --- paddle/fluid/imperative/bkcl_context.cc | 29 +- paddle/fluid/imperative/bkcl_context.h | 2 + paddle/fluid/imperative/nccl_context.cc | 24 + paddle/fluid/imperative/nccl_context.h | 2 + paddle/fluid/imperative/parallel_context.h | 2 + .../collective/c_sync_calc_stream_op.cc | 70 +-- .../collective/c_sync_comm_stream_op.cc | 74 ++-- paddle/fluid/pybind/imperative.cc | 10 +- paddle/fluid/pybind/op_function_generator.cc | 2 + python/paddle/distributed/collective.py | 411 ++++++++++++++---- .../fluid/tests/unittests/CMakeLists.txt | 5 + .../collective_allreduce_new_group_api.py | 56 +++ .../paddle/fluid/tests/unittests/new_group.py | 83 ++++ .../fluid/tests/unittests/test_new_group.sh | 19 + .../tests/unittests/test_new_group_api.py | 35 ++ 15 files changed, 670 insertions(+), 154 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/collective_allreduce_new_group_api.py create mode 100644 python/paddle/fluid/tests/unittests/new_group.py create mode 100755 python/paddle/fluid/tests/unittests/test_new_group.sh create mode 100644 python/paddle/fluid/tests/unittests/test_new_group_api.py diff --git a/paddle/fluid/imperative/bkcl_context.cc b/paddle/fluid/imperative/bkcl_context.cc index 873068a0d31..886179feb19 100644 --- a/paddle/fluid/imperative/bkcl_context.cc +++ b/paddle/fluid/imperative/bkcl_context.cc @@ -19,12 +19,11 @@ #include #include +#include "paddle/fluid/framework/variable.h" #include "paddle/fluid/platform/bkcl_helper.h" #include "paddle/fluid/platform/collective_helper.h" -#include "paddle/fluid/platform/gen_comm_id_helper.h" - -#include "paddle/fluid/framework/variable.h" #include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/gen_comm_id_helper.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/string/split.h" #include "paddle/fluid/string/string_helper.h" @@ -77,7 +76,7 @@ void BKCLParallelContext::Init() { bkcl_ids.resize(strategy_.nrings_); if (strategy_.local_rank_ == 0) { - // generate the unique ncclid on the root worker + // generate the unique bkclid on the root worker for (size_t i = 0; i < bkcl_ids.size(); ++i) { auto ret = bkcl_get_unique_id(&bkcl_ids[i]); PADDLE_ENFORCE_EQ(BKCL_SUCCESS, ret, @@ -99,6 +98,28 @@ void BKCLParallelContext::Init() { } } +void BKCLParallelContext::InitWithRingID(int ring_id) { + std::vector bkcl_ids; + bkcl_ids.resize(1); + + if (strategy_.local_rank_ == 0) { + // generate the unique bkclid on the root worker + auto ret = bkcl_get_unique_id(&bkcl_ids[0]); + PADDLE_ENFORCE_EQ(BKCL_SUCCESS, ret, + platform::errors::PreconditionNotMet( + "BKCL get unique id failed [%d]", ret)); + } + BcastBKCLId(bkcl_ids, 0); + + int xpu_id = BOOST_GET_CONST(platform::XPUPlace, place_).device; + VLOG(0) << "init BKCL context nranks: " << strategy_.nranks_ + << " local rank: " << strategy_.local_rank_ << " xpu id: " << xpu_id + << " ring id: " << ring_id; + // it will assign bkcl_comm in XPUDeviceContext within ring_id + platform::BKCLCommContext::Instance().CreateBKCLComm( + &bkcl_ids[0], strategy_.nranks_, strategy_.local_rank_, xpu_id, ring_id); +} + void BKCLParallelContext::AllReduceByStream(const framework::Variable &src, framework::Variable *dst, int ring_id, bool use_calc_stream) { diff --git a/paddle/fluid/imperative/bkcl_context.h b/paddle/fluid/imperative/bkcl_context.h index d7d917f2008..86e4d97b3c7 100644 --- a/paddle/fluid/imperative/bkcl_context.h +++ b/paddle/fluid/imperative/bkcl_context.h @@ -36,6 +36,8 @@ class BKCLParallelContext : public ParallelContext { void Init() override; + void InitWithRingID(int ring_id) override; + void AllReduceByStream(const framework::Variable& src, framework::Variable* dst, int ring_id, bool use_calc_stream) override; diff --git a/paddle/fluid/imperative/nccl_context.cc b/paddle/fluid/imperative/nccl_context.cc index eb0135d15e0..7e7c4ceea0b 100644 --- a/paddle/fluid/imperative/nccl_context.cc +++ b/paddle/fluid/imperative/nccl_context.cc @@ -79,6 +79,30 @@ void NCCLParallelContext::Init() { } } +void NCCLParallelContext::InitWithRingID(int ring_id) { + std::vector nccl_ids; + nccl_ids.resize(1); + + if (strategy_.local_rank_ == 0) { + // generate the unique ncclid on the root worker + platform::dynload::ncclGetUniqueId(&nccl_ids[0]); + } + BcastNCCLId(nccl_ids, 0); + + int gpu_id = BOOST_GET_CONST(platform::CUDAPlace, place_).device; + VLOG(0) << "init nccl context nranks: " << strategy_.nranks_ + << " local rank: " << strategy_.local_rank_ << " gpu id: " << gpu_id + << " ring id: " << ring_id; + // it will assign nccl_comm in CUDADeviceContext within ring_id + platform::NCCLCommContext::Instance().CreateNCCLComm( + &nccl_ids[0], strategy_.nranks_, strategy_.local_rank_, gpu_id, ring_id); + + compute_events_.emplace_back(platform::CudaEventResourcePool::Instance().New( + BOOST_GET_CONST(platform::CUDAPlace, place_).device)); + comm_events_.emplace_back(platform::CudaEventResourcePool::Instance().New( + BOOST_GET_CONST(platform::CUDAPlace, place_).device)); +} + void NCCLParallelContext::AllReduceByStream(const framework::Variable &src, framework::Variable *dst, int ring_id, bool use_calc_stream) { diff --git a/paddle/fluid/imperative/nccl_context.h b/paddle/fluid/imperative/nccl_context.h index 51e5743aebd..292ef1661c3 100644 --- a/paddle/fluid/imperative/nccl_context.h +++ b/paddle/fluid/imperative/nccl_context.h @@ -53,6 +53,8 @@ class NCCLParallelContext : public ParallelContext { void Init() override; + void InitWithRingID(int ring_id) override; + void AllReduceByStream(const framework::Variable& src, framework::Variable* dst, int ring_id, bool use_calc_stream) override; diff --git a/paddle/fluid/imperative/parallel_context.h b/paddle/fluid/imperative/parallel_context.h index ef0a9604092..9a76311f2ed 100644 --- a/paddle/fluid/imperative/parallel_context.h +++ b/paddle/fluid/imperative/parallel_context.h @@ -50,6 +50,8 @@ class ParallelContext { virtual void Init() = 0; + virtual void InitWithRingID(int ring_id) = 0; + virtual void AllReduceByStream(const framework::Variable& src, framework::Variable* dst, int ring_id, bool use_calc_stream) = 0; diff --git a/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc b/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc index c4abe284d72..700d1173e2f 100644 --- a/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc +++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc @@ -15,40 +15,20 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" -namespace paddle { -namespace framework { -class Scope; -} // namespace framework -} // namespace paddle - namespace paddle { namespace operators { -class CSyncCalcStreamOp : public framework::OperatorBase { +class CSyncCalcStreamOp : public framework::OperatorWithKernel { public: - CSyncCalcStreamOp(const std::string& type, - const framework::VariableNameMap& inputs, - const framework::VariableNameMap& outputs, - const framework::AttributeMap& attrs) - : OperatorBase(type, inputs, outputs, attrs) {} - - void RunImpl(const framework::Scope& scope, - const platform::Place& place) const override { - PADDLE_ENFORCE_EQ(is_gpu_place(place), true, - platform::errors::PreconditionNotMet( - "Sync stream op can run on gpu place only for now.")); -#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32) - auto dev_ctx = static_cast( - platform::DeviceContextPool::Instance().Get(place)); -#ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(dev_ctx->stream())); -#else - PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(dev_ctx->stream())); -#endif -#else - PADDLE_THROW(platform::errors::PreconditionNotMet( - "PaddlePaddle should compile with GPU.")); -#endif + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override {} + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(framework::proto::VarType::FP32, + ctx.GetPlace()); } }; @@ -65,10 +45,36 @@ Call calculation stream synchronization. } }; +template +class CSyncCalcStreamCudaKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32) + + auto place = ctx.GetPlace(); + auto dev_ctx = static_cast( + platform::DeviceContextPool::Instance().Get(place)); + +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(dev_ctx->stream())); +#else + PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(dev_ctx->stream())); +#endif + +#else + PADDLE_THROW(platform::errors::PreconditionNotMet( + "PaddlePaddle should compile with GPU.")); +#endif + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; -REGISTER_OPERATOR(c_sync_calc_stream, ops::CSyncCalcStreamOp, - ops::CSyncCalcStreamOpMaker); +REGISTER_OP_WITHOUT_GRADIENT(c_sync_calc_stream, ops::CSyncCalcStreamOp, + ops::CSyncCalcStreamOpMaker); + +REGISTER_OP_CUDA_KERNEL(c_sync_calc_stream, + ops::CSyncCalcStreamCudaKernel); diff --git a/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc b/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc index adf27069f52..95b9cd040fe 100644 --- a/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc +++ b/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc @@ -14,45 +14,25 @@ limitations under the License. */ #include #include "paddle/fluid/framework/op_registry.h" -namespace paddle { -namespace framework { -class Scope; -} // namespace framework -} // namespace paddle + #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) -#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/nccl_helper.h" #endif namespace paddle { namespace operators { -class CSyncCommStreamOp : public framework::OperatorBase { +class CSyncCommStreamOp : public framework::OperatorWithKernel { public: - CSyncCommStreamOp(const std::string& type, - const framework::VariableNameMap& inputs, - const framework::VariableNameMap& outputs, - const framework::AttributeMap& attrs) - : OperatorBase(type, inputs, outputs, attrs) {} - - void RunImpl(const framework::Scope& scope, - const platform::Place& place) const override { - PADDLE_ENFORCE_EQ(is_gpu_place(place), true, - platform::errors::PreconditionNotMet( - "Sync stream op can run on gpu place only for now.")); + using framework::OperatorWithKernel::OperatorWithKernel; -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - int ring_id = Attr("ring_id"); - auto stream = - platform::NCCLCommContext::Instance().Get(ring_id, place)->stream(); -#ifdef PADDLE_WITH_RCCL - PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream)); -#else - PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream)); -#endif -#else - PADDLE_THROW(platform::errors::PreconditionNotMet( - "PaddlePaddle should compile with GPU.")); -#endif + void InferShape(framework::InferShapeContext* ctx) const override {} + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(framework::proto::VarType::FP32, + ctx.GetPlace()); } }; @@ -72,10 +52,38 @@ Call communication stream synchronization. } }; +template +class CSyncCommStreamCudaKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + + auto place = ctx.GetPlace(); + + int ring_id = ctx.Attr("ring_id"); + auto stream = + platform::NCCLCommContext::Instance().Get(ring_id, place)->stream(); + +#ifdef PADDLE_WITH_RCCL + PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream)); +#else + PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream)); +#endif + +#else + PADDLE_THROW(platform::errors::PreconditionNotMet( + "PaddlePaddle should compile with GPU.")); +#endif + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; -REGISTER_OPERATOR(c_sync_comm_stream, ops::CSyncCommStreamOp, - ops::CSyncCommStreamOpMaker); +REGISTER_OP_WITHOUT_GRADIENT(c_sync_comm_stream, ops::CSyncCommStreamOp, + ops::CSyncCommStreamOpMaker); + +REGISTER_OP_CUDA_KERNEL(c_sync_comm_stream, + ops::CSyncCommStreamCudaKernel); diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index 38ba1dc0293..c1c1387a84c 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -1578,7 +1578,10 @@ void BindImperative(py::module *m_ptr) { m, "NCCLParallelContext") .def(py::init()) - .def("init", [](imperative::NCCLParallelContext &self) { self.Init(); }); + .def("init", [](imperative::NCCLParallelContext &self) { self.Init(); }) + .def("init_with_ring_id", + &imperative::NCCLParallelContext::InitWithRingID, + py::arg("ring_id")); #endif #if defined(PADDLE_WITH_XPU_BKCL) @@ -1587,7 +1590,10 @@ void BindImperative(py::module *m_ptr) { m, "BKCLParallelContext") .def(py::init()) - .def("init", [](imperative::BKCLParallelContext &self) { self.Init(); }); + .def("init", [](imperative::BKCLParallelContext &self) { self.Init(); }) + .def("init_with_ring_id", + &imperative::BKCLParallelContext::InitWithRingID, + py::arg("ring_id")); #endif } diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc index 69856fa4fa1..282b0e1d81c 100644 --- a/paddle/fluid/pybind/op_function_generator.cc +++ b/paddle/fluid/pybind/op_function_generator.cc @@ -119,6 +119,8 @@ std::map> op_passing_outs_map = { {"fill_constant", {"Out"}}, {"matmul", {"Out"}}, {"c_broadcast", {"Out"}}, + {"c_sync_calc_stream", {"Out"}}, + {"c_sync_comm_stream", {"Out"}}, {"c_allreduce_sum", {"Out"}}, {"c_allreduce_max", {"Out"}}, {"c_allreduce_min", {"Out"}}, diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py index a6eb896802f..8e5c35995b2 100644 --- a/python/paddle/distributed/collective.py +++ b/python/paddle/distributed/collective.py @@ -26,6 +26,9 @@ import paddle.fluid as fluid import paddle.fluid.core as core __all__ = [ + 'wait', + 'new_group', + 'get_group', 'broadcast', 'all_reduce', 'reduce', @@ -75,30 +78,225 @@ class ReduceOp: PROD = 3 -class _Group(): - """The abstract representation of group.""" +class Group(): + """ + The abstract representation of group. + """ - def __init__(self, rank, rank_num): + def __init__(self, rank, rank_num, id=0, ranks=[]): self.rank = rank self.nranks = rank_num + self.id = id + self.ranks = ranks + + def is_member(self): + if self.rank < 0: + return False + if self.nranks < 2: + return False + return True + + def get_group_rank(self, rank): + if self.id == 0: + return rank + if self.is_member() and rank in self.ranks: + return self.ranks.index(rank) + else: + return -1 + + +_global_env = None + + +def _get_global_env(): + global _global_env + if not _global_env: + _global_env = paddle.distributed.ParallelEnv() + return _global_env + + +# group map : the map of all group, 0 for GlobalGroup +# Dict[int, Group] +_group_map = {} + + +def _get_group_map(): + global _group_map + if not _group_map: + genv = _get_global_env() + _group_map[0] = Group(genv.rank, genv.world_size, 0) + return _group_map + + +def _get_global_group(): + return _get_group_map()[0] + + +def _new_ring_id(): + return len(_get_group_map()) + max(_get_global_env().nrings, 9) + + +def get_group(id=0): + """ + + Get group instance by group id. + + Args: + id (int): the group id + + Returns: + Group: the group instance. + + Examples: + .. code-block:: python + + ... + gid = paddle.distributed.new_group([2,4,6]) + paddle.distributed.get_group(gid.id) + + """ + + gm = _get_group_map() + return gm[group] if group in gm else None + + +def new_group(ranks=None, backend=None): + """ + + Creates a new distributed comminication group. + + Args: + ranks (list): The global ranks of group members, list as sorted. + backend (str): The backend used to create group, only nccl is supported now. + + Returns: + Group: The group instance. Nerver return None. + + Examples: + .. code-block:: python + + import numpy as np + import paddle + + paddle.distributed.init_parallel_env() + tindata = np.random.random([10, 1000]).astype('float32') + tindata = paddle.to_tensor(tindata) + gid = paddle.distributed.new_group([2,4,6]) + paddle.distributed.all_reduce(tindata, group=gid, use_calc_stream=False) + + """ + + if not backend: + backend = 'nccl' + assert backend == 'nccl', ("backend other than nccl is not supported yet") + + genv = _get_global_env() + global_rank = genv.rank + + ring_id = _new_ring_id() + + global _group_map + if global_rank not in ranks: + gp = Group(-1, -1, ring_id, ranks) + _group_map[ring_id] = gp + return gp + + ranks = sorted(ranks) + group_rank = ranks.index(global_rank) + group_size = len(ranks) + gp = Group(group_rank, group_size, ring_id, ranks) + _group_map[ring_id] = gp + + if group_size < 2: + return gp + + strategy = core.ParallelStrategy() + strategy.nranks = group_size + strategy.local_rank = group_rank + strategy.trainer_endpoints = [genv.trainer_endpoints[i] for i in ranks] + strategy.current_endpoint = genv.current_endpoint + strategy.nrings = 1 + + if core.is_compiled_with_cuda(): + place = core.CUDAPlace(genv.device_id) + core.NCCLParallelContext(strategy, place).init_with_ring_id(ring_id) + else: + assert False + + return gp + +def wait(tensor, group=None, use_calc_stream=True): + """ + + wait to sync stream for group. + + Args: + tensor (Tensor): The Tensor used before sync. + group (Group): The Group instance to perform sync. + use_calc_stream (bool): Wether to use calculation stream (True) or communication stream (False), + default to False. + + Returns: + None. + + Examples: + .. code-block:: python + + + import numpy as np + import paddle + + paddle.distributed.init_parallel_env() + tindata = np.random.random([10, 1000]).astype('float32') + tindata = paddle.to_tensor(tindata) + paddle.distributed.all_reduce(tindata, use_calc_stream=True) + paddle.distributed.wait(tindata) + + """ + + if group is not None and not group.is_member(): + return + + ring_id = 0 if group is None else group.id + + if use_calc_stream: + _sync_calc_stream(tensor) + else: + _sync_comm_stream(tensor, ring_id) + + +def _sync_calc_stream(tensor): + + if in_dygraph_mode(): + return core.ops.c_sync_calc_stream(tensor, tensor) + + op_type = 'c_sync_calc_stream' + + helper = LayerHelper(op_type, **locals()) + helper.append_op( + type=op_type, + inputs={'X': [tensor]}, + outputs={'Out': [tensor]}, ) -# NOTE(chenweihang): Lazily initialized global group information -# If we initialize _default_group when import module, it will -# not update when we use spawn to run multi-process training -_default_group = None +def _sync_comm_stream(tensor, ring_id=0): -def _get_global_default_group(): - global _default_group - if _default_group is None: - _default_group = _Group( - int(os.getenv("PADDLE_TRAINER_ID", "0")), - int(os.getenv("PADDLE_TRAINERS_NUM", "1"))) - return _default_group + if in_dygraph_mode(): + return core.ops.c_sync_comm_stream([tensor], [tensor], 'ring_id', + ring_id) + op_type = 'c_sync_comm_stream' -def broadcast(tensor, src, group=0): + helper = LayerHelper(op_type, **locals()) + helper.append_op( + type=op_type, + inputs={'X': [tensor]}, + outputs={'Out': [tensor]}, + attrs={'ring_id': ring_id}, ) + + +def broadcast(tensor, src, group=None, use_calc_stream=True): """ Broadcast a tensor from the source to all others. @@ -107,7 +305,9 @@ def broadcast(tensor, src, group=0): tensor (Tensor): The Tensor to send if current rank is the source, or the tensor to receive otherwise. Its data type should be float16, float32, float64, int32 or int64. src (int): The source rank. - group (int): The process group to work on. It is Optional. + group (Group): The group instance return by new_group or None for global default group. + use_calc_stream (bool): Wether to use calculation stream (True) or communication stream (False), + default to True. Returns: None. @@ -130,17 +330,25 @@ def broadcast(tensor, src, group=0): out = data.numpy() # [[1, 2, 3], [1, 2, 3]] """ + + if group is not None and not group.is_member(): + return + + if not isinstance(src, int): + raise ValueError("src should be int.") + + ring_id = 0 if group is None else group.id + gsrc = src if group is None else group.get_group_rank(src) + if in_dygraph_mode(): - return core.ops.c_broadcast(tensor, tensor, 'root', src, - 'use_calc_stream', True, 'ring_id', group) + return core.ops.c_broadcast(tensor, tensor, 'root', gsrc, + 'use_calc_stream', use_calc_stream, + 'ring_id', ring_id) op_type = 'c_broadcast' check_variable_and_dtype( tensor, 'tensor', ['float16', 'float32', 'float64', 'int32', 'int64'], 'broadcast') - if not isinstance(src, int) or not isinstance(group, int): - raise ValueError("Both the type of 'src' and 'group' for broadcast " - "should be int.") helper = LayerHelper(op_type, **locals()) helper.append_op( @@ -148,13 +356,13 @@ def broadcast(tensor, src, group=0): inputs={'X': [tensor]}, outputs={'Out': [tensor]}, attrs={ - 'root': src, - 'use_calc_stream': True, - 'ring_id': group, + 'root': gsrc, + 'use_calc_stream': use_calc_stream, + 'ring_id': ring_id, }) -def all_reduce(tensor, op=ReduceOp.SUM, group=0): +def all_reduce(tensor, op=ReduceOp.SUM, group=None, use_calc_stream=True): """ Reduce a tensor over all ranks so that all get the result. @@ -163,7 +371,9 @@ def all_reduce(tensor, op=ReduceOp.SUM, group=0): tensor (Tensor): The input Tensor. It also works as the output Tensor. Its data type should be float16, float32, float64, int32 or int64. op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.Min|ReduceOp.PROD): Optional. The operation used. - group (int): Optional. The process group to work on. + group (Group): The group instance return by new_group or None for global default group. + use_calc_stream (bool): Wether to use calculation stream (True) or communication stream (False), + default to True. Returns: None. @@ -187,19 +397,25 @@ def all_reduce(tensor, op=ReduceOp.SUM, group=0): out = data.numpy() # [[5, 7, 9], [5, 7, 9]] """ + if group is not None and not group.is_member(): + return + + ring_id = 0 if group is None else group.id + if in_dygraph_mode(): if op == ReduceOp.SUM: return core.ops.c_allreduce_sum(tensor, tensor, 'use_calc_stream', - True, 'ring_id', group) + use_calc_stream, 'ring_id', ring_id) elif op == ReduceOp.MAX: return core.ops.c_allreduce_max(tensor, tensor, 'use_calc_stream', - True, 'ring_id', group) + use_calc_stream, 'ring_id', ring_id) elif op == ReduceOp.MIN: return core.ops.c_allreduce_min(tensor, tensor, 'use_calc_stream', - True, 'ring_id', group) + use_calc_stream, 'ring_id', ring_id) elif op == ReduceOp.PROD: return core.ops.c_allreduce_prod(tensor, tensor, 'use_calc_stream', - True, 'ring_id', group) + use_calc_stream, 'ring_id', + ring_id) else: raise ValueError("Unknown parameter: {}.".format(op)) @@ -217,18 +433,18 @@ def all_reduce(tensor, op=ReduceOp.SUM, group=0): op_type = 'c_allreduce_min' elif op == ReduceOp.PROD: op_type = 'c_allreduce_prod' - if not isinstance(group, int): - raise ValueError("The type of 'group' for all_reduce should be int.") + if not isinstance(ring_id, int): + raise ValueError("The type of 'ring_id' for all_reduce should be int.") helper = LayerHelper(op_type, **locals()) helper.append_op( type=op_type, inputs={'X': [tensor]}, outputs={'Out': [tensor]}, - attrs={'ring_id': group, - 'use_calc_stream': True}) + attrs={'ring_id': ring_id, + 'use_calc_stream': use_calc_stream}) -def reduce(tensor, dst, op=ReduceOp.SUM, group=0): +def reduce(tensor, dst, op=ReduceOp.SUM, group=None, use_calc_stream=True): """ Reduce a tensor to the destination from all others. @@ -238,7 +454,9 @@ def reduce(tensor, dst, op=ReduceOp.SUM, group=0): should be float16, float32, float64, int32 or int64. dst (int): The destination rank id. op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.Min|ReduceOp.PROD): Optional. The operation used. - group (int): The id of the process group to work on. + group (Group): The group instance return by new_group or None for global default group. + use_calc_stream (bool): Wether to use calculation stream (True) or communication stream (False), + default to True. Returns: None. @@ -261,20 +479,32 @@ def reduce(tensor, dst, op=ReduceOp.SUM, group=0): out = data.numpy() # [[5, 7, 9], [5, 7, 9]] """ + if group is not None and not group.is_member(): + return + + if not isinstance(dst, int): + raise ValueError("dst should be int.") + + ring_id = 0 if group is None else group.id + gdst = dst if group is None else group.get_group_rank(dst) + if in_dygraph_mode(): if op == ReduceOp.SUM: return core.ops.c_reduce_sum(tensor, tensor, 'use_calc_stream', - True, 'ring_id', group, 'root_id', dst) + use_calc_stream, 'ring_id', ring_id, + 'root_id', gdst) elif op == ReduceOp.MAX: return core.ops.c_reduce_max(tensor, tensor, 'use_calc_stream', - True, 'ring_id', group, 'root_id', dst) + use_calc_stream, 'ring_id', ring_id, + 'root_id', gdst) elif op == ReduceOp.MIN: return core.ops.c_reduce_min(tensor, tensor, 'use_calc_stream', - True, 'ring_id', group, 'root_id', dst) + use_calc_stream, 'ring_id', ring_id, + 'root_id', gdst) elif op == ReduceOp.PROD: return core.ops.c_reduce_prod(tensor, tensor, 'use_calc_stream', - True, 'ring_id', group, 'root_id', - dst) + use_calc_stream, 'ring_id', ring_id, + 'root_id', gdst) else: raise ValueError("Unknown parameter: {}.".format(op)) @@ -295,22 +525,19 @@ def reduce(tensor, dst, op=ReduceOp.SUM, group=0): elif op == ReduceOp.PROD: op_type = 'c_reduce_prod' - if not isinstance(dst, int) or not isinstance(group, int): - raise ValueError("Both the type of 'dst' and 'group' for reduce " - "should be int.") helper = LayerHelper(op_type, **locals()) helper.append_op( type=op_type, inputs={'X': [tensor]}, outputs={'Out': [tensor]}, attrs={ - 'ring_id': group, - 'use_calc_stream': True, - 'root_id': dst, + 'ring_id': ring_id, + 'use_calc_stream': use_calc_stream, + 'root_id': gdst, }) -def all_gather(tensor_list, tensor, group=0): +def all_gather(tensor_list, tensor, group=None, use_calc_stream=True): """ Gather tensors from all participators and all get the result. @@ -320,7 +547,9 @@ def all_gather(tensor_list, tensor, group=0): should be float16, float32, float64, int32 or int64. tensor (Tensor): The Tensor to send. Its data type should be float16, float32, float64, int32 or int64. - group (int): The id of the process group to work on. + group (Group): The group instance return by new_group or None for global default group. + use_calc_stream (bool): Wether to use calculation stream (True) or communication stream (False), + default to True. Returns: None. @@ -348,13 +577,19 @@ def all_gather(tensor_list, tensor, group=0): data2 = paddle.to_tensor(np_data2) paddle.distributed.all_gather(tensor_list, data2) """ + if group is not None and not group.is_member(): + return + + ring_id = 0 if group is None else group.id + nranks = _get_global_group().nranks if group is None else group.nranks + op_type = 'c_allgather' helper = LayerHelper(op_type, **locals()) out = helper.create_variable_for_type_inference(dtype=tensor.dtype) - _default_group = _get_global_default_group() + if in_dygraph_mode(): - core.ops.c_allgather(tensor, out, 'use_calc_stream', True, 'ring_id', - group, 'nranks', _default_group.nranks) + core.ops.c_allgather(tensor, out, 'use_calc_stream', use_calc_stream, + 'ring_id', ring_id, 'nranks', nranks) else: if not isinstance(tensor_list, list): raise ValueError("The type of 'tensor_list' for all_gather " @@ -367,23 +602,20 @@ def all_gather(tensor_list, tensor, group=0): check_variable_and_dtype( tensor, 'tensor', ['float16', 'float32', 'float64', 'int32', 'int64'], 'all_gather') - if not isinstance(group, int): - raise ValueError("The type of 'group' for all_gather " - "should be int.") helper.append_op( type=op_type, inputs={'X': [tensor]}, outputs={'Out': [out]}, attrs={ - 'ring_id': group, - 'use_calc_stream': True, - 'nranks': _default_group.nranks + 'ring_id': ring_id, + 'use_calc_stream': use_calc_stream, + 'nranks': nranks }) - tensor_list.extend(paddle.split(out, _default_group.nranks, 0)) + tensor_list.extend(paddle.split(out, nranks, 0)) -def scatter(tensor, tensor_list=None, src=0, group=0): +def scatter(tensor, tensor_list=None, src=0, group=None, use_calc_stream=True): """ Scatter a tensor to all participators. @@ -394,7 +626,9 @@ def scatter(tensor, tensor_list=None, src=0, group=0): tensor_list (list): A list of Tensors to scatter. Every element in the list must be a Tensor whose data type should be float16, float32, float64, int32 or int64. src (int): The source rank id. - group (int): The id of the process group to work on. + group (Group): The group instance return by new_group or None for global default group. + use_calc_stream (bool): Wether to use calculation stream (True) or communication stream (False), + default to True. Returns: None. @@ -422,45 +656,51 @@ def scatter(tensor, tensor_list=None, src=0, group=0): paddle.distributed.scatter(data1, tensor_list=[data1, data2], src=1) out = data1.numpy() """ + if group is not None and not group.is_member(): + return + + if not isinstance(src, int): + raise ValueError("src should be int.") + + ring_id = 0 if group is None else group.id + gsrc = src if group is None else group.get_group_rank(src) + rank = _get_global_group().rank if group is None else group.rank + nranks = _get_global_group().nranks if group is None else group.nranks + op_type = 'c_scatter' - _default_group = _get_global_default_group() - rank = _default_group.rank - nranks = _default_group.nranks - if rank != src: + + if rank != gsrc: tensor_list = [] for _ in range(nranks): tensor_list.append(tensor) temp = paddle.concat(tensor_list, axis=0) if in_dygraph_mode(): - return core.ops.c_scatter(temp, tensor, 'use_calc_stream', True, - 'ring_id', group, 'nranks', - _default_group.nranks, 'root', src) + return core.ops.c_scatter(temp, tensor, 'use_calc_stream', + use_calc_stream, 'ring_id', ring_id, 'nranks', + nranks, 'root', gsrc) check_variable_and_dtype( tensor, 'tensor', ['float16', 'float32', 'float64', 'int32', 'int64'], 'scatter') - if not isinstance(group, int) or not isinstance(src, int): - raise ValueError("Both the type of 'src' and 'group' for scatter " - "should be int.") helper = LayerHelper(op_type, **locals()) helper.append_op( type=op_type, inputs={'X': [temp]}, outputs={'Out': [tensor]}, attrs={ - 'ring_id': group, - 'root': src, - 'use_calc_stream': True, + 'ring_id': ring_id, + 'root': gsrc, + 'use_calc_stream': use_calc_stream, 'nranks': nranks, }) -def barrier(group=0): +def barrier(group=None): """ Barrier among all participators in the group. Args: - group (int): The id of the process group to work on. + group (Group): The group instance return by new_group or None for global default group. Returns: None. @@ -475,18 +715,23 @@ def barrier(group=0): init_parallel_env() paddle.distributed.barrier() """ + if group is not None and not group.is_member(): + return + + ring_id = 0 if group is None else group.id + op_type = 'barrier' temp = fill_constant([1], dtype="int32", value="1") if in_dygraph_mode(): - return core.ops.barrier(temp, temp, 'ring_id', group) - if not isinstance(group, int): + return core.ops.barrier(temp, temp, 'ring_id', ring_id) + if not isinstance(ring_id, int): raise ValueError("The type of 'group' for barrier must be int.") helper = LayerHelper(op_type, **locals()) helper.append_op( type=op_type, inputs={'X': [temp]}, outputs={'Out': [temp]}, - attrs={'ring_id': group}) + attrs={'ring_id': ring_id}) def _parallel_linear(x, num_rows, num_cols, axis, param_attr, bias_attr, @@ -515,10 +760,10 @@ def _parallel_linear(x, num_rows, num_cols, axis, param_attr, bias_attr, if gather_out: if axis == 0: - paddle.distributed.all_reduce(linear_out, group=0) + paddle.distributed.all_reduce(linear_out) else: output = [] - paddle.distributed.all_gather(output, linear_out, group=0) + paddle.distributed.all_gather(output, linear_out) linear_out = paddle.concat(output, axis=len(linear_out.shape) - 1) return linear_out @@ -559,7 +804,7 @@ def _parallel_embedding(x, per_part_embeddings, origin_size, param_attr, main_block = paddle.static.default_main_program().global_block() startup_block.vars[embedding.weight.name].is_distributed = True main_block.vars[embedding.weight.name].is_distributed = True - paddle.distributed.all_reduce(emb_out, group=0) + paddle.distributed.all_reduce(emb_out, group=None) return emb_out @@ -584,7 +829,7 @@ def split(x, With parallel embedding, the weight is split into num_partitions partitions, each of which is a matrix with (N/num_partitions + 1) rows and M column where the last row as the padding idx. - + Suppose we split the NxM weight into two partitons on device_0 and device_1 respectively. Then, one each device, the final weight has (N/2 + 1) rows with the index range from 0 to N/2. On device_0, all values in the input within [0, N/2 -1] diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 0c292d355dd..0abb61d95aa 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -82,6 +82,7 @@ if(((NOT WITH_ROCM) AND (NOT WITH_GPU)) OR WIN32) LIST(REMOVE_ITEM TEST_OPS test_collective_scatter_api) LIST(REMOVE_ITEM TEST_OPS test_collective_barrier_api) LIST(REMOVE_ITEM TEST_OPS test_collective_allreduce_api) + LIST(REMOVE_ITEM TEST_OPS test_new_group_api) LIST(REMOVE_ITEM TEST_OPS test_collective_broadcast_api) LIST(REMOVE_ITEM TEST_OPS test_collective_allgather_api) LIST(REMOVE_ITEM TEST_OPS test_collective_wait) @@ -177,6 +178,7 @@ endif() if ((NOT WITH_NCCL) AND (NOT WITH_RCCL)) list(REMOVE_ITEM TEST_OPS test_imperative_group) + LIST(REMOVE_ITEM TEST_OPS test_new_group_api) endif() if(((NOT WITH_ROCM) AND (NOT WITH_GPU)) OR WIN32) @@ -518,6 +520,7 @@ if(WITH_DISTRIBUTE) if(WITH_GPU OR WITH_ROCM) bash_test_modules(test_c_comm_init_op START_BASH test_c_comm_init_op.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR}) py_test_modules(test_launch_coverage MODULES test_launch_coverage) + bash_test_modules(test_new_group START_BASH test_new_group.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR}) endif() bash_test_modules(test_fleetrun START_BASH test_fleetrun.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR}) @@ -831,6 +834,7 @@ if((WITH_ROCM OR WITH_GPU) AND NOT WIN32) set_tests_properties(test_collective_allgather_api PROPERTIES TIMEOUT 120) set_tests_properties(test_collective_broadcast_api PROPERTIES TIMEOUT 120) set_tests_properties(test_collective_allreduce_api PROPERTIES TIMEOUT 120) + set_tests_properties(test_new_group_api PROPERTIES TIMEOUT 120) if(WITH_DISTRIBUTE) set_tests_properties(test_pipeline PROPERTIES TIMEOUT 120) endif() @@ -853,6 +857,7 @@ if((WITH_ROCM OR WITH_GPU) AND NOT WIN32) test_collective_barrier_api test_collective_reduce_api test_collective_allreduce_api + test_new_group_api test_collective_broadcast_api test_collective_allgather_api PROPERTIES LABELS "RUN_TYPE=DIST") diff --git a/python/paddle/fluid/tests/unittests/collective_allreduce_new_group_api.py b/python/paddle/fluid/tests/unittests/collective_allreduce_new_group_api.py new file mode 100644 index 00000000000..597765cfb98 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/collective_allreduce_new_group_api.py @@ -0,0 +1,56 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import argparse +import os +import sys +import signal +import time +import socket +from contextlib import closing +from six import string_types +import math +import paddle +import paddle.fluid as fluid +import paddle.fluid.profiler as profiler +import paddle.fluid.unique_name as nameGen +from paddle.fluid import core +import unittest +from multiprocessing import Process +import paddle.fluid.layers as layers +from functools import reduce +from test_collective_api_base import TestCollectiveAPIRunnerBase, runtime_main + +paddle.enable_static() + + +class TestCollectiveAllreduceNewGroupAPI(TestCollectiveAPIRunnerBase): + def __init__(self): + self.global_ring_id = 0 + + def get_model(self, main_prog, startup_program, rank): + with fluid.program_guard(main_prog, startup_program): + tindata = layers.data( + name="tindata", shape=[10, 1000], dtype='float32') + gp = paddle.distributed.new_group([0, 1]) + paddle.distributed.all_reduce( + tindata, group=gp, use_calc_stream=False) + return [tindata] + + +if __name__ == "__main__": + runtime_main(TestCollectiveAllreduceNewGroupAPI, "allreduce") diff --git a/python/paddle/fluid/tests/unittests/new_group.py b/python/paddle/fluid/tests/unittests/new_group.py new file mode 100644 index 00000000000..fb7beeee1df --- /dev/null +++ b/python/paddle/fluid/tests/unittests/new_group.py @@ -0,0 +1,83 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import os +import paddle + + +class TestNewGroupAPI(object): + def __init__(self): + paddle.distributed.init_parallel_env() + d1 = np.array([1, 2, 3]) + d2 = np.array([2, 3, 4]) + self.tensor1 = paddle.to_tensor(d1) + self.tensor2 = paddle.to_tensor(d2) + + def test_all(self): + gp = paddle.distributed.new_group([0, 1]) + print("test new group api ok") + + tmp = np.array([0, 0, 0]) + result = paddle.to_tensor(tmp) + paddle.distributed.scatter( + result, [self.tensor2, self.tensor1], + src=0, + group=gp, + use_calc_stream=True) + if gp.rank == 0: + assert np.array_equal(result, self.tensor2) + elif gp.rank == 1: + assert np.array_equal(result, self.tensor1) + print("test scatter api ok") + + paddle.distributed.broadcast( + result, src=1, group=gp, use_calc_stream=True) + assert np.array_equal(result, self.tensor1) + print("test broadcast api ok") + + paddle.distributed.reduce(result, dst=0, group=gp, use_calc_stream=True) + if gp.rank == 0: + assert np.array_equal(result, + paddle.add(self.tensor1, self.tensor1)) + elif gp.rank == 1: + assert np.array_equal(result, self.tensor1) + print("test reduce api ok") + + paddle.distributed.all_reduce(result, use_calc_stream=True) + assert np.array_equal( + result, + paddle.add(paddle.add(self.tensor1, self.tensor1), self.tensor1)) + print("test all_reduce api ok") + + paddle.distributed.wait(result, gp, use_calc_stream=True) + paddle.distributed.wait(result, gp, use_calc_stream=False) + print("test wait api ok") + + result = [] + paddle.distributed.all_gather( + result, self.tensor1, group=gp, use_calc_stream=True) + assert np.array_equal(result[0], self.tensor1) + assert np.array_equal(result[1], self.tensor1) + print("test all_gather api ok") + + paddle.distributed.barrier(group=gp) + print("test barrier api ok") + + return + + +if __name__ == "__main__": + gpt = TestNewGroupAPI() + gpt.test_all() diff --git a/python/paddle/fluid/tests/unittests/test_new_group.sh b/python/paddle/fluid/tests/unittests/test_new_group.sh new file mode 100755 index 00000000000..998ead8db32 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_new_group.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -e + +CUDA_VISIBLE_DEVICES=0,1 python -m paddle.distributed.launch --gpus=0,1 new_group.py diff --git a/python/paddle/fluid/tests/unittests/test_new_group_api.py b/python/paddle/fluid/tests/unittests/test_new_group_api.py new file mode 100644 index 00000000000..b9b80d3b431 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_new_group_api.py @@ -0,0 +1,35 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import unittest +import numpy as np +import paddle + +from test_collective_api_base import TestDistBase + +paddle.enable_static() + + +class TestCollectiveAllreduceAPI(TestDistBase): + def _setup_config(self): + pass + + def test_allreduce_nccl(self): + self.check_with_place("collective_allreduce_new_group_api.py", + "allreduce", "nccl") + + +if __name__ == '__main__': + unittest.main() -- GitLab From 980227f9740b4f656e43bc99e0cc84a13185d5c1 Mon Sep 17 00:00:00 2001 From: Zhang Zheng <32410583+ZzSean@users.noreply.github.com> Date: Thu, 1 Apr 2021 11:24:33 +0800 Subject: [PATCH 127/486] Support uint8_t for fill_constant_op (#31911) --- paddle/fluid/operators/fill_constant_op.cc | 1 + paddle/fluid/operators/fill_constant_op.cu.cc | 1 + paddle/fluid/operators/math/math_function.cc | 1 + paddle/fluid/operators/math/math_function.cu | 1 + python/paddle/fluid/layers/tensor.py | 13 +++++++------ .../fluid/tests/unittests/test_fill_constant_op.py | 8 +------- python/paddle/fluid/tests/unittests/test_full_op.py | 5 +---- 7 files changed, 13 insertions(+), 17 deletions(-) diff --git a/paddle/fluid/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc index 8a96d057cbe..caa29309901 100644 --- a/paddle/fluid/operators/fill_constant_op.cc +++ b/paddle/fluid/operators/fill_constant_op.cc @@ -149,6 +149,7 @@ REGISTER_OPERATOR( REGISTER_OP_CPU_KERNEL(fill_constant, ops::FillConstantKernel, ops::FillConstantKernel, + ops::FillConstantKernel, ops::FillConstantKernel, ops::FillConstantKernel, ops::FillConstantKernel, diff --git a/paddle/fluid/operators/fill_constant_op.cu.cc b/paddle/fluid/operators/fill_constant_op.cu.cc index 78c62a4053b..e784c20b8b8 100644 --- a/paddle/fluid/operators/fill_constant_op.cu.cc +++ b/paddle/fluid/operators/fill_constant_op.cu.cc @@ -17,6 +17,7 @@ limitations under the License. */ namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL(fill_constant, ops::FillConstantKernel, ops::FillConstantKernel, + ops::FillConstantKernel, ops::FillConstantKernel, ops::FillConstantKernel, ops::FillConstantKernel, diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc index a61b50faa75..5242d03c11c 100644 --- a/paddle/fluid/operators/math/math_function.cc +++ b/paddle/fluid/operators/math/math_function.cc @@ -51,6 +51,7 @@ template struct SetConstant; template struct SetConstant; template struct SetConstant; template struct SetConstant; +template struct SetConstant; template struct SetConstant; template struct SetConstant; template struct SetConstant; diff --git a/paddle/fluid/operators/math/math_function.cu b/paddle/fluid/operators/math/math_function.cu index cc8925fcf8a..2b93cd92608 100644 --- a/paddle/fluid/operators/math/math_function.cu +++ b/paddle/fluid/operators/math/math_function.cu @@ -35,6 +35,7 @@ using complex128 = paddle::platform::complex128; template struct SetConstant; template struct SetConstant; template struct SetConstant; +template struct SetConstant; template struct SetConstant; template struct SetConstant; template struct SetConstant; diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py index 84f99962e84..7458466b02f 100644 --- a/python/paddle/fluid/layers/tensor.py +++ b/python/paddle/fluid/layers/tensor.py @@ -635,7 +635,7 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None, name=None): If ``shape`` is a list or tuple, the elements of it should be integers or Tensors with shape [1]. If ``shape`` is an Tensor, it should be an 1-D Tensor with date type int32 or int64. dtype(np.dtype|str): Data type of the output Tensor which can - be float16, float32, float64, int32, int64. + be float16, float32, float64, uint8, int32, int64. value(bool|float|int|Tensor): The constant value used to initialize the Tensor to be created. If ``value`` is an Tensor, it should be an 1-D Tensor. force_cpu(bool, optional): data should be on CPU if it's true, default value is False. @@ -673,7 +673,7 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None, name=None): attrs = {'force_cpu': force_cpu} dtype = convert_dtype(dtype) if not isinstance(value, Variable): - if dtype in ['int64', 'int32']: + if dtype in ['uint8', 'int64', 'int32']: attrs['str_value'] = str(int(value)) attrs['value'] = int(value) else: @@ -686,7 +686,7 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None, name=None): out = _varbase_creator(dtype=dtype) if isinstance(value, Variable): - if dtype in ['int64', 'int32']: + if dtype in ['uint8', 'int64', 'int32']: attrs['str_value'] = str(int(value.numpy().item(0))) else: attrs['str_value'] = str(float(value.numpy().item(0))) @@ -706,9 +706,10 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None, name=None): inputs['ValueTensor'] = value check_shape(shape) - check_dtype(dtype, 'dtype', - ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'], - 'fill_constant') + check_dtype( + dtype, 'dtype', + ['bool', 'float16', 'float32', 'float64', 'uint8', 'int32', 'int64'], + 'fill_constant') check_type(shape, 'shape', (Variable, list, tuple), 'fill_constant') if out is not None: diff --git a/python/paddle/fluid/tests/unittests/test_fill_constant_op.py b/python/paddle/fluid/tests/unittests/test_fill_constant_op.py index c305f71aa53..0dd78ea53c2 100644 --- a/python/paddle/fluid/tests/unittests/test_fill_constant_op.py +++ b/python/paddle/fluid/tests/unittests/test_fill_constant_op.py @@ -375,15 +375,9 @@ class TestFillConstantOpError(unittest.TestCase): out=x1) # The argument dtype of fill_constant_op must be one of bool, float16, - #float32, float64, int32 or int64 + #float32, float64, uint8, int32 or int64 x2 = fluid.layers.data(name='x2', shape=[1], dtype="int32") - self.assertRaises( - TypeError, - fluid.layers.fill_constant, - shape=[1], - value=5, - dtype='uint8') self.assertRaises( TypeError, fluid.layers.fill_constant, diff --git a/python/paddle/fluid/tests/unittests/test_full_op.py b/python/paddle/fluid/tests/unittests/test_full_op.py index 2d850db7837..19944aba46d 100644 --- a/python/paddle/fluid/tests/unittests/test_full_op.py +++ b/python/paddle/fluid/tests/unittests/test_full_op.py @@ -84,10 +84,7 @@ class TestFullOpError(unittest.TestCase): TypeError, paddle.full, shape=[1], fill_value=5, dtype='uint4') # The argument dtype of full must be one of bool, float16, - #float32, float64, int32 or int64 - - self.assertRaises( - TypeError, paddle.full, shape=[1], fill_value=5, dtype='uint8') + #float32, float64, uint8, int32 or int64 # The argument shape's type of full_op must be list, tuple or Variable. def test_shape_type(): -- GitLab From 4acc87beb2110e9966327dbd427e0cdc17e05841 Mon Sep 17 00:00:00 2001 From: Zhang Zheng <32410583+ZzSean@users.noreply.github.com> Date: Thu, 1 Apr 2021 11:26:07 +0800 Subject: [PATCH 128/486] Optimize the perf of SameDimsAdd CUDA Kernel (#31872) --- .../elementwise/elementwise_add_op.cu | 88 +++++++++++++------ .../elementwise/elementwise_div_op.cu | 2 +- .../elementwise/elementwise_mul_op.cu | 2 +- .../elementwise/elementwise_op_function.cu.h | 86 ++++++++++++------ .../elementwise/elementwise_sub_op.cu | 2 +- 5 files changed, 125 insertions(+), 55 deletions(-) diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cu b/paddle/fluid/operators/elementwise/elementwise_add_op.cu index 8de6416065d..68fd81f8264 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cu @@ -24,7 +24,10 @@ namespace paddle { namespace operators { template -struct SameDimsElemwiseAdd { +struct SameDimsElemwiseAdd< + platform::CUDADeviceContext, T, + typename std::enable_if::value && + !std::is_same::value>::type> { void operator()(const framework::ExecutionContext& ctx, const framework::Tensor* x, const framework::Tensor* y, framework::Tensor* z) { @@ -36,38 +39,68 @@ struct SameDimsElemwiseAdd { } }; -template <> -struct SameDimsElemwiseAdd { +template +struct SameDimsElemwiseAdd< + platform::CUDADeviceContext, T, + typename std::enable_if::value || + std::is_same::value>::type> { void operator()(const framework::ExecutionContext& ctx, const framework::Tensor* x, const framework::Tensor* y, framework::Tensor* z) { auto size = x->numel(); - dim3 grid_size = dim3(((size + 1) / 2 + PADDLE_CUDA_THREAD_SIZE - 1) / - PADDLE_CUDA_THREAD_SIZE, - 1); + int vec_size = sizeof(float4) / sizeof(T); + dim3 grid_size = + dim3(((size + vec_size - 1) / vec_size + PADDLE_CUDA_THREAD_SIZE - 1) / + PADDLE_CUDA_THREAD_SIZE, + 1); dim3 block_size = dim3(PADDLE_CUDA_THREAD_SIZE, 1); - const half* x2 = - reinterpret_cast(x->data()); - const half* y2 = - reinterpret_cast(y->data()); - half* z2 = reinterpret_cast(z->data()); - SameDimsElemwiseAddCUDAKernel<<< - grid_size, block_size, 0, - ctx.template device_context().stream()>>>( - x2, y2, z2, size); + if (std::is_same::value) { + SameDimsElemwiseAddCUDAKernel<<< + grid_size, block_size, 0, + ctx.template device_context() + .stream()>>>(x->data(), y->data(), z->data(), + size); + } else { + const half* x2 = + reinterpret_cast(x->data()); + const half* y2 = + reinterpret_cast(y->data()); + half* z2 = reinterpret_cast(z->data()); + SameDimsElemwiseAddCUDAKernel<<< + grid_size, block_size, 0, + ctx.template device_context() + .stream()>>>(x2, y2, z2, size); + } } }; template -static __global__ void SimpleElemwiseAddGradCUDAKernel(const T* dout, - int64_t size, T* dx, - T* dy) { - int col = blockIdx.x * blockDim.x + threadIdx.x; +static __global__ void SimpleElemwiseAddGradCUDAKernel( + const T* __restrict__ dout, int size, int vec_size, T* dx, T* dy) { + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = gridDim.x * blockDim.x; + int loop = size / vec_size; + int remainder = size % vec_size; + const float4* dout_vec = reinterpret_cast(dout); + float4* dx_vec = reinterpret_cast(dx); + float4* dy_vec = reinterpret_cast(dy); + float4 tmp_loop; + + for (int i = tid; i < loop; i += stride) { + tmp_loop = dout_vec[i]; + dx_vec[i] = tmp_loop; + dy_vec[i] = tmp_loop; + } - while (col < size) { - dx[col] = dout[col]; - dy[col] = dout[col]; - col += blockDim.x * gridDim.x; + if (tid == loop && remainder != 0) { + T tmp_rem; + while (remainder) { + int idx = size - remainder; + remainder--; + tmp_rem = dout[idx]; + dx[idx] = tmp_rem; + dy[idx] = tmp_rem; + } } } @@ -79,14 +112,17 @@ elementwise_add_grad(const framework::ExecutionContext& ctx, const framework::Tensor* out, const framework::Tensor* dout, framework::Tensor* dx, framework::Tensor* dy) { - dim3 block_size = dim3(PADDLE_CUDA_THREAD_SIZE, 1); auto size = x->numel(); + int vec_size = max(static_cast(sizeof(float4) / sizeof(T)), 1); + dim3 block_size = dim3(PADDLE_CUDA_THREAD_SIZE, 1); dim3 grid_size = - dim3((size + PADDLE_CUDA_THREAD_SIZE - 1) / PADDLE_CUDA_THREAD_SIZE, 1); + dim3(((size + vec_size - 1) / vec_size + PADDLE_CUDA_THREAD_SIZE - 1) / + PADDLE_CUDA_THREAD_SIZE, + 1); SimpleElemwiseAddGradCUDAKernel< T><<().stream()>>>( - dout->data(), size, dx->mutable_data(ctx.GetPlace()), + dout->data(), size, vec_size, dx->mutable_data(ctx.GetPlace()), dy->mutable_data(ctx.GetPlace())); } diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.cu b/paddle/fluid/operators/elementwise/elementwise_div_op.cu index 96583d06571..0cf9294c9de 100644 --- a/paddle/fluid/operators/elementwise/elementwise_div_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_div_op.cu @@ -43,7 +43,7 @@ struct SameDimsElemwiseDiv { const framework::Tensor* x, const framework::Tensor* y, framework::Tensor* z) { auto size = x->numel(); - dim3 grid_size = dim3(((size + 1) / 2 + PADDLE_CUDA_THREAD_SIZE - 1) / + dim3 grid_size = dim3(((size + 7) / 8 + PADDLE_CUDA_THREAD_SIZE - 1) / PADDLE_CUDA_THREAD_SIZE, 1); dim3 block_size = dim3(PADDLE_CUDA_THREAD_SIZE, 1); diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu index 5b598ab2d78..e01b5eb5fb7 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu @@ -43,7 +43,7 @@ struct SameDimsElemwiseMul { const framework::Tensor* x, const framework::Tensor* y, framework::Tensor* z) { auto size = x->numel(); - dim3 grid_size = dim3(((size + 1) / 2 + PADDLE_CUDA_THREAD_SIZE - 1) / + dim3 grid_size = dim3(((size + 7) / 8 + PADDLE_CUDA_THREAD_SIZE - 1) / PADDLE_CUDA_THREAD_SIZE, 1); dim3 block_size = dim3(PADDLE_CUDA_THREAD_SIZE, 1); diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_function.cu.h index 6d5dcc4dd6f..8344b3d9838 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_function.cu.h +++ b/paddle/fluid/operators/elementwise/elementwise_op_function.cu.h @@ -18,7 +18,11 @@ limitations under the License. */ #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/hostdevice.h" +#ifdef __HIPCC__ +#define PADDLE_CUDA_THREAD_SIZE 256 +#else #define PADDLE_CUDA_THREAD_SIZE 512 +#endif #ifdef PADDLE_WITH_CUDA #include @@ -158,32 +162,62 @@ inline DEVICE half2 half2_div(const half2& a, const half2& b) { #endif } -#define DEFINE_SIMPLE_CUDA_BINARY_KERNEL(Func, expr, FP16Function) \ - template \ - __global__ void SameDimsElemwise##Func##CUDAKernel(const T* x, const T* y, \ - T* z, int64_t size) { \ - int col = blockIdx.x * blockDim.x + threadIdx.x; \ - while (col < size) { \ - z[col] = x[col] expr y[col]; \ - col += blockDim.x * gridDim.x; \ - } \ - } \ - template <> \ - inline __global__ void SameDimsElemwise##Func##CUDAKernel( \ - const half* x, const half* y, half* z, int64_t size) { \ - int start = threadIdx.x + blockDim.x * blockIdx.x; \ - int stride = blockDim.x * gridDim.x; \ - int n2 = size / 2; \ - const half2* x2 = reinterpret_cast(x); \ - const half2* y2 = reinterpret_cast(y); \ - half2* z2 = reinterpret_cast(z); \ - for (int i = start; i < n2; i += stride) { \ - z2[i] = FP16Function(x2[i], y2[i]); \ - } \ - if (start == 0 && (size % 2)) { \ - z[size - 1] = __float2half(__half2float(x[size - 1]) \ - expr __half2float(y[size - 1])); \ - } \ +#define DEFINE_SIMPLE_CUDA_BINARY_KERNEL(Func, expr, FP16Function) \ + inline __global__ void SameDimsElemwise##Func##CUDAKernel( \ + const float* __restrict__ x, const float* __restrict__ y, float* z, \ + int64_t size) { \ + int tid = blockIdx.x * blockDim.x + threadIdx.x; \ + int stride = gridDim.x * blockDim.x; \ + int loop = size / 4; \ + int remainder = size % 4; \ + const float4* x_vec = reinterpret_cast(x); \ + const float4* y_vec = reinterpret_cast(y); \ + float4* z_vec = reinterpret_cast(z); \ + float4 x_f4, y_f4; \ + for (int i = tid; i < loop; i += stride) { \ + x_f4 = x_vec[i]; \ + y_f4 = y_vec[i]; \ + z_vec[i] = make_float4(x_f4.x expr y_f4.x, x_f4.y expr y_f4.y, \ + x_f4.z expr y_f4.z, x_f4.w expr y_f4.w); \ + } \ + if (tid == loop && remainder != 0) { \ + while (remainder) { \ + int idx = size - remainder; \ + remainder--; \ + z[idx] = x[idx] expr y[idx]; \ + } \ + } \ + } \ + inline __global__ void SameDimsElemwise##Func##CUDAKernel( \ + const half* __restrict__ x, const half* __restrict__ y, half* z, \ + int64_t size) { \ + int tid = blockIdx.x * blockDim.x + threadIdx.x; \ + int stride = gridDim.x * blockDim.x; \ + int loop = size / 8; \ + int remainder = size % 8; \ + const float4* x_vec = reinterpret_cast(x); \ + const float4* y_vec = reinterpret_cast(y); \ + float4* z_vec = reinterpret_cast(z); \ + float4 x_h8, y_h8, z_h8; \ + for (int i = tid; i < loop; i += stride) { \ + x_h8 = x_vec[i]; \ + y_h8 = y_vec[i]; \ + half2* x_h2 = reinterpret_cast(&x_h8); \ + half2* y_h2 = reinterpret_cast(&y_h8); \ + half2* z_h2 = reinterpret_cast(&z_h8); \ + z_h2[0] = FP16Function(x_h2[0], y_h2[0]); \ + z_h2[1] = FP16Function(x_h2[1], y_h2[1]); \ + z_h2[2] = FP16Function(x_h2[2], y_h2[2]); \ + z_h2[3] = FP16Function(x_h2[3], y_h2[3]); \ + z_vec[i] = z_h8; \ + } \ + if (tid == loop && remainder != 0) { \ + while (remainder) { \ + int idx = size - remainder; \ + remainder--; \ + z[idx] = __float2half(__half2float(x[idx]) expr __half2float(y[idx])); \ + } \ + } \ } DEFINE_SIMPLE_CUDA_BINARY_KERNEL(Add, +, half2_add) DEFINE_SIMPLE_CUDA_BINARY_KERNEL(Sub, -, half2_sub) diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.cu b/paddle/fluid/operators/elementwise/elementwise_sub_op.cu index 1996cc471ac..192999fd2ac 100644 --- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.cu @@ -43,7 +43,7 @@ struct SameDimsElemwiseSub { const framework::Tensor* x, const framework::Tensor* y, framework::Tensor* z) { auto size = x->numel(); - dim3 grid_size = dim3(((size + 1) / 2 + PADDLE_CUDA_THREAD_SIZE - 1) / + dim3 grid_size = dim3(((size + 7) / 8 + PADDLE_CUDA_THREAD_SIZE - 1) / PADDLE_CUDA_THREAD_SIZE, 1); dim3 block_size = dim3(PADDLE_CUDA_THREAD_SIZE, 1); -- GitLab From b807e4081ec8231ada96c1fd96aa0a93ebf9651d Mon Sep 17 00:00:00 2001 From: zlsh80826 Date: Thu, 1 Apr 2021 12:05:26 +0800 Subject: [PATCH 129/486] [Paddle-TRT] add anchor generator op plugin (#31730) * add anchor generator op plugin * add anchor generator unit_test * remove dbg info * remove redundant line * replace assertion with paddle enforce * dynamic plugin replaces assertion with paddle enforce * anchor generator support dynamic shape on spatial axis * anchor generator test with fp16, dynamic shape * add anchor generator test all * add back main * reduce test input size to not exceed the timelimit of ci * change super to InferencePassTest for python2 compatibility * reuse paddle operator anchor generator * move creator construct to header with default * add cuda ifdef * reduce line * change super to InferencePassTest for python2 compatibility * fix anchor generator fp16 serialize setting * split unittest from test_all * restrict anchor generator input format before version 7234 * anchor generator only support greater than trt7.1 * change min_graph_size to 2 * min_graph size to 3 if dynamic shape * reduce dynamic shape size to avoid trt search tactic too long to exceed time limit * remove anchor from fetch list * anchor generator support all trt version * fix memory not allocated but if serialized --- .../fluid/inference/api/analysis_predictor.cc | 1 + .../inference/tensorrt/convert/CMakeLists.txt | 1 + .../tensorrt/convert/anchor_generator_op.cc | 79 +++ paddle/fluid/inference/tensorrt/op_teller.cc | 1 + .../inference/tensorrt/plugin/CMakeLists.txt | 1 + .../plugin/anchor_generator_op_plugin.cu | 566 ++++++++++++++++++ .../plugin/anchor_generator_op_plugin.h | 201 +++++++ .../detection/anchor_generator_op.cu | 13 +- .../operators/detection/anchor_generator_op.h | 13 + .../inference/test_trt_anchor_generator_op.py | 122 ++++ 10 files changed, 990 insertions(+), 8 deletions(-) create mode 100644 paddle/fluid/inference/tensorrt/convert/anchor_generator_op.cc create mode 100644 paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.cu create mode 100644 paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.h create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_trt_anchor_generator_op.py diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 21ef3b2312f..4b6c746d575 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -1192,6 +1192,7 @@ USE_TRT_CONVERTER(scale); USE_TRT_CONVERTER(stack); USE_TRT_CONVERTER(clip); USE_TRT_CONVERTER(gather); +USE_TRT_CONVERTER(anchor_generator); USE_TRT_CONVERTER(yolo_box); USE_TRT_CONVERTER(roi_align); USE_TRT_CONVERTER(affine_channel); diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt index 3f792300942..3820ac5d7cc 100644 --- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt @@ -6,6 +6,7 @@ nv_library(tensorrt_converter shuffle_channel_op.cc swish_op.cc instance_norm_op.cc stack_op.cc transpose_op.cc flatten_op.cc emb_eltwise_layernorm.cc skip_layernorm.cc scale_op.cc slice_op.cc hard_sigmoid_op.cc hard_swish_op.cc clip_op.cc gather_op.cc + anchor_generator_op.cc yolo_box_op.cc roi_align_op.cc affine_channel_op.cc diff --git a/paddle/fluid/inference/tensorrt/convert/anchor_generator_op.cc b/paddle/fluid/inference/tensorrt/convert/anchor_generator_op.cc new file mode 100644 index 00000000000..56aab9785c9 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/anchor_generator_op.cc @@ -0,0 +1,79 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +/* Anchor Generator Op */ +class AnchorGeneratorOpConverter : public OpConverter { + public: + void operator()(const paddle::framework::proto::OpDesc& op, + const paddle::framework::Scope& scope, + bool test_mode) override { + VLOG(3) << "convert a fluid anchor generator op to tensorrt plugin"; + framework::OpDesc op_desc(op, nullptr); + std::string input_name = op_desc.Input("Input").front(); + std::string anchor_name = op_desc.Output("Anchors").front(); + std::string variance_name = op_desc.Output("Variances").front(); + + auto* input = engine_->GetITensor(input_name); + const auto input_dims = input->getDimensions(); // C, H, W + std::vector output_names{anchor_name, variance_name}; + + const auto anchor_sizes = + BOOST_GET_CONST(std::vector, op_desc.GetAttr("anchor_sizes")); + const auto aspect_ratios = + BOOST_GET_CONST(std::vector, op_desc.GetAttr("aspect_ratios")); + const auto stride = + BOOST_GET_CONST(std::vector, op_desc.GetAttr("stride")); + const auto variances = + BOOST_GET_CONST(std::vector, op_desc.GetAttr("variances")); + const auto offset = BOOST_GET_CONST(float, op_desc.GetAttr("offset")); + const int num_anchors = aspect_ratios.size() * anchor_sizes.size(); + bool is_dynamic = engine_->with_dynamic_shape(); + const auto height = input_dims.d[1]; + const auto width = input_dims.d[2]; + const int box_num = width * height * num_anchors; + const nvinfer1::DataType data_type = nvinfer1::DataType::kFLOAT; + + nvinfer1::IPluginV2* anchor_generator_plugin = nullptr; + if (is_dynamic) { + anchor_generator_plugin = new plugin::AnchorGeneratorPluginDynamic( + data_type, anchor_sizes, aspect_ratios, stride, variances, offset, + num_anchors); + } else { + anchor_generator_plugin = new plugin::AnchorGeneratorPlugin( + data_type, anchor_sizes, aspect_ratios, stride, variances, offset, + height, width, num_anchors, box_num); + } + + std::vector anchor_generator_inputs{input}; + auto* anchor_generator_layer = engine_->network()->addPluginV2( + anchor_generator_inputs.data(), anchor_generator_inputs.size(), + *anchor_generator_plugin); + + RreplenishLayerAndOutput(anchor_generator_layer, "anchor_generator", + output_names, test_mode); + } +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +REGISTER_TRT_OP_CONVERTER(anchor_generator, AnchorGeneratorOpConverter); diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc index c95912a931e..f4e7c334632 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -116,6 +116,7 @@ struct SimpleOpTypeSetTeller : public Teller { "affine_channel", "multiclass_nms", "nearest_interp", + "anchor_generator", }; }; diff --git a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt index b4e948edd8a..1804e6c5571 100644 --- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt @@ -5,6 +5,7 @@ nv_library(tensorrt_plugin instance_norm_op_plugin.cu emb_eltwise_layernorm_plugin.cu qkv_to_context_plugin.cu skip_layernorm_op_plugin.cu slice_op_plugin.cu hard_swish_op_plugin.cu stack_op_plugin.cu special_slice_plugin.cu + anchor_generator_op_plugin.cu yolo_box_op_plugin.cu roi_align_op_plugin.cu DEPS enforce tensorrt_engine prelu tensor bert_encoder_functor) diff --git a/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.cu new file mode 100644 index 00000000000..01ee86ceb48 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.cu @@ -0,0 +1,566 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include + +#include "paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.h" +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h" + +#include "paddle/fluid/operators/detection/anchor_generator_op.h" + +namespace paddle { +namespace inference { +namespace tensorrt { +namespace plugin { + +#define PrepareParamsOnDevice() \ + constexpr int data_size = 4; \ + cudaMalloc(&anchor_sizes_device_, anchor_sizes_.size() * data_size); \ + cudaMalloc(&aspect_ratios_device_, aspect_ratios_.size() * data_size); \ + cudaMalloc(&stride_device_, stride_.size() * data_size); \ + cudaMalloc(&variances_device_, variances_.size() * data_size); \ + cudaMemcpy(anchor_sizes_device_, anchor_sizes_.data(), \ + anchor_sizes_.size() * data_size, cudaMemcpyHostToDevice); \ + cudaMemcpy(aspect_ratios_device_, aspect_ratios_.data(), \ + aspect_ratios_.size() * data_size, cudaMemcpyHostToDevice); \ + cudaMemcpy(stride_device_, stride_.data(), stride_.size() * data_size, \ + cudaMemcpyHostToDevice); \ + cudaMemcpy(variances_device_, variances_.data(), \ + variances_.size() * data_size, cudaMemcpyHostToDevice); + +AnchorGeneratorPlugin::AnchorGeneratorPlugin( + const nvinfer1::DataType data_type, const std::vector& anchor_sizes, + const std::vector& aspect_ratios, const std::vector& stride, + const std::vector& variances, const float offset, const int height, + const int width, const int num_anchors, const int box_num) + : data_type_(data_type), + anchor_sizes_(anchor_sizes), + aspect_ratios_(aspect_ratios), + stride_(stride), + variances_(variances), + offset_(offset), + height_(height), + width_(width), + num_anchors_(num_anchors), + box_num_(box_num) { + // anchors must be float32, which is the generator proposals' input + PADDLE_ENFORCE_EQ(data_type_, nvinfer1::DataType::kFLOAT, + platform::errors::InvalidArgument( + "TRT anchor generator plugin only accepts float32.")); + PADDLE_ENFORCE_GE(height_, 0, + platform::errors::InvalidArgument( + "TRT anchor generator plugin only accepts height " + "greater than 0, but receive height = %d.", + height_)); + PADDLE_ENFORCE_GE(width_, 0, + platform::errors::InvalidArgument( + "TRT anchor generator plugin only accepts width " + "greater than 0, but receive width = %d.", + width_)); + PADDLE_ENFORCE_GE( + num_anchors_, 0, + platform::errors::InvalidArgument( + "TRT anchor generator plugin only accepts number of anchors greater " + "than 0, but receive number of anchors = %d.", + num_anchors_)); + PADDLE_ENFORCE_GE(box_num_, 0, + platform::errors::InvalidArgument( + "TRT anchor generator plugin only accepts box_num " + "greater than 0, but receive box_num = %d.", + box_num_)); + PrepareParamsOnDevice(); +} + +AnchorGeneratorPlugin::~AnchorGeneratorPlugin() { + auto release_device_ptr = [](void* ptr) { + if (ptr) { + cudaFree(ptr); + ptr = nullptr; + } + }; + release_device_ptr(anchor_sizes_device_); + release_device_ptr(aspect_ratios_device_); + release_device_ptr(stride_device_); + release_device_ptr(variances_device_); +} + +AnchorGeneratorPlugin::AnchorGeneratorPlugin(const void* data, size_t length) { + DeserializeValue(&data, &length, &data_type_); + DeserializeValue(&data, &length, &anchor_sizes_); + DeserializeValue(&data, &length, &aspect_ratios_); + DeserializeValue(&data, &length, &stride_); + DeserializeValue(&data, &length, &variances_); + DeserializeValue(&data, &length, &offset_); + DeserializeValue(&data, &length, &height_); + DeserializeValue(&data, &length, &width_); + DeserializeValue(&data, &length, &num_anchors_); + DeserializeValue(&data, &length, &box_num_); + PrepareParamsOnDevice(); +} + +const char* AnchorGeneratorPlugin::getPluginType() const { + return "anchor_generator_plugin"; +} + +const char* AnchorGeneratorPlugin::getPluginVersion() const { return "1"; } + +int AnchorGeneratorPlugin::getNbOutputs() const { return 2; } + +nvinfer1::Dims AnchorGeneratorPlugin::getOutputDimensions( + int index, const nvinfer1::Dims* inputs, int nb_input_dims) { + nvinfer1::Dims dims{}; + dims.nbDims = 4; + dims.d[0] = height_; + dims.d[1] = width_; + dims.d[2] = num_anchors_; + dims.d[3] = 4; + return dims; +} + +bool AnchorGeneratorPlugin::supportsFormat( + nvinfer1::DataType type, nvinfer1::TensorFormat format) const { + // static shape plugin can't support different type between input/out + // it may cause addition overhead in half mode + return (type == data_type_ && format == nvinfer1::TensorFormat::kLINEAR); +} + +size_t AnchorGeneratorPlugin::getWorkspaceSize(int max_batch_size) const { + return 0; +} + +template +int AnchorGeneratorPlugin::enqueue_impl(int batch_size, + const void* const* inputs, + void** outputs, void* workspace, + cudaStream_t stream) { + const int block = 512; + const int gen_anchor_grid = (box_num_ + block - 1) / block; + T* anchors = static_cast(outputs[0]); + T* vars = static_cast(outputs[1]); + const T* anchor_sizes_device = static_cast(anchor_sizes_device_); + const T* aspect_ratios_device = static_cast(aspect_ratios_device_); + const T* stride_device = static_cast(stride_device_); + const T* variances_device = static_cast(variances_device_); + paddle::operators::GenAnchors<<>>( + anchors, aspect_ratios_device, aspect_ratios_.size(), anchor_sizes_device, + anchor_sizes_.size(), stride_device, stride_.size(), height_, width_, + offset_); + const int var_grid = (box_num_ * 4 + block - 1) / block; + paddle::operators::SetVariance<<>>( + vars, variances_device, variances_.size(), box_num_ * 4); + return cudaGetLastError() != cudaSuccess; +} + +int AnchorGeneratorPlugin::enqueue(int batch_size, const void* const* inputs, + void** outputs, void* workspace, + cudaStream_t stream) { + return enqueue_impl(batch_size, inputs, outputs, workspace, stream); +} + +int AnchorGeneratorPlugin::initialize() { return 0; } + +void AnchorGeneratorPlugin::terminate() {} + +size_t AnchorGeneratorPlugin::getSerializationSize() const { + size_t serialize_size = 0; + serialize_size += SerializedSize(data_type_); + serialize_size += SerializedSize(anchor_sizes_); + serialize_size += SerializedSize(aspect_ratios_); + serialize_size += SerializedSize(stride_); + serialize_size += SerializedSize(variances_); + serialize_size += SerializedSize(offset_); + serialize_size += SerializedSize(height_); + serialize_size += SerializedSize(width_); + serialize_size += SerializedSize(num_anchors_); + serialize_size += SerializedSize(box_num_); + return serialize_size; +} + +void AnchorGeneratorPlugin::serialize(void* buffer) const { + SerializeValue(&buffer, data_type_); + SerializeValue(&buffer, anchor_sizes_); + SerializeValue(&buffer, aspect_ratios_); + SerializeValue(&buffer, stride_); + SerializeValue(&buffer, variances_); + SerializeValue(&buffer, offset_); + SerializeValue(&buffer, height_); + SerializeValue(&buffer, width_); + SerializeValue(&buffer, num_anchors_); + SerializeValue(&buffer, box_num_); +} + +void AnchorGeneratorPlugin::destroy() {} + +void AnchorGeneratorPlugin::setPluginNamespace(const char* lib_namespace) { + namespace_ = std::string(lib_namespace); +} + +const char* AnchorGeneratorPlugin::getPluginNamespace() const { + return namespace_.c_str(); +} + +nvinfer1::DataType AnchorGeneratorPlugin::getOutputDataType( + int index, const nvinfer1::DataType* input_type, int nb_inputs) const { + return data_type_; +} + +bool AnchorGeneratorPlugin::isOutputBroadcastAcrossBatch( + int output_index, const bool* input_is_broadcast, int nb_inputs) const { + return true; +} + +bool AnchorGeneratorPlugin::canBroadcastInputAcrossBatch( + int input_index) const { + return false; +} + +void AnchorGeneratorPlugin::configurePlugin( + const nvinfer1::Dims* input_dims, int nb_inputs, + const nvinfer1::Dims* output_dims, int nb_outputs, + const nvinfer1::DataType* input_types, + const nvinfer1::DataType* output_types, const bool* input_is_broadcast, + const bool* output_is_broadcast, nvinfer1::PluginFormat float_format, + int max_batct_size) {} + +nvinfer1::IPluginV2Ext* AnchorGeneratorPlugin::clone() const { + auto plugin = new AnchorGeneratorPlugin( + data_type_, anchor_sizes_, aspect_ratios_, stride_, variances_, offset_, + height_, width_, num_anchors_, box_num_); + plugin->setPluginNamespace(namespace_.c_str()); + return plugin; +} + +void AnchorGeneratorPluginCreator::setPluginNamespace( + const char* lib_namespace) { + namespace_ = std::string(lib_namespace); +} + +const char* AnchorGeneratorPluginCreator::getPluginNamespace() const { + return namespace_.c_str(); +} + +const char* AnchorGeneratorPluginCreator::getPluginName() const { + return "anchor_generator_plugin"; +} + +const char* AnchorGeneratorPluginCreator::getPluginVersion() const { + return "1"; +} + +const nvinfer1::PluginFieldCollection* +AnchorGeneratorPluginCreator::getFieldNames() { + return &field_collection_; +} + +nvinfer1::IPluginV2Ext* AnchorGeneratorPluginCreator::createPlugin( + const char* name, const nvinfer1::PluginFieldCollection* fc) { + const nvinfer1::PluginField* fields = fc->fields; + int type_id = -1; + std::vector anchor_sizes, aspect_ratios, stride, variances; + float offset = .5; + int height = -1, width = -1; + int num_anchors = -1; + int box_num = -1; + + for (int i = 0; i < fc->nbFields; ++i) { + const std::string field_name(fc->fields[i].name); + const auto length = fc->fields[i].length; + if (field_name.compare("type_id") == 0) { + type_id = *static_cast(fc->fields[i].data); + } else if (field_name.compare("anchor_sizes")) { + const auto* data = static_cast(fc->fields[i].data); + anchor_sizes.insert(anchor_sizes.end(), data, data + length); + } else if (field_name.compare("aspect_ratios")) { + const auto* data = static_cast(fc->fields[i].data); + aspect_ratios.insert(aspect_ratios.end(), data, data + length); + } else if (field_name.compare("stride")) { + const auto* data = static_cast(fc->fields[i].data); + stride.insert(stride.end(), data, data + length); + } else if (field_name.compare("variances")) { + const auto* data = static_cast(fc->fields[i].data); + variances.insert(variances.end(), data, data + length); + } else if (field_name.compare("offset")) { + offset = *static_cast(fc->fields[i].data); + } else if (field_name.compare("height")) { + height = *static_cast(fc->fields[i].data); + } else if (field_name.compare("width")) { + width = *static_cast(fc->fields[i].data); + } else if (field_name.compare("num_anchors")) { + num_anchors = *static_cast(fc->fields[i].data); + } else if (field_name.compare("box_num")) { + box_num = *static_cast(fc->fields[i].data); + } else { + assert(false && "unknown plugin field name."); + } + } + return new AnchorGeneratorPlugin(nvinfer1::DataType::kFLOAT, anchor_sizes, + aspect_ratios, stride, variances, offset, + height, width, num_anchors, box_num); +} + +nvinfer1::IPluginV2Ext* AnchorGeneratorPluginCreator::deserializePlugin( + const char* name, const void* serial_data, size_t serial_length) { + auto plugin = new AnchorGeneratorPlugin(serial_data, serial_length); + plugin->setPluginNamespace(namespace_.c_str()); + return plugin; +} + +#if IS_TRT_VERSION_GE(6000) +AnchorGeneratorPluginDynamic::AnchorGeneratorPluginDynamic( + const nvinfer1::DataType data_type, const std::vector& anchor_sizes, + const std::vector& aspect_ratios, const std::vector& stride, + const std::vector& variances, const float offset, + const int num_anchors) + : data_type_(data_type), + anchor_sizes_(anchor_sizes), + aspect_ratios_(aspect_ratios), + stride_(stride), + variances_(variances), + offset_(offset), + num_anchors_(num_anchors) { + // data_type_ is used to determine the output data type + // data_type_ can only be float32 + // height, width, num_anchors are calculated at configurePlugin + PADDLE_ENFORCE_EQ(data_type_, nvinfer1::DataType::kFLOAT, + platform::errors::InvalidArgument( + "TRT anchor generator plugin only accepts float32.")); + PADDLE_ENFORCE_GE( + num_anchors_, 0, + platform::errors::InvalidArgument( + "TRT anchor generator plugin only accepts number of anchors greater " + "than 0, but receive number of anchors = %d.", + num_anchors_)); + PrepareParamsOnDevice(); +} + +AnchorGeneratorPluginDynamic::~AnchorGeneratorPluginDynamic() { + auto release_device_ptr = [](void* ptr) { + if (ptr) { + cudaFree(ptr); + ptr = nullptr; + } + }; + release_device_ptr(anchor_sizes_device_); + release_device_ptr(aspect_ratios_device_); + release_device_ptr(stride_device_); + release_device_ptr(variances_device_); +} + +AnchorGeneratorPluginDynamic::AnchorGeneratorPluginDynamic(void const* data, + size_t length) { + DeserializeValue(&data, &length, &data_type_); + DeserializeValue(&data, &length, &anchor_sizes_); + DeserializeValue(&data, &length, &aspect_ratios_); + DeserializeValue(&data, &length, &stride_); + DeserializeValue(&data, &length, &variances_); + DeserializeValue(&data, &length, &offset_); + DeserializeValue(&data, &length, &num_anchors_); + PrepareParamsOnDevice(); +} + +nvinfer1::IPluginV2DynamicExt* AnchorGeneratorPluginDynamic::clone() const { + auto plugin = new AnchorGeneratorPluginDynamic( + data_type_, anchor_sizes_, aspect_ratios_, stride_, variances_, offset_, + num_anchors_); + plugin->setPluginNamespace(namespace_.c_str()); + return plugin; +} + +nvinfer1::DimsExprs AnchorGeneratorPluginDynamic::getOutputDimensions( + int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs, + nvinfer1::IExprBuilder& exprBuilder) { + nvinfer1::DimsExprs ret{}; + ret.nbDims = 4; + ret.d[0] = inputs[0].d[2]; // feature height + ret.d[1] = inputs[0].d[3]; // feature width + ret.d[2] = exprBuilder.constant(num_anchors_); + ret.d[3] = exprBuilder.constant(4); + return ret; +} + +bool AnchorGeneratorPluginDynamic::supportsFormatCombination( + int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs, + int nbOutputs) { + // input can be any, doesn't matter + // anchor generator doesn't read input raw data, only need the shape info + auto type = inOut[pos].type; + auto format = inOut[pos].format; +#if IS_TRT_VERSION_GE(7234) + if (pos == 0) return true; +#else + if (pos == 0) return format == nvinfer1::TensorFormat::kLINEAR; +#endif + return (type == nvinfer1::DataType::kFLOAT && + format == nvinfer1::TensorFormat::kLINEAR); +} + +void AnchorGeneratorPluginDynamic::configurePlugin( + const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs, + const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) {} + +size_t AnchorGeneratorPluginDynamic::getWorkspaceSize( + const nvinfer1::PluginTensorDesc* inputs, int nbInputs, + const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const { + return 0; +} + +template +int AnchorGeneratorPluginDynamic::enqueue_impl( + const nvinfer1::PluginTensorDesc* inputDesc, + const nvinfer1::PluginTensorDesc* outputDesc, const void* const* inputs, + void* const* outputs, void* workspace, cudaStream_t stream) { + const int height = inputDesc[0].dims.d[2]; + const int width = inputDesc[0].dims.d[3]; + const int box_num = height * width * num_anchors_; + const int block = 512; + const int gen_anchor_grid = (box_num + block - 1) / block; + T* anchors = static_cast(outputs[0]); + T* vars = static_cast(outputs[1]); + const T* anchor_sizes_device = static_cast(anchor_sizes_device_); + const T* aspect_ratios_device = static_cast(aspect_ratios_device_); + const T* stride_device = static_cast(stride_device_); + const T* variances_device = static_cast(variances_device_); + paddle::operators::GenAnchors<<>>( + anchors, aspect_ratios_device, aspect_ratios_.size(), anchor_sizes_device, + anchor_sizes_.size(), stride_device, stride_.size(), height, width, + offset_); + const int var_grid = (box_num * 4 + block - 1) / block; + paddle::operators::SetVariance<<>>( + vars, variances_device, variances_.size(), box_num * 4); + return cudaGetLastError() != cudaSuccess; +} + +int AnchorGeneratorPluginDynamic::enqueue( + const nvinfer1::PluginTensorDesc* inputDesc, + const nvinfer1::PluginTensorDesc* outputDesc, const void* const* inputs, + void* const* outputs, void* workspace, cudaStream_t stream) { + assert(outputDesc[0].type == nvinfer1::DataType::kFLOAT); + assert(outputDesc[1].type == nvinfer1::DataType::kFLOAT); + return enqueue_impl(inputDesc, outputDesc, inputs, outputs, workspace, + stream); +} + +nvinfer1::DataType AnchorGeneratorPluginDynamic::getOutputDataType( + int index, const nvinfer1::DataType* inputTypes, int nbInputs) const { + return data_type_; +} + +const char* AnchorGeneratorPluginDynamic::getPluginType() const { + return "anchor_generator_plugin_dynamic"; +} + +int AnchorGeneratorPluginDynamic::getNbOutputs() const { return 2; } + +int AnchorGeneratorPluginDynamic::initialize() { return 0; } + +void AnchorGeneratorPluginDynamic::terminate() {} + +size_t AnchorGeneratorPluginDynamic::getSerializationSize() const { + size_t serialize_size = 0; + serialize_size += SerializedSize(data_type_); + serialize_size += SerializedSize(anchor_sizes_); + serialize_size += SerializedSize(aspect_ratios_); + serialize_size += SerializedSize(stride_); + serialize_size += SerializedSize(variances_); + serialize_size += SerializedSize(offset_); + serialize_size += SerializedSize(num_anchors_); + return serialize_size; +} + +void AnchorGeneratorPluginDynamic::serialize(void* buffer) const { + SerializeValue(&buffer, data_type_); + SerializeValue(&buffer, anchor_sizes_); + SerializeValue(&buffer, aspect_ratios_); + SerializeValue(&buffer, stride_); + SerializeValue(&buffer, variances_); + SerializeValue(&buffer, offset_); + SerializeValue(&buffer, num_anchors_); +} + +void AnchorGeneratorPluginDynamic::destroy() {} + +void AnchorGeneratorPluginDynamicCreator::setPluginNamespace( + const char* lib_namespace) { + namespace_ = std::string(lib_namespace); +} + +const char* AnchorGeneratorPluginDynamicCreator::getPluginNamespace() const { + return namespace_.c_str(); +} + +const char* AnchorGeneratorPluginDynamicCreator::getPluginName() const { + return "anchor_generator_plugin_dynamic"; +} + +const char* AnchorGeneratorPluginDynamicCreator::getPluginVersion() const { + return "1"; +} + +const nvinfer1::PluginFieldCollection* +AnchorGeneratorPluginDynamicCreator::getFieldNames() { + return &field_collection_; +} + +nvinfer1::IPluginV2Ext* AnchorGeneratorPluginDynamicCreator::createPlugin( + const char* name, const nvinfer1::PluginFieldCollection* fc) { + const nvinfer1::PluginField* fields = fc->fields; + int type_id = -1; + std::vector anchor_sizes, aspect_ratios, stride, variances; + float offset = .5; + int num_anchors = -1; + for (int i = 0; i < fc->nbFields; ++i) { + const std::string field_name(fc->fields[i].name); + const auto length = fc->fields[i].length; + if (field_name.compare("type_id") == 0) { + type_id = *static_cast(fc->fields[i].data); + } else if (field_name.compare("anchor_sizes")) { + const auto* data = static_cast(fc->fields[i].data); + anchor_sizes.insert(anchor_sizes.end(), data, data + length); + } else if (field_name.compare("aspect_ratios")) { + const auto* data = static_cast(fc->fields[i].data); + aspect_ratios.insert(aspect_ratios.end(), data, data + length); + } else if (field_name.compare("stride")) { + const auto* data = static_cast(fc->fields[i].data); + stride.insert(stride.end(), data, data + length); + } else if (field_name.compare("variances")) { + const auto* data = static_cast(fc->fields[i].data); + variances.insert(variances.end(), data, data + length); + } else if (field_name.compare("offset")) { + offset = *static_cast(fc->fields[i].data); + } else if (field_name.compare("num_anchors")) { + num_anchors = *static_cast(fc->fields[i].data); + } else { + assert(false && "unknown plugin field name."); + } + } + return new AnchorGeneratorPluginDynamic(nvinfer1::DataType::kFLOAT, + anchor_sizes, aspect_ratios, stride, + variances, offset, num_anchors); +} + +nvinfer1::IPluginV2Ext* AnchorGeneratorPluginDynamicCreator::deserializePlugin( + const char* name, const void* serial_data, size_t serial_length) { + auto plugin = new AnchorGeneratorPluginDynamic(serial_data, serial_length); + plugin->setPluginNamespace(namespace_.c_str()); + return plugin; +} +#endif + +} // namespace plugin +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.h new file mode 100644 index 00000000000..aff0b6a6802 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/anchor_generator_op_plugin.h @@ -0,0 +1,201 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include "paddle/fluid/inference/tensorrt/engine.h" +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" + +namespace paddle { +namespace inference { +namespace tensorrt { +namespace plugin { + +class AnchorGeneratorPlugin : public nvinfer1::IPluginV2Ext { + public: + explicit AnchorGeneratorPlugin( + const nvinfer1::DataType, const std::vector& anchor_sizes, + const std::vector& aspect_ratios, const std::vector& stride, + const std::vector& variances, const float offset, const int height, + const int width, const int num_anchors, const int box_num); + AnchorGeneratorPlugin(const void* data, size_t length); + ~AnchorGeneratorPlugin() override; + const char* getPluginType() const override; + const char* getPluginVersion() const override; + int getNbOutputs() const override; + nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs, + int nb_input_dims) override; + bool supportsFormat(nvinfer1::DataType type, + nvinfer1::TensorFormat format) const override; + size_t getWorkspaceSize(int max_batch_size) const override; + int enqueue(int batch_size, const void* const* inputs, void** outputs, + void* workspace, cudaStream_t stream) override; + int initialize() override; + void terminate() override; + size_t getSerializationSize() const override; + void serialize(void* buffer) const override; + void destroy() override; + void setPluginNamespace(const char* lib_namespace) override; + const char* getPluginNamespace() const override; + nvinfer1::DataType getOutputDataType(int index, + const nvinfer1::DataType* input_type, + int nb_inputs) const override; + bool isOutputBroadcastAcrossBatch(int output_index, + const bool* input_is_broadcast, + int nb_inputs) const override; + bool canBroadcastInputAcrossBatch(int input_index) const override; + void configurePlugin(const nvinfer1::Dims* input_dims, int nb_inputs, + const nvinfer1::Dims* output_dims, int nb_outputs, + const nvinfer1::DataType* input_types, + const nvinfer1::DataType* output_types, + const bool* input_is_broadcast, + const bool* output_is_broadcast, + nvinfer1::PluginFormat float_format, + int max_batct_size) override; + nvinfer1::IPluginV2Ext* clone() const override; + + private: + template + int enqueue_impl(int batch_size, const void* const* inputs, void** outputs, + void* workspace, cudaStream_t stream); + nvinfer1::DataType data_type_; + std::vector anchor_sizes_; + std::vector aspect_ratios_; + std::vector stride_; + std::vector variances_; + float offset_; + void* anchor_sizes_device_; + void* aspect_ratios_device_; + void* stride_device_; + void* variances_device_; + int height_; + int width_; + int num_anchors_; + int box_num_; + std::string namespace_; +}; + +class AnchorGeneratorPluginCreator : public nvinfer1::IPluginCreator { + public: + AnchorGeneratorPluginCreator() = default; + ~AnchorGeneratorPluginCreator() override = default; + void setPluginNamespace(const char* lib_namespace) override; + const char* getPluginNamespace() const override; + const char* getPluginName() const override; + const char* getPluginVersion() const override; + const nvinfer1::PluginFieldCollection* getFieldNames() override; + nvinfer1::IPluginV2Ext* createPlugin( + const char* name, const nvinfer1::PluginFieldCollection* fc) override; + nvinfer1::IPluginV2Ext* deserializePlugin(const char* name, + const void* serial_data, + size_t serial_length) override; + + private: + std::string namespace_; + nvinfer1::PluginFieldCollection field_collection_; +}; + +REGISTER_TRT_PLUGIN_V2(AnchorGeneratorPluginCreator); + +#if IS_TRT_VERSION_GE(6000) +class AnchorGeneratorPluginDynamic : public DynamicPluginTensorRT { + public: + explicit AnchorGeneratorPluginDynamic(const nvinfer1::DataType data_type, + const std::vector& anchor_sizes, + const std::vector& aspect_ratios, + const std::vector& stride, + const std::vector& variances, + const float offset, + const int num_anchors); + AnchorGeneratorPluginDynamic(void const* data, size_t length); + ~AnchorGeneratorPluginDynamic(); + nvinfer1::IPluginV2DynamicExt* clone() const override; + nvinfer1::DimsExprs getOutputDimensions( + int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs, + nvinfer1::IExprBuilder& exprBuilder) override; + bool supportsFormatCombination(int pos, + const nvinfer1::PluginTensorDesc* inOut, + int nbInputs, int nbOutputs) override; + void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, + int nbInputs, + const nvinfer1::DynamicPluginTensorDesc* out, + int nbOutputs) override; + size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, + int nbInputs, + const nvinfer1::PluginTensorDesc* outputs, + int nbOutputs) const override; + int enqueue(const nvinfer1::PluginTensorDesc* inputDesc, + const nvinfer1::PluginTensorDesc* outputDesc, + const void* const* inputs, void* const* outputs, void* workspace, + cudaStream_t stream) override; + nvinfer1::DataType getOutputDataType(int index, + const nvinfer1::DataType* inputTypes, + int nbInputs) const override; + const char* getPluginType() const override; + int getNbOutputs() const override; + int initialize() override; + void terminate() override; + size_t getSerializationSize() const override; + void serialize(void* buffer) const override; + void destroy() override; + + private: + template + int enqueue_impl(const nvinfer1::PluginTensorDesc* inputDesc, + const nvinfer1::PluginTensorDesc* outputDesc, + const void* const* inputs, void* const* outputs, + void* workspace, cudaStream_t stream); + nvinfer1::DataType data_type_; + std::vector anchor_sizes_; + std::vector aspect_ratios_; + std::vector stride_; + std::vector variances_; + float offset_; + void* anchor_sizes_device_; + void* aspect_ratios_device_; + void* stride_device_; + void* variances_device_; + int num_anchors_; + std::string namespace_; +}; + +class AnchorGeneratorPluginDynamicCreator : public nvinfer1::IPluginCreator { + public: + AnchorGeneratorPluginDynamicCreator() = default; + ~AnchorGeneratorPluginDynamicCreator() override = default; + void setPluginNamespace(const char* lib_namespace) override; + const char* getPluginNamespace() const override; + const char* getPluginName() const override; + const char* getPluginVersion() const override; + const nvinfer1::PluginFieldCollection* getFieldNames() override; + nvinfer1::IPluginV2Ext* createPlugin( + const char* name, const nvinfer1::PluginFieldCollection* fc) override; + nvinfer1::IPluginV2Ext* deserializePlugin(const char* name, + const void* serial_data, + size_t serial_length) override; + + private: + std::string namespace_; + nvinfer1::PluginFieldCollection field_collection_; +}; +REGISTER_TRT_PLUGIN_V2(AnchorGeneratorPluginDynamicCreator); +#endif + +} // namespace plugin +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/operators/detection/anchor_generator_op.cu b/paddle/fluid/operators/detection/anchor_generator_op.cu index b4c27a63dbd..388b8531571 100644 --- a/paddle/fluid/operators/detection/anchor_generator_op.cu +++ b/paddle/fluid/operators/detection/anchor_generator_op.cu @@ -49,14 +49,11 @@ __global__ void GenAnchors(T* out, const T* aspect_ratios, const int ar_num, anchor_width = scale_w * base_w; anchor_height = scale_h * base_h; - T xmin = (x_ctr - 0.5 * (anchor_width - 1)); - T ymin = (y_ctr - 0.5 * (anchor_height - 1)); - T xmax = (x_ctr + 0.5 * (anchor_width - 1)); - T ymax = (y_ctr + 0.5 * (anchor_height - 1)); - out[i * 4] = xmin; - out[i * 4 + 1] = ymin; - out[i * 4 + 2] = xmax; - out[i * 4 + 3] = ymax; + T xmin = (x_ctr - .5f * (anchor_width - 1)); + T ymin = (y_ctr - .5f * (anchor_height - 1)); + T xmax = (x_ctr + .5f * (anchor_width - 1)); + T ymax = (y_ctr + .5f * (anchor_height - 1)); + reinterpret_cast(out)[i] = make_float4(xmin, ymin, xmax, ymax); } } diff --git a/paddle/fluid/operators/detection/anchor_generator_op.h b/paddle/fluid/operators/detection/anchor_generator_op.h index e0e499d76a1..599f6935736 100644 --- a/paddle/fluid/operators/detection/anchor_generator_op.h +++ b/paddle/fluid/operators/detection/anchor_generator_op.h @@ -22,6 +22,19 @@ limitations under the License. */ namespace paddle { namespace operators { +#ifdef PADDLE_WITH_CUDA +template +extern __global__ void GenAnchors(T* out, const T* aspect_ratios, + const int ar_num, const T* anchor_sizes, + const int as_num, const T* stride, + const int sd_num, const int height, + const int width, const T offset); + +template +extern __global__ void SetVariance(T* out, const T* var, const int vnum, + const int num); +#endif + template class AnchorGeneratorOpKernel : public framework::OpKernel { public: diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_anchor_generator_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_anchor_generator_op.py new file mode 100644 index 00000000000..1d6f1c2c459 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_anchor_generator_op.py @@ -0,0 +1,122 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import itertools +import numpy as np +from inference_pass_test import InferencePassTest +import paddle.fluid as fluid +import paddle.fluid.core as core +from paddle.fluid.core import PassVersionChecker +from paddle.fluid.core import AnalysisConfig + + +class TRTAnchorGeneratorBaseTest(InferencePassTest): + def setUp(self): + self.bs = 1 + self.channel = 16 + self.height = 32 + self.width = 32 + self.anchor_sizes = [64., 128., 256., 512.] + self.aspect_ratios = [.5, 1., 2.] + self.variance = [.1, .1, .2, .2] + self.stride = [8., 8.] + self.precision = AnalysisConfig.Precision.Float32 + self.serialize = False + self.enable_trt = True + self.feeds = { + 'data': + np.random.random([self.bs, self.channel, self.height, + self.width]).astype('float32'), + } + + def build(self): + min_graph_size = 3 if self.dynamic_shape_params is not None else 2 + self.trt_parameters = InferencePassTest.TensorRTParam( + 1 << 30, self.bs, min_graph_size, self.precision, self.serialize, + False) + with fluid.program_guard(self.main_program, self.startup_program): + data = fluid.data( + name='data', + shape=[-1, self.channel, self.height, self.width], + dtype='float32') + anchor, var = fluid.layers.detection.anchor_generator( + data, + anchor_sizes=self.anchor_sizes, + aspect_ratios=self.aspect_ratios, + variance=self.variance, + stride=self.stride) + if self.dynamic_shape_params is not None: + anchor = fluid.layers.transpose(anchor, [2, 3, 0, 1]) + out = fluid.layers.batch_norm(anchor, is_test=True) + + self.fetch_list = [out, var] + + def run_test(self): + self.build() + self.check_output() + + def set_dynamic(self): + self.dynamic_shape_params = InferencePassTest.DynamicShapeParam({ + 'data': [self.bs, self.channel, self.height // 2, self.width // 2] + }, { + 'data': [self.bs, self.channel, self.height, self.width] + }, {'data': [self.bs, self.channel, self.height, self.width]}, False) + + def test_base(self): + self.run_test() + + def test_fp16(self): + self.precision = AnalysisConfig.Precision.Half + self.run_test() + + def test_serialize(self): + self.serialize = True + self.run_test() + + def test_dynamic(self): + self.set_dynamic() + self.run_test() + + def test_dynamic_fp16(self): + self.precision = AnalysisConfig.Precision.Half + self.set_dynamic() + self.run_test() + + def test_dynamic_serialize(self): + self.serialize = True + self.set_dynamic() + self.run_test() + + def test_dynamic_fp16_serialize(self): + self.serialize = True + self.precision = AnalysisConfig.Precision.Half + self.set_dynamic() + self.run_test() + + def check_output(self): + if core.is_compiled_with_cuda(): + use_gpu = True + atol = 1e-5 + if self.trt_parameters.precision == AnalysisConfig.Precision.Half: + atol = 1e-3 + self.check_output_with_option(use_gpu, atol, flatten=True) + self.assertTrue( + PassVersionChecker.IsCompatible('tensorrt_subgraph_pass')) + + +if __name__ == "__main__": + unittest.main() -- GitLab From 0589ed21b66872b9f333b77d860eab5202df6d26 Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Thu, 1 Apr 2021 13:25:23 +0800 Subject: [PATCH 130/486] LOG CLEAN (#31819) * upgrade vlog * train from dataset fetch optimize --- cmake/external/brpc.cmake | 2 +- .../distributed/service/brpc_ps_server.cc | 5 +-- .../fluid/distributed/service/brpc_utils.cc | 2 +- paddle/fluid/distributed/service/env.h | 10 +++--- paddle/fluid/distributed/service/ps_client.cc | 3 +- paddle/fluid/distributed/service/service.cc | 2 +- .../fluid/distributed/table/depends/dense.h | 2 -- .../fluid/distributed/table/depends/sparse.h | 2 -- .../fluid/framework/details/build_strategy.cc | 28 +++++++---------- paddle/fluid/framework/device_worker.h | 2 +- paddle/fluid/framework/hogwild_worker.cc | 31 +++++++++++++++---- paddle/fluid/platform/lodtensor_printer.cc | 30 +++++++++++++----- paddle/fluid/platform/lodtensor_printer.h | 2 +- .../fluid/platform/lodtensor_printer_test.cc | 3 +- .../distributed/fleet/base/fleet_base.py | 13 ++++---- .../distributed/fleet/runtime/the_one_ps.py | 2 +- .../fluid/tests/unittests/test_monitor.py | 17 ++++++++-- 17 files changed, 97 insertions(+), 59 deletions(-) diff --git a/cmake/external/brpc.cmake b/cmake/external/brpc.cmake index 0eb590c42d0..582c06e88c1 100644 --- a/cmake/external/brpc.cmake +++ b/cmake/external/brpc.cmake @@ -41,7 +41,7 @@ ExternalProject_Add( ${EXTERNAL_PROJECT_LOG_ARGS} # TODO(gongwb): change to de newst repo when they changed. GIT_REPOSITORY "https://github.com/wangjiawei04/brpc" - GIT_TAG "6d79e0b17f25107c35b705ea58d888083f59ff47" + GIT_TAG "e203afb794caf027da0f1e0776443e7d20c0c28e" PREFIX ${BRPC_SOURCES_DIR} UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} diff --git a/paddle/fluid/distributed/service/brpc_ps_server.cc b/paddle/fluid/distributed/service/brpc_ps_server.cc index 8400e669182..d7ff0ecd95a 100644 --- a/paddle/fluid/distributed/service/brpc_ps_server.cc +++ b/paddle/fluid/distributed/service/brpc_ps_server.cc @@ -60,7 +60,8 @@ uint64_t BrpcPsServer::start(const std::string &ip, uint32_t port) { std::unique_lock lock(mutex_); std::string ip_port = ip + ":" + std::to_string(port); - VLOG(3) << "server of rank " << _rank << " starts at " << ip_port; + VLOG(0) << "running server with rank id: " << _rank + << ", endpoint: " << ip_port; brpc::ServerOptions options; int num_threads = std::thread::hardware_concurrency(); @@ -538,7 +539,7 @@ int32_t BrpcPsService::stop_server(Table *table, auto *p_server = _server; std::thread t_stop([p_server]() { p_server->stop(); - LOG(INFO) << "Server Stoped"; + VLOG(3) << "Server Stoped"; }); t_stop.detach(); return 0; diff --git a/paddle/fluid/distributed/service/brpc_utils.cc b/paddle/fluid/distributed/service/brpc_utils.cc index 09671876814..a356b77e737 100644 --- a/paddle/fluid/distributed/service/brpc_utils.cc +++ b/paddle/fluid/distributed/service/brpc_utils.cc @@ -324,7 +324,7 @@ std::string GetIntTypeEndpoint(const std::string& ip, const uint32_t& port) { while (hp->h_addr_list[i] != NULL) { int_ip = inet_ntoa(*(struct in_addr*)hp->h_addr_list[i]); - VLOG(0) << "Brpc Get host by name, host:" << ip << " -> ip: " << int_ip; + VLOG(3) << "Brpc Get host by name, host:" << ip << " -> ip: " << int_ip; break; } diff --git a/paddle/fluid/distributed/service/env.h b/paddle/fluid/distributed/service/env.h index 901aba0ad90..ca395a776af 100644 --- a/paddle/fluid/distributed/service/env.h +++ b/paddle/fluid/distributed/service/env.h @@ -39,7 +39,7 @@ struct PSHost { // |---ip---|---port---|--rank--| // |-32bit--|--20bit---|--12bit-| - // for pslib + uint64_t serialize_to_uint64() { uint64_t host_label = 0; host_label = inet_addr(ip.c_str()); @@ -175,14 +175,12 @@ class PSEnvironment { host.ip = ip; host.port = port; host.rank = rank; - if (sign_set.count(rank) > 0) { - LOG(WARNING) << "ps-host :" << host.ip << ":" << host.port - << ", rank:" << host.rank - << " already register, ignore register"; - } else { + + if (sign_set.count(rank) == 0) { host_list.push_back(host); sign_set.insert(rank); } + return 0; } diff --git a/paddle/fluid/distributed/service/ps_client.cc b/paddle/fluid/distributed/service/ps_client.cc index 095b5dee0b2..d427ecfc538 100644 --- a/paddle/fluid/distributed/service/ps_client.cc +++ b/paddle/fluid/distributed/service/ps_client.cc @@ -78,8 +78,7 @@ PSClient *PSClientFactory::create(const PSParameter &ps_config) { } TableManager::instance().initialize(); - LOG(INFO) << "Create PSClient[" << service_param.client_class() - << "] success"; + VLOG(3) << "Create PSClient[" << service_param.client_class() << "] success"; return client; } } // namespace distributed diff --git a/paddle/fluid/distributed/service/service.cc b/paddle/fluid/distributed/service/service.cc index 3d0f94fac27..2759e4614e6 100644 --- a/paddle/fluid/distributed/service/service.cc +++ b/paddle/fluid/distributed/service/service.cc @@ -47,7 +47,7 @@ paddle::distributed::PSParameter load_from_prototxt( } void PSCore::init_gflag(const std::string& gflags) { - LOG(INFO) << "Init With Gflags:" << gflags; + VLOG(3) << "Init With Gflags:" << gflags; std::vector flags = paddle::string::split_string(gflags); if (flags.size() < 1) { flags.push_back("-max_body_size=314217728"); diff --git a/paddle/fluid/distributed/table/depends/dense.h b/paddle/fluid/distributed/table/depends/dense.h index a2acdfd2014..8079003d1bf 100644 --- a/paddle/fluid/distributed/table/depends/dense.h +++ b/paddle/fluid/distributed/table/depends/dense.h @@ -89,7 +89,6 @@ class DSGD : public DenseOptimizer { auto blas = GetBlas(); float lr = *(global_learning_rate_) * (*learning_rate); - VLOG(4) << "DSGD LearningRate: " << lr; blas.VCOPY(update_numel, update_values + begin, grads.data()); blas.SCAL(update_numel, lr, grads.data()); blas.VSUB(update_numel, param + begin, grads.data(), param + begin); @@ -157,7 +156,6 @@ class DAdam : public DenseOptimizer { beta2_pow[0] = beta2_pow[0] * beta2; float lr_ = *(global_learning_rate_)*learning_rate[0]; - VLOG(4) << "DAdam LearningRate: " << lr_; lr_ *= sqrt(1 - beta2_pow[0]) / (1 - beta1_pow[0]); float* tmp_ = tmp.data(); diff --git a/paddle/fluid/distributed/table/depends/sparse.h b/paddle/fluid/distributed/table/depends/sparse.h index 672d6e7d396..0e1d7ef03c1 100644 --- a/paddle/fluid/distributed/table/depends/sparse.h +++ b/paddle/fluid/distributed/table/depends/sparse.h @@ -110,7 +110,6 @@ class SSGD : public SparseOptimizer { auto* value = block->Get(id); float learning_rate = *(global_learning_rate_) * (value + lr_offset)[0]; - VLOG(4) << "SSGD LearningRate: " << learning_rate; float* param = value + param_offset; std::vector grads; @@ -166,7 +165,6 @@ class SAdam : public SparseOptimizer { if (!block->GetEntry(id)) continue; auto* values = block->Get(id); float lr_ = *(global_learning_rate_) * (values + lr_offset)[0]; - VLOG(4) << "SAdam LearningRate: " << lr_; float* param = values + param_offset; float* moment1 = values + m1_offset; float* moment2 = values + m2_offset; diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index 34c87b83889..5636e3ed1b6 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -161,9 +161,6 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { #if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \ !defined(_WIN32) && !defined(__APPLE__) AppendPassWithCheck(strategy_.enable_auto_fusion_, "fusion_group_pass"); -#else - LOG(WARNING) << "fusion_group is not enabled for Windows/MacOS now, and " - "only effective when running with CUDA GPU."; #endif AppendPassWithCheck(strategy_.fuse_elewise_add_act_ops_, "fuse_elewise_add_act_pass"); @@ -265,12 +262,11 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { if (FLAGS_use_mkldnn) { AppendPass(pass_name); } else if (!strategy_.mkldnn_enabled_op_types_.empty()) { - LOG(WARNING) - << "mkldnn_enabled_op_types specify the operator type list to " - "use MKLDNN acceleration. It is null in default, means " - "that all the operators supported by MKLDNN will be " - "accelerated. And it should not be set when " - "FLAGS_use_mkldnn=false."; + VLOG(1) << "mkldnn_enabled_op_types specify the operator type list to " + "use MKLDNN acceleration. It is null in default, means " + "that all the operators supported by MKLDNN will be " + "accelerated. And it should not be set when " + "FLAGS_use_mkldnn=false."; } #else PADDLE_ENFORCE_NE(FLAGS_use_mkldnn, true, @@ -403,26 +399,26 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph, << ", num_trainers:" << num_trainers_; } else if (pass->Type() == "fuse_relu_depthwise_conv_pass") { if (use_device != p::kCUDA) { - LOG(WARNING) << "fuse_relu_depthwise_conv_pass is only supported on " - "GPU, skipped."; + VLOG(1) << "fuse_relu_depthwise_conv_pass is only supported on " + "GPU, skipped."; continue; } } else if (pass->Type() == "fusion_group_pass") { pass->Set("use_gpu", new bool((use_device == p::kCUDA))); if (use_device != p::kCUDA) { - LOG(WARNING) << "fusion_group_pass is only supported on GPU, skipped."; + VLOG(1) << "fusion_group_pass is only supported on GPU, skipped."; continue; } } else if (pass->Type() == "fuse_bn_act_pass") { if (use_device != p::kCUDA) { - LOG(WARNING) << "fuse_bn_act_pass is only supported on " - "GPU, skipped."; + VLOG(1) << "fuse_bn_act_pass is only supported on " + "GPU, skipped."; continue; } } else if (pass->Type() == "fuse_bn_add_act_pass") { if (use_device != p::kCUDA) { - LOG(WARNING) << "fuse_bn_add_act_pass is only supported on " - "GPU, skipped."; + VLOG(1) << "fuse_bn_add_act_pass is only supported on " + "GPU, skipped."; continue; } } else if (pass->Type() == "mkldnn_placement_pass") { diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h index 05c54a90f7e..9ced4221e1d 100644 --- a/paddle/fluid/framework/device_worker.h +++ b/paddle/fluid/framework/device_worker.h @@ -205,7 +205,7 @@ class DeviceWorker { Scope* root_scope_ = nullptr; Scope* thread_scope_; paddle::platform::Place place_; - int64_t batch_num_; + int64_t batch_num_ = 0; FetchConfig fetch_config_; bool use_cvm_; bool no_cvm_; diff --git a/paddle/fluid/framework/hogwild_worker.cc b/paddle/fluid/framework/hogwild_worker.cc index d8639643f2c..89dc5c7d3ea 100644 --- a/paddle/fluid/framework/hogwild_worker.cc +++ b/paddle/fluid/framework/hogwild_worker.cc @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/device_worker.h" #include "paddle/fluid/operators/controlflow/conditional_block_op_helper.h" @@ -226,14 +227,32 @@ void HogwildWorker::PrintFetchVars() { // call count batch_num_++; int batch_per_print = fetch_config_.print_period(); - if (thread_id_ == 0) { - if (batch_num_ % batch_per_print == 0) { - int fetch_var_num = fetch_config_.fetch_var_names_size(); - for (int i = 0; i < fetch_var_num; ++i) { - platform::PrintVar(thread_scope_, fetch_config_.fetch_var_names(i), - fetch_config_.fetch_var_str_format(i)); + int fetch_var_num = fetch_config_.fetch_var_names_size(); + + if (fetch_var_num == 0) { + return; + } + + if (thread_id_ == 0 && batch_num_ % batch_per_print == 0) { + time_t curtime; + time(&curtime); + char mbstr[80]; + std::strftime(mbstr, sizeof(mbstr), "%Y-%m-%d %H:%M:%S", + std::localtime(&curtime)); + + std::stringstream ss; + ss << "time: [" << mbstr << "], "; + ss << "batch: [" << batch_num_ << "], "; + + for (int i = 0; i < fetch_var_num; ++i) { + platform::PrintVar(thread_scope_, fetch_config_.fetch_var_names(i), + fetch_config_.fetch_var_str_format(i), &ss); + if (i < fetch_var_num - 1) { + ss << ", "; } } + + std::cout << ss.str() << std::endl; } } diff --git a/paddle/fluid/platform/lodtensor_printer.cc b/paddle/fluid/platform/lodtensor_printer.cc index 0be4233269e..25ae0ab264f 100644 --- a/paddle/fluid/platform/lodtensor_printer.cc +++ b/paddle/fluid/platform/lodtensor_printer.cc @@ -27,24 +27,38 @@ namespace paddle { namespace platform { void PrintVar(framework::Scope* scope, const std::string& var_name, - const std::string& print_info) { + const std::string& print_info, std::stringstream* sstream) { framework::Variable* var = scope->FindVar(var_name); if (var == nullptr) { - VLOG(1) << "Variable Name " << var_name << " does not exist in your scope"; + VLOG(0) << "Variable Name " << var_name << " does not exist in your scope"; return; } framework::LoDTensor* tensor = var->GetMutable(); if (tensor == nullptr) { - VLOG(1) << "tensor of variable " << var_name + VLOG(0) << "tensor of variable " << var_name << " does not exist in your scope"; return; } - std::ostringstream sstream; - sstream << print_info << "\t"; - sstream << var_name << "\t"; - sstream << *tensor << "\t"; - std::cout << sstream.str() << std::endl; + *sstream << print_info << ": "; + +#define PrintTensorCallback(cpp_type, proto_type) \ + do { \ + if (tensor->type() == proto_type) { \ + *sstream << "["; \ + auto* data = tensor->data(); \ + auto element_num = tensor->numel(); \ + if (element_num > 0) { \ + *sstream << data[0]; \ + for (int j = 1; j < element_num; ++j) { \ + *sstream << " " << data[j]; \ + } \ + } \ + *sstream << "]"; \ + } \ + } while (0) + + _ForEachDataType_(PrintTensorCallback); } } // end namespace platform diff --git a/paddle/fluid/platform/lodtensor_printer.h b/paddle/fluid/platform/lodtensor_printer.h index e0bd1fff197..d30afb62b0b 100644 --- a/paddle/fluid/platform/lodtensor_printer.h +++ b/paddle/fluid/platform/lodtensor_printer.h @@ -26,6 +26,6 @@ class Scope; namespace paddle { namespace platform { void PrintVar(framework::Scope* scope, const std::string& var_name, - const std::string& print_info); + const std::string& print_info, std::stringstream* out); } // end namespace platform } // end namespace paddle diff --git a/paddle/fluid/platform/lodtensor_printer_test.cc b/paddle/fluid/platform/lodtensor_printer_test.cc index 5b2af270740..51bd55ebb7f 100644 --- a/paddle/fluid/platform/lodtensor_printer_test.cc +++ b/paddle/fluid/platform/lodtensor_printer_test.cc @@ -18,5 +18,6 @@ TEST(LodTensorPrinter, PrintVar) { paddle::framework::Scope scope; - paddle::platform::PrintVar(&scope, "NotAVar", "We don't have var"); + std::stringstream ss; + paddle::platform::PrintVar(&scope, "NotAVar", "We don't have var", &ss); } diff --git a/python/paddle/distributed/fleet/base/fleet_base.py b/python/paddle/distributed/fleet/base/fleet_base.py index 19ba637cc96..cf802034cab 100644 --- a/python/paddle/distributed/fleet/base/fleet_base.py +++ b/python/paddle/distributed/fleet/base/fleet_base.py @@ -628,12 +628,13 @@ class Fleet(object): self.user_defined_optimizer = optimizer if strategy is not None: - warnings.warn( - "It is recommended to use DistributedStrategy " - "in fleet.init(). The strategy here is only for compatibility. " - "If the strategy in fleet.distributed_optimizer() is " - "not None, then it will overwrite the DistributedStrategy in fleet.init(), " - "which will take effect in distributed training.") + if self._is_collective: + warnings.warn( + "It is recommended to use DistributedStrategy " + "in fleet.init(). The strategy here is only for compatibility. " + "If the strategy in fleet.distributed_optimizer() is " + "not None, then it will overwrite the DistributedStrategy in fleet.init(), " + "which will take effect in distributed training.") self._user_defined_strategy = copy.deepcopy(strategy) self._context = {} diff --git a/python/paddle/distributed/fleet/runtime/the_one_ps.py b/python/paddle/distributed/fleet/runtime/the_one_ps.py index a5686806005..aa7df57e3c5 100644 --- a/python/paddle/distributed/fleet/runtime/the_one_ps.py +++ b/python/paddle/distributed/fleet/runtime/the_one_ps.py @@ -768,7 +768,7 @@ class TheOnePSRuntime(RuntimeBase): server = self._get_fleet_proto(is_server=True, is_sync=is_sync) proto_txt = str(server) - debug = bool(os.getenv("PSERVER_DEBUG", "0")) + debug = bool(int(os.getenv("PSERVER_DEBUG", "0"))) if debug: print("server: \n{}".format(proto_txt)) diff --git a/python/paddle/fluid/tests/unittests/test_monitor.py b/python/paddle/fluid/tests/unittests/test_monitor.py index cf273876b1f..bea2f6c8b38 100644 --- a/python/paddle/fluid/tests/unittests/test_monitor.py +++ b/python/paddle/fluid/tests/unittests/test_monitor.py @@ -17,6 +17,8 @@ TestCases for Monitor from __future__ import print_function import paddle +paddle.enable_static() + import paddle.fluid as fluid import paddle.fluid.core as core import numpy as np @@ -52,6 +54,11 @@ class TestDatasetWithStat(unittest.TestCase): name=slot, shape=[1], dtype="int64", lod_level=1) slots_vars.append(var) + embs = [] + for x in slots_vars: + emb = fluid.layers.embedding(x, is_sparse=True, size=[100001, 4]) + embs.append(emb) + dataset = paddle.distributed.InMemoryDataset() dataset._set_batch_size(32) dataset._set_thread(3) @@ -74,11 +81,17 @@ class TestDatasetWithStat(unittest.TestCase): for i in range(self.epoch_num): for data in data_loader(): exe.run(fluid.default_main_program(), feed=data) + else: for i in range(self.epoch_num): try: - exe.train_from_dataset(fluid.default_main_program(), - dataset) + exe.train_from_dataset( + fluid.default_main_program(), + dataset, + fetch_list=[embs[0], embs[1]], + fetch_info=["emb0", "emb1"], + print_period=1) + except Exception as e: self.assertTrue(False) -- GitLab From 9c5d0286abb27fa11cae1a11a23625ffbdc4edb6 Mon Sep 17 00:00:00 2001 From: hutuxian Date: Thu, 1 Apr 2021 14:31:31 +0800 Subject: [PATCH 131/486] remove useless code (#32001) --- paddle/fluid/operators/sync_batch_norm_op.cu.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/paddle/fluid/operators/sync_batch_norm_op.cu.h b/paddle/fluid/operators/sync_batch_norm_op.cu.h index d08a34ade77..69617b7e208 100644 --- a/paddle/fluid/operators/sync_batch_norm_op.cu.h +++ b/paddle/fluid/operators/sync_batch_norm_op.cu.h @@ -187,12 +187,6 @@ void SyncBatchNormFunctor(const framework::ExecutionContext &ctx, x_d, N, H * W * D, C, stats); } - Tensor c_g_st; - auto *c_g_st_d = c_g_st.mutable_data>( - {2 * C + 1}, platform::CPUPlace()); - auto gplace = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()); - memory::Copy(platform::CPUPlace(), c_g_st_d, gplace, stats, bytes, 0); - #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) auto *comm = dev_ctx.nccl_comm(); if (comm) { -- GitLab From 83b953f56f68470cfb285d0c127a53681c32800f Mon Sep 17 00:00:00 2001 From: chentianyu03 Date: Thu, 1 Apr 2021 14:55:48 +0800 Subject: [PATCH 132/486] add custom init grad for backward function (#31540) * add custom init grad for backward function * add custom init grad for backward function * handle when the grad_tensor is none * handle when the grad_tensor is none * fix the args type error on windows platform * modify the args order and doc * format code * add grad_tensor to xpu * modify the grad_tensor type check * add paddle.backward api to support multi tensors gradient compute * add paddle.backward api to support multi tensors gradient compute * add paddle.atuograd module and backward api * change tensor.backward func args * modify tensor backward api * remove create_graph intputs args * add doc and examplex code for backward api * when have the same tensor, throw error * modify test Init func args * modify the execute.Init func args in test files * add paddle.autograd package in setup.py.in * modify error msg, remove _run_backward method in class Tensor * add test cases for backward api --- paddle/fluid/imperative/basic_engine.cc | 115 ++++++++++------- paddle/fluid/imperative/basic_engine.h | 6 +- paddle/fluid/imperative/tests/test_hooks.cc | 8 +- paddle/fluid/imperative/tests/test_tracer.cc | 9 +- paddle/fluid/pybind/imperative.cc | 26 ++-- python/paddle/__init__.py | 1 + python/paddle/autograd/__init__.py | 22 ++++ python/paddle/autograd/backward_mode.py | 119 ++++++++++++++++++ python/paddle/fluid/dygraph/base.py | 1 + .../fluid/dygraph/varbase_patch_methods.py | 36 +++++- .../tests/unittests/test_custom_grad_input.py | 119 ++++++++++++++++++ python/setup.py.in | 1 + 12 files changed, 397 insertions(+), 66 deletions(-) create mode 100644 python/paddle/autograd/__init__.py create mode 100644 python/paddle/autograd/backward_mode.py create mode 100644 python/paddle/fluid/tests/unittests/test_custom_grad_input.py diff --git a/paddle/fluid/imperative/basic_engine.cc b/paddle/fluid/imperative/basic_engine.cc index 9e46af9cb72..2a439a6f1ea 100644 --- a/paddle/fluid/imperative/basic_engine.cc +++ b/paddle/fluid/imperative/basic_engine.cc @@ -36,48 +36,73 @@ DECLARE_bool(sort_sum_gradient); namespace paddle { namespace imperative { -void BasicEngine::Init(VarBase* var, bool retain_graph) { +void BasicEngine::Init( + const std::vector>& tensors, + const std::vector>& grad_tensors, + bool retain_graph) { retain_graph_ = retain_graph; - init_node_ = var->GradVarBase()->GradNode(); - PADDLE_ENFORCE_EQ(var->GradVarBase()->GraphIsFreed(), false, - platform::errors::Unavailable( - "%s trying to backward through the same graph a second " - "time, but this graph have already been freed. Please " - "specify Tensor.backward(retain_graph=True) when " - "calling backward at the first time.", - var->Name())); - - if (!retain_graph) { - VLOG(5) << "Clear the auto-grad graph from grad var " << var->Name() - << " because of retain_graph=False when calling backward"; - var->GradVarBase()->SetGraphIsFreed(true); - var->GradVarBase()->ClearGradNode(); - } - if (init_node_ == nullptr || var->OverridedStopGradient()) { - VLOG(3) << "Skip auto grad since there is no grad op for var or loss is " - "stop_gradient=True: " - << var->Name(); - return; - } + PADDLE_ENFORCE_EQ( + tensors.size(), grad_tensors.size(), + platform::errors::Unavailable( + "The size of tensors do not equal the size of grad_tensors," + "the size of tensors is %s, but the size of grad_tensors is %s.", + tensors.size(), grad_tensors.size())); + + for (size_t i = 0; i < tensors.size(); ++i) { + auto var = tensors[i]; + auto grad_tensor = grad_tensors[i]; + + auto init_node = var->GradVarBase()->GradNode(); + PADDLE_ENFORCE_EQ( + var->GradVarBase()->GraphIsFreed(), false, + platform::errors::Unavailable( + "%s trying to backward through the same graph a second " + "time, but this graph have already been freed. Please " + "specify Tensor.backward(retain_graph=True) when " + "calling backward at the first time.", + var->Name())); + + if (!retain_graph) { + VLOG(5) << "Clear the auto-grad graph from grad var " << var->Name() + << " because of retain_graph=False when calling backward"; + var->GradVarBase()->SetGraphIsFreed(true); + var->GradVarBase()->ClearGradNode(); + } - VLOG(3) << "Init first node of backward"; + if (init_node == nullptr || var->OverridedStopGradient()) { + VLOG(3) << "Skip auto grad since there is no grad op for var or loss is " + "stop_gradient=True: " + << var->Name(); + continue; + } - PADDLE_ENFORCE_EQ( - var->HasGradVar(), true, - platform::errors::NotFound("Grad variable not exist for variable %s", - var->Name())); - - auto& fwd_var = var->Var().Get(); - auto* grad_var = - var->GradVarBase()->MutableVar()->GetMutable(); - VLOG(6) << "init loss grad:" << var->GradVarBase()->Name() - << " as stop_gradient false"; - var->GradVarBase()->InnerSetOverridedStopGradient(false); - auto* dev_ctx = platform::DeviceContextPool::Instance().Get(fwd_var.place()); - grad_var->Resize(fwd_var.dims()); - grad_var->mutable_data(fwd_var.place(), fwd_var.type()); - operators::math::set_constant(*dev_ctx, grad_var, 1.0); + VLOG(3) << "Init node of backward"; + + PADDLE_ENFORCE_EQ( + var->HasGradVar(), true, + platform::errors::NotFound("Tensor %s has no gradient", var->Name())); + + auto& fwd_var = var->Var().Get(); + auto* grad_var = + var->GradVarBase()->MutableVar()->GetMutable(); + VLOG(6) << "init loss grad:" << var->GradVarBase()->Name() + << " as stop_gradient false"; + var->GradVarBase()->InnerSetOverridedStopGradient(false); + auto* dev_ctx = + platform::DeviceContextPool::Instance().Get(fwd_var.place()); + if (grad_tensor == nullptr) { + grad_var->Resize(fwd_var.dims()); + grad_var->mutable_data(fwd_var.place(), fwd_var.type()); + operators::math::set_constant(*dev_ctx, grad_var, 1.0); + } else { + paddle::framework::TensorCopy( + grad_tensor->Var().Get(), fwd_var.place(), + *dev_ctx, grad_var); + } + + init_nodes_.push_back(init_node); + } } void BasicEngine::CheckBackwardInputs(const OpBase& op) { @@ -224,8 +249,10 @@ void BasicEngine::PrepareDeps() { std::queue q; std::unordered_set visited; - q.push(init_node_.get()); - visited.insert(init_node_.get()); + for (size_t i = 0; i < init_nodes_.size(); ++i) { + q.push(init_nodes_[i].get()); + visited.insert(init_nodes_[i].get()); + } while (!q.empty()) { auto* cur_node = q.front(); @@ -276,14 +303,16 @@ static std::shared_ptr> CallGradientHooks( } void BasicEngine::Execute() { - if (init_node_ == nullptr) { + if (init_nodes_.empty()) { return; } PrepareDeps(); // Start execute Computation graph std::queue> q; - q.push(std::move(init_node_)); + for (size_t i = 0; i < init_nodes_.size(); ++i) { + q.push(std::move(init_nodes_[i])); + } size_t op_num = 0; @@ -505,7 +534,7 @@ void BasicEngine::Execute() { } void BasicEngine::Clear() { - init_node_.reset(); + init_nodes_.clear(); node_deps_.clear(); accumulators_.clear(); accumulators_with_grad_node_.clear(); diff --git a/paddle/fluid/imperative/basic_engine.h b/paddle/fluid/imperative/basic_engine.h index a2ad8b5f8aa..49761a8df0b 100644 --- a/paddle/fluid/imperative/basic_engine.h +++ b/paddle/fluid/imperative/basic_engine.h @@ -30,7 +30,9 @@ class OpBase; class BasicEngine : public Engine { public: - void Init(VarBase* var, bool retain_graph = false); + void Init(const std::vector>& tensors, + const std::vector>& grad_tensors, + bool retain_graph = false); void Execute() override; @@ -46,7 +48,7 @@ class BasicEngine : public Engine { void Clear(); private: - std::shared_ptr init_node_; + std::vector> init_nodes_; std::unordered_map node_deps_; // The input and output of Inplace op are the same. If only `var` is used // as the key, then the input and output of inplace op must be gradient diff --git a/paddle/fluid/imperative/tests/test_hooks.cc b/paddle/fluid/imperative/tests/test_hooks.cc index 9b75fac0ca5..8c907b98906 100644 --- a/paddle/fluid/imperative/tests/test_hooks.cc +++ b/paddle/fluid/imperative/tests/test_hooks.cc @@ -92,8 +92,10 @@ TEST(TestHooks, TestGradVarLeafBackwardHook) { ASSERT_EQ(out->GradVarBase()->GradOpNum(), 1UL); // 3. backward + std::vector> tensors{out}; + std::vector> grad_tensors{nullptr}; BasicEngine engine; - engine.Init(out.get()); + engine.Init(tensors, grad_tensors); engine.Execute(); framework::LoDTensor x_grad; @@ -191,8 +193,10 @@ void GradVarLeafBackwardHookWithGradAccmulatedTest() { ASSERT_EQ(out->GradVarBase()->GradOpNum(), 1UL); // 3. backward + std::vector> tensors{out}; + std::vector> grad_tensors{nullptr}; BasicEngine engine; - engine.Init(out.get()); + engine.Init(tensors, grad_tensors); engine.Execute(); framework::LoDTensor x_grad; diff --git a/paddle/fluid/imperative/tests/test_tracer.cc b/paddle/fluid/imperative/tests/test_tracer.cc index 9e3b0ea5df6..76de413b3e6 100644 --- a/paddle/fluid/imperative/tests/test_tracer.cc +++ b/paddle/fluid/imperative/tests/test_tracer.cc @@ -250,7 +250,10 @@ TEST(test_tracer, test_trace_op_with_multi_device_inputs) { tracer.TraceOp("reduce_sum", reduce_in, reduce_out, reduce_attr_map, gpu_place, true); imperative::BasicEngine engine; - engine.Init(reduce_sum_out.get()); + + std::vector> tensors{reduce_sum_out}; + std::vector> grad_tensors{nullptr}; + engine.Init(tensors, grad_tensors); engine.Execute(); framework::LoDTensor rlt; @@ -376,8 +379,10 @@ TEST(test_tracer, test_var_without_grad_var) { ASSERT_EQ(y_in->GradVarBase()->GradOpNum(), 0UL); ASSERT_EQ(vout->GradVarBase()->GradOpNum(), 1UL); + std::vector> tensors{vout}; + std::vector> grad_tensors{nullptr}; imperative::BasicEngine engine; - engine.Init(vout.get()); + engine.Init(tensors, grad_tensors); engine.Execute(); // check the grad diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index c1c1387a84c..4ab507fe367 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -805,6 +805,7 @@ void BindImperative(py::module *m_ptr) { Bump the version whenever the Tensor is modified through an inplace operation. )DOC") .def("numpy", + [](imperative::VarBase &self) -> py::array { const auto &tensor = self.MutableVar()->Get(); @@ -1003,18 +1004,6 @@ void BindImperative(py::module *m_ptr) { print(x.stop_gradient) # True print(x.grad) # None )DOC") - .def("_run_backward", - [](imperative::VarBase &self, const imperative::Tracer &tracer, - bool retain_graph) { - // TODO(jiabin): when we impl more backward execution we can - // select them - auto *engine = tracer.GetEngine(); - engine->Init(&self, retain_graph); - VLOG(3) << "Start backward"; - engine->Execute(); - VLOG(3) << "Finish backward"; - }, - py::call_guard()) .def("_grad_name", &imperative::VarBase::GradVarName) .def("_grad_value", [](imperative::VarBase &self) { @@ -1549,6 +1538,19 @@ void BindImperative(py::module *m_ptr) { }, py::call_guard()); + m.def( + "dygraph_run_backward", + [](const std::vector> &tensors, + const std::vector> &grad_tensors, + bool retain_graph, const imperative::Tracer &tracer) { + auto *engine = tracer.GetEngine(); + engine->Init(tensors, grad_tensors, retain_graph); + VLOG(3) << "Start backward"; + engine->Execute(); + VLOG(3) << "Finish backward"; + }, + py::call_guard()); + #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ defined(PADDLE_WITH_XPU_BKCL) py::class_ 0, "{} connot be empyt".format(name) + for each_var in in_out_list: + assert isinstance( + each_var, paddle. + Tensor), "Elements of {} must be paddle.Tensor".format(name) + return in_out_list + else: + assert isinstance( + in_out_list, + paddle.Tensor), "{} must be Tensor or list of Tensor".format( + name) + return [in_out_list] + + tensors = check_tensors(tensors, "tensors") + + assert len(tensors) == len( + set(tensors) + ), "The argument 'tensors' of paddle.autograd.backward contains duplicate paddle.Tensor object." + + if grad_tensors is not None: + if not isinstance(grad_tensors, (list, tuple)): + grad_tensors = [grad_tensors] + + for each_tensor in grad_tensors: + if each_tensor is not None: + assert isinstance( + each_tensor, paddle.Tensor + ), "The argument 'grad_tensors' of paddle.autograd.backward is invalid, it can be 'None', 'paddle.Tensor' or 'list[None/paddle.Tensor]'." + else: + grad_tensors = [None] * len(tensors) + + if len(grad_tensors) > 0: + assert len(tensors) == len( + grad_tensors), "The length of grad_tensors must be equal to tensors" + + assert isinstance(retain_graph, bool), "retain_graph must be True or False" + + core.dygraph_run_backward(tensors, grad_tensors, retain_graph, + framework._dygraph_tracer()) diff --git a/python/paddle/fluid/dygraph/base.py b/python/paddle/fluid/dygraph/base.py index 08d58e0c808..be5d9ac5831 100644 --- a/python/paddle/fluid/dygraph/base.py +++ b/python/paddle/fluid/dygraph/base.py @@ -26,6 +26,7 @@ import logging from ..data_feeder import convert_dtype import warnings from ..framework import _get_paddle_place +import paddle __all__ = [ 'no_grad', 'no_grad_', 'grad', 'guard', 'enable_dygraph', 'disable_dygraph', diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py index e565552632f..ac594709867 100644 --- a/python/paddle/fluid/dygraph/varbase_patch_methods.py +++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py @@ -163,7 +163,7 @@ def monkey_patch_varbase(): framework._current_expected_place()) @framework.dygraph_only - def backward(self, retain_graph=False): + def backward(self, grad_tensor=None, retain_graph=False): """ Run backward of current Graph which starts from current Tensor. @@ -172,17 +172,22 @@ def monkey_patch_varbase(): You can clear gradient by ``Tensor.clear_grad()`` . Args: + grad_tensor(Tensor, optional): initial gradient values of the current Tensor. If `grad_tensor` is None, + the initial gradient values of the current Tensor would be Tensor filled with 1.0; + if `grad_tensor` is not None, it must have the same length as the current Tensor. + Teh default value is None. + retain_graph(bool, optional): If False, the graph used to compute grads will be freed. If you would like to add more ops to the built graph after calling this method( :code:`backward` ), set the parameter :code:`retain_graph` to True, then the grads will be retained. Thus, seting it to False is much more memory-efficient. Defaults to False. - Returns: NoneType: None Examples: .. code-block:: python + import paddle x = paddle.to_tensor(5., stop_gradient=False) for i in range(5): y = paddle.pow(x, 4.0) @@ -198,15 +203,36 @@ def monkey_patch_varbase(): print("{}".format(x.grad)) # 0. + grad_tensor=paddle.to_tensor(2.) + for i in range(5): + y = paddle.pow(x, 4.0) + y.backward(grad_tensor) + print("{}: {}".format(i, x.grad)) + # 0: [1000.] + # 1: [2000.] + # 2: [3000.] + # 3: [4000.] + # 4: [5000.] + """ if framework.in_dygraph_mode(): + if grad_tensor is not None: + assert isinstance( + grad_tensor, paddle. + Tensor), "The type of grad_tensot must be paddle.Tensor" + assert grad_tensor.shape == self.shape, \ + "Tensor shape not match, Tensor of grad_tensor [ {} ] with shape {} mismatch Tensor [ {} ] with shape {}".format( + grad_tensor.name, grad_tensor.shape, self.name, self.shape) + if paddle.is_compiled_with_xpu(): # TODO(liuyuhui): Currently only for xpu. Will be removed in the future. scaled_loss = scale_loss(self) - scaled_loss._run_backward(framework._dygraph_tracer(), - retain_graph) + core.dygraph_run_backward([scaled_loss], [grad_tensor], + retain_graph, + framework._dygraph_tracer()) else: - self._run_backward(framework._dygraph_tracer(), retain_graph) + core.dygraph_run_backward([self], [grad_tensor], retain_graph, + framework._dygraph_tracer()) else: raise ValueError( "Variable.backward() is only available in DyGraph mode") diff --git a/python/paddle/fluid/tests/unittests/test_custom_grad_input.py b/python/paddle/fluid/tests/unittests/test_custom_grad_input.py new file mode 100644 index 00000000000..a7472e7ffd7 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_custom_grad_input.py @@ -0,0 +1,119 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np + +import paddle +import paddle.fluid.dygraph as dg +from op_test import OpTest + + +class TestTensorBackward(unittest.TestCase): + def setUp(self): + self._dtypes = ["float32", "float64"] + self._places = [paddle.CPUPlace()] + if paddle.is_compiled_with_cuda(): + self._places.append(paddle.CUDAPlace(0)) + + def test_tensor_backward(self): + for dtype in self._dtypes: + x = np.random.random([2, 100]).astype(dtype) + y = np.random.random([100, 2]).astype(dtype) + z = np.matmul(x, y) + grad = np.random.random(z.shape).astype(dtype) + for place in self._places: + with dg.guard(place): + x_tensor = paddle.to_tensor(x, stop_gradient=False) + y_tensor = paddle.to_tensor(y) + z_tensor = paddle.matmul(x_tensor, y_tensor) + + grad_tensor = paddle.to_tensor(grad) + z_tensor.backward(grad_tensor) + + x_grad = np.matmul(grad, y.T) + + self.assertTrue(np.allclose(x_grad, x_tensor.grad)) + + +class TestBackwardAPI(unittest.TestCase): + def setUp(self): + self._dtypes = ["float32", "float64"] + self._places = [paddle.CPUPlace()] + if paddle.is_compiled_with_cuda(): + self._places.append(paddle.CUDAPlace(0)) + + def test_backward_api(self): + for dtype in self._dtypes: + x = np.random.random([2, 2]).astype(dtype) + y = np.random.random([2, 2]).astype(dtype) + z = np.matmul(x, y) + grad = np.random.random(z.shape).astype(dtype) + for place in self._places: + with dg.guard(place): + x_tensor = paddle.to_tensor(x, stop_gradient=False) + y_tensor = paddle.to_tensor(y) + z_tensor1 = paddle.matmul(x_tensor, y_tensor) + z_tensor2 = paddle.matmul(x_tensor, y_tensor) + + grad_tensor = paddle.to_tensor(grad) + paddle.autograd.backward([z_tensor1, z_tensor2], + [grad_tensor, grad_tensor], True) + + x_grad = np.matmul(grad, y.T) + + self.assertTrue(np.allclose(x_grad * 2, x_tensor.grad)) + + def test_backward_single_tensor(self): + for dtype in self._dtypes: + x = np.random.random([2, 2]).astype(dtype) + y = np.random.random([2, 2]).astype(dtype) + z = np.matmul(x, y) + grad = np.random.random(z.shape).astype(dtype) + for place in self._places: + with dg.guard(place): + x_tensor = paddle.to_tensor(x, stop_gradient=False) + y_tensor = paddle.to_tensor(y) + z_tensor1 = paddle.matmul(x_tensor, y_tensor) + + grad_tensor = paddle.to_tensor(grad) + paddle.autograd.backward(z_tensor1, grad_tensor, True) + + x_grad = np.matmul(grad, y.T) + + self.assertTrue(np.allclose(x_grad, x_tensor.grad)) + + def test_backward_none_grad_tensor(self): + for dtype in self._dtypes: + x = np.random.random([2, 2]).astype(dtype) + y = np.random.random([2, 2]).astype(dtype) + z = np.matmul(x, y) + grad = np.ones(z.shape).astype(dtype) + for place in self._places: + with dg.guard(place): + x_tensor = paddle.to_tensor(x, stop_gradient=False) + y_tensor = paddle.to_tensor(y) + z_tensor1 = paddle.matmul(x_tensor, y_tensor) + + paddle.autograd.backward(z_tensor1, None) + + x_grad = np.matmul(grad, y.T) + + self.assertTrue(np.allclose(x_grad, x_tensor.grad)) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/setup.py.in b/python/setup.py.in index 73c773bab49..e4532b3e55d 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -216,6 +216,7 @@ packages=['paddle', 'paddle.static.amp', 'paddle.tensor', 'paddle.onnx', + 'paddle.autograd', ] with open('@PADDLE_SOURCE_DIR@/python/requirements.txt') as f: -- GitLab From 40e6c57bed6c8363bb5695284c2cdc1bb61a42fc Mon Sep 17 00:00:00 2001 From: Wei Shengyu Date: Thu, 1 Apr 2021 15:06:01 +0800 Subject: [PATCH 133/486] fix doc of Pooling layers (#31977) * fix doc of MaxPool1D * fix doc * fix doc format error * dbg * fix doc * dbg doc format test=document_fix * fix format test=document_fix * test doc * remove - from doc * fix indent * remove space before bracket * dbg format * fix indent test=document_fix * remove new line * fix descrip of Shape test=document_fix * add description for default value test=document_fix * fix bug test=document_fix --- python/paddle/nn/layer/pooling.py | 659 ++++++++++++++++-------------- 1 file changed, 344 insertions(+), 315 deletions(-) diff --git a/python/paddle/nn/layer/pooling.py b/python/paddle/nn/layer/pooling.py index 5830af3a182..cdb87a1cb39 100755 --- a/python/paddle/nn/layer/pooling.py +++ b/python/paddle/nn/layer/pooling.py @@ -33,7 +33,7 @@ __all__ = [ class AvgPool1D(layers.Layer): - """ + r""" This operation applies a 1D average pooling over an input signal composed of several input planes, based on the input, output_size, return_mask parameters. Input(X) and output(Out) are in NCL format, where N is batch @@ -41,36 +41,33 @@ class AvgPool1D(layers.Layer): The output tensor shape will be [N, C, output_size]. The output value of the layer with input size (N, C, L), - output (N, C, L_{out}) and kernel_size k can be precisely described as + output (N, C, :math:`L_{out}`) and kernel_size ksize can be precisely described as For average pool1d: .. math:: - Output(N_i, C_i, l) &= mean(Input[N_i, C_i, stride \times l:stride \times l+k]) + Output(N_i, C_i, l) = \frac{Input[N_i, C_i, stride \times l:stride \times l+k]}{ksize} - - Args: - kernel_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, - it must contain an integer. - stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list, + Parameters: + kernel_size(int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, it must contain an integer. - padding (string|int|list|tuple): The padding size. Padding could be in one of the following forms. + stride(int|list|tuple, optional): The pool stride size. If pool stride size is a tuple or list, + it must contain an integer. Default None, then stride will be equal to the kernel_size. + padding(str|int|list|tuple, optional): The padding size. Padding could be in one of the following forms. 1. A string in ['valid', 'same']. 2. An int, which means the feature map is zero padded by size of `padding` on every sides. 3. A list[int] or tuple(int) whose length is 1, which means the feature map is zero padded by the size of `padding[0]` on every sides. 4. A list[int] or tuple(int) whose length is 2. It has the form [pad_before, pad_after]. 5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0). The default value is 0. - exclusive (bool): Whether to exclude padding points in average pooling - mode, default is `True`. - ceil_mode (bool): ${ceil_mode_comment}Whether to use the ceil function to calculate output height and width. - If it is set to False, the floor function will be used. The default value is False. - name(str, optional): For detailed information, please refer - to :ref:`api_guide_Name`. Usually name is no need to set and - None by default. + exclusive(bool, optional): Whether to exclude padding points in average pooling mode, default is `True`. + ceil_mode(bool, optional): ${ceil_mode_comment}Whether to use the ceil function to calculate output height + and width. If it is set to False, the floor function will be used. The default value is False. + name(str, optional): For eed to detailed information, please refer to :ref:`api_guide_Name`. + Usually name is no nset and None by default. Returns: - None. + A callable object of AvgPool1D. Raises: ValueError: If `padding` is a string, but not "SAME" or "VALID". @@ -79,23 +76,24 @@ class AvgPool1D(layers.Layer): ShapeError: If the input is not a 3-D tensor. ShapeError: If the output's shape calculated is not greater than 0. - Shape: - - inpuut: 3-D tensor. - - output: 3-D tensor + - x(Tensor): The input tensor of avg pool1d operator, which is a 3-D tensor. + The data type can be float32, float64. + - output(Tensor): The output tensor of avg pool1d operator, which is a 3-D tensor. + The data type is same as input x. Examples: .. code-block:: python - import paddle - import paddle.nn as nn - import numpy as np + import paddle + import paddle.nn as nn + import numpy as np - data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32)) - AvgPool1D = nn.AvgPool1D(kernel_size=2, stride=2, padding=0) - pool_out = AvgPool1D(data) - # pool_out shape: [1, 3, 16] + data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32)) + AvgPool1D = nn.AvgPool1D(kernel_size=2, stride=2, padding=0) + pool_out = AvgPool1D(data) + # pool_out shape: [1, 3, 16] """ @@ -132,49 +130,53 @@ class AvgPool2D(layers.Layer): H is the height of the feature, and W is the width of the feature. Example: - Input: - X shape: $(N, C, H_{in}, W_{in})$ - Attr: - kernel_size: ksize - - Output: - Out shape: $(N, C, H_{out}, W_{out})$ - $$ - out(N_i, C_j, h, w) = \frac{1}{ksize[0] * ksize[1]} \sum_{m=0}^{ksize[0]-1} \sum_{n=0}^{ksize[1]-1} - input(N_i, C_j, stride[0] \times h + m, stride[1] \times w + n) - $$ - - Args: - kernel_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, + Input: + X shape: :math:`(N, C, :math:`H_{in}`, :math:`W_{in}`)` + Attr: + kernel_size: ksize + + Output: + Out shape: :math:`(N, C, :math:`H_{out}`, :math:`W_{out}`)` + + .. math:: + + Output(N_i, C_j, h, w) = \frac{\sum_{m=0}^{ksize[0]-1} \sum_{n=0}^{ksize[1]-1} + Input(N_i, C_j, stride[0] \times h + m, stride[1] \times w + n)}{ksize[0] * ksize[1]} + + Parameters: + kernel_size(int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, it must contain two integers, (pool_size_Height, pool_size_Width). Otherwise, the pool kernel size will be a square of an int. - stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list, + stride(int|list|tuple, optional): The pool stride size. If pool stride size is a tuple or list, it must contain two integers, (pool_stride_Height, pool_stride_Width). Otherwise, the pool stride size will be a square of an int. - - padding (string|int|list|tuple): The padding size. Padding could be in one of the following forms. + Default None, then stride will be equal to the kernel_size. + padding(str|int|list|tuple, optional): The padding size. Padding could be in one of the following forms. 1. A string in ['valid', 'same']. 2. An int, which means the feature map is zero padded by size of `padding` on every sides. 3. A list[int] or tuple(int) whose length is 2, [pad_height, pad_weight] whose value means the padding size of each dimension. 4. A list[int] or tuple(int) whose length is 4. [pad_height_top, pad_height_bottom, pad_width_left, pad_width_right] whose value means the padding size of each side. 5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0). The default value is 0. - ceil_mode (bool): when True, will use `ceil` instead of `floor` to compute the output shape - exclusive (bool): Whether to exclude padding points in average pooling - mode, default is `true`. - divisor_override (float): if specified, it will be used as divisor, otherwise kernel_size will be used. Default None. - data_format (string): The data format of the input and output data. An optional string from: `"NCHW"`, `"NDHW"`. - The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of: - `[batch_size, input_channels, input_height, input_width]`. - name(str, optional): For detailed information, please refer - to :ref:`api_guide_Name`. Usually name is no need to set and - None by default. + ceil_mode(bool, optional): When True, will use `ceil` instead of `floor` to compute the output shape. + exclusive(bool, optional): Whether to exclude padding points in average pooling + mode, default is `true`. + divisor_override(float, optional): If specified, it will be used as divisor, otherwise kernel_size will be + used. Default None. + data_format(str, optional): The data format of the input and output data. An optional string from: `"NCHW"`, + `"NDHW"`. The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of: + `[batch_size, input_channels, input_height, input_width]`. + name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`. + Usually name is no need to set and None by default. Shape: - - x: 4-D tensor. - - out: 2-D tensor + - x(Tensor): The input tensor of avg pool2d operator, which is a 4-D tensor. + The data type can be float32, float64. + - output(Tensor): The output tensor of avg pool2d operator, which is a 4-D tensor. + The data type is same as input x. - Returns: None. + Returns: + A callable object of AvgPool2D. Raises: ValueError: If `padding` is a string, but not "SAME" or "VALID". ValueError: If `padding` is "VALID", but `ceil_mode` is True. @@ -182,16 +184,16 @@ class AvgPool2D(layers.Layer): Examples: .. code-block:: python - import paddle - import paddle.nn as nn - import numpy as np + import paddle + import paddle.nn as nn + import numpy as np - # max pool2d - input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32]).astype(np.float32)) - AvgPool2D = nn.AvgPool2D(kernel_size=2, + # max pool2d + input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32]).astype(np.float32)) + AvgPool2D = nn.AvgPool2D(kernel_size=2, stride=2, padding=0) - output = AvgPool2D(input) - # output.shape [1, 3, 16, 16] + output = AvgPool2D(input) + # output.shape [1, 3, 16, 16] """ @@ -238,61 +240,64 @@ class AvgPool3D(layers.Layer): in NCDHW format, where N is batch size, C is the number of channels, H is the height of the feature, D is the depth of the feature, and W is the width of the feature. - Args: - kernel_size (int|list|tuple): The pool kernel size. If pool kernel size + Parameters: + kernel_size(int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, it must contain three integers, (kernel_size_Depth, kernel_size_Height, kernel_size_Width). Otherwise, the pool kernel size will be the cube of an int. - stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list, + stride(int|list|tuple, optional): The pool stride size. If pool stride size is a tuple or list, it must contain three integers, [stride_Depth, stride_Height, stride_Width). Otherwise, the pool stride size will be a cube of an int. - padding (string|int|list|tuple): The padding size. Padding could be in one of the following forms. + Default None, then stride will be equal to the kernel_size. + padding(str|int|list|tuple, optional): The padding size. Padding could be in one of the following forms. 1. A string in ['valid', 'same']. 2. An int, which means the feature map is zero padded by size of `padding` on every sides. 3. A list[int] or tuple(int) whose length is 3, [pad_depth, pad_height, pad_weight] whose value means the padding size of each dimension. 4. A list[int] or tuple(int) whose length is 6. [pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right] whose value means the padding size of each side. 5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0). The default value is 0. - ceil_mode (bool): ${ceil_mode_comment} - exclusive (bool): Whether to exclude padding points in average pooling - mode, default is True. - divisor_override (int|float) if specified, it will be used as divisor, otherwise kernel_size will be used. Default None. - data_format (string): The data format of the input and output data. An optional string from: `"NCDHW"`, `"NDHWC"`. - The default is `"NCDHW"`. When it is `"NCDHW"`, the data is stored in the order of: - `[batch_size, input_channels, input_depth, input_height, input_width]`. + ceil_mode(bool, optional): ${ceil_mode_comment} + exclusive(bool, optional): Whether to exclude padding points in average pooling mode, default is True. + divisor_override(int|float, optional): if specified, it will be used as divisor, otherwise kernel_size will + be used. Default None. + data_format(str, optional): The data format of the input and output data. An optional string from: `"NCDHW"`, + `"NDHWC"`. The default is `"NCDHW"`. When it is `"NCDHW"`, the data is stored in the order of: + `[batch_size, input_channels, input_depth, input_height, input_width]`. name(str, optional): For detailed information, please refer - to :ref:`api_guide_Name`. Usually name is no need to set and - None by default. + to :ref:`api_guide_Name`. Usually name is no need to set and + None by default. - Returns: None. + Returns: + A callable object of AvgPool3D. Raises: ValueError: If `padding` is a string, but not "SAME" or "VALID". ValueError: If `padding` is "VALID", but `ceil_mode` is True. ShapeError: If the output's shape calculated is not greater than 0. Shape: - - x: 5-D tensor. - - out: 5-D tensor. - + - x(Tensor): The input tensor of avg pool3d operator, which is a 5-D tensor. + The data type can be float32, float64. + - output(Tensor): The output tensor of avg pool3d operator, which is a 5-D tensor. + The data type is same as input x. Examples: .. code-block:: python - import paddle - import paddle.nn as nn - import numpy as np + import paddle + import paddle.nn as nn + import numpy as np - # avg pool3d - input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 2, 3, 32, 32]).astype(np.float32)) - AvgPool3D = nn.AvgPool3D(kernel_size=2, + # avg pool3d + input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 2, 3, 32, 32]).astype(np.float32)) + AvgPool3D = nn.AvgPool3D(kernel_size=2, stride=2, padding=0) - output = AvgPool3D(input) - # output.shape [1, 2, 3, 16, 16] + output = AvgPool3D(input) + # output.shape [1, 2, 3, 16, 16] """ def __init__(self, kernel_size, - stride, + stride=None, padding=0, ceil_mode=False, exclusive=True, @@ -328,10 +333,11 @@ class AvgPool3D(layers.Layer): class MaxPool1D(layers.Layer): """ - Applies a 1D max pooling over an input signal composed of several input planes based - on the input, output_size, return_mask parameters. - Input(X) and output(Out) are in NCL format, where N is batch - size, C is the number of channels, L is the length of the feature. + This operation applies 1D max pooling over input signal + composed of several input planes based on the input, + and kernel_size, stride, padding parameters. Input(X) and Output(Out) are + in NCL format, where N is batch size, C is the number of channels, + L is the length of the feature. The output value of the layer with input size (N, C, L), output (N, C, L_{out}) and kernel_size k can be precisely described as @@ -339,28 +345,27 @@ class MaxPool1D(layers.Layer): .. math:: - Output(N_i, C_i, l) &= max(Input[N_i, C_i, stride \times l:stride \times l+k])} + Output(N_i, C_i, l) = max(Input[N_i, C_i, stride \times l:stride \times l+k]) - Args: - kernel_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, - it must contain an integer. - stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list, + Parameters: + kernel_size(int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, it must contain an integer. - padding (string|int|list|tuple): The padding size. Padding could be in one of the following forms. + stride(int|list|tuple, optional): The pool stride size. If pool stride size is a tuple or list, + it must contain an integer. Default None, then stride will be equal to the kernel_size. + padding(str|int|list|tuple, optional): The padding size. Padding could be in one of the following forms. 1. A string in ['valid', 'same']. 2. An integer, which means the feature map is zero padded by size of `padding` on every sides. 3. A list[int] or tuple(int) whose length is 1, which means the feature map is zero padded by the size of `padding[0]` on every sides. - 4. A list[int] or tuple(int) whose length is 2. It has the form [pad_before, pad_after]. - 5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0). + 4. A list[int] or tuple(int) whose length is 2, It has the form [pad_before, pad_after]. + 5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or(0,0). The default value is 0. - return_mask (bool): Whether return the max indices along with the outputs. default is `False`. - ceil_mode (bool): Whether to use the ceil function to calculate output height and width. False is the default. - If it is set to False, the floor function will be used. Default False. - name(str, optional): For detailed information, please refer - to :ref:`api_guide_Name`. Usually name is no need to set and - None by default. + return_mask(bool, optional): Whether return the max indices along with the outputs. default is `False`. + ceil_mode(bool, optional): Whether to use the ceil function to calculate output height and width. + False is the default. If it is set to False, the floor function will be used. Default False. + name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`. + Usually name is no need to set and None by default. Returns: - None. + A callable object of MaxPool1D. Raises: ValueError: If `padding` is a string, but not "SAME" or "VALID". @@ -371,25 +376,27 @@ class MaxPool1D(layers.Layer): Shape: - - x: 3-D tensor. - - out: 3-D tensor. + - x(Tensor): The input tensor of max pool1d operator, which is a 3-D tensor. + The data type can be float32, float64. + - output(Tensor): The output tensor of max pool1d operator, which is a 3-D tensor. + The data type is same as input x. Examples: .. code-block:: python - import paddle - import paddle.nn as nn - import numpy as np + import paddle + import paddle.nn as nn + import numpy as np - data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32)) - MaxPool1D = nn.MaxPool1D(kernel_size=2, stride=2, padding=0) - pool_out = MaxPool1D(data) - # pool_out shape: [1, 3, 16] + data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32)) + MaxPool1D = nn.MaxPool1D(kernel_size=2, stride=2, padding=0) + pool_out = MaxPool1D(data) + # pool_out shape: [1, 3, 16] - MaxPool1D = nn.MaxPool1D(kernel_size=2, stride=2, padding=0, return_mask=True) - pool_out, indices = MaxPool1D(data) - # pool_out shape: [1, 3, 16], indices shape: [1, 3, 16] + MaxPool1D = nn.MaxPool1D(kernel_size=2, stride=2, padding=0, return_mask=True) + pool_out, indices = MaxPool1D(data) + # pool_out shape: [1, 3, 16], indices shape: [1, 3, 16] """ @@ -426,70 +433,73 @@ class MaxPool2D(layers.Layer): H is the height of the feature, and W is the width of the feature. Example: - Input: - X shape: $(N, C, H_{in}, W_{in})$ - Attr: - kernel_size: ksize - - Output: - Out shape: $(N, C, H_{out}, W_{out})$ - $$ - out(N_i, C_j, h, w) ={} & \max_{m=0, \ldots, ksize[0] -1} \max_{n=0, \ldots, ksize[1]-1} \\ - & \text{input}(N_i, C_j, \text{stride[0]} \times h + m, - \text{stride[1]} \times w + n) - $$ - - Args: - kernel_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, + - Input: + X shape: :math:`(N, C, H_{in}, W_{in})` + - Attr: + kernel_size: ksize + + - Output: + Out shape: :math:`(N, C, H_{out}, W_{out})` + + .. math:: + + Output(N_i, C_j, h, w) = \max_{m=0, \ldots, ksize[0] -1} \max_{n=0, \ldots, ksize[1]-1} + Input(N_i, C_j, stride[0] \times h + m, stride[1] \times w + n) + + Parameters: + kernel_size(int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, it must contain two integers, (pool_size_Height, pool_size_Width). Otherwise, the pool kernel size will be a square of an int. - stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list, + stride(int|list|tuple, optional): The pool stride size. If pool stride size is a tuple or list, it must contain two integers, (pool_stride_Height, pool_stride_Width). Otherwise, the pool stride size will be a square of an int. - padding (string|int|list|tuple): The padding size. Padding could be in one of the following forms. + Default None, then stride will be equal to the kernel_size. + padding(str|int|list|tuple, optional): The padding size. Padding could be in one of the following forms. 1. A string in ['valid', 'same']. 2. An int, which means the feature map is zero padded by size of `padding` on every sides. 3. A list[int] or tuple(int) whose length is 2, [pad_height, pad_weight] whose value means the padding size of each dimension. - 4. A list[int] or tuple(int) whose length is 4. [pad_height_top, pad_height_bottom, pad_width_left, pad_width_right] whose value means the padding size of each side. + 4. A list[int] or tuple(int) whose length is \4. [pad_height_top, pad_height_bottom, pad_width_left, pad_width_right] whose value means the padding size of each side. 5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0). The default value is 0. - ceil_mode (bool): when True, will use `ceil` instead of `floor` to compute the output shape - return_mask (bool): Whether to return the max indices along with the outputs. - data_format (string): The data format of the input and output data. An optional string from: `"NCHW"`, `"NDHW"`. - The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of: - `[batch_size, input_channels, input_height, input_width]`. - name(str, optional): For detailed information, please refer - to :ref:`api_guide_Name`. Usually name is no need to set and - None by default. + ceil_mode(bool, optional): when True, will use `ceil` instead of `floor` to compute the output shape + return_mask(bool, optional): Whether to return the max indices along with the outputs. + data_format(str, optional): The data format of the input and output data. An optional string from: `"NCHW"`, `"NDHW"`. + The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of: + `[batch_size, input_channels, input_height, input_width]`. + name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`. + Usually name is no need to set and None by default. - Returns: None + Returns: + A callable object of MaxPool2D. Raises: ValueError: If `padding` is a string, but not "SAME" or "VALID". ValueError: If `padding` is "VALID", but `ceil_mode` is True. ShapeError: If the output's shape calculated is not greater than 0. Shape: - - x: 4-D tensor. - - out: 4-D tensor. + - x(Tensor): The input tensor of max pool2d operator, which is a 4-D tensor. + The data type can be float32, float64. + - output(Tensor): The output tensor of max pool2d operator, which is a 4-D tensor. + The data type is same as input x. Examples: .. code-block:: python - import paddle - import paddle.nn as nn - import numpy as np + import paddle + import paddle.nn as nn + import numpy as np - # max pool2d - input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32]).astype(np.float32)) - MaxPool2D = nn.MaxPool2D(kernel_size=2, + # max pool2d + input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32]).astype(np.float32)) + MaxPool2D = nn.MaxPool2D(kernel_size=2, stride=2, padding=0) - output = MaxPool2D(input) - # output.shape [1, 3, 16, 16] + output = MaxPool2D(input) + # output.shape [1, 3, 16, 16] - # for return_mask=True - MaxPool2D = nn.MaxPool2D(kernel_size=2, stride=2, padding=0, return_mask=True) - output, max_indices = MaxPool2D(input) - # output.shape [1, 3, 16, 16], max_indices.shape [1, 3, 16, 16], + # for return_mask=True + MaxPool2D = nn.MaxPool2D(kernel_size=2, stride=2, padding=0, return_mask=True) + output, max_indices = MaxPool2D(input) + # output.shape [1, 3, 16, 16], max_indices.shape [1, 3, 16, 16], """ def __init__(self, @@ -532,59 +542,62 @@ class MaxPool3D(layers.Layer): in NCDHW format, where N is batch size, C is the number of channels, H is the height of the feature, D is the depth of the feature, and W is the width of the feature. - Args: - kernel_size (int|list|tuple): The pool kernel size. If the kernel size + Parameters: + kernel_size(int|list|tuple): The pool kernel size. If the kernel size is a tuple or list, it must contain three integers, (kernel_size_Depth, kernel_size_Height, kernel_size_Width). Otherwise, the pool kernel size will be the cube of an int. - stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list, + stride(int|list|tuple, optional): The pool stride size. If pool stride size is a tuple or list, it must contain three integers, [stride_Depth, stride_Height, stride_Width). Otherwise, the pool stride size will be a cube of an int. - padding (string|int|list|tuple): The padding size. Padding could be in one of the following forms. + Default None, then stride will be equal to the kernel_size. + padding(str|int|list|tuple, optional): The padding size. Padding could be in one of the following forms. 1. A string in ['valid', 'same']. 2. An int, which means the feature map is zero padded by size of `padding` on every sides. 3. A list[int] or tuple(int) whose length is 3, [pad_depth, pad_height, pad_weight] whose value means the padding size of each dimension. - 4. A list[int] or tuple(int) whose length is 6. [pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right] whose value means the padding size of each side. + 4. A list[int] or tuple(int) whose length is \6. [pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right] whose value means the padding size of each side. 5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0). The default value is 0. - ceil_mode (bool): ${ceil_mode_comment} - return_mask (bool): Whether to return the max indices along with the outputs. - data_format (string): The data format of the input and output data. An optional string from: `"NCDHW"`, `"NDHWC"`. - The default is `"NCDHW"`. When it is `"NCDHW"`, the data is stored in the order of: - `[batch_size, input_channels, input_depth, input_height, input_width]`. - name(str, optional): For detailed information, please refer - to :ref:`api_guide_Name`. Usually name is no need to set and - None by default. + ceil_mode(bool, optional): ${ceil_mode_comment} + return_mask(bool, optional): Whether to return the max indices along with the outputs. + data_format(str, optional): The data format of the input and output data. An optional string from: `"NCDHW"`, + `"NDHWC"`. The default is `"NCDHW"`. When it is `"NCDHW"`, the data is stored in the order of: + `[batch_size, input_channels, input_depth, input_height, input_width]`. + name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`. + Usually name is no need to set and None by default. - Returns:None. + Returns: + A callable object of MaxPool3D. Raises: ValueError: If `padding` is a string, but not "SAME" or "VALID". ValueError: If `padding` is "VALID", but `ceil_mode` is True. ShapeError: If the output's shape calculated is not greater than 0. Shape: - - x: 5-D tensor. - - out: 5-D tensor. + - x(Tensor): The input tensor of max pool3d operator, which is a 5-D tensor. + The data type can be float32, float64. + - output(Tensor): The output tensor of max pool3d operator, which is a 5-D tensor. + The data type is same as input x. Examples: .. code-block:: python - import paddle - import paddle.nn as nn - import numpy as np + import paddle + import paddle.nn as nn + import numpy as np - # max pool3d - input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 2, 3, 32, 32]).astype(np.float32)) - MaxPool3D = nn.MaxPool3D(kernel_size=2, + # max pool3d + input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 2, 3, 32, 32]).astype(np.float32)) + MaxPool3D = nn.MaxPool3D(kernel_size=2, stride=2, padding=0) - output = MaxPool3D(input) - # output.shape [1, 2, 3, 16, 16] + output = MaxPool3D(input) + # output.shape [1, 2, 3, 16, 16] - # for return_mask=True - MaxPool3D = nn.MaxPool3D(kernel_size=2, stride=2, padding=0, return_mask=True) - output, max_indices = MaxPool3D(input) - # output.shape [1, 2, 3, 16, 16], max_indices.shape [1, 2, 3, 16, 16], + # for return_mask=True + MaxPool3D = nn.MaxPool3D(kernel_size=2, stride=2, padding=0, return_mask=True) + output, max_indices = MaxPool3D(input) + # output.shape [1, 2, 3, 16, 16], max_indices.shape [1, 2, 3, 16, 16], """ def __init__(self, @@ -633,51 +646,52 @@ class AdaptiveAvgPool1D(layers.Layer): .. math:: - lstart &= floor(i * L_{in} / L_{out}) + lstart &= floor(i * L_{in} / L_{out}) - lend &= ceil((i + 1) * L_{in} / L_{out}) + lend &= ceil((i + 1) * L_{in} / L_{out}) - Output(i) &= \\frac{sum(Input[lstart:lend])}{(lstart - lend)} + Output(i) &= \frac{ \sum Input[lstart:lend]}{lend - lstart} - Args: - output_size (int): The target output size. It must be an integer. - name(str, optional): For detailed information, please refer - to :ref:`api_guide_Name`. Usually name is no need to set and - None by default. + Parameters: + output_size(int): The target output size. It must be an integer. + name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`. + Usually name is no need to set and None by default. Returns: - None. + A callable object of AdaptiveAvgPool1D. Raises: ValueError: 'output_size' should be an integer. Shape: - - x: 3-D tensor. - - out: 3-D tensor. + - x(Tensor): 3-D tensor. The input tensor of adaptive avg pool1d operator, which is a 3-D tensor. + The data type can be float32, float64. + - output(Tensor): 3-D tensor. The output tensor of adaptive avg pool1d operator, which is a 3-D tensor. + The data type is same as input x. Examples: .. code-block:: python - # average adaptive pool1d - # suppose input data in shape of [N, C, L], `output_size` is m or [m], - # output shape is [N, C, m], adaptive pool divide L dimension - # of input data into m grids averagely and performs poolings in each - # grid to get output. - # adaptive max pool performs calculations as follow: - # - # for i in range(m): - # lstart = floor(i * L / m) - # lend = ceil((i + 1) * L / m) - # output[:, :, i] = sum(input[:, :, lstart: lend])/(lstart - lend) - # - import paddle - import paddle.nn as nn - import numpy as np - - data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32)) - AdaptiveAvgPool1D = nn.AdaptiveAvgPool1D(output_size=16) - pool_out = AdaptiveAvgPool1D(data) - # pool_out shape: [1, 3, 16] + # average adaptive pool1d + # suppose input data in shape of [N, C, L], `output_size` is m or [m], + # output shape is [N, C, m], adaptive pool divide L dimension + # of input data into m grids averagely and performs poolings in each + # grid to get output. + # adaptive max pool performs calculations as follow: + # + # for i in range(m): + # lstart = floor(i * L / m) + # lend = ceil((i + 1) * L / m) + # output[:, :, i] = sum(input[:, :, lstart: lend])/(lend - lstart) + # + import paddle + import paddle.nn as nn + import numpy as np + + data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32)) + AdaptiveAvgPool1D = nn.AdaptiveAvgPool1D(output_size=16) + pool_out = AdaptiveAvgPool1D(data) + # pool_out shape: [1, 3, 16] """ def __init__(self, output_size, name=None): @@ -702,31 +716,32 @@ class AdaptiveAvgPool2D(layers.Layer): .. math:: - hstart &= floor(i * H_{in} / H_{out}) + hstart &= floor(i * H_{in} / H_{out}) - hend &= ceil((i + 1) * H_{in} / H_{out}) + hend &= ceil((i + 1) * H_{in} / H_{out}) - wstart &= floor(j * W_{in} / W_{out}) + wstart &= floor(j * W_{in} / W_{out}) - wend &= ceil((j + 1) * W_{in} / W_{out}) + wend &= ceil((j + 1) * W_{in} / W_{out}) - Output(i ,j) &= \\frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)} + Output(i ,j) &= \frac{\sum Input[hstart:hend, wstart:wend]}{(hend - hstart) * (wend - wstart)} Parameters: - output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, + output_size(int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, it must contain two element, (H, W). H and W can be either a int, or None which means the size will be the same as that of the input. - data_format (str): The data format of the input and output data. An optional string + data_format(str, optional): The data format of the input and output data. An optional string from: "NCHW", "NHWC". The default is "NCHW". When it is "NCHW", the data is stored in the order of: [batch_size, input_channels, input_height, input_width]. - name(str, optional): For detailed information, please refer - to :ref:`api_guide_Name`. Usually name is no need to set and - None by default. + name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`. + Usually name is no need to set and None by default. Shape: - x (Tensor): The input tensor of adaptive avg pool2d operator, which is a 4-D tensor. The data type can be float32, float64. - output (Tensor): The output tensor of adaptive avg pool2d operator, which is a 4-D tensor. The data type is same as input x. + - x(Tensor): The input tensor of adaptive avg pool2d operator, which is a 4-D tensor. + The data type can be float32, float64. + - output(Tensor): The output tensor of adaptive avg pool2d operator, which is a 4-D tensor. + The data type is same as input x. Returns: A callable object of AdaptiveAvgPool2D. @@ -787,34 +802,36 @@ class AdaptiveAvgPool3D(layers.Layer): .. math:: - dstart &= floor(i * D_{in} / D_{out}) + dstart &= floor(i * D_{in} / D_{out}) - dend &= ceil((i + 1) * D_{in} / D_{out}) + dend &= ceil((i + 1) * D_{in} / D_{out}) - hstart &= floor(j * H_{in} / H_{out}) + hstart &= floor(j * H_{in} / H_{out}) - hend &= ceil((j + 1) * H_{in} / H_{out}) + hend &= ceil((j + 1) * H_{in} / H_{out}) - wstart &= floor(k * W_{in} / W_{out}) + wstart &= floor(k * W_{in} / W_{out}) - wend &= ceil((k + 1) * W_{in} / W_{out}) + wend &= ceil((k + 1) * W_{in} / W_{out}) - Output(i ,j, k) &= \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)} + Output(i ,j, k) &= \frac{\sum Input[dstart:dend, hstart:hend, wstart:wend]} + {(dend - dstart) * (hend - hstart) * (wend - wstart)} Parameters: - output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, + output_size(int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, it must contain three elements, (D, H, W). D, H and W can be either a int, or None which means the size will be the same as that of the input. - data_format (str): The data format of the input and output data. An optional string + data_format(str, optional): The data format of the input and output data. An optional string from: "NCDHW", "NDHWC". The default is "NCDHW". When it is "NCDHW", the data is stored in the order of: [batch_size, input_channels, input_depth, input_height, input_width]. - name(str, optional): For detailed information, please refer - to :ref:`api_guide_Name`. Usually name is no need to set and - None by default. + name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`. + Usually name is no need to set and None by default. Shape: - x (Tensor): The input tensor of adaptive avg pool3d operator, which is a 5-D tensor. The data type can be float32, float64. - output (Tensor): The output tensor of adaptive avg pool3d operator, which is a 5-D tensor. The data type is same as input x. + - x(Tensor): The input tensor of adaptive avg pool3d operator, which is a 5-D tensor. + The data type can be float32, float64\. + - output(Tensor): The output tensor of adaptive avg pool3d operator, which is a 5-D tensor. + The data type is same as input x. Returns: A callable object of AdaptiveAvgPool3D. @@ -881,58 +898,59 @@ class AdaptiveMaxPool1D(layers.Layer): .. math:: - lstart &= floor(i * L_{in} / L_{out}) + lstart &= floor(i * L_{in} / L_{out}) - lend &= ceil((i + 1) * L_{in} / L_{out}) + lend &= ceil((i + 1) * L_{in} / L_{out}) - Output(i) &= max(Input[lstart:lend]) + Output(i) &= max(Input[lstart:lend]) - Args: - output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, - it must contain one int. - return_mask (bool): If true, the index of max pooling point will be returned along + Parameters: + output_size(int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, + it must contain one int. + return_mask(bool, optional): If true, the index of max pooling point will be returned along with outputs. It cannot be set in average pooling type. Default False. - name(str, optional): For detailed information, please refer - to :ref:`api_guide_Name`. Usually name is no need to set and - None by default. + name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`. + Usually name is no need to set and None by default. Returns: - None. + A callable object of AdaptiveMaxPool1D. Raises: ValueError: 'pool_size' should be a integer or list or tuple with length as 1. Shape: - x (Tensor): The input tensor of adaptive max pool1d operator, which is a 3-D tensor. The data type can be float32, float64. - output (Tensor): The output tensor of adaptive max pool1d operator, which is a 3-D tensor. The data type is same as input x. + - x(Tensor): The input tensor of adaptive max pool1d operator, which is a 3-D tensor. + The data type can be float32, float64. + - output(Tensor): The output tensor of adaptive max pool1d operator, which is a 3-D tensor. + The data type is same as input x. Examples: .. code-block:: python - # max adaptive pool1d - # suppose input data in shape of [N, C, L], `output_size` is m or [m], - # output shape is [N, C, m], adaptive pool divide L dimension - # of input data into m grids averagely and performs poolings in each - # grid to get output. - # adaptive max pool performs calculations as follow: - # - # for i in range(m): - # lstart = floor(i * L / m) - # lend = ceil((i + 1) * L / m) - # output[:, :, i] = max(input[:, :, lstart: lend]) - # - import paddle - import paddle.nn as nn - import numpy as np - - data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32)) - AdaptiveMaxPool1D = nn.AdaptiveMaxPool1D(output_size=16) - pool_out = AdaptiveMaxPool1D(data) - # pool_out shape: [1, 3, 16] - - # for return_mask = true - AdaptiveMaxPool1D = nn.AdaptiveMaxPool1D(output_size=16, return_mask=True) - pool_out, indices = AdaptiveMaxPool1D(data) - # pool_out shape: [1, 3, 16], indices shape: [1, 3, 16] + # max adaptive pool1d + # suppose input data in shape of [N, C, L], `output_size` is m or [m], + # output shape is [N, C, m], adaptive pool divide L dimension + # of input data into m grids averagely and performs poolings in each + # grid to get output. + # adaptive max pool performs calculations as follow: + # + # for i in range(m): + # lstart = floor(i * L / m) + # lend = ceil((i + 1) * L / m) + # output[:, :, i] = max(input[:, :, lstart: lend]) + # + import paddle + import paddle.nn as nn + import numpy as np + + data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32)) + AdaptiveMaxPool1D = nn.AdaptiveMaxPool1D(output_size=16) + pool_out = AdaptiveMaxPool1D(data) + # pool_out shape: [1, 3, 16] + + # for return_mask = true + AdaptiveMaxPool1D = nn.AdaptiveMaxPool1D(output_size=16, return_mask=True) + pool_out, indices = AdaptiveMaxPool1D(data) + # pool_out shape: [1, 3, 16], indices shape: [1, 3, 16] """ @@ -954,31 +972,36 @@ class AdaptiveMaxPool1D(layers.Layer): class AdaptiveMaxPool2D(layers.Layer): """ This operation applies 2D adaptive max pooling on input tensor. The h and w dimensions - of the output tensor are determined by the parameter output_size. The difference between adaptive pooling and pooling is adaptive one focus on the output size. + of the output tensor are determined by the parameter output_size. The difference between adaptive pooling and + pooling is adaptive one focus on the output size. For adaptive max pool2d: .. math:: - hstart &= floor(i * H_{in} / H_{out}) + hstart &= floor(i * H_{in} / H_{out}) - hend &= ceil((i + 1) * H_{in} / H_{out}) + hend &= ceil((i + 1) * H_{in} / H_{out}) - wstart &= floor(j * W_{in} / W_{out}) + wstart &= floor(j * W_{in} / W_{out}) - wend &= ceil((j + 1) * W_{in} / W_{out}) + wend &= ceil((j + 1) * W_{in} / W_{out}) - Output(i ,j) &= max(Input[hstart:hend, wstart:wend]) + Output(i ,j) &= max(Input[hstart:hend, wstart:wend]) Parameters: - output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, it must contain two element, (H, W). H and W can be either a int, or None which means the size will be the same as that of the input. - return_mask (bool): If true, the index of max pooling point will be returned along with outputs. It cannot be set in average pooling type. Default False. - name(str, optional): For detailed information, please refer - to :ref:`api_guide_Name`. Usually name is no need to set and - None by default. + output_size(int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, it must contain + two element, (H, W). H and W can be either a int, or None which means the size will be the same as that of + the input. + return_mask(bool, optional): If true, the index of max pooling point will be returned along with outputs. + It cannot be set in average pooling type. Default False. + name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`. + Usually name is no need to set and None by default. Shape: - x (Tensor): The input tensor of adaptive max pool2d operator, which is a 4-D tensor. The data type can be float32, float64. - output (Tensor): The output tensor of adaptive max pool2d operator, which is a 4-D tensor. The data type is same as input x. + - x(Tensor): The input tensor of adaptive max pool2d operator, which is a 4-D tensor. + The data type can be float32, float64. + - output(Tensor): The output tensor of adaptive max pool2d operator, which is a 4-D tensor. + The data type is same as input x. Returns: A callable object of AdaptiveMaxPool2D. @@ -1029,36 +1052,42 @@ class AdaptiveMaxPool2D(layers.Layer): class AdaptiveMaxPool3D(layers.Layer): """ - This operation applies 3D adaptive max pooling on input tensor. The h and w dimensions - of the output tensor are determined by the parameter output_size. The difference between adaptive pooling and pooling is adaptive one focus on the output size. + This operation applies 3D adaptive max pooling on input tensor. The h and w dimensions of the output tensor are + determined by the parameter output_size. The difference between adaptive pooling and pooling is adaptive one focus + on the output size. For adaptive max pool3d: .. math:: - dstart &= floor(i * D_{in} / D_{out}) + dstart &= floor(i * D_{in} / D_{out}) - dend &= ceil((i + 1) * D_{in} / D_{out}) + dend &= ceil((i + 1) * D_{in} / D_{out}) - hstart &= floor(j * H_{in} / H_{out}) + hstart &= floor(j * H_{in} / H_{out}) - hend &= ceil((j + 1) * H_{in} / H_{out}) + hend &= ceil((j + 1) * H_{in} / H_{out}) - wstart &= floor(k * W_{in} / W_{out}) + wstart &= floor(k * W_{in} / W_{out}) - wend &= ceil((k + 1) * W_{in} / W_{out}) + wend &= ceil((k + 1) * W_{in} / W_{out}) - Output(i ,j, k) &= max(Input[dstart:dend, hstart:hend, wstart:wend]) + Output(i ,j, k) &= max(Input[dstart:dend, hstart:hend, wstart:wend]) Parameters: - output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, it must contain three elements, (D, H, W). D, H and W can be either a int, or None which means the size will be the same as that of the input. - return_mask (bool): If true, the index of max pooling point will be returned along with outputs. Default False. - name(str, optional): For detailed information, please refer - to :ref:`api_guide_Name`. Usually name is no need to set and - None by default. + output_size(int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, it must contain + three elements, (D, H, W). D, H and W can be either a int, or None which means the size will be the same as + that of the input. + return_mask(bool, optional): If true, the index of max pooling point will be returned along with outputs. + Default False. + name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`. + Usually name is no need to set and None by default. Shape: - x (Tensor): The input tensor of adaptive max pool3d operator, which is a 5-D tensor. The data type can be float32, float64. - output (Tensor): The output tensor of adaptive max pool3d operator, which is a 5-D tensor. The data type is same as input x. + - x(Tensor): The input tensor of adaptive max pool3d operator, which is a 5-D tensor. + The data type can be float32, float64. + - output(Tensor): The output tensor of adaptive max pool3d operator, which is a 5-D tensor. + The data type is same as input x. + Returns: A callable object of AdaptiveMaxPool3D. Examples: -- GitLab From 8460698b59a3628e572c56dea79dff939bf446b8 Mon Sep 17 00:00:00 2001 From: ShenLiang Date: Thu, 1 Apr 2021 16:10:59 +0800 Subject: [PATCH 134/486] Support control flow in DataParallel (#31625) * support control flow * supoort sync_parameters_buffers * fix the bug of sparse embedding --- .../framework/distributed_strategy.proto | 1 + paddle/fluid/imperative/bkcl_context.cc | 8 +- paddle/fluid/imperative/bkcl_context.h | 2 + paddle/fluid/imperative/nccl_context.cc | 6 + paddle/fluid/imperative/nccl_context.h | 2 + paddle/fluid/imperative/parallel_context.h | 3 + paddle/fluid/imperative/reducer.cc | 326 ++++++++++++++---- paddle/fluid/imperative/reducer.h | 27 +- .../fleet/base/distributed_strategy.py | 28 ++ .../distributed/fleet/base/fleet_base.py | 4 +- python/paddle/fluid/dygraph/layers.py | 9 +- python/paddle/fluid/dygraph/parallel.py | 90 ++++- .../fluid/tests/unittests/CMakeLists.txt | 8 +- ...parallel_dygraph_control_flow_different.py | 122 +++++++ .../parallel_dygraph_control_flow_same.py | 87 +++++ .../parallel_dygraph_gradient_check.py | 136 ++++++++ .../unittests/parallel_dygraph_none_var.py | 80 +++++ .../parallel_dygraph_shared_unused_var.py | 74 ++++ .../parallel_dygraph_sparse_embedding_fp64.py | 2 - .../parallel_dygraph_unused_variables.py | 4 +- .../fluid/tests/unittests/test_dist_base.py | 22 +- .../test_fleet_distributed_strategy.py | 9 + .../test_parallel_dygraph_control_flow.py | 91 +++++ .../test_parallel_dygraph_dataparallel.py | 75 ++++ .../unittests/test_parallel_dygraph_mnist.py | 1 + .../test_parallel_dygraph_transformer.py | 1 + .../test_parallel_dygraph_unused_variables.py | 32 +- 27 files changed, 1153 insertions(+), 97 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/parallel_dygraph_control_flow_different.py create mode 100644 python/paddle/fluid/tests/unittests/parallel_dygraph_control_flow_same.py create mode 100644 python/paddle/fluid/tests/unittests/parallel_dygraph_gradient_check.py create mode 100644 python/paddle/fluid/tests/unittests/parallel_dygraph_none_var.py create mode 100644 python/paddle/fluid/tests/unittests/parallel_dygraph_shared_unused_var.py create mode 100644 python/paddle/fluid/tests/unittests/test_parallel_dygraph_control_flow.py create mode 100644 python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto index b36793507f5..04dc51f1b94 100644 --- a/paddle/fluid/framework/distributed_strategy.proto +++ b/paddle/fluid/framework/distributed_strategy.proto @@ -152,6 +152,7 @@ message DistributedStrategy { optional bool fp16_allreduce = 25 [ default = false ]; optional bool sharding = 26 [ default = false ]; optional float last_comm_group_size_MB = 27 [ default = 1 ]; + optional bool find_unused_parameters = 28 [ default = true ]; optional RecomputeConfig recompute_configs = 101; optional AMPConfig amp_configs = 102; diff --git a/paddle/fluid/imperative/bkcl_context.cc b/paddle/fluid/imperative/bkcl_context.cc index 886179feb19..16f9454e937 100644 --- a/paddle/fluid/imperative/bkcl_context.cc +++ b/paddle/fluid/imperative/bkcl_context.cc @@ -167,8 +167,6 @@ void BKCLParallelContext::WaitCompute(int ring_id) { platform::errors::OutOfRange("Ring id expected < nrings," "but got ring id = %d, nrings = %d", ring_id, strategy_.nrings_)); - // TODO(wangxi16): [Performance optimize] Maybe need to put Wait and - // bkcl_allreduce to comm thread, for bkcl_allreduce is blocking now. auto compute_dev_ctx = static_cast( platform::DeviceContextPool::Instance().Get(place_)); compute_dev_ctx->Wait(); @@ -188,6 +186,12 @@ void BKCLParallelContext::WaitComm(int ring_id) { comm_dev_ctx->Wait(); } +void BKCLParallelContext::SynchronizeCompute() { + auto compute_dev_ctx = static_cast( + platform::DeviceContextPool::Instance().Get(place_)); + compute_dev_ctx->Wait(); +} + } // namespace imperative } // namespace paddle #endif diff --git a/paddle/fluid/imperative/bkcl_context.h b/paddle/fluid/imperative/bkcl_context.h index 86e4d97b3c7..652b7689666 100644 --- a/paddle/fluid/imperative/bkcl_context.h +++ b/paddle/fluid/imperative/bkcl_context.h @@ -47,6 +47,8 @@ class BKCLParallelContext : public ParallelContext { void WaitCompute(int ring_id) override; void WaitComm(int ring_id) override; + + void SynchronizeCompute() override; }; } // namespace imperative diff --git a/paddle/fluid/imperative/nccl_context.cc b/paddle/fluid/imperative/nccl_context.cc index 7e7c4ceea0b..b91fc460781 100644 --- a/paddle/fluid/imperative/nccl_context.cc +++ b/paddle/fluid/imperative/nccl_context.cc @@ -173,6 +173,12 @@ void NCCLParallelContext::WaitComm(int ring_id) { #endif } +void NCCLParallelContext::SynchronizeCompute() { + auto *compute_dev_ctx = static_cast( + platform::DeviceContextPool::Instance().Get(place_)); + compute_dev_ctx->Wait(); +} + #endif } // namespace imperative diff --git a/paddle/fluid/imperative/nccl_context.h b/paddle/fluid/imperative/nccl_context.h index 292ef1661c3..bcaeb811b10 100644 --- a/paddle/fluid/imperative/nccl_context.h +++ b/paddle/fluid/imperative/nccl_context.h @@ -65,6 +65,8 @@ class NCCLParallelContext : public ParallelContext { void WaitComm(int ring_id) override; + void SynchronizeCompute() override; + private: // used for comm wait compute, compute_stream-->event-->comm_stream[ring_id] std::vector> compute_events_; diff --git a/paddle/fluid/imperative/parallel_context.h b/paddle/fluid/imperative/parallel_context.h index 9a76311f2ed..f537a316014 100644 --- a/paddle/fluid/imperative/parallel_context.h +++ b/paddle/fluid/imperative/parallel_context.h @@ -66,6 +66,9 @@ class ParallelContext { // if CPU, should do nothing. virtual void WaitComm(int ring_id) = 0; + // synchorize compute stream + virtual void SynchronizeCompute() = 0; + inline int GetNRings() const { return strategy_.nrings_; } inline int64_t GetNRanks() const { return strategy_.nranks_; } diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc index 4b18886821b..5422b7ce9c8 100644 --- a/paddle/fluid/imperative/reducer.cc +++ b/paddle/fluid/imperative/reducer.cc @@ -315,6 +315,12 @@ Reducer::Reducer(const std::vector> &vars, VariableWrapper *grad) { this->AddDistHook(global_var_index); })); var_index_map_[var->GradVarBase()->SharedVar().get()] = global_var_index; } + + // for checking var is ready once + vars_marked_ready_.resize(vars_.size(), false); + + // Initialize local used vars + local_used_vars_.resize(vars_.size(), 0); } void Reducer::InitializeDenseGroups( @@ -323,7 +329,7 @@ void Reducer::InitializeDenseGroups( for (size_t index = 0; index < variable_indices_.size(); ++index) { const auto variable_index = variable_indices_[index]; const auto &var = vars_[variable_index]; - const auto var_name = var->Name(); + const auto &var_name = var->Name(); PADDLE_ENFORCE_EQ(is_sparse_gradient_[variable_index], false, platform::errors::PreconditionNotMet( "Tensor %s's GRAD must be LoDTensor, but received " @@ -334,7 +340,7 @@ void Reducer::InitializeDenseGroups( PADDLE_ENFORCE_EQ(lod_tensor->IsInitialized(), true, platform::errors::PreconditionNotMet( "Tensor %s is not initialized.", var_name)); - auto size = lod_tensor->numel(); + const auto size = lod_tensor->numel(); PADDLE_ENFORCE_GT( size, 0, platform::errors::PreconditionNotMet( "The number of tensor %s's elements is 0.", var_name)); @@ -346,8 +352,8 @@ void Reducer::InitializeDenseGroups( p_group->dense_tensors_.push_back(framework::Tensor()); // check the dtype and place, it must be same. - auto dtype = var->DataType(); - auto place = var->Place(); + const auto &dtype = var->DataType(); + const auto &place = var->Place(); if (index > 0) { PADDLE_ENFORCE_EQ( dtype, p_group->dtype_, @@ -417,8 +423,7 @@ void Reducer::InitializeGroups( group.variable_indices_ = std::move(variable_indices_); groups_.emplace_back(std::move(group)); // Debug Message For Reducer - VLOG(3) << "The Group[" << group_index << "]:"; - VLOG(3) << groups_.back(); + VLOG(3) << "The Group[" << group_index << "]:" << groups_.back(); } } @@ -461,34 +466,38 @@ void Reducer::PrepareDeps(const std::unordered_set &init_nodes) { // and allreudce sequence counter(next_group_) will be cleaned up again. void Reducer::PrepareForBackward( const std::vector> &outputs) { - VLOG(3) << "start reseting count.."; + VLOG(3) << "after forward, then reset count for backward."; next_group_ = 0; std::for_each(groups_.begin(), groups_.end(), [](Group &group) { group.pending_ = group.variable_indices_.size(); group.sparse_contents_ = nullptr; }); + // reinitialize vars_marked_ready_ for next iteration + vars_marked_ready_.clear(); + vars_marked_ready_.resize(vars_.size(), false); + PADDLE_ENFORCE_EQ( - all_group_ready_, false, + groups_need_finalize_, false, platform::errors::PreconditionNotMet( - "Please note that all forward outputs derived from the module " + "A serious error has occurred here. There may be several reasons: " + "1) Please note that all forward outputs derived from the module " "parameters must participate in the calculation of losses and " "subsequent gradient calculations. If not, the wrapper will hang, " "waiting for autograd to generate gradients for these parameters. " "you can use detach or stop_gradient to make the unused parameters " - "detached from the autograd graph.")); + "detached from the autograd graph. " + "2) Used multiple forwards and one backward. You may be able to wrap " + "multiple forwards in a model.")); // The first var to trigger the unused parameter has_marked_unused_vars_ = false; + unused_vars_.clear(); + if (!find_unused_vars_) { return; } - // TODO(shenliang03) "find_unused_vars" interface will be exposed in the - // future to handle control flow to process unused parameters - find_unused_vars_ = false; - - unused_vars_.clear(); node_deps_.clear(); std::queue> q; std::unordered_set var_visited; @@ -551,6 +560,23 @@ void Reducer::PrepareForBackward( << "] is not used"; } } + + if (unused_vars_.empty()) { + LOG_FIRST_N(WARNING, 1) + << "All parameters are involved in the backward pass. " + "It is recommended to set find_unused_parameters to False " + "to improve performance. However, if unused parameters " + "appear in subsequent iterative training, then an error " + "will occur. Please make it clear that in the subsequent " + "training, there will be no parameters that are not used " + "in the backward pass, and then set find_unused_parameters"; + } else if (unused_vars_.size() == vars_.size()) { + LOG_FIRST_N(WARNING, 1) + << "There is no parameter in the device involved " + "in the backward calculation. If there are " + "parameters on other devices involved in the " + "backward, then a serious error will occur here."; + } } // Add hook function to each leaf node. When the gradient of a leaf node is @@ -563,67 +589,133 @@ void Reducer::PrepareForBackward( // concat + allreduce + split is emitted in turn according to next_group_. // 3, FinalizeBackward: after the end, synchronize each stream. void Reducer::AddDistHook(size_t var_index) { + PADDLE_ENFORCE_LT(var_index, variable_locators_.size(), + platform::errors::OutOfRange( + "Out of bounds variable index. it must be less" + "than %d, but it is %d", + variable_locators_.size(), var_index)); + VLOG(3) << "Var[" << var_index << "] [" << vars_[var_index]->GradVarBase()->Name() << "] arrived and triggered disthook"; - if (!has_marked_unused_vars_) { - has_marked_unused_vars_ = true; - for (auto unused_index : unused_vars_) { - if (NeedRebuildGroup()) { - rebuild_vars_.push_back(vars_[unused_index]); - rebuild_var_indices_.push_back(unused_index); - } - MarkVarReady(unused_index, false); - } - } + local_used_vars_[var_index] = 1; + + // rebuild group when find_unused_vars_ is false if (NeedRebuildGroup()) { rebuild_vars_.push_back(vars_[var_index]); rebuild_var_indices_.push_back(var_index); } + + if (!has_marked_unused_vars_ && find_unused_vars_) { + has_marked_unused_vars_ = true; + for (const auto &unused_index : unused_vars_) { + MarkVarReady(unused_index, false); + } + } + MarkVarReady(var_index, true); } void Reducer::MarkVarReady(const size_t var_index, const bool is_used_var) { - all_group_ready_ = true; + groups_need_finalize_ = true; + const auto &var_locator = variable_locators_[var_index]; - auto group_index = var_locator.group_index; + const auto group_index = var_locator.group_index; auto &group = groups_[group_index]; + // error happened, if the var is ready before. + if (vars_marked_ready_[var_index]) { + auto error_info = string::Sprintf( + "Error happened, when parameter[%d][%s] has been ready before. " + "There may be several reasons for this error: " + "1) In multiple reentrant backward phase, some parameters are reused." + "2) Using model parameters outside of forward function. Please " + "make sure that model parameters are not shared in concurrent " + "forward-backward passes.", + var_index, vars_[var_index]->GradVarBase()->Name()); + + PADDLE_ENFORCE_EQ(has_marked_unused_vars_, false, + platform::errors::PreconditionNotMet(error_info)); + + error_info += + "3) Unused parameters retrieval is incorrect. " + "The return value of forward will be used to retrieve" + " the unused parameters of the entire model. These " + "gradients of unused parameters will not be synchronized " + "between multiple cards. However, if the unused " + "parameters participate in the backward calculation " + "again at a later time (e.g. after the forward function, " + "the loss calculation uses the unused " + "paramters of the forward and trigger backward), " + "its gradient will be wrong."; + + PADDLE_ENFORCE_EQ(has_marked_unused_vars_, true, + platform::errors::PreconditionNotMet(error_info)); + } else { + vars_marked_ready_[var_index] = true; + } + if (!group.is_sparse_) { // process dense group - auto inside_group_index = var_locator.inside_group_index; - auto length = group.length_[inside_group_index]; + const auto inside_group_index = var_locator.inside_group_index; + const auto length = group.length_[inside_group_index]; auto &group_tensor = group.dense_tensors_[inside_group_index]; + if (is_used_var) { - auto var_warpper = vars_[var_index]->GradVarBase()->SharedVar(); - auto tensor = - var_warpper->MutableVar()->GetMutable(); + auto var_base = vars_[var_index]->GradVarBase(); + auto tensor = var_base->MutableVar()->GetMutable(); group_tensor.ShareDataWith(*tensor).Resize( {static_cast(length)}); } else { + // TODO(shenliang03): maybe save the memory + // by avoiding tensor construction if (!group_tensor.IsInitialized()) { group_tensor.Resize({static_cast(length)}); group_tensor.mutable_data(place_, group.dtype_); + } + #ifdef PADDLE_WITH_XPU_BKCL - if (platform::is_xpu_place(group_tensor.place())) { - // TODO(liuyuhui) support XPU set constant - VLOG(3) << "XPU doesn't support set_constant"; - } + if (platform::is_xpu_place(group_tensor.place())) { + // TODO(liuyuhui) support XPU set constant + VLOG(3) << "XPU doesn't support set_constant"; + } #else - auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place_); + auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place_); + if (HasGrad(var_index)) { + auto var_base = vars_[var_index]->GradVarBase(); + auto tensor = + var_base->MutableVar()->GetMutable(); + TensorCopy(*tensor, place_, *dev_ctx, &group_tensor); + group_tensor.Resize({static_cast(length)}); + } else { + group_tensor.Resize({static_cast(length)}); operators::math::set_constant(*dev_ctx, &group_tensor, 0.0); -#endif } +#endif } } else { // process sparse group - if (is_used_var) { - auto var_warpper = vars_[var_index]->GradVarBase()->SharedVar(); - group.sparse_contents_ = var_warpper->MutableVar(); - } else { - group.sparse_contents_ = nullptr; - } + PADDLE_ENFORCE_EQ(HasGrad(var_index), true, + platform::errors::PreconditionNotMet( + "The sparse parameter[%d][%s] must have a gradient", + var_index, vars_[var_index]->Name())); + auto var_base = vars_[var_index]->GradVarBase(); + // need to check tensor type + PADDLE_ENFORCE_EQ( + var_base->Var().IsType(), true, + platform::errors::PreconditionNotMet( + "The sparse parameter[%d][%s] must have a selectedrows gradient. " + "Before forward pass, the parameter type is inferred to be " + "SelectedRows, but after backward pass, its actual type becomes " + "LodTensor. It is currently not supported by DataParallel. " + "For example, if sparse embedding is used, and the weight of " + "embedding is shared with subsequent dense parameters, then " + "the parameter gradient of the embedding will be converted " + "to dense parameters.", + var_index, vars_[var_index]->Name())); + + group.sparse_contents_ = var_base->MutableVar(); } if (--group.pending_ == 0) { @@ -639,6 +731,14 @@ void Reducer::MarkVarReady(const size_t var_index, const bool is_used_var) { // TODO(liuyuhui): If BKCL support non-blocking communication, it should be // fixed as same as multi gpus card trainging. void Reducer::MarkGroupReady(size_t group_index) { + PADDLE_ENFORCE_GE( + group_index, next_group_, + platform::errors::PreconditionNotMet( + "The index of the incoming group must be greater " + "than or equal to the previously synchronized group index, " + "expect it to greater than or equal to %d, but got %d.", + next_group_, group_index)); + if (group_index > next_group_) { VLOG(3) << "It will adjust the order of group in next batch automatically"; return; @@ -647,7 +747,7 @@ void Reducer::MarkGroupReady(size_t group_index) { for (; next_group_ < groups_.size() && groups_[next_group_].pending_ == 0; ++next_group_) { auto &group = groups_[next_group_]; - int run_order = next_group_ % nrings_; + const int run_order = next_group_ % nrings_; // For CUDA or XPU, compute_stream --> comm_stream. // For CPU, do nothing. @@ -666,7 +766,7 @@ void Reducer::MarkGroupReady(size_t group_index) { comm_pool_->enqueue([&] { auto dev_id = BOOST_GET_CONST(platform::XPUPlace, place_).device; platform::SetXPUDeviceId(dev_id); - FusedAllReduceSchedule(run_order, group); + FusedAllReduceSchedule(run_order, group, next_group_); { std::lock_guard lock(mutex_); comm_op_count_ -= 1; // lock @@ -674,7 +774,7 @@ void Reducer::MarkGroupReady(size_t group_index) { } }); #elif defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL) - FusedAllReduceSchedule(run_order, group); + FusedAllReduceSchedule(run_order, group, next_group_); #else PADDLE_THROW(platform::errors::PreconditionNotMet( "Not compiled with BKCL or NCCL.")); @@ -682,24 +782,23 @@ void Reducer::MarkGroupReady(size_t group_index) { } } -void Reducer::FusedAllReduceSchedule(int run_order, Group &group) { +void Reducer::FusedAllReduceSchedule(const int run_order, Group &group, + const int curr_group_index) { + // The overall timeline: concat > div_nranks > allreduce > split + // dev_context is used to select different stream + const auto &dev_context = *parallel_ctx_->GetDeviceContext(run_order); if (group.is_sparse_) { - if (group.sparse_contents_ != nullptr) { - VLOG(3) << "sparse group [" << next_group_ << "] start allreduce in ring[" - << run_order << "]"; - group.DivNRanks(*parallel_ctx_->GetDeviceContext(run_order), nranks_); - parallel_ctx_->AllReduceByStream( - *group.sparse_contents_, group.sparse_contents_, run_order, false); - } else { - VLOG(3) << "The sparse group[" << next_group_ - << "] has no var to allreduce"; - } + VLOG(3) << "sparse group [" << curr_group_index + << "] start allreduce in ring[" << run_order << "]"; + group.DivNRanks(dev_context, nranks_); + parallel_ctx_->AllReduceByStream(*group.sparse_contents_, + group.sparse_contents_, run_order, false); } else { - VLOG(3) << "dense group [" << next_group_ << "] start allreduce in ring[" - << run_order << "]"; + VLOG(3) << "dense group [" << curr_group_index + << "] start allreduce in ring[" << run_order << "]"; // Select common commstream to concat tensors // group.dense_tensors ---> group.dense_contents_ - group.ConcatTensors(*parallel_ctx_->GetDeviceContext(run_order)); + group.ConcatTensors(dev_context); // NOTE(liuyuhui): ConcatTensors use communication stream, but BKCL only support // default stream for communicating, so there exist some problems in @@ -711,15 +810,15 @@ void Reducer::FusedAllReduceSchedule(int run_order, Group &group) { parallel_ctx_->WaitComm(run_order); } #endif - group.DivNRanks(*parallel_ctx_->GetDeviceContext(run_order), nranks_); + group.DivNRanks(dev_context, nranks_); // Start allreduce parallel_ctx_->AllReduceByStream( group.dense_contents_, &(group.dense_contents_), run_order, false); - // Select common commstream to split tensors + // Select communication stream to split tensors // group.dense_contents_ ---> group.dense_tensors - group.SplitTensors(*parallel_ctx_->GetDeviceContext(run_order)); + group.SplitTensors(dev_context); } } @@ -745,14 +844,98 @@ std::vector> Reducer::RebuildGruops() { return rebuild_group_indices; } +void Reducer::ProcessUnusedDenseVars() { + // The calculation stream must be used here to + // avoid conflicts with communication. + VLOG(3) << "Local used vars : " + << string::join_strings(local_used_vars_, ','); + const auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place_); + // H2D is to allreduce the local_used_vars_ + auto *global_used_tensor = + global_used_vars_.GetMutable(); + framework::TensorFromVector(local_used_vars_, *dev_ctx, + global_used_tensor); + parallel_ctx_->AllReduceByStream(global_used_vars_, &global_used_vars_, 0, + true); + framework::TensorToVector(*global_used_tensor, *dev_ctx, + &local_used_vars_); + + // sync compute stream to get global used var message, + // but maybe affect speed performance + parallel_ctx_->SynchronizeCompute(); + VLOG(3) << "Global used vars : " + << string::join_strings(local_used_vars_, ','); + + for (const auto var_index : unused_vars_) { + const bool global_unused = (local_used_vars_[var_index] == 0); + + // global used but local unused, set grad + VLOG(3) << "Var [" << var_index << "] [" << vars_[var_index]->Name() + << "] global_unused:" << global_unused + << " has grad: " << HasGrad(var_index); + + if (!global_unused) { + VLOG(3) << "Start process unused Var"; + // 1. source var base + const auto &var_locator = variable_locators_[var_index]; + const auto group_index = var_locator.group_index; + const auto &group = groups_[group_index]; + const auto inside_group_index = var_locator.inside_group_index; + const auto &src_tensor = group.dense_tensors_[inside_group_index]; + // sparse no need to check and no support find_unused_parameters + if (group.is_sparse_) { + continue; + } + // 2. destination var base + auto dest_var_base = vars_[var_index]; + auto *dest_tensor = + dest_var_base->MutableVar()->GetMutable(); + const auto &dest_dims = dest_tensor->dims(); + + // 3. create grad var base or get grad var base + auto grad_var_base_tmp = dest_var_base->MutableGradVarBase(); + + // 4. set grad tensor + auto *dest_grad_tensor = + grad_var_base_tmp->MutableVar()->GetMutable(); + const auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place_); + TensorCopy(src_tensor, place_, *dev_ctx, dest_grad_tensor); + dest_grad_tensor->Resize(dest_dims); + } + } +} + +bool Reducer::HasGrad(size_t var_index) { + const auto grad_var = vars_[var_index]->GradVarBase(); + if (!grad_var || !grad_var->Var().IsInitialized()) { + return false; + } + + const auto &var = grad_var->Var(); + if (var.IsType()) { + if (var.Get().IsInitialized()) { + return true; + } + } else if (var.IsType()) { + if (var.Get().value().IsInitialized()) { + return true; + } + } else { + PADDLE_THROW(platform::errors::PermissionDenied( + "Only support LoDTensor and SelectedRows for gradient var")); + } + return false; +} + void Reducer::FinalizeBackward() { - all_group_ready_ = false; + groups_need_finalize_ = false; #ifdef PADDLE_WITH_XPU_BKCL { std::unique_lock lock(mutex_); cv_.wait(lock, [&] { return comm_op_count_ == 0; }); } #endif + // Must prevent compute_stream_ starting until all comm streams have finished for (int i = 0; i < nrings_; ++i) { parallel_ctx_->WaitComm(i); @@ -765,7 +948,18 @@ void Reducer::FinalizeBackward() { InitializeGroups(group_indices_); } - VLOG(3) << "In the batch, Reducer is finished..."; + if (find_unused_vars_) { +// TODO(liuyuhui) support xpu about Tensorcopy/TensorFromVector/TensorToVector +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + ProcessUnusedDenseVars(); +#endif + // Initialize local used vars + local_used_vars_.clear(); + local_used_vars_.resize(vars_.size(), 0); + VLOG(3) << "ProcessUnusedDenseVars is finished."; + } + + VLOG(3) << "In the batch, Reducer is finished."; } // According to the size of each parameter, it is allocated to different groups. diff --git a/paddle/fluid/imperative/reducer.h b/paddle/fluid/imperative/reducer.h index b2680d0dea7..0d613dbea89 100644 --- a/paddle/fluid/imperative/reducer.h +++ b/paddle/fluid/imperative/reducer.h @@ -27,6 +27,7 @@ #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/variable.h" #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/platform/for_range.h" @@ -153,13 +154,20 @@ class Reducer { void MarkGroupReady(size_t group_index); - void FusedAllReduceSchedule(int run_order, Group& group); // NOLINT + void FusedAllReduceSchedule(const int run_order, Group& group, // NOLINT + const int curr_group_index); void FinalizeBackward(); std::vector> RebuildGruops(); - inline bool NeedRebuildGroup() { return !has_rebuilt_group_; } + inline bool NeedRebuildGroup() { + return !has_rebuilt_group_ && !find_unused_vars_; + } + + void ProcessUnusedDenseVars(); + + bool HasGrad(size_t var_index); private: std::vector> vars_; @@ -188,7 +196,7 @@ class Reducer { std::vector unused_vars_; bool has_marked_unused_vars_{false}; bool find_unused_vars_{false}; - bool all_group_ready_{false}; + bool groups_need_finalize_{false}; #ifdef PADDLE_WITH_XPU_BKCL // comm_pool_ is used for scheduling allreduce in multi Kunlun cards training. std::unique_ptr<::ThreadPool> comm_pool_{nullptr}; @@ -196,6 +204,19 @@ class Reducer { std::mutex mutex_; std::condition_variable cv_; #endif + + // it just for checking hook, each parameter can only trigger one hook + std::vector vars_marked_ready_; + + // Following variables are to help control flow. + // local_used_vars_ uses 0/1 to indicate whether the + // var is used in iteration. After the end of the + // iteration, global_used_vars_ is obtained synchronously + // globally. Choose whether to update the local + // gradient according to the global_used_vars_. + std::vector local_used_vars_; + // global_used_vars_ is used in comm stream to avoid wait + framework::Variable global_used_vars_; }; std::vector> AssignGroupBySize( diff --git a/python/paddle/distributed/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py index f79013d7347..626f6a37a98 100755 --- a/python/paddle/distributed/fleet/base/distributed_strategy.py +++ b/python/paddle/distributed/fleet/base/distributed_strategy.py @@ -620,6 +620,34 @@ class DistributedStrategy(object): else: raise ValueError("last_comm_group_size_MB should be greater than 0") + @property + def find_unused_parameters(self): + """ + Indicating whether we are using find_unused_parameters to + find unused parameters in DataParallel. + + Default value: True + + Examples: + + .. code-block:: python + + import paddle.distributed.fleet as fleet + strategy = fleet.DistributedStrategy() + strategy.find_unused_parameters = True + """ + + return self.strategy.find_unused_parameters + + @find_unused_parameters.setter + @is_strict_auto + def find_unused_parameters(self, flag): + if isinstance(flag, bool): + self.strategy.find_unused_parameters = flag + else: + print( + "WARNING: find_unused_parameters should have value of bool type") + @property def _fuse_grad_size_in_TFLOPS(self): return self.strategy.fuse_grad_size_in_TFLOPS diff --git a/python/paddle/distributed/fleet/base/fleet_base.py b/python/paddle/distributed/fleet/base/fleet_base.py index cf802034cab..470d1a2b78f 100644 --- a/python/paddle/distributed/fleet/base/fleet_base.py +++ b/python/paddle/distributed/fleet/base/fleet_base.py @@ -706,7 +706,9 @@ class Fleet(object): model, comm_buffer_size=self._user_defined_strategy.fuse_grad_size_in_MB, last_comm_buffer_size=self._user_defined_strategy. - last_comm_group_size_MB) + last_comm_group_size_MB, + find_unused_parameters=self._user_defined_strategy. + find_unused_parameters) return self.model @dygraph_only diff --git a/python/paddle/fluid/dygraph/layers.py b/python/paddle/fluid/dygraph/layers.py index b157ce81d82..3df0c608527 100644 --- a/python/paddle/fluid/dygraph/layers.py +++ b/python/paddle/fluid/dygraph/layers.py @@ -22,6 +22,7 @@ import copy import weakref import warnings from copy import deepcopy +import paddle from . import parallel_helper from .. import unique_name @@ -894,9 +895,15 @@ class Layer(core.Layer): if not self._built: with program_desc_tracing_guard(False): self._build_once(*inputs, **kwargs) - if parallel_helper._is_data_parallel_mode(): + + # TODO(liuyuhui) Only xpu broadcast parameters here. + # The other device is to call _sync_params_buffers in DataParallel + # to realize the parameter synchronization among multiply cards. + if parallel_helper._is_data_parallel_mode( + ) and paddle.is_compiled_with_xpu(): parallel_helper._broadcast_parameters( self._parameters.values()) + self._built = True outputs = self.forward(*inputs, **kwargs) diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py index 2ef72f6c5aa..b80621e21f1 100644 --- a/python/paddle/fluid/dygraph/parallel.py +++ b/python/paddle/fluid/dygraph/parallel.py @@ -24,6 +24,7 @@ from paddle.fluid.dygraph import layers from paddle.fluid.dygraph import parallel_helper from paddle.fluid.dygraph import to_variable, no_grad from paddle.utils import deprecated +from ..layers import collective import warnings import paddle import itertools @@ -348,6 +349,18 @@ class DataParallel(layers.Layer): last_comm_buffer_size(float, optional): It limits memory size(MB) of last buffer in communication calling. Making the last communication buffer size small is useful to improve performance. Default: 1. + find_unused_parameters(bool, optional): Whether to traverse the entire backward graph from the + all tensors in the return value of the wrapped model's + forward function. For parameters not involved in loss + calculation, their gradients will be marked as ready in + advance to prepare reduce. Please note that all forward + outputs derived from the wrapped model parameters must + participate in the calculation of loss and subsequent + gradient calculations. If not, serious error will occur. + Note that setting the find_unused_parameters to True + will affect computing performance. Therefore, if all parameters + are sure to participate in the loss calculation and the + autograd graph construction, please set it False. Default: True. Returns: Layer: The data paralleled module. @@ -403,11 +416,13 @@ class DataParallel(layers.Layer): layers, strategy=None, comm_buffer_size=25, - last_comm_buffer_size=1): + last_comm_buffer_size=1, + find_unused_parameters=True): super(DataParallel, self).__init__(layers.full_name() + "_data_parallel") self._layers = layers + self.find_unused_parameters = find_unused_parameters # NOTE(chenweihang): The ParallelStrategy here is not strictly a strategy. # It just stores some environment variables, which can be constructed by @@ -419,6 +434,17 @@ class DataParallel(layers.Layer): self._strategy = _build_default_parallel_strategy() if self._strategy.nranks > 1: + # check the environment + assert parallel_helper.__parallel_ctx__clz__ is not None, \ + "ParallelContext must be initialized before. You should use init_parallel_env() before" \ + "constructing the DataParallel." + + # sync buffer and params + # TODO(liuyuhui) Currently not support xpu. xpu is + # still broadcasting parameters when calling layer + if not paddle.is_compiled_with_xpu(): + self._sync_params_buffers() + self.comm_buffer_size = int(comm_buffer_size * 1024 * 1024) # NOTE(shenliang03): We can set environment variables to control # the size of the group, Default: 1MB. The role of this small group is: @@ -449,6 +475,10 @@ class DataParallel(layers.Layer): trainable_parameters = [param for _, param in layers_param] + assert len(trainable_parameters) > 0, \ + "This model does not have any parameters to train, and " \ + "does not need to use DataParallel" + # NOTE(shenliang03): Here we can only use the attributes to judge whether # parameter is sparse(or SelectedRows). The reason is that the sparse message # can't be obtained when bp hasn't happened yet. So if layer supports sparse parameter, @@ -470,19 +500,12 @@ class DataParallel(layers.Layer): trainable_parameters, is_sparse_gradient, [self.last_comm_buffer_size, self.comm_buffer_size]) - assert parallel_helper.__parallel_ctx__clz__ is not None, \ - "ParallelContext must be initialized before. You should use init_parallel_env() before" \ - "constructing the DataParallel." - - # TODO(shenliang03) "find_unused_vars" interface will be exposed in the future - # to handle control flow to process unused parameters - find_unused_vars = True self._reducer = core.Reducer( trainable_parameters, list(reversed(self.group_indices)), is_sparse_gradient, parallel_helper.__parallel_ctx__clz__, [self.last_comm_buffer_size, self.comm_buffer_size], - find_unused_vars) + self.find_unused_parameters) def _find_varbase(self, obj): if isinstance(obj, core.VarBase): @@ -493,11 +516,54 @@ class DataParallel(layers.Layer): return itertools.chain(*map(self._find_varbase, obj.values())) return [] + def _sync_params_buffers(self): + model_vars = [] + for _, param in self._layers.state_dict().items(): + if not isinstance(param, core.VarBase): + raise TypeError("The data type of '%s' must be Varbase" % + param.name) + model_vars.append(param.detach()) + if len(model_vars) == 0: + return + + mega_bytes = 128 * 1024 * 1024 + group_idx = 0 + memory_counter = 0 + var_groups = OrderedDict() + dtype = model_vars[0].dtype + + for var in model_vars: + bytes = np.prod(var.shape) * core.size_of_dtype(var.dtype) + if memory_counter < mega_bytes and dtype == var.dtype: + memory_counter += bytes + else: + memory_counter = 0 + dtype = var.dtype + group_idx += 1 + var_groups.setdefault(group_idx, []).append(var) + + coalesced_vars = _coalesce_tensors(var_groups) + + for coalesced_var, _, _ in coalesced_vars: + collective._broadcast(coalesced_var, root=0, sync_mode=True) + + for coalesced_var, origin_vars, var_shapes in coalesced_vars: + var_len = [np.prod(v_shape) for v_shape in var_shapes] + framework._dygraph_tracer().trace_op( + type='split', + inputs={'X': coalesced_var}, + outputs={'Out': origin_vars}, + attrs={'sections': var_len, + 'axis': 0}) + def forward(self, *inputs, **kwargs): outputs = self._layers(*inputs, **kwargs) - if self._strategy.nranks > 1: - self._reducer.prepare_for_backward( - list(self._find_varbase(outputs))) + if self._strategy.nranks > 1 and framework._dygraph_tracer()._has_grad: + if self.find_unused_parameters: + self._reducer.prepare_for_backward( + list(self._find_varbase(outputs))) + else: + self._reducer.prepare_for_backward(list(self._find_varbase([]))) return outputs diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 0abb61d95aa..28f5177c204 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -19,6 +19,8 @@ list(APPEND DIST_TEST_OPS test_fleet_pipeline_meta_optimizer) list(APPEND DIST_TEST_OPS test_fleet_graph_execution_meta_optimizer) list(APPEND DIST_TEST_OPS test_gen_nccl_id_op) list(APPEND DIST_TEST_OPS test_parallel_dygraph_unused_variables) +list(APPEND DIST_TEST_OPS test_parallel_dygraph_control_flow) +list(APPEND DIST_TEST_OPS test_parallel_dygraph_dataparallel) set(MIXED_DIST_TEST_OPS ${DIST_TEST_OPS}) #remove distribute unittests. list(APPEND MIXED_DIST_TEST_OPS test_dgc_op) @@ -160,6 +162,8 @@ if ((NOT WITH_GPU) AND (NOT WITH_ROCM)) LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_sparse_embedding_over_height) LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_transformer) LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_sync_batch_norm) + list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_control_flow) + list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_dataparallel) LIST(REMOVE_ITEM TEST_OPS test_imperative_auto_mixed_precision) LIST(REMOVE_ITEM TEST_OPS test_fleet_base_single) elseif(WITH_GPU) @@ -824,10 +828,12 @@ set_tests_properties(test_dataloader_unkeep_order PROPERTIES TIMEOUT 120) set_tests_properties(test_reader_reset PROPERTIES TIMEOUT 120) set_tests_properties(test_pool3d_api PROPERTIES TIMEOUT 120) if(WITH_DISTRIBUTE AND WITH_GPU AND WITH_NCCL) + set_tests_properties(test_parallel_dygraph_dataparallel PROPERTIES TIMEOUT 120) + set_tests_properties(test_parallel_dygraph_unused_variables PROPERTIES TIMEOUT 120) + set_tests_properties(test_parallel_dygraph_control_flow PROPERTIES TIMEOUT 120) if(${NCCL_VERSION} VERSION_GREATER_EQUAL 2212) set_tests_properties(test_parallel_dygraph_sparse_embedding PROPERTIES TIMEOUT 120) set_tests_properties(test_parallel_dygraph_transformer PROPERTIES TIMEOUT 120) - set_tests_properties(test_parallel_dygraph_unused_variables PROPERTIES TIMEOUT 120) endif() endif() if((WITH_ROCM OR WITH_GPU) AND NOT WIN32) diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_control_flow_different.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_control_flow_different.py new file mode 100644 index 00000000000..26c9944abd6 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_control_flow_different.py @@ -0,0 +1,122 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import paddle.distributed as dist + +import paddle +import paddle.fluid as fluid +from paddle.fluid.dygraph.nn import Embedding +import paddle.nn.functional as F +from test_dist_base import runtime_main, TestParallelDyGraphRunnerBase + +paddle.seed(123) +np.random.seed(2021) + + +class SimpleNet(fluid.Layer): + def __init__(self, hidden_size, vocab_size, is_sparse=False): + super(SimpleNet, self).__init__() + self.hidden_size = hidden_size + self.vocab_size = vocab_size + self.embedding = Embedding( + size=[self.vocab_size, self.hidden_size], + dtype='float32', + is_sparse=is_sparse) + + self.lin_a = paddle.nn.Linear(self.hidden_size, self.vocab_size) + self.lin_b = paddle.nn.Linear(self.vocab_size, 1) + + self.unused_net = paddle.nn.Linear(5, 3) + self.phony = self.create_parameter(shape=[1], dtype="float32") + + def forward(self, input, label, conf): + x_emb = self.embedding(input) + fc = self.lin_a(x_emb) + mask = conf > 0 + mask = paddle.cast(mask, dtype="int64") + mask.stop_gradient = True + emb_mask = mask.max(1).flatten() + emb_mask_inds = paddle.nonzero(emb_mask > 0).flatten() + emb_mask_inds.stop_gradient = True + + if emb_mask_inds.numel() == 0: + loss_box = self.phony * 0 + else: + projection = self.lin_b(fc) + projection = paddle.reshape(projection, shape=[-1, 1]) + output = paddle.gather(projection, emb_mask_inds) + target = paddle.gather(label, emb_mask_inds) + loss_box = F.smooth_l1_loss( + output, target, reduction='sum', delta=1.0) + loss_box = loss_box / len(conf) + + return loss_box + + +# global configs +batch_size = 4 +batch_num = 2000 +hidden_size = 5 +vocab_size = 100 + +conf_dataset = [[0], [0], [0], [0], [1], [0], [1], [0], [0], [1], [0], [1], + [1], [1], [1], [1], [1], [1], [1], [1], [1], [0], [0], [1]] + + +def fake_sample_reader(): + def __reader__(): + for i in range(batch_num): + x_data = np.random.randint(0, vocab_size) + y_data = np.random.random_sample((1, )).astype('float32') + conf_data = np.array(conf_dataset[i % len(conf_dataset)]).astype( + 'int64') + yield x_data, y_data, conf_data + + return __reader__ + + +class TestSimpleNet(TestParallelDyGraphRunnerBase): + def get_model(self): + model = SimpleNet( + hidden_size=hidden_size, vocab_size=vocab_size, is_sparse=False) + + train_reader = paddle.batch( + fake_sample_reader(), batch_size=batch_size, drop_last=True) + + optimizer = paddle.optimizer.SGD(learning_rate=0.001, + parameters=model.parameters()) + + return model, train_reader, optimizer + + def run_one_loop(self, model, optimizer, batch): + x_data = np.array([x[0] for x in batch]).astype('int64') + y_data = np.array([x[1] for x in batch]).astype('float32') + conf_data = np.array([x[2] for x in batch]).astype('int64') + x_data = x_data.reshape((-1, 1)) + y_data = y_data.reshape((-1, 1)) + conf_data = conf_data.reshape((-1, 1)) + + x = paddle.to_tensor(x_data) + y = paddle.to_tensor(y_data) + conf = paddle.to_tensor(conf_data) + + loss = model(x, y, conf) + return loss + + +if __name__ == "__main__": + runtime_main(TestSimpleNet) diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_control_flow_same.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_control_flow_same.py new file mode 100644 index 00000000000..3157d5e4129 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_control_flow_same.py @@ -0,0 +1,87 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import contextlib +import unittest +import numpy as np + +import paddle +import paddle.fluid as fluid +import paddle.fluid.dygraph as dygraph +from paddle.fluid import core +from paddle.fluid.optimizer import SGDOptimizer +from paddle.fluid.dygraph.nn import Linear +from paddle.fluid.dygraph.base import to_variable + +from test_dist_base import runtime_main, TestParallelDyGraphRunnerBase + +np.random.seed(2021) +paddle.seed(1024) + +batch_size = 4 +batch_num = 1000 + + +class SimpleNet(fluid.Layer): + def __init__(self): + super(SimpleNet, self).__init__() + self.net_a = paddle.nn.Sequential( + paddle.nn.Linear(10, 20), + paddle.nn.Linear(20, 20), paddle.nn.Linear(20, 5)) + self.net_b = paddle.nn.Sequential( + paddle.nn.Linear(10, 20), + paddle.nn.Linear(20, 20), paddle.nn.Linear(20, 5)) + self.net_unused = Linear(10, 20) + self.step = 0 + + def forward(self, x): + if self.step % 2 == 0: + return self.net_a(x) + else: + return self.net_b(x) + + self.step = self.step + 1 + + +def fake_sample_reader(): + def __reader__(): + for i in range(batch_num): + x_data = np.random.random_sample((10, )).astype('float32') + yield x_data + + return __reader__ + + +class TestSimpleNet(TestParallelDyGraphRunnerBase): + def get_model(self): + model = SimpleNet() + train_reader = paddle.batch( + fake_sample_reader(), batch_size=batch_size, drop_last=True) + optimizer = paddle.optimizer.SGD(learning_rate=0.001, + parameters=model.parameters()) + return model, train_reader, optimizer + + def run_one_loop(self, model, optimizer, batch): + x_data = np.array([x for x in batch]) + x_data = x_data.reshape((-1, 10)) + x = to_variable(x_data) + out = model(x) + loss = out.sum() / len(batch) + return loss + + +if __name__ == "__main__": + runtime_main(TestSimpleNet) diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_gradient_check.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_gradient_check.py new file mode 100644 index 00000000000..0d2631fa108 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_gradient_check.py @@ -0,0 +1,136 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import division +from __future__ import print_function + +import unittest + +import paddle +import numpy as np +import paddle.distributed as dist +import paddle.fluid as fluid +from paddle.fluid.dygraph.nn import Linear + +paddle.seed(1024) +np.random.seed(2021) + +batch = 5 +in_dim = 10 +out_dim = 20 + + +class SimpleNet(fluid.Layer): + def __init__(self, train_id): + super(SimpleNet, self).__init__() + self.w1 = self.create_parameter( + shape=[in_dim, out_dim], dtype="float32") + self.w2 = self.create_parameter( + shape=[in_dim, out_dim], dtype="float32") + self.share_net = Linear(out_dim, 10) + + self.unused_param = self.create_parameter( + shape=[out_dim, in_dim], dtype="float64") + + # just for test sync_params_buffers + self.register_buffer("queue", paddle.randn([10, 5])) + self.queue = paddle.nn.functional.normalize(self.queue, axis=0) + self.register_buffer("queue_ptr", paddle.zeros([1], 'int64')) + + self.trainer_id = train_id + + def forward(self, x): + is_use = (paddle.equal_all( + x, paddle.ones(shape=(batch, in_dim))).numpy()[0] and + self.trainer_id == 1) + + if is_use: + tmp = paddle.matmul(x, self.w1) + else: + tmp = paddle.matmul(x, self.w2) + + return self.share_net(tmp) + + +class TestDistTraning(unittest.TestCase): + def test_multiple_gpus(self): + dist.init_parallel_env() + self.trainer_id = dist.get_rank() + + model_a = SimpleNet(self.trainer_id) + model_b = SimpleNet(self.trainer_id) + + state_dict = model_a.state_dict() + model_b.set_state_dict(state_dict) + + model_a = paddle.DataParallel(model_a) + model_b = paddle.DataParallel(model_b) + + ones_input = paddle.ones(shape=(batch, in_dim)) + ones_input.stop_gradient = True + + w1_grad_sum = np.zeros((in_dim, out_dim), dtype='float32') + w2_grad_sum = np.zeros((in_dim, out_dim), dtype='float32') + + for step_id in range(5): + random_input = paddle.rand(shape=(batch, in_dim)) + random_input.stop_gradient = True + + if step_id % 2 == 0: + out_a = model_a(random_input) + out_b = model_b(random_input) + else: + out_a = model_a(ones_input) + out_b = model_b(ones_input) + + out_a.sum().backward() + out_b.sum().backward() + + self.check_gradient(model_a.parameters()) + self.check_gradient(model_b.parameters()) + + # test acc gradient + w1_grad_sum = self.check_acc(model_a._layers.w1.grad, w1_grad_sum, + model_b._layers.w1.grad) + w2_grad_sum = self.check_acc(model_a._layers.w2.grad, w2_grad_sum, + model_b._layers.w2.grad) + + model_a.clear_gradients() + + def check_acc(self, grad, grad_sum, acc_grad): + if grad is not None: + grad_sum = grad_sum + grad + np.testing.assert_allclose(grad_sum, acc_grad, rtol=1e-6) + return grad_sum + + def print_trainer_0(self, *args): + if self.trainer_id == 0: + print(*args) + + def broadcast_param(self, param, root): + paddle.distributed.broadcast(param, root) + return param + + def check_gradient(self, params): + other_param = [] + for param in params: + if param.trainable and (param._grad_ivar() is not None): + grad = param._grad_ivar() + other_grad = self.broadcast_param(grad.clone(), root=1) + if self.trainer_id == 0: + np.testing.assert_allclose(other_grad.numpy(), grad.numpy()) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_none_var.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_none_var.py new file mode 100644 index 00000000000..fc0246a9720 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_none_var.py @@ -0,0 +1,80 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import contextlib +import unittest +import numpy as np + +import paddle +import paddle.fluid as fluid +import paddle.fluid.dygraph as dygraph +from paddle.fluid import core +from paddle.fluid.optimizer import SGDOptimizer +from paddle.fluid.dygraph.nn import Linear + +from test_dist_base import runtime_main, TestParallelDyGraphRunnerBase + +np.random.seed(2021) +paddle.seed(1024) + +batch_size = 4 +batch_num = 1000 + + +class SimpleNet(fluid.Layer): + def __init__(self): + super(SimpleNet, self).__init__() + self.net_a = paddle.nn.Sequential( + paddle.nn.Linear(10, 20), + paddle.nn.Linear(20, 20), paddle.nn.Linear(20, 5)) + self.net_b = paddle.nn.Sequential( + paddle.nn.Linear(10, 20), + paddle.nn.Linear(20, 20), paddle.nn.Linear(20, 5)) + self.step = 0 + + def forward(self, x): + return paddle.to_tensor(0.0, dtype='float32') + + +def fake_sample_reader(): + def __reader__(): + for i in range(batch_num): + x_data = np.random.random_sample((10, )).astype('float32') + yield x_data + + return __reader__ + + +class TestSimpleNet(TestParallelDyGraphRunnerBase): + def get_model(self): + model = SimpleNet() + train_reader = paddle.batch( + fake_sample_reader(), batch_size=batch_size, drop_last=True) + optimizer = paddle.optimizer.SGD(learning_rate=0.001, + parameters=model.parameters()) + return model, train_reader, optimizer + + def run_one_loop(self, model, optimizer, batch): + x_data = np.array([x for x in batch]) + x_data = x_data.reshape((-1, 10)) + x = paddle.to_tensor(x_data) + out = model(x) + loss = out.sum() / len(batch) + return loss + + +if __name__ == "__main__": + runtime_main(TestSimpleNet) diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_shared_unused_var.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_shared_unused_var.py new file mode 100644 index 00000000000..facac33e4c6 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_shared_unused_var.py @@ -0,0 +1,74 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import paddle +import paddle.fluid as fluid +from paddle.fluid.optimizer import SGDOptimizer +from paddle.fluid.dygraph.nn import Conv2D, Pool2D, Linear +from paddle.fluid.dygraph.base import to_variable +from test_dist_base import runtime_main, TestParallelDyGraphRunnerBase + +np.random.seed(2021) +paddle.seed(1024) + + +class SimpleNet(fluid.Layer): + def __init__(self): + # bias is unused parameters, and it share with net_a + super(SimpleNet, self).__init__() + self.net_a = Linear(input_dim=10, output_dim=5) + self.net_b = Linear(10, 10) + self.bias = self.net_a.bias + + def forward(self, x): + return self.net_b(x) + + +batch_size = 4 +batch_num = 1000 + + +def fake_sample_reader(): + def __reader__(): + for i in range(batch_num): + x_data = np.random.random_sample((10, )).astype('float32') + yield x_data + + return __reader__ + + +class TestSimpleNet(TestParallelDyGraphRunnerBase): + def get_model(self): + model = SimpleNet() + train_reader = paddle.batch( + fake_sample_reader(), batch_size=batch_size, drop_last=True) + optimizer = paddle.optimizer.SGD(learning_rate=0.001, + parameters=model.parameters()) + return model, train_reader, optimizer + + def run_one_loop(self, model, optimizer, batch): + x_data = np.array([x for x in batch]) + x_data = x_data.reshape((-1, 10)) + x = to_variable(x_data) + out = model(x) + loss = out.sum() / len(batch) + return loss + + +if __name__ == "__main__": + runtime_main(TestSimpleNet) diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_sparse_embedding_fp64.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_sparse_embedding_fp64.py index 65c242a7023..a15b263a295 100644 --- a/python/paddle/fluid/tests/unittests/parallel_dygraph_sparse_embedding_fp64.py +++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_sparse_embedding_fp64.py @@ -65,8 +65,6 @@ class SimpleNet(Layer): def forward(self, input, label): x_emb = self.embedding(input) fc = paddle.matmul(x_emb, self.softmax_weight) - # use detach to stop gradient - fc = fc.detach() fc = paddle.add(fc, self.softmax_bias) projection = paddle.reshape(fc, shape=[-1, self.vocab_size]) loss = paddle.nn.functional.softmax_with_cross_entropy( diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_unused_variables.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_unused_variables.py index 1884eef15e9..9f877381101 100644 --- a/python/paddle/fluid/tests/unittests/parallel_dygraph_unused_variables.py +++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_unused_variables.py @@ -37,7 +37,7 @@ class SimpleNet(Layer): self.embedding = Embedding( self.vocab_size, self.hidden_size, - sparse=True, + sparse=is_sparse, weight_attr=paddle.ParamAttr( name='embedding_param', initializer=paddle.nn.initializer.Uniform( @@ -105,7 +105,7 @@ class TestSparseEmbeddingUnusedVars(TestParallelDyGraphRunnerBase): vocab_size=vocab_size, num_steps=num_steps, init_scale=init_scale, - is_sparse=True) + is_sparse=False) train_reader = paddle.batch( fake_sample_reader(), batch_size=batch_size, drop_last=True) diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py index d73698e7e02..fa5ce283985 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_base.py +++ b/python/paddle/fluid/tests/unittests/test_dist_base.py @@ -501,7 +501,12 @@ class TestParallelDyGraphRunnerBase(object): type(self).__name__, "begin to prepare context in dygraph with nccl2") dygraph.parallel.prepare_context(strategy) - model = dygraph.parallel.DataParallel(model, strategy) + if not args.find_unused_parameters: + model = dygraph.parallel.DataParallel( + model, strategy, find_unused_parameters=False) + else: + model = dygraph.parallel.DataParallel( + model, strategy, find_unused_parameters=True) print_to_err(type(self).__name__, "model built in dygraph") out_losses = [] print_to_err(type(self).__name__, "begin to run dygraph training") @@ -574,9 +579,14 @@ class TestParallelDyGraphRunnerBase(object): # get trainer id args.trainer_id = paddle.distributed.get_rank() + # set strategy + strategy = fleet.DistributedStrategy() + if not args.find_unused_parameters: + strategy.find_unused_parameters = False + # 3. init parallel env if args.update_method == "nccl2" or "bkcl": - fleet.init(is_collective=True) + fleet.init(is_collective=True, strategy=strategy) # 4. train model model, train_reader, opt = self.get_model() @@ -628,6 +638,7 @@ def runtime_main(test_class): parser.add_argument('--use_xpu', action='store_true') parser.add_argument('--use_dgc', action='store_true') parser.add_argument('--accumulate_gradient', action='store_true') + parser.add_argument('--find_unused_parameters', action='store_true') parser.add_argument('--use_reduce', action='store_true') parser.add_argument('--dc_asgd', action='store_true') parser.add_argument('--hogwild', action='store_true') @@ -726,6 +737,7 @@ class TestDistBase(unittest.TestCase): self._save_model = False self._fuse_all_reduce = None self._accumulate_gradient = False + self._find_unused_parameters = True self._setup_config() global DIST_UT_PORT @@ -852,6 +864,9 @@ class TestDistBase(unittest.TestCase): if self._accumulate_gradient: cmd += " --accumulate_gradient" + if self._find_unused_parameters: + cmd += " --find_unused_parameters" + env_local.update(envs) print("local_cmd: {}, env: {}".format(cmd, env_local)) @@ -1021,6 +1036,9 @@ class TestDistBase(unittest.TestCase): if self._accumulate_gradient: tr_cmd += " --accumulate_gradient" + if self._find_unused_parameters: + tr_cmd += " --find_unused_parameters" + if self._pipeline_mode: tr_cmd += " --use_pipeline" if self._mp_mode: diff --git a/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py b/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py index 31771ddbd68..d843e172763 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_distributed_strategy.py @@ -179,6 +179,15 @@ class TestStrategyConfig(unittest.TestCase): with self.assertRaises(ValueError): strategy.last_comm_group_size_MB = -1 + def test_find_unused_parameters(self): + strategy = paddle.distributed.fleet.DistributedStrategy() + strategy.find_unused_parameters = True + self.assertEqual(strategy.find_unused_parameters, True) + strategy.find_unused_parameters = False + self.assertEqual(strategy.find_unused_parameters, False) + strategy.find_unused_parameters = "True" + self.assertEqual(strategy.find_unused_parameters, False) + def test_fuse_grad_size_in_TFLOPS(self): strategy = paddle.distributed.fleet.DistributedStrategy() strategy._fuse_grad_size_in_TFLOPS = 0.1 diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_control_flow.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_control_flow.py new file mode 100644 index 00000000000..fa571bde5e4 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_control_flow.py @@ -0,0 +1,91 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import os +import sys +import unittest + +import paddle.fluid as fluid +from test_dist_base import TestDistBase +from spawn_runner_base import TestDistSpawnRunner + +flag_name = os.path.splitext(__file__)[0] + + +class TestDygraphControlFlowSame(TestDistBase): + def _setup_config(self): + self._sync_mode = False + self._nccl2_mode = True + self._dygraph = True + + def test_net(self): + if fluid.core.is_compiled_with_cuda(): + self.check_with_place( + "parallel_dygraph_control_flow_same.py", + delta=1e-5, + check_error_log=True, + log_name=flag_name) + + +class TestFleetDygraphControlFlowSame(TestDygraphControlFlowSame): + def _setup_config(self): + self._sync_mode = False + self._nccl2_mode = True + self._dygraph = True + self._use_fleet_api = True + + +class TestFleetDygraphControlFlowSameAccGrad(TestDygraphControlFlowSame): + def _setup_config(self): + self._sync_mode = False + self._nccl2_mode = True + self._dygraph = True + self._accumulate_gradient = True + + +class TestDygraphControlFlowDiff(TestDistBase): + def _setup_config(self): + self._sync_mode = False + self._nccl2_mode = True + self._dygraph = True + + def test_net(self): + if fluid.core.is_compiled_with_cuda(): + self.check_with_place( + "parallel_dygraph_control_flow_different.py", + delta=1e-5, + check_error_log=True, + log_name=flag_name) + + +class TestFleetDygraphControlFlowDiff(TestDygraphControlFlowDiff): + def _setup_config(self): + self._sync_mode = False + self._nccl2_mode = True + self._dygraph = True + self._use_fleet_api = True + + +class TestFleetDygraphControlFlowDiffAccGrad(TestDygraphControlFlowDiff): + def _setup_config(self): + self._sync_mode = False + self._nccl2_mode = True + self._dygraph = True + self._accumulate_gradient = True + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py new file mode 100644 index 00000000000..1d2a3975190 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py @@ -0,0 +1,75 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import time +import paddle.fluid as fluid + +from paddle.distributed.utils import find_free_ports, watch_local_trainers, get_cluster, get_gpus, start_local_trainers + + +def get_cluster_from_args(selected_gpus): + cluster_node_ips = '127.0.0.1' + node_ip = '127.0.0.1' + + node_ips = [x.strip() for x in cluster_node_ips.split(',')] + + node_ips.index(node_ip) + + free_ports = None + + free_ports = find_free_ports(len(selected_gpus)) + if free_ports is not None: + free_ports = list(free_ports) + + trainer_endpoints = [] + for ip in node_ips: + trainer_endpoints.append(["%s:%d" % (ip, port) for port in free_ports]) + return get_cluster(node_ips, node_ip, trainer_endpoints, selected_gpus) + + +class TestMultipleGpus(unittest.TestCase): + def run_mnist_2gpu(self, target_file_name): + if not fluid.core.is_compiled_with_cuda( + ) or fluid.core.get_cuda_device_count() == 0: + return + + selected_gpus = get_gpus('0,1') + cluster = None + pod = None + + cluster, pod = get_cluster_from_args(selected_gpus) + + procs = start_local_trainers( + cluster, + pod, + training_script=target_file_name, + training_script_args=[]) + + while True: + alive = watch_local_trainers(procs, cluster.trainers_nranks()) + + if not alive: + print("Local procs complete, POD info:{}".format(pod)) + break + time.sleep(3) + + def test_multiple_gpus_dynamic(self): + self.run_mnist_2gpu('parallel_dygraph_gradient_check.py') + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py index a3a3c5bfe3d..782d2304619 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_mnist.py @@ -73,6 +73,7 @@ class TestParallelDygraphMnistAccGrad(TestDistBase): self._dygraph = True self._use_fleet_api = True self._accumulate_gradient = True + self._find_unused_parameters = False def test_mnist(self): if fluid.core.is_compiled_with_cuda(): diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_transformer.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_transformer.py index bef64385f13..e0aab8541a5 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_transformer.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_transformer.py @@ -54,6 +54,7 @@ class TestParallelDygraphTransformerAccGrad(TestDistBase): self._nccl2_mode = True self._dygraph = True self._accumulate_gradient = True + self._find_unused_parameters = False def test_transformer(self): if fluid.core.is_compiled_with_cuda(): diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_unused_variables.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_unused_variables.py index 5906114cd24..75fa6f7c71d 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_unused_variables.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_unused_variables.py @@ -26,13 +26,13 @@ from parallel_dygraph_unused_variables import TestSparseEmbeddingUnusedVars flag_name = os.path.splitext(__file__)[0] -class TestParallelDygraphMnist(TestDistBase): +class TestParallelDygraphUnusedVar(TestDistBase): def _setup_config(self): self._sync_mode = False self._nccl2_mode = True self._dygraph = True - def test_mnist(self): + def test_net(self): if fluid.core.is_compiled_with_cuda(): self.check_with_place( "parallel_dygraph_unused_variables.py", @@ -41,6 +41,14 @@ class TestParallelDygraphMnist(TestDistBase): log_name=flag_name) +class TestFleetDygraphUnusedVar(TestParallelDygraphUnusedVar): + def _setup_config(self): + self._sync_mode = False + self._nccl2_mode = True + self._dygraph = True + self._use_fleet_api = True + + class TestSparseEmbeddingUnusedVarsSpawn(TestDistSpawnRunner): def test_mnist_with_spawn(self): if fluid.core.is_compiled_with_cuda() and sys.version_info >= (3, 4): @@ -48,17 +56,31 @@ class TestSparseEmbeddingUnusedVarsSpawn(TestDistSpawnRunner): test_class=TestSparseEmbeddingUnusedVars, delta=1e-5) -class TestFleetDygraphMnist(TestDistBase): +class TestParallelDygraphNoVar(TestDistBase): + def _setup_config(self): + self._sync_mode = False + self._nccl2_mode = True + self._dygraph = True + + def test_net(self): + if fluid.core.is_compiled_with_cuda(): + self.check_with_place( + "parallel_dygraph_none_var.py", + delta=1e-5, + check_error_log=True, + log_name=flag_name) + + +class TestParallelDygraphSharedUnusedVariables(TestDistBase): def _setup_config(self): self._sync_mode = False self._nccl2_mode = True self._dygraph = True - self._use_fleet_api = True def test_mnist(self): if fluid.core.is_compiled_with_cuda(): self.check_with_place( - "parallel_dygraph_unused_variables.py", + "parallel_dygraph_shared_unused_var.py", delta=1e-5, check_error_log=True, log_name=flag_name) -- GitLab From 1b6c1d39965068d856eb88e0a25fe7d2df2311b7 Mon Sep 17 00:00:00 2001 From: kuizhiqing Date: Thu, 1 Apr 2021 16:20:05 +0800 Subject: [PATCH 135/486] fix doc preblem (#32010) --- python/paddle/distributed/collective.py | 58 ++++++++++++------------- 1 file changed, 28 insertions(+), 30 deletions(-) diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py index 8e5c35995b2..2756dea72e8 100644 --- a/python/paddle/distributed/collective.py +++ b/python/paddle/distributed/collective.py @@ -142,7 +142,7 @@ def get_group(id=0): Get group instance by group id. Args: - id (int): the group id + id (int): the group id. Default value is 0. Returns: Group: the group instance. @@ -163,26 +163,24 @@ def get_group(id=0): def new_group(ranks=None, backend=None): """ - Creates a new distributed comminication group. + Creates a new distributed communication group. Args: - ranks (list): The global ranks of group members, list as sorted. + ranks (list): The global ranks of group members. backend (str): The backend used to create group, only nccl is supported now. Returns: - Group: The group instance. Nerver return None. + Group: The group instance. Examples: .. code-block:: python - import numpy as np import paddle paddle.distributed.init_parallel_env() - tindata = np.random.random([10, 1000]).astype('float32') - tindata = paddle.to_tensor(tindata) - gid = paddle.distributed.new_group([2,4,6]) - paddle.distributed.all_reduce(tindata, group=gid, use_calc_stream=False) + tindata = paddle.randn(shape=[2, 3]) + gp = paddle.distributed.new_group([2,4,6]) + paddle.distributed.all_reduce(tindata, group=gp, use_calc_stream=False) """ @@ -221,7 +219,7 @@ def new_group(ranks=None, backend=None): place = core.CUDAPlace(genv.device_id) core.NCCLParallelContext(strategy, place).init_with_ring_id(ring_id) else: - assert False + assert False, ("no cuda device found") return gp @@ -234,8 +232,8 @@ def wait(tensor, group=None, use_calc_stream=True): Args: tensor (Tensor): The Tensor used before sync. group (Group): The Group instance to perform sync. - use_calc_stream (bool): Wether to use calculation stream (True) or communication stream (False), - default to False. + use_calc_stream (bool): Wether to use calculation stream (True) or communication stream (False). + Default to True. Returns: None. @@ -243,13 +241,10 @@ def wait(tensor, group=None, use_calc_stream=True): Examples: .. code-block:: python - - import numpy as np import paddle paddle.distributed.init_parallel_env() - tindata = np.random.random([10, 1000]).astype('float32') - tindata = paddle.to_tensor(tindata) + tindata = paddle.randn(shape=[2, 3]) paddle.distributed.all_reduce(tindata, use_calc_stream=True) paddle.distributed.wait(tindata) @@ -306,8 +301,8 @@ def broadcast(tensor, src, group=None, use_calc_stream=True): should be float16, float32, float64, int32 or int64. src (int): The source rank. group (Group): The group instance return by new_group or None for global default group. - use_calc_stream (bool): Wether to use calculation stream (True) or communication stream (False), - default to True. + use_calc_stream (bool): Wether to use calculation stream (True) or communication stream (False). + Default to True. Returns: None. @@ -339,6 +334,7 @@ def broadcast(tensor, src, group=None, use_calc_stream=True): ring_id = 0 if group is None else group.id gsrc = src if group is None else group.get_group_rank(src) + assert gsrc >= 0, ("src rank out of group, need global rank") if in_dygraph_mode(): return core.ops.c_broadcast(tensor, tensor, 'root', gsrc, @@ -370,10 +366,10 @@ def all_reduce(tensor, op=ReduceOp.SUM, group=None, use_calc_stream=True): Args: tensor (Tensor): The input Tensor. It also works as the output Tensor. Its data type should be float16, float32, float64, int32 or int64. - op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.Min|ReduceOp.PROD): Optional. The operation used. + op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.Min|ReduceOp.PROD): Optional. The operation used. Default value is ReduceOp.SUM. group (Group): The group instance return by new_group or None for global default group. - use_calc_stream (bool): Wether to use calculation stream (True) or communication stream (False), - default to True. + use_calc_stream (bool): Wether to use calculation stream (True) or communication stream (False). + Default to True. Returns: None. @@ -453,10 +449,10 @@ def reduce(tensor, dst, op=ReduceOp.SUM, group=None, use_calc_stream=True): tensor (Tensor): The output Tensor for the destination and the input Tensor otherwise. Its data type should be float16, float32, float64, int32 or int64. dst (int): The destination rank id. - op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.Min|ReduceOp.PROD): Optional. The operation used. + op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.Min|ReduceOp.PROD): Optional. The operation used. Default value is ReduceOp.SUM. group (Group): The group instance return by new_group or None for global default group. - use_calc_stream (bool): Wether to use calculation stream (True) or communication stream (False), - default to True. + use_calc_stream (bool): Wether to use calculation stream (True) or communication stream (False). + Default to True. Returns: None. @@ -487,6 +483,7 @@ def reduce(tensor, dst, op=ReduceOp.SUM, group=None, use_calc_stream=True): ring_id = 0 if group is None else group.id gdst = dst if group is None else group.get_group_rank(dst) + assert gdst >= 0, ("dst rank out of group, need global rank") if in_dygraph_mode(): if op == ReduceOp.SUM: @@ -548,8 +545,8 @@ def all_gather(tensor_list, tensor, group=None, use_calc_stream=True): tensor (Tensor): The Tensor to send. Its data type should be float16, float32, float64, int32 or int64. group (Group): The group instance return by new_group or None for global default group. - use_calc_stream (bool): Wether to use calculation stream (True) or communication stream (False), - default to True. + use_calc_stream (bool): Wether to use calculation stream (True) or communication stream (False). + Default to True. Returns: None. @@ -624,11 +621,11 @@ def scatter(tensor, tensor_list=None, src=0, group=None, use_calc_stream=True): tensor (Tensor): The output Tensor. Its data type should be float16, float32, float64, int32 or int64. tensor_list (list): A list of Tensors to scatter. Every element in the list must be a Tensor whose data type - should be float16, float32, float64, int32 or int64. - src (int): The source rank id. + should be float16, float32, float64, int32 or int64. Default value is None. + src (int): The source rank id. Default value is 0. group (Group): The group instance return by new_group or None for global default group. - use_calc_stream (bool): Wether to use calculation stream (True) or communication stream (False), - default to True. + use_calc_stream (bool): Wether to use calculation stream (True) or communication stream (False). + Default to True. Returns: None. @@ -664,6 +661,7 @@ def scatter(tensor, tensor_list=None, src=0, group=None, use_calc_stream=True): ring_id = 0 if group is None else group.id gsrc = src if group is None else group.get_group_rank(src) + assert gsrc >= 0, ("src rank out of group, need global rank") rank = _get_global_group().rank if group is None else group.rank nranks = _get_global_group().nranks if group is None else group.nranks -- GitLab From 68e7de26c003e3404690d9e59d646a64350bc53f Mon Sep 17 00:00:00 2001 From: chajchaj <306536853@qq.com> Date: Thu, 1 Apr 2021 06:20:05 +0000 Subject: [PATCH 136/486] fix use_softmax=False does not work, test=develop --- python/paddle/nn/functional/loss.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py index 1dad1632e26..6c8a2d1cbce 100755 --- a/python/paddle/nn/functional/loss.py +++ b/python/paddle/nn/functional/loss.py @@ -1388,6 +1388,8 @@ def cross_entropy(input, "should be '-100', but received %s, which is not allowed." % ignore_index) + softmax_switch = use_softmax + input_dims = len(list(input.shape)) label_dims = len(list(label.shape)) if input_dims - 1 != label_dims and input_dims != label_dims: @@ -1400,7 +1402,7 @@ def cross_entropy(input, _, out = core.ops.softmax_with_cross_entropy( input, label, 'soft_label', soft_label, 'ignore_index', ignore_index, 'numeric_stable_mode', True, 'axis', axis, - 'use_softmax', use_softmax) + 'softmax_switch', softmax_switch) if weight is not None: @@ -1482,7 +1484,7 @@ def cross_entropy(input, 'ignore_index': ignore_index, 'numeric_stable_mode': True, 'axis': axis, - 'use_softmax': use_softmax + 'softmax_switch': softmax_switch } helper = LayerHelper('softmax_with_cross_entropy', **locals()) softmax = helper.create_variable_for_type_inference(dtype=input.dtype) -- GitLab From a4b30a1237fd3ed5c32be1a32f6cb880bc3aac1b Mon Sep 17 00:00:00 2001 From: Qi Li Date: Thu, 1 Apr 2021 18:53:38 +0800 Subject: [PATCH 137/486] [ROCM] fix depthwise conv failure on ROCM, test=develop (#31998) --- paddle/fluid/operators/math/depthwise_conv.cu | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/operators/math/depthwise_conv.cu b/paddle/fluid/operators/math/depthwise_conv.cu index 7439a959d38..d116b620dc1 100644 --- a/paddle/fluid/operators/math/depthwise_conv.cu +++ b/paddle/fluid/operators/math/depthwise_conv.cu @@ -613,6 +613,9 @@ class DepthwiseConvFunctor 512 && output_width <= 1024) thread = output_width; +#ifdef __HIPCC__ + thread = std::min(thread, 256); +#endif int blocks = std::min(std::max(thread / output_width, 1), output_height); dim3 threads(std::min(output_width, thread), blocks, 1); dim3 grid(output_channels, batch_size, 1); @@ -620,7 +623,13 @@ class DepthwiseConvFunctor Date: Thu, 1 Apr 2021 06:32:20 -0500 Subject: [PATCH 138/486] fix typo in spawn (#32017) --- python/paddle/distributed/spawn.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/paddle/distributed/spawn.py b/python/paddle/distributed/spawn.py index 56e59ac88ef..bf49604a897 100644 --- a/python/paddle/distributed/spawn.py +++ b/python/paddle/distributed/spawn.py @@ -303,8 +303,8 @@ class MultiprocessContext(object): raise Exception("Process %d terminated with signal %s." % (error_index, name)) else: - raise Exception("Process %d terminated with exit code %d." & ( - error_index, exitcode)) + raise Exception("Process %d terminated with exit code %d." % + (error_index, exitcode)) original_trace = self.error_queues[error_index].get() msg = "\n\n----------------------------------------------\n" \ -- GitLab From 0e52cdfc02a9d1666b5b1b05fba941455c5f7015 Mon Sep 17 00:00:00 2001 From: yaoxuefeng Date: Thu, 1 Apr 2021 19:45:34 +0800 Subject: [PATCH 139/486] delete test_data_generator (#31987) --- .../data_generator/test_data_generator.py | 39 ------------------- 1 file changed, 39 deletions(-) delete mode 100644 python/paddle/distributed/fleet/data_generator/test_data_generator.py diff --git a/python/paddle/distributed/fleet/data_generator/test_data_generator.py b/python/paddle/distributed/fleet/data_generator/test_data_generator.py deleted file mode 100644 index 60cbaf0bd36..00000000000 --- a/python/paddle/distributed/fleet/data_generator/test_data_generator.py +++ /dev/null @@ -1,39 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -import paddle -import paddle.distributed.fleet as fleet - - -class SyntheticData(fleet.MultiSlotDataGenerator): - def generate_sample(self, line): - def data_iter(): - for i in range(10000): - yield ("words", [1, 2, 3, 4]), ("label", [0]) - - return data_iter - - -class SyntheticStringData(fleet.MultiSlotStringDataGenerator): - def generate_sample(self, line): - def data_iter(): - for i in range(10000): - yield [("words", ["1", "2", "3", "4"]), ("label", ["0"])] - - return data_iter - - -sd = SyntheticData() -sd.run_from_memory() - -sd2 = SyntheticStringData() -sd2.run_from_memory() -- GitLab From 0b42f4896efa0df43631cdf6ff6bb8d04857a519 Mon Sep 17 00:00:00 2001 From: Zhou Wei <52485244+zhouwei25@users.noreply.github.com> Date: Fri, 2 Apr 2021 10:59:57 +0800 Subject: [PATCH 140/486] fix random compile failed on windows (#32032) --- paddle/scripts/paddle_build.bat | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat index b04c5f490c1..e939c712cbe 100644 --- a/paddle/scripts/paddle_build.bat +++ b/paddle/scripts/paddle_build.bat @@ -359,9 +359,9 @@ if %GENERATOR% == "Ninja" ( ninja -j %PARALLEL_PROJECT_COUNT% ) else ( if "%WITH_CLCACHE%"=="OFF" ( - MSBuild /m:%PARALLEL_PROJECT_COUNT% /p:PreferredToolArchitecture=x64 /p:Configuration=Release /verbosity:%LOG_LEVEL% ALL_BUILD.vcxproj + MSBuild /m:%PARALLEL_PROJECT_COUNT% /p:PreferredToolArchitecture=x64 /p:TrackFileAccess=false /p:Configuration=Release /verbosity:%LOG_LEVEL% ALL_BUILD.vcxproj ) else ( - MSBuild /m:%PARALLEL_PROJECT_COUNT% /p:PreferredToolArchitecture=x64 /p:TrackFileAccess=false /p:CLToolExe=clcache.exe /p:CLToolPath=%PYTHON_ROOT%\Scripts /p:Configuration=Release /verbosity:%LOG_LEVEL% paddle.sln + MSBuild /m:%PARALLEL_PROJECT_COUNT% /p:PreferredToolArchitecture=x64 /p:TrackFileAccess=false /p:CLToolExe=clcache.exe /p:CLToolPath=%PYTHON_ROOT%\Scripts /p:Configuration=Release /verbosity:%LOG_LEVEL% ALL_BUILD.vcxproj ) ) -- GitLab From 4490e8af4e29cc2cf8933226e5b2c2b577d155b9 Mon Sep 17 00:00:00 2001 From: niuliling123 <51102941+niuliling123@users.noreply.github.com> Date: Fri, 2 Apr 2021 12:13:04 +0800 Subject: [PATCH 141/486] add leaky_relu forward and backward in activation_op.cu (#31841) * add leaky_relu forward and backward in activation_op.cu --- paddle/fluid/operators/activation_op.cu | 250 +++++++++++++++++------- 1 file changed, 181 insertions(+), 69 deletions(-) diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu index c6d2fbccd8e..04f329088fa 100644 --- a/paddle/fluid/operators/activation_op.cu +++ b/paddle/fluid/operators/activation_op.cu @@ -42,6 +42,10 @@ template class BaseGPUFunctor { public: using ELEMENT_TYPE = T; + + using AttrPair = std::vector>; + + AttrPair GetAttrs() { return AttrPair(); } }; /* ========================================================================== */ @@ -57,42 +61,35 @@ class ReluGPUFunctor : public BaseGPUFunctor { // for relu forward when T is double __device__ __forceinline__ typename CudaVecType::type Compute( - const typename CudaVecType::type* x); + const typename CudaVecType::type in) { + // relu forward : out = max(x, 0) + return in > zero_ ? in : zero_; + } // when num % vecsize != 0 this func will be used - __device__ __forceinline__ T ComputeRemainder(const T x) { - return x > zero_ ? x : zero_; + __device__ __forceinline__ T ComputeRemainder(const T in) { + // relu forward : out = max(x, 0) + return in > zero_ ? in : zero_; } }; -template <> -__device__ __forceinline__ CudaVecType::type -ReluGPUFunctor::Compute(const CudaVecType::type* x) { -// relu forward : out = max(x, 0) -#ifdef __HIPCC__ || __CUDA_ARCH__ >= 350 - return __ldg(x) > zero_ ? __ldg(x) : zero_; -#else - return (*x) > zero_ ? (*x) : zero_; -#endif -} - template <> __device__ __forceinline__ CudaVecType::type -ReluGPUFunctor::Compute(const CudaVecType::type* xx) { - // relu forward : out = max(xx, 0) - return make_float4((xx->x > zero_) * (xx->x), (xx->y > zero_) * (xx->y), - (xx->z > zero_) * (xx->z), (xx->w > zero_) * (xx->w)); +ReluGPUFunctor::Compute(const CudaVecType::type in) { + // relu forward : out = max(in, 0) + return make_float4((in.x > zero_) * (in.x), (in.y > zero_) * (in.y), + (in.z > zero_) * (in.z), (in.w > zero_) * (in.w)); } template <> __device__ __forceinline__ CudaVecType::type -ReluGPUFunctor::Compute(const CudaVecType::type* in) { +ReluGPUFunctor::Compute(const CudaVecType::type in) { // relu forward : out = max(in, 0) #ifdef __HIPCC__ || CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__) const half2 kzero = __float2half2_rn(0.0f); - return __hmul2(__hgt2(__ldg(in), kzero), __ldg(in)); + return __hmul2(__hgt2(in, kzero), in); #else - const float2 xx = __half22float2(*in); + const float2 xx = __half22float2(in); return __floats2half2_rn((xx.x > 0.0f) * static_cast(xx.x), (xx.y > 0.0f) * static_cast(xx.y)); #endif @@ -112,8 +109,10 @@ class ReluGradGPUFunctor : public BaseGPUFunctor { // for relu backward when T is double __device__ __forceinline__ typename CudaVecType::type Compute( - const typename CudaVecType::type* out, - const typename CudaVecType::type* dout); + const typename CudaVecType::type out, + const typename CudaVecType::type dout) { + return out > zero_ ? dout : zero_; + } // when num % vecsize != 0 this func will be used __device__ __forceinline__ T ComputeRemainder(const T out, const T dout) { @@ -124,44 +123,132 @@ class ReluGradGPUFunctor : public BaseGPUFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } }; -template <> -__device__ __forceinline__ CudaVecType::type -ReluGradGPUFunctor::Compute(const CudaVecType::type* out, - const CudaVecType::type* dout) { -// relu backward : dx = out > 0 ? dout : 0; -#ifdef __HIPCC__ || __CUDA_ARCH__ >= 350 - return __ldg(out) > zero_ ? __ldg(dout) : zero_; -#else - return (*out) > zero_ ? (*dout) : zero_; -#endif -} - template <> __device__ __forceinline__ CudaVecType::type -ReluGradGPUFunctor::Compute(const CudaVecType::type* out, - const CudaVecType::type* dout) { +ReluGradGPUFunctor::Compute(const CudaVecType::type out, + const CudaVecType::type dout) { // relu backward : dx = out > 0 ? dout : 0; - return make_float4((out->x > zero_) * (dout->x), (out->y > zero_) * (dout->y), - (out->z > zero_) * (dout->z), - (out->w > zero_) * (dout->w)); + return make_float4((out.x > zero_) * (dout.x), (out.y > zero_) * (dout.y), + (out.z > zero_) * (dout.z), (out.w > zero_) * (dout.w)); } template <> __device__ __forceinline__ CudaVecType::type -ReluGradGPUFunctor::Compute(const CudaVecType::type* out, - const CudaVecType::type* dout) { +ReluGradGPUFunctor::Compute(const CudaVecType::type out, + const CudaVecType::type dout) { // relu backward : dx = out > 0 ? dout : 0; #ifdef __HIPCC__ || CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__) const half2 kzero = __float2half2_rn(0.0f); - return __hmul2(__hgt2(__ldg(out), kzero), __ldg(dout)); + return __hmul2(__hgt2(out, kzero), dout); #else - const float2 xx = __half22float2(*out); - const float2 yy = __half22float2(*dout); + const float2 xx = __half22float2(out); + const float2 yy = __half22float2(dout); return __floats2half2_rn((xx.x > 0.0f) * static_cast(yy.x), (xx.y > 0.0f) * static_cast(yy.y)); #endif } +/* ========================================================================== */ +/* ======================== leaky relu forward ======================== + */ +template +class LeakyReluGPUFunctor : public BaseGPUFunctor { + private: + T zero_; + float alpha_; + + public: + LeakyReluGPUFunctor() { zero_ = static_cast(0.0f); } + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"alpha", &alpha_}}; + } + // leakyrelu forward : out = x > 0 ? x : x * alpha + __device__ __forceinline__ typename CudaVecType::type Compute( + const typename CudaVecType::type in) { + return in > zero_ ? in : static_cast(alpha_) * in; + } + + __device__ __forceinline__ T ComputeRemainder(const T in) { + // leakyrelu forward : out = x > 0 ? x : x * alpha + return in > zero_ ? in : static_cast(alpha_) * in; + } +}; + +template <> +__device__ __forceinline__ CudaVecType::type +LeakyReluGPUFunctor::Compute(const CudaVecType::type in) { + // leakyrelu forward : out = x > 0 ? x : x * alpha + return make_float4((in.x > zero_) ? (in.x) : (in.x) * alpha_, + (in.y > zero_) ? (in.y) : (in.y) * alpha_, + (in.z > zero_) ? (in.z) : (in.z) * alpha_, + (in.w > zero_) ? (in.w) : (in.w) * alpha_); +} + +template <> +__device__ __forceinline__ CudaVecType::type +LeakyReluGPUFunctor::Compute(const CudaVecType::type in) { + // leakyrelu forward : out = x > 0 ? x : x * alpha + const float2 xx = __half22float2(in); + return __floats2half2_rn((xx.x > 0.0f) ? xx.x : xx.x * alpha_, + (xx.y > 0.0f) ? xx.y : xx.y * alpha_); +} +/* ========================================================================== */ + +/* =========================== leaky relu backward ======================= + */ +template +class LeakyReluGradGPUFunctor : public BaseGPUFunctor { + private: + T zero_; + float alpha_; + + public: + LeakyReluGradGPUFunctor() { zero_ = static_cast(0.0f); } + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"alpha", &alpha_}}; + } + + // for leaky relu backward when T is double + __device__ __forceinline__ typename CudaVecType::type Compute( + const typename CudaVecType::type in, + const typename CudaVecType::type dout) { + // leakyrelu backward : dx = x > 0 ? dout : alpha * dout + return in > zero_ ? dout : static_cast(alpha_) * dout; + } + + // when num % vecsize != 0 this func will be used + __device__ __forceinline__ T ComputeRemainder(const T in, const T dout) { + // leakyrelu backward : dx = x > 0 ? dout : alpha * dout + return in > zero_ ? dout : static_cast(alpha_) * dout; + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +template <> +__device__ __forceinline__ CudaVecType::type +LeakyReluGradGPUFunctor::Compute(const CudaVecType::type in, + const CudaVecType::type dout) { + // leakyrelu backward : dx = x > 0 ? dout : alpha * dout + return make_float4((in.x > zero_) ? (dout.x) : alpha_ * (dout.x), + (in.y > zero_) ? (dout.y) : alpha_ * (dout.y), + (in.z > zero_) ? (dout.z) : alpha_ * (dout.z), + (in.w > zero_) ? (dout.w) : alpha_ * (dout.w)); +} + +template <> +__device__ __forceinline__ CudaVecType::type LeakyReluGradGPUFunctor< + float16>::Compute(const CudaVecType::type in, + const CudaVecType::type dout) { + // leakyrelu backward : dx = x > 0 ? dout : alpha * dout + const float2 xx = __half22float2(in); + const float2 yy = __half22float2(dout); + return __floats2half2_rn((xx.x > 0.0f) ? yy.x : alpha_ * yy.x, + (xx.y > 0.0f) ? yy.y : alpha_ * yy.y); +} + /* ========================================================================== */ template @@ -176,14 +263,23 @@ __global__ void ActivationGradKernelVec(const T* forward_data, const T* dout, const VecType* in_forward = reinterpret_cast(forward_data); const VecType* in_dout = reinterpret_cast(dout); VecType* out = reinterpret_cast(dx); - + VecType forward_vec, dout_vec; + T in_data, dout_data; for (int i = idx; i < loop; i += stride) { - out[i] = functor.Compute((in_forward + i), (in_dout + i)); +#ifdef __HIPCC__ || __CUDA_ARCH__ >= 350 + forward_vec = __ldg(in_forward + i); + dout_vec = __ldg(in_dout + i); +#else + forward_vec = in_forward[i]; + dout_vec = in_dout[i]; +#endif + out[i] = functor.Compute(forward_vec, dout_vec); } while (idx == loop && tail) { - dx[num - tail] = - functor.ComputeRemainder(forward_data[num - tail], dout[num - tail]); + in_data = forward_data[num - tail]; + dout_data = dout[num - tail]; + dx[num - tail] = functor.ComputeRemainder(in_data, dout_data); --tail; } } @@ -199,9 +295,14 @@ __global__ void ActivationkernelVec(const T* src, T* dst, int num, int tail = num % vecsize; const VecType* in = reinterpret_cast(src); VecType* out = reinterpret_cast(dst); - + VecType x_vec; for (int i = idx; i < loop; i += stride) { - out[i] = functor.Compute((in + i)); +#ifdef __HIPCC__ || __CUDA_ARCH__ >= 350 + x_vec = __ldg(in + i); +#else + x_vec = in[i]; +#endif + out[i] = functor.Compute(x_vec); } while (idx == loop && tail) { @@ -231,6 +332,10 @@ class ActivationGPUKernel block = 256; #endif Functor functor; + auto attrs = functor.GetAttrs(); + for (auto& attr : attrs) { + *attr.second = context.Attr(attr.first); + } constexpr int vecsize = CudaVecType::vecsize; int grid = max((num / vecsize + block - 1) / block, 1); auto stream = context.cuda_device_context().stream(); @@ -270,7 +375,12 @@ class ActivationGradGPUKernel #ifdef __HIPCC__ block = 256; #endif + Functor functor; + auto attrs = functor.GetAttrs(); + for (auto& attr : attrs) { + *attr.second = context.Attr(attr.first); + } constexpr int vecsize = CudaVecType::vecsize; int grid = max((numel / vecsize + block - 1) / block, 1); auto stream = context.cuda_device_context().stream(); @@ -300,12 +410,28 @@ namespace plat = paddle::platform; ops::grad_functor>, \ ops::ActivationGradKernel>); - FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_CUDA_KERNEL); +#define REGISTER_ACTIVATION_GPU_KERNEL(act_type, op_name, functor, \ + grad_functor) \ + REGISTER_OP_CUDA_KERNEL( \ + act_type, ops::ActivationGPUKernel>, \ + ops::ActivationGPUKernel>, \ + ops::ActivationGPUKernel>); \ + REGISTER_OP_CUDA_KERNEL( \ + act_type##_grad, ops::ActivationGradGPUKernel>, \ + ops::ActivationGradGPUKernel>, \ + ops::ActivationGradGPUKernel>); + /* ======================== leaky relu register ============================ */ -REGISTER_ACTIVATION_CUDA_KERNEL(leaky_relu, LeakyRelu, LeakyReluFunctor, - LeakyReluGradFunctor); +REGISTER_ACTIVATION_GPU_KERNEL(leaky_relu, LeakyRelu, LeakyReluGPUFunctor, + LeakyReluGradGPUFunctor); REGISTER_OP_CUDA_KERNEL( leaky_relu_grad_grad, @@ -330,21 +456,7 @@ REGISTER_OP_CUDA_KERNEL( /* ========================================================================== */ /* =========================== relu register ============================ */ -REGISTER_OP_CUDA_KERNEL( - relu, ops::ActivationGPUKernel>, - ops::ActivationGPUKernel>, - ops::ActivationGPUKernel>); - -REGISTER_OP_CUDA_KERNEL( - relu_grad, ops::ActivationGradGPUKernel>, - ops::ActivationGradGPUKernel>, - ops::ActivationGradGPUKernel>); +REGISTER_ACTIVATION_GPU_KERNEL(relu, Relu, ReluGPUFunctor, ReluGradGPUFunctor); REGISTER_OP_CUDA_KERNEL( relu_grad_grad, -- GitLab From 9e06a6414878fdc287472bd8d93a633c58403381 Mon Sep 17 00:00:00 2001 From: ronnywang <524019753@qq.com> Date: Fri, 2 Apr 2021 13:43:27 +0800 Subject: [PATCH 142/486] [ROCM] fix softmax_with_cross_entropy_op (#31982) --- paddle/fluid/operators/math/cross_entropy.cu | 11 ++++++++--- paddle/fluid/operators/math/softmax.cu | 10 ++++++---- .../fluid/operators/softmax_with_cross_entropy_op.cu | 4 ++++ paddle/fluid/platform/dynload/miopen.h | 2 ++ 4 files changed, 20 insertions(+), 7 deletions(-) diff --git a/paddle/fluid/operators/math/cross_entropy.cu b/paddle/fluid/operators/math/cross_entropy.cu index 84fa0d6af99..55662e1d0aa 100644 --- a/paddle/fluid/operators/math/cross_entropy.cu +++ b/paddle/fluid/operators/math/cross_entropy.cu @@ -66,18 +66,23 @@ class CrossEntropyFunctor { int batch_size = prob->dims()[0]; int class_num = prob->dims()[1]; +#ifdef __HIPCC__ + constexpr int kMaxBlockDim = 256; +#else + constexpr int kMaxBlockDim = 512; +#endif if (softLabel) { const T* label_data = labels->data(); - int block = class_num > 512 - ? 512 + int block = class_num > kMaxBlockDim + ? kMaxBlockDim : pow(2, static_cast(std::log2(class_num))); SoftCrossEntropyKernel<<>>( loss_data, prob_data, label_data, class_num); } else { const int64_t* label_data = labels->data(); - int block = 512; + int block = kMaxBlockDim; int grid = (batch_size + block - 1) / block; CrossEntropyKernel<<>>( loss_data, prob_data, label_data, batch_size, class_num, diff --git a/paddle/fluid/operators/math/softmax.cu b/paddle/fluid/operators/math/softmax.cu index 879e367281c..9e9fe5b9c10 100644 --- a/paddle/fluid/operators/math/softmax.cu +++ b/paddle/fluid/operators/math/softmax.cu @@ -54,10 +54,11 @@ void SoftmaxCUDNNFunctor::operator()( xDesc.descriptor(layout, cudnn_tensor_dims); miopenTensorDescriptor_t cudnn_y_desc = xDesc.descriptor(layout, cudnn_tensor_dims); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSoftmaxForward( + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSoftmaxForward_V2( context.cudnn_handle(), CudnnDataType::kOne(), cudnn_x_desc, X->data(), CudnnDataType::kZero(), cudnn_y_desc, - Y->mutable_data(context.GetPlace()))); + Y->mutable_data(context.GetPlace()), MIOPEN_SOFTMAX_ACCURATE, + MIOPEN_SOFTMAX_MODE_INSTANCE)); #else cudnnTensorDescriptor_t cudnn_x_desc = xDesc.descriptor(layout, cudnn_tensor_dims); @@ -96,11 +97,12 @@ void SoftmaxGradCUDNNFunctor::operator()( dxDesc.descriptor(layout, cudnn_tensor_dims); miopenTensorDescriptor_t cudnn_ygrad_desc = dyDesc.descriptor(layout, cudnn_tensor_dims); - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSoftmaxBackward( + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSoftmaxBackward_V2( context.cudnn_handle(), CudnnDataType::kOne(), cudnn_y_desc, Y->data(), cudnn_ygrad_desc, YGrad->data(), CudnnDataType::kZero(), cudnn_xgrad_desc, - XGrad->mutable_data(context.GetPlace()))); + XGrad->mutable_data(context.GetPlace()), MIOPEN_SOFTMAX_ACCURATE, + MIOPEN_SOFTMAX_MODE_INSTANCE)); #else cudnnTensorDescriptor_t cudnn_y_desc = yDesc.descriptor(layout, cudnn_tensor_dims); diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu index 2257d816d89..140059256c3 100644 --- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu +++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu @@ -672,7 +672,11 @@ template static void SoftmaxWithCrossEntropyFusedKernel( const T* logits_data, const T* labels_data, T* softmax_data, T* loss_data, int64_t n, int64_t d, int axis_dim, gpuStream_t stream) { +#ifdef __HIPCC__ + constexpr int kMaxBlockDim = 256; +#else constexpr int kMaxBlockDim = 512; +#endif int64_t block_dim = axis_dim >= kMaxBlockDim ? kMaxBlockDim : (1 << static_cast(std::log2(axis_dim))); diff --git a/paddle/fluid/platform/dynload/miopen.h b/paddle/fluid/platform/dynload/miopen.h index 15de4c64e3e..05b1fc891a0 100644 --- a/paddle/fluid/platform/dynload/miopen.h +++ b/paddle/fluid/platform/dynload/miopen.h @@ -116,7 +116,9 @@ extern void EnforceCUDNNLoaded(const char* fn_name); __macro(miopenPoolingForward); \ __macro(miopenPoolingBackward); \ __macro(miopenSoftmaxBackward); \ + __macro(miopenSoftmaxBackward_V2); \ __macro(miopenSoftmaxForward); \ + __macro(miopenSoftmaxForward_V2); \ __macro(miopenCreateDropoutDescriptor); \ __macro(miopenDestroyDropoutDescriptor); \ __macro(miopenRestoreDropoutDescriptor); \ -- GitLab From 94736d6072a3fd5551b696d559a77603cafbad08 Mon Sep 17 00:00:00 2001 From: seemingwang Date: Fri, 2 Apr 2021 13:58:45 +0800 Subject: [PATCH 143/486] graph engine (#31226) * graph engine demo * upload unsaved changes * fix dependency error * fix shard_num problem * py client * remove lock and graph-type * add load direct graph * add load direct graph * add load direct graph * batch random_sample * batch_sample_k * fix num_nodes size * batch brpc * batch brpc * add test * add test * add load_nodes; change add_node function * change sample return type to pair * resolve conflict * resolved conflict * resolved conflict * separate server and client * merge pair type * fix * resolved conflict * fixed segment fault; high-level VLOG for load edges and load nodes * random_sample return 0 * rm useless loop * test:load edge * fix ret -1 * test: rm sample * rm sample * random_sample return future * random_sample return int * test fake node * fixed here * memory leak * remove test code * fix return problem * add common_graph_table * random sample node &test & change data-structure from linkedList to vector * add common_graph_table * sample with srand * add node_types * optimize nodes sample * recover test * random sample * destruct weighted sampler * GraphEdgeBlob * WeightedGraphEdgeBlob to GraphEdgeBlob * WeightedGraphEdgeBlob to GraphEdgeBlob * pybind sample nodes api * pull nodes with step * fixed pull_graph_list bug; add test for pull_graph_list by step * add graph table;name * add graph table;name * add pybind * add pybind * add FeatureNode * add FeatureNode * add FeatureNode Serialize * add FeatureNode Serialize * get_feat_node * avoid local rpc * fix get_node_feat * fix get_node_feat * remove log * get_node_feat return py:bytes * merge develop with graph_engine * fix threadpool.h head * fix * fix typo * resolve conflict * fix conflict * recover lost content * fix pybind of FeatureNode * recover cmake * recover tools * resolve conflict * resolve linking problem * code style * change test_server port * fix code problems * remove shard_num config * remove redundent threads * optimize start server * remove logs * fix code problems by reviewers' suggestions Co-authored-by: Huang Zhengjie <270018958@qq.com> Co-authored-by: Weiyue Su Co-authored-by: suweiyue Co-authored-by: luobin06 Co-authored-by: liweibin02 --- .github/ISSUE_TEMPLATE/---document-issue-.md | 2 +- .../fluid/distributed/service/CMakeLists.txt | 10 +- .../distributed/service/brpc_ps_client.cc | 2 +- .../distributed/service/brpc_ps_client.h | 27 +- .../distributed/service/graph_brpc_client.cc | 331 +++++++++++ .../distributed/service/graph_brpc_client.h | 105 ++++ .../distributed/service/graph_brpc_server.cc | 347 +++++++++++ .../distributed/service/graph_brpc_server.h | 113 ++++ .../distributed/service/graph_py_service.cc | 325 ++++++++++ .../distributed/service/graph_py_service.h | 178 ++++++ paddle/fluid/distributed/service/ps_client.cc | 5 +- paddle/fluid/distributed/service/ps_client.h | 8 +- .../fluid/distributed/service/sendrecv.proto | 6 +- paddle/fluid/distributed/service/server.cc | 3 + paddle/fluid/distributed/table/CMakeLists.txt | 10 +- .../distributed/table/common_graph_table.cc | 506 ++++++++++++++++ .../distributed/table/common_graph_table.h | 144 +++++ paddle/fluid/distributed/table/graph_edge.cc | 29 + paddle/fluid/distributed/table/graph_edge.h | 46 ++ paddle/fluid/distributed/table/graph_node.cc | 117 ++++ paddle/fluid/distributed/table/graph_node.h | 127 ++++ .../table/graph_weighted_sampler.cc | 150 +++++ .../table/graph_weighted_sampler.h | 58 ++ paddle/fluid/distributed/table/table.cc | 4 +- paddle/fluid/distributed/table/table.h | 27 + paddle/fluid/distributed/test/CMakeLists.txt | 3 + .../fluid/distributed/test/graph_node_test.cc | 556 ++++++++++++++++++ paddle/fluid/inference/api/demo_ci/clean.sh | 14 + paddle/fluid/pybind/CMakeLists.txt | 4 + paddle/fluid/pybind/fleet_py.cc | 60 ++ paddle/fluid/pybind/fleet_py.h | 6 +- paddle/fluid/pybind/pybind.cc | 5 + paddle/scripts/build_docker_images.sh | 15 + .../docker/root/.scripts/git-completion.sh | 15 + paddle/scripts/fast_install.sh | 14 + python/paddle/fluid/dataloader/fetcher.py | 7 +- .../incubate/fleet/tests/cluster_train.sh | 14 + .../test_squared_mat_sub_fuse_pass.py | 6 +- .../unittests/ir/inference/test_trt_matmul.py | 23 +- .../fluid/tests/unittests/parallel_test.sh | 15 + .../fluid/tests/unittests/test_bce_loss.py | 12 +- .../unittests/test_bce_with_logits_loss.py | 6 +- .../tests/unittests/test_c_comm_init_op.sh | 15 + .../tests/unittests/test_dist_fleet_ps10.py | 1 - .../test_flatten_contiguous_range_op.py | 3 +- .../fluid/tests/unittests/test_l1_loss.py | 12 +- .../tests/unittests/test_listen_and_serv.sh | 15 + .../fluid/tests/unittests/test_mse_loss.py | 18 +- ...ess_dataloader_iterable_dataset_dynamic.py | 1 + .../tests/unittests/test_pixel_shuffle.py | 12 +- .../fluid/tests/unittests/test_prod_op.py | 6 +- .../fluid/tests/unittests/test_selu_op.py | 9 +- .../unittests/test_sigmoid_focal_loss.py | 6 +- .../tests/unittests/test_transpose_op.py | 8 +- scripts/paddle | 169 ++++++ tools/check_api_approvals.sh | 14 + tools/check_sequence_op.sh | 14 + tools/cudaError/start.sh | 15 + tools/diff_api.py | 15 + tools/diff_unittest.py | 15 + tools/dockerfile/icode.sh | 14 + tools/document_preview.sh | 15 + tools/get_cpu_info.sh | 14 + 63 files changed, 3765 insertions(+), 81 deletions(-) create mode 100644 paddle/fluid/distributed/service/graph_brpc_client.cc create mode 100644 paddle/fluid/distributed/service/graph_brpc_client.h create mode 100644 paddle/fluid/distributed/service/graph_brpc_server.cc create mode 100644 paddle/fluid/distributed/service/graph_brpc_server.h create mode 100644 paddle/fluid/distributed/service/graph_py_service.cc create mode 100644 paddle/fluid/distributed/service/graph_py_service.h create mode 100644 paddle/fluid/distributed/table/common_graph_table.cc create mode 100644 paddle/fluid/distributed/table/common_graph_table.h create mode 100644 paddle/fluid/distributed/table/graph_edge.cc create mode 100644 paddle/fluid/distributed/table/graph_edge.h create mode 100644 paddle/fluid/distributed/table/graph_node.cc create mode 100644 paddle/fluid/distributed/table/graph_node.h create mode 100644 paddle/fluid/distributed/table/graph_weighted_sampler.cc create mode 100644 paddle/fluid/distributed/table/graph_weighted_sampler.h create mode 100644 paddle/fluid/distributed/test/graph_node_test.cc create mode 100644 scripts/paddle diff --git a/.github/ISSUE_TEMPLATE/---document-issue-.md b/.github/ISSUE_TEMPLATE/---document-issue-.md index 7c464ac584b..ffc2fcd7817 100644 --- a/.github/ISSUE_TEMPLATE/---document-issue-.md +++ b/.github/ISSUE_TEMPLATE/---document-issue-.md @@ -56,4 +56,4 @@ For example: no sample code; The sample code is not helpful; The sample code not For example:Chinese API in this doc is inconsistent with English API, including params, description, sample code, formula, etc. #### Other -For example: The doc link is broken; The doc page is missing; Dead link in docs. \ No newline at end of file +For example: The doc link is broken; The doc page is missing; Dead link in docs. diff --git a/paddle/fluid/distributed/service/CMakeLists.txt b/paddle/fluid/distributed/service/CMakeLists.txt index bb3f6f1174d..843dea9eea6 100644 --- a/paddle/fluid/distributed/service/CMakeLists.txt +++ b/paddle/fluid/distributed/service/CMakeLists.txt @@ -24,11 +24,12 @@ set_source_files_properties(heter_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUT set_source_files_properties(client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(ps_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(server.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - +set_source_files_properties(graph_brpc_server.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +set_source_files_properties(graph_brpc_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) cc_library(brpc_utils SRCS brpc_utils.cc DEPS tensor device_context ${COMMON_DEPS} ${RPC_DEPS}) -cc_library(downpour_server SRCS brpc_ps_server.cc DEPS boost eigen3 table brpc_utils ${RPC_DEPS}) -cc_library(downpour_client SRCS brpc_ps_client.cc DEPS boost eigen3 table brpc_utils ${RPC_DEPS}) +cc_library(downpour_server SRCS graph_brpc_server.cc brpc_ps_server.cc DEPS boost eigen3 table brpc_utils simple_threadpool ${RPC_DEPS}) +cc_library(downpour_client SRCS graph_brpc_client.cc brpc_ps_client.cc DEPS boost eigen3 table brpc_utils simple_threadpool ${RPC_DEPS}) cc_library(client SRCS ps_client.cc DEPS downpour_client boost ${RPC_DEPS}) cc_library(server SRCS server.cc DEPS downpour_server boost ${RPC_DEPS}) @@ -38,3 +39,6 @@ cc_library(ps_service SRCS service.cc DEPS communicator client server boost ${RP cc_library(heter_server SRCS heter_server.cc DEPS brpc_utils ${COMMON_DEPS} ${RPC_DEPS}) cc_library(heter_client SRCS heter_client.cc DEPS brpc_utils ${COMMON_DEPS} ${RPC_DEPS}) + +set_source_files_properties(graph_py_service.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +cc_library(graph_py_service SRCS graph_py_service.cc DEPS ps_service) diff --git a/paddle/fluid/distributed/service/brpc_ps_client.cc b/paddle/fluid/distributed/service/brpc_ps_client.cc index 163526fe3b2..5c226e6a0dd 100644 --- a/paddle/fluid/distributed/service/brpc_ps_client.cc +++ b/paddle/fluid/distributed/service/brpc_ps_client.cc @@ -990,4 +990,4 @@ int32_t BrpcPsClient::recv_and_save_table(const uint64_t table_id, } } // namespace distributed -} // namespace paddle +} // namespace paddle \ No newline at end of file diff --git a/paddle/fluid/distributed/service/brpc_ps_client.h b/paddle/fluid/distributed/service/brpc_ps_client.h index 8f9d2653864..84a31fdbd5d 100644 --- a/paddle/fluid/distributed/service/brpc_ps_client.h +++ b/paddle/fluid/distributed/service/brpc_ps_client.h @@ -170,9 +170,22 @@ class BrpcPsClient : public PSClient { virtual int32_t recv_and_save_table(const uint64_t table_id, const std::string &path); - private: + protected: + virtual size_t get_server_nums() { return _server_channels.size(); } + inline brpc::Channel *get_sparse_channel(size_t server_id) { + return _server_channels[server_id][0].get(); + } + inline brpc::Channel *get_dense_channel(size_t server_id) { + return _server_channels[server_id][1].get(); + } + inline brpc::Channel *get_cmd_channel(size_t server_id) { + return _server_channels[server_id][2].get(); + } virtual int32_t initialize() override; + private: + // virtual int32_t initialize() override; + inline uint32_t dense_dim_per_shard(uint32_t dense_dim_total, uint32_t shard_num) { return dense_dim_total / shard_num + 1; @@ -184,16 +197,6 @@ class BrpcPsClient : public PSClient { std::future send_save_cmd(uint32_t table_id, int cmd_id, const std::vector ¶m); - inline brpc::Channel *get_sparse_channel(size_t server_id) { - return _server_channels[server_id][0].get(); - } - inline brpc::Channel *get_dense_channel(size_t server_id) { - return _server_channels[server_id][1].get(); - } - inline brpc::Channel *get_cmd_channel(size_t server_id) { - return _server_channels[server_id][2].get(); - } - bool _running = false; bool _flushing = false; std::atomic _async_call_num; //异步请求计数 @@ -220,8 +223,6 @@ class BrpcPsClient : public PSClient { size_t num, void *done) override; - virtual size_t get_server_nums() { return _server_channels.size(); } - private: int32_t start_client_service(); diff --git a/paddle/fluid/distributed/service/graph_brpc_client.cc b/paddle/fluid/distributed/service/graph_brpc_client.cc new file mode 100644 index 00000000000..a6271cac83c --- /dev/null +++ b/paddle/fluid/distributed/service/graph_brpc_client.cc @@ -0,0 +1,331 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/distributed/service/graph_brpc_client.h" +#include +#include +#include +#include +#include +#include +#include "Eigen/Dense" +#include "paddle/fluid/distributed/service/brpc_ps_client.h" +#include "paddle/fluid/distributed/table/table.h" +#include "paddle/fluid/framework/archive.h" +#include "paddle/fluid/string/string_helper.h" +namespace paddle { +namespace distributed { + +void GraphPsService_Stub::service( + ::google::protobuf::RpcController *controller, + const ::paddle::distributed::PsRequestMessage *request, + ::paddle::distributed::PsResponseMessage *response, + ::google::protobuf::Closure *done) { + if (graph_service != NULL && local_channel == channel()) { + // VLOG(0)<<"use local"; + task_pool->enqueue([this, controller, request, response, done]() -> int { + this->graph_service->service(controller, request, response, done); + return 0; + }); + } else { + // VLOG(0)<<"use server"; + PsService_Stub::service(controller, request, response, done); + } +} + +int GraphBrpcClient::get_server_index_by_id(uint64_t id) { + int shard_num = get_shard_num(); + int shard_per_server = shard_num % server_size == 0 + ? shard_num / server_size + : shard_num / server_size + 1; + return id % shard_num / shard_per_server; +} + +std::future GraphBrpcClient::get_node_feat( + const uint32_t &table_id, const std::vector &node_ids, + const std::vector &feature_names, + std::vector> &res) { + std::vector request2server; + std::vector server2request(server_size, -1); + for (int query_idx = 0; query_idx < node_ids.size(); ++query_idx) { + int server_index = get_server_index_by_id(node_ids[query_idx]); + if (server2request[server_index] == -1) { + server2request[server_index] = request2server.size(); + request2server.push_back(server_index); + } + } + size_t request_call_num = request2server.size(); + std::vector> node_id_buckets(request_call_num); + std::vector> query_idx_buckets(request_call_num); + for (int query_idx = 0; query_idx < node_ids.size(); ++query_idx) { + int server_index = get_server_index_by_id(node_ids[query_idx]); + int request_idx = server2request[server_index]; + node_id_buckets[request_idx].push_back(node_ids[query_idx]); + query_idx_buckets[request_idx].push_back(query_idx); + } + + DownpourBrpcClosure *closure = new DownpourBrpcClosure( + request_call_num, + [&, node_id_buckets, query_idx_buckets, request_call_num](void *done) { + int ret = 0; + auto *closure = (DownpourBrpcClosure *)done; + int fail_num = 0; + for (int request_idx = 0; request_idx < request_call_num; + ++request_idx) { + if (closure->check_response(request_idx, + PS_GRAPH_SAMPLE_NEIGHBOORS) != 0) { + ++fail_num; + } else { + auto &res_io_buffer = + closure->cntl(request_idx)->response_attachment(); + butil::IOBufBytesIterator io_buffer_itr(res_io_buffer); + size_t bytes_size = io_buffer_itr.bytes_left(); + std::unique_ptr buffer_wrapper(new char[bytes_size]); + char *buffer = buffer_wrapper.get(); + io_buffer_itr.copy_and_forward((void *)(buffer), bytes_size); + + for (size_t feat_idx = 0; feat_idx < feature_names.size(); + ++feat_idx) { + for (size_t node_idx = 0; + node_idx < query_idx_buckets.at(request_idx).size(); + ++node_idx) { + int query_idx = query_idx_buckets.at(request_idx).at(node_idx); + size_t feat_len = *(size_t *)(buffer); + buffer += sizeof(size_t); + auto feature = std::string(buffer, feat_len); + res[feat_idx][query_idx] = feature; + buffer += feat_len; + } + } + } + if (fail_num == request_call_num) { + ret = -1; + } + } + closure->set_promise_value(ret); + }); + + auto promise = std::make_shared>(); + closure->add_promise(promise); + std::future fut = promise->get_future(); + + for (int request_idx = 0; request_idx < request_call_num; ++request_idx) { + int server_index = request2server[request_idx]; + closure->request(request_idx)->set_cmd_id(PS_GRAPH_GET_NODE_FEAT); + closure->request(request_idx)->set_table_id(table_id); + closure->request(request_idx)->set_client_id(_client_id); + size_t node_num = node_id_buckets[request_idx].size(); + + closure->request(request_idx) + ->add_params((char *)node_id_buckets[request_idx].data(), + sizeof(uint64_t) * node_num); + std::string joint_feature_name = + paddle::string::join_strings(feature_names, '\t'); + closure->request(request_idx) + ->add_params(joint_feature_name.c_str(), joint_feature_name.size()); + + PsService_Stub rpc_stub(get_cmd_channel(server_index)); + closure->cntl(request_idx)->set_log_id(butil::gettimeofday_ms()); + rpc_stub.service(closure->cntl(request_idx), closure->request(request_idx), + closure->response(request_idx), closure); + } + + return fut; +} +// char* &buffer,int &actual_size +std::future GraphBrpcClient::batch_sample_neighboors( + uint32_t table_id, std::vector node_ids, int sample_size, + std::vector>> &res) { + std::vector request2server; + std::vector server2request(server_size, -1); + res.clear(); + for (int query_idx = 0; query_idx < node_ids.size(); ++query_idx) { + int server_index = get_server_index_by_id(node_ids[query_idx]); + if (server2request[server_index] == -1) { + server2request[server_index] = request2server.size(); + request2server.push_back(server_index); + } + res.push_back(std::vector>()); + } + size_t request_call_num = request2server.size(); + std::vector> node_id_buckets(request_call_num); + std::vector> query_idx_buckets(request_call_num); + for (int query_idx = 0; query_idx < node_ids.size(); ++query_idx) { + int server_index = get_server_index_by_id(node_ids[query_idx]); + int request_idx = server2request[server_index]; + node_id_buckets[request_idx].push_back(node_ids[query_idx]); + query_idx_buckets[request_idx].push_back(query_idx); + } + + DownpourBrpcClosure *closure = new DownpourBrpcClosure( + request_call_num, + [&, node_id_buckets, query_idx_buckets, request_call_num](void *done) { + int ret = 0; + auto *closure = (DownpourBrpcClosure *)done; + int fail_num = 0; + for (int request_idx = 0; request_idx < request_call_num; + ++request_idx) { + if (closure->check_response(request_idx, + PS_GRAPH_SAMPLE_NEIGHBOORS) != 0) { + ++fail_num; + } else { + auto &res_io_buffer = + closure->cntl(request_idx)->response_attachment(); + butil::IOBufBytesIterator io_buffer_itr(res_io_buffer); + size_t bytes_size = io_buffer_itr.bytes_left(); + std::unique_ptr buffer_wrapper(new char[bytes_size]); + char *buffer = buffer_wrapper.get(); + io_buffer_itr.copy_and_forward((void *)(buffer), bytes_size); + + size_t node_num = *(size_t *)buffer; + int *actual_sizes = (int *)(buffer + sizeof(size_t)); + char *node_buffer = + buffer + sizeof(size_t) + sizeof(int) * node_num; + + int offset = 0; + for (size_t node_idx = 0; node_idx < node_num; ++node_idx) { + int query_idx = query_idx_buckets.at(request_idx).at(node_idx); + int actual_size = actual_sizes[node_idx]; + int start = 0; + while (start < actual_size) { + res[query_idx].push_back( + {*(uint64_t *)(node_buffer + offset + start), + *(float *)(node_buffer + offset + start + + GraphNode::id_size)}); + start += GraphNode::id_size + GraphNode::weight_size; + } + offset += actual_size; + } + } + if (fail_num == request_call_num) { + ret = -1; + } + } + closure->set_promise_value(ret); + }); + + auto promise = std::make_shared>(); + closure->add_promise(promise); + std::future fut = promise->get_future(); + + for (int request_idx = 0; request_idx < request_call_num; ++request_idx) { + int server_index = request2server[request_idx]; + closure->request(request_idx)->set_cmd_id(PS_GRAPH_SAMPLE_NEIGHBOORS); + closure->request(request_idx)->set_table_id(table_id); + closure->request(request_idx)->set_client_id(_client_id); + size_t node_num = node_id_buckets[request_idx].size(); + + closure->request(request_idx) + ->add_params((char *)node_id_buckets[request_idx].data(), + sizeof(uint64_t) * node_num); + closure->request(request_idx) + ->add_params((char *)&sample_size, sizeof(int)); + // PsService_Stub rpc_stub(get_cmd_channel(server_index)); + GraphPsService_Stub rpc_stub = + getServiceStub(get_cmd_channel(server_index)); + closure->cntl(request_idx)->set_log_id(butil::gettimeofday_ms()); + rpc_stub.service(closure->cntl(request_idx), closure->request(request_idx), + closure->response(request_idx), closure); + } + + return fut; +} +std::future GraphBrpcClient::random_sample_nodes( + uint32_t table_id, int server_index, int sample_size, + std::vector &ids) { + DownpourBrpcClosure *closure = new DownpourBrpcClosure(1, [&](void *done) { + int ret = 0; + auto *closure = (DownpourBrpcClosure *)done; + if (closure->check_response(0, PS_GRAPH_SAMPLE_NODES) != 0) { + ret = -1; + } else { + auto &res_io_buffer = closure->cntl(0)->response_attachment(); + butil::IOBufBytesIterator io_buffer_itr(res_io_buffer); + size_t bytes_size = io_buffer_itr.bytes_left(); + char buffer[bytes_size]; + auto size = io_buffer_itr.copy_and_forward((void *)(buffer), bytes_size); + int index = 0; + while (index < bytes_size) { + ids.push_back(*(uint64_t *)(buffer + index)); + index += GraphNode::id_size; + } + } + closure->set_promise_value(ret); + }); + auto promise = std::make_shared>(); + closure->add_promise(promise); + std::future fut = promise->get_future(); + ; + closure->request(0)->set_cmd_id(PS_GRAPH_SAMPLE_NODES); + closure->request(0)->set_table_id(table_id); + closure->request(0)->set_client_id(_client_id); + closure->request(0)->add_params((char *)&sample_size, sizeof(int)); + ; + // PsService_Stub rpc_stub(get_cmd_channel(server_index)); + GraphPsService_Stub rpc_stub = getServiceStub(get_cmd_channel(server_index)); + closure->cntl(0)->set_log_id(butil::gettimeofday_ms()); + rpc_stub.service(closure->cntl(0), closure->request(0), closure->response(0), + closure); + return fut; +} +std::future GraphBrpcClient::pull_graph_list( + uint32_t table_id, int server_index, int start, int size, int step, + std::vector &res) { + DownpourBrpcClosure *closure = new DownpourBrpcClosure(1, [&](void *done) { + int ret = 0; + auto *closure = (DownpourBrpcClosure *)done; + if (closure->check_response(0, PS_PULL_GRAPH_LIST) != 0) { + ret = -1; + } else { + auto &res_io_buffer = closure->cntl(0)->response_attachment(); + butil::IOBufBytesIterator io_buffer_itr(res_io_buffer); + size_t bytes_size = io_buffer_itr.bytes_left(); + char buffer[bytes_size]; + io_buffer_itr.copy_and_forward((void *)(buffer), bytes_size); + int index = 0; + while (index < bytes_size) { + FeatureNode node; + node.recover_from_buffer(buffer + index); + index += node.get_size(false); + res.push_back(node); + } + } + closure->set_promise_value(ret); + }); + auto promise = std::make_shared>(); + closure->add_promise(promise); + std::future fut = promise->get_future(); + closure->request(0)->set_cmd_id(PS_PULL_GRAPH_LIST); + closure->request(0)->set_table_id(table_id); + closure->request(0)->set_client_id(_client_id); + closure->request(0)->add_params((char *)&start, sizeof(int)); + closure->request(0)->add_params((char *)&size, sizeof(int)); + closure->request(0)->add_params((char *)&step, sizeof(int)); + // PsService_Stub rpc_stub(get_cmd_channel(server_index)); + GraphPsService_Stub rpc_stub = getServiceStub(get_cmd_channel(server_index)); + closure->cntl(0)->set_log_id(butil::gettimeofday_ms()); + rpc_stub.service(closure->cntl(0), closure->request(0), closure->response(0), + closure); + return fut; +} +int32_t GraphBrpcClient::initialize() { + // set_shard_num(_config.shard_num()); + BrpcPsClient::initialize(); + server_size = get_server_nums(); + graph_service = NULL; + local_channel = NULL; + return 0; +} +} +} diff --git a/paddle/fluid/distributed/service/graph_brpc_client.h b/paddle/fluid/distributed/service/graph_brpc_client.h new file mode 100644 index 00000000000..4e6775a4bed --- /dev/null +++ b/paddle/fluid/distributed/service/graph_brpc_client.h @@ -0,0 +1,105 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include + +#include +#include "ThreadPool.h" +#include "brpc/channel.h" +#include "brpc/controller.h" +#include "brpc/server.h" +#include "paddle/fluid/distributed/service/brpc_ps_client.h" +#include "paddle/fluid/distributed/service/graph_brpc_server.h" +#include "paddle/fluid/distributed/service/ps_client.h" +#include "paddle/fluid/distributed/table/table.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/tensor_util.h" + +namespace paddle { +namespace distributed { + +class GraphPsService_Stub : public PsService_Stub { + public: + GraphPsService_Stub(::google::protobuf::RpcChannel* channel, + ::google::protobuf::RpcChannel* local_channel = NULL, + GraphBrpcService* service = NULL, int thread_num = 1) + : PsService_Stub(channel) { + this->local_channel = local_channel; + this->graph_service = service; + task_pool.reset(new ::ThreadPool(thread_num)); + } + virtual ~GraphPsService_Stub() {} + + // implements PsService ------------------------------------------ + GraphBrpcService* graph_service; + std::shared_ptr<::ThreadPool> task_pool; + ::google::protobuf::RpcChannel* local_channel; + GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(GraphPsService_Stub); + void service(::google::protobuf::RpcController* controller, + const ::paddle::distributed::PsRequestMessage* request, + ::paddle::distributed::PsResponseMessage* response, + ::google::protobuf::Closure* done); +}; +class GraphBrpcClient : public BrpcPsClient { + public: + GraphBrpcClient() {} + virtual ~GraphBrpcClient() {} + // given a batch of nodes, sample graph_neighboors for each of them + virtual std::future batch_sample_neighboors( + uint32_t table_id, std::vector node_ids, int sample_size, + std::vector>>& res); + + virtual std::future pull_graph_list(uint32_t table_id, + int server_index, int start, + int size, int step, + std::vector& res); + virtual std::future random_sample_nodes(uint32_t table_id, + int server_index, + int sample_size, + std::vector& ids); + virtual std::future get_node_feat( + const uint32_t& table_id, const std::vector& node_ids, + const std::vector& feature_names, + std::vector>& res); + virtual int32_t initialize(); + int get_shard_num() { return shard_num; } + void set_shard_num(int shard_num) { this->shard_num = shard_num; } + int get_server_index_by_id(uint64_t id); + void set_local_channel(int index) { + this->local_channel = get_cmd_channel(index); + } + void set_local_graph_service(GraphBrpcService* graph_service) { + this->graph_service = graph_service; + } + GraphPsService_Stub getServiceStub(::google::protobuf::RpcChannel* channel, + int thread_num = 1) { + return GraphPsService_Stub(channel, local_channel, graph_service, + thread_num); + } + + private: + int shard_num; + size_t server_size; + ::google::protobuf::RpcChannel* local_channel; + GraphBrpcService* graph_service; +}; + +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/service/graph_brpc_server.cc b/paddle/fluid/distributed/service/graph_brpc_server.cc new file mode 100644 index 00000000000..4f6cc1143e9 --- /dev/null +++ b/paddle/fluid/distributed/service/graph_brpc_server.cc @@ -0,0 +1,347 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/distributed/service/graph_brpc_server.h" +#include "paddle/fluid/distributed/service/brpc_ps_server.h" + +#include // NOLINT +#include "butil/endpoint.h" +#include "iomanip" +#include "paddle/fluid/distributed/service/brpc_ps_client.h" +#include "paddle/fluid/distributed/table/table.h" +#include "paddle/fluid/framework/archive.h" +#include "paddle/fluid/platform/profiler.h" +namespace paddle { +namespace distributed { + +int32_t GraphBrpcServer::initialize() { + auto &service_config = _config.downpour_server_param().service_param(); + if (!service_config.has_service_class()) { + LOG(ERROR) << "miss service_class in ServerServiceParameter"; + return -1; + } + auto *service = + CREATE_PSCORE_CLASS(PsBaseService, service_config.service_class()); + if (service == NULL) { + LOG(ERROR) << "service is unregistered, service_name:" + << service_config.service_class(); + return -1; + } + + _service.reset(service); + if (service->configure(this) != 0 || service->initialize() != 0) { + LOG(ERROR) << "service initialize failed, service_name:" + << service_config.service_class(); + return -1; + } + if (_server.AddService(service, brpc::SERVER_DOESNT_OWN_SERVICE) != 0) { + LOG(ERROR) << "service add to brpc failed, service:" + << service_config.service_class(); + return -1; + } + return 0; +} + +uint64_t GraphBrpcServer::start(const std::string &ip, uint32_t port) { + std::unique_lock lock(mutex_); + + std::string ip_port = ip + ":" + std::to_string(port); + VLOG(3) << "server of rank " << _rank << " starts at " << ip_port; + brpc::ServerOptions options; + + int num_threads = std::thread::hardware_concurrency(); + auto trainers = _environment->get_trainers(); + options.num_threads = trainers > num_threads ? trainers : num_threads; + + if (_server.Start(ip_port.c_str(), &options) != 0) { + LOG(ERROR) << "GraphBrpcServer start failed, ip_port=" << ip_port; + return 0; + } + _environment->registe_ps_server(ip, port, _rank); + return 0; +} + +int32_t GraphBrpcServer::port() { return _server.listen_address().port; } + +int32_t GraphBrpcService::initialize() { + _is_initialize_shard_info = false; + _service_handler_map[PS_STOP_SERVER] = &GraphBrpcService::stop_server; + _service_handler_map[PS_LOAD_ONE_TABLE] = &GraphBrpcService::load_one_table; + _service_handler_map[PS_LOAD_ALL_TABLE] = &GraphBrpcService::load_all_table; + + _service_handler_map[PS_PRINT_TABLE_STAT] = + &GraphBrpcService::print_table_stat; + _service_handler_map[PS_BARRIER] = &GraphBrpcService::barrier; + _service_handler_map[PS_START_PROFILER] = &GraphBrpcService::start_profiler; + _service_handler_map[PS_STOP_PROFILER] = &GraphBrpcService::stop_profiler; + + _service_handler_map[PS_PULL_GRAPH_LIST] = &GraphBrpcService::pull_graph_list; + _service_handler_map[PS_GRAPH_SAMPLE_NEIGHBOORS] = + &GraphBrpcService::graph_random_sample_neighboors; + _service_handler_map[PS_GRAPH_SAMPLE_NODES] = + &GraphBrpcService::graph_random_sample_nodes; + _service_handler_map[PS_GRAPH_GET_NODE_FEAT] = + &GraphBrpcService::graph_get_node_feat; + + // shard初始化,server启动后才可从env获取到server_list的shard信息 + initialize_shard_info(); + + return 0; +} + +#define CHECK_TABLE_EXIST(table, request, response) \ + if (table == NULL) { \ + std::string err_msg("table not found with table_id:"); \ + err_msg.append(std::to_string(request.table_id())); \ + set_response_code(response, -1, err_msg.c_str()); \ + return -1; \ + } + +int32_t GraphBrpcService::initialize_shard_info() { + if (!_is_initialize_shard_info) { + std::lock_guard guard(_initialize_shard_mutex); + if (_is_initialize_shard_info) { + return 0; + } + size_t shard_num = _server->environment()->get_ps_servers().size(); + auto &table_map = *(_server->table()); + for (auto itr : table_map) { + itr.second->set_shard(_rank, shard_num); + } + _is_initialize_shard_info = true; + } + return 0; +} + +void GraphBrpcService::service(google::protobuf::RpcController *cntl_base, + const PsRequestMessage *request, + PsResponseMessage *response, + google::protobuf::Closure *done) { + brpc::ClosureGuard done_guard(done); + std::string log_label("ReceiveCmd-"); + if (!request->has_table_id()) { + set_response_code(*response, -1, "PsRequestMessage.tabel_id is required"); + return; + } + + response->set_err_code(0); + response->set_err_msg(""); + auto *table = _server->table(request->table_id()); + brpc::Controller *cntl = static_cast(cntl_base); + auto itr = _service_handler_map.find(request->cmd_id()); + if (itr == _service_handler_map.end()) { + std::string err_msg( + "undefined cmd_id, should match PsCmdID in ps.proto, cmd_id:"); + err_msg.append(std::to_string(request->cmd_id())); + set_response_code(*response, -1, err_msg.c_str()); + return; + } + serviceFunc handler_func = itr->second; + int service_ret = (this->*handler_func)(table, *request, *response, cntl); + if (service_ret != 0) { + response->set_err_code(service_ret); + response->set_err_msg("server internal error"); + } +} + +int32_t GraphBrpcService::barrier(Table *table, const PsRequestMessage &request, + PsResponseMessage &response, + brpc::Controller *cntl) { + CHECK_TABLE_EXIST(table, request, response) + + if (request.params_size() < 1) { + set_response_code(response, -1, + "PsRequestMessage.params is requeired at " + "least 1 for num of sparse_key"); + return 0; + } + + auto trainer_id = request.client_id(); + auto barrier_type = request.params(0); + table->barrier(trainer_id, barrier_type); + return 0; +} + +int32_t GraphBrpcService::print_table_stat(Table *table, + const PsRequestMessage &request, + PsResponseMessage &response, + brpc::Controller *cntl) { + CHECK_TABLE_EXIST(table, request, response) + std::pair ret = table->print_table_stat(); + paddle::framework::BinaryArchive ar; + ar << ret.first << ret.second; + std::string table_info(ar.Buffer(), ar.Length()); + response.set_data(table_info); + + return 0; +} + +int32_t GraphBrpcService::load_one_table(Table *table, + const PsRequestMessage &request, + PsResponseMessage &response, + brpc::Controller *cntl) { + CHECK_TABLE_EXIST(table, request, response) + if (request.params_size() < 2) { + set_response_code( + response, -1, + "PsRequestMessage.datas is requeired at least 2 for path & load_param"); + return -1; + } + if (table->load(request.params(0), request.params(1)) != 0) { + set_response_code(response, -1, "table load failed"); + return -1; + } + return 0; +} + +int32_t GraphBrpcService::load_all_table(Table *table, + const PsRequestMessage &request, + PsResponseMessage &response, + brpc::Controller *cntl) { + auto &table_map = *(_server->table()); + for (auto &itr : table_map) { + if (load_one_table(itr.second.get(), request, response, cntl) != 0) { + LOG(ERROR) << "load table[" << itr.first << "] failed"; + return -1; + } + } + return 0; +} + +int32_t GraphBrpcService::stop_server(Table *table, + const PsRequestMessage &request, + PsResponseMessage &response, + brpc::Controller *cntl) { + GraphBrpcServer *p_server = (GraphBrpcServer *)_server; + std::thread t_stop([p_server]() { + p_server->stop(); + LOG(INFO) << "Server Stoped"; + }); + p_server->export_cv()->notify_all(); + t_stop.detach(); + return 0; +} + +int32_t GraphBrpcService::stop_profiler(Table *table, + const PsRequestMessage &request, + PsResponseMessage &response, + brpc::Controller *cntl) { + platform::DisableProfiler(platform::EventSortingKey::kDefault, + string::Sprintf("server_%s_profile", _rank)); + return 0; +} + +int32_t GraphBrpcService::start_profiler(Table *table, + const PsRequestMessage &request, + PsResponseMessage &response, + brpc::Controller *cntl) { + platform::EnableProfiler(platform::ProfilerState::kCPU); + return 0; +} + +int32_t GraphBrpcService::pull_graph_list(Table *table, + const PsRequestMessage &request, + PsResponseMessage &response, + brpc::Controller *cntl) { + CHECK_TABLE_EXIST(table, request, response) + if (request.params_size() < 3) { + set_response_code(response, -1, + "pull_graph_list request requires at least 3 arguments"); + return 0; + } + int start = *(int *)(request.params(0).c_str()); + int size = *(int *)(request.params(1).c_str()); + int step = *(int *)(request.params(2).c_str()); + std::unique_ptr buffer; + int actual_size; + table->pull_graph_list(start, size, buffer, actual_size, false, step); + cntl->response_attachment().append(buffer.get(), actual_size); + return 0; +} +int32_t GraphBrpcService::graph_random_sample_neighboors( + Table *table, const PsRequestMessage &request, PsResponseMessage &response, + brpc::Controller *cntl) { + CHECK_TABLE_EXIST(table, request, response) + if (request.params_size() < 2) { + set_response_code( + response, -1, + "graph_random_sample request requires at least 2 arguments"); + return 0; + } + size_t node_num = request.params(0).size() / sizeof(uint64_t); + uint64_t *node_data = (uint64_t *)(request.params(0).c_str()); + int sample_size = *(uint64_t *)(request.params(1).c_str()); + std::vector> buffers(node_num); + std::vector actual_sizes(node_num, 0); + table->random_sample_neighboors(node_data, sample_size, buffers, + actual_sizes); + + cntl->response_attachment().append(&node_num, sizeof(size_t)); + cntl->response_attachment().append(actual_sizes.data(), + sizeof(int) * node_num); + for (size_t idx = 0; idx < node_num; ++idx) { + cntl->response_attachment().append(buffers[idx].get(), actual_sizes[idx]); + } + return 0; +} +int32_t GraphBrpcService::graph_random_sample_nodes( + Table *table, const PsRequestMessage &request, PsResponseMessage &response, + brpc::Controller *cntl) { + size_t size = *(uint64_t *)(request.params(0).c_str()); + std::unique_ptr buffer; + int actual_size; + if (table->random_sample_nodes(size, buffer, actual_size) == 0) { + cntl->response_attachment().append(buffer.get(), actual_size); + } else + cntl->response_attachment().append(NULL, 0); + + return 0; +} + +int32_t GraphBrpcService::graph_get_node_feat(Table *table, + const PsRequestMessage &request, + PsResponseMessage &response, + brpc::Controller *cntl) { + CHECK_TABLE_EXIST(table, request, response) + if (request.params_size() < 2) { + set_response_code( + response, -1, + "graph_get_node_feat request requires at least 2 arguments"); + return 0; + } + size_t node_num = request.params(0).size() / sizeof(uint64_t); + uint64_t *node_data = (uint64_t *)(request.params(0).c_str()); + std::vector node_ids(node_data, node_data + node_num); + + std::vector feature_names = + paddle::string::split_string(request.params(1), "\t"); + + std::vector> feature( + feature_names.size(), std::vector(node_num)); + + table->get_node_feat(node_ids, feature_names, feature); + + for (size_t feat_idx = 0; feat_idx < feature_names.size(); ++feat_idx) { + for (size_t node_idx = 0; node_idx < node_num; ++node_idx) { + size_t feat_len = feature[feat_idx][node_idx].size(); + cntl->response_attachment().append(&feat_len, sizeof(size_t)); + cntl->response_attachment().append(feature[feat_idx][node_idx].data(), + feat_len); + } + } + + return 0; +} +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/service/graph_brpc_server.h b/paddle/fluid/distributed/service/graph_brpc_server.h new file mode 100644 index 00000000000..af63bf5d99e --- /dev/null +++ b/paddle/fluid/distributed/service/graph_brpc_server.h @@ -0,0 +1,113 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "brpc/channel.h" +#include "brpc/controller.h" +#include "brpc/server.h" + +#include +#include +#include "paddle/fluid/distributed/service/brpc_ps_server.h" +#include "paddle/fluid/distributed/service/server.h" + +namespace paddle { +namespace distributed { +class GraphBrpcServer : public PSServer { + public: + GraphBrpcServer() {} + virtual ~GraphBrpcServer() {} + PsBaseService *get_service() { return _service.get(); } + virtual uint64_t start(const std::string &ip, uint32_t port); + virtual int32_t stop() { + std::unique_lock lock(mutex_); + if (stoped_) return 0; + stoped_ = true; + // cv_.notify_all(); + _server.Stop(1000); + _server.Join(); + return 0; + } + virtual int32_t port(); + + std::condition_variable *export_cv() { return &cv_; } + + private: + virtual int32_t initialize(); + mutable std::mutex mutex_; + std::condition_variable cv_; + bool stoped_ = false; + brpc::Server _server; + std::shared_ptr _service; + std::vector> _pserver_channels; +}; + +class GraphBrpcService; + +typedef int32_t (GraphBrpcService::*serviceFunc)( + Table *table, const PsRequestMessage &request, PsResponseMessage &response, + brpc::Controller *cntl); + +class GraphBrpcService : public PsBaseService { + public: + virtual int32_t initialize() override; + + virtual void service(::google::protobuf::RpcController *controller, + const PsRequestMessage *request, + PsResponseMessage *response, + ::google::protobuf::Closure *done) override; + + protected: + std::unordered_map _service_handler_map; + int32_t initialize_shard_info(); + int32_t pull_graph_list(Table *table, const PsRequestMessage &request, + PsResponseMessage &response, brpc::Controller *cntl); + int32_t graph_random_sample_neighboors(Table *table, + const PsRequestMessage &request, + PsResponseMessage &response, + brpc::Controller *cntl); + int32_t graph_random_sample_nodes(Table *table, + const PsRequestMessage &request, + PsResponseMessage &response, + brpc::Controller *cntl); + int32_t graph_get_node_feat(Table *table, const PsRequestMessage &request, + PsResponseMessage &response, + brpc::Controller *cntl); + int32_t barrier(Table *table, const PsRequestMessage &request, + PsResponseMessage &response, brpc::Controller *cntl); + int32_t load_one_table(Table *table, const PsRequestMessage &request, + PsResponseMessage &response, brpc::Controller *cntl); + int32_t load_all_table(Table *table, const PsRequestMessage &request, + PsResponseMessage &response, brpc::Controller *cntl); + int32_t stop_server(Table *table, const PsRequestMessage &request, + PsResponseMessage &response, brpc::Controller *cntl); + int32_t start_profiler(Table *table, const PsRequestMessage &request, + PsResponseMessage &response, brpc::Controller *cntl); + int32_t stop_profiler(Table *table, const PsRequestMessage &request, + PsResponseMessage &response, brpc::Controller *cntl); + + int32_t print_table_stat(Table *table, const PsRequestMessage &request, + PsResponseMessage &response, brpc::Controller *cntl); + + private: + bool _is_initialize_shard_info; + std::mutex _initialize_shard_mutex; + std::unordered_map _msg_handler_map; + std::vector _ori_values; + const int sample_nodes_ranges = 23; +}; + +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/service/graph_py_service.cc b/paddle/fluid/distributed/service/graph_py_service.cc new file mode 100644 index 00000000000..61e4e0cf7bb --- /dev/null +++ b/paddle/fluid/distributed/service/graph_py_service.cc @@ -0,0 +1,325 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/distributed/service/graph_py_service.h" +#include // NOLINT +#include "butil/endpoint.h" +#include "iomanip" +#include "paddle/fluid/distributed/table/table.h" +#include "paddle/fluid/framework/archive.h" +#include "paddle/fluid/platform/profiler.h" +namespace paddle { +namespace distributed { +std::vector GraphPyService::split(std::string& str, + const char pattern) { + std::vector res; + std::stringstream input(str); + std::string temp; + while (std::getline(input, temp, pattern)) { + res.push_back(temp); + } + return res; +} + +void GraphPyService::add_table_feat_conf(std::string table_name, + std::string feat_name, + std::string feat_dtype, + int32_t feat_shape) { + if (this->table_id_map.count(table_name)) { + this->table_feat_conf_table_name.push_back(table_name); + this->table_feat_conf_feat_name.push_back(feat_name); + this->table_feat_conf_feat_dtype.push_back(feat_dtype); + this->table_feat_conf_feat_shape.push_back(feat_shape); + } +} + +void GraphPyService::set_up(std::string ips_str, int shard_num, + std::vector node_types, + std::vector edge_types) { + set_shard_num(shard_num); + set_num_node_types(node_types.size()); + + for (size_t table_id = 0; table_id < node_types.size(); table_id++) { + this->table_id_map[node_types[table_id]] = this->table_id_map.size(); + } + for (size_t table_id = 0; table_id < edge_types.size(); table_id++) { + this->table_id_map[edge_types[table_id]] = this->table_id_map.size(); + } + std::istringstream stream(ips_str); + std::string ip; + server_size = 0; + std::vector ips_list = split(ips_str, ';'); + int index = 0; + for (auto ips : ips_list) { + auto ip_and_port = split(ips, ':'); + server_list.push_back(ip_and_port[0]); + port_list.push_back(ip_and_port[1]); + uint32_t port = stoul(ip_and_port[1]); + auto ph_host = paddle::distributed::PSHost(ip_and_port[0], port, index); + host_sign_list.push_back(ph_host.serialize_to_string()); + index++; + } +} +void GraphPyClient::start_client() { + std::map> dense_regions; + dense_regions.insert( + std::pair>(0, {})); + auto regions = dense_regions[0]; + ::paddle::distributed::PSParameter worker_proto = GetWorkerProto(); + paddle::distributed::PaddlePSEnvironment _ps_env; + auto servers_ = host_sign_list.size(); + _ps_env = paddle::distributed::PaddlePSEnvironment(); + _ps_env.set_ps_servers(&host_sign_list, servers_); + worker_ptr = std::shared_ptr( + (paddle::distributed::GraphBrpcClient*) + paddle::distributed::PSClientFactory::create(worker_proto)); + worker_ptr->configure(worker_proto, dense_regions, _ps_env, client_id); + worker_ptr->set_shard_num(get_shard_num()); +} +void GraphPyServer::start_server(bool block) { + std::string ip = server_list[rank]; + uint32_t port = std::stoul(port_list[rank]); + ::paddle::distributed::PSParameter server_proto = this->GetServerProto(); + + auto _ps_env = paddle::distributed::PaddlePSEnvironment(); + _ps_env.set_ps_servers(&this->host_sign_list, + this->host_sign_list.size()); // test + pserver_ptr = std::shared_ptr( + (paddle::distributed::GraphBrpcServer*) + paddle::distributed::PSServerFactory::create(server_proto)); + VLOG(0) << "pserver-ptr created "; + std::vector empty_vec; + framework::ProgramDesc empty_prog; + empty_vec.push_back(empty_prog); + pserver_ptr->configure(server_proto, _ps_env, rank, empty_vec); + pserver_ptr->start(ip, port); + std::condition_variable* cv_ = pserver_ptr->export_cv(); + if (block) { + std::mutex mutex_; + std::unique_lock lock(mutex_); + cv_->wait(lock); + } +} +::paddle::distributed::PSParameter GraphPyServer::GetServerProto() { + // Generate server proto desc + ::paddle::distributed::PSParameter server_fleet_desc; + ::paddle::distributed::ServerParameter* server_proto = + server_fleet_desc.mutable_server_param(); + ::paddle::distributed::DownpourServerParameter* downpour_server_proto = + server_proto->mutable_downpour_server_param(); + ::paddle::distributed::ServerServiceParameter* server_service_proto = + downpour_server_proto->mutable_service_param(); + server_service_proto->set_service_class("GraphBrpcService"); + server_service_proto->set_server_class("GraphBrpcServer"); + server_service_proto->set_client_class("GraphBrpcClient"); + server_service_proto->set_start_server_port(0); + server_service_proto->set_server_thread_num(12); + + for (auto& tuple : this->table_id_map) { + VLOG(0) << " make a new table " << tuple.second; + ::paddle::distributed::TableParameter* sparse_table_proto = + downpour_server_proto->add_downpour_table_param(); + std::vector feat_name; + std::vector feat_dtype; + std::vector feat_shape; + for (size_t i = 0; i < this->table_feat_conf_table_name.size(); i++) { + if (tuple.first == table_feat_conf_table_name[i]) { + feat_name.push_back(table_feat_conf_feat_name[i]); + feat_dtype.push_back(table_feat_conf_feat_dtype[i]); + feat_shape.push_back(table_feat_conf_feat_shape[i]); + } + } + std::string table_type; + if (tuple.second < this->num_node_types) { + table_type = "node"; + } else { + table_type = "edge"; + } + + GetDownpourSparseTableProto(sparse_table_proto, tuple.second, tuple.first, + table_type, feat_name, feat_dtype, feat_shape); + } + + return server_fleet_desc; +} + +::paddle::distributed::PSParameter GraphPyClient::GetWorkerProto() { + ::paddle::distributed::PSParameter worker_fleet_desc; + ::paddle::distributed::WorkerParameter* worker_proto = + worker_fleet_desc.mutable_worker_param(); + + ::paddle::distributed::DownpourWorkerParameter* downpour_worker_proto = + worker_proto->mutable_downpour_worker_param(); + + for (auto& tuple : this->table_id_map) { + VLOG(0) << " make a new table " << tuple.second; + ::paddle::distributed::TableParameter* worker_sparse_table_proto = + downpour_worker_proto->add_downpour_table_param(); + std::vector feat_name; + std::vector feat_dtype; + std::vector feat_shape; + for (size_t i = 0; i < this->table_feat_conf_table_name.size(); i++) { + if (tuple.first == table_feat_conf_table_name[i]) { + feat_name.push_back(table_feat_conf_feat_name[i]); + feat_dtype.push_back(table_feat_conf_feat_dtype[i]); + feat_shape.push_back(table_feat_conf_feat_shape[i]); + } + } + std::string table_type; + if (tuple.second < this->num_node_types) { + table_type = "node"; + } else { + table_type = "edge"; + } + + GetDownpourSparseTableProto(worker_sparse_table_proto, tuple.second, + tuple.first, table_type, feat_name, feat_dtype, + feat_shape); + } + + ::paddle::distributed::ServerParameter* server_proto = + worker_fleet_desc.mutable_server_param(); + ::paddle::distributed::DownpourServerParameter* downpour_server_proto = + server_proto->mutable_downpour_server_param(); + ::paddle::distributed::ServerServiceParameter* server_service_proto = + downpour_server_proto->mutable_service_param(); + server_service_proto->set_service_class("GraphBrpcService"); + server_service_proto->set_server_class("GraphBrpcServer"); + server_service_proto->set_client_class("GraphBrpcClient"); + server_service_proto->set_start_server_port(0); + server_service_proto->set_server_thread_num(12); + + for (auto& tuple : this->table_id_map) { + VLOG(0) << " make a new table " << tuple.second; + ::paddle::distributed::TableParameter* sparse_table_proto = + downpour_server_proto->add_downpour_table_param(); + std::vector feat_name; + std::vector feat_dtype; + std::vector feat_shape; + for (size_t i = 0; i < this->table_feat_conf_table_name.size(); i++) { + if (tuple.first == table_feat_conf_table_name[i]) { + feat_name.push_back(table_feat_conf_feat_name[i]); + feat_dtype.push_back(table_feat_conf_feat_dtype[i]); + feat_shape.push_back(table_feat_conf_feat_shape[i]); + } + } + std::string table_type; + if (tuple.second < this->num_node_types) { + table_type = "node"; + } else { + table_type = "edge"; + } + + GetDownpourSparseTableProto(sparse_table_proto, tuple.second, tuple.first, + table_type, feat_name, feat_dtype, feat_shape); + } + + return worker_fleet_desc; +} +void GraphPyClient::load_edge_file(std::string name, std::string filepath, + bool reverse) { + // 'e' means load edge + std::string params = "e"; + if (reverse) { + // 'e<' means load edges from $2 to $1 + params += "<"; + } else { + // 'e>' means load edges from $1 to $2 + params += ">"; + } + if (this->table_id_map.count(name)) { + VLOG(0) << "loadding data with type " << name << " from " << filepath; + uint32_t table_id = this->table_id_map[name]; + auto status = + get_ps_client()->load(table_id, std::string(filepath), params); + status.wait(); + } +} + +void GraphPyClient::load_node_file(std::string name, std::string filepath) { + // 'n' means load nodes and 'node_type' follows + std::string params = "n" + name; + if (this->table_id_map.count(name)) { + uint32_t table_id = this->table_id_map[name]; + auto status = + get_ps_client()->load(table_id, std::string(filepath), params); + status.wait(); + } +} +std::vector>> +GraphPyClient::batch_sample_neighboors(std::string name, + std::vector node_ids, + int sample_size) { + std::vector>> v; + if (this->table_id_map.count(name)) { + uint32_t table_id = this->table_id_map[name]; + auto status = + worker_ptr->batch_sample_neighboors(table_id, node_ids, sample_size, v); + status.wait(); + } + return v; +} + +std::vector GraphPyClient::random_sample_nodes(std::string name, + int server_index, + int sample_size) { + std::vector v; + if (this->table_id_map.count(name)) { + uint32_t table_id = this->table_id_map[name]; + auto status = + worker_ptr->random_sample_nodes(table_id, server_index, sample_size, v); + status.wait(); + } + return v; +} + +// (name, dtype, ndarray) +std::vector> GraphPyClient::get_node_feat( + std::string node_type, std::vector node_ids, + std::vector feature_names) { + std::vector> v( + feature_names.size(), std::vector(node_ids.size())); + if (this->table_id_map.count(node_type)) { + uint32_t table_id = this->table_id_map[node_type]; + auto status = + worker_ptr->get_node_feat(table_id, node_ids, feature_names, v); + status.wait(); + } + return v; +} + +std::vector GraphPyClient::pull_graph_list(std::string name, + int server_index, + int start, int size, + int step) { + std::vector res; + if (this->table_id_map.count(name)) { + uint32_t table_id = this->table_id_map[name]; + auto status = worker_ptr->pull_graph_list(table_id, server_index, start, + size, step, res); + status.wait(); + } + return res; +} + +void GraphPyClient::stop_server() { + VLOG(0) << "going to stop server"; + std::unique_lock lock(mutex_); + if (stoped_) return; + auto status = this->worker_ptr->stop_server(); + if (status.get() == 0) stoped_ = true; +} +void GraphPyClient::finalize_worker() { this->worker_ptr->finalize_worker(); } +} +} diff --git a/paddle/fluid/distributed/service/graph_py_service.h b/paddle/fluid/distributed/service/graph_py_service.h new file mode 100644 index 00000000000..e185f23e3d2 --- /dev/null +++ b/paddle/fluid/distributed/service/graph_py_service.h @@ -0,0 +1,178 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include // NOLINT +#include +#include +#include +#include +#include +#include // NOLINT +#include +#include +#include "google/protobuf/text_format.h" + +#include "gtest/gtest.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/framework/variable.h" + +#include "paddle/fluid/distributed/ps.pb.h" +#include "paddle/fluid/distributed/service/env.h" +#include "paddle/fluid/distributed/service/graph_brpc_client.h" +#include "paddle/fluid/distributed/service/graph_brpc_server.h" +#include "paddle/fluid/distributed/service/sendrecv.pb.h" +#include "paddle/fluid/distributed/service/service.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/string/printf.h" +namespace paddle { +namespace distributed { +class GraphPyService { + protected: + std::vector server_list, port_list, host_sign_list; + int server_size, shard_num; + int num_node_types; + std::unordered_map table_id_map; + std::vector table_feat_conf_table_name; + std::vector table_feat_conf_feat_name; + std::vector table_feat_conf_feat_dtype; + std::vector table_feat_conf_feat_shape; + + // std::thread *server_thread, *client_thread; + + // std::shared_ptr pserver_ptr; + + // std::shared_ptr worker_ptr; + + public: + // std::shared_ptr get_ps_server() { + // return pserver_ptr; + // } + // std::shared_ptr get_ps_client() { + // return worker_ptr; + // } + int get_shard_num() { return shard_num; } + void set_shard_num(int shard_num) { this->shard_num = shard_num; } + void GetDownpourSparseTableProto( + ::paddle::distributed::TableParameter* sparse_table_proto, + uint32_t table_id, std::string table_name, std::string table_type, + std::vector feat_name, std::vector feat_dtype, + std::vector feat_shape) { + sparse_table_proto->set_table_id(table_id); + sparse_table_proto->set_table_class("GraphTable"); + sparse_table_proto->set_shard_num(shard_num); + sparse_table_proto->set_type(::paddle::distributed::PS_SPARSE_TABLE); + ::paddle::distributed::TableAccessorParameter* accessor_proto = + sparse_table_proto->mutable_accessor(); + + ::paddle::distributed::CommonAccessorParameter* common_proto = + sparse_table_proto->mutable_common(); + + // Set GraphTable Parameter + common_proto->set_table_name(table_name); + common_proto->set_name(table_type); + for (size_t i = 0; i < feat_name.size(); i++) { + common_proto->add_params(feat_dtype[i]); + common_proto->add_dims(feat_shape[i]); + common_proto->add_attributes(feat_name[i]); + } + + accessor_proto->set_accessor_class("CommMergeAccessor"); + } + + void set_server_size(int server_size) { this->server_size = server_size; } + void set_num_node_types(int num_node_types) { + this->num_node_types = num_node_types; + } + int get_server_size(int server_size) { return server_size; } + std::vector split(std::string& str, const char pattern); + void set_up(std::string ips_str, int shard_num, + std::vector node_types, + std::vector edge_types); + + void add_table_feat_conf(std::string node_type, std::string feat_name, + std::string feat_dtype, int32_t feat_shape); +}; +class GraphPyServer : public GraphPyService { + public: + GraphPyServer() {} + void set_up(std::string ips_str, int shard_num, + std::vector node_types, + std::vector edge_types, int rank) { + set_rank(rank); + GraphPyService::set_up(ips_str, shard_num, node_types, edge_types); + } + int get_rank() { return rank; } + void set_rank(int rank) { this->rank = rank; } + + void start_server(bool block = true); + ::paddle::distributed::PSParameter GetServerProto(); + std::shared_ptr get_ps_server() { + return pserver_ptr; + } + + protected: + int rank; + std::shared_ptr pserver_ptr; + std::thread* server_thread; +}; +class GraphPyClient : public GraphPyService { + public: + void set_up(std::string ips_str, int shard_num, + std::vector node_types, + std::vector edge_types, int client_id) { + set_client_id(client_id); + GraphPyService::set_up(ips_str, shard_num, node_types, edge_types); + } + std::shared_ptr get_ps_client() { + return worker_ptr; + } + void bind_local_server(int local_channel_index, GraphPyServer& server) { + worker_ptr->set_local_channel(local_channel_index); + worker_ptr->set_local_graph_service( + (paddle::distributed::GraphBrpcService*)server.get_ps_server() + ->get_service()); + } + void stop_server(); + void finalize_worker(); + void load_edge_file(std::string name, std::string filepath, bool reverse); + void load_node_file(std::string name, std::string filepath); + int get_client_id() { return client_id; } + void set_client_id(int client_id) { this->client_id = client_id; } + void start_client(); + std::vector>> batch_sample_neighboors( + std::string name, std::vector node_ids, int sample_size); + std::vector random_sample_nodes(std::string name, int server_index, + int sample_size); + std::vector> get_node_feat( + std::string node_type, std::vector node_ids, + std::vector feature_names); + std::vector pull_graph_list(std::string name, int server_index, + int start, int size, int step = 1); + ::paddle::distributed::PSParameter GetWorkerProto(); + + protected: + mutable std::mutex mutex_; + int client_id; + std::shared_ptr worker_ptr; + std::thread* client_thread; + bool stoped_ = false; +}; +} +} diff --git a/paddle/fluid/distributed/service/ps_client.cc b/paddle/fluid/distributed/service/ps_client.cc index d427ecfc538..3f78908baa3 100644 --- a/paddle/fluid/distributed/service/ps_client.cc +++ b/paddle/fluid/distributed/service/ps_client.cc @@ -15,12 +15,13 @@ #include "paddle/fluid/distributed/service/ps_client.h" #include "glog/logging.h" #include "paddle/fluid/distributed/service/brpc_ps_client.h" +#include "paddle/fluid/distributed/service/graph_brpc_client.h" #include "paddle/fluid/distributed/table/table.h" namespace paddle { namespace distributed { REGISTER_PSCORE_CLASS(PSClient, BrpcPsClient); - +REGISTER_PSCORE_CLASS(PSClient, GraphBrpcClient); int32_t PSClient::configure( const PSParameter &config, const std::map> ®ions, @@ -82,4 +83,4 @@ PSClient *PSClientFactory::create(const PSParameter &ps_config) { return client; } } // namespace distributed -} // namespace paddle +} // namespace paddle \ No newline at end of file diff --git a/paddle/fluid/distributed/service/ps_client.h b/paddle/fluid/distributed/service/ps_client.h index 50f5802c63a..7b698afa726 100644 --- a/paddle/fluid/distributed/service/ps_client.h +++ b/paddle/fluid/distributed/service/ps_client.h @@ -24,16 +24,11 @@ #include "paddle/fluid/distributed/service/env.h" #include "paddle/fluid/distributed/service/sendrecv.pb.h" #include "paddle/fluid/distributed/table/accessor.h" +#include "paddle/fluid/distributed/table/graph_node.h" namespace paddle { namespace distributed { -class PSEnvironment; -class PsRequestMessage; -class PsResponseMessage; -class ValueAccessor; -struct Region; - using paddle::distributed::PsRequestMessage; using paddle::distributed::PsResponseMessage; @@ -160,6 +155,7 @@ class PSClient { promise.set_value(-1); return fut; } + // client2client消息处理,std::function ret (msg_type, from_client_id, msg) typedef std::function MsgHandlerFunc; diff --git a/paddle/fluid/distributed/service/sendrecv.proto b/paddle/fluid/distributed/service/sendrecv.proto index 6250f84c987..d908c26da98 100644 --- a/paddle/fluid/distributed/service/sendrecv.proto +++ b/paddle/fluid/distributed/service/sendrecv.proto @@ -48,6 +48,10 @@ enum PsCmdID { PS_START_PROFILER = 27; PS_STOP_PROFILER = 28; PS_PUSH_GLOBAL_STEP = 29; + PS_PULL_GRAPH_LIST = 30; + PS_GRAPH_SAMPLE_NEIGHBOORS = 31; + PS_GRAPH_SAMPLE_NODES = 32; + PS_GRAPH_GET_NODE_FEAT = 33; } message PsRequestMessage { @@ -111,4 +115,4 @@ message MultiVariableMessage { service PsService { rpc service(PsRequestMessage) returns (PsResponseMessage); rpc SendAndRecvVariable(MultiVariableMessage) returns (MultiVariableMessage); -}; \ No newline at end of file +}; diff --git a/paddle/fluid/distributed/service/server.cc b/paddle/fluid/distributed/service/server.cc index fc230a0b9c9..9324adad697 100644 --- a/paddle/fluid/distributed/service/server.cc +++ b/paddle/fluid/distributed/service/server.cc @@ -16,6 +16,7 @@ #include "glog/logging.h" #include "paddle/fluid/distributed/service/brpc_ps_server.h" +#include "paddle/fluid/distributed/service/graph_brpc_server.h" #include "paddle/fluid/distributed/table/table.h" namespace paddle { @@ -23,6 +24,8 @@ namespace distributed { REGISTER_PSCORE_CLASS(PSServer, BrpcPsServer); REGISTER_PSCORE_CLASS(PsBaseService, BrpcPsService); +REGISTER_PSCORE_CLASS(PSServer, GraphBrpcServer); +REGISTER_PSCORE_CLASS(PsBaseService, GraphBrpcService); PSServer *PSServerFactory::create(const PSParameter &ps_config) { const auto &config = ps_config.server_param(); diff --git a/paddle/fluid/distributed/table/CMakeLists.txt b/paddle/fluid/distributed/table/CMakeLists.txt index 1e98e193d54..33873abc5f7 100644 --- a/paddle/fluid/distributed/table/CMakeLists.txt +++ b/paddle/fluid/distributed/table/CMakeLists.txt @@ -1,13 +1,19 @@ set_property(GLOBAL PROPERTY TABLE_DEPS string_helper) get_property(TABLE_DEPS GLOBAL PROPERTY TABLE_DEPS) - +set_source_files_properties(graph_edge.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +cc_library(graph_edge SRCS graph_edge.cc) +set_source_files_properties(graph_weighted_sampler.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +cc_library(WeightedSampler SRCS graph_weighted_sampler.cc DEPS graph_edge) +set_source_files_properties(graph_node.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +cc_library(graph_node SRCS graph_node.cc DEPS WeightedSampler) set_source_files_properties(common_dense_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(common_sparse_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(sparse_geo_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(barrier_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +set_source_files_properties(common_graph_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -cc_library(common_table SRCS common_sparse_table.cc common_dense_table.cc sparse_geo_table.cc barrier_table.cc DEPS ${TABLE_DEPS} device_context string_helper simple_threadpool xxhash generator) +cc_library(common_table SRCS common_sparse_table.cc common_dense_table.cc sparse_geo_table.cc barrier_table.cc common_graph_table.cc DEPS ${TABLE_DEPS} graph_edge graph_node device_context string_helper simple_threadpool xxhash generator) set_source_files_properties(tensor_accessor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(tensor_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) diff --git a/paddle/fluid/distributed/table/common_graph_table.cc b/paddle/fluid/distributed/table/common_graph_table.cc new file mode 100644 index 00000000000..995a39a6543 --- /dev/null +++ b/paddle/fluid/distributed/table/common_graph_table.cc @@ -0,0 +1,506 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/distributed/table/common_graph_table.h" +#include +#include +#include +#include +#include "paddle/fluid/distributed/common/utils.h" +#include "paddle/fluid/distributed/table/graph_node.h" +#include "paddle/fluid/string/printf.h" +#include "paddle/fluid/string/string_helper.h" +namespace paddle { +namespace distributed { + +std::vector GraphShard::get_batch(int start, int end, int step) { + if (start < 0) start = 0; + std::vector res; + for (int pos = start; pos < std::min(end, (int)bucket.size()); pos += step) { + res.push_back(bucket[pos]); + } + return res; +} + +size_t GraphShard::get_size() { return bucket.size(); } + +GraphNode *GraphShard::add_graph_node(uint64_t id) { + if (node_location.find(id) == node_location.end()) { + node_location[id] = bucket.size(); + bucket.push_back(new GraphNode(id)); + } + return (GraphNode *)bucket[node_location[id]]; +} + +FeatureNode *GraphShard::add_feature_node(uint64_t id) { + if (node_location.find(id) == node_location.end()) { + node_location[id] = bucket.size(); + bucket.push_back(new FeatureNode(id)); + } + return (FeatureNode *)bucket[node_location[id]]; +} + +void GraphShard::add_neighboor(uint64_t id, uint64_t dst_id, float weight) { + find_node(id)->add_edge(dst_id, weight); +} + +Node *GraphShard::find_node(uint64_t id) { + auto iter = node_location.find(id); + return iter == node_location.end() ? nullptr : bucket[iter->second]; +} + +int32_t GraphTable::load(const std::string &path, const std::string ¶m) { + bool load_edge = (param[0] == 'e'); + bool load_node = (param[0] == 'n'); + if (load_edge) { + bool reverse_edge = (param[1] == '<'); + return this->load_edges(path, reverse_edge); + } + if (load_node) { + std::string node_type = param.substr(1); + return this->load_nodes(path, node_type); + } + return 0; +} + +int32_t GraphTable::get_nodes_ids_by_ranges( + std::vector> ranges, std::vector &res) { + int start = 0, end, index = 0, total_size = 0; + res.clear(); + std::vector>> tasks; + // std::string temp = ""; + // for(int i = 0;i < shards.size();i++) + // temp+= std::to_string((int)shards[i].get_size()) + " "; + // VLOG(0)<<"range distribution "<= end) { + break; + } else { + int first = std::max(ranges[index].first, start); + int second = std::min(ranges[index].second, end); + start = second; + first -= total_size; + second -= total_size; + // VLOG(0)<<" FIND RANGE "<enqueue( + [this, first, second, i]() -> std::vector { + return shards[i].get_ids_by_range(first, second); + })); + } + } + total_size += shards[i].get_size(); + } + for (int i = 0; i < tasks.size(); i++) { + auto vec = tasks[i].get(); + for (auto &id : vec) { + res.push_back(id); + std::swap(res[rand() % res.size()], res[(int)res.size() - 1]); + } + } + return 0; +} + +int32_t GraphTable::load_nodes(const std::string &path, std::string node_type) { + auto paths = paddle::string::split_string(path, ";"); + int64_t count = 0; + int64_t valid_count = 0; + for (auto path : paths) { + std::ifstream file(path); + std::string line; + while (std::getline(file, line)) { + count++; + auto values = paddle::string::split_string(line, "\t"); + if (values.size() < 2) continue; + auto id = std::stoull(values[1]); + + size_t shard_id = id % shard_num; + if (shard_id >= shard_end || shard_id < shard_start) { + VLOG(4) << "will not load " << id << " from " << path + << ", please check id distribution"; + continue; + } + + if (count % 1000000 == 0) { + VLOG(0) << count << " nodes are loaded from filepath"; + } + + std::string nt = values[0]; + if (nt != node_type) { + continue; + } + + size_t index = shard_id - shard_start; + + auto node = shards[index].add_feature_node(id); + + node->set_feature_size(feat_name.size()); + + for (size_t slice = 2; slice < values.size(); slice++) { + auto feat = this->parse_feature(values[slice]); + if (feat.first >= 0) { + node->set_feature(feat.first, feat.second); + } else { + VLOG(4) << "Node feature: " << values[slice] + << " not in feature_map."; + } + } + valid_count++; + } + } + + VLOG(0) << valid_count << "/" << count << " nodes in type " << node_type + << " are loaded successfully in " << path; + return 0; +} + +int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge) { + auto paths = paddle::string::split_string(path, ";"); + int count = 0; + std::string sample_type = "random"; + bool is_weighted = false; + int valid_count = 0; + + for (auto path : paths) { + std::ifstream file(path); + std::string line; + while (std::getline(file, line)) { + auto values = paddle::string::split_string(line, "\t"); + count++; + if (values.size() < 2) continue; + auto src_id = std::stoull(values[0]); + auto dst_id = std::stoull(values[1]); + if (reverse_edge) { + std::swap(src_id, dst_id); + } + float weight = 1; + if (values.size() == 3) { + weight = std::stof(values[2]); + sample_type = "weighted"; + is_weighted = true; + } + + size_t src_shard_id = src_id % shard_num; + + if (src_shard_id >= shard_end || src_shard_id < shard_start) { + VLOG(4) << "will not load " << src_id << " from " << path + << ", please check id distribution"; + continue; + } + if (count % 1000000 == 0) { + VLOG(0) << count << " edges are loaded from filepath"; + } + + size_t index = src_shard_id - shard_start; + shards[index].add_graph_node(src_id)->build_edges(is_weighted); + shards[index].add_neighboor(src_id, dst_id, weight); + valid_count++; + } + } + VLOG(0) << valid_count << "/" << count << " edges are loaded successfully in " + << path; + + // Build Sampler j + + for (auto &shard : shards) { + auto bucket = shard.get_bucket(); + for (int i = 0; i < bucket.size(); i++) { + bucket[i]->build_sampler(sample_type); + } + } + return 0; +} + +Node *GraphTable::find_node(uint64_t id) { + size_t shard_id = id % shard_num; + if (shard_id >= shard_end || shard_id < shard_start) { + return nullptr; + } + size_t index = shard_id - shard_start; + Node *node = shards[index].find_node(id); + return node; +} +uint32_t GraphTable::get_thread_pool_index(uint64_t node_id) { + return node_id % shard_num % shard_num_per_table % task_pool_size_; +} +int32_t GraphTable::random_sample_nodes(int sample_size, + std::unique_ptr &buffer, + int &actual_size) { + bool need_feature = false; + int total_size = 0; + for (int i = 0; i < shards.size(); i++) { + total_size += shards[i].get_size(); + } + if (sample_size > total_size) sample_size = total_size; + int range_num = random_sample_nodes_ranges; + if (range_num > sample_size) range_num = sample_size; + if (sample_size == 0 || range_num == 0) return 0; + std::vector ranges_len, ranges_pos; + int remain = sample_size, last_pos = -1, num; + std::set separator_set; + for (int i = 0; i < range_num - 1; i++) { + while (separator_set.find(num = rand() % (sample_size - 1)) != + separator_set.end()) + ; + separator_set.insert(num); + } + for (auto p : separator_set) { + ranges_len.push_back(p - last_pos); + last_pos = p; + } + ranges_len.push_back(sample_size - 1 - last_pos); + remain = total_size - sample_size + range_num; + separator_set.clear(); + for (int i = 0; i < range_num; i++) { + while (separator_set.find(num = rand() % remain) != separator_set.end()) + ; + separator_set.insert(num); + } + int used = 0, index = 0; + last_pos = -1; + for (auto p : separator_set) { + used += p - last_pos - 1; + last_pos = p; + ranges_pos.push_back(used); + used += ranges_len[index++]; + } + std::vector> first_half, second_half; + int start_index = rand() % total_size; + for (int i = 0; i < ranges_len.size() && i < ranges_pos.size(); i++) { + if (ranges_pos[i] + ranges_len[i] - 1 + start_index < total_size) + first_half.push_back({ranges_pos[i] + start_index, + ranges_pos[i] + ranges_len[i] + start_index}); + else if (ranges_pos[i] + start_index >= total_size) { + second_half.push_back( + {ranges_pos[i] + start_index - total_size, + ranges_pos[i] + ranges_len[i] + start_index - total_size}); + } else { + first_half.push_back({ranges_pos[i] + start_index, total_size}); + second_half.push_back( + {0, ranges_pos[i] + ranges_len[i] + start_index - total_size}); + } + } + for (auto &pair : first_half) second_half.push_back(pair); + std::vector res; + get_nodes_ids_by_ranges(second_half, res); + actual_size = res.size() * sizeof(uint64_t); + buffer.reset(new char[actual_size]); + char *pointer = buffer.get(); + memcpy(pointer, res.data(), actual_size); + return 0; +} +int32_t GraphTable::random_sample_neighboors( + uint64_t *node_ids, int sample_size, + std::vector> &buffers, + std::vector &actual_sizes) { + size_t node_num = buffers.size(); + std::vector> tasks; + for (size_t idx = 0; idx < node_num; ++idx) { + uint64_t &node_id = node_ids[idx]; + std::unique_ptr &buffer = buffers[idx]; + int &actual_size = actual_sizes[idx]; + tasks.push_back(_shards_task_pool[get_thread_pool_index(node_id)]->enqueue( + [&]() -> int { + Node *node = find_node(node_id); + + if (node == nullptr) { + actual_size = 0; + return 0; + } + std::vector res = node->sample_k(sample_size); + actual_size = res.size() * (Node::id_size + Node::weight_size); + int offset = 0; + uint64_t id; + float weight; + char *buffer_addr = new char[actual_size]; + buffer.reset(buffer_addr); + for (int &x : res) { + id = node->get_neighbor_id(x); + weight = node->get_neighbor_weight(x); + memcpy(buffer_addr + offset, &id, Node::id_size); + offset += Node::id_size; + memcpy(buffer_addr + offset, &weight, Node::weight_size); + offset += Node::weight_size; + } + return 0; + })); + } + for (size_t idx = 0; idx < node_num; ++idx) { + tasks[idx].get(); + } + return 0; +} + +int32_t GraphTable::get_node_feat(const std::vector &node_ids, + const std::vector &feature_names, + std::vector> &res) { + size_t node_num = node_ids.size(); + std::vector> tasks; + for (size_t idx = 0; idx < node_num; ++idx) { + uint64_t node_id = node_ids[idx]; + tasks.push_back(_shards_task_pool[get_thread_pool_index(node_id)]->enqueue( + [&, idx, node_id]() -> int { + Node *node = find_node(node_id); + + if (node == nullptr) { + return 0; + } + for (int feat_idx = 0; feat_idx < feature_names.size(); ++feat_idx) { + const std::string &feature_name = feature_names[feat_idx]; + if (feat_id_map.find(feature_name) != feat_id_map.end()) { + // res[feat_idx][idx] = + // node->get_feature(feat_id_map[feature_name]); + auto feat = node->get_feature(feat_id_map[feature_name]); + res[feat_idx][idx] = feat; + } + } + return 0; + })); + } + for (size_t idx = 0; idx < node_num; ++idx) { + tasks[idx].get(); + } + return 0; +} + +std::pair GraphTable::parse_feature( + std::string feat_str) { + // Return (feat_id, btyes) if name are in this->feat_name, else return (-1, + // "") + auto fields = paddle::string::split_string(feat_str, " "); + if (this->feat_id_map.count(fields[0])) { + int32_t id = this->feat_id_map[fields[0]]; + std::string dtype = this->feat_dtype[id]; + int32_t shape = this->feat_shape[id]; + std::vector values(fields.begin() + 1, fields.end()); + if (dtype == "feasign") { + return std::make_pair( + int32_t(id), paddle::string::join_strings(values, ' ')); + } else if (dtype == "string") { + return std::make_pair( + int32_t(id), paddle::string::join_strings(values, ' ')); + } else if (dtype == "float32") { + return std::make_pair( + int32_t(id), FeatureNode::parse_value_to_bytes(values)); + } else if (dtype == "float64") { + return std::make_pair( + int32_t(id), FeatureNode::parse_value_to_bytes(values)); + } else if (dtype == "int32") { + return std::make_pair( + int32_t(id), FeatureNode::parse_value_to_bytes(values)); + } else if (dtype == "int64") { + return std::make_pair( + int32_t(id), FeatureNode::parse_value_to_bytes(values)); + } + } + return std::make_pair(-1, ""); +} + +int32_t GraphTable::pull_graph_list(int start, int total_size, + std::unique_ptr &buffer, + int &actual_size, bool need_feature, + int step) { + if (start < 0) start = 0; + int size = 0, cur_size; + std::vector>> tasks; + for (size_t i = 0; i < shards.size() && total_size > 0; i++) { + cur_size = shards[i].get_size(); + if (size + cur_size <= start) { + size += cur_size; + continue; + } + int count = std::min(1 + (size + cur_size - start - 1) / step, total_size); + int end = start + (count - 1) * step + 1; + tasks.push_back(_shards_task_pool[i % task_pool_size_]->enqueue( + [this, i, start, end, step, size]() -> std::vector { + + return this->shards[i].get_batch(start - size, end - size, step); + })); + start += count * step; + total_size -= count; + size += cur_size; + } + for (size_t i = 0; i < tasks.size(); ++i) { + tasks[i].wait(); + } + size = 0; + std::vector> res; + for (size_t i = 0; i < tasks.size(); i++) { + res.push_back(tasks[i].get()); + for (size_t j = 0; j < res.back().size(); j++) { + size += res.back()[j]->get_size(need_feature); + } + } + char *buffer_addr = new char[size]; + buffer.reset(buffer_addr); + int index = 0; + for (size_t i = 0; i < res.size(); i++) { + for (size_t j = 0; j < res[i].size(); j++) { + res[i][j]->to_buffer(buffer_addr + index, need_feature); + index += res[i][j]->get_size(need_feature); + } + } + actual_size = size; + return 0; +} +int32_t GraphTable::initialize() { + _shards_task_pool.resize(task_pool_size_); + for (size_t i = 0; i < _shards_task_pool.size(); ++i) { + _shards_task_pool[i].reset(new ::ThreadPool(1)); + } + server_num = _shard_num; + // VLOG(0) << "in init graph table server num = " << server_num; + /* + _shard_num is actually server number here + when a server initialize its tables, it sets tables' _shard_num to server_num, + and _shard_idx to server + rank + */ + auto common = _config.common(); + + this->table_name = common.table_name(); + this->table_type = common.name(); + VLOG(0) << " init graph table type " << this->table_type << " table name " + << this->table_name; + int feat_conf_size = static_cast(common.attributes().size()); + for (int i = 0; i < feat_conf_size; i++) { + auto &f_name = common.attributes()[i]; + auto &f_shape = common.dims()[i]; + auto &f_dtype = common.params()[i]; + this->feat_name.push_back(f_name); + this->feat_shape.push_back(f_shape); + this->feat_dtype.push_back(f_dtype); + this->feat_id_map[f_name] = i; + VLOG(0) << "init graph table feat conf name:" << f_name + << " shape:" << f_shape << " dtype:" << f_dtype; + } + + shard_num = _config.shard_num(); + VLOG(0) << "in init graph table shard num = " << shard_num << " shard_idx" + << _shard_idx; + shard_num_per_table = sparse_local_shard_num(shard_num, server_num); + shard_start = _shard_idx * shard_num_per_table; + shard_end = shard_start + shard_num_per_table; + VLOG(0) << "in init graph table shard idx = " << _shard_idx << " shard_start " + << shard_start << " shard_end " << shard_end; + // shards.resize(shard_num_per_table); + shards = std::vector(shard_num_per_table, GraphShard(shard_num)); + return 0; +} +} +}; diff --git a/paddle/fluid/distributed/table/common_graph_table.h b/paddle/fluid/distributed/table/common_graph_table.h new file mode 100644 index 00000000000..de3cac134cd --- /dev/null +++ b/paddle/fluid/distributed/table/common_graph_table.h @@ -0,0 +1,144 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include // NOLINT +#include +#include +#include +#include +#include "paddle/fluid/distributed/table/accessor.h" +#include "paddle/fluid/distributed/table/common_table.h" +#include "paddle/fluid/distributed/table/graph_node.h" +#include "paddle/fluid/framework/rw_lock.h" +#include "paddle/fluid/string/string_helper.h" +namespace paddle { +namespace distributed { +class GraphShard { + public: + // static int bucket_low_bound; + // static int gcd(int s, int t) { + // if (s % t == 0) return t; + // return gcd(t, s % t); + // } + size_t get_size(); + GraphShard() {} + GraphShard(int shard_num) { + this->shard_num = shard_num; + // bucket_size = init_bucket_size(shard_num); + // bucket.resize(bucket_size); + } + std::vector &get_bucket() { return bucket; } + std::vector get_batch(int start, int end, int step); + // int init_bucket_size(int shard_num) { + // for (int i = bucket_low_bound;; i++) { + // if (gcd(i, shard_num) == 1) return i; + // } + // return -1; + // } + std::vector get_ids_by_range(int start, int end) { + std::vector res; + for (int i = start; i < end && i < bucket.size(); i++) { + res.push_back(bucket[i]->get_id()); + } + return res; + } + GraphNode *add_graph_node(uint64_t id); + FeatureNode *add_feature_node(uint64_t id); + Node *find_node(uint64_t id); + void add_neighboor(uint64_t id, uint64_t dst_id, float weight); + // std::unordered_map::iterator> + std::unordered_map get_node_location() { + return node_location; + } + + private: + std::unordered_map node_location; + int shard_num; + std::vector bucket; +}; +class GraphTable : public SparseTable { + public: + GraphTable() {} + virtual ~GraphTable() {} + virtual int32_t pull_graph_list(int start, int size, + std::unique_ptr &buffer, + int &actual_size, bool need_feature, + int step); + + virtual int32_t random_sample_neighboors( + uint64_t *node_ids, int sample_size, + std::vector> &buffers, + std::vector &actual_sizes); + + int32_t random_sample_nodes(int sample_size, std::unique_ptr &buffers, + int &actual_sizes); + + virtual int32_t get_nodes_ids_by_ranges( + std::vector> ranges, std::vector &res); + virtual int32_t initialize(); + + int32_t load(const std::string &path, const std::string ¶m); + + int32_t load_edges(const std::string &path, bool reverse); + + int32_t load_nodes(const std::string &path, std::string node_type); + + Node *find_node(uint64_t id); + + virtual int32_t pull_sparse(float *values, const uint64_t *keys, size_t num) { + return 0; + } + virtual int32_t push_sparse(const uint64_t *keys, const float *values, + size_t num) { + return 0; + } + virtual void clear() {} + virtual int32_t flush() { return 0; } + virtual int32_t shrink(const std::string ¶m) { return 0; } + //指定保存路径 + virtual int32_t save(const std::string &path, const std::string &converter) { + return 0; + } + virtual int32_t initialize_shard() { return 0; } + virtual uint32_t get_thread_pool_index(uint64_t node_id); + virtual std::pair parse_feature(std::string feat_str); + + virtual int32_t get_node_feat(const std::vector &node_ids, + const std::vector &feature_names, + std::vector> &res); + + protected: + std::vector shards; + size_t shard_start, shard_end, server_num, shard_num_per_table, shard_num; + const int task_pool_size_ = 11; + const int random_sample_nodes_ranges = 3; + + std::vector feat_name; + std::vector feat_dtype; + std::vector feat_shape; + std::unordered_map feat_id_map; + std::string table_name; + std::string table_type; + + std::vector> _shards_task_pool; +}; +} +}; diff --git a/paddle/fluid/distributed/table/graph_edge.cc b/paddle/fluid/distributed/table/graph_edge.cc new file mode 100644 index 00000000000..cc90f4c6516 --- /dev/null +++ b/paddle/fluid/distributed/table/graph_edge.cc @@ -0,0 +1,29 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/distributed/table/graph_edge.h" +#include +namespace paddle { +namespace distributed { + +void GraphEdgeBlob::add_edge(uint64_t id, float weight = 1) { + id_arr.push_back(id); +} + +void WeightedGraphEdgeBlob::add_edge(uint64_t id, float weight = 1) { + id_arr.push_back(id); + weight_arr.push_back(weight); +} +} +} diff --git a/paddle/fluid/distributed/table/graph_edge.h b/paddle/fluid/distributed/table/graph_edge.h new file mode 100644 index 00000000000..3dfe5a6f357 --- /dev/null +++ b/paddle/fluid/distributed/table/graph_edge.h @@ -0,0 +1,46 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +namespace paddle { +namespace distributed { + +class GraphEdgeBlob { + public: + GraphEdgeBlob() {} + virtual ~GraphEdgeBlob() {} + size_t size() { return id_arr.size(); } + virtual void add_edge(uint64_t id, float weight); + uint64_t get_id(int idx) { return id_arr[idx]; } + virtual float get_weight(int idx) { return 1; } + + protected: + std::vector id_arr; +}; + +class WeightedGraphEdgeBlob : public GraphEdgeBlob { + public: + WeightedGraphEdgeBlob() {} + virtual ~WeightedGraphEdgeBlob() {} + virtual void add_edge(uint64_t id, float weight); + virtual float get_weight(int idx) { return weight_arr[idx]; } + + protected: + std::vector weight_arr; +}; +} +} diff --git a/paddle/fluid/distributed/table/graph_node.cc b/paddle/fluid/distributed/table/graph_node.cc new file mode 100644 index 00000000000..27a2cafaf4f --- /dev/null +++ b/paddle/fluid/distributed/table/graph_node.cc @@ -0,0 +1,117 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/distributed/table/graph_node.h" +#include +namespace paddle { +namespace distributed { + +GraphNode::~GraphNode() { + if (sampler != nullptr) { + delete sampler; + sampler = nullptr; + } + if (edges != nullptr) { + delete edges; + edges = nullptr; + } +} + +int Node::weight_size = sizeof(float); +int Node::id_size = sizeof(uint64_t); +int Node::int_size = sizeof(int); + +int Node::get_size(bool need_feature) { return id_size + int_size; } + +void Node::to_buffer(char* buffer, bool need_feature) { + memcpy(buffer, &id, id_size); + buffer += id_size; + + int feat_num = 0; + memcpy(buffer, &feat_num, sizeof(int)); +} + +void Node::recover_from_buffer(char* buffer) { memcpy(&id, buffer, id_size); } + +int FeatureNode::get_size(bool need_feature) { + int size = id_size + int_size; // id, feat_num + if (need_feature) { + size += feature.size() * int_size; + for (const std::string& fea : feature) { + size += fea.size(); + } + } + return size; +} + +void GraphNode::build_edges(bool is_weighted) { + if (edges == nullptr) { + if (is_weighted == true) { + edges = new WeightedGraphEdgeBlob(); + } else { + edges = new GraphEdgeBlob(); + } + } +} +void GraphNode::build_sampler(std::string sample_type) { + if (sample_type == "random") { + sampler = new RandomSampler(); + } else if (sample_type == "weighted") { + sampler = new WeightedSampler(); + } + sampler->build(edges); +} +void FeatureNode::to_buffer(char* buffer, bool need_feature) { + memcpy(buffer, &id, id_size); + buffer += id_size; + + int feat_num = 0; + int feat_len; + if (need_feature) { + feat_num += feature.size(); + memcpy(buffer, &feat_num, sizeof(int)); + buffer += sizeof(int); + for (int i = 0; i < feat_num; ++i) { + feat_len = feature[i].size(); + memcpy(buffer, &feat_len, sizeof(int)); + buffer += sizeof(int); + memcpy(buffer, feature[i].c_str(), feature[i].size()); + buffer += feature[i].size(); + } + } else { + memcpy(buffer, &feat_num, sizeof(int)); + } +} +void FeatureNode::recover_from_buffer(char* buffer) { + int feat_num, feat_len; + memcpy(&id, buffer, id_size); + buffer += id_size; + + memcpy(&feat_num, buffer, sizeof(int)); + buffer += sizeof(int); + + feature.clear(); + for (int i = 0; i < feat_num; ++i) { + memcpy(&feat_len, buffer, sizeof(int)); + buffer += sizeof(int); + + char str[feat_len + 1]; + memcpy(str, buffer, feat_len); + buffer += feat_len; + str[feat_len] = '\0'; + feature.push_back(std::string(str)); + } +} +} +} diff --git a/paddle/fluid/distributed/table/graph_node.h b/paddle/fluid/distributed/table/graph_node.h new file mode 100644 index 00000000000..c3e8e3ce5b5 --- /dev/null +++ b/paddle/fluid/distributed/table/graph_node.h @@ -0,0 +1,127 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include +#include "paddle/fluid/distributed/table/graph_weighted_sampler.h" +namespace paddle { +namespace distributed { + +class Node { + public: + Node() {} + Node(uint64_t id) : id(id) {} + virtual ~Node() {} + static int id_size, int_size, weight_size; + uint64_t get_id() { return id; } + void set_id(uint64_t id) { this->id = id; } + + virtual void build_edges(bool is_weighted) {} + virtual void build_sampler(std::string sample_type) {} + virtual void add_edge(uint64_t id, float weight) {} + virtual std::vector sample_k(int k) { return std::vector(); } + virtual uint64_t get_neighbor_id(int idx) { return 0; } + virtual float get_neighbor_weight(int idx) { return 1.; } + + virtual int get_size(bool need_feature); + virtual void to_buffer(char *buffer, bool need_feature); + virtual void recover_from_buffer(char *buffer); + virtual std::string get_feature(int idx) { return std::string(""); } + virtual void set_feature(int idx, std::string str) {} + virtual void set_feature_size(int size) {} + virtual int get_feature_size() { return 0; } + + protected: + uint64_t id; +}; + +class GraphNode : public Node { + public: + GraphNode() : Node(), sampler(nullptr), edges(nullptr) {} + GraphNode(uint64_t id) : Node(id), sampler(nullptr), edges(nullptr) {} + virtual ~GraphNode(); + virtual void build_edges(bool is_weighted); + virtual void build_sampler(std::string sample_type); + virtual void add_edge(uint64_t id, float weight) { + edges->add_edge(id, weight); + } + virtual std::vector sample_k(int k) { return sampler->sample_k(k); } + virtual uint64_t get_neighbor_id(int idx) { return edges->get_id(idx); } + virtual float get_neighbor_weight(int idx) { return edges->get_weight(idx); } + + protected: + Sampler *sampler; + GraphEdgeBlob *edges; +}; + +class FeatureNode : public Node { + public: + FeatureNode() : Node() {} + FeatureNode(uint64_t id) : Node(id) {} + virtual ~FeatureNode() {} + virtual int get_size(bool need_feature); + virtual void to_buffer(char *buffer, bool need_feature); + virtual void recover_from_buffer(char *buffer); + virtual std::string get_feature(int idx) { + if (idx < (int)this->feature.size()) { + return this->feature[idx]; + } else { + return std::string(""); + } + } + + virtual void set_feature(int idx, std::string str) { + if (idx >= (int)this->feature.size()) { + this->feature.resize(idx + 1); + } + this->feature[idx] = str; + } + virtual void set_feature_size(int size) { this->feature.resize(size); } + virtual int get_feature_size() { return this->feature.size(); } + + template + static std::string parse_value_to_bytes(std::vector feat_str) { + T v; + size_t Tsize = sizeof(T) * feat_str.size(); + char buffer[Tsize]; + for (size_t i = 0; i < feat_str.size(); i++) { + std::stringstream ss(feat_str[i]); + ss >> v; + std::memcpy(buffer + sizeof(T) * i, (char *)&v, sizeof(T)); + } + return std::string(buffer, Tsize); + } + + template + static std::vector parse_bytes_to_array(std::string feat_str) { + T v; + std::vector out; + size_t start = 0; + const char *buffer = feat_str.data(); + while (start < feat_str.size()) { + std::memcpy((char *)&v, buffer + start, sizeof(T)); + start += sizeof(T); + out.push_back(v); + } + return out; + } + + protected: + std::vector feature; +}; +} +} diff --git a/paddle/fluid/distributed/table/graph_weighted_sampler.cc b/paddle/fluid/distributed/table/graph_weighted_sampler.cc new file mode 100644 index 00000000000..059a1d64bc3 --- /dev/null +++ b/paddle/fluid/distributed/table/graph_weighted_sampler.cc @@ -0,0 +1,150 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/distributed/table/graph_weighted_sampler.h" +#include +#include +namespace paddle { +namespace distributed { + +void RandomSampler::build(GraphEdgeBlob *edges) { this->edges = edges; } + +std::vector RandomSampler::sample_k(int k) { + int n = edges->size(); + if (k > n) { + k = n; + } + struct timespec tn; + clock_gettime(CLOCK_REALTIME, &tn); + srand(tn.tv_nsec); + std::vector sample_result; + std::unordered_map replace_map; + while (k--) { + int rand_int = rand() % n; + auto iter = replace_map.find(rand_int); + if (iter == replace_map.end()) { + sample_result.push_back(rand_int); + } else { + sample_result.push_back(iter->second); + } + + iter = replace_map.find(n - 1); + if (iter == replace_map.end()) { + replace_map[rand_int] = n - 1; + } else { + replace_map[rand_int] = iter->second; + } + --n; + } + return sample_result; +} + +WeightedSampler::WeightedSampler() { + left = nullptr; + right = nullptr; + edges = nullptr; +} + +WeightedSampler::~WeightedSampler() { + if (left != nullptr) { + delete left; + left = nullptr; + } + if (right != nullptr) { + delete right; + right = nullptr; + } +} + +void WeightedSampler::build(GraphEdgeBlob *edges) { + if (left != nullptr) { + delete left; + left = nullptr; + } + if (right != nullptr) { + delete right; + right = nullptr; + } + return build_one((WeightedGraphEdgeBlob *)edges, 0, edges->size()); +} + +void WeightedSampler::build_one(WeightedGraphEdgeBlob *edges, int start, + int end) { + count = 0; + this->edges = edges; + if (start + 1 == end) { + left = right = nullptr; + idx = start; + count = 1; + weight = edges->get_weight(idx); + + } else { + left = new WeightedSampler(); + right = new WeightedSampler(); + left->build_one(edges, start, start + (end - start) / 2); + right->build_one(edges, start + (end - start) / 2, end); + weight = left->weight + right->weight; + count = left->count + right->count; + } +} +std::vector WeightedSampler::sample_k(int k) { + if (k > count) { + k = count; + } + std::vector sample_result; + float subtract; + std::unordered_map subtract_weight_map; + std::unordered_map subtract_count_map; + struct timespec tn; + clock_gettime(CLOCK_REALTIME, &tn); + srand(tn.tv_nsec); + while (k--) { + float query_weight = rand() % 100000 / 100000.0; + query_weight *= weight - subtract_weight_map[this]; + sample_result.push_back(sample(query_weight, subtract_weight_map, + subtract_count_map, subtract)); + } + return sample_result; +} + +int WeightedSampler::sample( + float query_weight, + std::unordered_map &subtract_weight_map, + std::unordered_map &subtract_count_map, + float &subtract) { + if (left == nullptr) { + subtract_weight_map[this] = weight; + subtract = weight; + subtract_count_map[this] = 1; + return idx; + } + int left_count = left->count - subtract_count_map[left]; + int right_count = right->count - subtract_count_map[right]; + float left_subtract = subtract_weight_map[left]; + int return_idx; + if (right_count == 0 || + left_count > 0 && left->weight - left_subtract >= query_weight) { + return_idx = left->sample(query_weight, subtract_weight_map, + subtract_count_map, subtract); + } else { + return_idx = + right->sample(query_weight - (left->weight - left_subtract), + subtract_weight_map, subtract_count_map, subtract); + } + subtract_weight_map[this] += subtract; + subtract_count_map[this]++; + return return_idx; +} +} +} diff --git a/paddle/fluid/distributed/table/graph_weighted_sampler.h b/paddle/fluid/distributed/table/graph_weighted_sampler.h new file mode 100644 index 00000000000..cfc341d27c6 --- /dev/null +++ b/paddle/fluid/distributed/table/graph_weighted_sampler.h @@ -0,0 +1,58 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include "paddle/fluid/distributed/table/graph_edge.h" +namespace paddle { +namespace distributed { + +class Sampler { + public: + virtual ~Sampler() {} + virtual void build(GraphEdgeBlob *edges) = 0; + virtual std::vector sample_k(int k) = 0; +}; + +class RandomSampler : public Sampler { + public: + virtual ~RandomSampler() {} + virtual void build(GraphEdgeBlob *edges); + virtual std::vector sample_k(int k); + GraphEdgeBlob *edges; +}; + +class WeightedSampler : public Sampler { + public: + WeightedSampler(); + virtual ~WeightedSampler(); + WeightedSampler *left, *right; + float weight; + int count; + int idx; + GraphEdgeBlob *edges; + virtual void build(GraphEdgeBlob *edges); + virtual void build_one(WeightedGraphEdgeBlob *edges, int start, int end); + virtual std::vector sample_k(int k); + + private: + int sample(float query_weight, + std::unordered_map &subtract_weight_map, + std::unordered_map &subtract_count_map, + float &subtract); +}; +} +} diff --git a/paddle/fluid/distributed/table/table.cc b/paddle/fluid/distributed/table/table.cc index dfaaa6ffc12..600be954cb5 100644 --- a/paddle/fluid/distributed/table/table.cc +++ b/paddle/fluid/distributed/table/table.cc @@ -18,6 +18,7 @@ #include "paddle/fluid/distributed/common/registerer.h" #include "paddle/fluid/distributed/table/common_dense_table.h" +#include "paddle/fluid/distributed/table/common_graph_table.h" #include "paddle/fluid/distributed/table/common_sparse_table.h" #include "paddle/fluid/distributed/table/sparse_geo_table.h" #include "paddle/fluid/distributed/table/tensor_accessor.h" @@ -25,7 +26,7 @@ namespace paddle { namespace distributed { - +REGISTER_PSCORE_CLASS(Table, GraphTable); REGISTER_PSCORE_CLASS(Table, CommonDenseTable); REGISTER_PSCORE_CLASS(Table, CommonSparseTable); REGISTER_PSCORE_CLASS(Table, SparseGeoTable); @@ -75,5 +76,6 @@ int32_t Table::initialize_accessor() { _value_accesor.reset(accessor); return 0; } + } // namespace distributed } // namespace paddle diff --git a/paddle/fluid/distributed/table/table.h b/paddle/fluid/distributed/table/table.h index 65c99d2bbd4..d64e805af40 100644 --- a/paddle/fluid/distributed/table/table.h +++ b/paddle/fluid/distributed/table/table.h @@ -21,6 +21,7 @@ #include #include #include "paddle/fluid/distributed/table/accessor.h" +#include "paddle/fluid/distributed/table/graph_node.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/platform/device_context.h" @@ -86,6 +87,31 @@ class Table { return 0; } + // only for graph table + virtual int32_t pull_graph_list(int start, int total_size, + std::unique_ptr &buffer, + int &actual_size, bool need_feature, + int step = 1) { + return 0; + } + // only for graph table + virtual int32_t random_sample_neighboors( + uint64_t *node_ids, int sample_size, + std::vector> &buffers, + std::vector &actual_sizes) { + return 0; + } + + virtual int32_t random_sample_nodes(int sample_size, + std::unique_ptr &buffers, + int &actual_sizes) { + return 0; + } + virtual int32_t get_node_feat(const std::vector &node_ids, + const std::vector &feature_names, + std::vector> &res) { + return 0; + } virtual int32_t pour() { return 0; } virtual void clear() = 0; @@ -141,5 +167,6 @@ class TableManager { TableManager() {} ~TableManager() {} }; + } // namespace distributed } // namespace paddle diff --git a/paddle/fluid/distributed/test/CMakeLists.txt b/paddle/fluid/distributed/test/CMakeLists.txt index adedd049023..b756c740ac7 100644 --- a/paddle/fluid/distributed/test/CMakeLists.txt +++ b/paddle/fluid/distributed/test/CMakeLists.txt @@ -15,3 +15,6 @@ cc_test(brpc_service_sparse_sgd_test SRCS brpc_service_sparse_sgd_test.cc DEPS s set_source_files_properties(brpc_utils_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) cc_test(brpc_utils_test SRCS brpc_utils_test.cc DEPS brpc_utils scope math_function ${COMMON_DEPS} ${RPC_DEPS}) + +set_source_files_properties(graph_node_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +cc_test(graph_node_test SRCS graph_node_test.cc DEPS graph_py_service scope server client communicator ps_service boost table ps_framework_proto ${COMMON_DEPS}) diff --git a/paddle/fluid/distributed/test/graph_node_test.cc b/paddle/fluid/distributed/test/graph_node_test.cc new file mode 100644 index 00000000000..79ab2795963 --- /dev/null +++ b/paddle/fluid/distributed/test/graph_node_test.cc @@ -0,0 +1,556 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include // NOLINT +#include +#include +#include +#include // NOLINT +#include +#include +#include "google/protobuf/text_format.h" + +#include "gtest/gtest.h" +#include "paddle/fluid/distributed/ps.pb.h" +#include "paddle/fluid/distributed/service/brpc_ps_client.h" +#include "paddle/fluid/distributed/service/brpc_ps_server.h" +#include "paddle/fluid/distributed/service/env.h" +#include "paddle/fluid/distributed/service/graph_brpc_client.h" +#include "paddle/fluid/distributed/service/graph_brpc_server.h" +#include "paddle/fluid/distributed/service/graph_py_service.h" +#include "paddle/fluid/distributed/service/ps_client.h" +#include "paddle/fluid/distributed/service/sendrecv.pb.h" +#include "paddle/fluid/distributed/service/service.h" +#include "paddle/fluid/distributed/table/graph_node.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/framework/variable.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/string/printf.h" + +namespace framework = paddle::framework; +namespace platform = paddle::platform; +namespace operators = paddle::operators; +namespace math = paddle::operators::math; +namespace memory = paddle::memory; +namespace distributed = paddle::distributed; + +void testSampleNodes( + std::shared_ptr& worker_ptr_) { + std::vector ids; + auto pull_status = worker_ptr_->random_sample_nodes(0, 0, 6, ids); + std::unordered_set s; + std::unordered_set s1 = {37, 59}; + pull_status.wait(); + for (auto id : ids) s.insert(id); + ASSERT_EQ(true, s.size() == s1.size()); + for (auto id : s) { + ASSERT_EQ(true, s1.find(id) != s1.end()); + } +} + +void testFeatureNodeSerializeInt() { + std::string out = + distributed::FeatureNode::parse_value_to_bytes({"123", "345"}); + std::vector out2 = + distributed::FeatureNode::parse_bytes_to_array(out); + ASSERT_EQ(out2[0], 123); + ASSERT_EQ(out2[1], 345); +} + +void testFeatureNodeSerializeInt64() { + std::string out = + distributed::FeatureNode::parse_value_to_bytes({"123", "345"}); + std::vector out2 = + distributed::FeatureNode::parse_bytes_to_array(out); + ASSERT_EQ(out2[0], 123); + ASSERT_EQ(out2[1], 345); +} + +void testFeatureNodeSerializeFloat32() { + std::string out = distributed::FeatureNode::parse_value_to_bytes( + {"123.123", "345.123"}); + std::vector out2 = + distributed::FeatureNode::parse_bytes_to_array(out); + float eps; + std::cout << "Float " << out2[0] << " " << 123.123 << std::endl; + eps = out2[0] - 123.123; + ASSERT_LE(eps * eps, 1e-5); + eps = out2[1] - 345.123; + ASSERT_LE(eps * eps, 1e-5); +} + +void testFeatureNodeSerializeFloat64() { + std::string out = distributed::FeatureNode::parse_value_to_bytes( + {"123.123", "345.123"}); + std::vector out2 = + distributed::FeatureNode::parse_bytes_to_array(out); + float eps; + eps = out2[0] - 123.123; + std::cout << "Float64 " << out2[0] << " " << 123.123 << std::endl; + ASSERT_LE(eps * eps, 1e-5); + eps = out2[1] - 345.123; + ASSERT_LE(eps * eps, 1e-5); +} + +void testSingleSampleNeighboor( + std::shared_ptr& worker_ptr_) { + std::vector>> vs; + auto pull_status = worker_ptr_->batch_sample_neighboors( + 0, std::vector(1, 37), 4, vs); + pull_status.wait(); + + std::unordered_set s; + std::unordered_set s1 = {112, 45, 145}; + for (auto g : vs[0]) { + s.insert(g.first); + } + ASSERT_EQ(s.size(), 3); + for (auto g : s) { + ASSERT_EQ(true, s1.find(g) != s1.end()); + } + VLOG(0) << "test single done"; + s.clear(); + s1.clear(); + vs.clear(); + pull_status = worker_ptr_->batch_sample_neighboors( + 0, std::vector(1, 96), 4, vs); + pull_status.wait(); + s1 = {111, 48, 247}; + for (auto g : vs[0]) { + s.insert(g.first); + } + ASSERT_EQ(s.size(), 3); + for (auto g : s) { + ASSERT_EQ(true, s1.find(g) != s1.end()); + } +} + +void testBatchSampleNeighboor( + std::shared_ptr& worker_ptr_) { + std::vector>> vs; + std::vector v = {37, 96}; + auto pull_status = worker_ptr_->batch_sample_neighboors(0, v, 4, vs); + pull_status.wait(); + std::unordered_set s; + std::unordered_set s1 = {112, 45, 145}; + for (auto g : vs[0]) { + s.insert(g.first); + } + ASSERT_EQ(s.size(), 3); + for (auto g : s) { + ASSERT_EQ(true, s1.find(g) != s1.end()); + } + s.clear(); + s1.clear(); + s1 = {111, 48, 247}; + for (auto g : vs[1]) { + s.insert(g.first); + } + ASSERT_EQ(s.size(), 3); + for (auto g : s) { + ASSERT_EQ(true, s1.find(g) != s1.end()); + } +} + +void testGraphToBuffer(); +// std::string nodes[] = {std::string("37\taa\t45;0.34\t145;0.31\t112;0.21"), +// std::string("96\tfeature\t48;1.4\t247;0.31\t111;1.21"), +// std::string("59\ttreat\t45;0.34\t145;0.31\t112;0.21"), +// std::string("97\tfood\t48;1.4\t247;0.31\t111;1.21")}; + +std::string edges[] = { + std::string("37\t45\t0.34"), std::string("37\t145\t0.31"), + std::string("37\t112\t0.21"), std::string("96\t48\t1.4"), + std::string("96\t247\t0.31"), std::string("96\t111\t1.21"), + std::string("59\t45\t0.34"), std::string("59\t145\t0.31"), + std::string("59\t122\t0.21"), std::string("97\t48\t0.34"), + std::string("97\t247\t0.31"), std::string("97\t111\t0.21")}; +char edge_file_name[] = "edges.txt"; + +std::string nodes[] = { + std::string("user\t37\ta 0.34\tb 13 14\tc hello\td abc"), + std::string("user\t96\ta 0.31\tb 15 10\tc 96hello\td abcd"), + std::string("user\t59\ta 0.11\tb 11 14"), + std::string("user\t97\ta 0.11\tb 12 11"), + std::string("item\t45\ta 0.21"), + std::string("item\t145\ta 0.21"), + std::string("item\t112\ta 0.21"), + std::string("item\t48\ta 0.21"), + std::string("item\t247\ta 0.21"), + std::string("item\t111\ta 0.21"), + std::string("item\t46\ta 0.21"), + std::string("item\t146\ta 0.21"), + std::string("item\t122\ta 0.21"), + std::string("item\t49\ta 0.21"), + std::string("item\t248\ta 0.21"), + std::string("item\t113\ta 0.21")}; +char node_file_name[] = "nodes.txt"; + +void prepare_file(char file_name[], bool load_edge) { + std::ofstream ofile; + ofile.open(file_name); + if (load_edge) { + for (auto x : edges) { + ofile << x << std::endl; + } + } else { + for (auto x : nodes) { + ofile << x << std::endl; + } + } + ofile.close(); +} +void GetDownpourSparseTableProto( + ::paddle::distributed::TableParameter* sparse_table_proto) { + sparse_table_proto->set_table_id(0); + sparse_table_proto->set_table_class("GraphTable"); + sparse_table_proto->set_shard_num(127); + sparse_table_proto->set_type(::paddle::distributed::PS_SPARSE_TABLE); + ::paddle::distributed::TableAccessorParameter* accessor_proto = + sparse_table_proto->mutable_accessor(); + accessor_proto->set_accessor_class("CommMergeAccessor"); +} + +::paddle::distributed::PSParameter GetServerProto() { + // Generate server proto desc + ::paddle::distributed::PSParameter server_fleet_desc; + ::paddle::distributed::ServerParameter* server_proto = + server_fleet_desc.mutable_server_param(); + ::paddle::distributed::DownpourServerParameter* downpour_server_proto = + server_proto->mutable_downpour_server_param(); + ::paddle::distributed::ServerServiceParameter* server_service_proto = + downpour_server_proto->mutable_service_param(); + server_service_proto->set_service_class("GraphBrpcService"); + server_service_proto->set_server_class("GraphBrpcServer"); + server_service_proto->set_client_class("GraphBrpcClient"); + server_service_proto->set_start_server_port(0); + server_service_proto->set_server_thread_num(12); + + ::paddle::distributed::TableParameter* sparse_table_proto = + downpour_server_proto->add_downpour_table_param(); + GetDownpourSparseTableProto(sparse_table_proto); + return server_fleet_desc; +} + +::paddle::distributed::PSParameter GetWorkerProto() { + ::paddle::distributed::PSParameter worker_fleet_desc; + ::paddle::distributed::WorkerParameter* worker_proto = + worker_fleet_desc.mutable_worker_param(); + + ::paddle::distributed::DownpourWorkerParameter* downpour_worker_proto = + worker_proto->mutable_downpour_worker_param(); + + ::paddle::distributed::TableParameter* worker_sparse_table_proto = + downpour_worker_proto->add_downpour_table_param(); + GetDownpourSparseTableProto(worker_sparse_table_proto); + + ::paddle::distributed::ServerParameter* server_proto = + worker_fleet_desc.mutable_server_param(); + ::paddle::distributed::DownpourServerParameter* downpour_server_proto = + server_proto->mutable_downpour_server_param(); + ::paddle::distributed::ServerServiceParameter* server_service_proto = + downpour_server_proto->mutable_service_param(); + server_service_proto->set_service_class("GraphBrpcService"); + server_service_proto->set_server_class("GraphBrpcServer"); + server_service_proto->set_client_class("GraphBrpcClient"); + server_service_proto->set_start_server_port(0); + server_service_proto->set_server_thread_num(12); + + ::paddle::distributed::TableParameter* server_sparse_table_proto = + downpour_server_proto->add_downpour_table_param(); + GetDownpourSparseTableProto(server_sparse_table_proto); + + return worker_fleet_desc; +} + +/*-------------------------------------------------------------------------*/ + +std::string ip_ = "127.0.0.1", ip2 = "127.0.0.1"; +uint32_t port_ = 5209, port2 = 5210; + +std::vector host_sign_list_; + +std::shared_ptr pserver_ptr_, + pserver_ptr2; + +std::shared_ptr worker_ptr_; + +void RunServer() { + LOG(INFO) << "init first server"; + ::paddle::distributed::PSParameter server_proto = GetServerProto(); + + auto _ps_env = paddle::distributed::PaddlePSEnvironment(); + _ps_env.set_ps_servers(&host_sign_list_, 2); // test + pserver_ptr_ = std::shared_ptr( + (paddle::distributed::GraphBrpcServer*) + paddle::distributed::PSServerFactory::create(server_proto)); + std::vector empty_vec; + framework::ProgramDesc empty_prog; + empty_vec.push_back(empty_prog); + pserver_ptr_->configure(server_proto, _ps_env, 0, empty_vec); + LOG(INFO) << "first server, run start(ip,port)"; + pserver_ptr_->start(ip_, port_); + LOG(INFO) << "init first server Done"; +} + +void RunServer2() { + LOG(INFO) << "init second server"; + ::paddle::distributed::PSParameter server_proto2 = GetServerProto(); + + auto _ps_env2 = paddle::distributed::PaddlePSEnvironment(); + _ps_env2.set_ps_servers(&host_sign_list_, 2); // test + pserver_ptr2 = std::shared_ptr( + (paddle::distributed::GraphBrpcServer*) + paddle::distributed::PSServerFactory::create(server_proto2)); + std::vector empty_vec2; + framework::ProgramDesc empty_prog2; + empty_vec2.push_back(empty_prog2); + pserver_ptr2->configure(server_proto2, _ps_env2, 1, empty_vec2); + pserver_ptr2->start(ip2, port2); +} + +void RunClient( + std::map>& dense_regions, + int index, paddle::distributed::PsBaseService* service) { + ::paddle::distributed::PSParameter worker_proto = GetWorkerProto(); + paddle::distributed::PaddlePSEnvironment _ps_env; + auto servers_ = host_sign_list_.size(); + _ps_env = paddle::distributed::PaddlePSEnvironment(); + _ps_env.set_ps_servers(&host_sign_list_, servers_); + worker_ptr_ = std::shared_ptr( + (paddle::distributed::GraphBrpcClient*) + paddle::distributed::PSClientFactory::create(worker_proto)); + worker_ptr_->configure(worker_proto, dense_regions, _ps_env, 0); + worker_ptr_->set_shard_num(127); + worker_ptr_->set_local_channel(index); + worker_ptr_->set_local_graph_service( + (paddle::distributed::GraphBrpcService*)service); +} + +void RunBrpcPushSparse() { + setenv("http_proxy", "", 1); + setenv("https_proxy", "", 1); + prepare_file(edge_file_name, 1); + prepare_file(node_file_name, 0); + auto ph_host = paddle::distributed::PSHost(ip_, port_, 0); + host_sign_list_.push_back(ph_host.serialize_to_string()); + + // test-start + auto ph_host2 = paddle::distributed::PSHost(ip2, port2, 1); + host_sign_list_.push_back(ph_host2.serialize_to_string()); + // test-end + // Srart Server + std::thread* server_thread = new std::thread(RunServer); + std::thread* server_thread2 = new std::thread(RunServer2); + sleep(1); + + std::map> dense_regions; + dense_regions.insert( + std::pair>(0, {})); + auto regions = dense_regions[0]; + + RunClient(dense_regions, 0, pserver_ptr_->get_service()); + + /*-----------------------Test Server Init----------------------------------*/ + auto pull_status = + worker_ptr_->load(0, std::string(edge_file_name), std::string("e>")); + srand(time(0)); + pull_status.wait(); + std::vector>> vs; + testSampleNodes(worker_ptr_); + sleep(5); + testSingleSampleNeighboor(worker_ptr_); + testBatchSampleNeighboor(worker_ptr_); + pull_status = worker_ptr_->batch_sample_neighboors( + 0, std::vector(1, 10240001024), 4, vs); + pull_status.wait(); + ASSERT_EQ(0, vs[0].size()); + + std::vector nodes; + pull_status = worker_ptr_->pull_graph_list(0, 0, 0, 1, 1, nodes); + pull_status.wait(); + ASSERT_EQ(nodes.size(), 1); + ASSERT_EQ(nodes[0].get_id(), 37); + nodes.clear(); + pull_status = worker_ptr_->pull_graph_list(0, 0, 1, 4, 1, nodes); + pull_status.wait(); + ASSERT_EQ(nodes.size(), 1); + ASSERT_EQ(nodes[0].get_id(), 59); + for (auto g : nodes) { + std::cout << g.get_id() << std::endl; + } + distributed::GraphPyServer server1, server2; + distributed::GraphPyClient client1, client2; + std::string ips_str = "127.0.0.1:5211;127.0.0.1:5212"; + std::vector edge_types = {std::string("user2item")}; + std::vector node_types = {std::string("user"), + std::string("item")}; + VLOG(0) << "make 2 servers"; + server1.set_up(ips_str, 127, node_types, edge_types, 0); + server2.set_up(ips_str, 127, node_types, edge_types, 1); + + server1.add_table_feat_conf("user", "a", "float32", 1); + server1.add_table_feat_conf("user", "b", "int32", 2); + server1.add_table_feat_conf("user", "c", "string", 1); + server1.add_table_feat_conf("user", "d", "string", 1); + server1.add_table_feat_conf("item", "a", "float32", 1); + + server2.add_table_feat_conf("user", "a", "float32", 1); + server2.add_table_feat_conf("user", "b", "int32", 2); + server2.add_table_feat_conf("user", "c", "string", 1); + server2.add_table_feat_conf("user", "d", "string", 1); + server2.add_table_feat_conf("item", "a", "float32", 1); + + client1.set_up(ips_str, 127, node_types, edge_types, 0); + + client1.add_table_feat_conf("user", "a", "float32", 1); + client1.add_table_feat_conf("user", "b", "int32", 2); + client1.add_table_feat_conf("user", "c", "string", 1); + client1.add_table_feat_conf("user", "d", "string", 1); + client1.add_table_feat_conf("item", "a", "float32", 1); + + client2.set_up(ips_str, 127, node_types, edge_types, 1); + + client2.add_table_feat_conf("user", "a", "float32", 1); + client2.add_table_feat_conf("user", "b", "int32", 2); + client2.add_table_feat_conf("user", "c", "string", 1); + client2.add_table_feat_conf("user", "d", "string", 1); + client2.add_table_feat_conf("item", "a", "float32", 1); + + server1.start_server(false); + std::cout << "first server done" << std::endl; + server2.start_server(false); + std::cout << "second server done" << std::endl; + client1.start_client(); + std::cout << "first client done" << std::endl; + client2.start_client(); + std::cout << "first client done" << std::endl; + std::cout << "started" << std::endl; + VLOG(0) << "come to set local server"; + client1.bind_local_server(0, server1); + VLOG(0) << "first bound"; + client2.bind_local_server(1, server2); + VLOG(0) << "second bound"; + client1.load_node_file(std::string("user"), std::string(node_file_name)); + client1.load_node_file(std::string("item"), std::string(node_file_name)); + client1.load_edge_file(std::string("user2item"), std::string(edge_file_name), + 0); + nodes.clear(); + + nodes = client1.pull_graph_list(std::string("user"), 0, 1, 4, 1); + + ASSERT_EQ(nodes[0].get_id(), 59); + nodes.clear(); + + // Test Pull by step + + std::unordered_set count_item_nodes; + // pull by step 2 + for (int test_step = 1; test_step < 4; test_step++) { + count_item_nodes.clear(); + std::cout << "check pull graph list by step " << test_step << std::endl; + for (int server_id = 0; server_id < 2; server_id++) { + for (int start_step = 0; start_step < test_step; start_step++) { + nodes = client1.pull_graph_list(std::string("item"), server_id, + start_step, 12, test_step); + for (auto g : nodes) { + count_item_nodes.insert(g.get_id()); + } + nodes.clear(); + } + } + ASSERT_EQ(count_item_nodes.size(), 12); + } + + vs = client1.batch_sample_neighboors(std::string("user2item"), + std::vector(1, 96), 4); + ASSERT_EQ(vs[0].size(), 3); + std::vector node_ids; + node_ids.push_back(96); + node_ids.push_back(37); + vs = client1.batch_sample_neighboors(std::string("user2item"), node_ids, 4); + + ASSERT_EQ(vs.size(), 2); + std::vector nodes_ids = client2.random_sample_nodes("user", 0, 6); + ASSERT_EQ(nodes_ids.size(), 2); + ASSERT_EQ(true, (nodes_ids[0] == 59 && nodes_ids[1] == 37) || + (nodes_ids[0] == 37 && nodes_ids[1] == 59)); + + // Test get node feat + node_ids.clear(); + node_ids.push_back(37); + node_ids.push_back(96); + std::vector feature_names; + feature_names.push_back(std::string("c")); + feature_names.push_back(std::string("d")); + auto node_feat = + client1.get_node_feat(std::string("user"), node_ids, feature_names); + ASSERT_EQ(node_feat.size(), 2); + ASSERT_EQ(node_feat[0].size(), 2); + VLOG(0) << "get_node_feat: " << node_feat[0][0]; + VLOG(0) << "get_node_feat: " << node_feat[0][1]; + VLOG(0) << "get_node_feat: " << node_feat[1][0]; + VLOG(0) << "get_node_feat: " << node_feat[1][1]; + + // Test string + node_ids.clear(); + node_ids.push_back(37); + node_ids.push_back(96); + // std::vector feature_names; + feature_names.clear(); + feature_names.push_back(std::string("a")); + feature_names.push_back(std::string("b")); + node_feat = + client1.get_node_feat(std::string("user"), node_ids, feature_names); + ASSERT_EQ(node_feat.size(), 2); + ASSERT_EQ(node_feat[0].size(), 2); + VLOG(0) << "get_node_feat: " << node_feat[0][0].size(); + VLOG(0) << "get_node_feat: " << node_feat[0][1].size(); + VLOG(0) << "get_node_feat: " << node_feat[1][0].size(); + VLOG(0) << "get_node_feat: " << node_feat[1][1].size(); + + std::remove(edge_file_name); + std::remove(node_file_name); + LOG(INFO) << "Run stop_server"; + worker_ptr_->stop_server(); + LOG(INFO) << "Run finalize_worker"; + worker_ptr_->finalize_worker(); + testFeatureNodeSerializeInt(); + testFeatureNodeSerializeInt64(); + testFeatureNodeSerializeFloat32(); + testFeatureNodeSerializeFloat64(); + testGraphToBuffer(); + client1.stop_server(); +} + +void testGraphToBuffer() { + ::paddle::distributed::GraphNode s, s1; + s.set_feature_size(1); + s.set_feature(0, std::string("hhhh")); + s.set_id(65); + int size = s.get_size(true); + char str[size]; + s.to_buffer(str, true); + s1.recover_from_buffer(str); + ASSERT_EQ(s.get_id(), s1.get_id()); + VLOG(0) << s.get_feature(0); + VLOG(0) << s1.get_feature(0); +} + +TEST(RunBrpcPushSparse, Run) { RunBrpcPushSparse(); } diff --git a/paddle/fluid/inference/api/demo_ci/clean.sh b/paddle/fluid/inference/api/demo_ci/clean.sh index 0d9f3d2aa23..c265721db57 100755 --- a/paddle/fluid/inference/api/demo_ci/clean.sh +++ b/paddle/fluid/inference/api/demo_ci/clean.sh @@ -1,3 +1,17 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + set -x cd `dirname $0` rm -rf build/ data/ diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index 97ebd64a07e..10c79933546 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -7,6 +7,10 @@ set(PYBIND_DEPS pybind python proto_desc memory executor fleet_wrapper box_wrapp analysis_predictor imperative_profiler imperative_flag save_load_util dlpack_tensor device_context gloo_wrapper infer_io_utils heter_wrapper generator op_version_registry ps_gpu_wrapper custom_operator) +if (WITH_PSCORE) + set(PYBIND_DEPS ${PYBIND_DEPS} ps_service) + set(PYBIND_DEPS ${PYBIND_DEPS} graph_py_service) +endif() if (WITH_GPU OR WITH_ROCM) set(PYBIND_DEPS ${PYBIND_DEPS} dynload_cuda) set(PYBIND_DEPS ${PYBIND_DEPS} cuda_device_guard) diff --git a/paddle/fluid/pybind/fleet_py.cc b/paddle/fluid/pybind/fleet_py.cc index ba716fb3b55..0a2159667f3 100644 --- a/paddle/fluid/pybind/fleet_py.cc +++ b/paddle/fluid/pybind/fleet_py.cc @@ -32,6 +32,8 @@ limitations under the License. */ #include "paddle/fluid/distributed/fleet.h" #include "paddle/fluid/distributed/service/communicator.h" #include "paddle/fluid/distributed/service/env.h" +#include "paddle/fluid/distributed/service/graph_brpc_client.h" +#include "paddle/fluid/distributed/service/graph_py_service.h" #include "paddle/fluid/distributed/service/heter_client.h" namespace py = pybind11; @@ -39,6 +41,11 @@ using paddle::distributed::CommContext; using paddle::distributed::Communicator; using paddle::distributed::FleetWrapper; using paddle::distributed::HeterClient; +using paddle::distributed::GraphPyService; +using paddle::distributed::GraphNode; +using paddle::distributed::GraphPyServer; +using paddle::distributed::GraphPyClient; +using paddle::distributed::FeatureNode; namespace paddle { namespace pybind { @@ -152,5 +159,58 @@ void BindHeterClient(py::module* m) { .def("stop", &HeterClient::Stop); } +void BindGraphNode(py::module* m) { + py::class_(*m, "GraphNode") + .def(py::init<>()) + .def("get_id", &GraphNode::get_id) + .def("get_feature", &GraphNode::get_feature); +} +void BindGraphPyFeatureNode(py::module* m) { + py::class_(*m, "FeatureNode") + .def(py::init<>()) + .def("get_id", &GraphNode::get_id) + .def("get_feature", &GraphNode::get_feature); +} + +void BindGraphPyService(py::module* m) { + py::class_(*m, "GraphPyService").def(py::init<>()); +} + +void BindGraphPyServer(py::module* m) { + py::class_(*m, "GraphPyServer") + .def(py::init<>()) + .def("start_server", &GraphPyServer::start_server) + .def("set_up", &GraphPyServer::set_up) + .def("add_table_feat_conf", &GraphPyServer::add_table_feat_conf); +} +void BindGraphPyClient(py::module* m) { + py::class_(*m, "GraphPyClient") + .def(py::init<>()) + .def("load_edge_file", &GraphPyClient::load_edge_file) + .def("load_node_file", &GraphPyClient::load_node_file) + .def("set_up", &GraphPyClient::set_up) + .def("add_table_feat_conf", &GraphPyClient::add_table_feat_conf) + .def("pull_graph_list", &GraphPyClient::pull_graph_list) + .def("start_client", &GraphPyClient::start_client) + .def("batch_sample_neighboors", &GraphPyClient::batch_sample_neighboors) + .def("random_sample_nodes", &GraphPyClient::random_sample_nodes) + .def("stop_server", &GraphPyClient::stop_server) + .def("get_node_feat", + [](GraphPyClient& self, std::string node_type, + std::vector node_ids, + std::vector feature_names) { + auto feats = + self.get_node_feat(node_type, node_ids, feature_names); + std::vector> bytes_feats(feats.size()); + for (int i = 0; i < feats.size(); ++i) { + for (int j = 0; j < feats[i].size(); ++j) { + bytes_feats[i].push_back(py::bytes(feats[i][j])); + } + } + return bytes_feats; + }) + .def("bind_local_server", &GraphPyClient::bind_local_server); +} + } // end namespace pybind } // namespace paddle diff --git a/paddle/fluid/pybind/fleet_py.h b/paddle/fluid/pybind/fleet_py.h index 7f471598ad2..11b430cd208 100644 --- a/paddle/fluid/pybind/fleet_py.h +++ b/paddle/fluid/pybind/fleet_py.h @@ -27,6 +27,10 @@ void BindPSHost(py::module* m); void BindCommunicatorContext(py::module* m); void BindDistCommunicator(py::module* m); void BindHeterClient(py::module* m); - +void BindGraphNode(py::module* m); +void BindGraphPyService(py::module* m); +void BindGraphPyFeatureNode(py::module* m); +void BindGraphPyServer(py::module* m); +void BindGraphPyClient(py::module* m); } // namespace pybind } // namespace paddle diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index d8ee80c0070..29c7f00142d 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -2896,6 +2896,11 @@ All parameter, weight, gradient are variables in Paddle. BindCommunicatorContext(&m); BindDistCommunicator(&m); BindHeterClient(&m); + BindGraphPyFeatureNode(&m); + BindGraphNode(&m); + BindGraphPyService(&m); + BindGraphPyServer(&m); + BindGraphPyClient(&m); #endif } } // namespace pybind diff --git a/paddle/scripts/build_docker_images.sh b/paddle/scripts/build_docker_images.sh index a90f0885294..2b584cdca6b 100644 --- a/paddle/scripts/build_docker_images.sh +++ b/paddle/scripts/build_docker_images.sh @@ -1,4 +1,19 @@ #!/bin/sh + +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + set -xe REPO="${REPO:-paddlepaddle}" diff --git a/paddle/scripts/docker/root/.scripts/git-completion.sh b/paddle/scripts/docker/root/.scripts/git-completion.sh index bdddef5ac2f..c43e88a4acd 100755 --- a/paddle/scripts/docker/root/.scripts/git-completion.sh +++ b/paddle/scripts/docker/root/.scripts/git-completion.sh @@ -1,4 +1,19 @@ #!bash + +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + # # bash/zsh completion support for core Git. # diff --git a/paddle/scripts/fast_install.sh b/paddle/scripts/fast_install.sh index 1034b1c5c10..cacec55d3bc 100644 --- a/paddle/scripts/fast_install.sh +++ b/paddle/scripts/fast_install.sh @@ -1,5 +1,19 @@ #!/bin/bash +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + ## purple to echo function purple(){ echo -e "\033[35m$1\033[0m" diff --git a/python/paddle/fluid/dataloader/fetcher.py b/python/paddle/fluid/dataloader/fetcher.py index 9382a704223..41e12fbc68e 100644 --- a/python/paddle/fluid/dataloader/fetcher.py +++ b/python/paddle/fluid/dataloader/fetcher.py @@ -27,8 +27,8 @@ class _DatasetFetcher(object): class _IterableDatasetFetcher(_DatasetFetcher): def __init__(self, dataset, auto_collate_batch, collate_fn, drop_last): - super(_IterableDatasetFetcher, self).__init__(dataset, auto_collate_batch, - collate_fn, drop_last) + super(_IterableDatasetFetcher, self).__init__( + dataset, auto_collate_batch, collate_fn, drop_last) self.dataset_iter = iter(dataset) def fetch(self, batch_indices): @@ -53,7 +53,8 @@ class _IterableDatasetFetcher(_DatasetFetcher): class _MapDatasetFetcher(_DatasetFetcher): def __init__(self, dataset, auto_collate_batch, collate_fn, drop_last): - super(_MapDatasetFetcher, self).__init__(dataset, auto_collate_batch, collate_fn, drop_last) + super(_MapDatasetFetcher, self).__init__(dataset, auto_collate_batch, + collate_fn, drop_last) def fetch(self, batch_indices): if self.auto_collate_batch: diff --git a/python/paddle/fluid/incubate/fleet/tests/cluster_train.sh b/python/paddle/fluid/incubate/fleet/tests/cluster_train.sh index 1df6b0618de..cac2f7234bd 100644 --- a/python/paddle/fluid/incubate/fleet/tests/cluster_train.sh +++ b/python/paddle/fluid/incubate/fleet/tests/cluster_train.sh @@ -1,5 +1,19 @@ #!/bin/bash +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + # start pserver0 python fleet_deep_ctr.py \ --role pserver \ diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_squared_mat_sub_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_squared_mat_sub_fuse_pass.py index 95cff4de6f6..69a9ae3c0ad 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_squared_mat_sub_fuse_pass.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_squared_mat_sub_fuse_pass.py @@ -40,9 +40,11 @@ class SquaredMatSubFusePassTest(InferencePassTest): matmul_ab_square = paddle.square(matmul_ab) matmul_square_ab = paddle.matmul(data_a_square, data_b_square) - scale = paddle.fluid.layers.fill_constant(shape=[1], value=0.5, dtype='float32') + scale = paddle.fluid.layers.fill_constant( + shape=[1], value=0.5, dtype='float32') - sub_val = paddle.fluid.layers.elementwise_sub(matmul_ab_square, matmul_square_ab) + sub_val = paddle.fluid.layers.elementwise_sub(matmul_ab_square, + matmul_square_ab) squared_mat_sub_out = fluid.layers.elementwise_mul(sub_val, scale) self.feeds = { diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_matmul.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_matmul.py index 94434f40434..080d1ccc905 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_matmul.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_matmul.py @@ -25,19 +25,16 @@ class TensorRTMatMulDims2Test(InferencePassTest): def setUp(self): self.set_params() with fluid.program_guard(self.main_program, self.startup_program): - data = fluid.data( - name="data", shape=[24, 24], dtype="float32") + data = fluid.data(name="data", shape=[24, 24], dtype="float32") matmul_out = fluid.layers.matmul( x=data, y=data, - transpose_x = self.transpose_x, - transpose_y = self.transpose_y, - alpha = self.alpha) + transpose_x=self.transpose_x, + transpose_y=self.transpose_y, + alpha=self.alpha) out = fluid.layers.batch_norm(matmul_out, is_test=True) - self.feeds = { - "data": np.ones([24, 24]).astype("float32"), - } + self.feeds = {"data": np.ones([24, 24]).astype("float32"), } self.enable_trt = True self.trt_parameters = TensorRTMatMulDims2Test.TensorRTParam( 1 << 30, 32, 0, AnalysisConfig.Precision.Float32, False, False) @@ -65,14 +62,12 @@ class TensorRTMatMulTest(InferencePassTest): matmul_out = fluid.layers.matmul( x=data, y=data, - transpose_x = self.transpose_x, - transpose_y = self.transpose_y, - alpha = self.alpha) + transpose_x=self.transpose_x, + transpose_y=self.transpose_y, + alpha=self.alpha) out = fluid.layers.batch_norm(matmul_out, is_test=True) - self.feeds = { - "data": np.ones([1, 6, 24, 24]).astype("float32"), - } + self.feeds = {"data": np.ones([1, 6, 24, 24]).astype("float32"), } self.enable_trt = True self.trt_parameters = TensorRTMatMulTest.TensorRTParam( 1 << 30, 32, 0, AnalysisConfig.Precision.Float32, False, False) diff --git a/python/paddle/fluid/tests/unittests/parallel_test.sh b/python/paddle/fluid/tests/unittests/parallel_test.sh index 9da4f035345..551b7cdb7a4 100644 --- a/python/paddle/fluid/tests/unittests/parallel_test.sh +++ b/python/paddle/fluid/tests/unittests/parallel_test.sh @@ -1,4 +1,19 @@ #!/bin/bash + +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + unset https_proxy http_proxy export FLAGS_rpc_disable_reuse_port=1 diff --git a/python/paddle/fluid/tests/unittests/test_bce_loss.py b/python/paddle/fluid/tests/unittests/test_bce_loss.py index 4b39436842b..ea1a22780f0 100644 --- a/python/paddle/fluid/tests/unittests/test_bce_loss.py +++ b/python/paddle/fluid/tests/unittests/test_bce_loss.py @@ -27,8 +27,10 @@ def test_static_layer(place, prog = paddle.static.Program() startup_prog = paddle.static.Program() with paddle.static.program_guard(prog, startup_prog): - input = paddle.fluid.data(name='input', shape=input_np.shape, dtype='float64') - label = paddle.fluid.data(name='label', shape=label_np.shape, dtype='float64') + input = paddle.fluid.data( + name='input', shape=input_np.shape, dtype='float64') + label = paddle.fluid.data( + name='label', shape=label_np.shape, dtype='float64') if weight_np is not None: weight = paddle.fluid.data( name='weight', shape=weight_np.shape, dtype='float64') @@ -58,8 +60,10 @@ def test_static_functional(place, prog = paddle.static.Program() startup_prog = paddle.static.Program() with paddle.static.program_guard(prog, startup_prog): - input = paddle.fluid.data(name='input', shape=input_np.shape, dtype='float64') - label = paddle.fluid.data(name='label', shape=label_np.shape, dtype='float64') + input = paddle.fluid.data( + name='input', shape=input_np.shape, dtype='float64') + label = paddle.fluid.data( + name='label', shape=label_np.shape, dtype='float64') if weight_np is not None: weight = paddle.fluid.data( name='weight', shape=weight_np.shape, dtype='float64') diff --git a/python/paddle/fluid/tests/unittests/test_bce_with_logits_loss.py b/python/paddle/fluid/tests/unittests/test_bce_with_logits_loss.py index a6175aa471d..153b8fd3e7f 100644 --- a/python/paddle/fluid/tests/unittests/test_bce_with_logits_loss.py +++ b/python/paddle/fluid/tests/unittests/test_bce_with_logits_loss.py @@ -48,8 +48,10 @@ def test_static(place, prog = paddle.static.Program() startup_prog = paddle.static.Program() with paddle.static.program_guard(prog, startup_prog): - logit = paddle.fluid.data(name='logit', shape=logit_np.shape, dtype='float64') - label = paddle.fluid.data(name='label', shape=label_np.shape, dtype='float64') + logit = paddle.fluid.data( + name='logit', shape=logit_np.shape, dtype='float64') + label = paddle.fluid.data( + name='label', shape=label_np.shape, dtype='float64') feed_dict = {"logit": logit_np, "label": label_np} pos_weight = None diff --git a/python/paddle/fluid/tests/unittests/test_c_comm_init_op.sh b/python/paddle/fluid/tests/unittests/test_c_comm_init_op.sh index a9d450e223f..aba95a68ab7 100644 --- a/python/paddle/fluid/tests/unittests/test_c_comm_init_op.sh +++ b/python/paddle/fluid/tests/unittests/test_c_comm_init_op.sh @@ -1,4 +1,19 @@ #!/bin/bash + +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + set -e # use default values # FIXME: random fails on Unknown command lines -c (or -m). diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps10.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps10.py index 16584ee5008..a82866a797d 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps10.py +++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps10.py @@ -23,7 +23,6 @@ import os paddle.enable_static() - # For Net base_lr = 0.2 emb_lr = base_lr * 3 diff --git a/python/paddle/fluid/tests/unittests/test_flatten_contiguous_range_op.py b/python/paddle/fluid/tests/unittests/test_flatten_contiguous_range_op.py index aa85eb3df35..28803f5ac62 100644 --- a/python/paddle/fluid/tests/unittests/test_flatten_contiguous_range_op.py +++ b/python/paddle/fluid/tests/unittests/test_flatten_contiguous_range_op.py @@ -170,7 +170,8 @@ class TestFlatten2OpError(unittest.TestCase): x2 = np.arange(image_shape[0] * image_shape[1] * image_shape[2] * image_shape[3]).reshape(image_shape) / 100. x2 = x2.astype('float16') - x2_var = paddle.fluid.data(name='x2', shape=[3, 2, 4, 5], dtype='float16') + x2_var = paddle.fluid.data( + name='x2', shape=[3, 2, 4, 5], dtype='float16') paddle.flatten(x2_var) self.assertRaises(TypeError, test_type) diff --git a/python/paddle/fluid/tests/unittests/test_l1_loss.py b/python/paddle/fluid/tests/unittests/test_l1_loss.py index fba16959901..c35188623b4 100644 --- a/python/paddle/fluid/tests/unittests/test_l1_loss.py +++ b/python/paddle/fluid/tests/unittests/test_l1_loss.py @@ -44,8 +44,10 @@ class TestFunctionalL1Loss(unittest.TestCase): self.assertTrue(dy_result.shape, [10, 10, 5]) def run_static(self, use_gpu=False): - input = paddle.fluid.data(name='input', shape=[10, 10, 5], dtype='float32') - label = paddle.fluid.data(name='label', shape=[10, 10, 5], dtype='float32') + input = paddle.fluid.data( + name='input', shape=[10, 10, 5], dtype='float32') + label = paddle.fluid.data( + name='label', shape=[10, 10, 5], dtype='float32') result0 = paddle.nn.functional.l1_loss(input, label) result1 = paddle.nn.functional.l1_loss(input, label, reduction='sum') result2 = paddle.nn.functional.l1_loss(input, label, reduction='none') @@ -127,8 +129,10 @@ class TestClassL1Loss(unittest.TestCase): self.assertTrue(dy_result.shape, [10, 10, 5]) def run_static(self, use_gpu=False): - input = paddle.fluid.data(name='input', shape=[10, 10, 5], dtype='float32') - label = paddle.fluid.data(name='label', shape=[10, 10, 5], dtype='float32') + input = paddle.fluid.data( + name='input', shape=[10, 10, 5], dtype='float32') + label = paddle.fluid.data( + name='label', shape=[10, 10, 5], dtype='float32') l1_loss = paddle.nn.loss.L1Loss() result0 = l1_loss(input, label) l1_loss = paddle.nn.loss.L1Loss(reduction='sum') diff --git a/python/paddle/fluid/tests/unittests/test_listen_and_serv.sh b/python/paddle/fluid/tests/unittests/test_listen_and_serv.sh index bee230fba5a..d9d64e4dfa6 100644 --- a/python/paddle/fluid/tests/unittests/test_listen_and_serv.sh +++ b/python/paddle/fluid/tests/unittests/test_listen_and_serv.sh @@ -1,4 +1,19 @@ #!/bin/bash + +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + unset https_proxy http_proxy nohup python -u test_listen_and_serv_op.py > test_listen_and_serv_op.log 2>&1 & diff --git a/python/paddle/fluid/tests/unittests/test_mse_loss.py b/python/paddle/fluid/tests/unittests/test_mse_loss.py index bc5d35d3254..89eef6ca242 100644 --- a/python/paddle/fluid/tests/unittests/test_mse_loss.py +++ b/python/paddle/fluid/tests/unittests/test_mse_loss.py @@ -191,8 +191,10 @@ class TestNNFunctionalMseLoss(unittest.TestCase): place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda( ) else paddle.CPUPlace() with paddle.static.program_guard(prog, startup_prog): - input = paddle.fluid.data(name='input', shape=dim, dtype='float32') - target = paddle.fluid.data(name='target', shape=dim, dtype='float32') + input = paddle.fluid.data( + name='input', shape=dim, dtype='float32') + target = paddle.fluid.data( + name='target', shape=dim, dtype='float32') mse_loss = paddle.nn.functional.mse_loss(input, target, 'mean') exe = paddle.static.Executor(place) @@ -225,8 +227,10 @@ class TestNNFunctionalMseLoss(unittest.TestCase): place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda( ) else paddle.CPUPlace() with paddle.static.program_guard(prog, startup_prog): - input = paddle.fluid.data(name='input', shape=dim, dtype='float32') - target = paddle.fluid.data(name='target', shape=dim, dtype='float32') + input = paddle.fluid.data( + name='input', shape=dim, dtype='float32') + target = paddle.fluid.data( + name='target', shape=dim, dtype='float32') mse_loss = paddle.nn.functional.mse_loss(input, target, 'sum') exe = paddle.static.Executor(place) @@ -259,8 +263,10 @@ class TestNNFunctionalMseLoss(unittest.TestCase): place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda( ) else paddle.CPUPlace() with paddle.static.program_guard(prog, startup_prog): - input = paddle.fluid.data(name='input', shape=dim, dtype='float32') - target = paddle.fluid.data(name='target', shape=dim, dtype='float32') + input = paddle.fluid.data( + name='input', shape=dim, dtype='float32') + target = paddle.fluid.data( + name='target', shape=dim, dtype='float32') mse_loss = paddle.nn.functional.mse_loss(input, target, 'none') exe = paddle.static.Executor(place) diff --git a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_dynamic.py b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_dynamic.py index 0533a0d09fa..3bb3e843b1b 100644 --- a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_dynamic.py +++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_iterable_dataset_dynamic.py @@ -160,5 +160,6 @@ class TestDygraphDataLoaderWithBatchedDataset(TestDygraphDataLoader): print("time cost", ret['time'], 'step_list', ret['step']) return ret + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_pixel_shuffle.py b/python/paddle/fluid/tests/unittests/test_pixel_shuffle.py index f75d6e9df54..f1a409c712f 100644 --- a/python/paddle/fluid/tests/unittests/test_pixel_shuffle.py +++ b/python/paddle/fluid/tests/unittests/test_pixel_shuffle.py @@ -97,8 +97,10 @@ class TestPixelShuffleAPI(unittest.TestCase): place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace() paddle.enable_static() - x_1 = paddle.fluid.data(name="x", shape=[2, 9, 4, 4], dtype="float64") - x_2 = paddle.fluid.data(name="x2", shape=[2, 4, 4, 9], dtype="float64") + x_1 = paddle.fluid.data( + name="x", shape=[2, 9, 4, 4], dtype="float64") + x_2 = paddle.fluid.data( + name="x2", shape=[2, 4, 4, 9], dtype="float64") out_1 = F.pixel_shuffle(x_1, 3) out_2 = F.pixel_shuffle(x_2, 3, "NHWC") @@ -123,8 +125,10 @@ class TestPixelShuffleAPI(unittest.TestCase): place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace() paddle.enable_static() - x_1 = paddle.fluid.data(name="x", shape=[2, 9, 4, 4], dtype="float64") - x_2 = paddle.fluid.data(name="x2", shape=[2, 4, 4, 9], dtype="float64") + x_1 = paddle.fluid.data( + name="x", shape=[2, 9, 4, 4], dtype="float64") + x_2 = paddle.fluid.data( + name="x2", shape=[2, 4, 4, 9], dtype="float64") # init instance ps_1 = paddle.nn.PixelShuffle(3) ps_2 = paddle.nn.PixelShuffle(3, "NHWC") diff --git a/python/paddle/fluid/tests/unittests/test_prod_op.py b/python/paddle/fluid/tests/unittests/test_prod_op.py index 15fd79542d6..cdfcbb4e4e7 100644 --- a/python/paddle/fluid/tests/unittests/test_prod_op.py +++ b/python/paddle/fluid/tests/unittests/test_prod_op.py @@ -55,7 +55,8 @@ class TestProdOp(unittest.TestCase): self.assertTrue(np.allclose(dy_result.numpy(), expected_result)) def run_static(self, use_gpu=False): - input = paddle.fluid.data(name='input', shape=[10, 10, 5], dtype='float32') + input = paddle.fluid.data( + name='input', shape=[10, 10, 5], dtype='float32') result0 = paddle.prod(input) result1 = paddle.prod(input, axis=1) result2 = paddle.prod(input, axis=-1) @@ -114,7 +115,8 @@ class TestProdOpError(unittest.TestCase): with paddle.static.program_guard(paddle.static.Program(), paddle.static.Program()): x = paddle.fluid.data(name='x', shape=[2, 2, 4], dtype='float32') - bool_x = paddle.fluid.data(name='bool_x', shape=[2, 2, 4], dtype='bool') + bool_x = paddle.fluid.data( + name='bool_x', shape=[2, 2, 4], dtype='bool') # The argument x shoule be a Tensor self.assertRaises(TypeError, paddle.prod, [1]) diff --git a/python/paddle/fluid/tests/unittests/test_selu_op.py b/python/paddle/fluid/tests/unittests/test_selu_op.py index 95ae1eecc66..e71adae8d9b 100644 --- a/python/paddle/fluid/tests/unittests/test_selu_op.py +++ b/python/paddle/fluid/tests/unittests/test_selu_op.py @@ -128,15 +128,18 @@ class TestSeluAPI(unittest.TestCase): # The input type must be Variable. self.assertRaises(TypeError, F.selu, 1) # The input dtype must be float16, float32, float64. - x_int32 = paddle.fluid.data(name='x_int32', shape=[12, 10], dtype='int32') + x_int32 = paddle.fluid.data( + name='x_int32', shape=[12, 10], dtype='int32') self.assertRaises(TypeError, F.selu, x_int32) # The scale must be greater than 1.0 - x_fp32 = paddle.fluid.data(name='x_fp32', shape=[12, 10], dtype='float32') + x_fp32 = paddle.fluid.data( + name='x_fp32', shape=[12, 10], dtype='float32') self.assertRaises(ValueError, F.selu, x_fp32, -1.0) # The alpha must be no less than 0 self.assertRaises(ValueError, F.selu, x_fp32, 1.6, -1.0) # support the input dtype is float16 - x_fp16 = paddle.fluid.data(name='x_fp16', shape=[12, 10], dtype='float16') + x_fp16 = paddle.fluid.data( + name='x_fp16', shape=[12, 10], dtype='float16') F.selu(x_fp16) diff --git a/python/paddle/fluid/tests/unittests/test_sigmoid_focal_loss.py b/python/paddle/fluid/tests/unittests/test_sigmoid_focal_loss.py index 85f9501e53f..2ef04d9cbfa 100644 --- a/python/paddle/fluid/tests/unittests/test_sigmoid_focal_loss.py +++ b/python/paddle/fluid/tests/unittests/test_sigmoid_focal_loss.py @@ -42,8 +42,10 @@ def test_static(place, prog = paddle.static.Program() startup_prog = paddle.static.Program() with paddle.static.program_guard(prog, startup_prog): - logit = paddle.fluid.data(name='logit', shape=logit_np.shape, dtype='float64') - label = paddle.fluid.data(name='label', shape=label_np.shape, dtype='float64') + logit = paddle.fluid.data( + name='logit', shape=logit_np.shape, dtype='float64') + label = paddle.fluid.data( + name='label', shape=label_np.shape, dtype='float64') feed_dict = {"logit": logit_np, "label": label_np} normalizer = None diff --git a/python/paddle/fluid/tests/unittests/test_transpose_op.py b/python/paddle/fluid/tests/unittests/test_transpose_op.py index f72df8cbe46..59b4afdf8b0 100644 --- a/python/paddle/fluid/tests/unittests/test_transpose_op.py +++ b/python/paddle/fluid/tests/unittests/test_transpose_op.py @@ -23,6 +23,7 @@ from paddle.fluid import Program, program_guard paddle.enable_static() + class TestTransposeOp(OpTest): def setUp(self): self.init_op_type() @@ -151,6 +152,7 @@ class TestTransposeOpError(unittest.TestCase): self.assertRaises(ValueError, test_each_elem_value_check) + class TestTransposeApi(unittest.TestCase): def test_static_out(self): paddle.enable_static() @@ -161,10 +163,11 @@ class TestTransposeApi(unittest.TestCase): place = paddle.CPUPlace() exe = paddle.static.Executor(place) x_np = np.random.random([2, 3, 4]).astype("float32") - result1, result2 = exe.run(feed={"x": x_np}, fetch_list=[x_trans1, x_trans2]) + result1, result2 = exe.run(feed={"x": x_np}, + fetch_list=[x_trans1, x_trans2]) expected_result1 = np.transpose(x_np, [1, 0, 2]) expected_result2 = np.transpose(x_np, (2, 1, 0)) - + np.testing.assert_array_equal(result1, expected_result1) np.testing.assert_array_equal(result2, expected_result2) @@ -185,6 +188,7 @@ class TestTransposeApi(unittest.TestCase): # dygraph test paddle.enable_static() + class TestTAPI(unittest.TestCase): def test_out(self): with fluid.program_guard(fluid.Program()): diff --git a/scripts/paddle b/scripts/paddle new file mode 100644 index 00000000000..5f256ccf157 --- /dev/null +++ b/scripts/paddle @@ -0,0 +1,169 @@ +#!/bin/bash + +function version(){ + echo "PaddlePaddle , compiled with" + echo " with_avx: ON" + echo " with_gpu: OFF" + echo " with_mkl: ON" + echo " with_mkldnn: " + echo " with_python: ON" +} + +function ver2num() { + set -e + # convert version to number. + if [ -z "$1" ]; then # empty argument + printf "%03d%03d%03d%03d%03d" 0 + else + local VERN=$(echo $1 | sed 's#v##g' | sed 's#\.# #g' \ + | sed 's#a# 0 #g' | sed 's#b# 1 #g' | sed 's#rc# 2 #g') + if [ `echo $VERN | wc -w` -eq 3 ] ; then + printf "%03d%03d%03d%03d%03d" $VERN 999 999 + else + printf "%03d%03d%03d%03d%03d" $VERN + fi + fi + set +e +} + +function cpu_config() { + # auto set KMP_AFFINITY and OMP_DYNAMIC from Hyper Threading Status + # only when MKL enabled + if [ "ON" == "OFF" ]; then + return 0 + fi + platform="`uname -s`" + ht=0 + if [ $platform == "Linux" ]; then + ht=`lscpu |grep "per core"|awk -F':' '{print $2}'|xargs` + elif [ $platform == "Darwin" ]; then + if [ `sysctl -n hw.physicalcpu` -eq `sysctl -n hw.logicalcpu` ]; then + # HT is OFF + ht=1 + fi + else + return 0 + fi + if [ $ht -eq 1 ]; then # HT is OFF + if [ -z "$KMP_AFFINITY" ]; then + export KMP_AFFINITY="granularity=fine,compact,0,0" + fi + if [ -z "$OMP_DYNAMIC" ]; then + export OMP_DYNAMIC="FALSE" + fi + else # HT is ON + if [ -z "$KMP_AFFINITY" ]; then + export KMP_AFFINITY="granularity=fine,compact,1,0" + fi + if [ -z "$OMP_DYNAMIC" ]; then + export OMP_DYNAMIC="True" + fi + fi +} + +function threads_config() { + # auto set OMP_NUM_THREADS and MKL_NUM_THREADS + # according to trainer_count and total processors + # only when MKL enabled + # auto set OPENBLAS_NUM_THREADS when do not use MKL + platform="`uname -s`" + processors=0 + if [ $platform == "Linux" ]; then + processors=`grep "processor" /proc/cpuinfo|sort -u|wc -l` + elif [ $platform == "Darwin" ]; then + processors=`sysctl -n hw.logicalcpu` + else + return 0 + fi + trainers=`grep -Eo 'trainer_count.[0-9]+' <<< "$@" |grep -Eo '[0-9]+'|xargs` + if [ -z $trainers ]; then + trainers=1 + fi + threads=$((processors / trainers)) + if [ $threads -eq 0 ]; then + threads=1 + fi + if [ "ON" == "ON" ]; then + if [ -z "$OMP_NUM_THREADS" ]; then + export OMP_NUM_THREADS=$threads + fi + if [ -z "$MKL_NUM_THREADS" ]; then + export MKL_NUM_THREADS=$threads + fi + else + if [ -z "$OPENBLAS_NUM_THREADS" ]; then + export OPENBLAS_NUM_THREADS=$threads + fi + if [ $threads -gt 1 ] && [ -z "$OPENBLAS_MAIN_FREE" ]; then + export OPENBLAS_MAIN_FREE=1 + fi + fi + +} + +PADDLE_CONF_HOME="$HOME/.config/paddle" +mkdir -p ${PADDLE_CONF_HOME} + +if [ -z "${PADDLE_NO_STAT+x}" ]; then + SERVER_VER=`curl -m 5 -X POST --data content="{ \"version\": \"\" }"\ + -b ${PADDLE_CONF_HOME}/paddle.cookie \ + -c ${PADDLE_CONF_HOME}/paddle.cookie \ + http://api.paddlepaddle.org/version 2>/dev/null` + if [ $? -eq 0 ] && [ "$(ver2num )" -lt $(ver2num $SERVER_VER) ]; then + echo "Paddle release a new version ${SERVER_VER}, you can get the install package in http://www.paddlepaddle.org" + fi +fi + +PADDLE_BIN_PATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" + +if [ ! -z "${DEBUGGER}" ]; then + echo "Using debug command ${DEBUGGER}" +fi + +CUDNN_LIB_PATH="" + +if [ ! -z "${CUDNN_LIB_PATH}" ]; then + export LD_LIBRARY_PATH=${CUDNN_LIB_PATH}:${LD_LIBRARY_PATH} +fi + +export PYTHONPATH=${PWD}:${PYTHONPATH} + + +# Check python lib installed or not. +pip --help > /dev/null +if [ $? -ne 0 ]; then + echo "pip should be installed to run paddle." + exit 1 +fi + +if [ "OFF" == "ON" ]; then + PADDLE_NAME="paddlepaddle-gpu" +else + PADDLE_NAME="paddlepaddle" +fi + +INSTALLED_VERSION=`pip freeze 2>/dev/null | grep "^${PADDLE_NAME}==" | sed 's/.*==//g'` + +if [ -z "${INSTALLED_VERSION}" ]; then + INSTALLED_VERSION="0.0.0" # not installed +fi +cat <#RUN apt-get update \ diff --git a/tools/document_preview.sh b/tools/document_preview.sh index 10f486f8fd4..83c758d0aa8 100755 --- a/tools/document_preview.sh +++ b/tools/document_preview.sh @@ -1,4 +1,19 @@ #!/bin/bash + +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + PADDLE_ROOT=/home mkdir ${PADDLE_ROOT} cd ${PADDLE_ROOT} diff --git a/tools/get_cpu_info.sh b/tools/get_cpu_info.sh index 81eb19dc066..bce338a8619 100755 --- a/tools/get_cpu_info.sh +++ b/tools/get_cpu_info.sh @@ -1,5 +1,19 @@ #!/bin/bash +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + if [ "`uname -s`" != "Linux" ]; then echo "Current scenario only support in Linux yet!" exit 0 -- GitLab From d9187869d60049525e1bb6dcab0cb1f9869e55cb Mon Sep 17 00:00:00 2001 From: Wilber Date: Fri, 2 Apr 2021 14:05:16 +0800 Subject: [PATCH 144/486] update trt engine addplugin name. (#32018) * update trt engine addplugin name. * update --- paddle/fluid/inference/tensorrt/convert/elementwise_op.cc | 2 +- .../inference/tensorrt/convert/emb_eltwise_layernorm.cc | 2 +- paddle/fluid/inference/tensorrt/convert/gelu_op.cc | 2 +- .../fluid/inference/tensorrt/convert/multihead_matmul_op.cc | 2 +- paddle/fluid/inference/tensorrt/convert/pool2d_op.cc | 2 +- paddle/fluid/inference/tensorrt/convert/prelu_op.cc | 2 +- paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc | 2 +- paddle/fluid/inference/tensorrt/convert/slice_op.cc | 6 +++--- paddle/fluid/inference/tensorrt/convert/split_op.cc | 2 +- paddle/fluid/inference/tensorrt/convert/stack_op.cc | 2 +- paddle/fluid/inference/tensorrt/convert/swish_op.cc | 2 +- paddle/fluid/inference/tensorrt/engine.h | 6 +++--- .../fluid/inference/tensorrt/plugin/roi_align_op_plugin.cu | 1 + 13 files changed, 17 insertions(+), 16 deletions(-) diff --git a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc index dfadb28a652..74057addecd 100644 --- a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc @@ -251,7 +251,7 @@ class ElementwiseTensorOpConverter : public OpConverter { #if IS_TRT_VERSION_GE(6000) plugin::ElementwisePluginDynamic* plugin = new plugin::ElementwisePluginDynamic(op_type_, axis); - layer = engine_->AddPluginV2(itensors.data(), 2, plugin); + layer = engine_->AddDynamicPlugin(itensors.data(), 2, plugin); #else PADDLE_THROW(platform::errors::Fatal( "You are running the TRT Dynamic Shape mode, need to confirm that " diff --git a/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc index 7f8843a3f67..957dfe03698 100644 --- a/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc +++ b/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc @@ -169,7 +169,7 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter { plugin = new plugin::EmbEltwiseLayernormPluginDynamic( input_embs, bias, scale, emb_sizes, bias_size, scale_size, hidden, eps, with_fp16); - layer = engine_->AddPluginV2(input_ids.data(), input_num, plugin); + layer = engine_->AddDynamicPlugin(input_ids.data(), input_num, plugin); auto output_name = op_desc.Output("Out")[0]; RreplenishLayerAndOutput(layer, "emb_eltwise_layernorm", {output_name}, test_mode); diff --git a/paddle/fluid/inference/tensorrt/convert/gelu_op.cc b/paddle/fluid/inference/tensorrt/convert/gelu_op.cc index 4c9996ca02c..ca5b6a8b52e 100644 --- a/paddle/fluid/inference/tensorrt/convert/gelu_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/gelu_op.cc @@ -64,7 +64,7 @@ class GeluOpConverter : public OpConverter { engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); plugin::GeluPluginDynamic* plugin = new plugin::GeluPluginDynamic(with_fp16); - layer = engine_->AddPluginV2(&input, input_num, plugin); + layer = engine_->AddDynamicPlugin(&input, input_num, plugin); #else PADDLE_THROW(platform::errors::Fatal( "You are running the TRT Dynamic Shape mode, need to confirm that " diff --git a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc index 8ce46a19d4b..20086465491 100644 --- a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc @@ -227,7 +227,7 @@ class MultiheadMatMulOpConverter : public OpConverter { plugin::DynamicPluginTensorRT* plugin = new plugin::QkvToContextPluginDynamic(hidden_in, head_number, head_size, scale, with_fp16); - layer = engine_->AddPluginV2(plugin_inputs.data(), 2, plugin); + layer = engine_->AddDynamicPlugin(plugin_inputs.data(), 2, plugin); } } else { PADDLE_THROW(platform::errors::Fatal( diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc index aa4e54b5845..c10072602d7 100644 --- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc @@ -147,7 +147,7 @@ class Pool2dOpConverter : public OpConverter { plugin::PoolPluginDynamic *plugin = new plugin::PoolPluginDynamic(ceil_mode, pool_type, adaptive, ksize, strides, paddings, global_pooling); - layer = engine_->AddPluginV2(&input1, 1, plugin); + layer = engine_->AddDynamicPlugin(&input1, 1, plugin); #endif } auto output_name = op_desc.Output("Out")[0]; diff --git a/paddle/fluid/inference/tensorrt/convert/prelu_op.cc b/paddle/fluid/inference/tensorrt/convert/prelu_op.cc index 5e881ecbbc4..74d77d8be44 100644 --- a/paddle/fluid/inference/tensorrt/convert/prelu_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/prelu_op.cc @@ -65,7 +65,7 @@ class PReluOpConverter : public OpConverter { #if IS_TRT_VERSION_GE(6000) plugin::PReluPluginDynamic* plugin = new plugin::PReluPluginDynamic( alpha_data, alpha_tensor_temp->numel(), mode); - layer = engine_->AddPluginV2(&input, input_num, plugin); + layer = engine_->AddDynamicPlugin(&input, input_num, plugin); #else PADDLE_THROW(platform::errors::Fatal( "You are running the TRT Dynamic Shape mode, need to confirm that " diff --git a/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc index 2e4a4e6120d..3db7709acc2 100644 --- a/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc +++ b/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc @@ -90,7 +90,7 @@ class SkipLayerNormOpConverter : public OpConverter { plugin::SkipLayerNormPluginDynamic* plugin = new plugin::SkipLayerNormPluginDynamic(bias, scale, bias_size, scale_size, eps, with_fp16); - layer = engine_->AddPluginV2(inputs.data(), 2, plugin); + layer = engine_->AddDynamicPlugin(inputs.data(), 2, plugin); } } else { PADDLE_THROW(platform::errors::Fatal( diff --git a/paddle/fluid/inference/tensorrt/convert/slice_op.cc b/paddle/fluid/inference/tensorrt/convert/slice_op.cc index 0bd2b8c9bf5..38521d25641 100644 --- a/paddle/fluid/inference/tensorrt/convert/slice_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/slice_op.cc @@ -90,14 +90,14 @@ class SliceOpConverter : public OpConverter { // bool ban_fp16 = engine_->disable_trt_plugin_fp16(); plugin::SpecialSlicePluginDynamic* plugin = new plugin::SpecialSlicePluginDynamic(); - layer = engine_->AddPluginV2(plugin_inputs.data(), plugin_inputs.size(), - plugin); + layer = engine_->AddDynamicPlugin(plugin_inputs.data(), + plugin_inputs.size(), plugin); } else { bool with_fp16 = engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); plugin::SlicePluginDynamic* plugin = new plugin::SlicePluginDynamic(starts, ends, axes, with_fp16); - layer = engine_->AddPluginV2(&input, 1, plugin); + layer = engine_->AddDynamicPlugin(&input, 1, plugin); } #else PADDLE_THROW(platform::errors::Fatal( diff --git a/paddle/fluid/inference/tensorrt/convert/split_op.cc b/paddle/fluid/inference/tensorrt/convert/split_op.cc index 5d494c2093b..75b317e7bfd 100644 --- a/paddle/fluid/inference/tensorrt/convert/split_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/split_op.cc @@ -90,7 +90,7 @@ class SplitOpConverter : public OpConverter { engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); plugin::SplitPluginDynamic* plugin = new plugin::SplitPluginDynamic(axis, output_lengths, with_fp16); - layer = engine_->AddPluginV2(&input, input_num, plugin); + layer = engine_->AddDynamicPlugin(&input, input_num, plugin); #else PADDLE_THROW(platform::errors::Fatal( "You are running the TRT Dynamic Shape mode, need to confirm that " diff --git a/paddle/fluid/inference/tensorrt/convert/stack_op.cc b/paddle/fluid/inference/tensorrt/convert/stack_op.cc index 1c971fa12e2..d538c58879d 100644 --- a/paddle/fluid/inference/tensorrt/convert/stack_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/stack_op.cc @@ -59,7 +59,7 @@ class StackOpConverter : public OpConverter { engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); plugin::StackPluginDynamic* plugin = new plugin::StackPluginDynamic(axis, input_num, with_fp16); - layer = engine_->AddPluginV2(inputs, input_num, plugin); + layer = engine_->AddDynamicPlugin(inputs, input_num, plugin); assert(layer != nullptr); #else PADDLE_THROW(platform::errors::Fatal( diff --git a/paddle/fluid/inference/tensorrt/convert/swish_op.cc b/paddle/fluid/inference/tensorrt/convert/swish_op.cc index 25944a2fead..b2e394d14eb 100644 --- a/paddle/fluid/inference/tensorrt/convert/swish_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/swish_op.cc @@ -65,7 +65,7 @@ class SwishOpConverter : public OpConverter { engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); plugin::SwishPluginDynamic* plugin = new plugin::SwishPluginDynamic(beta, with_fp16); - layer = engine_->AddPluginV2(&input, input_num, plugin); + layer = engine_->AddDynamicPlugin(&input, input_num, plugin); #else PADDLE_THROW(platform::errors::Fatal( "You are running the TRT Dynamic Shape mode, need to confirm that " diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h index de2924824f0..2358e1ef976 100644 --- a/paddle/fluid/inference/tensorrt/engine.h +++ b/paddle/fluid/inference/tensorrt/engine.h @@ -378,9 +378,9 @@ class TensorRTEngine { bool with_dynamic_shape() { return with_dynamic_shape_; } #if IS_TRT_VERSION_GE(6000) - nvinfer1::IPluginV2Layer* AddPluginV2(nvinfer1::ITensor* const* inputs, - int num_inputs, - plugin::DynamicPluginTensorRT* plugin) { + nvinfer1::IPluginV2Layer* AddDynamicPlugin( + nvinfer1::ITensor* const* inputs, int num_inputs, + plugin::DynamicPluginTensorRT* plugin) { owned_pluginv2_.emplace_back(plugin); return network()->addPluginV2(inputs, num_inputs, *plugin); } diff --git a/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.cu index 42c0df41a1b..6e7ed0054f5 100644 --- a/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/roi_align_op_plugin.cu @@ -364,6 +364,7 @@ RoiAlignPluginDynamicCreator::getFieldNames() { nvinfer1::IPluginV2Ext* RoiAlignPluginDynamicCreator::createPlugin( const char* name, const nvinfer1::PluginFieldCollection* fc) { const nvinfer1::PluginField* fields = fc->fields; + return nullptr; } nvinfer1::IPluginV2Ext* RoiAlignPluginDynamicCreator::deserializePlugin( -- GitLab From ed49b4181987a44ccaf7d76eade58daa194bc884 Mon Sep 17 00:00:00 2001 From: Wilber Date: Fri, 2 Apr 2021 14:11:09 +0800 Subject: [PATCH 145/486] update plugin creator name (#32021) --- .../inference/tensorrt/plugin/elementwise_op_plugin.h | 6 +++--- .../tensorrt/plugin/emb_eltwise_layernorm_plugin.h | 7 ++++--- paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.h | 6 +++--- .../inference/tensorrt/plugin/qkv_to_context_plugin.h | 6 +++--- .../inference/tensorrt/plugin/skip_layernorm_op_plugin.h | 6 +++--- paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.h | 6 +++--- paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h | 6 +++--- paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.h | 6 +++--- 8 files changed, 25 insertions(+), 24 deletions(-) diff --git a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h index 49212aae9aa..75a1dd85f0f 100644 --- a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h @@ -144,9 +144,9 @@ class ElementwisePluginDynamic : public DynamicPluginTensorRT { int axis_; }; -class ElementwisePluginV2Creator : public nvinfer1::IPluginCreator { +class ElementwisePluginDynamicCreator : public nvinfer1::IPluginCreator { public: - ElementwisePluginV2Creator() {} + ElementwisePluginDynamicCreator() {} const char* getPluginName() const override { return "elementwise_plugin"; } const char* getPluginVersion() const override { return "1"; } @@ -182,7 +182,7 @@ class ElementwisePluginV2Creator : public nvinfer1::IPluginCreator { std::vector plugin_attributes_; }; -REGISTER_TRT_PLUGIN_V2(ElementwisePluginV2Creator); +REGISTER_TRT_PLUGIN_V2(ElementwisePluginDynamicCreator); #endif } // namespace plugin diff --git a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h index 6c8381a750c..7de84a8fc49 100644 --- a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h @@ -306,9 +306,10 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT { } }; -class EmbEltwiseLayernormPluginV2Creator : public nvinfer1::IPluginCreator { +class EmbEltwiseLayernormPluginDynamicCreator + : public nvinfer1::IPluginCreator { public: - EmbEltwiseLayernormPluginV2Creator() {} + EmbEltwiseLayernormPluginDynamicCreator() {} const char* getPluginName() const override { return "fused_embedding_eltwise_layernorm_plugin"; } @@ -345,7 +346,7 @@ class EmbEltwiseLayernormPluginV2Creator : public nvinfer1::IPluginCreator { std::vector plugin_attributes_; }; -REGISTER_TRT_PLUGIN_V2(EmbEltwiseLayernormPluginV2Creator); +REGISTER_TRT_PLUGIN_V2(EmbEltwiseLayernormPluginDynamicCreator); #endif } // namespace plugin diff --git a/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.h index 979f600a3a9..23e507ee477 100644 --- a/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.h @@ -115,9 +115,9 @@ class GeluPluginDynamic : public DynamicPluginTensorRT { void destroy() override { delete this; } }; -class GeluPluginV2Creator : public nvinfer1::IPluginCreator { +class GeluPluginDynamicCreator : public nvinfer1::IPluginCreator { public: - GeluPluginV2Creator() {} + GeluPluginDynamicCreator() {} const char* getPluginName() const override { return "gelu_plugin"; } const char* getPluginVersion() const override { return "1"; } @@ -153,7 +153,7 @@ class GeluPluginV2Creator : public nvinfer1::IPluginCreator { std::vector plugin_attributes_; }; -REGISTER_TRT_PLUGIN_V2(GeluPluginV2Creator); +REGISTER_TRT_PLUGIN_V2(GeluPluginDynamicCreator); #endif } // namespace plugin diff --git a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.h b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.h index b852f5a454c..7147d985575 100644 --- a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.h @@ -118,9 +118,9 @@ class QkvToContextPluginDynamic : public DynamicPluginTensorRT { float scale_; }; -class QkvToContextPluginV2Creator : public nvinfer1::IPluginCreator { +class QkvToContextPluginDynamicCreator : public nvinfer1::IPluginCreator { public: - QkvToContextPluginV2Creator() {} + QkvToContextPluginDynamicCreator() {} const char* getPluginName() const override { return "qkv_to_context_plugin"; } const char* getPluginVersion() const override { return "1"; } @@ -155,7 +155,7 @@ class QkvToContextPluginV2Creator : public nvinfer1::IPluginCreator { nvinfer1::PluginFieldCollection field_collection_; std::vector plugin_attributes_; }; -REGISTER_TRT_PLUGIN_V2(QkvToContextPluginV2Creator); +REGISTER_TRT_PLUGIN_V2(QkvToContextPluginDynamicCreator); #endif } // namespace plugin diff --git a/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h index 0e457fdc8f4..ac621784550 100644 --- a/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h @@ -119,9 +119,9 @@ class SkipLayerNormPluginDynamic : public DynamicPluginTensorRT { float eps_; }; -class SkipLayerNormPluginV2Creator : public nvinfer1::IPluginCreator { +class SkipLayerNormPluginDynamicCreator : public nvinfer1::IPluginCreator { public: - SkipLayerNormPluginV2Creator() {} + SkipLayerNormPluginDynamicCreator() {} const char* getPluginName() const override { return "skip_layernorm_plugin"; } const char* getPluginVersion() const override { return "1"; } @@ -156,7 +156,7 @@ class SkipLayerNormPluginV2Creator : public nvinfer1::IPluginCreator { nvinfer1::PluginFieldCollection field_collection_; std::vector plugin_attributes_; }; -REGISTER_TRT_PLUGIN_V2(SkipLayerNormPluginV2Creator); +REGISTER_TRT_PLUGIN_V2(SkipLayerNormPluginDynamicCreator); #endif } // namespace plugin diff --git a/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.h index 340406c5e7f..9d4f9a35c3b 100644 --- a/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.h @@ -121,9 +121,9 @@ class SlicePluginDynamic : public DynamicPluginTensorRT { cudaStream_t copy_stream_; }; -class SlicePluginV2Creator : public nvinfer1::IPluginCreator { +class SlicePluginDynamicCreator : public nvinfer1::IPluginCreator { public: - SlicePluginV2Creator() {} + SlicePluginDynamicCreator() {} const char* getPluginName() const override { return "slice_plugin"; } const char* getPluginVersion() const override { return "1"; } @@ -155,7 +155,7 @@ class SlicePluginV2Creator : public nvinfer1::IPluginCreator { nvinfer1::PluginFieldCollection field_collection_; }; -REGISTER_TRT_PLUGIN_V2(SlicePluginV2Creator); +REGISTER_TRT_PLUGIN_V2(SlicePluginDynamicCreator); #endif diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h index e43b57357fb..1ee895154d6 100644 --- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h @@ -193,9 +193,9 @@ class SplitPluginDynamic : public DynamicPluginTensorRT { std::vector output_length_; }; -class SplitPluginV2Creator : public nvinfer1::IPluginCreator { +class SplitPluginDynamicCreator : public nvinfer1::IPluginCreator { public: - SplitPluginV2Creator() {} + SplitPluginDynamicCreator() {} const char* getPluginName() const override { return "split_plugin"; } const char* getPluginVersion() const override { return "1"; } @@ -231,7 +231,7 @@ class SplitPluginV2Creator : public nvinfer1::IPluginCreator { std::vector plugin_attributes_; }; -REGISTER_TRT_PLUGIN_V2(SplitPluginV2Creator); +REGISTER_TRT_PLUGIN_V2(SplitPluginDynamicCreator); #endif } // namespace plugin diff --git a/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.h index 85cc6916238..11579aadcc4 100644 --- a/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.h @@ -127,9 +127,9 @@ class SwishPluginDynamic : public DynamicPluginTensorRT { float beta_; }; -class SwishPluginV2Creator : public nvinfer1::IPluginCreator { +class SwishPluginDynamicCreator : public nvinfer1::IPluginCreator { public: - SwishPluginV2Creator() {} + SwishPluginDynamicCreator() {} const char* getPluginName() const override { return "swish_plugin"; } const char* getPluginVersion() const override { return "1"; } @@ -165,7 +165,7 @@ class SwishPluginV2Creator : public nvinfer1::IPluginCreator { std::vector plugin_attributes_; }; -REGISTER_TRT_PLUGIN_V2(SwishPluginV2Creator); +REGISTER_TRT_PLUGIN_V2(SwishPluginDynamicCreator); #endif } // namespace plugin -- GitLab From cd74b20759d9f41c5a0064d6c9ecdaf160594ff0 Mon Sep 17 00:00:00 2001 From: cc <52520497+juncaipeng@users.noreply.github.com> Date: Fri, 2 Apr 2021 14:46:14 +0800 Subject: [PATCH 146/486] Add more ops to calculate output scales (#32036) --- .../slim/quantization/imperative/qat.py | 2 +- .../slim/quantization/imperative/utils.py | 34 +++++++------------ 2 files changed, 13 insertions(+), 23 deletions(-) diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py index f4620ff0001..66b11d1f17a 100644 --- a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py +++ b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py @@ -468,7 +468,7 @@ class ImperativeQuantizeOutputs(object): """ Whether the layer needs to calculate output scales. """ - return isinstance(layer, tuple(utils.quant_output_layers_map.values())) \ + return isinstance(layer, utils.quant_output_layers) \ or ('quantized' in layer.full_name() and \ 'quantized_noweight' not in layer.full_name()) diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py b/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py index f45eb8c97f4..004e1c1aa9b 100644 --- a/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py +++ b/python/paddle/fluid/contrib/slim/quantization/imperative/utils.py @@ -43,28 +43,18 @@ fake_quantize_dequantize_types = [ "fake_quantize_dequantize_moving_average_abs_max" ] -quant_output_layers_map = { - 'Conv2D': paddle.nn.Conv2D, - 'Conv2DTranspose': paddle.nn.Conv2DTranspose, - 'Linear': paddle.nn.Linear, - 'AdaptiveAvgPool2D': paddle.nn.AdaptiveAvgPool2D, - 'AdaptiveMaxPool2D': paddle.nn.AdaptiveMaxPool2D, - 'AvgPool2D': paddle.nn.AvgPool2D, - 'MaxPool2D': paddle.nn.MaxPool2D, - 'BatchNorm': paddle.nn.BatchNorm, - 'BatchNorm2D': paddle.nn.BatchNorm2D, - 'SyncBatchNorm': paddle.nn.SyncBatchNorm, - 'ELU': paddle.nn.ELU, - 'GELU': paddle.nn.GELU, - 'LeakyReLU': paddle.nn.LeakyReLU, - 'PReLU': paddle.nn.PReLU, - 'ReLU': paddle.nn.ReLU, - 'ReLU6': paddle.nn.ReLU6, - 'Sigmoid': paddle.nn.Sigmoid, - 'Softmax': paddle.nn.Softmax, - 'Tanh': paddle.nn.Tanh, - 'Swish': paddle.nn.Swish, -} +quant_output_layers = ( + paddle.nn.Conv2D, paddle.nn.Conv2DTranspose, paddle.nn.Linear, + paddle.nn.AdaptiveAvgPool2D, paddle.nn.AdaptiveMaxPool2D, + paddle.nn.AvgPool2D, paddle.nn.MaxPool2D, paddle.nn.BatchNorm, + paddle.nn.BatchNorm2D, paddle.nn.LayerNorm, paddle.nn.SyncBatchNorm, + paddle.nn.ELU, paddle.nn.GELU, paddle.nn.Hardshrink, paddle.nn.Hardsigmoid, + paddle.nn.Hardswish, paddle.nn.Hardtanh, paddle.nn.LeakyReLU, + paddle.nn.LogSigmoid, paddle.nn.LogSoftmax, paddle.nn.Maxout, + paddle.nn.PReLU, paddle.nn.ReLU, paddle.nn.ReLU6, paddle.nn.SELU, + paddle.nn.Sigmoid, paddle.nn.Softmax, paddle.nn.Softplus, + paddle.nn.Softshrink, paddle.nn.Softsign, paddle.nn.Swish, paddle.nn.Tanh, + paddle.nn.Tanhshrink, paddle.nn.ThresholdedReLU, paddle.nn.Upsample) weight_op_types = [ "conv2d", "depthwise_conv2d", "matmul", "conv2d_transpose", -- GitLab From bf10d5634ff4b20b3c1ff1f956510c6d72ce2be0 Mon Sep 17 00:00:00 2001 From: tianshuo78520a <707759223@qq.com> Date: Fri, 2 Apr 2021 17:22:49 +0800 Subject: [PATCH 147/486] fix decorator in py2 (#32043) --- python/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/requirements.txt b/python/requirements.txt index e89b3ede94f..609a4b34e8f 100644 --- a/python/requirements.txt +++ b/python/requirements.txt @@ -7,5 +7,5 @@ gast>=0.3.3 ; platform_system != "Windows" gast==0.3.3 ; platform_system == "Windows" Pillow six -decorator +decorator==4.4.2 astor -- GitLab From 43367e4b68e8cd9e10f374c867c408409d4060cd Mon Sep 17 00:00:00 2001 From: WeiXin Date: Fri, 2 Apr 2021 17:42:30 +0800 Subject: [PATCH 148/486] support save/load single tensor (#31756) * support save/load single tensor * compatibility modification according to unnittest * Some python2.7 don't have 'copyreg' modules * Handle a syntax error. * Dealing with compatibility problems on Mac. * Dealing with compatibility problems on Mac. * edit unittest to improve coverage. * Modify the code according to the review comments * Reduce redundant code. * support for static graph loading dygraph state_dict * edit code according to CI * edit unittest * edit unnittest * delete redundant file * edit code according to Comments * edit english doc * edit english doc * edit English DOC. * get/set_tensor->get/set_value; return_numpy=False * get/set_tensor->get/set_value; return_numpy=False * edit unnittest * edit unnittest * polish code. --- python/paddle/fluid/dygraph/layers.py | 8 +- python/paddle/fluid/framework.py | 352 +++++++++++++++-- python/paddle/fluid/io.py | 82 +++- .../fluid/tests/unittests/CMakeLists.txt | 2 +- .../unittests/test_imperative_save_load_v2.py | 2 +- .../tests/unittests/test_paddle_save_load.py | 254 +++++++++++- .../unittests/test_static_save_load_large.py | 17 +- .../fluid/tests/unittests/test_variable.py | 1 - python/paddle/framework/io.py | 362 +++++++++++++++++- .../static_mode_white_list.cpython-37.pyc | Bin 0 -> 20217 bytes tools/static_mode_white_list.pyc | Bin 21803 -> 0 bytes 11 files changed, 998 insertions(+), 82 deletions(-) create mode 100644 tools/__pycache__/static_mode_white_list.cpython-37.pyc delete mode 100644 tools/static_mode_white_list.pyc diff --git a/python/paddle/fluid/dygraph/layers.py b/python/paddle/fluid/dygraph/layers.py index 3df0c608527..36637abc6d0 100644 --- a/python/paddle/fluid/dygraph/layers.py +++ b/python/paddle/fluid/dygraph/layers.py @@ -22,6 +22,8 @@ import copy import weakref import warnings from copy import deepcopy +import inspect + import paddle from . import parallel_helper @@ -1294,10 +1296,12 @@ class Layer(core.Layer): if state is None: raise ValueError("{} is not found in the provided dict.".format( key)) - if list(state.shape) != list(param.shape): + state_shape = state.shape() if inspect.ismethod( + state.shape) else state.shape + if list(state_shape) != list(param.shape): raise ValueError( "{} receives a shape {}, but the expected shape is {}.". - format(key, list(state.shape), list(param.shape))) + format(key, list(state_shape), list(param.shape))) return param, state matched_param_state = [] diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index be795b9e59c..d5c01d20a91 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -24,6 +24,7 @@ import re import traceback import six import copy +from types import MethodType, FunctionType import numpy as np import subprocess @@ -1183,37 +1184,6 @@ class Variable(object): """ pass - @fake_interface_only - def set_value(self, value): - """ - **Notes**: - **This API is ONLY available in Dygraph mode** - - Set a new value for this Variable. - - Args: - value (Variable|np.ndarray): the new value. - - Examples: - .. code-block:: python - - import paddle.fluid as fluid - from paddle.fluid.dygraph.base import to_variable - from paddle.fluid.dygraph import Linear - import numpy as np - - data = np.ones([3, 1024], dtype='float32') - with fluid.dygraph.guard(): - linear = fluid.dygraph.Linear(1024, 4) - t = to_variable(data) - linear(t) # call with default weight - custom_weight = np.random.randn(1024, 4).astype("float32") - linear.weight.set_value(custom_weight) # change existing weight - out = linear(t) # call with different weight - - """ - pass - @fake_interface_only def backward(self, retain_graph=False): """ @@ -2011,6 +1981,159 @@ class Variable(object): return self + def get_value(self, scope=None): + """ + Get the value of variable in given scope. + + Args: + scope(Scope, optional) : If `scope` is None, it will be set to global scope + obtained through 'paddle.static.global_scope()'. Otherwise, use `scope`. + Default: None + + Returns: + Tensor: the value in given scope. + + Examples: + .. code-block:: python + + import paddle + import paddle.static as static + import numpy as np + + paddle.enable_static() + + x = static.data(name="x", shape=[10, 10], dtype='float32') + + y = static.nn.fc(x, 10, name='fc') + place = paddle.CPUPlace() + exe = static.Executor(place) + prog = paddle.static.default_main_program() + exe.run(static.default_startup_program()) + inputs = np.ones((10, 10), dtype='float32') + exe.run(prog, feed={'x': inputs}, fetch_list=[y, ]) + path = 'temp/tensor_' + for var in prog.list_vars(): + if var.persistable: + t = var.get_value() + paddle.save(t, path+var.name+'.pdtensor') + + for var in prog.list_vars(): + if var.persistable: + t_load = paddle.load(path+var.name+'.pdtensor') + var.set_value(t_load) + """ + # The 'framework' is a low-level module, and 'executor' + # can not be imported at the begainning of this file. + # Therefore, the above two modules are dynamically imported. + from .executor import global_scope + if scope is not None and not isinstance(scope, core._Scope): + raise TypeError( + "`scope` should be None or `paddle.static.Scope` type, but received {}.". + format(type(scope))) + + if scope is None: + scope = global_scope() + var_temp = scope.find_var(self.name) + if var_temp is None: + raise ValueError("Can not find Variable '{}' in the Scope.".format( + self.name)) + t = var_temp.get_tensor() + return t + + def set_value(self, value, scope=None): + ''' + Set the value to the tensor in given scope. + + Args: + value(Tensor/ndarray) : The value to be set. + scope(Scope, optional) : If `scope` is None, it will be set to global scope + obtained through 'paddle.static.global_scope()'. Otherwise, use `scope`. + Default: None + + Returns: + None + + Examples: + .. code-block:: python + + import paddle + import paddle.static as static + import numpy as np + + paddle.enable_static() + + x = static.data(name="x", shape=[10, 10], dtype='float32') + + y = static.nn.fc(x, 10, name='fc') + place = paddle.CPUPlace() + exe = static.Executor(place) + prog = paddle.static.default_main_program() + exe.run(static.default_startup_program()) + inputs = np.ones((10, 10), dtype='float32') + exe.run(prog, feed={'x': inputs}, fetch_list=[y, ]) + path = 'temp/tensor_' + for var in prog.list_vars(): + if var.persistable: + t = var.get_value() + paddle.save(t, path+var.name+'.pdtensor') + + for var in prog.list_vars(): + if var.persistable: + t_load = paddle.load(path+var.name+'.pdtensor') + var.set_value(t_load) + ''' + + # The 'framework' is a low-level module, and 'executor' + # can not be imported at the begainning of this file. + # Therefore, the above two modules are dynamically imported. + from .executor import global_scope + + if not (isinstance(value, np.ndarray) or hasattr(value, '__array__')): + raise TypeError( + "`value` should be `numpy.ndarray` or `LoDTensor`, but received {}.". + format(type(value))) + + if scope is not None and not isinstance(scope, core._Scope): + raise TypeError( + "`scope` should be None or `paddle.static.Scope` type, but received {}.". + format(type(scope))) + + if scope is None: + scope = global_scope() + + var_temp = scope.find_var(self.name) + if var_temp is None: + raise ValueError("Can not find Variable '{}' in the Scope.".format( + self.name)) + + t = var_temp.get_tensor() + + if hasattr(value, 'shape'): + if isinstance(value.shape, (MethodType, FunctionType)): + value_shape = value.shape() + else: + value_shape = value.shape + if list(t.shape()) != list(value_shape): + raise ValueError( + "{} expected a shape {}, but the received shape is {}.". + format(self.name, list(t.shape()), list(value_shape))) + + p = t._place() + if p.is_cpu_place(): + place = core.CPUPlace() + elif p.is_cuda_pinned_place(): + place = core.CUDAPinnedPlace() + elif p.is_xpu_place(): + p = core.Place() + p.set_place(t._place()) + place = core.XPUPlace(p.xpu_device_id()) + else: + p = core.Place() + p.set_place(t._place()) + place = core.CUDAPlace(p.gpu_device_id()) + + t.set(value, place) + def get_all_op_protos(): """ @@ -5319,6 +5442,173 @@ class Program(object): parameters.extend(each_block.all_parameters()) return parameters + def state_dict(self, mode='all', scope=None): + """ + Get parameters and persistable buffers of program as a dict. The key is the name of the parameter or the name of the buffer. + The value is the tensor of this variable in the given scope. + + .. note:: + This function MUST called after run start_up_program + + Args: + mode(str, optional): Source of the obtained parameters and buffers. + 'opt' : The return value only contains the variable in the optimizer. + 'param' : The return value only contains the variable in the network, not the variable in the optimizer. + 'all' : The return value contains the variable in the network and optimizer. + Default: 'all' + scope(Scope, optional) : If scope is None, state_dict will be set to global scope + obtained through 'paddle.static.global_scope()'. Otherwise, value will be set to scope. + Default: None + + Retruns: + dict: a dict contains the parameters and persistable buffers. + + Examples: + .. code-block:: python + + import paddle + import paddle.static as static + + paddle.enable_static() + + x = static.data(name="x", shape=[10, 10], dtype='float32') + y = static.nn.fc(x, 10) + z = static.nn.fc(y, 10) + + place = paddle.CPUPlace() + exe = static.Executor(place) + exe.run(static.default_startup_program()) + prog = static.default_main_program() + + path = "./temp/model.pdparams" + paddle.save(prog.state_dict(), path) + """ + # The 'framework' is a low-level module, and 'executor' + # can not be imported at the begainning of this file. + # Therefore, the above two modules are dynamically imported. + from .executor import global_scope + if scope is not None and not isinstance(scope, core._Scope): + raise TypeError( + "`scope` should be None or `paddle.static.Scope'` type, but received {}.". + format(type(scope))) + + if scope is None: + scope = global_scope() + + if not isinstance(mode, str): + raise TypeError("Type of `mode` should be string, but received {}.". + format(type(mode))) + + def is_parameter(var): + return isinstance(var, Parameter) + + def is_persistable(var): + if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \ + var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \ + var.desc.type() == core.VarDesc.VarType.READER: + return False + return var.persistable + + def is_belong_to_optimizer(var): + if not (isinstance(var, Parameter) or var.desc.need_check_feed()): + return is_persistable(var) + return False + + def condition(var): + + if mode == 'param': + return is_parameter(var) + elif mode == 'opt': + return is_belong_to_optimizer(var) + elif mode == 'all': + return is_parameter(var) or is_belong_to_optimizer(var) + else: + raise ValueError( + "`mode` string should be 'param', 'opt' or 'all', but received {}.". + format(mode)) + + var_list = filter(condition, self.list_vars()) + + state_dict = dict() + for var in var_list: + var_temp = scope.find_var(var.name) + if var_temp is None: + raise ValueError( + "Can not find Variable '{}' in the scope. Make sure it is initialized". + format(var.name)) + state_dict[var.name] = var_temp.get_tensor() + + return state_dict + + def set_state_dict(self, state_dict, scope=None): + """ + Set parameters and persistable buffers in state_dict to program. + An exception will throw if shape or dtype of the parameters is not match. + + .. note:: + This function MUST called after run start_up_program + + Args: + state_dict(dict): the dict store parameters and persistable buffers. + The key is the name of the parameter or the name of the buffer. + The value is the tensor of this variable in the given scope. + scope(Scope, optional) : If scope is None, state_dict will be set to global scope + obtained through 'paddle.static.global_scope()'. Otherwise, value will be set to scope. + Default: None + + Returns: + None + + Examples: + .. code-block:: python + + import paddle + import paddle.static as static + + paddle.enable_static() + + x = static.data(name="x", shape=[10, 10], dtype='float32') + y = static.nn.fc(x, 10) + z = static.nn.fc(y, 10) + + place = paddle.CPUPlace() + exe = static.Executor(place) + exe.run(static.default_startup_program()) + prog = static.default_main_program() + + path = "./temp/model.pdparams" + paddle.save(prog.state_dict(), path) + state_dict_load = paddle.load(path) + prog.set_state_dict(state_dict_load) + """ + + if not isinstance(state_dict, dict): + raise TypeError( + "Type of `state_dict` should be dict, but received {}.".format( + type(state_dict))) + + vars_dict = {var.name: var for var in self.list_vars()} + condition = True if 'StructuredToParameterName@@' in state_dict else False + for name, value in state_dict.items(): + if condition: + if name == "StructuredToParameterName@@": + continue + if name in state_dict['StructuredToParameterName@@']: + name = state_dict['StructuredToParameterName@@'][name] + if name in vars_dict: + try: + vars_dict[name].set_value(value, scope) + except ValueError as err: + warnings.warn( + ("Skip loading for '{}'. ".format(name) + str(err))) + except TypeError as err: + warnings.warn( + ("Skip loading for '{}'. ".format(name) + str(err))) + else: + warnings.warn(( + "Skip loading for '{0}'. Because '{0}' not in the program.". + format(name))) + @six.add_metaclass(ParameterMetaClass) class Parameter(Variable): diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py index 9cca3e16de5..cfb4b125993 100644 --- a/python/paddle/fluid/io.py +++ b/python/paddle/fluid/io.py @@ -1765,7 +1765,30 @@ def _pack_loaded_dict(load_obj): @static_only -def save(program, model_path, pickle_protocol=2): +def _legacy_save(param_dict, model_path, protocol=2): + def get_tensor(var): + if isinstance(var, core.VarBase): + return var.numpy() + elif isinstance(var, core.LoDTensor): + return np.array(var) + return var + + param_dict = {name: get_tensor(param_dict[name]) for name in param_dict} + + # When value of dict is lager than 4GB ,there is a Bug on 'MAC python3' + if sys.platform == 'darwin' and sys.version_info.major == 3: + pickle_bytes = pickle.dumps(param_dict, protocol=protocol) + with open(model_path, 'wb') as f: + max_bytes = 2**30 + for i in range(0, len(pickle_bytes), max_bytes): + f.write(pickle_bytes[i:i + max_bytes]) + else: + with open(model_path, 'wb') as f: + pickle.dump(param_dict, f, protocol=protocol) + + +@static_only +def save(program, model_path, protocol=2, **configs): """ :api_attr: Static Graph @@ -1778,8 +1801,9 @@ def save(program, model_path, pickle_protocol=2): Args: program(Program) : The program to saved. model_path(str): the file prefix to save the program. The format is "dirname/file_prefix". If file_prefix is empty str. A exception will be raised - pickle_protocol(int, optional): The protocol version of pickle module must be greater than 1 and less than 5. + protocol(int, optional): The protocol version of pickle module must be greater than 1 and less than 5. Default: 2 + configs(dict, optional) : optional keyword arguments. Returns: None @@ -1807,14 +1831,19 @@ def save(program, model_path, pickle_protocol=2): base_name = os.path.basename(model_path) assert base_name != "", \ "The input model_path MUST be format of dirname/filename [dirname\\filename in Windows system], but received model_path is empty string." + if 'pickle_protocol' in configs: + protocol = configs['pickle_protocol'] + warnings.warn( + "'pickle_protocol' is a deprecated argument. Please use 'protocol' instead." + ) - if not isinstance(pickle_protocol, int): + if not isinstance(protocol, int): raise ValueError("The 'protocol' MUST be `int`, but received {}".format( - type(pickle_protocol))) + type(protocol))) - if pickle_protocol < 2 or pickle_protocol > 4: + if protocol < 2 or protocol > 4: raise ValueError("Expected 1<'protocol'<5, but received protocol={}". - format(pickle_protocol)) + format(protocol)) dir_name = os.path.dirname(model_path) if dir_name and not os.path.exists(dir_name): @@ -1827,26 +1856,25 @@ def save(program, model_path, pickle_protocol=2): parameter_list = list(filter(is_parameter, program.list_vars())) param_dict = {p.name: get_tensor(p) for p in parameter_list} - param_dict = _unpack_saved_dict(param_dict, pickle_protocol) + param_dict = _unpack_saved_dict(param_dict, protocol) - # When value of dict is lager than 4GB ,there is a Bug on 'MAC python3.5/6' - if sys.platform == 'darwin' and sys.version_info.major == 3 and ( - sys.version_info.minor == 5 or sys.version_info.minor == 6): - pickle_bytes = pickle.dumps(param_dict, protocol=pickle_protocol) + # When value of dict is lager than 4GB ,there is a Bug on 'MAC python3' + if sys.platform == 'darwin' and sys.version_info.major == 3: + pickle_bytes = pickle.dumps(param_dict, protocol=protocol) with open(model_path + ".pdparams", 'wb') as f: max_bytes = 2**30 for i in range(0, len(pickle_bytes), max_bytes): f.write(pickle_bytes[i:i + max_bytes]) else: with open(model_path + ".pdparams", 'wb') as f: - pickle.dump(param_dict, f, protocol=pickle_protocol) + pickle.dump(param_dict, f, protocol=protocol) optimizer_var_list = list( filter(is_belong_to_optimizer, program.list_vars())) opt_dict = {p.name: get_tensor(p) for p in optimizer_var_list} with open(model_path + ".pdopt", 'wb') as f: - pickle.dump(opt_dict, f, protocol=pickle_protocol) + pickle.dump(opt_dict, f, protocol=protocol) main_program = program.clone() program.desc.flush() @@ -1857,6 +1885,17 @@ def save(program, model_path, pickle_protocol=2): f.write(program.desc.serialize_to_string()) +def _pickle_loads_mac(path, f): + pickle_bytes = bytearray(0) + file_size = os.path.getsize(path) + max_bytes = 2**30 + for _ in range(0, file_size, max_bytes): + pickle_bytes += f.read(max_bytes) + load_result = pickle.loads(pickle_bytes) if six.PY2 else pickle.loads( + pickle_bytes, encoding='latin1') + return load_result + + @static_only def load(program, model_path, executor=None, var_list=None): """ @@ -2016,8 +2055,13 @@ def load(program, model_path, executor=None, var_list=None): global_scope(), executor._default_executor) with open(parameter_file_name, 'rb') as f: - load_dict = pickle.load(f) if six.PY2 else pickle.load( - f, encoding='latin1') + + # When value of dict is lager than 4GB ,there is a Bug on 'MAC python3' + if sys.platform == 'darwin' and sys.version_info.major == 3: + load_dict = _pickle_loads_mac(parameter_file_name, f) + else: + load_dict = pickle.load(f) if six.PY2 else pickle.load( + f, encoding='latin1') load_dict = _pack_loaded_dict(load_dict) for v in parameter_list: assert v.name in load_dict, \ @@ -2196,8 +2240,12 @@ def load_program_state(model_path, var_list=None): "Parameter file [{}] not exits".format(parameter_file_name) with open(parameter_file_name, 'rb') as f: - para_dict = pickle.load(f) if six.PY2 else pickle.load( - f, encoding='latin1') + # When value of dict is lager than 4GB ,there is a Bug on 'MAC python3' + if sys.platform == 'darwin' and sys.version_info.major == 3: + para_dict = _pickle_loads_mac(parameter_file_name, f) + else: + para_dict = pickle.load(f) if six.PY2 else pickle.load( + f, encoding='latin1') para_dict = _pack_loaded_dict(para_dict) opt_file_name = model_prefix + ".pdopt" diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 28f5177c204..add3bbee41d 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -726,7 +726,7 @@ if (WIN32) set_tests_properties(test_paddle_save_load PROPERTIES TIMEOUT 250) else() set_tests_properties(test_static_save_load_large PROPERTIES TIMEOUT 600) - set_tests_properties(test_paddle_save_load PROPERTIES TIMEOUT 150) + set_tests_properties(test_paddle_save_load PROPERTIES TIMEOUT 250) endif() set_tests_properties(test_imperative_selected_rows_to_lod_tensor PROPERTIES TIMEOUT 120) set_tests_properties(test_index_select_op PROPERTIES TIMEOUT 120) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py b/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py index 672ffa9d394..9f0dcdb4d8f 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py @@ -930,7 +930,7 @@ class TestDygraphPtbRnn(unittest.TestCase): paddle.save(state_dict, os.path.join('saved_dy', 'emb_dy.pdparams')) para_state_dict = paddle.load( - os.path.join('saved_dy', 'emb_dy.pdparams')) + os.path.join('saved_dy', 'emb_dy.pdparams'), return_numpy=True) para_state_dict['weight'] = np.expand_dims( para_state_dict['weight'], axis=-1) diff --git a/python/paddle/fluid/tests/unittests/test_paddle_save_load.py b/python/paddle/fluid/tests/unittests/test_paddle_save_load.py index 06f63d1416b..b58d63969a5 100644 --- a/python/paddle/fluid/tests/unittests/test_paddle_save_load.py +++ b/python/paddle/fluid/tests/unittests/test_paddle_save_load.py @@ -18,10 +18,15 @@ import unittest import numpy as np import os import sys +import six import paddle import paddle.nn as nn import paddle.optimizer as opt +import paddle.fluid as fluid +from paddle.fluid.optimizer import Adam +import paddle.fluid.framework as framework +from test_imperative_base import new_program_scope BATCH_SIZE = 16 BATCH_NUM = 4 @@ -31,7 +36,10 @@ SEED = 10 IMAGE_SIZE = 784 CLASS_NUM = 10 -LARGE_PARAM = 2**26 +if six.PY2: + LARGE_PARAM = 2**2 +else: + LARGE_PARAM = 2**26 def random_batch_reader(): @@ -95,15 +103,22 @@ class TestSaveLoadLargeParameters(unittest.TestCase): path = os.path.join("test_paddle_save_load_large_param_save", "layer.pdparams") - paddle.save(layer.state_dict(), path) + if six.PY2: + protocol = 2 + else: + protocol = 4 + paddle.save(save_dict, path, protocol=protocol) dict_load = paddle.load(path) # compare results before and after saving for key, value in save_dict.items(): - self.assertTrue(np.array_equal(dict_load[key], value.numpy())) + self.assertTrue( + np.array_equal(dict_load[key].numpy(), value.numpy())) class TestSaveLoadPickle(unittest.TestCase): def test_pickle_protocol(self): + # enable dygraph mode + paddle.disable_static() # create network layer = LinearNet() save_dict = layer.state_dict() @@ -124,11 +139,236 @@ class TestSaveLoadPickle(unittest.TestCase): if sys.version_info.major >= 3 and sys.version_info.minor >= 4: protocols += [3, 4] for protocol in protocols: - paddle.save(save_dict, path, protocol) + paddle.save(save_dict, path, pickle_protocol=protocol) dict_load = paddle.load(path) # compare results before and after saving for key, value in save_dict.items(): - self.assertTrue(np.array_equal(dict_load[key], value.numpy())) + self.assertTrue( + np.array_equal(dict_load[key].numpy(), value.numpy())) + + +class TestSaveLoadAny(unittest.TestCase): + def set_zero(self, prog, place, scope=None): + if scope is None: + scope = fluid.global_scope() + for var in prog.list_vars(): + if isinstance(var, framework.Parameter) or var.persistable: + ten = scope.find_var(var.name).get_tensor() + if ten is not None: + ten.set(np.zeros_like(np.array(ten)), place) + new_t = np.array(scope.find_var(var.name).get_tensor()) + self.assertTrue(np.sum(np.abs(new_t)) == 0) + + def replace_static_save(self, program, model_path, pickle_protocol=2): + with self.assertRaises(TypeError): + program.state_dict(1) + with self.assertRaises(TypeError): + program.state_dict(scope=1) + with self.assertRaises(ValueError): + program.state_dict('x') + state_dict_param = program.state_dict('param') + paddle.save(state_dict_param, model_path + '.pdparams') + state_dict_opt = program.state_dict('opt') + paddle.save(state_dict_opt, model_path + '.pdopt') + state_dict_all = program.state_dict() + paddle.save(state_dict_opt, model_path + '.pdall') + + def replace_static_load(self, program, model_path): + with self.assertRaises(TypeError): + program.set_state_dict(1) + state_dict_param = paddle.load(model_path + '.pdparams') + state_dict_param['fake_var_name.@@'] = np.random.randn(1, 2) + state_dict_param['static_x'] = 'UserWarning' + program.set_state_dict(state_dict_param) + state_dict_param['static_x'] = np.random.randn(1, 2) + program.set_state_dict(state_dict_param) + program.set_state_dict(state_dict_param) + state_dict_opt = paddle.load(model_path + '.pdopt') + program.set_state_dict(state_dict_opt) + + def test_replace_static_save_load(self): + paddle.enable_static() + with new_program_scope(): + x = paddle.static.data( + name="static_x", shape=[None, IMAGE_SIZE], dtype='float32') + z = paddle.static.nn.fc(x, 10) + z = paddle.static.nn.fc(z, 10, bias_attr=False) + loss = fluid.layers.reduce_mean(z) + opt = Adam(learning_rate=1e-3) + opt.minimize(loss) + place = paddle.CPUPlace() + exe = paddle.static.Executor(place) + exe.run(paddle.static.default_startup_program()) + prog = paddle.static.default_main_program() + fake_inputs = np.random.randn(2, IMAGE_SIZE).astype('float32') + exe.run(prog, feed={'static_x': fake_inputs}, fetch_list=[loss]) + base_map = {} + for var in prog.list_vars(): + if isinstance(var, framework.Parameter) or var.persistable: + t = np.array(fluid.global_scope().find_var(var.name) + .get_tensor()) + base_map[var.name] = t + path = os.path.join("test_replace_static_save_load", "model") + # paddle.save, legacy paddle.fluid.load + self.replace_static_save(prog, path) + self.set_zero(prog, place) + paddle.fluid.io.load(prog, path) + for var in prog.list_vars(): + if isinstance(var, framework.Parameter) or var.persistable: + new_t = np.array(fluid.global_scope().find_var(var.name) + .get_tensor()) + base_t = base_map[var.name] + self.assertTrue(np.array_equal(new_t, np.array(base_t))) + # legacy paddle.fluid.save, paddle.load + paddle.fluid.io.save(prog, path) + self.set_zero(prog, place) + self.replace_static_load(prog, path) + for var in prog.list_vars(): + if isinstance(var, framework.Parameter) or var.persistable: + new_t = np.array(fluid.global_scope().find_var(var.name) + .get_tensor()) + base_t = base_map[var.name] + self.assertTrue(np.array_equal(new_t, base_t)) + # test for return tensor + path_vars = 'test_replace_save_load_return_tensor_static/model' + for var in prog.list_vars(): + if var.persistable: + tensor = var.get_value(fluid.global_scope()) + paddle.save(tensor, os.path.join(path_vars, var.name)) + with self.assertRaises(TypeError): + var.get_value('fluid.global_scope()') + with self.assertRaises(ValueError): + x.get_value() + with self.assertRaises(TypeError): + x.set_value('1') + fake_data = np.zeros([3, 2, 1, 2, 3]) + with self.assertRaises(TypeError): + x.set_value(fake_data, '1') + with self.assertRaises(ValueError): + x.set_value(fake_data) + with self.assertRaises(ValueError): + var.set_value(fake_data) + # set var to zero + self.set_zero(prog, place) + for var in prog.list_vars(): + if var.persistable: + tensor = paddle.load( + os.path.join(path_vars, var.name), return_numpy=False) + var.set_value(tensor) + new_t = np.array(fluid.global_scope().find_var(var.name) + .get_tensor()) + base_t = base_map[var.name] + self.assertTrue(np.array_equal(new_t, base_t)) + + def test_paddle_save_load_v2(self): + paddle.disable_static() + layer = LinearNet() + state_dict = layer.state_dict() + path = 'paddle_save_load_v2/model.pdparams' + with self.assertRaises(TypeError): + paddle.save(state_dict, path, use_binary_format='False') + # legacy paddle.save, paddle.load + paddle.framework.io._legacy_save(state_dict, path) + load_dict_tensor = paddle.load(path, return_numpy=False) + # legacy paddle.load, paddle.save + paddle.save(state_dict, path) + load_dict_np = paddle.framework.io._legacy_load(path) + for k, v in state_dict.items(): + self.assertTrue( + np.array_equal(v.numpy(), load_dict_tensor[k].numpy())) + self.assertTrue(np.array_equal(v.numpy(), load_dict_np[k])) + + def test_single_pickle_var_dygraph(self): + # enable dygraph mode + paddle.disable_static() + layer = LinearNet() + path = 'paddle_save_load_v2/var_dygraph' + tensor = layer._linear.weight + with self.assertRaises(ValueError): + paddle.save(tensor, path, pickle_protocol='3') + with self.assertRaises(ValueError): + paddle.save(tensor, path, pickle_protocol=5) + paddle.save(tensor, path) + t_dygraph = paddle.load(path) + np_dygraph = paddle.load(path, return_numpy=True) + self.assertTrue(isinstance(t_dygraph, paddle.fluid.core.VarBase)) + self.assertTrue(np.array_equal(tensor.numpy(), np_dygraph)) + self.assertTrue(np.array_equal(tensor.numpy(), t_dygraph.numpy())) + paddle.enable_static() + lod_static = paddle.load(path) + np_static = paddle.load(path, return_numpy=True) + self.assertTrue(isinstance(lod_static, paddle.fluid.core.LoDTensor)) + self.assertTrue(np.array_equal(tensor.numpy(), np_static)) + self.assertTrue(np.array_equal(tensor.numpy(), np.array(lod_static))) + + def test_single_pickle_var_static(self): + # enable static mode + paddle.enable_static() + with new_program_scope(): + # create network + x = paddle.static.data( + name="x", shape=[None, IMAGE_SIZE], dtype='float32') + z = paddle.static.nn.fc(x, 128) + loss = fluid.layers.reduce_mean(z) + place = fluid.CPUPlace( + ) if not paddle.fluid.core.is_compiled_with_cuda( + ) else fluid.CUDAPlace(0) + exe = paddle.static.Executor(place) + exe.run(paddle.static.default_startup_program()) + prog = paddle.static.default_main_program() + for var in prog.list_vars(): + if list(var.shape) == [IMAGE_SIZE, 128]: + tensor = var.get_value() + break + scope = fluid.global_scope() + origin_tensor = np.array(tensor) + path = 'test_single_pickle_var_static/var' + paddle.save(tensor, path) + self.set_zero(prog, place, scope) + # static load + lod_static = paddle.load(path) + np_static = paddle.load(path, return_numpy=True) + # set_tensor(np.ndarray) + var.set_value(np_static, scope) + self.assertTrue(np.array_equal(origin_tensor, np.array(tensor))) + # set_tensor(LoDTensor) + self.set_zero(prog, place, scope) + var.set_value(lod_static, scope) + self.assertTrue(np.array_equal(origin_tensor, np.array(tensor))) + # enable dygraph mode + paddle.disable_static() + var_dygraph = paddle.load(path) + np_dygraph = paddle.load(path, return_numpy=True) + self.assertTrue(np.array_equal(np.array(tensor), np_dygraph)) + self.assertTrue(np.array_equal(np.array(tensor), var_dygraph.numpy())) + + def test_dygraph_save_static_load(self): + inps = np.random.randn(1, IMAGE_SIZE).astype('float32') + path = 'test_dygraph_save_static_load/dy-static.pdparams' + paddle.disable_static() + with paddle.utils.unique_name.guard(): + layer = LinearNet() + state_dict_dy = layer.state_dict() + paddle.save(state_dict_dy, path) + paddle.enable_static() + with new_program_scope(): + layer = LinearNet() + data = paddle.static.data( + name='x_static_save', shape=(None, IMAGE_SIZE), dtype='float32') + y_static = layer(data) + program = paddle.static.default_main_program() + place = fluid.CPUPlace( + ) if not paddle.fluid.core.is_compiled_with_cuda( + ) else fluid.CUDAPlace(0) + exe = paddle.static.Executor(paddle.CPUPlace()) + exe.run(paddle.static.default_startup_program()) + state_dict = paddle.load(path, keep_name_table=True) + program.set_state_dict(state_dict) + state_dict_param = program.state_dict("param") + for name, tensor in state_dict_dy.items(): + self.assertTrue( + np.array_equal(tensor.numpy(), + np.array(state_dict_param[tensor.name]))) class TestSaveLoad(unittest.TestCase): @@ -158,7 +398,9 @@ class TestSaveLoad(unittest.TestCase): def check_load_state_dict(self, orig_dict, load_dict): for var_name, value in orig_dict.items(): - self.assertTrue(np.array_equal(value.numpy(), load_dict[var_name])) + load_value = load_dict[var_name].numpy() if hasattr( + load_dict[var_name], 'numpy') else np.array(load_dict[var_name]) + self.assertTrue(np.array_equal(value.numpy(), load_value)) def test_save_load(self): layer, opt = self.build_and_train_model() diff --git a/python/paddle/fluid/tests/unittests/test_static_save_load_large.py b/python/paddle/fluid/tests/unittests/test_static_save_load_large.py index 08413d711be..c5dc98af5c8 100644 --- a/python/paddle/fluid/tests/unittests/test_static_save_load_large.py +++ b/python/paddle/fluid/tests/unittests/test_static_save_load_large.py @@ -25,12 +25,17 @@ import six import pickle import os +# Python2.x no longer supports saving and loading large parameters. +if six.PY2: + LARGE_PARAM = 2 +else: + LARGE_PARAM = 2**26 + class TestStaticSaveLoadLargeParameters(unittest.TestCase): def test_large_parameters_static_save(self): # enable static mode paddle.enable_static() - LARGE_PARAM = 2**26 with new_program_scope(): # create network x = paddle.static.data( @@ -54,7 +59,11 @@ class TestStaticSaveLoadLargeParameters(unittest.TestCase): path = os.path.join("test_static_save_load_large_param", "static_save") - paddle.fluid.save(prog, path) + if six.PY2: + protocol = 2 + else: + protocol = 4 + paddle.fluid.save(prog, path, pickle_protocol=protocol) # set var to zero for var in prog.list_vars(): if isinstance(var, framework.Parameter) or var.persistable: @@ -92,3 +101,7 @@ class TestStaticSaveLoadLargeParameters(unittest.TestCase): .get_tensor()) base_t = base_map[var.name] self.assertTrue(np.array_equal(new_t, base_t)) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_variable.py b/python/paddle/fluid/tests/unittests/test_variable.py index 8d5ab0a5be7..690ac46e563 100644 --- a/python/paddle/fluid/tests/unittests/test_variable.py +++ b/python/paddle/fluid/tests/unittests/test_variable.py @@ -190,7 +190,6 @@ class TestVariable(unittest.TestCase): with fluid.dygraph.guard(): self.assertRaises(AssertionError, var.detach) self.assertRaises(AssertionError, var.numpy) - self.assertRaises(AssertionError, var.set_value, None) self.assertRaises(AssertionError, var.backward) self.assertRaises(AssertionError, var.gradient) self.assertRaises(AssertionError, var.clear_gradient) diff --git a/python/paddle/framework/io.py b/python/paddle/framework/io.py index 3d93bed32ec..3b953efab71 100644 --- a/python/paddle/framework/io.py +++ b/python/paddle/framework/io.py @@ -22,13 +22,18 @@ import warnings import sys import numpy as np +if not six.PY2: + import copyreg + import paddle # deprecated module import from paddle import fluid from paddle.fluid import core -from paddle.fluid.io import _unpack_saved_dict, _pack_loaded_dict -from paddle.fluid.framework import Variable, _varbase_creator, _dygraph_tracer +from paddle.fluid.io import _unpack_saved_dict, _pack_loaded_dict, _pickle_loads_mac +from paddle.fluid.io import _legacy_save as _legacy_static_save + +from paddle.fluid.framework import Variable, _varbase_creator, _dygraph_tracer, in_dygraph_mode, ParamBase, _current_expected_place from paddle.fluid.dygraph.jit import _SaveLoadConfig from paddle.fluid.dygraph.io import _construct_program_holders, _construct_params_and_buffers from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX, INFER_PARAMS_INFO_SUFFIX @@ -181,7 +186,9 @@ def _build_load_path_and_config(path, config): def _parse_load_config(configs): - supported_configs = ['model_filename', 'params_filename', 'keep_name_table'] + supported_configs = [ + 'model_filename', 'params_filename', 'keep_name_table', 'return_numpy' + ] # input check for key in configs: @@ -195,16 +202,158 @@ def _parse_load_config(configs): inner_config.model_filename = configs.get('model_filename', None) inner_config.params_filename = configs.get('params_filename', None) inner_config.keep_name_table = configs.get('keep_name_table', None) + inner_config.return_numpy = configs.get('return_numpy', False) return inner_config -def save(obj, path, pickle_protocol=2): +def _parse_save_config(configs): + supported_configs = ['use_binary_format', 'pickle_protocol'] + + # input check + for key in configs: + if key not in supported_configs: + raise ValueError( + "The additional config (%s) of `paddle.save` is not supported." + % key) + + # construct inner config + inner_config = _SaveLoadConfig() + inner_config.use_binary_format = configs.get('use_binary_format', False) + inner_config.pickle_protocol = configs.get('pickle_protocol', None) + + return inner_config + + +def _pickle_save(obj, f, protocol): + # TODO(weixin):add support for BytesIO. + if not isinstance(protocol, int): + raise ValueError("The 'protocol' MUST be `int`, but received {}".format( + type(protocol))) + + if protocol < 2 or protocol > 4: + raise ValueError("Expected 1<'protocol'<5, but received protocol={}". + format(protocol)) + + if not isinstance(obj, (core.LoDTensor, core.VarBase)): + raise NotImplementedError( + "Support 'paddle.Tensor' or 'paddle.core.LoDTensor', but received {}.". + format(type(obj))) + + def reudce_varbase(self): + data = self.numpy() + name = self.name + + return (tuple, ((name, data), )) + + def reduce_LoDTensor(self): + data = np.array(self) + + return (eval, ('data', {'data': data})) + + def add_dispatch_table(): + # This is not a good method, because the pickle module has been modified. + pickle.dispatch_table[core.VarBase] = reudce_varbase + pickle.dispatch_table[ParamBase] = reudce_varbase + pickle.dispatch_table[core.LoDTensor] = reduce_LoDTensor + + def pop_dispatch_table(): + pickle.dispatch_table.pop(core.VarBase) + pickle.dispatch_table.pop(core.LoDTensor) + pickle.dispatch_table.pop(ParamBase) + + # When value of dict is lager than 4GB ,there is a Bug on 'MAC python3' + if sys.platform == 'darwin' and sys.version_info.major == 3: + add_dispatch_table() + pickle_bytes = pickle.dumps(obj) + pop_dispatch_table() + + max_bytes = 2**30 + for i in range(0, len(pickle_bytes), max_bytes): + f.write(pickle_bytes[i:i + max_bytes]) + else: + if six.PY2: + add_dispatch_table() + pickle_bytes = pickle.dump(obj, f, protocol) + pop_dispatch_table() + else: + pickler = pickle.Pickler(f, protocol) + pickler.dispatch_table = copyreg.dispatch_table.copy() + + pickler.dispatch_table[core.VarBase] = reudce_varbase + pickler.dispatch_table[core.LoDTensor] = reduce_LoDTensor + pickler.dispatch_table[ParamBase] = reudce_varbase + + pickler.dump(obj) + + +def _use_legacy(obj): + # TODO(weixin):If `obj` is any object, the judgment condition should be more precise. + if not isinstance(obj, dict): + return False + return True + + +def _transformed_from_varbase(obj): + # In paddle2.1 version, VarBase is saved as tuple(tensor.name, tensor.numpy()). + # When executing paddle.load, use this function to determine whether to restore to VarBase/LoDTensor. + if isinstance(obj, tuple) and len(obj) == 2: + if six.PY2: + name_types = (str, unicode) + else: + name_types = str + if isinstance(obj[0], name_types) and isinstance(obj[1], np.ndarray): + return True + return False + + +def _transformed_from_lodtensor(obj): + # In paddle2.1 version, LoDTensor is saved as np.array(tensor). + # When executing paddle.load, use this function to determine whether to restore to VarBase/LoDTensor. + if isinstance(obj, np.ndarray): + return True + return False + + +def _to_LodTensor(ndarray): + if not isinstance(ndarray, np.ndarray): + raise TypeError( + 'Type of `ndarray` should be numpy.ndarray, but received {}.'. + format(type(ndarray))) + t = core.LoDTensor() + place = _current_expected_place() + t.set(ndarray, place) + return t + + +def _tuple_to_tensor(obj, return_numpy): + if return_numpy: + return obj[1] + if in_dygraph_mode(): + t = paddle.to_tensor(obj[1]) + # This function does modify the name of return value. + # Loading the same variable multiple times may cause the same name. + t.name = obj[0] + return t + else: + return _to_LodTensor(obj[1]) + + +def _ndarray_to_tensor(obj, return_numpy): + if return_numpy: + return obj + if in_dygraph_mode(): + return paddle.to_tensor(obj) + else: + return _to_LodTensor(obj) + + +def save(obj, path, protocol=2, **configs): ''' Save an object to the specified path. .. note:: - Now only supports save ``state_dict`` of Layer or Optimizer. + Now supports saving ``state_dict`` of Layer or Optimizer, Tensor. .. note:: Different from ``paddle.jit.save``, since the save result of ``paddle.save`` is a single file, @@ -219,8 +368,12 @@ def save(obj, path, pickle_protocol=2): obj(Object) : The object to be saved. path(str) : The path of the object to be saved. If saved in the current directory, the input path string will be used as the file name. - pickle_protocol(int, optional): The protocol version of pickle module must be greater than 1 and less than 5. + protocol(int, optional): The protocol version of pickle module must be greater than 1 and less than 5. Default: 2 + **configs(dict, optional): optional keyword arguments. The following options are currently supported: + use_binary_format(bool): When the saved object is static graph variable, you can specify ``use_binary_for_var``. + If True, save the file in the c++ binary format when saving a single static graph variable; otherwise, save it in pickle format. + Default: False Returns: None @@ -228,20 +381,91 @@ def save(obj, path, pickle_protocol=2): Examples: .. code-block:: python + # example 1: dynamic graph import paddle - emb = paddle.nn.Embedding(10, 10) layer_state_dict = emb.state_dict() + + # save state_dict of emb paddle.save(layer_state_dict, "emb.pdparams") - scheduler = paddle.optimizer.lr.NoamDecay( + + scheduler = paddle.optimizer.lr.NoamDecay( d_model=0.01, warmup_steps=100, verbose=True) adam = paddle.optimizer.Adam( learning_rate=scheduler, parameters=emb.parameters()) opt_state_dict = adam.state_dict() + + # save state_dict of optimizer paddle.save(opt_state_dict, "adam.pdopt") + # save weight of emb + paddle.save(emb.weight, "emb.weight.pdtensor") + + # example 2: static graph + import paddle + import paddle.static as static + + paddle.enable_static() + + # create network + x = paddle.static.data(name="x", shape=[None, 224], dtype='float32') + z = paddle.static.nn.fc(x, 10) + + place = paddle.CPUPlace() + exe = paddle.static.Executor(place) + exe.run(paddle.static.default_startup_program()) + prog = paddle.static.default_main_program() + for var in prog.list_vars(): + if list(var.shape) == [224, 10]: + tensor = var.get_tensor() + break + + # save/load tensor + path_tensor = 'temp/tensor.pdtensor' + paddle.save(tensor, path_tensor) + + # save/load state_dict + path_state_dict = 'temp/model.pdparams' + paddle.save(prog.state_dict("param"), path_tensor) ''' + # 1. input check + filename = os.path.basename(path) + if filename == "": + raise ValueError("The input path MUST be format of dirname/filename " + "[dirname\\filename in Windows system], but received " + "filename is empty string.") + + # 2. save object + dirname = os.path.dirname(path) + if dirname and not os.path.exists(dirname): + os.makedirs(dirname) + + config = _parse_save_config(configs) + + if not isinstance(config.use_binary_format, bool): + raise TypeError( + "Type of `use_binary_format` should be bool, but received {}.". + format(type(config.use_binary_format))) + + # `protocol` need to be used, `pickle_protocol` is a deprecated arg. + if config.pickle_protocol is not None: + protocol = config.pickle_protocol + warnings.warn( + "'pickle_protocol' is a deprecated argument. Please use 'protocol' instead." + ) + + if _use_legacy(obj): + if in_dygraph_mode(): + _legacy_save(obj, path, protocol) + else: + _legacy_static_save(obj, path, protocol) + else: + # save single variable + with open(path, 'wb') as f: + _pickle_save(obj, f, protocol) + +def _legacy_save(obj, path, protocol=2): # 1. input check if not isinstance(obj, dict): raise NotImplementedError( @@ -257,13 +481,13 @@ def save(obj, path, pickle_protocol=2): "[dirname\\filename in Windows system], but received " "filename is empty string.") - if not isinstance(pickle_protocol, int): + if not isinstance(protocol, int): raise ValueError("The 'protocol' MUST be `int`, but received {}".format( - type(pickle_protocol))) + type(protocol))) - if pickle_protocol < 2 or pickle_protocol > 4: + if protocol < 2 or protocol > 4: raise ValueError("Expected 1<'protocol'<5, but received protocol={}". - format(pickle_protocol)) + format(protocol)) # 2. save object dirname = os.path.dirname(path) @@ -274,19 +498,18 @@ def save(obj, path, pickle_protocol=2): if isinstance(obj, dict): saved_obj = _build_saved_state_dict(obj) - saved_obj = _unpack_saved_dict(saved_obj, pickle_protocol) + saved_obj = _unpack_saved_dict(saved_obj, protocol) - # When value of dict is lager than 4GB ,there is a Bug on 'MAC python3.5/6' - if sys.platform == 'darwin' and sys.version_info.major == 3 and ( - sys.version_info.minor == 5 or sys.version_info.minor == 6): - pickle_bytes = pickle.dumps(saved_obj, protocol=pickle_protocol) + # When value of dict is lager than 4GB ,there is a Bug on 'MAC python3' + if sys.platform == 'darwin' and sys.version_info.major == 3: + pickle_bytes = pickle.dumps(saved_obj, protocol=protocol) with open(path, 'wb') as f: max_bytes = 2**30 for i in range(0, len(pickle_bytes), max_bytes): f.write(pickle_bytes[i:i + max_bytes]) else: with open(path, 'wb') as f: - pickle.dump(saved_obj, f, protocol=pickle_protocol) + pickle.dump(saved_obj, f, protocol=protocol) def load(path, **configs): @@ -294,7 +517,7 @@ def load(path, **configs): Load an object can be used in paddle from specified path. .. note:: - Now only supports load ``state_dict`` of Layer or Optimizer. + Now supports load ``state_dict`` of Layer or Optimizer, Tensor. .. note:: In order to use the model parameters saved by paddle more efficiently, @@ -331,7 +554,9 @@ def load(path, **configs): ``save_inference_model`` save format. Default file name is :code:`__model__` . (2) params_filename (str): The persistable variables file name of the paddle 1.x ``save_inference_model`` save format. No default file name, save variables separately - by default. + by default. + (3) return_numpy(bool): If specified as True, return tensor as numpy.ndarray, otherwise return tensor as paddle.Tensor. + Default False. Returns: Object(Object): a target object can be used in paddle @@ -341,20 +566,115 @@ def load(path, **configs): import paddle + # example 1: dynamic graph + import paddle emb = paddle.nn.Embedding(10, 10) layer_state_dict = emb.state_dict() + + # save state_dict of emb paddle.save(layer_state_dict, "emb.pdparams") - scheduler = paddle.optimizer.lr.NoamDecay( + + scheduler = paddle.optimizer.lr.NoamDecay( d_model=0.01, warmup_steps=100, verbose=True) adam = paddle.optimizer.Adam( learning_rate=scheduler, parameters=emb.parameters()) opt_state_dict = adam.state_dict() + + # save state_dict of optimizer paddle.save(opt_state_dict, "adam.pdopt") + # save weight of emb + paddle.save(emb.weight, "emb.weight.pdtensor") + # load state_dict of emb load_layer_state_dict = paddle.load("emb.pdparams") + # load state_dict of optimizer load_opt_state_dict = paddle.load("adam.pdopt") + # load weight of emb + load_weight = paddle.load("emb.weight.pdtensor") + + + # example 2: static graph + import paddle + import paddle.static as static + + paddle.enable_static() + + # create network + x = paddle.static.data(name="x", shape=[None, 224], dtype='float32') + z = paddle.static.nn.fc(x, 10) + + place = paddle.CPUPlace() + exe = paddle.static.Executor(place) + exe.run(paddle.static.default_startup_program()) + prog = paddle.static.default_main_program() + for var in prog.list_vars(): + if list(var.shape) == [224, 10]: + tensor = var.get_tensor() + break + + # save/load tensor + path_tensor = 'temp/tensor.pdtensor' + paddle.save(tensor, path_tensor) + load_tensor = paddle.load(path_tensor) + + # save/load state_dict + path_state_dict = 'temp/model.pdparams' + paddle.save(prog.state_dict("param"), path_tensor) + load_state_dict = paddle.load(path_tensor) + ''' + + if os.path.isfile(path): + config = _parse_load_config(configs) + with open(path, 'rb') as f: + # When value of dict is lager than 4GB ,there is a Bug on 'MAC python3' + if sys.platform == 'darwin' and sys.version_info.major == 3: + load_result = _pickle_loads_mac(path, f) + else: + load_result = pickle.load(f) if six.PY2 else pickle.load( + f, encoding='latin1') + + # TODO(weixin):If `obj` is any object, the judgment condition should be more precise. + if isinstance(load_result, dict): + if isinstance(load_result, dict): + load_result = _pack_loaded_dict(load_result) + # paddle2.0: paddle.save/load + if "StructuredToParameterName@@" in load_result: + + for key in load_result["StructuredToParameterName@@"]: + load_result[key] = _ndarray_to_tensor( + load_result[key], config.return_numpy) + + if not config.keep_name_table and "StructuredToParameterName@@" in load_result: + del load_result["StructuredToParameterName@@"] + else: + # paddle2.1 static.save/load + for key in load_result: + load_result[key] = _ndarray_to_tensor( + load_result[key], config.return_numpy) + + else: + # TODO(weixin): support complex objects such as layer. + # If `obj` is any object, the judgment condition should be more precise. + if _transformed_from_lodtensor(load_result): + load_result = _ndarray_to_tensor(load_result, + config.return_numpy) + elif _transformed_from_varbase(load_result): + load_result = _tuple_to_tensor(load_result, + config.return_numpy) + else: + raise NotImplementedError( + 'Only support tensor and state_dict, but received {}.'. + format(type(load_result))) + + else: + load_result = _legacy_load(path, **configs) + + return load_result + + +def _legacy_load(path, **configs): load_result = None config = _parse_load_config(configs) diff --git a/tools/__pycache__/static_mode_white_list.cpython-37.pyc b/tools/__pycache__/static_mode_white_list.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..937267ff7180abf32341455e5d21fcac417782ae GIT binary patch literal 20217 zcmeI4cbqImwZ|`4P(+a+B0(jH2q@sa7ZDT{R8&+z1=EVQy>ohI+wGYix@UIpV$NAH z=bUrSIp>@)=bUqX-&5yQchAh?$M^nyazDR&w@+19SDiY!YPr##d+xD?|K73Nd3Ss9 zIa{`TPaoocuim<4%c(rnZ`ys!7TF@Z$X2<8>?*s-CFN4GyIflKkjuzrWly=B>?M24 zKC-V|UiOnK$o_IgIY17SgXCbjl3ZD?B8SMKa+n-0SCu2=YI3ApU5=8YCJW?JdkCw;Ch4NT=oIGBhAWxJh$&=+N@>F@6JYAk4&y;7$v*kJRTzQ^6UtSY49yj|WQ@054RyX8Id zUU{FqUp^ooln=>=RJ=z9wIn zZ^$?0Tk>uBj(k_XC*PMJ$PeX5@?-gl{8WA>KbK#~FXdPAYx#}*R(>bHmp{lKBs2`&Y82bTtWfXjf(f<3|Iz+PZ)un*W5 zTpsKPt^oE2R|E%u1HnPyU~naHWpEX62sjiR1`Y>T1xJ9Zfg{1y!BOC7a16KxxF$Fj z90!gE*8J6Ht;7=jTf!5EwmDo}$7n1VI14t9bKZ~?dnxF@(5xHq^DxG%ULxIcIR zcp!KXcrbVfcqn)ncsO_jcqDifcrD)yac=yybQb?yaK!uyb8P;yav1$ybin`yaBusya~J+ zyal`!ybZh^yaT)wybHV=ya&7&ybrt|d;ok9dZt{s{gA{tW&C{tEsE{to^D{t5nd;a2p2EBe0`{l^c0 zUBPbPlHgKccW`O22e^!S2L4s-iFddh|0?zZd-G>c{@n-ct3HE&75fFs75Hg?a7Azc zI1n5J4hB~OR|Z!Bhk!%DVc>9ZRd58j8aNVM9UKLY2FHMFfNO$d!ExYta4oP6YzI5Q z3EkhoCD4Uw*$8acK~+;cLGb`&fq+77jRdQfV+VdEQ1~pkbyoJfEAE~yMt9w zfFT%x5{$w5paM0RfGJo5>tH9?02hFJfO~>_fqR4dfct{`f%}68fCqvHfd_+!fQN#I zfro=ffJcHyfk%VKfD6H6!Q;T=!4tp}!IQv~!BfCf!PCIg!85=!!Lz`#!E?ZK!SlfL z!3)3(FWfqK_0kr8deJ#sj(RaM%+mgotezxkzn_n?q_>ieMp==Re@VQT$Y9Np?N#oIZj8Cr1ku;)-X?HRyqY^dNas$I>=Fz*2Q(7lwBH4JdqTLR*)tx_0t4xw&dm?!~Np<-&?j0J_UT<2Zy$z+< zW~aSLzLrk%a+Hil$)HLlb|qZQib<+njlHR)+OCAl+Gvsv_36spr%67{5rw2b-m#s( zPCTJ~dzkJ_(sAx$$8bvEecMcZUe`xs5RYS~h6t_=65W+;GT)-FdrO>xK&bPu95%j` zd^FG@5AqqR$g!+SnU;-&E*s6)Wf&H7dFyJrGN|&fUMChsuPD)qBHhTUrNgb~aEy3o zz10Nm$|uYhQDIX}gncce-bz^|gKU&l>4cw{(q-)JDjf~7B_F>UBt!JX+2UNU26b5l zE%CvsD%}X{NY*R;AcL7Ar*)lK&*nKFjI`aPcXMPi@-M zS6xhas9`3n8mn~CaEL8nt%OAw3&c#~h*$T~90VH7=)h-b*TEZIli( z*S_U+(pyPJWfeYug)s08Em_+hhvPf!LG4?*oFRfWCWiMVW5dJ3jGf&!=4-26&U;h9 zT(J~mU58njIr>_#VYc?=yh!Ps7D=OmX2NJN^QJ;=BG!<6d>hAD)wIEHwiHH|GVOH? z?{Yp)tBG#uFffG4*;Xu@XqpZ$mphYQk%zhaJe0!HgIrz3xChDjQ3ZkYt!G3ZWLyO{ zDg(?aBC)-o|2fid(WS#Ey|m6;*ZDEb)L!q=htGwLaLP(qWc6yWEI#1MbhMgeNSl7d zbi5a6Jeoj;Wh`$rJ8xzsF3xniR~AJEm-^$8at!rOi&{st{p^)Pn3kar@G@+^iI39r z0!if^tk!VJS5%Iqd2=MeQZT{gqU^24NtLW^=Y%~Nr32L5c#H>G+lEyQns1iYNfT7K zW?IoKwEgi-*5A}o`KH+40ptv!8m z&1iR*kfKfyHX1iNDoN*-Zer-$KGVQBE_Z>*3*vZ>#ga7D}khw z)H$v{7G-cCn4ChbFaeCKey4FY%UD^3MlM|`C{m{lrD$7ic9Jbm2P$f2Z{QSCM!eDb zTeJMi!s$2&wK)J@xvGu7*plcs%`?RbW_>Jd2(DB*lC^EhebX{&+s1WO=E1MBTW%fk zlz7z*lqtt>sRiQAWBk)N1OhK6Jj{k=r53Ntb4V;7}dAC1dtRS(1^PsaSL-160^1)GL9 z%wfm|)@i(w45!5;PuQT~D@{RUW2ByP$uyd$bm+dbo}ermjH$7X+q*GRw;HLw^)5Nq zMmRVLzHdC8EL|<|BU2lPau!(#0T7HiqiDWsu3PAzsfO3C8gm^6)ifEQ8Ycd8jUmp^ z)=kGIOb%h2uH5!)?sdd%T2(Au*}CZm)>kn7+G<82V@Wf?u<2Ph&yjqsGg>DB>K7%7 zd~vM}v-#HIpdQC<(m;^6`5Pofn>S2@7q#8&PCQ!a#gB~35KZrH+dNDow^bPegNfT_ z<;X03HXNhrn#bu>Ol;KxZzN<_mB$Y)VY}-=X33p%d?$OPfvyI_h_;)ltV^EaG2J;Q zJv2J=@V@EfQFVsynvrT+`p^g(M(!_`u3>A8qgUBvTE#~CAt-BErNOdVH1mH=PN$2s zXrv`s)MmcIsq~t8(Yjc8mTd%9*%tIu7}qgoyl6ZL4UPSSyhoUIKSbH431MBW-NxUh zV{*KWapqdq&oofh)Y&~{s>y(RP-~oyhFLX$XEuE!sw?(cXsVu$+70fIQGQJrgI<{h z3W+^a^`*_G`N}yHNV;h_NFT3dEhrkO?J!-GbM>3P0ngU zbeDd6VLHhRm;RPedtBKkAdk#o8rgbTts({Zb@d^xBKlk6;#@!sy}60qRx2- zCBXx8kNxS0OuZZtSE=*f9{t$@y^Y{De{;vhyx9#F-Rt*NMN?+s!S$FdA}&&!qN693 z(H6^x!%J0GOj$mKiBLt-8C##jOy+r85;bF1WNK>=tk?Xa4nvpWGTGhvAsSI$xSGT^ zH)N<#OkG__rnMF{)CuvD#g`1IwAis3Nr@HJfDpkjQkXh#HLKAM%pt5n3nEm+VjnY0 zJ~Dl@7~!=h{cy8bAHtduT*mT>o4NKlf|qW{{t~aR+1v-|WCi`2R9VN+4ALq2e9G)G zvSo-~%^7-`L9@tul?-G#vxLd?uW60RRHuirexXN5w=6hnOBiiP>zX-0FUN&Od%AXu zE|lh$s$Vh3nmzAns92Tjp_}bj<3=_0bKwi6mpUC1?*02UU8VwP~JO5Ph<7S8y6ub5NS8>k2VqNGm(trpulYE6w&TA8v2z zd=O3M@*yD?5v``={Tw+6Vj1#En{+-LYne!EU!hv!dfXuk8APw|jOx()(eyOmZ$=uV zqfLl}SSF;MK{Ua^XoX0Tq>%4TLENKQ?1XW~+i#LQ5G?x=9N9F6*nJ1U8@ zVbSumua9+6PU^^L?GVb}LyAL1l9%qjnmQdW(FJuBG?%O_1g2IG$RklicR8j)zFFQ_ zm;H&xxq&4!X}LR3Mj_v+$63!KkPsqhRXW!DEQXjTtF~J3nU9Fxg1d3;%b87zcBrIA zF$^fS*>szJlPV8ZOv+JOM_KT>S0-0?cQE0Qv6*rY^WVI@A+Yf+6ki#`7F^t-OY(~ zx`$Jew%yE1r=8>x?kPYYWwIV(V6VSxG@taKHw6Dp8^uZ8)|BKJxfym`hw5uO)?&VL!l$6LP$e(2 z(ZCpNavlzyQp%Fyj45(pE}Ukd+V@&35piE__m$!@?-4c>ibtdgo{N$&wx?L02<0(Xrw!C3l4M-qy`*?R3Iuk?!2X$QRA!&k74;l%W4sEW> zpY7AlYgN88^3}dM%&>ELIrTWmOFLDjB9iUw6;mqhAzjhb!bi69j~U`fA{9w;2^(e% z9A@=OJXW!-=g1=?YOe_Qb_RbRdiyl;5eY)Wmtk(6*$fnB4G)U(Tbc2_{+&_}F|Dd+ zOdXbZd4hT$nT3S(J~B+7&50BR*!b zP>sNG;%^eRUd$`aj4v4Xv(=TOLu2KjE*7T+0;7_;yOBOvW4i-kSx*dT=K3du_ zJ8oU48gWr6{dZHfs7gm4W>h4ay|LA_X;(9gu*$jmVkd0#@Q49JM?WMm+onMRVOL&g zncpOGobO~4>lI?LB6F!9m&L|_SU`*B&B(6)c6c51%Vb;$90|3D+YV&vvmGGN z!OULBF}?8Uk`oTZ371A0d8B3)LDzW0jGC73^N<^B9p7pWrhv4N#?vuAVvAZwGIJIX zdz6%dvg#tU1@Y+3`DF)jXobMbuZQ< zAnqiwqO{c>nl#<5#mex$QB^$S(Dhg$E=(eBWOD|h?aum*U`>@^erp^LVOY{|IUj{g z&V#YGO+$_%TMKf5MR%*)wvY9|KX^`$5IRj$yxn+{?|Ff*)JW-sl85H_ocrjxi%C<3 zNO)4?gkg;khcM6AjbrU)=%XsTuM?6-Vk=nzV|5zdn=`v`q^Ts!eS(Pd5mK+9_Ko$? zF1|m0(x(*DS=?Iixz0V#5U`qSO?P|L?mi<<<;jY*&ZG4VyKUkEfdhy-O#dT;Jx9?w z9JEU^encOh2`N1=GEePNOl+M`Li}eDUr75bo7Lb)Ptw$@*<_nbH@5un$h~(W8m19$ zTC`D{nSprs%900TK&{zy!=O4FXc!409&}m7057Czt!-(`JsN|-^x@&kvX%tH!HGpg zej4aAj-q57Ly!o9tp>mh%na5mRgRkI(Vm+L=a6|Btfn3b(A@;rosZtQi?{ra4xDFm zhI&eQ%dC85mgtfOhw#)_Hbd0T z7C2vgNkRMg>Q$)qN=}S7{nasfq8&QvAxKAk^;Of9I`r|J#P=bP=;3=b4r=JL3`N-_ z!((udvBcs!qrF(mJ-avuAB^LJ)y_<9s?5q5r(>FFA^p$SPFvK+TJM%%W|31yBlTylz+JWS=f2TfWa zvuC#!9EpToww3FlbXd7Ra?>!hn$Zr{Sp!kuhhUDvy_sffLV;WDO}kqh#TOX%JgAik zos#Bj4?frV7E2Ax9!#)3c28lG%DGK(I>L)2n+Y9P>4ISo zUJxs>ceSS!+yGT$z-rCnPc>UGQL*Vy$Hd!SYF6u|ewyCKP3)Y8zGeHE2WFZbUeEHu zik^#Uz7;RvnLMqt_j)&v%A&t(6h^iV8)Z?(I~^uQN?F^{Xuesq_I__WDWfrx%|fqm z(&(2NCM%ltI#-&@Z=TQcj`4IZ&u!>4HQZWn3z!F4pf8kGO=&v)EK#u&iFxwE+hzhg z-KZ1b<40Tx^R&9tR`xjJj-`MkIHtu;mTqgV#`Vv{CS%ujIwQ??3W-qk>{*YSS6c6c zH?HPHPPJZdQ!THZk+{<#vWJLcsOpVT&o{*{&}rb`B(X7qUC2ARq8%g3g*Losi0<7? zZ<535w5;Ck#q)pLCrP;?KvGw(PgZ=b;p{t_Q`^T-l%y=pMP;i^nmbz@nY*vHO?qm= z0n2#0bgkJOGRC}|Sj4WOgyP|(zH#z@_C}ag=VVl0HB%6(to6m_^>!SuIks;>b3T-f zD0hTHYoh{fTcs<4?p`-uVDIM@6{32QHQqnAaJu9TF%(_(X^UDGwh+VB3v;loWG6Ar5AMBV(Yg>w&) z8vSfa?iK`S!Ps4`I`Oa-+}|*l*}3N49@v-<^3UQhPl^)6CuZ6Y@w#|)e-<^-7+X2j&>!DUlvmJNsXvsDYC(nJxaxL9~xpkW- z3}-B1+3j%iEi7W@RI_~t4e!_tn}!!$=e{=6HaA6EmA}_c!)A^tv-{3%gY8JwUWIO( zOopO^ZRuunB7N@r?%>HM+ZQ}Vg*MmOq_Ne>xpmGr&%GV1?&xc^}uxM*dbirm@2N z#H$c(v!zaEz43+^X5%?`)rrHAR;@!=DXleh*h@wTwZPb4BVPR_Nc~rjOQ=h_aS4|i zImW2+cymI(jsYhYHmBwBmyv^5hqK9sQSthSCXRN$qZ7dlaa?WbzZ1*qaPFKIgY5CZ zvyD)@ZBrlCZ%M6PjnoL-$1s8&-NXPqPCPDAf5|ZZ-e7pu)1$?k8DUpF#W223_>H%j zz$FF=lO!l#@V78Np2X|&EZAz_x8Lg;7gNsF{X9*FZAIu}0}U&Rg~M`KU|JB)gqr&@ zamVc{TG#-Wl#<^Otf`Q+;e(ClBH@f~8S1})=h3gxo4c)??yP%;Z1Mn4x}LC&VwtD0 zh<+-H7rRP2Hj%Jbo#Qqoqaqf|Mr)cEdZ<6n$C=hX!=;dMiP3W5(#YLb(P6I%qHdrczIqu9!hChqB197hGrsy7cP~v| z!*2bD`R3sNo;~WT|GQLG-*6jGqVF}}m^Ytb)-c>i@aqSybxCslRYpAyj+VzN4_>wi zR)h@=mlis2c_*`aesd?;$t;X(9b-@VN9oox7KD^yeyTYo=)B#DfoINZ6GXvqYP)ZDBTbk>9pja9?Y;T+d zs%?-L7rNbzXWn3E%AH}{U2TsGF>$tYgV4?VLp+A+lKdCO1Ihf?CU<$QCh9wB7tX>9 zHeEYh37n@mqyGP_qj$OJfOF10?c6hNoSb>~O-@hFJ^h?>&p7L5$*s;f=iHZWsdnT4 zkQDy<>nX=q+-Eyp4|wb2n}YZ8w`Nz0?Dz@APsmmd*UuzWxwPEL`P^~ijf-|a1tRy$ xZm_rhFPF|^*sZ(#_s{=l4(z(ifB*cK1B)C`olt80xBV$ntbOs#|573c{s#-?Bv=3d literal 0 HcmV?d00001 diff --git a/tools/static_mode_white_list.pyc b/tools/static_mode_white_list.pyc deleted file mode 100644 index e9012c233595b6844f54e625972360f5aeeb0d3b..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 21803 zcmeHPb+{~7k*{-t27=4X9}(-xW3>l!3_jA6x>K~W5G=XHx=AWaC5;e z1h*92N^onzZ3MR!+)i+N!5sv56x>O0XTeq3-$#k1*ZfD zf-40t7Q968Qo+jvFBiN*@Jhj}1g{pnM(|p}>jbYCyg~3r!J7nc7Q999R>9i@Zx_5n z@J_+I1n(BSNAO<3`vmV7d_eF)!G{DN7JNkTQNdRTJ|_4|!B+{sTJSZ3uN8cq;Ohk+ z7kq=@8wKAa_-4Vk2tFbBq~KG6PYXUH_^jYt1>Yw4oZ$01wSJAQNfQ1eq8Vqf}a%pl;EcYKO^{A!OsbPUhoToUljb3 z;FkrzBKTFouL*u#@Ed~P6#SOpw*|i=_+7#834UMj2ZBEo{E^^~1%D#=Q^B7J{#@`E zg1;2}mEf-hew&>UxNP@{Ey&E z2|&PzJmxU@u@Eun1TNQ~|4iD+20( zCZG*i2kZx&3^)~V5O8I{ivwN~@X~;n1-v}q6#=gdcvZlw16~vG+JM&uyguL!0dEX= zQ^1=8-V*TEfVTy_J>VSy?+kcXz`Fz96Y$=E_XWH^-~$044ERvMhXXzm@X>&;2>4jQ zR|b4lz*h%+O~BU%d|klT2Yfu>8v?#D;F|)zIpA9YJ`wQAfKLT{I^Z(_pAGodfNu-< zT)^i8zCGYO0=_fgy8^yD;ClkTH{c5a-xu)x0Y4D%g8@Gj@WTN=67Zt|KNj%g0Y4G& zlL0>!@Y4Z56Y#SEKNs-x0lyINivhnB@XGS@W%mv67Z)1e-`lP0e=zjmjQnj@Yex<6Y#eIe;4rg0sj#2j{#o{ z_@{t>4)~XVe+~GzfPWA8kAVLS_^*Kf4)~vdF9k?Mh=>tqL|h}{ni1EExOT*KBCZ>I z{eNb}^`h$kvuFz<&f?#r+JfuTB1GIM_7>sB5jWvanuds*Mch2%77@3MxK+fhBW@FM z+lbpm+&nh__l!6@;$9K=j<`?6eIw3^I5*_O5f6)ac*G+j9vShdh(|{}CgP%q$3{FZ z;_(p|M?4|oi4m7XJSpPRh$lxpCE}?OPm8!L;_`^6M?53qnGw&5cy`2dBAy%Zyol#V zydWZrcws~yu@fWJ4wyf)%>5wDMUL&O^+-W2iXh_^(%HR5d% zZ;yCK#5*J274hzf_e8um;(ZbCkN7~u2O~Zd@!^P%M0_;PJ|3SVG!XH!vp+^>puGjX zfwpMcET2xxWsyyG^X0ObXVt2`!GM}n%SqlU=#~Shm=}v;*`6$$BFll(;+pl2;%+s} zo0ju=c4bjlO*Sw0isKu|IS-K6dW3SBP3oy4Z#>MLSIFw3DbhJ^=FnPu|q3k7Wwe_+`8Tkw~y3FBu&AjB5#?&x~4_@UQ(^eUZRlBQ$Z)^ZFQd?wEiXZ?BTsZf;CEb3cB?v@70aT|TmFrQ zyq>L5=gV2q`d;-cTTt+N()mNxv!<$(*!jYBogbttW+!Xip9`D1$eV^|-3FDjrGiZK zI7&hL`Fx$G8)vGNE)#9%O`FK%M_x~K4QD$3Zb*Dyd!-yp+#T(D`xS{v4Mpx%8kPCH zyfQJUKg3Qx**lrna3bvFO*zSGvhXrr6fXEXc{|z7mQ|e&bn{`84RE%9e6%#T<;ykB z6gvf(yP<;PLMX^J>DosAF`D-GM`b7?PX{p_m9ykBu!+-qKCFrCJoOzXsV4_G{oI3QxQ6=GtV0f z5p;oy)TguM2((@_>rHh%*-yVs?2xy)zmxu&SIJ0HDbCfL zP%2#;jqrO#!K>HQ3hnI%86Kf2gFe7cjbdj#Qw}zbNV1Z7>7KdkVHQnEzFsBK=?ohu z-qrRxjLdZZ&IXT;Tuf;pk}s-UmhB(Y2Q$%zqgPE`mC3WpnfZ|(H!K#Af z{CoQ`Xyxd|@(PP$QPoKoCD*944pX{xBZC9_usJR1m2?kwn!&Cc5023DD)}$QvM8Df zRdcm0TzjKv^U3b`_dZRls|OVNJDbsWgu7=b^I29bb_%q2vTM+(tjnD>1>lg9Gux(~ zXE%tB4Q074W(Tg<4NHHs8J(%xROF~mE?3pMZf0<9#)?0cu6|urv+Fd~Y#|P)KUcfi zVm)umjL(|<1QSLaEY(YBO{)-)FM8LZX(^cM+$O6tT5_q5#enFU$2ssIq3Lk9$=t8j zZF~2DW`*iv>V?k~)&MT^ps~BJp z!hMfiz4wTZ#?fOw?L3C8r^T(4%V4iRzRh4+j@)onr8wp~#u;?9{9RPi(o@j)-DE>E zQ^jIM4WrSw9?pb7?ZrVx=eu^F%3;S{$cip7_ZK?Fm(}1xV-OV=Q=7#Yv!-v*dnMU~ znxyb(%f|H~O+js8O4E>2g6Tgkdz07R$0#stQy1;J9`GDWR@*OX4Nz3cnsv7zUGHyp z*P)%;YqqXOaP0O9-9Ynii-SZ_d$nm!5|6YjS1_U_oT+DG&KzztO(BlGXu7rA8u}Oc zigEkFs&H|h78;yrf~5Bbat+ViQRwjN<)Wx(B(2@7gRA%9=hFO4zI6WB#8B^OLLkYo z>B5xL;T8Pin@Juev++3xt8YzLxYVVR-7z0~%7}$mNIr|{kshcinzo#+t96s5$aRBz zhd~>ZoEOwt#`jbGNv=?@w@jJY&6{24iEc_w$A7`MtlfX!LS!el*IP5< zWS*pGG;}r@>!z(+(qMFFJ(j{6_tvM33Pv>J*@}}@y+>}MPowtG#c|psuh3tkt&?Uk zW}UA2q=UHoFolv@<_=8POQvwDCH+zLU5<|o_92ve!Ka;n;`H;!&bPZhQ&kW-57mb? zE5>*u5-66(?Gr%>Y1xo8wJzps(uc{cl$6vvWNFx)O|rNKGF7x)+R`&tT-dC%F2+%d z9meWcBo9sZb$J}ny`H7G4rvPKv~{C-Fg2^3YW_M`R`4)*{10X^(5(f`;_YzA1Ir|HQkyr zY_2v&7%+&NOr4}7)O65*F$2gXuY~9myB2@ zsYG++-8OEcRRDJ$+H;RMA&JE_GKKnN$zs89z>KNweBK@$xkbvqXgEw>+P;{1AmwsW zcc(NztYo(QRMtxpX}XO_f~kz@47i+fk7dOehSb|xPxWUrui9qx5;`VTdKOGB&a<*| z3vAh8T6kxpWzrIioRfcVNuY9o1IiUcWa`h_TFijG%hL0 z!D1jioCk1axygWQA*;!LbC6iAn#^583fPJNcKvPS5bB)V1&#Dh<>T9p2sUyH{H$OI9 z+SwslkE~f#Rad^U+4G#Xd%R1^J=y|hCr01}55k~}lgt8+ushRZ7i2!fo@z9N@jhR} zNHk|cYpMA=&0DZ8v&&5FoZRJ8yX?E?^zD`Vy!=K6-cnjbHd?oHQYBT!bDMEY0FCzh zNFjw(V>DCZ`tCXN8Dyo!rg2j>kgv32rNr<E~9f#`!^0PQ$|+lPItk9FI~6WMYv~*oeN>sOsBCSbAWKKp66SS$HcrN(+J*Yd+cz#`;8lJ*V zu_&l+OQxmF2kdixb)mj+acVMOvxJo5Lk;14mo7EfnTHXSv0qjqy|)p3Q8c?l(a~6l zMZkiAP)#rRRB~w296wmOWC%8<;G*o@4O5cTv2+EfjCVI&3E#HT=7BC-Z11qB=*G|j z`bNBFs==7^(-YKUmXw@Jp2t8we=^P6?Xm`|DGhE$r({GTMbcAC=~bt1g4UHrZFH@> z%=D=VLkgX2%18@?l-#<<S(0HHl27(7gCVk4Pyo%o z@k{r+FRrp4tO}%dCC?@Xnv6kge0$1t5)UIR)6H3baubdLww041xN9JBF-{pIqf7vwt}BOYp4%mslY0lwtbl>|m?;6lTtLQz;z_rs+M>ZN_#jM+XqfH3 z`gUW75+a43LtBM=r|O2X?#Cw{l3CE|PRC|;oTEmuTCrZb2X=T!(h$6HqrDv3CZpQo z_~7In9#7w~ca^%C9L?H!s&}{sK0Ix)|FvBt6I~QKF!{*Kq z)m985m`iFVqYM<`rx@xoJL=WL@uUvfRhImn>!Hl2x3wH^I@X9AtQe@H{LK!M?pK&| z$WWZ*g%>;JGDX|&wXO`U-q*a?Ph3WG$mJUb;3#oaF1WOSP5%!jRVDc7S z6-q(N@KjSebVao!Y3x~(yZVsRVT*=xlALF!4LyoUL7kw=#|c4b@6hfRbH0Th%;&H; zp85+EYVm23CGM#Kv#KFekvhuXwlAzJ(DVlFc5*hH)GVV> z^qwPUFPgMLm2lQQ2J@sAxulm!E>k(xmYnpPPJ8c7nwQec3WJ=oO
    uoSy@*$VX) zwLq!AxmE$gPL}RO& zi;g2U=uCW4^}ZV^x1+h}R9Q41vM<5=Wd`0f+LALFfppm*9?MjBvs;%GOs%B%%FAil z*;DF!t%q@v2}W;ewY7NYZob#>SZqUFXsh5xlBUbs_^X%#9!5_q+Bx%Tor-;%qglpL zs)bXw0;fiMGk8wFbeGOxz9WSJe(4RkPO+%v^ob@N=Co0v(PqHGo%aP+ObB$vTdfa{ zl!q>0BH5&2eE{k2mY^KxFS|B%(e3s(Y%1ZXF5Gx;cHhjYVap`W#?A`e&RzL6wvk=bX8Nh6UhLYJjwlk@ zxz|hjHHr2k(6s8F!>GxchbEUtRVHHxqUbC*n{~hYOl(Ef+<45#7JF4rB!}6swZ?KC z@7c}PE7p?w0*Y!`^<6l}s87YL0Z6q0DHHL{l$THTmK6v%cRxp~0E< z3|qO5F>0eA(P;aS*~deuLlpuO)6vi_!nBgzHs;V8rn5&QnNp(Dvm}u{$1*}!6SlDo z`JyZt-y_F$=@*T>*;$`;2eY=9(R!QXQYh=v$WV#>qMD&6+3Ybp8P%wV;KnIUKVm^n zruM`tTO~)}VV(tt_$*Nyp@VgKoa6(JBAd zd+0jpL0|Tybh=Teg3dPOlZpb3`>cnL&~@EtZpFhoE36hvro~d#U@S_GuJw*YnL;+y zv}A`Ltq>hdH?YH3Of1g!I8zQbQETyT77g6T21jC&bvg8_s4_+6xH#~-qyUHQxsv8+#cn4P|R@MBU*Czaf-+2vKw8D5OmCjbToycW20JwzVa9@pW{;mW`0O;%q$!!&tTvu$?kz z=WbV?@-e1-ZH>}; z38H;nNDiya5Yo;?*50f_Xb&o(MK~B2W>ia&EvO7`>cB(!DzqRu=dh5`3+kUp6%C1A@ z)qT+6IHP^-pL;mgm}4-_jxzl+ptg=R*wAj|BPm9sqxKigrPK$h2(PM7jDw(9jHmrCi}(Jb}C^~?HE z!=&q))E~{FS-^g z`8w=VuVx-7rM$fdaiorg_FezA#tFVy2kUBxR*o>W*kO75^znKZTBSf^6c!%thf0jl zM@L37Hf?V@j3bAm&5~p%OHCs;w6y58JGX9QmF|1q)qKsS6tDd1`p$u;bzLk~nHo$; z9h0e4rL>kdRcp#8T`OvOC+D(?Uz#_}H@P=r%l5qF@8Lx6ztwh+yhB-kJqwH{JxS0p z68U>k5?O4f7 zf=*^vcgOnafxrAPd-nf|n05C5IY`$@#(|mu>GS+U_TYR|%HuvBS7{l)O0QD`#GvlF z?2JotQd)fd)BrS^*pD;K+c({1r+6Zx=Ew>$lcp#eKcv?bXjawANt?@zT=VT^`tQ`! zcw447Y5iE^^0CaQith2|ETnICIG0yTC7#Vi``-S3lI>-xJdHM$)Pqy9H(~7Jy^2Xd zi<=kT{FM;)rhPJIJjtlrg$X#XEfNl-JvM|@|H z?s)0hL0L^5mCG)F*{XV; z_9-;ySrOyBr|@;=#d$5W&;)lS4$V7eOcrX!nf2gvR|l!n>_WoXM*1)%Z*$2tMa~FU U|8q43uBO1%6u6oK|6dCHFWA)*QUCw| -- GitLab From 69c874fdc49b115e5c58784ff318fc0f5c45d265 Mon Sep 17 00:00:00 2001 From: JZ-LIANG <38102074+JZ-LIANG@users.noreply.github.com> Date: Fri, 2 Apr 2021 17:46:46 +0800 Subject: [PATCH 149/486] [3D-Parallel:Sharding] Optimizations for supporting ERNIE 3.0 training (#31884) --- .../framework/distributed_strategy.proto | 9 +- .../fleet/meta_optimizers/amp_optimizer.py | 1 + .../meta_optimizers/sharding/fp16_helper.py | 47 +- .../sharding/gradient_clip_helper.py | 56 +- .../fleet/meta_optimizers/sharding/utils.py | 233 +++-- .../meta_optimizers/sharding_optimizer.py | 828 +++++++++++++++--- python/paddle/fluid/backward.py | 35 +- .../tests/unittests/dist_sharding_save.py | 6 +- .../unittests/fleet_meta_optimizer_base.py | 6 +- .../fluid/tests/unittests/test_dist_base.py | 1 + .../test_fleet_sharding_meta_optimizer.py | 278 +++++- 11 files changed, 1224 insertions(+), 276 deletions(-) mode change 100644 => 100755 paddle/fluid/framework/distributed_strategy.proto mode change 100644 => 100755 python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py mode change 100644 => 100755 python/paddle/distributed/fleet/meta_optimizers/sharding/fp16_helper.py mode change 100644 => 100755 python/paddle/distributed/fleet/meta_optimizers/sharding/gradient_clip_helper.py mode change 100644 => 100755 python/paddle/fluid/backward.py mode change 100644 => 100755 python/paddle/fluid/tests/unittests/test_dist_base.py diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto old mode 100644 new mode 100755 index 04dc51f1b94..805ef1c3e91 --- a/paddle/fluid/framework/distributed_strategy.proto +++ b/paddle/fluid/framework/distributed_strategy.proto @@ -29,9 +29,14 @@ message RecomputeConfig { } message ShardingConfig { - optional float fuse_broadcast_MB = 1 [ default = 32.0 ]; + optional float segment_broadcast_MB = 1 [ default = 32.0 ]; optional bool hybrid_dp = 2 [ default = false ]; - optional int32 sharding_group_size = 3 [ default = 8 ]; + optional int32 sharding_degree = 3 [ default = 8 ]; + optional int32 mp_degree = 4 [ default = 1 ]; + optional string sharding_segment_strategy = 5 + [ default = 'segment_broadcast_MB' ]; + repeated string segment_anchors = 6; + optional int32 gradient_merge_acc_step = 7 [ default = 1 ]; } message AMPConfig { diff --git a/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py old mode 100644 new mode 100755 index dba3c944f70..02505e01197 --- a/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/amp_optimizer.py @@ -59,6 +59,7 @@ class AMPOptimizer(MetaOptimizerBase): is_distributed = self.role_maker._worker_num() > 1 if self.user_defined_strategy.sharding: # FIXME(wangxi). sharding failed when split check_finite_and_unscale + # FIXME(JZ-LIANG). To support Sharding-Megatron-AMP, Megatron should follow Sharding's behavior that to disable is_distributed. is_distributed = False self.wrapped_opt._set_distributed(is_distributed) diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/fp16_helper.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/fp16_helper.py old mode 100644 new mode 100755 index 03b36262a4f..cf399f66946 --- a/python/paddle/distributed/fleet/meta_optimizers/sharding/fp16_helper.py +++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/fp16_helper.py @@ -73,7 +73,7 @@ class FP16Utils(object): @staticmethod def prune_fp16(block, shard, reduced_grads_to_param, ring_id): """ - 1. prune all cast_fp32_to_fp16 ops if the param not belongs to this shard + 1. prune all cast_fp16_to_fp32 ops if the param not belongs to this shard 2. revise amp inifine grad checking for sharding """ # remove cast @@ -103,6 +103,7 @@ class FP16Utils(object): op._rename_input(inf_var_name, inf_var_name + "@sharding") if op.type in ["check_finite_and_unscale", "update_loss_scaling"]: reversed_x = [] + reversed_x_paramname = [] for input_name in op.desc.input('X'): param_name = input_name.strip("@GRAD") if param_name not in shard.global_params: @@ -111,12 +112,24 @@ class FP16Utils(object): "be grads, but {} is not a grad".format(input_name)) if shard.has_param(param_name): reversed_x.append(input_name) + reversed_x_paramname.append(param_name) op.desc.set_input('X', reversed_x) op.desc.set_output('Out', reversed_x) + + # the grad checking should take the all and only param in the current shard + to_check_param = set(reversed_x_paramname) + should_check_param = set(shard.global_params).intersection( + set([param for param, worker_idx in shard.global_param2device.items() \ + if worker_idx == shard.worker_idx])) + assert to_check_param == should_check_param, "amp \ + check_finite_and_unscale checking miss [{}] and got unexpected [{}]".format( + should_check_param - to_check_param, + to_check_param - should_check_param) + if update_loss_scaling_op_idx == -1: return inf_var = block.var(inf_var_name) - inf_var_fp32 = block.create_var( + inf_var_int32 = block.create_var( name=inf_var_name + "@cast_int32", shape=inf_var.shape, dtype=core.VarDesc.VarType.INT32) @@ -128,32 +141,30 @@ class FP16Utils(object): update_loss_scaling_op_idx, type='cast', inputs={'X': inf_var}, - outputs={'Out': inf_var_fp32}, + outputs={'Out': inf_var_int32}, attrs={ "in_dtype": inf_var.dtype, - "out_dtype": inf_var_fp32.dtype, + "out_dtype": inf_var_int32.dtype, OP_ROLE_KEY: OpRole.Optimize }) - insert_sync_calc_op(block, update_loss_scaling_op_idx + 1, - [inf_var_fp32]) + # this allreduce communication should not overlap with calc block._insert_op_without_sync( - update_loss_scaling_op_idx + 2, + update_loss_scaling_op_idx + 1, type='c_allreduce_max', - inputs={'X': inf_var_fp32}, - outputs={'Out': inf_var_fp32}, - attrs={'ring_id': ring_id, - OP_ROLE_KEY: OpRole.Optimize}) - - comm_op_num = insert_sync_comm_op(block, update_loss_scaling_op_idx + 3, - ring_id, [inf_var_fp32]) - + inputs={'X': inf_var_int32}, + outputs={'Out': inf_var_int32}, + attrs={ + 'ring_id': ring_id, + 'use_calc_stream': True, + OP_ROLE_KEY: OpRole.Optimize + }) block._insert_op_without_sync( - update_loss_scaling_op_idx + 3 + comm_op_num, + update_loss_scaling_op_idx + 2, type='cast', - inputs={'X': inf_var_fp32}, + inputs={'X': inf_var_int32}, outputs={'Out': inf_var_sharding}, attrs={ - "in_dtype": inf_var_fp32.dtype, + "in_dtype": inf_var_int32.dtype, "out_dtype": inf_var_sharding.dtype, OP_ROLE_KEY: OpRole.Optimize }) diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/gradient_clip_helper.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/gradient_clip_helper.py old mode 100644 new mode 100755 index c6aee792fcf..5082bc33830 --- a/python/paddle/distributed/fleet/meta_optimizers/sharding/gradient_clip_helper.py +++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/gradient_clip_helper.py @@ -16,14 +16,14 @@ from paddle.distributed.fleet.meta_optimizers.common import OP_ROLE_KEY, OpRole class GradientClipHelper(object): - def __init__(self, sharding_ring_id): - self.sharding_ring_id = sharding_ring_id + def __init__(self, mp_ring_id): + self.mp_ring_id = mp_ring_id def _is_gradient_clip_op(self, op): return op.desc.has_attr("op_namescope") \ and op.desc.attr("op_namescope").startswith("/gradient_clip") - def prune_gradient_clip(self, block, shard): + def prune_gradient_clip(self, block, shard, pure_dp_degree=1): """ prune gradient_clip related ops for params that not belong to cur shard prune: square, reduce_sum, elementwise_mul @@ -31,6 +31,7 @@ class GradientClipHelper(object): """ deperated_vars = set() deperate_op_idx = set() + reversed_x_paramname = [] for idx, op in enumerate(block.ops): if not self._is_gradient_clip_op(op): continue @@ -44,6 +45,8 @@ class GradientClipHelper(object): if shard.is_param(param_name) and \ not shard.has_param(param_name): deperate_op = True + elif shard.is_param(param_name): + reversed_x_paramname.append(param_name) if deperate_op: deperate_op_idx.add(idx) @@ -65,31 +68,48 @@ class GradientClipHelper(object): for input_name in op.desc.input_arg_names(): if input_name not in deperated_vars: reversed_inputs.append(input_name) + op.desc.set_input("X", reversed_inputs) assert (len(op.desc.output_arg_names()) == 1) sum_res = op.desc.output_arg_names()[0] - block._insert_op_without_sync( - idx + 1, - type='c_sync_comm_stream', - inputs={'X': sum_res}, - outputs={'Out': sum_res}, - attrs={'ring_id': 0, - OP_ROLE_KEY: OpRole.Optimize}) + + # this allreduce should not overlap with calc and should be scheduled in calc stream block._insert_op_without_sync( idx + 1, type='c_allreduce_sum', inputs={'X': sum_res}, outputs={'Out': sum_res}, attrs={ - 'ring_id': self.sharding_ring_id, - OP_ROLE_KEY: OpRole.Optimize + 'ring_id': self.mp_ring_id, + 'op_namescope': "/gradient_clip_model_parallelism", + 'use_calc_stream': True, + OP_ROLE_KEY: OpRole.Optimize, }) - block._insert_op_without_sync( - idx + 1, - type='c_sync_calc_stream', - inputs={'X': sum_res}, - outputs={'Out': sum_res}, - attrs={OP_ROLE_KEY: OpRole.Optimize}) + + # global norm should only be sum within each model parallelism word size when use global group + if pure_dp_degree > 1: + block._insert_op_without_sync( + idx + 2, + type='scale', + inputs={'X': sum_res}, + outputs={'Out': sum_res}, + attrs={ + 'scale': 1.0 / float(pure_dp_degree), + 'op_namescope': "/gradient_clip_model_parallelism", + 'bias': 0.0, + 'bias_after_scale': False, + OP_ROLE_KEY: OpRole.Optimize + }) + + # the grad sum here should take the all and only param in the current shard + to_check_param = set(reversed_x_paramname) + should_check_param = set(shard.global_params).intersection(set( + [param for param, worker_idx in shard.global_param2device.items() \ + if worker_idx == shard.worker_idx])) + assert to_check_param == should_check_param, "amp check_finite_and_unscale \ + checking miss [{}] and got unexpected [{}]".format( + should_check_param - to_check_param, + to_check_param - should_check_param) for var_name in deperated_vars: block._remove_var(var_name, sync=False) diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py index ad1cd4f6082..8b111026bdb 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py +++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py @@ -28,21 +28,24 @@ def check_broadcast(block): if the broadcasted var has a fill_constant op, the fill_constant op should stay forward before the broadcast op, and before a sync_calc op. Otherwise, raise error. + + should ignore and skip broadcast_op of inner_parallelism (e.g. Megatron) """ broadcast_vars = {} for idx, op in enumerate(block.ops): if op.type == "c_broadcast": - var_name = op.desc.input_arg_names()[0] - if "@BroadCast" in var_name: - if var_name in broadcast_vars: - raise ValueError("var_name areadly exist: {}" - "the old pos is {}, the new pos is {}". - format(var_name, broadcast_vars[var_name][ - "broadcast_pos"], idx)) - broadcast_vars[var_name] = { - "fill_constant_pos": -1, - "broadcast_pos": idx, - } + if op.all_attrs()["use_calc_stream"] == False: + var_name = op.desc.input_arg_names()[0] + if "@BroadCast" in var_name: + if var_name in broadcast_vars: + raise ValueError("var_name areadly exist: {}" + "the old pos is {}, the new pos is {}". + format(var_name, broadcast_vars[ + var_name]["broadcast_pos"], idx)) + broadcast_vars[var_name] = { + "fill_constant_pos": -1, + "broadcast_pos": idx, + } for idx, op in enumerate(block.ops): if op.type == "fill_constant": @@ -61,14 +64,15 @@ def check_broadcast(block): last_sync_calc_op_idx = idx continue if op.type == "c_broadcast": - var_name = op.desc.input_arg_names()[0] - if "@BroadCast" in var_name: - if broadcast_vars[var_name]["fill_constant_pos"] != -1: - assert (last_sync_calc_op_idx != -1) - assert (broadcast_vars[var_name]["fill_constant_pos"] < - last_sync_calc_op_idx) - assert (last_sync_calc_op_idx < idx) - continue + if op.all_attrs()["use_calc_stream"] == False: + var_name = op.desc.input_arg_names()[0] + if "@BroadCast" in var_name: + if broadcast_vars[var_name]["fill_constant_pos"] != -1: + assert (last_sync_calc_op_idx != -1) + assert (broadcast_vars[var_name]["fill_constant_pos"] < + last_sync_calc_op_idx) + assert (last_sync_calc_op_idx < idx) + continue for input_name in op.desc.input_arg_names(): if input_name in broadcast_vars: assert (broadcast_vars[input_name]["broadcast_pos"] != -1) @@ -78,43 +82,48 @@ def check_broadcast(block): return -def check_allreduce_sum(block, shard, dp_ring_id=-1): +def check_allreduce_sum(block, shard, sharding_ring_id, dp_ring_id=-1): """ the op order should be: grad: - 0: op that generate Var - 1: sync_calc - - 2: allreduce_sum_sharding + - 2: reduce_sum_sharding (allreduce --> reduce) - 3: sync_comm - 4: allreuce_sum_dp (dp_grads) - 5: sync_comm (dp_grads) - 6: op that use Var (dp_grads & sum) + + should ignore and skip allreduce_op of inner_parallelism (e.g. Megatron) """ vars_status = {} dp_grads_status = {} idx_last_grad_allreduce = -1 idx_amp_allreduce = -1 idx_gradient_clip_allreduce = -1 + for idx, op in enumerate(block.ops): - if op.type == "c_allreduce_sum": - ring_id = op.desc.attr("ring_id") - var_name = op.desc.input_arg_names()[0] - param = var_name.split("@")[0] + # sharding use both allreduce and reduce to sync grad + if op.type == "c_allreduce_sum" or op.type == "c_reduce_sum": + if op.all_attrs()["use_calc_stream"] == False: + ring_id = op.desc.attr("ring_id") + var_name = op.desc.input_arg_names()[0] + param = var_name.split("@")[0] - assert 'sum' in var_name or ("@GRAD" in var_name) - if 'sum' in var_name or (not shard.has_param(param)): - vars_status[var_name] = -1 - else: - dp_grads_status[var_name] = -1 + assert 'sum' in var_name or ("@GRAD" in var_name) + if 'sum' in var_name or (not shard.has_param(param)): + vars_status[var_name] = -1 + else: + dp_grads_status[var_name] = -1 - if ring_id != 0: - assert shard.has_param(param) - assert ring_id == dp_ring_id + if ring_id != sharding_ring_id: + assert shard.has_param(param) + assert ring_id == dp_ring_id - if "sum" in var_name: - idx_amp_allreduce = idx - elif "@GRAD": - idx_last_grad_allreduce = idx + if "sum" in var_name: + idx_amp_allreduce = idx + elif "@GRAD": + idx_last_grad_allreduce = idx if op.type == "c_allreduce_max": idx_gradient_clip_allreduce = idx @@ -128,38 +137,41 @@ def check_allreduce_sum(block, shard, dp_ring_id=-1): if var_name in dp_grads_status and dp_grads_status[ var_name] == 0: dp_grads_status[var_name] = 1 - - elif op.type == "c_allreduce_sum": - var_name = op.desc.input_arg_names()[0] - ring_id = op.desc.attr("ring_id") - if ring_id == 0: - if var_name in vars_status: - _status = vars_status[var_name] - else: - _status = dp_grads_status[var_name] - if _status == -1: - raise ValueError("{} is not generated, but you are" - "trying to all-reduce it".format(var_name)) - if _status == 0: - raise ValueError("There should be a sync_calc op " - "after generate Var: {} and before the" - "c_allreduce_sum op".format(var_name)) - assert (_status == 1) - if var_name in vars_status: - vars_status[var_name] = 2 + # check sharding allreduce and reduce but skip megatron allreduce + elif op.type == "c_allreduce_sum" or op.type == "c_reduce_sum": + if op.all_attrs()["use_calc_stream"] == False: + var_name = op.desc.input_arg_names()[0] + ring_id = op.desc.attr("ring_id") + if ring_id == sharding_ring_id: + assert op.type == "c_reduce_sum", "Grad in Sharding group should be reduce rather than allreduce" + if var_name in vars_status: + _status = vars_status[var_name] + else: + _status = dp_grads_status[var_name] + if _status == -1: + raise ValueError("{} is not generated, but you are" + "trying to all-reduce it".format( + var_name)) + if _status == 0: + raise ValueError("There should be a sync_calc op " + "after generate Var: {} and before the" + "c_allreduce_sum op".format(var_name)) + assert (_status == 1) + if var_name in vars_status: + vars_status[var_name] = 2 + else: + dp_grads_status[var_name] = 2 else: - dp_grads_status[var_name] = 2 - else: - assert ring_id == dp_ring_id - param = var_name.split("@")[0] - assert shard.has_param(param) - assert dp_grads_status[var_name] == 3 - dp_grads_status[var_name] = 4 + assert ring_id == dp_ring_id + param = var_name.split("@")[0] + assert shard.has_param(param) + assert dp_grads_status[var_name] == 3 + dp_grads_status[var_name] = 4 elif op.type == "c_sync_comm_stream": var_name = op.desc.input_arg_names()[0] ring_id = op.desc.attr("ring_id") - if ring_id == 0: + if ring_id == sharding_ring_id: for var_name in op.desc.input_arg_names(): if var_name in vars_status: assert vars_status[var_name] == 2 @@ -181,6 +193,9 @@ def check_allreduce_sum(block, shard, dp_ring_id=-1): raise ValueError("There should be a sync_comm op " "after allreduce the Var: {}".format( input_name)) + raise ValueError( + "The reduce output grad [{}] should NOT be be used in Non-root rank.". + format(input_name)) if input_name in dp_grads_status: if dp_ring_id == -1: if dp_grads_status[input_name] != 3: @@ -325,6 +340,27 @@ def insert_allreduce_ops(block, insert_idx, ring_id, allreduce_vars): return +def insert_reduce_ops(block, insert_idx, ring_id, reduce_vars, shard): + """ + _add_allreduce_ops + """ + for var in reduce_vars: + root_id = get_grad_device(var, shard) + assert root_id >= 0, "root id should be a positive int".format(var) + block._insert_op_without_sync( + insert_idx, + type='c_reduce_sum', + inputs={'X': var}, + outputs={'Out': var}, + attrs={ + 'ring_id': ring_id, + 'root_id': root_id, + OP_ROLE_KEY: OpRole.Backward + }) + + return + + def insert_broadcast_ops(block, insert_idx, ring_id, broadcast2root): """ _add_broadcast_ops @@ -428,7 +464,7 @@ def comm_analyse(main_program): count)) -def add_sync_comm(program, dist_strategy): +def add_sync_comm(program, sharding_ring_id): """ When clone a test prog by clone from the sharding main prog, part of the sync_comm op maybe be pruned by mistake, this function @@ -438,6 +474,7 @@ def add_sync_comm(program, dist_strategy): #NOTE (liangjianzhong): only support one comm stream by now, use more than one # comm streams will cause error. should be revise in future. + assert sharding_ring_id >= 0, "sharding_ring_id should larger than zero" block = program.global_block() not_sync_vars = set([]) for op in block.ops: @@ -448,15 +485,14 @@ def add_sync_comm(program, dist_strategy): for input_name in op.desc.input_arg_names(): not_sync_vars.remove(input_name) if not_sync_vars: - for nccl_id in range(dist_strategy.nccl_comm_num): - block.append_op( - type='c_sync_comm_stream', - inputs={'X': list(not_sync_vars)}, - outputs={'Out': list(not_sync_vars)}, - attrs={ - 'ring_id': nccl_id, - 'op_role': core.op_proto_and_checker_maker.OpRole.Forward - }) + block.append_op( + type='c_sync_comm_stream', + inputs={'X': list(not_sync_vars)}, + outputs={'Out': list(not_sync_vars)}, + attrs={ + 'ring_id': sharding_ring_id, + 'op_role': core.op_proto_and_checker_maker.OpRole.Forward + }) return @@ -468,7 +504,7 @@ def save_persistables(exe, dirname, main_program, filename=None): """ def is_opt_vars(var): - # NOTE(liangjianzhong): The checks should be updated when add new compatible optimizer + # NOTE(JZ-LIANG): The checks should be updated when add new compatible optimizer # now only Momentum and adam are compatible with sharding checks = [ "_moment1_0", "_moment2_0", "_beta1_pow_acc_0", "_beta2_pow_acc_0", @@ -479,12 +515,18 @@ def save_persistables(exe, dirname, main_program, filename=None): return True return False + def is_gradient_merge_vars(var): + # NOTE(JZ-LIANG): to revise save/load logic in framework instead of write this naive rule + + return var.name.endswith("@GradiantMerge") + def is_trainable(var): return isinstance(var, paddle.fluid.framework.Parameter) and var.trainable def sharding_predicate(var): - return is_trainable(var) or is_opt_vars(var) + return is_trainable(var) or is_opt_vars(var) or is_gradient_merge_vars( + var) if int(os.environ.get('PADDLE_TRAINER_ID', 0)) == 0: paddle.fluid.io.save_persistables( @@ -498,3 +540,42 @@ def save_persistables(exe, dirname, main_program, filename=None): filename=None) return + + +def get_grad_device(grad_name, shard): + assert "@GRAD" in grad_name, "[{}] should be a grad variable.".format( + grad_name) + base_name = None + # mind the traversal order + possible_suffixes = ['.cast_fp16@GRAD', '@GRAD'] + for suffix in possible_suffixes: + if suffix in grad_name: + base_name = re.sub(suffix, '', grad_name) + break + + assert base_name in shard.global_param2device, "[{}] should be a param variable.".format( + base_name) + + return shard.global_param2device[base_name] + + +def append_naive_sync(block, sync_var, ring_id): + # NOTE (JZ-LIANG) update this to use barrier sync for more elegent logic + # sync within global + block.append_op( + type="fill_constant", + outputs={"Out": sync_var}, + attrs={ + "shape": sync_var.shape, + "dtype": sync_var.dtype, + "value": int(1), + }) + block.append_op( + type='c_allreduce_sum', + inputs={'X': sync_var}, + outputs={'Out': sync_var}, + attrs={ + 'ring_id': ring_id, + 'use_calc_stream': True, + OP_ROLE_KEY: OpRole.Forward + }) diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py index a7f704361d3..cf3f75740ee 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py @@ -12,9 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. +import paddle from paddle.fluid import unique_name, core import paddle.fluid as fluid - from paddle.distributed.fleet.meta_optimizers.common import OpRole, OP_ROLE_VAR_KEY, CollectiveHelper from paddle.distributed.fleet.meta_optimizers.common import is_backward_op from paddle.distributed.fleet.meta_optimizers.meta_optimizer_base import MetaOptimizerBase @@ -24,7 +24,14 @@ from paddle.distributed.fleet.meta_optimizers.sharding.weight_decay_helper impor from paddle.distributed.fleet.meta_optimizers.sharding.gradient_clip_helper import GradientClipHelper from paddle.distributed.fleet.meta_optimizers.sharding.prune import ProgramDeps from paddle.distributed.fleet.meta_optimizers.sharding.utils import * +from paddle.fluid.framework import Program, Variable, name_scope, default_main_program, default_startup_program, device_guard + +from paddle.fluid import layers + import logging +logging.basicConfig( + format='%(asctime)s %(levelname)-8s %(message)s', + datefmt='%Y-%m-%d %H:%M:%S') from functools import reduce __all__ = ["ShardingOptimizer"] @@ -39,6 +46,7 @@ class ShardingOptimizer(MetaOptimizerBase): "AMPOptimizer", "LarsOptimizer", "LambOptimizer", + "ModelParallelOptimizer", ] self.meta_optimizers_black_list = ["GraphExecutionOptimizer", ] self._main_program = None @@ -50,6 +58,10 @@ class ShardingOptimizer(MetaOptimizerBase): # reduced grads to param name self._reduced_grads_to_param = {} self._shard = Shard() + self._verbose = False + + # use sharding as outer parallelism (e.g. inner:Megatron & outer sharding) + self.mp_degree = 1 def _can_apply(self): if not self.role_maker._is_collective: @@ -64,7 +76,7 @@ class ShardingOptimizer(MetaOptimizerBase): def _enable_strategy(self, dist_strategy, context): dist_strategy.sharding = True - dist_strategy.sharding_configs = {"fuse_broadcast_MB": 32} + dist_strategy.sharding_configs = {"segment_broadcast_MB": 32} def minimize_impl(self, loss, @@ -75,11 +87,53 @@ class ShardingOptimizer(MetaOptimizerBase): # self._nrings = self.user_defined_strategy.nccl_comm_num self._nrings_sharding = 1 self._nrings_dp = 1 - self._fuse_broadcast_MB = self.user_defined_strategy.sharding_configs[ - "fuse_broadcast_MB"] + + # parallelism + self.sharding_degree = int(self.user_defined_strategy.sharding_configs[ + "sharding_degree"]) + assert self.sharding_degree > 1, "sharding degree must be larger than zero" + self.mp_degree = int(self.user_defined_strategy.sharding_configs[ + "mp_degree"]) self.hybrid_dp = self.user_defined_strategy.sharding_configs[ "hybrid_dp"] + self.pp_degree = 1 + + # dp here is the pure dp as the outest parallelism + self.dp_degree = int(self.role_maker._worker_num() // self.mp_degree // + self.sharding_degree) + assert self.role_maker._worker_num( + ) == self.dp_degree * self.mp_degree * self.sharding_degree * self.pp_degree + if self.hybrid_dp: + assert self.dp_degree > 1, "hybrid dp is on, but dp degree is [{}]".format( + self.dp_degree) + + # segment + self._sharding_segment_strategy = str( + self.user_defined_strategy.sharding_configs[ + "sharding_segment_strategy"]) + if self._sharding_segment_strategy == "segment_broadcast_MB": + self._broadcast_MB = self.user_defined_strategy.sharding_configs[ + "segment_broadcast_MB"] + assert self._broadcast_MB > 0, "segment size should larger than zero !" + elif self._sharding_segment_strategy == "segment_anchors": + self._sharding_segment_anchors = self.user_defined_strategy.sharding_configs[ + "segment_anchors"] + assert len(self._sharding_segment_anchors + ) > 0, "you should set the sharding segment anchors !" + self._backward_remain_anchors = self._sharding_segment_anchors[:] + self._forward_remain_anchors = [] + else: + raise NotImplementedError( + "the sharding segment strategy [{}] is not implemented".format( + str(self._sharding_segment_strategy))) + + # gradient merge + self._gradient_merge_acc_step = int( + self.user_defined_strategy.sharding_configs[ + "gradient_merge_acc_step"]) + self._grad2merged_grad = dict() + if self.inner_opt is None: raise ValueError( "self.inner_opt of ShardingOptimizer should not be None.") @@ -93,8 +147,11 @@ class ShardingOptimizer(MetaOptimizerBase): self._main_program = main_block.program self._startup_program = startup_program - # step1: set_up - self._set_up(params_grads) + # step0: _init_comm + self._init_comm() + + # step1: _build_shard + self._build_shard(params_grads) # step2: split_program self._split_program(main_block) @@ -104,75 +161,166 @@ class ShardingOptimizer(MetaOptimizerBase): main_block._sync_with_cpp() startup_block._sync_with_cpp() - # step4: insert reduce_sum for grad - insert_scale_loss_grad_ops( - main_block, scale=1.0 / self.role_maker._worker_num()) + # step4: scale the loss by the num of dp degree + # sharding is also a senario of dp + scale_ = self.dp_degree * self.sharding_degree + if scale_ > 1: + insert_scale_loss_grad_ops(main_block, scale=1.0 / scale_) + main_block._sync_with_cpp() # step5: remove unneeded ops and vars from block self._prune_main_program(main_block) self._prune_startup_program(startup_block) + if self.hybrid_dp: + self._initialization_broadcast(startup_program) - # check op dependecy - check_broadcast(main_block) - check_allreduce_sum(main_block, self._shard, self.dp_ring_id) + # step6: optional gradient merge + if self._gradient_merge_acc_step > 1: + self._sharding_gradient_merge(main_block) + + # # check op dependecy + # FIXME (JZ-LIANG) enable checking in future. + # check_broadcast(main_block) + # check_allreduce_sum(main_block, self._shard, self.sharding_ring_id, + # self.dp_ring_id) self._wait() + return optimize_ops, params_grads - def _set_up(self, params_grads): - # step 1: initialize nccl - self.global_word_size = self.role_maker._worker_num() - self.global_rank = self.role_maker._worker_index() - self.endpoints = self.role_maker._get_trainer_endpoints() - self.current_endpoint = self.endpoints[self.global_rank] - self._collective_helper = CollectiveHelper(self.role_maker, - self._nrings_sharding) + def _init_comm(self): # config sharding & dp groups - self._init_comm() - # sharding + self._build_group() + + startup_block = self._startup_program.global_block() + self.startup_prog_sync_var = startup_block.create_var( + name="startup_prog_sync_var", + shape=[1], + dtype=core.VarDesc.VarType.INT32, + persistable=False) + + # global self._collective_helper._init_communicator( - self._startup_program, self.current_endpoint, - self.sharding_group_endpoints, self.sharding_rank, - self.sharding_ring_id, True) + self._startup_program, + self.current_endpoint, + self.global_endpoints, + self.global_rank, + self.global_ring_id, + False, + global_ring_id=self.global_ring_id, + sync=False) + append_naive_sync(startup_block, self.startup_prog_sync_var, + self.global_ring_id) + + # mp + if self.mp_degree > 1: + self._collective_helper._init_communicator( + self._startup_program, + self.current_endpoint, + self.mp_group_endpoints, + self.mp_rank, + self.mp_ring_id, + False, + global_ring_id=self.global_ring_id, + sync=False) + append_naive_sync(startup_block, self.startup_prog_sync_var, + self.global_ring_id) + + # sharding + if self.sharding_degree > 1: + self._collective_helper._init_communicator( + self._startup_program, + self.current_endpoint, + self.sharding_group_endpoints, + self.sharding_rank, + self.sharding_ring_id, + False, + global_ring_id=self.global_ring_id, + sync=False) + append_naive_sync(startup_block, self.startup_prog_sync_var, + self.global_ring_id) + # dp - if self.hybrid_dp: + if self.dp_degree > 1: self._collective_helper._init_communicator( - self._startup_program, self.current_endpoint, - self.dp_group_endpoints, self.dp_rank, self.dp_ring_id, True) + self._startup_program, + self.current_endpoint, + self.dp_group_endpoints, + self.dp_rank, + self.dp_ring_id, + False, + global_ring_id=self.global_ring_id, + sync=False) + append_naive_sync(startup_block, self.startup_prog_sync_var, + self.global_ring_id) - startup_block = self._startup_program.global_block() startup_block._sync_with_cpp() + def _build_shard(self, params_grads): # step 2: split params self._params = set([x[0].name for x in params_grads]) self._shard.setup(params_grads, self.sharding_rank, - self.sharding_group_size) + self.sharding_degree) # step 3: get broadcast vars self._broadcast_vars = self._shard.find_broadcast_params( self._main_program.global_block()) def _wait(self, ): - endpoints = self.role_maker._get_trainer_endpoints() - current_endpoint = endpoints[self.role_maker._worker_index()] - if self.role_maker._worker_index() == 0: + endpoints = self.global_endpoints[:] + current_endpoint = endpoints[self.global_rank] + if self.global_rank == 0: self._collective_helper._wait(current_endpoint, endpoints) + def collect_segment(self, segment, op_idx, block): + segment._start_idx = op_idx + 1 + self._segments.insert(0, segment) + new_segment = ProgramSegment(block) + new_segment._end_idx = op_idx + 1 + + return new_segment + def _split_program(self, block): for op_idx, op in reversed(list(enumerate(block.ops))): if int(op.attr('op_role')) != int(OpRole.Optimize): last_backward_op_idx = op_idx + 1 break + + var2broadcast_time = dict() segment = ProgramSegment(block) segment._end_idx = last_backward_op_idx for op_idx in reversed(range(last_backward_op_idx)): op = block.ops[op_idx] assert (int(op.attr('op_role')) != int(OpRole.Optimize)) - if segment._param_mem >= self._fuse_broadcast_MB: - segment._start_idx = op_idx + 1 - self._segments.insert(0, segment) - segment = ProgramSegment(block) - segment._end_idx = op_idx + 1 + if self._sharding_segment_strategy == "segment_broadcast_MB": + if segment._param_mem >= self._broadcast_MB: + segment = self.collect_segment(segment, op_idx, block) + + elif self._sharding_segment_strategy == "segment_anchors": + if int(op.attr('op_role')) == int(OpRole.Backward): + for input_name in op.desc.input_arg_names(): + + # NOTE (JZ-LIANG) naive rule to support amp, if amp change, should modify here accordingly + if self.user_defined_strategy.amp: + if ".cast_fp16@GRAD" not in input_name: + continue + else: + input_name = input_name[:input_name.find( + ".cast_fp16@GRAD")] + + if input_name in self._backward_remain_anchors: + segment = self.collect_segment(segment, op_idx, + block) + assert input_name not in self._forward_remain_anchors, "segment anchor [{}] met twice !".format( + input_name) + self._backward_remain_anchors.remove(input_name) + self._forward_remain_anchors.append(input_name) + elif int(op.attr('op_role')) == int(OpRole.Forward): + for output_name in op.desc.output_arg_names(): + if output_name in self._forward_remain_anchors: + segment = self.collect_segment(segment, op_idx, + block) + self._forward_remain_anchors.remove(output_name) # find broadcast vars for input_name in op.desc.input_arg_names(): @@ -190,6 +338,21 @@ class ShardingOptimizer(MetaOptimizerBase): broadcast_var_name = unique_name.generate(input_name + "@BroadCast") segment._fill_constant_vars.append(broadcast_var_name) + + # (JZ-LIANG) should use Param base name ? + broadcast_var_base_name = input_name + if "subprog" in broadcast_var_base_name: + # remove suffix + broadcast_var_base_name = broadcast_var_base_name[: + broadcast_var_base_name. + find( + ".subprog" + )] + + var2broadcast_time[ + broadcast_var_base_name] = var2broadcast_time.get( + broadcast_var_base_name, 0) + 1 + segment._param2broadcast[input_name] = broadcast_var_name segment._broadcast_vars.append((broadcast_var_name, self._shard.device(input_name))) @@ -219,6 +382,30 @@ class ShardingOptimizer(MetaOptimizerBase): if segment._param_mem > 0: segment._start_idx = 0 self._segments.insert(0, segment) + + if self._sharding_segment_strategy == "segment_anchors": + assert len( + self._forward_remain_anchors) == 0, "remain anchors {}".format( + self._forward_remain_anchors) + assert len( + self._backward_remain_anchors) == 0, "remain anchors {}".format( + self._backward_remain_anchors) + + if self._verbose: + for varname in sorted( + var2broadcast_time, key=var2broadcast_time.get, + reverse=True): + logging.info("Sharding broadcast: [{}] times [{}]".format( + var2broadcast_time[varname], varname)) + for idx_ in range(len(self._segments)): + logging.info("segment [{}] :".format(idx_)) + logging.info("start op: [{}] [{}]".format(block.ops[ + self._segments[idx_]._start_idx].desc.type(), block.ops[ + self._segments[idx_]._start_idx].desc.input_arg_names( + ))) + logging.info("end op: [{}] [{}]".format(block.ops[ + self._segments[idx_]._end_idx].desc.type(), block.ops[ + self._segments[idx_]._end_idx].desc.input_arg_names())) return def _prune_main_program(self, block): @@ -234,10 +421,21 @@ class ShardingOptimizer(MetaOptimizerBase): """ weightdecay_helper = WeightDecayHelper() weightdecay_helper.prune_weight_decay(block, self._shard) + # NOTE (JZ-LIANG) the sync of FoundInfinite should among one entire Model Parallelism + # group. and each Data Parallelism group should have its own sync of FoundInfinite + # amp could use global group for sync FP16Utils.prune_fp16(block, self._shard, self._reduced_grads_to_param, - self.sharding_ring_id) - gradientclip_helper = GradientClipHelper(self.sharding_ring_id) - gradientclip_helper.prune_gradient_clip(block, self._shard) + self.global_ring_id) + # clipbyglobalnorm should only use the Model paramllelism group (mp-sharding-pp) + if self.mp_degree * self.pp_degree == 1: + # separate the sharding-hybrid senario to keep the accuracy + gradientclip_helper = GradientClipHelper(self.sharding_ring_id) + gradientclip_helper.prune_gradient_clip( + block, self._shard, pure_dp_degree=1) + else: + gradientclip_helper = GradientClipHelper(self.global_ring_id) + gradientclip_helper.prune_gradient_clip( + block, self._shard, pure_dp_degree=self.dp_degree) # build prog deps reduced_grads = [] @@ -307,7 +505,8 @@ class ShardingOptimizer(MetaOptimizerBase): def _add_broadcast_allreduce(self, block): """ - _add_broadcast_allreduce + add broadcast allreduce op + if enable gradient_merge, insert related ops """ if len(self._segments) < 1: return @@ -315,17 +514,27 @@ class ShardingOptimizer(MetaOptimizerBase): if self._segments[-1]._allreduce_vars: shard_allredue_vars = self._shard.filter_grads(self._segments[-1] ._allreduce_vars) - if self.hybrid_dp and len(shard_allredue_vars) >= 1: - insert_sync_comm_ops(block, self._segments[-1]._end_idx, - self.dp_ring_id, shard_allredue_vars) - insert_allreduce_ops(block, self._segments[-1]._end_idx, - self.dp_ring_id, shard_allredue_vars) + if self._gradient_merge_acc_step <= 1: + if self.hybrid_dp and len(shard_allredue_vars) >= 1: + insert_sync_comm_ops(block, self._segments[-1]._end_idx, + self.dp_ring_id, shard_allredue_vars) + insert_allreduce_ops(block, self._segments[-1]._end_idx, + self.dp_ring_id, shard_allredue_vars) + # gradient merge + else: + self.create_persistable_gradients_and_insert_merge_ops( + block, + self._startup_program.global_block(), + self._segments[-1]._end_idx, shard_allredue_vars, + self._shard) + insert_sync_comm_ops(block, self._segments[-1]._end_idx, self.sharding_ring_id, self._segments[-1]._allreduce_vars) - insert_allreduce_ops(block, self._segments[-1]._end_idx, - self.sharding_ring_id, - self._segments[-1]._allreduce_vars) + # allreduce --> reduce + insert_reduce_ops(block, self._segments[-1]._end_idx, + self.sharding_ring_id, + self._segments[-1]._allreduce_vars, self._shard) for idx, segment in reversed(list(enumerate(self._segments))): allreduce_vars = self._segments[ @@ -364,19 +573,31 @@ class ShardingOptimizer(MetaOptimizerBase): # step2: add Sync ops shard_allredue_vars = self._shard.filter_grads(allreduce_vars) - if self.hybrid_dp and len(shard_allredue_vars) >= 1: - insert_sync_comm_ops(block, segment._end_idx, self.dp_ring_id, - shard_allredue_vars) + if self._gradient_merge_acc_step <= 1: + if self.hybrid_dp and len(shard_allredue_vars) >= 1: + insert_sync_comm_ops(block, segment._end_idx, + self.dp_ring_id, shard_allredue_vars) + + broad_cast_vars = [x[0] for x in broadcast_vars] + if len(broad_cast_vars) > 0: + insert_sync_comm_ops(block, segment._end_idx, + self.sharding_ring_id, + broad_cast_vars) + else: + comm_dep_vars = allreduce_vars + [ + x[0] for x in broadcast_vars + ] + if len(comm_dep_vars) > 0: + insert_sync_comm_ops(block, segment._end_idx, + self.sharding_ring_id, + comm_dep_vars) + # gradient merge + else: broad_cast_vars = [x[0] for x in broadcast_vars] if len(broad_cast_vars) > 0: insert_sync_comm_ops(block, segment._end_idx, self.sharding_ring_id, broad_cast_vars) - else: - comm_dep_vars = allreduce_vars + [x[0] for x in broadcast_vars] - if len(comm_dep_vars) > 0: - insert_sync_comm_ops(block, segment._end_idx, - self.sharding_ring_id, comm_dep_vars) calc_dep_vars = fill_constant_vars + [ k for k, v in cast_ops.items() @@ -394,18 +615,32 @@ class ShardingOptimizer(MetaOptimizerBase): insert_cast_ops(block, segment._end_idx, cast_ops) # step5: add broadcast ops + # gradient merge + if self._gradient_merge_acc_step > 1: + self.create_persistable_gradients_and_insert_merge_ops( + block, + self._startup_program.global_block(), segment._start_idx, + shard_allredue_vars, self._shard) + insert_broadcast_ops(block, segment._start_idx, self.sharding_ring_id, broadcast_vars) + # step6: add all_reduce ops # dp - if self.hybrid_dp and len(shard_allredue_vars) >= 1: - insert_allreduce_ops(block, segment._start_idx, self.dp_ring_id, - shard_allredue_vars) + if self._gradient_merge_acc_step <= 1: + if self.hybrid_dp and len(shard_allredue_vars) >= 1: + insert_allreduce_ops(block, segment._start_idx, + self.dp_ring_id, shard_allredue_vars) + insert_sync_comm_ops(block, segment._start_idx, + self.sharding_ring_id, allreduce_vars) + # gradient merge + else: insert_sync_comm_ops(block, segment._start_idx, self.sharding_ring_id, allreduce_vars) # sharding - insert_allreduce_ops(block, segment._start_idx, - self.sharding_ring_id, allreduce_vars) + # allreduce --> reduce + insert_reduce_ops(block, segment._start_idx, self.sharding_ring_id, + allreduce_vars, self._shard) block._sync_with_cpp() @@ -456,59 +691,440 @@ class ShardingOptimizer(MetaOptimizerBase): block._remove_var(var_name, sync=False) block._sync_with_cpp() - def _init_comm(self): - - if self.hybrid_dp: - self.sharding_group_size = self.user_defined_strategy.sharding_configs[ - "sharding_group_size"] - self.sharding_ring_id = 0 - self.sharding_rank = self.global_rank % self.sharding_group_size - - self.dp_group_size = self.global_word_size // self.sharding_group_size - self.dp_rank = self.global_rank // self.sharding_group_size - self.dp_ring_id = self.sharding_rank + 1 - - self.sharding_group_endpoints = [ - ep for idx, ep in enumerate(self.endpoints) - if (idx // self.sharding_group_size) == self.dp_rank - ] - self.dp_group_endpoints = [ - ep for idx, ep in enumerate(self.endpoints) - if (idx % self.sharding_group_size) == self.sharding_rank + def _build_group(self): + """ + pre-assign ring ids + mp: 0 + sharding: 1 + pure-dp: 2 + global: 3 + pp: >= 20 + if one parallelism is not enable: -1 + and only support parallelism hierarchy: mp --> sharding --> pp --> dp + """ + # step 1: initialize nccl + self.global_word_size = self.role_maker._worker_num() + self.global_rank = self.role_maker._worker_index() + self.global_endpoints = self.role_maker._get_trainer_endpoints() + self.current_endpoint = self.global_endpoints[self.global_rank] + self._collective_helper = CollectiveHelper( + self.role_maker, nrings=self._nrings_sharding) + assert self.global_word_size % self.mp_degree == 0, \ + "global_word_size: {} should be divisible to the mp_degree: {}".format(self.global_word_size, self.mp_degree) + assert self.global_word_size % self.sharding_degree == 0, \ + "global_word_size: {} should be divisible to the sharding_degree: {}".format(self.global_word_size, self.sharding_degree) + assert self.global_word_size % self.pp_degree == 0, \ + "global_word_size: {} should be divisible to the pp_degree: {}".format(self.global_word_size, self.pp_degree) + assert self.global_word_size % self.dp_degree == 0, \ + "global_word_size: {} should be divisible to the dp_degree: {}".format(self.global_word_size, self.dp_degree) + + # mp group + if self.mp_degree > 1: + self.mp_ring_id = 0 + self.mp_rank = self.global_rank % self.mp_degree + self.mp_group_id = self.global_rank // self.mp_degree + self.mp_group_endpoints = [ + ep for idx, ep in enumerate(self.global_endpoints) + if idx // self.mp_degree == self.mp_group_id ] - assert self.global_word_size > self.sharding_group_size, \ - "global_word_size: {} should be larger than sharding_group_size: {}".format(self.global_word_size, self.sharding_group_size) - assert self.global_word_size % self.sharding_group_size == 0, \ - "global_word_size: {} should be divisible to the sharding_group_size: {}".format(self.global_word_size, self.sharding_group_size) - assert self.dp_group_size * self.sharding_group_size == self.global_word_size, \ - "global_word_size: {} should be equal to the product of sharding_group_size: {} and dp_group_size: {}".format( - self.global_word_size, - self.sharding_group_size, - self.dp_group_size) - - logging.info("Using Sharing&DP mode !") + assert self.current_endpoint in self.mp_group_endpoints + assert len( + self.mp_group_endpoints + ) == self.mp_degree, "num of mp worker in group is [{}], but mp group size is [{}]".format( + len(self.mp_group_endpoints), self.mp_degree) + else: + self.mp_degree = 1 + self.mp_ring_id = -1 + self.mp_rank = -1 + self.mp_group_id = -1 + self.mp_group_endpoints = [] + + # sharding + if self.sharding_degree > 1: + self.sharding_ring_id = 1 + self.sharding_rank = (self.global_rank // + self.mp_degree) % self.sharding_degree + self.sharding_group_id = self.global_rank // (self.mp_degree * + self.sharding_degree) + # mp + sharding + ... + if self.mp_degree > 1: + self.sharding_group_endpoints = [ + ep for idx, ep in enumerate(self.global_endpoints) + if (idx // (self.mp_degree * self.sharding_degree)) == self. + sharding_group_id and idx % self.mp_degree == self.mp_rank + ] + # sharding + ... + else: + self.sharding_group_endpoints = [ + ep for idx, ep in enumerate(self.global_endpoints) + if (idx // (self.mp_degree * self.sharding_degree) + ) == self.sharding_group_id + ] + assert self.current_endpoint in self.sharding_group_endpoints + else: + self.sharding_degree = 1 + self.sharding_ring_id = -1 + self.sharding_rank = -1 + self.sharding_group_id = -1 + self.sharding_group_endpoints = [] + + # outter-pure-dp group + # NOTE (JZ-LIANG) support outter-pure-dp to scale the throughput in 3D parallelism + # e.g. mp-sharding-pp-dp + # sharding-hybrid-dp as one senario of outter-pure-dp + assert self.global_word_size == self.mp_degree * self.sharding_degree * self.pp_degree * self.dp_degree, "mp_degree: [{}], sharding_degree: [{}], pp_degree: [{}], dp_degree: [{}]; BUT global nrank: [{}]".format( + self.mp_degree, self.sharding_degree, self.pp_degree, + self.dp_degree, self.global_word_size) + if self.dp_degree > 1: + self.dp_ring_id = 2 + self.dp_rank = self.global_rank // (self.sharding_degree * + self.mp_degree * self.pp_degree) + dp_first_rank_idx = self.global_rank % ( + self.sharding_degree * self.mp_degree * self.pp_degree) + dp_offset = (self.sharding_degree * self.mp_degree * self.pp_degree) + self.dp_group_endpoints = [] + for i in range(self.dp_degree): + self.dp_group_endpoints.append(self.global_endpoints[ + dp_first_rank_idx + dp_offset * i]) + assert self.current_endpoint in self.dp_group_endpoints + logging.info("Hybrid DP mode turn on !") else: - self.sharding_ring_id = 0 - self.sharding_rank = self.global_rank - self.sharding_group_size = self.role_maker._worker_num() - self.sharding_group_endpoints = self.endpoints self.dp_ring_id = -1 self.dp_rank = -1 - self.dp_group_size = None - self.dp_group_endpoints = None + self.dp_group_endpoints = [] - logging.info("Using Sharing alone mode !") + # global group + self.global_ring_id = 3 logging.info("global word size: {}".format(self.global_word_size)) logging.info("global rank: {}".format(self.global_rank)) - logging.info("sharding group_size: {}".format(self.sharding_group_size)) + logging.info("global endpoints: {}".format(self.global_endpoints)) + logging.info("global ring id: {}".format(self.global_ring_id)) + logging.info("#####" * 6) + + logging.info("mp group size: {}".format(self.mp_degree)) + logging.info("mp rank: {}".format(self.mp_rank)) + logging.info("mp group id: {}".format(self.mp_group_id)) + logging.info("mp group endpoints: {}".format(self.mp_group_endpoints)) + logging.info("mp ring id: {}".format(self.mp_ring_id)) + logging.info("#####" * 6) + + logging.info("sharding group size: {}".format(self.sharding_degree)) logging.info("sharding rank: {}".format(self.sharding_rank)) - logging.info("dp group size: {}".format(self.dp_group_size)) - logging.info("dp rank: {}".format(self.dp_rank)) - logging.info("current endpoint: {}".format(self.current_endpoint)) + logging.info("sharding group id: {}".format(self.sharding_group_id)) logging.info("sharding group endpoints: {}".format( self.sharding_group_endpoints)) - logging.info("dp group endpoints: {}".format(self.dp_group_endpoints)) - logging.info("global word endpoints: {}".format(self.endpoints)) + logging.info("sharding ring id: {}".format(self.sharding_ring_id)) + logging.info("#####" * 6) + + logging.info("outter pure dp group size: {}".format(self.dp_degree)) + logging.info("outter pure dp rank: {}".format(self.dp_rank)) + logging.info("outter pure dp group endpoints: {}".format( + self.dp_group_endpoints)) + logging.info("outter pure dp ring id: {}".format(self.dp_ring_id)) + logging.info("#####" * 6) return + + def _initialization_broadcast(self, startup_prog): + """ + this funtion is to ensure the initialization between dp group to be + identical when hybrid-dp is used. + """ + block = startup_prog.global_block() + params = [] + for param in block.iter_parameters(): + params.append(param) + block.append_op( + type='c_broadcast', + inputs={'X': param}, + outputs={'Out': param}, + attrs={ + 'ring_id': self.dp_ring_id, + 'root': 0, + OP_ROLE_KEY: OpRole.Forward + }) + block.append_op( + type='c_sync_comm_stream', + inputs={'X': params}, + outputs={'Out': params}, + attrs={'ring_id': self.dp_ring_id, + OP_ROLE_KEY: OpRole.Forward}) + + # sync within global group + append_naive_sync(block, self.startup_prog_sync_var, + self.global_ring_id) + + # sharding gradient merge + def create_persistable_gradients_and_insert_merge_ops( + self, main_block, startup_block, insert_idx, grad_names, shard): + + for grad_name in grad_names: + assert get_grad_device( + grad_name, shard + ) == shard.worker_idx, "try to merge gradient not belong to current shard: [{}]".format( + grad_name) + persistable_grad_name = grad_name + '@GradiantMerge' + assert grad_name not in self._grad2merged_grad, "grad [{}] already in grad2merged_grad, maybe you meet sharing weight case !".format( + grad_name) + self._grad2merged_grad[grad_name] = persistable_grad_name + grad_var = main_block.var(grad_name) + # create var + gradient_merge_var = main_block.create_var( + name=persistable_grad_name, + shape=grad_var.shape, + dtype=grad_var.dtype, + persistable=True) + startup_gradient_merge_var = startup_block.create_var( + name=persistable_grad_name, + shape=grad_var.shape, + dtype=grad_var.dtype, + persistable=True) + + # merge gradient + main_block._insert_op_without_sync( + insert_idx, + type="elementwise_add", + inputs={'X': grad_name, + 'Y': gradient_merge_var}, + outputs={'Out': gradient_merge_var}, + attrs={ + 'axis': -1, + 'use_mkldnn': False, + OP_ROLE_KEY: OpRole.Backward + }) + + # startup initialization + startup_block.append_op( + type="fill_constant", + outputs={"Out": startup_gradient_merge_var}, + attrs={ + "shape": grad_var.shape, + "dtype": grad_var.dtype, + "value": float(0), + }) + + main_block._sync_with_cpp() + startup_block._sync_with_cpp() + + def _create_gm_cond(self, main_block): + # Add const var + acc_step_var = layers.create_global_var( + name="gradient_merge_acc_step", + shape=[1], + value=int(self._gradient_merge_acc_step), + dtype='int32', + persistable=True, + force_cpu=True) + + zero_var = layers.create_global_var( + name="gradient_merge_zero", + shape=[1], + value=int(0), + dtype='int32', + persistable=True, + force_cpu=True) + + # Add step var & cond var + current_step_var = layers.create_global_var( + name="gradient_merge_current_step", + shape=[1], + value=int(0), + dtype='int32', + persistable=True, + force_cpu=True) + + cond_var = layers.create_global_var( + name="gradient_merge_cond", + shape=[1], + value=bool(0), + dtype='bool', + persistable=False, + force_cpu=True) + + with device_guard("cpu"): + # step_var = (step_var + 1) % k_step + main_block.append_op( + type='increment', + inputs={'X': [current_step_var]}, + outputs={'Out': [current_step_var]}, + attrs={'step': float(1), + OP_ROLE_KEY: OpRole.Optimize}) + + main_block.append_op( + type='elementwise_mod', + inputs={'X': current_step_var, + 'Y': acc_step_var}, + outputs={'Out': current_step_var}, + attrs={ + 'axis': -1, + OP_ROLE_KEY: OpRole.Optimize, + 'use_mkldnn': False + }) + + # cond_var = (step_var == 0) + main_block.append_op( + type='equal', + inputs={'X': current_step_var, + 'Y': zero_var}, + outputs={'Out': cond_var}, + attrs={OP_ROLE_KEY: OpRole.Optimize}) + # paddle.static.Print(current_step_var, message="in FWBW last conditional") + return cond_var + + def _true_apply_gradient(self): + """ + allreduce grad@gradientmerge in dp group + grad@gradientmerge / acc_step + re-create all optimize ops of origin main block and rename them + cast(backward) + amp + clip + opt + # fill constant grad@gradientmerge + + """ + # current conditional block + main_block = self._main_program.global_block() + cur_block_idx = self._main_program.current_block_idx + cur_block = self._main_program.current_block() + self.cond_block = self._main_program.current_block() + + # cur_block's forward_block & backward_block is itself + cur_block._set_forward_block_idx(cur_block_idx) + + # allreduce grad@gradientmerge + if self.hybrid_dp: + assert self.dp_ring_id >= 0, "dp_ring_id should larger than 0 when in sharding&DP mode" + for grad, merged_grad in self._grad2merged_grad.items(): + merged_grad_var = main_block.var(merged_grad) + cur_block.append_op( + type='c_allreduce_sum', + inputs={'X': merged_grad_var}, + outputs={'Out': merged_grad_var}, + attrs={ + 'ring_id': self.dp_ring_id, + 'use_calc_stream': True, + OP_ROLE_KEY: OpRole.Optimize + }) + + # grad@gradientmerge / acc_step + for grad, merged_grad in self._grad2merged_grad.items(): + # grad /= k_steps + merged_grad_var = main_block.var(merged_grad) + cur_block.append_op( + type='scale', + inputs={'X': merged_grad_var}, + outputs={'Out': merged_grad_var}, + attrs={ + 'scale': 1.0 / float(self._gradient_merge_acc_step), + 'bias': 0.0, + 'bias_after_scale': False, + OP_ROLE_KEY: OpRole.Optimize + }) + + # re-create optimize ops + already_moved_var_names = [] + for op_desc in self.original_optimize_ops_desc: + new_op_desc = cur_block.desc.append_op() + new_op_desc.copy_from(op_desc) + + for input_name in new_op_desc.input_arg_names(): + if input_name in self._grad2merged_grad: + new_op_desc._rename_input( + input_name, self._grad2merged_grad[input_name]) + + for output_name in new_op_desc.output_arg_names(): + if output_name in self._grad2merged_grad: + new_op_desc._rename_output( + output_name, self._grad2merged_grad[output_name]) + + # move non temp optimize vars from block0 to cond block + if output_name not in already_moved_var_names and output_name not in self._grad2merged_grad.keys( + ): + var_ = self._main_program.global_block().var(output_name) + if not var_.persistable: + # move + name_ = var_.name + shape_ = var_.shape + type_ = var_.dtype + self._main_program.global_block()._remove_var( + var_.name, sync=False) + self.cond_block.create_var( + name=name_, + shape=shape_, + dtype=type_, + persistable=False) + already_moved_var_names.append(name_) + + self._main_program.global_block()._sync_with_cpp() + cur_block._sync_with_cpp() + + # fill zero to grad@gradientmerge + for grad, merged_grad in self._grad2merged_grad.items(): + merged_grad_var = main_block.var(merged_grad) + cur_block.append_op( + type='fill_constant', + outputs={'Out': merged_grad_var}, + attrs={ + "shape": merged_grad_var.shape, + "dtype": merged_grad_var.dtype, + "value": float(0), + OP_ROLE_KEY: OpRole.Optimize + }) + + # lr_var = main_block.var("gradient_merge_current_step") + # paddle.static.Print(lr_var, message="in OPTIMIZE last conditional") + + def _sharding_gradient_merge(self, main_block): + """ + copy all optimize ops in origin main block + remove all optimize ops in origin main block + create cond block + + """ + # copy original optimize ops to temp ops desc list + # remove them from block 0 + tmp_copy_block = self._main_program._create_block() + + self.original_optimize_ops_desc = [] + for op_idx, op in reversed(list(enumerate(main_block.ops))): + if int(op.attr('op_role')) != int(OpRole.Optimize): + continue + else: + tmp_op_desc = tmp_copy_block.desc.append_op() + tmp_op_desc.copy_from(op.desc) + self.original_optimize_ops_desc.append(tmp_op_desc) + main_block._remove_op(op_idx, sync=False) + tmp_copy_block._sync_with_cpp() + self.original_optimize_ops_desc = list( + reversed(self.original_optimize_ops_desc)) + + # back to block 0 + self._main_program._rollback() + + # create cond vars and ops at the end of block 0 + cond = self._create_gm_cond(main_block) + + # create cond block + cond_block = self._main_program._create_block() + self._true_apply_gradient() + + # back to block 0 + self._main_program._rollback() + + # cond op + step_scope = self._main_program.global_block().create_var( + type=core.VarDesc.VarType.STEP_SCOPES) + conditional_block_op = self._main_program.global_block().append_op( + type='conditional_block', + inputs={ + 'Cond': cond, + 'Input': [], + }, + outputs={'Out': [], + 'Scope': [step_scope]}, + attrs={ + 'sub_block': cond_block, + 'is_scalar_condition': True, + }) diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py old mode 100644 new mode 100755 index 33e2e387a82..b3a1834d49d --- a/python/paddle/fluid/backward.py +++ b/python/paddle/fluid/backward.py @@ -115,7 +115,7 @@ class ProgramStats(object): updated_min_idx = min_idx while idx_ > pre_segment_end_idx: if is_amp_cast(self.ops[idx_]): - _logger.debug("found amp-cast op: {}, : {}".format(self.ops[ + _logger.info("found amp-cast op: {}, : {}".format(self.ops[ idx_].desc.type(), self.ops[idx_].desc.input_arg_names()[ 0])) updated_min_idx = idx_ @@ -155,7 +155,7 @@ class ProgramStats(object): sorted_checkpoints = [] for name in checkpoints_name: if name not in self.var_op_deps: - _logger.debug( + _logger.info( "Recompute Optimizer: deleted %s from checkpoints, because it is not used in paddle program." % name) elif self.var_op_deps[name]["var_as_output_ops"] == []: @@ -784,7 +784,6 @@ def _append_backward_ops_with_checkpoints_( start_idx = 0 pre_segment_end_idx = -1 while True: - _logger.debug("FW op range[0] - [{}]".format(len(ops))) if start_idx >= len(checkpoints_name) - 1: break # min_idx: checkpoint_1' s input op @@ -797,6 +796,9 @@ def _append_backward_ops_with_checkpoints_( min_idx = program_stat._update_segment_start( min_idx, pre_segment_end_idx) segments.append([min_idx, max_idx + 1]) + else: + _logger.info("Could not recompute op range [{}] - [{}] ".format( + min_idx, max_idx + 1)) start_idx += 1 @@ -806,15 +808,15 @@ def _append_backward_ops_with_checkpoints_( recompute_segments = segments for i, (idx1, idx2) in enumerate(recompute_segments): - _logger.debug("recompute segment[{}]".format(i)) - _logger.debug("segment start op: [{}]: [{}]".format(ops[idx1].desc.type( + _logger.info("recompute segment[{}]".format(i)) + _logger.info("segment start op: [{}]: [{}]".format(ops[idx1].desc.type( ), ops[idx1].desc.input_arg_names())) - _logger.debug("segment end op: [{}]: [{}]".format(ops[ + _logger.info("segment end op: [{}]: [{}]".format(ops[ idx2 - 1].desc.type(), ops[idx2 - 1].desc.input_arg_names())) - _logger.debug("recompute segment[{}]".format(i)) - _logger.debug("segment start op: [{}]: [{}]".format(ops[idx1].desc.type( + _logger.info("recompute segment[{}]".format(i)) + _logger.info("segment start op: [{}]: [{}]".format(ops[idx1].desc.type( ), ops[idx1].desc.input_arg_names())) - _logger.debug("segment end op: [{}]: [{}]".format(ops[ + _logger.info("segment end op: [{}]: [{}]".format(ops[ idx2 - 1].desc.type(), ops[idx2 - 1].desc.input_arg_names())) # 2) go through all forward ops and induct all variables that will be hold in memory @@ -825,9 +827,7 @@ def _append_backward_ops_with_checkpoints_( program_stat.get_out_of_subgraph_vars(segment[0], segment[1])) cross_vars = set(vars_should_be_hold) - set(checkpoints_name) - _logger.debug("found [{}] vars which cross recompute segment: [{}], better checkpoints might be set to reduce those vars".format( \ - len(cross_vars), cross_vars)) - _logger.debug("found [{}] vars which cross recompute segment: [{}], better checkpoints might be set to reduce those vars".format( \ + _logger.info("found [{}] vars which cross recompute segment: [{}], better checkpoints might be set to reduce those vars".format( \ len(cross_vars), cross_vars)) # b. output of seed op should be kept in memory @@ -888,6 +888,17 @@ def _append_backward_ops_with_checkpoints_( continue if name not in var_name_dict: var_name_dict[name] = name + var_suffix + + # we should create the rename var in subprog, otherwise its VarType will be BOOL + ref_var = block.program.global_block().var(name) + block.create_var( + name=var_name_dict[name], + shape=ref_var.shape, + dtype=ref_var.dtype, + type=ref_var.type, + persistable=ref_var.persistable, + stop_gradient=ref_var.stop_gradient) + # 3.a. add ops in current recompute_segment as forward recomputation ops buffer_descs = _add_needed_descs_to_block(ff_ops, buffer_block, block, vars_in_memory) diff --git a/python/paddle/fluid/tests/unittests/dist_sharding_save.py b/python/paddle/fluid/tests/unittests/dist_sharding_save.py index 22c930bf894..676b15c0d93 100755 --- a/python/paddle/fluid/tests/unittests/dist_sharding_save.py +++ b/python/paddle/fluid/tests/unittests/dist_sharding_save.py @@ -59,7 +59,11 @@ def runtime_main(): strategy = paddle.distributed.fleet.DistributedStrategy() strategy.sharding = True - strategy.sharding_configs = {"fuse_broadcast_MB": 0.2} + strategy.sharding_configs = { + "sharding_segment_strategy": "segment_broadcast_MB", + "segment_broadcast_MB": 0.2, + "sharding_degree": 2, + } optimizer = paddle.fluid.optimizer.Momentum( learning_rate=0.01, momentum=0.9) diff --git a/python/paddle/fluid/tests/unittests/fleet_meta_optimizer_base.py b/python/paddle/fluid/tests/unittests/fleet_meta_optimizer_base.py index 1c74a11cc4d..549975f5d3f 100755 --- a/python/paddle/fluid/tests/unittests/fleet_meta_optimizer_base.py +++ b/python/paddle/fluid/tests/unittests/fleet_meta_optimizer_base.py @@ -146,7 +146,11 @@ class TestFleetMetaOptimizer(unittest.TestCase): strategy.gradient_merge_configs = {"k_steps": 2, "avg": True} elif name == "sharding": strategy.sharding = True - strategy.sharding_configs = {"fuse_broadcast_MB": 0.2} + strategy.sharding_configs = { + "sharding_segment_strategy": "segment_broadcast_MB", + "segment_broadcast_MB": 0.2, + "sharding_degree": 2, + } elif name == "recompute-offload": strategy.recompute = True strategy.recompute_configs = { diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py old mode 100644 new mode 100755 index fa5ce283985..37494294418 --- a/python/paddle/fluid/tests/unittests/test_dist_base.py +++ b/python/paddle/fluid/tests/unittests/test_dist_base.py @@ -1125,6 +1125,7 @@ class TestDistBase(unittest.TestCase): if check_error_log: print("outs[0]:", outs[0]) print("outs[1]:", outs[1]) + return pickle.loads(outs[0]), pickle.loads(outs[1]) def _run_pipeline(self, model, envs, check_error_log, log_name): diff --git a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py index 5da7e627f87..4d6744f2b6f 100755 --- a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py @@ -45,6 +45,7 @@ class TestFleetShardingMetaOptimizer(TestFleetMetaOptimizer): "fc_1.b_0", "fc_2.b_0", "fc_2.w_0", "fc_1.b_0_velocity_0", "fc_2.b_0_velocity_0", "fc_2.w_0_velocity_0", "learning_rate_0" ])) + self.assertEqual(ops, [ 'fill_constant', 'fill_constant', 'fill_constant', 'c_sync_calc_stream', 'c_broadcast', 'c_broadcast', 'c_broadcast', @@ -55,9 +56,9 @@ class TestFleetShardingMetaOptimizer(TestFleetMetaOptimizer): 'softmax_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream', - 'c_allreduce_sum', 'c_allreduce_sum', 'c_allreduce_sum', - 'c_allreduce_sum', 'c_allreduce_sum', 'c_allreduce_sum', - 'c_sync_comm_stream', 'momentum', 'momentum', 'momentum' + 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', + 'c_reduce_sum', 'c_reduce_sum', 'c_sync_comm_stream', 'momentum', + 'momentum', 'momentum' ]) def test_sharding_amp_optimizer(self): @@ -82,6 +83,7 @@ class TestFleetShardingMetaOptimizer(TestFleetMetaOptimizer): "fc_2.b_0_velocity_0", "fc_2.w_0_velocity_0", "learning_rate_0", "loss_scaling_0", "num_bad_steps_0", "num_good_steps_0" ])) + self.assertEqual(ops, [ 'cast', 'cast', 'cast', 'fill_constant', 'fill_constant', 'fill_constant', 'c_sync_calc_stream', 'c_broadcast', 'c_broadcast', @@ -94,11 +96,10 @@ class TestFleetShardingMetaOptimizer(TestFleetMetaOptimizer): 'softmax_grad', 'elementwise_add_grad', 'mul_grad', 'cast', 'tanh_grad', 'cast', 'elementwise_add_grad', 'mul_grad', 'cast', 'tanh_grad', 'cast', 'elementwise_add_grad', 'mul_grad', - 'c_sync_calc_stream', 'c_allreduce_sum', 'c_allreduce_sum', - 'c_allreduce_sum', 'c_allreduce_sum', 'c_allreduce_sum', - 'c_allreduce_sum', 'c_sync_comm_stream', 'cast', 'cast', 'cast', - 'check_finite_and_unscale', 'cast', 'c_sync_calc_stream', - 'c_allreduce_max', 'c_sync_comm_stream', 'cast', + 'c_sync_calc_stream', 'c_reduce_sum', 'c_reduce_sum', + 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', + 'c_sync_comm_stream', 'cast', 'cast', 'cast', + 'check_finite_and_unscale', 'cast', 'c_allreduce_max', 'cast', 'update_loss_scaling', 'momentum', 'momentum', 'momentum' ]) @@ -124,6 +125,7 @@ class TestFleetShardingMetaOptimizer(TestFleetMetaOptimizer): "fc_1.b_0", "fc_2.b_0", "fc_2.w_0", "fc_1.b_0_velocity_0", "fc_2.b_0_velocity_0", "fc_2.w_0_velocity_0", "learning_rate_0" ])) + self.assertEqual(ops, [ 'fill_constant', 'fill_constant', 'fill_constant', 'c_sync_calc_stream', 'c_broadcast', 'c_broadcast', 'c_broadcast', @@ -134,10 +136,9 @@ class TestFleetShardingMetaOptimizer(TestFleetMetaOptimizer): 'softmax_grad', 'elementwise_add_grad', 'mul_grad', 'mul', 'elementwise_add', 'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'mul', 'elementwise_add', 'tanh_grad', 'elementwise_add_grad', - 'mul_grad', 'c_sync_calc_stream', 'c_allreduce_sum', - 'c_allreduce_sum', 'c_allreduce_sum', 'c_allreduce_sum', - 'c_allreduce_sum', 'c_allreduce_sum', 'c_sync_comm_stream', - 'momentum', 'momentum', 'momentum' + 'mul_grad', 'c_sync_calc_stream', 'c_reduce_sum', 'c_reduce_sum', + 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', + 'c_sync_comm_stream', 'momentum', 'momentum', 'momentum' ]) def test_sharding_amp_recompute_optimizer(self): @@ -167,29 +168,27 @@ class TestFleetShardingMetaOptimizer(TestFleetMetaOptimizer): "fc_2.b_0_velocity_0", "fc_2.w_0_velocity_0", "learning_rate_0", "loss_scaling_0", "num_bad_steps_0", "num_good_steps_0" ])) - self.assertEqual(ops, [ - 'cast', 'cast', 'cast', 'fill_constant', 'fill_constant', + 'cast', 'cast', 'cast', 'cast', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'c_sync_calc_stream', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_sync_comm_stream', - 'cast', 'cast', 'mul', 'cast', 'elementwise_add', 'cast', 'tanh', - 'cast', 'cast', 'mul', 'elementwise_add', 'cast', 'tanh', 'cast', - 'mul', 'elementwise_add', 'softmax', 'cast', 'cross_entropy2', - 'mean', 'elementwise_mul', 'fill_constant', 'scale', - 'elementwise_mul_grad', 'mean_grad', 'cross_entropy_grad2', 'cast', - 'softmax_grad', 'elementwise_add_grad', 'mul_grad', 'cast', 'cast', - 'cast', 'mul', 'cast', 'elementwise_add', 'cast', 'tanh_grad', - 'cast', 'elementwise_add_grad', 'mul_grad', 'cast', 'cast', 'mul', - 'cast', 'elementwise_add', 'cast', 'tanh_grad', 'cast', + 'cast', 'mul', 'elementwise_add', 'cast', 'tanh', 'cast', 'mul', + 'elementwise_add', 'cast', 'tanh', 'cast', 'mul', 'elementwise_add', + 'softmax', 'cast', 'cross_entropy2', 'mean', 'elementwise_mul', + 'fill_constant', 'scale', 'elementwise_mul_grad', 'mean_grad', + 'cross_entropy_grad2', 'cast', 'softmax_grad', + 'elementwise_add_grad', 'mul_grad', 'cast', 'cast', 'mul', + 'elementwise_add', 'cast', 'tanh_grad', 'cast', + 'elementwise_add_grad', 'mul_grad', 'cast', 'mul', + 'elementwise_add', 'cast', 'tanh_grad', 'cast', 'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream', - 'c_allreduce_sum', 'c_allreduce_sum', 'c_allreduce_sum', - 'c_allreduce_sum', 'c_allreduce_sum', 'c_allreduce_sum', - 'c_sync_comm_stream', 'cast', 'cast', 'cast', - 'check_finite_and_unscale', 'cast', 'c_sync_calc_stream', - 'c_allreduce_max', 'c_sync_comm_stream', 'cast', - 'update_loss_scaling', 'momentum', 'momentum', 'momentum' + 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', + 'c_reduce_sum', 'c_reduce_sum', 'c_sync_comm_stream', 'cast', + 'cast', 'cast', 'check_finite_and_unscale', 'cast', + 'c_allreduce_max', 'cast', 'update_loss_scaling', 'momentum', + 'momentum', 'momentum' ]) def test_sharding_weight_decay(self): @@ -227,10 +226,10 @@ class TestFleetShardingMetaOptimizer(TestFleetMetaOptimizer): 'softmax_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream', - 'c_allreduce_sum', 'c_allreduce_sum', 'c_allreduce_sum', - 'c_allreduce_sum', 'c_allreduce_sum', 'c_allreduce_sum', - 'c_sync_comm_stream', 'scale', 'sum', 'scale', 'sum', 'scale', - 'sum', 'momentum', 'momentum', 'momentum' + 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', + 'c_reduce_sum', 'c_reduce_sum', 'c_sync_comm_stream', 'scale', + 'sum', 'scale', 'sum', 'scale', 'sum', 'momentum', 'momentum', + 'momentum' ]) def test_sharding_gradient_clip(self): @@ -253,6 +252,7 @@ class TestFleetShardingMetaOptimizer(TestFleetMetaOptimizer): "fc_1.b_0", "fc_2.b_0", "fc_2.w_0", "fc_1.b_0_velocity_0", "fc_2.b_0_velocity_0", "fc_2.w_0_velocity_0", "learning_rate_0" ])) + self.assertEqual(ops, [ 'fill_constant', 'fill_constant', 'fill_constant', 'c_sync_calc_stream', 'c_broadcast', 'c_broadcast', 'c_broadcast', @@ -263,14 +263,12 @@ class TestFleetShardingMetaOptimizer(TestFleetMetaOptimizer): 'softmax_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream', - 'c_allreduce_sum', 'c_allreduce_sum', 'c_allreduce_sum', - 'c_allreduce_sum', 'c_allreduce_sum', 'c_allreduce_sum', - 'c_sync_comm_stream', 'square', 'reduce_sum', 'square', - 'reduce_sum', 'square', 'reduce_sum', 'sum', 'c_sync_calc_stream', - 'c_allreduce_sum', 'c_sync_comm_stream', 'sqrt', 'fill_constant', - 'elementwise_max', 'elementwise_div', 'elementwise_mul', - 'elementwise_mul', 'elementwise_mul', 'momentum', 'momentum', - 'momentum' + 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', + 'c_reduce_sum', 'c_reduce_sum', 'c_sync_comm_stream', 'square', + 'reduce_sum', 'square', 'reduce_sum', 'square', 'reduce_sum', 'sum', + 'c_allreduce_sum', 'sqrt', 'fill_constant', 'elementwise_max', + 'elementwise_div', 'elementwise_mul', 'elementwise_mul', + 'elementwise_mul', 'momentum', 'momentum', 'momentum' ]) def test_sharding_clone_for_test(self): @@ -281,7 +279,8 @@ class TestFleetShardingMetaOptimizer(TestFleetMetaOptimizer): self.optimizer(avg_cost, strategy, train_prog, startup_prog) sharding.utils.comm_analyse(train_prog) test_prog = train_prog.clone(for_test=True) - sharding.utils.add_sync_comm(test_prog, strategy) + # assume sharding_ring_id = 1 + sharding.utils.add_sync_comm(test_prog, 1) ops = [op.type for op in test_prog.global_block().ops] self.assertEqual(ops, [ @@ -293,5 +292,200 @@ class TestFleetShardingMetaOptimizer(TestFleetMetaOptimizer): ]) +class TestFleetMetaOptimizer(TestFleetMetaOptimizer): + def setUp(self): + os.environ["PADDLE_TRAINER_ID"] = "3" + os.environ[ + "PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001,127.0.0.1:36002,127.0.0.1:36003,127.0.0.1:36004" + + def test_sharding_with_mp(self): + # NOTE(JZ-LIANG) MP parallelism need user to build model with MP API + train_prog, startup_prog = paddle.fluid.Program(), paddle.fluid.Program( + ) + avg_cost, _ = self.net(train_prog, startup_prog) + strategy = paddle.distributed.fleet.DistributedStrategy() + strategy.sharding = True + strategy.sharding_configs = { + "sharding_segment_strategy": "segment_broadcast_MB", + "segment_broadcast_MB": 0.2, + "segment_anchors": None, + "sharding_degree": 2, + "hybrid_dp": False, + "gradient_merge_acc_step": 1, + "mp_degree": 2 + } + self.optimizer(avg_cost, strategy, train_prog, startup_prog) + startup_prog_ops = startup_prog.global_block().ops + main_prog_ops = train_prog.global_block().ops + + # should has ring id for MP + created_ring_ids = [ + op.desc.attr("ring_id") for op in startup_prog_ops + if op.type == "c_comm_init" + ] + self.assertIn(0, created_ring_ids) + + # check correctness of MP group + sharding_group_waiting_port = None + for op in startup_prog_ops: + if op.type == "c_gen_nccl_id" and op.desc.output_arg_names()[ + 0] == "nccl_id_1": + sharding_group_waiting_ports = op.desc.attr("other_endpoints") + + self.assertEqual(sharding_group_waiting_ports, ['127.0.0.1:36003']) + + # check correctness of sharding group + sharding_group_waiting_port = None + for op in startup_prog_ops: + if op.type == "c_gen_nccl_id" and op.desc.output_arg_names()[ + 0] == "nccl_id_2": + dp_group_waiting_ports = op.desc.attr("other_endpoints") + + self.assertEqual(dp_group_waiting_ports, ['127.0.0.1:36002']) + + def test_sharding_hybrid_dp(self): + train_prog, startup_prog = paddle.fluid.Program(), paddle.fluid.Program( + ) + avg_cost, _ = self.net(train_prog, startup_prog) + strategy = paddle.distributed.fleet.DistributedStrategy() + strategy.sharding = True + strategy.sharding_configs = { + "sharding_segment_strategy": "segment_broadcast_MB", + "segment_broadcast_MB": 0.2, + "segment_anchors": None, + "sharding_degree": 2, + "hybrid_dp": True, + "gradient_merge_acc_step": 1, + "mp_degree": 1 + } + self.optimizer(avg_cost, strategy, train_prog, startup_prog) + startup_prog_ops = startup_prog.global_block().ops + main_prog_ops = train_prog.global_block().ops + + # check ring id for outter dp + created_ring_ids = [ + op.desc.attr("ring_id") for op in startup_prog_ops + if op.type == "c_comm_init" + ] + self.assertIn(2, created_ring_ids) + + # check correctness of sharding group + sharding_group_waiting_port = None + for op in startup_prog_ops: + if op.type == "c_gen_nccl_id" and op.desc.output_arg_names()[ + 0] == "nccl_id_1": + sharding_group_waiting_ports = op.desc.attr("other_endpoints") + + self.assertEqual(sharding_group_waiting_ports, ['127.0.0.1:36003']) + + # check correctness of dp group + sharding_group_waiting_port = None + for op in startup_prog_ops: + if op.type == "c_gen_nccl_id" and op.desc.output_arg_names()[ + 0] == "nccl_id_2": + dp_group_waiting_ports = op.desc.attr("other_endpoints") + self.assertEqual(dp_group_waiting_ports, ['127.0.0.1:36002']) + + # check loss scale for sharding hybrid dp + scale_ = -1 + for op in main_prog_ops: + if op.type == "scale": + scale_ = float(op.desc.attr("scale")) + self.assertEqual(scale_, 0.25) + + # check program (allreudce) + ops = [op.type for op in main_prog_ops] + self.assertEqual(ops, [ + 'fill_constant', 'fill_constant', 'fill_constant', + 'c_sync_calc_stream', 'c_broadcast', 'c_broadcast', 'c_broadcast', + 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_sync_comm_stream', + 'mul', 'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh', + 'mul', 'elementwise_add', 'softmax', 'cross_entropy2', 'mean', + 'fill_constant', 'scale', 'mean_grad', 'cross_entropy_grad2', + 'softmax_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad', + 'elementwise_add_grad', 'mul_grad', 'tanh_grad', + 'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream', + 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', + 'c_reduce_sum', 'c_reduce_sum', 'c_sync_comm_stream', + 'c_allreduce_sum', 'c_allreduce_sum', 'c_allreduce_sum', + 'c_sync_comm_stream', 'momentum', 'momentum', 'momentum' + ]) + + def test_sharding_hybrid_dp_gm(self): + train_prog, startup_prog = paddle.fluid.Program(), paddle.fluid.Program( + ) + avg_cost, _ = self.net(train_prog, startup_prog) + strategy = paddle.distributed.fleet.DistributedStrategy() + strategy.sharding = True + strategy.sharding_configs = { + "sharding_segment_strategy": "segment_broadcast_MB", + "segment_broadcast_MB": 0.2, + "segment_anchors": None, + "sharding_degree": 2, + "hybrid_dp": True, + "gradient_merge_acc_step": 4, + "mp_degree": 1 + } + self.optimizer(avg_cost, strategy, train_prog, startup_prog) + startup_prog_ops = startup_prog.global_block().ops + main_prog_ops = train_prog.global_block().ops + + # check ring id for outter dp + created_ring_ids = [ + op.desc.attr("ring_id") for op in startup_prog_ops + if op.type == "c_comm_init" + ] + self.assertIn(2, created_ring_ids) + + # check correctness of sharding group + sharding_group_waiting_port = None + for op in startup_prog_ops: + if op.type == "c_gen_nccl_id" and op.desc.output_arg_names()[ + 0] == "nccl_id_1": + sharding_group_waiting_ports = op.desc.attr("other_endpoints") + + self.assertEqual(sharding_group_waiting_ports, ['127.0.0.1:36003']) + + # check correctness of dp group + sharding_group_waiting_port = None + for op in startup_prog_ops: + if op.type == "c_gen_nccl_id" and op.desc.output_arg_names()[ + 0] == "nccl_id_2": + dp_group_waiting_ports = op.desc.attr("other_endpoints") + self.assertEqual(dp_group_waiting_ports, ['127.0.0.1:36002']) + + # check program + fw_bw_ops = [op.type for op in train_prog.blocks[0].ops] + opt_ops = [op.type for op in train_prog.blocks[2].ops] + self.assertEqual(fw_bw_ops, [ + 'fill_constant', 'fill_constant', 'fill_constant', + 'c_sync_calc_stream', 'c_broadcast', 'c_broadcast', 'c_broadcast', + 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_sync_comm_stream', + 'c_sync_comm_stream', 'mul', 'elementwise_add', 'tanh', 'mul', + 'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'softmax', + 'cross_entropy2', 'mean', 'fill_constant', 'scale', 'mean_grad', + 'cross_entropy_grad2', 'softmax_grad', 'elementwise_add_grad', + 'mul_grad', 'tanh_grad', 'elementwise_add_grad', 'mul_grad', + 'tanh_grad', 'elementwise_add_grad', 'mul_grad', + 'c_sync_calc_stream', 'c_reduce_sum', 'c_reduce_sum', + 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', + 'c_sync_comm_stream', 'elementwise_add', 'elementwise_add', + 'elementwise_add', 'increment', 'elementwise_mod', 'equal', + 'conditional_block' + ]) + self.assertEqual(opt_ops, [ + 'c_allreduce_sum', 'c_allreduce_sum', 'c_allreduce_sum', 'scale', + 'scale', 'scale', 'momentum', 'momentum', 'momentum', + 'fill_constant', 'fill_constant', 'fill_constant' + ]) + + # # check loss scale for gradient merge + scale_ = -1 + for op in train_prog.blocks[2].ops: + if op.type == "scale": + scale_ = float(op.desc.attr("scale")) + self.assertEqual(scale_, 0.25) + + if __name__ == "__main__": unittest.main() -- GitLab From 290be88db7f9b5d03936cf01e492becbdceac546 Mon Sep 17 00:00:00 2001 From: YUNSHEN XIE <1084314248@qq.com> Date: Fri, 2 Apr 2021 20:50:56 +0800 Subject: [PATCH 150/486] use busybox run test on windows openblas (#31728) * use busybox run test on windows openblas * fix error * fix disable_quick and nightly lable issue * add retry on windows openblas * fix bug * use one file to run cpu and gpu tests * fix with grep warning * fix syntax error * change run_unittest to run_unittest_gpu * Update run_unittests.sh fix error --- paddle/scripts/paddle_build.bat | 6 +- tools/windows/run_unittests.sh | 103 +++++++++++++++++++------------- 2 files changed, 65 insertions(+), 44 deletions(-) diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat index e939c712cbe..0fc8b7097a0 100644 --- a/paddle/scripts/paddle_build.bat +++ b/paddle/scripts/paddle_build.bat @@ -52,6 +52,8 @@ if not defined WITH_CACHE set WITH_CACHE=OFF if not defined WITH_UNITY_BUILD set WITH_UNITY_BUILD=OFF if not defined INFERENCE_DEMO_INSTALL_DIR set INFERENCE_DEMO_INSTALL_DIR=%cache_dir:\=/%/inference_demo if not defined LOG_LEVEL set LOG_LEVEL=normal +if not defined PRECISION_TEST set PRECISION_TEST=OFF +if not defined NIGHTLY_MODE set PRECISION_TEST=OFF rem -------set cache build directory----------- rmdir build\python /s/q @@ -501,7 +503,7 @@ setlocal enabledelayedexpansion set CUDA_DEVICE_COUNT=1 set FLAGS_fraction_of_gpu_memory_to_use=0.92 -%cache_dir%\tools\busybox64.exe bash %work_dir%\tools\windows\run_unittests.sh %NIGHTLY_MODE% %PRECISION_TEST% +%cache_dir%\tools\busybox64.exe bash %work_dir%\tools\windows\run_unittests.sh %NIGHTLY_MODE% %PRECISION_TEST% %WITH_GPU% goto:eof @@ -510,7 +512,7 @@ echo ======================================== echo Running CPU unit tests in parallel way ... echo ======================================== -ctest.exe -E "(%disable_ut_quickly%)" -LE %nightly_label% --output-on-failure -C Release -j 8 --repeat until-pass:4 after-timeout:4 +%cache_dir%\tools\busybox64.exe bash %work_dir%\tools\windows\run_unittests.sh %NIGHTLY_MODE% %PRECISION_TEST% %WITH_GPU% goto:eof diff --git a/tools/windows/run_unittests.sh b/tools/windows/run_unittests.sh index db3f3648ce2..6da2401fbe4 100644 --- a/tools/windows/run_unittests.sh +++ b/tools/windows/run_unittests.sh @@ -16,6 +16,7 @@ set -e set +x NIGHTLY_MODE=$1 PRECISION_TEST=$2 +WITH_GPU=$3 export PADDLE_ROOT="$(cd "$PWD/../" && pwd )" if [ ${NIGHTLY_MODE:-OFF} == "ON" ]; then @@ -204,47 +205,50 @@ long_time_test="^best_fit_allocator_test$|\ ^test_strided_slice_op$|\ ^test_transpose_op$" -export FLAGS_call_stack_level=2 -export FLAGS_fraction_of_gpu_memory_to_use=0.92 -export CUDA_VISIBLE_DEVICES=0 +if [ ${WITH_GPU:-OFF} == "ON" ];then + export FLAGS_call_stack_level=2 + export FLAGS_fraction_of_gpu_memory_to_use=0.92 + export CUDA_VISIBLE_DEVICES=0 -UT_list=$(ctest -N | awk -F ': ' '{print $2}' | sed '/^$/d' | sed '$d') -num=$(ctest -N | awk -F ': ' '{print $2}' | sed '/^$/d' | sed '$d' | wc -l) -echo "Windows 1 card TestCases count is $num" -if [ ${PRECISION_TEST:-OFF} == "ON" ]; then - python ${PADDLE_ROOT}/tools/get_pr_ut.py - if [[ -f "ut_list" ]]; then - echo "PREC length: "`wc -l ut_list` - precision_cases=`cat ut_list` + UT_list=$(ctest -N | awk -F ': ' '{print $2}' | sed '/^$/d' | sed '$d') + num=$(ctest -N | awk -F ': ' '{print $2}' | sed '/^$/d' | sed '$d' | wc -l) + echo "Windows 1 card TestCases count is $num" + if [ ${PRECISION_TEST:-OFF} == "ON" ]; then + python ${PADDLE_ROOT}/tools/get_pr_ut.py + if [[ -f "ut_list" ]]; then + set +x + echo "PREC length: "`wc -l ut_list` + precision_cases=`cat ut_list` + set -x + fi fi -fi -set +e -if [ ${PRECISION_TEST:-OFF} == "ON" ] && [[ "$precision_cases" != "" ]];then - UT_list_prec='' - re=$(cat ut_list|awk -F ' ' '{print }' | awk 'BEGIN{ all_str=""}{if (all_str==""){all_str=$1}else{all_str=all_str"$|^"$1}} END{print "^"all_str"$"}') - for case in $UT_list; do - flag=$(echo $case|grep -oE $re) - if [ -n "$flag" ];then - if [ -z "$UT_list_prec" ];then - UT_list_prec=$case + set +e + if [ ${PRECISION_TEST:-OFF} == "ON" ] && [[ "$precision_cases" != "" ]];then + UT_list_prec='' + re=$(cat ut_list|awk -F ' ' '{print }' | awk 'BEGIN{ all_str=""}{if (all_str==""){all_str=$1}else{all_str=all_str"$|^"$1}} END{print "^"all_str"$"}') + for case in $UT_list; do + flag=$(echo $case|grep -oE $re) + if [ -n "$flag" ];then + if [ -z "$UT_list_prec" ];then + UT_list_prec=$case + else + UT_list_prec=$UT_list_prec'\n'$case + fi else - UT_list_prec=$UT_list_prec'\n'$case + echo $case "won't run in PRECISION_TEST mode." fi - else - echo $case "won't run in PRECISION_TEST mode." - fi - done - UT_list=$UT_list_prec -fi -set -e - -output=$(python ${PADDLE_ROOT}/tools/parallel_UT_rule.py "${UT_list}") -cpu_parallel_job=$(echo $output | cut -d ";" -f 1) -tetrad_parallel_job=$(echo $output | cut -d ";" -f 2) -two_parallel_job=$(echo $output | cut -d ";" -f 3) -non_parallel_job=$(echo $output | cut -d ";" -f 4) + done + UT_list=$UT_list_prec + fi + set -e + output=$(python ${PADDLE_ROOT}/tools/parallel_UT_rule.py "${UT_list}") + cpu_parallel_job=$(echo $output | cut -d ";" -f 1) + tetrad_parallel_job=$(echo $output | cut -d ";" -f 2) + two_parallel_job=$(echo $output | cut -d ";" -f 3) + non_parallel_job=$(echo $output | cut -d ";" -f 4) +fi failed_test_lists='' tmp_dir=`mktemp -d` @@ -264,7 +268,13 @@ function collect_failed_tests() { set -e } -function run_unittest() { +function run_unittest_cpu() { + tmpfile=$tmp_dir/$RANDOM + (ctest -E "${disable_ut_quickly}" -LE "${nightly_label}" --output-on-failure -C Release -j 8 | tee $tmpfile) & + wait; +} + +function run_unittest_gpu() { test_case=$1 parallel_job=$2 parallel_level_base=${CTEST_PARALLEL_LEVEL:-1} @@ -283,7 +293,11 @@ function run_unittest() { } function unittests_retry(){ - parallel_job=1 + if [ "${WITH_GPU:-OFF}" == "ON" ];then + parallel_job=1 + else + parallel_job=4 + fi is_retry_execuate=0 wintest_error=1 retry_time=3 @@ -334,7 +348,7 @@ function unittests_retry(){ function show_ut_retry_result() { if [[ "$is_retry_execuate" != "0" ]];then - failed_test_lists_ult=`echo "${failed_test_lists}" | grep -o '[^ ].*$'` + failed_test_lists_ult=`echo "${failed_test_lists}"` echo "=========================================" echo "There are more than 10 failed unit tests, so no unit test retry!!!" echo "=========================================" @@ -363,10 +377,15 @@ function show_ut_retry_result() { } set +e -run_unittest $cpu_parallel_job 12 -run_unittest $tetrad_parallel_job 4 -run_unittest $two_parallel_job 2 -run_unittest $non_parallel_job + +if [ "${WITH_GPU:-OFF}" == "ON" ];then + run_unittest_gpu $cpu_parallel_job 12 + run_unittest_gpu $tetrad_parallel_job 4 + run_unittest_gpu $two_parallel_job 2 + run_unittest_gpu $non_parallel_job +else + run_unittest_cpu +fi collect_failed_tests set -e rm -f $tmp_dir/* -- GitLab From 36687d7ab0459d3931adb2ce1fb2efc76fe8a416 Mon Sep 17 00:00:00 2001 From: WeiXin Date: Sat, 3 Apr 2021 11:04:53 +0800 Subject: [PATCH 151/486] delete temporary files (#32055) --- .../static_mode_white_list.cpython-37.pyc | Bin 20217 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 tools/__pycache__/static_mode_white_list.cpython-37.pyc diff --git a/tools/__pycache__/static_mode_white_list.cpython-37.pyc b/tools/__pycache__/static_mode_white_list.cpython-37.pyc deleted file mode 100644 index 937267ff7180abf32341455e5d21fcac417782ae..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 20217 zcmeI4cbqImwZ|`4P(+a+B0(jH2q@sa7ZDT{R8&+z1=EVQy>ohI+wGYix@UIpV$NAH z=bUrSIp>@)=bUqX-&5yQchAh?$M^nyazDR&w@+19SDiY!YPr##d+xD?|K73Nd3Ss9 zIa{`TPaoocuim<4%c(rnZ`ys!7TF@Z$X2<8>?*s-CFN4GyIflKkjuzrWly=B>?M24 zKC-V|UiOnK$o_IgIY17SgXCbjl3ZD?B8SMKa+n-0SCu2=YI3ApU5=8YCJW?JdkCw;Ch4NT=oIGBhAWxJh$&=+N@>F@6JYAk4&y;7$v*kJRTzQ^6UtSY49yj|WQ@054RyX8Id zUU{FqUp^ooln=>=RJ=z9wIn zZ^$?0Tk>uBj(k_XC*PMJ$PeX5@?-gl{8WA>KbK#~FXdPAYx#}*R(>bHmp{lKBs2`&Y82bTtWfXjf(f<3|Iz+PZ)un*W5 zTpsKPt^oE2R|E%u1HnPyU~naHWpEX62sjiR1`Y>T1xJ9Zfg{1y!BOC7a16KxxF$Fj z90!gE*8J6Ht;7=jTf!5EwmDo}$7n1VI14t9bKZ~?dnxF@(5xHq^DxG%ULxIcIR zcp!KXcrbVfcqn)ncsO_jcqDifcrD)yac=yybQb?yaK!uyb8P;yav1$ybin`yaBusya~J+ zyal`!ybZh^yaT)wybHV=ya&7&ybrt|d;ok9dZt{s{gA{tW&C{tEsE{to^D{t5nd;a2p2EBe0`{l^c0 zUBPbPlHgKccW`O22e^!S2L4s-iFddh|0?zZd-G>c{@n-ct3HE&75fFs75Hg?a7Azc zI1n5J4hB~OR|Z!Bhk!%DVc>9ZRd58j8aNVM9UKLY2FHMFfNO$d!ExYta4oP6YzI5Q z3EkhoCD4Uw*$8acK~+;cLGb`&fq+77jRdQfV+VdEQ1~pkbyoJfEAE~yMt9w zfFT%x5{$w5paM0RfGJo5>tH9?02hFJfO~>_fqR4dfct{`f%}68fCqvHfd_+!fQN#I zfro=ffJcHyfk%VKfD6H6!Q;T=!4tp}!IQv~!BfCf!PCIg!85=!!Lz`#!E?ZK!SlfL z!3)3(FWfqK_0kr8deJ#sj(RaM%+mgotezxkzn_n?q_>ieMp==Re@VQT$Y9Np?N#oIZj8Cr1ku;)-X?HRyqY^dNas$I>=Fz*2Q(7lwBH4JdqTLR*)tx_0t4xw&dm?!~Np<-&?j0J_UT<2Zy$z+< zW~aSLzLrk%a+Hil$)HLlb|qZQib<+njlHR)+OCAl+Gvsv_36spr%67{5rw2b-m#s( zPCTJ~dzkJ_(sAx$$8bvEecMcZUe`xs5RYS~h6t_=65W+;GT)-FdrO>xK&bPu95%j` zd^FG@5AqqR$g!+SnU;-&E*s6)Wf&H7dFyJrGN|&fUMChsuPD)qBHhTUrNgb~aEy3o zz10Nm$|uYhQDIX}gncce-bz^|gKU&l>4cw{(q-)JDjf~7B_F>UBt!JX+2UNU26b5l zE%CvsD%}X{NY*R;AcL7Ar*)lK&*nKFjI`aPcXMPi@-M zS6xhas9`3n8mn~CaEL8nt%OAw3&c#~h*$T~90VH7=)h-b*TEZIli( z*S_U+(pyPJWfeYug)s08Em_+hhvPf!LG4?*oFRfWCWiMVW5dJ3jGf&!=4-26&U;h9 zT(J~mU58njIr>_#VYc?=yh!Ps7D=OmX2NJN^QJ;=BG!<6d>hAD)wIEHwiHH|GVOH? z?{Yp)tBG#uFffG4*;Xu@XqpZ$mphYQk%zhaJe0!HgIrz3xChDjQ3ZkYt!G3ZWLyO{ zDg(?aBC)-o|2fid(WS#Ey|m6;*ZDEb)L!q=htGwLaLP(qWc6yWEI#1MbhMgeNSl7d zbi5a6Jeoj;Wh`$rJ8xzsF3xniR~AJEm-^$8at!rOi&{st{p^)Pn3kar@G@+^iI39r z0!if^tk!VJS5%Iqd2=MeQZT{gqU^24NtLW^=Y%~Nr32L5c#H>G+lEyQns1iYNfT7K zW?IoKwEgi-*5A}o`KH+40ptv!8m z&1iR*kfKfyHX1iNDoN*-Zer-$KGVQBE_Z>*3*vZ>#ga7D}khw z)H$v{7G-cCn4ChbFaeCKey4FY%UD^3MlM|`C{m{lrD$7ic9Jbm2P$f2Z{QSCM!eDb zTeJMi!s$2&wK)J@xvGu7*plcs%`?RbW_>Jd2(DB*lC^EhebX{&+s1WO=E1MBTW%fk zlz7z*lqtt>sRiQAWBk)N1OhK6Jj{k=r53Ntb4V;7}dAC1dtRS(1^PsaSL-160^1)GL9 z%wfm|)@i(w45!5;PuQT~D@{RUW2ByP$uyd$bm+dbo}ermjH$7X+q*GRw;HLw^)5Nq zMmRVLzHdC8EL|<|BU2lPau!(#0T7HiqiDWsu3PAzsfO3C8gm^6)ifEQ8Ycd8jUmp^ z)=kGIOb%h2uH5!)?sdd%T2(Au*}CZm)>kn7+G<82V@Wf?u<2Ph&yjqsGg>DB>K7%7 zd~vM}v-#HIpdQC<(m;^6`5Pofn>S2@7q#8&PCQ!a#gB~35KZrH+dNDow^bPegNfT_ z<;X03HXNhrn#bu>Ol;KxZzN<_mB$Y)VY}-=X33p%d?$OPfvyI_h_;)ltV^EaG2J;Q zJv2J=@V@EfQFVsynvrT+`p^g(M(!_`u3>A8qgUBvTE#~CAt-BErNOdVH1mH=PN$2s zXrv`s)MmcIsq~t8(Yjc8mTd%9*%tIu7}qgoyl6ZL4UPSSyhoUIKSbH431MBW-NxUh zV{*KWapqdq&oofh)Y&~{s>y(RP-~oyhFLX$XEuE!sw?(cXsVu$+70fIQGQJrgI<{h z3W+^a^`*_G`N}yHNV;h_NFT3dEhrkO?J!-GbM>3P0ngU zbeDd6VLHhRm;RPedtBKkAdk#o8rgbTts({Zb@d^xBKlk6;#@!sy}60qRx2- zCBXx8kNxS0OuZZtSE=*f9{t$@y^Y{De{;vhyx9#F-Rt*NMN?+s!S$FdA}&&!qN693 z(H6^x!%J0GOj$mKiBLt-8C##jOy+r85;bF1WNK>=tk?Xa4nvpWGTGhvAsSI$xSGT^ zH)N<#OkG__rnMF{)CuvD#g`1IwAis3Nr@HJfDpkjQkXh#HLKAM%pt5n3nEm+VjnY0 zJ~Dl@7~!=h{cy8bAHtduT*mT>o4NKlf|qW{{t~aR+1v-|WCi`2R9VN+4ALq2e9G)G zvSo-~%^7-`L9@tul?-G#vxLd?uW60RRHuirexXN5w=6hnOBiiP>zX-0FUN&Od%AXu zE|lh$s$Vh3nmzAns92Tjp_}bj<3=_0bKwi6mpUC1?*02UU8VwP~JO5Ph<7S8y6ub5NS8>k2VqNGm(trpulYE6w&TA8v2z zd=O3M@*yD?5v``={Tw+6Vj1#En{+-LYne!EU!hv!dfXuk8APw|jOx()(eyOmZ$=uV zqfLl}SSF;MK{Ua^XoX0Tq>%4TLENKQ?1XW~+i#LQ5G?x=9N9F6*nJ1U8@ zVbSumua9+6PU^^L?GVb}LyAL1l9%qjnmQdW(FJuBG?%O_1g2IG$RklicR8j)zFFQ_ zm;H&xxq&4!X}LR3Mj_v+$63!KkPsqhRXW!DEQXjTtF~J3nU9Fxg1d3;%b87zcBrIA zF$^fS*>szJlPV8ZOv+JOM_KT>S0-0?cQE0Qv6*rY^WVI@A+Yf+6ki#`7F^t-OY(~ zx`$Jew%yE1r=8>x?kPYYWwIV(V6VSxG@taKHw6Dp8^uZ8)|BKJxfym`hw5uO)?&VL!l$6LP$e(2 z(ZCpNavlzyQp%Fyj45(pE}Ukd+V@&35piE__m$!@?-4c>ibtdgo{N$&wx?L02<0(Xrw!C3l4M-qy`*?R3Iuk?!2X$QRA!&k74;l%W4sEW> zpY7AlYgN88^3}dM%&>ELIrTWmOFLDjB9iUw6;mqhAzjhb!bi69j~U`fA{9w;2^(e% z9A@=OJXW!-=g1=?YOe_Qb_RbRdiyl;5eY)Wmtk(6*$fnB4G)U(Tbc2_{+&_}F|Dd+ zOdXbZd4hT$nT3S(J~B+7&50BR*!b zP>sNG;%^eRUd$`aj4v4Xv(=TOLu2KjE*7T+0;7_;yOBOvW4i-kSx*dT=K3du_ zJ8oU48gWr6{dZHfs7gm4W>h4ay|LA_X;(9gu*$jmVkd0#@Q49JM?WMm+onMRVOL&g zncpOGobO~4>lI?LB6F!9m&L|_SU`*B&B(6)c6c51%Vb;$90|3D+YV&vvmGGN z!OULBF}?8Uk`oTZ371A0d8B3)LDzW0jGC73^N<^B9p7pWrhv4N#?vuAVvAZwGIJIX zdz6%dvg#tU1@Y+3`DF)jXobMbuZQ< zAnqiwqO{c>nl#<5#mex$QB^$S(Dhg$E=(eBWOD|h?aum*U`>@^erp^LVOY{|IUj{g z&V#YGO+$_%TMKf5MR%*)wvY9|KX^`$5IRj$yxn+{?|Ff*)JW-sl85H_ocrjxi%C<3 zNO)4?gkg;khcM6AjbrU)=%XsTuM?6-Vk=nzV|5zdn=`v`q^Ts!eS(Pd5mK+9_Ko$? zF1|m0(x(*DS=?Iixz0V#5U`qSO?P|L?mi<<<;jY*&ZG4VyKUkEfdhy-O#dT;Jx9?w z9JEU^encOh2`N1=GEePNOl+M`Li}eDUr75bo7Lb)Ptw$@*<_nbH@5un$h~(W8m19$ zTC`D{nSprs%900TK&{zy!=O4FXc!409&}m7057Czt!-(`JsN|-^x@&kvX%tH!HGpg zej4aAj-q57Ly!o9tp>mh%na5mRgRkI(Vm+L=a6|Btfn3b(A@;rosZtQi?{ra4xDFm zhI&eQ%dC85mgtfOhw#)_Hbd0T z7C2vgNkRMg>Q$)qN=}S7{nasfq8&QvAxKAk^;Of9I`r|J#P=bP=;3=b4r=JL3`N-_ z!((udvBcs!qrF(mJ-avuAB^LJ)y_<9s?5q5r(>FFA^p$SPFvK+TJM%%W|31yBlTylz+JWS=f2TfWa zvuC#!9EpToww3FlbXd7Ra?>!hn$Zr{Sp!kuhhUDvy_sffLV;WDO}kqh#TOX%JgAik zos#Bj4?frV7E2Ax9!#)3c28lG%DGK(I>L)2n+Y9P>4ISo zUJxs>ceSS!+yGT$z-rCnPc>UGQL*Vy$Hd!SYF6u|ewyCKP3)Y8zGeHE2WFZbUeEHu zik^#Uz7;RvnLMqt_j)&v%A&t(6h^iV8)Z?(I~^uQN?F^{Xuesq_I__WDWfrx%|fqm z(&(2NCM%ltI#-&@Z=TQcj`4IZ&u!>4HQZWn3z!F4pf8kGO=&v)EK#u&iFxwE+hzhg z-KZ1b<40Tx^R&9tR`xjJj-`MkIHtu;mTqgV#`Vv{CS%ujIwQ??3W-qk>{*YSS6c6c zH?HPHPPJZdQ!THZk+{<#vWJLcsOpVT&o{*{&}rb`B(X7qUC2ARq8%g3g*Losi0<7? zZ<535w5;Ck#q)pLCrP;?KvGw(PgZ=b;p{t_Q`^T-l%y=pMP;i^nmbz@nY*vHO?qm= z0n2#0bgkJOGRC}|Sj4WOgyP|(zH#z@_C}ag=VVl0HB%6(to6m_^>!SuIks;>b3T-f zD0hTHYoh{fTcs<4?p`-uVDIM@6{32QHQqnAaJu9TF%(_(X^UDGwh+VB3v;loWG6Ar5AMBV(Yg>w&) z8vSfa?iK`S!Ps4`I`Oa-+}|*l*}3N49@v-<^3UQhPl^)6CuZ6Y@w#|)e-<^-7+X2j&>!DUlvmJNsXvsDYC(nJxaxL9~xpkW- z3}-B1+3j%iEi7W@RI_~t4e!_tn}!!$=e{=6HaA6EmA}_c!)A^tv-{3%gY8JwUWIO( zOopO^ZRuunB7N@r?%>HM+ZQ}Vg*MmOq_Ne>xpmGr&%GV1?&xc^}uxM*dbirm@2N z#H$c(v!zaEz43+^X5%?`)rrHAR;@!=DXleh*h@wTwZPb4BVPR_Nc~rjOQ=h_aS4|i zImW2+cymI(jsYhYHmBwBmyv^5hqK9sQSthSCXRN$qZ7dlaa?WbzZ1*qaPFKIgY5CZ zvyD)@ZBrlCZ%M6PjnoL-$1s8&-NXPqPCPDAf5|ZZ-e7pu)1$?k8DUpF#W223_>H%j zz$FF=lO!l#@V78Np2X|&EZAz_x8Lg;7gNsF{X9*FZAIu}0}U&Rg~M`KU|JB)gqr&@ zamVc{TG#-Wl#<^Otf`Q+;e(ClBH@f~8S1})=h3gxo4c)??yP%;Z1Mn4x}LC&VwtD0 zh<+-H7rRP2Hj%Jbo#Qqoqaqf|Mr)cEdZ<6n$C=hX!=;dMiP3W5(#YLb(P6I%qHdrczIqu9!hChqB197hGrsy7cP~v| z!*2bD`R3sNo;~WT|GQLG-*6jGqVF}}m^Ytb)-c>i@aqSybxCslRYpAyj+VzN4_>wi zR)h@=mlis2c_*`aesd?;$t;X(9b-@VN9oox7KD^yeyTYo=)B#DfoINZ6GXvqYP)ZDBTbk>9pja9?Y;T+d zs%?-L7rNbzXWn3E%AH}{U2TsGF>$tYgV4?VLp+A+lKdCO1Ihf?CU<$QCh9wB7tX>9 zHeEYh37n@mqyGP_qj$OJfOF10?c6hNoSb>~O-@hFJ^h?>&p7L5$*s;f=iHZWsdnT4 zkQDy<>nX=q+-Eyp4|wb2n}YZ8w`Nz0?Dz@APsmmd*UuzWxwPEL`P^~ijf-|a1tRy$ xZm_rhFPF|^*sZ(#_s{=l4(z(ifB*cK1B)C`olt80xBV$ntbOs#|573c{s#-?Bv=3d -- GitLab From 1e52f324318bfe31e6dc43bcf5eeb682a44ec5d7 Mon Sep 17 00:00:00 2001 From: jiangcheng Date: Sat, 3 Apr 2021 11:07:07 +0800 Subject: [PATCH 152/486] Optimize elementwise_add_grad op, test=develop (#32051) --- .../elementwise/elementwise_add_op.cu | 45 ++++++++++++++----- 1 file changed, 33 insertions(+), 12 deletions(-) diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cu b/paddle/fluid/operators/elementwise/elementwise_add_op.cu index 68fd81f8264..313607d975e 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cu @@ -112,18 +112,39 @@ elementwise_add_grad(const framework::ExecutionContext& ctx, const framework::Tensor* out, const framework::Tensor* dout, framework::Tensor* dx, framework::Tensor* dy) { - auto size = x->numel(); - int vec_size = max(static_cast(sizeof(float4) / sizeof(T)), 1); - dim3 block_size = dim3(PADDLE_CUDA_THREAD_SIZE, 1); - dim3 grid_size = - dim3(((size + vec_size - 1) / vec_size + PADDLE_CUDA_THREAD_SIZE - 1) / - PADDLE_CUDA_THREAD_SIZE, - 1); - SimpleElemwiseAddGradCUDAKernel< - T><<().stream()>>>( - dout->data(), size, vec_size, dx->mutable_data(ctx.GetPlace()), - dy->mutable_data(ctx.GetPlace())); + auto* dx_data = dx->mutable_data(ctx.GetPlace()); + auto* dy_data = dy->mutable_data(ctx.GetPlace()); + auto* dout_data = dout->data(); + if (dx_data == dout_data && dy_data != dout_data) { + VLOG(4) << "Special case when dx_data is the same as dout_data, " + "only need copy dout to dy"; + framework::TensorCopy( + *dout, ctx.GetPlace(), + ctx.template device_context(), dy); + } else if (dx_data != dout_data && dy_data == dout_data) { + VLOG(4) << "Special case when dy_data is the same as dout_data, " + "only need copy dout to dx"; + framework::TensorCopy( + *dout, ctx.GetPlace(), + ctx.template device_context(), dx); + } else if (dx_data != dout_data && dy_data != dout_data) { + auto size = x->numel(); + int vec_size = max(static_cast(sizeof(float4) / sizeof(T)), 1); + dim3 block_size = dim3(PADDLE_CUDA_THREAD_SIZE, 1); + dim3 grid_size = + dim3(((size + vec_size - 1) / vec_size + PADDLE_CUDA_THREAD_SIZE - 1) / + PADDLE_CUDA_THREAD_SIZE, + 1); + SimpleElemwiseAddGradCUDAKernel< + T><<().stream()>>>( + dout->data(), size, vec_size, dx->mutable_data(ctx.GetPlace()), + dy->mutable_data(ctx.GetPlace())); + } else { + VLOG(4) << "Special case when dy_data is the same as dout_data, " + "and dx_data is the same as dout_data, do not need " + "any operator"; + } } } // namespace operators -- GitLab From a3b08bad7e1df1110c75b19bd6117a555ae35d2d Mon Sep 17 00:00:00 2001 From: ronnywang <524019753@qq.com> Date: Tue, 6 Apr 2021 10:24:09 +0800 Subject: [PATCH 153/486] [ROCM] fix the backward maxpool (#32030) --- paddle/fluid/operators/pool_cudnn_op.cu.cc | 30 +++++++ python/paddle/fluid/dygraph/nn.py | 5 ++ .../fluid/tests/unittests/test_conv2d_op.py | 78 ++++++++++++------- 3 files changed, 86 insertions(+), 27 deletions(-) diff --git a/paddle/fluid/operators/pool_cudnn_op.cu.cc b/paddle/fluid/operators/pool_cudnn_op.cu.cc index 8ceb22d8cc4..1bdb3728f53 100644 --- a/paddle/fluid/operators/pool_cudnn_op.cu.cc +++ b/paddle/fluid/operators/pool_cudnn_op.cu.cc @@ -20,6 +20,8 @@ limitations under the License. */ #include "paddle/fluid/platform/cudnn_helper.h" #endif #ifdef PADDLE_WITH_HIP +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/platform/miopen_helper.h" #endif @@ -264,6 +266,34 @@ class PoolCUDNNGradOpKernel : public framework::OpKernel { std::string padding_algorithm = ctx.Attr("padding_algorithm"); const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); +#ifdef PADDLE_WITH_HIP + if (pooling_type == "max") { + using OpKernelMap = paddle::framework::OperatorWithKernel::OpKernelMap; + using OpKernelFunc = paddle::framework::OperatorWithKernel::OpKernelFunc; + auto &all_op_kernels = + paddle::framework::OperatorWithKernel::AllOpKernels(); + std::string op_type = "pool2d_grad"; + auto kernels_iter = all_op_kernels.find(op_type); + PADDLE_ENFORCE_NE( + kernels_iter, all_op_kernels.end(), + platform::errors::Unavailable( + "There are no kernels which are registered in the %s operator.", + op_type)); + OpKernelMap &kernels = kernels_iter->second; + paddle::framework::OpKernelType expected_kernel_key( + paddle::framework::ToDataType(typeid(T)), ctx.GetPlace()); + auto kernel_iter = kernels.find(expected_kernel_key); + PADDLE_ENFORCE_NE(kernel_iter, kernels.end(), + platform::errors::NotFound( + "Operator (%s) does not have kernel for %s.", + op_type, KernelTypeToString(expected_kernel_key))); + std::unique_ptr kernel_func_( + new OpKernelFunc(kernel_iter->second)); + (*kernel_func_)(ctx); + return; + } +#endif + // update paddings auto in_x_dims = input->dims(); framework::DDim data_dims; diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py index 6decff69ad6..ce728f1121d 100644 --- a/python/paddle/fluid/dygraph/nn.py +++ b/python/paddle/fluid/dygraph/nn.py @@ -174,6 +174,11 @@ class Conv2D(layers.Layer): dtype='float32'): assert param_attr is not False, "param_attr should not be False here." super(Conv2D, self).__init__() + + if (core.is_compiled_with_cuda() and paddle.fluid.get_flags( + "FLAGS_conv2d_disable_cudnn")["FLAGS_conv2d_disable_cudnn"]): + use_cudnn = False + self._num_channels = num_channels self._groups = groups self._stride = utils.convert_to_list(stride, 2, 'stride') diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_op.py index 83bba0b0ca1..bbb0f5b1039 100644 --- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py +++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py @@ -1470,35 +1470,59 @@ class TestConv2DAPI_Error(unittest.TestCase): not (core.is_compiled_with_cuda() or core.is_compiled_with_rocm()), "core is not compiled with CUDA or ROCM") class TestConv2DEnviron(unittest.TestCase): - def run_conv2d_api(self): - inputs = fluid.layers.data( - shape=[2, 3, 5, 5], - append_batch_size=False, - name="inputs", - dtype="float32") - fluid.layers.conv2d( - input=inputs, - num_filters=4, - filter_size=[3, 3], - stride=[1, 1], - padding=0, - dilation=[1, 1], - groups=1, - data_format="NCHW") - - x_var = paddle.uniform((2, 3, 5, 5), dtype="float32", min=-1., max=1.) - conv = paddle.nn.Conv2D( - in_channels=3, - out_channels=4, - kernel_size=(3, 3), - data_format="NCHW") - y_var = conv(x_var) + def run1(self, place): + with fluid.program_guard(fluid.Program(), fluid.Program()): + inputs = fluid.layers.data( + shape=[2, 3, 5, 5], + append_batch_size=False, + name="inputs", + dtype="float32") + result = fluid.layers.conv2d( + input=inputs, + num_filters=4, + filter_size=[3, 3], + stride=[1, 1], + padding=0, + dilation=[1, 1], + groups=1, + data_format="NCHW") + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + fetches = exe.run(fluid.default_main_program(), + feed={"inputs": self.input_np}, + fetch_list=[result]) + + def run2(self, place): + with fluid.dygraph.guard(place): + inputs = fluid.dygraph.to_variable(self.input_np) + conv = paddle.nn.Conv2D( + in_channels=3, + out_channels=4, + kernel_size=(3, 3), + data_format="NCHW") + result = conv(inputs) + + def run3(self, place): + with fluid.dygraph.guard(place): + inputs = fluid.dygraph.to_variable(self.input_np) + conv = paddle.fluid.dygraph.nn.Conv2D( + num_channels=3, + num_filters=4, + filter_size=(3, 3), ) + result = conv(inputs) + + def run_all(self, place): + self.run1(place) + self.run2(place) + self.run3(place) def test_environ(self): - fluid.set_flags({'FLAGS_conv2d_disable_cudnn': False}) - self.run_conv2d_api() - fluid.set_flags({'FLAGS_conv2d_disable_cudnn': True}) - self.run_conv2d_api() + self.input_np = np.random.random([2, 3, 5, 5]).astype("float32") + for place in [paddle.CPUPlace(), paddle.CUDAPlace(0)]: + fluid.set_flags({'FLAGS_conv2d_disable_cudnn': False}) + self.run_all(place) + fluid.set_flags({'FLAGS_conv2d_disable_cudnn': True}) + self.run_all(place) if __name__ == '__main__': -- GitLab From 2e82b6c8eccdd5d9c0663c0db1bc25af0a83a885 Mon Sep 17 00:00:00 2001 From: ShenLiang <1422485404@qq.com> Date: Tue, 6 Apr 2021 11:28:30 +0800 Subject: [PATCH 154/486] [Hybrid Parallel] Add Topology for hybrid communicate (#32011) * support hyparallel, add topology * fix utest --- python/paddle/distributed/fleet/__init__.py | 13 +- .../paddle/distributed/fleet/base/topology.py | 176 ++++++++++++++++++ .../fluid/tests/unittests/CMakeLists.txt | 2 +- .../unittests/hybrid_communicate_group.py | 101 ++++++++++ .../test_hybrid_parallel_topology.py | 84 +++++++++ .../fluid/tests/unittests/test_new_group.sh | 1 + 6 files changed, 368 insertions(+), 9 deletions(-) create mode 100644 python/paddle/distributed/fleet/base/topology.py create mode 100644 python/paddle/fluid/tests/unittests/hybrid_communicate_group.py create mode 100644 python/paddle/fluid/tests/unittests/test_hybrid_parallel_topology.py diff --git a/python/paddle/distributed/fleet/__init__.py b/python/paddle/distributed/fleet/__init__.py index bd8492ecfa7..ddbf8cbbe3f 100644 --- a/python/paddle/distributed/fleet/__init__.py +++ b/python/paddle/distributed/fleet/__init__.py @@ -20,16 +20,13 @@ from .base.util_factory import UtilBase from .dataset import * from .data_generator import MultiSlotDataGenerator, MultiSlotStringDataGenerator from . import metrics +from .base.topology import CommunicateTopology, HybridCommunicateGroup __all__ = [ - "DistributedStrategy", - "UtilBase", - "UserDefinedRoleMaker", - "PaddleCloudRoleMaker", - "Fleet", - "MultiSlotDataGenerator", - "MultiSlotStringDataGenerator", - "Role", + "DistributedStrategy", "UtilBase", "UserDefinedRoleMaker", + "PaddleCloudRoleMaker", "Fleet", "MultiSlotDataGenerator", + "MultiSlotStringDataGenerator", "Role", "CommunicateTopology", + "HybridCommunicateGroup" ] fleet = Fleet() diff --git a/python/paddle/distributed/fleet/base/topology.py b/python/paddle/distributed/fleet/base/topology.py new file mode 100644 index 00000000000..4e20ad50611 --- /dev/null +++ b/python/paddle/distributed/fleet/base/topology.py @@ -0,0 +1,176 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import collections +import numpy as np +from itertools import product +from functools import reduce +__all__ = ['CommunicateTopology', 'HybridCommunicateGroup'] + + +class CommunicateTopology(object): + def __init__(self, hybrid_group_names, dims): + self._parallel_names = hybrid_group_names + self._dims = dims + self.coordinate = collections.namedtuple('Coordinate', + self._parallel_names) + self._world_size = reduce(lambda x, y: x * y, self._dims) + + ranges = [range(d) for d in self._dims] + all_coordinate = [self.coordinate(*x) for x in product(*ranges)] + + self._coord2rank = dict(zip(all_coordinate, range(len(all_coordinate)))) + self._rank2coord = dict( + zip(self._coord2rank.values(), self._coord2rank.keys())) + + def get_hybrid_group_names(self): + return self._parallel_names + + def get_dim(self, axis_name): + return self._dims[self._parallel_names.index(axis_name)] + + def world_size(self): + return self._world_size + + def get_rank(self, **args): + assert len(args) == len(self._dims) + key = self.coordinate(**args) + assert key in self._coord2rank.keys() + return self._coord2rank[key] + + def get_coord(self, rank): + assert rank < self._world_size + assert rank in self._rank2coord.keys() + return self._rank2coord[rank] + + def get_axis_list(self, axis_name, index): + axis = self._parallel_names.index(axis_name) + ranks = [ + self._coord2rank[coord] for coord in self._coord2rank.keys() + if coord[axis] == index + ] + ranks.sort() + return ranks + + def get_dim_size(self, axis_name): + assert axis_name in self._parallel_names + return self._dims[self._parallel_names.index(axis_name)] + + def get_comm_list(self, axis_name): + assert axis_name in self._parallel_names + other_axis_names = [ + name for name in self._parallel_names if name != axis_name + ] + + ranges = [] + for name in other_axis_names: + dim_num = self.get_dim_size(name) + ranges.append(range(dim_num)) + + all_result = [] + for x in product(*ranges): + key_coord = {} + for other_name in other_axis_names: + key_coord[other_name] = x[other_axis_names.index(other_name)] + + result = [] + for i in range(0, self.get_dim_size(axis_name)): + key_coord[axis_name] = i + result.append(self._coord2rank[self.coordinate(**key_coord)]) + all_result.append(result) + + return all_result + + +class HybridCommunicateGroup(object): + def __init__(self, topology): + self.nranks = paddle.distributed.get_world_size() + self.global_rank = paddle.distributed.get_rank() + self._topo = topology + + self._num_data_parallel = self._topo.get_dim('data') + self._num_model_parallel = self._topo.get_dim('model') + self._num_pipe_parallel = self._topo.get_dim('pipe') + + self._data_parallel_id = self._get_data_parallel_id() + self._model_parallel_id = self._get_model_parallel_id() + + assert self._check_vaild_topo( + ), "Here is an unreasonable topogy setting" + + # create comm group for data parallel + self._dp_group, self._dp_comm_group = self._set_comm_group("data") + print("data parallel group", self._dp_group) + + # create comm group for model parallel + self._mp_group, self._mp_comm_group = self._set_comm_group("model") + print("model parallel group", self._mp_group) + + def _check_vaild_topo(self): + return self._num_data_parallel * self._num_model_parallel * self._num_pipe_parallel == self.nranks + + def _set_comm_group(self, parallel_method="data"): + parallel_group = [] + parallel_comm_group = None + parallel_groups = self._topo.get_comm_list(parallel_method) + + for group in parallel_groups: + comm_group = paddle.distributed.new_group(ranks=group) + if self.global_rank in group: + parallel_group = group + parallel_comm_group = comm_group + + assert len(parallel_group) > 0 + assert parallel_comm_group is not None + + return parallel_group, parallel_comm_group + + def topology(self): + return self._topo + + def get_global_rank(self): + return self.global_rank + + # data parallel message: + def _get_data_parallel_id(self): + return self._topo.get_coord(self.global_rank).data + + def get_data_parallel_rank(self): + return self._data_parallel_id + + def get_data_parallel_world_size(self): + return self._num_data_parallel + + def get_data_parallel_group(self): + return self._dp_comm_group + + def get_data_parallel_group_src_rank(self): + return self._dp_comm_group.ranks[0] + + # model parallel message: + def _get_model_parallel_id(self): + return self._topo.get_coord(self.global_rank).model + + def get_model_parallel_rank(self): + return self._model_parallel_id + + def get_model_parallel_world_size(self): + return self._num_model_parallel + + def get_model_parallel_group(self): + return self._mp_comm_group + + def get_model_parallel_group_src_rank(self): + return self._mp_comm_group.ranks[0] diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index add3bbee41d..f4c2318750c 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -524,7 +524,6 @@ if(WITH_DISTRIBUTE) if(WITH_GPU OR WITH_ROCM) bash_test_modules(test_c_comm_init_op START_BASH test_c_comm_init_op.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR}) py_test_modules(test_launch_coverage MODULES test_launch_coverage) - bash_test_modules(test_new_group START_BASH test_new_group.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR}) endif() bash_test_modules(test_fleetrun START_BASH test_fleetrun.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR}) @@ -543,6 +542,7 @@ if(WITH_DISTRIBUTE) endif() endforeach(TEST_OP) bash_test_modules(test_fleet_launch_ps START_BASH test_fleet_launch_ps.sh SERIAL LABELS "RUN_TYPE=EXCLUSIVE" ENVS "PADDLE_DIST_UT_PORT=${dist_ut_port}" PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR} ) + bash_test_modules(test_new_group START_BASH test_new_group.sh SERIAL LABELS "RUN_TYPE=EXCLUSIVE" ENVS "PADDLE_DIST_UT_PORT=${dist_ut_port}+20" PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR} ) endif(NOT APPLE) endif() diff --git a/python/paddle/fluid/tests/unittests/hybrid_communicate_group.py b/python/paddle/fluid/tests/unittests/hybrid_communicate_group.py new file mode 100644 index 00000000000..0a9785475b5 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/hybrid_communicate_group.py @@ -0,0 +1,101 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import os +import paddle +from paddle.distributed import fleet + + +class TestNewGroupAPI(object): + def __init__(self): + paddle.distributed.init_parallel_env() + topo = fleet.CommunicateTopology(["data", "model", "pipe"], [2, 1, 1]) + self.hcg = fleet.HybridCommunicateGroup(topo) + + d1 = np.array([1, 2, 3]) + d2 = np.array([2, 3, 4]) + self.tensor1 = paddle.to_tensor(d1) + self.tensor2 = paddle.to_tensor(d2) + + def test_all(self): + topo = self.hcg.topology() + global_rank = self.hcg.get_data_parallel_rank() + + dp_rank = self.hcg.get_data_parallel_rank() + dp_gp = self.hcg.get_data_parallel_group() + dp_world_size = self.hcg.get_data_parallel_world_size() + dp_src_rank = self.hcg.get_data_parallel_group_src_rank() + np.testing.assert_array_equal(dp_world_size, 2) + np.testing.assert_array_equal(dp_src_rank, 0) + + mp_rank = self.hcg.get_model_parallel_rank() + mp_gp = self.hcg.get_model_parallel_group() + mp_world_size = self.hcg.get_model_parallel_world_size() + mp_src_rank = self.hcg.get_model_parallel_group_src_rank() + np.testing.assert_array_equal(mp_world_size, 1) + + tmp = np.array([0, 0, 0]) + result = paddle.to_tensor(tmp) + paddle.distributed.scatter( + result, [self.tensor2, self.tensor1], + src=dp_src_rank, + group=dp_gp, + use_calc_stream=True) + if dp_rank == 0: + assert np.array_equal(result, self.tensor2) + elif dp_rank == 1: + assert np.array_equal(result, self.tensor1) + print("test scatter api ok") + + paddle.distributed.broadcast( + result, src=1, group=dp_gp, use_calc_stream=True) + assert np.array_equal(result, self.tensor1) + print("test broadcast api ok") + + paddle.distributed.reduce( + result, dst=dp_src_rank, group=dp_gp, use_calc_stream=True) + if dp_rank == 0: + assert np.array_equal(result, + paddle.add(self.tensor1, self.tensor1)) + elif dp_rank == 1: + assert np.array_equal(result, self.tensor1) + print("test reduce api ok") + + paddle.distributed.all_reduce(result, use_calc_stream=True) + assert np.array_equal( + result, + paddle.add(paddle.add(self.tensor1, self.tensor1), self.tensor1)) + print("test all_reduce api ok") + + paddle.distributed.wait(result, dp_gp, use_calc_stream=True) + paddle.distributed.wait(result, dp_gp, use_calc_stream=False) + print("test wait api ok") + + result = [] + paddle.distributed.all_gather( + result, self.tensor1, group=dp_gp, use_calc_stream=True) + assert np.array_equal(result[0], self.tensor1) + assert np.array_equal(result[1], self.tensor1) + print("test all_gather api ok") + + paddle.distributed.barrier(group=dp_gp) + print("test barrier api ok") + + return + + +if __name__ == "__main__": + gpt = TestNewGroupAPI() + gpt.test_all() diff --git a/python/paddle/fluid/tests/unittests/test_hybrid_parallel_topology.py b/python/paddle/fluid/tests/unittests/test_hybrid_parallel_topology.py new file mode 100644 index 00000000000..e4c469599d7 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_hybrid_parallel_topology.py @@ -0,0 +1,84 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import paddle +import paddle.nn as nn +import unittest +from paddle.distributed import fleet +import numpy as np + + +class TestCommunicateTopology(unittest.TestCase): + def test_topology(self): + topo = fleet.CommunicateTopology(["dp", "mp", "pp"], [2, 2, 2]) + + # test get_comm_list + dp_comm_list = [[0, 4], [1, 5], [2, 6], [3, 7]] + mp_comm_list = [[0, 2], [1, 3], [4, 6], [5, 7]] + pp_comm_list = [[0, 1], [2, 3], [4, 5], [6, 7]] + + np.testing.assert_array_equal(dp_comm_list, topo.get_comm_list("dp")) + np.testing.assert_array_equal(mp_comm_list, topo.get_comm_list("mp")) + np.testing.assert_array_equal(pp_comm_list, topo.get_comm_list("pp")) + + # test get_hybrid_group_names + parallel_names = ["dp", "mp", "pp"] + np.testing.assert_array_equal(parallel_names, + topo.get_hybrid_group_names()) + + # test get_dims + np.testing.assert_array_equal(2, topo.get_dim("dp")) + np.testing.assert_array_equal(2, topo.get_dim("mp")) + np.testing.assert_array_equal(2, topo.get_dim("pp")) + + # test world size + self.assertEqual(topo.world_size(), 8) + + # test get_rank + self.assertEqual(topo.get_rank(dp=0, mp=0, pp=0), 0) + self.assertEqual(topo.get_rank(dp=0, mp=0, pp=1), 1) + self.assertEqual(topo.get_rank(dp=0, mp=1, pp=0), 2) + self.assertEqual(topo.get_rank(dp=0, mp=1, pp=1), 3) + self.assertEqual(topo.get_rank(dp=1, mp=0, pp=0), 4) + self.assertEqual(topo.get_rank(dp=1, mp=0, pp=1), 5) + self.assertEqual(topo.get_rank(dp=1, mp=1, pp=0), 6) + self.assertEqual(topo.get_rank(dp=1, mp=1, pp=1), 7) + + # test get_coord + self.assertEqual(topo.get_coord(0), topo.coordinate(0, 0, 0)) + self.assertEqual(topo.get_coord(1), topo.coordinate(0, 0, 1)) + self.assertEqual(topo.get_coord(2), topo.coordinate(0, 1, 0)) + self.assertEqual(topo.get_coord(3), topo.coordinate(0, 1, 1)) + self.assertEqual(topo.get_coord(4), topo.coordinate(1, 0, 0)) + self.assertEqual(topo.get_coord(5), topo.coordinate(1, 0, 1)) + self.assertEqual(topo.get_coord(6), topo.coordinate(1, 1, 0)) + self.assertEqual(topo.get_coord(7), topo.coordinate(1, 1, 1)) + + # test get_axis_list + self.assertEqual(topo.get_axis_list("dp", 0), [0, 1, 2, 3]) + self.assertEqual(topo.get_axis_list("dp", 1), [4, 5, 6, 7]) + self.assertEqual(topo.get_axis_list("mp", 0), [0, 1, 4, 5]) + self.assertEqual(topo.get_axis_list("mp", 1), [2, 3, 6, 7]) + self.assertEqual(topo.get_axis_list("pp", 0), [0, 2, 4, 6]) + self.assertEqual(topo.get_axis_list("pp", 1), [1, 3, 5, 7]) + + # test get_dim_size + self.assertEqual(topo.get_dim_size("dp"), 2) + self.assertEqual(topo.get_dim_size("mp"), 2) + self.assertEqual(topo.get_dim_size("pp"), 2) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_new_group.sh b/python/paddle/fluid/tests/unittests/test_new_group.sh index 998ead8db32..d0b29a64145 100755 --- a/python/paddle/fluid/tests/unittests/test_new_group.sh +++ b/python/paddle/fluid/tests/unittests/test_new_group.sh @@ -17,3 +17,4 @@ set -e CUDA_VISIBLE_DEVICES=0,1 python -m paddle.distributed.launch --gpus=0,1 new_group.py +CUDA_VISIBLE_DEVICES=0,1 python -m paddle.distributed.launch --gpus=0,1 hybrid_communicate_group.py -- GitLab From 9e8f9037c0bb1db4a2f50145016d5a10261e0476 Mon Sep 17 00:00:00 2001 From: Kqnonrime <36952116+Kqnonrime@users.noreply.github.com> Date: Tue, 6 Apr 2021 13:37:07 +0800 Subject: [PATCH 155/486] fix two error message (#32039) * fix two error message * fix two error message * fix error * fix error * fix error * fix error --- paddle/fluid/operators/scatter.h | 20 ++++++++++++++------ paddle/fluid/operators/unstack_op.cc | 12 ++++++++---- 2 files changed, 22 insertions(+), 10 deletions(-) diff --git a/paddle/fluid/operators/scatter.h b/paddle/fluid/operators/scatter.h index cfa88b9808d..864a94a4235 100644 --- a/paddle/fluid/operators/scatter.h +++ b/paddle/fluid/operators/scatter.h @@ -102,9 +102,13 @@ void ScatterAssign(const platform::DeviceContext& ctx, const Tensor& src, // check src shape and dst shape should match for (int i = 1; i < src_dims.size(); i++) - PADDLE_ENFORCE_EQ(src_dims[i], dst_dims[i], - platform::errors::InvalidArgument( - "src shape and dst shape should match")); + PADDLE_ENFORCE_EQ( + src_dims[i], dst_dims[i], + platform::errors::InvalidArgument( + "The dimensions of the source tensor and target tensor should" + " match, but received source tensor's %d-th dimension is %d," + "target tensor's %d-th dimension is %d.", + i, src_dims[i], i, dst_dims[i])); // slice size size_t slice_size = 1; @@ -146,9 +150,13 @@ void ScatterAssignAdd(const framework::ExecutionContext& ctx, const Tensor& src, // check src shape and dst shape should match for (int i = 1; i < src_dims.size(); i++) - PADDLE_ENFORCE_EQ(src_dims[i], dst_dims[i], - platform::errors::InvalidArgument( - "src shape and dst shape should match")); + PADDLE_ENFORCE_EQ( + src_dims[i], dst_dims[i], + platform::errors::InvalidArgument( + "The dimensions of the source tensor and target tensor should" + " match, but received source tensor's %d-th dimension is %d," + "target tensor's %d-th dimension is %d.", + i, src_dims[i], i, dst_dims[i])); // slice size size_t slice_size = 1; diff --git a/paddle/fluid/operators/unstack_op.cc b/paddle/fluid/operators/unstack_op.cc index 2f71f10a1c4..71cc586cb59 100644 --- a/paddle/fluid/operators/unstack_op.cc +++ b/paddle/fluid/operators/unstack_op.cc @@ -101,14 +101,18 @@ class UnStackGradOp : public framework::OperatorWithKernel { void InferShape(framework::InferShapeContext *ctx) const override { PADDLE_ENFORCE_GT(ctx->Inputs(framework::GradVarName("Y")).size(), 0, platform::errors::InvalidArgument( - "Number of Inputs(Y@Grad) must be larger than 0")); + "The Inputs(Y@Grad) of unstack operator are empty.")); OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")), "Output", "X", "UnStackGrad"); auto input_dims = ctx->GetInputsDim(framework::GradVarName("Y")); for (size_t i = 1; i < input_dims.size(); ++i) { - PADDLE_ENFORCE_EQ(input_dims[i], input_dims[0], - platform::errors::InvalidArgument( - "Dims of all Inputs(Y@Grad) must be the same")); + PADDLE_ENFORCE_EQ( + input_dims[i], input_dims[0], + platform::errors::InvalidArgument( + "The dimensions of all Inputs(Y@Grad) must be the same," + "but received Inputs(Y@Grad)'s %d-th dimension is %d, " + "Inputs(Y@Grad)'s 0-th to %d-th dimension is %d.", + i, input_dims[i], i - 1, input_dims[0])); } int axis = ctx->Attrs().Get("axis"); -- GitLab From 6d6ea569dc1e9ff15fdc774c79276b0f79444f5e Mon Sep 17 00:00:00 2001 From: Pei Yang Date: Tue, 6 Apr 2021 13:45:34 +0800 Subject: [PATCH 156/486] remove pass restrictions for skip-ln pass (#32081) --- paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc | 8 -------- 1 file changed, 8 deletions(-) diff --git a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc index ada20113077..0e63320f2f7 100644 --- a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc +++ b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc @@ -141,14 +141,6 @@ void SkipLayerNormFusePass::ApplyImpl(ir::Graph *graph) const { GET_IR_NODE_FROM_SUBGRAPH(layer_norm_variance, layer_norm_variance, fused_pattern); - // check if is in ernie or not - if (!graph->Has(kEmbEltwiseLayernormPass) || - !graph->Has(kMultiheadMatmulPass)) { - LOG(INFO) << "The skip_layernorm_fuse_pass is only supported in " - << "Ernie/Bert model. Just skip this pass."; - return; - } - std::unordered_set del_node_set; // Create an SkipLayerNorm op node -- GitLab From b17e36a49851558da99b3ced7d9c6337b314639c Mon Sep 17 00:00:00 2001 From: zlsh80826 Date: Tue, 6 Apr 2021 14:46:24 +0800 Subject: [PATCH 157/486] [PaddleTRT] Yolov3 bugfix (#32064) * fix yolobox teller condition * fix cuda double free bug --- paddle/fluid/inference/tensorrt/op_teller.cc | 2 +- paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu | 5 +---- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc index f4e7c334632..b681b098c8c 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -206,7 +206,7 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, (desc.HasAttr("class_num") && desc.HasAttr("anchors") && desc.HasAttr("downsample_ratio") && desc.HasAttr("conf_thresh") && desc.HasAttr("clip_bbox") && desc.HasAttr("scale_x_y")); - return has_attrs; + if (!has_attrs) return false; } if (op_type == "affine_channel") { diff --git a/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu index e1b4c898d21..13d07e77403 100644 --- a/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu @@ -283,10 +283,7 @@ void YoloBoxPlugin::serialize(void* buffer) const { SerializeValue(&buffer, input_w_); } -void YoloBoxPlugin::destroy() { - cudaFree(anchors_device_); - delete this; -} +void YoloBoxPlugin::destroy() {} void YoloBoxPlugin::setPluginNamespace(const char* lib_namespace) { namespace_ = std::string(lib_namespace); -- GitLab From 78af100c94888ad695e2edbd49f8fc683a2bca87 Mon Sep 17 00:00:00 2001 From: zhulei <563755780@qq.com> Date: Tue, 6 Apr 2021 15:51:35 +0800 Subject: [PATCH 158/486] fix test of affine_grid with rocm (#32047) * fix test of affine_grid with rocm * fix test of affine_grid with rocm --- python/paddle/fluid/layers/nn.py | 3 +++ python/paddle/fluid/tests/unittests/test_affine_grid_op.py | 2 ++ python/paddle/nn/functional/vision.py | 2 ++ 3 files changed, 7 insertions(+) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 6bc69ffd5cd..34dc1e9b346 100755 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -9260,6 +9260,9 @@ def affine_grid(theta, out_shape, name=None): 'affine_grid') else: attrs['output_shape'] = out_shape + if core.is_compiled_with_rocm(): + # ROCM platform do not have MIOPEN kernel for affine_grid + attrs['use_cudnn'] = False helper.append_op( type='affine_grid', diff --git a/python/paddle/fluid/tests/unittests/test_affine_grid_op.py b/python/paddle/fluid/tests/unittests/test_affine_grid_op.py index e4336ab05d5..8277256009e 100644 --- a/python/paddle/fluid/tests/unittests/test_affine_grid_op.py +++ b/python/paddle/fluid/tests/unittests/test_affine_grid_op.py @@ -83,6 +83,8 @@ class TestAffineGridOpCase1(TestAffineGridOp): self.output_shape = np.array([20, 2, 5, 7]).astype("int32") self.dynamic_shape = True self.use_cudnn = True + if paddle.fluid.core.is_compiled_with_rocm(): + self.use_cudnn = False # ROCM platform do not have MIOPEN kernel for affine_grid self.align_corners = True diff --git a/python/paddle/nn/functional/vision.py b/python/paddle/nn/functional/vision.py index 9e04095e7b7..032d5b47eda 100644 --- a/python/paddle/nn/functional/vision.py +++ b/python/paddle/nn/functional/vision.py @@ -119,6 +119,8 @@ def affine_grid(theta, out_shape, align_corners=True, name=None): use_cudnn = True else: use_cudnn = False + if core.is_compiled_with_rocm(): + use_cudnn = False # ROCM platform do not have MIOPEN kernel for affine_grid if not (isinstance(out_shape, list) or isinstance(out_shape, tuple) or \ isinstance(out_shape, Variable)): -- GitLab From 187bf412692d9f36bf50bd0e65024b8314063884 Mon Sep 17 00:00:00 2001 From: wuhuanzhou Date: Tue, 6 Apr 2021 17:21:54 +0800 Subject: [PATCH 159/486] optimize compilation of operators using eigen (#31851) --- paddle/fluid/operators/CMakeLists.txt | 4 +- paddle/fluid/operators/addmm_op.h | 8 +- paddle/fluid/operators/eigen/CMakeLists.txt | 10 +++ paddle/fluid/operators/eigen/broadcast.cc | 86 ++++++++++++++++++ paddle/fluid/operators/eigen/broadcast.cu | 87 +++++++++++++++++++ paddle/fluid/operators/eigen/eigen_function.h | 52 +++++++++++ paddle/fluid/operators/expand_as_op.h | 19 ++-- paddle/fluid/operators/expand_as_v2_op.h | 19 ++-- paddle/fluid/operators/expand_op.h | 22 ++--- paddle/fluid/operators/expand_v2_op.h | 22 ++--- paddle/fluid/operators/meshgrid_op.h | 18 ++-- paddle/fluid/operators/tile_op.h | 22 ++--- 12 files changed, 309 insertions(+), 60 deletions(-) create mode 100644 paddle/fluid/operators/eigen/CMakeLists.txt create mode 100644 paddle/fluid/operators/eigen/broadcast.cc create mode 100644 paddle/fluid/operators/eigen/broadcast.cu create mode 100644 paddle/fluid/operators/eigen/eigen_function.h diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 467a5ff9063..ed878727532 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -10,6 +10,7 @@ file(WRITE ${pybind_file} "// Generated by the paddle/fluid/operators/CMakeLists copy_if_different(${pybind_file} ${pybind_file_final}) add_subdirectory(math) +add_subdirectory(eigen) add_subdirectory(controlflow) add_subdirectory(detection) add_subdirectory(elementwise) @@ -110,8 +111,9 @@ set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_fun set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions beam_search fc matrix_inverse) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} box_wrapper boost ps_gpu_wrapper) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} common_infer_shape_functions) +set(COMMON_OP_DEPS ${COMMON_OP_DEPS} eigen_cc_function) if (WITH_GPU OR WITH_ROCM) - set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv prelu bert_encoder_functor) + set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv prelu bert_encoder_functor eigen_cu_function) endif() set(COMMON_OP_DEPS ${COMMON_OP_DEPS} device_memory_aligment) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} layer) diff --git a/paddle/fluid/operators/addmm_op.h b/paddle/fluid/operators/addmm_op.h index 97e3ed9c1ad..ecfd10d2fa6 100644 --- a/paddle/fluid/operators/addmm_op.h +++ b/paddle/fluid/operators/addmm_op.h @@ -18,6 +18,7 @@ limitations under the License. */ #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/operators/eigen/eigen_function.h" #include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/math_function.h" @@ -32,8 +33,8 @@ template using EigenTensor = framework::EigenTensor; -using Array1 = Eigen::DSizes; -using Array2 = Eigen::DSizes; +using Array1 = Eigen::DSizes; +using Array2 = Eigen::DSizes; using Tensor = framework::Tensor; @@ -105,7 +106,8 @@ class AddMMKernel : public framework::OpKernel { auto eigen_out = EigenTensor::From(*out); auto& place = *context.template device_context().eigen_device(); - eigen_out.device(place) = eigen_input.broadcast(bcast_dims); + EigenBroadcast, T, 2>::Eval( + place, eigen_out, eigen_input, bcast_dims); blas.GEMM(false, false, x_dims[0], y_dims[1], x_dims[1], alpha, x->data(), x_dims[1], y->data(), y_dims[1], beta, diff --git a/paddle/fluid/operators/eigen/CMakeLists.txt b/paddle/fluid/operators/eigen/CMakeLists.txt new file mode 100644 index 00000000000..848bf2433c5 --- /dev/null +++ b/paddle/fluid/operators/eigen/CMakeLists.txt @@ -0,0 +1,10 @@ +file(GLOB EIGEN_CC_SOURCES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cc") +cc_library(eigen_cc_function SRCS ${EIGEN_CC_SOURCES} DEPS eigen3) +if(WITH_GPU OR WITH_ROCM) + file(GLOB EIGEN_CU_SOURCES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cu") + if(WITH_GPU) + nv_library(eigen_cu_function SRCS ${EIGEN_CU_SOURCES} DEPS eigen3) + elseif(WITH_ROCM) + hip_library(eigen_cu_function SRCS ${EIGEN_CU_SOURCES} DEPS eigen3) + endif() +endif() diff --git a/paddle/fluid/operators/eigen/broadcast.cc b/paddle/fluid/operators/eigen/broadcast.cc new file mode 100644 index 00000000000..dab25f95493 --- /dev/null +++ b/paddle/fluid/operators/eigen/broadcast.cc @@ -0,0 +1,86 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include "paddle/fluid/operators/eigen/eigen_function.h" +#include "paddle/fluid/platform/float16.h" + +namespace paddle { +namespace operators { + +template +struct EigenBroadcast { + using Array = Eigen::DSizes; + using InType = Eigen::TensorMap< + Eigen::Tensor>; + using InType32BitIndex = + Eigen::TensorMap, + Eigen::Aligned>; + using OutType = Eigen::TensorMap< + Eigen::Tensor>; + using OutType32BitIndex = + Eigen::TensorMap, + Eigen::Aligned>; + + static void Eval(const Eigen::DefaultDevice& dev, OutType out, InType in, + const Array& bcast) { + out.device(dev) = in.broadcast(bcast); + } + + static void Eval(const Eigen::DefaultDevice& dev, OutType32BitIndex out, + InType32BitIndex in, const Array& bcast) { + out.device(dev) = in.broadcast(bcast); + } +}; + +template +struct EigenBroadcastGrad { + using Array = Eigen::DSizes; + using Array2 = Eigen::DSizes; + using InType = Eigen::TensorMap< + Eigen::Tensor>; + using OutType = + Eigen::TensorMap>; + static void Eval(const Eigen::DefaultDevice& dev, OutType out, InType in, + const Array& reduce_dims, const Array2& reshape_dims) { + out.device(dev) = + in.reshape(reshape_dims).sum(reduce_dims).reshape(out.dimensions()); + } +}; + +#define INSTANTIATION(FUNCTOR, T) \ + template struct FUNCTOR; \ + template struct FUNCTOR; \ + template struct FUNCTOR; \ + template struct FUNCTOR; \ + template struct FUNCTOR; \ + template struct FUNCTOR +INSTANTIATION(EigenBroadcast, bool); +INSTANTIATION(EigenBroadcast, platform::float16); +INSTANTIATION(EigenBroadcast, float); +INSTANTIATION(EigenBroadcast, double); +INSTANTIATION(EigenBroadcast, int); +INSTANTIATION(EigenBroadcast, int64_t); +INSTANTIATION(EigenBroadcastGrad, bool); +INSTANTIATION(EigenBroadcastGrad, float); +INSTANTIATION(EigenBroadcastGrad, platform::float16); +INSTANTIATION(EigenBroadcastGrad, double); +INSTANTIATION(EigenBroadcastGrad, int); +INSTANTIATION(EigenBroadcastGrad, int64_t); +template struct EigenBroadcastGrad; +template struct EigenBroadcastGrad; +template struct EigenBroadcastGrad; +template struct EigenBroadcastGrad; +#undef INSTANTIATION + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/eigen/broadcast.cu b/paddle/fluid/operators/eigen/broadcast.cu new file mode 100644 index 00000000000..63e244d393a --- /dev/null +++ b/paddle/fluid/operators/eigen/broadcast.cu @@ -0,0 +1,87 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include "paddle/fluid/operators/eigen/eigen_function.h" +#include "paddle/fluid/platform/float16.h" + +namespace paddle { +namespace operators { + +template +struct EigenBroadcast { + using Array = Eigen::DSizes; + using InType = Eigen::TensorMap< + Eigen::Tensor>; + using InType32BitIndex = + Eigen::TensorMap, + Eigen::Aligned>; + using OutType = Eigen::TensorMap< + Eigen::Tensor>; + using OutType32BitIndex = + Eigen::TensorMap, + Eigen::Aligned>; + + static void Eval(const Eigen::GpuDevice& dev, OutType out, InType in, + const Array& bcast) { + out.device(dev) = in.broadcast(bcast); + } + + static void Eval(const Eigen::GpuDevice& dev, OutType32BitIndex out, + InType32BitIndex in, const Array& bcast) { + out.device(dev) = in.broadcast(bcast); + } +}; + +template +struct EigenBroadcastGrad { + using Array = Eigen::DSizes; + using Array2 = Eigen::DSizes; + using InType = Eigen::TensorMap< + Eigen::Tensor>; + using OutType = + Eigen::TensorMap>; + static void Eval(const Eigen::GpuDevice& dev, OutType out, InType in, + const Array& reduce_dims, const Array2& reshape_dims) { + out.device(dev) = + in.reshape(reshape_dims).sum(reduce_dims).reshape(out.dimensions()); + } +}; + +#define INSTANTIATION(FUNCTOR, T) \ + template struct FUNCTOR; \ + template struct FUNCTOR; \ + template struct FUNCTOR; \ + template struct FUNCTOR; \ + template struct FUNCTOR; \ + template struct FUNCTOR +INSTANTIATION(EigenBroadcast, bool); +INSTANTIATION(EigenBroadcast, platform::float16); +INSTANTIATION(EigenBroadcast, float); +INSTANTIATION(EigenBroadcast, double); +INSTANTIATION(EigenBroadcast, int); +INSTANTIATION(EigenBroadcast, int64_t); +INSTANTIATION(EigenBroadcastGrad, bool); +INSTANTIATION(EigenBroadcastGrad, float); +INSTANTIATION(EigenBroadcastGrad, platform::float16); +INSTANTIATION(EigenBroadcastGrad, double); +INSTANTIATION(EigenBroadcastGrad, int); +INSTANTIATION(EigenBroadcastGrad, int64_t); +template struct EigenBroadcastGrad; +template struct EigenBroadcastGrad; +template struct EigenBroadcastGrad; +template struct EigenBroadcastGrad; +template struct EigenBroadcastGrad; +#undef INSTANTIATION + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/eigen/eigen_function.h b/paddle/fluid/operators/eigen/eigen_function.h new file mode 100644 index 00000000000..59669505959 --- /dev/null +++ b/paddle/fluid/operators/eigen/eigen_function.h @@ -0,0 +1,52 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once +#include "unsupported/Eigen/CXX11/Tensor" + +namespace paddle { +namespace operators { + +template +struct EigenBroadcast { + using Array = Eigen::DSizes; + using InType = Eigen::TensorMap< + Eigen::Tensor>; + using InType32BitIndex = + Eigen::TensorMap, + Eigen::Aligned>; + using OutType = Eigen::TensorMap< + Eigen::Tensor>; + using OutType32BitIndex = + Eigen::TensorMap, + Eigen::Aligned>; + static void Eval(const EigenDevice& dev, OutType out, InType in, + const Array& bcast); + static void Eval(const EigenDevice& dev, OutType32BitIndex out, + InType32BitIndex in, const Array& bcast); +}; + +template +struct EigenBroadcastGrad { + using Array = Eigen::DSizes; + using Array2 = Eigen::DSizes; + using InType = Eigen::TensorMap< + Eigen::Tensor>; + using OutType = + Eigen::TensorMap>; + static void Eval(const EigenDevice& dev, OutType out, InType in, + const Array& reduce_dims, const Array2& reshape_dims); +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/expand_as_op.h b/paddle/fluid/operators/expand_as_op.h index cbaeb0c4e42..4cefadb24ec 100644 --- a/paddle/fluid/operators/expand_as_op.h +++ b/paddle/fluid/operators/expand_as_op.h @@ -22,6 +22,7 @@ limitations under the License. */ #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/operators/eigen/eigen_function.h" #define MAX_RANK_SUPPORTED 6 @@ -75,7 +76,7 @@ class ExpandAsKernel : public framework::OpKernel { auto in_dims = in0->dims(); auto* target_tensor = context.Input("target_tensor"); auto* out0 = context.Output("Out"); - Eigen::DSizes bcast_dims; + Eigen::DSizes bcast_dims; int bcast_dims_remainder = 0; auto x_dims = in0->dims(); auto y_dims = target_tensor->dims(); @@ -104,7 +105,8 @@ class ExpandAsKernel : public framework::OpKernel { auto y = EigenTensor::From(*out0); auto& place = *context.template device_context().eigen_device(); - y.device(place) = x.broadcast(bcast_dims); + EigenBroadcast, T, Rank>::Eval(place, y, x, + bcast_dims); } }; @@ -165,20 +167,19 @@ class ExpandAsGradKernel : public framework::OpKernel { auto* out0 = context.Output(framework::GradVarName("X")); out0->mutable_data(context.GetPlace()); auto x_grad = EigenVector::Flatten(*out0); - Eigen::DSizes reshape_dims; + Eigen::DSizes reshape_dims; for (size_t i = 0; i < reshape_size; ++i) { reshape_dims[i] = reshape_dims_vec[i]; } - Eigen::DSizes reduce_dims; + Eigen::DSizes reduce_dims; for (size_t i = 0; i < reduce_size; ++i) { reduce_dims[i] = reduce_dims_vec[i]; } auto out_grad = EigenVector::Flatten(*in0); - x_grad.device( - *context.template device_context().eigen_device()) = - out_grad.reshape(reshape_dims) - .sum(reduce_dims) - .reshape(x_grad.dimensions()); + auto& place = + *context.template device_context().eigen_device(); + EigenBroadcastGrad, T, Dims>::Eval( + place, x_grad, out_grad, reduce_dims, reshape_dims); } }; diff --git a/paddle/fluid/operators/expand_as_v2_op.h b/paddle/fluid/operators/expand_as_v2_op.h index c36e461926f..441dd353804 100644 --- a/paddle/fluid/operators/expand_as_v2_op.h +++ b/paddle/fluid/operators/expand_as_v2_op.h @@ -23,6 +23,7 @@ limitations under the License. */ #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/operators/eigen/eigen_function.h" #define MAX_RANK_SUPPORTED 6 @@ -108,7 +109,7 @@ class ExpandAsV2Kernel : public framework::OpKernel { } } auto* out0 = context.Output("Out"); - Eigen::DSizes bcast_dims; + Eigen::DSizes bcast_dims; for (size_t i = 0; i < repeat_times.size(); ++i) { bcast_dims[i] = repeat_times[i]; } @@ -122,7 +123,8 @@ class ExpandAsV2Kernel : public framework::OpKernel { auto y = EigenTensor::From(*out0, out_dims); auto& place = *context.template device_context().eigen_device(); - y.device(place) = x.broadcast(bcast_dims); + EigenBroadcast, T, Rank>::Eval(place, y, x, + bcast_dims); } }; @@ -191,20 +193,19 @@ class ExpandAsV2GradKernel : public framework::OpKernel { auto* out0 = context.Output(framework::GradVarName("X")); out0->mutable_data(context.GetPlace()); auto x_grad = EigenVector::Flatten(*out0); - Eigen::DSizes reshape_dims; + Eigen::DSizes reshape_dims; for (size_t i = 0; i < reshape_size; ++i) { reshape_dims[i] = reshape_dims_vec[i]; } - Eigen::DSizes reduce_dims; + Eigen::DSizes reduce_dims; for (size_t i = 0; i < reduce_size; ++i) { reduce_dims[i] = reduce_dims_vec[i]; } auto out_grad = EigenVector::Flatten(*in0); - x_grad.device( - *context.template device_context().eigen_device()) = - out_grad.reshape(reshape_dims) - .sum(reduce_dims) - .reshape(x_grad.dimensions()); + auto& place = + *context.template device_context().eigen_device(); + EigenBroadcastGrad, T, Dims>::Eval( + place, x_grad, out_grad, reduce_dims, reshape_dims); } }; diff --git a/paddle/fluid/operators/expand_op.h b/paddle/fluid/operators/expand_op.h index 8b79a1feb8c..abd525497d6 100644 --- a/paddle/fluid/operators/expand_op.h +++ b/paddle/fluid/operators/expand_op.h @@ -25,6 +25,7 @@ limitations under the License. */ #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/operators/eigen/eigen_function.h" #define MAX_RANK_SUPPORTED 6 @@ -141,7 +142,7 @@ class ExpandKernel : public framework::OpKernel { "of dimensions (%d) of the input.", expand_times.size(), static_cast(in_dims.size()))); auto* out0 = context.Output("Out"); - Eigen::DSizes bcast_dims; + Eigen::DSizes bcast_dims; for (size_t i = 0; i < expand_times.size(); ++i) { bcast_dims[i] = expand_times[i]; } @@ -160,9 +161,11 @@ class ExpandKernel : public framework::OpKernel { // use 32-bit index to speed up bool use_32bit_index = y.size() < Eigen::NumTraits::highest(); if (use_32bit_index) { - To32BitIndex(y).device(place) = To32BitIndex(x).broadcast(bcast_dims); + EigenBroadcast, T, Rank>::Eval( + place, To32BitIndex(y), To32BitIndex(x), bcast_dims); } else { - y.device(place) = x.broadcast(bcast_dims); + EigenBroadcast, T, Rank>::Eval(place, y, x, + bcast_dims); } } }; @@ -241,20 +244,19 @@ class ExpandGradKernel : public framework::OpKernel { auto* out0 = context.Output(framework::GradVarName("X")); out0->mutable_data(context.GetPlace()); auto x_grad = EigenVector::Flatten(*out0); - Eigen::DSizes reshape_dims; + Eigen::DSizes reshape_dims; for (size_t i = 0; i < reshape_size; ++i) { reshape_dims[i] = reshape_dims_vec[i]; } - Eigen::DSizes reduce_dims; + Eigen::DSizes reduce_dims; for (size_t i = 0; i < reduce_size; ++i) { reduce_dims[i] = reduce_dims_vec[i]; } auto out_grad = EigenVector::Flatten(*in0); - x_grad.device( - *context.template device_context().eigen_device()) = - out_grad.reshape(reshape_dims) - .sum(reduce_dims) - .reshape(x_grad.dimensions()); + auto& place = + *context.template device_context().eigen_device(); + EigenBroadcastGrad, T, Dims>::Eval( + place, x_grad, out_grad, reduce_dims, reshape_dims); } }; diff --git a/paddle/fluid/operators/expand_v2_op.h b/paddle/fluid/operators/expand_v2_op.h index ec9c6e62f27..af5fdf22cd9 100644 --- a/paddle/fluid/operators/expand_v2_op.h +++ b/paddle/fluid/operators/expand_v2_op.h @@ -26,6 +26,7 @@ limitations under the License. */ #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/operators/eigen/eigen_function.h" #define MAX_RANK_SUPPORTED 6 @@ -174,7 +175,7 @@ class ExpandV2Kernel : public framework::OpKernel { } auto* out0 = context.Output("Out"); - Eigen::DSizes bcast_dims; + Eigen::DSizes bcast_dims; for (size_t i = 0; i < repeat_times.size(); ++i) { bcast_dims[i] = repeat_times[i]; } @@ -194,9 +195,11 @@ class ExpandV2Kernel : public framework::OpKernel { // use 32-bit index to speed up bool use_32bit_index = y.size() < Eigen::NumTraits::highest(); if (use_32bit_index) { - To32BitIndex(y).device(place) = To32BitIndex(x).broadcast(bcast_dims); + EigenBroadcast, T, Rank>::Eval( + place, To32BitIndex(y), To32BitIndex(x), bcast_dims); } else { - y.device(place) = x.broadcast(bcast_dims); + EigenBroadcast, T, Rank>::Eval(place, y, x, + bcast_dims); } } }; @@ -275,20 +278,19 @@ class ExpandV2GradKernel : public framework::OpKernel { auto* out0 = context.Output(framework::GradVarName("X")); out0->mutable_data(context.GetPlace()); auto x_grad = EigenVector::Flatten(*out0); - Eigen::DSizes reshape_dims; + Eigen::DSizes reshape_dims; for (size_t i = 0; i < reshape_size; ++i) { reshape_dims[i] = reshape_dims_vec[i]; } - Eigen::DSizes reduce_dims; + Eigen::DSizes reduce_dims; for (size_t i = 0; i < reduce_size; ++i) { reduce_dims[i] = reduce_dims_vec[i]; } auto out_grad = EigenVector::Flatten(*in0); - x_grad.device( - *context.template device_context().eigen_device()) = - out_grad.reshape(reshape_dims) - .sum(reduce_dims) - .reshape(x_grad.dimensions()); + auto& place = + *context.template device_context().eigen_device(); + EigenBroadcastGrad, T, Dims>::Eval( + place, x_grad, out_grad, reduce_dims, reshape_dims); } }; diff --git a/paddle/fluid/operators/meshgrid_op.h b/paddle/fluid/operators/meshgrid_op.h index 162622c7d01..345e007de4a 100644 --- a/paddle/fluid/operators/meshgrid_op.h +++ b/paddle/fluid/operators/meshgrid_op.h @@ -25,6 +25,7 @@ #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/operators/eigen/eigen_function.h" #include "paddle/fluid/platform/errors.h" #define MAX_RANK_SUPPORTED 6 @@ -106,19 +107,21 @@ class MeshgridKernel : public framework::OpKernel { reshape_ins_tensor.Resize(out_dims_reshape); framework::DDim out_dims = framework::make_ddim(shape); - Eigen::DSizes bcast_dims; + Eigen::DSizes bcast_dims; for (int64_t j = 0; j < size; j++) { bcast_dims[j] = shape[j]; } bcast_dims[i] = 1; outs[i]->Resize(out_dims); - auto x = framework::EigenTensor::From(reshape_ins_tensor); + auto x = framework::EigenTensor::From( + static_cast(reshape_ins_tensor)); outs[i]->mutable_data(context.GetPlace()); auto y = framework::EigenTensor::From(*outs[i]); auto& place = *context.template device_context().eigen_device(); - y.device(place) = x.broadcast(bcast_dims); + EigenBroadcast, T, Rank>::Eval(place, y, x, + bcast_dims); } } }; @@ -169,21 +172,20 @@ class MeshgridGradKernel : public framework::OpKernel { } } - Eigen::DSizes reduce_dims; + Eigen::DSizes reduce_dims; for (int k = 0; k < n; k++) { reduce_dims[k] = reduce_dims_vec[k]; } - Eigen::DSizes reshape_dims; + Eigen::DSizes reshape_dims; for (int k = 0; k < n * 2; k++) { reshape_dims[k] = reshape_dims_vec[k]; } - auto tensor_reduce_tmp = - out_grad_tmp.reshape(reshape_dims).sum(reduce_dims); auto& place = *context.template device_context().eigen_device(); - in_grad.device(place) = tensor_reduce_tmp.reshape(in_grad.dimensions()); + EigenBroadcastGrad, T, Rank>::Eval( + place, in_grad, out_grad_tmp, reduce_dims, reshape_dims); } } }; diff --git a/paddle/fluid/operators/tile_op.h b/paddle/fluid/operators/tile_op.h index dffd3e58641..4bbde8d08e0 100644 --- a/paddle/fluid/operators/tile_op.h +++ b/paddle/fluid/operators/tile_op.h @@ -26,6 +26,7 @@ limitations under the License. */ #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/operators/eigen/eigen_function.h" #define MAX_RANK_SUPPORTED 6 @@ -155,7 +156,7 @@ class TileKernel : public framework::OpKernel { "'repeat_times' for tile op must match after promotion.", vec_in_dims.size(), repeat_times.size())); auto* out0 = context.Output("Out"); - Eigen::DSizes bcast_dims; + Eigen::DSizes bcast_dims; for (size_t i = 0; i < repeat_times.size(); ++i) { bcast_dims[i] = repeat_times[i]; } @@ -175,9 +176,11 @@ class TileKernel : public framework::OpKernel { // use 32-bit index to speed up bool use_32bit_index = y.size() < Eigen::NumTraits::highest(); if (use_32bit_index) { - To32BitIndex(y).device(place) = To32BitIndex(x).broadcast(bcast_dims); + EigenBroadcast, T, Rank>::Eval( + place, To32BitIndex(y), To32BitIndex(x), bcast_dims); } else { - y.device(place) = x.broadcast(bcast_dims); + EigenBroadcast, T, Rank>::Eval(place, y, x, + bcast_dims); } } }; @@ -255,21 +258,20 @@ class TileGradKernel : public framework::OpKernel { auto* out0 = context.Output(framework::GradVarName("X")); out0->mutable_data(context.GetPlace()); auto x_grad = EigenVector::Flatten(*out0); - Eigen::DSizes reshape_dims; + Eigen::DSizes reshape_dims; for (size_t i = 0; i < reshape_size; ++i) { reshape_dims[i] = reshape_dims_vec[i]; } - Eigen::DSizes reduce_dims; + Eigen::DSizes reduce_dims; for (size_t i = 0; i < reduce_size; ++i) { reduce_dims[i] = reduce_dims_vec[i]; } auto out_grad = EigenVector::Flatten(*in0); - x_grad.device( - *context.template device_context().eigen_device()) = - out_grad.reshape(reshape_dims) - .sum(reduce_dims) - .reshape(x_grad.dimensions()); + auto& place = + *context.template device_context().eigen_device(); + EigenBroadcastGrad, T, Dims>::Eval( + place, x_grad, out_grad, reduce_dims, reshape_dims); } }; -- GitLab From a17c3691185dbccb086a2959f2148e3ce2557a5f Mon Sep 17 00:00:00 2001 From: joejiong Date: Tue, 6 Apr 2021 17:40:55 +0800 Subject: [PATCH 160/486] fix fc doc (#32084) --- python/paddle/static/nn/common.py | 1 + 1 file changed, 1 insertion(+) mode change 100644 => 100755 python/paddle/static/nn/common.py diff --git a/python/paddle/static/nn/common.py b/python/paddle/static/nn/common.py old mode 100644 new mode 100755 index 0806d2c2914..f917b4fa09a --- a/python/paddle/static/nn/common.py +++ b/python/paddle/static/nn/common.py @@ -106,6 +106,7 @@ def fc(x, weight_attr (ParamAttr, optional): The attribute for the learnable weight. The default value is None, and the weight will be initialized to zero. For detailed information, please refer to :attr:`paddle.ParamAttr`. + Warning, if x is a list of tensor, weight_attr should also be a list of same length. bias_attr (ParamAttr|bool, optional): The attribute of the learnable bias. If it is set to False, no bias will be added to the output. If it is set to None or one kind of ParamAttr, a bias parameter will -- GitLab From b8b82b72f7664ee42651830c11dbc1205ca6aaf4 Mon Sep 17 00:00:00 2001 From: tianshuo78520a <707759223@qq.com> Date: Tue, 6 Apr 2021 19:47:56 +0800 Subject: [PATCH 161/486] Del cudnn6 code2 (#31986) --- cmake/configure.cmake | 5 + paddle/fluid/platform/dynload/cudnn.cc | 21 +-- paddle/fluid/platform/dynload/cudnn.h | 190 +++++++++++-------------- 3 files changed, 88 insertions(+), 128 deletions(-) diff --git a/cmake/configure.cmake b/cmake/configure.cmake index 851520328f3..2a1e6897c02 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -100,6 +100,11 @@ if(WITH_GPU) if(NOT CUDNN_FOUND) message(FATAL_ERROR "Paddle needs cudnn to compile") endif() + + if(${CUDNN_MAJOR_VERSION} VERSION_LESS 7) + message(FATAL_ERROR "Paddle needs CUDNN >= 7.0 to compile") + endif() + if(CUPTI_FOUND) include_directories(${CUPTI_INCLUDE_DIR}) add_definitions(-DPADDLE_WITH_CUPTI) diff --git a/paddle/fluid/platform/dynload/cudnn.cc b/paddle/fluid/platform/dynload/cudnn.cc index 4c59fe5e9ba..366762401c7 100644 --- a/paddle/fluid/platform/dynload/cudnn.cc +++ b/paddle/fluid/platform/dynload/cudnn.cc @@ -24,26 +24,9 @@ void* cudnn_dso_handle = nullptr; #define DEFINE_WRAP(__name) DynLoad__##__name __name CUDNN_DNN_ROUTINE_EACH(DEFINE_WRAP); -CUDNN_DNN_ROUTINE_EACH_R2(DEFINE_WRAP); -#ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_R3 -CUDNN_DNN_ROUTINE_EACH_AFTER_R3(DEFINE_WRAP); -#endif - -#ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_R3_LESS_R8 -CUDNN_DNN_ROUTINE_EACH_AFTER_R3_LESS_R8(DEFINE_WRAP); -#endif - -#ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_R4 -CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DEFINE_WRAP); -#endif - -#ifdef CUDNN_DNN_ROUTINE_EACH_R5 -CUDNN_DNN_ROUTINE_EACH_R5(DEFINE_WRAP); -#endif - -#ifdef CUDNN_DNN_ROUTINE_EACH_R6 -CUDNN_DNN_ROUTINE_EACH_R6(DEFINE_WRAP); +#ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_R7_LESS_R8 +CUDNN_DNN_ROUTINE_EACH_AFTER_R7_LESS_R8(DEFINE_WRAP); #endif #ifdef CUDNN_DNN_ROUTINE_EACH_R7 diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h index db84b8731f9..f5045ff004e 100644 --- a/paddle/fluid/platform/dynload/cudnn.h +++ b/paddle/fluid/platform/dynload/cudnn.h @@ -48,121 +48,93 @@ extern void EnforceCUDNNLoaded(const char* fn_name); * include all needed cudnn functions in HPPL * different cudnn version has different interfaces **/ -#define CUDNN_DNN_ROUTINE_EACH(__macro) \ - __macro(cudnnSetTensor4dDescriptor); \ - __macro(cudnnSetTensor4dDescriptorEx); \ - __macro(cudnnSetTensorNdDescriptor); \ - __macro(cudnnGetTensorNdDescriptor); \ - __macro(cudnnGetConvolutionNdForwardOutputDim); \ - __macro(cudnnCreateTensorDescriptor); \ - __macro(cudnnDestroyTensorDescriptor); \ - __macro(cudnnCreateFilterDescriptor); \ - __macro(cudnnSetFilter4dDescriptor); \ - __macro(cudnnSetFilterNdDescriptor); \ - __macro(cudnnGetFilterNdDescriptor); \ - __macro(cudnnSetPooling2dDescriptor); \ - __macro(cudnnSetPoolingNdDescriptor); \ - __macro(cudnnGetPoolingNdDescriptor); \ - __macro(cudnnDestroyFilterDescriptor); \ - __macro(cudnnCreateConvolutionDescriptor); \ - __macro(cudnnCreatePoolingDescriptor); \ - __macro(cudnnDestroyPoolingDescriptor); \ - __macro(cudnnSetConvolution2dDescriptor); \ - __macro(cudnnDestroyConvolutionDescriptor); \ - __macro(cudnnSetConvolutionNdDescriptor); \ - __macro(cudnnGetConvolutionNdDescriptor); \ - __macro(cudnnDeriveBNTensorDescriptor); \ - __macro(cudnnCreateSpatialTransformerDescriptor); \ - __macro(cudnnSetSpatialTransformerNdDescriptor); \ - __macro(cudnnDestroySpatialTransformerDescriptor); \ - __macro(cudnnSpatialTfGridGeneratorForward); \ - __macro(cudnnSpatialTfGridGeneratorBackward); \ - __macro(cudnnSpatialTfSamplerForward); \ - __macro(cudnnSpatialTfSamplerBackward); \ - __macro(cudnnCreate); \ - __macro(cudnnDestroy); \ - __macro(cudnnSetStream); \ - __macro(cudnnActivationForward); \ - __macro(cudnnActivationBackward); \ - __macro(cudnnConvolutionForward); \ - __macro(cudnnConvolutionBackwardBias); \ - __macro(cudnnGetConvolutionForwardWorkspaceSize); \ - __macro(cudnnTransformTensor); \ - __macro(cudnnPoolingForward); \ - __macro(cudnnPoolingBackward); \ - __macro(cudnnSoftmaxBackward); \ - __macro(cudnnSoftmaxForward); \ - __macro(cudnnGetVersion); \ - __macro(cudnnFindConvolutionForwardAlgorithmEx); \ - __macro(cudnnFindConvolutionBackwardFilterAlgorithmEx); \ - __macro(cudnnFindConvolutionBackwardFilterAlgorithm); \ - __macro(cudnnFindConvolutionBackwardDataAlgorithmEx); \ - __macro(cudnnGetErrorString); \ - __macro(cudnnCreateDropoutDescriptor); \ - __macro(cudnnDropoutGetStatesSize); \ - __macro(cudnnSetDropoutDescriptor); \ - __macro(cudnnRestoreDropoutDescriptor); \ - __macro(cudnnCreateRNNDescriptor); \ - __macro(cudnnGetRNNParamsSize); \ - __macro(cudnnGetRNNWorkspaceSize); \ - __macro(cudnnGetRNNTrainingReserveSize); \ - __macro(cudnnRNNForwardTraining); \ - __macro(cudnnRNNBackwardData); \ - __macro(cudnnRNNBackwardWeights); \ - __macro(cudnnRNNForwardInference); \ - __macro(cudnnDestroyDropoutDescriptor); \ - __macro(cudnnDestroyRNNDescriptor); \ - __macro(cudnnSetTensorNdDescriptorEx); - -CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) - -#define CUDNN_DNN_ROUTINE_EACH_R2(__macro) \ - __macro(cudnnAddTensor); \ - __macro(cudnnConvolutionBackwardData); \ - __macro(cudnnConvolutionBackwardFilter); -CUDNN_DNN_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) - -// APIs available after R3: -#if CUDNN_VERSION >= 3000 -#define CUDNN_DNN_ROUTINE_EACH_AFTER_R3(__macro) \ +#define CUDNN_DNN_ROUTINE_EACH(__macro) \ + __macro(cudnnSetTensor4dDescriptor); \ + __macro(cudnnSetTensor4dDescriptorEx); \ + __macro(cudnnSetTensorNdDescriptor); \ + __macro(cudnnGetTensorNdDescriptor); \ + __macro(cudnnGetConvolutionNdForwardOutputDim); \ + __macro(cudnnCreateTensorDescriptor); \ + __macro(cudnnDestroyTensorDescriptor); \ + __macro(cudnnCreateFilterDescriptor); \ + __macro(cudnnSetFilter4dDescriptor); \ + __macro(cudnnSetFilterNdDescriptor); \ + __macro(cudnnGetFilterNdDescriptor); \ + __macro(cudnnSetPooling2dDescriptor); \ + __macro(cudnnSetPoolingNdDescriptor); \ + __macro(cudnnGetPoolingNdDescriptor); \ + __macro(cudnnDestroyFilterDescriptor); \ + __macro(cudnnCreateConvolutionDescriptor); \ + __macro(cudnnCreatePoolingDescriptor); \ + __macro(cudnnDestroyPoolingDescriptor); \ + __macro(cudnnSetConvolution2dDescriptor); \ + __macro(cudnnDestroyConvolutionDescriptor); \ + __macro(cudnnSetConvolutionNdDescriptor); \ + __macro(cudnnGetConvolutionNdDescriptor); \ + __macro(cudnnDeriveBNTensorDescriptor); \ + __macro(cudnnCreateSpatialTransformerDescriptor); \ + __macro(cudnnSetSpatialTransformerNdDescriptor); \ + __macro(cudnnDestroySpatialTransformerDescriptor); \ + __macro(cudnnSpatialTfGridGeneratorForward); \ + __macro(cudnnSpatialTfGridGeneratorBackward); \ + __macro(cudnnSpatialTfSamplerForward); \ + __macro(cudnnSpatialTfSamplerBackward); \ + __macro(cudnnCreate); \ + __macro(cudnnDestroy); \ + __macro(cudnnSetStream); \ + __macro(cudnnActivationForward); \ + __macro(cudnnActivationBackward); \ + __macro(cudnnConvolutionForward); \ + __macro(cudnnConvolutionBackwardBias); \ + __macro(cudnnGetConvolutionForwardWorkspaceSize); \ + __macro(cudnnTransformTensor); \ + __macro(cudnnPoolingForward); \ + __macro(cudnnPoolingBackward); \ + __macro(cudnnSoftmaxBackward); \ + __macro(cudnnSoftmaxForward); \ + __macro(cudnnGetVersion); \ + __macro(cudnnFindConvolutionForwardAlgorithmEx); \ + __macro(cudnnFindConvolutionBackwardFilterAlgorithmEx); \ + __macro(cudnnFindConvolutionBackwardFilterAlgorithm); \ + __macro(cudnnFindConvolutionBackwardDataAlgorithmEx); \ + __macro(cudnnGetErrorString); \ + __macro(cudnnCreateDropoutDescriptor); \ + __macro(cudnnDropoutGetStatesSize); \ + __macro(cudnnSetDropoutDescriptor); \ + __macro(cudnnRestoreDropoutDescriptor); \ + __macro(cudnnCreateRNNDescriptor); \ + __macro(cudnnGetRNNParamsSize); \ + __macro(cudnnGetRNNWorkspaceSize); \ + __macro(cudnnGetRNNTrainingReserveSize); \ + __macro(cudnnRNNForwardTraining); \ + __macro(cudnnRNNBackwardData); \ + __macro(cudnnRNNBackwardWeights); \ + __macro(cudnnRNNForwardInference); \ + __macro(cudnnDestroyDropoutDescriptor); \ + __macro(cudnnDestroyRNNDescriptor); \ + __macro(cudnnSetTensorNdDescriptorEx); \ + __macro(cudnnAddTensor); \ + __macro(cudnnConvolutionBackwardData); \ + __macro(cudnnConvolutionBackwardFilter); \ __macro(cudnnGetConvolutionBackwardFilterWorkspaceSize); \ - __macro(cudnnGetConvolutionBackwardDataWorkspaceSize); -CUDNN_DNN_ROUTINE_EACH_AFTER_R3(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) -#endif + __macro(cudnnGetConvolutionBackwardDataWorkspaceSize); \ + __macro(cudnnBatchNormalizationForwardTraining); \ + __macro(cudnnBatchNormalizationForwardInference); \ + __macro(cudnnBatchNormalizationBackward); \ + __macro(cudnnCreateActivationDescriptor); \ + __macro(cudnnSetActivationDescriptor); \ + __macro(cudnnGetActivationDescriptor); \ + __macro(cudnnDestroyActivationDescriptor); \ + __macro(cudnnSetRNNDescriptor_v6); +CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) -// APIs available after R3: -#if CUDNN_VERSION >= 3000 && CUDNN_VERSION < 8000 -#define CUDNN_DNN_ROUTINE_EACH_AFTER_R3_LESS_R8(__macro) \ +#if CUDNN_VERSION >= 7000 && CUDNN_VERSION < 8000 +#define CUDNN_DNN_ROUTINE_EACH_AFTER_R7_LESS_R8(__macro) \ __macro(cudnnGetConvolutionBackwardFilterAlgorithm); \ __macro(cudnnGetConvolutionForwardAlgorithm); \ __macro(cudnnGetConvolutionBackwardDataAlgorithm); \ __macro(cudnnSetRNNDescriptor); -CUDNN_DNN_ROUTINE_EACH_AFTER_R3_LESS_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) -#endif - -// APIs available after R4: -#if CUDNN_VERSION >= 4007 -#define CUDNN_DNN_ROUTINE_EACH_AFTER_R4(__macro) \ - __macro(cudnnBatchNormalizationForwardTraining); \ - __macro(cudnnBatchNormalizationForwardInference); \ - __macro(cudnnBatchNormalizationBackward); -CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) -#endif - -// APIs in R5 -#if CUDNN_VERSION >= 5000 -#define CUDNN_DNN_ROUTINE_EACH_R5(__macro) \ - __macro(cudnnCreateActivationDescriptor); \ - __macro(cudnnSetActivationDescriptor); \ - __macro(cudnnGetActivationDescriptor); \ - __macro(cudnnDestroyActivationDescriptor); -CUDNN_DNN_ROUTINE_EACH_R5(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) -#endif - -// APIs in R6 -#if CUDNN_VERSION >= 6000 -#define CUDNN_DNN_ROUTINE_EACH_R6(__macro) __macro(cudnnSetRNNDescriptor_v6); -CUDNN_DNN_ROUTINE_EACH_R6(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) +CUDNN_DNN_ROUTINE_EACH_AFTER_R7_LESS_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) #endif #if CUDNN_VERSION >= 7001 -- GitLab From a881b4d576f3d28e03d1070b16200589dbe5b87c Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Wed, 7 Apr 2021 13:02:38 +0800 Subject: [PATCH 162/486] Struct SparseValue && Bug Fix (#31721) * add PullSparseValue for pull sparse * fix bug for PullSparseValue * add test mode in lookuptable * revert API change * add comment for is_training --- paddle/fluid/distributed/fleet.cc | 46 +++--------- paddle/fluid/distributed/fleet.h | 13 +--- .../distributed/service/brpc_ps_client.cc | 17 ++++- .../distributed/service/brpc_ps_client.h | 3 +- .../distributed/service/brpc_ps_server.cc | 33 +++++---- .../fluid/distributed/service/communicator.cc | 4 +- paddle/fluid/distributed/service/ps_client.h | 5 +- .../distributed/table/common_graph_table.h | 9 ++- .../distributed/table/common_sparse_table.cc | 52 +++++++------ .../distributed/table/common_sparse_table.h | 3 +- paddle/fluid/distributed/table/common_table.h | 8 +- .../table/depends/large_scale_kv.h | 9 ++- .../distributed/table/depends/sparse_utils.h | 74 +++++++++++++++++++ .../distributed/table/sparse_geo_table.cc | 11 ++- paddle/fluid/distributed/table/table.h | 5 +- paddle/fluid/distributed/table/tensor_table.h | 12 +-- .../test/brpc_service_sparse_sgd_test.cc | 8 +- .../fluid/distributed/test/geo_table_test.cc | 7 +- .../distributed/test/sparse_table_test.cc | 16 +++- .../pscore/distributed_lookup_table_op.cc | 5 ++ .../pscore/distributed_lookup_table_op.h | 7 +- .../paddle/distributed/fleet/utils/ps_util.py | 7 +- 22 files changed, 232 insertions(+), 122 deletions(-) create mode 100644 paddle/fluid/distributed/table/depends/sparse_utils.h diff --git a/paddle/fluid/distributed/fleet.cc b/paddle/fluid/distributed/fleet.cc index b638af49730..9aafdd769ed 100644 --- a/paddle/fluid/distributed/fleet.cc +++ b/paddle/fluid/distributed/fleet.cc @@ -146,41 +146,6 @@ void FleetWrapper::CreateClient2ClientConnection() { client2client_max_retry_); } -std::future FleetWrapper::PullSparseVarsAsync( - const Scope& scope, const uint64_t table_id, - const std::vector& var_names, std::vector* fea_keys, - std::vector>* fea_values, int fea_value_dim) { - fea_keys->clear(); - fea_keys->resize(0); - fea_keys->reserve(MAX_FEASIGN_NUM); - for (auto name : var_names) { - Variable* var = scope.FindVar(name); - if (var == nullptr) { - continue; - } - LoDTensor* tensor = var->GetMutable(); - CHECK(tensor != nullptr) << "tensor of var " << name << " is null"; - int64_t* ids = tensor->data(); - size_t len = tensor->numel(); - for (auto i = 0u; i < len; ++i) { - if (ids[i] == 0u) { - continue; - } - fea_keys->push_back(static_cast(ids[i])); - } - } - fea_values->resize(fea_keys->size() + 1); - for (auto& t : *fea_values) { - t.resize(fea_value_dim); - } - std::vector pull_result_ptr; - for (auto& t : *fea_values) { - pull_result_ptr.push_back(t.data()); - } - return pserver_ptr_->_worker_ptr->pull_sparse( - pull_result_ptr.data(), table_id, fea_keys->data(), fea_keys->size()); -} - void FleetWrapper::PullSparseVarsSync( const Scope& scope, const uint64_t table_id, const std::vector& var_names, std::vector* fea_keys, @@ -224,8 +189,10 @@ void FleetWrapper::PullSparseVarsSync( for (auto& t : *fea_values) { pull_result_ptr.push_back(t.data()); } + bool training = true; auto status = pserver_ptr_->_worker_ptr->pull_sparse( - pull_result_ptr.data(), table_id, fea_keys->data(), fea_keys->size()); + pull_result_ptr.data(), table_id, fea_keys->data(), fea_keys->size(), + training); pull_sparse_status.push_back(std::move(status)); for (auto& t : pull_sparse_status) { t.wait(); @@ -238,9 +205,13 @@ void FleetWrapper::PullSparseVarsSync( } } +// is_training is true means training, false means inference, the behavior is +// different on pserver + void FleetWrapper::PullSparseToTensorSync(const uint64_t table_id, int fea_dim, uint64_t padding_id, platform::Place place, + bool is_training, std::vector* inputs, std::vector* outputs) { std::vector fea_keys; @@ -279,7 +250,8 @@ void FleetWrapper::PullSparseToTensorSync(const uint64_t table_id, int fea_dim, } auto* communicator = Communicator::GetInstance(); auto status = communicator->_worker_ptr->pull_sparse( - pull_result_ptr.data(), table_id, fea_keys.data(), fea_keys.size()); + pull_result_ptr.data(), table_id, fea_keys.data(), fea_keys.size(), + is_training); status.wait(); auto ret = status.get(); if (ret != 0) { diff --git a/paddle/fluid/distributed/fleet.h b/paddle/fluid/distributed/fleet.h index ac566606ddc..863440180a8 100644 --- a/paddle/fluid/distributed/fleet.h +++ b/paddle/fluid/distributed/fleet.h @@ -84,19 +84,14 @@ class FleetWrapper { int fea_dim, const std::vector& var_emb_names); - // Pull sparse variables from server in async mode - // Param: scope, table_id, var_names, fea_keys, fea_dim - // Param: fea_values std::future - std::future PullSparseVarsAsync( - const Scope& scope, const uint64_t table_id, - const std::vector& var_names, - std::vector* fea_keys, - std::vector>* fea_values, int fea_dim); - // Pull sparse variables from server in sync mode // pull immediately to tensors + // is_training is true means training, false means inference, the behavior is + // different on pserver + void PullSparseToTensorSync(const uint64_t table_id, int fea_dim, uint64_t padding_id, platform::Place place, + bool is_training, std::vector* inputs, // NOLINT std::vector* outputs); // NOLINT diff --git a/paddle/fluid/distributed/service/brpc_ps_client.cc b/paddle/fluid/distributed/service/brpc_ps_client.cc index 5c226e6a0dd..b49a71ab0c1 100644 --- a/paddle/fluid/distributed/service/brpc_ps_client.cc +++ b/paddle/fluid/distributed/service/brpc_ps_client.cc @@ -768,8 +768,8 @@ std::future BrpcPsClient::push_global_step(int table_id, std::future BrpcPsClient::pull_sparse(float **select_values, size_t table_id, - const uint64_t *keys, - size_t num) { + const uint64_t *keys, size_t num, + bool is_training) { size_t request_call_num = _server_channels.size(); auto shard_sorted_kvs = std::make_shared< @@ -837,16 +837,27 @@ std::future BrpcPsClient::pull_sparse(float **select_values, uint32_t kv_request_count = 0; size_t sorted_kv_size = sorted_kvs.size(); auto &request_buffer = closure->cntl(i)->request_attachment(); + + request_buffer.append((void *)&is_training, sizeof(bool)); + std::vector keys_counter; + keys_counter.reserve(sorted_kv_size); + for (size_t kv_idx = 0; kv_idx < sorted_kv_size; ++kv_idx) { ++kv_request_count; + uint32_t keys = 1; last_key = sorted_kvs[kv_idx].first; request_buffer.append((void *)&last_key, sizeof(uint64_t)); while (kv_idx < sorted_kv_size - 1 && last_key == sorted_kvs[kv_idx + 1].first) { ++kv_idx; + ++keys; } + keys_counter.push_back(keys); } + request_buffer.append((void *)keys_counter.data(), + sizeof(uint32_t) * keys_counter.size()); + if (kv_request_count == 0) { closure->Run(); } else { @@ -956,7 +967,7 @@ int32_t BrpcPsClient::recv_and_save_table(const uint64_t table_id, } auto status = pull_sparse((float **)save_vec.data(), table_id, - save_key.data(), save_key.size()); + save_key.data(), save_key.size(), true); status.wait(); // create lod tensor diff --git a/paddle/fluid/distributed/service/brpc_ps_client.h b/paddle/fluid/distributed/service/brpc_ps_client.h index 84a31fdbd5d..5192356e4b5 100644 --- a/paddle/fluid/distributed/service/brpc_ps_client.h +++ b/paddle/fluid/distributed/service/brpc_ps_client.h @@ -148,7 +148,8 @@ class BrpcPsClient : public PSClient { virtual std::future pull_sparse(float **select_values, size_t table_id, - const uint64_t *keys, size_t num); + const uint64_t *keys, size_t num, + bool is_training); virtual std::future print_table_stat(uint32_t table_id); diff --git a/paddle/fluid/distributed/service/brpc_ps_server.cc b/paddle/fluid/distributed/service/brpc_ps_server.cc index d7ff0ecd95a..a9370561a54 100644 --- a/paddle/fluid/distributed/service/brpc_ps_server.cc +++ b/paddle/fluid/distributed/service/brpc_ps_server.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/distributed/service/brpc_ps_server.h" #include // NOLINT +#include "paddle/fluid/distributed/table/depends/sparse_utils.h" #include "paddle/fluid/distributed/table/table.h" #include "paddle/fluid/framework/archive.h" #include "paddle/fluid/platform/profiler.h" @@ -337,33 +338,39 @@ int32_t BrpcPsService::pull_sparse(Table *table, brpc::Controller *cntl) { platform::RecordEvent record_event("PsService->pull_sparse"); CHECK_TABLE_EXIST(table, request, response) - thread_local std::string push_sparse_request_buffer; + auto &req_io_buffer = cntl->request_attachment(); auto req_buffer_size = req_io_buffer.size(); + if (req_buffer_size < 1) { set_response_code(response, -1, "req attachment is empty"); return 0; } + if (request.params_size() < 1) { set_response_code(response, -1, "PsRequestMessage.params is requeired at " "least 1 for num of sparse_key"); return 0; } + uint32_t num = *(uint32_t *)(request.params(0).c_str()); - push_sparse_request_buffer.resize(0); - push_sparse_request_buffer.reserve(req_buffer_size); - const char *data = (const char *)cntl->request_attachment().fetch( - const_cast(push_sparse_request_buffer.data()), req_buffer_size); - /* - Attachment Content: - |---keysData---| - |---8*{num}B---| - */ - const uint64_t *keys = (const uint64_t *)data; + auto dim = table->value_accesor()->select_dim(); + + thread_local std::string req_buffer; + req_buffer.reserve(req_buffer_size); + + const void *data = cntl->request_attachment().fetch( + const_cast(req_buffer.data()), req_buffer_size); + + auto value = PullSparseValue(num, dim); + + value.DeserializeFromBytes(const_cast(data)); + std::vector res_data; - res_data.resize(num * table->value_accesor()->select_size() / sizeof(float)); - table->pull_sparse(res_data.data(), keys, num); + res_data.resize(num * dim); + table->pull_sparse(res_data.data(), value); + cntl->response_attachment().append((char *)res_data.data(), res_data.size() * sizeof(float)); return 0; diff --git a/paddle/fluid/distributed/service/communicator.cc b/paddle/fluid/distributed/service/communicator.cc index 8699719e5cd..3d5ab8e16d9 100644 --- a/paddle/fluid/distributed/service/communicator.cc +++ b/paddle/fluid/distributed/service/communicator.cc @@ -320,9 +320,11 @@ void Communicator::RpcRecvSparse(const std::string &varname, int table_id, push_g_vec.push_back(tensor->data() + i * dim); } + bool training = true; + auto status = _worker_ptr->pull_sparse( (float **)push_g_vec.data(), table_id, // NOLINT - sparse_push_keys.data(), sparse_push_keys.size()); + sparse_push_keys.data(), sparse_push_keys.size(), training); status.wait(); return; } diff --git a/paddle/fluid/distributed/service/ps_client.h b/paddle/fluid/distributed/service/ps_client.h index 7b698afa726..3ff4b9d063f 100644 --- a/paddle/fluid/distributed/service/ps_client.h +++ b/paddle/fluid/distributed/service/ps_client.h @@ -112,10 +112,11 @@ class PSClient { // future结束前keys和values缓冲区不能再次使用 // 整合多个线程请求的keys,聚集并分散发送到server // 返回结果后,遍历buffer并对values赋值 + // is_training 用于区分请求是训练/预测,server端对于特征和准入会有不同的处理. virtual std::future pull_sparse(float **select_values, size_t table_id, - const uint64_t *keys, - size_t num) = 0; + const uint64_t *keys, size_t num, + bool is_training) = 0; virtual std::future print_table_stat(uint32_t table_id) = 0; diff --git a/paddle/fluid/distributed/table/common_graph_table.h b/paddle/fluid/distributed/table/common_graph_table.h index de3cac134cd..ab289618462 100644 --- a/paddle/fluid/distributed/table/common_graph_table.h +++ b/paddle/fluid/distributed/table/common_graph_table.h @@ -103,13 +103,16 @@ class GraphTable : public SparseTable { Node *find_node(uint64_t id); - virtual int32_t pull_sparse(float *values, const uint64_t *keys, size_t num) { + virtual int32_t pull_sparse(float *values, + const PullSparseValue &pull_value) { return 0; } + virtual int32_t push_sparse(const uint64_t *keys, const float *values, size_t num) { return 0; } + virtual void clear() {} virtual int32_t flush() { return 0; } virtual int32_t shrink(const std::string ¶m) { return 0; } @@ -140,5 +143,5 @@ class GraphTable : public SparseTable { std::vector> _shards_task_pool; }; -} -}; +} // namespace distributed +}; // namespace paddle diff --git a/paddle/fluid/distributed/table/common_sparse_table.cc b/paddle/fluid/distributed/table/common_sparse_table.cc index ffedbea14a0..a25a90aa9a7 100644 --- a/paddle/fluid/distributed/table/common_sparse_table.cc +++ b/paddle/fluid/distributed/table/common_sparse_table.cc @@ -254,7 +254,6 @@ int32_t CommonSparseTable::initialize_value() { } auto accessor = _config.accessor(); - std::vector feasigns; for (size_t x = 0; x < accessor.fea_dim(); ++x) { @@ -271,9 +270,14 @@ int32_t CommonSparseTable::initialize_value() { std::vector ids(bucket_feasigns); std::copy(feasigns.begin() + buckets[x], feasigns.begin() + buckets[x + 1], ids.begin()); + + std::vector fres; + fres.resize(ids.size(), 1); + + auto pull_value = PullSparseValue(ids, fres, param_dim_); std::vector pulls; pulls.resize(bucket_feasigns * param_dim_); - pull_sparse(pulls.data(), ids.data(), bucket_feasigns); + pull_sparse(pulls.data(), pull_value); } return 0; @@ -399,32 +403,36 @@ int32_t CommonSparseTable::pour() { return 0; } -int32_t CommonSparseTable::pull_sparse(float* pull_values, const uint64_t* keys, - size_t num) { +int32_t CommonSparseTable::pull_sparse(float* pull_values, + const PullSparseValue& pull_value) { rwlock_->RDLock(); - std::vector> offset_bucket; - offset_bucket.resize(task_pool_size_); - - for (int x = 0; x < num; ++x) { - auto y = keys[x] % task_pool_size_; - offset_bucket[y].push_back(x); - } - - std::vector> tasks(task_pool_size_); + auto shard_num = task_pool_size_; + std::vector> tasks(shard_num); - for (int shard_id = 0; shard_id < task_pool_size_; ++shard_id) { + for (int shard_id = 0; shard_id < shard_num; ++shard_id) { tasks[shard_id] = _shards_task_pool[shard_id]->enqueue( - [this, shard_id, &keys, &offset_bucket, &pull_values]() -> int { + [this, shard_id, shard_num, &pull_value, &pull_values]() -> int { auto& block = shard_values_[shard_id]; - auto& offsets = offset_bucket[shard_id]; - for (int i = 0; i < offsets.size(); ++i) { - auto offset = offsets[i]; - auto id = keys[offset]; - auto* value = block->Init(id); - std::copy_n(value + param_offset_, param_dim_, - pull_values + param_dim_ * offset); + std::vector offsets; + pull_value.Fission(shard_id, shard_num, &offsets); + + if (pull_value.is_training_) { + for (auto& offset : offsets) { + auto feasign = pull_value.feasigns_[offset]; + auto frequencie = pull_value.frequencies_[offset]; + auto* value = block->Init(feasign, true, frequencie); + std::copy_n(value + param_offset_, param_dim_, + pull_values + param_dim_ * offset); + } + } else { + for (auto& offset : offsets) { + auto feasign = pull_value.feasigns_[offset]; + auto* value = block->Init(feasign, false); + std::copy_n(value + param_offset_, param_dim_, + pull_values + param_dim_ * offset); + } } return 0; diff --git a/paddle/fluid/distributed/table/common_sparse_table.h b/paddle/fluid/distributed/table/common_sparse_table.h index 98cbf2b4a21..31f4dabcdfd 100644 --- a/paddle/fluid/distributed/table/common_sparse_table.h +++ b/paddle/fluid/distributed/table/common_sparse_table.h @@ -61,8 +61,7 @@ class CommonSparseTable : public SparseTable { int32_t save(const std::string& path, const std::string& param); virtual std::pair print_table_stat(); - virtual int32_t pull_sparse(float* pull_values, const uint64_t* keys, - size_t num); + virtual int32_t pull_sparse(float* values, const PullSparseValue& pull_value); virtual int32_t push_sparse(const uint64_t* keys, const float* values, size_t num); diff --git a/paddle/fluid/distributed/table/common_table.h b/paddle/fluid/distributed/table/common_table.h index dc3cfa75ff6..bc7f17f5f24 100644 --- a/paddle/fluid/distributed/table/common_table.h +++ b/paddle/fluid/distributed/table/common_table.h @@ -98,8 +98,8 @@ class DenseTable : public Table { virtual ~DenseTable() {} virtual void *get_shard(size_t shard_idx) { return 0; } - int32_t pull_sparse(float *values, const uint64_t *keys, - size_t num) override { + int32_t pull_sparse(float *values, + const PullSparseValue &pull_value) override { return 0; } int32_t push_sparse(const uint64_t *keys, const float *values, @@ -123,8 +123,8 @@ class BarrierTable : public Table { int32_t push_dense(const float *values, size_t num) override { return 0; } - int32_t pull_sparse(float *values, const uint64_t *keys, - size_t num) override { + int32_t pull_sparse(float *values, + const PullSparseValue &pull_value) override { return 0; } int32_t push_sparse(const uint64_t *keys, const float *values, diff --git a/paddle/fluid/distributed/table/depends/large_scale_kv.h b/paddle/fluid/distributed/table/depends/large_scale_kv.h index ba79a381a6d..cb077033cad 100644 --- a/paddle/fluid/distributed/table/depends/large_scale_kv.h +++ b/paddle/fluid/distributed/table/depends/large_scale_kv.h @@ -155,7 +155,8 @@ class ValueBlock { } // pull - float *Init(const uint64_t &id, const bool with_update = true) { + float *Init(const uint64_t &id, const bool with_update = true, + const int counter = 1) { if (!Has(id)) { values_[id] = std::make_shared(value_length_); } @@ -163,16 +164,16 @@ class ValueBlock { auto &value = values_.at(id); if (with_update) { - AttrUpdate(value); + AttrUpdate(value, counter); } return value->data_.data(); } - void AttrUpdate(std::shared_ptr value) { + void AttrUpdate(std::shared_ptr value, const int counter) { // update state value->unseen_days_ = 0; - ++value->count_; + value->count_ += counter; if (!value->is_entry_) { value->is_entry_ = entry_func_(value); diff --git a/paddle/fluid/distributed/table/depends/sparse_utils.h b/paddle/fluid/distributed/table/depends/sparse_utils.h new file mode 100644 index 00000000000..c185dd17d79 --- /dev/null +++ b/paddle/fluid/distributed/table/depends/sparse_utils.h @@ -0,0 +1,74 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include + +namespace paddle { +namespace distributed { + +struct PullSparseValue { + explicit PullSparseValue(int numel, int dim) + : numel_(numel), + dim_(dim), + is_training_(true), + feasigns_(nullptr), + frequencies_(nullptr) {} + + explicit PullSparseValue(std::vector feasigns, + std::vector frequencies, int dim) { + numel_ = feasigns.size(); + dim_ = dim; + is_training_ = true; + feasigns_ = feasigns.data(); + frequencies_ = frequencies.data(); + } + + void DeserializeFromBytes(void* bytes) { + /* + |---isTraining--------------| + |---8*{num}B(keysData)------| + |---4*{num}B(Frequencies)---| + */ + auto* begin = reinterpret_cast(bytes); + is_training_ = reinterpret_cast(begin)[0]; + feasigns_ = reinterpret_cast(begin + sizeof(bool)); + frequencies_ = reinterpret_cast(begin + sizeof(bool) + + sizeof(uint64_t) * numel_); + } + + void Fission(const int shard_id, const int shard_num, + std::vector* offset_shard) const { + offset_shard->reserve(numel_ / shard_num + 1); + for (int x = 0; x < numel_; ++x) { + if (feasigns_[x] % shard_num == shard_id) { + offset_shard->push_back(x); + } + } + } + + int numel_; + int dim_; + bool is_training_; + uint64_t* feasigns_; + uint32_t* frequencies_; +}; + +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/table/sparse_geo_table.cc b/paddle/fluid/distributed/table/sparse_geo_table.cc index 9b276e7de5c..04cd1136382 100644 --- a/paddle/fluid/distributed/table/sparse_geo_table.cc +++ b/paddle/fluid/distributed/table/sparse_geo_table.cc @@ -22,8 +22,17 @@ int32_t SparseGeoTable::pull_geo_param(const uint32_t trainer_id, std::vector* ids) { geo_recorder->GetAndClear(trainer_id, ids); auto dim = _config.common().dims()[0]; + + std::vector frequencies; + frequencies.resize(ids->size(), 1); + + auto pull_value = PullSparseValue(ids->size(), dim); + pull_value.is_training_ = true; + pull_value.feasigns_ = ids->data(); + pull_value.frequencies_ = frequencies.data(); + values->resize(ids->size() * dim); - CommonSparseTable::pull_sparse(values->data(), ids->data(), ids->size()); + CommonSparseTable::pull_sparse(values->data(), pull_value); return 0; } diff --git a/paddle/fluid/distributed/table/table.h b/paddle/fluid/distributed/table/table.h index d64e805af40..8f014ac98ba 100644 --- a/paddle/fluid/distributed/table/table.h +++ b/paddle/fluid/distributed/table/table.h @@ -21,6 +21,7 @@ #include #include #include "paddle/fluid/distributed/table/accessor.h" +#include "paddle/fluid/distributed/table/depends/sparse_utils.h" #include "paddle/fluid/distributed/table/graph_node.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" @@ -47,8 +48,8 @@ class Table { return 0; } - virtual int32_t pull_sparse(float *values, const uint64_t *keys, - size_t num) = 0; + virtual int32_t pull_sparse(float *values, + const PullSparseValue &pull_value) = 0; virtual int32_t push_sparse(const uint64_t *keys, const float *values, size_t num) = 0; virtual int32_t push_sparse_param(const uint64_t *keys, const float *values, diff --git a/paddle/fluid/distributed/table/tensor_table.h b/paddle/fluid/distributed/table/tensor_table.h index 1a8f1a9cd9a..080682d1314 100644 --- a/paddle/fluid/distributed/table/tensor_table.h +++ b/paddle/fluid/distributed/table/tensor_table.h @@ -52,8 +52,8 @@ class TensorTable : public Table { int32_t push_dense(const float *values, size_t num) override { return 0; } - int32_t pull_sparse(float *values, const uint64_t *keys, - size_t num) override { + int32_t pull_sparse(float *values, + const PullSparseValue &pull_value) override { return 0; } int32_t push_sparse(const uint64_t *keys, const float *values, @@ -102,8 +102,8 @@ class DenseTensorTable : public TensorTable { DenseTensorTable() {} virtual ~DenseTensorTable() {} - int32_t pull_sparse(float *values, const uint64_t *keys, - size_t num) override { + int32_t pull_sparse(float *values, + const PullSparseValue &pull_value) override { return 0; } int32_t push_sparse(const uint64_t *keys, const float *values, @@ -158,8 +158,8 @@ class GlobalStepTable : public DenseTensorTable { GlobalStepTable() {} virtual ~GlobalStepTable() {} - int32_t pull_sparse(float *values, const uint64_t *keys, - size_t num) override { + int32_t pull_sparse(float *values, + const PullSparseValue &pull_value) override { return 0; } int32_t push_sparse(const uint64_t *keys, const float *values, diff --git a/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc b/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc index fbd236012f5..8fb3434af6e 100644 --- a/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc +++ b/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc @@ -212,8 +212,8 @@ void RunBrpcPushSparse() { /*-----------------------Test Server Init----------------------------------*/ LOG(INFO) << "Run pull_sparse_param"; - auto pull_status = worker_ptr_->pull_sparse(fea_value_ptr.data(), 0, - fea_keys.data(), fea_keys.size()); + auto pull_status = worker_ptr_->pull_sparse( + fea_value_ptr.data(), 0, fea_keys.data(), fea_keys.size(), true); pull_status.wait(); for (size_t idx = 0; idx < tensor->numel(); ++idx) { fea_values.data()[idx] *= 2.0; @@ -241,7 +241,7 @@ void RunBrpcPushSparse() { push_status.wait(); auto pull_param_status = worker_ptr_->pull_sparse( - fea_temp_value_ptr.data(), 0, fea_keys.data(), fea_keys.size()); + fea_temp_value_ptr.data(), 0, fea_keys.data(), fea_keys.size(), true); pull_param_status.wait(); for (size_t idx = 0; idx < tensor->numel(); ++idx) { @@ -275,7 +275,7 @@ void RunBrpcPushSparse() { push_grad_status.wait(); auto pull_update_status = worker_ptr_->pull_sparse( - fea_temp_value_ptr.data(), 0, fea_keys.data(), fea_keys.size()); + fea_temp_value_ptr.data(), 0, fea_keys.data(), fea_keys.size(), true); pull_update_status.wait(); for (size_t idx = 0; idx < tensor->numel(); ++idx) { diff --git a/paddle/fluid/distributed/test/geo_table_test.cc b/paddle/fluid/distributed/test/geo_table_test.cc index 22e11acf658..c9f15db3f78 100644 --- a/paddle/fluid/distributed/test/geo_table_test.cc +++ b/paddle/fluid/distributed/test/geo_table_test.cc @@ -23,6 +23,7 @@ limitations under the License. */ #include "paddle/fluid/distributed/ps.pb.h" #include "paddle/fluid/distributed/table/common_dense_table.h" #include "paddle/fluid/distributed/table/common_sparse_table.h" +#include "paddle/fluid/distributed/table/depends/sparse_utils.h" #include "paddle/fluid/distributed/table/sparse_geo_table.h" #include "paddle/fluid/distributed/table/table.h" @@ -53,14 +54,18 @@ TEST(SparseGeoTable, SSUM) { // test push_sparse_param, and create params std::vector init_keys = {0, 1, 2, 3, 4}; + std::vector init_fres = {1, 1, 1, 1, 1}; std::vector init_values; for (size_t i = 0; i < init_keys.size() * emb_dim; i++) { init_values.push_back(0.0); } table->push_sparse_param(init_keys.data(), init_values.data(), init_keys.size()); + std::vector pull_values(init_values.size()); - table->pull_sparse(pull_values.data(), init_keys.data(), init_keys.size()); + auto value = PullSparseValue(init_keys, init_fres, emb_dim); + table->pull_sparse(pull_values.data(), value); + for (size_t i = 0; i < init_keys.size() * emb_dim; i++) { ASSERT_TRUE(abs(pull_values[i] - init_values[i]) < 1e-5); } diff --git a/paddle/fluid/distributed/test/sparse_table_test.cc b/paddle/fluid/distributed/test/sparse_table_test.cc index 6db95c5fac2..26bede392d6 100644 --- a/paddle/fluid/distributed/test/sparse_table_test.cc +++ b/paddle/fluid/distributed/test/sparse_table_test.cc @@ -55,9 +55,14 @@ TEST(CommonSparseTable, SGD) { // pull parameters for create and check std::vector init_keys = {0, 1, 2, 3, 4}; + std::vector init_fres = {1, 1, 1, 1, 1}; + std::vector init_values; init_values.resize(init_keys.size() * emb_dim); - table->pull_sparse(init_values.data(), init_keys.data(), init_keys.size()); + + std::vector pull_values(init_values.size()); + auto value = PullSparseValue(init_keys, init_fres, emb_dim); + table->pull_sparse(init_values.data(), value); // for check std::vector total_gradients; @@ -100,7 +105,8 @@ TEST(CommonSparseTable, SGD) { std::vector pull_values; pull_values.resize(init_keys.size() * emb_dim); - table->pull_sparse(pull_values.data(), init_keys.data(), init_keys.size()); + table->pull_sparse(init_values.data(), value); + for (size_t i = 0; i < init_values.size(); ++i) { auto update_val = init_values[i] - 1.0 * total_gradients[i]; ASSERT_TRUE(abs(update_val - pull_values[i]) < 1e-5); @@ -148,9 +154,13 @@ TEST(CommonSparseTable, Adam) { // pull parameters for create and check std::vector init_keys = {0, 1, 2, 3, 4}; + std::vector init_fres = {1, 1, 1, 1, 1}; + std::vector init_values; init_values.resize(init_keys.size() * emb_dim); - table->pull_sparse(init_values.data(), init_keys.data(), init_keys.size()); + + auto value = PullSparseValue(init_keys, init_fres, emb_dim); + table->pull_sparse(init_values.data(), value); // push gradient std::vector> trainer_keys; diff --git a/paddle/fluid/operators/pscore/distributed_lookup_table_op.cc b/paddle/fluid/operators/pscore/distributed_lookup_table_op.cc index 159bdcabd65..277c93fad6a 100644 --- a/paddle/fluid/operators/pscore/distributed_lookup_table_op.cc +++ b/paddle/fluid/operators/pscore/distributed_lookup_table_op.cc @@ -119,6 +119,11 @@ class DistributedLookupTableOpMaker : public framework::OpProtoAndCheckerMaker { "Output data type") .SetDefault(framework::proto::VarType::FP32); + AddAttr("is_test", + "(bool, default false) Set to true for inference only, false " + "for training.") + .SetDefault(false); + AddComment(R"DOC( Lookup Tablel Prefetch Operator. This operator is used to perform lookup on parameter W, diff --git a/paddle/fluid/operators/pscore/distributed_lookup_table_op.h b/paddle/fluid/operators/pscore/distributed_lookup_table_op.h index 0f1a096e207..413b4ab3585 100644 --- a/paddle/fluid/operators/pscore/distributed_lookup_table_op.h +++ b/paddle/fluid/operators/pscore/distributed_lookup_table_op.h @@ -30,6 +30,7 @@ class DistributedLookupTableKernel : public framework::OpKernel { auto padding_idx = context.Attr("padding_idx"); auto table_id = context.Attr("table_id"); + bool is_test = context.Attr("is_test"); auto embedding_name = context.InputNames("W").front(); int64_t emb_dim = 0; @@ -55,7 +56,8 @@ class DistributedLookupTableKernel : public framework::OpKernel { if (platform::is_cpu_place(context.GetPlace())) { fleet->PullSparseToTensorSync(static_cast(table_id), emb_dim, static_cast(padding_idx), - context.GetPlace(), &inputs, &outputs); + context.GetPlace(), !is_test, &inputs, + &outputs); } else { auto inputs_variable = context.MultiInputVar("Ids"); auto outputs_variable = context.MultiOutputVar("Outputs"); @@ -93,7 +95,8 @@ class DistributedLookupTableKernel : public framework::OpKernel { // use fleet->PullSparse fleet->PullSparseToTensorSync(static_cast(table_id), emb_dim, static_cast(padding_idx), - cpu_place, &tmp_input_vec, &tmp_output_vec); + cpu_place, !is_test, &tmp_input_vec, + &tmp_output_vec); // cp temp to origin for (size_t idx = 0; idx < output_var_size; ++idx) { diff --git a/python/paddle/distributed/fleet/utils/ps_util.py b/python/paddle/distributed/fleet/utils/ps_util.py index a409d02c984..7bf7bec43de 100644 --- a/python/paddle/distributed/fleet/utils/ps_util.py +++ b/python/paddle/distributed/fleet/utils/ps_util.py @@ -16,6 +16,7 @@ import numpy as np import os import paddle +import warnings class DistributedInfer: @@ -104,8 +105,6 @@ class DistributedInfer: vars=need_load_vars) def get_dist_infer_program(self): - import paddle.distributed.fleet as fleet - varname2tables = self._get_sparse_table_map() convert_program = self._convert_program(self.origin_main_program, varname2tables) @@ -185,6 +184,7 @@ class DistributedInfer: "is_distributed": is_distributed, "padding_idx": padding_idx, "table_id": table_id, + "is_test": True, "lookup_table_version": op_type }) else: @@ -193,6 +193,9 @@ class DistributedInfer: ) pull_sparse_ops = _get_pull_sparse_ops(program) + warnings.warn( + "lookup_table will be forced to test mode when use DistributedInfer" + ) _pull_sparse_fuse(program, pull_sparse_ops) return program -- GitLab From e625f884567d7be6a7554b872f5e92ba66c62afd Mon Sep 17 00:00:00 2001 From: iducn <45056973+iducn@users.noreply.github.com> Date: Wed, 7 Apr 2021 14:42:14 +0800 Subject: [PATCH 163/486] print build summary (#32110) * print build summary * print build summary * print build summary * print build summary --- paddle/scripts/paddle_build.sh | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 7516e4c99ea..2df9e0198ee 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -404,7 +404,7 @@ EOF tar -czf paddle_inference.tgz paddle_inference buildSize=$(du -h --max-depth=0 ${PADDLE_ROOT}/build/paddle_inference.tgz |awk '{print $1}') echo "Paddle_Inference Size: $buildSize" - echo "ipipe_log_param_Paddle_Inference_Size: $buildSize" + echo "ipipe_log_param_Paddle_Inference_Size: $buildSize" >> ${PADDLE_ROOT}/build/build_summary.txt else SYSTEM=`uname -s` if [ "$SYSTEM" == "Darwin" ]; then @@ -414,10 +414,10 @@ EOF fi buildSize=$($com ${PADDLE_ROOT}/build |awk '{print $1}') echo "Build Size: $buildSize" - echo "ipipe_log_param_Build_Size: $buildSize" + echo "ipipe_log_param_Build_Size: $buildSize" >> ${PADDLE_ROOT}/build/build_summary.txt PR_whlSize=$($com ${PADDLE_ROOT}/build/python/dist |awk '{print $1}') echo "PR whl Size: $PR_whlSize" - echo "ipipe_log_param_PR_whl_Size: $PR_whlSize" + echo "ipipe_log_param_PR_whl_Size: $PR_whlSize" >> ${PADDLE_ROOT}/build/build_summary.txt fi } @@ -442,7 +442,7 @@ function cmake_gen_and_build() { build $2 endTime_s=`date +%s` echo "Build Time: $[ $endTime_s - $startTime_s ]s" - echo "ipipe_log_param_Build_Time: $[ $endTime_s - $startTime_s ]s" + echo "ipipe_log_param_Build_Time: $[ $endTime_s - $startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt } function build_mac() { @@ -480,7 +480,7 @@ function cmake_gen_and_build_mac() { build_mac endTime_s=`date +%s` echo "Build Time: $[ $endTime_s - $startTime_s ]s" - echo "ipipe_log_param_Build_Time: $[ $endTime_s - $startTime_s ]s" + echo "ipipe_log_param_Build_Time: $[ $endTime_s - $startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt } function run_test() { @@ -684,7 +684,7 @@ EOF #mactest_error=$? ut_endTime_s=`date +%s` echo "Mac testCase Time: $[ $ut_endTime_s - $ut_startTime_s ]s" - echo "ipipe_log_param_Mac_TestCases_Time: $[ $ut_endTime_s - $ut_startTime_s ]s" + echo "ipipe_log_param_Mac_TestCases_Time: $[ $ut_endTime_s - $ut_startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt paddle version # Recovery proxy to avoid failure in later steps set +x @@ -993,10 +993,10 @@ EOF num=$(echo $testcases|grep -o '\^'|wc -l) if (( $2 == -1 )); then echo "exclusive TestCases count is $num" - echo "ipipe_log_param_Exclusive_TestCases_Count: $num" + echo "ipipe_log_param_Exclusive_TestCases_Count: $num" >> ${PADDLE_ROOT}/build/build_summary.txt else echo "$2 card TestCases count is $num" - echo "ipipe_log_param_${2}_Cards_TestCases_Count: $num" + echo "ipipe_log_param_${2}_Cards_TestCases_Count: $num" >> ${PADDLE_ROOT}/build/build_summary.txt fi } @@ -1098,10 +1098,10 @@ function card_test() { ut_endTime_s=`date +%s` if (( $2 == -1 )); then echo "exclusive TestCases Total Time: $[ $ut_endTime_s - $ut_startTime_s ]s" - echo "ipipe_log_param_Exclusive_TestCases_Total_Time: $[ $ut_endTime_s - $ut_startTime_s ]s" + echo "ipipe_log_param_Exclusive_TestCases_Total_Time: $[ $ut_endTime_s - $ut_startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt else echo "$2 card TestCases Total Time: $[ $ut_endTime_s - $ut_startTime_s ]s" - echo "ipipe_log_param_${2}_Cards_TestCases_Total_Time: $[ $ut_endTime_s - $ut_startTime_s ]s" + echo "ipipe_log_param_${2}_Cards_TestCases_Total_Time: $[ $ut_endTime_s - $ut_startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt fi set +m } @@ -1448,7 +1448,7 @@ function parallel_test() { fi ut_total_endTime_s=`date +%s` echo "TestCases Total Time: $[ $ut_total_endTime_s - $ut_total_startTime_s ]s" - echo "ipipe_log_param_TestCases_Total_Time: $[ $ut_total_endTime_s - $ut_total_startTime_s ]s" + echo "ipipe_log_param_TestCases_Total_Time: $[ $ut_total_endTime_s - $ut_total_startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt } function enable_unused_var_check() { @@ -1728,7 +1728,7 @@ EOF fi endTime_s=`date +%s` echo "Build Time: $[ $endTime_s - $startTime_s ]s" - echo "ipipe_log_param_Build_Time: $[ $endTime_s - $startTime_s ]s" + echo "ipipe_log_param_Build_Time: $[ $endTime_s - $startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt build_size "paddle_inference" } @@ -1760,7 +1760,7 @@ EOF EXIT_CODE=$? fluid_endTime_s=`date +%s` echo "test_fluid_lib Total Time: $[ $fluid_endTime_s - $fluid_startTime_s ]s" - echo "ipipe_log_param_Test_Fluid_Lib_Total_Time: $[ $fluid_endTime_s - $fluid_startTime_s ]s" + echo "ipipe_log_param_Test_Fluid_Lib_Total_Time: $[ $fluid_endTime_s - $fluid_startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt ./clean.sh if [[ "$EXIT_CODE" != "0" ]]; then exit 8; @@ -1807,7 +1807,7 @@ function example() { function collect_ccache_hits() { rate=$(ccache -s | grep 'cache hit rate' | awk '{print $4}') echo "ccache hit rate: ${rate}%" - echo "ipipe_log_param_Ccache_Hit_Rate: ${rate}%" + echo "ipipe_log_param_Ccache_Hit_Rate: ${rate}%" >> ${PADDLE_ROOT}/build/build_summary.txt } @@ -2029,6 +2029,12 @@ function main() { exit 1 ;; esac + set +x + if [[ -f ${PADDLE_ROOT}/build/build_summary.txt ]];then + echo "=====================build summary======================" + cat ${PADDLE_ROOT}/build/build_summary.txt + echo "========================================================" + fi echo "paddle_build script finished as expected" } -- GitLab From f5186c3c4bab17df887f674a5fcd490c1ddaf332 Mon Sep 17 00:00:00 2001 From: pangyoki Date: Wed, 7 Apr 2021 15:35:55 +0800 Subject: [PATCH 164/486] update name of develop whl package and upgrade gcc 4.8.2 to gcc 5.4 (#31240) * update develop whl package name * distingush cpu and gpu name * fix ref_gcc * change whl name * upgrade gcc 4.8 to 5.4 in ubuntu_dev * update gcc4.8 to 5.4 in centos * Upgrade pip from 18.0 to 20.0.1 * change 2.1.0_dev0 to 2.1.0.dev0 in gpu version --- tools/dockerfile/Dockerfile.ubuntu | 8 ++-- tools/dockerfile/build_scripts/install_gcc.sh | 14 +++++++ tools/dockerfile/centos7_manylinux.sh | 4 ++ tools/dockerfile/ubuntu16_dev.sh | 38 ++++++++++--------- tools/dockerfile/ubuntu18_dev.sh | 38 ++++++++++--------- 5 files changed, 62 insertions(+), 40 deletions(-) diff --git a/tools/dockerfile/Dockerfile.ubuntu b/tools/dockerfile/Dockerfile.ubuntu index d68992717c5..f566e66a976 100644 --- a/tools/dockerfile/Dockerfile.ubuntu +++ b/tools/dockerfile/Dockerfile.ubuntu @@ -97,8 +97,8 @@ RUN wget https://files.pythonhosted.org/packages/b0/d1/8acb42f391cba52e35b131e44 WORKDIR /home/setuptools-40.6.2 RUN python setup.py build && python setup.py install WORKDIR /home -RUN wget https://files.pythonhosted.org/packages/69/81/52b68d0a4de760a2f1979b0931ba7889202f302072cc7a0d614211bc7579/pip-18.0.tar.gz && tar -zxvf pip-18.0.tar.gz -WORKDIR pip-18.0 +RUN wget https://files.pythonhosted.org/packages/28/af/2c76c8aa46ccdf7578b83d97a11a2d1858794d4be4a1610ade0d30182e8b/pip-20.0.1.tar.gz && tar -zxvf pip-20.0.1.tar.gz +WORKDIR pip-20.0.1 RUN python setup.py install && \ python3.8 setup.py install && \ python3.7 setup.py install && \ @@ -106,8 +106,8 @@ RUN python setup.py install && \ python3 setup.py install WORKDIR /home -RUN rm Python-$version.tgz setuptools-40.6.2.zip pip-18.0.tar.gz && \ - rm -r Python-$version setuptools-40.6.2 pip-18.0 +RUN rm Python-$version.tgz setuptools-40.6.2.zip pip-20.0.1.tar.gz && \ + rm -r Python-$version setuptools-40.6.2 pip-20.0.1 # Install Go and glide RUN wget -qO- https://paddle-ci.cdn.bcebos.com/go1.8.1.linux-amd64.tar.gz | \ diff --git a/tools/dockerfile/build_scripts/install_gcc.sh b/tools/dockerfile/build_scripts/install_gcc.sh index e75021b2a9b..e744e9ddac6 100644 --- a/tools/dockerfile/build_scripts/install_gcc.sh +++ b/tools/dockerfile/build_scripts/install_gcc.sh @@ -43,4 +43,18 @@ if [ "$1" == "gcc82" ]; then ln -s /usr/local/gcc-8.2/lib64/libgfortran.so.5 ${lib_so_5} && \ ln -s /usr/local/gcc-8.2/lib64/libstdc++.so.6 ${lib_so_6} && \ cp /usr/local/gcc-8.2/lib64/libstdc++.so.6.0.25 ${lib_path} +elif [ "$1" == "gcc54" ]; then + wget -q http://ftp.tsukuba.wide.ad.jp/software/gcc/releases/gcc-5.4.0/gcc-5.4.0.tar.bz2 + tar -xvf gcc-5.4.0.tar.bz2 && \ + cd gcc-5.4.0 && \ + unset LIBRARY_PATH CPATH C_INCLUDE_PATH PKG_CONFIG_PATH CPLUS_INCLUDE_PATH INCLUDE && \ + ./contrib/download_prerequisites && \ + cd .. && mkdir temp_gcc54 && cd temp_gcc54 && \ + ../gcc-5.4.0/configure --prefix=/usr/local/gcc-5.4 --enable-checking=release --enable-languages=c,c++ --disable-multilib && \ + make -j8 && make install + cd .. && rm -rf temp_gcc54 + cp ${lib_so_6} ${lib_so_6}.bak && rm -f ${lib_so_6} && + ln -s /usr/local/gcc-5.4/lib64/libgfortran.so.5 ${lib_so_5} && \ + ln -s /usr/local/gcc-5.4/lib64/libstdc++.so.6 ${lib_so_6} && \ + cp /usr/local/gcc-5.4/lib64/libstdc++.so.6.0.21 ${lib_path} fi diff --git a/tools/dockerfile/centos7_manylinux.sh b/tools/dockerfile/centos7_manylinux.sh index 490bff22826..5f8a48c8067 100755 --- a/tools/dockerfile/centos7_manylinux.sh +++ b/tools/dockerfile/centos7_manylinux.sh @@ -20,11 +20,15 @@ REPO="${REPO:-paddledocker}" function make_cuda9cudnn7(){ sed 's//9.0-cudnn7-devel-centos7/g' Dockerfile.centos >Dockerfile.tmp + sed -i "s#RUN bash build_scripts/build.sh#RUN bash build_scripts/install_gcc.sh gcc54 \nRUN mv /usr/bin/cc /usr/bin/cc.bak \&\& ln -s /usr/local/gcc-5.4/bin/gcc /usr/bin/cc \nENV PATH=/usr/local/gcc-5.4/bin:\$PATH \nRUN bash build_scripts/build.sh#g" Dockerfile.tmp + } function make_cuda10cudnn7() { sed 's//10.0-cudnn7-devel-centos7/g' Dockerfile.centos >Dockerfile.tmp + sed -i "s#RUN bash build_scripts/build.sh#RUN bash build_scripts/install_gcc.sh gcc54 \nRUN mv /usr/bin/cc /usr/bin/cc.bak \&\& ln -s /usr/local/gcc-5.4/bin/gcc /usr/bin/cc \nENV PATH=/usr/local/gcc-5.4/bin:\$PATH \nRUN bash build_scripts/build.sh#g" Dockerfile.tmp + } diff --git a/tools/dockerfile/ubuntu16_dev.sh b/tools/dockerfile/ubuntu16_dev.sh index b7d0d8e3e2a..23578b4143f 100755 --- a/tools/dockerfile/ubuntu16_dev.sh +++ b/tools/dockerfile/ubuntu16_dev.sh @@ -33,7 +33,7 @@ function ref_whl(){ fi if [[ ${WITH_GPU} != "ON" ]]; then - ref_gcc = "" + ref_gcc="" elif [[ ${gcc_version} == "8.2.0" ]];then ref_gcc=_gcc8.2 fi @@ -44,29 +44,31 @@ function ref_whl(){ ref_version=.post100 elif [[ ${ref_CUDA_MAJOR} == "10.1" ]];then ref_version=.post101 - elif [[ ${ref_CUDA_MAJOR} == "10.2" ]];then + elif [[ ${ref_CUDA_MAJOR} == "10.2" && ${PADDLE_VERSION} == "develop" ]];then + ref_version=.post102 + elif [[ ${ref_CUDA_MAJOR} == "10.2" && ${PADDLE_VERSION} != "develop" ]];then ref_version="" elif [[ ${ref_CUDA_MAJOR} == "9" ]];then ref_version=.post90 fi + + ref_dev=2.1.0.dev0 ref_web="https://paddle-wheel.bj.bcebos.com/${PADDLE_BRANCH}-${ref_gpu}-${ref_mkl}${ref_gcc}" - if [[ ${PADDLE_VERSION} == "0.0.0" && ${WITH_GPU} == "ON" ]]; then - ref_paddle_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp27-cp27mu-linux_x86_64.whl - ref_paddle3_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp35-cp35m-linux_x86_64.whl - ref_paddle36_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp36-cp36m-linux_x86_64.whl - ref_paddle37_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp37-cp37m-linux_x86_64.whl - ref_paddle38_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp38-cp38-linux_x86_64.whl - else - ref_paddle_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp27-cp27mu-linux_x86_64.whl - ref_paddle3_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp35-cp35m-linux_x86_64.whl - ref_paddle36_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp36-cp36m-linux_x86_64.whl - ref_paddle37_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp37-cp37m-linux_x86_64.whl - ref_paddle38_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp38-cp38-linux_x86_64.whl - fi - - if [[ ${PADDLE_VERSION} != "0.0.0" && ${WITH_GPU} == "ON" ]]; then + if [[ ${PADDLE_VERSION} == "develop" && ${WITH_GPU} == "ON" ]]; then + ref_paddle_whl=paddlepaddle${install_gpu}-${ref_dev}${ref_version}-cp27-cp27mu-linux_x86_64.whl + ref_paddle3_whl=paddlepaddle${install_gpu}-${ref_dev}${ref_version}-cp35-cp35m-linux_x86_64.whl + ref_paddle36_whl=paddlepaddle${install_gpu}-${ref_dev}${ref_version}-cp36-cp36m-linux_x86_64.whl + ref_paddle37_whl=paddlepaddle${install_gpu}-${ref_dev}${ref_version}-cp37-cp37m-linux_x86_64.whl + ref_paddle38_whl=paddlepaddle${install_gpu}-${ref_dev}${ref_version}-cp38-cp38-linux_x86_64.whl + elif [[ ${PADDLE_VERSION} == "develop" && ${WITH_GPU} != "ON" ]]; then + ref_paddle_whl=paddlepaddle${install_gpu}-${ref_dev}-cp27-cp27mu-linux_x86_64.whl + ref_paddle3_whl=paddlepaddle${install_gpu}-${ref_dev}-cp35-cp35m-linux_x86_64.whl + ref_paddle36_whl=paddlepaddle${install_gpu}-${ref_dev}-cp36-cp36m-linux_x86_64.whl + ref_paddle37_whl=paddlepaddle${install_gpu}-${ref_dev}-cp37-cp37m-linux_x86_64.whl + ref_paddle38_whl=paddlepaddle${install_gpu}-${ref_dev}-cp38-cp38-linux_x86_64.whl + elif [[ ${PADDLE_VERSION} != "develop" && ${WITH_GPU} == "ON" ]]; then ref_paddle_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}${ref_version}-cp27-cp27mu-linux_x86_64.whl ref_paddle3_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}${ref_version}-cp35-cp35m-linux_x86_64.whl ref_paddle36_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}${ref_version}-cp36-cp36m-linux_x86_64.whl @@ -106,7 +108,7 @@ function install_gcc(){ else sed -i 's##RUN apt-get update \ WORKDIR /usr/bin \ - RUN apt install -y gcc-4.8 g++-4.8 \&\& cp gcc gcc.bak \&\& cp g++ g++.bak \&\& rm gcc \&\& rm g++ \&\& ln -s gcc-4.8 gcc \&\& ln -s g++-4.8 g++ #g' Dockerfile.tmp + RUN apt install -y gcc g++ #g' Dockerfile.tmp fi } diff --git a/tools/dockerfile/ubuntu18_dev.sh b/tools/dockerfile/ubuntu18_dev.sh index 19572f639bc..6c6a14529ca 100755 --- a/tools/dockerfile/ubuntu18_dev.sh +++ b/tools/dockerfile/ubuntu18_dev.sh @@ -33,7 +33,7 @@ function ref_whl(){ fi if [[ ${WITH_GPU} != "ON" ]]; then - ref_gcc = "" + ref_gcc="" elif [[ ${gcc_version} == "8.2.0" ]];then ref_gcc=_gcc8.2 fi @@ -44,29 +44,31 @@ function ref_whl(){ ref_version=.post100 elif [[ ${ref_CUDA_MAJOR} == "10.1" ]];then ref_version=.post101 - elif [[ ${ref_CUDA_MAJOR} == "10.2" ]];then + elif [[ ${ref_CUDA_MAJOR} == "10.2" && ${PADDLE_VERSION} == "develop" ]];then + ref_version=.post102 + elif [[ ${ref_CUDA_MAJOR} == "10.2" && ${PADDLE_VERSION} != "develop" ]];then ref_version="" elif [[ ${ref_CUDA_MAJOR} == "9" ]];then ref_version=.post90 fi + + ref_dev=2.1.0.dev0 ref_web="https://paddle-wheel.bj.bcebos.com/${PADDLE_BRANCH}-${ref_gpu}-${ref_mkl}${ref_gcc}" - if [[ ${PADDLE_VERSION} == "0.0.0" && ${WITH_GPU} == "ON" ]]; then - ref_paddle_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp27-cp27mu-linux_x86_64.whl - ref_paddle3_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp35-cp35m-linux_x86_64.whl - ref_paddle36_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp36-cp36m-linux_x86_64.whl - ref_paddle37_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp37-cp37m-linux_x86_64.whl - ref_paddle38_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp38-cp38-linux_x86_64.whl - else - ref_paddle_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp27-cp27mu-linux_x86_64.whl - ref_paddle3_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp35-cp35m-linux_x86_64.whl - ref_paddle36_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp36-cp36m-linux_x86_64.whl - ref_paddle37_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp37-cp37m-linux_x86_64.whl - ref_paddle38_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp38-cp38-linux_x86_64.whl - fi - - if [[ ${PADDLE_VERSION} != "0.0.0" && ${WITH_GPU} == "ON" ]]; then + if [[ ${PADDLE_VERSION} == "develop" && ${WITH_GPU} == "ON" ]]; then + ref_paddle_whl=paddlepaddle${install_gpu}-${ref_dev}${ref_version}-cp27-cp27mu-linux_x86_64.whl + ref_paddle3_whl=paddlepaddle${install_gpu}-${ref_dev}${ref_version}-cp35-cp35m-linux_x86_64.whl + ref_paddle36_whl=paddlepaddle${install_gpu}-${ref_dev}${ref_version}-cp36-cp36m-linux_x86_64.whl + ref_paddle37_whl=paddlepaddle${install_gpu}-${ref_dev}${ref_version}-cp37-cp37m-linux_x86_64.whl + ref_paddle38_whl=paddlepaddle${install_gpu}-${ref_dev}${ref_version}-cp38-cp38-linux_x86_64.whl + elif [[ ${PADDLE_VERSION} == "develop" && ${WITH_GPU} != "ON" ]]; then + ref_paddle_whl=paddlepaddle${install_gpu}-${ref_dev}-cp27-cp27mu-linux_x86_64.whl + ref_paddle3_whl=paddlepaddle${install_gpu}-${ref_dev}-cp35-cp35m-linux_x86_64.whl + ref_paddle36_whl=paddlepaddle${install_gpu}-${ref_dev}-cp36-cp36m-linux_x86_64.whl + ref_paddle37_whl=paddlepaddle${install_gpu}-${ref_dev}-cp37-cp37m-linux_x86_64.whl + ref_paddle38_whl=paddlepaddle${install_gpu}-${ref_dev}-cp38-cp38-linux_x86_64.whl + elif [[ ${PADDLE_VERSION} != "develop" && ${WITH_GPU} == "ON" ]]; then ref_paddle_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}${ref_version}-cp27-cp27mu-linux_x86_64.whl ref_paddle3_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}${ref_version}-cp35-cp35m-linux_x86_64.whl ref_paddle36_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}${ref_version}-cp36-cp36m-linux_x86_64.whl @@ -107,7 +109,7 @@ function install_gcc(){ else sed -i 's##RUN apt-get update \ WORKDIR /usr/bin \ - RUN apt install -y gcc-4.8 g++-4.8 \&\& cp gcc gcc.bak \&\& cp g++ g++.bak \&\& rm gcc \&\& rm g++ \&\& ln -s gcc-4.8 gcc \&\& ln -s g++-4.8 g++ #g' Dockerfile.tmp + RUN apt install -y gcc g++ #g' Dockerfile.tmp fi } -- GitLab From 10af966afb115ccfaf783c63b8172cb0b4fbbe7e Mon Sep 17 00:00:00 2001 From: CtfGo Date: Wed, 7 Apr 2021 16:38:02 +0800 Subject: [PATCH 165/486] update the TraceLayer.save_inference_model method with add file suffix automatically (#31989) As the title --- python/paddle/fluid/dygraph/jit.py | 78 ++++++++++++------- ...imperative_trace_non_persistable_inputs.py | 10 ++- .../unittests/test_traced_layer_err_msg.py | 40 ++++++++-- 3 files changed, 88 insertions(+), 40 deletions(-) diff --git a/python/paddle/fluid/dygraph/jit.py b/python/paddle/fluid/dygraph/jit.py index 4b35d778459..40ab19184c9 100644 --- a/python/paddle/fluid/dygraph/jit.py +++ b/python/paddle/fluid/dygraph/jit.py @@ -509,33 +509,33 @@ def save(layer, path, input_spec=None, **configs): Saves input Layer as ``paddle.jit.TranslatedLayer`` format model, which can be used for inference or fine-tuning after loading. - It will save the translated program and all related persistable + It will save the translated program and all related persistable variables of input Layer to given ``path`` . - - ``path`` is the prefix of saved objects, and the saved translated program file + + ``path`` is the prefix of saved objects, and the saved translated program file suffix is ``.pdmodel`` , the saved persistable variables file suffix is ``.pdiparams`` , - and here also saved some additional variable description information to a file, + and here also saved some additional variable description information to a file, its suffix is ``.pdiparams.info``, these additional information is used in fine-tuning. The saved model can be loaded by follow APIs: - - ``paddle.jit.load`` - - ``paddle.static.load_inference_model`` + - ``paddle.jit.load`` + - ``paddle.static.load_inference_model`` - Other C++ inference APIs Args: layer (Layer): The Layer to be saved. path (str): The path prefix to save model. The format is ``dirname/file_prefix`` or ``file_prefix``. - input_spec (list[InputSpec|Tensor], optional): Describes the input of the saved model's forward - method, which can be described by InputSpec or example Tensor. If None, all input variables of + input_spec (list[InputSpec|Tensor], optional): Describes the input of the saved model's forward + method, which can be described by InputSpec or example Tensor. If None, all input variables of the original Layer's forward method would be the inputs of the saved model. Default None. - **configs (dict, optional): Other save configuration options for compatibility. We do not - recommend using these configurations, they may be removed in the future. If not necessary, + **configs (dict, optional): Other save configuration options for compatibility. We do not + recommend using these configurations, they may be removed in the future. If not necessary, DO NOT use them. Default None. The following options are currently supported: (1) output_spec (list[Tensor]): Selects the output targets of the saved model. - By default, all return variables of original Layer's forward method are kept as the - output of the saved model. If the provided ``output_spec`` list is not all output variables, - the saved model will be pruned according to the given ``output_spec`` list. + By default, all return variables of original Layer's forward method are kept as the + output of the saved model. If the provided ``output_spec`` list is not all output variables, + the saved model will be pruned according to the given ``output_spec`` list. Returns: None @@ -793,8 +793,8 @@ def load(path, **configs): """ :api_attr: imperative - Load model saved by ``paddle.jit.save`` or ``paddle.static.save_inference_model`` or - paddle 1.x API ``paddle.fluid.io.save_inference_model`` as ``paddle.jit.TranslatedLayer``, + Load model saved by ``paddle.jit.save`` or ``paddle.static.save_inference_model`` or + paddle 1.x API ``paddle.fluid.io.save_inference_model`` as ``paddle.jit.TranslatedLayer``, then performing inference or fine-tune training. .. note:: @@ -807,14 +807,14 @@ def load(path, **configs): Args: path (str): The path prefix to load model. The format is ``dirname/file_prefix`` or ``file_prefix`` . - **configs (dict, optional): Other load configuration options for compatibility. We do not - recommend using these configurations, they may be removed in the future. If not necessary, + **configs (dict, optional): Other load configuration options for compatibility. We do not + recommend using these configurations, they may be removed in the future. If not necessary, DO NOT use them. Default None. The following options are currently supported: - (1) model_filename (str): The inference model file name of the paddle 1.x - ``save_inference_model`` save format. Default file name is :code:`__model__` . - (2) params_filename (str): The persistable variables file name of the paddle 1.x - ``save_inference_model`` save format. No default file name, save variables separately + (1) model_filename (str): The inference model file name of the paddle 1.x + ``save_inference_model`` save format. Default file name is :code:`__model__` . + (2) params_filename (str): The persistable variables file name of the paddle 1.x + ``save_inference_model`` save format. No default file name, save variables separately by default. @@ -960,7 +960,7 @@ def load(path, **configs): loader = paddle.io.DataLoader(dataset, feed_list=[image, label], places=place, - batch_size=BATCH_SIZE, + batch_size=BATCH_SIZE, shuffle=True, drop_last=True, num_workers=2) @@ -969,7 +969,7 @@ def load(path, **configs): for data in loader(): exe.run( static.default_main_program(), - feed=data, + feed=data, fetch_list=[avg_loss]) model_path = "fc.example.model" @@ -1052,7 +1052,7 @@ def _trace(layer, class TracedLayer(object): """ :api_attr: imperative - + TracedLayer is used to convert a forward dygraph model to a static graph model. This is mainly used to save the dygraph model for online inference using C++. Besides, users can also do inference in Python @@ -1132,7 +1132,7 @@ class TracedLayer(object): def forward(self, input): return self._fc(input) - + layer = ExampleLayer() in_var = paddle.uniform(shape=[2, 3], dtype='float32') out_dygraph, static_layer = paddle.jit.TracedLayer.trace(layer, inputs=[in_var]) @@ -1244,13 +1244,16 @@ class TracedLayer(object): return self._run(self._build_feed(inputs)) @switch_to_static_graph - def save_inference_model(self, dirname, feed=None, fetch=None): + def save_inference_model(self, path, feed=None, fetch=None): """ Save the TracedLayer to a model for inference. The saved inference model can be loaded by C++ inference APIs. + ``path`` is the prefix of saved objects, and the saved translated program file + suffix is ``.pdmodel`` , the saved persistable variables file suffix is ``.pdiparams`` . + Args: - dirname (str): the directory to save the inference model. + path(str): The path prefix to save model. The format is ``dirname/file_prefix`` or ``file_prefix``. feed (list[int], optional): the input variable indices of the saved inference model. If None, all input variables of the TracedLayer object would be the inputs of the saved inference @@ -1294,7 +1297,7 @@ class TracedLayer(object): fetch, = exe.run(program, feed={feed_vars[0]: in_np}, fetch_list=fetch_vars) print(fetch.shape) # (2, 10) """ - check_type(dirname, "dirname", str, + check_type(path, "path", str, "fluid.dygraph.jit.TracedLayer.save_inference_model") check_type(feed, "feed", (type(None), list), "fluid.dygraph.jit.TracedLayer.save_inference_model") @@ -1309,6 +1312,18 @@ class TracedLayer(object): check_type(f, "each element of fetch", int, "fluid.dygraph.jit.TracedLayer.save_inference_model") + # path check + file_prefix = os.path.basename(path) + if file_prefix == "": + raise ValueError( + "The input path MUST be format of dirname/file_prefix " + "[dirname\\file_prefix in Windows system], but received " + "file_prefix is empty string.") + + dirname = os.path.dirname(path) + if dirname and not os.path.exists(dirname): + os.makedirs(dirname) + from paddle.fluid.io import save_inference_model def get_feed_fetch(all_vars, partial_vars): @@ -1326,9 +1341,14 @@ class TracedLayer(object): assert target_var is not None, "{} cannot be found".format(name) target_vars.append(target_var) + model_filename = file_prefix + INFER_MODEL_SUFFIX + params_filename = file_prefix + INFER_PARAMS_SUFFIX + save_inference_model( dirname=dirname, feeded_var_names=feeded_var_names, target_vars=target_vars, executor=self._exe, - main_program=self._program.clone()) + main_program=self._program.clone(), + model_filename=model_filename, + params_filename=params_filename) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_trace_non_persistable_inputs.py b/python/paddle/fluid/tests/unittests/test_imperative_trace_non_persistable_inputs.py index 2a74d29e1ee..645a05e75f6 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_trace_non_persistable_inputs.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_trace_non_persistable_inputs.py @@ -75,10 +75,12 @@ class TestTracedLayerRecordNonPersistableInput(unittest.TestCase): self.assertEqual(actual_persistable_vars, expected_persistable_vars) - dirname = './traced_layer_test_non_persistable_vars' - traced_layer.save_inference_model(dirname=dirname) - filenames = set([f for f in os.listdir(dirname) if f != '__model__']) - self.assertEqual(filenames, expected_persistable_vars) + traced_layer.save_inference_model( + path='./traced_layer_test_non_persistable_vars') + self.assertTrue('traced_layer_test_non_persistable_vars.pdmodel' in + os.listdir('./')) + self.assertTrue('traced_layer_test_non_persistable_vars.pdiparams' in + os.listdir('./')) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_traced_layer_err_msg.py b/python/paddle/fluid/tests/unittests/test_traced_layer_err_msg.py index 38543fecac8..cb518646889 100644 --- a/python/paddle/fluid/tests/unittests/test_traced_layer_err_msg.py +++ b/python/paddle/fluid/tests/unittests/test_traced_layer_err_msg.py @@ -18,6 +18,7 @@ import paddle.fluid as fluid import six import unittest import paddle.nn as nn +import os class SimpleFCLayer(nn.Layer): @@ -115,36 +116,41 @@ class TestTracedLayerErrMsg(unittest.TestCase): dygraph_out, traced_layer = fluid.dygraph.TracedLayer.trace( self.layer, [in_x]) - dirname = './traced_layer_err_msg' + path = './traced_layer_err_msg' with self.assertRaises(TypeError) as e: traced_layer.save_inference_model([0]) self.assertEqual( - "The type of 'dirname' in fluid.dygraph.jit.TracedLayer.save_inference_model must be <{} 'str'>, but received <{} 'list'>. ". + "The type of 'path' in fluid.dygraph.jit.TracedLayer.save_inference_model must be <{} 'str'>, but received <{} 'list'>. ". format(self.type_str, self.type_str), str(e.exception)) with self.assertRaises(TypeError) as e: - traced_layer.save_inference_model(dirname, [0], [None]) + traced_layer.save_inference_model(path, [0], [None]) self.assertEqual( "The type of 'each element of fetch' in fluid.dygraph.jit.TracedLayer.save_inference_model must be <{} 'int'>, but received <{} 'NoneType'>. ". format(self.type_str, self.type_str), str(e.exception)) with self.assertRaises(TypeError) as e: - traced_layer.save_inference_model(dirname, [0], False) + traced_layer.save_inference_model(path, [0], False) self.assertEqual( "The type of 'fetch' in fluid.dygraph.jit.TracedLayer.save_inference_model must be (<{} 'NoneType'>, <{} 'list'>), but received <{} 'bool'>. ". format(self.type_str, self.type_str, self.type_str), str(e.exception)) with self.assertRaises(TypeError) as e: - traced_layer.save_inference_model(dirname, [None], [0]) + traced_layer.save_inference_model(path, [None], [0]) self.assertEqual( "The type of 'each element of feed' in fluid.dygraph.jit.TracedLayer.save_inference_model must be <{} 'int'>, but received <{} 'NoneType'>. ". format(self.type_str, self.type_str), str(e.exception)) with self.assertRaises(TypeError) as e: - traced_layer.save_inference_model(dirname, True, [0]) + traced_layer.save_inference_model(path, True, [0]) self.assertEqual( "The type of 'feed' in fluid.dygraph.jit.TracedLayer.save_inference_model must be (<{} 'NoneType'>, <{} 'list'>), but received <{} 'bool'>. ". format(self.type_str, self.type_str, self.type_str), str(e.exception)) + with self.assertRaises(ValueError) as e: + traced_layer.save_inference_model("") + self.assertEqual( + "The input path MUST be format of dirname/file_prefix [dirname\\file_prefix in Windows system], " + "but received file_prefix is empty string.", str(e.exception)) - traced_layer.save_inference_model(dirname) + traced_layer.save_inference_model(path) def _train_simple_net(self): layer = None @@ -174,5 +180,25 @@ class TestOutVarWithNoneErrMsg(unittest.TestCase): [in_x]) +class TestTracedLayerSaveInferenceModel(unittest.TestCase): + """test save_inference_model will automaticlly create non-exist dir""" + + def setUp(self): + self.save_path = "./nonexist_dir/fc" + import shutil + if os.path.exists(os.path.dirname(self.save_path)): + shutil.rmtree(os.path.dirname(self.save_path)) + + def test_mkdir_when_input_path_non_exist(self): + fc_layer = SimpleFCLayer(3, 4, 2) + input_var = paddle.to_tensor(np.random.random([4, 3]).astype('float32')) + with fluid.dygraph.guard(): + dygraph_out, traced_layer = fluid.dygraph.TracedLayer.trace( + fc_layer, inputs=[input_var]) + self.assertFalse(os.path.exists(os.path.dirname(self.save_path))) + traced_layer.save_inference_model(self.save_path) + self.assertTrue(os.path.exists(os.path.dirname(self.save_path))) + + if __name__ == '__main__': unittest.main() -- GitLab From 363b25aaecd0d3c08332eb2788c9056cedf84f45 Mon Sep 17 00:00:00 2001 From: Ouyang Chao Date: Wed, 7 Apr 2021 16:59:36 +0800 Subject: [PATCH 166/486] improve performance of DepthwiseConv(NHWC) (#31677) * improve performance of DepthwiseConv(NWHC) --- paddle/fluid/operators/conv_op.h | 104 +- paddle/fluid/operators/math/depthwise_conv.cu | 903 +++++++++++++----- python/paddle/nn/functional/conv.py | 8 - 3 files changed, 683 insertions(+), 332 deletions(-) diff --git a/paddle/fluid/operators/conv_op.h b/paddle/fluid/operators/conv_op.h index 364e3ab8d26..94d1f707b74 100644 --- a/paddle/fluid/operators/conv_op.h +++ b/paddle/fluid/operators/conv_op.h @@ -903,29 +903,19 @@ class DepthwiseConvKernel : public framework::OpKernel { "and input channel number is %d", output->dims()[1], input->dims()[1])); } - // transform tensor - Tensor transformed_input(input->type()); - Tensor transformed_output(output->type()); - - if (channel_last) { - ResizeToChannelFirst(context, input, - &transformed_input); - TransToChannelFirst(context, input, &transformed_input); - - ResizeToChannelFirst(context, output, - &transformed_output); - - } else { - transformed_input = *input; - transformed_output = *output; - } // update padding and dilation - auto in_dims = transformed_input.dims(); + auto in_dims = input->dims(); auto filter_dims = filter.dims(); framework::DDim in_data_dims; - in_data_dims = framework::slice_ddim(in_dims, 2, in_dims.size()); + const framework::DataLayout data_layout = + framework::StringToDataLayout(data_format); + if (data_layout != framework::DataLayout::kNHWC) { + in_data_dims = framework::slice_ddim(in_dims, 2, in_dims.size()); + } else { + in_data_dims = framework::slice_ddim(in_dims, 1, in_dims.size() - 1); + } framework::DDim filter_data_dims = framework::slice_ddim(filter_dims, 2, filter_dims.size()); @@ -944,16 +934,12 @@ class DepthwiseConvKernel : public framework::OpKernel { if (fuse_relu) { math::DepthwiseConvFunctor depthwiseConv; - depthwiseConv(dev_ctx, transformed_input, filter, strides, paddings, - dilations, &transformed_output); + depthwiseConv(dev_ctx, *input, filter, strides, paddings, dilations, + output, data_layout); } else { math::DepthwiseConvFunctor depthwiseConv; - depthwiseConv(dev_ctx, transformed_input, filter, strides, paddings, - dilations, &transformed_output); - } - if (channel_last) { - TransToChannelLast(context, &transformed_output, - output); + depthwiseConv(dev_ctx, *input, filter, strides, paddings, dilations, + output, data_layout); } } }; @@ -981,33 +967,18 @@ class DepthwiseConvGradKernel : public framework::OpKernel { context.Attr("padding_algorithm"); const std::string data_format = context.Attr("data_format"); - const bool channel_last = (data_format == "NHWC" || data_format == "NDHWC"); - - // transform Tensor - Tensor transformed_input(input->type()); - Tensor transformed_output_grad(output_grad->type()); - - if (channel_last) { - ResizeToChannelFirst(context, input, - &transformed_input); - TransToChannelFirst(context, input, &transformed_input); - - ResizeToChannelFirst(context, output_grad, - &transformed_output_grad); - TransToChannelFirst(context, output_grad, - &transformed_output_grad); - - } else { - transformed_input = *input; - transformed_output_grad = *output_grad; - } - // update padding and dilation - auto in_dims = transformed_input.dims(); + auto in_dims = input->dims(); auto filter_dims = filter.dims(); framework::DDim in_data_dims; - in_data_dims = framework::slice_ddim(in_dims, 2, in_dims.size()); + const framework::DataLayout data_layout = + framework::StringToDataLayout(data_format); + if (data_layout != framework::DataLayout::kNHWC) { + in_data_dims = framework::slice_ddim(in_dims, 2, in_dims.size()); + } else { + in_data_dims = framework::slice_ddim(in_dims, 1, in_dims.size() - 1); + } framework::DDim filter_data_dims = framework::slice_ddim(filter_dims, 2, filter_dims.size()); std::vector ksize = framework::vectorize(filter_data_dims); @@ -1025,33 +996,18 @@ class DepthwiseConvGradKernel : public framework::OpKernel { if (input_grad) { input_grad->mutable_data(context.GetPlace()); - Tensor transformed_input_grad(input_grad->type()); - if (channel_last) { - ResizeToChannelFirst(context, input_grad, - &transformed_input_grad); - - } else { - transformed_input_grad = *input_grad; - } - - set_zero(dev_ctx, &transformed_input_grad, static_cast(0)); + set_zero(dev_ctx, input_grad, static_cast(0)); if (fuse_relu) { math::DepthwiseConvInputGradFunctor depthwiseConvInputGrad; - depthwiseConvInputGrad(dev_ctx, transformed_input, filter, - transformed_output_grad, strides, paddings, - dilations, &transformed_input_grad); + depthwiseConvInputGrad(dev_ctx, *input, filter, *output_grad, strides, + paddings, dilations, input_grad, data_layout); } else { math::DepthwiseConvInputGradFunctor depthwiseConvInputGrad; - depthwiseConvInputGrad(dev_ctx, transformed_input, filter, - transformed_output_grad, strides, paddings, - dilations, &transformed_input_grad); - } - if (channel_last) { - TransToChannelLast(context, &transformed_input_grad, - input_grad); + depthwiseConvInputGrad(dev_ctx, *input, filter, *output_grad, strides, + paddings, dilations, input_grad, data_layout); } } @@ -1061,15 +1017,13 @@ class DepthwiseConvGradKernel : public framework::OpKernel { if (fuse_relu) { math::DepthwiseConvFilterGradFunctor depthwiseConvFilterGrad; - depthwiseConvFilterGrad(dev_ctx, transformed_input, - transformed_output_grad, strides, paddings, - dilations, filter_grad); + depthwiseConvFilterGrad(dev_ctx, *input, *output_grad, strides, + paddings, dilations, filter_grad, data_layout); } else { math::DepthwiseConvFilterGradFunctor depthwiseConvFilterGrad; - depthwiseConvFilterGrad(dev_ctx, transformed_input, - transformed_output_grad, strides, paddings, - dilations, filter_grad); + depthwiseConvFilterGrad(dev_ctx, *input, *output_grad, strides, + paddings, dilations, filter_grad, data_layout); } } } diff --git a/paddle/fluid/operators/math/depthwise_conv.cu b/paddle/fluid/operators/math/depthwise_conv.cu index d116b620dc1..5fd543b5c6c 100644 --- a/paddle/fluid/operators/math/depthwise_conv.cu +++ b/paddle/fluid/operators/math/depthwise_conv.cu @@ -22,6 +22,7 @@ limitations under the License. */ namespace cub = hipcub; #endif #include "paddle/fluid/operators/math/depthwise_conv.h" +#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/platform/cuda_device_function.h" #include "paddle/fluid/platform/cuda_primitives.h" @@ -52,8 +53,7 @@ __device__ __inline__ void CudaAtomicAddWithWarp(T* sum, T value) { const int filter_multiplier, const int filter_height, \ const int filter_width, const int stride_height, const int stride_width, \ const int padding_height, const int padding_width, \ - const int dilate_height, const int dilate_width, T *const output_data, \ - const DataLayout data_layout = DataLayout::kNCHW + const int dilate_height, const int dilate_width, T *const output_data // A Cuda kernel to compute the depthwise convolution forward pass // in NCHW format. @@ -123,7 +123,6 @@ __device__ __inline__ void KernelDepthwiseConvNHWC( const int batch = idx / output_width / output_height / output_channels; const int c_in = c_out / filter_multiplier; - const T* weight = filter_data + c_out * filter_height * filter_width; T value = 0; const int h_in_start = -padding_height + h_out * stride_height; const int w_in_start = -padding_width + w_out * stride_width; @@ -142,13 +141,14 @@ __device__ __inline__ void KernelDepthwiseConvNHWC( for (int w_in = w_in_start; w_in < w_in_end; w_in += dilate_width) { if (h_in >= h_start && h_in < h_end && w_in >= w_start && w_in < w_end) { int offset = ((batch * input_height + h_in) * input_width + w_in) * - output_channels + + input_channels + c_in; T in_data = input_data[offset]; + const T* weight = filter_data + weight_offset * output_channels + c_out; if (fuse_relu_before_conv) { - value += weight[weight_offset] * max(0.0f, in_data); + value += weight[0] * max(0.0f, in_data); } else { - value += weight[weight_offset] * in_data; + value += weight[0] * in_data; } } weight_offset++; @@ -161,10 +161,10 @@ __device__ __inline__ void KernelDepthwiseConvNHWC( } template -__device__ __inline__ void KernelDepthwiseConvCFilter( +__device__ __inline__ void KernelDepthwiseConvCFilterNCHW( ARG_DEFINE_KernelDepthwiseConv) { - const int kWeghtSize = c_filter * c_filter; - T r_weight[kWeghtSize]; + const int kWeightSize = c_filter * c_filter; + T r_weight[kWeightSize]; const int batch = blockIdx.y; const int c_out = blockIdx.x; const T* weight = filter_data + c_out * c_filter * c_filter; @@ -182,13 +182,8 @@ __device__ __inline__ void KernelDepthwiseConvCFilter( const int h_in_end = h_in_start + c_filter * dilate_height; const int w_in_end = w_in_start + c_filter * dilate_width; - int in_offset; - if (data_layout != DataLayout::kNHWC) { - in_offset = - ((batch * input_channels + c_in) * input_height) * input_width; - } else { - in_offset = batch * input_height * input_width * input_channels; - } + int in_offset = + ((batch * input_channels + c_in) * input_height) * input_width; const int h_end = h_in_end < input_height ? h_in_end : input_height; const int w_end = w_in_end < input_width ? w_in_end : input_width; @@ -201,13 +196,63 @@ __device__ __inline__ void KernelDepthwiseConvCFilter( w_in += dilate_width, w_f++) { if (h_in >= 0 && h_in < input_height && w_in >= 0 && w_in < input_width) { - int offset; - if (data_layout != DataLayout::kNHWC) { - offset = in_offset + h_in * input_width + w_in; + int offset = in_offset + h_in * input_width + w_in; + if (fuse_relu_before_conv) { + value += r_weight[h_f * c_filter + w_f] * + max(0.0f, input_data[offset]); } else { - offset = in_offset + - (h_in * input_width + w_in) * input_channels + c_in; + value += r_weight[h_f * c_filter + w_f] * input_data[offset]; } + } + } + } + int index = + ((batch * gridDim.x + c_out) * output_height + h_out) * output_width + + w_out; + output_data[index] = value; + } + } +} + +template +__device__ __inline__ void KernelDepthwiseConvCFilterNHWC( + ARG_DEFINE_KernelDepthwiseConv) { + const int batch = blockIdx.z; + int h_out = blockIdx.x * dilate_height + blockIdx.y; + if (h_out >= output_height) { + return; + } + int in_offset = batch * input_height * input_width * input_channels; + int out_offset = + (batch * output_height + h_out) * output_width * output_channels; + const int h_in_start = -padding_height + h_out * stride_height; + const int wi_size = (output_width + dilate_width - 1) / dilate_width; + const int kWeightSize = c_filter * c_filter; + T r_weight[kWeightSize]; + + for (int c_out = threadIdx.x; c_out < output_channels; c_out += blockDim.x) { + for (int i = 0; i < c_filter * c_filter; i++) { + const T* weight = filter_data + i * output_channels + c_out; + r_weight[i] = weight[0]; + } + const int c_in = c_out / filter_multiplier; + for (int i = threadIdx.y; i < wi_size * dilate_width; i += blockDim.y) { + int i_dw = i / wi_size; + int i_wi = i - i_dw * wi_size; + int w_out = i_wi * dilate_width + i_dw; + if (w_out >= output_width) { + continue; + } + T value = 0; + const int w_in_start = -padding_width + w_out * stride_width; + for (int h_in = h_in_start, h_f = 0; h_f < c_filter; + h_in += dilate_height, h_f++) { + for (int w_in = w_in_start, w_f = 0; w_f < c_filter; + w_in += dilate_width, w_f++) { + if (h_in >= 0 && h_in < input_height && w_in >= 0 && + w_in < input_width) { + int offset = + in_offset + (h_in * input_width + w_in) * input_channels + c_in; if (fuse_relu_before_conv) { value += r_weight[h_f * c_filter + w_f] * max(0.0f, input_data[offset]); @@ -217,23 +262,14 @@ __device__ __inline__ void KernelDepthwiseConvCFilter( } } } - int index; - if (data_layout != DataLayout::kNHWC) { - index = ((batch * gridDim.x + c_out) * output_height + h_out) * - output_width + - w_out; - } else { - index = ((batch * output_height + h_out) * output_width + w_out) * - gridDim.x + - c_out; - } + int index = out_offset + w_out * output_channels + c_out; output_data[index] = value; } } } template + DataLayout data_layout, bool fuse_relu_before_conv> __global__ void KernelDepthwiseConvSp(ARG_DEFINE_KernelDepthwiseConv) { int final_filter_multiplier = filter_multiplier; int h_stride = stride_height; @@ -244,28 +280,37 @@ __global__ void KernelDepthwiseConvSp(ARG_DEFINE_KernelDepthwiseConv) { w_stride = c_stride; } if (c_filter == -1) { - if (data_layout == DataLayout::kNCHW) { + if (data_layout != DataLayout::kNHWC) { KernelDepthwiseConvNCHW( input_data, filter_data, batch_size, output_channels, output_height, output_width, input_channels, input_height, input_width, final_filter_multiplier, filter_height, filter_width, h_stride, w_stride, padding_height, padding_width, dilate_height, dilate_width, - output_data, data_layout); + output_data); } else { KernelDepthwiseConvNHWC( input_data, filter_data, batch_size, output_channels, output_height, output_width, input_channels, input_height, input_width, final_filter_multiplier, filter_height, filter_width, h_stride, w_stride, padding_height, padding_width, dilate_height, dilate_width, - output_data, data_layout); + output_data); } } else { - KernelDepthwiseConvCFilter( - input_data, filter_data, batch_size, output_channels, output_height, - output_width, input_channels, input_height, input_width, - final_filter_multiplier, filter_height, filter_width, h_stride, - w_stride, padding_height, padding_width, dilate_height, dilate_width, - output_data, data_layout); + if (data_layout != DataLayout::kNHWC) { + KernelDepthwiseConvCFilterNCHW( + input_data, filter_data, batch_size, output_channels, output_height, + output_width, input_channels, input_height, input_width, + final_filter_multiplier, filter_height, filter_width, h_stride, + w_stride, padding_height, padding_width, dilate_height, dilate_width, + output_data); + } else { + KernelDepthwiseConvCFilterNHWC( + input_data, filter_data, batch_size, output_channels, output_height, + output_width, input_channels, input_height, input_width, + final_filter_multiplier, filter_height, filter_width, h_stride, + w_stride, padding_height, padding_width, dilate_height, dilate_width, + output_data); + } } } @@ -280,40 +325,27 @@ __global__ void KernelDepthwiseConvSp(ARG_DEFINE_KernelDepthwiseConv) { const int filter_width, const int stride_height, const int stride_width, \ const int padding_height, const int padding_width, \ const int dilate_height, const int dilate_width, \ - T *const input_grad_data, \ - const DataLayout data_layout = DataLayout::kNCHW + T *const input_grad_data template -__device__ __inline__ void KernelDepthwiseConvInputGrad( +__device__ __inline__ void KernelDepthwiseConvInputGradNCHW( ARG_DEFINE_KernelDepthwiseConvInputGrad) { + const int batch = blockIdx.y; + const int c_in = blockIdx.x; for (int w_in = threadIdx.x; w_in < input_width; w_in += blockDim.x) { for (int h_in = threadIdx.y; h_in < input_height; h_in += blockDim.y) { - const int batch = blockIdx.y; - const int c_in = blockIdx.x; - const int c_out_start = c_in * filter_multiplier; - int h_out_start = h_in - (filter_height - 1) * dilate_height + padding_height; - int h_out_end = h_in + padding_height; - int w_out_start = w_in - (filter_width - 1) * dilate_width + padding_width; - int w_out_end = w_in + padding_width; T value = 0; - int index; - if (data_layout != DataLayout::kNHWC) { - index = - ((batch * gridDim.x + c_in) * input_height + h_in) * input_width + - w_in; - } else { - index = - ((batch * input_height + h_in) * input_width + w_in) * gridDim.x + - c_in; - } + int index = + ((batch * gridDim.x + c_in) * input_height + h_in) * input_width + + w_in; if (fuse_relu_before_conv) { if (input_data[index] <= 0) { @@ -335,20 +367,67 @@ __device__ __inline__ void KernelDepthwiseConvInputGrad( if (h_out % stride_height == 0 && w_out % stride_width == 0 && s_h_out >= 0 && s_h_out < output_height && s_w_out >= 0 && s_w_out < output_width) { - int output_grad_offset; - if (data_layout != DataLayout::kNHWC) { - output_grad_offset = - ((batch * output_channels + c_out) * output_height + - s_h_out) * - output_width + - s_w_out; - } else { - output_grad_offset = - ((batch * output_height + s_h_out) * output_width + - s_w_out) * - output_channels + - c_out; - } + int output_grad_offset = + ((batch * output_channels + c_out) * output_height + + s_h_out) * + output_width + + s_w_out; + value += output_grad_data[output_grad_offset] * + filter_data[filter_offset]; + } + } + } + } + input_grad_data[index] = value; + } + } +} + +template +__device__ __inline__ void KernelDepthwiseConvInputGradNHWC( + ARG_DEFINE_KernelDepthwiseConvInputGrad) { + const int batch = blockIdx.z; + int h_in = blockIdx.x * dilate_height + blockIdx.y; + if (h_in >= input_height) { + return; + } + + for (int c_in = threadIdx.x; c_in < input_channels; c_in += blockDim.x) { + for (int w_in = threadIdx.y; w_in < input_width; w_in += blockDim.y) { + int h_out_start = + h_in - (filter_height - 1) * dilate_height + padding_height; + int w_out_start = + w_in - (filter_width - 1) * dilate_width + padding_width; + + T value = 0; + int index = ((batch * input_height + h_in) * input_width + w_in) * + input_channels + + c_in; + if (fuse_relu_before_conv) { + if (input_data[index] <= 0) { + input_grad_data[index] = 0; + continue; + } + } + + for (int c_i = 0; c_i < filter_multiplier; c_i++) { + int c_out = c_in * filter_multiplier + c_i; + int weight_offset = filter_height * filter_width; + for (int h_out = h_out_start, h_f = 0; h_f < filter_height; + h_out += dilate_height, h_f++) { + for (int w_out = w_out_start, w_f = 0; w_f < filter_width; + w_out += dilate_width, w_f++) { + weight_offset--; + int s_h_out = h_out / stride_height; + int s_w_out = w_out / stride_width; + if (h_out % stride_height == 0 && w_out % stride_width == 0 && + s_h_out >= 0 && s_h_out < output_height && s_w_out >= 0 && + s_w_out < output_width) { + int output_grad_offset = + ((batch * output_height + s_h_out) * output_width + s_w_out) * + output_channels + + c_out; + int filter_offset = weight_offset * output_channels + c_out; value += output_grad_data[output_grad_offset] * filter_data[filter_offset]; } @@ -362,10 +441,10 @@ __device__ __inline__ void KernelDepthwiseConvInputGrad( template -__device__ __inline__ void KernelDepthwiseConvInputGradCFilter( +__device__ __inline__ void KernelDepthwiseConvInputGradCFilterNCHW( ARG_DEFINE_KernelDepthwiseConvInputGrad) { - const int kWeghtSize = c_filter * c_filter * c_filter_multiplier + 1; - T r_weight[kWeghtSize]; + const int kWeightSize = c_filter * c_filter * c_filter_multiplier + 1; + T r_weight[kWeightSize]; const int batch = blockIdx.y; const int c_in = blockIdx.x; @@ -379,24 +458,13 @@ __device__ __inline__ void KernelDepthwiseConvInputGradCFilter( for (int w_in = threadIdx.x; w_in < input_width; w_in += blockDim.x) { for (int h_in = threadIdx.y; h_in < input_height; h_in += blockDim.y) { - const int batch = blockIdx.y; - const int c_in = blockIdx.x; - int h_out_start = h_in - (c_filter - 1) * dilate_height + padding_height; - int w_out_start = w_in - (c_filter - 1) * dilate_width + padding_width; T value = 0; - int index; - if (data_layout != DataLayout::kNHWC) { - index = - ((batch * gridDim.x + c_in) * input_height + h_in) * input_width + - w_in; - } else { - index = - ((batch * input_height + h_in) * input_width + w_in) * gridDim.x + - c_in; - } + int index = + ((batch * gridDim.x + c_in) * input_height + h_in) * input_width + + w_in; if (fuse_relu_before_conv) { if (input_data[index] <= 0) { input_grad_data[index] = 0; @@ -415,20 +483,11 @@ __device__ __inline__ void KernelDepthwiseConvInputGradCFilter( if (h_out % stride_height == 0 && w_out % stride_width == 0 && s_h_out >= 0 && s_h_out < output_height && s_w_out >= 0 && s_w_out < output_width) { - int output_grad_offset; - if (data_layout != DataLayout::kNHWC) { - output_grad_offset = - ((batch * output_channels + c_out) * output_height + - s_h_out) * - output_width + - s_w_out; - } else { - output_grad_offset = - ((batch * output_height + s_h_out) * output_width + - s_w_out) * - output_channels + - c_out; - } + int output_grad_offset = + ((batch * output_channels + c_out) * output_height + + s_h_out) * + output_width + + s_w_out; value += output_grad_data[output_grad_offset] * r_weight[h_f * c_filter + w_f + c_i * c_filter * c_filter]; @@ -441,47 +500,137 @@ __device__ __inline__ void KernelDepthwiseConvInputGradCFilter( } } -template +__device__ __inline__ void KernelDepthwiseConvInputGradCFilterNHWC( + ARG_DEFINE_KernelDepthwiseConvInputGrad) { + int h_in = blockIdx.x * dilate_height + blockIdx.y; + if (h_in >= input_height) { + return; + } + const int kWeightSize = c_filter * c_filter * c_filter_multiplier + 1; + T r_weight[kWeightSize]; + const int batch = blockIdx.z; + const int wi_size = (input_width + dilate_width - 1) / dilate_width; + const int h_out_start = + h_in - (c_filter - 1) * dilate_height + padding_height; + + for (int c_in = threadIdx.x; c_in < input_channels; c_in += blockDim.x) { + for (int c_i = 0; c_i < c_filter_multiplier; c_i++) { + int c_out = c_in * c_filter_multiplier + c_i; + for (int i = 0; i < c_filter * c_filter; i++) + r_weight[i + c_i * c_filter * c_filter] = + filter_data[(c_filter * c_filter - i - 1) * output_channels + + c_out]; + } + for (int i = threadIdx.y; i < wi_size * dilate_width; i += blockDim.y) { + int i_dw = i / wi_size; + int i_wi = i - i_dw * wi_size; + int w_in = i_wi * dilate_width + i_dw; + if (w_in >= input_width) { + continue; + } + int w_out_start = w_in - (c_filter - 1) * dilate_width + padding_width; + + T value = 0; + int index = ((batch * input_height + h_in) * input_width + w_in) * + input_channels + + c_in; + if (fuse_relu_before_conv) { + if (input_data[index] <= 0) { + input_grad_data[index] = 0; + continue; + } + } + + for (int c_i = 0; c_i < c_filter_multiplier; c_i++) { + int c_out = c_in * c_filter_multiplier + c_i; + for (int h_out = h_out_start, h_f = 0; h_f < c_filter; + h_out += dilate_height, h_f++) { + for (int w_out = w_out_start, w_f = 0; w_f < c_filter; + w_out += dilate_width, w_f++) { + int s_h_out = h_out / stride_height; + int s_w_out = w_out / stride_width; + if (h_out % stride_height == 0 && w_out % stride_width == 0 && + s_h_out >= 0 && s_h_out < output_height && s_w_out >= 0 && + s_w_out < output_width) { + int output_grad_offset = + ((batch * output_height + s_h_out) * output_width + s_w_out) * + output_channels + + c_out; + value += + output_grad_data[output_grad_offset] * + r_weight[h_f * c_filter + w_f + c_i * c_filter * c_filter]; + } + } + } + } + input_grad_data[index] = value; + } + } +} + +template __global__ void KernelDepthwiseConvInputGradSp( ARG_DEFINE_KernelDepthwiseConvInputGrad) { - if (c_filter_multiplier == 0) - KernelDepthwiseConvInputGrad( - input_data, output_grad_data, filter_data, batch_size, output_channels, - output_height, output_width, input_channels, input_height, input_width, - filter_multiplier, filter_height, filter_width, stride_height, - stride_width, padding_height, padding_width, dilate_height, - dilate_width, input_grad_data, data_layout); - else if (c_filter == -1) - KernelDepthwiseConvInputGrad( - input_data, output_grad_data, filter_data, batch_size, output_channels, - output_height, output_width, input_channels, input_height, input_width, - c_filter_multiplier, filter_height, filter_width, c_stride, c_stride, - padding_height, padding_width, dilate_height, dilate_width, - input_grad_data, data_layout); - else - KernelDepthwiseConvInputGradCFilter( - input_data, output_grad_data, filter_data, batch_size, output_channels, - output_height, output_width, input_channels, input_height, input_width, - c_filter_multiplier, filter_height, filter_width, c_stride, c_stride, - padding_height, padding_width, dilate_height, dilate_width, - input_grad_data, data_layout); + int final_filter_multiplier = filter_multiplier; + int h_stride = stride_height; + int w_stride = stride_width; + if (c_filter_multiplier != 0) { + final_filter_multiplier = c_filter_multiplier; + h_stride = c_stride; + w_stride = c_stride; + } + + if (c_filter_multiplier == 0 || c_filter == -1) { + if (data_layout != DataLayout::kNHWC) { + KernelDepthwiseConvInputGradNCHW( + input_data, output_grad_data, filter_data, batch_size, + output_channels, output_height, output_width, input_channels, + input_height, input_width, final_filter_multiplier, filter_height, + filter_width, h_stride, w_stride, padding_height, padding_width, + dilate_height, dilate_width, input_grad_data); + } else { + KernelDepthwiseConvInputGradNHWC( + input_data, output_grad_data, filter_data, batch_size, + output_channels, output_height, output_width, input_channels, + input_height, input_width, final_filter_multiplier, filter_height, + filter_width, h_stride, w_stride, padding_height, padding_width, + dilate_height, dilate_width, input_grad_data); + } + } else { + if (data_layout != DataLayout::kNHWC) { + KernelDepthwiseConvInputGradCFilterNCHW( + input_data, output_grad_data, filter_data, batch_size, + output_channels, output_height, output_width, input_channels, + input_height, input_width, c_filter_multiplier, filter_height, + filter_width, c_stride, c_stride, padding_height, padding_width, + dilate_height, dilate_width, input_grad_data); + } else { + KernelDepthwiseConvInputGradCFilterNHWC( + input_data, output_grad_data, filter_data, batch_size, + output_channels, output_height, output_width, input_channels, + input_height, input_width, c_filter_multiplier, filter_height, + filter_width, c_stride, c_stride, padding_height, padding_width, + dilate_height, dilate_width, input_grad_data); + } + } } // Cuda kernel to compute the depthwise convolution backprop w.r.t. filter. template -__device__ __inline__ void KernelDepthwiseConvFilterGrad( +__device__ __inline__ void KernelDepthwiseConvFilterGradNCHW( const T* output_grad_data, const T* input_data, const int num, const int output_channels, const int output_height, const int output_width, const int input_channels, const int input_height, const int input_width, const int filter_multiplier, const int filter_height, const int filter_width, const int stride_height, const int stride_width, const int padding_height, const int padding_width, const int dilate_height, - const int dilate_width, T* filter_grad_data, - const DataLayout data_layout = DataLayout::kNCHW) { + const int dilate_width, T* filter_grad_data) { T s = 0; - int gbid = ((blockIdx.z * gridDim.y) + blockIdx.y) * gridDim.x + blockIdx.x; for (int image_w = threadIdx.x; image_w < output_width; @@ -499,45 +648,137 @@ __device__ __inline__ void KernelDepthwiseConvFilterGrad( if (image_wk < 0 || image_wk >= input_width) continue; #define gaid(N, C, H, W) \ ((((N)*gridDim.z + (C)) * output_height + (H)) * output_width + (W)) -#define gaid_nhwc(N, H, W, C) \ - ((((N)*output_height + (H)) * output_width + (W)) * gridDim.z + (C)) - int input_id; - if (data_layout != DataLayout::kNHWC) { - input_id = ((bid * (gridDim.z / filter_multiplier) + - kernel_id / filter_multiplier) * - input_height + - image_hk) * - input_width + - image_wk; - if (fuse_relu_before_conv) { - s += output_grad_data[gaid(bid, kernel_id, image_h, image_w)] * - max(0.0f, input_data[input_id]); - } else { - s += output_grad_data[gaid(bid, kernel_id, image_h, image_w)] * - input_data[input_id]; - } + int input_id = ((bid * (gridDim.z / filter_multiplier) + + kernel_id / filter_multiplier) * + input_height + + image_hk) * + input_width + + image_wk; + if (fuse_relu_before_conv) { + s += output_grad_data[gaid(bid, kernel_id, image_h, image_w)] * + max(0.0f, input_data[input_id]); } else { - input_id = + s += output_grad_data[gaid(bid, kernel_id, image_h, image_w)] * + input_data[input_id]; + } +#undef gaid + } + } + } + CudaAtomicAddWithWarp(&filter_grad_data[gbid], s); +} + +template +__device__ __inline__ void KernelDepthwiseConvFilterGradNHWC( + const T* output_grad_data, const T* input_data, const int num, + const int output_channels, const int output_height, const int output_width, + const int input_channels, const int input_height, const int input_width, + const int filter_multiplier, const int filter_height, + const int filter_width, const int stride_height, const int stride_width, + const int padding_height, const int padding_width, const int dilate_height, + const int dilate_width, T* filter_grad_data) { + int bid = blockIdx.z; + int image_h = blockIdx.y; + int kernel_iw = blockIdx.x % filter_width; + int kernel_ih = blockIdx.x / filter_width; + for (int kernel_id = threadIdx.x; kernel_id < output_channels; + kernel_id += blockDim.x) { + T s = 0; + int gbid = + ((kernel_id * filter_height) + kernel_ih) * filter_width + kernel_iw; + for (int image_w = threadIdx.y; image_w < output_width; + image_w += blockDim.y) { + int kernel_h = kernel_ih * dilate_height - padding_height; + int kernel_w = kernel_iw * dilate_width - padding_width; + + int image_hk = image_h * stride_height + kernel_h; + int image_wk = image_w * stride_width + kernel_w; + if (image_hk < 0 || image_hk >= input_height) continue; + if (image_wk < 0 || image_wk >= input_width) continue; +#define gaid(N, H, W, C) \ + ((((N)*output_height + (H)) * output_width + (W)) * output_channels + (C)) + int input_id = + ((bid * input_height + image_hk) * input_width + image_wk) * + input_channels + + kernel_id / filter_multiplier; + if (fuse_relu_before_conv) { + s += output_grad_data[gaid(bid, image_h, image_w, kernel_id)] * + max(0.0f, input_data[input_id]); + } else { + s += output_grad_data[gaid(bid, image_h, image_w, kernel_id)] * + input_data[input_id]; + } +#undef gaid + } + platform::CudaAtomicAdd(&filter_grad_data[gbid], s); + } +} + +template +__device__ __inline__ void KernelDepthwiseConvFilterGradCFilterNHWC( + const T* output_grad_data, const T* input_data, const int num, + const int output_channels, const int output_height, const int output_width, + const int input_channels, const int input_height, const int input_width, + const int filter_multiplier, const int filter_height, + const int filter_width, const int stride_height, const int stride_width, + const int padding_height, const int padding_width, const int dilate_height, + const int dilate_width, T* filter_grad_data) { + const int bid = blockIdx.z; + int image_h = blockIdx.x * dilate_height + blockIdx.y; + if (image_h >= output_height) { + return; + } + const int kWeightSize = c_filter * c_filter; + T r_weight[kWeightSize]; + const int wi_size = (output_width + dilate_width - 1) / dilate_width; + + for (int kernel_id = threadIdx.x; kernel_id < output_channels; + kernel_id += blockDim.x) { + for (int i = 0; i < c_filter * c_filter; ++i) { + r_weight[i] = 0; + } + for (int i = threadIdx.y; i < wi_size * dilate_width; i += blockDim.y) { + int i_dw = i / wi_size; + int i_wi = i - i_dw * wi_size; + int image_w = i_wi * dilate_width + i_dw; + if (image_w >= output_width) { + continue; + } + for (int kernel_ih = 0; kernel_ih < c_filter; ++kernel_ih) { + for (int kernel_iw = 0; kernel_iw < c_filter; ++kernel_iw) { + int kernel_h = kernel_ih * dilate_height - padding_height; + int kernel_w = kernel_iw * dilate_width - padding_width; + int image_hk = image_h * stride_height + kernel_h; + int image_wk = image_w * stride_width + kernel_w; + if (image_hk < 0 || image_hk >= input_height) continue; + if (image_wk < 0 || image_wk >= input_width) continue; + int input_id = ((bid * input_height + image_hk) * input_width + image_wk) * - (gridDim.z / filter_multiplier) + + input_channels + kernel_id / filter_multiplier; + int output_id = + ((bid * output_height + image_h) * output_width + image_w) * + output_channels + + kernel_id; + T s = 0; if (fuse_relu_before_conv) { - s += output_grad_data[gaid_nhwc(bid, image_h, image_w, kernel_id)] * - max(0.0f, input_data[input_id]); + s = output_grad_data[output_id] * max(0.0f, input_data[input_id]); } else { - s += output_grad_data[gaid_nhwc(bid, image_h, image_w, kernel_id)] * - input_data[input_id]; + s = output_grad_data[output_id] * input_data[input_id]; } + r_weight[kernel_ih * c_filter + kernel_iw] += s; } - -#undef gaid } } + for (int i = 0; i < c_filter * c_filter; ++i) { + T* weight = filter_grad_data + i * output_channels + kernel_id; + platform::CudaAtomicAdd(&weight[0], r_weight[i]); + } } - CudaAtomicAddWithWarp(&filter_grad_data[gbid], s); } -template +template __global__ void KernelDepthwiseConvFilterGradSp( const T* output_grad_data, const T* input_data, const int num, const int output_channels, const int output_height, const int output_width, @@ -545,22 +786,49 @@ __global__ void KernelDepthwiseConvFilterGradSp( const int filter_multiplier, const int filter_height, const int filter_width, const int stride_height, const int stride_width, const int padding_height, const int padding_width, const int dilate_height, - const int dilate_width, T* filter_grad_data, - const DataLayout data_layout = DataLayout::kNCHW) { - if (c_filter_multiplier == 0) - KernelDepthwiseConvFilterGrad( - output_grad_data, input_data, num, output_channels, output_height, - output_width, input_channels, input_height, input_width, - filter_multiplier, filter_height, filter_width, stride_height, - stride_width, padding_height, padding_width, dilate_height, - dilate_width, filter_grad_data, data_layout); - else - KernelDepthwiseConvFilterGrad( - output_grad_data, input_data, num, output_channels, output_height, - output_width, input_channels, input_height, input_width, - c_filter_multiplier, filter_height, filter_width, stride_height, - stride_width, padding_height, padding_width, dilate_height, - dilate_width, filter_grad_data, data_layout); + const int dilate_width, T* filter_grad_data) { + int final_filter_multiplier = filter_multiplier; + int h_stride = stride_height; + int w_stride = stride_width; + if (c_filter_multiplier != 0) { + final_filter_multiplier = c_filter_multiplier; + h_stride = c_stride; + w_stride = c_stride; + } + if (c_filter_multiplier == 0 || c_filter == -1) { + if (data_layout != DataLayout::kNHWC) { + KernelDepthwiseConvFilterGradNCHW( + output_grad_data, input_data, num, output_channels, output_height, + output_width, input_channels, input_height, input_width, + final_filter_multiplier, filter_height, filter_width, h_stride, + w_stride, padding_height, padding_width, dilate_height, dilate_width, + filter_grad_data); + } else { + KernelDepthwiseConvFilterGradNHWC( + output_grad_data, input_data, num, output_channels, output_height, + output_width, input_channels, input_height, input_width, + final_filter_multiplier, filter_height, filter_width, h_stride, + w_stride, padding_height, padding_width, dilate_height, dilate_width, + filter_grad_data); + } + } else { + if (data_layout != DataLayout::kNHWC) { + KernelDepthwiseConvFilterGradNCHW( + output_grad_data, input_data, num, output_channels, output_height, + output_width, input_channels, input_height, input_width, + final_filter_multiplier, filter_height, filter_width, h_stride, + w_stride, padding_height, padding_width, dilate_height, dilate_width, + filter_grad_data); + } else { + KernelDepthwiseConvFilterGradCFilterNHWC( + output_grad_data, input_data, num, output_channels, output_height, + output_width, input_channels, input_height, input_width, + final_filter_multiplier, filter_height, filter_width, h_stride, + w_stride, padding_height, padding_width, dilate_height, dilate_width, + filter_grad_data); + } + } } /* @@ -608,19 +876,45 @@ class DepthwiseConvFunctor(); T* output_data = output->mutable_data(context.GetPlace()); + framework::Tensor filter_hwc; + if (data_layout == DataLayout::kNHWC) { + framework::DDim filter_hwc_dims({filter.dims()[2], filter.dims()[3], + filter.dims()[0], filter.dims()[1]}); + filter_hwc.Resize(filter_hwc_dims); + filter_hwc.mutable_data(context.GetPlace()); + std::vector perm_axis({2, 3, 0, 1}); + math::TransposeNormal trans; + trans(context, filter, &filter_hwc, perm_axis); + filter_data = filter_hwc.data(); + } + int thread = 512; - if (output_width > 1024 && output_width <= 2048) - thread = (output_width - 1) / 2 + 1; - else if (output_width > 512 && output_width <= 1024) - thread = output_width; + int blocks; + dim3 threads; + dim3 grid; + if (data_layout != DataLayout::kNHWC) { + if (output_width > 1024 && output_width <= 2048) + thread = (output_width - 1) / 2 + 1; + else if (output_width > 512 && output_width <= 1024) + thread = output_width; +#ifdef __HIPCC__ + thread = std::min(thread, 256); +#endif + blocks = std::min(std::max(thread / output_width, 1), output_height); + threads = dim3(std::min(output_width, thread), blocks, 1); + grid = dim3(output_channels, batch_size, 1); + } else { #ifdef __HIPCC__ - thread = std::min(thread, 256); + thread = std::min(thread, 256); #endif - int blocks = std::min(std::max(thread / output_width, 1), output_height); - dim3 threads(std::min(output_width, thread), blocks, 1); - dim3 grid(output_channels, batch_size, 1); + blocks = std::min( + std::max(thread / output_channels, 1), + ((output_width + dilate_width - 1) / dilate_width) * dilate_width); + threads = dim3(std::min(output_channels, thread), blocks, 1); + grid = dim3((output_height + dilate_height - 1) / dilate_height, + dilate_height, batch_size); + } int filter_multiplier = output_channels / input_channels; - int nums_output = batch_size * output_channels * output_height * output_width; #ifdef __HIPCC__ @@ -631,26 +925,37 @@ class DepthwiseConvFunctor<<>>( \ - input_data, filter_data, batch_size, output_channels, output_height, \ - output_width, input_channels, input_height, input_width, \ - filter_multiplier, ksize_height, ksize_width, stride_height, \ - stride_width, padding_height, padding_width, dilate_height, \ - dilate_width, output_data, data_layout); \ - return; \ +#define check_case(c_filter_multiplier, c_stride, c_filter) \ + if (c_filter_multiplier == 0 || \ + filter_multiplier == c_filter_multiplier && \ + stride_height == stride_width && stride_height == c_stride && \ + (ksize_height == ksize_width && ksize_height == c_filter || \ + c_filter == -1)) { \ + if (c_filter == -1) { \ + threads.x = block_size; \ + grid.x = grid_size; \ + threads.y = threads.z = grid.y = grid.z = 1; \ + } \ + if (data_layout != DataLayout::kNHWC) { \ + KernelDepthwiseConvSp< \ + T, c_filter_multiplier, c_stride, c_filter, DataLayout::kNCHW, \ + fuse_relu_before_conv><<>>( \ + input_data, filter_data, batch_size, output_channels, output_height, \ + output_width, input_channels, input_height, input_width, \ + filter_multiplier, ksize_height, ksize_width, stride_height, \ + stride_width, padding_height, padding_width, dilate_height, \ + dilate_width, output_data); \ + } else { \ + KernelDepthwiseConvSp< \ + T, c_filter_multiplier, c_stride, c_filter, DataLayout::kNHWC, \ + fuse_relu_before_conv><<>>( \ + input_data, filter_data, batch_size, output_channels, output_height, \ + output_width, input_channels, input_height, input_width, \ + filter_multiplier, ksize_height, ksize_width, stride_height, \ + stride_width, padding_height, padding_width, dilate_height, \ + dilate_width, output_data); \ + } \ + return; \ } check_case(1, 1, 3); check_case(1, 1, 5); @@ -714,32 +1019,67 @@ class DepthwiseConvInputGradFunctor(); T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + framework::Tensor filter_hwc; + if (data_layout == DataLayout::kNHWC) { + framework::DDim filter_hwc_dims({filter.dims()[2], filter.dims()[3], + filter.dims()[0], filter.dims()[1]}); + filter_hwc.Resize(filter_hwc_dims); + filter_hwc.mutable_data(context.GetPlace()); + std::vector perm_axis({2, 3, 0, 1}); + math::TransposeNormal trans; + trans(context, filter, &filter_hwc, perm_axis); + filter_data = filter_hwc.data(); + } + int thread = 512; - if (input_width > 1024 && input_width <= 2048) - thread = (input_width - 1) / 2 + 1; - else if (input_width > 512 && input_width <= 1024) - thread = input_width; - int blocks = std::min(std::max(thread / input_width, 1), input_height); - dim3 threads(std::min(input_width, thread), blocks, 1); - dim3 grid(input_channels, batch_size, 1); + int blocks; + dim3 threads; + dim3 grid; + if (data_layout != DataLayout::kNHWC) { + if (input_width > 1024 && input_width <= 2048) { + thread = (input_width - 1) / 2 + 1; + } else if (input_width > 512 && input_width <= 1024) { + thread = input_width; + } + blocks = std::min(std::max(thread / input_width, 1), input_height); + threads = dim3(std::min(input_width, thread), blocks, 1); + grid = dim3(input_channels, batch_size, 1); + } else { + blocks = std::min( + std::max(thread / input_channels, 1), + ((input_width + dilate_width - 1) / dilate_width) * dilate_width); + threads = dim3(std::min(input_channels, thread), blocks, 1); + grid = dim3((input_height + dilate_height - 1) / dilate_height, + dilate_height, batch_size); + } int filter_multiplier = output_channels / input_channels; -#define check_case(c_filter_multiplier, c_stride, c_filter) \ - if (c_filter_multiplier == 0 || \ - filter_multiplier == c_filter_multiplier && \ - stride_height == stride_width && stride_height == c_stride && \ - (ksize_height == ksize_width && ksize_height == c_filter || \ - c_filter == -1)) { \ - KernelDepthwiseConvInputGradSp< \ - T, c_filter_multiplier, c_stride, c_filter, \ - fuse_relu_before_conv><<>>( \ - input_data, output_grad_data, filter_data, batch_size, \ - output_channels, output_height, output_width, input_channels, \ - input_height, input_width, filter_multiplier, ksize_height, \ - ksize_width, stride_height, stride_width, padding_height, \ - padding_width, dilate_height, dilate_width, input_grad_data, \ - data_layout); \ - return; \ +#define check_case(c_filter_multiplier, c_stride, c_filter) \ + if (c_filter_multiplier == 0 || \ + filter_multiplier == c_filter_multiplier && \ + stride_height == stride_width && stride_height == c_stride && \ + (ksize_height == ksize_width && ksize_height == c_filter || \ + c_filter == -1)) { \ + if (data_layout != DataLayout::kNHWC) { \ + KernelDepthwiseConvInputGradSp< \ + T, c_filter_multiplier, c_stride, c_filter, DataLayout::kNCHW, \ + fuse_relu_before_conv><<>>( \ + input_data, output_grad_data, filter_data, batch_size, \ + output_channels, output_height, output_width, input_channels, \ + input_height, input_width, filter_multiplier, ksize_height, \ + ksize_width, stride_height, stride_width, padding_height, \ + padding_width, dilate_height, dilate_width, input_grad_data); \ + } else { \ + KernelDepthwiseConvInputGradSp< \ + T, c_filter_multiplier, c_stride, c_filter, DataLayout::kNHWC, \ + fuse_relu_before_conv><<>>( \ + input_data, output_grad_data, filter_data, batch_size, \ + output_channels, output_height, output_width, input_channels, \ + input_height, input_width, filter_multiplier, ksize_height, \ + ksize_width, stride_height, stride_width, padding_height, \ + padding_width, dilate_height, dilate_width, input_grad_data); \ + } \ + return; \ } check_case(1, 1, 3); check_case(1, 1, 5); @@ -802,30 +1142,95 @@ class DepthwiseConvFilterGradFunctormutable_data(context.GetPlace()); int block_size = 512; - if (output_width > 1024 && output_width <= 2048) - block_size = (output_width - 1) / 2 + 1; - else if (output_width > 512 && output_width <= 1024) - block_size = output_width; - int crop_output_height = - std::min(std::max(block_size / output_width, 1), output_height); - dim3 grid(ksize_width, ksize_height, output_channels); - dim3 threads(std::min(output_width, block_size), crop_output_height, 1); + int blocks; + dim3 threads; + dim3 grid; + if (data_layout != DataLayout::kNHWC) { + if (output_width > 1024 && output_width <= 2048) { + block_size = (output_width - 1) / 2 + 1; + } else if (output_width > 512 && output_width <= 1024) { + block_size = output_width; + } + blocks = std::min(std::max(block_size / output_width, 1), output_height); + grid = dim3(ksize_width, ksize_height, output_channels); + threads = dim3(std::min(output_width, block_size), blocks, 1); + } else { + blocks = std::min( + std::max(block_size / output_channels, 1), + ((output_width + dilate_width - 1) / dilate_width) * dilate_width); + grid = dim3((output_height + dilate_height - 1) / dilate_height, + dilate_height, batch_size); + threads = dim3(std::min(output_channels, block_size), blocks, 1); + } int filter_multiplier = output_channels / input_channels; -#define check_case(c_filter_multiplier) \ - if (c_filter_multiplier == 0 || c_filter_multiplier == filter_multiplier) { \ - KernelDepthwiseConvFilterGradSp< \ - T, c_filter_multiplier, \ - fuse_relu_before_conv><<>>( \ - output_grad_data, input_data, batch_size, output_channels, \ - output_height, output_width, input_channels, input_height, \ - input_width, filter_multiplier, ksize_height, ksize_width, \ - stride_height, stride_width, padding_height, padding_width, \ - dilate_height, dilate_width, filter_grad_data, data_layout); \ - return; \ +#define check_case(c_filter_multiplier, c_stride, c_filter) \ + if (c_filter_multiplier == 0 || \ + filter_multiplier == c_filter_multiplier && \ + stride_height == stride_width && stride_height == c_stride && \ + (ksize_height == ksize_width && ksize_height == c_filter || \ + c_filter == -1)) { \ + if (data_layout != DataLayout::kNHWC) { \ + KernelDepthwiseConvFilterGradSp< \ + T, c_filter_multiplier, c_stride, c_filter, DataLayout::kNCHW, \ + fuse_relu_before_conv><<>>( \ + output_grad_data, input_data, batch_size, output_channels, \ + output_height, output_width, input_channels, input_height, \ + input_width, filter_multiplier, ksize_height, ksize_width, \ + stride_height, stride_width, padding_height, padding_width, \ + dilate_height, dilate_width, filter_grad_data); \ + } else { \ + framework::Tensor filter_grad_hwc; \ + if (c_filter != -1) { \ + framework::DDim filter_grad_hwc_dims( \ + {filter_grad->dims()[2], filter_grad->dims()[3], \ + filter_grad->dims()[0], filter_grad->dims()[1]}); \ + filter_grad_hwc.Resize(filter_grad_hwc_dims); \ + filter_grad_hwc.mutable_data(context.GetPlace()); \ + math::SetConstant set_zero; \ + set_zero(context, &filter_grad_hwc, static_cast(0)); \ + filter_grad_data = filter_grad_hwc.data(); \ + } else { \ + block_size = 512; \ + if (output_channels > 1024 && output_channels <= 2048) { \ + block_size = (output_channels - 1) / 2 + 1; \ + } else if (output_channels > 512 && output_channels <= 1024) { \ + block_size = output_channels; \ + } \ + blocks = \ + std::min(std::max(block_size / output_channels, 1), output_width); \ + grid = dim3(ksize_width * ksize_height, output_height, batch_size); \ + threads = dim3(std::min(output_channels, block_size), blocks, 1); \ + } \ + KernelDepthwiseConvFilterGradSp< \ + T, c_filter_multiplier, c_stride, c_filter, DataLayout::kNHWC, \ + fuse_relu_before_conv><<>>( \ + output_grad_data, input_data, batch_size, output_channels, \ + output_height, output_width, input_channels, input_height, \ + input_width, filter_multiplier, ksize_height, ksize_width, \ + stride_height, stride_width, padding_height, padding_width, \ + dilate_height, dilate_width, filter_grad_data); \ + if (c_filter != -1) { \ + std::vector perm_axis({2, 3, 0, 1}); \ + math::TransposeNormal trans; \ + trans(context, filter_grad_hwc, filter_grad, perm_axis); \ + } \ + } \ + return; \ } - check_case(1); - check_case(0); + check_case(1, 1, 3); + check_case(1, 1, 5); + check_case(1, 1, -1); + check_case(1, 2, 3); + check_case(1, 2, 5); + check_case(1, 2, -1); + check_case(2, 1, 3); + check_case(2, 1, 5); + check_case(2, 1, -1); + check_case(2, 2, 3); + check_case(2, 2, 5); + check_case(2, 2, -1); + check_case(0, 0, -1); #undef check_case } }; diff --git a/python/paddle/nn/functional/conv.py b/python/paddle/nn/functional/conv.py index 75dc62e530d..3a520615625 100644 --- a/python/paddle/nn/functional/conv.py +++ b/python/paddle/nn/functional/conv.py @@ -112,10 +112,6 @@ def _conv_nd(x, # Due to the poor performance of NHWC, we transpose the input to NCHW. origin_format = data_format - if origin_format == "NHWC" and op_type == "depthwise_conv2d": - x = nn.transpose(x, perm=[0, 3, 1, 2]) - data_format = "NCHW" - channel_dim = 1 if in_dygraph_mode(): attrs = ('strides', stride, 'paddings', padding, 'dilations', dilation, 'groups', groups, 'use_cudnn', use_cudnn, 'use_mkldnn', @@ -159,10 +155,6 @@ def _conv_nd(x, 'use_mkldnn': use_mkldnn}) else: out = pre_bias - - if origin_format == "NHWC" and op_type == "depthwise_conv2d": - out = nn.transpose(out, perm=[0, 2, 3, 1]) - return out -- GitLab From 1e60a0c4b4f044036c8f9bd95482ec110ac8e8c6 Mon Sep 17 00:00:00 2001 From: JZ-LIANG <38102074+JZ-LIANG@users.noreply.github.com> Date: Wed, 7 Apr 2021 18:24:16 +0800 Subject: [PATCH 167/486] [3D-parallelism] Hybrid Model Parallelism (#32074) --- .../framework/distributed_strategy.proto | 18 +- .../meta_optimizers/pipeline_optimizer.py | 5 + .../meta_optimizers/sharding/fp16_helper.py | 66 ++- .../sharding/gradient_clip_helper.py | 81 ++- .../sharding/offload_helper.py | 281 ++++++++++ .../fleet/meta_optimizers/sharding/prune.py | 4 + .../fleet/meta_optimizers/sharding/utils.py | 66 ++- .../meta_optimizers/sharding_optimizer.py | 517 ++++++++++++++---- python/paddle/fluid/backward.py | 15 + python/paddle/fluid/optimizer.py | 56 +- .../test_fleet_sharding_meta_optimizer.py | 66 ++- 11 files changed, 1023 insertions(+), 152 deletions(-) mode change 100644 => 100755 python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py create mode 100755 python/paddle/distributed/fleet/meta_optimizers/sharding/offload_helper.py mode change 100644 => 100755 python/paddle/distributed/fleet/meta_optimizers/sharding/prune.py diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto index 805ef1c3e91..6363eedc80a 100755 --- a/paddle/fluid/framework/distributed_strategy.proto +++ b/paddle/fluid/framework/distributed_strategy.proto @@ -29,14 +29,18 @@ message RecomputeConfig { } message ShardingConfig { - optional float segment_broadcast_MB = 1 [ default = 32.0 ]; - optional bool hybrid_dp = 2 [ default = false ]; - optional int32 sharding_degree = 3 [ default = 8 ]; - optional int32 mp_degree = 4 [ default = 1 ]; - optional string sharding_segment_strategy = 5 + optional string sharding_segment_strategy = 1 [ default = 'segment_broadcast_MB' ]; - repeated string segment_anchors = 6; - optional int32 gradient_merge_acc_step = 7 [ default = 1 ]; + optional float segment_broadcast_MB = 2 [ default = 32.0 ]; + repeated string segment_anchors = 3; + optional int32 sharding_degree = 4 [ default = 8 ]; + optional int32 mp_degree = 5 [ default = 1 ]; + optional int32 dp_degree = 6 [ default = 1 ]; + optional bool hybrid_dp = 7 [ default = false ]; + optional int32 gradient_merge_acc_step = 8 [ default = 1 ]; + optional bool optimize_offload = 9 [ default = false ]; + optional bool pp_allreduce_in_optimize = 10 [ default = false ]; + optional int32 pp_degree = 11 [ default = 1 ]; } message AMPConfig { diff --git a/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py old mode 100644 new mode 100755 index 6cb7593b6bf..ae2daa9b9d8 --- a/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py @@ -45,11 +45,16 @@ class PipelineOptimizer(MetaOptimizerBase): 'accumulate_steps'] self.schedule_mode = user_defined_strategy.pipeline_configs[ 'schedule_mode'] + self.use_sharding = user_defined_strategy.sharding def _can_apply(self): if not self.role_maker._is_collective: return False + # FIXME revise for hybrid parallelism + if self.use_sharding: + return False + if self.user_defined_strategy.pipeline == True: return True return False diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/fp16_helper.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/fp16_helper.py index cf399f66946..40ba7781566 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/sharding/fp16_helper.py +++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/fp16_helper.py @@ -81,7 +81,10 @@ class FP16Utils(object): if not FP16Utils.is_fp32_cast_op(block, op): continue output_name = op.desc.output_arg_names()[0] - param_name = output_name.strip("@GRAD") + # TODO (JZ-LIANG) revise this for uniform mixed parallelism + param_name = output_name.strip( + "@GRAD@MERGED" + ) if "@MERGED" in output_name else output_name.strip("@GRAD") if param_name not in shard.global_params: raise ValueError("Output 'X' of cast_op must be a grad of" "model param, but {} is not a grad".format( @@ -105,7 +108,11 @@ class FP16Utils(object): reversed_x = [] reversed_x_paramname = [] for input_name in op.desc.input('X'): - param_name = input_name.strip("@GRAD") + # TODO (JZ-LIANG) revise this for uniform mixed parallelism + if "@MERGED" in input_name: + param_name = input_name.strip("@GRAD@MERGED") + else: + param_name = input_name.strip("@GRAD") if param_name not in shard.global_params: raise ValueError( "Input 'X' of check_finite_and_unscale must" @@ -169,3 +176,58 @@ class FP16Utils(object): OP_ROLE_KEY: OpRole.Optimize }) block._sync_with_cpp() + + # TODO (JZ-LIANG) revise this for uniform mixed parallelism + @staticmethod + def sync_amp_check_nan_inf(block, ring_id): + update_loss_scaling_op_idx = -1 + + for idx, op in reversed(list(enumerate(block.ops))): + if op.type == "update_loss_scaling": + update_loss_scaling_op_idx = idx + inf_var_name = op.desc.input('FoundInfinite')[0] + op._rename_input(inf_var_name, inf_var_name + "@GLOBAL_WORLD") + + # not use amp + if update_loss_scaling_op_idx == -1: + return + inf_var = block.var(inf_var_name) + inf_var_int32 = block.create_var( + name=inf_var_name + "@cast_int32", + shape=inf_var.shape, + dtype=core.VarDesc.VarType.INT32) + inf_var_global = block.create_var( + name=inf_var_name + "@GLOBAL_WORLD", + shape=inf_var.shape, + dtype=inf_var.dtype) + block._insert_op_without_sync( + update_loss_scaling_op_idx, + type='cast', + inputs={'X': inf_var}, + outputs={'Out': inf_var_int32}, + attrs={ + "in_dtype": inf_var.dtype, + "out_dtype": inf_var_int32.dtype, + OP_ROLE_KEY: OpRole.Optimize + }) + block._insert_op_without_sync( + update_loss_scaling_op_idx + 1, + type='c_allreduce_max', + inputs={'X': inf_var_int32}, + outputs={'Out': inf_var_int32}, + attrs={ + 'ring_id': ring_id, + 'use_calc_stream': True, + OP_ROLE_KEY: OpRole.Optimize + }) + block._insert_op_without_sync( + update_loss_scaling_op_idx + 2, + type='cast', + inputs={'X': inf_var_int32}, + outputs={'Out': inf_var_global}, + attrs={ + "in_dtype": inf_var_int32.dtype, + "out_dtype": inf_var_global.dtype, + OP_ROLE_KEY: OpRole.Optimize + }) + block._sync_with_cpp() diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/gradient_clip_helper.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/gradient_clip_helper.py index 5082bc33830..d5a012b147a 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/sharding/gradient_clip_helper.py +++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/gradient_clip_helper.py @@ -32,6 +32,7 @@ class GradientClipHelper(object): deperated_vars = set() deperate_op_idx = set() reversed_x_paramname = [] + global_norm_sum_op_idx = -1 for idx, op in enumerate(block.ops): if not self._is_gradient_clip_op(op): continue @@ -41,7 +42,11 @@ class GradientClipHelper(object): for input_name in op.desc.input_arg_names(): if input_name in deperated_vars: deperate_op = True - param_name = input_name.strip("@GRAD") + # TODO (JZ-LIANG) revise this for uniform mixed parallelism + if "@MERGED" in input_name: + param_name = input_name.strip("@GRAD@MERGED") + else: + param_name = input_name.strip("@GRAD") if shard.is_param(param_name) and \ not shard.has_param(param_name): deperate_op = True @@ -51,7 +56,8 @@ class GradientClipHelper(object): if deperate_op: deperate_op_idx.add(idx) for output_name in op.desc.output_arg_names(): - deperated_vars.add(output_name) + if output_name not in op.desc.input_arg_names(): + deperated_vars.add(output_name) if not deperated_vars: # got no gradient_clip op @@ -65,6 +71,7 @@ class GradientClipHelper(object): continue reversed_inputs = [] if op.type == "sum": + global_norm_sum_op_idx = idx for input_name in op.desc.input_arg_names(): if input_name not in deperated_vars: reversed_inputs.append(input_name) @@ -86,20 +93,20 @@ class GradientClipHelper(object): OP_ROLE_KEY: OpRole.Optimize, }) - # global norm should only be sum within each model parallelism word size when use global group - if pure_dp_degree > 1: - block._insert_op_without_sync( - idx + 2, - type='scale', - inputs={'X': sum_res}, - outputs={'Out': sum_res}, - attrs={ - 'scale': 1.0 / float(pure_dp_degree), - 'op_namescope': "/gradient_clip_model_parallelism", - 'bias': 0.0, - 'bias_after_scale': False, - OP_ROLE_KEY: OpRole.Optimize - }) + # global norm should only be sum within each model parallelism word size when use global group + if pure_dp_degree > 1: + block._insert_op_without_sync( + idx + 2, + type='scale', + inputs={'X': sum_res}, + outputs={'Out': sum_res}, + attrs={ + 'scale': 1.0 / float(pure_dp_degree), + 'op_namescope': "/gradient_clip_model_parallelism", + 'bias': 0.0, + 'bias_after_scale': False, + OP_ROLE_KEY: OpRole.Optimize + }) # the grad sum here should take the all and only param in the current shard to_check_param = set(reversed_x_paramname) @@ -115,3 +122,45 @@ class GradientClipHelper(object): block._remove_var(var_name, sync=False) block._sync_with_cpp() return + + # TODO (JZ-LIANG) revise this for uniform mixed parallelism + def sync_global_norm(self, block, ring_id, pure_dp_degree=1): + """ + prune gradient_clip related ops for params that not belong to cur shard + prune: square, reduce_sum, elementwise_mul + keep: sum, sqrt, elementwise_max, elementwise_div + """ + for idx, op in reversed(list(enumerate(block.ops))): + if not self._is_gradient_clip_op(op): + continue + + if op.type == "sum": + sum_res = op.desc.output_arg_names()[0] + block._insert_op_without_sync( + idx + 1, + type='c_allreduce_sum', + inputs={'X': sum_res}, + outputs={'Out': sum_res}, + attrs={ + 'ring_id': ring_id, + 'op_namescope': "/gradient_clip_model_parallelism", + 'use_calc_stream': True, + OP_ROLE_KEY: OpRole.Optimize, + }) + + # global norm should only be sum within each model parallelism word size + if pure_dp_degree > 1: + block._insert_op_without_sync( + idx + 2, + type='scale', + inputs={'X': sum_res}, + outputs={'Out': sum_res}, + attrs={ + 'scale': 1.0 / float(pure_dp_degree), + 'op_namescope': "/gradient_clip_model_parallelism", + 'bias': 0.0, + 'bias_after_scale': False, + OP_ROLE_KEY: OpRole.Optimize + }) + + return diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/offload_helper.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/offload_helper.py new file mode 100755 index 00000000000..76803818453 --- /dev/null +++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/offload_helper.py @@ -0,0 +1,281 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ..common import is_optimizer_op, OP_ROLE_KEY, OpRole +from paddle.fluid import core, unique_name + + +class OffloadHelper(object): + cpu_place_type = 0 + cuda_place_type = 1 + cuda_pinned_place_type = 2 + + def __init__(self): + pass + "0: dst is on CPUPlace. " + "1: dst is on CUDAPlace. " + "2: dst is on CUDAPinnedPlace. " + + def _insert_cast_op(self, block, idx, src_name, dst_name): + src_var = block.var(src_name) + if not block.has_var(dst_name): + block.create_var( + name=dst_name, + shape=src_var.shape, + dtype=core.VarDesc.VarType.FP16, + persistable=True) + dst_var = block.var(dst_name) + assert dst_var.dtype == core.VarDesc.VarType.FP16 + block._insert_op_without_sync( + idx, + type='cast', + inputs={'X': src_var}, + outputs={'Out': dst_var}, + attrs={ + 'in_dtype': src_var.dtype, + 'out_dtype': dst_var.dtype, + OP_ROLE_KEY: OpRole.Optimize + }) + + def _insert_memcpy_op(self, block, idx, src_name, dst_name, dst_place_type): + src_var = block.var(src_name) + dst_var = block.var(dst_name) + block._insert_op_without_sync( + idx, + type='memcpy', + inputs={'X': src_var}, + outputs={'Out': dst_var}, + attrs={ + 'dst_place_type': dst_place_type, + OP_ROLE_KEY: OpRole.Optimize, + }) + + def _insert_fetch_op(self, block, idx, src_name, dst_name): + self._insert_memcpy_op(block, idx, src_name, dst_name, + OffloadHelper.cuda_place_type) + + def _insert_offload_op(self, block, idx, src_name, dst_name): + self._insert_memcpy_op(block, idx, src_name, dst_name, + OffloadHelper.cuda_pinned_place_type) + + def _get_offload_var_name(self, name): + return unique_name.generate(name + '@offload') + + def _create_offload_var(self, var_name, offload_var_name, blocks): + for block in blocks: + var = block.var(var_name) + var.persistable = False + offload_var = block.create_var( + name=offload_var_name, + shape=var.shape, + dtype=var.dtype, + persistable=True) + + def offload_fp32param(self, block, startup_block): + """ + (p_fp16) = cast(p) + (p_fp16_recompute) = cast(p) + (pout,) = adam(p) + ===========================> + rename(p_fp16_recompute, p_fp16) + + (p,) = prefetch(p@offload) + (pout,) = adam(p) + (p_fp16) = cast(p) + (p@offload) = memcpy(p) + """ + param_to_idx = dict() + param_to_fp16 = dict() + # recompute_var which need rename to fp16_param + fp16_param_to_recompute = dict() + recompute_to_fp16 = dict() + + def remove_param(input_name): + param_to_idx.pop(input_name) + if input_name in param_to_fp16: + fp16_param = param_to_fp16.pop(input_name) + if fp16_param in fp16_param_to_recompute: + recompute = fp16_param_to_recompute.pop(fp16_param) + recompute_to_fp16.pop(recompute) + + # step1: record param + for idx, op in reversed(list(enumerate(block.ops))): + if op.type in ('adam', 'momentum', 'lars', 'lamb'): + param = op.desc.input("Param")[0] + param_to_idx[param] = idx + + # step2: remove param which can't offload + for idx, op in enumerate(block.ops): + if is_optimizer_op(op): + break + for input_name in op.desc.input_arg_names(): + if input_name not in param_to_idx: + continue + + # param is real used by fp32 op + if op.type != 'cast': + remove_param(input_name) + continue + + # param is only used by cast op, + # which to cast fp32_param to fp16_param + output_name = op.output_arg_names[0] + if 'cast_fp16' not in output_name: + remove_param(input_name) + continue + + if 'subprog' not in output_name: + assert output_name == input_name + '.cast_fp16' + assert input_name not in param_to_fp16, \ + "There must be only one cast op from fp32 param to fp16 param." + param_to_fp16[input_name] = output_name + else: + # fp16-->recompute_var + assert input_name in param_to_fp16, \ + "param must first be cast to fp16" + fp16_param = param_to_fp16[input_name] + fp16_param_to_recompute[fp16_param] = output_name + recompute_to_fp16[output_name] = fp16_param + + param_name_to_offload_name = dict() + # step3: main_block add offload, cast op + # change recompute to fp16, remove cast(param) to fp16 + for idx, op in reversed(list(enumerate(block.ops))): + if op.type in ('adam', 'momentum', 'lars', 'lamb'): + param = op.desc.input("Param")[0] + if param not in param_to_idx: continue + # step3.1: create offload_var + offload_var_name = self._get_offload_var_name(param) + param_name_to_offload_name[param] = offload_var_name + self._create_offload_var(param, offload_var_name, + [block, startup_block]) + + # step3.2: insert cast op and offload op + self._insert_offload_op(block, idx + 1, param, offload_var_name) + + assert param in param_to_fp16 + fp16_param_name = param_to_fp16[param] + fp16_param_var = block.var(fp16_param_name) + fp16_param_var.persistable = True + self._insert_cast_op(block, idx + 1, param, + param_to_fp16[param]) + + # step3.3: insert fetch op + self._insert_fetch_op(block, idx, offload_var_name, param) + continue + + # step3.4: remove cast op + if op.type == 'cast': + input_name = op.desc.input_arg_names()[0] + if input_name in param_to_idx: + block._remove_op(idx, sync=False) + continue + + # step3.5: change recompute_param to fp16_param + for input_name in op.desc.input_arg_names(): + if input_name in recompute_to_fp16: + op._rename_input(input_name, recompute_to_fp16[input_name]) + for output_name in op.desc.output_arg_names(): + if output_name in recompute_to_fp16: + op._rename_output(output_name, + recompute_to_fp16[output_name]) + + # step4: remove recompute_param + for name in recompute_to_fp16.keys(): + block._remove_var(name, sync=False) + + # step5: startup_block add offload + visited_vars = set() + for idx, op in reversed(list(enumerate(startup_block.ops))): + for out_name in op.output_arg_names: + if out_name in visited_vars: + continue + + if out_name in param_name_to_offload_name: + var_name = out_name + offload_var_name = param_name_to_offload_name[var_name] + self._insert_offload_op(startup_block, idx + 1, var_name, + offload_var_name) + self._insert_cast_op(startup_block, idx + 1, var_name, + param_to_fp16[var_name]) + + visited_vars.add(out_name) + + block._sync_with_cpp() + startup_block._sync_with_cpp() + + def offload(self, block, startup_block): + """ + (m1, m2) = prefetch(m1@offload, m2@offload) + (m1out, m2out, pout) = adam(m1, m2, p) + (m1@offload, m2@offload) = memcpy(m1, m2) + """ + vars_name_to_offload_name = dict() + + # main_block add offload + for idx, op in reversed(list(enumerate(block.ops))): + if not is_optimizer_op(op): + break + + vars_name = [] + if op.type == "adam": + # {Moment1Out = [''], Moment2Out = [''], ParamOut = ['']} = + # adam(inputs={Moment1 = [''], Moment2 = [''], Param = ['']}) + vars_name.append(op.desc.input("Moment1")[0]) + vars_name.append(op.desc.input("Moment2")[0]) + elif op.type == 'momentum': + pass + elif op.type == 'lars': + pass + elif op.type == 'lamb': + pass + + # step1: create and init offload_var + for var_name in vars_name: + assert var_name not in vars_name_to_offload_name + + offload_var_name = self._get_offload_var_name(var_name) + vars_name_to_offload_name[var_name] = offload_var_name + + self._create_offload_var(var_name, offload_var_name, + [block, startup_block]) + + # step2: insert offload op + for var_name in vars_name: + offload_var_name = vars_name_to_offload_name[var_name] + self._insert_offload_op(block, idx + 1, var_name, + offload_var_name) + + # step3: insert fetch op + for var_name in vars_name: + offload_var_name = vars_name_to_offload_name[var_name] + self._insert_fetch_op(block, idx, offload_var_name, var_name) + + # startup_block add offload + visited_vars = set() + for idx, op in reversed(list(enumerate(startup_block.ops))): + for out_name in op.output_arg_names: + if out_name in visited_vars: + continue + + if out_name in vars_name_to_offload_name: + var_name = out_name + offload_var_name = vars_name_to_offload_name[var_name] + # insert offload op after var is generated + self._insert_offload_op(startup_block, idx + 1, var_name, + offload_var_name) + visited_vars.add(out_name) + + block._sync_with_cpp() + startup_block._sync_with_cpp() diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/prune.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/prune.py old mode 100644 new mode 100755 index 70753b59ccc..5a43367cf1a --- a/python/paddle/distributed/fleet/meta_optimizers/sharding/prune.py +++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/prune.py @@ -126,6 +126,10 @@ class ProgramDeps(object): def should_remove_op(self, op_idx): op = self._block.ops[op_idx] + # TODO (JZ-LIANG) revise this for uniform mixed parallelism + # remove check_finite_and_unscale op if its input 'X' is empty + if op.type == 'check_finite_and_unscale' and len(op.input('X')) == 0: + return True for output_name in op.desc.output_arg_names(): if output_name not in self._should_removed_var: return False diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py index 8b111026bdb..f4ceb2d287a 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py +++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py @@ -274,6 +274,10 @@ def insert_sync_comm_ops(block, insert_idx, ring_id, comm_dep_vars): """ insert sync_comm_op for vars """ + # NOTE (JZ-LIANG) to be check, may result undefined case + if len(comm_dep_vars) == 0: + return 0 + op_role = get_valid_op_role(block, insert_idx) block._insert_op_without_sync( insert_idx, @@ -324,27 +328,45 @@ def insert_cast_ops(block, insert_idx, cast_ops): return -def insert_allreduce_ops(block, insert_idx, ring_id, allreduce_vars): +def insert_allreduce_ops(block, + insert_idx, + ring_id, + allreduce_vars, + op_role=OpRole.Backward, + use_calc_stream=False): """ _add_allreduce_ops """ + if len(allreduce_vars) == 0: + return + for var in allreduce_vars: block._insert_op_without_sync( insert_idx, type='c_allreduce_sum', inputs={'X': var}, outputs={'Out': var}, - attrs={'ring_id': ring_id, - OP_ROLE_KEY: OpRole.Backward}) + attrs={ + 'ring_id': ring_id, + 'use_calc_stream': use_calc_stream, + OP_ROLE_KEY: op_role + }) return -def insert_reduce_ops(block, insert_idx, ring_id, reduce_vars, shard): +def insert_reduce_ops(block, + insert_idx, + ring_id, + reduce_vars, + shard, + op_role=OpRole.Backward, + use_calc_stream=False): """ _add_allreduce_ops """ for var in reduce_vars: + root_id = get_grad_device(var, shard) assert root_id >= 0, "root id should be a positive int".format(var) block._insert_op_without_sync( @@ -355,12 +377,40 @@ def insert_reduce_ops(block, insert_idx, ring_id, reduce_vars, shard): attrs={ 'ring_id': ring_id, 'root_id': root_id, - OP_ROLE_KEY: OpRole.Backward + 'use_calc_stream': use_calc_stream, + OP_ROLE_KEY: op_role }) - return +def get_grad_device(grad_name, shard): + assert "@GRAD" in grad_name, "[{}] should be a grad variable.".format( + grad_name) + base_name = None + # mind the traversal order + possible_suffixes = [ + '.cast_fp16@GRAD@MERGED', '.cast_fp16@GRAD', '@GRAD@MERGED', '@GRAD' + ] + for suffix in possible_suffixes: + if suffix in grad_name: + base_name = re.sub(suffix, '', grad_name) + break + + assert base_name in shard.global_param2device, "[{}] should be a param variable.".format( + base_name) + + return shard.global_param2device[base_name] + + +def get_first_check_finite_and_unscale_op_idx(block): + + for idx, op in enumerate(block.ops): + if op.type == "check_finite_and_unscale": + return idx + + raise ValueError("check_finite_and_unscale does not exist in block") + + def insert_broadcast_ops(block, insert_idx, ring_id, broadcast2root): """ _add_broadcast_ops @@ -420,6 +470,7 @@ def insert_scale_loss_grad_ops(block, scale=1.0): outputs={'Out': loss_grad_var}, attrs={'scale': scale, OP_ROLE_KEY: OpRole.Backward}) + break def comm_analyse(main_program): @@ -502,6 +553,9 @@ def save_persistables(exe, dirname, main_program, filename=None): and part of persistable vars are duplicated and exist in all the ranks with different values. This function handles the model saving for sharding training. """ + # TODO (JZ-LIANG) revise this for uniform mixed parallelism + if main_program._pipeline_opt: + main_program = main_program._pipeline_opt['section_program']['program'] def is_opt_vars(var): # NOTE(JZ-LIANG): The checks should be updated when add new compatible optimizer diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py index cf3f75740ee..a83ae226a9d 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py @@ -16,16 +16,16 @@ import paddle from paddle.fluid import unique_name, core import paddle.fluid as fluid from paddle.distributed.fleet.meta_optimizers.common import OpRole, OP_ROLE_VAR_KEY, CollectiveHelper -from paddle.distributed.fleet.meta_optimizers.common import is_backward_op +from paddle.distributed.fleet.meta_optimizers.common import is_backward_op, is_optimizer_op, is_update_op from paddle.distributed.fleet.meta_optimizers.meta_optimizer_base import MetaOptimizerBase from paddle.distributed.fleet.meta_optimizers.sharding.shard import Shard, ProgramSegment from paddle.distributed.fleet.meta_optimizers.sharding.fp16_helper import FP16Utils from paddle.distributed.fleet.meta_optimizers.sharding.weight_decay_helper import WeightDecayHelper from paddle.distributed.fleet.meta_optimizers.sharding.gradient_clip_helper import GradientClipHelper +from .sharding.offload_helper import OffloadHelper from paddle.distributed.fleet.meta_optimizers.sharding.prune import ProgramDeps from paddle.distributed.fleet.meta_optimizers.sharding.utils import * from paddle.fluid.framework import Program, Variable, name_scope, default_main_program, default_startup_program, device_guard - from paddle.fluid import layers import logging @@ -38,6 +38,8 @@ __all__ = ["ShardingOptimizer"] class ShardingOptimizer(MetaOptimizerBase): + """Sharding Optimizer.""" + def __init__(self, optimizer): super(ShardingOptimizer, self).__init__(optimizer) self.inner_opt = optimizer @@ -46,7 +48,8 @@ class ShardingOptimizer(MetaOptimizerBase): "AMPOptimizer", "LarsOptimizer", "LambOptimizer", - "ModelParallelOptimizer", + # "ModelParallelOptimizer", + # "PipelineOptimizer", ] self.meta_optimizers_black_list = ["GraphExecutionOptimizer", ] self._main_program = None @@ -88,26 +91,6 @@ class ShardingOptimizer(MetaOptimizerBase): self._nrings_sharding = 1 self._nrings_dp = 1 - # parallelism - self.sharding_degree = int(self.user_defined_strategy.sharding_configs[ - "sharding_degree"]) - assert self.sharding_degree > 1, "sharding degree must be larger than zero" - self.mp_degree = int(self.user_defined_strategy.sharding_configs[ - "mp_degree"]) - self.hybrid_dp = self.user_defined_strategy.sharding_configs[ - "hybrid_dp"] - - self.pp_degree = 1 - - # dp here is the pure dp as the outest parallelism - self.dp_degree = int(self.role_maker._worker_num() // self.mp_degree // - self.sharding_degree) - assert self.role_maker._worker_num( - ) == self.dp_degree * self.mp_degree * self.sharding_degree * self.pp_degree - if self.hybrid_dp: - assert self.dp_degree > 1, "hybrid dp is on, but dp degree is [{}]".format( - self.dp_degree) - # segment self._sharding_segment_strategy = str( self.user_defined_strategy.sharding_configs[ @@ -128,55 +111,231 @@ class ShardingOptimizer(MetaOptimizerBase): "the sharding segment strategy [{}] is not implemented".format( str(self._sharding_segment_strategy))) + # parallelism + self.sharding_degree = int(self.user_defined_strategy.sharding_configs[ + "sharding_degree"]) + assert self.sharding_degree > 0, "sharding degree must be larger than zero" + self.mp_degree = int(self.user_defined_strategy.sharding_configs[ + "mp_degree"]) + # pipeline setting + # TODO (JZ-LIANG) should revise here for support mix parallelism with pipeline + self.pp_degree = int(self.user_defined_strategy.sharding_configs[ + "pp_degree"]) + if self.pp_degree > 1: + assert self.user_defined_strategy.pipeline == True + + self.dp_degree = int(self.user_defined_strategy.sharding_configs[ + 'dp_degree']) + assert self.role_maker._worker_num( + ) == self.mp_degree * self.sharding_degree * self.pp_degree * self.dp_degree, "global work size [{}], mp_degree [{}], sharding_degree [{}], pp_degree [{}], dp_degree [{}].".format( + self.role_maker._worker_num(), + self.mp_degree, + self.sharding_degree, + self.pp_degree, + self.dp_degree, ) + + self.hybrid_dp = self.user_defined_strategy.sharding_configs[ + "hybrid_dp"] + # NOTE (JZ-LIANG) + # there 2 kind of modes for gradient-merge and hybrid-dp in mixed parallism [sharding] and [pipeline]. + # we distinguish this two modes since the gm/hybrid-dp related allreduce should be insert in different place according different mode to have best performance: + # sharding: communication within node, and therefore should insert within backward segment to overlap with bw calc, conduct every micro step + # pipeline: communication accross nodes, and therefore should insert in update segemnt, conduct just once per global step + self.hybrid_dp_mode = None + # dp here is the pure dp as the outest parallelism + if self.hybrid_dp: + assert self.dp_degree > 1, "hybrid dp is on, but dp degree is [{}]".format( + self.dp_degree) + if self.pp_degree > 1: + self.hybrid_dp_mode = "pp_hybrid_dp" + else: + assert self.sharding_degree > 1, "by now we only support five kind of hybrid dp: sharding_hybrid_dp, mp_sharding_hybrid_dp, pp_hybrid_dp, mp_sharding_pp_hybrid_dp, sharding_pp_hybrid_dp." + self.hybrid_dp_mode = "sharding_hybrid_dp" + # gradient merge self._gradient_merge_acc_step = int( self.user_defined_strategy.sharding_configs[ "gradient_merge_acc_step"]) - self._grad2merged_grad = dict() + self.gradient_merge_mode = None + if self.pp_degree <= 1: + self.gradient_merge_mode = "sharding_gm" + self._grad2merged_grad = dict() + else: + self.gradient_merge_mode = "pp_gm" + self._gradient_merge_acc_step = self.user_defined_strategy.pipeline_configs[ + 'accumulate_steps'] + if self._gradient_merge_acc_step > 1: + logging.info("Gradient merge in [{}], acc step = [{}]".format( + self.gradient_merge_mode, self._gradient_merge_acc_step)) + + # optimize offload + self.optimize_offload = self.user_defined_strategy.sharding_configs[ + "optimize_offload"] + + # this feature is design for ascend, and should NOT be used in GPU training + self.pp_allreduce_in_optimize = self.user_defined_strategy.sharding_configs[ + "pp_allreduce_in_optimize"] if self.inner_opt is None: raise ValueError( "self.inner_opt of ShardingOptimizer should not be None.") - optimize_ops, params_grads = self.inner_opt.minimize( - loss, startup_program, parameter_list, no_grad_set) + + if self.pp_degree > 1: + pp_optimizer = fluid.optimizer.PipelineOptimizer( + self.inner_opt, self._gradient_merge_acc_step) + main_program = loss.block.program + main_program._pipeline_opt = dict() + self.schedule_mode = self.user_defined_strategy.pipeline_configs[ + 'schedule_mode'] + main_program._pipeline_opt['schedule_mode'] = self.schedule_mode + main_program._pipeline_opt[ + 'micro_batch_size'] = self.user_defined_strategy.pipeline_configs[ + 'micro_batch_size'] + self.pp_rank_ = self.role_maker._worker_index() // ( + self.sharding_degree * self.mp_degree) % self.pp_degree + main_program._pipeline_opt['local_rank'] = self.pp_rank_ + main_program._pipeline_opt[ + 'global_rank'] = self.role_maker._worker_index() + main_program._pipeline_opt['use_sharding'] = True + # TODO (JZ-LIANG) should revise here for support mix parallelism with pipeline + main_program._pipeline_opt['ring_id'] = 20 + main_program._pipeline_opt['global_ring_id'] = 3 + + optimize_ops, params_grads, program_list, self.pipeline_pair, self.pp_ring_map = pp_optimizer.minimize( + loss, startup_program, parameter_list, no_grad_set) + self.pp_degree = len(program_list) + else: + optimize_ops, params_grads = self.inner_opt.minimize( + loss, startup_program, parameter_list, no_grad_set) if startup_program is None: startup_program = default_startup_program() - main_block = loss.block + + if self.pp_degree > 1: + startup_program = startup_program._pipeline_opt['startup_program'] + #main_program = main_program._pipeline_opt['section_program']['program'] + print("pp_rank:", self.pp_rank_) + main_program = program_list[self.pp_rank_] + with open("main_%d" % self.role_maker._worker_index(), 'w') as f: + f.writelines(str(main_program)) + main_block = main_program.global_block() + new_params_grads = [] + for param, grad in params_grads: + if main_block.has_var(param.name): + new_params_grads.append((param, grad)) + params_grads = new_params_grads + + else: + main_block = loss.block + startup_block = startup_program.global_block() self._main_program = main_block.program self._startup_program = startup_program + if self.pp_degree > 1: + pp_optimizer._rename_gradient_var_name(main_block) + with open("main_%d" % self.role_maker._worker_index(), 'w') as f: + f.writelines(str(main_program)) + # step0: _init_comm self._init_comm() - # step1: _build_shard - self._build_shard(params_grads) - - # step2: split_program - self._split_program(main_block) + if self.sharding_degree > 1: - # step3: add broadcast and reduce ops - self._add_broadcast_allreduce(main_block) - main_block._sync_with_cpp() - startup_block._sync_with_cpp() + # step1: build shard + self._build_shard(params_grads) + + # step2: split_program + self._split_program(main_block) + + # step3: add broadcast and reduce ops + self._add_broadcast_allreduce(main_block) + main_block._sync_with_cpp() + startup_block._sync_with_cpp() + + main_block._sync_with_cpp() + + # step4: remove unneeded ops and vars from block + self._prune_main_program(main_block) + self._prune_startup_program(startup_block) + + if self.pp_degree > 1: + # sharding-pp related logic + # pp_optimizer._rename_gradient_var_name(main_block) + # crop ops + if self.sharding_degree > 1: + for idx, op in reversed(list(enumerate(main_block.ops))): + if is_update_op(op): + op_role_var = op.attr('op_role_var') + param_name = op_role_var[0] + if not self._shard.has_param(param_name): + main_block._remove_op(idx) + + for idx, op in reversed(list(enumerate(main_block.ops))): + if op.type != 'cast': continue + in_name = op.input_arg_names[0] + if in_name not in self._params: continue + #if self._shard.has_param(param_name): continue + if in_name not in main_block.vars: + main_block._remove_op(idx) + + accumulated_grad_names = pp_optimizer._accumulate_gradients( + main_block) + # accumulated_grad_names = sorted(accumulated_grad_names) + if self.pp_allreduce_in_optimize: + print("persistable FP32 grad: ") + print(accumulated_grad_names) + first_optimize_op_index = get_first_check_finite_and_unscale_op_idx( + main_block) + insert_reduce_ops( + main_block, + first_optimize_op_index, + self.sharding_ring_id, + accumulated_grad_names, + self._shard, + core.op_proto_and_checker_maker.OpRole.Optimize, + use_calc_stream=True) + if self.hybrid_dp and self.hybrid_dp_mode == "pp_hybrid_dp": + first_optimize_op_index = get_first_check_finite_and_unscale_op_idx( + main_block) + insert_allreduce_ops( + main_block, + first_optimize_op_index, + self.dp_ring_id, + accumulated_grad_names, + core.op_proto_and_checker_maker.OpRole.Optimize, + use_calc_stream=True) + + # if not use sharding, adapt amp/clip, for remain parallelism. + # cast --> amp --> clip --> opt + if self.sharding_degree <= 1: + # amp + FP16Utils.sync_amp_check_nan_inf(main_block, self.global_ring_id) + + # clip + gradientclip_helper = GradientClipHelper(self.global_ring_id) + gradientclip_helper.sync_global_norm( + main_block, self.global_ring_id, self.dp_degree) - # step4: scale the loss by the num of dp degree - # sharding is also a senario of dp - scale_ = self.dp_degree * self.sharding_degree - if scale_ > 1: - insert_scale_loss_grad_ops(main_block, scale=1.0 / scale_) + # step6: loss div dp_degree + global_dp_degree = self.sharding_degree * self.dp_degree + assert int(global_dp_degree) == global_dp_degree + if global_dp_degree > 1: + insert_scale_loss_grad_ops(main_block, scale=1.0 / global_dp_degree) main_block._sync_with_cpp() - # step5: remove unneeded ops and vars from block - self._prune_main_program(main_block) - self._prune_startup_program(startup_block) - if self.hybrid_dp: - self._initialization_broadcast(startup_program) - - # step6: optional gradient merge - if self._gradient_merge_acc_step > 1: + # TODO(wangxi): add optimize offload + # opt offload should be enable while gradient merge is enable && acc_step is quite large (e.g. >> 100) + # sync its memcpy could not be overlap with calc, otherwise it will slower down training severely. + if self.optimize_offload: + logging.info("Sharding with optimize offload !") + offload_helper = OffloadHelper() + offload_helper.offload(main_block, startup_block) + offload_helper.offload_fp32param(main_block, startup_block) + + # step6: (optional) sharding gradient merge + if self.gradient_merge_mode == "sharding_gm" and self._gradient_merge_acc_step > 1: self._sharding_gradient_merge(main_block) # # check op dependecy @@ -184,14 +343,29 @@ class ShardingOptimizer(MetaOptimizerBase): # check_broadcast(main_block) # check_allreduce_sum(main_block, self._shard, self.sharding_ring_id, # self.dp_ring_id) + + if self.hybrid_dp: + # NOTE(JZ-LIANG) ensure in both sharding_hybrid_dp & pp_hybrid_dp + # init param broadcast should be called after startup pruning + self._initialization_broadcast(startup_block) + + with open("start_sharding_%d" % self.role_maker._worker_index(), + 'w') as f: + f.writelines(str(startup_block.program)) + with open("main_sharding_%d" % self.role_maker._worker_index(), + 'w') as f: + f.writelines(str(main_block.program)) + self._wait() return optimize_ops, params_grads def _init_comm(self): + # config sharding & dp groups - self._build_group() + self._build_groups() + # sync var startup_block = self._startup_program.global_block() self.startup_prog_sync_var = startup_block.create_var( name="startup_prog_sync_var", @@ -199,7 +373,7 @@ class ShardingOptimizer(MetaOptimizerBase): dtype=core.VarDesc.VarType.INT32, persistable=False) - # global + # global ring self._collective_helper._init_communicator( self._startup_program, self.current_endpoint, @@ -212,7 +386,7 @@ class ShardingOptimizer(MetaOptimizerBase): append_naive_sync(startup_block, self.startup_prog_sync_var, self.global_ring_id) - # mp + # mp ring if self.mp_degree > 1: self._collective_helper._init_communicator( self._startup_program, @@ -226,7 +400,7 @@ class ShardingOptimizer(MetaOptimizerBase): append_naive_sync(startup_block, self.startup_prog_sync_var, self.global_ring_id) - # sharding + # sharding ring if self.sharding_degree > 1: self._collective_helper._init_communicator( self._startup_program, @@ -240,7 +414,65 @@ class ShardingOptimizer(MetaOptimizerBase): append_naive_sync(startup_block, self.startup_prog_sync_var, self.global_ring_id) - # dp + # pp ring + if self.pp_degree > 1: + if self.schedule_mode == 'F-then-B': # GPipe + self._collective_helper._init_communicator( + self._startup_program, + self.current_endpoint, + self.pp_group_endpoints, + self.pp_rank, + self.pp_ring_id, + False, + global_ring_id=self.global_ring_id, + sync=False) + # append_naive_sync(startup_block, self.startup_prog_sync_var, + # self.global_ring_id) + self._collective_helper._init_communicator( + self._startup_program, + self.current_endpoint, + self.pp_group_endpoints, + self.pp_rank, + self.pp_ring_id + 2, + False, + global_ring_id=self.global_ring_id, + sync=False) + # append_naive_sync(startup_block, self.startup_prog_sync_var, + # self.global_ring_id) + else: + assert self.schedule_mode == '1F1B' + for pair in self.pipeline_pair: + pair_key = pair[0] * 1000 + pair[1] + ring_id = self.pp_ring_map[pair_key] + print("pp pair:{}, ring_id: {}".format(pair, ring_id)) + if self.pp_rank not in pair: continue + pp_group_endpoints = [ + self.pp_group_endpoints[pair[0]], + self.pp_group_endpoints[pair[1]], + ] + if pair[0] < pair[1]: + start_ring_id = self.pp_ring_id + pair[1] - pair[0] - 1 + else: + start_ring_id = self.pp_ring_id + 2 + pair[0] - pair[ + 1] - 1 + pp_rank = 0 if self.pp_rank == pair[0] else 1 + self._collective_helper._init_communicator( + self._startup_program, + self.current_endpoint, + pp_group_endpoints, + pp_rank, + ring_id, + False, + global_ring_id=self.global_ring_id, + sync=False) + # append_naive_sync(startup_block, self.startup_prog_sync_var, + # self.global_ring_id) + + # TODO (JZ-LIANG) to unify this shit + assert self.pp_rank_ == self.pp_rank, "pp rank for pp opt [{}], pp rank for sharding opt [{}]".format( + self.pp_rank_, self.pp_rank) + + # pure dp ring if self.dp_degree > 1: self._collective_helper._init_communicator( self._startup_program, @@ -360,17 +592,22 @@ class ShardingOptimizer(MetaOptimizerBase): self._main_program.global_block().var(input_name)) # find reduce vars - if is_backward_op(op) and \ - OP_ROLE_VAR_KEY in op.attr_names: - op_role_var = op.all_attrs()[OP_ROLE_VAR_KEY] - if len(op_role_var) != 0: - assert len(op_role_var) % 2 == 0 - for i in range(0, len(op_role_var), 2): - param, reduced_grad = op_role_var[i], op_role_var[i + 1] - segment._allreduce_vars.append(reduced_grad) - assert ( - reduced_grad not in self._reduced_grads_to_param) - self._reduced_grads_to_param[reduced_grad] = param + if self.pp_degree > 1 and self.pp_allreduce_in_optimize: + # place pipeline gradient allreduce in optimize + pass + else: + if is_backward_op(op) and \ + OP_ROLE_VAR_KEY in op.attr_names: + op_role_var = op.all_attrs()[OP_ROLE_VAR_KEY] + if len(op_role_var) != 0: + assert len(op_role_var) % 2 == 0 + for i in range(0, len(op_role_var), 2): + param, reduced_grad = op_role_var[i], op_role_var[ + i + 1] + segment._allreduce_vars.append(reduced_grad) + assert (reduced_grad not in + self._reduced_grads_to_param) + self._reduced_grads_to_param[reduced_grad] = param # find cast op if FP16Utils.is_fp16_cast_op(block, op, self._params): @@ -462,8 +699,13 @@ class ShardingOptimizer(MetaOptimizerBase): # Prune for idx, op in reversed(list(enumerate(block.ops))): if op.type in [ - "c_allreduce_sum", "c_sync_comm_stream", - "c_calc_comm_stream", "c_gen_nccl_id", "c_comm_init" + "c_allreduce_sum", + "c_sync_comm_stream", + "c_calc_comm_stream", + "c_gen_nccl_id", + "c_comm_init", + 'send_v2', + 'recv_v2', ]: pass elif op.type == "conditional_block": @@ -500,6 +742,16 @@ class ShardingOptimizer(MetaOptimizerBase): if program_deps.should_remove_op(idx): program_deps.remove_op(idx) + # NOTE (JZ-LIANG) revise and unify logic here + # sharding support fp16_allreduce logic + block._sync_with_cpp() + for idx, op in reversed(list(enumerate(block.ops))): + if op.type == 'concat' and is_optimizer_op(op): + # remove inputs that not on this card + reserved_x = [] + for var_name in op.desc.input("X"): + if block.has_var(var_name): reserved_x.append(var_name) + op.desc.set_input('X', reserved_x) block._sync_with_cpp() return @@ -507,21 +759,41 @@ class ShardingOptimizer(MetaOptimizerBase): """ add broadcast allreduce op if enable gradient_merge, insert related ops + + if combined with pipeline(grad accumulate), + the grad allreduce should be done in optimize role """ if len(self._segments) < 1: return # sharding + if self.pp_degree > 1 and self.pp_allreduce_in_optimize: + for idx in range(len(self._segments)): + assert len(self._segments[idx]._allreduce_vars) == 0 + + # NOTE (JZ-LIANG) revise and unify logic here + # fix the _end_idx for segments[-1] if pp is used. + new_end_idx = self._segments[-1]._end_idx + for idx in range(self._segments[-1]._end_idx - 1, + self._segments[-1]._start_idx - 1, -1): + op = block.ops[idx] + if op.type == "fill_constant" or op.type == "sum": + if "MERGED" in op.output_arg_names[0]: new_end_idx = idx + 1 + elif op.type == "cast": + if "@TMP" in op.output_arg_names[0]: new_end_idx = idx + 1 + self._segments[-1]._end_idx = new_end_idx + if self._segments[-1]._allreduce_vars: shard_allredue_vars = self._shard.filter_grads(self._segments[-1] ._allreduce_vars) - if self._gradient_merge_acc_step <= 1: - if self.hybrid_dp and len(shard_allredue_vars) >= 1: + if self.gradient_merge_mode != "sharding_gm" or self._gradient_merge_acc_step <= 1: + if self.hybrid_dp and self.hybrid_dp_mode == "sharding_hybrid_dp" and len( + shard_allredue_vars) >= 1: insert_sync_comm_ops(block, self._segments[-1]._end_idx, self.dp_ring_id, shard_allredue_vars) insert_allreduce_ops(block, self._segments[-1]._end_idx, self.dp_ring_id, shard_allredue_vars) # gradient merge - else: + elif self.gradient_merge_mode == "sharding_gm" and self._gradient_merge_acc_step > 1: self.create_persistable_gradients_and_insert_merge_ops( block, self._startup_program.global_block(), @@ -532,9 +804,14 @@ class ShardingOptimizer(MetaOptimizerBase): self.sharding_ring_id, self._segments[-1]._allreduce_vars) # allreduce --> reduce - insert_reduce_ops(block, self._segments[-1]._end_idx, - self.sharding_ring_id, - self._segments[-1]._allreduce_vars, self._shard) + insert_reduce_ops( + block, + self._segments[-1]._end_idx, + self.sharding_ring_id, + self._segments[-1]._allreduce_vars, + self._shard, + op_role=OpRole.Backward, + use_calc_stream=False) for idx, segment in reversed(list(enumerate(self._segments))): allreduce_vars = self._segments[ @@ -574,8 +851,9 @@ class ShardingOptimizer(MetaOptimizerBase): # step2: add Sync ops shard_allredue_vars = self._shard.filter_grads(allreduce_vars) - if self._gradient_merge_acc_step <= 1: - if self.hybrid_dp and len(shard_allredue_vars) >= 1: + if self.gradient_merge_mode != "sharding_gm" or self._gradient_merge_acc_step <= 1: + if self.hybrid_dp and self.hybrid_dp_mode == "sharding_hybrid_dp" and len( + shard_allredue_vars) >= 1: insert_sync_comm_ops(block, segment._end_idx, self.dp_ring_id, shard_allredue_vars) @@ -593,7 +871,7 @@ class ShardingOptimizer(MetaOptimizerBase): self.sharding_ring_id, comm_dep_vars) # gradient merge - else: + elif self.gradient_merge_mode == "sharding_gm" and self._gradient_merge_acc_step > 1: broad_cast_vars = [x[0] for x in broadcast_vars] if len(broad_cast_vars) > 0: insert_sync_comm_ops(block, segment._end_idx, @@ -616,7 +894,7 @@ class ShardingOptimizer(MetaOptimizerBase): # step5: add broadcast ops # gradient merge - if self._gradient_merge_acc_step > 1: + if self.gradient_merge_mode == "sharding_gm" and self._gradient_merge_acc_step > 1: self.create_persistable_gradients_and_insert_merge_ops( block, self._startup_program.global_block(), segment._start_idx, @@ -627,20 +905,29 @@ class ShardingOptimizer(MetaOptimizerBase): # step6: add all_reduce ops # dp - if self._gradient_merge_acc_step <= 1: - if self.hybrid_dp and len(shard_allredue_vars) >= 1: + if self.gradient_merge_mode != "sharding_gm" or self._gradient_merge_acc_step <= 1: + if self.hybrid_dp and self.hybrid_dp_mode == "sharding_hybrid_dp" and len( + shard_allredue_vars) >= 1: insert_allreduce_ops(block, segment._start_idx, self.dp_ring_id, shard_allredue_vars) insert_sync_comm_ops(block, segment._start_idx, self.sharding_ring_id, allreduce_vars) # gradient merge - else: + elif self.gradient_merge_mode == "sharding_gm" and self._gradient_merge_acc_step > 1: insert_sync_comm_ops(block, segment._start_idx, self.sharding_ring_id, allreduce_vars) # sharding # allreduce --> reduce - insert_reduce_ops(block, segment._start_idx, self.sharding_ring_id, - allreduce_vars, self._shard) + # TODO temp change + if len(allreduce_vars) > 0: + insert_reduce_ops( + block, + segment._start_idx, + self.sharding_ring_id, + allreduce_vars, + self._shard, + op_role=OpRole.Backward, + use_calc_stream=False) block._sync_with_cpp() @@ -691,14 +978,14 @@ class ShardingOptimizer(MetaOptimizerBase): block._remove_var(var_name, sync=False) block._sync_with_cpp() - def _build_group(self): + def _build_groups(self): """ pre-assign ring ids - mp: 0 - sharding: 1 - pure-dp: 2 - global: 3 - pp: >= 20 + mp: 0 + sharding: 1 + pure-dp: 2 + global: 3 + pp: >= 20 if one parallelism is not enable: -1 and only support parallelism hierarchy: mp --> sharding --> pp --> dp """ @@ -768,6 +1055,30 @@ class ShardingOptimizer(MetaOptimizerBase): self.sharding_group_id = -1 self.sharding_group_endpoints = [] + # pp + if self.pp_degree > 1: + self.pp_ring_id = 20 + self.pp_rank = self.global_rank // (self.sharding_degree * + self.mp_degree) % self.pp_degree + # (NOTE): Already adjust for (outter-pure) dp + self.pp_group_id = self.global_rank // ( + self.mp_degree * self.sharding_degree * self.pp_degree) + pp_first_stage_idx = self.global_rank % ( + self.sharding_degree * self.mp_degree) + self.pp_group_id * ( + self.mp_degree * self.sharding_degree * self.pp_degree) + pp_stage_offset = self.sharding_degree * self.mp_degree + self.pp_group_endpoints = [] + for i in range(self.pp_degree): + self.pp_group_endpoints.append(self.global_endpoints[ + pp_first_stage_idx + pp_stage_offset * i]) + assert self.current_endpoint in self.pp_group_endpoints + else: + self.pp_degree = 1 + self.pp_ring_id = -1 + self.pp_rank = -1 + self.pp_group_id = -1 + self.pp_group_endpoints = [] + # outter-pure-dp group # NOTE (JZ-LIANG) support outter-pure-dp to scale the throughput in 3D parallelism # e.g. mp-sharding-pp-dp @@ -775,6 +1086,7 @@ class ShardingOptimizer(MetaOptimizerBase): assert self.global_word_size == self.mp_degree * self.sharding_degree * self.pp_degree * self.dp_degree, "mp_degree: [{}], sharding_degree: [{}], pp_degree: [{}], dp_degree: [{}]; BUT global nrank: [{}]".format( self.mp_degree, self.sharding_degree, self.pp_degree, self.dp_degree, self.global_word_size) + if self.dp_degree > 1: self.dp_ring_id = 2 self.dp_rank = self.global_rank // (self.sharding_degree * @@ -794,6 +1106,8 @@ class ShardingOptimizer(MetaOptimizerBase): self.dp_group_endpoints = [] # global group + # use for gen_nccl_comm_sync, amp check nan inf, clip by global norm + # NOTE (JZ-LIANG) when use global ring for calc global norm and dp_degree > 1, the allreduce result should be devided by dp_degree self.global_ring_id = 3 logging.info("global word size: {}".format(self.global_word_size)) @@ -817,25 +1131,31 @@ class ShardingOptimizer(MetaOptimizerBase): logging.info("sharding ring id: {}".format(self.sharding_ring_id)) logging.info("#####" * 6) - logging.info("outter pure dp group size: {}".format(self.dp_degree)) - logging.info("outter pure dp rank: {}".format(self.dp_rank)) - logging.info("outter pure dp group endpoints: {}".format( + logging.info("pp group size: {}".format(self.pp_degree)) + logging.info("pp rank: {}".format(self.pp_rank)) + logging.info("pp group id: {}".format(self.pp_group_id)) + logging.info("pp group endpoints: {}".format(self.pp_group_endpoints)) + logging.info("pp ring id: {}".format(self.pp_ring_id)) + logging.info("#####" * 6) + + logging.info("pure dp group size: {}".format(self.dp_degree)) + logging.info("pure dp rank: {}".format(self.dp_rank)) + logging.info("pure dp group endpoints: {}".format( self.dp_group_endpoints)) - logging.info("outter pure dp ring id: {}".format(self.dp_ring_id)) + logging.info("pure dp ring id: {}".format(self.dp_ring_id)) logging.info("#####" * 6) return - def _initialization_broadcast(self, startup_prog): + def _initialization_broadcast(self, startup_block): """ this funtion is to ensure the initialization between dp group to be identical when hybrid-dp is used. """ - block = startup_prog.global_block() params = [] - for param in block.iter_parameters(): + for param in startup_block.iter_parameters(): params.append(param) - block.append_op( + startup_block.append_op( type='c_broadcast', inputs={'X': param}, outputs={'Out': param}, @@ -844,15 +1164,14 @@ class ShardingOptimizer(MetaOptimizerBase): 'root': 0, OP_ROLE_KEY: OpRole.Forward }) - block.append_op( + startup_block.append_op( type='c_sync_comm_stream', inputs={'X': params}, outputs={'Out': params}, attrs={'ring_id': self.dp_ring_id, OP_ROLE_KEY: OpRole.Forward}) - # sync within global group - append_naive_sync(block, self.startup_prog_sync_var, + append_naive_sync(startup_block, self.startup_prog_sync_var, self.global_ring_id) # sharding gradient merge diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py index b3a1834d49d..572ebb26d73 100755 --- a/python/paddle/fluid/backward.py +++ b/python/paddle/fluid/backward.py @@ -233,6 +233,8 @@ def _add_needed_descs_to_block(descs, block, main_block, in_memory_vars): new_op_desc = block.desc.append_op() new_op_desc.copy_from(desc) new_op_desc._set_attr(op_role_attr_name, backward) + if desc.has_attr('op_device'): + new_op_desc._set_attr('op_device', desc.attr('op_device')) result_descs.append(new_op_desc) return result_descs @@ -252,6 +254,8 @@ def _add_descs_to_block(descs, block): new_op_desc = block.desc.append_op() new_op_desc.copy_from(desc) new_op_desc._set_attr(op_role_attr_name, backward) + if desc.has_attr('op_device'): + new_op_desc._set_attr('op_device', desc.attr('op_device')) result_descs.append(new_op_desc) return result_descs @@ -843,6 +847,7 @@ def _append_backward_ops_with_checkpoints_( vars_in_memory = vars_should_be_hold + checkpoints_name max_calculated_op_position = len(ops) + device_attr_name = core.op_proto_and_checker_maker.kOpDeviceAttrName() if recompute_segments == []: gap_ops = ops[0:max_calculated_op_position] for op in reversed(gap_ops): @@ -852,6 +857,11 @@ def _append_backward_ops_with_checkpoints_( _pretty_op_desc_(op.desc, "with_sub_block")) grad_op_desc, op_grad_to_var = core.get_grad_op_desc( op.desc, cpt.to_text(no_grad_dict[block.idx]), []) + # Set device for grad_op according to forward Op + if op.desc.has_attr(device_attr_name): + op_device = op.desc.attr(device_attr_name) + for op_desc in grad_op_desc: + op_desc._set_attr(device_attr_name, op_device) added_descs = _add_descs_to_block(grad_op_desc, local_block) grad_op_descs.extend(added_descs) grad_to_var.update(op_grad_to_var) @@ -866,6 +876,11 @@ def _append_backward_ops_with_checkpoints_( _pretty_op_desc_(op.desc, "with_sub_block")) grad_op_desc, op_grad_to_var = core.get_grad_op_desc( op.desc, cpt.to_text(no_grad_dict[block.idx]), []) + # Set device for grad_op according to forward Op + if op.desc.has_attr(device_attr_name): + op_device = op.desc.attr(device_attr_name) + for op_desc in grad_op_desc: + op_desc._set_attr(device_attr_name, op_device) added_descs = _add_descs_to_block(grad_op_desc, local_block) grad_op_descs.extend(added_descs) grad_to_var.update(op_grad_to_var) diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 76c5a309103..27ce44a257e 100755 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -4033,6 +4033,12 @@ class PipelineOptimizer(object): """ Find the post op that has variable named var_name as input. """ + # bugfix for uniform hybrid parallelism + if '.cast_fp32' in var_name: + var_name = var_name.replace('.cast_fp32', '') + if '.cast_fp16' in var_name: + var_name = var_name.replace('.cast_fp16', '') + post_ops = self.input_var_to_op[var_name] if post_ops == None: return None result_op = None @@ -4114,7 +4120,23 @@ class PipelineOptimizer(object): # For LRSched ops, we should put them on all sub-programs to # make sure each sub-program update the lr correctly op._set_attr(self._op_device_key, "gpu:all") - elif op.type == "scale" and self._is_backward_op(op): + # bugfix in hybrid parallelism + elif op.type == "sum" and self._is_backward_op(op): + # For sum ops that compute the sum of @RENAMED@ vars + for name in op.desc.input_arg_names(): + assert '@RENAME@' in name, \ + "The op must be sum used to accumulate renamed vars." + assert len(op.desc.output_arg_names()) == 1 + out_name = op.desc.output_arg_names()[0] + post_op = self._find_post_op(idx, out_name) + assert post_op.has_attr( + 'op_device'), "{} has no op_device attr for var {}".format( + post_op.type, out_name) + device = post_op.attr(self._op_device_key) + assert device, "The post op must have op_device set." + op._set_attr(self._op_device_key, device) + elif (op.type == "cast" or + op.type == "scale") and self._is_backward_op(op): prev_op = self._find_prev_op(idx, op.desc.input("X")[0]) op._set_attr(self._op_device_key, prev_op.attr(self._op_device_key)) elif op.type == "memcpy" and not self._is_optimize_op(op): @@ -4249,11 +4271,19 @@ class PipelineOptimizer(object): Insert a pair of send and recv ops for every two consecutive ops on different devices. """ - extra_index_info = {'index': 0} - # A map from var to device where op takes it as input, # avoiding multiple send and recv ops. input_var_to_device = dict() + # bugfix hybrid parallelism + first_optimize_index = None + for index, op in enumerate(list(block.ops)): + if self._is_optimize_op(op): + first_optimize_index = index + break + extra_index_info = { + 'index': 0, + 'first_optimize_index': first_optimize_index + } for index, op in enumerate(list(block.ops)): cur_device = op.attr(self._op_device_key) @@ -4371,17 +4401,26 @@ class PipelineOptimizer(object): 'peer': 1, }) extra_index_info['index'] += 1 + insert_index = None + if int(op_role) == int(self._op_role.Backward): + insert_index = extra_index_info[ + 'first_optimize_index'] + new_op_role = self._op_role.Optimize + else: + insert_index = index + new_op_role = self._op_role.Backward block._insert_op( - index=index + extra_index_info['index'], + index=insert_index + extra_index_info['index'], type='c_sync_comm_stream', inputs={'X': [var]}, outputs={'Out': [var]}, attrs={ self._op_device_key: prev_dev, - self._op_role_key: self._op_role.Backward, + self._op_role_key: new_op_role, 'ring_id': ring_id, }) - extra_index_info['index'] += 1 + if int(op_role) == int(self._op_role.Forward): + extra_index_info['index'] += 1 var_shape = list(var.shape) var_shape[0] = self.micro_batch_size if var_shape[ 0] < 0 else var_shape[0] @@ -4768,8 +4807,9 @@ class PipelineOptimizer(object): # Step4: Special Case: process persistable vars that exist in # multiple sections - self._process_persistable_vars_in_multi_sections( - main_program, startup_program, program_list) + # FIXME + # self._process_persistable_vars_in_multi_sections( + # main_program, startup_program, program_list) # Step5: Add sub blocks for section programs self._add_sub_blocks(main_block, program_list) diff --git a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py index 4d6744f2b6f..f28bf89ff5c 100755 --- a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py @@ -354,6 +354,7 @@ class TestFleetMetaOptimizer(TestFleetMetaOptimizer): "segment_broadcast_MB": 0.2, "segment_anchors": None, "sharding_degree": 2, + "dp_degree": 2, "hybrid_dp": True, "gradient_merge_acc_step": 1, "mp_degree": 1 @@ -422,6 +423,7 @@ class TestFleetMetaOptimizer(TestFleetMetaOptimizer): "segment_broadcast_MB": 0.2, "segment_anchors": None, "sharding_degree": 2, + "dp_degree": 2, "hybrid_dp": True, "gradient_merge_acc_step": 4, "mp_degree": 1 @@ -458,20 +460,56 @@ class TestFleetMetaOptimizer(TestFleetMetaOptimizer): fw_bw_ops = [op.type for op in train_prog.blocks[0].ops] opt_ops = [op.type for op in train_prog.blocks[2].ops] self.assertEqual(fw_bw_ops, [ - 'fill_constant', 'fill_constant', 'fill_constant', - 'c_sync_calc_stream', 'c_broadcast', 'c_broadcast', 'c_broadcast', - 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_sync_comm_stream', - 'c_sync_comm_stream', 'mul', 'elementwise_add', 'tanh', 'mul', - 'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'softmax', - 'cross_entropy2', 'mean', 'fill_constant', 'scale', 'mean_grad', - 'cross_entropy_grad2', 'softmax_grad', 'elementwise_add_grad', - 'mul_grad', 'tanh_grad', 'elementwise_add_grad', 'mul_grad', - 'tanh_grad', 'elementwise_add_grad', 'mul_grad', - 'c_sync_calc_stream', 'c_reduce_sum', 'c_reduce_sum', - 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', - 'c_sync_comm_stream', 'elementwise_add', 'elementwise_add', - 'elementwise_add', 'increment', 'elementwise_mod', 'equal', - 'conditional_block' + 'fill_constant', + 'fill_constant', + 'fill_constant', + 'c_sync_calc_stream', + 'c_broadcast', + 'c_broadcast', + 'c_broadcast', + 'c_broadcast', + 'c_broadcast', + 'c_broadcast', + 'c_sync_comm_stream', + 'mul', + 'elementwise_add', + 'tanh', + 'mul', + 'elementwise_add', + 'tanh', + 'mul', + 'elementwise_add', + 'softmax', + 'cross_entropy2', + 'mean', + 'fill_constant', + 'scale', + 'mean_grad', + 'cross_entropy_grad2', + 'softmax_grad', + 'elementwise_add_grad', + 'mul_grad', + 'tanh_grad', + 'elementwise_add_grad', + 'mul_grad', + 'tanh_grad', + 'elementwise_add_grad', + 'mul_grad', + 'c_sync_calc_stream', + 'c_reduce_sum', + 'c_reduce_sum', + 'c_reduce_sum', + 'c_reduce_sum', + 'c_reduce_sum', + 'c_reduce_sum', + 'c_sync_comm_stream', + 'elementwise_add', + 'elementwise_add', + 'elementwise_add', + 'increment', + 'elementwise_mod', + 'equal', + 'conditional_block', ]) self.assertEqual(opt_ops, [ 'c_allreduce_sum', 'c_allreduce_sum', 'c_allreduce_sum', 'scale', -- GitLab From 8c7c53b3d5237bcdbcb42e492ec51bc581223549 Mon Sep 17 00:00:00 2001 From: zhang wenhui Date: Wed, 7 Apr 2021 19:06:26 +0800 Subject: [PATCH 168/486] =?UTF-8?q?=E3=80=90NPU=E3=80=91Merge=20ascend=20G?= =?UTF-8?q?E&distributed=20code=20by=200208=20from=20ascendrc=20(#31957)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Ascend rc (#30483) * Fix compilcation on CANN20.1 and older (#30494) Fix compilcation on CANN20.1 and older * Add distribution supported (#30578) Add distribution supported * Build praser for Hcom* operators (#30627) Build praser for Hcom* operators * Pass device_ids info from launch to trainer. (#30632) Pass device_ids info from launch to trainer * Add Hccl program group (#30642) Add Hccl program group * Add startup bash files of test_ascend_group. (#30645) Add startup bash files of test_ascend_group * cleanup (#30646) cleanup test_ascend_group.py * [Feature] Build parser to support distributed training (#30658) [Feature] Build parser to support distributed training * fix compilation on ascend-20.1 (#30722) fix compilation on ascend-20.1 * Dev/fix ascend string (#30749) Dev/fix ascend string * code style (#30781) code style * Merge ascend_optimizer and ascend_parser. (#30776) Merge ascend_optimizer and ascend_parser. * Ascendrc add converted op : [range/equal/range/uniform_random/expand/squeeze], fix cast op bug (#30797) Ascendrc add converted op : [range/equal/range/uniform_random/expand/squeeze], fix cast op bug * Add paddle ascend distribution training supported (#30796) Add paddle ascend distribution training supported * pass cxx_flags to gloo cmake (#30857) * Destroy session first. (#30954) Destroy session first. * merge * fix, test=develop * fix, test=develop * fix style, test=develop * fix, test=develop * fix * fix log fatal, test=develop * fix enforce style, test=develop * fix, test=develop * fix, test=develop * fix rccl, test=develop * fix test, test=develop * fix, test=develop * fix, test=develop * fix, test=develop * fix node_num, test=develop * fix ids str, test=develop * fix ids str, test=develop * fix ids str, test=develop * fix, test=develop * fix, test=develop * fix, test=develop * fix, test=develop * fix, test=develop * fix, test=develop * fix, test=develop * fix, test=develop * fix style code, test=develop * fix style code, test=develop * fix style code, test=develop * fix style code, test=develop Co-authored-by: hutuxian Co-authored-by: gongweibao Co-authored-by: Void Main Co-authored-by: Leo Chen Co-authored-by: dingsiyu <18369187719@163.com> Co-authored-by: OleNet --- CMakeLists.txt | 4 + cmake/external/ascend.cmake | 85 +- cmake/external/gloo.cmake | 48 +- cmake/external/protobuf.cmake | 11 +- cmake/external/threadpool.cmake | 6 +- cmake/external/warpctc.cmake | 106 +- paddle/fluid/framework/fleet/CMakeLists.txt | 2 +- paddle/fluid/framework/fleet/ascend_wrapper.h | 45 +- paddle/fluid/memory/allocation/CMakeLists.txt | 2 + .../fluid/operators/collective/CMakeLists.txt | 6 + .../operators/collective/c_gen_nccl_id_op.cc | 16 + .../operators/collective/gen_nccl_id_op.cc | 15 + paddle/fluid/platform/CMakeLists.txt | 10 + paddle/fluid/platform/ascend_npu_info.cc | 36 + paddle/fluid/platform/ascend_npu_info.h | 31 + paddle/fluid/pybind/ascend_wrapper_py.cc | 300 ++- paddle/fluid/pybind/ascend_wrapper_py.h | 1 + paddle/fluid/pybind/op_function_generator.cc | 15 + paddle/fluid/pybind/pybind.cc | 10 + python/paddle/distributed/fleet/__init__.py | 11 + .../distributed/fleet/base/fleet_base.py | 12 + .../distributed/fleet/base/role_maker.py | 26 + python/paddle/distributed/fleet/launch.py | 44 +- .../paddle/distributed/fleet/launch_utils.py | 100 +- .../fleet/meta_optimizers/ascend/__init__.py | 13 + .../ascend/ascend_optimizer.py | 119 +- .../meta_optimizers/ascend/ascend_parser.py | 2076 +++++++++++++++-- .../graph_execution_optimizer.py | 5 +- .../fluid/tests/unittests/CMakeLists.txt | 9 +- .../fluid/tests/unittests/ascend_group.py | 140 ++ .../ascend_multi_process_collective.py | 41 + .../tests/unittests/test_ascend_group.sh | 30 + .../unittests/test_fleet_launch_ascend.sh | 59 + .../fluid/transpiler/ascend_transpiler.py | 74 + python/setup.py.in | 1 + 35 files changed, 3057 insertions(+), 452 deletions(-) create mode 100644 paddle/fluid/platform/ascend_npu_info.cc create mode 100644 paddle/fluid/platform/ascend_npu_info.h create mode 100644 python/paddle/distributed/fleet/meta_optimizers/ascend/__init__.py create mode 100644 python/paddle/fluid/tests/unittests/ascend_group.py create mode 100644 python/paddle/fluid/tests/unittests/ascend_multi_process_collective.py create mode 100644 python/paddle/fluid/tests/unittests/test_ascend_group.sh create mode 100644 python/paddle/fluid/tests/unittests/test_fleet_launch_ascend.sh create mode 100644 python/paddle/fluid/transpiler/ascend_transpiler.py diff --git a/CMakeLists.txt b/CMakeLists.txt index 2d2f613eff5..59bc768aa41 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -33,6 +33,7 @@ option(WITH_TENSORRT "Compile PaddlePaddle with NVIDIA TensorRT" OFF) option(WITH_XPU "Compile PaddlePaddle with BAIDU KUNLUN XPU" OFF) option(WITH_WIN_DUMP_DBG "Compile with windows core dump debug mode" OFF) option(WITH_ASCEND "Compile PaddlePaddle with ASCEND" OFF) +option(WITH_ASCEND_CXX11 "Compile PaddlePaddle with ASCEND and CXX11 ABI" OFF) if (WITH_GPU AND WITH_XPU) message(FATAL_ERROR "Error when compile GPU and XPU at the same time") endif() @@ -57,6 +58,9 @@ if(WITH_MUSL) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=deprecated-declarations -Wno-deprecated-declarations -Wno-error=pessimizing-move -Wno-error=deprecated-copy") endif() +if(WITH_ASCEND AND NOT WITH_ASCEND_CXX11) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0") +endif() if(WIN32) option(MSVC_STATIC_CRT "use static C Runtime library by default" ON) diff --git a/cmake/external/ascend.cmake b/cmake/external/ascend.cmake index bcf0c0a0646..a0b6f480f95 100644 --- a/cmake/external/ascend.cmake +++ b/cmake/external/ascend.cmake @@ -12,50 +12,47 @@ # See the License for the specific language governing permissions and # limitations under the License. -INCLUDE(ExternalProject) - -SET(ASCEND_PROJECT "extern_ascend") -IF((NOT DEFINED ASCEND_VER) OR (NOT DEFINED ASCEND_URL)) - MESSAGE(STATUS "use pre defined download url") - SET(ASCEND_VER "0.1.1" CACHE STRING "" FORCE) - SET(ASCEND_NAME "ascend" CACHE STRING "" FORCE) - SET(ASCEND_URL "http://paddle-ascend.bj.bcebos.com/ascend.tar.gz" CACHE STRING "" FORCE) -ENDIF() -MESSAGE(STATUS "ASCEND_NAME: ${ASCEND_NAME}, ASCEND_URL: ${ASCEND_URL}") -SET(ASCEND_SOURCE_DIR "${THIRD_PARTY_PATH}/ascend") -SET(ASCEND_DOWNLOAD_DIR "${ASCEND_SOURCE_DIR}/src/${ASCEND_PROJECT}") -SET(ASCEND_DST_DIR "ascend") -SET(ASCEND_INSTALL_ROOT "${THIRD_PARTY_PATH}/install") -SET(ASCEND_INSTALL_DIR ${ASCEND_INSTALL_ROOT}/${ASCEND_DST_DIR}) -SET(ASCEND_ROOT ${ASCEND_INSTALL_DIR}) -SET(ASCEND_INC_DIR ${ASCEND_ROOT}/include) -SET(ASCEND_LIB_DIR ${ASCEND_ROOT}/lib) -SET(ASCEND_LIB ${ASCEND_LIB_DIR}/libge_runner.so) -SET(ASCEND_GRAPH_LIB ${ASCEND_LIB_DIR}/libgraph.so) -SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${ASCEND_ROOT}/lib") - -INCLUDE_DIRECTORIES(${ASCEND_INC_DIR}) -FILE(WRITE ${ASCEND_DOWNLOAD_DIR}/CMakeLists.txt - "PROJECT(ASCEND)\n" - "cmake_minimum_required(VERSION 3.0)\n" - "install(DIRECTORY ${ASCEND_NAME}/include ${ASCEND_NAME}/lib \n" - " DESTINATION ${ASCEND_DST_DIR})\n") -ExternalProject_Add( - ${ASCEND_PROJECT} - ${EXTERNAL_PROJECT_LOG_ARGS} - PREFIX ${ASCEND_SOURCE_DIR} - DOWNLOAD_DIR ${ASCEND_DOWNLOAD_DIR} - DOWNLOAD_COMMAND wget --no-check-certificate ${ASCEND_URL} -c -q -O ${ASCEND_NAME}.tar.gz - && tar zxvf ${ASCEND_NAME}.tar.gz - DOWNLOAD_NO_PROGRESS 1 - UPDATE_COMMAND "" - CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${ASCEND_INSTALL_ROOT} - CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${ASCEND_INSTALL_ROOT} -) -ADD_LIBRARY(ascend SHARED IMPORTED GLOBAL) -SET_PROPERTY(TARGET ascend PROPERTY IMPORTED_LOCATION ${ASCEND_LIB}) + +#NOTE: Logic is from +# https://github.com/mindspore-ai/graphengine/blob/master/CMakeLists.txt +if(DEFINED ENV{ASCEND_CUSTOM_PATH}) + set(ASCEND_DIR $ENV{ASCEND_CUSTOM_PATH}) +else() + set(ASCEND_DIR /usr/local/Ascend) +endif() + +set(ASCEND_DRIVER_DIR ${ASCEND_DIR}/driver/lib64) +set(ASCEND_DRIVER_COMMON_DIR ${ASCEND_DIR}/driver/lib64/common) +set(ASCEND_DRIVER_SHARE_DIR ${ASCEND_DIR}/driver/lib64/share) +set(ASCEND_RUNTIME_DIR ${ASCEND_DIR}/fwkacllib/lib64) +set(ASCEND_ATC_DIR ${ASCEND_DIR}/atc/lib64) +set(ASCEND_ACL_DIR ${ASCEND_DIR}/acllib/lib64) +set(STATIC_ACL_LIB ${ASCEND_ACL_DIR}) + +set(ASCEND_MS_RUNTIME_PATH ${ASCEND_RUNTIME_DIR} ${ASCEND_ACL_DIR} ${ASCEND_ATC_DIR}) +set(ASCEND_MS_DRIVER_PATH ${ASCEND_DRIVER_DIR} ${ASCEND_DRIVER_COMMON_DIR}) +set(ATLAS_RUNTIME_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64) +set(ATLAS_RUNTIME_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include) +set(ATLAS_ACL_DIR ${ASCEND_DIR}/ascend-toolkit/latest/acllib/lib64) +set(ATLAS_ATC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/atc/lib64) +set(ATLAS_MS_RUNTIME_PATH ${ATLAS_RUNTIME_DIR} ${ATLAS_ACL_DIR} ${ATLAS_ATC_DIR}) + +set(atlas_graph_lib ${ATLAS_RUNTIME_DIR}/libgraph.so) +set(atlas_ge_runner_lib ${ATLAS_RUNTIME_DIR}/libge_runner.so) +set(atlas_acl_lib ${ATLAS_RUNTIME_DIR}/libascendcl.so) +INCLUDE_DIRECTORIES(${ATLAS_RUNTIME_INC_DIR}) + +if(EXISTS ${ATLAS_RUNTIME_INC_DIR}/graph/ascend_string.h) + add_definitions(-DPADDLE_WITH_ASCEND_STRING) +endif() + +ADD_LIBRARY(ascend_ge SHARED IMPORTED GLOBAL) +SET_PROPERTY(TARGET ascend_ge PROPERTY IMPORTED_LOCATION ${atlas_ge_runner_lib}) ADD_LIBRARY(ascend_graph SHARED IMPORTED GLOBAL) -SET_PROPERTY(TARGET ascend_graph PROPERTY IMPORTED_LOCATION ${ASCEND_GRAPH_LIB}) -ADD_DEPENDENCIES(ascend ascend_graph ${ASCEND_PROJECT}) +SET_PROPERTY(TARGET ascend_graph PROPERTY IMPORTED_LOCATION ${atlas_graph_lib}) + +ADD_LIBRARY(atlas_acl SHARED IMPORTED GLOBAL) +SET_PROPERTY(TARGET atlas_acl PROPERTY IMPORTED_LOCATION ${atlas_acl_lib}) +add_custom_target(extern_ascend DEPENDS ascend_ge ascend_graph atlas_acl) diff --git a/cmake/external/gloo.cmake b/cmake/external/gloo.cmake index ea7af315e1a..2e4a67093dc 100644 --- a/cmake/external/gloo.cmake +++ b/cmake/external/gloo.cmake @@ -32,21 +32,39 @@ cache_third_party(extern_gloo TAG ${GLOO_TAG} DIR GLOO_SOURCE_DIR) -ExternalProject_Add( - extern_gloo - ${EXTERNAL_PROJECT_LOG_ARGS} - ${SHALLOW_CLONE} - "${GLOO_DOWNLOAD_CMD}" - PREFIX "${GLOO_PREFIX_DIR}" - SOURCE_DIR "${GLOO_SOURCE_DIR}" - UPDATE_COMMAND "" - CONFIGURE_COMMAND "" - BUILD_COMMAND mkdir -p ${GLOO_SOURCE_DIR}/build - && cd ${GLOO_SOURCE_DIR}/build && cmake .. && make - && mkdir -p ${GLOO_LIBRARY_DIR} ${GLOO_INCLUDE_DIR}/gloo - INSTALL_COMMAND ${CMAKE_COMMAND} -E copy ${GLOO_SOURCE_DIR}/build/gloo/libgloo.a ${GLOO_LIBRARY_DIR} - COMMAND ${CMAKE_COMMAND} -E copy_directory "${GLOO_SOURCE_DIR}/gloo/" "${GLOO_INCLUDE_DIR}/gloo" -) +if(WITH_ASCEND) + ExternalProject_Add( + extern_gloo + ${EXTERNAL_PROJECT_LOG_ARGS} + ${SHALLOW_CLONE} + "${GLOO_DOWNLOAD_CMD}" + PREFIX "${GLOO_PREFIX_DIR}" + SOURCE_DIR "${GLOO_SOURCE_DIR}" + UPDATE_COMMAND "" + CONFIGURE_COMMAND "" + BUILD_COMMAND mkdir -p ${GLOO_SOURCE_DIR}/build + && cd ${GLOO_SOURCE_DIR}/build && cmake .. -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} && make + && mkdir -p ${GLOO_LIBRARY_DIR} ${GLOO_INCLUDE_DIR}/gloo + INSTALL_COMMAND ${CMAKE_COMMAND} -E copy ${GLOO_SOURCE_DIR}/build/gloo/libgloo.a ${GLOO_LIBRARY_DIR} + COMMAND ${CMAKE_COMMAND} -E copy_directory "${GLOO_SOURCE_DIR}/gloo/" "${GLOO_INCLUDE_DIR}/gloo" + ) +else() + ExternalProject_Add( + extern_gloo + ${EXTERNAL_PROJECT_LOG_ARGS} + ${SHALLOW_CLONE} + "${GLOO_DOWNLOAD_CMD}" + PREFIX "${GLOO_PREFIX_DIR}" + SOURCE_DIR "${GLOO_SOURCE_DIR}" + UPDATE_COMMAND "" + CONFIGURE_COMMAND "" + BUILD_COMMAND mkdir -p ${GLOO_SOURCE_DIR}/build + && cd ${GLOO_SOURCE_DIR}/build && cmake .. && make + && mkdir -p ${GLOO_LIBRARY_DIR} ${GLOO_INCLUDE_DIR}/gloo + INSTALL_COMMAND ${CMAKE_COMMAND} -E copy ${GLOO_SOURCE_DIR}/build/gloo/libgloo.a ${GLOO_LIBRARY_DIR} + COMMAND ${CMAKE_COMMAND} -E copy_directory "${GLOO_SOURCE_DIR}/gloo/" "${GLOO_INCLUDE_DIR}/gloo" + ) +endif() ADD_LIBRARY(gloo STATIC IMPORTED GLOBAL) diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake index 40a27f506f3..1466664c126 100644 --- a/cmake/external/protobuf.cmake +++ b/cmake/external/protobuf.cmake @@ -198,8 +198,13 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST) "-Dprotobuf_MSVC_STATIC_RUNTIME=${MSVC_STATIC_CRT}") ENDIF() +if(WITH_ASCEND AND NOT WITH_ASCEND_CXX11) + SET(PROTOBUF_REPOSITORY https://gitee.com/tianjianhe/protobuf.git) + SET(PROTOBUF_TAG v3.8.0) +else() SET(PROTOBUF_REPOSITORY ${GIT_URL}/protocolbuffers/protobuf.git) SET(PROTOBUF_TAG 9f75c5aa851cd877fb0d93ccc31b8567a6706546) +endif() cache_third_party(${TARGET_NAME} REPOSITORY ${PROTOBUF_REPOSITORY} @@ -234,7 +239,11 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST) ) ENDFUNCTION() -SET(PROTOBUF_VERSION 3.1.0) +if(WITH_ASCEND) + SET(PROTOBUF_VERSION 3.8.0) +else() + SET(PROTOBUF_VERSION 3.1.0) +endif() IF(NOT PROTOBUF_FOUND) build_protobuf(extern_protobuf FALSE) diff --git a/cmake/external/threadpool.cmake b/cmake/external/threadpool.cmake index 205e8d26d93..0eabdb4e127 100644 --- a/cmake/external/threadpool.cmake +++ b/cmake/external/threadpool.cmake @@ -16,7 +16,11 @@ INCLUDE(ExternalProject) SET(THREADPOOL_PREFIX_DIR ${THIRD_PARTY_PATH}/threadpool) SET(THREADPOOL_SOURCE_DIR ${THIRD_PARTY_PATH}/threadpool/src/extern_threadpool) -SET(THREADPOOL_REPOSITORY ${GIT_URL}/progschj/ThreadPool.git) +if(WITH_ASCEND) + SET(THREADPOOL_REPOSITORY https://gitee.com/tianjianhe/ThreadPool.git) +else() + SET(THREADPOOL_REPOSITORY ${GIT_URL}/progschj/ThreadPool.git) +endif() SET(THREADPOOL_TAG 9a42ec1329f259a5f4881a291db1dcb8f2ad9040) cache_third_party(extern_threadpool diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake index ac28f7561f6..a4367510ac7 100644 --- a/cmake/external/warpctc.cmake +++ b/cmake/external/warpctc.cmake @@ -21,6 +21,8 @@ ENDIF() SET(WARPCTC_PREFIX_DIR ${THIRD_PARTY_PATH}/warpctc) SET(WARPCTC_SOURCE_DIR ${THIRD_PARTY_PATH}/warpctc/src/extern_warpctc) SET(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc) +# in case of low internet speed +#set(WARPCTC_REPOSITORY https://gitee.com/tianjianhe/warp-ctc.git) set(WARPCTC_REPOSITORY ${GIT_URL}/baidu-research/warp-ctc.git) set(WARPCTC_TAG c690fc5755abbdbdc98ef78d51ec10a6748a8cd1) @@ -41,39 +43,77 @@ cache_third_party(extern_warpctc TAG ${WARPCTC_TAG} DIR WARPCTC_SOURCE_DIR) -ExternalProject_Add( - extern_warpctc - ${EXTERNAL_PROJECT_LOG_ARGS} - ${SHALLOW_CLONE} - "${WARPCTC_DOWNLOAD_CMD}" - PREFIX ${WARPCTC_PREFIX_DIR} - SOURCE_DIR ${WARPCTC_SOURCE_DIR} - #UPDATE_COMMAND "" - PATCH_COMMAND "" - BUILD_ALWAYS 1 - CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} - -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} - -DCMAKE_C_FLAGS=$ - -DCMAKE_C_FLAGS_DEBUG=$ - -DCMAKE_C_FLAGS_RELEASE=$ - -DCMAKE_CXX_FLAGS=$ - -DCMAKE_CXX_FLAGS_RELEASE=$ - -DCMAKE_CXX_FLAGS_DEBUG=$ - -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR} - -DWITH_GPU=${WITH_GPU} - -DWITH_ROCM=${WITH_ROCM} - -DWITH_OMP=${USE_OMP} - -DWITH_TORCH=OFF - -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON - -DBUILD_SHARED=ON - -DBUILD_TESTS=OFF - -DCMAKE_POSITION_INDEPENDENT_CODE=ON - -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} - ${EXTERNAL_OPTIONAL_ARGS} - CMAKE_CACHE_ARGS -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} - -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON - -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR} -) +if(WITH_ASCEND) + ExternalProject_Add( + extern_warpctc + ${EXTERNAL_PROJECT_LOG_ARGS} + ${SHALLOW_CLONE} + "${WARPCTC_DOWNLOAD_CMD}" + PREFIX ${WARPCTC_PREFIX_DIR} + SOURCE_DIR ${WARPCTC_SOURCE_DIR} + #UPDATE_COMMAND "" + PATCH_COMMAND "" + BUILD_ALWAYS 1 + CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} + -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} + "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}" + -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} + -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} + -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR} + -DWITH_GPU=${WITH_GPU} + -DWITH_ROCM=${WITH_ROCM} + -DWITH_OMP=${USE_OMP} + -DWITH_TORCH=OFF + -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON + -DBUILD_SHARED=ON + -DBUILD_TESTS=OFF + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + ${EXTERNAL_OPTIONAL_ARGS} + CMAKE_CACHE_ARGS -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} + -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON + -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR} + ) +else() + ExternalProject_Add( + extern_warpctc + ${EXTERNAL_PROJECT_LOG_ARGS} + ${SHALLOW_CLONE} + "${WARPCTC_DOWNLOAD_CMD}" + PREFIX ${WARPCTC_PREFIX_DIR} + SOURCE_DIR ${WARPCTC_SOURCE_DIR} + #UPDATE_COMMAND "" + PATCH_COMMAND "" + BUILD_ALWAYS 1 + CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + -DCMAKE_C_FLAGS=$ + -DCMAKE_C_FLAGS_DEBUG=$ + -DCMAKE_C_FLAGS_RELEASE=$ + -DCMAKE_CXX_FLAGS=$ + -DCMAKE_CXX_FLAGS_RELEASE=$ + -DCMAKE_CXX_FLAGS_DEBUG=$ + -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR} + -DWITH_GPU=${WITH_GPU} + -DWITH_ROCM=${WITH_ROCM} + -DWITH_OMP=${USE_OMP} + -DWITH_TORCH=OFF + -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON + -DBUILD_SHARED=ON + -DBUILD_TESTS=OFF + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + ${EXTERNAL_OPTIONAL_ARGS} + CMAKE_CACHE_ARGS -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} + -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON + -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR} + ) +endif() + + IF(WIN32) SET(WARPCTC_LIBRARIES "${WARPCTC_INSTALL_DIR}/bin/warpctc${CMAKE_SHARED_LIBRARY_SUFFIX}" CACHE FILEPATH "Warp-ctc Library" FORCE) diff --git a/paddle/fluid/framework/fleet/CMakeLists.txt b/paddle/fluid/framework/fleet/CMakeLists.txt index 61f3c026f1f..ce0a905afc6 100644 --- a/paddle/fluid/framework/fleet/CMakeLists.txt +++ b/paddle/fluid/framework/fleet/CMakeLists.txt @@ -42,5 +42,5 @@ cc_library(heter_wrapper SRCS heter_wrapper.cc DEPS framework_proto device_conte cc_test(test_fleet_cc SRCS test_fleet.cc DEPS fleet_wrapper gloo_wrapper fs shell) if(WITH_ASCEND) - cc_library(ascend_wrapper SRCS ascend_wrapper.cc DEPS framework_proto lod_tensor ascend ascend_graph) + cc_library(ascend_wrapper SRCS ascend_wrapper.cc DEPS framework_proto lod_tensor ascend_ge ascend_graph) endif(WITH_ASCEND) diff --git a/paddle/fluid/framework/fleet/ascend_wrapper.h b/paddle/fluid/framework/fleet/ascend_wrapper.h index da79fccb8ca..baa2fd126a4 100644 --- a/paddle/fluid/framework/fleet/ascend_wrapper.h +++ b/paddle/fluid/framework/fleet/ascend_wrapper.h @@ -37,25 +37,50 @@ limitations under the License. */ namespace paddle { namespace framework { -// typedef std::vector AscendGraphDesc; typedef ge::Graph AscendGraphDesc; +#ifdef PADDLE_WITH_ASCEND_STRING +using AscendString = ge::AscendString; +#else +using AscendString = std::string; +#endif + class AscendInstance { public: virtual ~AscendInstance() {} AscendInstance() {} - std::map GetDefaultInitSessionOptions() { - std::map init_options; - init_options["a"] = "b"; - init_options["ge.trainFlag"] = "1"; + std::map _GetDefaultInitOptions() { + std::map init_options; + init_options["ge.exec.deviceId"] = "0"; + init_options["ge.graphRunMode"] = "1"; + return init_options; + } + + std::map _GetDefaultInitSessionOptions() { + std::map init_options; + // init_options["a"] = "b"; + // init_options["ge.trainFlag"] = "1"; return init_options; } - // add other parameters here to init + ge::Status InitGEForUT() { + return ge::GEInitialize(_GetDefaultInitOptions()); + } + void InitGlobalResouces() { - session_.reset(new ge::Session(GetDefaultInitSessionOptions())); - VLOG(1) << "InitGlobalResouces Done"; + LOG(INFO) << "Begin ascend InitGlobalResouces"; + session_.reset(new ge::Session(_GetDefaultInitSessionOptions())); + if (session_ == nullptr) { + PADDLE_THROW(platform::errors::Fatal("new session error: nullptr")); + } + LOG(INFO) << "End ascend InitGlobalResouces"; + } + + void DestroyGlobalResouces() { + LOG(INFO) << "Begin ascend DestroyGlobalResouces"; + session_ = nullptr; + LOG(INFO) << "Begin ascend DestroyGlobalResouces"; } static std::shared_ptr GetInstance() { @@ -178,6 +203,6 @@ class AscendInstance { private: static std::shared_ptr ascend_instance_; }; -} // end namespace framework -} // end namespace paddle +} // namespace framework +} // namespace paddle #endif diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt index 377ea376773..565797d51dd 100644 --- a/paddle/fluid/memory/allocation/CMakeLists.txt +++ b/paddle/fluid/memory/allocation/CMakeLists.txt @@ -33,6 +33,8 @@ if (WITH_GPU OR WITH_ROCM) set(AllocatorFacadeDeps gpu_info cuda_allocator pinned_allocator cuda_device_guard thread_local_allocator) elseif(WITH_XPU) set(AllocatorFacadeDeps xpu_info) +elseif(WITH_ASCEND) + set(AllocatorFacadeDeps ascend_npu_info) else () set(AllocatorFacadeDeps) endif() diff --git a/paddle/fluid/operators/collective/CMakeLists.txt b/paddle/fluid/operators/collective/CMakeLists.txt index 8920541b9b9..977a208d20e 100644 --- a/paddle/fluid/operators/collective/CMakeLists.txt +++ b/paddle/fluid/operators/collective/CMakeLists.txt @@ -19,6 +19,12 @@ if(WITH_NCCL OR WITH_RCCL) op_library(gen_nccl_id_op DEPS ${COLLECTIVE_DEPS}) endif() +if(WITH_ASCEND) + op_library(gen_nccl_id_op) + op_library(c_gen_nccl_id_op) +endif() + + if(WITH_GLOO) set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} gloo_wrapper) endif() diff --git a/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc b/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc index 1592d809f91..7da30f64d1c 100644 --- a/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc +++ b/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc @@ -27,6 +27,7 @@ limitations under the License. */ namespace paddle { namespace operators { +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) static void GenNCCLID(std::vector* nccl_ids) { for (size_t i = 0; i < nccl_ids->size(); ++i) { PADDLE_ENFORCE_CUDA_SUCCESS( @@ -84,6 +85,21 @@ class CGenNCCLIdOp : public framework::OperatorBase { } }; +#else +class CGenNCCLIdOp : public framework::OperatorBase { + public: + CGenNCCLIdOp(const std::string& type, + const framework::VariableNameMap& inputs, + const framework::VariableNameMap& outputs, + const framework::AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + void RunImpl(const framework::Scope& scope, + const platform::Place& dev_place) const override {} +}; + +#endif + class CGenNCCLIdOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { diff --git a/paddle/fluid/operators/collective/gen_nccl_id_op.cc b/paddle/fluid/operators/collective/gen_nccl_id_op.cc index 679713d05bc..99a92469e85 100644 --- a/paddle/fluid/operators/collective/gen_nccl_id_op.cc +++ b/paddle/fluid/operators/collective/gen_nccl_id_op.cc @@ -34,6 +34,7 @@ class Scope; namespace paddle { namespace operators { +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) static void GenNCCLID(std::vector* nccl_ids) { for (size_t i = 0; i < nccl_ids->size(); ++i) { PADDLE_ENFORCE_CUDA_SUCCESS( @@ -194,6 +195,20 @@ class GenNCCLIdOp : public framework::OperatorBase { } }; +#else +class GenNCCLIdOp : public framework::OperatorBase { + public: + GenNCCLIdOp(const std::string& type, const framework::VariableNameMap& inputs, + const framework::VariableNameMap& outputs, + const framework::AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + void RunImpl(const framework::Scope& scope, + const platform::Place& dev_place) const override {} +}; + +#endif + class GenNCCLIdOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index 47344f0e373..1e16008f36b 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -10,6 +10,12 @@ ELSE() set(XPU_CTX_DEPS) endif(WITH_XPU) +if(WITH_ASCEND) + set(ASCEND_DEPS xpulib) +ELSE() + set(ASCEND_DEPS) +endif(WITH_ASCEND) + if (WITH_PYTHON) py_proto_compile(profiler_py_proto SRCS profiler.proto) add_custom_target(profiler_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py) @@ -66,6 +72,10 @@ if(WITH_XPU) cc_library(xpu_info SRCS xpu_info.cc DEPS gflags glog enforce xpulib) endif() +if(WITH_ASCEND) + cc_library(ascend_npu_info SRCS ascend_npu_info.cc DEPS gflags glog enforce atlas_acl) +endif() + add_subdirectory(dynload) add_subdirectory(stream) diff --git a/paddle/fluid/platform/ascend_npu_info.cc b/paddle/fluid/platform/ascend_npu_info.cc new file mode 100644 index 00000000000..db8dafeae1e --- /dev/null +++ b/paddle/fluid/platform/ascend_npu_info.cc @@ -0,0 +1,36 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include "paddle/fluid/platform/ascend_npu_info.h" +#include +#include "acl/acl_rt.h" + +namespace paddle { +namespace platform { +namespace ascend { + +int NPUDevice::GetDeviceCount() { + uint32_t count = 0; + aclError status = aclrtGetDeviceCount(&count); + if (status != 0) { + PADDLE_THROW(platform::errors::InvalidArgument( + "aclrtGetDeviceCount error code: %d", status)); + return -1; + } + + return count; +} + +} // namespace ascend +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/ascend_npu_info.h b/paddle/fluid/platform/ascend_npu_info.h new file mode 100644 index 00000000000..7afed121a5a --- /dev/null +++ b/paddle/fluid/platform/ascend_npu_info.h @@ -0,0 +1,31 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once + +#ifdef PADDLE_WITH_ASCEND + +namespace paddle { +namespace platform { +namespace ascend { + +class NPUDevice { + public: + //! Get the total number of XPU devices in system. + static int GetDeviceCount(); +}; + +} // namespace ascend +} // namespace platform +} // namespace paddle +#endif diff --git a/paddle/fluid/pybind/ascend_wrapper_py.cc b/paddle/fluid/pybind/ascend_wrapper_py.cc index 00eca380859..303ab5c0fe8 100644 --- a/paddle/fluid/pybind/ascend_wrapper_py.cc +++ b/paddle/fluid/pybind/ascend_wrapper_py.cc @@ -32,6 +32,8 @@ limitations under the License. */ #include #include #include "paddle/fluid/framework/fleet/ascend_wrapper.h" +#include "paddle/fluid/platform/ascend_npu_info.h" +#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/pybind/ascend_wrapper_py.h" using namespace ge; // NOLINT @@ -40,6 +42,12 @@ namespace py = pybind11; namespace paddle { namespace pybind { +#ifdef PADDLE_WITH_ASCEND_STRING +using AscendString = AscendString; +#else +using AscendString = std::string; +#endif + void BindAscendWrapper(py::module *m) { py::class_>(*m, "AscendInstance") @@ -47,13 +55,31 @@ void BindAscendWrapper(py::module *m) { .def("init_global_resources", &framework::AscendInstance::InitGlobalResouces, py::call_guard()) + .def("destroy_global_resources", + &framework::AscendInstance::DestroyGlobalResouces, + py::call_guard()) .def("add_ascend_subgraph", &framework::AscendInstance::AddAscendSubgraph, py::call_guard()); -} // end AscendWrapper +} -Status ge_initialize(std::map &options) { // NOLINT +std::map convert_map( + const std::map &options) { + std::map rets; + for (auto &option : options) { + AscendString key = option.first.c_str(); + AscendString val = option.second.c_str(); + rets[key] = val; + } + return rets; +} + +ge::Status ge_initialize( + std::map &options) { // NOLINT py::gil_scoped_release release; - Status res = GEInitialize(options); + auto init_options = convert_map(options); + ge::Status res = ge::GEInitialize(init_options); + PADDLE_ENFORCE_EQ(res, ge::SUCCESS, platform::errors::Fatal( + "ge initialize not success:%d", res)); py::gil_scoped_acquire acquire; return res; } @@ -82,11 +108,18 @@ enum AttrType { AT_NAMEATTR }; +void BindAscendDevice(py::module *m) { + py::class_(*m, "NPUDevice") + .def_static( + "get_device_count", + static_cast(&platform::ascend::NPUDevice::GetDeviceCount)); +} + void BindAscendGraph(py::module *m) { m->def("ge_initialize", &ge_initialize, "GEInitialize"); m->def("ge_finalize", &GEFinalize, "GEFinalize"); - //枚举封装 + // enum py::enum_(*m, "GEGraphRunMode") .value("PREDICTION", GraphRunMode::PREDICTION) .value("TRAIN", GraphRunMode::TRAIN) @@ -214,24 +247,34 @@ void BindAscendGraph(py::module *m) { // 类封装 py::class_(*m, "GESession") - .def(py::init &>()) + .def(py::init([](const std::map &options) { + return std::unique_ptr( + new ge::Session(convert_map(options))); + })) + .def("add_graph", (ge::Status (Session::*)(uint32_t, const Graph &)) & + Session::AddGraph) .def("add_graph", - (Status (Session::*)(uint32_t, const Graph &)) & Session::AddGraph) - .def("add_graph", - (Status (Session::*)(uint32_t, const Graph &, - const std::map &)) & - Session::AddGraph) + [](Session &ss, uint32_t index, const Graph &graph, + const std::map &options) { + return ss.AddGraph(index, graph, convert_map(options)); + }) .def("remove_graph", &Session::RemoveGraph) .def("run_graph", [](Session &ss, uint32_t graphId, const std::vector &inputs) -> py::tuple { std::vector outputs; - Status res = ss.RunGraph(graphId, inputs, outputs); + ge::Status res = ss.RunGraph(graphId, inputs, outputs); return py::make_tuple(outputs, res); }, py::call_guard()) .def("build_graph", &Session::BuildGraph) .def("run_graph_async", &Session::RunGraphAsync) +#ifdef PADDLE_WITH_ASCEND_STRING + .def("register_call_back_func", + static_cast( + &ge::Session::RegisterCallBackFunc)) +#else .def("register_call_back_func", (Status (Session::*)( // NOLINT const std::string &, @@ -239,11 +282,12 @@ void BindAscendGraph(py::module *m) { uint32_t graph_id, const std::map ¶ms_list)>)) & Session::RegisterCallBackFunc) +#endif .def("is_graph_need_rebuild", &Session::IsGraphNeedRebuild); py::class_(*m, "GEGraph") .def(py::init<>()) - .def(py::init()) + .def(py::init()) .def("set_inputs", &Graph::SetInputs) .def("set_outputs", (Graph & (Graph::*)(const std::vector &)) & Graph::SetOutputs) @@ -253,40 +297,70 @@ void BindAscendGraph(py::module *m) { Graph::SetOutputs) .def("set_outputs", (Graph & - (Graph::*)(const std::vector> + (Graph::*)(const std::vector> &)) & Graph::SetOutputs) .def("set_targets", &Graph::SetTargets) .def("is_valid", &Graph::IsValid) .def("add_op", &Graph::AddOp) .def("find_op_by_name", - [](Graph &graph, const std::string &name) -> py::tuple { + [](Graph &graph, const char *name) -> py::tuple { ge::Operator op; graphStatus status = graph.FindOpByName(name, op); return py::make_tuple(op, status); }) .def("find_op_by_type", - [](Graph &graph, const std::string &type) -> py::tuple { + [](Graph &graph, const char *type) -> py::tuple { std::vector ops; graphStatus status = graph.FindOpByType(type, ops); return py::make_tuple(ops, status); }) .def("get_all_op_name", [](Graph &graph) -> py::tuple { - std::vector op_name; + std::vector op_name; graphStatus status = graph.GetAllOpName(op_name); return py::make_tuple(op_name, status); }) +#ifdef PADDLE_WITH_ASCEND_STRING + .def("save_to_file", + static_cast( + &ge::Graph::SaveToFile)) + .def("load_from_file", + static_cast( + &Graph::LoadFromFile)) + .def("get_name", + static_cast( + &Graph::GetName)) +#else .def("save_to_file", &Graph::SaveToFile) .def("load_from_file", &Graph::LoadFromFile) .def("get_name", &Graph::GetName) +#endif .def("set_need_iteration", &Graph::SetNeedIteration); py::class_(*m, "GEOperator") .def(py::init<>()) - .def(py::init()) - .def(py::init()) + .def(py::init()) + .def(py::init()) .def("is_empty", &Operator::IsEmpty) +#ifdef PADDLE_WITH_ASCEND_STRING + .def("get_name", + static_cast( + &Operator::GetName)) + .def("get_op_type", + static_cast( + &Operator::GetOpType)) + .def("set_input", + (Operator & (Operator::*)(const char *, const Operator &)) & + Operator::SetInput) + .def("set_input", + (Operator & + (Operator::*)(const char *, const Operator &, const char *)) & + Operator::SetInput) + .def("set_input", (Operator & (Operator::*)(const char *, + const Operator &, uint32_t)) & + Operator::SetInput) +#else .def("get_name", &Operator::GetName) .def("get_op_type", &Operator::GetOpType) .def("set_input", @@ -299,13 +373,28 @@ void BindAscendGraph(py::module *m) { .def("set_input", (Operator & (Operator::*)(const std::string &, const Operator &, uint32_t)) & Operator::SetInput) +#endif .def("add_control_input", &Operator::AddControlInput) .def("get_input_const_data", - [](Operator &op, const std::string &dst_name) -> py::tuple { + [](Operator &op, const char *dst_name) -> py::tuple { Tensor data; graphStatus res = op.GetInputConstData(dst_name, data); return py::make_tuple(data, res); }) +#ifdef PADDLE_WITH_ASCEND_STRING + .def("get_input_desc", + (TensorDesc (Operator::*)(uint32_t) const) & Operator::GetInputDesc) + .def("get_input_desc", + [](Operator &op, const std::string &name) { + return op.GetInputDescByName(name.c_str()); + }) + .def("get_dynamic_output_num", + static_cast( + &Operator::GetDynamicOutputNum)) + .def("get_dynamic_input_num", + static_cast( + &Operator::GetDynamicInputNum)) +#else .def("get_input_desc", (TensorDesc (Operator::*)(const std::string &) const) & Operator::GetInputDesc) @@ -313,12 +402,41 @@ void BindAscendGraph(py::module *m) { (TensorDesc (Operator::*)(uint32_t) const) & Operator::GetInputDesc) .def("get_dynamic_output_num", &Operator::GetDynamicOutputNum) .def("get_dynamic_input_num", &Operator::GetDynamicInputNum) +#endif .def("try_get_input_desc", - [](Operator &op, const std::string &name) -> py::tuple { + [](Operator &op, const char *name) -> py::tuple { TensorDesc tensor_desc; graphStatus status = op.TryGetInputDesc(name, tensor_desc); return py::make_tuple(tensor_desc, status); }) +#ifdef PADDLE_WITH_ASCEND_STRING + .def("update_input_desc", + static_cast(&Operator::UpdateInputDesc)) + .def("get_output_desc", + [](Operator &op, const std::string &name) { + return op.GetOutputDescByName(name.c_str()); + }) + .def("get_output_desc", + (TensorDesc (Operator::*)(uint32_t) const) & Operator::GetOutputDesc) + .def("update_output_desc", + static_cast(&Operator::UpdateOutputDesc)) + .def("get_dynamic_input_desc", + static_cast(&Operator::GetDynamicInputDesc)) + .def("update_dynamic_input_desc", + static_cast( + &Operator::UpdateDynamicInputDesc)) + .def("get_dynamic_output_desc", + static_cast(&Operator::GetDynamicOutputDesc)) + .def("update_dynamic_output_desc", + static_cast( + &Operator::UpdateDynamicOutputDesc)) +#else .def("update_input_desc", &Operator::UpdateInputDesc) .def("get_output_desc", (TensorDesc (Operator::*)(const std::string &) const) & @@ -330,33 +448,38 @@ void BindAscendGraph(py::module *m) { .def("update_dynamic_input_desc", &Operator::UpdateDynamicInputDesc) .def("get_dynamic_output_desc", &Operator::GetDynamicOutputDesc) .def("update_dynamic_output_desc", &Operator::UpdateDynamicOutputDesc) +#endif .def("infer_shape_and_type", &Operator::InferShapeAndType) .def("set_inference_context", &Operator::SetInferenceContext) .def("get_inference_context", &Operator::GetInferenceContext) .def("verify_all_attr", &Operator::VerifyAllAttr) .def("get_inputs_size", &Operator::GetInputsSize) .def("get_outputs_size", &Operator::GetOutputsSize) +#ifdef PADDLE_WITH_ASCEND_STRING + .def("get_all_attr_names_and_types", + static_cast &) const>( + &Operator::GetAllAttrNamesAndTypes)) +#else .def("get_all_attr_names_and_types", &Operator::GetAllAttrNamesAndTypes) +#endif .def("set_attr_int64", - [](Operator &op, const std::string &name, - int64_t value) -> Operator & { + [](Operator &op, const char *name, int64_t value) -> Operator & { int64_t tar = (int64_t)value; return op.SetAttr(name, tar); }) .def("set_attr_int32", - [](Operator &op, const std::string &name, - int32_t value) -> Operator & { + [](Operator &op, const char *name, int32_t value) -> Operator & { int32_t tar = (int32_t)value; return op.SetAttr(name, tar); }) .def("set_attr_uint32", - [](Operator &op, const std::string &name, - uint32_t value) -> Operator & { + [](Operator &op, const char *name, uint32_t value) -> Operator & { uint32_t tar = (uint32_t)value; return op.SetAttr(name, tar); }) .def("set_attr_vec_int64", - [](Operator &op, const std::string &name, + [](Operator &op, const char *name, const std::vector &value) -> Operator & { int len = value.size(); std::vector tar; @@ -368,7 +491,7 @@ void BindAscendGraph(py::module *m) { return op.SetAttr(name, tar); }) .def("set_attr_vec_int32", - [](Operator &op, const std::string &name, + [](Operator &op, const char *name, const std::vector &value) -> Operator & { int len = value.size(); std::vector tar; @@ -380,7 +503,7 @@ void BindAscendGraph(py::module *m) { return op.SetAttr(name, tar); }) .def("set_attr_vec_uint32", - [](Operator &op, const std::string &name, + [](Operator &op, const char *name, const std::vector &value) -> Operator & { int len = value.size(); std::vector tar; @@ -392,21 +515,20 @@ void BindAscendGraph(py::module *m) { return op.SetAttr(name, tar); }) .def("set_attr_list_int64", - [](Operator &op, const std::string &name, + [](Operator &op, const char *name, std::initializer_list &attrValue) -> Operator & { return op.SetAttr(name, std::move(attrValue)); }) .def("set_attr_attrvalue", - [](Operator &op, const std::string &name, AttrValue &attrValue) + [](Operator &op, const char *name, AttrValue &attrValue) -> Operator & { return op.SetAttr(name, std::move(attrValue)); }) - .def( - "set_attr_float", - [](Operator &op, const std::string &name, float value) -> Operator & { - float tar = static_cast(value); - return op.SetAttr(name, tar); - }) + .def("set_attr_float", + [](Operator &op, const char *name, float value) -> Operator & { + float tar = static_cast(value); + return op.SetAttr(name, tar); + }) .def("set_attr_vec_float", - [](Operator &op, const std::string &name, + [](Operator &op, const char *name, const std::vector &value) -> Operator & { int len = value.size(); std::vector tar; @@ -417,6 +539,15 @@ void BindAscendGraph(py::module *m) { } return op.SetAttr(name, tar); }) +#ifdef PADDLE_WITH_ASCEND_STRING + .def("set_attr_string", + (Operator & (Operator::*)(const char *, const char *)) & + Operator::SetAttr) + .def("set_attr_vec_string", + (Operator & + (Operator::*)(const char *, const std::vector &)) & + Operator::SetAttr) +#else .def("set_attr_string", (Operator & (Operator::*)(const std::string &, const std::string &)) & Operator::SetAttr) @@ -424,15 +555,16 @@ void BindAscendGraph(py::module *m) { (Operator & (Operator::*)(const std::string &, const std::vector &)) & Operator::SetAttr) +#endif .def("set_attr_bool", - [](Operator &op, const std::string &name, bool value) -> Operator & { + [](Operator &op, const char *name, bool value) -> Operator & { if (value) return op.SetAttr(name, true); else return op.SetAttr(name, false); }) .def("set_attr_vec_bool", - [](Operator &op, const std::string &name, + [](Operator &op, const char *name, const std::vector &value) -> Operator & { int len = value.size(); std::vector tar; @@ -444,6 +576,15 @@ void BindAscendGraph(py::module *m) { } return op.SetAttr(name, tar); }) +#ifdef PADDLE_WITH_ASCEND_STRING + .def("set_attr_tensor", + (Operator & (Operator::*)(const char *, const Tensor &)) & + Operator::SetAttr) + .def("set_attr_vec_tensor", + (Operator & + (Operator::*)(const char *, const std::vector &)) & + Operator::SetAttr) +#else .def("set_attr_tensor", (Operator & (Operator::*)(const std::string &, const Tensor &)) & Operator::SetAttr) @@ -451,8 +592,9 @@ void BindAscendGraph(py::module *m) { (Operator & (Operator::*)(const std::string &, const std::vector &)) & Operator::SetAttr) +#endif .def("set_attr_vec_uint8", - [](Operator &op, const std::string &name, + [](Operator &op, const char *name, const std::vector &value) -> Operator & { int len = value.size(); std::vector tar; @@ -463,13 +605,21 @@ void BindAscendGraph(py::module *m) { } return op.SetAttr(name, tar); }) +#ifdef PADDLE_WITH_ASCEND_STRING + .def("set_attr_vec_vec_int64", + (Operator & + (Operator::*)(const char *, + const std::vector> &)) & + Operator::SetAttr) +#else .def("set_attr_vec_vec_int64", (Operator & (Operator::*)(const std::string &, const std::vector> &)) & Operator::SetAttr) +#endif .def("set_attr_vec_dtype", - [](Operator &op, const std::string &name, + [](Operator &op, const char *name, const std::vector &value) -> Operator & { int len = value.size(); std::vector tar; @@ -481,15 +631,13 @@ void BindAscendGraph(py::module *m) { return op.SetAttr(name, tar); }) .def("set_attr_dtype", - [](Operator &op, const std::string &name, + [](Operator &op, const char *name, const DataType &value) -> Operator & { ge::DataType tar = (ge::DataType)value; return op.SetAttr(name, tar); }) - .def("get_attr", - [](Operator &op, const std::string &name, - AttrType type) -> py::tuple { + [](Operator &op, const char *name, AttrType type) -> py::tuple { graphStatus res = -1; switch (type) { case AT_INT64: { @@ -538,12 +686,12 @@ void BindAscendGraph(py::module *m) { return py::make_tuple(o_av, res); } break; case AT_STRING: { - std::string s_av; + AscendString s_av; res = op.GetAttr(name, s_av); return py::make_tuple(s_av, res); } break; case AT_LIST_STRING: { - std::vector v_s_av; + std::vector v_s_av; res = op.GetAttr(name, v_s_av); return py::make_tuple(v_s_av, res); } break; @@ -594,11 +742,31 @@ void BindAscendGraph(py::module *m) { }) .def("break_connect", &Operator::BreakConnect) .def("get_subgraph_names_count", &Operator::GetSubgraphNamesCount) +#ifdef PADDLE_WITH_ASCEND_STRING + .def("get_subgraph_names", + static_cast &) const>(&Operator::GetSubgraphNames)) + .def("get_subgraph_builder", + static_cast(&Operator::GetSubgraphBuilder)) + .def("get_subgraph", + static_cast( + &Operator::GetSubgraph)) + .def("get_dynamic_subgraph_builder", + static_cast( + &Operator::GetDynamicSubgraphBuilder)) + .def("get_dynamic_subgraph", + static_cast(&Operator::GetDynamicSubgraph)); +#else + .def("get_subgraph_names_count", &Operator::GetSubgraphNamesCount) .def("get_subgraph_names", &Operator::GetSubgraphNames) .def("get_subgraph_builder", &Operator::GetSubgraphBuilder) .def("get_subgraph", &Operator::GetSubgraph) .def("get_dynamic_subgraph_builder", &Operator::GetDynamicSubgraphBuilder) .def("get_dynamic_subgraph", &Operator::GetDynamicSubgraph); +#endif py::class_(*m, "GETensor") .def(py::init<>()) @@ -613,10 +781,15 @@ void BindAscendGraph(py::module *m) { Tensor::SetData) .def("set_data", (graphStatus (Tensor::*)(const uint8_t *, size_t)) & Tensor::SetData) +#ifdef PADDLE_WITH_ASCEND_STRING + .def("set_data", + (graphStatus (Tensor::*)(const char *)) & Tensor::SetData) +#else .def("set_data", (graphStatus (Tensor::*)(const std::string &)) & Tensor::SetData) +#endif .def("set_data", - (graphStatus (Tensor::*)(const std::vector &)) & + (graphStatus (Tensor::*)(const std::vector &)) & Tensor::SetData) .def("get_data", @@ -638,8 +811,8 @@ void BindAscendGraph(py::module *m) { .def(py::init(), py::arg("shape"), py::arg("format") = FORMAT_ND, py::arg("dt") = DT_FLOAT) .def(py::init()) - .def("update", - (void (TensorDesc::*)(Shape, Format, DataType)) & TensorDesc::Update, + .def("update", (void (TensorDesc::*)(const Shape &, Format, DataType)) & + TensorDesc::Update, py::arg("shape"), py::arg("format") = FORMAT_ND, py::arg("dt") = DT_FLOAT) .def("set_shape", &TensorDesc::SetShape) @@ -660,8 +833,16 @@ void BindAscendGraph(py::module *m) { .def("get_origin_format", &TensorDesc::GetOriginFormat) .def("set_data_type", &TensorDesc::SetDataType) .def("get_data_type", &TensorDesc::GetDataType) +#ifdef PADDLE_WITH_ASCEND_STRING + .def("set_name", static_cast( + &TensorDesc::SetName)) + .def("get_name", + static_cast( + &TensorDesc::GetName)) +#else .def("set_name", &TensorDesc::SetName) .def("get_name", &TensorDesc::GetName) +#endif .def("set_size", &TensorDesc::SetSize) .def("get_size", &TensorDesc::GetSize) .def("set_real_dim_cnt", &TensorDesc::SetRealDimCnt) @@ -679,16 +860,27 @@ void BindAscendGraph(py::module *m) { py::class_(*m, "GEAttrValue").def(py::init<>()); py::class_(*m, "GEOperatorFactory") +#ifdef PADDLE_WITH_ASCEND_STRING + .def_static("create_operator", + static_cast( + &ge::OperatorFactory::CreateOperator)) +#else .def("create_operator", &OperatorFactory::CreateOperator) +#endif .def("get_ops_type_list", []() -> py::tuple { - std::vector all_ops; + std::vector all_ops; graphStatus status = OperatorFactory::GetOpsTypeList(all_ops); return py::make_tuple(all_ops, status); }) +#ifdef PADDLE_WITH_ASCEND_STRING + .def_static("is_exist_op", static_cast( + &OperatorFactory::IsExistOp)); +#else .def("is_exist_op", &OperatorFactory::IsExistOp); +#endif } -} // end namespace pybind -} // end namespace paddle +} // namespace pybind +} // namespace paddle #endif diff --git a/paddle/fluid/pybind/ascend_wrapper_py.h b/paddle/fluid/pybind/ascend_wrapper_py.h index 4af96d6ef4b..e999080544c 100644 --- a/paddle/fluid/pybind/ascend_wrapper_py.h +++ b/paddle/fluid/pybind/ascend_wrapper_py.h @@ -25,6 +25,7 @@ namespace pybind { void BindAscendGraph(py::module* m); void BindAscendWrapper(py::module* m); +void BindAscendDevice(py::module* m); } // namespace pybind } // namespace paddle diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc index 282b0e1d81c..2c1927f49f6 100644 --- a/paddle/fluid/pybind/op_function_generator.cc +++ b/paddle/fluid/pybind/op_function_generator.cc @@ -16,6 +16,9 @@ #include #include #include +#ifndef _WIN32 +#include +#endif #include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/op_registry.h" @@ -23,6 +26,9 @@ #include "paddle/fluid/framework/variable.h" #include "paddle/fluid/pybind/pybind.h" #include "paddle/fluid/string/string_helper.h" +#ifdef PADDLE_WITH_ASCEND +#include "paddle/fluid/framework/fleet/ascend_wrapper.h" +#endif // NOTE(zhiqiu): Commonly, the inputs in auto-generated OP function are // determined by the OP`s proto automatically, i.e., all the inputs registered @@ -561,6 +567,11 @@ int main(int argc, char* argv[]) { return -1; } +#ifdef PADDLE_WITH_ASCEND + auto ascend_ptr = paddle::framework::AscendInstance::GetInstance(); + ascend_ptr->InitGEForUT(); +#endif + std::vector headers{"\"paddle/fluid/imperative/tracer.h\""}; std::ofstream out(argv[1], std::ios::out); @@ -590,5 +601,9 @@ int main(int argc, char* argv[]) { << "} // namespace paddle\n"; out.close(); + +#ifdef PADDLE_WITH_ASCEND + ge::GEFinalize(); +#endif return 0; } diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 29c7f00142d..5bf70d1126b 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -143,6 +143,14 @@ bool IsCompiledWithROCM() { #endif } +bool IsCompiledWithAscend() { +#ifndef PADDLE_WITH_ASCEND + return false; +#else + return true; +#endif +} + bool IsCompiledWithXPU() { #ifndef PADDLE_WITH_XPU return false; @@ -1756,6 +1764,7 @@ All parameter, weight, gradient are variables in Paddle. m.def("init_devices", []() { framework::InitDevices(); }); m.def("is_compiled_with_cuda", IsCompiledWithCUDA); + m.def("is_compiled_with_ascend", IsCompiledWithAscend); m.def("is_compiled_with_rocm", IsCompiledWithROCM); m.def("is_compiled_with_xpu", IsCompiledWithXPU); m.def("is_compiled_with_mkldnn", IsCompiledWithMKLDNN); @@ -2885,6 +2894,7 @@ All parameter, weight, gradient are variables in Paddle. #ifdef PADDLE_WITH_ASCEND BindAscendWrapper(&m); BindAscendGraph(&m); + BindAscendDevice(&m); #endif #ifdef PADDLE_WITH_CRYPTO BindCrypto(&m); diff --git a/python/paddle/distributed/fleet/__init__.py b/python/paddle/distributed/fleet/__init__.py index ddbf8cbbe3f..6d4aedddba6 100644 --- a/python/paddle/distributed/fleet/__init__.py +++ b/python/paddle/distributed/fleet/__init__.py @@ -37,6 +37,17 @@ init = fleet.init is_first_worker = fleet.is_first_worker worker_index = fleet.worker_index worker_num = fleet.worker_num +node_num = fleet.node_num +rank = fleet.worker_index +nranks = fleet.worker_num +world_size = fleet.worker_num +# device id in current trainer +local_device_ids = fleet.local_device_ids +# device ids in world +world_device_ids = fleet.world_device_ids +# rank in node +local_rank = fleet.local_rank +rank_in_node = local_rank is_worker = fleet.is_worker worker_endpoints = fleet.worker_endpoints server_num = fleet.server_num diff --git a/python/paddle/distributed/fleet/base/fleet_base.py b/python/paddle/distributed/fleet/base/fleet_base.py index 470d1a2b78f..0a60cbf78d5 100644 --- a/python/paddle/distributed/fleet/base/fleet_base.py +++ b/python/paddle/distributed/fleet/base/fleet_base.py @@ -289,6 +289,18 @@ class Fleet(object): """ return self._role_maker._worker_num() + def node_num(self): + return self._role_maker._get_node_num() + + def local_rank(self): + return self._role_maker._get_local_rank() + + def local_device_ids(self): + return self._role_maker._get_local_device_ids() + + def world_device_ids(self): + return self._role_maker._get_world_device_ids() + def is_worker(self): """ Check whether the node is an instance of worker. diff --git a/python/paddle/distributed/fleet/base/role_maker.py b/python/paddle/distributed/fleet/base/role_maker.py index a8683aea97f..62c8faa0757 100644 --- a/python/paddle/distributed/fleet/base/role_maker.py +++ b/python/paddle/distributed/fleet/base/role_maker.py @@ -622,6 +622,29 @@ class PaddleCloudRoleMaker(RoleMakerBase): self._generate_role() return self._nodes_num + def _get_node_num(self): + """ + return the training node number + """ + if not self._role_is_generated: + self._generate_role() + return self._nodes_num + + def _get_local_rank(self): + if not self._role_is_generated: + self._generate_role() + return self._local_rank + + def _get_local_device_ids(self): + if not self._role_is_generated: + self._generate_role() + return self._local_device_ids + + def _get_world_device_ids(self): + if not self._role_is_generated: + self._generate_role() + return self._world_device_ids + def _get_trainer_endpoints(self): """ get endpoint of all trainers @@ -782,6 +805,9 @@ class PaddleCloudRoleMaker(RoleMakerBase): self._trainers_num = len(self._worker_endpoints) self._nodes_num = len( set([x.split(':')[0] for x in self._worker_endpoints])) + self._local_rank = os.getenv("PADDLE_RANK_IN_NODE") + self._local_device_ids = os.getenv("PADDLE_LOCAL_DEVICE_IDS") + self._world_device_ids = os.getenv("PADDLE_WORLD_DEVICE_IDS") def _gloo_init(self): # PADDLE_WITH_GLOO 1: trainer barrier, 2: all barrier diff --git a/python/paddle/distributed/fleet/launch.py b/python/paddle/distributed/fleet/launch.py index 0f9b13d8a12..d6f4227a923 100644 --- a/python/paddle/distributed/fleet/launch.py +++ b/python/paddle/distributed/fleet/launch.py @@ -108,6 +108,21 @@ see: http://www.paddlepaddle.org/documentation/docs/zh/1.6/user_guides/howto/tra "In gpu training, it should be less or equal to the gpus number of you system(or you set by --gpus). And so each process can" " bound to one or average number of gpus.") + base_group.add_argument( + "--run_mode", + type=str, + default="collective", + help="run mode of job, can be:collective/ps/ps-heter") + + base_group.add_argument( + "--ascend_npus", + type=str, + default=None, + help="It's for ascend npu training." + "For example:" + "--ascend_npus=\"0,1,2,3\" will launch four training processes each bound to one gpu." + ) + if fluid.core.is_compiled_with_cuda(): base_group.add_argument( "--gpus", @@ -243,6 +258,9 @@ def launch_collective(args): log_dir=args.log_dir, envs=global_envs) + for idx, proc in enumerate(procs): + print("launch proc_id:{} idx:{}".format(proc.proc.pid, idx)) + while True: alive = watch_local_trainers(procs, cluster.trainers_nranks()) @@ -276,6 +294,16 @@ def launch_ps(args, distribute_mode): def which_distributed_mode(args): + if args.run_mode is not None: + assert args.run_mode in ["collective", "ps", "ps-heter"] + + if args.run_mode == "collective": + return DistributeMode.COLLECTIVE + elif args.run_mode == "ps": + return DistributeMode.PS + elif args.run_mode == "ps-heter": + return DistributeMode.PS_HETER + ps_args = [ '--worker_num', '--server_num', '--heter_worker_num', '--servers', '--workers', '--heter_workers', '--http_port' @@ -298,24 +326,26 @@ def which_distributed_mode(args): ) if fluid.core.is_compiled_with_cuda(): - device_count = fluid.core.get_cuda_device_count() + accelerators = fluid.core.get_cuda_device_count() + elif fluid.core.is_compiled_with_ascend(): + accelerators = fluid.core.NPUDevice.get_device_count() elif fluid.core.is_compiled_with_xpu(): - device_count = fluid.core.get_xpu_device_count() + accelerators = fluid.core.get_xpu_device_count() else: - device_count = 0 + accelerators = 0 if len(has_ps_args) > 0: logger.info( - "Run parameter-sever mode. pserver arguments:{}, cuda or xpu count:{}". - format(has_ps_args, device_count)) + "Run parameter-sever mode. pserver arguments:{}, accelerators count:{}". + format(has_ps_args, accelerators)) has_ps_heter_args = list(set(has_ps_args) & set(ps_heter_args)) if len(has_ps_heter_args) > 0: return DistributeMode.PS_HETER else: return DistributeMode.PS elif len(has_collective_args) > 0: - logger.info("Run collective gpu mode. gpu arguments:{}, cuda count:{}". - format(has_collective_args, device_count)) + logger.info("Run collective mode. gpu arguments:{}, cuda count:{}". + format(has_collective_args, accelerators)) return DistributeMode.COLLECTIVE else: if not fluid.core.is_compiled_with_cuda( diff --git a/python/paddle/distributed/fleet/launch_utils.py b/python/paddle/distributed/fleet/launch_utils.py index c5cb1ec94ac..2d2807bce28 100644 --- a/python/paddle/distributed/fleet/launch_utils.py +++ b/python/paddle/distributed/fleet/launch_utils.py @@ -52,6 +52,8 @@ class DeviceMode(): GPU = 1 KUNLUN = 2 XPU = 2 + ASCEND_NPU = 3 + UNKNOWN = 3 class Cluster(object): @@ -98,6 +100,14 @@ class Cluster(object): r.append(t.endpoint) return r + def world_device_ids(self): + r = [] + for pod in self.pods: + for t in pod.trainers: + str_accelerators = [str(acc) for acc in t.accelerators] + r.append(str_accelerators) + return r + def pods_endpoints(self): r = [] for pod in self.pods: @@ -105,7 +115,6 @@ class Cluster(object): assert pod.port != None and pod.addr != None, "{} not a valid endpoint".format( ep) r.append(ep) - return r def get_pod_by_id(self, pod_id): @@ -132,23 +141,23 @@ class JobServer(object): class Trainer(object): def __init__(self): - self.gpus = [] + self.accelerators = [] self.endpoint = None self.rank = None def __str__(self): - return "gpu:{} endpoint:{} rank:{}".format(self.gpus, self.endpoint, - self.rank) + return "accelerator:{} endpoint:{} rank:{}".format( + self.accelerators, self.endpoint, self.rank) def __eq__(self, t): - if len(self.gpus) != len(t.gpus): + if len(self.accelerators) != len(t.accelerators): return False if self.endpoint != t.endpoint or \ self.rank != t.rank: return False - for a, b in zip(self.gpus, t.gpus): + for a, b in zip(self.accelerators, t.accelerators): if a != b: return False @@ -171,12 +180,13 @@ class Pod(object): self.servers = [] self.workers = [] self.heter_workers = [] - self.gpus = [] + self.accelerators = [] + self.device_mode = None def __str__(self): - return "rank:{} id:{} addr:{} port:{} visible_gpu:{} trainers:{} servers:{} \ + return "rank:{} id:{} addr:{} port:{} visible_accelerator:{} trainers:{} servers:{} \ workers:{} heter_workers:{}".format( - self.rank, self.id, self.addr, self.port, self.gpus, [ + self.rank, self.id, self.addr, self.port, self.accelerators, [ str(t) for t in self.trainers ], [str(s) for s in self.servers], [str(w) for w in self.workers], [str(h) for h in self.heter_workers]) @@ -231,12 +241,12 @@ class Pod(object): def rank(self): return self.rank - def get_visible_gpus(self): + def get_visible_accelerators(self): r = "" - for g in self.gpus: + for g in self.accelerators: r += "{},".format(g) - assert r != "", "this pod {} can't see any gpus".format(self) + assert r != "", "this pod {} can't see any accelerators".format(self) r = r[:-1] return r @@ -264,23 +274,27 @@ def get_cluster(node_ips, node_ip, trainer_endpoints, device_mode, pod = Pod() pod.rank = node_rank pod.addr = ip + pod.device_mode = device_mode + cur_node_endpoints = trainer_endpoints[node_rank] # when use paddlecloud, endpoints may > devices_per_proc(user_defined) assert len(cur_node_endpoints) >= len( devices_per_proc - ), "current trainer_endpoints size should be greater equal than selected_gpus size." + ), "current trainer_endpoints size should be greater equal than acclerators size." for i in range(len(devices_per_proc)): trainer = Trainer() - if device_mode == DeviceMode.GPU: + if device_mode == DeviceMode.GPU or device_mode == DeviceMode.ASCEND_NPU: if isinstance(devices_per_proc[i], (list, tuple)): - trainer.gpus.extend(devices_per_proc[i]) + trainer.accelerators.extend(devices_per_proc[i]) + pod.accelerators.extend(devices_per_proc[i]) else: - trainer.gpus.append(devices_per_proc[i]) + trainer.accelerators.append(devices_per_proc[i]) + pod.accelerators.append(devices_per_proc[i]) elif device_mode == DeviceMode.XPU: if isinstance(devices_per_proc[i], (list, tuple)): - trainer.gpus.extend(devices_per_proc[i]) + trainer.accelerators.extend(devices_per_proc[i]) else: - trainer.gpus.append(devices_per_proc[i]) + trainer.accelerators.append(devices_per_proc[i]) trainer.endpoint = "%s" % (cur_node_endpoints[i]) trainer.rank = trainer_rank trainer_rank += 1 @@ -451,21 +465,32 @@ def start_local_trainers(cluster, current_env.pop("http_proxy", None) current_env.pop("https_proxy", None) + ids = cluster.world_device_ids() + res = [':'.join(ele) for ele in ids] procs = [] for idx, t in enumerate(pod.trainers): proc_env = { "PADDLE_TRAINER_ID": "%d" % t.rank, "PADDLE_CURRENT_ENDPOINT": "%s" % t.endpoint, "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(), - "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints()) + "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints()), + "PADDLE_RANK_IN_NODE": str(idx), + "PADDLE_LOCAL_DEVICE_IDS": + ",".join([str(acc) for acc in t.accelerators]), + "PADDLE_WORLD_DEVICE_IDS": ",".join(res), } - if fluid.core.is_compiled_with_cuda() and len(t.gpus) > 0: + if len(t.accelerators) > 0 and pod.device_mode == DeviceMode.GPU: proc_env["FLAGS_selected_gpus"] = "%s" % ",".join( - [str(g) for g in t.gpus]) - elif fluid.core.is_compiled_with_xpu() and len(t.gpus) > 0: + [str(g) for g in t.accelerators]) + + if len(t.accelerators) > 0: + proc_env["FLAGS_selected_accelerators"] = "%s" % ",".join( + [str(g) for g in t.accelerators]) + # to do: same code style in future + if fluid.core.is_compiled_with_xpu() and len(t.accelerators) > 0: proc_env["FLAGS_selected_xpus"] = "%s" % ",".join( - [str(g) for g in t.gpus]) + [str(g) for g in t.accelerators]) current_env.update(proc_env) @@ -564,6 +589,17 @@ def watch_local_trainers(procs, nranks): return alive +def get_ascend_npus(npus): + if npus is None: + count = fluid.core.NPUDevice.get_device_count() + if count <= 0: + return ret + ret = [x for x in range(count)] + else: + ret = [x.strip() for x in npus.split(',')] + return ret + + def get_gpus(gpus): if gpus is None: gpus_num = fluid.core.get_cuda_device_count() @@ -623,11 +659,17 @@ def get_xpus(xpus): def get_device_mode(): - if fluid.core.is_compiled_with_cuda() and fluid.core.get_cuda_device_count( - ) > 0: - print("launch train in GPU mode") + if fluid.core.is_compiled_with_ascend() and \ + fluid.core.NPUDevice.get_device_count() > 0: + print("launch train in ascend npu mode!") + return DeviceMode.ASCEND_NPU + + if fluid.core.is_compiled_with_cuda() and \ + fluid.core.get_cuda_device_count() > 0: + print("launch train in GPU mode!") return DeviceMode.GPU - elif fluid.core.is_compiled_with_xpu() and fluid.core.get_xpu_device_count( + + if fluid.core.is_compiled_with_xpu() and fluid.core.get_xpu_device_count( ) > 0: print("launch train in XPU mode") return DeviceMode.XPU @@ -654,6 +696,10 @@ def get_device_proc_info(args): ] else: devices_per_proc = gpus + elif device_mode == DeviceMode.ASCEND_NPU: + npus = get_ascend_npus(args.ascend_npus) + assert args.nproc_per_node is None, "ascend_npus need't nproc_per_node arguments" + devices_per_proc = npus elif device_mode == DeviceMode.XPU: xpus = get_xpus(args.xpus) if args.nproc_per_node is not None: diff --git a/python/paddle/distributed/fleet/meta_optimizers/ascend/__init__.py b/python/paddle/distributed/fleet/meta_optimizers/ascend/__init__.py new file mode 100644 index 00000000000..b9a7651e449 --- /dev/null +++ b/python/paddle/distributed/fleet/meta_optimizers/ascend/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_optimizer.py index d7ac81bb5c5..978899604ea 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_optimizer.py @@ -12,16 +12,26 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os import paddle.fluid.framework as framework from paddle.fluid.optimizer import Optimizer import paddle.fluid.core as core import numpy as np -import ascend_parser +from . import ascend_parser +from paddle.distributed import fleet +import hccl.manage.api as hccl +from collections import namedtuple + +HcomGroupConfig = namedtuple('HcomGroupConfig', ['name', 'nranks', 'rank_ids']) class AscendIRParser(object): - def __init__(self): + def __init__(self, auto_dp=False, world_rank_size=1): self.graph_idx = 0 + self.hcom_endpoints = {} + self.groups_to_create = [] + self._auto_dp = auto_dp + self._world_rank_size = world_rank_size def _construct_input_map(self, input_varlist): ret_map = {} @@ -43,15 +53,52 @@ class AscendIRParser(object): ret_map[var.name] = ge_input return ge_in_operator, ret_map + def _endpoint_to_world_rank_id(self, endpoint): + world_endpoints = fleet.worker_endpoints() + assert endpoint in world_endpoints, "endpoint (%s) not in worker_endpoints (%s) " % ( + endpoint, fleet.world_device_ids()) + return world_endpoints.index(endpoint) + def parse_op(self, op): - if op.type in ascend_parser.registerd_op: - print("Op[%s] has been registered, begin to parse it" % (op.type)) + if op.type == 'c_gen_nccl_id': + endpoint = op.attr("endpoint") + other_endpoints = op.attr("other_endpoints") + rank = op.attr("rank") + + nccl_id = op.output_arg_names[0] + + # c_gen_nccl_id operator splits endpoints into local endpoint and other_endpoints + # we should combine these together to produce world_rank_ids + self.hcom_endpoints[nccl_id] = other_endpoints[:] + self.hcom_endpoints[nccl_id].insert(rank, endpoint) + + print("nccl_id (%s) registered endpoints %s" % + (nccl_id, self.hcom_endpoints[nccl_id])) + elif op.type == 'c_comm_init': + nccl_id = op.input_arg_names[0] + nranks = op.attr("nranks") + assert nranks == len(self.hcom_endpoints[ + nccl_id]), "nranks doesn't match endpoint count" + rank = op.attr("rank") + ring_id = op.attr("ring_id") + + group_name = "hcom_group_" + str(ring_id) + global_rank_ids = [ + self._endpoint_to_world_rank_id(endpoint) + for endpoint in self.hcom_endpoints[nccl_id] + ] + self.groups_to_create.append( + HcomGroupConfig( + name=group_name, nranks=nranks, rank_ids=global_rank_ids)) + print("append to create group: %s, with rank_ids: %s" % + (group_name, global_rank_ids)) + elif op.type in ascend_parser.registerd_op: op_parser = self.parser_factory.create_parse( ascend_parser.registerd_op[op.type]) op_parser.apply(op) else: - print("Op[%s] has not been registered, so we have to skip it" % - (op.type)) + assert False, "Op[%s] has not been registered, so we have to skip it" % ( + op.type) def _parse_program(self, graph_name, @@ -84,7 +131,7 @@ class AscendIRParser(object): name = e.name ge_out_operator.append(self.var2geop[name]) - # (Debug) If you want to print back prop vars, append/assign the varname in ge_out_operator here, such as: + # (Debug) If you want to print back prop vars, append/assign the varname in ge_out_operator here, such as: # if graph_name == "main": # ge_out_operator.append(self.var2geop["reduce_sum_0.tmp_0@GRAD"]) @@ -115,6 +162,17 @@ class AscendIRParser(object): startup_graph = self._parse_program("startup", startup_program) main_graph = self._parse_program("main", main_program, input_varlist, fetch_list) + if self._auto_dp and self._world_rank_size > 1: + assert len(self.groups_to_create + ) == 0, "can't parse program under auto_dp mode" + + from paddle.distributed import fleet + self.groups_to_create.append( + HcomGroupConfig( + name="hcom_group_0", + nranks=fleet.world_size(), + rank_ids=[x for x in range(fleet.world_size())])) + return startup_graph, main_graph @@ -124,9 +182,14 @@ class AscendOptimizer(Optimizer): def __init__(self, optimizer, fetch_list=[]): self.inner_opt = optimizer self.fetch_list = fetch_list + self.ascend_instance = None def __del__(self): + print("begin AscendOptimizer del") + if self.ascend_instance is not None: + self.ascend_instance.destroy_global_resources() core.ge_finalize() + print("end AscendOptimizer del") def _can_apply(self): if not self.user_defined_strategy.ascend: @@ -138,7 +201,7 @@ class AscendOptimizer(Optimizer): dist_strategy.ascend = False dist_strategy.ascend_configs = {} - def _get_input_varlist(program): + def _get_input_varlist(self, program): ret_list = [] for var in program.list_vars(): if var.is_data or var.persistable: @@ -149,30 +212,56 @@ class AscendOptimizer(Optimizer): loss, startup_program=None, parameter_list=None, - no_grad_set=None): - minimized = self.inner_opt.minimize( - loss, startup_program=startup_program) + no_grad_set=None, + auto_dp=False, + rank_table_file=None): + minimized = None + if self.inner_opt: + minimized = self.inner_opt.minimize( + loss, startup_program=startup_program) self.ascend_instance = core.AscendInstance() + from paddle.distributed import fleet + if auto_dp and fleet.world_size() > 1: + from paddle.fluid.transpiler import ascend_transpiler + t = ascend_transpiler.AscendTranspiler(startup_program, + loss.block.program) + t.transpile() + #print(loss.block.program) + # Config about Graph Engine can be found in https://support.huaweicloud.com/ config = { - "ge.exec.deviceId": "0", + "ge.exec.deviceId": str(fleet.local_device_ids()), "ge.graphRunMode": "1", - "ge.exec.precision_mode": "must_keep_origin_dtype" + "ge.exec.precision_mode": "must_keep_origin_dtype", } + # if multi trainers + if rank_table_file and fleet.world_size() > 1: + config["ge.exec.rankTableFile"] = rank_table_file + config["ge.exec.rankId"] = str(fleet.worker_index()) + config["ge.exec.isUseHcom"] = "1" + config["ge.exec.deployMode"] = "0" + print("ge_initialize config:", config) core.ge_initialize(config) # Init Session self.ascend_instance.init_global_resources() main_block = loss.block - self.parser = AscendIRParser() + self.parser = AscendIRParser( + auto_dp=auto_dp, world_rank_size=fleet.world_size()) + + input_varlist = self._get_input_varlist(main_block.program) - input_varlist = _get_input_varlist(main_block.program) startup_graph, main_graph = self.parser.parse_program( startup_program, main_block.program, input_varlist, self.fetch_list) + for cfg in self.parser.groups_to_create: + print("create group (%s), nranks: %d, rank_ids: %s" % + (cfg.name, cfg.nranks, cfg.rank_ids)) + hccl.create_group(cfg.name, cfg.nranks, cfg.rank_ids) + self.ascend_instance.add_ascend_subgraph(0, startup_graph) self.ascend_instance.add_ascend_subgraph(1, main_graph) diff --git a/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_parser.py b/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_parser.py index 2c5930c5b9f..f2ecaf48438 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_parser.py +++ b/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_parser.py @@ -1,41 +1,106 @@ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - import paddle.fluid.framework as framework from paddle.fluid.optimizer import Optimizer import paddle.fluid.core as core import numpy as np - -registerd_op = { - "elementwise_add": "AddParser", - "matmul": "MatMulParser", - "mul": "MulParser", - "relu": "ReluParser", - "softmax_with_cross_entropy": "SoftmaxWithCrossEntropyParser", - "shape": "ShapeParser", - "fill_constant": "FillConstantParser", - "reduce_sum": "ReduceSumParser", - "reduce_sum_grad": "ReduceSumGradParser", - "matmul_grad": "MatMulGradParser", - "mul_grad": "MulGradParser", - "reshape2": "ReshapeParser", - "scale": "ScaleParser", - "relu_grad": "ReluGradParser", - "softmax_with_cross_entropy_grad": "SoftmaxWithCrossEntropyGradParser", - "truncated_gaussian_random": "TruncatedNormalParser", - "sgd": "SGDParser" -} +from paddle.distributed import fleet +from functools import reduce + +registerd_op = {## forwards + "elementwise_add": "AddParser", + "matmul": "MatMulParser", + "mul": "MulParser", + "relu": "ReluParser", + "softmax_with_cross_entropy": "SoftmaxWithCrossEntropyParser", + "shape": "ShapeParser", + "fill_constant": "FillConstantParser", + "reduce_sum": "ReduceSumParser", + "elementwise_mul": "DotMulParser", + "elementwise_div": "DotDivParser", + "elementwise_pow": "DotPowParser", + "elementwise_max": "MaxParser", + "elementwise_min": "MinParser", + "elementwise_sub": "DotSubParser", + "pow": "PowParser", + "gelu": "GeluParser", + "sqrt": "SqrtParser", + "log": "LogParser", + "sum": "SumParser", + "logical_not": "LogicalNotParser", + "gather": "GatherParser", + "scatter": "ScatterParser", + "cast": "CastParser", + "tanh": "TanhParser", + "stack": "StackParser", + "square": "SquareParser", + "unsqueeze2": "UnSqueezeParser", + "assign": "AssignParser", + "softmax": "SoftMaxParser", + "reshape2": "ReshapeParser", + "transpose2": "TransposeParser", + "layer_norm": "LayerNormParser", + "less_than": "LessParser", + "mean": "MeanParser", + "scale": "ScaleParser", + "slice": "SliceParser", + "top_k": "TopkParser", + "accuracy": "AccuracyParser", + #"increment": "IncrementParser", + "lookup_table": "LookupTableParser", + "truncated_gaussian_random": "TruncatedNormalParser", + "c_allgather": "AllGatherParser", + "c_allreduce_sum": "AllReduceSumParser", + "c_allreduce_max": "AllReduceMaxParser", + "c_broadcast": "BroadcastParser", + "c_reduce_scatter": "ReduceScatterParser", + "c_send": "SendParser", + "c_receive": "ReceiveParser", + "uniform_random": "UniformRandomParser", + "range": "RangeParser", + "equal": "EqualParser", + "expand": "ExpandParser", + "squeeze2": "SqueezeParser", + + + ## backwords + "matmul_grad": "MatMulGradParser", + "mul_grad": "MulGradParser", + "relu_grad": "ReluGradParser", + "reduce_sum_grad": "ReduceSumGradParser", + "softmax_with_cross_entropy_grad": "SoftmaxWithCrossEntropyGradParser", + "tanh_grad":"TanhGradParser", + "log_grad":"LogGradParser", + "pow_grad": "PowGradParser", + "sqrt_grad": "SqrtGradParser", + "gelu_grad": "GeluGradParser", + "mean_grad": "MeanGradParser", + 'lookup_table_grad': "LookUpTableGradParser", + "elementwise_mul_grad": "DotMulGradParser", + "elementwise_add_grad": "DotAddGradParser", + "elementwise_div_grad": "DotDivGradParser", + "softmax_grad": "SoftmaxGradParser", + "slice_grad": "SliceGradParser", + "reshape2_grad": "ReshapeGradParser", + "gather_grad": "GatherGradParser", + "transpose2_grad": "TransposeGradParser", + "layer_norm_grad": "LayerNormGradParser", + + ## opt + "sgd": "SGDParser", + #"adam": "AdamParser", + } global_cnt = -1 global_input_cnt = -1 @@ -60,6 +125,7 @@ class AscendHelper(object): 5: "float32", 6: "float64" } + self.dtype2paddle_inv_map = {"VarType.FP32": 0, "VarType.FP16": 1} def dtype2ge(self, dtype): assert dtype in self.dtype2ge_map, "dtype[%d] is not supported %d" % ( @@ -105,7 +171,6 @@ class AscendParserBase(object): self.parser_name, len(index_list), output_num) for output_id in range(output_num): arguments = self.op.output(self.op.output_names[output_id]) - print("%d argument: %s" % (output_id, str(arguments))) if len(arguments) > 0: assert len(arguments) == len( index_list[output_id] @@ -113,8 +178,6 @@ class AscendParserBase(object): self.parser_name, output_id, len(index_list[output_id]), len(arguments)) for i in range(len(arguments)): - print("assgin index_list[%d][%d] to %s" % - (output_id, i, arguments[i])) self.var2geop[arguments[i]] = geop_list[index_list[ output_id][i]] @@ -125,7 +188,7 @@ class AscendParserBase(object): self.op = op assert self.op.type == self.parser_name, "op [%s] != parser_name[%s]" % ( self.op.type, self.parser_name) - print("begin to parse op %s" % (self.parser_name)) + #print("begin to parse op %s" % (self.parser_name)) geop_list, index_list = self._apply() self.update_output(geop_list, index_list) @@ -152,6 +215,63 @@ class AscendParserBase(object): tensor.set_data(data_8) return tensor + def _get_ge_tensor(self, shape, dtype, value_list): + tensor_desc = core.GETensorDesc( + core.GEShape(shape), core.GEFormat.FORMAT_ND, + self.ascend_helper.dtype2ge(dtype)) + tensor = core.GETensor(tensor_desc) + + data = np.array(value_list).reshape(shape).astype( + self.ascend_helper.dtype2np(dtype)) + buf = data.tobytes() + data_8 = np.frombuffer(buf, dtype=np.uint8) + tensor.set_data(data_8) + + tensor_const = core.GEOperatorFactory.create_operator( + "const" + self._accumulated_op_id(), + "Const").set_attr_tensor("value", tensor) + + return tensor_const + + def _get_variable(self, shape, dtype, tensor): + if dtype == "int32": + type = core.GEDataType.DT_INT32 + elif dtype == "float32": + type = core.GEDataType.DT_FLOAT + + var = core.GEOperatorFactory.create_operator( + "variable" + self._accumulated_op_id(), "Variable") + var.update_output_desc("y", + core.GETensorDesc( + core.GEShape(shape), core.GEFormat.FORMAT_ND, + type)) + assign = core.GEOperatorFactory.create_operator( + "assign" + self._accumulated_op_id(), "Assign").set_input( + "value", tensor).set_input("ref", var) + + return assign + + def _create_shape_tensor(self): + tensor_desc = core.GETensorDesc( + core.GEShape([2]), core.GEFormat.FORMAT_ND, + core.GEDataType.DT_INT32) + tensor = core.GETensor(tensor_desc) + + data = np.ones((2)).astype("int32").reshape([2]) + data[0] = 64 + buf = data.tobytes() + data_8 = np.frombuffer(buf, dtype=np.uint8) + tensor.set_data(data_8) + return tensor + + def _get_GEtensor_shape(self, tensor): + tensor_shape = core.GEOperatorFactory.create_operator( + "shape" + self._accumulated_op_id(), "Shape").set_input("x", tensor) + tensor_shape = core.GEOperatorFactory.create_operator( + "cast" + self._accumulated_op_id(), "Cast").set_input( + "x", tensor_shape).set_attr_int32("dst_type", 0) + return tensor_shape + class AddParser(AscendParserBase): def __init__(self, graph, var2geop): @@ -162,109 +282,276 @@ class AddParser(AscendParserBase): x = self._get_ge_input(self.op.input_arg_names[0]) y = self._get_ge_input(self.op.input_arg_names[1]) add = core.GEOperatorFactory.create_operator( - "add" + self._accumulated_op_id(), "Add").set_input( - "x1", x).set_input("x2", y) + "add" + self._accumulated_op_id(), + "Add").set_input("x1", x).set_input("x2", y) return [add], [[0]] -class ReduceSumParser(AscendParserBase): +class DotSubParser(AscendParserBase): def __init__(self, graph, var2geop): - super(ReduceSumParser, self).__init__(graph, var2geop) - self.parser_name = "reduce_sum" + super(DotSubParser, self).__init__(graph, var2geop) + self.parser_name = "elementwise_sub" def _apply(self): x = self._get_ge_input(self.op.input_arg_names[0]) - axes = self.op.attr("dim") - keep_dims = self.op.attr("keep_dim") - reduce_sum = core.GEOperatorFactory.create_operator( - "reduce_sum" + self._accumulated_op_id(), "ReduceSumD").set_input( - "x", x, 0).set_attr_vec_int32("axes", axes).set_attr_bool( - "keep_dims", keep_dims) - return [reduce_sum], [[0]] + y = self._get_ge_input(self.op.input_arg_names[1]) + sub = core.GEOperatorFactory.create_operator( + "sub" + self._accumulated_op_id(), + "Sub").set_input("x1", x).set_input("x2", y) + return [sub], [[0]] -class ReduceSumGradParser(AscendParserBase): +class DotMulParser(AscendParserBase): def __init__(self, graph, var2geop): - super(ReduceSumGradParser, self).__init__(graph, var2geop) - self.parser_name = "reduce_sum_grad" + super(DotMulParser, self).__init__(graph, var2geop) + self.parser_name = "elementwise_mul" def _apply(self): x = self._get_ge_input(self.op.input_arg_names[0]) - input = self._get_ge_input(self.op.input_arg_names[1]) + y = self._get_ge_input(self.op.input_arg_names[1]) + mul = core.GEOperatorFactory.create_operator( + "dotmul" + self._accumulated_op_id(), + "Mul").set_input("x1", x).set_input("x2", y) + return [mul], [[0]] - shape_tensor = core.GEOperatorFactory.create_operator( - "shape" + self._accumulated_op_id(), "Shape").set_input("x", input, - 0) - axis_const = core.GEOperatorFactory.create_operator( - "const" + self._accumulated_op_id(), "Const").set_attr_tensor( - "value", self._create_ge_tensor([1], 2, -1)) - self._mark_as_input(axis_const) - broadcast = core.GEOperatorFactory.create_operator( - "broadcast_to_d" + self._accumulated_op_id(), - "BroadcastTo").set_input("x", x).set_input("shape", shape_tensor) - # unsqueeze cannot get right result, but ExpandDims seems have the same functionality. - reduce_sum_grad = core.GEOperatorFactory.create_operator( - "expand" + self._accumulated_op_id(), "ExpandDims").set_input( - "x", broadcast).set_input("axis", axis_const) - return [shape_tensor, axis_const, broadcast, reduce_sum_grad], [[3]] +class DotDivParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(DotDivParser, self).__init__(graph, var2geop) + self.parser_name = "elementwise_div" + + def _apply(self): + x = self._get_ge_input(self.op.input_arg_names[0]) + y = self._get_ge_input(self.op.input_arg_names[1]) + div = core.GEOperatorFactory.create_operator( + "dotdiv" + self._accumulated_op_id(), + "Div").set_input("x1", x).set_input("x2", y) + return [div], [[0]] -class MatMulParser(AscendParserBase): +class DotPowParser(AscendParserBase): def __init__(self, graph, var2geop): - super(MatMulParser, self).__init__(graph, var2geop) - self.parser_name = "matmul" + super(DotPowParser, self).__init__(graph, var2geop) + self.parser_name = "elementwise_pow" def _apply(self): - x1 = self._get_ge_input(self.op.input_arg_names[0]) - x2 = self._get_ge_input(self.op.input_arg_names[1]) - matmul = core.GEOperatorFactory.create_operator( - "matmul" + self._accumulated_op_id(), "MatMul").set_input( - "x1", x1).set_input("x2", x2) - return [matmul], [[0]] + x = self._get_ge_input(self.op.input_arg_names[0]) + y = self._get_ge_input(self.op.input_arg_names[1]) + pow = core.GEOperatorFactory.create_operator( + "dotpow" + self._accumulated_op_id(), + "Pow").set_input("x1", x1).set_input("x2", y) + return [pow], [[0]] -class MatMulGradParser(AscendParserBase): +class LessParser(AscendParserBase): def __init__(self, graph, var2geop): - super(MatMulGradParser, self).__init__(graph, var2geop) - self.parser_name = "matmul_grad" + super(LessParser, self).__init__(graph, var2geop) + self.parser_name = "less_than" def _apply(self): - out_grad = self._get_ge_input(self.op.input_arg_names[0]) - x = self._get_ge_input(self.op.input_arg_names[1]) - y = self._get_ge_input(self.op.input_arg_names[2]) + x = self._get_ge_input(self.op.input_arg_names[0]) + y = self._get_ge_input(self.op.input_arg_names[1]) + less_than = core.GEOperatorFactory.create_operator( + "less_than" + self._accumulated_op_id(), + "Less").set_input("x1", x).set_input("x2", y) + return [less_than], [[0]] - x_grad = core.GEOperatorFactory.create_operator( - self.parser_name + self._accumulated_op_id(), "MatMul").set_input( - "x1", out_grad).set_input("x2", y).set_attr_bool( - "transpose_x1", False).set_attr_bool("transpose_x2", True) - y_grad = core.GEOperatorFactory.create_operator( - self.parser_name + self._accumulated_op_id(), "MatMul").set_input( - "x1", x).set_input("x2", out_grad).set_attr_bool( - "transpose_x1", True).set_attr_bool("transpose_x2", False) - return [x_grad, y_grad], [[0], [1]] +class MaxParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(MaxParser, self).__init__(graph, var2geop) + self.parser_name = "elementwise_max" -class MulGradParser(AscendParserBase): + def _apply(self): + x = self._get_ge_input(self.op.input_arg_names[0]) + y = self._get_ge_input(self.op.input_arg_names[1]) + max_out = core.GEOperatorFactory.create_operator( + "max" + self._accumulated_op_id(), + "Maximum").set_input("x1", x).set_input("x2", y) + return [max_out], [[0]] + + +class MinParser(AscendParserBase): def __init__(self, graph, var2geop): - super(MulGradParser, self).__init__(graph, var2geop) - self.parser_name = "mul_grad" + super(MinParser, self).__init__(graph, var2geop) + self.parser_name = "elementwise_min" def _apply(self): - out_grad = self._get_ge_input(self.op.input_arg_names[0]) - x = self._get_ge_input(self.op.input_arg_names[1]) - y = self._get_ge_input(self.op.input_arg_names[2]) + x = self._get_ge_input(self.op.input_arg_names[0]) + y = self._get_ge_input(self.op.input_arg_names[1]) + min_out = core.GEOperatorFactory.create_operator( + "min" + self._accumulated_op_id(), + "Minimum").set_input("x1", x).set_input("x2", y) + return [min_out], [[0]] - x_grad = core.GEOperatorFactory.create_operator( - self.parser_name + self._accumulated_op_id(), "MatMul").set_input( - "x1", out_grad).set_input("x2", y).set_attr_bool( - "transpose_x1", False).set_attr_bool("transpose_x2", True) - y_grad = core.GEOperatorFactory.create_operator( - self.parser_name + self._accumulated_op_id(), "MatMul").set_input( - "x1", x).set_input("x2", out_grad).set_attr_bool( - "transpose_x1", True).set_attr_bool("transpose_x2", False) - return [x_grad, y_grad], [[0], [1]] +## cal +class LogParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(LogParser, self).__init__(graph, var2geop) + self.parser_name = "log" + + def _apply(self): + x = self._get_ge_input(self.op.input_arg_names[0]) + log = core.GEOperatorFactory.create_operator( + "log" + self._accumulated_op_id(), "Log").set_input("x", x) + return [log], [[0]] + + +class SqrtParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(SqrtParser, self).__init__(graph, var2geop) + self.parser_name = "sqrt" + + def _apply(self): + x = self._get_ge_input(self.op.input_arg_names[0]) + sqrt = core.GEOperatorFactory.create_operator( + "sqrt" + self._accumulated_op_id(), "Sqrt").set_input("x", x) + return [sqrt], [[0]] + + +class PowParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(PowParser, self).__init__(graph, var2geop) + self.parser_name = "pow" + + def _apply(self): + x = self._get_ge_input(self.op.input_arg_names[0]) + factor = self.op.attr("factor") + pow_value = core.GEOperatorFactory.create_operator( + "pow" + self._accumulated_op_id(), + "Power").set_input("x", x).set_attr_float( + "power", factor).set_attr_float("scale", 1.0).set_attr_float( + "shift", 0.0) + return [pow_value], [[0]] + + +class SquareParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(SquareParser, self).__init__(graph, var2geop) + self.parser_name = "square" + + def _apply(self): + x = self._get_ge_input(self.op.input_arg_names[0]) + square = core.GEOperatorFactory.create_operator( + "square" + self._accumulated_op_id(), "Square").set_input("x", x) + return [square], [[0]] + + +class SumParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(SumParser, self).__init__(graph, var2geop) + self.parser_name = "sum" + + def _apply(self): + len_list = len(self.op.input_arg_names) + if len_list < 2: + assert False, "the size of input list must large or equal 2" + x = self._get_ge_input(self.op.input_arg_names[0]) + y = self._get_ge_input(self.op.input_arg_names[1]) + sum = core.GEOperatorFactory.create_operator( + "sum" + self._accumulated_op_id(), + "Add").set_input("x1", x).set_input("x2", y) + for i in range(2, len_list): + y = self._get_ge_input(self.op.input_arg_names[i]) + sum = core.GEOperatorFactory.create_operator( + "sum" + self._accumulated_op_id(), + "Add").set_input("x1", sum).set_input("x2", y) + return [sum], [[0]] + + +class LogicalNotParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(LogicalNotParser, self).__init__(graph, var2geop) + self.parser_name = "logical_not" + + def _apply(self): + x = self._get_ge_input(self.op.input_arg_names[0]) + logical_not = core.GEOperatorFactory.create_operator( + "logical_not" + self._accumulated_op_id(), + "LogicalNot").set_input("x", x) + return [logical_not], [[0]] + + +class MeanParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(MeanParser, self).__init__(graph, var2geop) + self.parser_name = "mean" + + def _apply(self): + x = self._get_ge_input(self.op.input_arg_names[0]) + mean = core.GEOperatorFactory.create_operator( + "mean" + self._accumulated_op_id(), + "ReduceMeanD").set_input("x", x).set_attr_bool( + "keep_dims", False).set_attr_vec_int32("axes", []) + return [mean], [[0]] + + +class ReduceSumParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(ReduceSumParser, self).__init__(graph, var2geop) + self.parser_name = "reduce_sum" + + def _apply(self): + x = self._get_ge_input(self.op.input_arg_names[0]) + axes = self.op.attr("dim") + keep_dims = self.op.attr("keep_dim") + reduce_all = self.op.attr("reduce_all") + x_shape = self.op.block.var(self.op.input_arg_names[0]).shape + if reduce_all: + axes = list(range(len(x_shape))) + reduce_sum = core.GEOperatorFactory.create_operator( + "reduce_sum" + self._accumulated_op_id(), + "ReduceSumD").set_input("x", x, 0).set_attr_vec_int32( + "axes", axes).set_attr_bool("keep_dims", keep_dims) + return [reduce_sum], [[0]] + + +#class IncrementParser(AscendParserBase): +# def __init__(self, graph, var2geop): +# super(IncrementParser, self).__init__(graph, var2geop) +# self.parser_name = "increment" +# +# def _apply(self): +# x = self._get_ge_input(self.op.input_arg_names[0]) +# step = self.op.attr("step") #self._get_ge_input(self.op.input_arg_names[1]) +# print("step: ", step) +# +# increment = core.GEOperatorFactory.create_operator("adds" + self._accumulated_op_id(), "Adds").set_input("x", x).set_attr_float("value", step) #set_input("x2", bias) +# +# return [increment] + + +## matrix cal +class MatMulParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(MatMulParser, self).__init__(graph, var2geop) + self.parser_name = "matmul" + + def _apply(self): + x = self._get_ge_input(self.op.input_arg_names[0]) + y = self._get_ge_input(self.op.input_arg_names[1]) + transpose_x = self.op.attr("transpose_X") + transpose_y = self.op.attr("transpose_Y") + + x1_shape = self.op.block.var(self.op.input_arg_names[0]).shape + x2_shape = self.op.block.var(self.op.input_arg_names[1]).shape + + if len(x1_shape) > 2: + matmul = core.GEOperatorFactory.create_operator( + "matmul" + self._accumulated_op_id(), "BatchMatMul").set_input( + "x1", x).set_input("x2", y).set_attr_bool( + "adj_x1", + transpose_x).set_attr_bool("adj_x2", transpose_y) + elif len(x1_shape) == 2: + matmul = core.GEOperatorFactory.create_operator( + "matmul" + self._accumulated_op_id(), + "MatMul").set_input("x1", x).set_input("x2", y).set_attr_bool( + "transpose_x1", transpose_x).set_attr_bool("transpose_x2", + transpose_y) + else: + assert False, "not support" + return [matmul], [[0]] class MulParser(AscendParserBase): @@ -275,13 +562,105 @@ class MulParser(AscendParserBase): def _apply(self): x = self._get_ge_input(self.op.input_arg_names[0]) y = self._get_ge_input(self.op.input_arg_names[1]) + x_num_col_dims = self.op.attr("x_num_col_dims") + y_num_col_dims = self.op.attr("y_num_col_dims") + shape_x1 = self.op.block.var(self.op.input_arg_names[0]).shape + shape_x2 = self.op.block.var(self.op.input_arg_names[1]).shape + + if x_num_col_dims == 1 and y_num_col_dims == 1: + if len(shape_x1) == 2 and len(shape_x2) == 2: + matmul = core.GEOperatorFactory.create_operator( + "mul" + self._accumulated_op_id(), + "MatMul").set_input("x1", x).set_input("x2", y) + elif len(shape_x1) == 3 and len(shape_x2) == 2: + flatten_x1 = core.GEOperatorFactory.create_operator( + "flatten" + self._accumulated_op_id(), + "Flatten").set_input("x", x) + matmul = core.GEOperatorFactory.create_operator( + "mul" + self._accumulated_op_id(), "MatMul").set_input( + "x1", flatten_x1, 0).set_input("x2", y, 0) + else: + assert False, "not support" + else: + if len(shape_x1) == 3 and len(shape_x2) == 2: + assert x_num_col_dims == 2, "only support 2" + flatten_x1 = core.GEOperatorFactory.create_operator( + "flatten" + self._accumulated_op_id(), + "FlattenV2").set_input("x", x).set_attr_int32( + "axis", 0).set_attr_int32("end_axis", 1) + matmul_m = core.GEOperatorFactory.create_operator( + "mul" + self._accumulated_op_id(), "MatMul").set_input( + "x1", flatten_x1, 0).set_input("x2", y, 0) + matmul_transpose = core.GEOperatorFactory.create_operator( + "transpose" + self._accumulated_op_id(), + "TransposeD").set_input( + "x", matmul_m).set_attr_vec_int32("perm", [1, 0]) + tensor = self._create_ge_tensor( + [3], 2, [shape_x2[1], shape_x1[0], shape_x1[1]]) + const_shape = core.GEOperatorFactory.create_operator( + "shape" + self._accumulated_op_id(), + "Const").set_attr_tensor("value", tensor) + reshape_matmul = core.GEOperatorFactory.create_operator( + "reshape" + self._accumulated_op_id(), "Reshape").set_input( + "x", matmul_transpose).set_input( + "shape", const_shape).set_attr_int32("axis", 0) + matmul = core.GEOperatorFactory.create_operator( + "transpose" + self._accumulated_op_id(), + "TransposeD").set_input( + "x", + reshape_matmul).set_attr_vec_int32("perm", [1, 2, 0]) + else: + assert False, "not support" - matmul = core.GEOperatorFactory.create_operator( - "mul" + self._accumulated_op_id(), "MatMul").set_input( - "x1", x).set_input("x2", y) return [matmul], [[0]] +class LayerNormParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(LayerNormParser, self).__init__(graph, var2geop) + self.parser_name = "layer_norm" + + def _apply(self): + x = self._get_ge_input(self.op.input_arg_names[2]) + scale = self._get_ge_input(self.op.input_arg_names[1]) + bias = self._get_ge_input(self.op.input_arg_names[0]) + epsilon = self.op.attr("epsilon") + begin_norm_axis = self.op.attr("begin_norm_axis") + x_dtype = self.op.block.var(self.op.input_arg_names[2]).dtype + + shape_tensor = core.GEOperatorFactory.create_operator( + "shape" + self._accumulated_op_id(), "Shape").set_input("x", x) + scale_expand = core.GEOperatorFactory.create_operator( + "broadcast_to_d" + self._accumulated_op_id(), + "BroadcastTo").set_input("x", + scale).set_input("shape", shape_tensor) + bias_expand = core.GEOperatorFactory.create_operator( + "broadcast_to_d" + self._accumulated_op_id(), + "BroadcastTo").set_input("x", bias).set_input("shape", shape_tensor) + layer_norm = core.GEOperatorFactory.create_operator( + "layer_norm" + self._accumulated_op_id(), + "LayerNorm").set_input("x", x).set_input( + "gamma", + scale_expand).set_input("beta", bias_expand).set_attr_int32( + "begin_norm_axis", begin_norm_axis).set_attr_int32( + "begin_params_axis", + begin_norm_axis).set_attr_float("epsilon", epsilon) + + cast_dtype = 0 if self.ascend_helper.dtype2paddle_inv_map[str( + x_dtype)] == 0 else 1 + y = core.GEOperatorFactory.create_operator( + "cast" + self._accumulated_op_id(), "Cast").set_input( + "x", layer_norm, 0).set_attr_int32("dst_type", cast_dtype) + mean = core.GEOperatorFactory.create_operator( + "cast" + self._accumulated_op_id(), "Cast").set_input( + "x", layer_norm, 1).set_attr_int32("dst_type", cast_dtype) + variance = core.GEOperatorFactory.create_operator( + "cast" + self._accumulated_op_id(), "Cast").set_input( + "x", layer_norm, 2).set_attr_int32("dst_type", cast_dtype) + return [y, mean, variance], [[1], [2], [0]] + + +## activate function class ReluParser(AscendParserBase): def __init__(self, graph, var2geop): super(ReluParser, self).__init__(graph, var2geop) @@ -294,20 +673,31 @@ class ReluParser(AscendParserBase): return [relu], [[0]] -class ReluGradParser(AscendParserBase): +class GeluParser(AscendParserBase): def __init__(self, graph, var2geop): - super(ReluGradParser, self).__init__(graph, var2geop) - self.parser_name = "relu_grad" + super(GeluParser, self).__init__(graph, var2geop) + self.parser_name = "gelu" def _apply(self): - out = self._get_ge_input(self.op.input_arg_names[0]) - out_grad = self._get_ge_input(self.op.input_arg_names[1]) - relu_grad = core.GEOperatorFactory.create_operator( - self.parser_name + self._accumulated_op_id(), "ReluGrad").set_input( - "gradients", out_grad).set_input("features", out) - return [relu_grad], [[0]] + x = self._get_ge_input(self.op.input_arg_names[0]) + gelu = core.GEOperatorFactory.create_operator( + "gelu" + self._accumulated_op_id(), "Gelu").set_input("x", x) + return [gelu], [[0]] + + +class TanhParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(TanhParser, self).__init__(graph, var2geop) + self.parser_name = "tanh" + + def _apply(self): + x = self._get_ge_input(self.op.input_arg_names[0]) + tanh = core.GEOperatorFactory.create_operator( + "tanh" + self._accumulated_op_id(), "Tanh").set_input("x", x) + return [tanh], [[0]] +## loss function class SoftmaxWithCrossEntropyParser(AscendParserBase): def __init__(self, graph, var2geop): super(SoftmaxWithCrossEntropyParser, self).__init__(graph, var2geop) @@ -316,80 +706,61 @@ class SoftmaxWithCrossEntropyParser(AscendParserBase): def _apply(self): label = self._get_ge_input(self.op.input_arg_names[0]) logits = self._get_ge_input(self.op.input_arg_names[1]) - cls_num = self.op.block.var(self.op.input_arg_names[1]).shape[1] + softmax = core.GEOperatorFactory.create_operator( - "softmax" + self._accumulated_op_id(), "SoftmaxV2").set_input( - "x", logits) + "softmax" + self._accumulated_op_id(), + "SoftmaxV2").set_input("x", logits) label = core.GEOperatorFactory.create_operator( "cast" + self._accumulated_op_id(), "Cast").set_input( "x", label).set_attr_int32("dst_type", 3) tensoron = self._create_ge_tensor([1], 5, 1) - on_const = core.GEOperatorFactory.create_operator( - "const" + self._accumulated_op_id(), "Const").set_attr_tensor( - "value", tensoron) - self._mark_as_input(on_const) + on = core.GEOperatorFactory.create_operator( + "const" + self._accumulated_op_id(), + "Const").set_attr_tensor("value", tensoron) tensoroff = self._create_ge_tensor([1], 5, 0) - off_const = core.GEOperatorFactory.create_operator( - "const" + self._accumulated_op_id(), "Const").set_attr_tensor( - "value", tensoroff) - self._mark_as_input(off_const) + off = core.GEOperatorFactory.create_operator( + "const" + self._accumulated_op_id(), + "Const").set_attr_tensor("value", tensoroff) + self._mark_as_input(on) + self._mark_as_input(off) onehot = core.GEOperatorFactory.create_operator( "onehot" + self._accumulated_op_id(), "OneHotD").set_input( - "x", label).set_input("on_value", on_const).set_input( - "off_value", off_const).set_attr_int32("depth", cls_num) + "x", label).set_input("on_value", on).set_input( + "off_value", off).set_attr_int32("depth", cls_num) squeeze = core.GEOperatorFactory.create_operator( "mul" + self._accumulated_op_id(), "Squeeze").set_input("x", onehot) - loss = core.GEOperatorFactory.create_operator( + + loss_all = core.GEOperatorFactory.create_operator( "loss" + self._accumulated_op_id(), "SoftmaxCrossEntropyWithLogits").set_input( "features", logits).set_input("labels", squeeze) - - return [label, softmax, on_const, off_const, onehot, squeeze, - loss], [[6], [1]] + loss = core.GEOperatorFactory.create_operator( + "cast" + self._accumulated_op_id(), "Cast").set_input( + "x", loss_all, 0).set_attr_int32("dst_type", 0) + loss_expand = core.GEOperatorFactory.create_operator( + "unsqueeze" + self._accumulated_op_id(), + "Unsqueeze").set_input("x", loss).set_attr_vec_int32("axes", [1]) + return [label, softmax, loss_expand], [[2], [1]] -class SoftmaxWithCrossEntropyGradParser(AscendParserBase): +class SoftMaxParser(AscendParserBase): def __init__(self, graph, var2geop): - super(SoftmaxWithCrossEntropyGradParser, self).__init__(graph, var2geop) - self.parser_name = "softmax_with_cross_entropy_grad" + super(SoftMaxParser, self).__init__(graph, var2geop) + self.parser_name = "softmax" def _apply(self): - label = self._get_ge_input(self.op.input_arg_names[0]) - loss_grad = self._get_ge_input(self.op.input_arg_names[1]) - softmax = self._get_ge_input(self.op.input_arg_names[2]) - cls_num = self.op.block.var(self.op.input_arg_names[2]).shape[1] + logits = self._get_ge_input(self.op.input_arg_names[0]) + axes = self.op.attr("axis") - tensoron = self._create_ge_tensor([1], 5, 1) - on_const = core.GEOperatorFactory.create_operator( - "const" + self._accumulated_op_id(), "Const").set_attr_tensor( - "value", tensoron) - self._mark_as_input(on_const) - tensoroff = self._create_ge_tensor([1], 5, 0) - off_const = core.GEOperatorFactory.create_operator( - "const" + self._accumulated_op_id(), "Const").set_attr_tensor( - "value", tensoroff) - self._mark_as_input(off_const) - label = core.GEOperatorFactory.create_operator( - "cast" + self._accumulated_op_id(), "Cast").set_input( - "x", label).set_attr_int32("dst_type", 3) - onehot = core.GEOperatorFactory.create_operator( - "onehot" + self._accumulated_op_id(), "OneHotD").set_input( - "x", label).set_input("on_value", on_const).set_input( - "off_value", off_const).set_attr_int32("depth", cls_num) - # the fuck onehot will add a demension, so must call squeeze afterward - squeeze = core.GEOperatorFactory.create_operator( - "mul" + self._accumulated_op_id(), "Squeeze").set_input("x", onehot) - sub = core.GEOperatorFactory.create_operator( - "sub" + self._accumulated_op_id(), "Sub").set_input( - "x1", softmax).set_input("x2", squeeze) - grad = core.GEOperatorFactory.create_operator( - "mul" + self._accumulated_op_id(), "Mul").set_input( - "x1", loss_grad).set_input("x2", sub) - return [on_const, off_const, label, onehot, squeeze, sub, grad], [[-1]] + softmax = core.GEOperatorFactory.create_operator( + "softmax" + self._accumulated_op_id(), "SoftmaxV2").set_input( + "x", logits).set_attr_vec_int32("axes", [axes]) + return [softmax], [[0]] +## general class ShapeParser(AscendParserBase): def __init__(self, graph, var2geop): super(ShapeParser, self).__init__(graph, var2geop) @@ -411,16 +782,15 @@ class FillConstantParser(AscendParserBase): shape = self.op.attr("shape") dtype = self.op.attr("dtype") value = self.op.attr("value") - print("shape: ", shape) - print("dtype: ", dtype) - print("value: ", value) + tensor = self._create_ge_tensor(shape, dtype, value) const = core.GEOperatorFactory.create_operator( - "const" + self._accumulated_op_id(), "Const").set_attr_tensor( - "value", tensor) + "const" + self._accumulated_op_id(), + "Const").set_attr_tensor("value", tensor) self._mark_as_input(const) if self.op.block.var(self.op.output('Out')[0]).persistable: - print("%s fill_constant" % (self.op.output('Out')[0])) + #print("%s is Persistable in fill_constant" % + # (self.op.output('Out')[0])) var = core.GEOperatorFactory.create_operator( self.op.output('Out')[0], "Variable") var.update_output_desc("y", @@ -432,26 +802,7 @@ class FillConstantParser(AscendParserBase): "assign" + self._accumulated_op_id(), "Assign").set_input( "value", const).set_input("ref", var) return [const], [[0]] - else: - print( - "self.op.output('Out')[0] is not persistable in fill_constant") - return [const], [[0]] - - -class SGDParser(AscendParserBase): - def __init__(self, graph, var2geop): - super(SGDParser, self).__init__(graph, var2geop) - self.parser_name = "sgd" - - def _apply(self): - grad = self._get_ge_input(self.op.input_arg_names[0]) - lr = self._get_ge_input(self.op.input_arg_names[1]) - param = self._get_ge_input(self.op.input_arg_names[2]) - sgd = core.GEOperatorFactory.create_operator( - "momentum" + self._accumulated_op_id(), - "ApplyGradientDescent").set_input("var", param).set_input( - "alpha", lr).set_input("delta", grad) - return [sgd], [[0]] + return [const], [[0]] class TruncatedNormalParser(AscendParserBase): @@ -465,30 +816,27 @@ class TruncatedNormalParser(AscendParserBase): mean = self.op.attr("mean") std = self.op.attr("std") seed = self.op.attr("seed") + tensor1 = self._create_ge_tensor([len(shape)], 2, shape) shape_tensor = core.GEOperatorFactory.create_operator( - "const" + self._accumulated_op_id(), "Const").set_attr_tensor( - "value", tensor1) - + "const" + self._accumulated_op_id(), + "Const").set_attr_tensor("value", tensor1) tensor2 = self._create_ge_tensor([1], dtype, mean) mean_tensor = core.GEOperatorFactory.create_operator( - "const" + self._accumulated_op_id(), "Const").set_attr_tensor( - "value", tensor2) - + "const" + self._accumulated_op_id(), + "Const").set_attr_tensor("value", tensor2) tensor3 = self._create_ge_tensor([1], dtype, std) std_tensor = core.GEOperatorFactory.create_operator( - "const" + self._accumulated_op_id(), "Const").set_attr_tensor( - "value", tensor3) - + "const" + self._accumulated_op_id(), + "Const").set_attr_tensor("value", tensor3) tensor4 = self._create_ge_tensor([1], dtype, mean - 2 * std) min_tensor = core.GEOperatorFactory.create_operator( - "const" + self._accumulated_op_id(), "Const").set_attr_tensor( - "value", tensor4) - + "const" + self._accumulated_op_id(), + "Const").set_attr_tensor("value", tensor4) tensor5 = self._create_ge_tensor([1], dtype, mean + 2 * std) max_tensor = core.GEOperatorFactory.create_operator( - "const" + self._accumulated_op_id(), "Const").set_attr_tensor( - "value", tensor5) + "const" + self._accumulated_op_id(), + "Const").set_attr_tensor("value", tensor5) self._mark_as_input(shape_tensor) self._mark_as_input(mean_tensor) @@ -507,9 +855,8 @@ class TruncatedNormalParser(AscendParserBase): ## wirte the output of truncatedNormal from startup_program to main_program if self.op.block.var(self.op.output('Out')[0]).persistable: - print("%s is Persistable in truncated_normal" % - (self.op.output('Out')[0])) - #var = core.GEOperatorFactory.create_operator(self.op.output('Out')[0], "Variable").set_input("x", truncated_normal) + #print("%s is Persistable in truncated_normal" % + # (self.op.output('Out')[0])) var = core.GEOperatorFactory.create_operator( self.op.output('Out')[0], "Variable") var.update_output_desc("y", @@ -524,66 +871,1313 @@ class TruncatedNormalParser(AscendParserBase): shape_tensor, mean_tensor, std_tensor, min_tensor, max_tensor, truncated_normal ], [[-1]] - else: - print( - "self.op.output('Out')[0] is not persistable in truncated_noraml" - ) - return [truncated_normal], [[0]] #[assign] + #else: + # print( + # "self.op.output('Out')[0] is not persistable in truncated_noraml" + # ) + return [truncated_normal], [[0]] -class ScaleParser(AscendParserBase): +class GatherParser(AscendParserBase): def __init__(self, graph, var2geop): - super(ScaleParser, self).__init__(graph, var2geop) - self.parser_name = "scale" + super(GatherParser, self).__init__(graph, var2geop) + self.parser_name = "gather" def _apply(self): - x = self._get_ge_input(self.op.input_arg_names[0]) - scale = self.op.attr( - "scale") #self.get_ge_input(self.op.input_arg_names[1]) - bias = self.op.attr("bias") - bias_after_scale = self.op.attr("bias_after_scale") - if bias_after_scale: - scale_value = core.GEOperatorFactory.create_operator( - "scale" + self._accumulated_op_id(), "Power").set_input( - "x", x).set_attr_float("power", 1.0).set_attr_float( + index = self._get_ge_input(self.op.input_arg_names[0]) + x = self._get_ge_input(self.op.input_arg_names[1]) + clo = self.op.block.var(self.op.input_arg_names[1]).shape[-1] + + gather = core.GEOperatorFactory.create_operator( + "gather" + self._accumulated_op_id(), "Gather").set_input( + "x", x).set_input("indices", index).set_attr_bool( + "validate_indices", True) + return [gather], [[0]] + + +class ScatterParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(ScatterParser, self).__init__(graph, var2geop) + self.parser_name = "scatter" + + def _apply(self): + index = self._get_ge_input(self.op.input_arg_names[0]) + x = self._get_ge_input(self.op.input_arg_names[1]) + updates = self._get_ge_input(self.op.input_arg_names[2]) + overwrite = self.op.attr("overwrite") + index_shape = self.op.block.var(self.op.input_arg_names[0]).shape + + if len(index_shape) == 1: + index = core.GEOperatorFactory.create_operator( + "unsqueeze" + self.getid(), "Unsqueeze").set_input( + "x", index).set_attr_vec_int32("axes", [1]) + if not overwrite: + scatter_value = core.GEOperatorFactory.create_operator( + "scatter" + self._accumulated_op_id(), + "TensorScatterAdd").set_input( + "x", x_var).set_input("indices", index_var).set_input( + "updates", updatesi_var) + else: + scatter_value = core.GEOperatorFactory.create_operator( + "scatter" + self._accumulated_op_id(), + "TensorScatterUpdate").set_input( + "x", x_var).set_input("indices", index_var).set_input( + "updates", updates_var) + return [x_var, index_var, updates_var, scatter_value], [[-1]] + + +class CastParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(CastParser, self).__init__(graph, var2geop) + self.parser_name = "cast" + + def _apply(self): + x = self._get_ge_input(self.op.input_arg_names[0]) + dtype = self.op.attr("out_dtype") + cast = core.GEOperatorFactory.create_operator( + "cast" + self._accumulated_op_id(), "Cast").set_input( + "x", x).set_attr_int32("dst_type", dtype) + return [cast], [[0]] + + +class AssignParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(AssignParser, self).__init__(graph, var2geop) + self.parser_name = "assign" + + def _apply(self): + const = self._get_ge_input(self.op.input_arg_names[0]) + var = self._get_ge_input(self.op.input_arg_names[1]) + assign = core.GEOperatorFactory.create_operator( + "assign" + self._accumulated_op_id(), "Assign").set_input( + "value", const).set_input("ref", var) + return [assign], [[0]] + + +class ScaleParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(ScaleParser, self).__init__(graph, var2geop) + self.parser_name = "scale" + + def _apply(self): + x = self._get_ge_input(self.op.input_arg_names[0]) + scale = self.op.attr("scale") + bias = self.op.attr("bias") + bias_after_scale = self.op.attr("bias_after_scale") + + if bias_after_scale: + scale_value = core.GEOperatorFactory.create_operator( + "scale" + self._accumulated_op_id(), "Power").set_input( + "x", x).set_attr_float("power", 1.0).set_attr_float( "scale", scale).set_attr_float("shift", bias) else: x_add_bias = core.GEOperatorFactory.create_operator( "adds" + self._accumulated_op_id(), "Adds").set_input( - "x", x).set_attr_float("value", - bias) #set_input("x2", bias) + "x", x).set_attr_float("value", bias) scale_value = core.GEOperatorFactory.create_operator( "scale" + self._accumulated_op_id(), "Power").set_input( - "x", x_add_bias).set_attr_float( - "power", 1.0).set_attr_float( - "scale", scale).set_attr_float("shift", 0.0) - #tensor_zeros = core.GEOperatorFactory.create_operator("zeroslike" + self.getid(), "ZerosLike").set_input("x", x) - #bias_ = self.create_ge_tensor([1], 5, bias) - #const_bias = core.GEOperatorFactory.create_operator("const" + self.getid(), "Const").set_attr_tensor("value", tensor_bias) + "x", + x_add_bias).set_attr_float("power", 1.0).set_attr_float( + "scale", scale).set_attr_float("shift", 0.0) return [scale_value], [[0]] +class SliceParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(SliceParser, self).__init__(graph, var2geop) + self.parser_name = "slice" + + def _apply(self): + x = self._get_ge_input(self.op.input_arg_names[0]) + axes = self.op.attr("axes") + starts = self.op.attr("starts") + ends = self.op.attr("ends") + + x_shape = self.op.block.var(self.op.input_arg_names[0]).shape + len_shape = len(x_shape) + axes_cor = list(range(len_shape)) + starts_cor, ends_cor = [], [] + cnt = 0 + for i in range(len_shape): + starts_cor.append(starts[cnt] if i in axes else 0) + if i in axes and ends[cnt] <= x_shape[i]: + ends_cor.append(ends[cnt]) + else: + ends_cor.append(x_shape[i]) + if i in axes: + cnt += 1 + size = [ends_cor[i] - starts_cor[i] for i in range(len(axes_cor))] + + assert len(axes_cor) == len(starts_cor) == len( + ends_cor), "the three fields must have same size" + slice_value = core.GEOperatorFactory.create_operator( + "slice" + self._accumulated_op_id(), "SliceD").set_input( + "x", x).set_attr_vec_int32( + "offsets", starts_cor).set_attr_vec_int32("size", size) + + return [slice_value], [[0]] + + class ReshapeParser(AscendParserBase): def __init__(self, graph, var2geop): super(ReshapeParser, self).__init__(graph, var2geop) self.parser_name = "reshape2" def _apply(self): - print("swbuf:", self.op.input_arg_names) + org_shape = self.op.block.var(self.op.input_arg_names[0]).shape + assert org_shape.count(-1) == 0, "do not allow the dim is -1" shape = self.op.attr("shape") - axis = 0 - if shape[0] == -1: - axis = 1 - shape = shape[1:] - print("shape: ", shape) - data_x1_shape = self._get_ge_input(self.op.input_arg_names[0]) + for cnt in range(len(shape)): + if shape[cnt] == 0: + shape[cnt] = org_shape[cnt] + + if -1 in shape: + assert shape.count(-1) == 1, "only allow one dim is -1" + mul_res_org = reduce(lambda x, y: x * y, org_shape) + mul_res_refine = reduce(lambda x, y: x * y, shape) * -1 + idx = shape.index(-1) + shape[idx] = mul_res_org // mul_res_refine + + x = self._get_ge_input(self.op.input_arg_names[0]) tensor = self._create_ge_tensor([len(shape)], 2, shape) const_shape = core.GEOperatorFactory.create_operator( - "shape" + self._accumulated_op_id(), "Const").set_attr_tensor( - "value", tensor) + "shape" + self._accumulated_op_id(), + "Const").set_attr_tensor("value", tensor) reshape = core.GEOperatorFactory.create_operator( "reshape" + self._accumulated_op_id(), "Reshape").set_input( - "x", data_x1_shape).set_input( - "shape", const_shape).set_attr_int32("axis", axis) + "x", + x).set_input("shape", const_shape).set_attr_int32("axis", 0) + x_shape = core.GEOperatorFactory.create_operator( + "shape" + self._accumulated_op_id(), "Shape").set_input("x", x) + + return [x_shape, reshape], [[1], [0]] + + +class TransposeParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(TransposeParser, self).__init__(graph, var2geop) + self.parser_name = "transpose2" + + def _apply(self): + x = self._get_ge_input(self.op.input_arg_names[0]) + perm = self.op.attr("axis") + transpose = core.GEOperatorFactory.create_operator( + "transpose" + self._accumulated_op_id(), "TransposeD").set_input( + "x", x).set_attr_vec_int32("perm", perm) + x_shape = core.GEOperatorFactory.create_operator( + "shape" + self._accumulated_op_id(), "Shape").set_input("x", x) + + return [x_shape, transpose], [[1], [0]] + + +class AccuracyParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(AccuracyParser, self).__init__(graph, var2geop) + self.parser_name = "accuracy" + + def _apply(self): + pred = self._get_ge_input(self.op.input_arg_names[0]) + label = self._get_ge_input(self.op.input_arg_names[1]) + logits = self._get_ge_input(self.op.input_arg_names[2]) + + pred = core.GEOperatorFactory.create_operator( + "cast" + self._accumulated_op_id(), "Cast").set_input( + "x", pred).set_attr_int32("dst_type", 3) + label = core.GEOperatorFactory.create_operator( + "cast" + self._accumulated_op_id(), "Cast").set_input( + "x", label).set_attr_int32("dst_type", 3) + equal = core.GEOperatorFactory.create_operator( + "equal" + self._accumulated_op_id(), "Equal").set_input( + "x1", pred).set_input("x2", label) + cast = core.GEOperatorFactory.create_operator( + "cast" + self._accumulated_op_id(), "Cast").set_input( + "x", equal).set_attr_int32("dst_type", 0) + acc = core.GEOperatorFactory.create_operator( + "mean" + self._accumulated_op_id(), "ReduceMeanD").set_input( + "x", cast).set_attr_bool("keep_dims", False).set_attr_vec_int32( + "axes", []) + correct = core.GEOperatorFactory.create_operator( + "sum" + self._accumulated_op_id(), "ReduceSumD").set_input( + "x", cast).set_attr_bool("keep_dims", False).set_attr_vec_int32( + "axes", []) + ones_tensor = core.GEOperatorFactory.create_operator( + "oneslike" + self._accumulated_op_id(), + "OnesLike").set_input("x", label) + ones_tensor = core.GEOperatorFactory.create_operator( + "cast" + self._accumulated_op_id(), "Cast").set_input( + "x", ones_tensor).set_attr_int32("dst_type", 0) + total = core.GEOperatorFactory.create_operator( + "sum" + self._accumulated_op_id(), "ReduceSumD").set_input( + "x", ones_tensor).set_attr_bool( + "keep_dims", False).set_attr_vec_int32("axes", []) + + return [acc, correct, total], [[0], [1], [2]] + + +class TopkParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(TopkParser, self).__init__(graph, var2geop) + self.parser_name = "top_k" + + def _apply(self): + x = self._get_ge_input(self.op.input_arg_names[0]) + k = self.op.attr("k") + + tensor = self._create_ge_tensor([1], 2, k) + const_k = core.GEOperatorFactory.create_operator( + "const" + self._accumulated_op_id(), + "Const").set_attr_tensor("value", tensor) + cast_x = core.GEOperatorFactory.create_operator( + "cast" + self._accumulated_op_id(), + "Cast").set_input("x", x).set_attr_int32("dst_type", 1) + topk = core.GEOperatorFactory.create_operator( + "topk" + self._accumulated_op_id(), + "TopK").set_input("x", cast_x).set_input("k", const_k) + value = core.GEOperatorFactory.create_operator( + "cast" + self._accumulated_op_id(), "Cast").set_input( + "x", topk, 0).set_attr_int32("dst_type", 0) + index = core.GEOperatorFactory.create_operator( + "cast" + self._accumulated_op_id(), "Cast").set_input( + "x", topk, 1).set_attr_int32("dst_type", 0) + return [value, index], [[1], [0]] + + +class LookupTableParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(LookupTableParser, self).__init__(graph, var2geop) + self.parser_name = "lookup_table" + + def _apply(self): + ids = self._get_ge_input(self.op.input_arg_names[0]) + w = self._get_ge_input(self.op.input_arg_names[1]) + + ids_squeeze = core.GEOperatorFactory.create_operator( + "squeeze" + self._accumulated_op_id(), "Squeeze").set_input( + "x", ids).set_attr_vec_int32("axes", [-1]) + out = core.GEOperatorFactory.create_operator( + "lookup" + self._accumulated_op_id(), "Gather").set_input( + "x", w).set_input("indices", ids_squeeze) + return [out], [[0]] + + +class StackParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(StackParser, self).__init__(graph, var2geop) + self.parser_name = "stack" + + def _apply(self): + tiles = len(self.op.input_arg_names) + data_x_lst = [] + for index in range(tiles): + data_x_lst.append( + self._get_ge_input(self.op.input_arg_names[index])) + axis = self.op.attr("axis") + + data_x = data_x_lst[0] + tensor = self._create_ge_tensor([1], 2, axis) + tensor_axis = core.GEOperatorFactory.create_operator( + "axis" + self._accumulated_op_id(), + "Const").set_attr_tensor("value", tensor) + expand = core.GEOperatorFactory.create_operator( + "expand" + self._accumulated_op_id(), + "ExpandDims").set_input("x", data_x).set_input("axis", tensor_axis) + + stack = core.GEOperatorFactory.create_operator( + "stack" + self._accumulated_op_id(), + "TileWithAxis").set_input("x", expand).set_attr_int32( + "axis", axis).set_attr_int32("tiles", tiles) + + return [stack], [[0]] + + +class UnSqueezeParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(UnSqueezeParser, self).__init__(graph, var2geop) + self.parser_name = "unsqueeze2" + + def _apply(self): + x = self._get_ge_input(self.op.input_arg_names[0]) + axes = self.op.attr('axes') + + output = core.GEOperatorFactory.create_operator( + "unsqueeze" + self._accumulated_op_id(), + "Unsqueeze").set_input("x", x).set_attr_vec_int32("axes", axes) + shape = core.GEOperatorFactory.create_operator( + "shape" + self._accumulated_op_id(), "Shape").set_input("x", output) + return [shape, output], [[1], [0]] + + +## parallel +class AllGatherParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(AllGatherParser, self).__init__(graph, var2geop) + self.parser_name = "c_allgather" + + def _apply(self): + x = self._get_ge_input(self.op.input_arg_names[0]) + rank_size = self.op.attr("rank_size") + group = self.op.attr("group") + + allgather = core.GEOperatorFactory.create_operator( + "allgather" + self._accumulated_op_id(), "HcomAllGather").set_input( + "x", x).set_attr_int32( + "rank_size", rank_size).set_attr_string("group", group) + return [allgather], [[0]] + + +class AllReduceParser(AscendParserBase): + def __init__(self, graph, var2geop, reduction): + super(AllReduceParser, self).__init__(graph, var2geop) + self.parser_name = "c_allreduce_" + reduction + self.reduction = reduction + + def _apply(self): + x = self._get_ge_input(self.op.input_arg_names[0]) + reduction = self.reduction + ring_id = self.op.attr("ring_id") + group = "hcom_group_" + str(ring_id) + fusion = None #self.op.attr("fusion") + fusion_id = None #self.op.attr("fusion_id") + + allreduce = core.GEOperatorFactory.create_operator( + "allreduce" + self._accumulated_op_id(), "HcomAllReduce").set_input( + "x", x).set_attr_string( + "reduction", reduction).set_attr_string("group", group) + if fusion is not None: + allreduce.set_attr_int32("fusion", fusion) + + if fusion_id is not None: + allreduce.set_attr_int32("fusion_id", fusion_id) + return [allreduce], [[0]] + + +class AllReduceSumParser(AllReduceParser): + def __init__(self, graph, var2geop): + super(AllReduceSumParser, self).__init__(graph, var2geop, 'sum') + + +class AllReduceMaxParser(AllReduceParser): + def __init__(self, graph, var2geop): + super(AllReduceMaxParser, self).__init__(graph, var2geop, 'max') + + +class BroadcastParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(BroadcastParser, self).__init__(graph, var2geop) + self.parser_name = "c_broadcast" + + def _apply(self): + x = self._get_ge_input(self.op.input_arg_names[0]) + root_rank = self.op.attr("root_rank") + group = self.op.attr("group") + + broadcast = core.GEOperatorFactory.create_operator( + "broadcast" + self._accumulated_op_id(), "HcomBroadcast").set_input( + "x", x).set_attr_int32( + "root_rank", root_rank).set_attr_string("group", group) + return [broadcast], [[0]] + + +class ReduceScatterParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(ReduceScatterParser, self).__init__(graph, var2geop) + self.parser_name = "c_reduce_scatter" + + def _apply(self): + x = self._get_ge_input(self.op.input_arg_names[0]) + reduction = self.op.attr("reduction") + group = self.op.attr("group") + rank_size = self.op.attr("rank_size") + + reduce_scatter = core.GEOperatorFactory.create_operator( + "reducescatter" + self._accumulated_op_id(), + "HcomReduceScatter").set_input("x", x).set_attr_string( + "reduction", reduction).set_attr_string( + "group", group).set_attr_int32("rank_size", rank_size) + return [reduce_scatter], [[0]] + + +class SendParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(SendParser, self).__init__(graph, var2geop) + self.parser_name = "c_send" + + def _apply(self): + x = self._get_ge_input(self.op.input_arg_names[0]) + sr_tag = self.op.attr("sr_tag") + dest_rank = self.op.attr("dest_rank") + group = self.op.attr("group") + + send = core.GEOperatorFactory.create_operator( + "send" + self._accumulated_op_id(), "HcomSend").set_input( + "x", x).set_attr_int32("sr_tag", sr_tag).set_attr_int32( + "dest_rank", dest_rank).set_attr_string("group", group) + return [send], [[0]] + + +class ReceiveParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(ReceiveParser, self).__init__(graph, var2geop) + self.parser_name = "c_receive" + + def _apply(self): + x = self._get_ge_input(self.op.input_arg_names[0]) + sr_tag = self.op.attr("sr_tag") + src_rank = self.op.attr("src_rank") + group = self.op.attr("group") + shape = self.op.attr("shape") + dtype = self.op.attr("dtype") + + receive = core.GEOperatorFactory.create_operator( + "receive" + self._accumulated_op_id(), "HcomReceive").set_input( + "x", x).set_attr_int32("sr_tag", sr_tag).set_attr_int32( + "src_rank", src_rank).set_attr_string( + "group", group).set_attr_vec_int32( + "shape", shape).set_attr_int32("dtype", dtype) + return [receive], [[0]] + + +class RangeParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(RangeParser, self).__init__(graph, var2geop) + self.parser_name = "range" + + def _apply(self): + # TODO not support range type yet + start = self._get_ge_input(self.op.input_arg_names[0]) + end = self._get_ge_input(self.op.input_arg_names[1]) + delta = self._get_ge_input(self.op.input_arg_names[2]) + + ge_range = core.GEOperatorFactory.create_operator( + "range" + self._accumulated_op_id(), "Range")\ + .set_input("start", end)\ + .set_input("limit", start) \ + .set_input("delta", delta) + + return [ge_range], [[0]] + + +class UniformRandomParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(UniformRandomParser, self).__init__(graph, var2geop) + self.parser_name = "uniform_random" + + def _apply(self): + shape = self.op.attr("shape") + + min_v = self.op.attr("min") + max_v = self.op.attr("max") + seed = self.op.attr("seed") + dtype = self.op.attr("dtype") + assert max_v > min_v, "assert max_v > min_v, but recieved " + \ + "as max_v={}, min_v={} ".format(max_v, min_v) + + tensor1 = self._create_ge_tensor([len(shape)], 2, shape) + shape_tensor = core.GEOperatorFactory.create_operator( + "const" + self._accumulated_op_id(), + "Const").set_attr_tensor("value", tensor1) + + ge_ur = core.GEOperatorFactory.create_operator( + "uniform_random" + self._accumulated_op_id(), "RandomUniform")\ + .set_input("shape", shape_tensor)\ + .set_attr_dtype("dtype", self.ascend_helper.dtype2ge(dtype)) \ + .set_attr_int32("seed", seed)\ + .set_attr_int32("seed2", seed) + + scale = max_v - min_v + + scale_value = core.GEOperatorFactory.create_operator( + "scale" + self._accumulated_op_id(), "Power").set_input( + "x", ge_ur).set_attr_float("power", 1.0).set_attr_float( + "scale", scale).set_attr_float("shift", min_v) + + return [scale_value], [[0]] + + +class EqualParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(EqualParser, self).__init__(graph, var2geop) + self.parser_name = "equal" + + def _apply(self): + data_x1 = self._get_ge_input(self.op.input_arg_names[0]) + data_x2 = self._get_ge_input(self.op.input_arg_names[1]) + equal = core.GEOperatorFactory.create_operator("equal" \ + + self._accumulated_op_id(), "Equal")\ + .set_input("x1", data_x1)\ + .set_input("x2", data_x2) + return [equal], [[0]] + + +class ExpandParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(ExpandParser, self).__init__(graph, var2geop) + self.parser_name = "expand" + + def _apply(self): + data_x1_shape = self._get_ge_input(self.op.input_arg_names[0]) + expand_times = self.op.attr('expand_times') + + tensor = self._create_ge_tensor([len(expand_times)], 2, expand_times) + expand_tensor = core.GEOperatorFactory.\ + create_operator("const" + self._accumulated_op_id(), "Const")\ + .set_attr_tensor("value", tensor) + + assign = core.GEOperatorFactory\ + .create_operator("tile" + self._accumulated_op_id(), "Tile")\ + .set_input("x", data_x1_shape)\ + .set_input("multiples", expand_tensor) + return [assign], [[0]] + + +class SqueezeParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(SqueezeParser, self).__init__(graph, var2geop) + self.parser_name = "squeeze2" + + def _apply(self): + tensor = self._get_ge_input(self.op.input_arg_names[0]) + axes = self.op.attr("axes") + + data_squeezed = core.GEOperatorFactory\ + .create_operator("squeeze" + self._accumulated_op_id(), "Squeeze")\ + .set_input("x", tensor)\ + .set_attr_vec_int32("axes", axes) + shape = core.GEOperatorFactory.create_operator( + "shape" + self._accumulated_op_id(), + "Shape").set_input("x", data_squeezed) + return [shape, data_squeezed], [[1], [0]] + + +#****************************************************************# +#*************************** *************************# +#*************************** *************************# +#*************************** GradParser *************************# +#*************************** *************************# +#*************************** *************************# +#****************************************************************# +## grad +class ReduceSumGradParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(ReduceSumGradParser, self).__init__(graph, var2geop) + self.parser_name = "reduce_sum_grad" + + def _apply(self): + x = self._get_ge_input(self.op.input_arg_names[0]) + input = self._get_ge_input(self.op.input_arg_names[1]) + + shape_tensor = core.GEOperatorFactory.create_operator( + "shape" + self._accumulated_op_id(), + "Shape").set_input("x", input, 0) + tensoron = self._create_ge_tensor([1], 2, -1) + const = core.GEOperatorFactory.create_operator( + "const" + self._accumulated_op_id(), + "Const").set_attr_tensor("value", tensoron) + self._mark_as_input(const) + + reduce_sum = core.GEOperatorFactory.create_operator( + "broadcast_to_d" + self._accumulated_op_id(), + "BroadcastTo").set_input("x", x).set_input("shape", shape_tensor) + #reduce_sum = core.GEOperatorFactory.create_operator("expand" + self._accumulated_op_id(), "ExpandDims").set_input("x", reduce_sum).set_input("axis", const) + + return [reduce_sum], [[0]] + + +class MatMulGradParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(MatMulGradParser, self).__init__(graph, var2geop) + self.parser_name = "matmul_grad" + + def _apply(self): + out_grad = self._get_ge_input(self.op.input_arg_names[0]) + x = self._get_ge_input(self.op.input_arg_names[1]) + y = self._get_ge_input(self.op.input_arg_names[2]) + transpose_x = self.op.attr("transpose_X") + transpose_y = self.op.attr("transpose_Y") + + out_grad_shape = self.op.block.var(self.op.input_arg_names[0]).shape + x_shape = self.op.block.var(self.op.input_arg_names[1]).shape + y_shape = self.op.block.var(self.op.input_arg_names[2]).shape + + if len(x_shape) > 2: + if transpose_y: + x_grad = core.GEOperatorFactory.create_operator( + self.parser_name + self._accumulated_op_id(), + "BatchMatMul").set_input("x1", out_grad).set_input( + "x2", y).set_attr_bool( + "adj_x1", False).set_attr_bool("adj_x2", False) + y_grad = core.GEOperatorFactory.create_operator( + self.parser_name + self._accumulated_op_id(), + "BatchMatMul").set_input("x1", out_grad).set_input( + "x2", x).set_attr_bool( + "adj_x1", True).set_attr_bool("adj_x2", False) + else: + x_grad = core.GEOperatorFactory.create_operator( + self.parser_name + self._accumulated_op_id(), + "BatchMatMul").set_input("x1", out_grad).set_input( + "x2", y).set_attr_bool( + "adj_x1", False).set_attr_bool("adj_x2", True) + y_grad = core.GEOperatorFactory.create_operator( + self.parser_name + self._accumulated_op_id(), + "BatchMatMul").set_input("x1", x).set_input( + "x2", out_grad).set_attr_bool( + "adj_x1", True).set_attr_bool("adj_x2", False) + else: + if transpose_y: + x_grad = core.GEOperatorFactory.create_operator( + self.parser_name + self._accumulated_op_id(), + "MatMul").set_input("x1", out_grad).set_input( + "x2", y).set_attr_bool( + "transpose_x1", False).set_attr_bool("transpose_x2", + False) + y_grad = core.GEOperatorFactory.create_operator( + self.parser_name + self._accumulated_op_id(), + "MatMul").set_input("x1", out_grad).set_input( + "x2", x).set_attr_bool( + "transpose_x1", True).set_attr_bool("transpose_x2", + False) + else: + x_grad = core.GEOperatorFactory.create_operator( + self.parser_name + self._accumulated_op_id(), + "MatMul").set_input("x1", out_grad).set_input( + "x2", y).set_attr_bool( + "transpose_x1", False).set_attr_bool("transpose_x2", + True) + y_grad = core.GEOperatorFactory.create_operator( + self.parser_name + self._accumulated_op_id(), + "MatMul").set_input("x1", x).set_input( + "x2", out_grad).set_attr_bool( + "transpose_x1", True).set_attr_bool("transpose_x2", + False) + + return [x_grad, y_grad], [[0], [1]] + + +class MulGradParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(MulGradParser, self).__init__(graph, var2geop) + self.parser_name = "mul_grad" + + def _apply(self): + out_grad = self._get_ge_input(self.op.input_arg_names[0]) + x = self._get_ge_input(self.op.input_arg_names[1]) + y = self._get_ge_input(self.op.input_arg_names[2]) + x_num_col_dims = self.op.attr("x_num_col_dims") + y_num_col_dims = self.op.attr("y_num_col_dims") + + shape_out_grad = self.op.block.var(self.op.input_arg_names[0]).shape + shape_x = self.op.block.var(self.op.input_arg_names[1]).shape + shape_y = self.op.block.var(self.op.input_arg_names[2]).shape + + if x_num_col_dims == 1 and y_num_col_dims == 1: + if len(shape_x) == 2 and len(shape_y) == 2: + x_grad = core.GEOperatorFactory.create_operator( + self.parser_name + self._accumulated_op_id(), + "MatMul").set_input("x1", out_grad).set_input( + "x2", y).set_attr_bool( + "transpose_x1", False).set_attr_bool("transpose_x2", + True) + y_grad = core.GEOperatorFactory.create_operator( + self.parser_name + self._accumulated_op_id(), + "MatMul").set_input("x1", x).set_input( + "x2", out_grad).set_attr_bool( + "transpose_x1", True).set_attr_bool("transpose_x2", + False) + elif len(shape_x) == 3 and len(shape_y) == 2: + flatten_x = core.GEOperatorFactory.create_operator( + "flatten" + self._accumulated_op_id(), + "Flatten").set_input("x", x) + x_grad = core.GEOperatorFactory.create_operator( + self.parser_name + self._accumulated_op_id(), + "MatMul").set_input( + "x1", out_grad).set_input("x2", y).set_attr_bool( + "transpose_x1", + False).set_attr_bool("transpose_x2", True) + if len(shape_out_grad) == 2: + x_grad = core.GEOperatorFactory.create_operator( + "unsqueeze" + self._accumulated_op_id(), + "Unsqueeze").set_input("x", x_grad).set_attr_vec_int32( + "axes", [1]) + + y_grad = core.GEOperatorFactory.create_operator( + self.parser_name + self._accumulated_op_id(), + "MatMul").set_input( + "x1", + flatten_x).set_input("x2", out_grad).set_attr_bool( + "transpose_x1", + True).set_attr_bool("transpose_x2", False) + else: + if len(shape_x) == 3 and len(shape_y) == 2: + assert x_num_col_dims == 2, "only support 2" + flatten_x = core.GEOperatorFactory.create_operator( + "flatten" + self._accumulated_op_id(), + "FlattenV2").set_input("x", x).set_attr_int32( + "axis", 0).set_attr_int32("end_axis", 1) + flatten_out_grad = core.GEOperatorFactory.create_operator( + "flatten" + self._accumulated_op_id(), + "FlattenV2").set_input("x", out_grad).set_attr_int32( + "axis", 0).set_attr_int32("end_axis", 1) + + y_unsqueeze = core.GEOperatorFactory.create_operator( + "unsqueeze" + self._accumulated_op_id(), + "Unsqueeze").set_input("x", + y).set_attr_vec_int32("axes", [0]) + x_grad = core.GEOperatorFactory.create_operator( + self.parser_name + self._accumulated_op_id(), + "BatchMatMul").set_input("x1", out_grad).set_input( + "x2", y_unsqueeze).set_attr_bool( + "adj_x1", False).set_attr_bool("adj_x2", True) + y_grad = core.GEOperatorFactory.create_operator( + self.parser_name + self._accumulated_op_id(), + "MatMul").set_input("x1", flatten_x).set_input( + "x2", flatten_out_grad).set_attr_bool( + "transpose_x1", + True).set_attr_bool("transpose_x2", False) + + return [x_grad, y_grad], [[0], [1]] + + +class ReluGradParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(ReluGradParser, self).__init__(graph, var2geop) + self.parser_name = "relu_grad" + + def _apply(self): + out = self._get_ge_input(self.op.input_arg_names[0]) + out_grad = self._get_ge_input(self.op.input_arg_names[1]) + relu_grad = core.GEOperatorFactory.create_operator( + self.parser_name + self._accumulated_op_id(), "ReluGrad").set_input( + "gradients", out_grad).set_input("features", out) + return [relu_grad], [[0]] + + +class SoftmaxWithCrossEntropyGradParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(SoftmaxWithCrossEntropyGradParser, self).__init__(graph, var2geop) + self.parser_name = "softmax_with_cross_entropy_grad" + + def _apply(self): + label = self._get_ge_input(self.op.input_arg_names[0]) + loss_grad = self._get_ge_input(self.op.input_arg_names[1]) + softmax = self._get_ge_input(self.op.input_arg_names[2]) + cls_num = self.op.block.var(self.op.input_arg_names[2]).shape[1] + + label_shape = self.op.block.var(self.op.input_arg_names[0]).shape + loss_grad_shape = self.op.block.var(self.op.input_arg_names[1]).shape + softmax_shape = self.op.block.var(self.op.input_arg_names[2]).shape + + tensoron = self._create_ge_tensor([1], 5, 1) + on = core.GEOperatorFactory.create_operator( + "const" + self._accumulated_op_id(), + "Const").set_attr_tensor("value", tensoron) + tensoroff = self._create_ge_tensor([1], 5, 0) + off = core.GEOperatorFactory.create_operator( + "const" + self._accumulated_op_id(), + "Const").set_attr_tensor("value", tensoroff) + self._mark_as_input(on) + self._mark_as_input(off) + + label = core.GEOperatorFactory.create_operator( + "cast" + self._accumulated_op_id(), "Cast").set_input( + "x", label).set_attr_int32("dst_type", 3) + onehot = core.GEOperatorFactory.create_operator( + "onehot" + self._accumulated_op_id(), "OneHotD").set_input( + "x", label).set_input("on_value", on).set_input( + "off_value", off).set_attr_int32("depth", cls_num) + squeeze = core.GEOperatorFactory.create_operator( + "suqeeze" + self._accumulated_op_id(), + "Squeeze").set_input("x", onehot) + sub = core.GEOperatorFactory.create_operator( + "sub" + self._accumulated_op_id(), "Sub").set_input( + "x1", softmax).set_input("x2", squeeze) + grad = core.GEOperatorFactory.create_operator( + "mul" + self._accumulated_op_id(), + "Mul").set_input("x1", loss_grad).set_input("x2", sub) + + return [on, off, label, onehot, grad], [[-1]] + + +class DotMulGradParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(DotMulGradParser, self).__init__(graph, var2geop) + self.parser_name = "elementwise_mul_grad" + + def _apply(self): + out_grad = self._get_ge_input(self.op.input_arg_names[0]) + out_1 = self._get_ge_input(self.op.input_arg_names[1]) + out_2 = self._get_ge_input(self.op.input_arg_names[2]) + + x_grad = core.GEOperatorFactory.create_operator( + self.parser_name + self._accumulated_op_id(), + "Mul").set_input("x1", out_grad).set_input("x2", out_2) + y_grad = core.GEOperatorFactory.create_operator( + self.parser_name + self._accumulated_op_id(), + "Mul").set_input("x1", out_1).set_input("x2", out_grad) + + return [x_grad, y_grad], [[0], [1]] + + +class DotAddGradParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(DotAddGradParser, self).__init__(graph, var2geop) + self.parser_name = "elementwise_add_grad" + + def _apply(self): + out_grad = self._get_ge_input(self.op.input_arg_names[0]) + out_1 = self._get_ge_input(self.op.input_arg_names[1]) + out_2 = self._get_ge_input(self.op.input_arg_names[2]) + out_grad_shape = self.op.block.var(self.op.input_arg_names[0]).shape + out_1_shape = self.op.block.var(self.op.input_arg_names[1]).shape + out_2_shape = self.op.block.var(self.op.input_arg_names[2]).shape + + x_grad = out_grad + cur_time_x = len(out_grad_shape) - len(out_1_shape) + for i in range(cur_time_x): + x_grad = core.GEOperatorFactory.create_operator( + self.parser_name + self._accumulated_op_id(), + "ReduceSumD").set_input("x", x_grad).set_attr_vec_int32( + "axes", [0]).set_attr_bool("keep_dims", False) + for axis, size in enumerate(out_1_shape): + if size == 1: + x_grad = core.GEOperatorFactory.create_operator( + self.parser_name + self._accumulated_op_id(), + "ReduceSumD").set_input("x", x_grad).set_attr_vec_int32( + "axes", [axis]).set_attr_bool("keep_dims", True) + + y_grad = out_grad + cur_time_y = len(out_grad_shape) - len(out_2_shape) + for i in range(cur_time_y): + y_grad = core.GEOperatorFactory.create_operator( + self.parser_name + self._accumulated_op_id(), + "ReduceSumD").set_input("x", y_grad).set_attr_vec_int32( + "axes", [0]).set_attr_bool("keep_dims", False) + for axis, size in enumerate(out_2_shape): + if size == 1: + y_grad = core.GEOperatorFactory.create_operator( + self.parser_name + self._accumulated_op_id(), + "ReduceSumD").set_input("x", y_grad).set_attr_vec_int32( + "axes", [axis]).set_attr_bool("keep_dims", True) + + return [x_grad, y_grad], [[0], [1]] + + +class DotDivGradParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(DotDivGradParser, self).__init__(graph, var2geop) + self.parser_name = "elementwise_div_grad" + + def _apply(self): + out = self._get_ge_input(self.op.input_arg_names[0]) + out_grad = self._get_ge_input(self.op.input_arg_names[1]) + x = self._get_ge_input(self.op.input_arg_names[2]) + y = self._get_ge_input(self.op.input_arg_names[3]) + + y_power = core.GEOperatorFactory.create_operator( + "power" + self._accumulated_op_id(), "Power").set_input( + "x", y).set_attr_float("power", -1) + + tensor_zeros = core.GEOperatorFactory.create_operator( + "zeroslike" + self._accumulated_op_id(), + "ZerosLike").set_input("x", x) + x_zero = core.GEOperatorFactory.create_operator( + "equal" + self._accumulated_op_id(), "Equal").set_input( + "x1", x).set_input("x2", tensor_zeros) + x_nozero = core.GEOperatorFactory.create_operator( + "logical_not" + self._accumulated_op_id(), + "LogicalNot").set_input("x", x_zero) + x_nozero_f = core.GEOperatorFactory.create_operator( + "cast" + self._accumulated_op_id(), "Cast").set_input( + "x", x_nozero).set_attr_int32("dst_type", 0) + x_grad_w = core.GEOperatorFactory.create_operator( + "mul" + self._accumulated_op_id(), "Mul").set_input( + "x1", x_nozero_f).set_input("x2", y_power) + x_grad = core.GEOperatorFactory.create_operator( + self.parser_name + self._accumulated_op_id(), + "Mul").set_input("x1", x_grad_w).set_input("x2", out_grad) + + y_grad_w = core.GEOperatorFactory.create_operator( + "mul" + self._accumulated_op_id(), "Mul").set_input( + "x1", out).set_input("x2", y_power) + y_grad = core.GEOperatorFactory.create_operator( + "mul" + self._accumulated_op_id(), "Mul").set_input( + "x1", y_grad_w).set_input("x2", out_grad) + + return [x_grad, y_grad], [[0], [1]] + + +class SoftmaxGradParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(SoftmaxGradParser, self).__init__(graph, var2geop) + self.parser_name = "softmax_grad" + + def _apply(self): + out = self._get_ge_input(self.op.input_arg_names[0]) + out_grad = self._get_ge_input(self.op.input_arg_names[1]) + + x_grad = core.GEOperatorFactory.create_operator( + self.parser_name + self._accumulated_op_id(), + "SoftmaxGrad").set_input("softmax", out).set_input("grad_softmax", + out_grad) + return [x_grad], [[0]] + + +class ReshapeGradParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(ReshapeGradParser, self).__init__(graph, var2geop) + self.parser_name = "reshape2_grad" + + def _apply(self): + out_grad = self._get_ge_input(self.op.input_arg_names[0]) + x_shape = self._get_ge_input(self.op.input_arg_names[1]) + x_shape_list = self.op.block.var(self.op.input_arg_names[1]).shape + + if x_shape_list[0] == 0: + x_shape_delzero = x_shape_list[1:] + tensor = self._create_ge_tensor([len(x_shape_delzero)], 2, + x_shape_delzero) + const_shape = core.GEOperatorFactory.create_operator( + "shape" + self._accumulated_op_id(), + "Const").set_attr_tensor("value", tensor) + x_grad = core.GEOperatorFactory.create_operator( + "reshape" + self._accumulated_op_id(), "Reshape").set_input( + "x", out_grad).set_input("shape", const_shape) + + return [x_grad], [[0]] - return [reshape, reshape], [[0], [1]] + +class GatherGradParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(GatherGradParser, self).__init__(graph, var2geop) + self.parser_name = "gather_grad" + + def _apply(self): + index = self._get_ge_input(self.op.input_arg_names[0]) + out_grad = self._get_ge_input(self.op.input_arg_names[1]) + x = self._get_ge_input(self.op.input_arg_names[2]) + + index_shape = self.op.block.var(self.op.input_arg_names[0]).shape + out_grad_shape = self.op.block.var(self.op.input_arg_names[1]).shape + x_shape = self.op.block.var(self.op.input_arg_names[2]).shape + + if len(index_shape) == 1: + index = core.GEOperatorFactory.create_operator( + "unsqueeze" + self._accumulated_op_id(), "Unsqueeze").set_input( + "x", index).set_attr_vec_int32("axes", [1]) + + tensor_zeros = core.GEOperatorFactory.create_operator( + "zeroslike" + self._accumulated_op_id(), + "ZerosLike").set_input("x", x) + x_grad = core.GEOperatorFactory.create_operator( + "scatter" + self._accumulated_op_id(), + "TensorScatterUpdate").set_input("x", tensor_zeros).set_input( + "indices", index).set_input("updates", out_grad) + + return [tensor_zeros, x_grad], [[-1]] + + +class TransposeGradParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(TransposeGradParser, self).__init__(graph, var2geop) + self.parser_name = "transpose2_grad" + + def _apply(self): + out_grad = self._get_ge_input(self.op.input_arg_names[0]) + x = self._get_ge_input(self.op.input_arg_names[1]) + perm = self.op.attr("axis") + + x_shape = self.op.block.var(self.op.input_arg_names[1]).shape[1:] + out_grad_shape = self.op.block.var(self.op.input_arg_names[0]).shape + assert list(map(lambda x: out_grad_shape[x], perm)) == list(x_shape) + + x_grad = core.GEOperatorFactory.create_operator( + "transpose" + self._accumulated_op_id(), "TransposeD").set_input( + "x", out_grad).set_attr_vec_int32("perm", perm) + + return [x_grad], [[0]] + + +class LayerNormGradParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(LayerNormGradParser, self).__init__(graph, var2geop) + self.parser_name = "layer_norm_grad" + + def _apply(self): + bias = self._get_ge_input(self.op.input_arg_names[0]) + mean = self._get_ge_input(self.op.input_arg_names[1]) + scale = self._get_ge_input(self.op.input_arg_names[2]) + variance = self._get_ge_input(self.op.input_arg_names[3]) + x = self._get_ge_input(self.op.input_arg_names[4]) + out_grad = self._get_ge_input(self.op.input_arg_names[5]) + x_dtype = self.op.block.var(self.op.input_arg_names[4]).dtype + + x_grad = core.GEOperatorFactory.create_operator( + self.parser_name + self._accumulated_op_id(), + "LayerNormGrad").set_input("dy", out_grad).set_input( + "x", x).set_input("variance", variance).set_input( + "mean", mean).set_input("gamma", scale) + + cast_dtype = 0 if self.ascend_helper.dtype2paddle_inv_map[str( + x_dtype)] == 0 else 1 + out_x_grad = core.GEOperatorFactory.create_operator( + "cast" + self._accumulated_op_id(), "Cast").set_input( + "x", x_grad, 0).set_attr_int32("dst_type", cast_dtype) + out_scale_grad = core.GEOperatorFactory.create_operator( + "cast" + self._accumulated_op_id(), "Cast").set_input( + "x", x_grad, 1).set_attr_int32("dst_type", cast_dtype) + out_bias_grad = core.GEOperatorFactory.create_operator( + "cast" + self._accumulated_op_id(), "Cast").set_input( + "x", x_grad, 2).set_attr_int32("dst_type", cast_dtype) + + return [out_x_grad, out_scale_grad, out_bias_grad], [[2], [1], [0]] + + +class TanhGradParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(TanhGradParser, self).__init__(graph, var2geop) + self.parser_name = 'tanh_grad' + + def _apply(self): + y = self._get_ge_input(self.op.input_arg_names[0]) + out_grad = self._get_ge_input(self.op.input_arg_names[1]) + tanh_grad = core.GEOperatorFactory.create_operator( + "tanh_grad" + self._accumulated_op_id(), + "TanhGrad").set_input("y", y).set_input("dy", out_grad) + + return [tanh_grad], [[0]] + + +class LogGradParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(LogGradParser, self).__init__(graph, var2geop) + self.parser_name = 'log_grad' + + def _apply(self): + grad = self._get_ge_input(self.op.input_arg_names[0]) + input = self._get_ge_input(self.op.input_arg_names[1]) + log_grad = core.GEOperatorFactory.create_operator( + "log_grad" + self._accumulated_op_id(), + "DivNoNan").set_input("x1", grad).set_input("x2", input) + return [log_grad], [[0]] + + +class SqrtGradParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(SqrtGradParser, self).__init__(graph, var2geop) + self.parser_name = "sqrt_grad" + + def _apply(self): + y = self._get_ge_input(self.op.input_arg_names[0]) + out_grad = self._get_ge_input(self.op.input_arg_names[1]) + sqrt_grad = core.GEOperatorFactory.create_operator( + "sqrt_grad" + self._accumulated_op_id(), + "SqrtGrad").set_input("y", y).set_input("dy", out_grad) + return [sqrt_grad] + + +class PowGradParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(PowGradParser, self).__init__(graph, var2geop) + self.parser_name = "pow_grad" + + def _apply(self): + grad = self._get_ge_input(self.op.input_arg_names[0]) + x = self._get_ge_input(self.op.input_arg_names[1]) + factor = self.op.attr("factor") + + shape_tensor = self._create_shape_tensor() + shape_tensor = core.GEOperatorFactory.create_operator( + "shape" + self._accumulated_op_id(), "Shape").set_input("x", x) + factor_scale = self._create_ge_tensor([1], 5, factor) + factor_scale = core.GEOperatorFactory.create_operator( + "const" + self._accumulated_op_id(), + "Const").set_attr_tensor("value", factor_scale) + factor_tensor = core.GEOperatorFactory.create_operator( + "broadcast_to_d" + self._accumulated_op_id(), + "BroadcastTo").set_input( + "x", factor_scale).set_input("shape", shape_tensor) + + x_power = core.GEOperatorFactory.create_operator( + "x_power" + self._accumulated_op_id(), "Power").set_input( + "x", x).set_attr_float("power", factor - 1) + x_power_mul_factor = core.GEOperatorFactory.create_operator( + "x_power_mul_factor" + self._accumulated_op_id(), "Mul").set_input( + "x1", x).set_input("x2", factor_tensor) + x_power_mul_factor_grad = core.GEOperatorFactory.create_operator( + "x_power_mul_factor_grad" + self._accumulated_op_id(), + "Mul").set_input("x1", x_power_mul_factor).set_input("x2", grad) + + return [x_power_mul_factor_grad], [[0]] + + +class GeluGradParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(GeluGradParser, self).__init__(graph, var2geop) + self.parser_name = "gelu_grad" + + def _apply(self): + grad = self._get_ge_input(self.op.input_arg_names[0]) + x = self._get_ge_input(self.op.input_arg_names[1]) + + y = core.GEOperatorFactory.create_operator( + "gelu" + self._accumulated_op_id(), "Gelu").set_input("x", x) + gelu_grad = core.GEOperatorFactory.create_operator( + "gelu_grad" + self._accumulated_op_id(), "GeluGrad").set_input( + "x", x).set_input("dy", grad).set_input("y", y) + + return [gelu_grad], [[0]] + + +class MeanGradParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(MeanGradParser, self).__init__(graph, var2geop) + self.parser_name = "mean_grad" + + def _apply(self): + grad = self._get_ge_input(self.op.input_arg_names[0]) + x = self._get_ge_input(self.op.input_arg_names[1]) + + ones_tensor = core.GEOperatorFactory.create_operator( + "one_tensor" + self._accumulated_op_id(), + "OnesLike").set_input("x", x) + sum = core.GEOperatorFactory.create_operator( + "mean" + self._accumulated_op_id(), "ReduceSumD").set_input( + "x", ones_tensor).set_attr_bool( + "keep_dims", False).set_attr_vec_int32("axes", []) + mean = core.GEOperatorFactory.create_operator( + "x_power" + self._accumulated_op_id(), "Power").set_input( + "x", sum).set_attr_float("power", -1) + + mean_grad = core.GEOperatorFactory.create_operator( + "mean_grad" + self._accumulated_op_id(), + "Mul").set_input("x1", mean).set_input("x2", grad) + + return [mean_grad], [[0]] + + +class SliceGradParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(SliceGradParser, self).__init__(graph, var2geop) + self.parser_name = "slice_grad" + + def _apply(self): + x = self._get_ge_input(self.op.input_arg_names[0]) + grad = self._get_ge_input(self.op.input_arg_names[1]) + axes = self.op.attr("axes") + starts = self.op.attr("starts") + ends = self.op.attr("ends") + + x_shape = self.op.block.var(self.op.input_arg_names[0]).shape + grad_shape = self.op.block.var(self.op.input_arg_names[1]).shape + + len_shape = len(x_shape) + axes_cor = list(range(len_shape)) + starts_cor, ends_cor = [], [] + cnt = 0 + for i in range(len_shape): + starts_cor.append(starts[cnt] if i in axes else 0) + if i in axes and ends[cnt] <= x_shape[i]: + ends_cor.append(x_shape[i] - ends[cnt]) + else: + ends_cor.append(0) + if i in axes: + cnt += 1 + + starts_cor[0] = 0 + ends_cor[0] = 0 + paddings = [[s, e] for (s, e) in zip(starts_cor, ends_cor)] + slice_value = core.GEOperatorFactory.create_operator( + "slice_grad" + self._accumulated_op_id(), "PadD").set_input( + "x", grad).set_attr_vec_vec_int64("paddings", paddings) + + return [slice_value], [[0]] + + +class LookUpTableGradParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(LookUpTableGradParser, self).__init__(graph, var2geop) + self.parser_name = "lookup_table_grad" + + def _apply(self): + ids = self._get_ge_input(self.op.input_arg_names[0]) + grad = self._get_ge_input(self.op.input_arg_names[1]) + embedding = self._get_ge_input(self.op.input_arg_names[2]) + + shape_ids = self.op.block.var(self.op.input_arg_names[0]).shape + shape_grad = self.op.block.var(self.op.input_arg_names[1]).shape + shape_embedding = self.op.block.var(self.op.input_arg_names[2]).shape + + ids_flatten = core.GEOperatorFactory.create_operator( + "flatten" + self._accumulated_op_id(), "FlattenV2").set_input( + "x", + ids).set_attr_int32("axis", 0).set_attr_int32("end_axis", 1) + grad_flatten = core.GEOperatorFactory.create_operator( + "flatten" + self._accumulated_op_id(), "FlattenV2").set_input( + "x", + grad).set_attr_int32("axis", 0).set_attr_int32("end_axis", 1) + + tensor_zeros = core.GEOperatorFactory.create_operator( + "zeroslike" + self._accumulated_op_id(), + "ZerosLike").set_input("x", embedding) + embedding_grad = core.GEOperatorFactory.create_operator( + "scatteradd" + self._accumulated_op_id(), + "TensorScatterAdd").set_input( + "x", tensor_zeros).set_input("indices", ids_flatten).set_input( + "updates", grad_flatten) + + return [embedding_grad], [[0]] + + +class SGDParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(SGDParser, self).__init__(graph, var2geop) + self.parser_name = "sgd" + + def _apply(self): + grad = self._get_ge_input(self.op.input_arg_names[0]) + lr = self._get_ge_input(self.op.input_arg_names[1]) + param = self._get_ge_input(self.op.input_arg_names[2]) + sgd = core.GEOperatorFactory.create_operator( + "momentum" + self._accumulated_op_id(), + "ApplyGradientDescent").set_input("var", param).set_input( + "alpha", lr).set_input("delta", grad) + return [sgd], [[0]] + + +class AdamParser(AscendParserBase): + def __init__(self, graph, var2geop): + super(AdamParser, self).__init__(graph, var2geop) + self.parser_name = "adam" + + def _apply(self): + beta1_power = self._get_ge_input(self.op.input_arg_names[0]) + beta2_power = self._get_ge_input(self.op.input_arg_names[1]) + grad = self._get_ge_input(self.op.input_arg_names[2]) + lr = self._get_ge_input(self.op.input_arg_names[3]) + moment1 = self._get_ge_input(self.op.input_arg_names[4]) + moment2 = self._get_ge_input(self.op.input_arg_names[5]) + param = self._get_ge_input(self.op.input_arg_names[6]) + beta1 = self.op.attr('beta1') + beta2 = self.op.attr('beta2') + epsilon = self.op.attr('epsilon') + + beta1 = core.GEOperatorFactory.create_operator( + "const" + self._accumulated_op_id(), "Const").set_attr_tensor( + "value", self._create_ge_tensor([1], 5, beta1)) + beta2 = core.GEOperatorFactory.create_operator( + "const" + self._accumulated_op_id(), "Const").set_attr_tensor( + "value", self._create_ge_tensor([1], 5, beta2)) + epsilon = core.GEOperatorFactory.create_operator( + "const" + self._accumulated_op_id(), "Const").set_attr_tensor( + "value", self._create_ge_tensor([1], 5, epsilon)) + + adam = core.GEOperatorFactory.create_operator( + "adam" + self._accumulated_op_id(), + "ApplyAdam").set_input("var", param).set_input( + "m", moment1).set_input("v", moment2).set_input( + "beta1_power", beta1_power).set_input( + "beta2_power", beta2_power).set_input( + "lr", lr).set_input("beta1", beta1).set_input( + "beta2", beta2).set_input( + "epsilon", epsilon).set_input("grad", grad) + + return [adam], [[0]] diff --git a/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py index 159c0b973b2..9a4ffd2fd02 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/graph_execution_optimizer.py @@ -61,8 +61,9 @@ class GraphExecutionOptimizer(MetaOptimizerBase): trainer_endpoints_env = ",".join(trainer_endpoints) trainers_num = self.role_maker._worker_num() - if trainer_id == 0: - wait_server_ready(other_trainers) + # FIXME(wangxi): approve this. + #if trainer_id == 0: + # wait_server_ready(other_trainers) if core.is_compiled_with_cuda(): comm_id_var = startup_program.global_block().create_var( diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index f4c2318750c..e1c5ae750d9 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -40,6 +40,8 @@ list(APPEND MIXED_DIST_TEST_OPS test_fleetrun) list(APPEND MIXED_DIST_TEST_OPS test_fleet_run_random_port) list(APPEND MIXED_DIST_TEST_OPS test_fleet_launch_async) list(APPEND MIXED_DIST_TEST_OPS test_fleet_launch_cloud) +list(APPEND MIXED_DIST_TEST_OPS test_fleet_launch_ascend) +list(APPEND MIXED_DIST_TEST_OPS test_ascend_group) list(APPEND MIXED_DIST_TEST_OPS test_fleet_launch_nproc) list(APPEND MIXED_DIST_TEST_OPS test_fleet_api_input) list(APPEND MIXED_DIST_TEST_OPS test_collective_optimizer) @@ -531,6 +533,10 @@ if(WITH_DISTRIBUTE) bash_test_modules(test_fleet_launch_async START_BASH test_fleet_launch_async.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR}) bash_test_modules(test_fleet_launch_cloud START_BASH test_fleet_launch_cloud.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR}) bash_test_modules(test_fleet_launch_nproc START_BASH test_fleet_launch_nproc.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR}) + if(WITH_ASCEND) + bash_test_modules(test_fleet_launch_ascend START_BASH test_fleet_launch_ascend.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR}) + bash_test_modules(test_ascend_group START_BASH test_ascend_group.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR}) + endif() # port range (20000, 23000) is reserved for dist-ops set(dist_ut_port 20001) @@ -541,7 +547,8 @@ if(WITH_DISTRIBUTE) message(FATAL_ERROR "available ports have been exhausted:${dist_ut_port}") endif() endforeach(TEST_OP) - bash_test_modules(test_fleet_launch_ps START_BASH test_fleet_launch_ps.sh SERIAL LABELS "RUN_TYPE=EXCLUSIVE" ENVS "PADDLE_DIST_UT_PORT=${dist_ut_port}" PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR} ) + # solve it later. + # bash_test_modules(test_fleet_launch_ps START_BASH test_fleet_launch_ps.sh SERIAL LABELS "RUN_TYPE=EXCLUSIVE" ENVS "PADDLE_DIST_UT_PORT=${dist_ut_port}" PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR} ) bash_test_modules(test_new_group START_BASH test_new_group.sh SERIAL LABELS "RUN_TYPE=EXCLUSIVE" ENVS "PADDLE_DIST_UT_PORT=${dist_ut_port}+20" PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR} ) endif(NOT APPLE) endif() diff --git a/python/paddle/fluid/tests/unittests/ascend_group.py b/python/paddle/fluid/tests/unittests/ascend_group.py new file mode 100644 index 00000000000..78a3687b5ca --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ascend_group.py @@ -0,0 +1,140 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys +import time +import paddle.fluid as fluid +from paddle.fluid import unique_name +import paddle.fluid.core as core +import paddle +from paddle.fluid.layer_helper import LayerHelper +from paddle.distributed import fleet +from paddle.distributed.fleet.meta_optimizers.ascend import ascend_parser, ascend_optimizer +from collections import namedtuple + +Block = namedtuple('Block', ['program']) +Loss = namedtuple('Loss', ['block']) + +paddle.enable_static() + +OpRole = core.op_proto_and_checker_maker.OpRole +OP_ROLE_KEY = core.op_proto_and_checker_maker.kOpRoleAttrName() +OP_ROLE_VAR_KEY = core.op_proto_and_checker_maker.kOpRoleVarAttrName() + +role = fleet.PaddleCloudRoleMaker(is_collective=True) +fleet.init(role) + + +def init_communicator(startup_program, main_program, current_endpoint, + endpoints, ring_id): + nranks = len(endpoints) + other_endpoints = endpoints[:] + other_endpoints.remove(current_endpoint) + group_rank = endpoints.index(current_endpoint) + assert group_rank >= 0 + + block = startup_program.global_block() + nccl_id_var = block.create_var( + name=unique_name.generate('nccl_id'), + persistable=True, + type=core.VarDesc.VarType.RAW) + block.append_op( + type='c_gen_nccl_id', + inputs={}, + outputs={'Out': nccl_id_var}, + attrs={ + 'rank': group_rank, + 'endpoint': current_endpoint, + 'other_endpoints': other_endpoints, + OP_ROLE_KEY: OpRole.Forward, + }) + block.append_op( + type='c_comm_init', + inputs={'X': nccl_id_var}, + outputs={}, + attrs={ + 'nranks': nranks, + 'rank': group_rank, + 'ring_id': ring_id, + OP_ROLE_KEY: OpRole.Forward, + }) + + with fluid.program_guard(main_program): + op_type = "c_allreduce_sum" + data = fluid.layers.fill_constant(shape=[1], dtype='float32', value=2.5) + helper = LayerHelper(op_type, **locals()) + helper.append_op( + type=op_type, + inputs={'X': [data]}, + outputs={'Out': [data]}, + attrs={'ring_id': ring_id, + 'use_calc_stream': True}) + + print("startup program:", startup_program) + print("main program:", main_program) + + +def train(world_endpoints, world_device_ids, local_device_ids, local_rank): + startup_programs = [] + main_programs = [] + + #trainer_endpoints=["127.0.0.1:6071","127.0.0.1:6072","127.0.0.1:6073","127.0.0.1:6074"] + trainer_endpoints = world_endpoints + groups = [[], [], []] + groups[0] = [trainer_endpoints[0], trainer_endpoints[1]] + groups[1] = [trainer_endpoints[2], trainer_endpoints[3]] + groups[2] = [trainer_endpoints[0], trainer_endpoints[2]] + print("groups:", groups) + + for i in range(len(trainer_endpoints)): + startup_programs.append(fluid.Program()) + main_programs.append(fluid.Program()) + + for idx, group in enumerate(groups): + for te in group: + te_idx = trainer_endpoints.index(te) + startup_program = startup_programs[te_idx] + main_program = main_programs[te_idx] + init_communicator(startup_program, main_program, te, group, idx) + + print(len(startup_programs)) + print(startup_programs[local_rank]) + print(main_programs[local_rank]) + + print("local rank: ", local_rank) + print("local startup program: ", startup_programs[local_rank]) + + startup_program = startup_programs[local_rank] + main_program = main_programs[local_rank] + loss = Loss(Block(main_program)) + optimizer = ascend_optimizer.AscendOptimizer(None, fetch_list=[]) + optimizer.minimize(loss, startup_program, auto_dp=True) + + exe = paddle.static.Executor(paddle.CPUPlace()) + #exe.run(startup_program) + exe.run(main_program) + + +worker_endpoints = fleet.worker_endpoints() +world_device_ids = fleet.world_device_ids() +local_device_ids = fleet.local_device_ids() +local_rank = int(fleet.local_rank()) + +print("worker_endpoints:", worker_endpoints) +print("world_device_ids:", world_device_ids) +print("local_device_ids:", local_device_ids) +print("local_rank:", local_rank) + +train(worker_endpoints, world_device_ids, local_device_ids, local_rank) diff --git a/python/paddle/fluid/tests/unittests/ascend_multi_process_collective.py b/python/paddle/fluid/tests/unittests/ascend_multi_process_collective.py new file mode 100644 index 00000000000..33e6f63ea10 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ascend_multi_process_collective.py @@ -0,0 +1,41 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys +import time + + +def train(prefix): + selected_accelerators = os.getenv("FLAGS_selected_accelerators") + trainer_id = int(os.getenv("PADDLE_TRAINER_ID")) + worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS") + current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT") + worker_endpoints = worker_endpoints_env + trainers_num = len(worker_endpoints.split(',')) + device_ids = os.getenv("PADDLE_WORLD_DEVICE_IDS") + current_device_id = os.getenv("PADDLE_LOCAL_DEVICE_IDS") + + details = "selected_accelerators:{} worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{} device_ids:{} device_id:{}"\ + .format(selected_accelerators, worker_endpoints, trainers_num, current_endpoint,trainer_id,device_ids, current_device_id) + + print(details) + with open("multi_process_{}.check_{}.log".format(prefix, trainer_id), + "w") as f: + f.write(details) + + +if __name__ == '__main__': + prefix = sys.argv[1] + train(prefix) diff --git a/python/paddle/fluid/tests/unittests/test_ascend_group.sh b/python/paddle/fluid/tests/unittests/test_ascend_group.sh new file mode 100644 index 00000000000..31c442e0962 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_ascend_group.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -e + +cluster_node_ips="127.0.0.1" +export PADDLE_TRAINERS_NUM=4 +export POD_IP=127.0.0.1 +export PADDLE_TRAINERS=127.0.0.1 +export PADDLE_TRAINER_ID=0 + +export PADDLE_PORT=35789 +export TRAINER_PORTS_NUM=4 + +distributed_args="--ips=${cluster_node_ips} --ascend_npus=0,1,2,3 --log_dir=testlog" +python -m paddle.distributed.fleet.launch ${distributed_args} \ + ascend_group.py fleetascendgroup diff --git a/python/paddle/fluid/tests/unittests/test_fleet_launch_ascend.sh b/python/paddle/fluid/tests/unittests/test_fleet_launch_ascend.sh new file mode 100644 index 00000000000..0960083abf2 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_fleet_launch_ascend.sh @@ -0,0 +1,59 @@ +#!/bin/bash + +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -e + +# use paddlecloud +echo "begin test use paddlecloud" +cluster_node_ips="127.0.0.1,127.0.0.2" +export PADDLE_TRAINERS_NUM=2 +export POD_IP=127.0.0.1 +export PADDLE_TRAINERS=127.0.0.1,127.0.0.2 +export PADDLE_TRAINER_ID=0 + +export PADDLE_PORT=35789 +export TRAINER_PORTS_NUM=2 + +distributed_args="--ips=${cluster_node_ips} --ascend_npus=0,1 --log_dir=testlog" +python -m paddle.distributed.fleet.launch ${distributed_args} ascend_multi_process_collective.py fleetlaunchascend + +str1="selected_accelerators:0 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35789 trainer_id:0 device_ids:0,1,0,1 device_id:0" +str2="selected_accelerators:1 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35790 trainer_id:1 device_ids:0,1,0,1 device_id:1" +file_0="multi_process_fleetlaunchascend.check_0.log" +file_1="multi_process_fleetlaunchascend.check_1.log" + +echo "paddlecloud params test" +if grep -q "$str1" "$file_0"; then + echo "find trainer 0" +else + echo "not find trainer 0" + exit -1 +fi + +if grep -q "$str2" "$file_1"; then + echo "find trainer 1" +else + echo "not find trainer 1" + exit -1 +fi + +# test async poll process +if [ -f $file_0 ]; then + rm $file_0 +fi +if [ -f $file_1 ]; then + rm $file_1 +fi diff --git a/python/paddle/fluid/transpiler/ascend_transpiler.py b/python/paddle/fluid/transpiler/ascend_transpiler.py new file mode 100644 index 00000000000..5593c91b5bc --- /dev/null +++ b/python/paddle/fluid/transpiler/ascend_transpiler.py @@ -0,0 +1,74 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from . import collective +from .. import core +OpRole = core.op_proto_and_checker_maker.OpRole +from paddle.distributed import fleet + + +class AscendTranspiler(collective.Collective): + def __init__(self, startup_program, main_program): + self.nrings = 1 + super(AscendTranspiler, self).__init__(self.nrings) + self._startup_program = startup_program + self._main_program = main_program + + def _insert_allreduce_ops(self): + block = self._main_program.global_block() + ring_id = -1 + grad = None + for idx, op in reversed(list(enumerate(block.ops))): + if self._is_backward_op(op) and \ + self.op_role_var_key in op.attr_names: + op_role_var = op.all_attrs()[self.op_role_var_key] + + if len(op_role_var) == 0: + continue + assert len(op_role_var) % 2 == 0 + + offset = idx + for i in range(0, len(op_role_var), 2): + param = block.vars[op_role_var[i]] + grad = block.vars[op_role_var[i + 1]] + if param.is_distributed: + continue + + # As we search ops reversedly, we should insert c_allreduce_sum + # op in the same way to keep the ring_id alternate + ring_id = (ring_id + 1) % self.nrings + block._insert_op( + offset + 1, + type='c_allreduce_sum', + inputs={'X': grad}, + outputs={'Out': grad}, + attrs={ + 'ring_id': ring_id, + self.op_role_key: OpRole.Backward + }) + block._insert_op( + offset + 2, + type='scale', + inputs={'X': grad}, + outputs={'Out': grad}, + attrs={ + 'scale': 1.0 / fleet.worker_num(), + self.op_role_key: OpRole.Backward + }) + + if grad is None: + return + + def transpile(self): + self._insert_allreduce_ops() diff --git a/python/setup.py.in b/python/setup.py.in index e4532b3e55d..2883f2ed248 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -149,6 +149,7 @@ packages=['paddle', 'paddle.distributed.fleet.base', 'paddle.distributed.fleet.meta_optimizers', 'paddle.distributed.fleet.meta_optimizers.sharding', + 'paddle.distributed.fleet.meta_optimizers.ascend', 'paddle.distributed.fleet.runtime', 'paddle.distributed.fleet.dataset', 'paddle.distributed.fleet.data_generator', -- GitLab From d91faf2997754bd8beed4529d6b680a796e24fad Mon Sep 17 00:00:00 2001 From: furnace <34057289+windstamp@users.noreply.github.com> Date: Wed, 7 Apr 2021 19:25:32 +0800 Subject: [PATCH 169/486] bugfix for unit test test_segment_ops (#32116) --- paddle/fluid/platform/cuda_primitives.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/paddle/fluid/platform/cuda_primitives.h b/paddle/fluid/platform/cuda_primitives.h index 340372007a7..94f64d158af 100644 --- a/paddle/fluid/platform/cuda_primitives.h +++ b/paddle/fluid/platform/cuda_primitives.h @@ -200,6 +200,8 @@ CUDA_ATOMIC_WRAPPER(Max, float) { old = atomicCAS(address_as_i, assumed, __float_as_int(val)); } while (assumed != old); + + return __int_as_float(old); } CUDA_ATOMIC_WRAPPER(Max, double) { @@ -219,6 +221,8 @@ CUDA_ATOMIC_WRAPPER(Max, double) { old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val)); } while (assumed != old); + + return __longlong_as_double(old); } // For atomicMin @@ -272,6 +276,8 @@ CUDA_ATOMIC_WRAPPER(Min, float) { old = atomicCAS(address_as_i, assumed, __float_as_int(val)); } while (assumed != old); + + return __int_as_float(old); } CUDA_ATOMIC_WRAPPER(Min, double) { @@ -291,6 +297,8 @@ CUDA_ATOMIC_WRAPPER(Min, double) { old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val)); } while (assumed != old); + + return __longlong_as_double(old); } } // namespace platform -- GitLab From e09f4db9f71cf474041eb5bc2e54f330c06a243c Mon Sep 17 00:00:00 2001 From: YUNSHEN XIE <1084314248@qq.com> Date: Wed, 7 Apr 2021 19:26:07 +0800 Subject: [PATCH 170/486] Check added ut on windows (#31826) * added ut check on windows,notest,test=windows_ci * debug,notest,test=windows_ci * debug,notest,test=windows_ci * fix bug,notest,test=windows_ci * added ut check * test for new ut add on windows * test,notest,test=windows_ci * fix bug,notest,test=windows_ci * test * test * test * test,notest,test=windows_ci * test,notest,test=windows_ci * check added ut on windows * only fetch upstream develop * modified according comment * Update run_unittests.sh * Update run_unittests.sh --- paddle/scripts/paddle_build.bat | 7 +++++ tools/check_added_ut.sh | 50 +++++++++++++++++++++++++++------ tools/windows/run_unittests.sh | 18 ++++++++++++ 3 files changed, 67 insertions(+), 8 deletions(-) diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat index 0fc8b7097a0..14e62d6761f 100644 --- a/paddle/scripts/paddle_build.bat +++ b/paddle/scripts/paddle_build.bat @@ -501,6 +501,13 @@ setlocal enabledelayedexpansion :: if %errorlevel% NEQ 0 exit /b 8 :: for /F %%# in ('cmd /C nvidia-smi -L ^|find "GPU" /C') do set CUDA_DEVICE_COUNT=%%# set CUDA_DEVICE_COUNT=1 + +echo cmake .. -G %GENERATOR% -DCMAKE_BUILD_TYPE=Release -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% ^ +-DWITH_TESTING=%WITH_TESTING% -DWITH_PYTHON=%WITH_PYTHON% -DON_INFER=%ON_INFER% ^ +-DWITH_INFERENCE_API_TEST=%WITH_INFERENCE_API_TEST% -DTHIRD_PARTY_PATH=%THIRD_PARTY_PATH% ^ +-DINFERENCE_DEMO_INSTALL_DIR=%INFERENCE_DEMO_INSTALL_DIR% -DWITH_STATIC_LIB=%WITH_STATIC_LIB% ^ +-DWITH_TENSORRT=%WITH_TENSORRT% -DTENSORRT_ROOT="%TENSORRT_ROOT%" -DMSVC_STATIC_CRT=%MSVC_STATIC_CRT% ^ +-DWITH_UNITY_BUILD=%WITH_UNITY_BUILD% -DCUDA_ARCH_NAME=%CUDA_ARCH_NAME% >> %work_dir%\win_cmake.sh set FLAGS_fraction_of_gpu_memory_to_use=0.92 %cache_dir%\tools\busybox64.exe bash %work_dir%\tools\windows\run_unittests.sh %NIGHTLY_MODE% %PRECISION_TEST% %WITH_GPU% diff --git a/tools/check_added_ut.sh b/tools/check_added_ut.sh index 2ea34771d1b..618236f75bf 100644 --- a/tools/check_added_ut.sh +++ b/tools/check_added_ut.sh @@ -16,32 +16,66 @@ set +e set -x +SYSTEM=`uname -s` if [ -z ${BRANCH} ]; then BRANCH="develop" fi export CI_SKIP_CPP_TEST=OFF -PADDLE_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}")/../" && pwd )" +if [[ "$SYSTEM" == "Linux" ]] || [[ "$SYSTEM" == "Darwin" ]];then + PADDLE_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}")/../" && pwd )" +elif [[ "$SYSTEM" == "Windows_NT" ]];then + PADDLE_ROOT="$(cd "$PWD/../" && pwd )" +fi CURDIR=`pwd` cd $PADDLE_ROOT -cp $PADDLE_ROOT/paddle/scripts/paddle_build.sh $PADDLE_ROOT/paddle/scripts/paddle_build_pre.sh +if [[ "$SYSTEM" == "Linux" ]] || [[ "$SYSTEM" == "Darwin" ]];then + cp $PADDLE_ROOT/paddle/scripts/paddle_build.sh $PADDLE_ROOT/paddle/scripts/paddle_build_pre.sh +elif [[ "$SYSTEM" == "Windows_NT" ]];then + git remote | grep upstream + if [ $? != 0 ]; then + git remote add upstream https://github.com/PaddlePaddle/Paddle.git + git fetch upstream develop + fi +fi CURBRANCH=`git rev-parse --abbrev-ref HEAD` echo $CURBRANCH +if [ `git branch | grep 'prec_added_ut'` ];then + git branch -D 'prec_added_ut' +fi git checkout -b prec_added_ut upstream/${BRANCH} +git branch mkdir prec_build cd prec_build -bash $PADDLE_ROOT/paddle/scripts/paddle_build_pre.sh cmake_gen_in_current_dir >prebuild.log 2>&1 -ctest -N | awk -F ':' '{print $2}' | sed '/^$/d' | sed '$d' | sed 's/ //g' > /$PADDLE_ROOT/br-ut +if [[ "$SYSTEM" == "Linux" ]] || [[ "$SYSTEM" == "Darwin" ]];then + bash $PADDLE_ROOT/paddle/scripts/paddle_build_pre.sh cmake_gen_in_current_dir >prebuild.log 2>&1 +elif [[ "$SYSTEM" == "Windows_NT" ]];then + bash $PADDLE_ROOT/win_cmake.sh +fi +ctest -N | awk -F ':' '{print $2}' | sed '/^$/d' | sed '$d' | sed 's/ //g' | grep 'test' > $PADDLE_ROOT/br-ut cd $PADDLE_ROOT/build -ctest -N | awk -F ':' '{print $2}' | sed '/^$/d' | sed '$d' | sed 's/ //g' > /$PADDLE_ROOT/pr-ut +ctest -N | awk -F ':' '{print $2}' | sed '/^$/d' | sed '$d' | sed 's/ //g' | grep 'test' > $PADDLE_ROOT/pr-ut cd $PADDLE_ROOT +echo "=================================" +echo "br-ut" +cat $PADDLE_ROOT/br-ut +echo "=================================" +echo "pr-ut" +cat $PADDLE_ROOT/pr-ut +echo "=================================" grep -F -x -v -f br-ut pr-ut > $PADDLE_ROOT/added_ut -sort pr-ut |uniq -d > $PADDLE_ROOT/duplicate_ut +if [[ "$SYSTEM" == 'Linux' ]];then + sort pr-ut |uniq -d > $PADDLE_ROOT/duplicate_ut +fi echo "New-UT:" cat $PADDLE_ROOT/added_ut rm -rf prec_build -rm $PADDLE_ROOT/br-ut $PADDLE_ROOT/pr-ut $PADDLE_ROOT/paddle/scripts/paddle_build_pre.sh -git checkout $CURBRANCH +if [[ "$SYSTEM" == "Linux" ]] || [[ "$SYSTEM" == "Darwin" ]];then + rm $PADDLE_ROOT/br-ut $PADDLE_ROOT/pr-ut $PADDLE_ROOT/paddle/scripts/paddle_build_pre.sh +elif [[ "$SYSTEM" == "Windows_NT" ]];then + rm $PADDLE_ROOT/br-ut $PADDLE_ROOT/pr-ut $PADDLE_ROOT/get_added_ut.sh +fi +git checkout -f $CURBRANCH echo $CURBRANCH git branch -D prec_added_ut cd $CURDIR diff --git a/tools/windows/run_unittests.sh b/tools/windows/run_unittests.sh index 6da2401fbe4..8d52c1b84ae 100644 --- a/tools/windows/run_unittests.sh +++ b/tools/windows/run_unittests.sh @@ -37,6 +37,14 @@ else disable_ut_quickly='' fi +# check added ut +set +e +cp $PADDLE_ROOT/tools/check_added_ut.sh $PADDLE_ROOT/tools/check_added_ut_win.sh +bash $PADDLE_ROOT/tools/check_added_ut_win.sh +rm -rf $PADDLE_ROOT/tools/check_added_ut_win.sh +set -e + + # /*==================Fixed Disabled Windows unittests==============================*/ # TODO: fix these unittest that is bound to fail diable_wingpu_test="^lite_mul_model_test$|\ @@ -379,6 +387,16 @@ function show_ut_retry_result() { set +e if [ "${WITH_GPU:-OFF}" == "ON" ];then + if [ -f "$PADDLE_ROOT/added_ut" ];then + added_uts=^$(awk BEGIN{RS=EOF}'{gsub(/\n/,"$|^");print}' $PADDLE_ROOT/added_ut)$ + ctest -R "(${added_uts})" --output-on-failure -C Release --repeat-until-fail 3;added_ut_error=$? + if [ "$added_ut_error" != 0 ];then + echo "========================================" + echo "Added UT should pass three additional executions" + echo "========================================" + exit 8; + fi + fi run_unittest_gpu $cpu_parallel_job 12 run_unittest_gpu $tetrad_parallel_job 4 run_unittest_gpu $two_parallel_job 2 -- GitLab From 4935b8e741f15571a3b58c9279f29b73c1edf99b Mon Sep 17 00:00:00 2001 From: seemingwang Date: Wed, 7 Apr 2021 19:52:07 +0800 Subject: [PATCH 171/486] move graph files (#32103) * graph engine demo * upload unsaved changes * fix dependency error * fix shard_num problem * py client * remove lock and graph-type * add load direct graph * add load direct graph * add load direct graph * batch random_sample * batch_sample_k * fix num_nodes size * batch brpc * batch brpc * add test * add test * add load_nodes; change add_node function * change sample return type to pair * resolve conflict * resolved conflict * resolved conflict * separate server and client * merge pair type * fix * resolved conflict * fixed segment fault; high-level VLOG for load edges and load nodes * random_sample return 0 * rm useless loop * test:load edge * fix ret -1 * test: rm sample * rm sample * random_sample return future * random_sample return int * test fake node * fixed here * memory leak * remove test code * fix return problem * add common_graph_table * random sample node &test & change data-structure from linkedList to vector * add common_graph_table * sample with srand * add node_types * optimize nodes sample * recover test * random sample * destruct weighted sampler * GraphEdgeBlob * WeightedGraphEdgeBlob to GraphEdgeBlob * WeightedGraphEdgeBlob to GraphEdgeBlob * pybind sample nodes api * pull nodes with step * fixed pull_graph_list bug; add test for pull_graph_list by step * add graph table;name * add graph table;name * add pybind * add pybind * add FeatureNode * add FeatureNode * add FeatureNode Serialize * add FeatureNode Serialize * get_feat_node * avoid local rpc * fix get_node_feat * fix get_node_feat * remove log * get_node_feat return py:bytes * merge develop with graph_engine * fix threadpool.h head * fix * fix typo * resolve conflict * fix conflict * recover lost content * fix pybind of FeatureNode * recover cmake * recover tools * resolve conflict * resolve linking problem * code style * change test_server port * fix code problems * remove shard_num config * remove redundent threads * optimize start server * remove logs * fix code problems by reviewers' suggestions * move graph files into a folder * code style change * remove graph operations from base table Co-authored-by: Huang Zhengjie <270018958@qq.com> Co-authored-by: Weiyue Su Co-authored-by: suweiyue Co-authored-by: luobin06 Co-authored-by: liweibin02 Co-authored-by: tangwei12 --- .../distributed/service/graph_brpc_server.cc | 13 +- .../distributed/service/graph_brpc_server.h | 3 +- paddle/fluid/distributed/service/ps_client.h | 2 +- paddle/fluid/distributed/table/CMakeLists.txt | 14 +- .../distributed/table/common_graph_table.cc | 2 +- .../distributed/table/common_graph_table.h | 2 +- .../distributed/table/graph/graph_edge.cc | 29 ++++ .../distributed/table/graph/graph_edge.h | 46 ++++++ .../distributed/table/graph/graph_node.cc | 117 ++++++++++++++ .../distributed/table/graph/graph_node.h | 127 +++++++++++++++ .../table/graph/graph_weighted_sampler.cc | 150 ++++++++++++++++++ .../table/graph/graph_weighted_sampler.h | 58 +++++++ paddle/fluid/distributed/table/table.h | 27 +--- .../fluid/distributed/test/graph_node_test.cc | 2 +- 14 files changed, 548 insertions(+), 44 deletions(-) create mode 100644 paddle/fluid/distributed/table/graph/graph_edge.cc create mode 100644 paddle/fluid/distributed/table/graph/graph_edge.h create mode 100644 paddle/fluid/distributed/table/graph/graph_node.cc create mode 100644 paddle/fluid/distributed/table/graph/graph_node.h create mode 100644 paddle/fluid/distributed/table/graph/graph_weighted_sampler.cc create mode 100644 paddle/fluid/distributed/table/graph/graph_weighted_sampler.h diff --git a/paddle/fluid/distributed/service/graph_brpc_server.cc b/paddle/fluid/distributed/service/graph_brpc_server.cc index 4f6cc1143e9..bdd926278b6 100644 --- a/paddle/fluid/distributed/service/graph_brpc_server.cc +++ b/paddle/fluid/distributed/service/graph_brpc_server.cc @@ -19,7 +19,6 @@ #include "butil/endpoint.h" #include "iomanip" #include "paddle/fluid/distributed/service/brpc_ps_client.h" -#include "paddle/fluid/distributed/table/table.h" #include "paddle/fluid/framework/archive.h" #include "paddle/fluid/platform/profiler.h" namespace paddle { @@ -265,7 +264,8 @@ int32_t GraphBrpcService::pull_graph_list(Table *table, int step = *(int *)(request.params(2).c_str()); std::unique_ptr buffer; int actual_size; - table->pull_graph_list(start, size, buffer, actual_size, false, step); + ((GraphTable *)table) + ->pull_graph_list(start, size, buffer, actual_size, false, step); cntl->response_attachment().append(buffer.get(), actual_size); return 0; } @@ -284,8 +284,8 @@ int32_t GraphBrpcService::graph_random_sample_neighboors( int sample_size = *(uint64_t *)(request.params(1).c_str()); std::vector> buffers(node_num); std::vector actual_sizes(node_num, 0); - table->random_sample_neighboors(node_data, sample_size, buffers, - actual_sizes); + ((GraphTable *)table) + ->random_sample_neighboors(node_data, sample_size, buffers, actual_sizes); cntl->response_attachment().append(&node_num, sizeof(size_t)); cntl->response_attachment().append(actual_sizes.data(), @@ -301,7 +301,8 @@ int32_t GraphBrpcService::graph_random_sample_nodes( size_t size = *(uint64_t *)(request.params(0).c_str()); std::unique_ptr buffer; int actual_size; - if (table->random_sample_nodes(size, buffer, actual_size) == 0) { + if (((GraphTable *)table)->random_sample_nodes(size, buffer, actual_size) == + 0) { cntl->response_attachment().append(buffer.get(), actual_size); } else cntl->response_attachment().append(NULL, 0); @@ -330,7 +331,7 @@ int32_t GraphBrpcService::graph_get_node_feat(Table *table, std::vector> feature( feature_names.size(), std::vector(node_num)); - table->get_node_feat(node_ids, feature_names, feature); + ((GraphTable *)table)->get_node_feat(node_ids, feature_names, feature); for (size_t feat_idx = 0; feat_idx < feature_names.size(); ++feat_idx) { for (size_t node_idx = 0; node_idx < node_num; ++node_idx) { diff --git a/paddle/fluid/distributed/service/graph_brpc_server.h b/paddle/fluid/distributed/service/graph_brpc_server.h index af63bf5d99e..32c572f9e6c 100644 --- a/paddle/fluid/distributed/service/graph_brpc_server.h +++ b/paddle/fluid/distributed/service/graph_brpc_server.h @@ -22,7 +22,8 @@ #include #include "paddle/fluid/distributed/service/brpc_ps_server.h" #include "paddle/fluid/distributed/service/server.h" - +#include "paddle/fluid/distributed/table/common_graph_table.h" +#include "paddle/fluid/distributed/table/table.h" namespace paddle { namespace distributed { class GraphBrpcServer : public PSServer { diff --git a/paddle/fluid/distributed/service/ps_client.h b/paddle/fluid/distributed/service/ps_client.h index 3ff4b9d063f..1c8abc6c2e8 100644 --- a/paddle/fluid/distributed/service/ps_client.h +++ b/paddle/fluid/distributed/service/ps_client.h @@ -24,7 +24,7 @@ #include "paddle/fluid/distributed/service/env.h" #include "paddle/fluid/distributed/service/sendrecv.pb.h" #include "paddle/fluid/distributed/table/accessor.h" -#include "paddle/fluid/distributed/table/graph_node.h" +#include "paddle/fluid/distributed/table/graph/graph_node.h" namespace paddle { namespace distributed { diff --git a/paddle/fluid/distributed/table/CMakeLists.txt b/paddle/fluid/distributed/table/CMakeLists.txt index 33873abc5f7..dde1f5ae8ee 100644 --- a/paddle/fluid/distributed/table/CMakeLists.txt +++ b/paddle/fluid/distributed/table/CMakeLists.txt @@ -1,12 +1,12 @@ set_property(GLOBAL PROPERTY TABLE_DEPS string_helper) - +set(graphDir graph) get_property(TABLE_DEPS GLOBAL PROPERTY TABLE_DEPS) -set_source_files_properties(graph_edge.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -cc_library(graph_edge SRCS graph_edge.cc) -set_source_files_properties(graph_weighted_sampler.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -cc_library(WeightedSampler SRCS graph_weighted_sampler.cc DEPS graph_edge) -set_source_files_properties(graph_node.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -cc_library(graph_node SRCS graph_node.cc DEPS WeightedSampler) +set_source_files_properties(${graphDir}/graph_edge.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +cc_library(graph_edge SRCS ${graphDir}/graph_edge.cc) +set_source_files_properties(${graphDir}/graph_weighted_sampler.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +cc_library(WeightedSampler SRCS ${graphDir}/graph_weighted_sampler.cc DEPS graph_edge) +set_source_files_properties(${graphDir}/graph_node.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +cc_library(graph_node SRCS ${graphDir}/graph_node.cc DEPS WeightedSampler) set_source_files_properties(common_dense_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(common_sparse_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(sparse_geo_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) diff --git a/paddle/fluid/distributed/table/common_graph_table.cc b/paddle/fluid/distributed/table/common_graph_table.cc index 995a39a6543..020bcdcc52e 100644 --- a/paddle/fluid/distributed/table/common_graph_table.cc +++ b/paddle/fluid/distributed/table/common_graph_table.cc @@ -18,7 +18,7 @@ #include #include #include "paddle/fluid/distributed/common/utils.h" -#include "paddle/fluid/distributed/table/graph_node.h" +#include "paddle/fluid/distributed/table/graph/graph_node.h" #include "paddle/fluid/string/printf.h" #include "paddle/fluid/string/string_helper.h" namespace paddle { diff --git a/paddle/fluid/distributed/table/common_graph_table.h b/paddle/fluid/distributed/table/common_graph_table.h index ab289618462..8ddf3c8f904 100644 --- a/paddle/fluid/distributed/table/common_graph_table.h +++ b/paddle/fluid/distributed/table/common_graph_table.h @@ -26,7 +26,7 @@ #include #include "paddle/fluid/distributed/table/accessor.h" #include "paddle/fluid/distributed/table/common_table.h" -#include "paddle/fluid/distributed/table/graph_node.h" +#include "paddle/fluid/distributed/table/graph/graph_node.h" #include "paddle/fluid/framework/rw_lock.h" #include "paddle/fluid/string/string_helper.h" namespace paddle { diff --git a/paddle/fluid/distributed/table/graph/graph_edge.cc b/paddle/fluid/distributed/table/graph/graph_edge.cc new file mode 100644 index 00000000000..0ab0d5a76d6 --- /dev/null +++ b/paddle/fluid/distributed/table/graph/graph_edge.cc @@ -0,0 +1,29 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/distributed/table/graph/graph_edge.h" +#include +namespace paddle { +namespace distributed { + +void GraphEdgeBlob::add_edge(uint64_t id, float weight = 1) { + id_arr.push_back(id); +} + +void WeightedGraphEdgeBlob::add_edge(uint64_t id, float weight = 1) { + id_arr.push_back(id); + weight_arr.push_back(weight); +} +} +} diff --git a/paddle/fluid/distributed/table/graph/graph_edge.h b/paddle/fluid/distributed/table/graph/graph_edge.h new file mode 100644 index 00000000000..3dfe5a6f357 --- /dev/null +++ b/paddle/fluid/distributed/table/graph/graph_edge.h @@ -0,0 +1,46 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +namespace paddle { +namespace distributed { + +class GraphEdgeBlob { + public: + GraphEdgeBlob() {} + virtual ~GraphEdgeBlob() {} + size_t size() { return id_arr.size(); } + virtual void add_edge(uint64_t id, float weight); + uint64_t get_id(int idx) { return id_arr[idx]; } + virtual float get_weight(int idx) { return 1; } + + protected: + std::vector id_arr; +}; + +class WeightedGraphEdgeBlob : public GraphEdgeBlob { + public: + WeightedGraphEdgeBlob() {} + virtual ~WeightedGraphEdgeBlob() {} + virtual void add_edge(uint64_t id, float weight); + virtual float get_weight(int idx) { return weight_arr[idx]; } + + protected: + std::vector weight_arr; +}; +} +} diff --git a/paddle/fluid/distributed/table/graph/graph_node.cc b/paddle/fluid/distributed/table/graph/graph_node.cc new file mode 100644 index 00000000000..816d31b9790 --- /dev/null +++ b/paddle/fluid/distributed/table/graph/graph_node.cc @@ -0,0 +1,117 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/distributed/table/graph/graph_node.h" +#include +namespace paddle { +namespace distributed { + +GraphNode::~GraphNode() { + if (sampler != nullptr) { + delete sampler; + sampler = nullptr; + } + if (edges != nullptr) { + delete edges; + edges = nullptr; + } +} + +int Node::weight_size = sizeof(float); +int Node::id_size = sizeof(uint64_t); +int Node::int_size = sizeof(int); + +int Node::get_size(bool need_feature) { return id_size + int_size; } + +void Node::to_buffer(char* buffer, bool need_feature) { + memcpy(buffer, &id, id_size); + buffer += id_size; + + int feat_num = 0; + memcpy(buffer, &feat_num, sizeof(int)); +} + +void Node::recover_from_buffer(char* buffer) { memcpy(&id, buffer, id_size); } + +int FeatureNode::get_size(bool need_feature) { + int size = id_size + int_size; // id, feat_num + if (need_feature) { + size += feature.size() * int_size; + for (const std::string& fea : feature) { + size += fea.size(); + } + } + return size; +} + +void GraphNode::build_edges(bool is_weighted) { + if (edges == nullptr) { + if (is_weighted == true) { + edges = new WeightedGraphEdgeBlob(); + } else { + edges = new GraphEdgeBlob(); + } + } +} +void GraphNode::build_sampler(std::string sample_type) { + if (sample_type == "random") { + sampler = new RandomSampler(); + } else if (sample_type == "weighted") { + sampler = new WeightedSampler(); + } + sampler->build(edges); +} +void FeatureNode::to_buffer(char* buffer, bool need_feature) { + memcpy(buffer, &id, id_size); + buffer += id_size; + + int feat_num = 0; + int feat_len; + if (need_feature) { + feat_num += feature.size(); + memcpy(buffer, &feat_num, sizeof(int)); + buffer += sizeof(int); + for (int i = 0; i < feat_num; ++i) { + feat_len = feature[i].size(); + memcpy(buffer, &feat_len, sizeof(int)); + buffer += sizeof(int); + memcpy(buffer, feature[i].c_str(), feature[i].size()); + buffer += feature[i].size(); + } + } else { + memcpy(buffer, &feat_num, sizeof(int)); + } +} +void FeatureNode::recover_from_buffer(char* buffer) { + int feat_num, feat_len; + memcpy(&id, buffer, id_size); + buffer += id_size; + + memcpy(&feat_num, buffer, sizeof(int)); + buffer += sizeof(int); + + feature.clear(); + for (int i = 0; i < feat_num; ++i) { + memcpy(&feat_len, buffer, sizeof(int)); + buffer += sizeof(int); + + char str[feat_len + 1]; + memcpy(str, buffer, feat_len); + buffer += feat_len; + str[feat_len] = '\0'; + feature.push_back(std::string(str)); + } +} +} +} diff --git a/paddle/fluid/distributed/table/graph/graph_node.h b/paddle/fluid/distributed/table/graph/graph_node.h new file mode 100644 index 00000000000..8ad795ac97b --- /dev/null +++ b/paddle/fluid/distributed/table/graph/graph_node.h @@ -0,0 +1,127 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include +#include "paddle/fluid/distributed/table/graph/graph_weighted_sampler.h" +namespace paddle { +namespace distributed { + +class Node { + public: + Node() {} + Node(uint64_t id) : id(id) {} + virtual ~Node() {} + static int id_size, int_size, weight_size; + uint64_t get_id() { return id; } + void set_id(uint64_t id) { this->id = id; } + + virtual void build_edges(bool is_weighted) {} + virtual void build_sampler(std::string sample_type) {} + virtual void add_edge(uint64_t id, float weight) {} + virtual std::vector sample_k(int k) { return std::vector(); } + virtual uint64_t get_neighbor_id(int idx) { return 0; } + virtual float get_neighbor_weight(int idx) { return 1.; } + + virtual int get_size(bool need_feature); + virtual void to_buffer(char *buffer, bool need_feature); + virtual void recover_from_buffer(char *buffer); + virtual std::string get_feature(int idx) { return std::string(""); } + virtual void set_feature(int idx, std::string str) {} + virtual void set_feature_size(int size) {} + virtual int get_feature_size() { return 0; } + + protected: + uint64_t id; +}; + +class GraphNode : public Node { + public: + GraphNode() : Node(), sampler(nullptr), edges(nullptr) {} + GraphNode(uint64_t id) : Node(id), sampler(nullptr), edges(nullptr) {} + virtual ~GraphNode(); + virtual void build_edges(bool is_weighted); + virtual void build_sampler(std::string sample_type); + virtual void add_edge(uint64_t id, float weight) { + edges->add_edge(id, weight); + } + virtual std::vector sample_k(int k) { return sampler->sample_k(k); } + virtual uint64_t get_neighbor_id(int idx) { return edges->get_id(idx); } + virtual float get_neighbor_weight(int idx) { return edges->get_weight(idx); } + + protected: + Sampler *sampler; + GraphEdgeBlob *edges; +}; + +class FeatureNode : public Node { + public: + FeatureNode() : Node() {} + FeatureNode(uint64_t id) : Node(id) {} + virtual ~FeatureNode() {} + virtual int get_size(bool need_feature); + virtual void to_buffer(char *buffer, bool need_feature); + virtual void recover_from_buffer(char *buffer); + virtual std::string get_feature(int idx) { + if (idx < (int)this->feature.size()) { + return this->feature[idx]; + } else { + return std::string(""); + } + } + + virtual void set_feature(int idx, std::string str) { + if (idx >= (int)this->feature.size()) { + this->feature.resize(idx + 1); + } + this->feature[idx] = str; + } + virtual void set_feature_size(int size) { this->feature.resize(size); } + virtual int get_feature_size() { return this->feature.size(); } + + template + static std::string parse_value_to_bytes(std::vector feat_str) { + T v; + size_t Tsize = sizeof(T) * feat_str.size(); + char buffer[Tsize]; + for (size_t i = 0; i < feat_str.size(); i++) { + std::stringstream ss(feat_str[i]); + ss >> v; + std::memcpy(buffer + sizeof(T) * i, (char *)&v, sizeof(T)); + } + return std::string(buffer, Tsize); + } + + template + static std::vector parse_bytes_to_array(std::string feat_str) { + T v; + std::vector out; + size_t start = 0; + const char *buffer = feat_str.data(); + while (start < feat_str.size()) { + std::memcpy((char *)&v, buffer + start, sizeof(T)); + start += sizeof(T); + out.push_back(v); + } + return out; + } + + protected: + std::vector feature; +}; +} +} diff --git a/paddle/fluid/distributed/table/graph/graph_weighted_sampler.cc b/paddle/fluid/distributed/table/graph/graph_weighted_sampler.cc new file mode 100644 index 00000000000..3a680875e3d --- /dev/null +++ b/paddle/fluid/distributed/table/graph/graph_weighted_sampler.cc @@ -0,0 +1,150 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/distributed/table/graph/graph_weighted_sampler.h" +#include +#include +namespace paddle { +namespace distributed { + +void RandomSampler::build(GraphEdgeBlob *edges) { this->edges = edges; } + +std::vector RandomSampler::sample_k(int k) { + int n = edges->size(); + if (k > n) { + k = n; + } + struct timespec tn; + clock_gettime(CLOCK_REALTIME, &tn); + srand(tn.tv_nsec); + std::vector sample_result; + std::unordered_map replace_map; + while (k--) { + int rand_int = rand() % n; + auto iter = replace_map.find(rand_int); + if (iter == replace_map.end()) { + sample_result.push_back(rand_int); + } else { + sample_result.push_back(iter->second); + } + + iter = replace_map.find(n - 1); + if (iter == replace_map.end()) { + replace_map[rand_int] = n - 1; + } else { + replace_map[rand_int] = iter->second; + } + --n; + } + return sample_result; +} + +WeightedSampler::WeightedSampler() { + left = nullptr; + right = nullptr; + edges = nullptr; +} + +WeightedSampler::~WeightedSampler() { + if (left != nullptr) { + delete left; + left = nullptr; + } + if (right != nullptr) { + delete right; + right = nullptr; + } +} + +void WeightedSampler::build(GraphEdgeBlob *edges) { + if (left != nullptr) { + delete left; + left = nullptr; + } + if (right != nullptr) { + delete right; + right = nullptr; + } + return build_one((WeightedGraphEdgeBlob *)edges, 0, edges->size()); +} + +void WeightedSampler::build_one(WeightedGraphEdgeBlob *edges, int start, + int end) { + count = 0; + this->edges = edges; + if (start + 1 == end) { + left = right = nullptr; + idx = start; + count = 1; + weight = edges->get_weight(idx); + + } else { + left = new WeightedSampler(); + right = new WeightedSampler(); + left->build_one(edges, start, start + (end - start) / 2); + right->build_one(edges, start + (end - start) / 2, end); + weight = left->weight + right->weight; + count = left->count + right->count; + } +} +std::vector WeightedSampler::sample_k(int k) { + if (k > count) { + k = count; + } + std::vector sample_result; + float subtract; + std::unordered_map subtract_weight_map; + std::unordered_map subtract_count_map; + struct timespec tn; + clock_gettime(CLOCK_REALTIME, &tn); + srand(tn.tv_nsec); + while (k--) { + float query_weight = rand() % 100000 / 100000.0; + query_weight *= weight - subtract_weight_map[this]; + sample_result.push_back(sample(query_weight, subtract_weight_map, + subtract_count_map, subtract)); + } + return sample_result; +} + +int WeightedSampler::sample( + float query_weight, + std::unordered_map &subtract_weight_map, + std::unordered_map &subtract_count_map, + float &subtract) { + if (left == nullptr) { + subtract_weight_map[this] = weight; + subtract = weight; + subtract_count_map[this] = 1; + return idx; + } + int left_count = left->count - subtract_count_map[left]; + int right_count = right->count - subtract_count_map[right]; + float left_subtract = subtract_weight_map[left]; + int return_idx; + if (right_count == 0 || + left_count > 0 && left->weight - left_subtract >= query_weight) { + return_idx = left->sample(query_weight, subtract_weight_map, + subtract_count_map, subtract); + } else { + return_idx = + right->sample(query_weight - (left->weight - left_subtract), + subtract_weight_map, subtract_count_map, subtract); + } + subtract_weight_map[this] += subtract; + subtract_count_map[this]++; + return return_idx; +} +} +} diff --git a/paddle/fluid/distributed/table/graph/graph_weighted_sampler.h b/paddle/fluid/distributed/table/graph/graph_weighted_sampler.h new file mode 100644 index 00000000000..1787ab23b04 --- /dev/null +++ b/paddle/fluid/distributed/table/graph/graph_weighted_sampler.h @@ -0,0 +1,58 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include "paddle/fluid/distributed/table/graph/graph_edge.h" +namespace paddle { +namespace distributed { + +class Sampler { + public: + virtual ~Sampler() {} + virtual void build(GraphEdgeBlob *edges) = 0; + virtual std::vector sample_k(int k) = 0; +}; + +class RandomSampler : public Sampler { + public: + virtual ~RandomSampler() {} + virtual void build(GraphEdgeBlob *edges); + virtual std::vector sample_k(int k); + GraphEdgeBlob *edges; +}; + +class WeightedSampler : public Sampler { + public: + WeightedSampler(); + virtual ~WeightedSampler(); + WeightedSampler *left, *right; + float weight; + int count; + int idx; + GraphEdgeBlob *edges; + virtual void build(GraphEdgeBlob *edges); + virtual void build_one(WeightedGraphEdgeBlob *edges, int start, int end); + virtual std::vector sample_k(int k); + + private: + int sample(float query_weight, + std::unordered_map &subtract_weight_map, + std::unordered_map &subtract_count_map, + float &subtract); +}; +} +} diff --git a/paddle/fluid/distributed/table/table.h b/paddle/fluid/distributed/table/table.h index 8f014ac98ba..5bc818ff474 100644 --- a/paddle/fluid/distributed/table/table.h +++ b/paddle/fluid/distributed/table/table.h @@ -22,7 +22,7 @@ #include #include "paddle/fluid/distributed/table/accessor.h" #include "paddle/fluid/distributed/table/depends/sparse_utils.h" -#include "paddle/fluid/distributed/table/graph_node.h" +#include "paddle/fluid/distributed/table/graph/graph_node.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/platform/device_context.h" @@ -88,31 +88,6 @@ class Table { return 0; } - // only for graph table - virtual int32_t pull_graph_list(int start, int total_size, - std::unique_ptr &buffer, - int &actual_size, bool need_feature, - int step = 1) { - return 0; - } - // only for graph table - virtual int32_t random_sample_neighboors( - uint64_t *node_ids, int sample_size, - std::vector> &buffers, - std::vector &actual_sizes) { - return 0; - } - - virtual int32_t random_sample_nodes(int sample_size, - std::unique_ptr &buffers, - int &actual_sizes) { - return 0; - } - virtual int32_t get_node_feat(const std::vector &node_ids, - const std::vector &feature_names, - std::vector> &res) { - return 0; - } virtual int32_t pour() { return 0; } virtual void clear() = 0; diff --git a/paddle/fluid/distributed/test/graph_node_test.cc b/paddle/fluid/distributed/test/graph_node_test.cc index 79ab2795963..b268bb449e1 100644 --- a/paddle/fluid/distributed/test/graph_node_test.cc +++ b/paddle/fluid/distributed/test/graph_node_test.cc @@ -33,7 +33,7 @@ limitations under the License. */ #include "paddle/fluid/distributed/service/ps_client.h" #include "paddle/fluid/distributed/service/sendrecv.pb.h" #include "paddle/fluid/distributed/service/service.h" -#include "paddle/fluid/distributed/table/graph_node.h" +#include "paddle/fluid/distributed/table/graph/graph_node.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" -- GitLab From 297290a89edf4e8a6880c579a9287d2809e40df7 Mon Sep 17 00:00:00 2001 From: danleifeng <52735331+danleifeng@users.noreply.github.com> Date: Wed, 7 Apr 2021 21:03:20 +0800 Subject: [PATCH 172/486] add uint8 type for flatten op (#32120) * add uint8 type for flatten;test=develop --- paddle/fluid/operators/flatten_op.cc | 8 ++++++++ paddle/fluid/operators/flatten_op.cu.cc | 8 ++++++++ python/paddle/fluid/layers/nn.py | 7 +++++-- python/paddle/fluid/tests/unittests/test_flatten2_op.py | 2 +- .../tests/unittests/test_flatten_contiguous_range_op.py | 2 +- python/paddle/tensor/manipulation.py | 5 +++-- 6 files changed, 26 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/operators/flatten_op.cc b/paddle/fluid/operators/flatten_op.cc index d23beea7e4e..c94ce4174f2 100644 --- a/paddle/fluid/operators/flatten_op.cc +++ b/paddle/fluid/operators/flatten_op.cc @@ -429,6 +429,7 @@ REGISTER_OPERATOR(flatten_contiguous_range_grad, REGISTER_OP_CPU_KERNEL( flatten, ops::FlattenKernel, ops::FlattenKernel, + ops::FlattenKernel, ops::FlattenKernel, ops::FlattenKernel, ops::FlattenKernel); @@ -436,12 +437,14 @@ REGISTER_OP_CPU_KERNEL( flatten_grad, ops::FlattenGradKernel, ops::FlattenGradKernel, + ops::FlattenGradKernel, ops::FlattenGradKernel, ops::FlattenGradKernel, ops::FlattenGradKernel); REGISTER_OP_CPU_KERNEL( flatten2, ops::Flatten2Kernel, ops::Flatten2Kernel, + ops::Flatten2Kernel, ops::Flatten2Kernel, ops::Flatten2Kernel, ops::Flatten2Kernel); @@ -449,6 +452,7 @@ REGISTER_OP_CPU_KERNEL( flatten2_grad, ops::Flatten2GradKernel, ops::Flatten2GradKernel, + ops::Flatten2GradKernel, ops::Flatten2GradKernel, ops::Flatten2GradKernel, ops::Flatten2GradKernel); @@ -458,6 +462,8 @@ REGISTER_OP_CPU_KERNEL( float>, ops::FlattenContiguousRangeKernel, + ops::FlattenContiguousRangeKernel, ops::FlattenContiguousRangeKernel, ops::FlattenContiguousRangeKernel, @@ -469,6 +475,8 @@ REGISTER_OP_CPU_KERNEL( float>, ops::FlattenContiguousRangeGradKernel, + ops::FlattenContiguousRangeGradKernel, ops::FlattenContiguousRangeGradKernel, ops::FlattenContiguousRangeGradKernel, ops::FlattenKernel, + ops::FlattenKernel, ops::FlattenKernel, ops::FlattenKernel, ops::FlattenKernel); @@ -26,12 +27,14 @@ REGISTER_OP_CUDA_KERNEL( flatten_grad, ops::FlattenGradKernel, ops::FlattenGradKernel, + ops::FlattenGradKernel, ops::FlattenGradKernel, ops::FlattenGradKernel, ops::FlattenGradKernel); REGISTER_OP_CUDA_KERNEL( flatten2, ops::Flatten2Kernel, ops::Flatten2Kernel, + ops::Flatten2Kernel, ops::Flatten2Kernel, ops::Flatten2Kernel, ops::Flatten2Kernel); @@ -39,6 +42,7 @@ REGISTER_OP_CUDA_KERNEL( flatten2_grad, ops::Flatten2GradKernel, ops::Flatten2GradKernel, + ops::Flatten2GradKernel, ops::Flatten2GradKernel, ops::Flatten2GradKernel, ops::Flatten2GradKernel); @@ -48,6 +52,8 @@ REGISTER_OP_CUDA_KERNEL( float>, ops::FlattenContiguousRangeKernel, + ops::FlattenContiguousRangeKernel, ops::FlattenContiguousRangeKernel, ops::FlattenContiguousRangeKernel, @@ -59,6 +65,8 @@ REGISTER_OP_CUDA_KERNEL( float>, ops::FlattenContiguousRangeGradKernel, + ops::FlattenContiguousRangeGradKernel, ops::FlattenContiguousRangeGradKernel, ops::FlattenContiguousRangeGradKernel= axis. A tensor with type float32, - float64, int8, int32, int64. + float64, int8, int32, int64, uint8. axis (int): Indicate up to which input dimensions (exclusive) should be flattened to the outer dimension of the output. The value for axis must be in the range [0, R], where R @@ -9962,14 +9962,17 @@ def flatten(x, axis=1, name=None): .. code-block:: python + import paddle import paddle.fluid as fluid + paddle.enable_static() x = fluid.data(name="x", shape=[4, 4, 3], dtype="float32") # x shape is [4, 4, 3] out = fluid.layers.flatten(x=x, axis=2) # out shape is [16, 3] """ check_variable_and_dtype( - x, 'x', ['float32', 'float64', 'int8', 'int32', 'int64'], 'flatten') + x, 'x', ['float32', 'float64', 'int8', 'int32', 'int64', 'uint8'], + 'flatten') helper = LayerHelper('flatten', **locals()) if not (isinstance(x, Variable)): diff --git a/python/paddle/fluid/tests/unittests/test_flatten2_op.py b/python/paddle/fluid/tests/unittests/test_flatten2_op.py index 189a63a0868..0d50c65558a 100644 --- a/python/paddle/fluid/tests/unittests/test_flatten2_op.py +++ b/python/paddle/fluid/tests/unittests/test_flatten2_op.py @@ -81,7 +81,7 @@ class TestFlatten2OpError(unittest.TestCase): self.assertRaises(TypeError, test_Variable) def test_type(): - # dtype must be float32, float64, int8, int32, int64. + # dtype must be float32, float64, int8, int32, int64, uint8. x2 = fluid.layers.data( name='x2', shape=[3, 2, 4, 5], dtype='float16') fluid.layers.flatten(x2, axis=1) diff --git a/python/paddle/fluid/tests/unittests/test_flatten_contiguous_range_op.py b/python/paddle/fluid/tests/unittests/test_flatten_contiguous_range_op.py index 28803f5ac62..d6cc6ecffc1 100644 --- a/python/paddle/fluid/tests/unittests/test_flatten_contiguous_range_op.py +++ b/python/paddle/fluid/tests/unittests/test_flatten_contiguous_range_op.py @@ -166,7 +166,7 @@ class TestFlatten2OpError(unittest.TestCase): self.assertRaises(ValueError, test_ValueError3) def test_type(): - # dtype must be float32, float64, int8, int32, int64. + # dtype must be float32, float64, int8, int32, int64, uint8. x2 = np.arange(image_shape[0] * image_shape[1] * image_shape[2] * image_shape[3]).reshape(image_shape) / 100. x2 = x2.astype('float16') diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py index 9bcda74d116..377435a5000 100644 --- a/python/paddle/tensor/manipulation.py +++ b/python/paddle/tensor/manipulation.py @@ -212,7 +212,7 @@ def flatten(x, start_axis=0, stop_axis=-1, name=None): Args: x (Tensor): A tensor of number of dimentions >= axis. A tensor with data type float32, - float64, int8, int32, int64. + float64, int8, int32, int64, uint8. start_axis (int): the start axis to flatten stop_axis (int): the stop axis to flatten name(str, Optional): For details, please refer to :ref:`api_guide_Name`. @@ -249,7 +249,8 @@ def flatten(x, start_axis=0, stop_axis=-1, name=None): raise ValueError("The input x should be a Tensor") check_variable_and_dtype( - x, 'x', ['float32', 'float64', 'int8', 'int32', 'int64'], 'flatten') + x, 'x', ['float32', 'float64', 'int8', 'int32', 'int64', 'uint8'], + 'flatten') helper = LayerHelper('flatten', **locals()) x_dim = len(x.shape) -- GitLab From f74f97624d4c4ea5fbed7ca497872f46c92431cd Mon Sep 17 00:00:00 2001 From: Thomas Young <35565423+HexToString@users.noreply.github.com> Date: Thu, 8 Apr 2021 13:19:45 +0800 Subject: [PATCH 173/486] fix the XXX_GRAD_CASE bug by HexToString (#32004) --- paddle/fluid/operators/expand_as_op.h | 29 ++++++++++++++++++++---- paddle/fluid/operators/expand_as_v2_op.h | 26 ++++++++++++++++----- paddle/fluid/operators/expand_op.h | 26 ++++++++++++++++----- paddle/fluid/operators/expand_v2_op.h | 26 ++++++++++++++++----- paddle/fluid/operators/meshgrid_op.h | 17 ++++++++++---- paddle/fluid/operators/tile_op.h | 26 ++++++++++++++++----- 6 files changed, 116 insertions(+), 34 deletions(-) mode change 100644 => 100755 paddle/fluid/operators/expand_as_op.h mode change 100644 => 100755 paddle/fluid/operators/expand_as_v2_op.h mode change 100644 => 100755 paddle/fluid/operators/expand_op.h mode change 100644 => 100755 paddle/fluid/operators/expand_v2_op.h mode change 100644 => 100755 paddle/fluid/operators/meshgrid_op.h mode change 100644 => 100755 paddle/fluid/operators/tile_op.h diff --git a/paddle/fluid/operators/expand_as_op.h b/paddle/fluid/operators/expand_as_op.h old mode 100644 new mode 100755 index 4cefadb24ec..406455af741 --- a/paddle/fluid/operators/expand_as_op.h +++ b/paddle/fluid/operators/expand_as_op.h @@ -25,7 +25,14 @@ limitations under the License. */ #include "paddle/fluid/operators/eigen/eigen_function.h" #define MAX_RANK_SUPPORTED 6 - +// 1. BOOST_PP_REPEAT macro represents a fast horizontal repetition construct. +// Usage: BOOST_PP_REPEAT(count, macro, data). +// This macro expands to the sequence: +// macro(z, 0, data) macro(z, 1, data) ... macro(z, count - 1, data). +// 2. As for our case, count = MAX_RANK_SUPPORTED(which is 6). +// So the range of n is 0-5(which is count-1). +// We want to generate case 1-6 instead of case 0-5. +// So we need to change n to n + 1. #define EXPAND_AS_TEMPLATE(z, n, data) \ case n + 1: { \ ExpandAs(context); \ @@ -33,10 +40,10 @@ limitations under the License. */ } #define REP_EXPAND_AS_TEMPLATE(n) BOOST_PP_REPEAT(n, EXPAND_AS_TEMPLATE, ~) #define COND(n) BOOST_PP_GREATER_EQUAL(n, BOOST_PP_MOD(n, MAX_RANK_SUPPORTED)) -#define EXPAND_AS_GRAD_CASE(n) \ - case n: { \ - ExpandAsBackward(context, reshape_dims_vec, reduce_dims_vec); \ - break; \ +#define EXPAND_AS_GRAD_CASE(n) \ + case n + 1: { \ + ExpandAsBackward(context, reshape_dims_vec, reduce_dims_vec); \ + break; \ } #define EXPAND_AS_GRAD_TEMPLATE(z, n, data) \ BOOST_PP_IF(COND(n), EXPAND_AS_GRAD_CASE(n), ) @@ -145,6 +152,18 @@ class ExpandAsGradKernel : public framework::OpKernel { framework::TensorCopy(*in0, context.GetPlace(), context.device_context(), out0); } else { + PADDLE_ENFORCE_GE(dims, 1, + platform::errors::InvalidArgument( + "The rank of the input 'Out@GRAD' for " + "expand_as_grad op must be greater than or " + "equal to 1, but the value received is %d.", + dims)); + PADDLE_ENFORCE_LE(dims, MAX_RANK_SUPPORTED, + platform::errors::InvalidArgument( + "The rank of the input 'Out@GRAD' for " + "expand_as_grad op must be less than or equal " + "to %d, but the value received is %d.", + MAX_RANK_SUPPORTED, dims)); switch (dims) { REP_EXPAND_AS_GRAD_TEMPLATE(MAX_RANK_SUPPORTED) default: diff --git a/paddle/fluid/operators/expand_as_v2_op.h b/paddle/fluid/operators/expand_as_v2_op.h old mode 100644 new mode 100755 index 441dd353804..6df4c592378 --- a/paddle/fluid/operators/expand_as_v2_op.h +++ b/paddle/fluid/operators/expand_as_v2_op.h @@ -26,7 +26,14 @@ limitations under the License. */ #include "paddle/fluid/operators/eigen/eigen_function.h" #define MAX_RANK_SUPPORTED 6 - +// 1. BOOST_PP_REPEAT macro represents a fast horizontal repetition construct. +// Usage: BOOST_PP_REPEAT(count, macro, data). +// This macro expands to the sequence: +// macro(z, 0, data) macro(z, 1, data) ... macro(z, count - 1, data). +// 2. As for our case, count = MAX_RANK_SUPPORTED(which is 6). +// So the range of n is 0-5(which is count-1). +// We want to generate case 1-6 instead of case 0-5. +// So we need to change n to n + 1. #define EXPAND_AS_TEMPLATE(z, n, data) \ case n + 1: { \ ExpandAs(context); \ @@ -34,10 +41,10 @@ limitations under the License. */ } #define REP_EXPAND_AS_TEMPLATE(n) BOOST_PP_REPEAT(n, EXPAND_AS_TEMPLATE, ~) #define COND(n) BOOST_PP_GREATER_EQUAL(n, BOOST_PP_MOD(n, MAX_RANK_SUPPORTED)) -#define EXPAND_AS_GRAD_CASE(n) \ - case n: { \ - ExpandAsBackward(context, reshape_dims_vec, reduce_dims_vec); \ - break; \ +#define EXPAND_AS_GRAD_CASE(n) \ + case n + 1: { \ + ExpandAsBackward(context, reshape_dims_vec, reduce_dims_vec); \ + break; \ } #define EXPAND_AS_GRAD_TEMPLATE(z, n, data) \ BOOST_PP_IF(COND(n), EXPAND_AS_GRAD_CASE(n), ) @@ -178,7 +185,14 @@ class ExpandAsV2GradKernel : public framework::OpKernel { "expand_as_v2_grad op must be less than or equal " "to %d, but the value received is %d.", MAX_RANK_SUPPORTED, dims)); - switch (dims) { REP_EXPAND_AS_GRAD_TEMPLATE(MAX_RANK_SUPPORTED) } + switch (dims) { + REP_EXPAND_AS_GRAD_TEMPLATE(MAX_RANK_SUPPORTED) + default: + PADDLE_THROW(platform::errors::InvalidArgument( + "Only support tensor with rank being between 1 and 6. But " + "received tensor's rank = %d.", + dims)); + } } } diff --git a/paddle/fluid/operators/expand_op.h b/paddle/fluid/operators/expand_op.h old mode 100644 new mode 100755 index abd525497d6..e1a1ce0a817 --- a/paddle/fluid/operators/expand_op.h +++ b/paddle/fluid/operators/expand_op.h @@ -28,7 +28,14 @@ limitations under the License. */ #include "paddle/fluid/operators/eigen/eigen_function.h" #define MAX_RANK_SUPPORTED 6 - +// 1. BOOST_PP_REPEAT macro represents a fast horizontal repetition construct. +// Usage: BOOST_PP_REPEAT(count, macro, data). +// This macro expands to the sequence: +// macro(z, 0, data) macro(z, 1, data) ... macro(z, count - 1, data). +// 2. As for our case, count = MAX_RANK_SUPPORTED(which is 6). +// So the range of n is 0-5(which is count-1). +// We want to generate case 1-6 instead of case 0-5. +// So we need to change n to n + 1. #define EXPAND_TEMPLATE(z, n, data) \ case n + 1: { \ Expand(context); \ @@ -36,10 +43,10 @@ limitations under the License. */ } #define REP_EXPAND_TEMPLATE(n) BOOST_PP_REPEAT(n, EXPAND_TEMPLATE, ~) #define COND(n) BOOST_PP_GREATER_EQUAL(n, BOOST_PP_MOD(n, MAX_RANK_SUPPORTED)) -#define EXPAND_GRAD_CASE(n) \ - case n: { \ - ExpandBackward(context, reshape_dims_vec, reduce_dims_vec); \ - break; \ +#define EXPAND_GRAD_CASE(n) \ + case n + 1: { \ + ExpandBackward(context, reshape_dims_vec, reduce_dims_vec); \ + break; \ } #define EXPAND_GRAD_TEMPLATE(z, n, data) \ BOOST_PP_IF(COND(n), EXPAND_GRAD_CASE(n), ) @@ -219,7 +226,14 @@ class ExpandGradKernel : public framework::OpKernel { "for Op(expand_grad) must be less than or equal " "to %d, but the value received is %d.", MAX_RANK_SUPPORTED, dims)); - switch (dims) { REP_EXPAND_GRAD_TEMPLATE(MAX_RANK_SUPPORTED) } + switch (dims) { + REP_EXPAND_GRAD_TEMPLATE(MAX_RANK_SUPPORTED) + default: + PADDLE_THROW(platform::errors::InvalidArgument( + "Only support tensor with rank being between 1 and 6. But " + "received tensor's rank = %d.", + dims)); + } } } diff --git a/paddle/fluid/operators/expand_v2_op.h b/paddle/fluid/operators/expand_v2_op.h old mode 100644 new mode 100755 index af5fdf22cd9..8a87a067c51 --- a/paddle/fluid/operators/expand_v2_op.h +++ b/paddle/fluid/operators/expand_v2_op.h @@ -29,7 +29,14 @@ limitations under the License. */ #include "paddle/fluid/operators/eigen/eigen_function.h" #define MAX_RANK_SUPPORTED 6 - +// 1. BOOST_PP_REPEAT macro represents a fast horizontal repetition construct. +// Usage: BOOST_PP_REPEAT(count, macro, data). +// This macro expands to the sequence: +// macro(z, 0, data) macro(z, 1, data) ... macro(z, count - 1, data). +// 2. As for our case, count = MAX_RANK_SUPPORTED(which is 6). +// So the range of n is 0-5(which is count-1). +// We want to generate case 1-6 instead of case 0-5. +// So we need to change n to n + 1. #define EXPAND_TEMPLATE(z, n, data) \ case n + 1: { \ Expand(context); \ @@ -37,10 +44,10 @@ limitations under the License. */ } #define REP_EXPAND_TEMPLATE(n) BOOST_PP_REPEAT(n, EXPAND_TEMPLATE, ~) #define COND(n) BOOST_PP_GREATER_EQUAL(n, BOOST_PP_MOD(n, MAX_RANK_SUPPORTED)) -#define EXPAND_GRAD_CASE(n) \ - case n: { \ - ExpandBackward(context, reshape_dims_vec, reduce_dims_vec); \ - break; \ +#define EXPAND_GRAD_CASE(n) \ + case n + 1: { \ + ExpandBackward(context, reshape_dims_vec, reduce_dims_vec); \ + break; \ } #define EXPAND_GRAD_TEMPLATE(z, n, data) \ BOOST_PP_IF(COND(n), EXPAND_GRAD_CASE(n), ) @@ -263,7 +270,14 @@ class ExpandV2GradKernel : public framework::OpKernel { "expand_v2_grad op must be less than or equal " "to %d, but the value received is %d.", MAX_RANK_SUPPORTED, dims)); - switch (dims) { REP_EXPAND_GRAD_TEMPLATE(MAX_RANK_SUPPORTED) } + switch (dims) { + REP_EXPAND_GRAD_TEMPLATE(MAX_RANK_SUPPORTED) + default: + PADDLE_THROW(platform::errors::InvalidArgument( + "Only support tensor with rank being between 1 and 6. But " + "received tensor's rank = %d.", + dims)); + } } } diff --git a/paddle/fluid/operators/meshgrid_op.h b/paddle/fluid/operators/meshgrid_op.h old mode 100644 new mode 100755 index 345e007de4a..2aad894e11d --- a/paddle/fluid/operators/meshgrid_op.h +++ b/paddle/fluid/operators/meshgrid_op.h @@ -29,7 +29,14 @@ #include "paddle/fluid/platform/errors.h" #define MAX_RANK_SUPPORTED 6 - +// 1. BOOST_PP_REPEAT macro represents a fast horizontal repetition construct. +// Usage: BOOST_PP_REPEAT(count, macro, data). +// This macro expands to the sequence: +// macro(z, 0, data) macro(z, 1, data) ... macro(z, count - 1, data). +// 2. As for our case, count = MAX_RANK_SUPPORTED(which is 6). +// So the range of n is 0-5(which is count-1). +// We want to generate case 1-6 instead of case 0-5. +// So we need to change n to n + 1. #define MESHGRID_TEMPLATE(z, n, data) \ case n + 1: { \ MeshgridForward(context); \ @@ -38,10 +45,10 @@ #define REP_MESHGRID_TEMPLATE(n) BOOST_PP_REPEAT(n, MESHGRID_TEMPLATE, ~) #define COND(n) BOOST_PP_GREATER_EQUAL(n, BOOST_PP_MOD(n, MAX_RANK_SUPPORTED)) -#define MESHGRID_GRAD_CASE(n) \ - case n: { \ - MeshgridBackward(context); \ - break; \ +#define MESHGRID_GRAD_CASE(n) \ + case n + 1: { \ + MeshgridBackward(context); \ + break; \ } #define MESHGRID_GRAD_TEMPLATE(z, n, data) \ BOOST_PP_IF(COND(n), MESHGRID_GRAD_CASE(n), ) diff --git a/paddle/fluid/operators/tile_op.h b/paddle/fluid/operators/tile_op.h old mode 100644 new mode 100755 index 4bbde8d08e0..1fb0fa6ce51 --- a/paddle/fluid/operators/tile_op.h +++ b/paddle/fluid/operators/tile_op.h @@ -29,7 +29,14 @@ limitations under the License. */ #include "paddle/fluid/operators/eigen/eigen_function.h" #define MAX_RANK_SUPPORTED 6 - +// 1. BOOST_PP_REPEAT macro represents a fast horizontal repetition construct. +// Usage: BOOST_PP_REPEAT(count, macro, data). +// This macro expands to the sequence: +// macro(z, 0, data) macro(z, 1, data) ... macro(z, count - 1, data). +// 2. As for our case, count = MAX_RANK_SUPPORTED(which is 6). +// So the range of n is 0-5(which is count-1). +// We want to generate case 1-6 instead of case 0-5. +// So we need to change n to n + 1. #define TILE_TEMPLATE(z, n, data) \ case n + 1: { \ Tile(context); \ @@ -37,10 +44,10 @@ limitations under the License. */ } #define REP_TILE_TEMPLATE(n) BOOST_PP_REPEAT(n, TILE_TEMPLATE, ~) #define COND(n) BOOST_PP_GREATER_EQUAL(n, BOOST_PP_MOD(n, MAX_RANK_SUPPORTED)) -#define TILE_GRAD_CASE(n) \ - case n: { \ - TileBackward(context, reshape_dims_vec, reduce_dims_vec); \ - break; \ +#define TILE_GRAD_CASE(n) \ + case n + 1: { \ + TileBackward(context, reshape_dims_vec, reduce_dims_vec); \ + break; \ } #define TILE_GRAD_TEMPLATE(z, n, data) BOOST_PP_IF(COND(n), TILE_GRAD_CASE(n), ) #define REP_TILE_GRAD_TEMPLATE(n) BOOST_PP_REPEAT(n, TILE_GRAD_TEMPLATE, ~) @@ -243,7 +250,14 @@ class TileGradKernel : public framework::OpKernel { "must be less than or equal " "to %d, but the value received is %d.", MAX_RANK_SUPPORTED, dims)); - switch (dims) { REP_TILE_GRAD_TEMPLATE(MAX_RANK_SUPPORTED) } + switch (dims) { + REP_TILE_GRAD_TEMPLATE(MAX_RANK_SUPPORTED) + default: + PADDLE_THROW(platform::errors::InvalidArgument( + "Only support tensor with rank being between 1 and 6. But " + "received tensor's rank = %d.", + dims)); + } } } -- GitLab From 723020337d9bf392398e4990a7abf6df7940a69d Mon Sep 17 00:00:00 2001 From: ShenLiang <1422485404@qq.com> Date: Thu, 8 Apr 2021 13:20:21 +0800 Subject: [PATCH 174/486] fix bug (#32135) --- .../tests/unittests/test_parallel_dygraph_dataparallel.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py index 1d2a3975190..5491b451368 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py @@ -18,7 +18,7 @@ import unittest import time import paddle.fluid as fluid -from paddle.distributed.utils import find_free_ports, watch_local_trainers, get_cluster, get_gpus, start_local_trainers +from paddle.distributed.utils import find_free_ports, watch_local_trainers, get_cluster, start_local_trainers def get_cluster_from_args(selected_gpus): @@ -41,6 +41,11 @@ def get_cluster_from_args(selected_gpus): return get_cluster(node_ips, node_ip, trainer_endpoints, selected_gpus) +def get_gpus(selected_gpus): + selected_gpus = [x.strip() for x in selected_gpus.split(',')] + return selected_gpus + + class TestMultipleGpus(unittest.TestCase): def run_mnist_2gpu(self, target_file_name): if not fluid.core.is_compiled_with_cuda( -- GitLab From 6e65fe02e262e39dbb6989f24ff5e564cc15fe50 Mon Sep 17 00:00:00 2001 From: Zhen Wang Date: Thu, 8 Apr 2021 14:34:21 +0800 Subject: [PATCH 175/486] The unsupported_fp16_list using in AMP will be created automatically during the runtime. (#32102) * Use the runtime to create the unsupported_fp16_list using in AMP. * Add more infos about supported ops. * Add some comments for the function of OpSupportedInfos. * Fix the unit test of test_multi_precision_fp16_train. --- paddle/fluid/pybind/pybind.cc | 63 ++++++++ .../contrib/mixed_precision/fp16_lists.py | 148 +----------------- .../tests/test_multi_precision_fp16_train.py | 3 +- 3 files changed, 71 insertions(+), 143 deletions(-) diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 5bf70d1126b..215c81a00e8 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -14,11 +14,15 @@ limitations under the License. */ #include #include +#include #include +#include #include #include #include // NOLINT // for call_once #include +#include +#include #include #include #include @@ -189,6 +193,64 @@ bool SupportsBfloat16FastPerformance() { #endif } +// According to the input `place` and `dtype`, this function returns a tuple +// consists of three sets: +// 1) All operators registered in the Paddle framework. +// 2) All operators supported for `place` and `dtype`. +// 3) All operators unsupported for `place` and `dtype`. +// The input `place` is a type of string, which can only be `GPU` or `CPU`. +// The input `dtype` is a type of paddle::framework::proto::VarType::Type, +// which can be paddle::framework::proto::VarType::FP16, +// paddle::framework::proto::VarType::FP32 and so on. +std::tuple, std::unordered_set, + std::unordered_set> +OpSupportedInfos(const std::string &place, + framework::proto::VarType::Type dtype) { + std::string query_place; + std::transform(place.begin(), place.end(), std::back_inserter(query_place), + [](unsigned char c) { return std::toupper(c); }); + using fn_type = std::add_pointer::type; + std::unordered_map is_target_place{ + {"GPU", &platform::is_gpu_place}, {"CPU", &platform::is_cpu_place}, + }; + PADDLE_ENFORCE_NE( + is_target_place.count(query_place), 0, + platform::errors::InvalidArgument( + "The argument `place` should be 'GPU' or 'CPU', but get '%s'.", + place)); + + std::unordered_set all_ops; + const auto &op_info = framework::OpInfoMap::Instance().map(); + for (auto it = op_info.begin(); it != op_info.end(); it++) { + all_ops.emplace(it->first); + } + + std::unordered_set supported_ops; + auto &all_kernels = framework::OperatorWithKernel::AllOpKernels(); + for (auto it = all_kernels.begin(); it != all_kernels.end(); it++) { + for (auto &kernel_type : it->second) { + if (is_target_place[query_place](kernel_type.first.place_) && + kernel_type.first.data_type_ == dtype) { + supported_ops.emplace(it->first); + } + } + } + + std::unordered_set unsupported_ops; + for (auto &op : all_ops) { + if (!supported_ops.count(op)) { + unsupported_ops.emplace(op); + } + } + + VLOG(4) << "-- The size of all_ops: " << all_ops.size() << " --"; + VLOG(4) << "-- The size of supported_ops: " << supported_ops.size() << " --"; + VLOG(4) << "-- The size of unsupported_ops: " << unsupported_ops.size() + << " --"; + return std::make_tuple(std::move(all_ops), std::move(supported_ops), + std::move(unsupported_ops)); +} + bool IsCompiledWithBrpc() { #ifndef PADDLE_WITH_DISTRIBUTE return false; @@ -1770,6 +1832,7 @@ All parameter, weight, gradient are variables in Paddle. m.def("is_compiled_with_mkldnn", IsCompiledWithMKLDNN); m.def("supports_bfloat16", SupportsBfloat16); m.def("supports_bfloat16_fast_performance", SupportsBfloat16FastPerformance); + m.def("op_supported_infos", OpSupportedInfos); m.def("is_compiled_with_brpc", IsCompiledWithBrpc); m.def("is_compiled_with_dist", IsCompiledWithDIST); m.def("_cuda_synchronize", [](const platform::CUDAPlace &place) { diff --git a/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py b/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py index 6a524af4ee2..f940f6a3143 100644 --- a/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py +++ b/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py @@ -13,6 +13,7 @@ # limitations under the License. import copy +from ... import core __all__ = ["CustomOpLists", "AutoMixedPrecisionLists"] @@ -147,147 +148,10 @@ gray_list = { } # The set of ops that don't support fp16 calculation -unsupported_fp16_list = { - # from python/paddle/fluid/layers/io.py - 'send', - 'send_barrier', - 'recv', - 'fetch_barrier', - 'create_py_reader', - 'create_double_buffer_reader', - 'read', - 'load', - - # from python/paddle/fluid/control_flow.py - 'increment', - 'less_than', - 'less_equal', - 'greater_than', - 'greater_equal', - 'equal', - 'not_equal', - 'read_from_array', - 'shrink_rnn_memory', - 'lod_array_length', - 'logical_and', - 'logical_or', - 'logical_xor', - 'logical_not', - 'print', - 'conditional_block', - 'while', - 'ifelse', - 'is_empty', - 'lstm', - 'cudnn_lstm', - 'lstmp', - 'gru', - 'gru_unit', - 'linear_chain_crf', - 'crf_decoding', - 'bpr_loss', - 'chunk_eval', - 'sequence_conv', - 'sequence_softmax', - # Depthwise conv2d isn't fast and safe currently. - # ref: https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/grappler/optimizers/auto_mixed_precision_lists.h#L79 - 'depthwise_conv2d', - # Tensor Core kernels are not available for 3D convolutions currently. - 'conv3d', - 'sequence_pool', - 'sequence_concat', - 'sequence_slice', - 'data_norm', - 'group_norm', - 'spectral_norm', - 'depthwise_conv2d_transpose', - 'sequence_expand', - 'conv_transposed2d', - 'conv_transposed3d', - 'sequence_expand_as', - 'sequence_pad', - 'sequence_unpad', - 'sequence_erase', - 'beam_search', - 'beam_search_decode', - 'lstm_unit', - 'reduce_sum', - 'reduce_mean', - 'reduce_max', - 'reduce_min', - 'reduce_prod', - 'reduce_all', - 'reduce_any', - 'split', - 'edit_distance', - 'ctc_align', - 'warpctc', - 'sequence_reshape', - 'nce', - 'hierarchical_sigmoid', - 'im2sequence', - 'row_conv', - 'multiplex', - 'sample_logits', - 'one_hot', - 'smooth_l1_loss', - 'squeeze2', - 'unsqueeze2', - 'lod_reset', - 'lrn', - 'pad', - 'pad_constant_like', - 'label_smooth', - 'scatter', - 'sequence_scatter', - 'random_crop', - 'mean_iou', - 'selu', - 'crop', - 'affine_grid', - 'rank_loss', - 'margin_rank_loss', - 'pad2d', - 'elu', - 'pow', - 'stanh', - 'hard_sigmoid', - 'swish', - 'prelu', - 'brelu', - 'sequence_enumerate', - 'sequence_mask', - 'expand', - 'sampling_id', - 'maxout', - 'space_to_depth', - 'sequence_reverse', - 'similarity_focus', - 'hash', - 'grid_sampler', - 'log_loss', - 'teacher_student_sigmoid_loss', - 'add_position_encoding', - 'bilinear_tensor_product', - 'shuffle_channel', - 'temporal_shift', - 'psroi_pool', - 'huber_loss', - 'kldiv_loss', - 'tree_conv', - 'pixel_shuffle', - 'fsp', - 'cvm', - 'affine_channel', - 'roi_pool', - 'roi_align', - 'anchor_generator', - 'generate_proposals', - 'generate_proposal_labels', - 'generate_mask_labels', - # fp16 is slower than fp32, though fp16 is supported. - 'lookup_table', - 'lookup_table_v2', -} +# lookup_table fp16 is slower than fp32, though fp16 is supported. +_, _, _sys_unsupported_fp16_list = core.op_supported_infos( + 'GPU', core.VarDesc.VarType.FP16) +unsupported_fp16_list = {'lookup_table', + 'lookup_table_v2'} | _sys_unsupported_fp16_list CustomOpLists = AutoMixedPrecisionLists diff --git a/python/paddle/fluid/contrib/tests/test_multi_precision_fp16_train.py b/python/paddle/fluid/contrib/tests/test_multi_precision_fp16_train.py index b190a5d02ef..850b267411e 100644 --- a/python/paddle/fluid/contrib/tests/test_multi_precision_fp16_train.py +++ b/python/paddle/fluid/contrib/tests/test_multi_precision_fp16_train.py @@ -258,7 +258,8 @@ class TestAmpWithNonIterableDataLoader(unittest.TestCase): cast_model_to_fp16(main_prog, use_fp16_guard=False) def test_non_iterable_dataloader(self): - self.decorate_with_data_loader() + if fluid.core.is_compiled_with_cuda(): + self.decorate_with_data_loader() if __name__ == '__main__': -- GitLab From 54344964daef17517b13a866a5998e0f8f1e827d Mon Sep 17 00:00:00 2001 From: JZ-LIANG <38102074+JZ-LIANG@users.noreply.github.com> Date: Thu, 8 Apr 2021 15:33:32 +0800 Subject: [PATCH 176/486] 4D Hybrid Parallelism (#32134) --- .../meta_optimizers/sharding_optimizer.py | 13 +- .../unittests/fleet_meta_optimizer_base.py | 32 ++++ .../test_fleet_sharding_meta_optimizer.py | 163 ++++++++++++------ 3 files changed, 153 insertions(+), 55 deletions(-) diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py index a83ae226a9d..2c4ad33c361 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py @@ -134,8 +134,17 @@ class ShardingOptimizer(MetaOptimizerBase): self.pp_degree, self.dp_degree, ) - self.hybrid_dp = self.user_defined_strategy.sharding_configs[ - "hybrid_dp"] + # FIXME (JZ-LIANG) deprecated hybrid_dp + if self.user_defined_strategy.sharding_configs["hybrid_dp"]: + logging.warning( + "[hybrid_dp] API setting is deprecated. Now when dp_degree >= 2, its will be in hybrid dp mode automatically" + ) + assert self.dp_degree >= 1 + if self.dp_degree > 1: + self.hybrid_dp = True + else: + self.hybrid_dp = False + # NOTE (JZ-LIANG) # there 2 kind of modes for gradient-merge and hybrid-dp in mixed parallism [sharding] and [pipeline]. # we distinguish this two modes since the gm/hybrid-dp related allreduce should be insert in different place according different mode to have best performance: diff --git a/python/paddle/fluid/tests/unittests/fleet_meta_optimizer_base.py b/python/paddle/fluid/tests/unittests/fleet_meta_optimizer_base.py index 549975f5d3f..730fa4ca60d 100755 --- a/python/paddle/fluid/tests/unittests/fleet_meta_optimizer_base.py +++ b/python/paddle/fluid/tests/unittests/fleet_meta_optimizer_base.py @@ -50,6 +50,38 @@ class TestFleetMetaOptimizer(unittest.TestCase): strategy = paddle.distributed.fleet.DistributedStrategy() return avg_cost, strategy + def pp_net(self, main_prog, startup_prog, pp_degree=2): + def fc_block(input_x): + fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh') + fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh') + fc_3 = paddle.fluid.layers.fc(input=fc_2, size=64, act='tanh') + return fc_3 + + with fluid.program_guard(main_prog, startup_prog): + with fluid.unique_name.guard(): + role = role_maker.PaddleCloudRoleMaker(is_collective=True) + fleet.init(role) + with fluid.device_guard("gpu:0"): + input_x = paddle.fluid.layers.data( + name="x", shape=[32], dtype='float32') + input_y = paddle.fluid.layers.data( + name="y", shape=[1], dtype='int64') + + for stage_idx in range(pp_degree): + with fluid.device_guard("gpu:" + str(stage_idx)): + input_x = fc_block(input_x) + + with fluid.device_guard("gpu:" + str(pp_degree - 1)): + prediction = paddle.fluid.layers.fc(input=[input_x], + size=2, + act='softmax') + cost = paddle.fluid.layers.cross_entropy( + input=prediction, label=input_y) + avg_cost = paddle.fluid.layers.mean(x=cost) + + strategy = paddle.distributed.fleet.DistributedStrategy() + return avg_cost, strategy + def optimizer(self, loss, strategy, diff --git a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py index f28bf89ff5c..4d1e936558a 100755 --- a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py @@ -298,6 +298,13 @@ class TestFleetMetaOptimizer(TestFleetMetaOptimizer): os.environ[ "PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001,127.0.0.1:36002,127.0.0.1:36003,127.0.0.1:36004" + # pre-assigned ring id + self.mp_ring_id = 0 + self.sharding_ring_id = 1 + self.dp_ring_id = 2 + self.global_ring_id = 3 + self.pp_ring_id = 20 + def test_sharding_with_mp(self): # NOTE(JZ-LIANG) MP parallelism need user to build model with MP API train_prog, startup_prog = paddle.fluid.Program(), paddle.fluid.Program( @@ -323,7 +330,7 @@ class TestFleetMetaOptimizer(TestFleetMetaOptimizer): op.desc.attr("ring_id") for op in startup_prog_ops if op.type == "c_comm_init" ] - self.assertIn(0, created_ring_ids) + self.assertIn(self.mp_ring_id, created_ring_ids) # check correctness of MP group sharding_group_waiting_port = None @@ -368,7 +375,7 @@ class TestFleetMetaOptimizer(TestFleetMetaOptimizer): op.desc.attr("ring_id") for op in startup_prog_ops if op.type == "c_comm_init" ] - self.assertIn(2, created_ring_ids) + self.assertIn(self.dp_ring_id, created_ring_ids) # check correctness of sharding group sharding_group_waiting_port = None @@ -437,7 +444,7 @@ class TestFleetMetaOptimizer(TestFleetMetaOptimizer): op.desc.attr("ring_id") for op in startup_prog_ops if op.type == "c_comm_init" ] - self.assertIn(2, created_ring_ids) + self.assertIn(self.dp_ring_id, created_ring_ids) # check correctness of sharding group sharding_group_waiting_port = None @@ -460,56 +467,19 @@ class TestFleetMetaOptimizer(TestFleetMetaOptimizer): fw_bw_ops = [op.type for op in train_prog.blocks[0].ops] opt_ops = [op.type for op in train_prog.blocks[2].ops] self.assertEqual(fw_bw_ops, [ - 'fill_constant', - 'fill_constant', - 'fill_constant', - 'c_sync_calc_stream', - 'c_broadcast', - 'c_broadcast', - 'c_broadcast', - 'c_broadcast', - 'c_broadcast', - 'c_broadcast', - 'c_sync_comm_stream', - 'mul', - 'elementwise_add', - 'tanh', - 'mul', - 'elementwise_add', - 'tanh', - 'mul', - 'elementwise_add', - 'softmax', - 'cross_entropy2', - 'mean', - 'fill_constant', - 'scale', - 'mean_grad', - 'cross_entropy_grad2', - 'softmax_grad', - 'elementwise_add_grad', - 'mul_grad', - 'tanh_grad', - 'elementwise_add_grad', - 'mul_grad', - 'tanh_grad', - 'elementwise_add_grad', - 'mul_grad', - 'c_sync_calc_stream', - 'c_reduce_sum', - 'c_reduce_sum', - 'c_reduce_sum', - 'c_reduce_sum', - 'c_reduce_sum', - 'c_reduce_sum', - 'c_sync_comm_stream', - 'elementwise_add', - 'elementwise_add', - 'elementwise_add', - 'increment', - 'elementwise_mod', - 'equal', - 'conditional_block', + 'fill_constant', 'fill_constant', 'fill_constant', + 'c_sync_calc_stream', 'c_broadcast', 'c_broadcast', 'c_broadcast', + 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_sync_comm_stream', + 'mul', 'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh', + 'mul', 'elementwise_add', 'softmax', 'cross_entropy2', 'mean', + 'fill_constant', 'scale', 'mean_grad', 'cross_entropy_grad2', + 'softmax_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad', + 'elementwise_add_grad', 'mul_grad', 'tanh_grad', + 'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream', + 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', + 'c_reduce_sum', 'c_reduce_sum', 'c_sync_comm_stream', + 'elementwise_add', 'elementwise_add', 'elementwise_add', + 'increment', 'elementwise_mod', 'equal', 'conditional_block' ]) self.assertEqual(opt_ops, [ 'c_allreduce_sum', 'c_allreduce_sum', 'c_allreduce_sum', 'scale', @@ -524,6 +494,93 @@ class TestFleetMetaOptimizer(TestFleetMetaOptimizer): scale_ = float(op.desc.attr("scale")) self.assertEqual(scale_, 0.25) + def test_sharding_with_pp(self): + train_prog, startup_prog = paddle.fluid.Program(), paddle.fluid.Program( + ) + avg_cost, strategy = self.pp_net(train_prog, startup_prog) + strategy.sharding = True + strategy.sharding_configs = { + "sharding_segment_strategy": "segment_broadcast_MB", + "segment_broadcast_MB": 0.1, + "sharding_degree": 2, + "hybrid_dp": False, + "gradient_merge_acc_step": 4, + "mp_degree": 1, + "pp_degree": 2 + } + strategy.pipeline = True + strategy.pipeline_configs = { + "schedule_mode": "1F1B", + "micro_batch_size": 2, + "accumulate_steps": 4, + } + self.optimizer(avg_cost, strategy, train_prog, startup_prog) + train_prog = train_prog._pipeline_opt['section_program'] + startup_prog = startup_prog._pipeline_opt['startup_program'] + + startup_prog_ops = startup_prog.global_block().ops + main_prog_ops = train_prog.global_block().ops + + # check program + startup_prog_op_types = [op.type for op in startup_prog_ops] + main_prog_op_types = [op.type for op in main_prog_ops] + self.assertEqual(startup_prog_op_types, [ + 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', + 'fill_constant', 'fill_constant', 'fill_constant', 'uniform_random', + 'fill_constant', 'uniform_random', 'fill_constant', 'c_gen_nccl_id', + 'c_comm_init', 'fill_constant', 'c_allreduce_sum', 'c_gen_nccl_id', + 'c_comm_init', 'fill_constant', 'c_allreduce_sum', 'c_gen_nccl_id', + 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init' + ]) + + self.assertEqual(main_prog_op_types, [ + 'fill_constant', 'fill_constant', 'fill_constant', + 'c_sync_calc_stream', 'c_broadcast', 'c_broadcast', 'c_broadcast', + 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', + 'c_broadcast', 'c_sync_comm_stream', 'recv_v2', 'mul', + 'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh', 'mul', + 'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'softmax', + 'cross_entropy2', 'mean', 'fill_constant', 'scale', 'scale', + 'mean_grad', 'cross_entropy_grad2', 'softmax_grad', + 'elementwise_add_grad', 'mul_grad', 'tanh_grad', + 'elementwise_add_grad', 'mul_grad', 'tanh_grad', + 'elementwise_add_grad', 'mul_grad', 'tanh_grad', + 'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream', 'send_v2', + 'c_sync_calc_stream', 'c_reduce_sum', 'c_reduce_sum', + 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', + 'c_reduce_sum', 'c_reduce_sum', 'c_sync_comm_stream', + 'c_sync_comm_stream', 'fill_constant', 'sum', 'fill_constant', + 'sum', 'fill_constant', 'sum', 'fill_constant', 'sum', + 'fill_constant', 'sum', 'momentum', 'momentum', 'momentum', + 'momentum', 'momentum' + ]) + + # should has ring id for pp + created_ring_ids = [ + op.desc.attr("ring_id") for op in startup_prog_ops + if op.type == "c_comm_init" + ] + self.assertIn(self.sharding_ring_id, created_ring_ids) + self.assertIn(self.pp_ring_id, created_ring_ids) + + # check correctness of pp group + sharding_group_waiting_port = None + for op in startup_prog_ops: + if op.type == "c_gen_nccl_id" and op.desc.output_arg_names()[ + 0] == "nccl_id_1": + sharding_group_waiting_ports = op.desc.attr("other_endpoints") + + self.assertEqual(sharding_group_waiting_ports, ['127.0.0.1:36003']) + + # check correctness of sharding group + sharding_group_waiting_port = None + for op in startup_prog_ops: + if op.type == "c_gen_nccl_id" and op.desc.output_arg_names()[ + 0] == "nccl_id_2": + dp_group_waiting_ports = op.desc.attr("other_endpoints") + + self.assertEqual(dp_group_waiting_ports, ['127.0.0.1:36002']) + if __name__ == "__main__": unittest.main() -- GitLab From e45c3fa57d5d1dcc98b3744bcdbcd215bf0dccff Mon Sep 17 00:00:00 2001 From: chentianyu03 Date: Thu, 8 Apr 2021 17:47:50 +0800 Subject: [PATCH 177/486] Add LayerDict class (#31951) * add layerdict class * add docs and test cases for LayerDict class * remove the arguments type in function define * add update inputs type check --- .../test_imperative_container_layerdict.py | 105 +++++++ python/paddle/nn/__init__.py | 2 + python/paddle/nn/layer/__init__.py | 2 + python/paddle/nn/layer/container.py | 294 ++++++++++++++++++ 4 files changed, 403 insertions(+) create mode 100644 python/paddle/fluid/tests/unittests/test_imperative_container_layerdict.py create mode 100644 python/paddle/nn/layer/container.py diff --git a/python/paddle/fluid/tests/unittests/test_imperative_container_layerdict.py b/python/paddle/fluid/tests/unittests/test_imperative_container_layerdict.py new file mode 100644 index 00000000000..9cd3c6a8fb5 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_imperative_container_layerdict.py @@ -0,0 +1,105 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import paddle +from collections import OrderedDict + + +class TestLayerDict(unittest.TestCase): + def test_layer_dict(self): + layers = OrderedDict([ + ('conv1d', paddle.nn.Conv1D(3, 2, 3)), + ('conv2d', paddle.nn.Conv2D(3, 2, 3)), + ]) + + layers_dicts = paddle.nn.LayerDict(sublayers=layers) + + def check_layer_dict(): + self.assertEqual(len(layers), len(layers_dicts)) + + for k1, k2 in zip(layers, layers_dicts): + self.assertIs(layers[k1], layers_dicts[k2]) + + for k, v in zip(layers, layers_dicts.children()): + self.assertIs(layers[k], v) + + for k in layers_dicts: + self.assertIs(layers[k], layers_dicts[k]) + + for k in layers.keys(): + self.assertTrue(k in layers_dicts) + + for k1, k2 in zip(layers.keys(), layers_dicts.keys()): + self.assertEqual(k1, k2) + + for k, v in layers_dicts.items(): + self.assertIs(layers[k], v) + + for v1, v2 in zip(layers.values(), layers_dicts.values()): + self.assertIs(v1, v2) + + check_layer_dict() + + layers['linear'] = paddle.nn.Linear(2, 4) + layers_dicts['linear'] = layers['linear'] + check_layer_dict() + + sublayer = OrderedDict([ + ('sigmod', paddle.nn.Sigmoid()), + ('relu', paddle.nn.ReLU()), + ]) + layers.update(sublayer) + layers_dicts.update(sublayer) + check_layer_dict() + + del layers['conv1d'] + del layers_dicts['conv1d'] + check_layer_dict() + + l = layers_dicts.pop('linear') + self.assertIs(layers['linear'], l) + layers.pop('linear') + check_layer_dict() + + layers_dicts.clear() + self.assertEqual(0, len(layers_dicts)) + layers.clear() + check_layer_dict() + + list_format_layers = [ + ('conv1d', paddle.nn.Conv1D(3, 2, 3)), + ('conv2d', paddle.nn.Conv2D(3, 2, 3)), + ] + layers = OrderedDict(list_format_layers) + layers_dicts.update(list_format_layers) + check_layer_dict() + + def test_layer_dict_error_inputs(self): + layers = [ + ('conv1d', paddle.nn.Conv1D(3, 2, 3), "conv1d"), + ('conv2d', paddle.nn.Conv2D(3, 2, 3)), + ] + + layers_dicts = paddle.nn.LayerDict() + self.assertRaises(ValueError, layers_dicts.update, layers) + + self.assertRaises(AssertionError, layers_dicts.update, 1) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/nn/__init__.py b/python/paddle/nn/__init__.py index 3a552d588be..79f21aadae6 100644 --- a/python/paddle/nn/__init__.py +++ b/python/paddle/nn/__init__.py @@ -151,6 +151,8 @@ from .layer.distance import PairwiseDistance #DEFINE_ALIAS from .layer.vision import PixelShuffle +from .layer.container import LayerDict #DEFINE_ALIAS + from .layer import loss #DEFINE_ALIAS from .layer import conv #DEFINE_ALIAS from .layer import vision #DEFINE_ALIAS diff --git a/python/paddle/nn/layer/__init__.py b/python/paddle/nn/layer/__init__.py index 13fdde07087..17c4ca5c5d1 100644 --- a/python/paddle/nn/layer/__init__.py +++ b/python/paddle/nn/layer/__init__.py @@ -23,6 +23,7 @@ from . import rnn from . import vision from . import distance from . import transformer +from . import container from .activation import * from .loss import * @@ -99,3 +100,4 @@ from .norm import LocalResponseNorm #DEFINE_ALIAS from .vision import PixelShuffle #DEFINE_ALIAS from .distance import PairwiseDistance #DEFINE_ALIAS +from .container import LayerDict #DEFINE_ALIAS diff --git a/python/paddle/nn/layer/container.py b/python/paddle/nn/layer/container.py new file mode 100644 index 00000000000..db317839ae8 --- /dev/null +++ b/python/paddle/nn/layer/container.py @@ -0,0 +1,294 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections import OrderedDict +from ...fluid.dygraph.layers import Layer +from six.moves import collections_abc + +__all__ = ['LayerDict', ] + + +class LayerDict(Layer): + """ + LayerDict holds sublayers in the ordered dictionary, and sublayers it contains are properly registered. + Holded sublayers can be accessed like a regular ordered python dictionary. + + Parameters: + sublayers (LayerDict|OrderedDict|list[(key,Layer)...], optional): iterable of key/value pairs, the type of value is 'paddle.nn.Layer' . + + Examplex: + .. code-block:: python + + import paddle + import numpy as np + from collections import OrderedDict + + sublayers = OrderedDict([ + ('conv1d', paddle.nn.Conv1D(3, 2, 3)), + ('conv2d', paddle.nn.Conv2D(3, 2, 3)), + ('conv3d', paddle.nn.Conv3D(4, 6, (3, 3, 3))), + ]) + + layers_dict = paddle.nn.LayerDict(sublayers=sublayers) + + l = layers_dict['conv1d'] + + for k in layers_dict: + l = layers_dict[k] + + len(layers_dict) + #3 + + del layers_dict['conv2d'] + len(layers_dict) + #2 + + conv1d = layers_dict.pop('conv1d') + len(layers_dict) + #1 + + layers_dict.clear() + len(layers_dict) + #0 + + """ + + def __init__(self, sublayers=None): + super(LayerDict, self).__init__() + if sublayers is not None: + self.update(sublayers) + + def __getitem__(self, key): + return self._sub_layers[key] + + def __setitem__(self, key, sublayer): + return self.add_sublayer(key, sublayer) + + def __delitem__(self, key): + del self._sub_layers[key] + + def __len__(self): + return len(self._sub_layers) + + def __iter__(self): + return iter(self._sub_layers) + + def __contains__(self, key): + return key in self._sub_layers + + def clear(self): + """ + Clear all the sublayers in the LayerDict. + + Parameters: + None. + + Examplex: + .. code-block:: python + + import paddle + from collections import OrderedDict + + sublayers = OrderedDict([ + ('conv1d', paddle.nn.Conv1D(3, 2, 3)), + ('conv2d', paddle.nn.Conv2D(3, 2, 3)), + ('conv3d', paddle.nn.Conv3D(4, 6, (3, 3, 3))), + ]) + + layer_dict = paddle.nn.LayerDict(sublayers=sublayers) + len(layer_dict) + #3 + + layer_dict.clear() + len(layer_dict) + #0 + + """ + self._sub_layers.clear() + + def pop(self, key): + """ + Remove the key from the LayerDict and return the layer of the key. + + Parameters: + key (str): the key to be removed. + + Examples: + .. code-block:: python + + import paddle + from collections import OrderedDict + + sublayers = OrderedDict([ + ('conv1d', paddle.nn.Conv1D(3, 2, 3)), + ('conv2d', paddle.nn.Conv2D(3, 2, 3)), + ('conv3d', paddle.nn.Conv3D(4, 6, (3, 3, 3))), + ]) + + layer_dict = paddle.nn.LayerDict(sublayers=sublayers) + len(layer_dict) + #3 + + layer_dict.pop('conv2d') + len(layer_dict) + #2 + + """ + v = self[key] + del self[key] + return v + + def keys(self): + """ + Return the iterable of the keys in LayerDict. + + Parameters: + None. + + Examples: + .. code-block:: python + + import paddle + from collections import OrderedDict + + sublayers = OrderedDict([ + ('conv1d', paddle.nn.Conv1D(3, 2, 3)), + ('conv2d', paddle.nn.Conv2D(3, 2, 3)), + ('conv3d', paddle.nn.Conv3D(4, 6, (3, 3, 3))), + ]) + + layer_dict = paddle.nn.LayerDict(sublayers=sublayers) + for k in layer_dict.keys(): + print(k) + + #conv1d + #conv2d + #conv3d + + """ + return self._sub_layers.keys() + + def items(self): + """ + Return the iterable of the key/value pairs in LayerDict. + + Parameters: + None. + + Examples: + .. code-block:: python + + import paddle + from collections import OrderedDict + + sublayers = OrderedDict([ + ('conv1d', paddle.nn.Conv1D(3, 2, 3)), + ('conv2d', paddle.nn.Conv2D(3, 2, 3)), + ('conv3d', paddle.nn.Conv3D(4, 6, (3, 3, 3))), + ]) + + layer_dict = paddle.nn.LayerDict(sublayers=sublayers) + for k, v in layer_dict.items(): + print(k, ":", v) + + #conv1d : Conv1D(3, 2, kernel_size=[3], data_format=NCL) + #conv2d : Conv2D(3, 2, kernel_size=[3, 3], data_format=NCHW) + #conv3d : Conv3D(4, 6, kernel_size=[3, 3, 3], data_format=NCDHW) + + """ + return self._sub_layers.items() + + def values(self): + """ + Return the iterable of the values in LayerDict. + + Parameters: + None. + + Examples: + .. code-block:: python + + import paddle + from collections import OrderedDict + + sublayers = OrderedDict([ + ('conv1d', paddle.nn.Conv1D(3, 2, 3)), + ('conv2d', paddle.nn.Conv2D(3, 2, 3)), + ('conv3d', paddle.nn.Conv3D(4, 6, (3, 3, 3))), + ]) + + layer_dict = paddle.nn.LayerDict(sublayers=sublayers) + for v in layer_dict.values(): + print(v) + + #Conv1D(3, 2, kernel_size=[3], data_format=NCL) + #Conv2D(3, 2, kernel_size=[3, 3], data_format=NCHW) + #Conv3D(4, 6, kernel_size=[3, 3, 3], data_format=NCDHW) + + """ + return self._sub_layers.values() + + def update(self, sublayers): + """ + Update the key/values pairs in sublayers to the LayerDict, overwriting the existing keys. + + Parameters: + sublayers (LayerDict|OrderedDict|list[(key,Layer)...]): iterable of key/value pairs, the type of value is 'paddle.nn.Layer' . + + Examples: + .. code-block:: python + + import paddle + from collections import OrderedDict + + sublayers = OrderedDict([ + ('conv1d', paddle.nn.Conv1D(3, 2, 3)), + ('conv2d', paddle.nn.Conv2D(3, 2, 3)), + ('conv3d', paddle.nn.Conv3D(4, 6, (3, 3, 3))), + ]) + + new_sublayers = OrderedDict([ + ('relu', paddle.nn.ReLU()), + ('conv2d', paddle.nn.Conv2D(4, 2, 4)), + ]) + layer_dict = paddle.nn.LayerDict(sublayers=sublayers) + + layer_dict.update(new_sublayers) + + for k, v in layer_dict.items(): + print(k, ":", v) + #conv1d : Conv1D(3, 2, kernel_size=[3], data_format=NCL) + #conv2d : Conv2D(4, 2, kernel_size=[4, 4], data_format=NCHW) + #conv3d : Conv3D(4, 6, kernel_size=[3, 3, 3], data_format=NCDHW) + #relu : ReLU() + + """ + + assert isinstance( + sublayers, collections_abc.Iterable + ), "The type of sublayers is not iterable of key/value pairs, the type of sublayers is " + type( + sublayers).__name__ + + if isinstance(sublayers, + (OrderedDict, LayerDict, collections_abc.Mapping)): + for key, layer in sublayers.items(): + self.add_sublayer(key, layer) + else: + # handle this format [(key1, layer1), (key2, layer2)...] + for i, kv in enumerate(sublayers): + if len(kv) != 2: + raise ValueError("The length of the " + str(i) + + "'s element in sublayers is " + str( + len(kv)) + ", which must be 2.") + self.add_sublayer(kv[0], kv[1]) -- GitLab From 1bae1e74277c61f60b65514c7af5147947c9e105 Mon Sep 17 00:00:00 2001 From: cc <52520497+juncaipeng@users.noreply.github.com> Date: Thu, 8 Apr 2021 20:20:51 +0800 Subject: [PATCH 178/486] Support converting the model from fp32 to fp16 (#32112) * Support converting the model from fp32 to fp16 --- paddle/fluid/operators/save_op.cc | 2 + .../post_training_quantization.py | 78 +++++++++ .../test_weight_quantization_mobilenetv1.py | 151 +++++++++++++++--- 3 files changed, 211 insertions(+), 20 deletions(-) diff --git a/paddle/fluid/operators/save_op.cc b/paddle/fluid/operators/save_op.cc index f619f3d59ce..194274cdd5b 100644 --- a/paddle/fluid/operators/save_op.cc +++ b/paddle/fluid/operators/save_op.cc @@ -88,6 +88,8 @@ REGISTER_OPERATOR(save, ops::SaveOp, ops::SaveOpProtoMaker, REGISTER_OP_CPU_KERNEL( save, ops::SaveOpKernel, ops::SaveOpKernel, + ops::SaveOpKernel, ops::SaveOpKernel, ops::SaveOpKernel, ops::SaveOpKernel, diff --git a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py index b59534b5965..aba6005f0cf 100644 --- a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py +++ b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py @@ -16,9 +16,11 @@ import os import re import logging import numpy as np +import shutil from .... import io from .... import core from .... import framework +from .... import unique_name from ....executor import global_scope, Executor from ....framework import IrGraph from ....log_helper import get_logger @@ -1006,6 +1008,82 @@ class WeightQuantization(object): quantizable_op_type, weight_bits, weight_quantize_type, True, threshold_rate) + def convert_weight_to_fp16(self, save_model_dir): + """ + Convert all presistable vars from fp32 to fp16. + Note that, this api only changes the data type of variables in + __params__ file, and the __model__ file remains unchanged. + + Args: + save_model_dir(str): The path to save the fp16 model. + """ + + # Load model + place = core.CPUPlace() + exe = Executor(place) + scope = global_scope() + [infer_program, feed_list, fetch_list] = \ + io.load_inference_model(dirname=self._model_dir, + executor=exe, + model_filename=self._model_filename, + params_filename=self._params_filename) + + # Clone and save fp16 weights + save_program = framework.Program() + save_block = save_program.global_block() + save_var_map = {} + + for var in infer_program.list_vars(): + if (var.type == core.VarDesc.VarType.RAW) or \ + (not var.persistable) or (var.name in ['feed', 'fetch']) \ + or (var.dtype != core.VarDesc.VarType.FP32): + continue + + #new_var = _clone_var_to_block_(var, save_block) + new_var = save_block._clone_variable(var) + if self._params_filename is not None: + save_var_map[new_var.name] = new_var + else: + save_file_path = os.path.join( + os.path.normpath(save_model_dir), new_var.name) + save_block.append_op( + type='save', + inputs={'X': [new_var]}, + outputs={}, + attrs={ + 'file_path': os.path.normpath(save_file_path), + 'save_as_fp16': True + }) + + if self._params_filename is not None: + save_var_list = [] + for name in sorted(save_var_map.keys()): + save_var_list.append(save_var_map[name]) + + saved_params_var = save_block.create_var( + type=core.VarDesc.VarType.RAW, + name=unique_name.generate("saved_params")) + saved_params_var.desc.set_persistable(True) + + save_path = os.path.join( + os.path.normpath(save_model_dir), self._params_filename) + save_block.append_op( + type='save_combine', + inputs={'X': save_var_list}, + outputs={'Y': saved_params_var}, + attrs={'file_path': save_path, + 'save_as_fp16': True}) + + save_program._sync_with_cpp() + exe.run(save_program) + + # Copy model + model_filename = "__model__" if self._model_filename is None \ + else self._model_filename + src_model = os.path.join(self._model_dir, model_filename) + dest_model = os.path.join(save_model_dir, model_filename) + shutil.copyfile(src_model, dest_model) + def _quantize_weight_to_int(self, save_model_dir, save_model_filename, save_params_filename, quantizable_op_type, weight_bits, weight_quantize_type, for_test, diff --git a/python/paddle/fluid/contrib/slim/tests/test_weight_quantization_mobilenetv1.py b/python/paddle/fluid/contrib/slim/tests/test_weight_quantization_mobilenetv1.py index 1e8fa51d635..744c97c514b 100644 --- a/python/paddle/fluid/contrib/slim/tests/test_weight_quantization_mobilenetv1.py +++ b/python/paddle/fluid/contrib/slim/tests/test_weight_quantization_mobilenetv1.py @@ -15,6 +15,7 @@ import unittest import os import time +import numpy as np from paddle.dataset.common import download, DATA_HOME from paddle.fluid.contrib.slim.quantization import WeightQuantization import paddle @@ -22,6 +23,28 @@ import paddle paddle.enable_static() +def _load_variable_data(scope, var_name): + ''' + Load variable value from scope + ''' + var_node = scope.find_var(var_name) + assert var_node is not None, \ + "Cannot find " + var_name + " in scope." + return np.array(var_node.get_tensor()) + + +def _set_variable_data(scope, place, var_name, np_value): + ''' + Set the value of var node by name, if the node exits, + ''' + assert isinstance(np_value, np.ndarray), \ + 'The type of value should be numpy array.' + var_node = scope.find_var(var_name) + if var_node != None: + tensor = var_node.get_tensor() + tensor.set(np_value, place) + + class TestWeightQuantization(unittest.TestCase): def setUp(self): self.weight_quantization_dir = 'weight_quantization' @@ -45,18 +68,20 @@ class TestWeightQuantization(unittest.TestCase): zip_path) os.system(cmd) - def run_test(self, model_name, model_data_url, model_data_md5, weight_bits, - quantizable_op_type, weight_quantize_type, generate_test_model, - threshold_rate): + def quantize_to_int(self, model_name, model_data_url, model_data_md5, + weight_bits, quantizable_op_type, weight_quantize_type, + generate_test_model, threshold_rate): model_dir = self.download_model(model_name, model_data_url, model_data_md5) + load_model_dir = os.path.join(model_dir, model_name) timestamp = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime()) save_model_dir = os.path.join( os.getcwd(), model_name + "_wq_" + str(weight_bits) + "_" + timestamp) - weight_quant = WeightQuantization(model_dir=model_dir + "/model") + + weight_quant = WeightQuantization(model_dir=load_model_dir) weight_quant.quantize_weight_to_int( save_model_dir=save_model_dir, weight_bits=weight_bits, @@ -72,11 +97,79 @@ class TestWeightQuantization(unittest.TestCase): print("Failed to delete {} due to {}".format(save_model_dir, str( e))) + def convert_to_fp16(self, model_name, model_data_url, model_data_md5, + model_filename, params_filename): + model_dir = self.download_model(model_name, model_data_url, + model_data_md5) + load_model_dir = os.path.join(model_dir, model_name) + + timestamp = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime()) + save_model_dir = os.path.join(os.getcwd(), + model_name + "_wq_fp16_" + timestamp) + + weight_quant = WeightQuantization(load_model_dir, model_filename, + params_filename) + + weight_quant.convert_weight_to_fp16(save_model_dir) + + print("finish converting the data type of weights to fp16 for " + + model_name) + print("fp16 model saved in " + save_model_dir + "\n") + + input_data = np.ones([1, 3, 224, 224], dtype=np.float32) + res_fp32 = self.run_models(load_model_dir, model_filename, + params_filename, input_data, False) + res_fp16 = self.run_models(save_model_dir, model_filename, + params_filename, input_data, True) + + self.assertTrue( + np.allclose( + res_fp32, res_fp16, rtol=1e-5, atol=1e-08, equal_nan=True), + msg='Failed to test the accuracy of the fp32 and fp16 model.') + + try: + os.system("rm -rf {}".format(save_model_dir)) + except Exception as e: + print("Failed to delete {} due to {}".format(save_model_dir, str( + e))) + + def run_models(self, model_dir, model_filename, params_filename, input_data, + is_fp16_model): + print(model_dir) + + place = paddle.CPUPlace() + exe = paddle.static.Executor(place) + scope = paddle.static.Scope() + with paddle.static.scope_guard(scope): + [inference_program, feed_target_names, fetch_targets] = \ + paddle.fluid.io.load_inference_model(model_dir, exe, + model_filename=model_filename, + params_filename=params_filename) + + if is_fp16_model: + for var in inference_program.list_vars(): + if (var.type == paddle.fluid.core.VarDesc.VarType.RAW) or \ + (not var.persistable) or (var.name in ['feed', 'fetch']) \ + or (var.dtype != paddle.fluid.core.VarDesc.VarType.FP16): + continue + tensor = _load_variable_data(scope, var.name) + _set_variable_data(scope, place, var.name, + tensor.astype(np.float32)) + + results = exe.run(inference_program, + feed={feed_target_names[0]: input_data}, + fetch_list=fetch_targets) + return np.array(results[0]) + class TestWeightQuantizationMobilenetv1(TestWeightQuantization): - model_name = "mobilenetv1" - model_data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mobilenetv1_int8_model.tar.gz" - model_data_md5 = "13892b0716d26443a8cdea15b3c6438b" + nocomb_model_name = "mobilenetv1_fp32_nocombined" + nocomb_model_data_url = "https://paddle-inference-dist.cdn.bcebos.com/Paddle-Inference-Demo/mobilenetv1_fp32_nocombined.tar.gz" + nocomb_model_data_md5 = "c9aae3b04d9d535c84590ae557be0a0b" + + comb_model_name = "mobilenetv1_fp32_combined" + comb_model_data_url = "https://paddle-inference-dist.cdn.bcebos.com/Paddle-Inference-Demo/mobilenetv1_fp32_combined.tar.gz" + comb_model_data_md5 = "087c67e2b2b0a8b689fcc570a56c005f" def test_weight_quantization_mobilenetv1_8bit_abs_max(self): weight_bits = 8 @@ -84,9 +177,10 @@ class TestWeightQuantizationMobilenetv1(TestWeightQuantization): weight_quantize_type = "abs_max" generate_test_model = True threshold_rate = 0.0 - self.run_test(self.model_name, self.model_data_url, self.model_data_md5, - weight_bits, quantizable_op_type, weight_quantize_type, - generate_test_model, threshold_rate) + self.quantize_to_int(self.nocomb_model_name, self.nocomb_model_data_url, + self.nocomb_model_data_md5, weight_bits, + quantizable_op_type, weight_quantize_type, + generate_test_model, threshold_rate) def test_weight_quantization_mobilenetv1_8bit_channel_wise_abs_max(self): weight_bits = 8 @@ -94,19 +188,21 @@ class TestWeightQuantizationMobilenetv1(TestWeightQuantization): weight_quantize_type = "channel_wise_abs_max" generate_test_model = True threshold_rate = 0.0 - self.run_test(self.model_name, self.model_data_url, self.model_data_md5, - weight_bits, quantizable_op_type, weight_quantize_type, - generate_test_model, threshold_rate) + self.quantize_to_int(self.nocomb_model_name, self.nocomb_model_data_url, + self.nocomb_model_data_md5, weight_bits, + quantizable_op_type, weight_quantize_type, + generate_test_model, threshold_rate) def test_weight_quantization_mobilenetv1_16bit_abs_max(self): weight_bits = 16 quantizable_op_type = ['conv2d', 'depthwise_conv2d', 'mul'] weight_quantize_type = "abs_max" generate_test_model = False - threshold_rate = 1e-9 - self.run_test(self.model_name, self.model_data_url, self.model_data_md5, - weight_bits, quantizable_op_type, weight_quantize_type, - generate_test_model, threshold_rate) + threshold_rate = 0 + self.quantize_to_int(self.nocomb_model_name, self.nocomb_model_data_url, + self.nocomb_model_data_md5, weight_bits, + quantizable_op_type, weight_quantize_type, + generate_test_model, threshold_rate) def test_weight_quantization_mobilenetv1_16bit_channel_wise_abs_max(self): weight_bits = 16 @@ -114,9 +210,24 @@ class TestWeightQuantizationMobilenetv1(TestWeightQuantization): weight_quantize_type = "channel_wise_abs_max" generate_test_model = False threshold_rate = 1e-9 - self.run_test(self.model_name, self.model_data_url, self.model_data_md5, - weight_bits, quantizable_op_type, weight_quantize_type, - generate_test_model, threshold_rate) + self.quantize_to_int(self.nocomb_model_name, self.nocomb_model_data_url, + self.nocomb_model_data_md5, weight_bits, + quantizable_op_type, weight_quantize_type, + generate_test_model, threshold_rate) + + def test_mobilenetv1_fp16_combined(self): + model_filename = '__model__' + params_filename = '__params__' + self.convert_to_fp16(self.comb_model_name, self.comb_model_data_url, + self.comb_model_data_md5, model_filename, + params_filename) + + def test_mobilenetv1_fp16_nocombined(self): + model_filename = None + params_filename = None + self.convert_to_fp16(self.nocomb_model_name, self.nocomb_model_data_url, + self.nocomb_model_data_md5, model_filename, + params_filename) if __name__ == '__main__': -- GitLab From 3822247f425f532c1417b7ed565390e4769a275d Mon Sep 17 00:00:00 2001 From: "Lei.C" <773703646@qq.com> Date: Fri, 9 Apr 2021 10:06:06 +0800 Subject: [PATCH 179/486] [ROCM] update rocm skip ut list, test=develop (#32149) --- tools/get_quick_disable_lt.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tools/get_quick_disable_lt.py b/tools/get_quick_disable_lt.py index 18ebdb00317..4805c909c1b 100644 --- a/tools/get_quick_disable_lt.py +++ b/tools/get_quick_disable_lt.py @@ -25,6 +25,13 @@ def download_file(): url = "https://sys-p0.bj.bcebos.com/prec/{}".format('disable_ut_win') else: url = "https://sys-p0.bj.bcebos.com/prec/{}".format('disable_ut') + try: + import paddle.fluid.core as core + if core.is_compiled_with_rocm(): + url = "https://sys-p0.bj.bcebos.com/prec/{}".format( + 'disable_ut_rocm_ci') + except: + pass f = requests.get(url) data = f.text status_code = f.status_code -- GitLab From dabaca003fc6368bfa7804f304270533bb1b088f Mon Sep 17 00:00:00 2001 From: Jacek Czaja Date: Fri, 9 Apr 2021 04:13:26 +0200 Subject: [PATCH 180/486] Candidate fix to #31992 (#32136) --- paddle/fluid/platform/device_context.h | 16 ++++++-------- paddle/fluid/platform/mkldnn_helper.h | 30 +++++++++++++++++--------- 2 files changed, 26 insertions(+), 20 deletions(-) diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index 411fe09c864..02ad22f780f 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -600,6 +600,8 @@ class MKLDNNDeviceContextThreadLocals { // MKL-DNN stream used for execution of primitives (per-thread) mkldnn::engine cur_engine; mkldnn::stream cur_stream; + std::string key_suffix; // Key identifying current Executor + bool key_attach_thread_id = true; Body(); ~Body(); @@ -612,6 +614,10 @@ class MKLDNNDeviceContextThreadLocals { void log_lib_version(void); const mkldnn::engine& get_engine(void); mkldnn::stream& get_stream(void); + void set_key_suffix(const std::string& suffix) { key_suffix = suffix; } + const std::string& get_key_suffix(void) const { return key_suffix; } + void disable_tid_in_key(void) { key_attach_thread_id = false; } + bool is_tid_used_in_key(void) const { return key_attach_thread_id; } }; MKLDNNDeviceContextThreadLocals() = default; MKLDNNDeviceContextThreadLocals(const MKLDNNDeviceContextThreadLocals& c) = @@ -655,14 +661,6 @@ class MKLDNNDeviceContext : public CPUDeviceContext { // Remove all entries from the blob map void ResetBlobMap(); - // Set a suffix to be added to key - void SetKeySuffix(const std::string& suffix) { key_suffix_ = suffix; } - const std::string& GetKeySuffix(void) const { return key_suffix_; } - - // Disable adding thread ID to the key - void DisableThreadInfoInKey(void) { key_attach_thread_id_ = false; } - bool IsThreadIdUsedInKey(void) const { return key_attach_thread_id_; } - // Prevent next ResetBlobMap() void BlockNextCacheClearing(); @@ -686,8 +684,6 @@ class MKLDNNDeviceContext : public CPUDeviceContext { std::shared_ptr p_blobmap_; std::shared_ptr p_mutex_; bool block_next_cache_clearing_ = false; - std::string key_suffix_; // Key identifying current Executor - bool key_attach_thread_id_ = true; }; #endif diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h index 20e6dfe1c39..35776b9f1e6 100644 --- a/paddle/fluid/platform/mkldnn_helper.h +++ b/paddle/fluid/platform/mkldnn_helper.h @@ -439,14 +439,23 @@ inline void AppendKey(std::string* key, const std::vector& dims) { inline void AttachPointerHashToMKLDNNKey(void* ptr, const platform::Place& place) { if (platform::is_cpu_place(place)) { - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - platform::MKLDNNDeviceContext* dev_ctx = - (platform::MKLDNNDeviceContext*)pool.Get(place); - dev_ctx->SetKeySuffix("E" + - std::to_string(reinterpret_cast(ptr))); - // When NaiveExecutor/Executor is used no info on thread id is needed in a - // key - dev_ctx->DisableThreadInfoInKey(); + // Static vars will remember first executor and its thread + // so both of them need to be processed by the same thread within + // critical section + static std::mutex static_vars_barrier; + static_vars_barrier.lock(); + static auto first_exec = ptr; + static auto first_thread = ThreadIDasStr(); + static_vars_barrier.unlock(); + + if (first_exec != ptr) { + paddle::platform::MKLDNNDeviceContext::tls().set_key_suffix( + "E" + std::to_string(reinterpret_cast(ptr))); + } + // For first thread + if (first_thread == ThreadIDasStr()) { + paddle::platform::MKLDNNDeviceContext::tls().disable_tid_in_key(); + } } } @@ -457,13 +466,14 @@ inline std::string CreateKey(const platform::MKLDNNDeviceContext& dev_ctx, key.reserve(64); using expand_type = int[]; expand_type{0, (AppendKey(&key, std::forward(args)), 0)...}; - key += dev_ctx.GetKeySuffix(); + key += paddle::platform::MKLDNNDeviceContext::tls().get_key_suffix(); return key; } inline std::string ExtendKeyWithThreadInfoIfNeeded( const platform::MKLDNNDeviceContext& dev_ctx, const std::string& key) { - return ((dev_ctx.IsThreadIdUsedInKey() == true) && + return ((paddle::platform::MKLDNNDeviceContext::tls().is_tid_used_in_key() == + true) && (platform::MKLDNNDeviceContext::tls().get_cur_mkldnn_session_id() == platform::MKLDNNDeviceContextThreadLocals::kMKLDNNSessionID_Default)) ? key + "-t:" + ThreadIDasStr() -- GitLab From 55730d959af4fce9114c15b515f409a59ff14aed Mon Sep 17 00:00:00 2001 From: Aurelius84 Date: Fri, 9 Apr 2021 11:06:26 +0800 Subject: [PATCH 181/486] [Dy2Stat] Support DictCmp and zip grammer (#32159) * support DictCmp and zip grammar * fix code style --- .../dygraph_to_static/loop_transformer.py | 34 +++++++++++++----- .../fluid/dygraph/dygraph_to_static/utils.py | 35 +++++++++++++++++++ .../unittests/dygraph_to_static/test_dict.py | 34 ++++++++++++++++++ 3 files changed, 94 insertions(+), 9 deletions(-) diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py index bd89a79c805..14bb54983b5 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py @@ -378,6 +378,21 @@ class NameVisitor(gast.NodeVisitor): :param loop_node: Current loop node. """ + def filter_name_nodes_from(root_node, target_var_names): + """ + Filter children with gast.Name type from node.(inclusivly) + """ + name_nodes = set() + if isinstance(root_node, gast.Name): + if node.id in target_var_names: + name_nodes.add(root_node) + for child_node in gast.walk(root_node): + if isinstance(child_node, gast.Name): + if child_node.id in target_var_names: + name_nodes.add(child_node) + + return name_nodes + vars_of_list_generator = set() target_vars_of_for_node = set() @@ -412,15 +427,16 @@ class NameVisitor(gast.NodeVisitor): # 1.2 vars from target vars used in elt_node target_var_names = {var.id for var in target_vars} - listcomp_node = self._get_parent_node(parent_node) - elt_node = listcomp_node.elt - if isinstance(elt_node, gast.Name): - if elt_node.id in target_var_names: - vars_of_list_generator.add(elt_node) - for child_node in gast.walk(elt_node): - if isinstance(child_node, gast.Name): - if child_node.id in target_var_names: - vars_of_list_generator.add(child_node) + comp_node = self._get_parent_node(parent_node) + elt_nodes = [] + if isinstance(comp_node, gast.ListComp): + elt_nodes.append(comp_node.elt) + elif isinstance(comp_node, gast.DictComp): + elt_nodes.extend([comp_node.key, comp_node.value]) + + for node in elt_nodes: + vars_of_list_generator |= filter_name_nodes_from( + node, target_var_names) # 2. Get target vars or vars from target vars used in for-loop but the for-loop is # 1) not the "loop_node" itself diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py index 624ca085ac6..001116a74c9 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py @@ -79,6 +79,7 @@ FOR_ITER_TUPLE_PREFIX = '__for_loop_iter_tuple' FOR_ITER_TUPLE_INDEX_PREFIX = '__for_loop_iter_tuple_index' FOR_ITER_VAR_LEN_PREFIX = '__for_loop_var_len' FOR_ITER_VAR_NAME_PREFIX = '__for_loop_iter_var' +FOR_ITER_ZIP_TO_LIST_PREFIX = '__for_loop_iter_zip' # FullArgSpec is valid from Python3. Defined a Namedtuple to # to make it available in Python2. @@ -1012,6 +1013,9 @@ class ForNodeVisitor(object): # - for i, x enumerate(var|var.numpy()) # - for x in var self.iter_var_len_name = unique_name.generate(FOR_ITER_VAR_LEN_PREFIX) + # - created zip to list var : __for_loop_iter_zip_0 + self.iter_zip_to_list_name = unique_name.generate( + FOR_ITER_ZIP_TO_LIST_PREFIX) # - var.numpy()/var # - for x in var|var.numpy() @@ -1083,6 +1087,7 @@ class ForNodeVisitor(object): def _parse_for_stmts(self): init_stmts = [] + init_stmts.extend(self._build_iter_node()) init_stmts.append(self._build_index_init_node()) init_stmts.append(self._build_var_len_assign_node()) @@ -1105,6 +1110,7 @@ class ForNodeVisitor(object): def _parse_for_enumerate_stmts(self): init_stmts = [] + init_stmts.extend(self._build_iter_node()) init_stmts.append(self._build_index_init_node()) init_stmts.append(self._build_var_len_assign_node()) init_stmts.append(self._build_enum_init_node()) @@ -1163,6 +1169,34 @@ class ForNodeVisitor(object): return convert_len_node + def _build_iter_node(self): + """ + Process special cases for iter_node inclue: + - Case 1 (for zip): + + - for i, val in enumerate(zip(x, y)) # original code: + + - __for_loop_iter_zip_0 = list(zip(x, y)) + - for i, val in enumerate(__for_loop_iter_zip_0) + """ + new_nodes = [] + if isinstance(self.iter_node, gast.Call) and isinstance( + self.iter_node.func, gast.Name): + if self.iter_node.func.id == 'zip': + iter_var_name = ast_to_source_code(self.iter_node).strip() + zip_to_list_str = "{target} = list({value})".format( + target=self.iter_zip_to_list_name, value=iter_var_name) + zip_to_list_node = gast.parse(zip_to_list_str).body[0] + new_nodes.append(zip_to_list_node) + + self.iter_node = gast.Name( + id=self.iter_zip_to_list_name, + ctx=gast.Load(), + annotation=None, + type_comment=None) + + return new_nodes + def _build_enum_init_node(self): if self.is_for_enumerate_iter() and self.args_length != 1: init_value_str = ast_to_source_code(self.iter_args[1]).strip() @@ -1399,6 +1433,7 @@ def input_specs_compatible(src_input_specs, desired_input_specs): for spec in src_input_specs: if spec not in desired_input_specs: return False + else: for i in range(len_specs): src_shape = src_input_specs[i].shape diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_dict.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_dict.py index 3a7994ee67e..dbd3952991c 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_dict.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_dict.py @@ -241,5 +241,39 @@ class TestDictPop(TestNetWithDict): static_result)) +class TestDictCmpInFor(unittest.TestCase): + def test_with_for(self): + def func(): + pos = [1, 3] + neg = [-1, -3] + dict_val = {'minus': 0} + # test `zip` with `for` + for (x, y) in zip(pos, neg): + val = x - y + dict_val.update( + {k: val + dict_val[k] + for k, v in dict_val.items()}) + + return dict_val + + self.assertEqual(paddle.jit.to_static(func)()['minus'], 8) + + def test_with_for_enumerate(self): + def func(): + pos = [1, 3] + neg = [-1, -3] + dict_val = {'minus': 0} + # test `zip` with `for` + for i, (x, y) in enumerate(zip(pos, neg)): + val = x - y + dict_val.update( + {k: val + dict_val[k] + for k, v in dict_val.items()}) + + return dict_val + + self.assertEqual(paddle.jit.to_static(func)()['minus'], 8) + + if __name__ == '__main__': unittest.main() -- GitLab From d815fbf90b7c1f88e01da7aff6189a7bcbcc2714 Mon Sep 17 00:00:00 2001 From: Aurelius84 Date: Fri, 9 Apr 2021 11:39:46 +0800 Subject: [PATCH 182/486] [CustomOp]Support MacOS platform and Remove libpaddle_custom_op.so dependency (#31976) * Remove old custom OP to reduce whl package volume * [Custom OP]Remove old custom OP to reduce whl package volume * support macos --- paddle/fluid/framework/CMakeLists.txt | 38 ++---- .../fluid/platform/dynload/dynamic_loader.cc | 5 - python/paddle/fluid/tests/CMakeLists.txt | 7 +- .../fluid/tests/custom_op/CMakeLists.txt | 2 +- .../tests/custom_op/custom_relu_setup.py | 16 ++- .../fluid/tests/custom_op/test_check_abi.py | 32 ++++- .../tests/custom_op/test_custom_relu_model.py | 20 +-- .../custom_op/test_custom_relu_op_jit.py | 15 ++- .../custom_op/test_custom_relu_op_setup.py | 6 +- python/paddle/fluid/tests/custom_op/utils.py | 3 + .../utils/cpp_extension/cpp_extension.py | 74 ++++++++--- .../utils/cpp_extension/extension_utils.py | 123 ++++++++++++++++-- python/setup.py.in | 4 - 13 files changed, 245 insertions(+), 100 deletions(-) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 2842f230ca9..13c37b93d7c 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -367,29 +367,23 @@ endif() ##### 2.0 New custom op extension mechanism related ##### # if not deps `layer`, will cause: undefined symbol: _ZN6paddle10imperative7VarBase9name_set_ -set(PADDLE_CUSTOM_OP_MODULES custom_tensor op_meta_info custom_operator layer) - -set(PADDLE_CUSTOM_OP_SRCS - ${CMAKE_CURRENT_SOURCE_DIR}/custom_operator.cc - ${CMAKE_CURRENT_SOURCE_DIR}/../extension/src/ext_tensor.cc - ${CMAKE_CURRENT_SOURCE_DIR}/../extension/src/ext_op_meta_info.cc - ${CMAKE_SOURCE_DIR}/paddle/fluid/imperative/layer.cc) -set(PADDLE_CUSTOM_OP_SRCS ${PADDLE_CUSTOM_OP_SRCS} PARENT_SCOPE) +if (WIN32) + set(PADDLE_CUSTOM_OP_MODULES custom_tensor op_meta_info custom_operator layer) -cc_library(paddle_custom_op_shared - SHARED SRCS ${PADDLE_CUSTOM_OP_SRCS} DEPS ${PADDLE_CUSTOM_OP_MODULES}) + set(PADDLE_CUSTOM_OP_SRCS + ${CMAKE_CURRENT_SOURCE_DIR}/custom_operator.cc + ${CMAKE_CURRENT_SOURCE_DIR}/../extension/src/ext_tensor.cc + ${CMAKE_CURRENT_SOURCE_DIR}/../extension/src/ext_op_meta_info.cc + ${CMAKE_SOURCE_DIR}/paddle/fluid/imperative/layer.cc) + set(PADDLE_CUSTOM_OP_SRCS ${PADDLE_CUSTOM_OP_SRCS} PARENT_SCOPE) -get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) -set_target_properties(paddle_custom_op_shared PROPERTIES OUTPUT_NAME paddle_custom_op) -target_link_libraries(paddle_custom_op_shared ${os_dependency_modules}) + cc_library(paddle_custom_op_shared + SHARED SRCS ${PADDLE_CUSTOM_OP_SRCS} DEPS ${PADDLE_CUSTOM_OP_MODULES}) -if (LINUX) - set(PADDLE_CUSTOM_OP_SHARED_LIB - ${PADDLE_BINARY_DIR}/paddle/fluid/framework/libpaddle_custom_op.so - CACHE INTERNAL "Paddle custom op lib") -endif() + get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) + set_target_properties(paddle_custom_op_shared PROPERTIES OUTPUT_NAME paddle_custom_op) + target_link_libraries(paddle_custom_op_shared ${os_dependency_modules}) -if (WIN32) if("${CMAKE_GENERATOR}" STREQUAL "Ninja") set(paddle_custom_op_lib_path ${CMAKE_CURRENT_BINARY_DIR}) else() @@ -402,9 +396,3 @@ if (WIN32) ${paddle_custom_op_lib_path}/paddle_custom_op.dll CACHE INTERNAL "Paddle custom op dll") endif() - -if(APPLE) - set(PADDLE_CUSTOM_OP_SHARED_LIB - ${PADDLE_BINARY_DIR}/paddle/fluid/framework/paddle_custom_op.dylib - CACHE INTERNAL "Paddle custom op lib") -endif() diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc index fbdfc4928cf..956acfe2771 100644 --- a/paddle/fluid/platform/dynload/dynamic_loader.cc +++ b/paddle/fluid/platform/dynload/dynamic_loader.cc @@ -414,12 +414,7 @@ void* GetMKLMLDsoHandle() { } void* GetOpDsoHandle(const std::string& dso_name) { -#if defined(__APPLE__) || defined(__OSX__) - PADDLE_THROW(platform::errors::Unimplemented( - "Create custom cpp op outside framework do not support Apple.")); -#else return GetDsoHandleFromSearchPath(FLAGS_op_dir, dso_name); -#endif } void* GetNvtxDsoHandle() { diff --git a/python/paddle/fluid/tests/CMakeLists.txt b/python/paddle/fluid/tests/CMakeLists.txt index 1d404151415..d73c4e3acb9 100644 --- a/python/paddle/fluid/tests/CMakeLists.txt +++ b/python/paddle/fluid/tests/CMakeLists.txt @@ -8,11 +8,6 @@ endforeach() add_subdirectory(unittests) add_subdirectory(book) - -# 2.0 New custom OP can support Windows/Linux now -# TODO: support 2.0 New Custom OP on Mac -if(NOT APPLE) - add_subdirectory(custom_op) -endif() +add_subdirectory(custom_op) set_tests_properties(test_beam_search_decoder PROPERTIES TIMEOUT 120) diff --git a/python/paddle/fluid/tests/custom_op/CMakeLists.txt b/python/paddle/fluid/tests/custom_op/CMakeLists.txt index ceaf4bbdfeb..81f64038c7c 100644 --- a/python/paddle/fluid/tests/custom_op/CMakeLists.txt +++ b/python/paddle/fluid/tests/custom_op/CMakeLists.txt @@ -1,5 +1,5 @@ # New custom OP can support Windows/Linux now -if(WITH_GPU) +if(WITH_GPU OR APPLE) # GPU custom op tests: compile both .cc and .cu file py_test(test_custom_relu_op_setup SRCS test_custom_relu_op_setup.py) py_test(test_custom_relu_op_jit SRCS test_custom_relu_op_jit.py) diff --git a/python/paddle/fluid/tests/custom_op/custom_relu_setup.py b/python/paddle/fluid/tests/custom_op/custom_relu_setup.py index 598b850c876..cbc4d17a4c7 100644 --- a/python/paddle/fluid/tests/custom_op/custom_relu_setup.py +++ b/python/paddle/fluid/tests/custom_op/custom_relu_setup.py @@ -14,17 +14,21 @@ import os -from utils import paddle_includes, extra_compile_args -from paddle.utils.cpp_extension import CUDAExtension, setup +from utils import paddle_includes, extra_compile_args, IS_MAC +from paddle.utils.cpp_extension import CUDAExtension, setup, CppExtension + +# Mac-CI don't support GPU +Extension = CppExtension if IS_MAC else CUDAExtension +sources = ['custom_relu_op.cc', 'custom_relu_op_dup.cc'] +if not IS_MAC: + sources.append('custom_relu_op.cu') # custom_relu_op_dup.cc is only used for multi ops test, # not a new op, if you want to test only one op, remove this # source file setup( name='custom_relu_module_setup', - ext_modules=CUDAExtension( # test for not specific name here. - sources=[ - 'custom_relu_op.cc', 'custom_relu_op.cu', 'custom_relu_op_dup.cc' - ], # test for multi ops + ext_modules=Extension( # test for not specific name here. + sources=sources, # test for multi ops include_dirs=paddle_includes, extra_compile_args=extra_compile_args)) diff --git a/python/paddle/fluid/tests/custom_op/test_check_abi.py b/python/paddle/fluid/tests/custom_op/test_check_abi.py index ed2af83b234..75cf99458e7 100644 --- a/python/paddle/fluid/tests/custom_op/test_check_abi.py +++ b/python/paddle/fluid/tests/custom_op/test_check_abi.py @@ -1,11 +1,11 @@ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -52,6 +52,8 @@ class TestCheckCompiler(TestABIBase): compiler = 'g++' elif utils.IS_WINDOWS: compiler = 'cl' + else: + compiler = 'clang' # Linux: all CI gcc version > 5.4.0 # Windows: all CI MSVC version > 19.00.24215 @@ -71,7 +73,7 @@ class TestCheckCompiler(TestABIBase): self.assertTrue( "Compiler Compatibility WARNING" in str(error[0].message)) - def test_exception(self): + def test_exception_linux(self): # clear environ self.del_environ() compiler = 'python' # fake command @@ -95,6 +97,28 @@ class TestCheckCompiler(TestABIBase): # restore utils._expected_compiler_current_platform = raw_func + def test_exception_mac(self): + # clear environ + self.del_environ() + compiler = 'python' # fake command + if utils.OS_NAME.startswith('darwin'): + + def fake(): + return [compiler] + + # mock a fake function + raw_func = utils._expected_compiler_current_platform + utils._expected_compiler_current_platform = fake + with warnings.catch_warnings(record=True) as error: + flag = utils.check_abi_compatibility(compiler, verbose=True) + # check return True + self.assertTrue(flag) + # check ABI Compatibility without WARNING + self.assertTrue(len(error) == 0) + + # restore + utils._expected_compiler_current_platform = raw_func + class TestRunCMDException(unittest.TestCase): def test_exception(self): diff --git a/python/paddle/fluid/tests/custom_op/test_custom_relu_model.py b/python/paddle/fluid/tests/custom_op/test_custom_relu_model.py index 1d4b2ae161e..db97e86385a 100644 --- a/python/paddle/fluid/tests/custom_op/test_custom_relu_model.py +++ b/python/paddle/fluid/tests/custom_op/test_custom_relu_model.py @@ -1,11 +1,11 @@ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -21,9 +21,9 @@ from paddle import nn from paddle.utils.cpp_extension import load, get_build_directory from paddle.utils.cpp_extension.extension_utils import run_cmd -from utils import paddle_includes, extra_cc_args, extra_nvcc_args +from utils import paddle_includes, extra_cc_args, extra_nvcc_args, IS_MAC -# Because Windows don't use docker, the shared lib already exists in the +# Because Windows don't use docker, the shared lib already exists in the # cache dir, it will not be compiled again unless the shared lib is removed. file = '{}\\custom_relu_for_model_jit\\custom_relu_for_model_jit.pyd'.format( get_build_directory()) @@ -35,9 +35,13 @@ if os.name == 'nt' and os.path.isfile(file): # custom_relu_op_dup.cc is only used for multi ops test, # not a new op, if you want to test only one op, remove this # source file +source_files = ['custom_relu_op.cc'] +if not IS_MAC: + source_files.append('custom_relu_op.cu') + custom_module = load( name='custom_relu_for_model_jit', - sources=['custom_relu_op.cc', 'custom_relu_op.cu'], + sources=source_files, extra_include_paths=paddle_includes, # add for Coverage CI extra_cxx_cflags=extra_cc_args, # test for cc flags extra_cuda_cflags=extra_nvcc_args, # test for nvcc flags @@ -84,7 +88,7 @@ class TestDygraphModel(unittest.TestCase): for i in range(self.batch_num) ] - self.devices = ['cpu', 'gpu'] + self.devices = ['cpu', 'gpu'] if not IS_MAC else ['cpu'] # for saving model self.model_path_template = "infer_model/custom_relu_dygaph_model_{}.pdparams" @@ -191,7 +195,7 @@ class TestStaticModel(unittest.TestCase): for i in range(self.batch_num) ] - self.devices = ['cpu', 'gpu'] + self.devices = ['cpu', 'gpu'] if not IS_MAC else ['cpu'] # for saving model self.model_path_template = "infer_model/custom_relu_static_model_{}_{}" diff --git a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py index 641630b0f44..d8dcc76ac60 100644 --- a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py +++ b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_jit.py @@ -18,10 +18,10 @@ import paddle import numpy as np from paddle.utils.cpp_extension import load, get_build_directory from paddle.utils.cpp_extension.extension_utils import run_cmd -from utils import paddle_includes, extra_cc_args, extra_nvcc_args, IS_WINDOWS +from utils import paddle_includes, extra_cc_args, extra_nvcc_args, IS_WINDOWS, IS_MAC from test_custom_relu_op_setup import custom_relu_dynamic, custom_relu_static -# Because Windows don't use docker, the shared lib already exists in the +# Because Windows don't use docker, the shared lib already exists in the # cache dir, it will not be compiled again unless the shared lib is removed. file = '{}\\custom_relu_module_jit\\custom_relu_module_jit.pyd'.format( get_build_directory()) @@ -33,11 +33,13 @@ if os.name == 'nt' and os.path.isfile(file): # custom_relu_op_dup.cc is only used for multi ops test, # not a new op, if you want to test only one op, remove this # source file +sources = ['custom_relu_op.cc', 'custom_relu_op_dup.cc'] +if not IS_MAC: + sources.append('custom_relu_op.cu') + custom_module = load( name='custom_relu_module_jit', - sources=[ - 'custom_relu_op.cc', 'custom_relu_op.cu', 'custom_relu_op_dup.cc' - ], + sources=sources, extra_include_paths=paddle_includes, # add for Coverage CI extra_cxx_cflags=extra_cc_args, # test for cc flags extra_cuda_cflags=extra_nvcc_args, # test for nvcc flags @@ -112,6 +114,9 @@ class TestJITLoad(unittest.TestCase): self.assertTrue(caught_exception) caught_exception = False + # MAC-CI don't support GPU + if IS_MAC: + return try: x = np.random.uniform(-1, 1, [4, 8]).astype('int32') custom_relu_dynamic(custom_module.custom_relu, 'gpu', 'int32', x) diff --git a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py index 5c5c2d65a59..b2676174517 100644 --- a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py +++ b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py @@ -1,11 +1,11 @@ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. diff --git a/python/paddle/fluid/tests/custom_op/utils.py b/python/paddle/fluid/tests/custom_op/utils.py index 57ce79b1f30..2d492da3d97 100644 --- a/python/paddle/fluid/tests/custom_op/utils.py +++ b/python/paddle/fluid/tests/custom_op/utils.py @@ -13,10 +13,13 @@ # limitations under the License. import os +import sys import six from distutils.sysconfig import get_python_lib from paddle.utils.cpp_extension.extension_utils import IS_WINDOWS +IS_MAC = sys.platform.startswith('darwin') + site_packages_path = get_python_lib() # Note(Aurelius84): We use `add_test` in Cmake to config how to run unittest in CI. # `PYTHONPATH` will be set as `build/python/paddle` that will make no way to find diff --git a/python/paddle/utils/cpp_extension/cpp_extension.py b/python/paddle/utils/cpp_extension/cpp_extension.py index 606f5465e1b..83dc1d2582b 100644 --- a/python/paddle/utils/cpp_extension/cpp_extension.py +++ b/python/paddle/utils/cpp_extension/cpp_extension.py @@ -22,14 +22,15 @@ from setuptools.command.easy_install import easy_install from setuptools.command.build_ext import build_ext from distutils.command.build import build -from .extension_utils import find_cuda_home, find_rocm_home, normalize_extension_kwargs, add_compile_flag +from .extension_utils import find_cuda_home, find_rocm_home, normalize_extension_kwargs, add_compile_flag, run_cmd from .extension_utils import is_cuda_file, prepare_unix_cudaflags, prepare_win_cudaflags from .extension_utils import _import_module_from_library, _write_setup_file, _jit_compile from .extension_utils import check_abi_compatibility, log_v, CustomOpInfo, parse_op_name_from -from .extension_utils import clean_object_if_change_cflags +from .extension_utils import clean_object_if_change_cflags, _reset_so_rpath from .extension_utils import bootstrap_context, get_build_directory, add_std_without_repeat from .extension_utils import IS_WINDOWS, OS_NAME, MSVC_COMPILE_FLAGS, MSVC_COMPILE_FLAGS +from .extension_utils import CLANG_COMPILE_FLAGS, CLANG_LINK_FLAGS from ...fluid import core @@ -50,14 +51,14 @@ else: def setup(**attr): """ The interface is used to config the process of compiling customized operators, - mainly includes how to complile shared library, automatically generate python API + mainly includes how to compile shared library, automatically generate python API and install it into site-package. It supports using customized operators directly with ``import`` statement. It encapsulates the python built-in ``setuptools.setup`` function and keeps arguments and usage same as the native interface. Meanwhile, it hiddens Paddle inner framework concepts, such as necessary compiling flags, included paths of head files, and linking - flags. It also will automatically search and valid local enviromment and versions of + flags. It also will automatically search and valid local environment and versions of ``cc(Linux)`` , ``cl.exe(Windows)`` and ``nvcc`` , then compiles customized operators supporting CPU or GPU device according to the specified Extension type. @@ -67,7 +68,7 @@ def setup(**attr): For Linux, GCC version will be checked . For example if Paddle with CUDA 10.1 is built with GCC 8.2, then the version of user's local machine should satisfy GCC >= 8.2. - For Windows, Visual Studio version will be checked, and it shoule be greater than or equal to that of + For Windows, Visual Studio version will be checked, and it should be greater than or equal to that of PaddlePaddle (Visual Studio 2015 update3). If the above conditions are not met, the corresponding warning will be printed, and a fatal error may occur because of ABI compatibility. @@ -130,7 +131,7 @@ def setup(**attr): ext_modules(Extension): Specify the Extension instance including customized operator source files, compiling flags et.al. If only compile operator supporting CPU device, please use ``CppExtension`` ; If compile operator supporting CPU and GPU devices, please use ``CUDAExtension`` . - include_dirs(list[str], optional): Specify the extra include directoies to search head files. The interface will automatically add + include_dirs(list[str], optional): Specify the extra include directories to search head files. The interface will automatically add ``site-package/paddle/include`` . Please add the corresponding directory path if including third-party head files. Default is None. extra_compile_args(list[str] | dict, optional): Specify the extra compiling flags such as ``-O3`` . If set ``list[str]`` , all these flags @@ -158,7 +159,7 @@ def setup(**attr): setup(name='custom_module', ext_modules=CUDAExtension( sources=['relu_op.cc', 'relu_op.cu']) - + # After running `python setup.py install` from custom_module import relu """ @@ -209,7 +210,7 @@ def CppExtension(sources, *args, **kwargs): Op Kernel only supporting CPU device. Please use ``CUDAExtension`` if you want to compile Op Kernel that supports both CPU and GPU devices. - It furtherly encapsulates python built-in ``setuptools.Extension`` .The arguments and + It further encapsulates python built-in ``setuptools.Extension`` .The arguments and usage are same as the native interface, except for no need to explicitly specify ``name`` . @@ -259,7 +260,7 @@ def CUDAExtension(sources, *args, **kwargs): Op Kernel supporting both CPU and GPU devices. Please use ``CppExtension`` if you want to compile Op Kernel that supports only CPU device. - It furtherly encapsulates python built-in ``setuptools.Extension`` .The arguments and + It further encapsulates python built-in ``setuptools.Extension`` .The arguments and usage are same as the native interface, except for no need to explicitly specify ``name`` . @@ -367,11 +368,14 @@ class BuildExtension(build_ext, object): self.build_lib = self.output_dir def build_extensions(self): + if OS_NAME.startswith("darwin"): + self._valid_clang_compiler() + self._check_abi() # Note(Aurelius84): If already compiling source before, we should check whether # cflags have changed and delete the built shared library to re-compile the source - # even though source file content keep unchanaged. + # even though source file content keep unchanged. so_name = self.get_ext_fullpath(self.extensions[0].name) clean_object_if_change_cflags( os.path.abspath(so_name), self.extensions[0]) @@ -397,17 +401,21 @@ class BuildExtension(build_ext, object): cflags = copy.deepcopy(extra_postargs) try: original_compiler = self.compiler.compiler_so - # ncvv compile CUDA source + # nvcc compile CUDA source if is_cuda_file(src): if core.is_compiled_with_rocm(): - assert ROCM_HOME is not None, "Not found ROCM runtime, please use `export ROCM_PATH= XXX` to specific it." + assert ROCM_HOME is not None, "Not found ROCM runtime, \ + please use `export ROCM_PATH= XXX` to specify it." + hipcc_cmd = os.path.join(ROCM_HOME, 'bin', 'hipcc') self.compiler.set_executable('compiler_so', hipcc_cmd) # {'nvcc': {}, 'cxx: {}} if isinstance(cflags, dict): cflags = cflags['hipcc'] else: - assert CUDA_HOME is not None, "Not found CUDA runtime, please use `export CUDA_HOME= XXX` to specific it." + assert CUDA_HOME is not None, "Not found CUDA runtime, \ + please use `export CUDA_HOME= XXX` to specify it." + nvcc_cmd = os.path.join(CUDA_HOME, 'bin', 'nvcc') self.compiler.set_executable('compiler_so', nvcc_cmd) # {'nvcc': {}, 'cxx: {}} @@ -424,7 +432,7 @@ class BuildExtension(build_ext, object): original_compile(obj, src, ext, cc_args, cflags, pp_opts) finally: # restore original_compiler - self.compiler.compiler_so = original_compiler + self.compiler.set_executable('compiler_so', original_compiler) def win_custom_single_compiler(sources, output_dir=None, @@ -470,7 +478,9 @@ class BuildExtension(build_ext, object): src = src_list[0] obj = obj_list[0] if is_cuda_file(src): - assert CUDA_HOME is not None, "Not found CUDA runtime, please use `export CUDA_HOME= XXX` to specific it." + assert CUDA_HOME is not None, "Not found CUDA runtime, \ + please use `export CUDA_HOME= XXX` to specify it." + nvcc_cmd = os.path.join(CUDA_HOME, 'bin', 'nvcc') if isinstance(self.cflags, dict): cflags = self.cflags['nvcc'] @@ -548,22 +558,42 @@ class BuildExtension(build_ext, object): print("Compiling user custom op, it will cost a few seconds.....") build_ext.build_extensions(self) + # Reset runtime library path on MacOS platform + so_path = self.get_ext_fullpath(self.extensions[0]._full_name) + _reset_so_rpath(so_path) + def get_ext_filename(self, fullname): # for example: custommed_extension.cpython-37m-x86_64-linux-gnu.so ext_name = super(BuildExtension, self).get_ext_filename(fullname) + split_str = '.' + name_items = ext_name.split(split_str) if self.no_python_abi_suffix and six.PY3: - split_str = '.' - name_items = ext_name.split(split_str) assert len( name_items ) > 2, "Expected len(name_items) > 2, but received {}".format( len(name_items)) name_items.pop(-2) - # custommed_extension.so ext_name = split_str.join(name_items) + # custommed_extension.dylib + if OS_NAME.startswith('darwin'): + name_items[-1] = 'dylib' + ext_name = split_str.join(name_items) return ext_name + def _valid_clang_compiler(self): + """ + Make sure to use Clang as compiler on Mac platform + """ + compiler_infos = ['clang'] + CLANG_COMPILE_FLAGS + linker_infos = ['clang'] + CLANG_LINK_FLAGS + self.compiler.set_executables( + compiler=compiler_infos, + compiler_so=compiler_infos, + compiler_cxx=['clang'], + linker_exe=['clang'], + linker_so=linker_infos) + def _check_abi(self): """ Check ABI Compatibility. @@ -628,6 +658,8 @@ class EasyInstallCommand(easy_install, object): will_rename = False if OS_NAME.startswith('linux') and ext == '.so': will_rename = True + elif OS_NAME.startswith('darwin') and ext == '.dylib': + will_rename = True elif IS_WINDOWS and ext == '.pyd': will_rename = True @@ -702,7 +734,7 @@ def load(name, For Linux, GCC version will be checked . For example if Paddle with CUDA 10.1 is built with GCC 8.2, then the version of user's local machine should satisfy GCC >= 8.2. - For Windows, Visual Studio version will be checked, and it shoule be greater than or equal to that of + For Windows, Visual Studio version will be checked, and it should be greater than or equal to that of PaddlePaddle (Visual Studio 2015 update3). If the above conditions are not met, the corresponding warning will be printed, and a fatal error may occur because of ABI compatibility. @@ -729,7 +761,7 @@ def load(name, custom_op_module = load( name="op_shared_libary_name", # name of shared library - sources=['relu_op.cc', 'relu_op.cu'], # source files of cusomized op + sources=['relu_op.cc', 'relu_op.cu'], # source files of customized op extra_cxx_cflags=['-g', '-w'], # optional, specify extra flags to compile .cc/.cpp file extra_cuda_cflags=['-O2'], # optional, specify extra flags to compile .cu file verbose=True # optional, specify to output log information @@ -761,7 +793,7 @@ def load(name, verbose(bool, optional): whether to verbose compiled log information. Default is False Returns: - Moudle: A callable python module contains all CustomOp Layer APIs. + Module: A callable python module contains all CustomOp Layer APIs. """ diff --git a/python/paddle/utils/cpp_extension/extension_utils.py b/python/paddle/utils/cpp_extension/extension_utils.py index 65655eaf48e..06596c0fae8 100644 --- a/python/paddle/utils/cpp_extension/extension_utils.py +++ b/python/paddle/utils/cpp_extension/extension_utils.py @@ -44,6 +44,13 @@ MSVC_COMPILE_FLAGS = [ '/wd4190', '/EHsc', '/w', '/DGOOGLE_GLOG_DLL_DECL', '/DBOOST_HAS_STATIC_ASSERT', '/DNDEBUG', '/DPADDLE_USE_DSO' ] +CLANG_COMPILE_FLAGS = [ + '-fno-common', '-dynamic', '-DNDEBUG', '-g', '-fwrapv', '-O3', '-arch', + 'x86_64' +] +CLANG_LINK_FLAGS = [ + '-dynamiclib', '-undefined', 'dynamic_lookup', '-arch', 'x86_64' +] MSVC_LINK_FLAGS = ['/MACHINE:X64', 'paddle_custom_op.lib'] @@ -247,7 +254,7 @@ class VersionManager: def combine_hash(md5, value): """ Return new hash value. - DO NOT use `hash()` beacuse it doesn't generate stable value between different process. + DO NOT use `hash()` because it doesn't generate stable value between different process. See https://stackoverflow.com/questions/27522626/hash-function-in-python-3-3-returns-different-results-between-sessions """ md5.update(repr(value).encode()) @@ -286,13 +293,13 @@ def clean_object_if_change_cflags(so_path, extension): if os.path.exists(so_path) and os.path.exists(version_file): old_version_info = deserialize(version_file) so_version = old_version_info.get(so_name, None) - # delete shared library file if versison is changed to re-compile it. + # delete shared library file if version is changed to re-compile it. if so_version is not None and so_version != versioner.version: log_v( "Re-Compiling {}, because specified cflags have been changed. New signature {} has been saved into {}.". format(so_name, versioner.version, version_file)) os.remove(so_path) - # upate new version information + # update new version information new_version_info = versioner.details new_version_info[so_name] = versioner.version serialize(version_file, new_version_info) @@ -348,6 +355,54 @@ def get_cuda_arch_flags(cflags): return [] +def _get_fluid_path(): + """ + Return installed fluid dir path. + """ + import paddle + return os.path.join(os.path.dirname(paddle.__file__), 'fluid') + + +def _get_core_name(): + """ + Return pybind DSO module name. + """ + import paddle + if paddle.fluid.core.load_noavx: + return 'core_noavx.so' + else: + return 'core_avx.so' + + +def _get_lib_core_path(): + """ + Return real path of libcore_(no)avx.dylib on MacOS. + """ + raw_core_name = _get_core_name() + lib_core_name = "lib{}.dylib".format(raw_core_name[:-3]) + return os.path.join(_get_fluid_path(), lib_core_name) + + +def _reset_so_rpath(so_path): + """ + NOTE(Aurelius84): Runtime path of core_(no)avx.so is modified into `@loader_path/../libs` + in setup.py.in. While loading custom op, `@loader_path` is the dirname of custom op + instead of `paddle/fluid`. So we modify `@loader_path` from custom dylib into `@rpath` + to ensure dynamic loader find it correctly. + + Moreover, we will add `-rpath site-packages/paddle/fluid` while linking the dylib so + that we don't need to set `LD_LIBRARY_PATH` any more. + """ + assert os.path.exists(so_path) + if OS_NAME.startswith("darwin"): + origin_runtime_path = "@loader_path/../libs/" + rpath = "@rpath/{}".format(_get_core_name()) + cmd = 'install_name_tool -change {} {} {}'.format(origin_runtime_path, + rpath, so_path) + + run_cmd(cmd) + + def normalize_extension_kwargs(kwargs, use_cuda=False): """ Normalize include_dirs, library_dir and other attributes in kwargs. @@ -381,15 +436,28 @@ def normalize_extension_kwargs(kwargs, use_cuda=False): extra_link_args.extend(['cudadevrt.lib', 'cudart_static.lib']) kwargs['extra_link_args'] = extra_link_args else: + ########################### Linux Platform ########################### + extra_link_args = kwargs.get('extra_link_args', []) + # On Linux, GCC support '-l:xxx.so' to specify the library name + # without `lib` prefix. + if OS_NAME.startswith('linux'): + extra_link_args.append('-l:{}'.format(_get_core_name())) + ########################### MacOS Platform ########################### + else: + # See _reset_so_rpath for details. + extra_link_args.append('-Wl,-rpath,{}'.format(_get_fluid_path())) + # On MacOS, ld don't support `-l:xx`, so we create a + # libcore_avx.dylib symbol link. + lib_core_name = create_sym_link_if_not_exist() + extra_link_args.append('-l{}'.format(lib_core_name)) + ########################### -- END -- ########################### + add_compile_flag(extra_compile_args, ['-w']) # disable warning # Note(Aurelius84): This marco will impact memory layout of `Tensor`. - # We align it automatially with pre-installed Paddle. + # We align it automatically with pre-installed Paddle. if core.is_compiled_with_mkldnn(): add_compile_flag(extra_compile_args, ['-DPADDLE_WITH_MKLDNN']) - # append link flags - extra_link_args = kwargs.get('extra_link_args', []) - extra_link_args.append('-lpaddle_custom_op') if use_cuda: extra_link_args.append('-lcudart') @@ -406,6 +474,30 @@ def normalize_extension_kwargs(kwargs, use_cuda=False): return kwargs +def create_sym_link_if_not_exist(): + """ + Create soft symbol link of `core_avx.so` or `core_noavx.so` + """ + assert OS_NAME.startswith('darwin') + + raw_core_name = _get_core_name() + core_path = os.path.join(_get_fluid_path(), raw_core_name) + new_lib_core_path = _get_lib_core_path() + + # create symbol link + if not os.path.exists(new_lib_core_path): + try: + os.symlink(core_path, new_lib_core_path) + assert os.path.exists(new_lib_core_path) + except Exception: + raise RuntimeError( + "Failed to create soft symbol link for {}.\n Please execute the following command manually: `ln -s {} {}`". + format(raw_core_name, core_path, new_lib_core_path)) + + # core_avx or core_noavx without suffix + return raw_core_name[:-3] + + def find_cuda_home(): """ Use heuristic method to find cuda path @@ -518,6 +610,11 @@ def find_paddle_includes(use_cuda=False): cuda_include_dir = find_cuda_includes() include_dirs.extend(cuda_include_dir) + if OS_NAME.startswith('darwin'): + # NOTE(Aurelius84): Ensure to find std v1 headers correctly. + std_v1_includes = '/Library/Developer/CommandLineTools/usr/include/c++/v1/' + include_dirs.append(std_v1_includes) + return include_dirs @@ -567,6 +664,9 @@ def find_paddle_libraries(use_cuda=False): cuda_lib_dir = find_cuda_libraries() paddle_lib_dirs.extend(cuda_lib_dir) + # add `paddle/fluid` to search `core_avx.so` or `core_noavx.so` + paddle_lib_dirs.append(_get_fluid_path()) + return paddle_lib_dirs @@ -614,9 +714,6 @@ def get_build_directory(verbose=False): if IS_WINDOWS: root_extensions_directory = os.path.normpath( root_extensions_directory) - elif OS_NAME.startswith('darwin'): - # TODO(Aurelius84): consider macOs - raise NotImplementedError("Not support Mac now.") log_v("$PADDLE_EXTENSION_DIR is not set, using path: {} by default.". format(root_extensions_directory), verbose) @@ -654,6 +751,8 @@ def _import_module_from_library(module_name, build_directory, verbose=False): """ if IS_WINDOWS: dynamic_suffix = '.pyd' + elif OS_NAME.startswith('darwin'): + dynamic_suffix = '.dylib' else: dynamic_suffix = '.so' ext_path = os.path.join(build_directory, module_name + dynamic_suffix) @@ -708,7 +807,7 @@ def _custom_api_content(op_name): # Set 'float32' temporarily, and the actual dtype of output variable will be inferred # in runtime. outs[out_name] = helper.create_variable(dtype='float32') - + helper.append_op(type="{op_name}", inputs=ins, outputs=outs, attrs=attrs) res = [outs[out_name] for out_name in out_names] @@ -757,7 +856,7 @@ def _get_api_inputs_str(op_name): # e.g: x, y, z param_names = in_names + attr_names # NOTE(chenweihang): we add suffix `@VECTOR` for std::vector input, - # but the string contains `@` cannot used as argument name, so we split + # but the string contains `@` cannot used as argument name, so we split # input name by `@`, and only use first substr as argument params_str = ','.join([p.split("@")[0].lower() for p in param_names]) # e.g: {'X': x, 'Y': y, 'Z': z} diff --git a/python/setup.py.in b/python/setup.py.in index 2883f2ed248..601e6e48703 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -351,10 +351,6 @@ if '${WITH_XPU}' == 'OFF' and '${XPU_SDK_ROOT}' != '': ### New custom op extension mechanism related ### -# copy libpaddle_custom_op.so to libs on linux -if sys.platform.startswith('linux'): - shutil.copy('${PADDLE_CUSTOM_OP_SHARED_LIB}', libs_path) - package_data['paddle.libs'] += ['libpaddle_custom_op.so'] # copy paddle_custom_op.lib/paddle_custom_op.dll to libs on Windows if os.name == 'nt': -- GitLab From 95122ebe8664a8d93558f07bf8c42c198a4b4653 Mon Sep 17 00:00:00 2001 From: Yiqun Liu Date: Fri, 9 Apr 2021 13:52:27 +0800 Subject: [PATCH 183/486] Advoid CPU -> CPU memory copy when start, end, step is already on CPU. (#29088) --- paddle/fluid/operators/range_op.cu | 24 ++++-------------------- paddle/fluid/operators/utils.h | 13 +++++++++++++ 2 files changed, 17 insertions(+), 20 deletions(-) diff --git a/paddle/fluid/operators/range_op.cu b/paddle/fluid/operators/range_op.cu index f2c78e0f70b..6250d68730e 100644 --- a/paddle/fluid/operators/range_op.cu +++ b/paddle/fluid/operators/range_op.cu @@ -15,6 +15,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/range_op.h" +#include "paddle/fluid/operators/utils.h" #include "paddle/fluid/platform/cuda_primitives.h" namespace paddle { @@ -34,26 +35,9 @@ class CUDARangeKernel : public framework::OpKernel { auto* step_t = context.Input("Step"); auto* out = context.Output("Out"); - T start, end, step; - framework::Tensor n; - if (::paddle::platform::is_cpu_place(start_t->place())) { - start = start_t->data()[0]; - } else { - framework::TensorCopy(*start_t, platform::CPUPlace(), &n); - start = n.data()[0]; - } - if (::paddle::platform::is_cpu_place(end_t->place())) { - end = end_t->data()[0]; - } else { - framework::TensorCopy(*end_t, platform::CPUPlace(), &n); - end = n.data()[0]; - } - if (::paddle::platform::is_cpu_place(step_t->place())) { - step = step_t->data()[0]; - } else { - framework::TensorCopy(*step_t, platform::CPUPlace(), &n); - step = n.data()[0]; - } + T start = GetValue(start_t); + T end = GetValue(end_t); + T step = GetValue(step_t); int64_t size = 0; GetSize(start, end, step, &size); diff --git a/paddle/fluid/operators/utils.h b/paddle/fluid/operators/utils.h index 985c3512761..912d538d5e9 100644 --- a/paddle/fluid/operators/utils.h +++ b/paddle/fluid/operators/utils.h @@ -108,5 +108,18 @@ inline framework::DDim GetShape(const framework::ExecutionContext& ctx) { return framework::make_ddim(vec_shape); } +template +inline T GetValue(const framework::Tensor* x) { + T value = static_cast(0); + if (!platform::is_cpu_place(x->place())) { + framework::Tensor cpu_x; + framework::TensorCopy(*x, platform::CPUPlace(), &cpu_x); + value = cpu_x.data()[0]; + } else { + value = x->data()[0]; + } + return value; +} + } // namespace operators } // namespace paddle -- GitLab From 4636d13616a1c7d4475fdc1135747c74dd38b7a8 Mon Sep 17 00:00:00 2001 From: Aurelius84 Date: Fri, 9 Apr 2021 15:17:10 +0800 Subject: [PATCH 184/486] [Dy2Stat] Fix undefined var used in For (#32153) * fix undefind var in For * fix code style --- .../dygraph_to_static/ifelse_transformer.py | 12 ++++++++-- .../dygraph_to_static/test_ifelse.py | 23 +++++++++++++++++++ 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/ifelse_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/ifelse_transformer.py index 79d24c05184..de788487fea 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/ifelse_transformer.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/ifelse_transformer.py @@ -238,11 +238,16 @@ class NameVisitor(gast.NodeVisitor): return new_name_ids def _is_call_func_name_node(self, node): + white_func_names = set(['append', 'extend']) if len(self.ancestor_nodes) > 1: assert self.ancestor_nodes[-1] == node parent_node = self.ancestor_nodes[-2] if isinstance(parent_node, gast.Call) and parent_node.func == node: - return True + # e.g: var_list.append(elem), var_list is also a name_id. + should_skip = isinstance( + node, gast.Attribute) and node.attr in white_func_names + if not should_skip: + return True return False def _update_name_ids(self, new_name_ids): @@ -398,10 +403,13 @@ def parse_cond_return(parent_vars_dict, if_vars_dict, else_vars_dict, ]) def _vars_loaded_before_store(ids_dict): + """ + gast.Param is also a kind of `load` semantic. + """ new_dict = defaultdict(list) for k, ctxs in six.iteritems(ids_dict): for ctx in ctxs: - if isinstance(ctx, gast.Load): + if isinstance(ctx, (gast.Load, gast.Param)): new_dict[k].append(ctx) elif isinstance(ctx, gast.Store): break diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse.py index 419150345b8..5db1bb2a384 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse.py @@ -342,5 +342,28 @@ class TestDiffModeNet2(TestDiffModeNet): self.Net = DiffModeNet2 +class TestNewVarCreateInOneBranch(unittest.TestCase): + def test_var_used_in_another_for(self): + def case_func(training): + # targets and targets_list is dynamically defined by training + if training: + targets = [1, 2, 3] + targets_list = [targets] + + num_step = 3 + for i in range(num_step): + if i > 0: + rois, rosi_num = 1, 2 + # targets is in loop_vars. + if training: + ros, rosi_num, targets = -1, -2, [-1, -2, -3] + targets_list.append(targets) + + return rosi_num + + self.assertEqual(paddle.jit.to_static(case_func)(False), 2) + self.assertEqual(paddle.jit.to_static(case_func)(True), -2) + + if __name__ == '__main__': unittest.main() -- GitLab From a73cb6798b30065846ee4069eb40103cb1aefa89 Mon Sep 17 00:00:00 2001 From: Shang Zhizhou Date: Fri, 9 Apr 2021 16:31:34 +0800 Subject: [PATCH 185/486] fix unittest timeour (#32161) --- python/paddle/fluid/tests/unittests/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index e1c5ae750d9..792a2d32326 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -801,7 +801,7 @@ set_tests_properties(test_imperative_optimizer PROPERTIES TIMEOUT 120) set_tests_properties(test_pool2d_op PROPERTIES TIMEOUT 120) set_tests_properties(test_transpose_op PROPERTIES TIMEOUT 120) set_tests_properties(test_eager_deletion_gru_net PROPERTIES TIMEOUT 120) -set_tests_properties(test_activation_op PROPERTIES TIMEOUT 120) +set_tests_properties(test_activation_op PROPERTIES TIMEOUT 180) set_tests_properties(test_normal PROPERTIES TIMEOUT 120) set_tests_properties(test_lstmp_op PROPERTIES TIMEOUT 120) set_tests_properties(test_bilinear_interp_op PROPERTIES TIMEOUT 120) -- GitLab From ccf5709d3f9958dccedf1b79e3c834fc2398b9c2 Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Fri, 9 Apr 2021 17:29:20 +0800 Subject: [PATCH 186/486] [NPU] cherry-pick basic NPU components/allocator/operator/executor supports from ascendrc (#32144) * [feature] support npu allocator (#30840) [feature] support npu allocator * [feature] support npu operator (#30951) [feature] support npu operator * [feature] support npu allocator, part 2 (#30972) * support npu allocator * add npu device context * fix some compile problem * fix some compile problem * add npu info * compile ok * fix include dir * support naive_best_fit_allocator * run ut ok, bug failed to exit * call aclrtResetDevice before exit * fix aclFinilize * add system allocatot test * add selected_gpus in gtest * add tensor_test for npu * support npu op, initial commit * add npu stream * add elementwise_add_op * compile ok * fix typo * fix elementwise_add_op_npu_test * support op run * test can run but failed * change aclopExecuteV2 to aclopCompileAndExecute * support parsing ascend rank table file (#31000) support parsing ascend rank table file * Fix reshape on GE graph. (#31084) Fix reshape on GE graph * add npu kernel for elementwise_sub and elementwise_sub_grad (#30973) * add npu sub op * fix typo * rename test * fix bug * fix bug * add fp16 kernel * fix typo * support sub grad op * support elementwise_sub_grad op Co-authored-by: frankwhzhang * Fix compilation problem (#31100) Fix compilation problem (#31100) * fix compile * fix code stype * remove const_cast * support adding correct npu op in pybind.h (#31143) * support adding correct npu op in pybind.h * refine code * [NPU] Support executor with NPU (#31057) * [NPU] Support executor with NPU * Fix code according to reviews * Fix code * Add unittest for sub op npu * refactor npu device manager (#31154) refactor npu device manager (#31154) * fix selected npus * fix compile * fix reading flags from env * format Co-authored-by: xiayanming <41795079@qq.com> Co-authored-by: gongweibao Co-authored-by: frankwhzhang Co-authored-by: liym27 <33742067+liym27@users.noreply.github.com> --- CMakeLists.txt | 5 +- cmake/configure.cmake | 4 + cmake/external/ascend.cmake | 82 ++-- cmake/external/protobuf.cmake | 3 + cmake/operators.cmake | 41 +- cmake/third_party.cmake | 11 +- paddle/fluid/framework/dlpack_tensor.cc | 5 + paddle/fluid/framework/executor.cc | 8 + paddle/fluid/framework/garbage_collector.cc | 3 +- paddle/fluid/framework/garbage_collector.h | 3 +- paddle/fluid/framework/library_type.h | 2 + paddle/fluid/framework/op_registry.h | 3 + paddle/fluid/framework/operator.cc | 20 + paddle/fluid/framework/parallel_executor.cc | 3 + paddle/fluid/framework/tensor_test.cc | 82 +++- paddle/fluid/framework/tensor_util.cc | 76 ++++ paddle/fluid/framework/tensor_util.h | 16 + .../fluid/imperative/gradient_accumulator.cc | 17 + paddle/fluid/memory/allocation/CMakeLists.txt | 4 + .../memory/allocation/allocator_facade.cc | 12 + .../allocation/naive_best_fit_allocator.cc | 133 ++++++ .../naive_best_fit_allocator_test.cc | 16 + .../fluid/memory/allocation/npu_allocator.cc | 73 ++++ .../fluid/memory/allocation/npu_allocator.h | 41 ++ paddle/fluid/memory/detail/CMakeLists.txt | 2 + paddle/fluid/memory/detail/buddy_allocator.cc | 18 + paddle/fluid/memory/detail/buddy_allocator.h | 1 + .../memory/detail/buddy_allocator_test.cc | 36 +- .../fluid/memory/detail/system_allocator.cc | 64 +++ paddle/fluid/memory/detail/system_allocator.h | 16 + .../memory/detail/system_allocator_test.cc | 8 + paddle/fluid/memory/memcpy.cc | 79 ++++ paddle/fluid/memory/memcpy.h | 20 + paddle/fluid/operators/CMakeLists.txt | 5 + .../operators/elementwise/CMakeLists.txt | 4 + .../elementwise/elementwise_add_op_npu.cc | 50 +++ .../elementwise/elementwise_op_npu_test.cc | 181 ++++++++ .../elementwise/elementwise_sub_op_npu.cc | 171 ++++++++ paddle/fluid/operators/math/math_function.cc | 7 + paddle/fluid/operators/npu_op_runner.cc | 260 +++++++++++ paddle/fluid/operators/npu_op_runner.h | 84 ++++ paddle/fluid/platform/CMakeLists.txt | 17 +- paddle/fluid/platform/device_context.cc | 44 +- paddle/fluid/platform/device_context.h | 56 ++- paddle/fluid/platform/dynload/cudnn.h | 3 + paddle/fluid/platform/enforce.h | 41 +- paddle/fluid/platform/flags.cc | 19 +- paddle/fluid/platform/gpu_info.cc | 1 + paddle/fluid/platform/init.cc | 17 + paddle/fluid/platform/monitor.cc | 10 + paddle/fluid/platform/monitor.h | 10 + paddle/fluid/platform/npu_info.cc | 409 ++++++++++++++++++ paddle/fluid/platform/npu_info.h | 156 +++++++ paddle/fluid/platform/place.cc | 7 + paddle/fluid/platform/place.h | 49 ++- paddle/fluid/platform/stream/CMakeLists.txt | 4 + paddle/fluid/platform/stream/cuda_stream.cc | 4 +- paddle/fluid/platform/stream/cuda_stream.h | 2 +- paddle/fluid/platform/stream/npu_stream.cc | 51 +++ paddle/fluid/platform/stream/npu_stream.h | 76 ++++ .../fluid/platform/stream_callback_manager.cc | 52 ++- .../fluid/platform/stream_callback_manager.h | 5 +- .../pybind/global_value_getter_setter.cc | 12 + paddle/fluid/pybind/pybind.cc | 121 +++++- paddle/fluid/pybind/tensor_py.h | 16 + paddle/testing/paddle_gtest_main.cc | 13 +- python/paddle/__init__.py | 2 + python/paddle/device.py | 19 +- .../paddle/distributed/fleet/ascend_utils.py | 125 ++++++ python/paddle/distributed/fleet/launch.py | 10 +- .../paddle/distributed/fleet/launch_utils.py | 4 +- .../ascend/ascend_optimizer.py | 5 +- .../meta_optimizers/ascend/ascend_parser.py | 9 +- python/paddle/fluid/__init__.py | 14 +- python/paddle/fluid/executor.py | 1 + python/paddle/fluid/framework.py | 23 +- .../fluid/tests/unittests/CMakeLists.txt | 4 + .../fluid/tests/unittests/npu/CMakeLists.txt | 6 + .../npu/test_elementwise_add_op_npu.py | 162 +++++++ .../npu/test_elementwise_sub_op_npu.py | 224 ++++++++++ .../tests/unittests/npu/test_npu_place.py | 61 +++ .../paddle/fluid/tests/unittests/op_test.py | 14 +- .../fluid/tests/unittests/test_device.py | 43 +- .../unittests/test_fleet_launch_ascend2.sh | 103 +++++ python/paddle/framework/__init__.py | 5 +- 85 files changed, 3568 insertions(+), 130 deletions(-) create mode 100644 paddle/fluid/memory/allocation/npu_allocator.cc create mode 100644 paddle/fluid/memory/allocation/npu_allocator.h create mode 100644 paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc create mode 100644 paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc create mode 100644 paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc create mode 100644 paddle/fluid/operators/npu_op_runner.cc create mode 100644 paddle/fluid/operators/npu_op_runner.h create mode 100644 paddle/fluid/platform/npu_info.cc create mode 100644 paddle/fluid/platform/npu_info.h create mode 100644 paddle/fluid/platform/stream/npu_stream.cc create mode 100644 paddle/fluid/platform/stream/npu_stream.h create mode 100644 python/paddle/distributed/fleet/ascend_utils.py create mode 100644 python/paddle/fluid/tests/unittests/npu/CMakeLists.txt create mode 100644 python/paddle/fluid/tests/unittests/npu/test_elementwise_add_op_npu.py create mode 100644 python/paddle/fluid/tests/unittests/npu/test_elementwise_sub_op_npu.py create mode 100644 python/paddle/fluid/tests/unittests/npu/test_npu_place.py create mode 100644 python/paddle/fluid/tests/unittests/test_fleet_launch_ascend2.sh diff --git a/CMakeLists.txt b/CMakeLists.txt index 59bc768aa41..8d96c339dad 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -33,11 +33,14 @@ option(WITH_TENSORRT "Compile PaddlePaddle with NVIDIA TensorRT" OFF) option(WITH_XPU "Compile PaddlePaddle with BAIDU KUNLUN XPU" OFF) option(WITH_WIN_DUMP_DBG "Compile with windows core dump debug mode" OFF) option(WITH_ASCEND "Compile PaddlePaddle with ASCEND" OFF) +# NOTE(zhiqiu): WITH_ASCEND_CL can be compile on x86_64, so we can set WITH_ASCEND=OFF and WITH_ASCEND_CL=ON +# to develop some acl related functionality on x86 +option(WITH_ASCEND_CL "Compile PaddlePaddle with ASCEND CL" ${WITH_ASCEND}) option(WITH_ASCEND_CXX11 "Compile PaddlePaddle with ASCEND and CXX11 ABI" OFF) if (WITH_GPU AND WITH_XPU) message(FATAL_ERROR "Error when compile GPU and XPU at the same time") endif() -if (WITH_GPU AND WITH_ASCEND) +if (WITH_GPU AND WITH_ASCEND) message(FATAL_ERROR "Error when compile GPU and ASCEND at the same time") endif() diff --git a/cmake/configure.cmake b/cmake/configure.cmake index 2a1e6897c02..9f1eb16fcf0 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -82,6 +82,10 @@ if(WITH_ASCEND) add_definitions(-DPADDLE_WITH_ASCEND) endif() +if(WITH_ASCEND_CL) + add_definitions(-DPADDLE_WITH_ASCEND_CL) +endif() + if(WITH_XPU) message(STATUS "Compile with XPU!") add_definitions(-DPADDLE_WITH_XPU) diff --git a/cmake/external/ascend.cmake b/cmake/external/ascend.cmake index a0b6f480f95..bddd2023b43 100644 --- a/cmake/external/ascend.cmake +++ b/cmake/external/ascend.cmake @@ -21,38 +21,60 @@ else() set(ASCEND_DIR /usr/local/Ascend) endif() -set(ASCEND_DRIVER_DIR ${ASCEND_DIR}/driver/lib64) -set(ASCEND_DRIVER_COMMON_DIR ${ASCEND_DIR}/driver/lib64/common) -set(ASCEND_DRIVER_SHARE_DIR ${ASCEND_DIR}/driver/lib64/share) -set(ASCEND_RUNTIME_DIR ${ASCEND_DIR}/fwkacllib/lib64) -set(ASCEND_ATC_DIR ${ASCEND_DIR}/atc/lib64) -set(ASCEND_ACL_DIR ${ASCEND_DIR}/acllib/lib64) -set(STATIC_ACL_LIB ${ASCEND_ACL_DIR}) - -set(ASCEND_MS_RUNTIME_PATH ${ASCEND_RUNTIME_DIR} ${ASCEND_ACL_DIR} ${ASCEND_ATC_DIR}) -set(ASCEND_MS_DRIVER_PATH ${ASCEND_DRIVER_DIR} ${ASCEND_DRIVER_COMMON_DIR}) -set(ATLAS_RUNTIME_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64) -set(ATLAS_RUNTIME_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include) -set(ATLAS_ACL_DIR ${ASCEND_DIR}/ascend-toolkit/latest/acllib/lib64) -set(ATLAS_ATC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/atc/lib64) -set(ATLAS_MS_RUNTIME_PATH ${ATLAS_RUNTIME_DIR} ${ATLAS_ACL_DIR} ${ATLAS_ATC_DIR}) - -set(atlas_graph_lib ${ATLAS_RUNTIME_DIR}/libgraph.so) -set(atlas_ge_runner_lib ${ATLAS_RUNTIME_DIR}/libge_runner.so) -set(atlas_acl_lib ${ATLAS_RUNTIME_DIR}/libascendcl.so) -INCLUDE_DIRECTORIES(${ATLAS_RUNTIME_INC_DIR}) - -if(EXISTS ${ATLAS_RUNTIME_INC_DIR}/graph/ascend_string.h) - add_definitions(-DPADDLE_WITH_ASCEND_STRING) +if(WITH_ASCEND) + set(ASCEND_DRIVER_DIR ${ASCEND_DIR}/driver/lib64) + set(ASCEND_DRIVER_COMMON_DIR ${ASCEND_DIR}/driver/lib64/common) + set(ASCEND_DRIVER_SHARE_DIR ${ASCEND_DIR}/driver/lib64/share) + set(ASCEND_RUNTIME_DIR ${ASCEND_DIR}/fwkacllib/lib64) + set(ASCEND_ATC_DIR ${ASCEND_DIR}/atc/lib64) + set(ASCEND_ACL_DIR ${ASCEND_DIR}/acllib/lib64) + set(STATIC_ACL_LIB ${ASCEND_ACL_DIR}) + + set(ASCEND_MS_RUNTIME_PATH ${ASCEND_RUNTIME_DIR} ${ASCEND_ACL_DIR} ${ASCEND_ATC_DIR}) + set(ASCEND_MS_DRIVER_PATH ${ASCEND_DRIVER_DIR} ${ASCEND_DRIVER_COMMON_DIR}) + set(ATLAS_RUNTIME_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64) + set(ATLAS_RUNTIME_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include) + set(ATLAS_ACL_DIR ${ASCEND_DIR}/ascend-toolkit/latest/acllib/lib64) + set(ATLAS_ATC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/atc/lib64) + set(ATLAS_MS_RUNTIME_PATH ${ATLAS_RUNTIME_DIR} ${ATLAS_ACL_DIR} ${ATLAS_ATC_DIR}) + + set(atlas_graph_lib ${ATLAS_RUNTIME_DIR}/libgraph.so) + set(atlas_ge_runner_lib ${ATLAS_RUNTIME_DIR}/libge_runner.so) + set(atlas_acl_lib ${ATLAS_RUNTIME_DIR}/libascendcl.so) + INCLUDE_DIRECTORIES(${ATLAS_RUNTIME_INC_DIR}) + + if(EXISTS ${ATLAS_RUNTIME_INC_DIR}/graph/ascend_string.h) + add_definitions(-DPADDLE_WITH_ASCEND_STRING) + endif() + + ADD_LIBRARY(ascend_ge SHARED IMPORTED GLOBAL) + SET_PROPERTY(TARGET ascend_ge PROPERTY IMPORTED_LOCATION ${atlas_ge_runner_lib}) + + ADD_LIBRARY(ascend_graph SHARED IMPORTED GLOBAL) + SET_PROPERTY(TARGET ascend_graph PROPERTY IMPORTED_LOCATION ${atlas_graph_lib}) + + ADD_LIBRARY(atlas_acl SHARED IMPORTED GLOBAL) + SET_PROPERTY(TARGET atlas_acl PROPERTY IMPORTED_LOCATION ${atlas_acl_lib}) + + add_custom_target(extern_ascend DEPENDS ascend_ge ascend_graph atlas_acl) endif() -ADD_LIBRARY(ascend_ge SHARED IMPORTED GLOBAL) -SET_PROPERTY(TARGET ascend_ge PROPERTY IMPORTED_LOCATION ${atlas_ge_runner_lib}) +if(WITH_ASCEND_CL) + set(ASCEND_CL_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64) + + set(ascendcl_lib ${ASCEND_CL_DIR}/libascendcl.so) + set(acl_op_compiler_lib ${ASCEND_CL_DIR}/libacl_op_compiler.so) + set(ASCEND_CL_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include) -ADD_LIBRARY(ascend_graph SHARED IMPORTED GLOBAL) -SET_PROPERTY(TARGET ascend_graph PROPERTY IMPORTED_LOCATION ${atlas_graph_lib}) + message(STATUS "ASCEND_CL_INC_DIR ${ASCEND_CL_INC_DIR}") + message(STATUS "ASCEND_CL_DIR ${ASCEND_CL_DIR}") + INCLUDE_DIRECTORIES(${ASCEND_CL_INC_DIR}) -ADD_LIBRARY(atlas_acl SHARED IMPORTED GLOBAL) -SET_PROPERTY(TARGET atlas_acl PROPERTY IMPORTED_LOCATION ${atlas_acl_lib}) + ADD_LIBRARY(ascendcl SHARED IMPORTED GLOBAL) + SET_PROPERTY(TARGET ascendcl PROPERTY IMPORTED_LOCATION ${ascendcl_lib}) -add_custom_target(extern_ascend DEPENDS ascend_ge ascend_graph atlas_acl) + ADD_LIBRARY(acl_op_compiler SHARED IMPORTED GLOBAL) + SET_PROPERTY(TARGET acl_op_compiler PROPERTY IMPORTED_LOCATION ${acl_op_compiler_lib}) + add_custom_target(extern_ascend_cl DEPENDS ascendcl acl_op_compiler) + +endif() diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake index 1466664c126..82d64fd0228 100644 --- a/cmake/external/protobuf.cmake +++ b/cmake/external/protobuf.cmake @@ -201,6 +201,9 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST) if(WITH_ASCEND AND NOT WITH_ASCEND_CXX11) SET(PROTOBUF_REPOSITORY https://gitee.com/tianjianhe/protobuf.git) SET(PROTOBUF_TAG v3.8.0) +elseif(WITH_ASCEND_CL AND NOT WITH_ASCEND_CXX11) + SET(PROTOBUF_REPOSITORY https://gitee.com/tianjianhe/protobuf.git) + SET(PROTOBUF_TAG v3.8.0) else() SET(PROTOBUF_REPOSITORY ${GIT_URL}/protocolbuffers/protobuf.git) SET(PROTOBUF_TAG 9f75c5aa851cd877fb0d93ccc31b8567a6706546) diff --git a/cmake/operators.cmake b/cmake/operators.cmake index 0343ff3cc29..7dac91e531e 100644 --- a/cmake/operators.cmake +++ b/cmake/operators.cmake @@ -11,6 +11,7 @@ function(op_library TARGET) set(cu_cc_srcs) set(hip_cc_srcs) set(xpu_cc_srcs) + set(npu_cc_srcs) set(cudnn_cu_cc_srcs) set(miopen_cu_cc_srcs) set(cudnn_cu_srcs) @@ -20,6 +21,9 @@ function(op_library TARGET) set(mkldnn_cc_srcs) set(MKLDNN_FILE) set(op_common_deps operator op_registry math_function layer common_infer_shape_functions) + if (WITH_ASCEND_CL) + set(op_common_deps ${op_common_deps} npu_op_runner) + endif() # Option `UNITY` is used to specify that operator `TARGET` will compiles with Unity Build. set(options UNITY) set(oneValueArgs "") @@ -85,6 +89,12 @@ function(op_library TARGET) list(APPEND xpu_cc_srcs ${XPU_FILE}.cc) endif() endif() + if(WITH_ASCEND_CL) + string(REPLACE "_op" "_op_npu" NPU_FILE "${TARGET}") + if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${NPU_FILE}.cc) + list(APPEND npu_cc_srcs ${NPU_FILE}.cc) + endif() + endif() else() foreach(src ${op_library_SRCS}) if(WITH_ROCM AND ${src} MATCHES ".*_cudnn_op.cu$") @@ -107,6 +117,8 @@ function(op_library TARGET) list(APPEND cu_cc_srcs ${src}) elseif(WITH_XPU AND ${src} MATCHES ".*_op_xpu.cc$") list(APPEND xpu_cc_srcs ${src}) + elseif(WITH_ASCEND_CL AND ${src} MATCHES ".*_op_npu.cc$") + list(APPEND npu_cc_srcs ${src}) elseif(${src} MATCHES ".*\\.cc$") list(APPEND cc_srcs ${src}) else() @@ -176,7 +188,7 @@ function(op_library TARGET) # Unity Build relies on global option `WITH_UNITY_BUILD` and local option `UNITY`. if(WITH_UNITY_BUILD AND op_library_UNITY) # Combine the cc source files. - compose_unity_target_sources(${UNITY_TARGET} cc ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs}) + compose_unity_target_sources(${UNITY_TARGET} cc ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} ${npu_cc_srcs}) if(TARGET ${UNITY_TARGET}) # If `UNITY_TARGET` exists, add source files to `UNITY_TARGET`. target_sources(${UNITY_TARGET} PRIVATE ${unity_target_cc_sources}) @@ -187,7 +199,7 @@ function(op_library TARGET) # Add alias library to handle dependencies. add_library(${TARGET} ALIAS ${UNITY_TARGET}) else() - cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} DEPS ${op_library_DEPS} + cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} ${npu_cc_srcs} DEPS ${op_library_DEPS} ${op_common_deps}) endif() endif() @@ -207,6 +219,7 @@ function(op_library TARGET) # The registration of USE_OP, please refer to paddle/fluid/framework/op_registry.h. # Note that it's enough to just adding one operator to pybind in a *_op.cc file. # And for detail pybind information, please see generated paddle/pybind/pybind.h. + set(ORIGINAL_TARGET ${TARGET}) file(READ ${TARGET}.cc TARGET_CONTENT) string(REGEX MATCH "REGISTER_OPERATOR\\(.*REGISTER_OPERATOR\\(" multi_register "${TARGET_CONTENT}") # [ \t\r\n]* is used for blank characters @@ -239,8 +252,9 @@ function(op_library TARGET) list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len) list(LENGTH xpu_cc_srcs xpu_cc_srcs_len) list(LENGTH miopen_cu_cc_srcs miopen_cu_cc_srcs_len) + list(LENGTH npu_cc_srcs npu_cc_srcs_len) if (${pybind_flag} EQUAL 0 AND ${mkldnn_cc_srcs_len} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0 AND - ${hip_srcs_len} EQUAL 0 AND ${hip_cc_srcs_len} EQUAL 0 AND ${miopen_cu_cc_srcs_len} EQUAL 0 AND ${xpu_cc_srcs_len} EQUAL 0) + ${hip_srcs_len} EQUAL 0 AND ${hip_cc_srcs_len} EQUAL 0 AND ${miopen_cu_cc_srcs_len} EQUAL 0 AND ${xpu_cc_srcs_len} EQUAL 0 AND ${npu_cc_srcs_len} EQUAL 0) file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n") set(pybind_flag 1) endif() @@ -280,6 +294,26 @@ function(op_library TARGET) if (WITH_XPU AND ${xpu_cc_srcs_len} GREATER 0) file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, XPU);\n") endif() + + if (WITH_ASCEND_CL AND ${npu_cc_srcs_len} GREATER 0) + file(READ ${ORIGINAL_TARGET}_npu.cc TARGET_NPU_CONTENT) + # It is different from the logic above, becareful + string(REGEX MATCH "REGISTER_OP_NPU_KERNEL\\(.*" multi_npu_register "${TARGET_NPU_CONTENT}") + # [ \t\r\n]* is used for blank characters + string(REGEX MATCH "REGISTER_OP_NPU_KERNEL\\([ \t\r\n]*[a-z0-9_]*," one_npu_register "${multi_npu_register}") + + if (one_npu_register STREQUAL "") + string(REPLACE "_op" "" NPU_TARGET "${TARGET}") + else () + string(REPLACE "REGISTER_OP_NPU_KERNEL(" "" NPU_TARGET "${one_npu_register}") + string(REPLACE "," "" NPU_TARGET "${NPU_TARGET}") + # [ \t\r\n]+ is used for blank characters. + # Here we use '+' instead of '*' since it is a REPLACE operation. + string(REGEX REPLACE "[ \t\r\n]+" "" NPU_TARGET "${NPU_TARGET}") + endif() + file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${NPU_TARGET}, NPU);\n") + endif() + # pybind USE_OP_DEVICE_KERNEL for MKLDNN if (WITH_MKLDNN AND ${mkldnn_cc_srcs_len} GREATER 0) # Append first implemented MKLDNN activation operator @@ -330,6 +364,7 @@ function(register_operators) file(GLOB OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc") string(REPLACE "_mkldnn" "" OPS "${OPS}") string(REPLACE "_xpu" "" OPS "${OPS}") + string(REPLACE "_npu" "" OPS "${OPS}") string(REPLACE ".cc" "" OPS "${OPS}") list(REMOVE_DUPLICATES OPS) list(LENGTH register_operators_DEPS register_operators_DEPS_len) diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake index 6488d29afc5..81fa7d0dfa9 100644 --- a/cmake/third_party.cmake +++ b/cmake/third_party.cmake @@ -274,10 +274,15 @@ if(WITH_BOX_PS) list(APPEND third_party_deps extern_box_ps) endif(WITH_BOX_PS) -if(WITH_ASCEND) +if(WITH_ASCEND OR WITH_ASCEND_CL) include(external/ascend) - list(APPEND third_party_deps extern_ascend) -endif (WITH_ASCEND) + if(WITH_ASCEND) + list(APPEND third_party_deps extern_ascend) + endif() + if(WITH_ASCEND_CL) + list(APPEND third_party_deps extern_ascend_cl) + endif() +endif () if (WITH_PSCORE) include(external/snappy) diff --git a/paddle/fluid/framework/dlpack_tensor.cc b/paddle/fluid/framework/dlpack_tensor.cc index a3fbb008fe4..b99ab6b5a7f 100644 --- a/paddle/fluid/framework/dlpack_tensor.cc +++ b/paddle/fluid/framework/dlpack_tensor.cc @@ -82,6 +82,11 @@ struct DLContextVisitor : public boost::static_visitor<::DLContext> { platform::errors::Unimplemented("platform::XPUPlace is not supported")); } + inline ::DLContext operator()(const platform::NPUPlace &place) const { + PADDLE_THROW( + platform::errors::Unimplemented("platform::NPUPlace is not supported")); + } + inline ::DLContext operator()(const platform::CUDAPlace &place) const { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) ::DLContext ctx; diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 0acc8a55fa9..101991d2c1b 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -453,6 +453,14 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx, #else PADDLE_THROW( platform::errors::Unimplemented("No XPU gc found in CPU/GPU paddle")); +#endif + } else if (platform::is_npu_place(place_)) { +#ifdef PADDLE_WITH_ASCEND_CL + // TODO(ascendrc): Support garbage collector on NPUPlace + VLOG(4) << "Skip NPU gc because it is not implemented now."; +#else + PADDLE_THROW(platform::errors::Unimplemented( + "No NPU gc found in CPU/GPU/XPU paddle")); #endif } } diff --git a/paddle/fluid/framework/garbage_collector.cc b/paddle/fluid/framework/garbage_collector.cc index c8b6c764255..8dfbd3c268b 100644 --- a/paddle/fluid/framework/garbage_collector.cc +++ b/paddle/fluid/framework/garbage_collector.cc @@ -86,8 +86,9 @@ StreamGarbageCollector::StreamGarbageCollector(const platform::CUDAPlace &place, PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamCreate(&stream_)); #else PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream_)); + callback_manager_.reset( + new platform::StreamCallbackManager(stream_)); #endif - callback_manager_.reset(new platform::StreamCallbackManager(stream_)); } StreamGarbageCollector::~StreamGarbageCollector() { diff --git a/paddle/fluid/framework/garbage_collector.h b/paddle/fluid/framework/garbage_collector.h index 97800865af8..572c79d21a0 100644 --- a/paddle/fluid/framework/garbage_collector.h +++ b/paddle/fluid/framework/garbage_collector.h @@ -117,7 +117,8 @@ class StreamGarbageCollector : public GarbageCollector { private: gpuStream_t stream_; - std::unique_ptr callback_manager_; + std::unique_ptr> + callback_manager_; }; class CUDAPinnedGarbageCollector : public GarbageCollector { diff --git a/paddle/fluid/framework/library_type.h b/paddle/fluid/framework/library_type.h index 4307e51862d..8fe314cf5f1 100644 --- a/paddle/fluid/framework/library_type.h +++ b/paddle/fluid/framework/library_type.h @@ -61,6 +61,8 @@ inline LibraryType StringToLibraryType(const char* ctype) { return LibraryType::kPlain; } else if (s == std::string("XPU")) { return LibraryType::kPlain; + } else if (s == std::string("NPU")) { + return LibraryType::kPlain; } else if (s == std::string("CUDA")) { return LibraryType::kPlain; } else { diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h index 472c6f40826..4c529329761 100644 --- a/paddle/fluid/framework/op_registry.h +++ b/paddle/fluid/framework/op_registry.h @@ -304,6 +304,9 @@ struct OpKernelRegistrarFunctorEx &places, const BuildStrategy &build_strategy, ir::Graph *graph) : member_(new ParallelExecutorPrivate(places, scope)) { + PADDLE_ENFORCE(places.size() > 0 && !is_npu_place(places[0]), + platform::errors::Unavailable( + "NPU is not supported in ParallelExecutor")); InitP2P(places); ir::InitReaderQueueDeviceCount(graph, *(member_->global_scope_), member_->places_.size()); diff --git a/paddle/fluid/framework/tensor_test.cc b/paddle/fluid/framework/tensor_test.cc index 54f77981306..101463756c0 100644 --- a/paddle/fluid/framework/tensor_test.cc +++ b/paddle/fluid/framework/tensor_test.cc @@ -125,25 +125,54 @@ TEST(Tensor, MutableData) { float* p2 = nullptr; // initialization p1 = src_tensor.mutable_data(framework::make_ddim({1, 2, 3}), - platform::CUDAPlace()); + platform::CUDAPlace(0)); auto p1_holder = src_tensor.Holder(); EXPECT_NE(p1, nullptr); // set src_tensor a new dim with large size // momery is supposed to be re-allocated p2 = src_tensor.mutable_data(framework::make_ddim({3, 1024}), - platform::CUDAPlace()); + platform::CUDAPlace(0)); auto p2_holder = src_tensor.Holder(); EXPECT_NE(p2, nullptr); EXPECT_NE(p1_holder.get(), p2_holder.get()); // set src_tensor a new dim with same size // momery block is supposed to be unchanged p1 = src_tensor.mutable_data(framework::make_ddim({2, 2, 3}), - platform::CUDAPlace()); + platform::CUDAPlace(0)); EXPECT_EQ(p1, p2); // set src_tensor a new dim with smaller size // momery block is supposed to be unchanged p2 = src_tensor.mutable_data(framework::make_ddim({2, 2}), - platform::CUDAPlace()); + platform::CUDAPlace(0)); + EXPECT_EQ(p1, p2); + } +#endif +#ifdef PADDLE_WITH_ASCEND_CL + { + framework::Tensor src_tensor; + float* p1 = nullptr; + float* p2 = nullptr; + // initialization + p1 = src_tensor.mutable_data(framework::make_ddim({1, 2, 3}), + platform::NPUPlace(0)); + auto p1_holder = src_tensor.Holder(); + EXPECT_NE(p1, nullptr); + // set src_tensor a new dim with large size + // momery is supposed to be re-allocated + p2 = src_tensor.mutable_data(framework::make_ddim({3, 1024}), + platform::NPUPlace(0)); + auto p2_holder = src_tensor.Holder(); + EXPECT_NE(p2, nullptr); + EXPECT_NE(p1_holder.get(), p2_holder.get()); + // set src_tensor a new dim with same size + // momery block is supposed to be unchanged + p1 = src_tensor.mutable_data(framework::make_ddim({2, 2, 3}), + platform::NPUPlace(0)); + EXPECT_EQ(p1, p2); + // set src_tensor a new dim with smaller size + // momery block is supposed to be unchanged + p2 = src_tensor.mutable_data(framework::make_ddim({2, 2}), + platform::NPUPlace(0)); EXPECT_EQ(p1, p2); } #endif @@ -179,7 +208,17 @@ TEST(Tensor, ShareDataWith) { framework::Tensor src_tensor; framework::Tensor dst_tensor; src_tensor.mutable_data(framework::make_ddim({2, 3, 4}), - platform::CUDAPlace()); + platform::CUDAPlace(0)); + dst_tensor.ShareDataWith(src_tensor); + ASSERT_EQ(src_tensor.data(), dst_tensor.data()); + } +#endif +#ifdef PADDLE_WITH_ASCEND_CL + { + framework::Tensor src_tensor; + framework::Tensor dst_tensor; + src_tensor.mutable_data(framework::make_ddim({2, 3, 4}), + platform::NPUPlace(0)); dst_tensor.ShareDataWith(src_tensor); ASSERT_EQ(src_tensor.data(), dst_tensor.data()); } @@ -216,7 +255,34 @@ TEST(Tensor, Slice) { { framework::Tensor src_tensor; src_tensor.mutable_data(framework::make_ddim({6, 9}), - platform::CUDAPlace()); + platform::CUDAPlace(0)); + framework::Tensor slice_tensor = src_tensor.Slice(2, 6); + framework::DDim slice_dims = slice_tensor.dims(); + ASSERT_EQ(arity(slice_dims), 2); + EXPECT_EQ(slice_dims[0], 4); + EXPECT_EQ(slice_dims[1], 9); + + uintptr_t src_data_address = + reinterpret_cast(src_tensor.data()); + uintptr_t src_mutable_data_address = + reinterpret_cast(src_tensor.mutable_data( + src_tensor.dims(), platform::CUDAPlace(0))); + uintptr_t slice_data_address = + reinterpret_cast(slice_tensor.data()); + uintptr_t slice_mutable_data_address = + reinterpret_cast(slice_tensor.mutable_data( + slice_tensor.dims(), platform::CUDAPlace(0))); + EXPECT_EQ(src_data_address, src_mutable_data_address); + EXPECT_EQ(slice_data_address, slice_mutable_data_address); + EXPECT_EQ(src_data_address + 9 * 2 * sizeof(double), slice_data_address); + } +#endif + +#ifdef PADDLE_WITH_ASCEND_CL + { + framework::Tensor src_tensor; + src_tensor.mutable_data(framework::make_ddim({6, 9}), + platform::NPUPlace(0)); framework::Tensor slice_tensor = src_tensor.Slice(2, 6); framework::DDim slice_dims = slice_tensor.dims(); ASSERT_EQ(arity(slice_dims), 2); @@ -227,12 +293,12 @@ TEST(Tensor, Slice) { reinterpret_cast(src_tensor.data()); uintptr_t src_mutable_data_address = reinterpret_cast(src_tensor.mutable_data( - src_tensor.dims(), platform::CUDAPlace())); + src_tensor.dims(), platform::NPUPlace(0))); uintptr_t slice_data_address = reinterpret_cast(slice_tensor.data()); uintptr_t slice_mutable_data_address = reinterpret_cast(slice_tensor.mutable_data( - slice_tensor.dims(), platform::CUDAPlace())); + slice_tensor.dims(), platform::NPUPlace(0))); EXPECT_EQ(src_data_address, src_mutable_data_address); EXPECT_EQ(slice_data_address, slice_mutable_data_address); EXPECT_EQ(src_data_address + 9 * 2 * sizeof(double), slice_data_address); diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index c6ac30a3698..d6882b25d22 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -97,6 +97,42 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place, "Copy from %s to %s is not supported.", src_place, dst_place)); } #endif +#ifdef PADDLE_WITH_ASCEND_CL + // TODO(zhiqiu): handle different condition like CUDA code below + else if (platform::is_npu_place(src_place) && // NOLINT + platform::is_cpu_place(dst_place)) { + auto stream = + reinterpret_cast(ctx).stream(); + memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr, + BOOST_GET_CONST(platform::NPUPlace, src_place), src_ptr, size, + stream); + } + else if (platform::is_cpu_place(src_place) && // NOLINT + platform::is_npu_place(dst_place)) { + auto stream = + reinterpret_cast(ctx).stream(); + memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, + BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size, + stream); + } + else if (platform::is_npu_place(src_place) && // NOLINT + platform::is_npu_place(dst_place)) { + if (src_ptr == dst_ptr) { + VLOG(3) << "Skip copy the same data async from " << src_place << " to " + << dst_place; + return; + } + auto stream = + reinterpret_cast(ctx).stream(); + memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, + BOOST_GET_CONST(platform::NPUPlace, src_place), src_ptr, size, + stream); + } + else { // NOLINT + PADDLE_THROW(platform::errors::Unimplemented( + "Copy from %s to %s is not supported.", src_place, dst_place)); + } +#endif #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) else if (platform::is_cuda_pinned_place(src_place) && // NOLINT platform::is_cuda_pinned_place(dst_place)) { @@ -304,6 +340,35 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place, "Copy from %s to %s is not supported.", src_place, dst_place)); } #endif +#ifdef PADDLE_WITH_ASCEND_CL + else if (platform::is_npu_place(src_place) && // NOLINT + platform::is_cpu_place(dst_place)) { /* npu -> cpu*/ + memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr, + BOOST_GET_CONST(platform::NPUPlace, src_place), src_ptr, size, + nullptr); + } + else if (platform::is_cpu_place(src_place) && // NOLINT + platform::is_npu_place(dst_place)) { /* cpu -> npu*/ + memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, + BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size, + nullptr); + } + else if (platform::is_npu_place(src_place) && // NOLINT + platform::is_npu_place(dst_place)) { /* npu -> npu*/ + if (src_ptr == dst_ptr) { + VLOG(3) << "Skip copy the same data sync from " << src_place << " to " + << dst_place; + return; + } + memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, + BOOST_GET_CONST(platform::NPUPlace, src_place), src_ptr, size, + nullptr); + } + else { // NOLINT + PADDLE_THROW(platform::errors::Unimplemented( + "Copy from %s to %s is not supported.", src_place, dst_place)); + } +#endif #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) else if (platform::is_cuda_pinned_place(src_place) && // NOLINT platform::is_cuda_pinned_place(dst_place)) { @@ -431,6 +496,13 @@ class AnyVisitor : public boost::static_visitor { return GetResultHelper(out, gpu); } + bool GetResult(const framework::Tensor& out, + const platform::NPUPlace& npu) const { + PADDLE_THROW( + platform::errors::Unimplemented("Not supported on place (%s) ", npu)); + // return GetResultHelper(out, npu); + } + bool GetResult(const framework::Tensor& out, const platform::CPUPlace& cpu) const { return *out.data(); @@ -633,6 +705,10 @@ struct BothFalseVisitor : public boost::static_visitor<> { #endif } + void VisitorImpl(const platform::NPUPlace& npu) const { + // TODO(zhiqiu) + } + void VisitorImpl(const platform::CPUPlace& cpu) const { int num = in_.numel(); const bool* in_ptr = in_.data(); diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h index fd0f98784ce..868d920f13c 100644 --- a/paddle/fluid/framework/tensor_util.h +++ b/paddle/fluid/framework/tensor_util.h @@ -157,6 +157,14 @@ void TensorFromVector(const std::vector& src, reinterpret_cast(ctx).stream()); } #endif +#ifdef PADDLE_WITH_ASCEND_CL + else if (platform::is_npu_place(dst_place)) { // NOLINT + memory::Copy( + BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, src_place, + src_ptr, size, + reinterpret_cast(ctx).stream()); + } +#endif } template @@ -194,6 +202,14 @@ void TensorToVector(const Tensor& src, const platform::DeviceContext& ctx, reinterpret_cast(ctx).stream()); } #endif +#ifdef PADDLE_WITH_ASCEND_CL + else if (platform::is_npu_place(src.place())) { // NOLINT + memory::Copy( + dst_place, dst_ptr, BOOST_GET_CONST(platform::NPUPlace, src.place()), + src_ptr, size, + reinterpret_cast(ctx).stream()); + } +#endif } template diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc index df5ff750c99..64f5a9e0cc8 100644 --- a/paddle/fluid/imperative/gradient_accumulator.cc +++ b/paddle/fluid/imperative/gradient_accumulator.cc @@ -115,6 +115,23 @@ class TensorAddFunctor : public boost::static_visitor<> { } #endif +#ifdef PADDLE_WITH_ASCEND_CL + void operator()(const platform::NPUPlace& place) { + // TODO(zhiqiu): SUPPORT it + PADDLE_THROW(platform::errors::PermissionDenied( + "Gradient accumulation on place (%s) " + "is not supported in imperative mode", + place)); + } +#else + void operator()(const platform::NPUPlace& place) { + PADDLE_THROW(platform::errors::PermissionDenied( + "Gradient accumulation on place (%s) " + "is not supported in imperative mode", + place)); + } +#endif + // there is NO blas in CUDAPinnedPlace void operator()(const platform::CUDAPinnedPlace& place) { PADDLE_THROW(platform::errors::PermissionDenied( diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt index 565797d51dd..2ea047fa13c 100644 --- a/paddle/fluid/memory/allocation/CMakeLists.txt +++ b/paddle/fluid/memory/allocation/CMakeLists.txt @@ -27,6 +27,10 @@ if (WITH_ROCM) cc_test(thread_local_allocator_test SRCS thread_local_allocator_test.cc DEPS thread_local_allocator) endif() +if (WITH_ASCEND_CL) + cc_library(npu_allocator SRCS npu_allocator.cc DEPS allocator npu_info) +endif() + cc_library(retry_allocator SRCS retry_allocator.cc DEPS allocator) if (WITH_GPU OR WITH_ROCM) diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index cbeb263b5f4..730efa5c646 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -32,6 +32,7 @@ #ifdef PADDLE_WITH_XPU #include "paddle/fluid/platform/xpu_info.h" #endif +#include "paddle/fluid/platform/npu_info.h" DEFINE_int64( gpu_allocator_retry_time, 10000, @@ -66,6 +67,11 @@ class AllocatorFacadePrivate { InitNaiveBestFitCUDAAllocator(platform::CUDAPlace(dev_id)); } InitNaiveBestFitCUDAPinnedAllocator(); +#endif +#ifdef PADDLE_WITH_ASCEND_CL + for (int dev_id = 0; dev_id < platform::GetNPUDeviceCount(); ++dev_id) { + InitNaiveBestFitNPUAllocator(platform::NPUPlace(dev_id)); + } #endif break; } @@ -185,6 +191,12 @@ class AllocatorFacadePrivate { } #endif +#ifdef PADDLE_WITH_ASCEND_CL + void InitNaiveBestFitNPUAllocator(platform::NPUPlace p) { + allocators_[p] = std::make_shared(p); + } +#endif + class ZeroSizeAllocator : public Allocator { public: explicit ZeroSizeAllocator(platform::Place place) : place_(place) {} diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc index 0ada2cafcc1..3e88d61783c 100644 --- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc +++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc @@ -19,7 +19,10 @@ #include "gflags/gflags.h" #include "glog/logging.h" #include "paddle/fluid/memory/detail/buddy_allocator.h" +#include "paddle/fluid/memory/detail/system_allocator.h" #include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/npu_info.h" #include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/string/printf.h" @@ -110,6 +113,7 @@ size_t Used(const platform::CPUPlace &place) { return GetCPUBuddyAllocator()->Used(); } +// For kunlun XPU template <> void *Alloc(const platform::XPUPlace &place, size_t size) { #ifdef PADDLE_WITH_XPU @@ -219,6 +223,135 @@ size_t Used(const platform::XPUPlace &place) { #endif } +// For Ascend NPU +#ifdef PADDLE_WITH_ASCEND_CL +class NPUBuddyAllocatorList { + private: + NPUBuddyAllocatorList() : devices_(platform::GetSelectedNPUDevices()) { + auto npu_num = devices_.size(); + allocators_.resize(npu_num); + init_flags_.reserve(npu_num); + for (size_t i = 0; i < npu_num; ++i) { + init_flags_.emplace_back(new std::once_flag()); + } + } + + static NPUBuddyAllocatorList *CreateNewInstance() { + return new NPUBuddyAllocatorList(); + } + + public: + static NPUBuddyAllocatorList *Instance() { + static auto *instance = CreateNewInstance(); + return instance; + } + + BuddyAllocator *Get(int npu_id) { + auto pos = std::distance( + devices_.begin(), std::find(devices_.begin(), devices_.end(), npu_id)); + PADDLE_ENFORCE_LT(pos, devices_.size(), + platform::errors::OutOfRange( + "The index exceeds the size of devices, the size of " + "devices is %d, the index is %d", + devices_.size(), pos)); + + std::call_once(*init_flags_[pos], [this, pos] { + platform::SetNPUDeviceId(devices_[pos]); + allocators_[pos].reset(new BuddyAllocator( + std::unique_ptr( + new detail::NPUAllocator(devices_[pos])), + platform::NPUMinChunkSize(), platform::NPUMaxChunkSize())); + VLOG(10) << "\n\nNOTE:\n" + << "You can set GFlags environment variable " + << "'FLAGS_fraction_of_gpu_memory_to_use' " + << "or 'FLAGS_initial_gpu_memory_in_mb' " + << "or 'FLAGS_reallocate_gpu_memory_in_mb' " + << "to change the memory size for GPU usage.\n" + << "Current 'FLAGS_fraction_of_gpu_memory_to_use' value is " + << FLAGS_fraction_of_gpu_memory_to_use + << ". Current 'FLAGS_initial_gpu_memory_in_mb' value is " + << FLAGS_initial_gpu_memory_in_mb + << ". Current 'FLAGS_reallocate_gpu_memory_in_mb' value is " + << FLAGS_reallocate_gpu_memory_in_mb << "\n\n"; + }); + + return allocators_[pos].get(); + } + + private: + std::vector devices_; + std::vector> init_flags_; + std::vector> allocators_; +}; + +BuddyAllocator *GetNPUBuddyAllocator(int npu_id) { + return NPUBuddyAllocatorList::Instance()->Get(npu_id); +} +#endif + +template <> +size_t Used(const platform::NPUPlace &place) { +#ifdef PADDLE_WITH_ASCEND_CL + return GetNPUBuddyAllocator(place.device)->Used(); +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "'NPUPlace' is not supported in CPU only device.")); +#endif +} + +template <> +void *Alloc(const platform::NPUPlace &place, size_t size) { +#ifdef PADDLE_WITH_ASCEND_CL + auto *buddy_allocator = GetNPUBuddyAllocator(place.device); + auto *ptr = buddy_allocator->Alloc(size); + if (ptr == nullptr) { + platform::NPUDeviceGuard(place.device); + size_t avail, total; + platform::NPUMemoryUsage(&avail, &total); + PADDLE_THROW(platform::errors::ResourceExhausted( + "Cannot allocate %s in GPU %d, avaliable %s, total %s, GpuMinChunkSize " + "%s, GpuMaxChunkSize %s, GPU memory used: %s.", + string::HumanReadableSize(size), place.device, + string::HumanReadableSize(avail), string::HumanReadableSize(total), + string::HumanReadableSize(buddy_allocator->GetMinChunkSize()), + string::HumanReadableSize(buddy_allocator->GetMaxChunkSize()), + string::HumanReadableSize(Used(place)))); + } else { + if (FLAGS_init_allocated_mem) { + aclrtMemset(ptr, size, 0xEF, size); + } + } + VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place); + return ptr; +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "'NPUPlace' is not supported in CPU only device.")); +#endif +} + +template <> +void Free(const platform::NPUPlace &place, void *p, + size_t size) { +#ifdef PADDLE_WITH_ASCEND_CL + VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place); + GetNPUBuddyAllocator(place.device)->Free(p); +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "'NPUPlace' is not supported in CPU only device.")); +#endif +} + +template <> +uint64_t Release(const platform::NPUPlace &place) { +#ifdef PADDLE_WITH_ASCEND_CL + return GetNPUBuddyAllocator(place.device)->Release(); +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "'NPUPlace' is not supported in CPU only device.")); +#endif +} + +// For CUDA #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) class GPUBuddyAllocatorList { private: diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc index 37da748ee9c..1fe85dd699a 100644 --- a/paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc +++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc @@ -61,6 +61,22 @@ TEST(NaiveBestFitAllocatorTest, CudaPinnedAlloc) { } #endif +#ifdef PADDLE_WITH_ASCEND_CL +TEST(NaiveBestFitAllocatorTest, NpuAlloc) { + NaiveBestFitAllocator alloc{platform::NPUPlace(0)}; + { + size_t size = (1 << 20); + auto allocation = alloc.Allocate(size); + } + sleep(10); + alloc.Release(platform::NPUPlace(0)); + + size_t size = (1 << 20); + auto allocation = alloc.Allocate(size); + alloc.Release(platform::NPUPlace(0)); +} +#endif + } // namespace allocation } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/allocation/npu_allocator.cc b/paddle/fluid/memory/allocation/npu_allocator.cc new file mode 100644 index 00000000000..4ecdee9bd03 --- /dev/null +++ b/paddle/fluid/memory/allocation/npu_allocator.cc @@ -0,0 +1,73 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/memory/allocation/npu_allocator.h" +#include +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/npu_info.h" + +namespace paddle { +namespace memory { +namespace allocation { + +bool NPUAllocator::IsAllocThreadSafe() const { return true; } +void NPUAllocator::FreeImpl(Allocation* allocation) { + PADDLE_ENFORCE_EQ( + BOOST_GET_CONST(platform::NPUPlace, allocation->place()), place_, + platform::errors::PermissionDenied( + "NPU memory is freed in incorrect device. This may be a bug")); + platform::RecordedNPUFree(allocation->ptr(), allocation->size(), + place_.device); + delete allocation; +} + +Allocation* NPUAllocator::AllocateImpl(size_t size) { + std::call_once(once_flag_, + [this] { platform::SetNPUDeviceId(place_.device); }); + + void* ptr; + auto result = platform::RecordedNPUMalloc(&ptr, size, place_.device); + if (LIKELY(result == ACL_ERROR_NONE)) { + return new Allocation(ptr, size, platform::Place(place_)); + } + + size_t avail, total, actual_avail, actual_total; + bool is_limited = platform::RecordedNPUMemGetInfo( + &avail, &total, &actual_avail, &actual_total, place_.device); + + std::string err_msg; + if (is_limited) { + auto limit_size = (total >> 20); + err_msg = string::Sprintf( + "Or set environment variable `FLAGS_gpu_memory_limit_mb` to a larger " + "value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the maximum " + "GPU memory usage is limited to %d MB.\n" + " The command is `export FLAGS_gpu_memory_limit_mb=xxx`.", + limit_size, limit_size); + } + + PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted( + "\n\nOut of memory error on NPU %d. " + "Cannot allocate %s memory on NPU %d, " + "available memory is only %s.\n\n" + "Please check whether there is any other process using NPU %d.\n" + "1. If yes, please stop them, or start PaddlePaddle on another NPU.\n" + "2. If no, please decrease the batch size of your model. %s\n\n", + place_.device, string::HumanReadableSize(size), place_.device, + string::HumanReadableSize(avail), place_.device, err_msg)); +} + +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/npu_allocator.h b/paddle/fluid/memory/allocation/npu_allocator.h new file mode 100644 index 00000000000..738ec5d3ce1 --- /dev/null +++ b/paddle/fluid/memory/allocation/npu_allocator.h @@ -0,0 +1,41 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include // NOLINT +#include "paddle/fluid/memory/allocation/allocator.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace memory { +namespace allocation { + +class NPUAllocator : public Allocator { + public: + explicit NPUAllocator(const platform::NPUPlace& place) : place_(place) {} + + bool IsAllocThreadSafe() const override; + + protected: + void FreeImpl(Allocation* allocation) override; + Allocation* AllocateImpl(size_t size) override; + + private: + platform::NPUPlace place_; + std::once_flag once_flag_; +}; + +} // namespace allocation +} // namespace memory +} // namespace paddle diff --git a/paddle/fluid/memory/detail/CMakeLists.txt b/paddle/fluid/memory/detail/CMakeLists.txt index fcae741db36..e9631ee739b 100644 --- a/paddle/fluid/memory/detail/CMakeLists.txt +++ b/paddle/fluid/memory/detail/CMakeLists.txt @@ -6,6 +6,8 @@ if(WITH_GPU) nv_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info gpu_info place) elseif(WITH_ROCM) hip_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info gpu_info place) +elseif(${WITH_ASCEND_CL}) + cc_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info npu_info place) else() cc_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info place) endif() diff --git a/paddle/fluid/memory/detail/buddy_allocator.cc b/paddle/fluid/memory/detail/buddy_allocator.cc index 50c0b58f3a1..55436f451a4 100644 --- a/paddle/fluid/memory/detail/buddy_allocator.cc +++ b/paddle/fluid/memory/detail/buddy_allocator.cc @@ -21,6 +21,9 @@ limitations under the License. */ #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) DECLARE_uint64(reallocate_gpu_memory_in_mb); #endif +#ifdef PADDLE_WITH_ASCEND_CL +DECLARE_uint64(reallocate_gpu_memory_in_mb); +#endif namespace paddle { namespace memory { @@ -235,6 +238,21 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool( } } #endif +#ifdef PADDLE_WITH_ASCEND_CL + if (system_allocator_->UseGpu()) { + if ((total_used_ + total_free_) == 0) { + // Compute the allocation size for gpu for the first allocation. + allocate_bytes = std::max(platform::NPUInitAllocSize(), request_bytes); + } else { + // Compute the re-allocation size, we store the re-allocation size when + // user set FLAGS_reallocate_gpu_memory_in_mb to fix value. + if (realloc_size_ == 0 || FLAGS_reallocate_gpu_memory_in_mb == 0ul) { + realloc_size_ = platform::NPUReallocSize(); + } + allocate_bytes = std::max(realloc_size_, request_bytes); + } + } +#endif // Allocate a new block void* p = system_allocator_->Alloc(&index, allocate_bytes); diff --git a/paddle/fluid/memory/detail/buddy_allocator.h b/paddle/fluid/memory/detail/buddy_allocator.h index 15e93deffcc..135c3b6d04f 100644 --- a/paddle/fluid/memory/detail/buddy_allocator.h +++ b/paddle/fluid/memory/detail/buddy_allocator.h @@ -26,6 +26,7 @@ limitations under the License. */ #include "paddle/fluid/memory/detail/system_allocator.h" #include "paddle/fluid/platform/cpu_info.h" #include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/npu_info.h" namespace paddle { namespace memory { diff --git a/paddle/fluid/memory/detail/buddy_allocator_test.cc b/paddle/fluid/memory/detail/buddy_allocator_test.cc index 2dc3e73af24..290f3d5d1bc 100644 --- a/paddle/fluid/memory/detail/buddy_allocator_test.cc +++ b/paddle/fluid/memory/detail/buddy_allocator_test.cc @@ -19,14 +19,16 @@ limitations under the License. */ #ifdef WITH_GPERFTOOLS #include "gperftools/profiler.h" #endif +#include +#include + #include "gflags/gflags.h" #include "gtest/gtest.h" #include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/npu_info.h" -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -#include -#include - +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_ASCEND_CL) DECLARE_double(fraction_of_gpu_memory_to_use); DECLARE_uint64(initial_gpu_memory_in_mb); DECLARE_uint64(reallocate_gpu_memory_in_mb); @@ -342,6 +344,32 @@ TEST(BuddyAllocator, Release) { } #endif +#ifdef PADDLE_WITH_ASCEND_CL +TEST(BuddyAllocator, NpuFraction) { + // In a 16 GB machine, the pool size will be about 160 MB + FLAGS_fraction_of_gpu_memory_to_use = 0.005; + FLAGS_fraction_of_gpu_memory_to_use = 0.92; + FLAGS_initial_gpu_memory_in_mb = 0; + FLAGS_reallocate_gpu_memory_in_mb = 0; + + BuddyAllocator buddy_allocator( + std::unique_ptr(new NPUAllocator(0)), + platform::NPUMinChunkSize(), platform::NPUMaxChunkSize()); + + // Less than pool size + TestBuddyAllocator(&buddy_allocator, 10); + TestBuddyAllocator(&buddy_allocator, 10 << 10); + TestBuddyAllocator(&buddy_allocator, 10 << 20); + buddy_allocator.Release(); + + // Greater than max chunk size + TestBuddyAllocator(&buddy_allocator, 300 << 20, + /* use_system_allocator = */ true); + TestBuddyAllocator(&buddy_allocator, 1 * static_cast(1 << 30), + /* use_system_allocator = */ true); +} +#endif + } // namespace detail } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc index 38baf6c24ba..c733ba5c68c 100644 --- a/paddle/fluid/memory/detail/system_allocator.cc +++ b/paddle/fluid/memory/detail/system_allocator.cc @@ -29,6 +29,8 @@ limitations under the License. */ #include "paddle/fluid/platform/cpu_info.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/gpu_info.h" +#include "paddle/fluid/platform/npu_info.h" + #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/fluid/platform/cuda_device_guard.h" #endif @@ -247,6 +249,68 @@ bool CUDAPinnedAllocator::UseGpu() const { return false; } #endif +#ifdef PADDLE_WITH_ASCEND_CL +void* NPUAllocator::Alloc(size_t* index, size_t size) { + if (size <= 0) return nullptr; + + void* p; + auto result = platform::RecordedNPUMalloc(&p, size, npu_id_); + + if (result == ACL_ERROR_NONE) { + *index = 0; + npu_alloc_size_ += size; + return p; + } else { + size_t avail, total, actual_avail, actual_total; + bool is_limited = platform::RecordedNPUMemGetInfo( + &avail, &total, &actual_avail, &actual_total, npu_id_); + + std::string err_msg; + if (is_limited) { + auto limit_size = (total >> 20); + err_msg = string::Sprintf( + "\n 3) Set environment variable `FLAGS_gpu_memory_limit_mb` to a " + "larger value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the " + "maximum GPU memory usage is limited to %d MB.\n" + " The command is `export FLAGS_gpu_memory_limit_mb=xxx`.", + limit_size, limit_size); + } + + PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted( + "\n\nOut of memory error on NPU %d. " + "Cannot allocate %s memory on NPU %d, " + "available memory is only %s.\n\n" + "Please check whether there is any other process using NPU %d.\n" + "1. If yes, please stop them, or start PaddlePaddle on another NPU.\n" + "2. If no, please try one of the following suggestions:\n" + " 1) Decrease the batch size of your model.\n" + " 2) FLAGS_fraction_of_gpu_memory_to_use is %.2lf now, " + "please set it to a higher value but less than 1.0.\n" + " The command is " + "`export FLAGS_fraction_of_gpu_memory_to_use=xxx`.%s\n\n", + npu_id_, string::HumanReadableSize(size), npu_id_, + string::HumanReadableSize(avail), npu_id_, + FLAGS_fraction_of_gpu_memory_to_use, err_msg)); + } +} + +void NPUAllocator::Free(void* p, size_t size, size_t index) { + VLOG(4) << "Free " << p << " size " << size; + PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument( + "The index should be 0, index is %d", index)); + PADDLE_ENFORCE_GE(npu_alloc_size_, size, + platform::errors::InvalidArgument( + "The size of memory (%d) to free exceeds the size of " + "allocated gpu memory (%d)", + size, npu_alloc_size_)); + npu_alloc_size_ -= size; + + platform::RecordedNPUFree(p, size, npu_id_); +} + +bool NPUAllocator::UseGpu() const { return true; } +#endif + } // namespace detail } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/detail/system_allocator.h b/paddle/fluid/memory/detail/system_allocator.h index e332bb670da..26711ae4070 100644 --- a/paddle/fluid/memory/detail/system_allocator.h +++ b/paddle/fluid/memory/detail/system_allocator.h @@ -66,6 +66,22 @@ class CUDAPinnedAllocator : public SystemAllocator { }; #endif +#ifdef PADDLE_WITH_ASCEND_CL + +class NPUAllocator : public SystemAllocator { + public: + explicit NPUAllocator(int npu_id) : npu_id_(npu_id) {} + + virtual void* Alloc(size_t* index, size_t size); + virtual void Free(void* p, size_t size, size_t index); + virtual bool UseGpu() const; + + private: + size_t npu_alloc_size_ = 0; + int npu_id_; +}; +#endif + } // namespace detail } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/detail/system_allocator_test.cc b/paddle/fluid/memory/detail/system_allocator_test.cc index 13854d771a0..ead188341da 100644 --- a/paddle/fluid/memory/detail/system_allocator_test.cc +++ b/paddle/fluid/memory/detail/system_allocator_test.cc @@ -85,3 +85,11 @@ TEST(GPUAllocator, AllocFailure) { } } #endif + +#ifdef PADDLE_WITH_ASCEND_CL +TEST(NPUAllocator, Alloc) { + paddle::memory::detail::NPUAllocator a(0); + TestAllocator(&a, 1 << 20); + TestAllocator(&a, 1); +} +#endif diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc index 6f252e1bd0d..d9a4503cc1e 100644 --- a/paddle/fluid/memory/memcpy.cc +++ b/paddle/fluid/memory/memcpy.cc @@ -196,6 +196,85 @@ void Copy(platform::XPUPlace dst_place, } #endif +#ifdef PADDLE_WITH_ASCEND_CL +template <> +void Copy(platform::NPUPlace dst_place, + void* dst, + platform::CPUPlace src_place, + const void* src, size_t num, + aclrtStream stream) { + if (UNLIKELY(num == 0)) return; + + platform::SetNPUDeviceId(dst_place.device); + VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " + << dst_place << " by thream(" << stream << ")"; + if (stream) { + platform::RecordEvent record_event("NpuMemcpyAsync:CPU->NPU"); + platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE, stream); + } else { + platform::RecordEvent record_event("NpuMemcpySync:CPU->NPU"); + platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE); + } +} + +template <> +void Copy(platform::CPUPlace dst_place, + void* dst, + platform::NPUPlace src_place, + const void* src, size_t num, + aclrtStream stream) { + if (UNLIKELY(num == 0)) return; + + platform::SetNPUDeviceId(src_place.device); + VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " + << dst_place << " by thream(" << stream << ")"; + if (stream) { + platform::RecordEvent record_event("NpuMemcpyAsync:NPU->CPU"); + platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST, stream); + } else { + platform::RecordEvent record_event("GpuMemcpySync:NPU->CPU"); + platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST); + } +} + +template <> +void Copy(platform::NPUPlace dst_place, + void* dst, + platform::NPUPlace src_place, + const void* src, size_t num, + aclrtStream stream) { + if (UNLIKELY(num == 0)) return; + + VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " + << dst_place << " by stream(" << stream << ")"; + if (dst_place == src_place) { + platform::SetNPUDeviceId(src_place.device); + if (stream) { + platform::RecordEvent record_event("NpuMemcpyAsync(same_npu):NPU->NPU"); + platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE, + stream); + } else { + platform::RecordEvent record_event("NpuMemcpySync(same_npu):NPU->NPU"); + platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE); + } + } else { + if (!platform::NPUCanAccessPeer(dst_place.device, dst_place.device)) { + PADDLE_THROW(platform::errors::Unavailable( + "Peer access between NPU places is not allowed.")); + } + if (stream) { + // TODO(zhiqiu): support peer access? + platform::RecordEvent record_event("NpuMemcpyPeerAsync:NPU->NPU"); + platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE, + stream); + } else { + platform::RecordEvent record_event("NpuMemcpyPeerSync:NPU->NPU"); + platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE); + } + } +} +#endif + #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) static constexpr size_t kMaxGpuAsyncCopyBytes = 64 * 1024; // 64K diff --git a/paddle/fluid/memory/memcpy.h b/paddle/fluid/memory/memcpy.h index 25490f28b65..c630437224c 100644 --- a/paddle/fluid/memory/memcpy.h +++ b/paddle/fluid/memory/memcpy.h @@ -52,7 +52,27 @@ void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num); template void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num, gpuStream_t stream); +#endif +#ifdef PADDLE_WITH_ASCEND_CL +/** + * \brief Copy memory from one place to another place. + * + * \param[in] DstPlace Destination allocation place (CPU or NPU). + * \param[in] dst Destination memory address. + * \param[in] SrcPlace Source allocation place (CPU or NPU). + * \param[in] src Source memory address. + * \param[in] num memory size in bytes to copy. + * \param[in] stream NPU stream. + * + * \note For NPU memory copy, NPU stream need to be specified + * for asynchronously memory copy. + * + */ +template +void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num, + aclrtStream stream); #endif + } // namespace memory } // namespace paddle diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index ed878727532..dac8c7b03e5 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -123,6 +123,11 @@ if (WITH_ASCEND) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} ascend_wrapper) endif() +if (WITH_ASCEND_CL) + cc_library(npu_op_runner SRCS npu_op_runner.cc DEPS operator npu_info) + set(COMMON_OP_DEPS ${COMMON_OP_DEPS} npu_op_runner) +endif() + # FIXME(typhoonzero): operator deps may not needed. # op_library(lod_tensor_to_array_op DEPS lod_rank_table_op) # op_library(array_to_lod_tensor_op DEPS lod_rank_table_op) diff --git a/paddle/fluid/operators/elementwise/CMakeLists.txt b/paddle/fluid/operators/elementwise/CMakeLists.txt index 06ca98e526e..216a3f79d6f 100644 --- a/paddle/fluid/operators/elementwise/CMakeLists.txt +++ b/paddle/fluid/operators/elementwise/CMakeLists.txt @@ -8,3 +8,7 @@ register_operators(DEPS op_version_registry) cc_test(test_elementwise_add_op_inplace SRCS test_elementwise_add_op_inplace.cc DEPS op_registry elementwise_add_op scope device_context enforce executor) cc_test(test_elementwise_div_grad_grad SRCS test_elementwise_div_grad_grad.cc DEPS op_registry elementwise_div_op scope device_context enforce executor) cc_test(test_elementwise_add_grad_grad SRCS test_elementwise_add_grad_grad.cc DEPS op_registry elementwise_add_op scope device_context enforce executor) + +if(WITH_ASCEND_CL) +cc_test(elementwise_op_npu_test SRCS elementwise_op_npu_test.cc DEPS op_registry elementwise_add_op elementwise_sub_op scope device_context enforce executor) +endif() diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc new file mode 100644 index 00000000000..1e7e5e02c01 --- /dev/null +++ b/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc @@ -0,0 +1,50 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_ASCEND_CL +#include +#include + +#include "paddle/fluid/operators/elementwise/elementwise_add_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +template +class ElementwiseAddNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* out = ctx.Output("Out"); + out->mutable_data(ctx.GetPlace()); + + auto runner = NpuOpRunner("Add", {*x, *y}, {*out}, {}); + auto stream = + ctx.template device_context() + .stream(); + runner.Run(stream); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_NPU_KERNEL( + elementwise_add, + ops::ElementwiseAddNPUKernel); +#endif diff --git a/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc b/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc new file mode 100644 index 00000000000..3a2a2164708 --- /dev/null +++ b/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc @@ -0,0 +1,181 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifndef _WIN32 +#include +#endif + +#include +#include // NOLINT +#include + +#include "gtest/gtest.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/operators/dropout_op.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/string/printf.h" + +namespace f = paddle::framework; +namespace p = paddle::platform; +namespace m = paddle::operators::math; + +USE_OP(elementwise_add); +USE_OP_DEVICE_KERNEL(elementwise_add, NPU); +USE_OP(elementwise_sub); +USE_OP_DEVICE_KERNEL(elementwise_sub, NPU); + +template +void Compare(f::Scope* scope, const p::DeviceContext& ctx, + std::string op_type) { + // init + auto x = scope->Var("X"); + auto tensor_x = x->GetMutable(); + + auto y = scope->Var("Y"); + auto tensor_y = y->GetMutable(); + + std::vector init_x; + for (int64_t i = 0; i < 10 * 10; ++i) { + init_x.push_back(static_cast(1.0)); + } + + std::vector init_y; + for (int64_t i = 0; i < 10 * 10; ++i) { + init_y.push_back(static_cast(2.0)); + } + + TensorFromVector(init_x, ctx, tensor_x); + tensor_x->Resize({10, 10}); + TensorFromVector(init_y, ctx, tensor_y); + tensor_y->Resize({10, 10}); + + ctx.Wait(); + + auto place = ctx.GetPlace(); + auto out = scope->Var("Out"); + auto tensor_out = out->GetMutable(); + + // run + f::AttributeMap attrs; + auto op = f::OpRegistry::CreateOp(op_type, {{"X", {"X"}}, {"Y", {"Y"}}}, + {{"Out", {"Out"}}}, attrs); + + op->Run(*scope, place); + + std::vector out_vec; + TensorToVector(*tensor_out, ctx, &out_vec); + + ctx.Wait(); + float expected; + if (op_type == "elementwise_add") { + expected = 3.0; + } else if (op_type == "elementwise_sub") { + expected = -1.0; + } + EXPECT_EQ(out_vec.size(), init_x.size()); + for (uint32_t i = 0; i < out_vec.size(); i++) { + EXPECT_EQ(out_vec[i], static_cast(expected)); + } +} + +template +void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx, + std::string op_type) { + // init + auto dout = scope->Var("DOut"); + auto tensor_dout = dout->GetMutable(); + tensor_dout->Resize({2, 3, 5}); + + auto x = scope->Var("X"); + auto tensor_x = x->GetMutable(); + tensor_x->Resize({2, 3, 5}); + + auto y = scope->Var("Y"); + auto tensor_y = y->GetMutable(); + tensor_y->Resize({1, 5}); + + auto dx = scope->Var("DX"); + auto tensor_dx = dx->GetMutable(); + + auto dy = scope->Var("DY"); + auto tensor_dy = dy->GetMutable(); + + std::vector init_dout; + for (int64_t i = 0; i < tensor_dout->numel(); ++i) { + init_dout.push_back(static_cast(1.0)); + } + + TensorFromVector(init_dout, ctx, tensor_dout); + tensor_dout->Resize({2, 3, 5}); + + ctx.Wait(); + + // run + f::AttributeMap attrs; + auto op = f::OpRegistry::CreateOp( + op_type, {{"Out@GRAD", {"DOut"}}, {"X", {"X"}}, {"Y", {"Y"}}}, + {{"X@GRAD", {"DX"}}, {"Y@GRAD", {"DY"}}}, attrs); + + auto place = ctx.GetPlace(); + op->Run(*scope, place); + + std::vector dx_vec; + TensorToVector(*tensor_dx, ctx, &dx_vec); + + std::vector dy_vec; + TensorToVector(*tensor_dy, ctx, &dy_vec); + + ctx.Wait(); + float expected_x, expected_y; + if (op_type == "elementwise_add_grad") { + expected_x = 1.0; + expected_y = 6.0; + } else if (op_type == "elementwise_sub_grad") { + expected_x = 1.0; + expected_y = -6.0; + } + + for (uint32_t i = 0; i < dx_vec.size(); i++) { + EXPECT_EQ(dx_vec[i], static_cast(expected_x)); + } + for (uint32_t i = 0; i < dy_vec.size(); i++) { + EXPECT_EQ(dy_vec[i], static_cast(expected_y)); + } +} + +TEST(elementwise_add, NPU_fp32) { + f::Scope scope; + p::NPUDeviceContext ctx(p::NPUPlace(0)); + Compare(&scope, ctx, "elementwise_add"); +} + +TEST(elementwise_sub, NPU_fp32) { + f::Scope scope; + p::NPUDeviceContext ctx(p::NPUPlace(0)); + Compare(&scope, ctx, "elementwise_sub"); +} + +TEST(elementwise_sub, NPU_fp16) { + f::Scope scope; + p::NPUDeviceContext ctx(p::NPUPlace(0)); + Compare(&scope, ctx, "elementwise_sub"); +} + +TEST(elementwise_sub_grad, NPU) { + f::Scope scope; + p::NPUDeviceContext ctx(p::NPUPlace(0)); + CompareGrad(&scope, ctx, "elementwise_sub_grad"); +} diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc new file mode 100644 index 00000000000..e47c38daee8 --- /dev/null +++ b/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc @@ -0,0 +1,171 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_ASCEND_CL +#include +#include + +#include "paddle/fluid/operators/elementwise/elementwise_sub_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class ElementwiseSubNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* out = ctx.Output("Out"); + + out->mutable_data(ctx.GetPlace()); + + auto runner = NpuOpRunner("Sub", {*x, *y}, {*out}, {}); + + auto stream = + ctx.template device_context() + .stream(); + runner.Run(stream); + } +}; + +template +class ElementwiseSubGradNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* dx = ctx.Output(framework::GradVarName("X")); + auto* dy = ctx.Output(framework::GradVarName("Y")); + + dx->mutable_data(ctx.GetPlace()); + dy->mutable_data(ctx.GetPlace()); + + // NOTE(zhiqiu): It seems Ascend Sub follow the broadcast sematics with + // default axis=-1? + // So, the sub_grad should do reduce if needed. + // For example, the shape of each variable in elementwise_sub: + // x, dx: [2, 3, 5] + // y, dy: [1, 5] + // out, dout: [2, 3, 5] + // Then, out = x - y => dx = dout, dy = -dout + // And, the shape of dy can be computed by two stages reduce, + // 1. [2, 3, 5] => [3, 5], ReduceSumD on axis = 0, keep_dims = false. + // 2. [3, 5] => [1, 5], ReduceSumD on axis = 0, keep_dims = true. + + auto stream = + ctx.template device_context() + .stream(); + // For dx + // stage 1 + auto reduce_ndim = dout->dims().size() - dx->dims().size(); + std::vector axes; + for (auto i = 0; i < reduce_ndim; ++i) { + axes.push_back(i); + } + auto tmp_dout = dout; + Tensor reduced_dout(dx->type()); + if (axes.size() != 0) { + std::vector reduced_dout_dims; + for (auto i = reduce_ndim; i < dout->dims().size(); ++i) { + reduced_dout_dims.push_back(dout->dims()[i]); + } + reduced_dout.Resize(framework::make_ddim(reduced_dout_dims)); + reduced_dout.mutable_data(ctx.GetPlace()); + auto runner = NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout}, + {{"axes", axes}, {"keep_dims", false}}); + runner.Run(stream); + tmp_dout = &reduced_dout; + } + + // stage 2 + axes.clear(); + for (auto i = 0; i < dx->dims().size(); ++i) { + if (dx->dims()[i] == 1) { + axes.push_back(i); + } + } + if (axes.size() != 0) { + auto runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {*dx}, + {{"axes", axes}, {"keep_dims", true}}); + runner.Run(stream); + } else { + framework::TensorCopySync(*tmp_dout, ctx.GetPlace(), dx); + } + + // For dy + // stage 1 + reduce_ndim = dout->dims().size() - dy->dims().size(); + axes.clear(); + for (auto i = 0; i < reduce_ndim; ++i) { + axes.push_back(i); + } + tmp_dout = dout; + Tensor reduced_dy(dy->type()); + + if (axes.size() != 0) { + std::vector reduced_dout_dims; + for (auto i = reduce_ndim; i < dout->dims().size(); ++i) { + reduced_dout_dims.push_back(dout->dims()[i]); + } + reduced_dout.Resize(framework::make_ddim(reduced_dout_dims)); + reduced_dout.mutable_data(ctx.GetPlace()); + auto runner = NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout}, + {{"axes", axes}, {"keep_dims", false}}); + runner.Run(stream); + tmp_dout = &reduced_dout; + } + + // stage 2 + axes.clear(); + auto* tmp_dy = tmp_dout; + for (auto i = 0; i < dy->dims().size(); ++i) { + if (dy->dims()[i] == 1) { + axes.push_back(i); + } + } + if (axes.size() != 0) { + reduced_dy.Resize(dy->dims()); + reduced_dy.mutable_data(ctx.GetPlace()); + auto runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {reduced_dy}, + {{"axes", axes}, {"keep_dims", true}}); + runner.Run(stream); + tmp_dy = &reduced_dy; + } + + // stage 3, negative + auto runner = NpuOpRunner("Neg", {*tmp_dy}, {*dy}, {}); + runner.Run(stream); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_NPU_KERNEL( + elementwise_sub, + ops::ElementwiseSubNPUKernel, + ops::ElementwiseSubNPUKernel); + +REGISTER_OP_NPU_KERNEL( + elementwise_sub_grad, + ops::ElementwiseSubGradNPUKernel, + ops::ElementwiseSubGradNPUKernel); +#endif diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc index 5242d03c11c..68179a68574 100644 --- a/paddle/fluid/operators/math/math_function.cc +++ b/paddle/fluid/operators/math/math_function.cc @@ -149,6 +149,13 @@ void set_constant_with_place( PADDLE_THROW(platform::errors::Unimplemented("XPUPlace is not supported")); } +template <> +void set_constant_with_place( + const platform::DeviceContext& context, framework::Tensor* tensor, + float value) { + PADDLE_THROW(platform::errors::Unimplemented("NPUPlace is not supported")); +} + template <> void set_constant_with_place( const platform::DeviceContext& context, framework::Tensor* tensor, diff --git a/paddle/fluid/operators/npu_op_runner.cc b/paddle/fluid/operators/npu_op_runner.cc new file mode 100644 index 00000000000..7af6de52241 --- /dev/null +++ b/paddle/fluid/operators/npu_op_runner.cc @@ -0,0 +1,260 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/npu_op_runner.h" + +#include +#include + +#include +#include +#include + +#include "acl/acl.h" +#include "acl/acl_op_compiler.h" + +#include "paddle/fluid/framework/framework.pb.h" + +namespace paddle { +namespace operators { + +static std::map + DTYPE_2_ACL_DTYPE = { + {framework::proto::VarType::BOOL, ACL_BOOL}, + {framework::proto::VarType::INT16, ACL_INT16}, + {framework::proto::VarType::INT32, ACL_INT32}, + {framework::proto::VarType::INT64, ACL_INT64}, + {framework::proto::VarType::FP16, ACL_FLOAT16}, + {framework::proto::VarType::FP32, ACL_FLOAT}, + {framework::proto::VarType::FP64, ACL_DOUBLE}, +}; + +static std::map DATA_LAYOUT_2_ACL_FORMAT = { + {DataLayout::kNCHW, ACL_FORMAT_NCHW}, + {DataLayout::kNHWC, ACL_FORMAT_NHWC}, + {DataLayout::kAnyLayout, ACL_FORMAT_ND}, +}; + +aclDataType ConvertToNpuDtype(framework::proto::VarType::Type dtype) { + auto iter = DTYPE_2_ACL_DTYPE.find(dtype); + PADDLE_ENFORCE_NE(iter, DTYPE_2_ACL_DTYPE.end(), + platform::errors::NotFound( + "The data type (%s) can not convert to ACL data type.", + framework::DataTypeToString(dtype))); + return iter->second; +} + +aclFormat ConvertToNpuFormat(DataLayout layout) { + auto iter = DATA_LAYOUT_2_ACL_FORMAT.find(layout); + PADDLE_ENFORCE_NE( + iter, DATA_LAYOUT_2_ACL_FORMAT.end(), + platform::errors::NotFound( + "The data type (%s) can not convert to ACL data type.", layout)); + return iter->second; +} + +NpuOpRunner::NpuOpRunner(std::string op_type) : op_type_(op_type) { + attr_ = aclopCreateAttr(); +} + +NpuOpRunner::NpuOpRunner(std::string op_type, const std::vector &inputs, + const std::vector &outputs, + const AttributeMap &attrs) + : op_type_(op_type) { + attr_ = aclopCreateAttr(); + AddInputs(inputs); + AddOutputs(outputs); + AddAttrs(attrs); +} + +NpuOpRunner::~NpuOpRunner() { + // TODO(zhiqiu): handle free +} + +const std::string &NpuOpRunner::Type() { return op_type_; } + +NpuOpRunner &NpuOpRunner::AddAttr(const std::string &name, + const Attribute &attr) { + if (attr.type() == typeid(bool)) { + PADDLE_ENFORCE_NPU_SUCCESS( + aclopSetAttrBool(attr_, name.c_str(), BOOST_GET_CONST(bool, attr))); + } else if (attr.type() == typeid(int)) { + PADDLE_ENFORCE_NPU_SUCCESS( + aclopSetAttrInt(attr_, name.c_str(), BOOST_GET_CONST(int, attr))); + + } else if (attr.type() == typeid(int64_t)) { + PADDLE_ENFORCE_NPU_SUCCESS( + aclopSetAttrInt(attr_, name.c_str(), BOOST_GET_CONST(int64_t, attr))); + } else if (attr.type() == typeid(float)) { + PADDLE_ENFORCE_NPU_SUCCESS( + aclopSetAttrFloat(attr_, name.c_str(), BOOST_GET_CONST(float, attr))); + } else if (attr.type() == typeid(std::vector)) { + auto a = BOOST_GET_CONST(std::vector, attr); + std::vector cast_a; + for (auto it : a) { + cast_a.push_back(static_cast(it)); + } + PADDLE_ENFORCE_NPU_SUCCESS(aclopSetAttrListBool( + attr_, name.c_str(), cast_a.size(), cast_a.data())); + } else if (attr.type() == typeid(std::vector)) { + auto a = BOOST_GET_CONST(std::vector, attr); + std::vector cast_a; + for (auto it : a) { + cast_a.push_back(static_cast(it)); + } + PADDLE_ENFORCE_NPU_SUCCESS( + aclopSetAttrListInt(attr_, name.c_str(), cast_a.size(), cast_a.data())); + } else if (attr.type() == typeid(std::vector)) { + auto a = BOOST_GET_CONST(std::vector, attr); + PADDLE_ENFORCE_NPU_SUCCESS( + aclopSetAttrListInt(attr_, name.c_str(), a.size(), a.data())); + } else if (attr.type() == typeid(std::vector)) { + auto a = BOOST_GET_CONST(std::vector, attr); + PADDLE_ENFORCE_NPU_SUCCESS( + aclopSetAttrListFloat(attr_, name.c_str(), a.size(), a.data())); + } else if (attr.type() == typeid(std::string)) { + auto a = BOOST_GET_CONST(std::string, attr); + PADDLE_ENFORCE_NPU_SUCCESS( + aclopSetAttrString(attr_, name.c_str(), a.c_str())); + } else if (attr.type() == typeid(std::vector)) { + auto a = BOOST_GET_CONST(std::vector, attr); + std::vector s; + for (auto &it : a) { + s.push_back(it.data()); + } + PADDLE_ENFORCE_NPU_SUCCESS( + aclopSetAttrListString(attr_, name.c_str(), s.size(), s.data())); + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Can not convert attribubte '%s' to convert to aclopAttr", name)); + } + return *this; +} + +NpuOpRunner &NpuOpRunner::AddAttrs(const AttributeMap &attrs) { + for (const auto &pair : attrs) { + AddAttr(pair.first, pair.second); + } + return *this; +} + +NpuOpRunner &NpuOpRunner::AddInput(const Tensor &tensor) { + // create aclTensorDesc + input_descs_.emplace_back(CreateTensorDesc(tensor)); + // create aclDataBuffer + input_buffers_.emplace_back(CreateDataBuffer(tensor)); + return *this; +} + +NpuOpRunner &NpuOpRunner::AddOutput(const Tensor &tensor) { + // create aclTensorDesc + output_descs_.emplace_back(CreateTensorDesc(tensor)); + // create aclDataBuffer + output_buffers_.emplace_back(CreateDataBuffer(tensor)); + return *this; +} + +NpuOpRunner &NpuOpRunner::AddInputs(const std::vector &tensors) { + for (auto tensor : tensors) { + // create aclTensorDesc + input_descs_.emplace_back(CreateTensorDesc(tensor)); + // create aclDataBuffer + input_buffers_.emplace_back(CreateDataBuffer(tensor)); + } + return *this; +} + +NpuOpRunner &NpuOpRunner::AddOutputs(const std::vector &tensors) { + for (auto tensor : tensors) { + // create aclTensorDesc + output_descs_.emplace_back(CreateTensorDesc(tensor)); + // create aclDataBuffer + output_buffers_.emplace_back(CreateDataBuffer(tensor)); + } + return *this; +} + +aclTensorDesc *NpuOpRunner::GetInputDesc(size_t index) { + PADDLE_ENFORCE_LT(index, input_descs_.size(), + platform::errors::OutOfRange( + "The index should be less than the size of inputs of " + "operator %s, but got index is %d and size is %d", + Type(), index, input_descs_.size())); + return input_descs_[index]; +} + +aclTensorDesc *NpuOpRunner::GetOutputDesc(size_t index) { + PADDLE_ENFORCE_LT(index, output_descs_.size(), + platform::errors::OutOfRange( + "The index should be less than the size of output of " + "operator %s, but got index is %d and size is %d", + Type(), index, output_descs_.size())); + return output_descs_[index]; +} + +std::vector &NpuOpRunner::GetInputDescs() { + return input_descs_; +} + +std::vector &NpuOpRunner::GetOutputDescs() { + return output_descs_; +} + +std::vector &NpuOpRunner::GetInputBuffers() { + return input_buffers_; +} + +std::vector &NpuOpRunner::GetOutputBuffers() { + return output_buffers_; +} + +aclTensorDesc *NpuOpRunner::CreateTensorDesc(Tensor tensor) { + auto dtype = ConvertToNpuDtype(tensor.type()); + auto format = ConvertToNpuFormat(tensor.layout()); + auto dims = framework::vectorize(tensor.dims()); + + VLOG(4) << dtype << " " << dims.size() << " " << dims[0] << "," << dims[1] + << " " << format; + + auto *desc = aclCreateTensorDesc(dtype, dims.size(), dims.data(), format); + PADDLE_ENFORCE_NOT_NULL( + desc, platform::errors::External("Call aclCreateTensorDesc failed.")); + return desc; +} + +aclDataBuffer *NpuOpRunner::CreateDataBuffer(Tensor tensor) { + void *ptr = tensor.data(); + VLOG(4) << "ptr: " << ptr << ", size: " << tensor.memory_size(); + auto *buffer = aclCreateDataBuffer(ptr, tensor.memory_size()); + PADDLE_ENFORCE_NOT_NULL( + buffer, platform::errors::External("Call aclCreateDataBuffer failed.")); + return buffer; +} + +void NpuOpRunner::Run(aclrtStream stream) { + VLOG(4) << "op_type: " << op_type_; + VLOG(4) << "input_desc.size: " << input_descs_.size(); + VLOG(4) << "output_desc.size: " << output_descs_.size(); + VLOG(4) << "stream: " << stream; + VLOG(4) << "attr: " << attr_; + aclError ret = aclopCompileAndExecute( + op_type_.c_str(), input_descs_.size(), input_descs_.data(), + input_buffers_.data(), output_descs_.size(), output_descs_.data(), + output_buffers_.data(), attr_, ACL_ENGINE_SYS, ACL_COMPILE_SYS, NULL, + stream); + VLOG(4) << "after aclopCompileAndExecute: " << ret; + PADDLE_ENFORCE_NPU_SUCCESS(ret); +} +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/npu_op_runner.h b/paddle/fluid/operators/npu_op_runner.h new file mode 100644 index 00000000000..c69d8441e5d --- /dev/null +++ b/paddle/fluid/operators/npu_op_runner.h @@ -0,0 +1,84 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include + +#include +#include + +#include "acl/acl.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using DataLayout = framework::DataLayout; +using Attribute = framework::Attribute; +using AttributeMap = framework::AttributeMap; + +class NpuOpRunner { + public: + explicit NpuOpRunner(std::string op_type); + explicit NpuOpRunner(std::string op_type, + const std::vector &inputs = {}, + const std::vector &outputs = {}, + const AttributeMap &attrs = {}); + + ~NpuOpRunner(); + + const std::string &Type(); + + NpuOpRunner &AddAttr(const std::string &name, const Attribute &attr); + + NpuOpRunner &AddAttrs(const AttributeMap &attrs); + + NpuOpRunner &AddInput(const Tensor &tensor); + + NpuOpRunner &AddOutput(const Tensor &tensor); + + NpuOpRunner &AddInputs(const std::vector &tensors); + + NpuOpRunner &AddOutputs(const std::vector &tensors); + + aclTensorDesc *GetInputDesc(size_t index); + + aclTensorDesc *GetOutputDesc(size_t index); + + std::vector &GetInputDescs(); + + std::vector &GetOutputDescs(); + + std::vector &GetInputBuffers(); + + std::vector &GetOutputBuffers(); + + void Run(aclrtStream stream); + + private: + aclTensorDesc *CreateTensorDesc(Tensor tensor); + aclDataBuffer *CreateDataBuffer(Tensor tensor); + + private: + std::string op_type_; + std::vector input_buffers_; + std::vector output_buffers_; + std::vector input_descs_; + std::vector output_descs_; + aclopAttr *attr_{nullptr}; +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index 1e16008f36b..584dbd4756a 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -76,6 +76,10 @@ if(WITH_ASCEND) cc_library(ascend_npu_info SRCS ascend_npu_info.cc DEPS gflags glog enforce atlas_acl) endif() +if(WITH_ASCEND_CL) + cc_library(npu_info SRCS npu_info.cc DEPS gflags glog enforce monitor ascendcl acl_op_compiler) +endif() + add_subdirectory(dynload) add_subdirectory(stream) @@ -91,11 +95,20 @@ IF(WITH_GPU OR WITH_ROCM) set(GPU_CTX_DEPS dynload_cuda dynamic_loader cuda_stream) ENDIF() +IF(WITH_ASCEND_CL) + set(NPU_CTX_DEPS npu_stream npu_info) +ENDIF() + IF(WITH_MKLDNN) set(MKLDNN_CTX_DEPS mkldnn) ELSE() set(MKLDNN_CTX_DEPS) ENDIF() + +IF(WITH_ASCEND_CL) +cc_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce) +ENDIF() + IF(WITH_GPU) nv_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce) ENDIF() @@ -105,6 +118,8 @@ ENDIF() IF(WITH_GPU OR WITH_ROCM) set(STREAM_CALLBACK_DEPS stream_callback_manager) +ELSEIF(WITH_ASCEND_CL) + set(STREAM_CALLBACK_DEPS stream_callback_manager) ELSE() set(STREAM_CALLBACK_DEPS) ENDIF() @@ -118,7 +133,7 @@ cc_library(cudnn_workspace_helper SRCS cudnn_workspace_helper.cc DEPS boost) # memcpy depends on device_context, here add deps individually for # avoiding cycle dependencies cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool malloc xxhash ${STREAM_CALLBACK_DEPS} - place eigen3 stringpiece cpu_helper cpu_info framework_proto ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS} + place eigen3 stringpiece cpu_helper cpu_info framework_proto ${GPU_CTX_DEPS} ${NPU_CTX_DEPS} ${MKLDNN_CTX_DEPS} ${dgc_deps} dlpack cudnn_workspace_helper ${XPU_CTX_DEPS}) cc_library(collective_helper SRCS collective_helper.cc gen_comm_id_helper.cc DEPS framework_proto device_context enforce) diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index 22daaf101cf..a0ade3898c3 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -78,13 +78,13 @@ bool AllowTF32Cudnn() { return allow_tf32_cudnn; } DeviceContextPool* DeviceContextPool::pool = nullptr; platform::DeviceContext* DeviceContextPool::Get(const platform::Place& place) { + VLOG(4) << "DeviceContextPool Get: " << place; auto it = device_contexts_.find(place); if (it == device_contexts_.end()) { PADDLE_THROW(platform::errors::Unimplemented( "Place %s is not supported. Please check that your paddle compiles " - "with WITH_GPU or WITH_XPU option or check that your train process " - "hold the " - "correct gpu_id if you use Executor.", + "with WITH_GPU, WITH_XPU or WITH_ASCEND_CL option or check that " + "your train process set the correct device id if you use Executor.", place)); } return it->second.get().get(); @@ -145,6 +145,14 @@ DeviceContextPool::DeviceContextPool( PADDLE_THROW( platform::errors::Unimplemented("XPUPlace is not supported. Please " "re-compile with WITH_XPU option.")); +#endif + } else if (platform::is_npu_place(p)) { +#ifdef PADDLE_WITH_ASCEND_CL + EmplaceDeviceContext(&device_contexts_, p); +#else + PADDLE_THROW(platform::errors::Unimplemented( + "NPUPlace is not supported. Please " + "re-compile with WITH_ASCEND_CL option.")); #endif } } @@ -229,8 +237,35 @@ Place XPUDeviceContext::GetPlace() const { return place_; } xpu::Context* XPUDeviceContext::x_context() const { return context_; } #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#ifdef PADDLE_WITH_ASCEND_CL +NPUDeviceContext::NPUDeviceContext(NPUPlace place) : place_(place) { + NPUDeviceGuard guard(place_.device); + // PADDLE_ENFORCE_NPU_SUCCESS(aclrtCreateContext(&context_, place_.device)); + // NOTE(zhiqiu): Usually, no need to create context explicitly, + // ACL creates a default context which contains 1 default stream + // and 1 sync strean after aclrtSetDevice. + PADDLE_ENFORCE_NPU_SUCCESS(aclrtGetCurrentContext(&context_)); + stream_.reset(new stream::NPUStream(place)); +} + +NPUDeviceContext::~NPUDeviceContext() { + // NPUDeviceGuard guard(place_.device); + // PADDLE_ENFORCE_NPU_SUCCESS(aclrtDestroyContext(context_)); +} +void NPUDeviceContext::Wait() const { + NPUDeviceGuard guard(place_.device); + PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeDevice()); +} + +aclrtStream NPUDeviceContext::stream() const { return stream_->raw_stream(); } + +Place NPUDeviceContext::GetPlace() const { return place_; } + +aclrtContext NPUDeviceContext::context() const { return context_; } +#endif + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) class EigenCudaStreamDevice : public Eigen::StreamInterface { public: EigenCudaStreamDevice() : scratch_(nullptr), semaphore_(nullptr) { @@ -706,6 +741,5 @@ MKLDNNDeviceContext::BlobPtr_t MKLDNNDeviceContext::GetBlob( } #endif - } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index 02ad22f780f..face048f28e 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -57,6 +57,9 @@ limitations under the License. */ #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/fluid/platform/stream/cuda_stream.h" #endif +#ifdef PADDLE_WITH_ASCEND_CL +#include "paddle/fluid/platform/stream/npu_stream.h" +#endif #include "unsupported/Eigen/CXX11/Tensor" namespace Eigen { @@ -69,6 +72,11 @@ struct GpuDevice; #include "paddle/fluid/platform/xpu_info.h" #endif +#ifdef PADDLE_WITH_ASCEND_CL +#include "acl/acl.h" +#include "paddle/fluid/platform/npu_info.h" +#endif + namespace paddle { namespace platform { @@ -87,11 +95,13 @@ enum DeviceType { CPU = 0, CUDA = 1, XPU = 2, + NPU = 3, }; constexpr DeviceType kCPU = DeviceType::CPU; constexpr DeviceType kCUDA = DeviceType::CUDA; constexpr DeviceType kXPU = DeviceType::XPU; +constexpr DeviceType kNPU = DeviceType::NPU; class DeviceContext { public: @@ -163,8 +173,52 @@ struct DefaultDeviceContextType { }; #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#ifdef PADDLE_WITH_ASCEND_CL +class NPUDeviceContext : public DeviceContext { + public: + explicit NPUDeviceContext(NPUPlace place); + virtual ~NPUDeviceContext(); + Eigen::DefaultDevice* eigen_device() const { return nullptr; } + Place GetPlace() const override; + aclrtContext context() const; + /*! \brief Wait for all operations completion in the stream. */ + void Wait() const override; + + /*! \brief Return npu stream in the device context. */ + aclrtStream stream() const; + +#ifdef PADDLE_WITH_ASCEND_HCCL + /*! \brief Return bkcl context. */ + HCCLContext_t hccl_context() const { return hccl_context_; } + + /*! \brief Set bkcl context. */ + void set_hccl_context(HCCLContext_t context) { hccl_context_ = context; } +#endif + + private: + NPUPlace place_; + aclrtContext context_; +#ifdef PADDLE_WITH_ASCEND_HCCL + HCCLContext_t hccl_context_; +#endif + + // Need to be the same with other DeviceContext, + // Eventhough eigen_device_ is not used in NPU + // NOTE(zhiqiu): why need? + std::unique_ptr eigen_device_; + std::shared_ptr stream_; + + DISABLE_COPY_AND_ASSIGN(NPUDeviceContext); +}; + +template <> +struct DefaultDeviceContextType { + using TYPE = NPUDeviceContext; +}; +#endif + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) class CudnnWorkspaceHandle; class EigenCudaStreamDevice; diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h index f5045ff004e..4828a97e4df 100644 --- a/paddle/fluid/platform/dynload/cudnn.h +++ b/paddle/fluid/platform/dynload/cudnn.h @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#ifdef PADDLE_WITH_CUDA #include #include #include // NOLINT @@ -186,3 +187,5 @@ CUDNN_DNN_ROUTINE_EACH_R8(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) } // namespace dynload } // namespace platform } // namespace paddle + +#endif diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index 47ade89ff2d..f0809d34d49 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -45,6 +45,10 @@ limitations under the License. */ #include // NOLINT #endif +#ifdef PADDLE_WITH_ASCEND_CL +#include "acl/acl.h" +#endif // PADDLE_WITH_ASCEND_CL + #include #include #include @@ -970,7 +974,6 @@ DEFINE_CUDA_STATUS_TYPE(cusolverStatus_t, CUSOLVER_STATUS_SUCCESS); #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL) DEFINE_CUDA_STATUS_TYPE(ncclResult_t, ncclSuccess); #endif - } // namespace details #define PADDLE_ENFORCE_CUDA_SUCCESS(COND) \ @@ -1204,5 +1207,41 @@ inline void retry_sleep(unsigned millisecond) { #undef DEFINE_CUDA_STATUS_TYPE #endif // PADDLE_WITH_HIP +#ifdef PADDLE_WITH_ASCEND_CL +namespace details { +template +struct NPUStatusType {}; + +#define DEFINE_NPU_STATUS_TYPE(type, success_value) \ + template <> \ + struct NPUStatusType { \ + using Type = type; \ + static constexpr Type kSuccess = success_value; \ + } + +DEFINE_NPU_STATUS_TYPE(aclError, ACL_ERROR_NONE); +} // namespace details + +inline std::string build_npu_error_msg(aclError stat) { + std::ostringstream sout; + sout << " ACL error, the error code is : " << stat << ". "; + return sout.str(); +} + +#define PADDLE_ENFORCE_NPU_SUCCESS(COND) \ + do { \ + auto __cond__ = (COND); \ + using __NPU_STATUS_TYPE__ = decltype(__cond__); \ + constexpr auto __success_type__ = \ + ::paddle::platform::details::NPUStatusType< \ + __NPU_STATUS_TYPE__>::kSuccess; \ + if (UNLIKELY(__cond__ != __success_type__)) { \ + auto __summary__ = ::paddle::platform::errors::External( \ + ::paddle::platform::build_npu_error_msg(__cond__)); \ + __THROW_ERROR_INTERNAL__(__summary__); \ + } \ + } while (0) +#endif // PADDLE_WITH_ASCEND_CL + } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc index fa77c0be037..83b9544d232 100644 --- a/paddle/fluid/platform/flags.cc +++ b/paddle/fluid/platform/flags.cc @@ -45,7 +45,10 @@ DEFINE_bool(check_nan_inf, false, "Checking whether operator produce NAN/INF or not. It will be " "extremely slow so please use this flag wisely."); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +// NOTE(zhiqiu): better to share the flags, otherwise we will have too many +// flags. +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_ASCEND_CL) /** * CUDA related related FLAG @@ -84,8 +87,15 @@ DEFINE_string(selected_gpus, "", "share-memory only."); #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_ASCEND_CL) +DEFINE_string(selected_npus, "", + "A list of device ids separated by comma, like: 0,1,2,3. " + "This option is useful when doing multi process training and " + "each process have only one device (NPU). If you want to use " + "all visible devices, set this to empty string."); +#endif +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) /** * CUDNN related FLAG * Name: FLAGS_cudnn_deterministic @@ -377,7 +387,10 @@ DEFINE_double( "Default use 50% of CPU memory as the pinned_memory for PaddlePaddle," "reserve the rest for page tables, etc"); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +// NOTE(zhiqiu): better to share the flags, otherwise we will have too many +// flags. +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_ASCEND_CL) /** * Memory related FLAG diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc index 3769428c9df..2e66e3e36d0 100644 --- a/paddle/fluid/platform/gpu_info.cc +++ b/paddle/fluid/platform/gpu_info.cc @@ -102,6 +102,7 @@ static int GetCUDADeviceCountImpl() { } int GetCUDADeviceCount() { + // cache the count static auto dev_cnt = GetCUDADeviceCountImpl(); return dev_cnt; } diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc index ea89082733a..ac6988d350f 100644 --- a/paddle/fluid/platform/init.cc +++ b/paddle/fluid/platform/init.cc @@ -16,6 +16,8 @@ limitations under the License. */ #include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/platform/cpu_info.h" +#include "paddle/fluid/platform/npu_info.h" +#include "paddle/fluid/string/split.h" #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/fluid/platform/cuda_device_guard.h" #endif @@ -63,6 +65,7 @@ namespace framework { std::once_flag gflags_init_flag; std::once_flag glog_init_flag; +std::once_flag npu_init_flag; bool InitGflags(std::vector args) { bool successed = false; @@ -145,6 +148,17 @@ void InitDevices() { } catch (const std::exception &exp) { LOG(WARNING) << "Compiled with WITH_XPU, but no XPU found in runtime."; } +#endif +#ifdef PADDLE_WITH_ASCEND_CL + // NOTE(zhiqiu): use singleton to explicitly init and finalize ACL + platform::AclInstance::Instance(); // NOLINT + try { + // use user specified XPUs in single-node multi-process mode. + devices = platform::GetSelectedNPUDevices(); + } catch (const std::exception &exp) { + LOG(WARNING) + << "Compiled with PADDLE_WITH_ASCEND_CL, but no NPU found in runtime."; + } #endif InitDevices(devices); } @@ -165,6 +179,9 @@ void InitDevices(const std::vector devices) { #endif #ifdef PADDLE_WITH_XPU places.emplace_back(platform::XPUPlace(devices[i])); +#endif +#ifdef PADDLE_WITH_ASCEND_CL + places.emplace_back(platform::NPUPlace(devices[i])); #endif } places.emplace_back(platform::CPUPlace()); diff --git a/paddle/fluid/platform/monitor.cc b/paddle/fluid/platform/monitor.cc index 76554012bf5..1b44cb19654 100644 --- a/paddle/fluid/platform/monitor.cc +++ b/paddle/fluid/platform/monitor.cc @@ -35,3 +35,13 @@ DEFINE_INT_STATUS(STAT_gpu12_mem_size) DEFINE_INT_STATUS(STAT_gpu13_mem_size) DEFINE_INT_STATUS(STAT_gpu14_mem_size) DEFINE_INT_STATUS(STAT_gpu15_mem_size) + +// For Ascend NPU +DEFINE_INT_STATUS(STAT_npu0_mem_size) +DEFINE_INT_STATUS(STAT_npu1_mem_size) +DEFINE_INT_STATUS(STAT_npu2_mem_size) +DEFINE_INT_STATUS(STAT_npu3_mem_size) +DEFINE_INT_STATUS(STAT_npu4_mem_size) +DEFINE_INT_STATUS(STAT_npu5_mem_size) +DEFINE_INT_STATUS(STAT_npu6_mem_size) +DEFINE_INT_STATUS(STAT_npu7_mem_size) diff --git a/paddle/fluid/platform/monitor.h b/paddle/fluid/platform/monitor.h index b57fae9daac..0eb9448ce0f 100644 --- a/paddle/fluid/platform/monitor.h +++ b/paddle/fluid/platform/monitor.h @@ -187,3 +187,13 @@ class StatRegistry { USE_INT_STAT(STAT_gpu13_mem_size); \ USE_INT_STAT(STAT_gpu14_mem_size); \ USE_INT_STAT(STAT_gpu15_mem_size) + +#define USE_NPU_MEM_STAT \ + USE_INT_STAT(STAT_npu0_mem_size); \ + USE_INT_STAT(STAT_npu1_mem_size); \ + USE_INT_STAT(STAT_npu2_mem_size); \ + USE_INT_STAT(STAT_npu3_mem_size); \ + USE_INT_STAT(STAT_npu4_mem_size); \ + USE_INT_STAT(STAT_npu5_mem_size); \ + USE_INT_STAT(STAT_npu6_mem_size); \ + USE_INT_STAT(STAT_npu7_mem_size) diff --git a/paddle/fluid/platform/npu_info.cc b/paddle/fluid/platform/npu_info.cc new file mode 100644 index 00000000000..69204363993 --- /dev/null +++ b/paddle/fluid/platform/npu_info.cc @@ -0,0 +1,409 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/npu_info.h" +#include +#include +#include + +#include "gflags/gflags.h" + +#include "paddle/fluid/platform/lock_guard_ptr.h" +#include "paddle/fluid/platform/macros.h" +#include "paddle/fluid/platform/monitor.h" +#include "paddle/fluid/string/split.h" + +DECLARE_double(fraction_of_gpu_memory_to_use); +DECLARE_uint64(initial_gpu_memory_in_mb); +DECLARE_uint64(reallocate_gpu_memory_in_mb); +DECLARE_bool(enable_cublas_tensor_op_math); +DECLARE_uint64(gpu_memory_limit_mb); +DECLARE_string(selected_npus); + +constexpr static float fraction_reserve_gpu_memory = 0.05f; + +USE_NPU_MEM_STAT; + +namespace paddle { +namespace platform { + +static int GetNPUDeviceCountImpl() { + uint32_t count; + PADDLE_ENFORCE_NPU_SUCCESS(aclrtGetDeviceCount(&count)); + return count; +} + +int GetNPUDeviceCount() { + static auto dev_cnt = GetNPUDeviceCountImpl(); + return dev_cnt; +} + +int NPUCanAccessPeer(int src, int dst) { + int can = 0; + PADDLE_ENFORCE_NPU_SUCCESS(aclrtDeviceCanAccessPeer(&can, src, dst)); + return can; +} + +// For example, "1.0.1" +std::string GetNPURuntimeVersion(int id) { + PADDLE_ENFORCE_LT(id, GetNPUDeviceCount(), + platform::errors::InvalidArgument( + "Device id must be less than NPU count, " + "but received id is: %d. NPU count is: %d.", + id, GetNPUDeviceCount())); + int major = 0, minor = 0, patch = 0; + PADDLE_ENFORCE_NPU_SUCCESS(aclrtGetVersion(&major, &minor, &patch)); + return string::Sprintf("%d.%d.%d", major, minor, patch); +} + +int GetCurrentNPUDeviceId() { + int device_id; + PADDLE_ENFORCE_NPU_SUCCESS(aclrtGetDevice(&device_id)); + return device_id; +} + +//! Get a list of device ids from environment variable or use all. +std::vector GetSelectedNPUDevices() { + // use user specified NPUs in single-node multi-process mode. + std::vector devices; + if (!FLAGS_selected_npus.empty()) { + auto devices_str = paddle::string::Split(FLAGS_selected_npus, ','); + for (auto id : devices_str) { + devices.push_back(atoi(id.c_str())); + } + } else { + int count = GetNPUDeviceCount(); + for (int i = 0; i < count; ++i) { + devices.push_back(i); + } + } + return devices; +} + +void SetNPUDeviceId(int id) { + PADDLE_ENFORCE_LT(id, GetNPUDeviceCount(), + platform::errors::InvalidArgument( + "Device id must be less than NPU count, " + "but received id is: %d. NPU count is: %d.", + id, GetNPUDeviceCount())); + // NOTE(zihqiu): It is recommended to call aclrtSetDevice and aclrtResetDevice + // pairly. + PADDLE_ENFORCE_NPU_SUCCESS(aclrtSetDevice(id)); +} + +void ResetNPUDeviceId(int id) { + PADDLE_ENFORCE_LT(id, GetNPUDeviceCount(), + platform::errors::InvalidArgument( + "Device id must be less than NPU count, " + "but received id is: %d. NPU count is: %d.", + id, GetNPUDeviceCount())); + PADDLE_ENFORCE_NPU_SUCCESS(aclrtResetDevice(id)); +} + +void NPUMemoryUsage(size_t *available, size_t *total) { + size_t actual_available, actual_total; + RecordedNPUMemGetInfo(available, total, &actual_available, &actual_total, + platform::GetCurrentNPUDeviceId()); +} + +size_t NPUAvailableMemToAlloc() { + size_t total = 0; + size_t available = 0; + NPUMemoryUsage(&available, &total); + size_t reserving = + static_cast(fraction_reserve_gpu_memory * available); + // If available size is less than minimum chunk size, no usable memory exists + size_t available_to_alloc = available - reserving; + size_t min_chunk_size = NPUMinChunkSize(); + if (available_to_alloc < min_chunk_size) { + available_to_alloc = 0; + } + VLOG(10) << "NPU usage " << (available >> 20) << "M/" << (total >> 20) + << "M, " << (available_to_alloc >> 20) << "M available to allocate"; + return available_to_alloc; +} + +size_t NPUMaxAllocSize() { + return std::max(NPUInitAllocSize(), NPUReallocSize()); +} + +static size_t NPUAllocSize(bool realloc) { + size_t available_to_alloc = NPUAvailableMemToAlloc(); + PADDLE_ENFORCE_GT( + available_to_alloc, 0, + platform::errors::ResourceExhausted("Not enough available NPU memory.")); + // If FLAGS_initial_gpu_memory_in_mb is 0, then initial memory will be + // allocated by fraction + size_t flag_mb = realloc ? FLAGS_reallocate_gpu_memory_in_mb + : FLAGS_initial_gpu_memory_in_mb; + size_t alloc_bytes = + (flag_mb > 0ul ? flag_mb << 20 : available_to_alloc * + FLAGS_fraction_of_gpu_memory_to_use); + PADDLE_ENFORCE_GE( + available_to_alloc, alloc_bytes, + platform::errors::ResourceExhausted("Not enough available NPU memory.")); + VLOG(10) << "Alloc size is " << (alloc_bytes >> 20) + << " MiB, is it Re-alloc: " << realloc; + return alloc_bytes; +} + +size_t NPUInitAllocSize() { return NPUAllocSize(/* realloc = */ false); } + +size_t NPUReallocSize() { return NPUAllocSize(/* realloc = */ true); } + +size_t NPUMinChunkSize() { + // Allow to allocate the minimum chunk size is 256 bytes. + return 1 << 8; +} + +size_t NPUMaxChunkSize() { + size_t max_chunk_size = NPUMaxAllocSize(); + VLOG(10) << "Max chunk size " << (max_chunk_size >> 20) << "M"; + return max_chunk_size; +} + +void NPUMemcpyAsync(void *dst, const void *src, size_t count, + enum aclrtMemcpyKind kind, aclrtStream stream, + size_t dst_max_count) { + dst_max_count = dst_max_count ? dst_max_count : count; + VLOG(4) << dst << " " << dst_max_count << " " << src << " " << count << " " + << kind << " " << stream; + PADDLE_ENFORCE_NPU_SUCCESS( + aclrtMemcpyAsync(dst, dst_max_count, src, count, kind, stream)); +} + +void NPUMemcpySync(void *dst, const void *src, size_t count, + enum aclrtMemcpyKind kind, size_t dst_max_count) { + // NOTE(zhiqiu): The default max_count is count + dst_max_count = dst_max_count ? dst_max_count : count; + PADDLE_ENFORCE_NPU_SUCCESS(aclrtMemcpy(dst, dst_max_count, src, count, kind)); +} + +void NPUMemcpyPeerASync(void *dst, int dst_device, const void *src, + size_t count, enum aclrtMemcpyKind kind, + aclrtStream stream, size_t dst_max_count) { + dst_max_count = dst_max_count ? dst_max_count : count; + PADDLE_ENFORCE_NPU_SUCCESS( + aclrtMemcpyAsync(dst, dst_max_count, src, count, kind, stream)); +} + +void NPUMemcpyPeerSync(void *dst, int dst_device, const void *src, size_t count, + enum aclrtMemcpyKind kind, size_t dst_max_count) { + // NOTE(zhiqiu): The default max_count is count + dst_max_count = dst_max_count ? dst_max_count : count; + PADDLE_ENFORCE_NPU_SUCCESS(aclrtMemcpy(dst, dst_max_count, src, count, kind)); +} + +void NPUMemsetAsync(void *dst, int value, size_t count, aclrtStream stream, + size_t max_count) { + max_count = max_count ? max_count : count; + PADDLE_ENFORCE_NPU_SUCCESS( + aclrtMemsetAsync(dst, max_count, value, count, stream)); +} + +void NPUStreamSync(aclrtStream stream) { + PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeStream(stream)); +} + +static void RaiseNonOutOfMemoryError(aclError *status) { + if (*status == ACL_ERROR_BAD_ALLOC) { + *status = ACL_ERROR_NONE; + } + PADDLE_ENFORCE_NPU_SUCCESS(*status); +} + +class RecordedNPUMallocHelper { + private: + explicit RecordedNPUMallocHelper(int dev_id, uint64_t limit_size = 0) + : dev_id_(dev_id), limit_size_(limit_size) { + if (NeedRecord()) { + mtx_.reset(new std::mutex()); + } + } + + DISABLE_COPY_AND_ASSIGN(RecordedNPUMallocHelper); + + public: + static RecordedNPUMallocHelper *Instance(int dev_id) { + std::call_once(once_flag_, [] { + int dev_cnt = GetNPUDeviceCount(); + instances_.reserve(dev_cnt); + for (int i = 0; i < dev_cnt; ++i) { + // NOTE(zhiqiu): share the flags with gpu, avoid more flags. + instances_.emplace_back( + new RecordedNPUMallocHelper(i, FLAGS_gpu_memory_limit_mb << 20)); + } + }); + + PADDLE_ENFORCE_GE( + dev_id, 0, + platform::errors::OutOfRange( + "Device id must be not less than 0, but got %d.", dev_id)); + PADDLE_ENFORCE_LT( + dev_id, instances_.size(), + platform::errors::OutOfRange("Device id %d exceeds npu card number %d.", + dev_id, instances_.size())); + return instances_[dev_id].get(); + } + + /** + * Try to allocate `size` npu memory. Only ACL_ERROR_BAD_ALLOC + * or ACL_ERROR_NONE would be returned. + */ + aclError Malloc(void **ptr, size_t size) { + LockGuardPtr lock(mtx_); + if (UNLIKELY(NeedRecord() && cur_size_ + size > limit_size_)) { + return ACL_ERROR_BAD_ALLOC; + } + + NPUDeviceGuard guard(dev_id_); + auto result = aclrtMalloc(ptr, size, ACL_MEM_MALLOC_HUGE_FIRST); + if (result == ACL_ERROR_NONE) { + if (NeedRecord()) { + cur_size_ += size; + } + STAT_INT_ADD("STAT_npu" + std::to_string(dev_id_) + "_mem_size", size); + return result; + } else { + RaiseNonOutOfMemoryError(&result); + // Non out of memory error would be raised inside + // RaiseNonOutOfMemoryError. Therefore, we can + // return cudaErrorMemoryAllocation directly here. + return ACL_ERROR_BAD_ALLOC; + } + } + + /** + * Free gpu memory. Usually, free is not allowed to raise error. + * If it does raise error, the process should be crashed. + */ + void Free(void *ptr, size_t size) { + NPUDeviceGuard guard(dev_id_); + auto result = aclrtFree(ptr); + PADDLE_ENFORCE_NPU_SUCCESS(result); + if (NeedRecord()) { + std::lock_guard guard(*mtx_); + cur_size_ -= size; + } + STAT_INT_SUB("STAT_npu" + std::to_string(dev_id_) + "_mem_size", size); + } + + bool GetMemInfo(size_t *avail, size_t *total, size_t *actual_avail, + size_t *actual_total) { + { + NPUDeviceGuard guard(dev_id_); + auto result = aclrtGetMemInfo(ACL_HBM_MEM, actual_avail, actual_total); + if (result != ACL_ERROR_NONE) { + *actual_avail = 0; + } + RaiseNonOutOfMemoryError(&result); + } + + if (NeedRecord()) { + std::lock_guard guard(*mtx_); + *avail = std::min(*actual_avail, limit_size_ - cur_size_); + *total = std::min(*actual_total, limit_size_); + return *total < *actual_total; + } else { + *avail = *actual_avail; + *total = *actual_total; + return false; + } + } + + inline bool NeedRecord() const { return limit_size_ != 0; } + + uint64_t RecordedSize() const { + LockGuardPtr lock(mtx_); + return NeedRecord() ? cur_size_ : 0; + } + + uint64_t LimitSize() const { return limit_size_; } + + private: + const int dev_id_; + const uint64_t limit_size_; + uint64_t cur_size_{0}; + + mutable std::unique_ptr mtx_; + + static std::once_flag once_flag_; + static std::vector> instances_; +}; + +std::once_flag RecordedNPUMallocHelper::once_flag_; +std::vector> + RecordedNPUMallocHelper::instances_; + +aclError RecordedNPUMalloc(void **ptr, size_t size, int dev_id) { + return RecordedNPUMallocHelper::Instance(dev_id)->Malloc(ptr, size); +} + +void RecordedNPUFree(void *p, size_t size, int dev_id) { + return RecordedNPUMallocHelper::Instance(dev_id)->Free(p, size); +} + +bool RecordedNPUMemGetInfo(size_t *avail, size_t *total, size_t *actual_avail, + size_t *actual_total, int dev_id) { + return RecordedNPUMallocHelper::Instance(dev_id)->GetMemInfo( + avail, total, actual_avail, actual_total); +} + +uint64_t RecordedNPUMallocSize(int dev_id) { + return RecordedNPUMallocHelper::Instance(dev_id)->RecordedSize(); +} + +bool IsNPUMallocRecorded(int dev_id) { + return RecordedNPUMallocHelper::Instance(dev_id)->NeedRecord(); +} + +AclInstance::~AclInstance() {} + +AclInstance &AclInstance::Instance() { + static AclInstance instance; + return instance; +} + +AclInstance::AclInstance() { + PADDLE_ENFORCE_NPU_SUCCESS(aclInit(nullptr)); + VLOG(4) << "Call aclrtSetDevice "; + // NOTE(zhiqiu): why set devices here? + // Because ACL creates a default context which contains 2 streams + // when calling aclrtSetDeviceId, so usually we do not need to + // create contexts explicitly. And, for each device, aclrtSetDeviceId + // need to call parily with aclrtResetDeviceId to destory the default + // context. Here, we use this singleton and static instance to manage + // the devices to make sure they will be resetted before program exit. + devices_ = platform::GetSelectedNPUDevices(); + for (auto it = devices_.rbegin(); it != devices_.rend(); ++it) { + SetNPUDeviceId(*it); + VLOG(4) << "Call aclrtSetDevice " << *it; + } +} + +void AclInstance::Finalize() { + // NOTE(zhiqiu): DO NOT perform finalize in destructor + // to avoid problems caused by destructor order of static + // object. + for (size_t i = 0; i < devices_.size(); ++i) { + auto status = aclrtResetDevice(devices_[i]); + VLOG(4) << "Call aclrtResetDevice " << devices_[i] + << " status = " << status; + } + auto status = aclFinalize(); + VLOG(4) << "Call aclFinalize, status = " << status; +} + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/npu_info.h b/paddle/fluid/platform/npu_info.h new file mode 100644 index 00000000000..648b18531b2 --- /dev/null +++ b/paddle/fluid/platform/npu_info.h @@ -0,0 +1,156 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#ifdef PADDLE_WITH_ASCEND_CL +#include + +#include +#include + +#include "acl/acl.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace platform { + +//! Get the total number of NPU devices in system. +int GetNPUDeviceCount(); + +//! Get the runtime version of the ith NPU +std::string GetNPURuntimeVersion(int id); +//! Check if this device can access peer or not. +int NPUCanAccessPeer(int src, int dst); + +//! Get the current NPU device id in system. +int GetCurrentNPUDeviceId(); + +//! Get the current NPU stream. +int GetCurrentStream(); + +//! Get a list of device ids from environment variable or use all. +std::vector GetSelectedNPUDevices(); + +//! Set the NPU device id for next execution. +void SetNPUDeviceId(int device_id); + +//! Reset the NPU device id for next execution. +void ResetNPUDeviceId(int device_id); + +//! Get the memory usage of current NPU device. +void NPUMemoryUsage(size_t *available, size_t *total); + +//! Get the available memory to allocate, which is the size of available npu +//! minus reserving. +size_t NPUAvailableMemToAlloc(); + +//! Get the maximum allocation size of current NPU device. +size_t NPUMaxAllocSize(); + +//! Get the initial allocation size of current NPU device. +size_t NPUInitAllocSize(); + +//! Get the re-allocation size of current NPU device. +size_t NPUReallocSize(); + +//! Get the minimum chunk size for NPU buddy allocator. +size_t NPUMinChunkSize(); + +//! Get the maximum chunk size for NPU buddy allocator. +size_t NPUMaxChunkSize(); + +//! Copy memory from address src to dst asynchronously. +void NPUMemcpyAsync(void *dst, const void *src, size_t count, + enum aclrtMemcpyKind kind, aclrtStream stream, + size_t dst_max_count = 0); + +//! Copy memory from address src to dst synchronously. +void NPUMemcpySync(void *dst, const void *src, size_t count, + enum aclrtMemcpyKind kind, size_t dst_max_count = 0); + +//! Set memory dst with value count size asynchronously +void NPUMemsetAsync(void *dst, int value, size_t count, aclrtStream stream, + size_t max_count = 0); + +//! Copy memory from one device to another device asynchronously. +void NPUMemcpyPeerAsync(void *dst, int dst_device, const void *src, + int src_device, size_t count, aclrtStream stream, + size_t max_count = 0); + +//! Copy memory from one device to another device synchronously. +void NPUMemcpyPeerSync(void *dst, int dst_device, const void *src, + int src_device, size_t count, size_t max_count = 0); + +//! Blocks until stream has completed all operations. +void NPUStreamSync(aclrtStream stream); + +//! aclrtMalloc with recorded info +aclError RecordedNPUMalloc(void **ptr, size_t size, int dev_id); + +//! aclrtFree with recorded info +void RecordedNPUFree(void *p, size_t size, int dev_id); + +//! Get available and total gpu memory with considering limitation +bool RecordedNPUMemGetInfo(size_t *avail, size_t *total, size_t *actual_avail, + size_t *actual_total, int dev_id); + +//! Get recorded actrtMalloc size. If record is disabled, return 0. +uint64_t RecordedNPUMallocSize(int dev_id); + +bool IsNPUMallocRecorded(int dev_id); + +class NPUDeviceGuard { + public: + explicit inline NPUDeviceGuard(int dev_id) { + int prev_id = platform::GetCurrentNPUDeviceId(); + if (prev_id != dev_id) { + prev_id_ = prev_id; + platform::SetNPUDeviceId(dev_id); + } + } + + inline ~NPUDeviceGuard() { + if (prev_id_ != -1) { + platform::SetNPUDeviceId(prev_id_); + } + } + + NPUDeviceGuard(const NPUDeviceGuard &o) = delete; + NPUDeviceGuard &operator=(const NPUDeviceGuard &o) = delete; + + private: + int prev_id_{-1}; +}; + +class AclInstance { + public: + // NOTE(zhiiu): Commonly, exception in destructor is not recommended, so + // no PADDLE_ENFORCE here, call acl API directly. + ~AclInstance(); + AclInstance(const AclInstance &o) = delete; + const AclInstance &operator=(const AclInstance &o) = delete; + static AclInstance &Instance(); + void Finalize(); + + private: + // forbid calling default constructor + AclInstance(); + std::vector devices_; +}; + +} // namespace platform +} // namespace paddle + +#endif diff --git a/paddle/fluid/platform/place.cc b/paddle/fluid/platform/place.cc index b80d2fd1632..1cc9fd9fe76 100644 --- a/paddle/fluid/platform/place.cc +++ b/paddle/fluid/platform/place.cc @@ -33,6 +33,7 @@ class PlacePrinter : public boost::static_visitor<> { os_ << "CUDAPlace(" << p.device << ")"; } void operator()(const XPUPlace &p) { os_ << "XPUPlace(" << p.device << ")"; } + void operator()(const NPUPlace &p) { os_ << "NPUPlace(" << p.device << ")"; } void operator()(const CUDAPinnedPlace &p) { os_ << "CUDAPinnedPlace"; } private: @@ -49,6 +50,10 @@ bool is_xpu_place(const Place &p) { return boost::apply_visitor(IsXPUPlace(), p); } +bool is_npu_place(const Place &p) { + return boost::apply_visitor(IsNPUPlace(), p); +} + bool is_cpu_place(const Place &p) { return boost::apply_visitor(IsCPUPlace(), p); } @@ -67,6 +72,8 @@ bool is_same_place(const Place &p1, const Place &p2) { return true; } else if (is_xpu_place(p1)) { return BOOST_GET_CONST(XPUPlace, p1) == BOOST_GET_CONST(XPUPlace, p2); + } else if (is_npu_place(p1)) { + return BOOST_GET_CONST(NPUPlace, p1) == BOOST_GET_CONST(NPUPlace, p2); } else { return BOOST_GET_CONST(CUDAPlace, p1) == BOOST_GET_CONST(CUDAPlace, p2); } diff --git a/paddle/fluid/platform/place.h b/paddle/fluid/platform/place.h index e11ca4159e0..f20fac477d0 100644 --- a/paddle/fluid/platform/place.h +++ b/paddle/fluid/platform/place.h @@ -72,16 +72,31 @@ struct XPUPlace { int device; }; +struct NPUPlace { + NPUPlace() : NPUPlace(0) {} + explicit NPUPlace(int d) : device(d) {} + + inline int GetDeviceId() const { return device; } + // needed for variant equality comparison + inline bool operator==(const NPUPlace &o) const { return device == o.device; } + inline bool operator!=(const NPUPlace &o) const { return !(*this == o); } + inline bool operator<(const NPUPlace &o) const { return device < o.device; } + + int device; +}; + struct IsCUDAPlace : public boost::static_visitor { bool operator()(const CPUPlace &) const { return false; } bool operator()(const XPUPlace &) const { return false; } - bool operator()(const CUDAPlace &gpu) const { return true; } + bool operator()(const NPUPlace &) const { return false; } + bool operator()(const CUDAPlace &) const { return true; } bool operator()(const CUDAPinnedPlace &) const { return false; } }; struct IsCPUPlace : public boost::static_visitor { - bool operator()(const CPUPlace &cpu) const { return true; } + bool operator()(const CPUPlace &) const { return true; } bool operator()(const XPUPlace &) const { return false; } + bool operator()(const NPUPlace &) const { return false; } bool operator()(const CUDAPlace &) const { return false; } bool operator()(const CUDAPinnedPlace &) const { return false; } }; @@ -89,27 +104,38 @@ struct IsCPUPlace : public boost::static_visitor { struct IsCUDAPinnedPlace : public boost::static_visitor { bool operator()(const CPUPlace &) const { return false; } bool operator()(const XPUPlace &) const { return false; } + bool operator()(const NPUPlace &) const { return false; } bool operator()(const CUDAPlace &) const { return false; } bool operator()(const CUDAPinnedPlace &cuda_pinned) const { return true; } }; struct IsXPUPlace : public boost::static_visitor { bool operator()(const CPUPlace &) const { return false; } - bool operator()(const XPUPlace &xpu) const { return true; } + bool operator()(const XPUPlace &) const { return true; } + bool operator()(const NPUPlace &) const { return false; } bool operator()(const CUDAPlace &) const { return false; } bool operator()(const CUDAPinnedPlace &) const { return false; } }; -class Place - : public boost::variant { +struct IsNPUPlace : public boost::static_visitor { + bool operator()(const CPUPlace &) const { return false; } + bool operator()(const XPUPlace &) const { return false; } + bool operator()(const NPUPlace &) const { return true; } + bool operator()(const CUDAPlace &) const { return false; } + bool operator()(const CUDAPinnedPlace &) const { return false; } +}; + +class Place : public boost::variant { private: using PlaceBase = - boost::variant; + boost::variant; public: Place() = default; Place(const CPUPlace &cpu_place) : PlaceBase(cpu_place) {} // NOLINT Place(const XPUPlace &xpu_place) : PlaceBase(xpu_place) {} // NOLINT + Place(const NPUPlace &npu_place) : PlaceBase(npu_place) {} // NOLINT Place(const CUDAPlace &cuda_place) : PlaceBase(cuda_place) {} // NOLINT Place(const CUDAPinnedPlace &cuda_pinned_place) // NOLINT : PlaceBase(cuda_pinned_place) {} @@ -126,6 +152,7 @@ using PlaceList = std::vector; bool is_gpu_place(const Place &); bool is_xpu_place(const Place &); +bool is_npu_place(const Place &); bool is_cpu_place(const Place &); bool is_cuda_pinned_place(const Place &); bool places_are_same_class(const Place &, const Place &); @@ -153,6 +180,16 @@ struct PlaceVisitorWrapper #endif } + typename Visitor::result_type operator()(const NPUPlace &npu) const { +#ifdef PADDLE_WITH_ASCEND + return visitor_(npu); +#else + PADDLE_THROW(platform::errors::Unavailable( + "Paddle is not compiled with NPU. Cannot visit npu device")); + return typename Visitor::result_type(); +#endif + } + typename Visitor::result_type operator()(const CUDAPlace &cuda) const { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) return visitor_(cuda); diff --git a/paddle/fluid/platform/stream/CMakeLists.txt b/paddle/fluid/platform/stream/CMakeLists.txt index c0595eb415d..e1e3e49ce9c 100644 --- a/paddle/fluid/platform/stream/CMakeLists.txt +++ b/paddle/fluid/platform/stream/CMakeLists.txt @@ -1,3 +1,7 @@ IF(WITH_GPU OR WITH_ROCM) cc_library(cuda_stream SRCS cuda_stream.cc DEPS enforce boost) ENDIF() + +IF(WITH_ASCEND_CL) +cc_library(npu_stream SRCS npu_stream.cc DEPS enforce boost stream_callback_manager) +ENDIF() diff --git a/paddle/fluid/platform/stream/cuda_stream.cc b/paddle/fluid/platform/stream/cuda_stream.cc index fc51a08c2aa..6c6a47fadb5 100644 --- a/paddle/fluid/platform/stream/cuda_stream.cc +++ b/paddle/fluid/platform/stream/cuda_stream.cc @@ -49,8 +49,8 @@ bool CUDAStream::Init(const Place& place, const Priority& priority) { cudaStreamCreateWithPriority(&stream_, kDefaultFlag, 0)); #endif } - callback_manager_.reset(new StreamCallbackManager(stream_)); - VLOG(3) << "CUDAStream Init stream: " << stream_ + callback_manager_.reset(new StreamCallbackManager(stream_)); + VLOG(3) << "GPUStream Init stream: " << stream_ << ", priority: " << static_cast(priority); return true; } diff --git a/paddle/fluid/platform/stream/cuda_stream.h b/paddle/fluid/platform/stream/cuda_stream.h index d9375492519..46bbe94b080 100644 --- a/paddle/fluid/platform/stream/cuda_stream.h +++ b/paddle/fluid/platform/stream/cuda_stream.h @@ -101,7 +101,7 @@ class CUDAStream final { cudaStream_t stream_{nullptr}; #endif Priority priority_{Priority::kNormal}; - std::unique_ptr callback_manager_; + std::unique_ptr> callback_manager_; DISABLE_COPY_AND_ASSIGN(CUDAStream); }; diff --git a/paddle/fluid/platform/stream/npu_stream.cc b/paddle/fluid/platform/stream/npu_stream.cc new file mode 100644 index 00000000000..2664ac7194b --- /dev/null +++ b/paddle/fluid/platform/stream/npu_stream.cc @@ -0,0 +1,51 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/stream/npu_stream.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/npu_info.h" + +namespace paddle { +namespace platform { +namespace stream { + +bool NPUStream::Init(const Place& place) { + PADDLE_ENFORCE_EQ(is_npu_place(place), true, + platform::errors::InvalidArgument( + "NPU stream must be created using npu place.")); + place_ = place; + NPUDeviceGuard guard(BOOST_GET_CONST(NPUPlace, place_).device); + PADDLE_ENFORCE_NPU_SUCCESS(aclrtCreateStream(&stream_)); + callback_manager_.reset(new StreamCallbackManager(stream_)); + VLOG(3) << "NPUStream Init stream: " << stream_; + return true; +} + +void NPUStream::Destroy() { + NPUDeviceGuard guard(BOOST_GET_CONST(NPUPlace, place_).device); + Wait(); + WaitCallback(); + if (stream_) { + PADDLE_ENFORCE_NPU_SUCCESS(aclrtDestroyStream(stream_)); + } + stream_ = nullptr; +} + +void NPUStream::Wait() const { + PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeStream(stream_)); +} + +} // namespace stream +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/stream/npu_stream.h b/paddle/fluid/platform/stream/npu_stream.h new file mode 100644 index 00000000000..7e5d574acec --- /dev/null +++ b/paddle/fluid/platform/stream/npu_stream.h @@ -0,0 +1,76 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include + +#include "paddle/fluid/platform/macros.h" +#include "paddle/fluid/platform/npu_info.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/platform/stream_callback_manager.h" + +namespace paddle { +namespace platform { +namespace stream { + +#ifdef PADDLE_WITH_ASCEND_CL + +class NPUStream final { + public: + NPUStream() = default; + explicit NPUStream(const Place& place) { Init(place); } + virtual ~NPUStream() { Destroy(); } + + bool Init(const Place& place); + + template + void AddCallback(Callback&& callback) const { + callback_manager_->AddCallback(callback); + } + + template + void RecordEvent(aclrtEvent ev, Callback callback) const { + callback(); + PADDLE_ENFORCE_NPU_SUCCESS(aclrtRecordEvent(ev, stream_)); + } + + void RecordEvent(aclrtEvent ev) const { + PADDLE_ENFORCE_NPU_SUCCESS(aclrtRecordEvent(ev, stream_)); + } + + void WaitEvent(aclrtEvent ev) const { + PADDLE_ENFORCE_NPU_SUCCESS(aclrtStreamWaitEvent(stream_, ev)); + } + + void Wait() const; + void WaitCallback() const { callback_manager_->Wait(); } + + aclrtStream raw_stream() const { return stream_; } + void Destroy(); + + private: + Place place_; + aclrtStream stream_{nullptr}; + std::unique_ptr> callback_manager_; + + DISABLE_COPY_AND_ASSIGN(NPUStream); +}; + +#endif + +} // namespace stream +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/stream_callback_manager.cc b/paddle/fluid/platform/stream_callback_manager.cc index d6b106dc582..287c8fc37e0 100644 --- a/paddle/fluid/platform/stream_callback_manager.cc +++ b/paddle/fluid/platform/stream_callback_manager.cc @@ -21,11 +21,18 @@ namespace platform { #ifdef PADDLE_WITH_HIP static void StreamCallbackFunc(gpuStream_t stream, gpuError_t status, void *user_data) -#elif CUDA_VERSION >= 10000 -static void CUDART_CB StreamCallbackFunc(void *user_data) +#endif +#ifdef PADDLE_WITH_CUDA +#if CUDA_VERSION >= 10000 + static void CUDART_CB StreamCallbackFunc(void *user_data) #else -static void CUDART_CB StreamCallbackFunc(cudaStream_t stream, - cudaError_t status, void *user_data) + static void CUDART_CB + StreamCallbackFunc(cudaStream_t stream, cudaError_t status, void *user_data) +#endif +#endif + +#if PADDLE_WITH_ASCEND_CL + static void StreamCallbackFunc(void *user_data) #endif { std::unique_ptr> func( @@ -33,10 +40,13 @@ static void CUDART_CB StreamCallbackFunc(cudaStream_t stream, (*func)(); } -StreamCallbackManager::StreamCallbackManager(const gpuStream_t stream) +template +StreamCallbackManager::StreamCallbackManager(const Stream stream) : stream_(stream), thread_pool_(1) {} -void StreamCallbackManager::AddCallback(std::function callback) const { +template +void StreamCallbackManager::AddCallback( + std::function callback) const { auto *callback_func = new std::function(std::move(callback)); auto *func = new std::function([this, callback_func] { std::lock_guard lock(mtx_); @@ -45,23 +55,37 @@ void StreamCallbackManager::AddCallback(std::function callback) const { (*callback_func)(); }); }); + #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_CUDA_SUCCESS( hipStreamAddCallback(stream_, StreamCallbackFunc, func, 0)); -#elif CUDA_VERSION >= 10000 +#endif +#ifdef PADDLE_WITH_CUDA +#if CUDA_VERSION >= 10000 PADDLE_ENFORCE_CUDA_SUCCESS( cudaLaunchHostFunc(stream_, StreamCallbackFunc, func)); #else PADDLE_ENFORCE_CUDA_SUCCESS( cudaStreamAddCallback(stream_, StreamCallbackFunc, func, 0)); #endif +#endif + +#if PADDLE_WITH_ASCEND_CL + PADDLE_ENFORCE_NPU_SUCCESS(aclrtLaunchCallback(StreamCallbackFunc, func, + ACL_CALLBACK_BLOCK, stream_)); +#endif } -void StreamCallbackManager::Wait() const { +template +void StreamCallbackManager::Wait() const { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream_)); -#else +#endif +#ifdef PADDLE_WITH_CUDA PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream_)); +#endif +#ifdef PADDLE_WITH_ASCEND_CL + PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeStream(stream_)); #endif { std::lock_guard lock(mtx_); @@ -71,5 +95,15 @@ void StreamCallbackManager::Wait() const { } } +#ifdef PADDLE_WITH_CUDA +template struct StreamCallbackManager; +#endif +#ifdef PADDLE_WITH_HIP +template struct StreamCallbackManager; +#endif +#ifdef PADDLE_WITH_ASCEND_CL +template struct StreamCallbackManager; +#endif + } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/stream_callback_manager.h b/paddle/fluid/platform/stream_callback_manager.h index 56e8f83b5a5..1b960f188ec 100644 --- a/paddle/fluid/platform/stream_callback_manager.h +++ b/paddle/fluid/platform/stream_callback_manager.h @@ -37,9 +37,10 @@ namespace platform { // NOTE(zjl): clean StreamCallbackManager to make compilation faster // Make StreamCallbackManager thread-safe +template class StreamCallbackManager { public: - explicit StreamCallbackManager(const gpuStream_t stream); + explicit StreamCallbackManager(const Stream stream); ~StreamCallbackManager() = default; @@ -48,7 +49,7 @@ class StreamCallbackManager { void Wait() const; private: - const gpuStream_t stream_; + const Stream stream_; mutable ::ThreadPool thread_pool_; mutable std::mutex mtx_; mutable std::future last_future_; diff --git a/paddle/fluid/pybind/global_value_getter_setter.cc b/paddle/fluid/pybind/global_value_getter_setter.cc index e8ba16398d2..bc8d1e5b405 100644 --- a/paddle/fluid/pybind/global_value_getter_setter.cc +++ b/paddle/fluid/pybind/global_value_getter_setter.cc @@ -88,10 +88,17 @@ DECLARE_uint64(reallocate_gpu_memory_in_mb); // others DECLARE_bool(sync_nccl_allreduce); #endif + #ifdef PADDLE_WITH_XPU // device management DECLARE_string(selected_xpus); #endif + +#ifdef PADDLE_WITH_ASCEND_CL +// device management +DECLARE_string(selected_npus); +#endif + #ifdef PADDLE_WITH_DISTRIBUTE DECLARE_int32(rpc_send_thread_num); DECLARE_int32(rpc_get_thread_num); @@ -374,6 +381,11 @@ static void RegisterGlobalVarGetterSetter() { #ifdef PADDLE_WITH_XPU REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_selected_xpus); #endif + +#ifdef PADDLE_WITH_ASCEND_CL + REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_selected_npus); +#endif + #ifdef PADDLE_WITH_DITRIBUTE REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_rpc_send_thread_num, FLAGS_rpc_get_thread_num, diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 215c81a00e8..428c7c2420b 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -107,6 +107,10 @@ limitations under the License. */ #include "paddle/fluid/platform/gpu_info.h" #endif +#ifdef PADDLE_WITH_ASCEND_CL +#include "paddle/fluid/platform/npu_info.h" +#endif + #ifdef PADDLE_WITH_XPU #include "paddle/fluid/platform/xpu_info.h" #endif @@ -163,6 +167,14 @@ bool IsCompiledWithXPU() { #endif } +bool IsCompiledWithNPU() { +#ifndef PADDLE_WITH_ASCEND_CL + return false; +#else + return true; +#endif +} + bool IsCompiledWithMKLDNN() { #ifndef PADDLE_WITH_MKLDNN return false; @@ -569,6 +581,11 @@ PYBIND11_MODULE(core_noavx, m) { make_ddim(x_dim), make_ddim(y_dim), -1)); }); +#ifdef PADDLE_WITH_ASCEND_CL + m.def("_npu_finalize", + []() { platform::AclInstance::Instance().Finalize(); }); +#endif + m.def( "_append_python_callable_object_and_return_id", [](py::object py_obj) -> size_t { @@ -641,6 +658,10 @@ PYBIND11_MODULE(core_noavx, m) { [](framework::Tensor &self, paddle::platform::CPUPlace &place) { self.mutable_data(place); }) + .def("_alloc_float", + [](framework::Tensor &self, paddle::platform::NPUPlace &place) { + self.mutable_data(place); + }) .def("_alloc_double", [](framework::Tensor &self, paddle::platform::CPUPlace &place) { self.mutable_data(place); @@ -688,12 +709,19 @@ PYBIND11_MODULE(core_noavx, m) { return reinterpret_cast(self.mutable_data(place, type)); }) .def("_clear", &framework::Tensor::clear) + .def("_mutable_data", + [](framework::Tensor &self, paddle::platform::NPUPlace &place, + paddle::framework::proto::VarType::Type type) { + return reinterpret_cast(self.mutable_data(place, type)); + }) .def("set", SetTensorFromPyArray, py::arg("array"), py::arg("place"), py::arg("zero_copy") = false) .def("set", SetTensorFromPyArray, py::arg("array"), py::arg("place"), py::arg("zero_copy") = false) .def("set", SetTensorFromPyArray, py::arg("array"), py::arg("place"), py::arg("zero_copy") = false) + .def("set", SetTensorFromPyArray, + py::arg("array"), py::arg("place"), py::arg("zero_copy") = false) .def("set", SetTensorFromPyArray, py::arg("array"), py::arg("place"), py::arg("zero_copy") = false, R"DOC( @@ -701,7 +729,7 @@ PYBIND11_MODULE(core_noavx, m) { Args: lod (numpy.ndarray): The data to set. - place (CPUPlace|CUDAPlace|XPUPlace|CUDAPinnedPlace): The place where the + place (CPUPlace|CUDAPlace|XPUPlace|CUDAPinnedPlace|NPUPlace): The place where the LoDTensor is to be set. zero_copy (bool, optional): Whether to share memory with the input numpy array. This parameter only works with CPUPlace. Default: False. @@ -1429,6 +1457,18 @@ All parameter, weight, gradient are variables in Paddle. return new paddle::platform::XPUDeviceContext(place); #endif }) + .def_static("create", + [](paddle::platform::NPUPlace& place) + -> paddle::platform::DeviceContext* { +#ifndef PADDLE_WITH_ASCEND_CL + PADDLE_THROW( + platform::errors::PermissionDenied( + "Cannot use NPUPlace in CPU/GPU/XPU version, " + "Please recompile or reinstall Paddle with NPU support.")); +#else + return new paddle::platform::NPUDeviceContext(place); +#endif + }) .def_static("create", [](paddle::platform::CUDAPlace& place) -> paddle::platform::DeviceContext* { @@ -1529,6 +1569,7 @@ All parameter, weight, gradient are variables in Paddle. .def("_equals", &IsSamePlace) .def("_equals", &IsSamePlace) .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) .def("_equals", &IsSamePlace) .def("_get_device_id", @@ -1598,6 +1639,7 @@ All parameter, weight, gradient are variables in Paddle. #ifdef PADDLE_WITH_XPU m.def("get_xpu_device_count", platform::GetXPUDeviceCount); #endif + py::class_(m, "CPUPlace", R"DOC( CPUPlace is a descriptor of a device. It represents a CPU device on which a tensor will be allocated and a model will run. @@ -1613,6 +1655,7 @@ All parameter, weight, gradient are variables in Paddle. .def("_type", &PlaceIndex) .def("_equals", &IsSamePlace) .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) .def("_equals", &IsSamePlace) .def("_equals", &IsSamePlace) .def("_equals", @@ -1650,6 +1693,8 @@ All parameter, weight, gradient are variables in Paddle. &IsSamePlace) .def("_equals", &IsSamePlace) + .def("_equals", + &IsSamePlace) .def("_equals", &IsSamePlace) .def("_equals", @@ -1657,6 +1702,65 @@ All parameter, weight, gradient are variables in Paddle. .def("__repr__", string::to_string) .def("__str__", string::to_string); + // NPUPlace + py::class_(m, "NPUPlace", R"DOC( + NPUPlace is a descriptor of a device. + It represents a NPU device on which a tensor will be allocated and a model will run. + + Examples: + .. code-block:: python + import paddle + npu_place = paddle.NPUPlace(0) + + )DOC") + .def("__init__", + [](platform::NPUPlace &self, int dev_id) { +#ifdef PADDLE_WITH_ASCEND_CL + if (UNLIKELY(dev_id < 0)) { + LOG(ERROR) << string::Sprintf( + "Invalid NPUPlace(%d), device id must be 0 or " + "positive integer", + dev_id); + std::exit(-1); + } + if (UNLIKELY(dev_id >= platform::GetNPUDeviceCount())) { + if (platform::GetNPUDeviceCount() == 0) { + LOG(ERROR) << "Cannot use NPU because there is no NPU " + "detected on your " + "machine."; + std::exit(-1); + } else { + LOG(ERROR) << string::Sprintf( + "Invalid NPUPlace(%d), must inside [0, %d), because NPU " + "number on your machine is %d", + dev_id, platform::GetNPUDeviceCount(), + platform::GetNPUDeviceCount()); + std::exit(-1); + } + } + new (&self) platform::NPUPlace(dev_id); +#else + LOG(ERROR) << string::Sprintf( + "Cannot use NPU because you have installed CPU/GPU version " + "PaddlePaddle.\n" + "If you want to use NPU, please try to install NPU version " + "PaddlePaddle by: pip install paddlepaddle-xpu\n" + "If you only have CPU, please change NPUPlace(%d) to be " + "CPUPlace().\n", + dev_id); + std::exit(-1); +#endif + }) + .def("_type", &PlaceIndex) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", + &IsSamePlace) + .def("__str__", string::to_string); + py::class_(m, "Place") .def(py::init<>()) .def("_type", &PlaceIndex) @@ -1664,6 +1768,7 @@ All parameter, weight, gradient are variables in Paddle. .def("_equals", &IsSamePlace) .def("_equals", &IsSamePlace) .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) .def("_equals", &IsSamePlace) .def("is_gpu_place", [](platform::Place &self) { return platform::is_gpu_place(self); }) @@ -1671,6 +1776,8 @@ All parameter, weight, gradient are variables in Paddle. [](platform::Place &self) { return platform::is_cpu_place(self); }) .def("is_xpu_place", [](platform::Place &self) { return platform::is_xpu_place(self); }) + .def("is_npu_place", + [](platform::Place &self) { return platform::is_npu_place(self); }) .def("is_cuda_pinned_place", [](platform::Place &self) { return platform::is_cuda_pinned_place(self); @@ -1683,6 +1790,10 @@ All parameter, weight, gradient are variables in Paddle. [](platform::Place &self) { return BOOST_GET_CONST(platform::XPUPlace, self).device; }) + .def("npu_device_id", + [](platform::Place &self) { + return BOOST_GET_CONST(platform::NPUPlace, self).device; + }) .def("set_place", [](platform::Place &self, const platform::Place &other) { self = other; }) .def("set_place", @@ -1702,6 +1813,10 @@ All parameter, weight, gradient are variables in Paddle. const platform::CUDAPinnedPlace &cuda_pinned_place) { self = cuda_pinned_place; }) + .def("set_place", + [](platform::Place &self, const platform::NPUPlace &npu_place) { + self = npu_place; + }) .def("__repr__", string::to_string) .def("__str__", string::to_string); @@ -1726,6 +1841,9 @@ All parameter, weight, gradient are variables in Paddle. .def("run", [](OperatorBase &self, const Scope &scope, const platform::XPUPlace &place) { self.Run(scope, place); }) + .def("run", + [](OperatorBase &self, const Scope &scope, + const platform::NPUPlace &place) { self.Run(scope, place); }) .def("run", [](OperatorBase &self, const Scope &scope, const platform::CUDAPlace &place) { self.Run(scope, place); }) @@ -1828,6 +1946,7 @@ All parameter, weight, gradient are variables in Paddle. m.def("is_compiled_with_cuda", IsCompiledWithCUDA); m.def("is_compiled_with_ascend", IsCompiledWithAscend); m.def("is_compiled_with_rocm", IsCompiledWithROCM); + m.def("is_compiled_with_npu", IsCompiledWithNPU); m.def("is_compiled_with_xpu", IsCompiledWithXPU); m.def("is_compiled_with_mkldnn", IsCompiledWithMKLDNN); m.def("supports_bfloat16", SupportsBfloat16); diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h index 5f252170070..ab1dd8a180b 100644 --- a/paddle/fluid/pybind/tensor_py.h +++ b/paddle/fluid/pybind/tensor_py.h @@ -294,6 +294,22 @@ void SetTensorFromPyArrayT( PADDLE_THROW(platform::errors::PermissionDenied( "Cannot use XPUPlace in CPU/GPU version, " "Please recompile or reinstall Paddle with XPU support.")); +#endif + } else if (paddle::platform::is_npu_place(place)) { +#ifdef PADDLE_WITH_ASCEND_CL + platform::Place tmp_place = place; + platform::NPUDeviceGuard guard( + BOOST_GET_CONST(platform::NPUPlace, tmp_place).device); + auto dst = self->mutable_data(place); + platform::NPUMemcpySync(dst, array.data(), array.nbytes(), + ACL_MEMCPY_HOST_TO_DEVICE); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &ctx = *pool.Get(place); + ctx.Wait(); +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "Cannot use NPUPlace in CPU/GPU/XPU version. " + "Please recompile or reinstall Paddle with NPU support.")); #endif } else { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc index 38ed76a87cd..a886f7a0298 100644 --- a/paddle/testing/paddle_gtest_main.cc +++ b/paddle/testing/paddle_gtest_main.cc @@ -16,6 +16,7 @@ limitations under the License. */ #include "gtest/gtest.h" #include "paddle/fluid/memory/allocation/allocator_strategy.h" #include "paddle/fluid/platform/init.h" +#include "paddle/fluid/platform/npu_info.h" int main(int argc, char** argv) { paddle::memory::allocation::UseAllocatorStrategyGFlag(); @@ -38,11 +39,13 @@ int main(int argc, char** argv) { } #endif -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ + defined(PADDLE_WITH_ASCEND_CL) envs.push_back("fraction_of_gpu_memory_to_use"); envs.push_back("initial_gpu_memory_in_mb"); envs.push_back("reallocate_gpu_memory_in_mb"); envs.push_back("allocator_strategy"); + envs.push_back("selected_gpus"); #elif __clang__ envs.push_back("use_mkldnn"); envs.push_back("initial_cpu_memory_in_mb"); @@ -61,6 +64,10 @@ int main(int argc, char** argv) { undefok.push_back("initial_cpu_memory_in_mb"); #endif +#if defined(PADDLE_WITH_ASCEND_CL) + envs.push_back("selected_npus"); +#endif + char* env_str = nullptr; if (envs.size() > 0) { std::string env_string = "--tryfromenv="; @@ -93,6 +100,10 @@ int main(int argc, char** argv) { int ret = RUN_ALL_TESTS(); +#ifdef PADDLE_WITH_ASCEND_CL + paddle::platform::AclInstance::Instance().Finalize(); +#endif + if (env_str) free(env_str); if (undefok_str) free(undefok_str); diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py index 02725751cb6..17bf2d544f3 100755 --- a/python/paddle/__init__.py +++ b/python/paddle/__init__.py @@ -238,6 +238,7 @@ from .framework import ParamAttr #DEFINE_ALIAS from .framework import create_parameter #DEFINE_ALIAS from .framework import CPUPlace #DEFINE_ALIAS from .framework import CUDAPlace #DEFINE_ALIAS +from .framework import NPUPlace #DEFINE_ALIAS from .framework import CUDAPinnedPlace #DEFINE_ALIAS from .framework import grad #DEFINE_ALIAS @@ -262,6 +263,7 @@ from .device import set_device from .device import get_device from .device import is_compiled_with_cuda #DEFINE_ALIAS from .device import is_compiled_with_xpu +from .device import is_compiled_with_npu from .device import XPUPlace # from .tensor.tensor import Tensor #DEFINE_ALIAS # from .tensor.tensor import LoDTensor #DEFINE_ALIAS diff --git a/python/paddle/device.py b/python/paddle/device.py index 81b1dfcc745..d5e4406454b 100644 --- a/python/paddle/device.py +++ b/python/paddle/device.py @@ -32,12 +32,28 @@ __all__ = [ # 'cuda_places', # 'CUDAPinnedPlace', # 'CUDAPlace', - 'is_compiled_with_cuda' + 'is_compiled_with_cuda', + 'is_compiled_with_npu' ] _cudnn_version = None +def is_compiled_with_npu(): + """ + Whether this whl package can be used to run the model on NPU. + + Returns (bool): `True` if NPU is supported, otherwise `False`. + + Examples: + .. code-block:: python + + import paddle + support_npu = paddle.is_compiled_with_npu() + """ + return core.is_compiled_with_npu() + + def is_compiled_with_xpu(): """ Whether paddle was built with WITH_XPU=ON to support Baidu Kunlun @@ -165,6 +181,7 @@ def set_device(device): device_id = device_info_list[1] device_id = int(device_id) place = core.XPUPlace(device_id) + framework._set_expected_place(place) return place diff --git a/python/paddle/distributed/fleet/ascend_utils.py b/python/paddle/distributed/fleet/ascend_utils.py new file mode 100644 index 00000000000..23e812041c8 --- /dev/null +++ b/python/paddle/distributed/fleet/ascend_utils.py @@ -0,0 +1,125 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import json +import paddle +from paddle.distributed.fleet.launch_utils import get_cluster, logger, get_host_name_ip, DeviceMode + + +def _get_ascend_rankfile(rank_table_file_path): + """ + Args: + rank_table_file_path: ascend npu rank file json + { + "status": "completed", + "version": "1.0", + "server_count": "2", + "server_list": [ + { + "server_id": "192.168.24.217", + "device": [ + { + "device_id": "0", + "device_ip": "192.1.184.23", + "rank_id": "0" + }, + { + "device_id": "1", + "device_ip": "192.2.21.93", + "rank_id": "1" + } + ] + }, + { + "server_id": "192.168.26.177", + "device": [ + { + "device_id": "0", + "device_ip": "192.1.94.132", + "rank_id": "2" + }, + { + "device_id": "1", + "device_ip": "192.2.94.30", + "rank_id": "3" + } + ] + } + ] + } + + Returns: + node_ips: node ip list + device_count: number of npu per machine + + """ + json_data = None + with open(rank_table_file_path) as json_file: + json_data = json.load(json_file) + + node_ips = [] + device_count = 0 + server_list = json_data['server_list'] + for server in server_list: + node_ips.append(server['server_id']) + device_list = server['device'] + device_count = len(device_list) + + return node_ips, device_count + + +def get_cloud_cluster(rank_table_file=None, + device_mode=DeviceMode.ASCEND_NPU, + devices_per_proc=None, + start_port=6070): + """ + Args: + rank_table_file: string, ascend npu rank file path + device_mode: DeviceMode(Int) + devices_per_proc:list + start_port: the start port of current runtime env + """ + if rank_table_file: + # multi trainers + node_ips, device_count = _get_ascend_rankfile(rank_table_file) + node_index = os.environ.get("PADDLE_TRAINER_ID") + node_ip = None + if node_index is None: + _, node_ip = get_host_name_ip() + else: + node_ip = node_ips[int(node_index)] + + assert node_ip in node_ips, "Can't find your local ip {%s} in node_ips: {%s}" \ + % (node_ip, node_ips) + else: + # single trainer (single ascend card) + node_ips = ["127.0.0.1"] + node_ip = node_ips[0] + device_count = 1 + devices_per_proc = None + + if devices_per_proc is None: + devices_per_proc = [str(x) for x in range(device_count)] + + free_ports = [ + x for x in range(start_port, start_port + len(devices_per_proc)) + ] + + trainer_endpoints = [] + for ip in node_ips: + trainer_endpoints.append(["%s:%d" % (ip, port) for port in free_ports]) + + return get_cluster(node_ips, node_ip, trainer_endpoints, device_mode, + devices_per_proc) diff --git a/python/paddle/distributed/fleet/launch.py b/python/paddle/distributed/fleet/launch.py index d6f4227a923..bd5b67005ba 100644 --- a/python/paddle/distributed/fleet/launch.py +++ b/python/paddle/distributed/fleet/launch.py @@ -73,6 +73,7 @@ from paddle.distributed.fleet import launch_utils # TODO(danleifeng): Don't import * from a module from paddle.distributed.fleet.launch_utils import * import paddle.distributed.fleet.cloud_utils as cloud_utils +import paddle.distributed.fleet.ascend_utils as ascend_utils def _print_arguments(args): @@ -120,7 +121,7 @@ see: http://www.paddlepaddle.org/documentation/docs/zh/1.6/user_guides/howto/tra default=None, help="It's for ascend npu training." "For example:" - "--ascend_npus=\"0,1,2,3\" will launch four training processes each bound to one gpu." + "--ascend_npus=\"0,1,2,3\" will launch four training processes each bound to one npu." ) if fluid.core.is_compiled_with_cuda(): @@ -237,6 +238,13 @@ def launch_collective(args): cluster, pod = cloud_utils.get_cloud_cluster( args.ips, device_mode, devices_per_proc, start_port) logger.debug("get cluster from cloud:{}".format(cluster)) + elif device_mode == DeviceMode.ASCEND_NPU: + # for ascend + cluster, pod = ascend_utils.get_cloud_cluster( + rank_table_file=os.getenv("RANK_TABLE_FILE", None), + device_mode=device_mode, + devices_per_proc=devices_per_proc, + start_port=start_port) else: # trainers_num = 1 or not use paddlecloud ips="a,b" cluster, pod = get_cluster_from_args(args, device_mode, diff --git a/python/paddle/distributed/fleet/launch_utils.py b/python/paddle/distributed/fleet/launch_utils.py index 2d2807bce28..9f6c186b353 100644 --- a/python/paddle/distributed/fleet/launch_utils.py +++ b/python/paddle/distributed/fleet/launch_utils.py @@ -593,8 +593,8 @@ def get_ascend_npus(npus): if npus is None: count = fluid.core.NPUDevice.get_device_count() if count <= 0: - return ret - ret = [x for x in range(count)] + return None + ret = [str(x) for x in range(count)] else: ret = [x.strip() for x in npus.split(',')] return ret diff --git a/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_optimizer.py index 978899604ea..824225fd776 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_optimizer.py @@ -214,7 +214,8 @@ class AscendOptimizer(Optimizer): parameter_list=None, no_grad_set=None, auto_dp=False, - rank_table_file=None): + rank_table_file=None, + precision_mode="must_keep_origin_dtype"): minimized = None if self.inner_opt: minimized = self.inner_opt.minimize( @@ -234,7 +235,7 @@ class AscendOptimizer(Optimizer): config = { "ge.exec.deviceId": str(fleet.local_device_ids()), "ge.graphRunMode": "1", - "ge.exec.precision_mode": "must_keep_origin_dtype", + "ge.exec.precision_mode": precision_mode, } # if multi trainers if rank_table_file and fleet.world_size() > 1: diff --git a/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_parser.py b/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_parser.py index f2ecaf48438..19b5e910db2 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_parser.py +++ b/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_parser.py @@ -200,7 +200,8 @@ class AscendParserBase(object): def _accumulated_op_id(self): global global_cnt global_cnt += 1 - return "." + str(global_cnt) + name = "." + str(global_cnt) + return name def _create_ge_tensor(self, shape, dtype, value): tensor_desc = core.GETensorDesc( @@ -1622,10 +1623,14 @@ class MulGradParser(AscendParserBase): "unsqueeze" + self._accumulated_op_id(), "Unsqueeze").set_input("x", y).set_attr_vec_int32("axes", [0]) + y_stack = core.GEOperatorFactory.create_operator( + "stack" + self._accumulated_op_id(), + "TileWithAxis").set_input("x", y_unsqueeze).set_attr_int32( + "axis", 0).set_attr_int32("tiles", shape_out_grad[0]) x_grad = core.GEOperatorFactory.create_operator( self.parser_name + self._accumulated_op_id(), "BatchMatMul").set_input("x1", out_grad).set_input( - "x2", y_unsqueeze).set_attr_bool( + "x2", y_stack).set_attr_bool( "adj_x1", False).set_attr_bool("adj_x2", True) y_grad = core.GEOperatorFactory.create_operator( self.parser_name + self._accumulated_op_id(), diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index ae341868785..6dd1478dc1f 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -68,7 +68,8 @@ from .input import embedding, one_hot from . import distribute_lookup_table from .param_attr import ParamAttr, WeightNormParamAttr from .data_feeder import DataFeeder -from .core import LoDTensor, LoDTensorArray, CPUPlace, XPUPlace, CUDAPlace, CUDAPinnedPlace, Scope, _Scope +from .core import LoDTensor, LoDTensorArray, Scope, _Scope +from .core import CPUPlace, XPUPlace, CUDAPlace, CUDAPinnedPlace, NPUPlace from .incubate import fleet from .incubate import data_generator from .transpiler import DistributeTranspiler, \ @@ -124,6 +125,7 @@ __all__ = framework.__all__ + executor.__all__ + \ 'XPUPlace', 'CUDAPlace', 'CUDAPinnedPlace', + 'NPUPlace', 'Tensor', 'ParamAttr', 'WeightNormParamAttr', @@ -232,6 +234,16 @@ def __bootstrap__(): 'gpu_memory_limit_mb', 'conv2d_disable_cudnn', ] + + if core.is_compiled_with_npu(): + read_env_flags += [ + 'selected_npus', + 'fraction_of_gpu_memory_to_use', + 'initial_gpu_memory_in_mb', + 'reallocate_gpu_memory_in_mb', + 'gpu_memory_limit_mb', + ] + core.init_gflags(["--tryfromenv=" + ",".join(read_env_flags)]) core.init_glog(sys.argv[0]) # don't init_p2p when in unittest to save time. diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index da326ec074c..9c85cc6cd5d 100644 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -1213,6 +1213,7 @@ class Executor(object): # In distributed training, the compiled program is saved in Program._graph has_compiled_graph = isinstance(program._graph, compiler.CompiledProgram) + if has_compiled_graph: program._graph._compile(scope, self.place) # _graph in program does not support inference since the _graph is optimized diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index d5c01d20a91..499f0873dc3 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -6201,7 +6201,7 @@ def _get_paddle_place(place): if place is None: return place if isinstance(place, (core.Place, core.XPUPlace, core.CPUPlace, - core.CUDAPinnedPlace, core.CUDAPlace)): + core.CUDAPinnedPlace, core.CUDAPlace, core.NPUPlace)): return place if not isinstance(place, str): @@ -6211,9 +6211,11 @@ def _get_paddle_place(place): place = place.lower() if (place == "cpu"): return core.CPUPlace() + if (place == "device"): return core.Place() + # GPU avaliable_gpu_place = re.match(r'gpu:\d+', place) if place == "gpu_pinned" or place == "gpu" or avaliable_gpu_place: if not core.is_compiled_with_cuda(): @@ -6229,6 +6231,8 @@ def _get_paddle_place(place): device_id = place_info_list[1] device_id = int(device_id) return core.CUDAPlace(device_id) + + # XPU avaliable_xpu_place = re.match(r'xpu:\d+', place) if avaliable_xpu_place: if not core.is_compiled_with_xpu(): @@ -6239,9 +6243,22 @@ def _get_paddle_place(place): device_id = place_info_list[1] device_id = int(device_id) return core.XPUPlace(device_id) + + # NPU + avaliable_npu_place = re.match(r'npu:\d+', place) + if avaliable_npu_place: + if not core.is_compiled_with_npu(): + raise ValueError( + "The device should not be {}, since PaddlePaddle is " \ + "not compiled with NPU".format(avaliable_npu_place)) + place_info_list = place.split(':', 1) + device_id = place_info_list[1] + device_id = int(device_id) + return core.NPUPlace(device_id) + raise ValueError( - "paddle support CPUPlace, CUDAPlace,CUDAPinnedPlace and XPUPlace, Please check your Place Input" - ) + "Paddle supports CPUPlace, CUDAPlace,CUDAPinnedPlace, XPUPlace and NPUPlace, but received {}.". + format(place)) def _get_paddle_place_list(places): diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 792a2d32326..e8669fd2951 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -625,6 +625,10 @@ if (WITH_XPU) add_subdirectory(xpu) endif() +if (WITH_ASCEND_CL) + add_subdirectory(npu) +endif() + if (WITH_MKLDNN) add_subdirectory(mkldnn) endif() diff --git a/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt b/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt new file mode 100644 index 00000000000..f71e04c09aa --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt @@ -0,0 +1,6 @@ +file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") +string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") + +foreach(TEST_OP ${TEST_OPS}) + py_test_modules(${TEST_OP} MODULES ${TEST_OP}) +endforeach(TEST_OP) diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_add_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_add_op_npu.py new file mode 100644 index 00000000000..47da4fdb23e --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_add_op_npu.py @@ -0,0 +1,162 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest, _set_use_system_allocator +import paddle +import paddle.fluid as fluid + +paddle.enable_static() + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestElementwiseAddOp(OpTest): + def setUp(self): + self.set_npu() + self.op_type = "elementwise_add" + self.place = paddle.NPUPlace(0) + self.init_dtype() + self.init_input_output() + self.init_kernel_type() + self.init_axis() + + self.inputs = { + 'X': OpTest.np_dtype_to_fluid_dtype(self.x), + 'Y': OpTest.np_dtype_to_fluid_dtype(self.y) + } + self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_mkldnn} + self.outputs = {'Out': self.out} + + def set_npu(self): + self.__class__.use_npu = True + + def init_kernel_type(self): + self.use_mkldnn = False + + def init_input_output(self): + self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype) + self.y = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype) + self.out = np.add(self.x, self.y) + + def init_dtype(self): + self.dtype = np.float32 + + def init_axis(self): + self.axis = -1 + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False) + + # TODO(ascendrc): Test grad op after it is implemented. + # def test_check_grad_normal(self): + # self.check_grad_with_place( + # self.place, ['X', 'Y'], + # 'Out', + # max_relative_error=0.006, + # check_dygraph=False) + # + # def test_check_grad_ingore_x(self): + # self.check_grad_with_place( + # self.place, ['Y'], + # 'Out', + # no_grad_set=set("X"), + # max_relative_error=0.006, + # check_dygraph=False) + # + # def test_check_grad_ingore_y(self): + # self.check_grad_with_place( + # self.place, ['X'], + # 'Out', + # no_grad_set=set("Y"), + # max_relative_error=0.006,check_dygraph=False) + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestAddAPI(unittest.TestCase): + def test_name(self): + with paddle.static.program_guard(paddle.static.Program()): + x = paddle.static.data(name="x", shape=[2, 3], dtype="float32") + y = paddle.static.data(name='y', shape=[2, 3], dtype='float32') + + y_1 = paddle.add(x, y, name='add_res') + self.assertEqual(('add_res' in y_1.name), True) + + def test_static(self): + with paddle.static.program_guard(paddle.static.Program()): + + x_np = np.array([2, 3, 4]).astype('float32') + y_np = np.array([1, 5, 2]).astype('float32') + + x = paddle.static.data(name="x", shape=[3], dtype='float32') + y = paddle.static.data(name="y", shape=[3], dtype='float32') + + x_reshape = paddle.reshape(x, [3, 1]) + y_reshape = paddle.reshape(y, [3, 1]) + z = paddle.add(x_reshape, y_reshape) + z = paddle.reshape(z, shape=[3]) + + place = paddle.NPUPlace(0) + exe = paddle.static.Executor(place) + x_value, y_value, z_value = exe.run(feed={"x": x_np, + "y": y_np}, + fetch_list=[x, y, z]) + + z_expected = np.array([3., 8., 6.]) + self.assertEqual( + (x_value == x_np).all(), + True, + msg="x_value = {}, but expected {}".format(x_value, x_np)) + self.assertEqual( + (y_value == y_np).all(), + True, + msg="y_value = {}, but expected {}".format(y_value, y_np)) + self.assertEqual( + (z_value == z_expected).all(), + True, + msg="z_value = {}, but expected {}".format(z_value, z_expected)) + + def test_backward(self): + # TODO(ascendrc): Test backward after add grad npu op implemented. + pass + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestAddError(unittest.TestCase): + def test_errors(self): + with paddle.static.program_guard(paddle.static.Program()): + # the input of elementwise_add must be Variable. + x1 = fluid.create_lod_tensor( + np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.NPUPlace(0)) + y1 = fluid.create_lod_tensor( + np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.NPUPlace(0)) + self.assertRaises(TypeError, paddle.add, x1, y1) + + # the input dtype must be float16 or float32 or float64 or int32 or int64 + x2 = paddle.static.data( + name='x2', shape=[3, 4, 5, 6], dtype="uint8") + y2 = paddle.static.data( + name='y2', shape=[3, 4, 5, 6], dtype="uint8") + self.assertRaises(TypeError, paddle.add, x2, y2) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_sub_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_sub_op_npu.py new file mode 100644 index 00000000000..8c6c7b46f49 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_sub_op_npu.py @@ -0,0 +1,224 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid + +paddle.enable_static() + +SEED = 2021 + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestElementwiseSubOp(OpTest): + def setUp(self): + self.set_npu() + self.op_type = "elementwise_sub" + self.place = paddle.NPUPlace(0) + self.init_dtype() + self.init_input_output() + self.init_kernel_type() + self.init_axis() + + self.inputs = { + 'X': OpTest.np_dtype_to_fluid_dtype(self.x), + 'Y': OpTest.np_dtype_to_fluid_dtype(self.y) + } + self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_mkldnn} + self.outputs = {'Out': self.out} + + def set_npu(self): + self.__class__.use_npu = True + + def init_kernel_type(self): + self.use_mkldnn = False + + def init_input_output(self): + self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype) + self.y = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype) + self.out = np.subtract(self.x, self.y) + + def init_dtype(self): + self.dtype = np.float32 + + def init_axis(self): + self.axis = 0 + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False) + + # TODO(ascendrc): For grad tests, OpTest raises FatalError:Segmentation fault + # when call op.run, which may be caused by system environment exception + # and the exact cause has not be located. + # def test_check_grad_normal(self): + # self.check_grad_with_place( + # self.place, ['X', 'Y'], + # 'Out', + # max_relative_error=0.006, + # check_dygraph=False) + # + # def test_check_grad_ingore_x(self): + # self.check_grad_with_place( + # self.place, ['Y'], + # 'Out', + # no_grad_set=set("X"), + # max_relative_error=0.006, + # check_dygraph=False) + # + # def test_check_grad_ingore_y(self): + # self.check_grad_with_place( + # self.place, ['X'], + # 'Out', + # no_grad_set=set("Y"), + # max_relative_error=0.006,check_dygraph=False) + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestSubtractAPI(unittest.TestCase): + def test_name(self): + with paddle.static.program_guard(paddle.static.Program()): + x = paddle.static.data(name="x", shape=[2, 3], dtype="float32") + y = paddle.static.data(name='y', shape=[2, 3], dtype='float32') + + y_1 = paddle.subtract(x, y, name='add_res') + self.assertEqual(('add_res' in y_1.name), True) + + def test_static(self): + with paddle.static.program_guard(paddle.static.Program()): + + x_np = np.array([2, 3, 4]).astype('float32') + y_np = np.array([1, 5, 2]).astype('float32') + + x = paddle.static.data(name="x", shape=[3], dtype='float32') + y = paddle.static.data(name="y", shape=[3], dtype='float32') + + x_reshape = paddle.reshape(x, [3, 1]) + y_reshape = paddle.reshape(y, [3, 1]) + z = paddle.subtract(x_reshape, y_reshape) + z = paddle.reshape(z, shape=[3]) + + place = paddle.NPUPlace(0) + exe = paddle.static.Executor(place) + x_value, y_value, z_value = exe.run(feed={"x": x_np, + "y": y_np}, + fetch_list=[x, y, z]) + + z_expected = np.array([1., -2., 2.]) + self.assertEqual( + (x_value == x_np).all(), + True, + msg="x_value = {}, but expected {}".format(x_value, x_np)) + self.assertEqual( + (y_value == y_np).all(), + True, + msg="y_value = {}, but expected {}".format(y_value, y_np)) + self.assertEqual( + (z_value == z_expected).all(), + True, + msg="z_value = {}, but expected {}".format(z_value, z_expected)) + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestSubtractError(unittest.TestCase): + def test_errors(self): + with paddle.static.program_guard(paddle.static.Program()): + # the input of elementwise_add must be Variable. + x1 = fluid.create_lod_tensor( + np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.NPUPlace(0)) + y1 = fluid.create_lod_tensor( + np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.NPUPlace(0)) + self.assertRaises(TypeError, paddle.subtract, x1, y1) + + # the input dtype must be float16 or float32 or float64 or int32 or int64 + x2 = paddle.static.data( + name='x2', shape=[3, 4, 5, 6], dtype="uint8") + y2 = paddle.static.data( + name='y2', shape=[3, 4, 5, 6], dtype="uint8") + self.assertRaises(TypeError, paddle.subtract, x2, y2) + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestSubtractNet(unittest.TestCase): + def _test(self, run_npu=True): + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = SEED + startup_prog.random_seed = SEED + np.random.seed(SEED) + + a_np = np.random.random(size=(32, 32)).astype('float32') + b_np = np.random.random(size=(32, 32)).astype('float32') + label_np = np.random.randint(2, size=(32, 1)).astype('int64') + + with paddle.static.program_guard(main_prog, startup_prog): + a = paddle.static.data(name="a", shape=[32, 32], dtype='float32') + b = paddle.static.data(name="b", shape=[32, 32], dtype='float32') + label = paddle.static.data( + name="label", shape=[32, 1], dtype='int64') + + sum = paddle.add(a, b) + c = paddle.assign(b) + z = paddle.subtract(sum, c) + + fc_1 = fluid.layers.fc(input=z, size=128) + prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax') + + cost = fluid.layers.cross_entropy(input=prediction, label=label) + loss = fluid.layers.reduce_mean(cost) + sgd = fluid.optimizer.SGD(learning_rate=0.01) + sgd.minimize(loss) + + if run_npu: + place = paddle.NPUPlace(0) + else: + place = paddle.CPUPlace() + + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + for epoch in range(100): + + pred_res, loss_res = exe.run( + main_prog, + feed={"a": a_np, + "b": b_np, + "label": label_np}, + fetch_list=[prediction, loss]) + if epoch % 10 == 0: + print("Epoch {} | Prediction[0]: {}, Loss: {}".format( + epoch, pred_res[0], loss_res)) + + return pred_res, loss_res + + def test_npu(self): + npu_pred, npu_loss = self._test(True) + cpu_pred, cpu_loos = self._test(False) + + self.assertTrue(np.allclose(npu_pred, cpu_pred)) + self.assertTrue(np.allclose(npu_loss, cpu_loos)) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/npu/test_npu_place.py b/python/paddle/fluid/tests/unittests/npu/test_npu_place.py new file mode 100644 index 00000000000..3f71fad2b9c --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_npu_place.py @@ -0,0 +1,61 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import paddle +import numpy as np +from paddle.fluid import core + +paddle.enable_static() + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestNpuPlace(unittest.TestCase): + def test(self): + p = core.Place() + p.set_place(paddle.NPUPlace(0)) + + self.assertTrue(p.is_npu_place()) + self.assertEqual(p.npu_device_id(), 0) + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestNpuPlaceError(unittest.TestCase): + def test_static(self): + # NPU is not supported in ParallelExecutor + prog = paddle.static.Program() + with paddle.static.program_guard(prog): + + x_np = np.array([2, 3, 4]).astype('float32') + y_np = np.array([1, 5, 2]).astype('float32') + + x = paddle.static.data(name="x", shape=[3], dtype='float32') + y = paddle.static.data(name="y", shape=[3], dtype='float32') + z = paddle.add(x, y) + + compiled_prog = paddle.static.CompiledProgram(prog) + place = paddle.NPUPlace(0) + exe = paddle.static.Executor(place) + + with self.assertRaisesRegex(RuntimeError, + "NPU is not supported in ParallelExecutor"): + exe.run(compiled_prog, feed={"x": x_np, "y": y_np}) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py index dff96a8cbc3..569c4316880 100644 --- a/python/paddle/fluid/tests/unittests/op_test.py +++ b/python/paddle/fluid/tests/unittests/op_test.py @@ -266,7 +266,10 @@ class OpTest(unittest.TestCase): np.random.seed(123) random.seed(124) - cls._use_system_allocator = _set_use_system_allocator(True) + if paddle.is_compiled_with_npu(): + cls._use_system_allocator = _set_use_system_allocator(False) + else: + cls._use_system_allocator = _set_use_system_allocator(True) @classmethod def tearDownClass(cls): @@ -298,6 +301,9 @@ class OpTest(unittest.TestCase): def is_rocm_op_test(): return core.is_compiled_with_rocm() + def is_npu_op_test(): + return hasattr(cls, "use_npu") and cls.use_npu == True + if not hasattr(cls, "op_type"): raise AssertionError( "This test do not have op_type in class attrs, " @@ -319,7 +325,8 @@ class OpTest(unittest.TestCase): and not hasattr(cls, 'exist_fp64_check_grad') \ and not is_xpu_op_test() \ and not is_mkldnn_op_test() \ - and not is_rocm_op_test(): + and not is_rocm_op_test() \ + and not is_npu_op_test(): raise AssertionError( "This test of %s op needs check_grad with fp64 precision." % cls.op_type) @@ -1216,7 +1223,8 @@ class OpTest(unittest.TestCase): # Check inplace for given op, its grad op, its grad_grad op, etc. # No effect on original OpTest # Currently not support ParallelExecutor on XPUPlace. - if not paddle.is_compiled_with_xpu(): + if not paddle.is_compiled_with_xpu( + ) and not paddle.is_compiled_with_npu(): self.check_inplace_output_with_place( place, no_check_set=no_check_set, inplace_atol=inplace_atol) diff --git a/python/paddle/fluid/tests/unittests/test_device.py b/python/paddle/fluid/tests/unittests/test_device.py index 195337e80de..08697a08044 100644 --- a/python/paddle/fluid/tests/unittests/test_device.py +++ b/python/paddle/fluid/tests/unittests/test_device.py @@ -15,54 +15,39 @@ from __future__ import print_function import unittest -from op_test import OpTest -import numpy as np +import paddle import paddle.fluid as fluid import paddle.fluid.core as core import paddle.fluid.framework as framework -import warnings -import paddle class TestStaticDeviceManage(unittest.TestCase): - def test_cpu_device(self): - paddle.set_device('cpu') + def _test_device(self, device_name, device_class): + paddle.set_device(device_name) + out1 = paddle.zeros(shape=[1, 3], dtype='float32') out2 = paddle.ones(shape=[1, 3], dtype='float32') out3 = paddle.concat(x=[out1, out2], axis=0) - exe = paddle.fluid.Executor() + + exe = paddle.static.Executor() exe.run(paddle.fluid.default_startup_program()) res = exe.run(fetch_list=[out3]) + device = paddle.get_device() - self.assertEqual(isinstance(exe.place, core.CPUPlace), True) - self.assertEqual(device, "cpu") + self.assertEqual(isinstance(exe.place, device_class), True) + self.assertEqual(device, device_name) + + def test_cpu_device(self): + self._test_device("cpu", core.CPUPlace) def test_gpu_device(self): if core.is_compiled_with_cuda(): - out1 = paddle.zeros(shape=[1, 3], dtype='float32') - out2 = paddle.ones(shape=[1, 3], dtype='float32') - out3 = paddle.concat(x=[out1, out2], axis=0) - paddle.set_device('gpu:0') - exe = paddle.fluid.Executor() - exe.run(paddle.fluid.default_startup_program()) - res = exe.run(fetch_list=[out3]) - device = paddle.get_device() - self.assertEqual(isinstance(exe.place, core.CUDAPlace), True) - self.assertEqual(device, "gpu:0") + self._test_device("gpu:0", core.CUDAPlace) def test_xpu_device(self): if core.is_compiled_with_xpu(): - out1 = paddle.zeros(shape=[1, 3], dtype='float32') - out2 = paddle.ones(shape=[1, 3], dtype='float32') - out3 = paddle.concat(x=[out1, out2], axis=0) - paddle.set_device('xpu:0') - exe = paddle.fluid.Executor() - exe.run(paddle.fluid.default_startup_program()) - res = exe.run(fetch_list=[out3]) - device = paddle.get_device() - self.assertEqual(isinstance(exe.place, core.XPUPlace), True) - self.assertEqual(device, "xpu:0") + self._test_device("xpu:0", core.XPUPlace) class TestImperativeDeviceManage(unittest.TestCase): diff --git a/python/paddle/fluid/tests/unittests/test_fleet_launch_ascend2.sh b/python/paddle/fluid/tests/unittests/test_fleet_launch_ascend2.sh new file mode 100644 index 00000000000..2e9c1e69953 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_fleet_launch_ascend2.sh @@ -0,0 +1,103 @@ +#!/bin/bash + +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -e + +RANK_TABLE_FILE_NAME="rank_table_file.json" +cat > ${RANK_TABLE_FILE_NAME} < Date: Fri, 9 Apr 2021 17:44:34 +0800 Subject: [PATCH 187/486] make high precision for avg_pool and adaptive_avg_pool when data_type is float16 (#31887) * make high precision for avg_pool --- paddle/fluid/operators/math/pooling.h | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/operators/math/pooling.h b/paddle/fluid/operators/math/pooling.h index 21d588cc01f..3547de0a4d7 100644 --- a/paddle/fluid/operators/math/pooling.h +++ b/paddle/fluid/operators/math/pooling.h @@ -18,6 +18,7 @@ limitations under the License. */ #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/operators/amp/fp16_type_traits.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/hostdevice.h" #include "paddle/fluid/platform/macros.h" @@ -46,10 +47,22 @@ class MaxPool { template class AvgPool { + using MT = typename details::MPTypeTrait::Type; + MT intermediate_res; + public: - DEVICE inline T initial() { return static_cast(0); } - DEVICE inline void compute(const T& x, T* y) { *y += x; } - DEVICE inline void finalize(const T& pool_field, T* y) { *y /= pool_field; } + DEVICE inline T initial() { + intermediate_res = static_cast(0.0f); + return static_cast(0); + } + + DEVICE inline void compute(const T& x, T* y) { + intermediate_res += static_cast(x); + } + + DEVICE inline void finalize(const T& pool_field, T* y) { + *y = static_cast(intermediate_res / (static_cast(pool_field))); + } }; template -- GitLab From afa3720c2bcbdde66b6599e4efcd2735379d4033 Mon Sep 17 00:00:00 2001 From: tianshuo78520a <707759223@qq.com> Date: Sat, 10 Apr 2021 12:37:23 +0800 Subject: [PATCH 188/486] Ci py3 gcc5.4 (#32045) --- cmake/external/brpc.cmake | 2 +- tools/dockerfile/Dockerfile.centos | 16 +++------ tools/dockerfile/Dockerfile.ubuntu | 48 ++++++++++--------------- tools/dockerfile/build_scripts/build.sh | 21 ++++------- tools/dockerfile/ci_dockerfile.sh | 22 ++++++++---- 5 files changed, 45 insertions(+), 64 deletions(-) diff --git a/cmake/external/brpc.cmake b/cmake/external/brpc.cmake index 582c06e88c1..2d72b6eb56d 100644 --- a/cmake/external/brpc.cmake +++ b/cmake/external/brpc.cmake @@ -39,7 +39,7 @@ set(prefix_path "${THIRD_PARTY_PATH}/install/gflags|${THIRD_PARTY_PATH}/install/ ExternalProject_Add( extern_brpc ${EXTERNAL_PROJECT_LOG_ARGS} - # TODO(gongwb): change to de newst repo when they changed. + # TODO(gongwb): change to de newst repo when they changed GIT_REPOSITORY "https://github.com/wangjiawei04/brpc" GIT_TAG "e203afb794caf027da0f1e0776443e7d20c0c28e" PREFIX ${BRPC_SOURCES_DIR} diff --git a/tools/dockerfile/Dockerfile.centos b/tools/dockerfile/Dockerfile.centos index 108d2e5705c..5e87804179f 100644 --- a/tools/dockerfile/Dockerfile.centos +++ b/tools/dockerfile/Dockerfile.centos @@ -19,6 +19,7 @@ RUN bash build_scripts/build.sh RUN bash build_scripts/install_nccl2.sh RUN bash build_scripts/install_trt.sh RUN rm -rf build_scripts +RUN ln -s /usr/local/ssl/include/openssl /usr/include # git 2.17.1 RUN wget -q https://paddle-ci.gz.bcebos.com/git-2.17.1.tar.gz && \ @@ -47,26 +48,17 @@ RUN cd /opt && wget -q --no-check-certificate https://paddle-ci.cdn.bcebos.com/p RUN wget https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/python/requirements.txt -O /root/requirements.txt -RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.15-ucs4/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27mu/bin/pip install setuptools -U && \ - LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.15-ucs2/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27m/bin/pip install setuptools -U && \ - LD_LIBRARY_PATH=/opt/_internal/cpython-3.5.1/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.5.1/bin/pip3 install setuptools -U && \ - LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.6.0/bin/pip3 install setuptools -U && \ +RUN LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.6.0/bin/pip3 install setuptools -U && \ LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.7.0/bin/pip3 install setuptools -U && \ LD_LIBRARY_PATH=/opt/_internal/cpython-3.8.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.8.0/bin/pip3 install setuptools -U -RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.15-ucs4/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27mu/bin/pip install -r /root/requirements.txt && \ - LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.15-ucs2/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27m/bin/pip install -r /root/requirements.txt && \ - LD_LIBRARY_PATH=/opt/_internal/cpython-3.5.1/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.5.1/bin/pip3 install -r /root/requirements.txt && \ - LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.6.0/bin/pip3 install -r /root/requirements.txt && \ +RUN LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.6.0/bin/pip3 install -r /root/requirements.txt && \ LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.7.0/bin/pip3 install -r /root/requirements.txt && \ LD_LIBRARY_PATH=/opt/_internal/cpython-3.8.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.8.0/bin/pip3 install -r /root/requirements.txt && \ go get github.com/Masterminds/glide && \ rm -rf /root/requirements.txt -RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.15-ucs4/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27mu/bin/pip install pre-commit 'ipython==5.3.0' && \ - LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.15-ucs2/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27m/bin/pip install pre-commit 'ipython==5.3.0' && \ - LD_LIBRARY_PATH=/opt/_internal/cpython-3.5.1/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.5.1/bin/pip3 install pre-commit 'ipython==5.3.0' && \ - LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.6.0/bin/pip3 install pre-commit 'ipython==5.3.0' && \ +RUN LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.6.0/bin/pip3 install pre-commit 'ipython==5.3.0' && \ LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.7.0/bin/pip3 install pre-commit 'ipython==5.3.0' && \ LD_LIBRARY_PATH=/opt/_internal/cpython-3.8.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.8.0/bin/pip3 install pre-commit 'ipython==5.3.0' diff --git a/tools/dockerfile/Dockerfile.ubuntu b/tools/dockerfile/Dockerfile.ubuntu index f566e66a976..2cae7896d64 100644 --- a/tools/dockerfile/Dockerfile.ubuntu +++ b/tools/dockerfile/Dockerfile.ubuntu @@ -56,7 +56,7 @@ RUN mkdir -p /root/python_build/ && wget -q https://www.sqlite.org/2018/sqlite-a # Install Python3.7 RUN wget -q https://www.python.org/ftp/python/3.7.0/Python-3.7.0.tgz && \ tar -xzf Python-3.7.0.tgz && cd Python-3.7.0 && \ - CFLAGS="-Wformat" ./configure --prefix=/usr/local/ --enable-shared > /dev/null && \ + CFLAGS="-Wformat" ./configure --prefix=/usr/local/python3.7.0 --enable-shared > /dev/null && \ make -j8 > /dev/null && make altinstall > /dev/null && ldconfig # Install Python3.8 @@ -65,16 +65,11 @@ RUN wget -q https://www.python.org/ftp/python/3.8.0/Python-3.8.0.tgz && \ CFLAGS="-Wformat" ./configure --prefix=/usr/local/ --enable-shared > /dev/null && \ make -j8 > /dev/null && make altinstall > /dev/null && ldconfig -# Install Python3.5 -RUN wget -q https://www.python.org/ftp/python/3.5.1/Python-3.5.1.tgz && \ - tar -xzf Python-3.5.1.tgz && cd Python-3.5.1 && \ - CFLAGS="-Wformat" ./configure --prefix=/usr/local/python3.5.1 --enable-shared > /dev/null && \ - make -j8 > /dev/null && make altinstall > /dev/null && ldconfig -ENV PATH=/usr/local/python3.5.1/include:${PATH} -ENV PATH=/usr/local/python3.5.1/bin:${PATH} -ENV LD_LIBRARY_PATH=/usr/local/python3.5.1/lib:${LD_LIBRARY_PATH} -ENV CPLUS_INCLUDE_PATH=/usr/local/python3.5.1/include/python3.5:$CPLUS_INCLUDE_PATH -RUN ln -sf /usr/local/python3.5.1/bin/python3.5 /usr/local/bin/python3 && ln -sf /usr/local/python3.5.1/bin/python3.5 /usr/bin/python3 +ENV PATH=/usr/local/python3.7.0/include:${PATH} +ENV PATH=/usr/local/python3.7.0/bin:${PATH} +ENV LD_LIBRARY_PATH=/usr/local/python3.7.0/lib:${LD_LIBRARY_PATH} +ENV CPLUS_INCLUDE_PATH=/usr/local/python3.7.0/include/python3.7:$CPLUS_INCLUDE_PATH +RUN ln -sf /usr/local/python3.7.0/bin/python3.7 /usr/local/bin/python3 && ln -sf /usr/local/python3.7.0/bin/python3.7 /usr/bin/python3 RUN rm -r /root/python_build @@ -102,14 +97,14 @@ WORKDIR pip-20.0.1 RUN python setup.py install && \ python3.8 setup.py install && \ python3.7 setup.py install && \ - python3.6 setup.py install && \ - python3 setup.py install + python3.6 setup.py install WORKDIR /home RUN rm Python-$version.tgz setuptools-40.6.2.zip pip-20.0.1.tar.gz && \ rm -r Python-$version setuptools-40.6.2 pip-20.0.1 # Install Go and glide +WORKDIR /home RUN wget -qO- https://paddle-ci.cdn.bcebos.com/go1.8.1.linux-amd64.tar.gz | \ tar -xz -C /usr/local && \ mkdir /root/gopath && \ @@ -143,10 +138,7 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8 # specify sphinx version as 1.5.6 and remove -U option for [pip install -U # sphinx-rtd-theme] since -U option will cause sphinx being updated to newest # version(1.7.1 for now), which causes building documentation failed. -RUN pip3 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \ - pip3 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \ - pip3 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \ - pip3.6 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \ +RUN pip3.6 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \ pip3.6 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \ pip3.6 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \ pip3.7 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \ @@ -155,37 +147,31 @@ RUN pip3 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \ pip3.8 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \ pip3.8 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \ pip3.8 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \ - pip --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \ pip --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \ pip --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark -RUN pip3 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ - pip3 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ - pip3.6 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ +RUN pip3.6 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ pip3.6 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ pip3.7 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ pip3.7 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ pip3.8 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ pip3.8 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ pip --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ - pip --no-cache-dir install 'ipykernel==4.6.0' + pip --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' #For docstring checker -RUN pip3 --no-cache-dir install pylint pytest astroid isort && \ - pip3.6 --no-cache-dir install pylint pytest astroid isort && \ +RUN pip3.6 --no-cache-dir install pylint pytest astroid isort && \ pip3.7 --no-cache-dir install pylint pytest astroid isort && \ pip3.8 --no-cache-dir install pylint pytest astroid isort && \ - pip --no-cache-dir install pylint pytest astroid isort LinkChecker + pip --no-cache-dir install pylint pytest astroid isort -RUN pip3 --no-cache-dir install coverage && \ - pip3.6 --no-cache-dir install coverage && \ +RUN pip3.6 --no-cache-dir install coverage && \ pip3.7 --no-cache-dir install coverage && \ pip3.8 --no-cache-dir install coverage && \ pip --no-cache-dir install coverage COPY ./python/requirements.txt /root/ -RUN pip3 --no-cache-dir install -r /root/requirements.txt && \ - pip3.6 --no-cache-dir install -r /root/requirements.txt && \ +RUN pip3.6 --no-cache-dir install -r /root/requirements.txt && \ pip3.7 --no-cache-dir install -r /root/requirements.txt && \ pip3.8 --no-cache-dir install -r /root/requirements.txt && \ pip --no-cache-dir install -r /root/requirements.txt @@ -193,7 +179,9 @@ RUN pip3 --no-cache-dir install -r /root/requirements.txt && \ # To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use # the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2 RUN apt-get install -y libssl-dev libffi-dev && apt-get clean -y && \ - pip3 --no-cache-dir install certifi urllib3[secure] && \ + pip3.6 install --upgrade pip && \ + pip3.7 install --upgrade pip && \ + pip3.8 install --upgrade pip && \ pip3.6 --no-cache-dir install certifi urllib3[secure] && \ pip3.7 --no-cache-dir install certifi urllib3[secure] && \ pip3.8 --no-cache-dir install certifi urllib3[secure] && \ diff --git a/tools/dockerfile/build_scripts/build.sh b/tools/dockerfile/build_scripts/build.sh index 7d5e0194432..41f6e18f547 100644 --- a/tools/dockerfile/build_scripts/build.sh +++ b/tools/dockerfile/build_scripts/build.sh @@ -24,15 +24,13 @@ set -ex # remove others to expedite build and reduce docker image size. The original # manylinux docker image project builds many python versions. # NOTE We added back 3.5.1, since auditwheel requires python 3.3+ -CPYTHON_VERSIONS="3.8.0 3.7.0 3.6.0 3.5.1 2.7.15" +CPYTHON_VERSIONS="3.8.0 3.7.0 3.6.0" # openssl version to build, with expected sha256 hash of .tar.gz # archive OPENSSL_ROOT=openssl-1.0.2g OPENSSL_HASH=b784b1b3907ce39abf4098702dade6365522a253ad1552e267a9a0e89594aa33 PATCHELF_HASH=f2aa40a6148cb3b0ca807a1bf836b081793e55ec9e5540a5356d800132be7e0a -CURL_ROOT=curl-7.49.1 -CURL_HASH=eb63cec4bef692eab9db459033f409533e6d10e20942f4b060b32819e81885f1 AUTOCONF_ROOT=autoconf-2.69 AUTOCONF_HASH=954bd69b391edc12d6a4a51a2dd1476543da5c6bbf05a95b59dc0dd6fd4c2969 @@ -79,7 +77,6 @@ build_openssl $OPENSSL_ROOT $OPENSSL_HASH mkdir -p /opt/python build_cpythons $CPYTHON_VERSIONS -PY35_BIN=/opt/python/cp35-cp35m/bin PY36_BIN=/opt/python/cp36-cp36m/bin PY37_BIN=/opt/python/cp37-cp37m/bin PY38_BIN=/opt/python/cp38-cp38m/bin @@ -87,25 +84,19 @@ PY38_BIN=/opt/python/cp38-cp38m/bin # libpython, we need to add libpython's dir to LD_LIBRARY_PATH before running # python. ORIGINAL_LD_LIBRARY_PATH="${LD_LIBRARY_PATH}" -LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname ${PY35_BIN})/lib:$(dirname ${PY36_BIN})/lib:$(dirname ${PY37_BIN})/lib:$(dirname ${PY38_BIN})/lib" +LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname ${PY36_BIN})/lib:$(dirname ${PY37_BIN})/lib:$(dirname ${PY38_BIN})/lib" # Our openssl doesn't know how to find the system CA trust store # (https://github.com/pypa/manylinux/issues/53) # And it's not clear how up-to-date that is anyway # So let's just use the same one pip and everyone uses -LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname ${PY35_BIN})/lib" $PY35_BIN/pip install certifi -ln -s $($PY35_BIN/python -c 'import certifi; print(certifi.where())') \ +LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname ${PY37_BIN})/lib" $PY37_BIN/pip install certifi +ln -s $($PY37_BIN/python -c 'import certifi; print(certifi.where())') \ /opt/_internal/certs.pem # If you modify this line you also have to modify the versions in the # Dockerfiles: export SSL_CERT_FILE=/opt/_internal/certs.pem -# Install newest curl -build_curl $CURL_ROOT $CURL_HASH -rm -rf /usr/local/include/curl /usr/local/lib/libcurl* /usr/local/lib/pkgconfig/libcurl.pc -hash -r -curl --version -curl-config --features # Install patchelf (latest with unreleased bug fixes) # FIXME(typhoonzero): restore this when the link is fixed. @@ -117,8 +108,8 @@ curl-config --features yum install -y patchelf # Install latest pypi release of auditwheel -LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname ${PY35_BIN})/lib" $PY35_BIN/pip install auditwheel -ln -s $PY35_BIN/auditwheel /usr/local/bin/auditwheel +#LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname ${PY35_BIN})/lib" $PY35_BIN/pip install auditwheel +#ln -s $PY35_BIN/auditwheel /usr/local/bin/auditwheel # Clean up development headers and other unnecessary stuff for # final image diff --git a/tools/dockerfile/ci_dockerfile.sh b/tools/dockerfile/ci_dockerfile.sh index 15196e30516..e61a4eb3dbd 100644 --- a/tools/dockerfile/ci_dockerfile.sh +++ b/tools/dockerfile/ci_dockerfile.sh @@ -19,8 +19,14 @@ function make_ubuntu_dockerfile(){ sed "s//10.1-cudnn7-devel-ubuntu16.04/g" ./Dockerfile.ubuntu >${dockerfile_name} sed -i "s#liblzma-dev#liblzma-dev openmpi-bin openmpi-doc libopenmpi-dev#g" ${dockerfile_name} dockerfile_line=$(wc -l ${dockerfile_name}|awk '{print $1}') - sed -i "${dockerfile_line}i RUN wget --no-check-certificate -q https://paddle-edl.bj.bcebos.com/hadoop-2.7.7.tar.gz && \ + sed -i "${dockerfile_line}i RUN wget --no-check-certificate -q https://paddle-edl.bj.bcebos.com/hadoop-2.7.7.tar.gz \&\& \ tar -xzf hadoop-2.7.7.tar.gz && mv hadoop-2.7.7 /usr/local/" ${dockerfile_name} + sed -i "${dockerfile_line}i RUN apt remove git -y \&\& apt install -y libcurl4-openssl-dev gettext \&\& wget -q https://paddle-ci.gz.bcebos.com/git-2.17.1.tar.gz \&\& \ + tar -xvf git-2.17.1.tar.gz \&\& \ + cd git-2.17.1 \&\& \ + ./configure --with-openssl --with-curl --prefix=/usr/local \&\& \ + make -j8 \&\& make install " ${dockerfile_name} + sed -i "${dockerfile_line}i RUN pip install wheel \&\& pip3 install PyGithub wheel \&\& pip3.7 install PyGithub " ${dockerfile_name} sed -i "s##WORKDIR /usr/bin \\ COPY tools/dockerfile/build_scripts /build_scripts \\ RUN bash /build_scripts/install_gcc.sh gcc82 \&\& rm -rf /build_scripts \\ @@ -30,9 +36,9 @@ function make_ubuntu_dockerfile(){ RUN ln -s /usr/local/gcc-8.2/bin/gcc /usr/bin/gcc \\ RUN ln -s /usr/local/gcc-8.2/bin/g++ /usr/bin/g++ \\ ENV PATH=/usr/local/gcc-8.2/bin:\$PATH #g" ${dockerfile_name} - sed -i "s#bash /build_scripts/install_nccl2.sh#wget --no-proxy https://nccl2-deb.cdn.bcebos.com/nccl-repo-ubuntu1604-2.7.8-ga-cuda10.1_1-1_amd64.deb \\ + sed -i "s#bash /build_scripts/install_nccl2.sh#wget -q --no-proxy https://nccl2-deb.cdn.bcebos.com/nccl-repo-ubuntu1604-2.7.8-ga-cuda10.1_1-1_amd64.deb \\ RUN dpkg -i nccl-repo-ubuntu1604-2.7.8-ga-cuda10.1_1-1_amd64.deb \\ - RUN apt-get install -y libnccl2=2.7.8-1+cuda10.1 libnccl-dev=2.7.8-1+cuda10.1 --allow-change-held-packages #g" ${dockerfile_name} + RUN apt update \&\& apt remove -y libnccl* --allow-change-held-packages \&\& apt-get install -y libnccl2=2.7.8-1+cuda10.1 libnccl-dev=2.7.8-1+cuda10.1 pigz --allow-change-held-packages #g" ${dockerfile_name} } @@ -41,12 +47,16 @@ function make_centos_dockerfile(){ sed "s//11.0-cudnn8-devel-centos7/g" Dockerfile.centos >${dockerfile_name} sed -i "s#COPY build_scripts /build_scripts#COPY tools/dockerfile/build_scripts ./build_scripts#g" ${dockerfile_name} dockerfile_line=$(wc -l ${dockerfile_name}|awk '{print $1}') - sed -i "${dockerfile_line}i RUN rm -f /usr/bin/cc && ln -s /usr/local/gcc-8.2/bin/gcc /usr/bin/cc" ${dockerfile_name} + sed -i "${dockerfile_line}i RUN yum install -y pigz graphviz" ${dockerfile_name} + sed -i "${dockerfile_line}i RUN rm -f /usr/bin/cc && ln -s /usr/local/gcc-5.4/bin/gcc /usr/bin/cc" ${dockerfile_name} + sed -i "${dockerfile_line}i RUN rm -f /usr/bin/g++ && ln -s /usr/local/gcc-5.4/bin/g++ /usr/bin/g++" ${dockerfile_name} + sed -i "${dockerfile_line}i RUN rm -f /usr/bin/c++ && ln -s /usr/local/gcc-5.4/bin/c++ /usr/bin/c++" ${dockerfile_name} + sed -i "${dockerfile_line}i RUN rm -f /usr/bin/gcc && ln -s /usr/local/gcc-5.4/bin/gcc /usr/bin/gcc" ${dockerfile_name} sed -i "${dockerfile_line}i RUN ln -s /usr/lib64/libz.so /usr/local/lib/libz.so \\ RUN ln -s /usr/local/lib/libnccl.so /usr/local/cuda/lib64/" ${dockerfile_name} sed -i $"${dockerfile_line}i RUN wget --no-check-certificate -q https://paddle-edl.bj.bcebos.com/hadoop-2.7.7.tar.gz \\ - RUN tar -xzf hadoop-2.7.7.tar.gz && mv hadoop-2.7.7 /usr/local/" ${dockerfile_name} - sed -i "s#RUN bash build_scripts/build.sh#RUN bash build_scripts/install_gcc.sh gcc82 \nRUN mv /usr/bin/cc /usr/bin/cc.bak \&\& ln -s /usr/local/gcc-8.2/bin/gcc /usr/bin/cc \nENV PATH=/usr/local/gcc-8.2/bin:\$PATH \nRUN bash build_scripts/build.sh#g" ${dockerfile_name} + RUN tar -xzf hadoop-2.7.7.tar.gz && mv hadoop-2.7.7 /usr/local/" ${dockerfile_name} + sed -i "s#RUN bash build_scripts/build.sh#RUN bash build_scripts/install_gcc.sh gcc54 \nRUN mv /usr/bin/cc /usr/bin/cc.bak \&\& ln -s /usr/local/gcc-5.4/bin/gcc /usr/bin/cc \nENV PATH=/usr/local/gcc-5.4/bin:\$PATH \nRUN bash build_scripts/build.sh#g" ${dockerfile_name} } -- GitLab From f8bab5b0671b1ffb86d25c06774bc2d121c2a098 Mon Sep 17 00:00:00 2001 From: AshburnLee <1578034415@qq.com> Date: Sat, 10 Apr 2021 13:01:24 +0800 Subject: [PATCH 189/486] Optimize the performance of the forward of log_softmax when axis is -1 and dim <= 1024 (#31630) --- paddle/fluid/operators/log_softmax_op.cu | 170 +++++++++++++++++++++++ 1 file changed, 170 insertions(+) diff --git a/paddle/fluid/operators/log_softmax_op.cu b/paddle/fluid/operators/log_softmax_op.cu index 02fca246d24..9136de38caf 100644 --- a/paddle/fluid/operators/log_softmax_op.cu +++ b/paddle/fluid/operators/log_softmax_op.cu @@ -12,7 +12,177 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include +#include "paddle/fluid/operators/amp/fp16_type_traits.h" #include "paddle/fluid/operators/log_softmax_op.h" +#include "paddle/fluid/platform/cuda_device_function.h" + +namespace paddle { +namespace operators { + +#define LAUNCH_WARP_FORWAR_COMPUTE(near_greater_power_of_two) \ + case near_greater_power_of_two: \ + ComputeLogSoftmaxForwardInWarp< \ + T, AccT, near_greater_power_of_two><<>>( \ + dst, src, outer_size, dim_size); \ + break; + +template +__device__ __forceinline__ T WarpReduceSum(T value) { +#pragma unroll + for (int offset = KernelWarpSize / 2; offset > 0; offset /= 2) { + T sum_val = platform::CudaShuffleXorSync(0xFFFFFFFF, value, offset); + value = value + sum_val; + } + return value; +} + +template +__device__ __forceinline__ T WarpReduceMax(T value) { +#pragma unroll + for (int offset = KernelWarpSize / 2; offset > 0; offset /= 2) { + T max_val = platform::CudaShuffleXorSync(0xFFFFFFFF, value, offset); + value = max(value, max_val); + } + return value; +} + +int GetNearGreaterPowerOfTwo(int value) { + int log2_value = 0; + while ((1 << log2_value) < value) { + ++log2_value; + } + return 1 << log2_value; +} + +template +__global__ void ComputeLogSoftmaxForwardInWarp(T *dst, const T *src, + int batch_size, + int element_count) { + constexpr int near_greater_power_of_two = NearGreaterPowerOfTwo; + constexpr int kernel_warp_size = + (near_greater_power_of_two < 32) ? near_greater_power_of_two : 32; + constexpr int warp_iter = near_greater_power_of_two / kernel_warp_size; + int batch_id = blockDim.y * blockIdx.x + threadIdx.y; + + // set effective_warp_id as 1 when warps do effective work, + // when warps do ineffective work, effective_warp_id remains unchanged. + int effective_warp_id = batch_size - batch_id; + if (effective_warp_id > 1) effective_warp_id = 1; + + int thread_in_warp_idx = threadIdx.x; + + // 1.read data from global memory to registers + AccT elements[warp_iter]; + // set effective_element_count as the num of elements when warps do effective + // work + // set effective_element_count as 0, when warps do ineffective work + int effective_element_count = (effective_warp_id <= 0) ? 0 : element_count; + for (int it = 0; it < warp_iter; ++it) { + int element_index = thread_in_warp_idx + it * kernel_warp_size; + if (element_index < effective_element_count) { + elements[it] = + static_cast(src[batch_id * element_count + element_index]); + } else { + elements[it] = -std::numeric_limits::infinity(); + } + } + + // 2.compute max_value. For each thread, loop all registers to find max + AccT max_value = elements[0]; +#pragma unroll + for (int it = 1; it < warp_iter; ++it) { + max_value = (max_value > elements[it]) ? max_value : elements[it]; + } + max_value = WarpReduceMax(max_value); + + // 3.For each warp, accumulate all thread registers + AccT sum = 0.0f; +#pragma unroll + for (int it = 0; it < warp_iter; ++it) { + sum += std::exp(elements[it] - max_value); + } + sum = WarpReduceSum(sum); + + // 4.store result. + sum = std::log(sum); +#pragma unroll + for (int it = 0; it < warp_iter; ++it) { + int element_index = thread_in_warp_idx + it * kernel_warp_size; + if (element_index < element_count) { + dst[batch_id * element_count + element_index] = + static_cast(elements[it] - max_value - sum); + } else { + break; + } + } +} + +template +void LaunchSoftmaxForwardForLastAxis(T *dst, const T *src, int dim_size, + int outer_size, gpuStream_t stream) { + int threads_per_block = 128; + int near_greater_power_of_two = GetNearGreaterPowerOfTwo(dim_size); + int kernel_warp_size = + (near_greater_power_of_two < 32) ? near_greater_power_of_two : 32; + int warps_per_block = (threads_per_block / kernel_warp_size); + int blocks = (outer_size + warps_per_block - 1) / warps_per_block; + dim3 threads(kernel_warp_size, warps_per_block, 1); + + switch (near_greater_power_of_two) { + LAUNCH_WARP_FORWAR_COMPUTE(1); + LAUNCH_WARP_FORWAR_COMPUTE(2); + LAUNCH_WARP_FORWAR_COMPUTE(4); // dim_size: 3~4 + LAUNCH_WARP_FORWAR_COMPUTE(8); // dim_size: 5~8 + LAUNCH_WARP_FORWAR_COMPUTE(16); // dim_size: 9~16 + LAUNCH_WARP_FORWAR_COMPUTE(32); // dim_size: 17~32 + LAUNCH_WARP_FORWAR_COMPUTE(64); // dim_size: 33~64 + LAUNCH_WARP_FORWAR_COMPUTE(128); // dim_size 65~128 + LAUNCH_WARP_FORWAR_COMPUTE(256); // dim_size 129~256 + LAUNCH_WARP_FORWAR_COMPUTE(512); // dim_size 257~512 + LAUNCH_WARP_FORWAR_COMPUTE(1024); // dim_size 513~1024 + + default: + break; + } +} + +template +class LogSoftmaxKernel + : public framework::OpKernel { + using MPDType = typename details::MPTypeTrait::Type; + + public: + void Compute(const framework::ExecutionContext &context) const override { + const auto *x = context.Input("X"); + auto *out = context.Output("Out"); + const auto *input_data = x->data(); + auto *output_data = out->mutable_data(context.GetPlace()); + + const int rank = x->dims().size(); + const int axis = CanonicalAxis(context.Attr("axis"), rank); + + int dim_size = x->dims()[axis]; + int inner_size = 1; + for (int i = axis + 1; i < x->dims().size(); ++i) { + inner_size *= x->dims()[i]; + } + int outer_size = SizeToAxis(axis, x->dims()); + gpuStream_t stream = context.cuda_device_context().stream(); + + if (inner_size == 1 && dim_size <= 1024 && dim_size * sizeof(T) <= 4096) { + LaunchSoftmaxForwardForLastAxis(output_data, input_data, + dim_size, outer_size, stream); + } else { + LogSoftmaxFunctor()( + context.template device_context(), x, + out, axis); + } + } +}; + +} // operators +} // paddle namespace ops = paddle::operators; namespace plat = paddle::platform; -- GitLab From a2387ef2e5df0b463b4a435bce31e212c5d51202 Mon Sep 17 00:00:00 2001 From: TTerror Date: Mon, 12 Apr 2021 09:26:07 +0800 Subject: [PATCH 190/486] fix concat_grad on kunlun (#32151) * fix concat_grad on kunlun * fix concat_grad on kunlun --- cmake/external/xpu.cmake | 2 +- paddle/fluid/operators/concat_op_xpu.cc | 19 ++++++------------- 2 files changed, 7 insertions(+), 14 deletions(-) diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake index 16c69a7b503..f846623602e 100644 --- a/cmake/external/xpu.cmake +++ b/cmake/external/xpu.cmake @@ -13,7 +13,7 @@ if(NOT XPU_SDK_ROOT) elseif(WITH_SUNWAY) SET(XPU_URL "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/sunway/xpu_2021_01_13.tar.gz" CACHE STRING "" FORCE) else() - SET(XPU_URL "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/xpu_2021_03_30.tar.gz" CACHE STRING "" FORCE) + SET(XPU_URL "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/xpu_2021_04_09.tar.gz" CACHE STRING "" FORCE) endif() SET(XPU_SOURCE_DIR "${THIRD_PARTY_PATH}/xpu") diff --git a/paddle/fluid/operators/concat_op_xpu.cc b/paddle/fluid/operators/concat_op_xpu.cc index aa0002cc6d1..be299babdba 100644 --- a/paddle/fluid/operators/concat_op_xpu.cc +++ b/paddle/fluid/operators/concat_op_xpu.cc @@ -132,16 +132,14 @@ class ConcatGradXPUKernel : public framework::OpKernel { axis = ComputeAxis(static_cast(axis), static_cast(ins[0]->dims().size())); // get output tensor that the name is not kEmptyVarName - std::vector outputs; - std::vector choose_idx; - int n = 0; + std::vector ptrs(outs.size()); for (size_t j = 0; j < outs.size(); ++j) { if (out_var_names[j] != framework::kEmptyVarName && outs[j]->numel() != 0UL) { outs[j]->mutable_data(ctx.GetPlace()); - outputs.push_back(outs[j]); - choose_idx.push_back(j); - n++; + ptrs[j] = outs[j]->data(); + } else { + ptrs[j] = nullptr; } } PADDLE_ENFORCE_GE(axis, 0, platform::errors::InvalidArgument( @@ -157,10 +155,10 @@ class ConcatGradXPUKernel : public framework::OpKernel { axis, out_grad->dims().size())); auto input_dims = ins[0]->dims(); - std::vector split_list(n); + std::vector split_list(ins.size()); std::vector xdims_list(input_dims.size()); int total_length = 0; - for (int i = 0; i < n; ++i) { + for (size_t i = 0; i < ins.size(); ++i) { split_list[i] = ins[i]->dims()[axis]; total_length += ins[i]->dims()[axis]; } @@ -172,11 +170,6 @@ class ConcatGradXPUKernel : public framework::OpKernel { } xdims_list[axis] = total_length; - std::vector ptrs(n); - for (int i = 0; i < n; ++i) { - ptrs[i] = outputs[i]->data(); - } - auto& dev_ctx = ctx.template device_context(); int r = xpu::split(dev_ctx.x_context(), out_grad->data(), ptrs, xdims_list, split_list, axis); -- GitLab From 80698cad4f1335ab88e6f14b3f3a5b3fb6c3bb28 Mon Sep 17 00:00:00 2001 From: wuhuanzhou Date: Mon, 12 Apr 2021 11:33:10 +0800 Subject: [PATCH 191/486] remove PYTHON_ABI, test=document_fix (#32190) --- tools/test_op_benchmark.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/test_op_benchmark.sh b/tools/test_op_benchmark.sh index 4f7288eb125..a4c905196c2 100644 --- a/tools/test_op_benchmark.sh +++ b/tools/test_op_benchmark.sh @@ -162,7 +162,6 @@ function compile_install_paddlepaddle { export BUILD_TYPE=Release export CUDA_ARCH_NAME=Auto export WITH_DISTRIBUTE=OFF - export PYTHON_ABI=cp37-cp37m export CMAKE_BUILD_TYPE=Release [ -d build ] && rm -rf build bash paddle/scripts/paddle_build.sh build $(nproc) -- GitLab From af374ae64c2ceb9f622be6e26a1eb43330540c80 Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Mon, 12 Apr 2021 13:49:53 +0800 Subject: [PATCH 192/486] follow comments to refine PR 32144 (#32174) --- paddle/fluid/memory/allocation/npu_allocator.cc | 2 +- paddle/fluid/memory/allocation/npu_allocator.h | 2 +- paddle/fluid/platform/npu_info.cc | 2 +- paddle/fluid/platform/npu_info.h | 2 +- paddle/fluid/platform/stream/npu_stream.cc | 2 +- python/paddle/device.py | 4 +++- 6 files changed, 8 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/memory/allocation/npu_allocator.cc b/paddle/fluid/memory/allocation/npu_allocator.cc index 4ecdee9bd03..faf7ae6221c 100644 --- a/paddle/fluid/memory/allocation/npu_allocator.cc +++ b/paddle/fluid/memory/allocation/npu_allocator.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/paddle/fluid/memory/allocation/npu_allocator.h b/paddle/fluid/memory/allocation/npu_allocator.h index 738ec5d3ce1..bf668973505 100644 --- a/paddle/fluid/memory/allocation/npu_allocator.h +++ b/paddle/fluid/memory/allocation/npu_allocator.h @@ -1,4 +1,4 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/paddle/fluid/platform/npu_info.cc b/paddle/fluid/platform/npu_info.cc index 69204363993..090945239a3 100644 --- a/paddle/fluid/platform/npu_info.cc +++ b/paddle/fluid/platform/npu_info.cc @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/paddle/fluid/platform/npu_info.h b/paddle/fluid/platform/npu_info.h index 648b18531b2..1f392f1a534 100644 --- a/paddle/fluid/platform/npu_info.h +++ b/paddle/fluid/platform/npu_info.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/paddle/fluid/platform/stream/npu_stream.cc b/paddle/fluid/platform/stream/npu_stream.cc index 2664ac7194b..1c3e153e58c 100644 --- a/paddle/fluid/platform/stream/npu_stream.cc +++ b/paddle/fluid/platform/stream/npu_stream.cc @@ -1,4 +1,4 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/python/paddle/device.py b/python/paddle/device.py index d5e4406454b..20453998fb7 100644 --- a/python/paddle/device.py +++ b/python/paddle/device.py @@ -39,9 +39,11 @@ __all__ = [ _cudnn_version = None +# TODO: WITH_ASCEND_CL may changed to WITH_NPU or others in the future +# for consistent. def is_compiled_with_npu(): """ - Whether this whl package can be used to run the model on NPU. + Whether paddle was built with WITH_ASCEND_CL=ON to support Ascend NPU. Returns (bool): `True` if NPU is supported, otherwise `False`. -- GitLab From d8afe40737e202b04bc03c2111afe34230ec5afa Mon Sep 17 00:00:00 2001 From: limingshu <61349199+JamesLim-sy@users.noreply.github.com> Date: Mon, 12 Apr 2021 14:26:41 +0800 Subject: [PATCH 193/486] Optimization of bilinear backward OP CUDA kernel. (#30950) --- paddle/fluid/operators/interpolate_v2_op.cu | 288 +++++++++++++++----- 1 file changed, 218 insertions(+), 70 deletions(-) diff --git a/paddle/fluid/operators/interpolate_v2_op.cu b/paddle/fluid/operators/interpolate_v2_op.cu index 90abcaa8b47..9c19278ac4d 100644 --- a/paddle/fluid/operators/interpolate_v2_op.cu +++ b/paddle/fluid/operators/interpolate_v2_op.cu @@ -12,6 +12,8 @@ #include #include #include "paddle/fluid/operators/interpolate_v2_op.h" +#include "paddle/fluid/operators/math/math_cuda_utils.h" +#include "paddle/fluid/platform/cuda_device_function.h" #include "paddle/fluid/platform/cuda_primitives.h" #include "paddle/fluid/platform/gpu_launch_config.h" @@ -302,81 +304,214 @@ __global__ void KeBilinearInterpFw( } template -__global__ void KeBilinearInterpBw( - T* in, const size_t in_img_h, const size_t in_img_w, const size_t input_h, - const size_t input_w, const T* out, const size_t out_img_h, - const size_t out_img_w, const size_t output_h, const size_t output_w, - const size_t num_channels, const T ratio_h, const T ratio_w, - const bool align_corners, const int align_mode, - const DataLayout data_layout) { - int nthreads = output_h * output_w; - int tid = blockIdx.x * blockDim.x + threadIdx.x; - int stride = blockDim.x * gridDim.x; - bool align_flag = (align_mode == 0 && !align_corners); - for (; tid < nthreads; tid += stride) { - int out_id_h = tid / output_w; - int out_id_w = tid % output_w; - int in_img_size = input_w / num_channels; - int out_img_size = output_w / num_channels; +__forceinline__ __device__ void PreCalculatorForInputIndex( + int* in_img_idx, int* in_img_idy, int* w_id, int* h_id, T* w1lambda, + T* h1lambda, T* w2lambda, T* h2lambda, T src_w, T src_h, const int in_img_w, + const int in_img_h) { + src_w = (src_w > 0) ? src_w : 0.f; + src_h = (src_h > 0) ? src_h : 0.f; + *in_img_idx = static_cast(src_w); + *in_img_idy = static_cast(src_h); + *w_id = (*in_img_idx < in_img_w - 1) ? 1 : 0; + *h_id = (*in_img_idy < in_img_h - 1) ? 1 : 0; + *w1lambda = src_w - *in_img_idx; + *h1lambda = src_h - *in_img_idy; + *w2lambda = 1.f - *w1lambda; + *h2lambda = 1.f - *h1lambda; +} - int channel_id, out_img_idy, out_img_idx; - if (data_layout == DataLayout::kNCHW) { - channel_id = out_id_w / out_img_size; - out_img_idy = (out_id_w % out_img_size) / out_img_w; - out_img_idx = tid % out_img_w; - } else { - out_img_idy = out_id_w / (out_img_w * num_channels); - out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels; - channel_id = tid % num_channels; +/* Calculate the minimum of partial elements in a block */ +template +__inline__ __device__ T PartialBlockMin(T val, size_t threads_num_in_block, + unsigned mask) { + __shared__ T shared[WARP_SIZE]; + __shared__ T shared_last_val; + __shared__ int shared_last_idx; + int lane = threadIdx.x & 0x1f; + int wid = threadIdx.x >> 5; + int threshold = (threads_num_in_block & (-WARP_SIZE)); + + if (threadIdx.x < threshold) { + shared_last_idx = (threshold >> 5) - 1; + val = math::warpReduceMin(val, mask); + if (lane == 0) { + shared[wid] = val; } + } else { + shared_last_val = std::numeric_limits::max(); + platform::CudaAtomicMin(&shared_last_val, val); + shared[wid] = shared_last_val; + shared_last_idx = wid; + } + __syncthreads(); - int in_img_idy = align_flag ? ratio_h * (out_img_idy + 0.5) - 0.5 - : ratio_h * out_img_idy; - in_img_idy = (in_img_idy > 0) ? in_img_idy : 0; - int h_id = (in_img_idy < in_img_h - 1) ? 1 : 0; - T src_h = ratio_h * (out_img_idy + 0.5) - 0.5; - src_h = (src_h > 0) ? src_h : 0; - T h1lambda = - align_flag ? src_h - in_img_idy : ratio_h * out_img_idy - in_img_idy; - T h2lambda = 1.f - h1lambda; + if (threadIdx.x < threshold) { + val = (lane <= shared_last_idx) ? shared[lane] + : std::numeric_limits::max(); + val = math::warpReduceMin(val, mask); + shared_last_val = val; + } + __syncthreads(); + if (threadIdx.x >= threshold) { + val = shared_last_val; + } + return val; +} - int in_img_idx = align_flag ? ratio_w * (out_img_idx + 0.5) - 0.5 - : ratio_w * out_img_idx; - in_img_idx = (in_img_idx > 0) ? in_img_idx : 0; - int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0; - T src_w = ratio_w * (out_img_idx + 0.5) - 0.5; - src_w = (src_w > 0) ? src_w : 0; - T w1lambda = - align_flag ? src_w - in_img_idx : ratio_w * out_img_idx - in_img_idx; - T w2lambda = 1.f - w1lambda; +template +__global__ void KeBilinearInterpBwShareMemory( + T* in, const int in_h, const int in_w, const T* __restrict__ out, + const int out_h, const int out_w, const int n, const int num_channels, + float ratio_h, float ratio_w, const T align_type_value, bool is_nchw) { + __shared__ T s_data[2][1024]; + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + int in_chw = in_h * in_w * num_channels; + int out_chw = num_channels * out_h * out_w; + int nthreads = n * out_chw; - T* in_pos; - if (data_layout == DataLayout::kNCHW) { - in_pos = &in[out_id_h * input_w + channel_id * in_img_size + - in_img_idy * in_img_w + in_img_idx]; + for (; tid < nthreads; tid += stride) { + int out_id_h = tid / out_chw; + int out_id_w = tid % out_chw; + const int in_img_size = in_h * in_w; + const int out_img_size = out_h * out_w; + T value = out[out_id_h * out_chw + out_id_w]; + + int channel_id = out_id_w / out_img_size; + int out_img_idy = (out_id_w % out_img_size) / out_w; + int out_img_idx = tid % out_w; + + int in_img_idx, in_img_idy, w_id, h_id; + T w1lambda, h1lambda, w2lambda, h2lambda; + T src_w = ratio_w * (out_img_idx + align_type_value) - align_type_value; + T src_h = ratio_h * (out_img_idy + align_type_value) - align_type_value; + PreCalculatorForInputIndex(&in_img_idx, &in_img_idy, &w_id, &h_id, + &w1lambda, &h1lambda, &w2lambda, &h2lambda, + src_w, src_h, in_w, in_h); + + // top_left_index is just input_index. + int input_index = out_id_h * in_chw + channel_id * in_img_size + + in_img_idy * in_w + in_img_idx; + int top_right_index = input_index + w_id; + int bot_left_index = input_index + h_id * in_w; + int bot_right_index = input_index + h_id * in_w + w_id; + int in_top_min_index, in_bot_min_index; + + s_data[0][threadIdx.x] = 0.f; + s_data[1][threadIdx.x] = 0.f; + int remain = nthreads - (tid & (-blockDim.x)); + int in_top_max_index = math::blockReduceMax(top_right_index, FINAL_MASK); + int in_bot_max_index = math::blockReduceMax(bot_right_index, FINAL_MASK); + + if (remain > blockDim.x) { + in_top_min_index = math::blockReduceMin(input_index, FINAL_MASK); + in_bot_min_index = math::blockReduceMin(bot_left_index, FINAL_MASK); } else { - in_pos = &in[out_id_h * input_w + in_img_idy * in_img_w * num_channels + - in_img_idx * num_channels + channel_id]; + in_top_min_index = PartialBlockMin(input_index, remain, FINAL_MASK); + in_bot_min_index = PartialBlockMin(bot_left_index, remain, FINAL_MASK); } + int upper_limit_share_idx = (in_top_max_index - in_top_min_index) > + (in_bot_max_index - in_bot_min_index) + ? (in_top_max_index - in_top_min_index) + : (in_bot_max_index - in_bot_min_index); + if (h_id != 0) { + platform::CudaAtomicAdd(&s_data[0][input_index - in_top_min_index], + h2lambda * w2lambda * value); + platform::CudaAtomicAdd(&s_data[0][top_right_index - in_top_min_index], + h2lambda * w1lambda * value); + platform::CudaAtomicAdd(&s_data[1][bot_left_index - in_bot_min_index], + h1lambda * w2lambda * value); + platform::CudaAtomicAdd(&s_data[1][bot_right_index - in_bot_min_index], + h1lambda * w1lambda * value); + } else { + platform::CudaAtomicAdd(&s_data[0][top_right_index - in_top_min_index], + (h2lambda + h1lambda) * w1lambda * value); + platform::CudaAtomicAdd(&s_data[1][bot_left_index - in_bot_min_index], + (h1lambda + h2lambda) * w2lambda * value); + } + __syncthreads(); - const T* out_pos = &out[out_id_h * output_w + out_id_w]; + if (threadIdx.x <= upper_limit_share_idx) { + platform::CudaAtomicAdd(&in[in_top_min_index + threadIdx.x], + s_data[0][threadIdx.x]); + platform::CudaAtomicAdd(&in[in_bot_min_index + threadIdx.x], + s_data[1][threadIdx.x]); + } + } +} - if (data_layout == DataLayout::kNCHW) { - platform::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * out_pos[0]); - platform::CudaAtomicAdd(&in_pos[w_id], h2lambda * w1lambda * out_pos[0]); - platform::CudaAtomicAdd(&in_pos[h_id * in_img_w], - h1lambda * w2lambda * out_pos[0]); - platform::CudaAtomicAdd(&in_pos[h_id * in_img_w + w_id], - h1lambda * w1lambda * out_pos[0]); - } else { - platform::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * out_pos[0]); +template +__global__ void KeBilinearInterpBw(T* in, const int in_h, const int in_w, + const T* __restrict__ out, const int out_h, + const int out_w, const int n, + const int num_channels, float ratio_h, + float ratio_w, const T align_type_value, + bool is_nchw) { + int tid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + int in_chw = in_h * in_w * num_channels; + int out_chw = num_channels * out_h * out_w; + int nthreads = n * out_chw; + + if (is_nchw) { + for (; tid < nthreads; tid += stride) { + int out_id_h = tid / out_chw; + int out_id_w = tid % out_chw; + const int in_img_size = in_h * in_w; + const int out_img_size = out_h * out_w; + T value = out[out_id_h * out_chw + out_id_w]; + + int channel_id = out_id_w / out_img_size; + int out_img_idy = (out_id_w % out_img_size) / out_w; + int out_img_idx = tid % out_w; + int in_img_idx, in_img_idy, w_id, h_id; + T w1lambda, h1lambda, w2lambda, h2lambda; + + T src_w = ratio_w * (out_img_idx + align_type_value) - align_type_value; + T src_h = ratio_h * (out_img_idy + align_type_value) - align_type_value; + PreCalculatorForInputIndex(&in_img_idx, &in_img_idy, &w_id, &h_id, + &w1lambda, &h1lambda, &w2lambda, &h2lambda, + src_w, src_h, in_w, in_h); + + T* in_pos = &in[out_id_h * in_chw + channel_id * in_img_size + + in_img_idy * in_w + in_img_idx]; + platform::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * value); + platform::CudaAtomicAdd(&in_pos[w_id], h2lambda * w1lambda * value); + platform::CudaAtomicAdd(&in_pos[h_id * in_w], + h1lambda * w2lambda * value); + platform::CudaAtomicAdd(&in_pos[h_id * in_w + w_id], + h1lambda * w1lambda * value); + } + } else { + for (; tid < nthreads; tid += stride) { + int out_id_h = tid / out_chw; + int out_id_w = tid % out_chw; + const int in_img_size = in_h * in_w; + const int out_img_size = out_h * out_w; + T value = out[out_id_h * out_chw + out_id_w]; + + int out_img_idy = out_id_w / (out_w * num_channels); + int out_img_idx = out_id_w % (out_w * num_channels) / num_channels; + int channel_id = tid % num_channels; + + int in_img_idx, in_img_idy, w_id, h_id; + T w1lambda, h1lambda, w2lambda, h2lambda; + T src_w = ratio_w * (out_img_idx + align_type_value) - align_type_value; + T src_h = ratio_h * (out_img_idy + align_type_value) - align_type_value; + PreCalculatorForInputIndex(&in_img_idx, &in_img_idy, &w_id, &h_id, + &w1lambda, &h1lambda, &w2lambda, &h2lambda, + src_w, src_h, in_w, in_h); + + T* in_pos = &in[out_id_h * in_chw + in_img_idy * in_w * num_channels + + in_img_idx * num_channels + channel_id]; + platform::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * value); platform::CudaAtomicAdd(&in_pos[w_id * num_channels], - h2lambda * w1lambda * out_pos[0]); - platform::CudaAtomicAdd(&in_pos[h_id * in_img_w * num_channels], - h1lambda * w2lambda * out_pos[0]); + h2lambda * w1lambda * value); + platform::CudaAtomicAdd(&in_pos[h_id * in_w * num_channels], + h1lambda * w2lambda * value); platform::CudaAtomicAdd( - &in_pos[h_id * in_img_w * num_channels + w_id * num_channels], - h1lambda * w1lambda * out_pos[0]); + &in_pos[h_id * in_w * num_channels + w_id * num_channels], + h1lambda * w1lambda * value); } } } @@ -1373,7 +1508,6 @@ static void Interpolate2DCUDABwd(const framework::ExecutionContext& ctx, int out_hw = out_h * out_w; int in_chw = c * in_hw; int out_chw = c * out_hw; - int pixelNum = n * out_chw; platform::GpuLaunchConfig config = @@ -1386,11 +1520,25 @@ static void Interpolate2DCUDABwd(const framework::ExecutionContext& ctx, input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h, out_w, n, out_chw, c, ratio_h, ratio_w, align_corners, data_layout); } else if ("bilinear" == interp_method) { - KeBilinearInterpBw<<>>( - input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h, out_w, - n, out_chw, c, ratio_h, ratio_w, align_corners, align_mode, - data_layout); + const T align_type_value = (align_mode == 0 && !align_corners) ? 0.5f : 0; + bool is_nchw = (data_layout == DataLayout::kNCHW) ? true : false; + bool optimize_flag = false; + optimize_flag = (in_h < (out_h >> 6) && in_w < (out_w >> 6)) + ? true + : ((in_h == 1 && in_w == 1) ? true : false); + + if (optimize_flag & is_nchw) { + KeBilinearInterpBwShareMemory< + T><<>>( + input_grad_data, in_h, in_w, output_grad_data, out_h, out_w, n, c, + ratio_h, ratio_w, align_type_value, is_nchw); + } else { + KeBilinearInterpBw<<>>( + input_grad_data, in_h, in_w, output_grad_data, out_h, out_w, n, c, + ratio_h, ratio_w, align_type_value, is_nchw); + } } else if ("bicubic" == interp_method) { KeBicubicInterpBw<<>>( -- GitLab From bd2a4e23fad95fbd10f5ea33f1ab922b65be6515 Mon Sep 17 00:00:00 2001 From: ronnywang <524019753@qq.com> Date: Mon, 12 Apr 2021 14:32:24 +0800 Subject: [PATCH 194/486] [ROCM] fix some unittests (#32129) * [ROCM] fix test_gru_rnn_op * [ROCM] fix test_expand_op * [ROCM] fix test_cross_entropy_loss * [ROCM] fix test_conv_nn_grad * [ROCM] fix test_bilinear_tensor_product_op * [ROCM] fix elementwise_op_function * [ROCM] fix test_lstm_cudnn_op * [ROCM] fix test_gpu_package_without_gpu_device * [ROCM] fix test_gru_unit_op * [ROCM] fix test_imperative_optimizer * [ROCM] fix rnn * [ROCM] fix group_norm_op * [ROCM] fix test_pool3d_api * [ROCM] fix test_pool3d_op --- .../elementwise/elementwise_op_function.h | 4 + paddle/fluid/operators/group_norm_op.cu | 8 + paddle/fluid/operators/miopen_lstm_cache.h | 7 +- paddle/fluid/platform/miopen_helper.h | 7 +- .../test_bilinear_tensor_product_op.py | 11 +- .../tests/unittests/test_conv_nn_grad.py | 32 ++-- .../unittests/test_cross_entropy_loss.py | 150 ++++++++++-------- .../fluid/tests/unittests/test_expand_op.py | 13 +- .../test_gpu_package_without_gpu_device.py | 5 +- .../fluid/tests/unittests/test_gru_rnn_op.py | 23 ++- .../fluid/tests/unittests/test_gru_unit_op.py | 20 ++- .../unittests/test_imperative_optimizer.py | 12 +- .../unittests/test_imperative_optimizer_v2.py | 12 +- .../tests/unittests/test_lstm_cudnn_op.py | 48 ++++-- .../fluid/tests/unittests/test_pool3d_op.py | 18 ++- python/paddle/nn/layer/rnn.py | 3 +- 16 files changed, 239 insertions(+), 134 deletions(-) diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h index c69baadb3c2..32e49cf3996 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_function.h +++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h @@ -39,7 +39,11 @@ limitations under the License. */ #include "paddle/fluid/platform/cuda_device_function.h" #include "paddle/fluid/platform/cuda_primitives.h" +#ifdef __HIPCC__ +constexpr int ELEMWISE_MAX_BLOCK_DIM = 256; +#else constexpr int ELEMWISE_MAX_BLOCK_DIM = 1024; +#endif #define BLOCK_X 32 #define BLOCK_Y 32 #endif diff --git a/paddle/fluid/operators/group_norm_op.cu b/paddle/fluid/operators/group_norm_op.cu index 45d97723a3e..18a248f5531 100644 --- a/paddle/fluid/operators/group_norm_op.cu +++ b/paddle/fluid/operators/group_norm_op.cu @@ -174,7 +174,11 @@ class GroupNormKernel int imsize = (data_layout == DataLayout::kNCHW ? x_dims[2] * x_dims[3] : x_dims[1] * x_dims[2]); +#ifdef __HIPCC__ + int block_size = std::max(std::min(256, imsize), 64); +#else int block_size = std::min(1024, imsize); +#endif dim3 grid(group_size, groups, x_dims[0]); dim3 threads(block_size, 1, 1); GroupNormForwardGetMeanAndVar<<>>( @@ -348,7 +352,11 @@ class GroupNormGradKernel int imsize = (data_layout == DataLayout::kNCHW ? x_dims[2] * x_dims[3] : x_dims[1] * x_dims[2]); +#ifdef __HIPCC__ + int block_size = std::max(std::min(256, imsize), 64); +#else int block_size = std::min(1024, imsize); +#endif dim3 grid(group_size, groups, x_dims[0]); dim3 threads(block_size, 1, 1); int flags = diff --git a/paddle/fluid/operators/miopen_lstm_cache.h b/paddle/fluid/operators/miopen_lstm_cache.h index 7c0faa86be0..a357e6e5af6 100644 --- a/paddle/fluid/operators/miopen_lstm_cache.h +++ b/paddle/fluid/operators/miopen_lstm_cache.h @@ -75,10 +75,11 @@ class ScopedRNNBase { dropout_state, seed_, state_size); // ------------------- miopen rnn descriptors --------------------- - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetRNNDescriptor( - rnn_desc_.desc(), hidden_size_, num_layers_, miopenRNNlinear, + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetRNNDescriptor_V2( + rnn_desc_.desc(), hidden_size_, num_layers_, dropout_desc_.desc(), + miopenRNNlinear, is_bidirec_ ? miopenRNNbidirection : miopenRNNunidirection, miopenLSTM, - miopenRNNNoBias, miopenRNNdefault, miopen_type)); + miopenRNNwithBias, miopenRNNdefault, miopen_type)); // ------------------- miopen weights_size --------------------- size_t weights_size_; diff --git a/paddle/fluid/platform/miopen_helper.h b/paddle/fluid/platform/miopen_helper.h index 435d28d518d..46c7da83970 100644 --- a/paddle/fluid/platform/miopen_helper.h +++ b/paddle/fluid/platform/miopen_helper.h @@ -434,9 +434,10 @@ class ScopedPoolingDescriptor { "The size of kernel and strides should be equal. But " "received size of kernel is %d, size of strides is %d.", kernel.size(), strides.size())); - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSet2dPoolingDescriptor( - desc_, GetPoolingMode(mode), kernel[0], kernel[1], pads[0], pads[1], - strides[0], strides[1])); + PADDLE_ENFORCE_CUDA_SUCCESS(dynload::miopenSetNdPoolingDescriptor( + desc_, GetPoolingMode(mode), kernel.size(), + const_cast(kernel.data()), const_cast(pads.data()), + const_cast(strides.data()))); return desc_; } diff --git a/python/paddle/fluid/tests/unittests/test_bilinear_tensor_product_op.py b/python/paddle/fluid/tests/unittests/test_bilinear_tensor_product_op.py index ba9db2c104f..60e9d0a26b3 100644 --- a/python/paddle/fluid/tests/unittests/test_bilinear_tensor_product_op.py +++ b/python/paddle/fluid/tests/unittests/test_bilinear_tensor_product_op.py @@ -42,11 +42,12 @@ class TestBilinearTensorProductOp(OpTest): size0 = 5 size1 = 4 size2 = 5 - a = np.random.random((batch_size, size0)).astype("float64") - b = np.random.random((batch_size, size1)).astype("float64") - w = np.random.random((size2, size0, size1)).astype("float64") - bias = np.random.random((1, size2)).astype("float64") - output = np.zeros((batch_size, size2)).astype("float64") + dtype = "float32" if fluid.core.is_compiled_with_rocm() else "float64" + a = np.random.random((batch_size, size0)).astype(dtype) + b = np.random.random((batch_size, size1)).astype(dtype) + w = np.random.random((size2, size0, size1)).astype(dtype) + bias = np.random.random((1, size2)).astype(dtype) + output = np.zeros((batch_size, size2)).astype(dtype) for i in range(size2): w_i = w[i, :, :] output[:, i] = np.sum(np.matmul(a, w_i) * b, axis=1) diff --git a/python/paddle/fluid/tests/unittests/test_conv_nn_grad.py b/python/paddle/fluid/tests/unittests/test_conv_nn_grad.py index 7aa3d0d1686..d5f49919bc9 100644 --- a/python/paddle/fluid/tests/unittests/test_conv_nn_grad.py +++ b/python/paddle/fluid/tests/unittests/test_conv_nn_grad.py @@ -30,7 +30,7 @@ class TestConvDoubleGradCheck(unittest.TestCase): def func(self, place): shape = [2, 4, 3, 3] eps = 0.005 - dtype = np.float64 + dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64 x = layers.data('x', shape, False, dtype) y = layers.conv2d(x, 2, 1, groups=1, bias_attr=False) x_arr = np.random.uniform(-1, 1, shape).astype(dtype) @@ -57,7 +57,7 @@ class TestConvDoubleGradCheck(unittest.TestCase): def func(self, place): shape = [2, 4, 3, 3] eps = 0.005 - dtype = np.float64 + dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64 x = layers.data('x', shape, False, dtype) y = layers.conv2d(x, 2, 1, bias_attr=False) x_arr = np.random.uniform(-1, 1, shape).astype(dtype) @@ -82,7 +82,7 @@ class TestConvDoubleGradCheckTest1(unittest.TestCase): def func(self, place): shape = [2, 3, 3, 3] eps = 0.005 - dtype = np.float64 + dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64 x = layers.data('x', shape, False, dtype) y = layers.conv2d(x, 2, 1, padding=1, bias_attr=False) x_arr = np.random.uniform(-1, 1, shape).astype(dtype) @@ -107,7 +107,7 @@ class TestConv3DDoubleGradCheck(unittest.TestCase): def func(self, place): shape = [2, 4, 3, 4, 2] eps = 0.005 - dtype = np.float64 + dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64 x = layers.data('x', shape, False, dtype) y = layers.conv3d(x, 2, 1, bias_attr=False) x_arr = np.random.uniform(-1, 1, shape).astype(dtype) @@ -132,7 +132,7 @@ class TestConv3DDoubleGradCheckTest1(unittest.TestCase): def func(self, place): shape = [2, 4, 5, 3, 2] eps = 0.005 - dtype = np.float64 + dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64 x = layers.data('x', shape, False, dtype) y = layers.conv3d(x, 2, 1, padding=1, bias_attr=False) x_arr = np.random.uniform(-1, 1, shape).astype(dtype) @@ -157,7 +157,7 @@ class TestConv2DoubleGradCheck_AsyPadding(unittest.TestCase): def func(self, place): shape = [2, 2, 3, 3] eps = 0.005 - dtype = np.float64 + dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64 x = layers.data('x', shape, False, dtype) y = layers.conv2d( input=x, @@ -188,7 +188,7 @@ class TestConv2DoubleGradCheck_PaddingSAME(unittest.TestCase): def func(self, place): shape = [2, 2, 3, 3] eps = 0.005 - dtype = np.float64 + dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64 x = layers.data('x', shape, False, dtype) y = layers.conv2d( input=x, @@ -219,7 +219,7 @@ class TestConv2DoubleGradCheck_PaddingVALID(unittest.TestCase): def func(self, place): shape = [2, 2, 3, 3] eps = 0.005 - dtype = np.float64 + dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64 x = layers.data('x', shape, False, dtype) y = layers.conv2d( input=x, @@ -250,7 +250,7 @@ class TestConv2DoubleGradCheck_ChannelLast(unittest.TestCase): def func(self, place): shape = [2, 2, 3, 3] eps = 0.005 - dtype = np.float64 + dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64 x = layers.data('x', shape, False, dtype) y = layers.conv2d( input=x, @@ -283,7 +283,7 @@ class TestConv2DoubleGradCheck_ChannelLast_AsyPadding(unittest.TestCase): def func(self, place): shape = [2, 2, 3, 3] eps = 0.005 - dtype = np.float64 + dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64 x = layers.data('x', shape, False, dtype) y = layers.conv2d( input=x, @@ -316,7 +316,7 @@ class TestConv3DDoubleGradCheck_AsyPadding(unittest.TestCase): def func(self, place): shape = [2, 2, 2, 2, 2] eps = 0.005 - dtype = np.float64 + dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64 x = layers.data('x', shape, False, dtype) y = layers.conv3d( input=x, @@ -347,7 +347,7 @@ class TestConv3DoubleGradCheck_PaddingSAME(unittest.TestCase): def func(self, place): shape = [2, 2, 2, 2, 2] eps = 0.005 - dtype = np.float64 + dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64 x = layers.data('x', shape, False, dtype) y = layers.conv3d( input=x, @@ -379,7 +379,7 @@ class TestConv3DoubleGradCheck_PaddingVALID(unittest.TestCase): def func(self, place): shape = [2, 2, 3, 3, 2] eps = 0.005 - dtype = np.float64 + dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64 x = layers.data('x', shape, False, dtype) y = layers.conv3d( input=x, @@ -410,7 +410,7 @@ class TestConv3DDoubleGradCheck_ChannelLast(unittest.TestCase): def func(self, place): shape = [2, 2, 2, 2, 3] eps = 0.005 - dtype = np.float64 + dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64 x = layers.data('x', shape, False, dtype) y = layers.conv3d( input=x, @@ -443,7 +443,7 @@ class TestConv3DDoubleGradCheck_ChannelLast_AsyPadding(unittest.TestCase): def func(self, place): shape = [2, 2, 2, 2, 3] eps = 0.005 - dtype = np.float64 + dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64 x = layers.data('x', shape, False, dtype) y = layers.conv3d( input=x, @@ -476,7 +476,7 @@ class TestDepthWiseConvDoubleGradCheck(unittest.TestCase): def func(self, place): shape = [2, 4, 3, 3] eps = 0.005 - dtype = np.float64 + dtype = np.float32 if fluid.core.is_compiled_with_rocm() else np.float64 x = layers.data('x', shape, False, dtype) # condition of depthwise conv: diff --git a/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py b/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py index 1a5e4b28355..ea44e23da24 100644 --- a/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py +++ b/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py @@ -191,12 +191,16 @@ def cross_entropy_soft_2d(softmax, class CrossEntropyLoss(unittest.TestCase): + def setUp(self): + self.dtype = 'float32' if fluid.core.is_compiled_with_rocm( + ) else 'float64' ###test for deprecated softmax_with_cross_entropy def test_softmax_with_cross_entropy(self): self.numeric_stable_mode = False self.soft_label = True - self.dtype = np.float64 + self.dtype = 'float32' if fluid.core.is_compiled_with_rocm( + ) else 'float64' self.axis = -1 self.ignore_index = -100 #should not be changed self.N = 4 @@ -248,7 +252,8 @@ class CrossEntropyLoss(unittest.TestCase): def test_cross_entropy_loss_soft_1d(self): self.numeric_stable_mode = False self.soft_label = True - self.dtype = np.float64 + self.dtype = 'float32' if fluid.core.is_compiled_with_rocm( + ) else 'float64' self.axis = -1 self.ignore_index = -100 #should not be changed self.N = 4 @@ -296,9 +301,9 @@ class CrossEntropyLoss(unittest.TestCase): ) else fluid.CPUPlace() with fluid.program_guard(prog, startup_prog): input = fluid.data( - name='input', shape=[self.N, self.C], dtype='float64') + name='input', shape=[self.N, self.C], dtype=self.dtype) label = fluid.data( - name='label', shape=[self.N, self.C], dtype='float64') + name='label', shape=[self.N, self.C], dtype=self.dtype) cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss( reduction=self.reduction, soft_label=True) @@ -321,7 +326,8 @@ class CrossEntropyLoss(unittest.TestCase): def test_cross_entropy_loss_soft_1d_weight(self): self.numeric_stable_mode = False self.soft_label = True - self.dtype = np.float64 + self.dtype = 'float32' if fluid.core.is_compiled_with_rocm( + ) else 'float64' self.axis = -1 self.ignore_index = -100 #should not be changed self.N = 4 @@ -376,10 +382,10 @@ class CrossEntropyLoss(unittest.TestCase): ) else fluid.CPUPlace() with fluid.program_guard(prog, startup_prog): input = fluid.data( - name='input', shape=[self.N, self.C], dtype='float64') + name='input', shape=[self.N, self.C], dtype=self.dtype) label = fluid.data( - name='label', shape=[self.N, self.C], dtype='float64') - weight = fluid.data(name='weight', shape=[self.C], dtype='float64') + name='label', shape=[self.N, self.C], dtype=self.dtype) + weight = fluid.data(name='weight', shape=[self.C], dtype=self.dtype) cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss( weight=weight, reduction=self.reduction, soft_label=True) @@ -403,7 +409,8 @@ class CrossEntropyLoss(unittest.TestCase): def test_cross_entropy_loss_soft_1d_mean(self): self.numeric_stable_mode = False self.soft_label = True - self.dtype = np.float64 + self.dtype = 'float32' if fluid.core.is_compiled_with_rocm( + ) else 'float64' self.axis = -1 self.ignore_index = -100 #should not be changed self.N = 4 @@ -451,9 +458,9 @@ class CrossEntropyLoss(unittest.TestCase): ) else fluid.CPUPlace() with fluid.program_guard(prog, startup_prog): input = fluid.data( - name='input', shape=[self.N, self.C], dtype='float64') + name='input', shape=[self.N, self.C], dtype=self.dtype) label = fluid.data( - name='label', shape=[self.N, self.C], dtype='float64') + name='label', shape=[self.N, self.C], dtype=self.dtype) cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss( reduction=self.reduction, soft_label=True) @@ -475,7 +482,8 @@ class CrossEntropyLoss(unittest.TestCase): def test_cross_entropy_loss_soft_1d_weight_mean(self): self.numeric_stable_mode = False self.soft_label = True - self.dtype = np.float64 + self.dtype = 'float32' if fluid.core.is_compiled_with_rocm( + ) else 'float64' self.axis = -1 self.ignore_index = -100 #should not be changed self.N = 4 @@ -523,10 +531,10 @@ class CrossEntropyLoss(unittest.TestCase): ) else fluid.CPUPlace() with fluid.program_guard(prog, startup_prog): input = fluid.data( - name='input', shape=[self.N, self.C], dtype='float64') + name='input', shape=[self.N, self.C], dtype=self.dtype) label = fluid.data( - name='label', shape=[self.N, self.C], dtype='float64') - weight = fluid.data(name='weight', shape=[self.C], dtype='float64') + name='label', shape=[self.N, self.C], dtype=self.dtype) + weight = fluid.data(name='weight', shape=[self.C], dtype=self.dtype) cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss( weight=weight, reduction=self.reduction, soft_label=True) @@ -549,7 +557,8 @@ class CrossEntropyLoss(unittest.TestCase): def test_cross_entropy_loss_soft_2d(self): self.numeric_stable_mode = False self.soft_label = True - self.dtype = np.float64 + self.dtype = 'float32' if fluid.core.is_compiled_with_rocm( + ) else 'float64' self.axis = -1 self.ignore_index = -100 #should not be changed self.N = 3 @@ -604,11 +613,11 @@ class CrossEntropyLoss(unittest.TestCase): input = fluid.data( name='input', shape=[self.N, self.H, self.W, self.C], - dtype='float64') + dtype=self.dtype) label = fluid.data( name='label', shape=[self.N, self.H, self.W, self.C], - dtype='float64') + dtype=self.dtype) cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss( reduction=self.reduction, soft_label=True) @@ -631,7 +640,8 @@ class CrossEntropyLoss(unittest.TestCase): def test_cross_entropy_loss_soft_2d_weight_mean(self): self.numeric_stable_mode = False self.soft_label = True - self.dtype = np.float64 + self.dtype = 'float32' if fluid.core.is_compiled_with_rocm( + ) else 'float64' self.axis = -1 self.ignore_index = -100 #should not be changed self.N = 3 @@ -685,12 +695,12 @@ class CrossEntropyLoss(unittest.TestCase): input = fluid.data( name='input', shape=[self.N, self.H, self.W, self.C], - dtype='float64') + dtype=self.dtype) label = fluid.data( name='label', shape=[self.N, self.H, self.W, self.C], - dtype='float64') - weight = fluid.data(name='weight', shape=[self.C], dtype='float64') + dtype=self.dtype) + weight = fluid.data(name='weight', shape=[self.C], dtype=self.dtype) cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss( weight=weight, reduction=self.reduction, soft_label=True) @@ -713,7 +723,7 @@ class CrossEntropyLoss(unittest.TestCase): ###soft_label test end def test_cross_entropy_loss_1d_with_mean_ignore(self): - input_np = np.random.random([2, 4]).astype(np.float64) + input_np = np.random.random([2, 4]).astype(self.dtype) label_np = np.random.randint(0, 4, size=(2)).astype(np.int64) paddle.enable_static() prog = fluid.Program() @@ -721,7 +731,7 @@ class CrossEntropyLoss(unittest.TestCase): place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda( ) else fluid.CPUPlace() with fluid.program_guard(prog, startup_prog): - input = fluid.data(name='input', shape=[2, 4], dtype='float64') + input = fluid.data(name='input', shape=[2, 4], dtype=self.dtype) label = fluid.data(name='label', shape=[2], dtype='int64') cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(ignore_index=0) ret = cross_entropy_loss(input, label) @@ -752,20 +762,20 @@ class CrossEntropyLoss(unittest.TestCase): def test_cross_entropy_loss_1d_with_weight_mean_ignore(self): N = 100 C = 200 - input_np = np.random.random([N, C]).astype(np.float64) + input_np = np.random.random([N, C]).astype(self.dtype) label_np = np.random.randint(0, C, size=(N)).astype(np.int64) - weight_np = np.random.random([C]).astype(np.float64) + weight_np = np.random.random([C]).astype(self.dtype) paddle.enable_static() prog = fluid.Program() startup_prog = fluid.Program() place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda( ) else fluid.CPUPlace() with fluid.program_guard(prog, startup_prog): - input = fluid.data(name='input', shape=[N, C], dtype='float64') + input = fluid.data(name='input', shape=[N, C], dtype=self.dtype) label = fluid.data(name='label', shape=[N], dtype='int64') weight = fluid.data( name='weight', shape=[C], - dtype='float64') #weight for each class + dtype=self.dtype) #weight for each class cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss( weight=weight, ignore_index=0) ret = cross_entropy_loss(input, label) @@ -798,20 +808,20 @@ class CrossEntropyLoss(unittest.TestCase): self.assertTrue(np.allclose(dy_ret_value, expected)) def test_cross_entropy_loss_1d_with_weight_mean(self): - input_np = np.random.random([2, 4]).astype(np.float64) + input_np = np.random.random([2, 4]).astype(self.dtype) label_np = np.random.randint(0, 4, size=(2)).astype(np.int64) - weight_np = np.random.random([4]).astype(np.float64) #shape:C + weight_np = np.random.random([4]).astype(self.dtype) #shape:C paddle.enable_static() prog = fluid.Program() startup_prog = fluid.Program() place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda( ) else fluid.CPUPlace() with fluid.program_guard(prog, startup_prog): - input = fluid.data(name='input', shape=[2, 4], dtype='float64') + input = fluid.data(name='input', shape=[2, 4], dtype=self.dtype) label = fluid.data(name='label', shape=[2], dtype='int64') weight = fluid.data( name='weight', shape=[4], - dtype='float64') #weight for each class + dtype=self.dtype) #weight for each class cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(weight=weight) ret = cross_entropy_loss(input, label) @@ -842,18 +852,18 @@ class CrossEntropyLoss(unittest.TestCase): self.assertTrue(np.allclose(dy_ret_value, expected)) def test_cross_entropy_loss_1d_with_weight_sum(self): - input_np = np.random.random([100, 200]).astype(np.float64) #N,C + input_np = np.random.random([100, 200]).astype(self.dtype) #N,C label_np = np.random.randint(0, 100, size=(100)).astype(np.int64) #N,1 - weight_np = np.random.random([200]).astype(np.float64) #C + weight_np = np.random.random([200]).astype(self.dtype) #C paddle.enable_static() prog = fluid.Program() startup_prog = fluid.Program() place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda( ) else fluid.CPUPlace() with fluid.program_guard(prog, startup_prog): - input = fluid.data(name='input', shape=[100, 200], dtype='float64') + input = fluid.data(name='input', shape=[100, 200], dtype=self.dtype) label = fluid.data(name='label', shape=[100], dtype='int64') - weight = fluid.data(name='weight', shape=[200], dtype='float64') + weight = fluid.data(name='weight', shape=[200], dtype=self.dtype) cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss( weight=weight, reduction='sum') ret = cross_entropy_loss(input, label) @@ -882,9 +892,9 @@ class CrossEntropyLoss(unittest.TestCase): self.assertTrue(np.allclose(dy_ret_value, expected)) def test_cross_entropy_loss_1d_with_weight_none(self): - input_np = np.random.random([100, 200]).astype(np.float64) #N,C + input_np = np.random.random([100, 200]).astype(self.dtype) #N,C label_np = np.random.randint(0, 100, size=(100)).astype(np.int64) #N,1 - weight_np = np.random.random([200]).astype(np.float64) #C + weight_np = np.random.random([200]).astype(self.dtype) #C paddle.enable_static() prog = fluid.Program() @@ -892,9 +902,9 @@ class CrossEntropyLoss(unittest.TestCase): place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda( ) else fluid.CPUPlace() with fluid.program_guard(prog, startup_prog): - input = fluid.data(name='input', shape=[100, 200], dtype='float64') + input = fluid.data(name='input', shape=[100, 200], dtype=self.dtype) label = fluid.data(name='label', shape=[100], dtype='int64') - weight = fluid.data(name='weight', shape=[200], dtype='float64') + weight = fluid.data(name='weight', shape=[200], dtype=self.dtype) cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss( weight=weight, reduction='none') @@ -926,18 +936,18 @@ class CrossEntropyLoss(unittest.TestCase): self.assertTrue(np.allclose(dy_ret_value, expected)) def test_cross_entropy_loss_1d_with_weight_none_func(self): - input_np = np.random.random([100, 200]).astype(np.float64) #N,C + input_np = np.random.random([100, 200]).astype(self.dtype) #N,C label_np = np.random.randint(0, 100, size=(100)).astype(np.int64) #N - weight_np = np.random.random([200]).astype(np.float64) #C + weight_np = np.random.random([200]).astype(self.dtype) #C paddle.enable_static() prog = fluid.Program() startup_prog = fluid.Program() place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda( ) else fluid.CPUPlace() with fluid.program_guard(prog, startup_prog): - input = fluid.data(name='input', shape=[100, 200], dtype='float64') + input = fluid.data(name='input', shape=[100, 200], dtype=self.dtype) label = fluid.data(name='label', shape=[100], dtype='int64') - weight = fluid.data(name='weight', shape=[200], dtype='float64') + weight = fluid.data(name='weight', shape=[200], dtype=self.dtype) ret = paddle.nn.functional.cross_entropy( input, label, weight=weight, reduction='none') @@ -967,18 +977,18 @@ class CrossEntropyLoss(unittest.TestCase): self.assertTrue(np.allclose(dy_ret_value, expected)) def test_cross_entropy_loss_1d_mean(self): - input_np = np.random.random([100, 200]).astype(np.float64) #N,C + input_np = np.random.random([100, 200]).astype(self.dtype) #N,C label_np = np.random.randint(0, 100, size=(100)).astype(np.int64) #N,1 - weight_np = np.random.random([200]).astype(np.float64) #C + weight_np = np.random.random([200]).astype(self.dtype) #C paddle.enable_static() prog = fluid.Program() startup_prog = fluid.Program() place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda( ) else fluid.CPUPlace() with fluid.program_guard(prog, startup_prog): - input = fluid.data(name='input', shape=[100, 200], dtype='float64') + input = fluid.data(name='input', shape=[100, 200], dtype=self.dtype) label = fluid.data(name='label', shape=[100], dtype='int64') - weight = fluid.data(name='weight', shape=[100], dtype='float64') + weight = fluid.data(name='weight', shape=[100], dtype=self.dtype) cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss() ret = cross_entropy_loss(input, label) exe = fluid.Executor(place) @@ -1000,7 +1010,7 @@ class CrossEntropyLoss(unittest.TestCase): self.assertTrue(np.allclose(dy_ret_value, expected)) def test_cross_entropy_loss_1d_sum(self): - input_np = np.random.random([100, 200]).astype(np.float64) #N,C + input_np = np.random.random([100, 200]).astype(self.dtype) #N,C label_np = np.random.randint(0, 100, size=(100)).astype(np.int64) #N,1 paddle.enable_static() prog = fluid.Program() @@ -1008,7 +1018,7 @@ class CrossEntropyLoss(unittest.TestCase): place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda( ) else fluid.CPUPlace() with fluid.program_guard(prog, startup_prog): - input = fluid.data(name='input', shape=[100, 200], dtype='float64') + input = fluid.data(name='input', shape=[100, 200], dtype=self.dtype) label = fluid.data(name='label', shape=[100], dtype='int64') cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss( reduction='sum') @@ -1033,7 +1043,7 @@ class CrossEntropyLoss(unittest.TestCase): self.assertTrue(np.allclose(dy_ret_value, expected)) def test_cross_entropy_loss_1d_none(self): - input_np = np.random.random([100, 200]).astype(np.float64) #N,C + input_np = np.random.random([100, 200]).astype(self.dtype) #N,C label_np = np.random.randint(0, 100, size=(100)).astype(np.int64) #N,1 paddle.enable_static() prog = fluid.Program() @@ -1041,7 +1051,7 @@ class CrossEntropyLoss(unittest.TestCase): place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda( ) else fluid.CPUPlace() with fluid.program_guard(prog, startup_prog): - input = fluid.data(name='input', shape=[100, 200], dtype='float64') + input = fluid.data(name='input', shape=[100, 200], dtype=self.dtype) label = fluid.data(name='label', shape=[100], dtype='int64') cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss( reduction='none') @@ -1068,10 +1078,10 @@ class CrossEntropyLoss(unittest.TestCase): self.assertTrue(np.allclose(dy_ret_value, expected)) def test_cross_entropy_loss_2d_with_weight_none(self): - input_np = np.random.random(size=(2, 2, 2, 3)).astype(np.float64) #NHWC + input_np = np.random.random(size=(2, 2, 2, 3)).astype(self.dtype) #NHWC label_np = np.random.randint( 0, 3, size=(2, 2, 2)).astype(np.int64) #NHW1 - weight_np = np.random.random(size=(3, )).astype(np.float64) #C + weight_np = np.random.random(size=(3, )).astype(self.dtype) #C paddle.enable_static() prog = fluid.Program() @@ -1080,9 +1090,9 @@ class CrossEntropyLoss(unittest.TestCase): ) else fluid.CPUPlace() with fluid.program_guard(prog, startup_prog): input = fluid.data( - name='input', shape=[2, 2, 2, 3], dtype='float64') + name='input', shape=[2, 2, 2, 3], dtype=self.dtype) label = fluid.data(name='label', shape=[2, 2, 2], dtype='int64') - weight = fluid.data(name='weight', shape=[3], dtype='float64') + weight = fluid.data(name='weight', shape=[3], dtype=self.dtype) cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss( weight=weight, reduction='none') ret = cross_entropy_loss(input, label) @@ -1113,10 +1123,10 @@ class CrossEntropyLoss(unittest.TestCase): self.assertTrue(np.allclose(dy_ret_value, expected)) def test_cross_entropy_loss_2d_with_weight_mean(self): - input_np = np.random.random(size=(2, 2, 2, 3)).astype(np.float64) #NHWC + input_np = np.random.random(size=(2, 2, 2, 3)).astype(self.dtype) #NHWC label_np = np.random.randint( 0, 3, size=(2, 2, 2)).astype(np.int64) #NHW - weight_np = np.random.random(size=(3, )).astype(np.float64) #C + weight_np = np.random.random(size=(3, )).astype(self.dtype) #C paddle.enable_static() prog = fluid.Program() startup_prog = fluid.Program() @@ -1124,9 +1134,9 @@ class CrossEntropyLoss(unittest.TestCase): ) else fluid.CPUPlace() with fluid.program_guard(prog, startup_prog): input = fluid.data( - name='input', shape=[2, 2, 2, 3], dtype='float64') + name='input', shape=[2, 2, 2, 3], dtype=self.dtype) label = fluid.data(name='label', shape=[2, 2, 2], dtype='int64') - weight = fluid.data(name='weight', shape=[3], dtype='float64') + weight = fluid.data(name='weight', shape=[3], dtype=self.dtype) cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss( weight=weight, reduction='mean') ret = cross_entropy_loss(input, label) @@ -1155,10 +1165,10 @@ class CrossEntropyLoss(unittest.TestCase): self.assertTrue(np.allclose(dy_ret_value, expected)) def test_cross_entropy_loss_2d_with_weight_sum(self): - input_np = np.random.random(size=(2, 2, 2, 3)).astype(np.float64) #NHWC + input_np = np.random.random(size=(2, 2, 2, 3)).astype(self.dtype) #NHWC label_np = np.random.randint( 0, 3, size=(2, 2, 2)).astype(np.int64) #NHW - weight_np = np.random.random(size=(3, )).astype(np.float64) #C + weight_np = np.random.random(size=(3, )).astype(self.dtype) #C paddle.enable_static() prog = fluid.Program() @@ -1167,9 +1177,9 @@ class CrossEntropyLoss(unittest.TestCase): ) else fluid.CPUPlace() with fluid.program_guard(prog, startup_prog): input = fluid.data( - name='input', shape=[2, 2, 2, 3], dtype='float64') + name='input', shape=[2, 2, 2, 3], dtype=self.dtype) label = fluid.data(name='label', shape=[2, 2, 2], dtype='int64') - weight = fluid.data(name='weight', shape=[3], dtype='float64') + weight = fluid.data(name='weight', shape=[3], dtype=self.dtype) cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss( weight=weight, reduction='sum') ret = cross_entropy_loss(input, label) @@ -1198,7 +1208,7 @@ class CrossEntropyLoss(unittest.TestCase): self.assertTrue(np.allclose(dy_ret_value, expected)) def test_cross_entropy_loss_2d_none(self): - input_np = np.random.random(size=(2, 2, 2, 3)).astype(np.float64) #NHWC + input_np = np.random.random(size=(2, 2, 2, 3)).astype(self.dtype) #NHWC label_np = np.random.randint( 0, 3, size=(2, 2, 2)).astype(np.int64) #NHW paddle.enable_static() @@ -1208,7 +1218,7 @@ class CrossEntropyLoss(unittest.TestCase): ) else fluid.CPUPlace() with fluid.program_guard(prog, startup_prog): input = fluid.data( - name='input', shape=[2, 2, 2, 3], dtype='float64') + name='input', shape=[2, 2, 2, 3], dtype=self.dtype) label = fluid.data(name='label', shape=[2, 2, 2], dtype='int64') cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss( reduction='none') @@ -1237,7 +1247,7 @@ class CrossEntropyLoss(unittest.TestCase): self.assertTrue(np.allclose(dy_ret_value, expected)) def test_cross_entropy_loss_2d_mean(self): - input_np = np.random.random(size=(2, 2, 2, 3)).astype(np.float64) #NHWC + input_np = np.random.random(size=(2, 2, 2, 3)).astype(self.dtype) #NHWC label_np = np.random.randint( 0, 3, size=(2, 2, 2)).astype(np.int64) #NHW paddle.enable_static() @@ -1247,7 +1257,7 @@ class CrossEntropyLoss(unittest.TestCase): ) else fluid.CPUPlace() with fluid.program_guard(prog, startup_prog): input = fluid.data( - name='input', shape=[2, 2, 2, 3], dtype='float64') + name='input', shape=[2, 2, 2, 3], dtype=self.dtype) label = fluid.data(name='label', shape=[2, 2, 2], dtype='int64') cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss( reduction='mean') @@ -1276,7 +1286,7 @@ class CrossEntropyLoss(unittest.TestCase): self.assertTrue(np.allclose(dy_ret_value, expected)) def test_cross_entropy_loss_2d_sum(self): - input_np = np.random.random(size=(2, 2, 2, 3)).astype(np.float64) #NHWC + input_np = np.random.random(size=(2, 2, 2, 3)).astype(self.dtype) #NHWC label_np = np.random.randint( 0, 3, size=(2, 2, 2)).astype(np.int64) #NHW paddle.enable_static() @@ -1286,7 +1296,7 @@ class CrossEntropyLoss(unittest.TestCase): ) else fluid.CPUPlace() with fluid.program_guard(prog, startup_prog): input = fluid.data( - name='input', shape=[2, 2, 2, 3], dtype='float64') + name='input', shape=[2, 2, 2, 3], dtype=self.dtype) label = fluid.data(name='label', shape=[2, 2, 2], dtype='int64') cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss( reduction='sum') diff --git a/python/paddle/fluid/tests/unittests/test_expand_op.py b/python/paddle/fluid/tests/unittests/test_expand_op.py index a325ffe1d0e..edda6da655d 100644 --- a/python/paddle/fluid/tests/unittests/test_expand_op.py +++ b/python/paddle/fluid/tests/unittests/test_expand_op.py @@ -27,8 +27,10 @@ class TestExpandOpRank1(OpTest): def setUp(self): self.op_type = "expand" self.init_data() + self.dtype = "float32" if fluid.core.is_compiled_with_rocm( + ) else "float64" - self.inputs = {'X': np.random.random(self.ori_shape).astype("float64")} + self.inputs = {'X': np.random.random(self.ori_shape).astype(self.dtype)} self.attrs = {'expand_times': self.expand_times} output = np.tile(self.inputs['X'], self.expand_times) self.outputs = {'Out': output} @@ -79,13 +81,16 @@ class TestExpandOpRank1_tensor_attr(OpTest): def setUp(self): self.op_type = "expand" self.init_data() + self.dtype = "float32" if fluid.core.is_compiled_with_rocm( + ) else "float64" + expand_times_tensor = [] for index, ele in enumerate(self.expand_times): expand_times_tensor.append(("x" + str(index), np.ones( (1)).astype('int32') * ele)) self.inputs = { - 'X': np.random.random(self.ori_shape).astype("float64"), + 'X': np.random.random(self.ori_shape).astype(self.dtype), 'expand_times_tensor': expand_times_tensor, } self.attrs = {"expand_times": self.infer_expand_times} @@ -123,9 +128,11 @@ class TestExpandOpRank1_tensor(OpTest): def setUp(self): self.op_type = "expand" self.init_data() + self.dtype = "float32" if fluid.core.is_compiled_with_rocm( + ) else "float64" self.inputs = { - 'X': np.random.random(self.ori_shape).astype("float64"), + 'X': np.random.random(self.ori_shape).astype(self.dtype), 'ExpandTimes': np.array(self.expand_times).astype("int32"), } self.attrs = {} diff --git a/python/paddle/fluid/tests/unittests/test_gpu_package_without_gpu_device.py b/python/paddle/fluid/tests/unittests/test_gpu_package_without_gpu_device.py index d854372bbc6..2b51bec9cb0 100644 --- a/python/paddle/fluid/tests/unittests/test_gpu_package_without_gpu_device.py +++ b/python/paddle/fluid/tests/unittests/test_gpu_package_without_gpu_device.py @@ -26,7 +26,10 @@ from paddle.fluid import core class TestGPUPackagePaddle(unittest.TestCase): def test_import_paddle(self): if core.is_compiled_with_cuda(): - os.environ['CUDA_VISIBLE_DEVICES'] = '' + if core.is_compiled_with_rocm(): + os.environ['HIP_VISIBLE_DEVICES'] = '' + else: + os.environ['CUDA_VISIBLE_DEVICES'] = '' test_file = 'test_no_gpu_run_rand.py' with open(test_file, 'w') as wb: cmd_test = """ diff --git a/python/paddle/fluid/tests/unittests/test_gru_rnn_op.py b/python/paddle/fluid/tests/unittests/test_gru_rnn_op.py index eb1fed81cbe..9f18ec9843d 100644 --- a/python/paddle/fluid/tests/unittests/test_gru_rnn_op.py +++ b/python/paddle/fluid/tests/unittests/test_gru_rnn_op.py @@ -44,8 +44,9 @@ class TestGRUOp(OpTest): def setUp(self): self.op_type = "rnn" - self.dtype = "float64" - self.sequence_length = np.array( + self.dtype = "float32" if core.is_compiled_with_rocm() else "float64" + self.sequence_length = None if core.is_compiled_with_rocm( + ) else np.array( [12, 11, 10, 9, 8, 7, 6, 5], dtype=np.int32) self.num_layers = 1 self.is_bidirec = False @@ -83,6 +84,24 @@ class TestGRUOp(OpTest): output, last_hidden = rnn1(input, sequence_length=self.sequence_length) + if core.is_compiled_with_rocm(): + + def rocm_rnn_get_place(): + places = [core.CUDAPlace(0)] + return places + + self._get_places = rocm_rnn_get_place + + if self.is_bidirec: + for i in range(0, len(flat_w), 4): + flat_w[i + 1], flat_w[i + 2] = flat_w[i + 2], flat_w[i + 1] + + for i in range(len(flat_w)): + w = np.split(flat_w[i][1], 3, 0) + w = [w[1], w[0], w[2]] + w = np.concatenate(w) + flat_w[i] = (flat_w[i][0], w) + init_h = np.zeros((self.num_layers * self.direction_num, batch_size, self.hidden_size)).astype(self.dtype) diff --git a/python/paddle/fluid/tests/unittests/test_gru_unit_op.py b/python/paddle/fluid/tests/unittests/test_gru_unit_op.py index a570e266072..74afa7db289 100644 --- a/python/paddle/fluid/tests/unittests/test_gru_unit_op.py +++ b/python/paddle/fluid/tests/unittests/test_gru_unit_op.py @@ -121,12 +121,12 @@ class TestGRUUnitOp(OpTest): self.op_type = 'gru_unit' self.inputs = { 'Input': np.random.uniform( - -0.1, 0.1, (batch_size, frame_size * 3)).astype('float64'), + -0.1, 0.1, (batch_size, frame_size * 3)).astype(self.dtype), 'HiddenPrev': np.random.uniform( - -0.1, 0.1, (batch_size, frame_size)).astype('float64'), + -0.1, 0.1, (batch_size, frame_size)).astype(self.dtype), 'Weight': np.random.uniform( -1. / math.sqrt(frame_size), 1. / math.sqrt(frame_size), - (frame_size, frame_size * 3)).astype('float64'), + (frame_size, frame_size * 3)).astype(self.dtype), } self.attrs = { 'activation': GRUActivationType.tanh, @@ -161,12 +161,14 @@ class TestGRUUnitOp(OpTest): else: h = u * c + (1 - u) * h_p self.outputs = { - 'Gate': g.astype('float64'), - 'ResetHiddenPrev': r_h_p.astype('float64'), - 'Hidden': h.astype('float64') + 'Gate': g.astype(self.dtype), + 'ResetHiddenPrev': r_h_p.astype(self.dtype), + 'Hidden': h.astype(self.dtype) } def setUp(self): + self.dtype = 'float32' if fluid.core.is_compiled_with_rocm( + ) else 'float64' self.set_inputs() self.set_outputs() @@ -179,6 +181,8 @@ class TestGRUUnitOp(OpTest): class TestGRUUnitOpOriginMode(TestGRUUnitOp): def setUp(self): + self.dtype = 'float32' if fluid.core.is_compiled_with_rocm( + ) else 'float64' self.set_inputs(origin_mode=True) self.set_outputs(origin_mode=True) @@ -189,7 +193,7 @@ class TestGRUUnitOpWithBias(TestGRUUnitOp): frame_size = self.frame_size super(TestGRUUnitOpWithBias, self).set_inputs() self.inputs['Bias'] = np.random.uniform( - -0.1, 0.1, (1, frame_size * 3)).astype('float64') + -0.1, 0.1, (1, frame_size * 3)).astype(self.dtype) self.attrs = { 'activation': GRUActivationType.identity, 'gate_activation': GRUActivationType.sigmoid, @@ -207,6 +211,8 @@ class TestGRUUnitOpWithBias(TestGRUUnitOp): class TestGRUUnitOpWithBiasOriginMode(TestGRUUnitOpWithBias): def setUp(self): + self.dtype = 'float32' if fluid.core.is_compiled_with_rocm( + ) else 'float64' self.set_inputs(origin_mode=True) self.set_outputs(origin_mode=True) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py index cd019c92075..36c4d67bf2d 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py @@ -190,10 +190,18 @@ class TestImperativeOptimizerBase(unittest.TestCase): for key, value in six.iteritems(static_param_init_value): self.assertTrue(np.allclose(value, dy_param_init_value[key])) - self.assertTrue(np.allclose(static_out, dy_out)) + if core.is_compiled_with_rocm(): + self.assertTrue(np.allclose(static_out, dy_out, atol=1e-3)) + else: + self.assertTrue(np.allclose(static_out, dy_out)) for key, value in six.iteritems(static_param_value): - self.assertTrue(np.allclose(value, dy_param_value[key])) + if core.is_compiled_with_rocm(): + self.assertTrue( + np.allclose( + value, dy_param_value[key], atol=1e-3)) + else: + self.assertTrue(np.allclose(value, dy_param_value[key])) class TestImperativeOptimizerPiecewiseDecay(TestImperativeOptimizerBase): diff --git a/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py b/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py index e3d82888f61..eac627d1b5b 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py @@ -207,10 +207,18 @@ class TestImperativeOptimizerBase(unittest.TestCase): for key, value in six.iteritems(static_param_init_value): self.assertTrue(np.allclose(value, dy_param_init_value[key])) - self.assertTrue(np.allclose(static_out, dy_out)) + if core.is_compiled_with_rocm(): + self.assertTrue(np.allclose(static_out, dy_out, atol=1e-3)) + else: + self.assertTrue(np.allclose(static_out, dy_out)) for key, value in six.iteritems(static_param_value): - self.assertTrue(np.allclose(value, dy_param_value[key])) + if core.is_compiled_with_rocm(): + self.assertTrue( + np.allclose( + value, dy_param_value[key], atol=1e-3)) + else: + self.assertTrue(np.allclose(value, dy_param_value[key])) class TestImperativeOptimizerPiecewiseDecay(TestImperativeOptimizerBase): diff --git a/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py b/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py index 82443f8c549..372b8d0d4d2 100644 --- a/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py +++ b/python/paddle/fluid/tests/unittests/test_lstm_cudnn_op.py @@ -390,8 +390,10 @@ class TestCUDNNLstmOp(OpTest): def setUp(self): self.op_type = "cudnn_lstm" - self.dtype = np.float64 - self.sequence_length = np.array([12, 11, 10, 9, 8], dtype=np.int32) + self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64 + self.sequence_length = None if core.is_compiled_with_rocm( + ) else np.array( + [12, 11, 10, 9, 8], dtype=np.int32) self.num_layers = 1 self.set_attrs() @@ -447,6 +449,13 @@ class TestCUDNNLstmOp(OpTest): hidden_size)).astype(self.dtype) state_out = np.ndarray((300)).astype("uint8") + if core.is_compiled_with_rocm(): + for i in range(len(flat_w)): + w = np.split(flat_w[i][1], 4, 0) + w = [w[0], w[1], w[3], w[2]] + w = np.concatenate(w) + flat_w[i] = (flat_w[i][0], w) + self.inputs = { 'Input': input, 'WeightList': flat_w, @@ -454,6 +463,13 @@ class TestCUDNNLstmOp(OpTest): 'InitC': init_c, 'SequenceLength': self.sequence_length } + if self.sequence_length is None: + self.inputs = { + 'Input': input, + 'WeightList': flat_w, + 'InitH': init_h, + 'InitC': init_c, + } self.attrs = { 'dropout_prob': 0.0, 'is_bidirec': False, @@ -474,8 +490,12 @@ class TestCUDNNLstmOp(OpTest): def test_output_with_place(self): place = core.CUDAPlace(0) - self.check_output_with_place( - place, no_check_set=['Reserve', 'StateOut']) + if core.is_compiled_with_rocm(): + self.check_output_with_place( + place, atol=1e-5, no_check_set=['Reserve', 'StateOut']) + else: + self.check_output_with_place( + place, no_check_set=['Reserve', 'StateOut']) def test_grad_with_place(self): place = core.CUDAPlace(0) @@ -496,14 +516,13 @@ class TestCUDNNlstmAPI(unittest.TestCase): hidden_size = 20 dropout_prob = 0.0 num_layers = 1 + dtype = 'float32' if core.is_compiled_with_rocm() else 'float64' input = fluid.data( - name='input', - shape=[seq_len, batch_size, hidden_size], - dtype='float64') + name='input', shape=[seq_len, batch_size, hidden_size], dtype=dtype) init_h = layers.fill_constant([num_layers, batch_size, hidden_size], - 'float64', 0.0) + dtype, 0.0) init_c = layers.fill_constant([num_layers, batch_size, hidden_size], - 'float64', 0.0) + dtype, 0.0) rnn_out, last_h, last_c = layers.lstm(input, init_h, init_c, seq_len, hidden_size, num_layers, dropout_prob, False) @@ -526,14 +545,13 @@ class TestCUDNNlstmAPI(unittest.TestCase): hidden_size = 20 dropout_prob = 0.0 num_layers = 2 + dtype = 'float32' if core.is_compiled_with_rocm() else 'float64' input = fluid.data( - name='input', - shape=[seq_len, batch_size, hidden_size], - dtype='float64') + name='input', shape=[seq_len, batch_size, hidden_size], dtype=dtype) init_h = layers.fill_constant([num_layers, batch_size, hidden_size], - 'float64', 0.0) + dtype, 0.0) init_c = layers.fill_constant([num_layers, batch_size, hidden_size], - 'float64', 0.0) + dtype, 0.0) rnn_out, last_h, last_c = layers.lstm(input, init_h, init_c, seq_len, hidden_size, num_layers, dropout_prob, False, True) @@ -541,7 +559,7 @@ class TestCUDNNlstmAPI(unittest.TestCase): exe.run(fluid.default_startup_program()) input_i = np.random.uniform( low=-0.1, high=0.1, size=(seq_len, batch_size, - hidden_size)).astype("float64") + hidden_size)).astype(dtype) out = exe.run(fluid.default_main_program(), feed={'input': input_i}, fetch_list=[rnn_out, last_h, last_c, 'cudnn_lstm_0.w_0']) diff --git a/python/paddle/fluid/tests/unittests/test_pool3d_op.py b/python/paddle/fluid/tests/unittests/test_pool3d_op.py index d618875835f..40b9be9ee4f 100644 --- a/python/paddle/fluid/tests/unittests/test_pool3d_op.py +++ b/python/paddle/fluid/tests/unittests/test_pool3d_op.py @@ -224,7 +224,7 @@ class TestPool3D_Op(OpTest): def setUp(self): self.op_type = "pool3d" self.init_kernel_type() - self.dtype = np.float64 + self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64 self.init_test_case() self.padding_algorithm = "EXPLICIT" self.init_paddings() @@ -277,9 +277,16 @@ class TestPool3D_Op(OpTest): return if self.has_cudnn() and self.pool_type != "max": place = core.CUDAPlace(0) - self.check_grad_with_place(place, set(['X']), 'Out') + if core.is_compiled_with_rocm(): + self.check_grad_with_place( + place, set(['X']), 'Out', max_relative_error=1e-2) + else: + self.check_grad_with_place(place, set(['X']), 'Out') elif self.pool_type != "max": - self.check_grad(set(['X']), 'Out') + if core.is_compiled_with_rocm(): + self.check_grad(set(['X']), 'Out', max_relative_error=1e-2) + else: + self.check_grad(set(['X']), 'Out') def init_data_format(self): self.data_format = "NCDHW" @@ -400,7 +407,10 @@ def create_test_cudnn_fp16_class(parent): if core.is_compiled_with_cuda(): place = core.CUDAPlace(0) if core.is_float16_supported(place): - self.check_output_with_place(place, atol=1e-3) + if core.is_compiled_with_rocm(): + self.check_output_with_place(place, atol=1e-2) + else: + self.check_output_with_place(place, atol=1e-3) cls_name = "{0}_{1}".format(parent.__name__, "CUDNNFp16Op") TestCUDNNFp16Case.__name__ = cls_name diff --git a/python/paddle/nn/layer/rnn.py b/python/paddle/nn/layer/rnn.py index a899f18f521..0cefb89340a 100644 --- a/python/paddle/nn/layer/rnn.py +++ b/python/paddle/nn/layer/rnn.py @@ -1053,7 +1053,8 @@ class RNNBase(LayerList): initial_states, paddle.fluid.framework.Variable) else initial_states - if self.could_use_cudnn: + if self.could_use_cudnn and (not fluid.core.is_compiled_with_rocm() or + sequence_length is None): # Add CPU kernel and dispatch in backend later return self._cudnn_impl(inputs, initial_states, sequence_length) -- GitLab From bb3b79067e44ef8effba4d9790c9a4aa9c3555f1 Mon Sep 17 00:00:00 2001 From: Aurelius84 Date: Mon, 12 Apr 2021 15:32:47 +0800 Subject: [PATCH 195/486] [CustomOp]Fix description of supporting MacOS (#32192) --- python/paddle/utils/cpp_extension/cpp_extension.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/paddle/utils/cpp_extension/cpp_extension.py b/python/paddle/utils/cpp_extension/cpp_extension.py index 83dc1d2582b..ab528cdb0c0 100644 --- a/python/paddle/utils/cpp_extension/cpp_extension.py +++ b/python/paddle/utils/cpp_extension/cpp_extension.py @@ -75,7 +75,7 @@ def setup(**attr): .. note:: - 1. Currently we support Linux and Windows platfrom. MacOS is supporting... + 1. Currently we support Linux, MacOS and Windows platfrom. 2. On Linux platform, we recommend to use GCC 8.2 as soft linking condidate of ``/usr/bin/cc`` . Then, Use ``which cc`` to ensure location of ``cc`` and using ``cc --version`` to ensure linking GCC version. @@ -745,7 +745,7 @@ def load(name, .. note:: - 1. Currently we support Linux and Windows platfrom. MacOS is supporting... + 1. Currently we support Linux, MacOS and Windows platfrom. 2. On Linux platform, we recommend to use GCC 8.2 as soft linking condidate of ``/usr/bin/cc`` . Then, Use ``which cc`` to ensure location of ``cc`` and using ``cc --version`` to ensure linking GCC version. -- GitLab From 8dacfb5e9a5e6b88d16cba4f3c4a85b668941947 Mon Sep 17 00:00:00 2001 From: YUNSHEN XIE <1084314248@qq.com> Date: Mon, 12 Apr 2021 16:12:15 +0800 Subject: [PATCH 196/486] Optimize the process of obtaining prec_list on windows (#32123) * test,test,notest,test=windows_ci * test,notest,test=windows_ci * test,notest,test=windows_ci * test,notest,test=windows_ci * remove test code * delete some unnecessary logs * fix format error * turn on added ut check on windows --- tools/check_added_ut.sh | 9 +------ tools/get_pr_ut.py | 3 +-- tools/windows/get_prec_ut_list.py | 42 +++++++++++++++++++++++++++++++ tools/windows/run_unittests.sh | 19 +++----------- 4 files changed, 47 insertions(+), 26 deletions(-) create mode 100644 tools/windows/get_prec_ut_list.py diff --git a/tools/check_added_ut.sh b/tools/check_added_ut.sh index 618236f75bf..7301e9954e9 100644 --- a/tools/check_added_ut.sh +++ b/tools/check_added_ut.sh @@ -50,19 +50,12 @@ cd prec_build if [[ "$SYSTEM" == "Linux" ]] || [[ "$SYSTEM" == "Darwin" ]];then bash $PADDLE_ROOT/paddle/scripts/paddle_build_pre.sh cmake_gen_in_current_dir >prebuild.log 2>&1 elif [[ "$SYSTEM" == "Windows_NT" ]];then - bash $PADDLE_ROOT/win_cmake.sh + bash $PADDLE_ROOT/win_cmake.sh >prec_build.log 2>&1 fi ctest -N | awk -F ':' '{print $2}' | sed '/^$/d' | sed '$d' | sed 's/ //g' | grep 'test' > $PADDLE_ROOT/br-ut cd $PADDLE_ROOT/build ctest -N | awk -F ':' '{print $2}' | sed '/^$/d' | sed '$d' | sed 's/ //g' | grep 'test' > $PADDLE_ROOT/pr-ut cd $PADDLE_ROOT -echo "=================================" -echo "br-ut" -cat $PADDLE_ROOT/br-ut -echo "=================================" -echo "pr-ut" -cat $PADDLE_ROOT/pr-ut -echo "=================================" grep -F -x -v -f br-ut pr-ut > $PADDLE_ROOT/added_ut if [[ "$SYSTEM" == 'Linux' ]];then sort pr-ut |uniq -d > $PADDLE_ROOT/duplicate_ut diff --git a/tools/get_pr_ut.py b/tools/get_pr_ut.py index 58d7d2c0d6b..001f380049f 100644 --- a/tools/get_pr_ut.py +++ b/tools/get_pr_ut.py @@ -265,8 +265,7 @@ class PRChecker(object): '.cu'): if f.find('test_') != -1 or f.find('_test') != -1: print('PREC {} need check new ut'.format(f)) - if current_system != "Windows": - check_added_ut = True + check_added_ut = True elif self.is_only_comment(f): ut_list.append('nomap_comment_placeholder') else: diff --git a/tools/windows/get_prec_ut_list.py b/tools/windows/get_prec_ut_list.py new file mode 100644 index 00000000000..ce5b2b6d205 --- /dev/null +++ b/tools/windows/get_prec_ut_list.py @@ -0,0 +1,42 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""To get a list of prec ut """ + +import sys + + +def get_prec_ut_list(all_test_cases, prec_test_cases): + """Select the ut that needs to be executed""" + all_test_cases_list = all_test_cases.strip().split("\n") + prec_test_cases_list = prec_test_cases.strip().split("\n") + all_test_cases_list_new = [item.rstrip() for item in all_test_cases_list] + prec_test_cases_list_new = [item.rstrip() for item in prec_test_cases_list] + + if len(prec_test_cases) == 0: + return "\n".join(all_test_cases_list) + + case_to_run = ['test_prec_ut'] + for case in all_test_cases_list_new: + if case in prec_test_cases_list_new: + case_to_run.append(case) + else: + print("{} will not run in PRECISION_TEST mode.".format(case)) + for case in case_to_run: + print(case) + + +if __name__ == '__main__': + all_test_cases = sys.argv[1] + prec_test_cases = sys.argv[2] + get_prec_ut_list(all_test_cases, prec_test_cases) diff --git a/tools/windows/run_unittests.sh b/tools/windows/run_unittests.sh index 8d52c1b84ae..c05907fb899 100644 --- a/tools/windows/run_unittests.sh +++ b/tools/windows/run_unittests.sh @@ -224,29 +224,16 @@ if [ ${WITH_GPU:-OFF} == "ON" ];then if [ ${PRECISION_TEST:-OFF} == "ON" ]; then python ${PADDLE_ROOT}/tools/get_pr_ut.py if [[ -f "ut_list" ]]; then - set +x echo "PREC length: "`wc -l ut_list` precision_cases=`cat ut_list` - set -x fi fi set +e if [ ${PRECISION_TEST:-OFF} == "ON" ] && [[ "$precision_cases" != "" ]];then - UT_list_prec='' - re=$(cat ut_list|awk -F ' ' '{print }' | awk 'BEGIN{ all_str=""}{if (all_str==""){all_str=$1}else{all_str=all_str"$|^"$1}} END{print "^"all_str"$"}') - for case in $UT_list; do - flag=$(echo $case|grep -oE $re) - if [ -n "$flag" ];then - if [ -z "$UT_list_prec" ];then - UT_list_prec=$case - else - UT_list_prec=$UT_list_prec'\n'$case - fi - else - echo $case "won't run in PRECISION_TEST mode." - fi - done + UT_list_res=$(python ${PADDLE_ROOT}/tools/windows/get_prec_ut_list.py "$UT_list" "$precision_cases" ) + UT_list_prec=$(echo "${UT_list_res}" | grep -v 'PRECISION_TEST') + UT_list_prec_info=$(echo "${UT_list_res}" | grep 'PRECISION_TEST') UT_list=$UT_list_prec fi set -e -- GitLab From 4b5cb22fb9d3aa034866acfdc54bd1843684c228 Mon Sep 17 00:00:00 2001 From: zhulei <563755780@qq.com> Date: Mon, 12 Apr 2021 17:12:39 +0800 Subject: [PATCH 197/486] [Rocm] fix python test of multinomial (#32158) * [Rocm] fix python test of multinomial * [Rocm] fix python test of multinomial * [Rocm] fix python test of multinomial * [Rocm] fix python test of multinomial --- python/paddle/tensor/random.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/python/paddle/tensor/random.py b/python/paddle/tensor/random.py index ba7ca417382..5aca87c1507 100644 --- a/python/paddle/tensor/random.py +++ b/python/paddle/tensor/random.py @@ -135,6 +135,9 @@ def multinomial(x, num_samples=1, replacement=False, name=None): """ + assert core.is_compiled_with_rocm() == False, ( + "multinomial op is not supported on ROCM yet.") + if in_dygraph_mode(): return core.ops.multinomial(x, 'num_samples', num_samples, 'replacement', replacement) -- GitLab From 0624ea568b1067eb6dc4139c85f0778149f526fe Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Mon, 12 Apr 2021 21:19:28 +0800 Subject: [PATCH 198/486] polish custom api content for performence (#32209) --- .../utils/cpp_extension/extension_utils.py | 21 ++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/python/paddle/utils/cpp_extension/extension_utils.py b/python/paddle/utils/cpp_extension/extension_utils.py index 06596c0fae8..30ff3f81ca7 100644 --- a/python/paddle/utils/cpp_extension/extension_utils.py +++ b/python/paddle/utils/cpp_extension/extension_utils.py @@ -793,22 +793,29 @@ def _custom_api_content(op_name): params_str, ins_str, attrs_str, outs_str = _get_api_inputs_str(op_name) API_TEMPLATE = textwrap.dedent(""" + from paddle.fluid.core import VarBase + from paddle.fluid.framework import in_dygraph_mode, _dygraph_tracer from paddle.fluid.layer_helper import LayerHelper def {op_name}({inputs}): - helper = LayerHelper("{op_name}", **locals()) - # prepare inputs and outputs ins = {ins} attrs = {attrs} outs = {{}} out_names = {out_names} - for out_name in out_names: - # Set 'float32' temporarily, and the actual dtype of output variable will be inferred - # in runtime. - outs[out_name] = helper.create_variable(dtype='float32') - helper.append_op(type="{op_name}", inputs=ins, outputs=outs, attrs=attrs) + # The output variable's dtype use default value 'float32', + # and the actual dtype of output variable will be inferred in runtime. + if in_dygraph_mode(): + for out_name in out_names: + outs[out_name] = VarBase() + _dygraph_tracer().trace_op(type="{op_name}", inputs=ins, outputs=outs, attrs=attrs) + else: + helper = LayerHelper("{op_name}", **locals()) + for out_name in out_names: + outs[out_name] = helper.create_variable(dtype='float32') + + helper.append_op(type="{op_name}", inputs=ins, outputs=outs, attrs=attrs) res = [outs[out_name] for out_name in out_names] -- GitLab From 4a09c1a1ed8456865395627c6d37f75c657c1030 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ren=20Wei=20=28=E4=BB=BB=E5=8D=AB=29?= Date: Tue, 13 Apr 2021 10:28:09 +0800 Subject: [PATCH 199/486] run the sample codes added by `add_sample_code` in ops.py (#31863) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * skip paddle.Tensor. * some file may not exists. such as version.py, it's generated by setup.py * debug mode * add unittests for sampcd_processor.py * add test cases for sampcd_processor * add test cases for sampcd_processor * add testcases * add test cases * add testcases * add testcases * refactor, add testcases * add import * all files map to pool. dont split manually * __all__ += another list * add testcases * add testcases * handle个锤子啊 * this line should not removed https://github.com/wadefelix/Paddle/commit/882e7f7c3be6c2415f58550f82be338b84f0c0ef#diff-cb0679475bf60202fd803ae05b9146989437c3f787d1502616be6c71c69d0fb1 * print -> logger * regulate the logging infomation * regulate the logging infomation * logger to file * logger * threads or subprocesses number config * follow the good code style don't touch wlist.json * run test_sampcd_processor.py, it's a unittest for sampcd_processor.py * update unittest for sampcd_processor.py test=document_fix --- tools/check_file_diff_approvals.sh | 10 + tools/sampcd_processor.py | 383 +++++++++++++++++-------- tools/test_sampcd_processor.py | 439 +++++++++++++++++++++++++++++ 3 files changed, 713 insertions(+), 119 deletions(-) create mode 100644 tools/test_sampcd_processor.py diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh index f3bf3ea508b..05466883e58 100644 --- a/tools/check_file_diff_approvals.sh +++ b/tools/check_file_diff_approvals.sh @@ -53,6 +53,7 @@ API_FILES=("CMakeLists.txt" "python/paddle/fluid/tests/unittests/white_list/check_op_sequence_batch_1_input_white_list.py" "python/paddle/fluid/tests/unittests/white_list/no_grad_set_white_list.py" "tools/wlist.json" + "tools/sampcd_processor.py" "paddle/scripts/paddle_build.bat" "tools/windows/run_unittests.sh" "tools/parallel_UT_rule.py" @@ -79,6 +80,12 @@ function add_failed(){ echo_list="${echo_list[@]}$1" } +function run_test_sampcd_processor() { + CUR_PWD=$(pwd) + cd ${PADDLE_ROOT}/tools + python test_sampcd_processor.py + cd ${CUR_PWD} +} if [[ $git_files -gt 19 || $git_count -gt 999 ]];then echo_line="You must have Dianhai approval for change 20+ files or add than 1000+ lines of content.\n" @@ -136,6 +143,9 @@ for API_FILE in ${API_FILES[*]}; do elif [ "${API_FILE}" == "tools/wlist.json" ];then echo_line="You must have one TPM (jzhang533) approval for the api whitelist for the tools/wlist.json.\n" check_approval 1 29231 + elif [ "${API_FILE}" == "tools/sampcd_processor.py" ];then + echo_line="test_sampcd_processor.py will be executed for changed sampcd_processor.py.\n" + run_test_sampcd_processor elif [ "${API_FILE}" == "python/paddle/distributed/fleet/__init__.py" ]; then echo_line="You must have (fuyinno4 (Recommend), raindrops2sea) approval for ${API_FILE} changes" check_approval 1 35824027 38231817 diff --git a/tools/sampcd_processor.py b/tools/sampcd_processor.py index ce0490d487f..fde01329340 100644 --- a/tools/sampcd_processor.py +++ b/tools/sampcd_processor.py @@ -22,6 +22,10 @@ import inspect import paddle import paddle.fluid import json +import argparse +import shutil +import re +import logging """ please make sure to run in the tools path usage: python sample_test.py {arg1} @@ -33,6 +37,26 @@ for example, you can run cpu version python2 testing like this: """ +logger = logging.getLogger() +if logger.handlers: + console = logger.handlers[ + 0] # we assume the first handler is the one we want to configure +else: + console = logging.StreamHandler() + logger.addHandler(console) +console.setFormatter( + logging.Formatter( + "%(asctime)s - %(funcName)s:%(lineno)d - %(levelname)s - %(message)s")) + +RUN_ON_DEVICE = 'cpu' +GPU_ID = 0 +methods = [] +whl_error = [] +API_DEV_SPEC_FN = 'paddle/fluid/API_DEV.spec' +API_PR_SPEC_FN = 'paddle/fluid/API_PR.spec' +API_DIFF_SPEC_FN = 'dev_pr_diff_api.spec' +SAMPLECODE_TEMPDIR = 'samplecode_temp' + def find_all(srcstr, substr): """ @@ -98,9 +122,13 @@ def sampcd_extract_and_run(srccom, name, htype="def", hname=""): Returns: result: True or False + name(str): the name of the API. + msg(str): messages """ + global GPU_ID, RUN_ON_DEVICE, SAMPLECODE_TEMPDIR result = True + msg = None def sampcd_header_print(name, sampcd, htype, hname): """ @@ -113,7 +141,8 @@ def sampcd_extract_and_run(srccom, name, htype="def", hname=""): hname(str): the name of the hint banners , e.t. def hname. flushed. """ - print_header(htype, hname) + print(htype, " name:", hname) + print("-----------------------") print("Sample code ", str(y), " extracted for ", name, " :") print(sampcd) print("----example code check----\n") @@ -122,11 +151,9 @@ def sampcd_extract_and_run(srccom, name, htype="def", hname=""): sampcd_begins = find_all(srccom, " code-block:: python") if len(sampcd_begins) == 0: - print_header(htype, hname) - ''' - detect sample codes using >>> to format - and consider this situation as wrong - ''' + # detect sample codes using >>> to format and consider this situation as wrong + print(htype, " name:", hname) + print("-----------------------") if srccom.find("Examples:") != -1: print("----example code check----\n") if srccom.find(">>>") != -1: @@ -164,23 +191,22 @@ def sampcd_extract_and_run(srccom, name, htype="def", hname=""): sampcd_to_write.append(cdline[min_indent:]) sampcd = '\n'.join(sampcd_to_write) - if sys.argv[1] == "cpu": - sampcd = '\nimport os\n' + 'os.environ["CUDA_VISIBLE_DEVICES"] = ""\n' + sampcd - if sys.argv[1] == "gpu": - sampcd = '\nimport os\n' + 'os.environ["CUDA_VISIBLE_DEVICES"] = "0"\n' + sampcd + if RUN_ON_DEVICE == "cpu": + sampcd = '\nimport os\nos.environ["CUDA_VISIBLE_DEVICES"] = ""\n' + sampcd + if RUN_ON_DEVICE == "gpu": + sampcd = '\nimport os\nos.environ["CUDA_VISIBLE_DEVICES"] = "{}"\n'.format( + GPU_ID) + sampcd sampcd += '\nprint(' + '\"' + name + ' sample code is executed successfully!\")' - if len(sampcd_begins) > 1: - tfname = name + "_example_" + str(y) + ".py" - else: - tfname = name + "_example" + ".py" - tempf = open("samplecode_temp/" + tfname, 'w') - tempf.write(sampcd) - tempf.close() + tfname = os.path.join(SAMPLECODE_TEMPDIR, '{}_example{}'.format( + name, '.py' if len(sampcd_begins) == 1 else '_{}.py'.format(y))) + logging.info('running %s', tfname) + with open(tfname, 'w') as tempf: + tempf.write(sampcd) if platform.python_version()[0] == "2": - cmd = ["python", "samplecode_temp/" + tfname] + cmd = ["python", tfname] elif platform.python_version()[0] == "3": - cmd = ["python3", "samplecode_temp/" + tfname] + cmd = ["python3", tfname] else: print("Error: fail to parse python version!") result = False @@ -199,11 +225,12 @@ def sampcd_extract_and_run(srccom, name, htype="def", hname=""): print("Error Raised from Sample Code ", name, " :\n") print(err) print(msg) + logging.warning('%s error: %s', tfname, err) + logging.warning('%s msg: %s', tfname, msg) result = False # msg is the returned code execution report - #os.remove("samplecode_temp/" + tfname) - return result + return result, name, msg def single_defcom_extract(start_from, srcls, is_class_begin=False): @@ -264,12 +291,7 @@ def single_defcom_extract(start_from, srcls, is_class_begin=False): return fcombody -def print_header(htype, name): - print(htype, " name:", name) - print("-----------------------") - - -def srccoms_extract(srcfile, wlist): +def srccoms_extract(srcfile, wlist, methods): """ Given a source file ``srcfile``, this function will extract its API(doc comments) and run sample codes in the @@ -278,12 +300,15 @@ def srccoms_extract(srcfile, wlist): Args: srcfile(file): the source file wlist(list): white list + methods(list): only elements of this list considered. Returns: result: True or False + error_methods: the methods that failed. """ process_result = True + error_methods = [] srcc = srcfile.read() # 2. get defs and classes header line number # set file pointer to its beginning @@ -292,8 +317,8 @@ def srccoms_extract(srcfile, wlist): # 1. fetch__all__ list allidx = srcc.find("__all__") - srcfile_new = srcfile.name - srcfile_new = srcfile_new.replace('.py', '') + logger.debug('processing %s, methods: %s', srcfile.name, str(methods)) + srcfile_new, _ = os.path.splitext(srcfile.name) srcfile_list = srcfile_new.split('/') srcfile_str = '' for i in range(4, len(srcfile_list)): @@ -323,15 +348,27 @@ def srccoms_extract(srcfile, wlist): if '' in alllist: alllist.remove('') api_alllist_count = len(alllist) + logger.debug('found %d items: %s', api_alllist_count, str(alllist)) api_count = 0 handled = [] # get src contents in layers/ops.py if srcfile.name.find("ops.py") != -1: for i in range(0, len(srcls)): - if srcls[i].find("__doc__") != -1: - opname = srcls[i][:srcls[i].find("__doc__") - 1] + opname = None + opres = re.match(r"^(\w+)\.__doc__", srcls[i]) + if opres is not None: + opname = opres.group(1) + else: + opres = re.match( + r"^add_sample_code\(globals\(\)\[\"(\w+)\"\]", srcls[i]) + if opres is not None: + opname = opres.group(1) + if opname is not None: if opname in wlist: + logger.info('%s is in the whitelist, skip it.', opname) continue + else: + logger.debug('%s\'s docstring found.', opname) comstart = i for j in range(i, len(srcls)): if srcls[j].find("\"\"\"") != -1: @@ -341,51 +378,73 @@ def srccoms_extract(srcfile, wlist): opcom += srcls[j] if srcls[j].find("\"\"\"") != -1: break + result, _, _ = sampcd_extract_and_run(opcom, opname, "def", + opname) + if not result: + error_methods.append(opname) + process_result = False api_count += 1 handled.append( opname) # ops.py also has normal formatted functions # use list 'handled' to mark the functions have been handled here # which will be ignored in the following step + # handled what? + logger.debug('%s already handled.', str(handled)) for i in range(0, len(srcls)): if srcls[i].startswith( 'def '): # a function header is detected in line i f_header = srcls[i].replace(" ", '') fn = f_header[len('def'):f_header.find('(')] # function name if "%s%s" % (srcfile_str, fn) not in methods: + logger.info( + '[file:%s, function:%s] not in methods list, skip it.', + srcfile_str, fn) continue if fn in handled: continue if fn in alllist: api_count += 1 if fn in wlist or fn + "@" + srcfile.name in wlist: + logger.info('[file:%s, function:%s] skip by wlist.', + srcfile_str, fn) continue fcombody = single_defcom_extract(i, srcls) if fcombody == "": # if no comment - print_header("def", fn) + print("def name:", fn) + print("-----------------------") print("WARNING: no comments in function ", fn, ", but it deserves.") continue else: - if not sampcd_extract_and_run(fcombody, fn, "def", fn): + result, _, _ = sampcd_extract_and_run(fcombody, fn, + "def", fn) + if not result: + error_methods.append(fn) process_result = False if srcls[i].startswith('class '): c_header = srcls[i].replace(" ", '') cn = c_header[len('class'):c_header.find('(')] # class name if '%s%s' % (srcfile_str, cn) not in methods: + logger.info( + '[file:%s, class:%s] not in methods list, skip it.', + srcfile_str, cn) continue if cn in handled: continue if cn in alllist: api_count += 1 if cn in wlist or cn + "@" + srcfile.name in wlist: + logger.info('[file:%s, class:%s] skip by wlist.', + srcfile_str, cn) continue # class comment classcom = single_defcom_extract(i, srcls, True) if classcom != "": - if not sampcd_extract_and_run(classcom, cn, "class", - cn): - + result, _, _ = sampcd_extract_and_run(classcom, cn, + "class", cn) + if not result: + error_methods.append(cn) process_result = False else: print("WARNING: no comments in class itself ", cn, @@ -410,10 +469,19 @@ def srccoms_extract(srcfile, wlist): if '%s%s' % ( srcfile_str, name ) not in methods: # class method not in api.spec + logger.info( + '[file:%s, func:%s] not in methods, skip it.', + srcfile_str, name) continue if mn.startswith('_'): + logger.info( + '[file:%s, func:%s] startswith _, it\'s private method, skip it.', + srcfile_str, name) continue if name in wlist or name + "@" + srcfile.name in wlist: + logger.info( + '[file:%s, class:%s] skip by wlist.', + srcfile_str, name) continue thismethod = [thisl[indent:] ] # method body lines @@ -434,22 +502,38 @@ def srccoms_extract(srcfile, wlist): thismtdcom = single_defcom_extract(0, thismethod) if thismtdcom != "": - if not sampcd_extract_and_run( - thismtdcom, name, "method", name): + result, _, _ = sampcd_extract_and_run( + thismtdcom, name, "method", name) + if not result: + error_methods.append(name) process_result = False + else: + logger.warning('__all__ not found in file:%s', srcfile.name) - return process_result + return process_result, error_methods def test(file_list): + global methods # readonly process_result = True for file in file_list: with open(file, 'r') as src: - if not srccoms_extract(src, wlist): + if not srccoms_extract(src, wlist, methods): process_result = False return process_result +def run_a_test(tc_filename): + """ + execute a sample code-block. + """ + global methods # readonly + process_result = True + with open(tc_filename, 'r') as src: + process_result, error_methods = srccoms_extract(src, wlist, methods) + return process_result, tc_filename, error_methods + + def get_filenames(): ''' this function will get the modules that pending for check. @@ -460,12 +544,12 @@ def get_filenames(): ''' filenames = [] - global methods + global methods # write global whl_error methods = [] whl_error = [] get_incrementapi() - API_spec = 'dev_pr_diff_api.spec' + API_spec = API_DIFF_SPEC_FN with open(API_spec) as f: for line in f.readlines(): api = line.replace('\n', '') @@ -474,17 +558,30 @@ def get_filenames(): except AttributeError: whl_error.append(api) continue + except SyntaxError: + logger.warning('line:%s, api:%s', line, api) + # paddle.Tensor. + continue if len(module.split('.')) > 1: filename = '../python/' + # work for .so? module_py = '%s.py' % module.split('.')[-1] for i in range(0, len(module.split('.')) - 1): filename = filename + '%s/' % module.split('.')[i] filename = filename + module_py else: filename = '' - print("\nWARNING:----Exception in get api filename----\n") - print("\n" + api + ' module is ' + module + "\n") - if filename != '' and filename not in filenames: + logger.warning("WARNING: Exception in getting api:%s module:%s", + api, module) + if filename in filenames: + continue + elif not filename: + logger.warning('filename invalid: %s', line) + continue + elif not os.path.exists(filename): + logger.warning('file not exists: %s', filename) + continue + else: filenames.append(filename) # get all methods method = '' @@ -496,9 +593,9 @@ def get_filenames(): name = '%s.%s' % (api.split('.')[-2], api.split('.')[-1]) else: name = '' - print("\nWARNING:----Exception in get api methods----\n") - print("\n" + line + "\n") - print("\n" + api + ' method is None!!!' + "\n") + logger.warning( + "WARNING: Exception when getting api:%s, line:%s", api, + line) for j in range(2, len(module.split('.'))): method = method + '%s.' % module.split('.')[j] method = method + name @@ -508,26 +605,27 @@ def get_filenames(): return filenames +def get_api_md5(path): + api_md5 = {} + API_spec = '%s/%s' % (os.path.abspath(os.path.join(os.getcwd(), "..")), + path) + with open(API_spec) as f: + for line in f.readlines(): + api = line.split(' ', 1)[0] + md5 = line.split("'document', ")[1].replace(')', '').replace('\n', + '') + api_md5[api] = md5 + return api_md5 + + def get_incrementapi(): ''' this function will get the apis that difference between API_DEV.spec and API_PR.spec. ''' - - def get_api_md5(path): - api_md5 = {} - API_spec = '%s/%s' % (os.path.abspath(os.path.join(os.getcwd(), "..")), - path) - with open(API_spec) as f: - for line in f.readlines(): - api = line.split(' ', 1)[0] - md5 = line.split("'document', ")[1].replace(')', '').replace( - '\n', '') - api_md5[api] = md5 - return api_md5 - - dev_api = get_api_md5('paddle/fluid/API_DEV.spec') - pr_api = get_api_md5('paddle/fluid/API_PR.spec') - with open('dev_pr_diff_api.spec', 'w') as f: + global API_DEV_SPEC_FN, API_PR_SPEC_FN, API_DIFF_SPEC_FN ## readonly + dev_api = get_api_md5(API_DEV_SPEC_FN) + pr_api = get_api_md5(API_PR_SPEC_FN) + with open(API_DIFF_SPEC_FN, 'w') as f: for key in pr_api: if key in dev_api: if dev_api[key] != pr_api[key]: @@ -538,7 +636,7 @@ def get_incrementapi(): f.write('\n') -def get_wlist(): +def get_wlist(fn="wlist.json"): ''' this function will get the white list of API. @@ -551,7 +649,7 @@ def get_wlist(): wlist_file = [] # only white on CPU gpu_not_white = [] - with open("wlist.json", 'r') as load_f: + with open(fn, 'r') as load_f: load_dict = json.load(load_f) for key in load_dict: if key == 'wlist_dir': @@ -567,31 +665,77 @@ def get_wlist(): return wlist, wlist_file, gpu_not_white -wlist, wlist_file, gpu_not_white = get_wlist() +arguments = [ + # flags, dest, type, default, help + ['--gpu_id', 'gpu_id', int, 0, 'GPU device id to use [0]'], + ['--logf', 'logf', str, None, 'file for logging'], + ['--threads', 'threads', int, 0, 'sub processes number'], +] -if len(sys.argv) < 2: - print("Error: inadequate number of arguments") - print('''If you are going to run it on - "CPU: >>> python sampcd_processor.py cpu - "GPU: >>> python sampcd_processor.py gpu - ''') - sys.exit("lack arguments") -else: - if sys.argv[1] == "gpu": + +def parse_args(): + """ + Parse input arguments + """ + global arguments + parser = argparse.ArgumentParser(description='run Sample Code Test') + # parser.add_argument('--cpu', dest='cpu_mode', action="store_true", + # help='Use CPU mode (overrides --gpu)') + # parser.add_argument('--gpu', dest='gpu_mode', action="store_true") + parser.add_argument('--debug', dest='debug', action="store_true") + parser.add_argument('mode', type=str, help='run on device', default='cpu') + for item in arguments: + parser.add_argument( + item[0], dest=item[1], help=item[4], type=item[2], default=item[3]) + + if len(sys.argv) == 1: + args = parser.parse_args(['cpu']) + return args + # parser.print_help() + # sys.exit(1) + + args = parser.parse_args() + return args + + +if __name__ == '__main__': + args = parse_args() + if args.debug: + logger.setLevel(logging.DEBUG) + if args.logf: + logfHandler = logging.FileHandler(args.logf) + logfHandler.setFormatter( + logging.Formatter( + "%(asctime)s - %(funcName)s:%(lineno)d - %(levelname)s - %(message)s" + )) + logger.addHandler(logfHandler) + + wlist, wlist_file, gpu_not_white = get_wlist() + + if args.mode == "gpu": + GPU_ID = args.gpu_id + logger.info("using GPU_ID %d", GPU_ID) for _gnw in gpu_not_white: wlist.remove(_gnw) - elif sys.argv[1] != "cpu": - print("Unrecognized argument:'", sys.argv[1], "' , 'cpu' or 'gpu' is ", - "desired\n") + elif args.mode != "cpu": + logger.error("Unrecognized argument:%s, 'cpu' or 'gpu' is desired.", + args.mode) sys.exit("Invalid arguments") - print("API check -- Example Code") - print("sample_test running under python", platform.python_version()) - if not os.path.isdir("./samplecode_temp"): - os.mkdir("./samplecode_temp") - cpus = multiprocessing.cpu_count() + RUN_ON_DEVICE = args.mode + logger.info("API check -- Example Code") + logger.info("sample_test running under python %s", + platform.python_version()) + + if os.path.exists(SAMPLECODE_TEMPDIR): + if not os.path.isdir(SAMPLECODE_TEMPDIR): + os.remove(SAMPLECODE_TEMPDIR) + os.mkdir(SAMPLECODE_TEMPDIR) + else: + os.mkdir(SAMPLECODE_TEMPDIR) + filenames = get_filenames() if len(filenames) == 0 and len(whl_error) == 0: - print("-----API_PR.spec is the same as API_DEV.spec-----") + logger.info("-----API_PR.spec is the same as API_DEV.spec-----") exit(0) rm_file = [] for f in filenames: @@ -600,51 +744,52 @@ else: rm_file.append(f) filenames.remove(f) if len(rm_file) != 0: - print("REMOVE white files: %s" % rm_file) - print("API_PR is diff from API_DEV: %s" % filenames) - one_part_filenum = int(math.ceil(len(filenames) / cpus)) - if one_part_filenum == 0: - one_part_filenum = 1 - divided_file_list = [ - filenames[i:i + one_part_filenum] - for i in range(0, len(filenames), one_part_filenum) - ] - - po = multiprocessing.Pool() - results = po.map_async(test, divided_file_list) + logger.info("REMOVE white files: %s", rm_file) + logger.info("API_PR is diff from API_DEV: %s", filenames) + + threads = multiprocessing.cpu_count() + if args.threads: + threads = args.threads + po = multiprocessing.Pool(threads) + # results = po.map_async(test, divided_file_list) + results = po.map_async(run_a_test, filenames) po.close() po.join() result = results.get() # delete temp files - for root, dirs, files in os.walk("./samplecode_temp"): - for fntemp in files: - os.remove("./samplecode_temp/" + fntemp) - os.rmdir("./samplecode_temp") + if not args.debug: + shutil.rmtree(SAMPLECODE_TEMPDIR) - print("----------------End of the Check--------------------") + logger.info("----------------End of the Check--------------------") if len(whl_error) != 0: - print("%s is not in whl." % whl_error) - print("") - print("Please check the whl package and API_PR.spec!") - print("You can follow these steps in order to generate API.spec:") - print("1. cd ${paddle_path}, compile paddle;") - print("2. pip install build/python/dist/(build whl package);") - print( + logger.info("%s is not in whl.", whl_error) + logger.info("") + logger.info("Please check the whl package and API_PR.spec!") + logger.info("You can follow these steps in order to generate API.spec:") + logger.info("1. cd ${paddle_path}, compile paddle;") + logger.info("2. pip install build/python/dist/(build whl package);") + logger.info( "3. run 'python tools/print_signatures.py paddle > paddle/fluid/API.spec'." ) for temp in result: - if not temp: - print("") - print("In addition, mistakes found in sample codes.") - print("Please check sample codes.") - print("----------------------------------------------------") + if not temp[0]: + logger.info("In addition, mistakes found in sample codes: %s", + temp[1]) + logger.info("error_methods: %s", str(temp[2])) + logger.info("----------------------------------------------------") exit(1) else: + has_error = False for temp in result: - if not temp: - print("Mistakes found in sample codes.") - print("Please check sample codes.") - exit(1) - print("Sample code check is successful!") + if not temp[0]: + logger.info("In addition, mistakes found in sample codes: %s", + temp[1]) + logger.info("error_methods: %s", str(temp[2])) + has_error = True + if has_error: + logger.info("Mistakes found in sample codes.") + logger.info("Please check sample codes.") + exit(1) + logger.info("Sample code check is successful!") diff --git a/tools/test_sampcd_processor.py b/tools/test_sampcd_processor.py new file mode 100644 index 00000000000..d8f47d1af58 --- /dev/null +++ b/tools/test_sampcd_processor.py @@ -0,0 +1,439 @@ +#! python + +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import os +import tempfile +import shutil +import sys +import importlib +from sampcd_processor import find_all +from sampcd_processor import check_indent +from sampcd_processor import sampcd_extract_and_run +from sampcd_processor import single_defcom_extract +from sampcd_processor import srccoms_extract +from sampcd_processor import get_api_md5 +from sampcd_processor import get_incrementapi +from sampcd_processor import get_wlist + + +class Test_find_all(unittest.TestCase): + def test_find_none(self): + self.assertEqual(0, len(find_all('hello', 'world'))) + + def test_find_one(self): + self.assertListEqual([0], find_all('hello', 'hello')) + + def test_find_two(self): + self.assertListEqual([1, 15], + find_all(' hello, world; hello paddle!', 'hello')) + + +class Test_check_indent(unittest.TestCase): + def test_no_indent(self): + self.assertEqual(0, check_indent('hello paddle')) + + def test_indent_4_spaces(self): + self.assertEqual(4, check_indent(' hello paddle')) + + def test_indent_1_tab(self): + self.assertEqual(4, check_indent("\thello paddle")) + + +class Test_sampcd_extract_and_run(unittest.TestCase): + def setUp(self): + if not os.path.exists('samplecode_temp/'): + os.mkdir('samplecode_temp/') + + def test_run_a_defs_samplecode(self): + comments = """ + Examples: + .. code-block:: python + print(1+1) + """ + funcname = 'one_plus_one' + res, name, msg = sampcd_extract_and_run(comments, funcname) + self.assertTrue(res) + self.assertEqual(funcname, name) + + def test_run_a_def_no_code(self): + comments = """ + placeholder + """ + funcname = 'one_plus_one' + res, name, msg = sampcd_extract_and_run(comments, funcname) + self.assertFalse(res) + self.assertEqual(funcname, name) + + def test_run_a_def_raise_expection(self): + comments = """ + placeholder + Examples: + .. code-block:: python + print(1/0) + """ + funcname = 'one_plus_one' + res, name, msg = sampcd_extract_and_run(comments, funcname) + self.assertFalse(res) + self.assertEqual(funcname, name) + + +class Test_single_defcom_extract(unittest.TestCase): + def test_extract_from_func(self): + defstr = ''' +import os +def foo(): + """ + foo is a function. + """ + pass +def bar(): + pass +''' + comm = single_defcom_extract( + 2, defstr.splitlines(True), is_class_begin=False) + self.assertEqual(" foo is a function.\n", comm) + pass + + def test_extract_from_func_with_no_docstring(self): + defstr = ''' +import os +def bar(): + pass +''' + comm = single_defcom_extract( + 2, defstr.splitlines(True), is_class_begin=False) + self.assertEqual('', comm) + pass + + def test_extract_from_class(self): + defstr = r''' +import os +class Foo(): + """ + Foo is a class. + second line. + """ + pass + def bar(): + pass +def foo(): + pass +''' + comm = single_defcom_extract( + 2, defstr.splitlines(True), is_class_begin=True) + rcomm = """ Foo is a class. + second line. +""" + self.assertEqual(rcomm, comm) + pass + + def test_extract_from_class_with_no_docstring(self): + defstr = ''' +import os +class Foo(): + pass + def bar(): + pass +def foo(): + pass +''' + comm = single_defcom_extract( + 0, defstr.splitlines(True), is_class_begin=True) + self.assertEqual('', comm) + + +class Test_get_api_md5(unittest.TestCase): + def setUp(self): + self.api_pr_spec_filename = os.path.abspath( + os.path.join(os.getcwd(), "..", 'paddle/fluid/API_PR.spec')) + with open(self.api_pr_spec_filename, 'w') as f: + f.write("\n".join([ + """one_plus_one (ArgSpec(args=[], varargs=None, keywords=None, defaults=(,)), ('document', 'md5sum of one_plus_one'))""", + """two_plus_two (ArgSpec(args=[], varargs=None, keywords=None, defaults=(,)), ('document', 'md5sum of two_plus_two'))""", + """three_plus_three (ArgSpec(args=[], varargs=None, keywords=None, defaults=(,)), ('document', 'md5sum of three_plus_three'))""", + """four_plus_four (ArgSpec(args=[], varargs=None, keywords=None, defaults=(,)), ('document', 'md5sum of four_plus_four'))""", + ])) + + def tearDown(self): + os.remove(self.api_pr_spec_filename) + pass + + def test_get_api_md5(self): + res = get_api_md5('paddle/fluid/API_PR.spec') + self.assertEqual("'md5sum of one_plus_one'", res['one_plus_one']) + self.assertEqual("'md5sum of two_plus_two'", res['two_plus_two']) + self.assertEqual("'md5sum of three_plus_three'", + res['three_plus_three']) + self.assertEqual("'md5sum of four_plus_four'", res['four_plus_four']) + + +class Test_get_incrementapi(unittest.TestCase): + def setUp(self): + self.api_pr_spec_filename = os.path.abspath( + os.path.join(os.getcwd(), "..", 'paddle/fluid/API_PR.spec')) + with open(self.api_pr_spec_filename, 'w') as f: + f.write("\n".join([ + """one_plus_one (ArgSpec(args=[], varargs=None, keywords=None, defaults=(,)), ('document', 'md5sum of one_plus_one'))""", + """two_plus_two (ArgSpec(args=[], varargs=None, keywords=None, defaults=(,)), ('document', 'md5sum of two_plus_two'))""", + """three_plus_three (ArgSpec(args=[], varargs=None, keywords=None, defaults=(,)), ('document', 'md5sum of three_plus_three'))""", + """four_plus_four (ArgSpec(args=[], varargs=None, keywords=None, defaults=(,)), ('document', 'md5sum of four_plus_four'))""", + ])) + self.api_dev_spec_filename = os.path.abspath( + os.path.join(os.getcwd(), "..", 'paddle/fluid/API_DEV.spec')) + with open(self.api_dev_spec_filename, 'w') as f: + f.write("\n".join([ + """one_plus_one (ArgSpec(args=[], varargs=None, keywords=None, defaults=(,)), ('document', 'md5sum of one_plus_one'))""", + ])) + self.api_diff_spec_filename = os.path.abspath( + os.path.join(os.getcwd(), "dev_pr_diff_api.spec")) + + def tearDown(self): + os.remove(self.api_pr_spec_filename) + os.remove(self.api_dev_spec_filename) + os.remove(self.api_diff_spec_filename) + + def test_it(self): + get_incrementapi() + with open(self.api_diff_spec_filename, 'r') as f: + lines = f.readlines() + self.assertCountEqual( + ["two_plus_two\n", "three_plus_three\n", "four_plus_four\n"], + lines) + + +class Test_get_wlist(unittest.TestCase): + def setUp(self): + self.tmpDir = tempfile.mkdtemp() + self.wlist_filename = os.path.join(self.tmpDir, 'wlist.json') + with open(self.wlist_filename, 'w') as f: + f.write(r''' +{ + "wlist_dir":[ + { + "name":"../python/paddle/fluid/contrib", + "annotation":"" + }, + { + "name":"../python/paddle/verison.py", + "annotation":"" + } + ], + "wlist_api":[ + { + "name":"xxxxx", + "annotation":"not a real api, just for example" + } + ], + "wlist_temp_api":[ + "to_tensor", + "save_persistables@dygraph/checkpoint.py" + ], + "gpu_not_white":[ + "deformable_conv" + ] +} +''') + + def tearDown(self): + os.remove(self.wlist_filename) + shutil.rmtree(self.tmpDir) + + def test_get_wlist(self): + wlist, wlist_file, gpu_not_white = get_wlist(self.wlist_filename) + self.assertCountEqual( + ["xxxxx", "to_tensor", + "save_persistables@dygraph/checkpoint.py"], wlist) + self.assertCountEqual([ + "../python/paddle/fluid/contrib", + "../python/paddle/verison.py", + ], wlist_file) + self.assertCountEqual(["deformable_conv"], gpu_not_white) + + +class Test_srccoms_extract(unittest.TestCase): + def setUp(self): + self.tmpDir = tempfile.mkdtemp() + sys.path.append(self.tmpDir) + self.api_pr_spec_filename = os.path.abspath( + os.path.join(os.getcwd(), "..", 'paddle/fluid/API_PR.spec')) + with open(self.api_pr_spec_filename, 'w') as f: + f.write("\n".join([ + """one_plus_one (ArgSpec(args=[], varargs=None, keywords=None, defaults=(,)), ('document', "one_plus_one"))""", + """two_plus_two (ArgSpec(args=[], varargs=None, keywords=None, defaults=(,)), ('document', "two_plus_two"))""", + """three_plus_three (ArgSpec(args=[], varargs=None, keywords=None, defaults=(,)), ('document', "three_plus_three"))""", + """four_plus_four (ArgSpec(args=[], varargs=None, keywords=None, defaults=(,)), ('document', "four_plus_four"))""", + ])) + + def tearDown(self): + sys.path.remove(self.tmpDir) + shutil.rmtree(self.tmpDir) + os.remove(self.api_pr_spec_filename) + + def test_from_ops_py(self): + filecont = ''' +def add_sample_code(obj, docstr): + pass + +__unary_func__ = [ + 'exp', +] + +__all__ = [] +__all__ += __unary_func__ +__all__ += ['one_plus_one'] + +def exp(): + pass +add_sample_code(globals()["exp"], r""" +Examples: + .. code-block:: python + import paddle + x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3]) + out = paddle.exp(x) + print(out) + # [0.67032005 0.81873075 1.10517092 1.34985881] +""") + +def one_plus_one(): + return 1+1 + +one_plus_one.__doc__ = """ + placeholder + + Examples: + .. code-block:: python + print(1+1) +""" + +__all__ += ['two_plus_two'] +def two_plus_two(): + return 2+2 +add_sample_code(globals()["two_plus_two"], """ + Examples: + .. code-block:: python + print(2+2) +""") +''' + pyfilename = os.path.join(self.tmpDir, 'ops.py') + with open(pyfilename, 'w') as pyfile: + pyfile.write(filecont) + self.assertTrue(os.path.exists(pyfilename)) + utsp = importlib.import_module('ops') + print('testing srccoms_extract from ops.py') + methods = ['one_plus_one', 'two_plus_two', 'exp'] + # os.remove("samplecode_temp/" "one_plus_one_example.py") + self.assertFalse( + os.path.exists("samplecode_temp/" + "one_plus_one_example.py")) + with open(pyfilename, 'r') as pyfile: + res, error_methods = srccoms_extract(pyfile, [], methods) + self.assertTrue(res) + self.assertTrue( + os.path.exists("samplecode_temp/" + "one_plus_one_example.py")) + os.remove("samplecode_temp/" "one_plus_one_example.py") + self.assertTrue( + os.path.exists("samplecode_temp/" + "two_plus_two_example.py")) + os.remove("samplecode_temp/" "two_plus_two_example.py") + self.assertTrue(os.path.exists("samplecode_temp/" "exp_example.py")) + os.remove("samplecode_temp/" "exp_example.py") + + def test_from_not_ops_py(self): + filecont = ''' +__all__ = [ + 'one_plus_one' +] + +def one_plus_one(): + """ + placeholder + + Examples: + .. code-block:: python + print(1+1) + """ + return 1+1 + +''' + pyfilename = os.path.join(self.tmpDir, 'opo.py') + with open(pyfilename, 'w') as pyfile: + pyfile.write(filecont) + utsp = importlib.import_module('opo') + methods = ['one_plus_one'] + with open(pyfilename, 'r') as pyfile: + res, error_methods = srccoms_extract(pyfile, [], methods) + self.assertTrue(res) + self.assertTrue( + os.path.exists("samplecode_temp/" + "one_plus_one_example.py")) + os.remove("samplecode_temp/" "one_plus_one_example.py") + + def test_with_empty_wlist(self): + """ + see test_from_ops_py + """ + pass + + def test_with_wlist(self): + filecont = ''' +__all__ = [ + 'four_plus_four', + 'three_plus_three' + ] + +def four_plus_four(): + """ + placeholder + + Examples: + .. code-block:: python + print(4+4) + """ + return 4+4 +def three_plus_three(): + """ + placeholder + + Examples: + .. code-block:: python + print(3+3) + """ + return 3+3 + +''' + pyfilename = os.path.join(self.tmpDir, 'three_and_four.py') + with open(pyfilename, 'w') as pyfile: + pyfile.write(filecont) + utsp = importlib.import_module('three_and_four') + methods = ['four_plus_four', 'three_plus_three'] + with open(pyfilename, 'r') as pyfile: + res, error_methods = srccoms_extract(pyfile, ['three_plus_three'], + methods) + self.assertTrue(res) + self.assertTrue( + os.path.exists("samplecode_temp/four_plus_four_example.py")) + os.remove("samplecode_temp/" "four_plus_four_example.py") + self.assertFalse( + os.path.exists("samplecode_temp/three_plus_three_example.py")) + + +# https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/layers/ops.py +# why? unabled to use the ast module. emmmmm + +if __name__ == '__main__': + unittest.main() -- GitLab From fdf63b4e31ed6ef6026c8b17e6266c28ec41e3ad Mon Sep 17 00:00:00 2001 From: jiangcheng Date: Tue, 13 Apr 2021 10:33:03 +0800 Subject: [PATCH 200/486] optimize check_finite_and_unscale_op by fused kernel, test=develop (#31954) --- .../amp/check_finite_and_unscale_op.cu | 105 ++++++++++++++---- 1 file changed, 84 insertions(+), 21 deletions(-) diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu index 6840e4847c4..2c3a9c366e4 100644 --- a/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu +++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu @@ -26,18 +26,48 @@ __global__ void InverseAndMemset(const T* s, T* o, bool* found_inf) { } template -__global__ void CheckFiniteAndUnscale(const T* in, const MT* scale, int num, - bool* found_inf, T* out) { - const int idx = threadIdx.x + blockIdx.x * blockDim.x; - - if (idx < num) { - MT val = static_cast(in[idx]) * (*scale); +__global__ void CheckFiniteAndUnscale(const T** xs, const MT* scale, + int64_t size, int64_t* starts, + bool* found_inf, T** outs) { + const int64_t tid = threadIdx.x + blockIdx.x * blockDim.x; + + // copy starts array from global memory to shared memory + extern __shared__ int64_t s_starts[]; + for (int i = threadIdx.x; i <= size; i += blockDim.x) { + s_starts[i] = starts[i]; + } + __syncthreads(); + + const int64_t num = s_starts[size]; + int pre_xs_index = 0; + bool t_found_inf = false; + const MT t_scale = *scale; + for (int64_t idx = tid; idx < num; idx += gridDim.x * blockDim.x) { + // get the xs's index of thread + int xs_index = pre_xs_index; + while (idx < s_starts[xs_index]) xs_index++; + // avoid some tensor's numel is zero + while (idx >= s_starts[xs_index]) xs_index++; + pre_xs_index = xs_index - 1; + + // get in data and out data + const T* in = xs[pre_xs_index]; + T* out = outs[pre_xs_index]; + int64_t in_idx = idx - s_starts[pre_xs_index]; + + // Unscale + MT val = static_cast(in[in_idx]) * t_scale; T narrow_val = static_cast(val); - out[idx] = narrow_val; + out[in_idx] = narrow_val; + + // CheckFinite if (!isfinite(narrow_val)) { - *found_inf = true; + t_found_inf = true; } } + if (t_found_inf) { + *found_inf = true; + } } template @@ -63,20 +93,53 @@ class CheckFiniteAndUnscaleGpuKernel : public framework::OpKernel { InverseAndMemset<<<1, 1, 0, dev_ctx.stream()>>>( scale_data, inverse_scale_v, found_inf_data); - for (size_t i = 0; i < xs.size(); ++i) { - const auto* x = xs[i]; - auto* out = outs[i]; - const T* x_data = x->data(); - T* out_data = out->mutable_data(dev_ctx.GetPlace()); - - int num = x->numel(); - int block = 1024; - int grid = (num + block - 1) / block; - VLOG(3) << "launch kernel"; - CheckFiniteAndUnscale<<>>( - x_data, inverse_scale_v, num, found_inf_data, out_data); - VLOG(3) << "finish kernel"; + size_t xs_size = xs.size(); + // calculate each tensor's start index and copy to device + auto h_starts_tensor = + memory::Alloc(platform::CPUPlace(), (xs_size + 1) * sizeof(int64_t)); + int64_t* h_starts = reinterpret_cast(h_starts_tensor->ptr()); + + auto d_starts_tensor = + memory::Alloc(dev_ctx, (xs_size + 1) * sizeof(int64_t)); + int64_t* d_starts = reinterpret_cast(d_starts_tensor->ptr()); + + h_starts[0] = 0; + for (int i = 1; i <= xs_size; i++) { + // the start index value of each tensor is + // the sum of previous tensor's size + h_starts[i] = h_starts[i - 1] + xs[i - 1]->numel(); + } + int64_t total_num = h_starts[xs_size]; + memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), + d_starts, platform::CPUPlace(), h_starts, + (xs_size + 1) * sizeof(int64_t), dev_ctx.stream()); + + // copy each tensor's data address to device + auto h_mem = memory::Alloc(platform::CPUPlace(), 2 * xs_size * sizeof(T*)); + const T** h_xs = reinterpret_cast(h_mem->ptr()); + T** h_outs = reinterpret_cast(h_mem->ptr()) + xs_size; + + auto d_mem = memory::Alloc(dev_ctx, 2 * xs_size * sizeof(T*)); + const T** d_xs = reinterpret_cast(d_mem->ptr()); + T** d_outs = reinterpret_cast(d_mem->ptr()) + xs_size; + + for (size_t i = 0; i < xs_size; ++i) { + h_xs[i] = xs[i]->data(); + h_outs[i] = outs[i]->mutable_data(dev_ctx.GetPlace()); } + memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), d_xs, + platform::CPUPlace(), h_xs, 2 * xs_size * sizeof(T*), + dev_ctx.stream()); + + // Launch Kernel + int block = 1024; + int block_num = block * 20; // each thread deal with 20 number + int grid = (total_num + block_num - 1) / block_num; + VLOG(3) << "launch kernel"; + CheckFiniteAndUnscale<<< + grid, block, (xs_size + 1) * sizeof(int64_t), dev_ctx.stream()>>>( + d_xs, inverse_scale_v, xs_size, d_starts, found_inf_data, d_outs); + VLOG(3) << "finish kernel"; } }; } // namespace operators -- GitLab From 693c7629eb948116f9cdf433175f52f7c8951713 Mon Sep 17 00:00:00 2001 From: Qi Li Date: Tue, 13 Apr 2021 12:09:41 +0800 Subject: [PATCH 201/486] [ROCM] fix depth conv2d in rocm, test=develop (#32170) --- paddle/fluid/operators/conv_cudnn_op.cu | 9 ++++++++- paddle/fluid/operators/math/depthwise_conv.cu | 3 +-- python/paddle/fluid/layers/nn.py | 4 ++++ python/paddle/fluid/tests/unittests/test_conv2d_op.py | 11 +++++++++++ python/paddle/nn/functional/conv.py | 9 ++++++++- python/paddle/nn/layer/conv.py | 11 +++++++---- 6 files changed, 39 insertions(+), 8 deletions(-) diff --git a/paddle/fluid/operators/conv_cudnn_op.cu b/paddle/fluid/operators/conv_cudnn_op.cu index 39e9d37ddc6..ab535e341f7 100644 --- a/paddle/fluid/operators/conv_cudnn_op.cu +++ b/paddle/fluid/operators/conv_cudnn_op.cu @@ -1363,7 +1363,14 @@ REGISTER_OP_KERNEL( conv2d_grad_grad, CUDNN, plat::CUDAPlace, paddle::operators::CUDNNConvDoubleGradOpKernel, paddle::operators::CUDNNConvDoubleGradOpKernel); - +// ROCM has limit thread in depthwise_conv.cu and willl result in accuracy issue +// Use depthwise_conv2d in MIOPEN to resolve this issue +REGISTER_OP_KERNEL(depthwise_conv2d, CUDNN, plat::CUDAPlace, + paddle::operators::CUDNNConvOpKernel, + paddle::operators::CUDNNConvOpKernel); +REGISTER_OP_KERNEL(depthwise_conv2d_grad, CUDNN, plat::CUDAPlace, + paddle::operators::CUDNNConvGradOpKernel, + paddle::operators::CUDNNConvGradOpKernel); REGISTER_OP_CUDA_KERNEL( depthwise_conv2d_grad_grad, paddle::operators::CUDNNConvDoubleGradOpKernel, diff --git a/paddle/fluid/operators/math/depthwise_conv.cu b/paddle/fluid/operators/math/depthwise_conv.cu index 5fd543b5c6c..7c5f59fab0d 100644 --- a/paddle/fluid/operators/math/depthwise_conv.cu +++ b/paddle/fluid/operators/math/depthwise_conv.cu @@ -919,11 +919,10 @@ class DepthwiseConvFunctor Date: Tue, 13 Apr 2021 14:12:24 +0800 Subject: [PATCH 202/486] add layer.to api (#32040) * add layer.to api * add layer.to api * add layer.to api * add the doc for Layer.to * add input type checking * modify assert and import bug * format code style * format code style * make place support str type * add SetGradVarBase method to set the gradient after conversion * modify argument palce to device * modify argument palce to device * modify doc of layers.to API * add xpuplace to device argument --- paddle/fluid/imperative/layer.h | 4 + paddle/fluid/pybind/imperative.cc | 14 +++ python/paddle/device.py | 48 ++++---- python/paddle/fluid/dygraph/layers.py | 109 ++++++++++++++++++ .../fluid/tests/unittests/test_base_layer.py | 67 +++++++++++ 5 files changed, 220 insertions(+), 22 deletions(-) diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h index f87db415768..e43921636d9 100644 --- a/paddle/fluid/imperative/layer.h +++ b/paddle/fluid/imperative/layer.h @@ -108,6 +108,10 @@ class VarBase { void ClearGradVarBase() { grad_var_ = nullptr; } + void SetGradVarBase(VarBase& grad_var) { + MutableGradVarBase()->CopyFrom(grad_var, true); + } + const std::shared_ptr& MutableGradVarBase() { if (grad_var_ == nullptr) { if (auto grad_var_wrapper = var_->GetGradVar()) { diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index 4ab507fe367..68c6b855572 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -1032,6 +1032,10 @@ void BindImperative(py::module *m_ptr) { return std::shared_ptr(nullptr); }, py::return_value_policy::copy) + .def("_set_grad_ivar", + [](imperative::VarBase &self, imperative::VarBase &grad) { + self.SetGradVarBase(grad); + }) .def("_is_sparse", [](imperative::VarBase &self) { return self.Var().IsType(); @@ -1278,6 +1282,16 @@ void BindImperative(py::module *m_ptr) { return new_var; }, py::return_value_policy::copy) + .def("_copy_to", + [](const std::shared_ptr &self, + const platform::Place &place, bool blocking) { + auto new_var = self->NewVarBase(place, blocking); + if (!blocking) { + IncreaseVarbaseReferenceCountUntilCopyComplete(self, place); + } + return new_var; + }, + py::return_value_policy::copy) .def("value", [](imperative::VarBase &self) { return self.MutableVar(); }, py::return_value_policy::reference) .def_property("name", &imperative::VarBase::Name, diff --git a/python/paddle/device.py b/python/paddle/device.py index 20453998fb7..035d240e713 100644 --- a/python/paddle/device.py +++ b/python/paddle/device.py @@ -119,28 +119,7 @@ def get_cudnn_version(): return _cudnn_version -def set_device(device): - """ - Paddle supports running calculations on various types of devices, including CPU, GPU and XPU. - They are represented by string identifiers. This function can specify the global device - which the OP will run. - - Parameters: - device(str): This parameter determines the specific running device. - It can be ``cpu``, ``gpu:x`` and ``xpu:x``, where ``x`` is the - index of the GPUs or XPUs. - - Examples: - - .. code-block:: python - - import paddle - - paddle.set_device("cpu") - x1 = paddle.ones(name='x1', shape=[1, 2], dtype='int32') - x2 = paddle.zeros(name='x2', shape=[1, 2], dtype='int32') - data = paddle.stack([x1,x2], axis=1) - """ +def _convert_to_place(device): lower_device = device.lower() if lower_device == 'cpu': place = core.CPUPlace() @@ -183,7 +162,32 @@ def set_device(device): device_id = device_info_list[1] device_id = int(device_id) place = core.XPUPlace(device_id) + return place + +def set_device(device): + """ + Paddle supports running calculations on various types of devices, including CPU, GPU and XPU. + They are represented by string identifiers. This function can specify the global device + which the OP will run. + + Parameters: + device(str): This parameter determines the specific running device. + It can be ``cpu``, ``gpu:x`` and ``xpu:x``, where ``x`` is the + index of the GPUs or XPUs. + + Examples: + + .. code-block:: python + + import paddle + + paddle.set_device("cpu") + x1 = paddle.ones(name='x1', shape=[1, 2], dtype='int32') + x2 = paddle.zeros(name='x2', shape=[1, 2], dtype='int32') + data = paddle.stack([x1,x2], axis=1) + """ + place = _convert_to_place(device) framework._set_expected_place(place) return place diff --git a/python/paddle/fluid/dygraph/layers.py b/python/paddle/fluid/dygraph/layers.py index 36637abc6d0..b4959764742 100644 --- a/python/paddle/fluid/dygraph/layers.py +++ b/python/paddle/fluid/dygraph/layers.py @@ -36,6 +36,7 @@ from ..param_attr import ParamAttr from paddle.fluid.executor import Executor, global_scope from paddle.fluid.framework import in_dygraph_mode from paddle.fluid.framework import _current_expected_place as _get_device +from paddle.fluid.dygraph import no_grad import paddle.utils.deprecated as deprecated __all__ = ['Layer'] @@ -1343,6 +1344,114 @@ class Layer(core.Layer): for param, state in matched_param_state: _set_var(param, state) + def _apply(self, func, device, dtype, blocking): + for layer in self.children(): + layer._apply(func, device, dtype, blocking) + + for key, param in self._parameters.items(): + if param is not None: + with no_grad(): + param_applied = func(param, device, dtype, blocking) + assert param.is_leaf + param_applied.stop_gradient = param.stop_gradient + self._parameters[key] = param_applied + + if param.grad is not None: + with no_grad(): + grad_applied = func(param._grad_ivar(), device, dtype, + blocking) + + grad_applied.stop_gradient = param._grad_ivar( + ).stop_gradient + self._parameters[key]._set_grad_ivar(grad_applied) + + for key, buf in self._buffers.items(): + self._buffers[key] = func(buf, device, dtype, blocking) + + def to(self, device=None, dtype=None, blocking=None): + ''' + Cast the parameters and buffers of Layer by the give device, dtype and blocking. + + Parameters: + device(str|paddle.CPUPlace()|paddle.CUDAPlace()|paddle.CUDAPinnedPlace()|paddle.XPUPlace()|None, optional): The device of the Layer which want to be stored. + If None, the device is the same with the original Tensor. If device is string, it can be ``cpu``, ``gpu:x`` and ``xpu:x``, where ``x`` is the + index of the GPUs or XPUs. Default: None. + + dtype(str|core.VarDesc.VarType|None, optional): The type of the data. If None, the dtype is the same with the original Tensor. Default: None. + + blocking(bool|None, optional): If False and the source is in pinned memory, the copy will be + asynchronous with respect to the host. Otherwise, the argument has no effect. If None, the blocking is set True. Default: None. + + Returns: + None + + Examples: + .. code-block:: python + + import paddle + + linear=paddle.nn.Linear(2, 2) + linear.weight + #Parameter containing: + #Tensor(shape=[2, 2], dtype=float32, place=CUDAPlace(0), stop_gradient=False, + # [[-0.32770029, 0.38653070], + # [ 0.46030545, 0.08158520]]) + + linear.to(dtype='float64') + linear.weight + #Tenor(shape=[2, 2], dtype=float64, place=CUDAPlace(0), stop_gradient=False, + # [[-0.32770029, 0.38653070], + # [ 0.46030545, 0.08158520]]) + + linear.to(device='cpu') + linear.weight + #Tensor(shape=[2, 2], dtype=float64, place=CPUPlace, stop_gradient=False, + # [[-0.32770029, 0.38653070], + # [ 0.46030545, 0.08158520]]) + linear.to(device=paddle.CUDAPinnedPlace(), blocking=False) + linear.weight + #Tensor(shape=[2, 2], dtype=float64, place=CUDAPinnedPlace, stop_gradient=False, + # [[-0.04989364, -0.56889004], + # [ 0.33960250, 0.96878713]]) + + + ''' + + if device is None and dtype is None and blocking is None: + return + + if device is not None: + if isinstance(device, str): + device = paddle.device._convert_to_place(device) + elif isinstance(device, (core.CPUPlace, core.CUDAPlace, + core.CUDAPinnedPlace, core.XPUPlace)): + pass + else: + raise ValueError( + "device value error, must be str, paddle.CPUPlace(), paddle.CUDAPlace(), paddle.CUDAPinnedPlace() or paddle.XPUPlace(), but the type of device is " + + type(device).__name__) + + if blocking is None: + blocking = True + else: + assert isinstance( + blocking, + bool), "blocking value error, must be the True, False or None" + + def transform(t, device, dtype, blocking): + if device is None: + device = t.place + if dtype is None: + dtype = t.dtype + + new_t = t._copy_to(device, blocking) + if dtype is not None and dtype != t.dtype: + new_t = new_t.cast(dtype=dtype) + + return new_t + + self._apply(transform, device, dtype, blocking) + # [aliases] Compatible with old method names set_dict = set_state_dict load_dict = set_state_dict diff --git a/python/paddle/fluid/tests/unittests/test_base_layer.py b/python/paddle/fluid/tests/unittests/test_base_layer.py index 31879dae0da..e6e15575f2c 100644 --- a/python/paddle/fluid/tests/unittests/test_base_layer.py +++ b/python/paddle/fluid/tests/unittests/test_base_layer.py @@ -331,5 +331,72 @@ class TestModifiedBuffer(unittest.TestCase): np.array_equal(dy_outs[i].numpy(), st_outs[i].numpy())) +class TestLayerTo(unittest.TestCase): + def setUp(self): + paddle.disable_static() + self.linear = paddle.nn.Linear(2, 2) + self.new_grad = np.random.random([2, 2]) + self.linear.weight._set_grad_ivar(paddle.to_tensor(self.new_grad)) + buffer = paddle.to_tensor([0.0], dtype='float32') + self.linear.register_buffer("buf_name", buffer, persistable=True) + + sublayer = paddle.nn.Conv1D(3, 2, 3) + self.linear.add_sublayer(1, sublayer) + + def test_to_api(self): + self.linear.to(dtype='double') + self.assertEqual(self.linear.weight.dtype, + paddle.fluid.core.VarDesc.VarType.FP64) + self.assertEqual(self.linear.buf_name.dtype, + paddle.fluid.core.VarDesc.VarType.FP64) + self.assertTrue(np.allclose(self.linear.weight.grad, self.new_grad)) + self.assertTrue(self.linear.weight._grad_ivar().dtype, + paddle.fluid.core.VarDesc.VarType.FP64) + + self.linear.to() + self.assertEqual(self.linear.weight.dtype, + paddle.fluid.core.VarDesc.VarType.FP64) + self.assertEqual(self.linear.buf_name.dtype, + paddle.fluid.core.VarDesc.VarType.FP64) + self.assertTrue(np.allclose(self.linear.weight.grad, self.new_grad)) + self.assertTrue(self.linear.weight._grad_ivar().dtype, + paddle.fluid.core.VarDesc.VarType.FP64) + + if paddle.fluid.is_compiled_with_cuda(): + self.linear.to(device=paddle.CUDAPlace(0)) + self.assertTrue(self.linear.weight.place.is_gpu_place()) + self.assertEqual(self.linear.weight.place.gpu_device_id(), 0) + self.assertTrue(self.linear.buf_name.place.is_gpu_place()) + self.assertEqual(self.linear.buf_name.place.gpu_device_id(), 0) + self.assertTrue(self.linear.weight._grad_ivar().place.is_gpu_place( + )) + self.assertEqual( + self.linear.weight._grad_ivar().place.gpu_device_id(), 0) + + self.linear.to(device='gpu:0') + self.assertTrue(self.linear.weight.place.is_gpu_place()) + self.assertEqual(self.linear.weight.place.gpu_device_id(), 0) + self.assertTrue(self.linear.buf_name.place.is_gpu_place()) + self.assertEqual(self.linear.buf_name.place.gpu_device_id(), 0) + self.assertTrue(self.linear.weight._grad_ivar().place.is_gpu_place( + )) + self.assertEqual( + self.linear.weight._grad_ivar().place.gpu_device_id(), 0) + + self.linear.to(device=paddle.CPUPlace()) + self.assertTrue(self.linear.weight.place.is_cpu_place()) + self.assertTrue(self.linear.buf_name.place.is_cpu_place()) + self.assertTrue(self.linear.weight._grad_ivar().place.is_cpu_place()) + + self.linear.to(device='cpu') + self.assertTrue(self.linear.weight.place.is_cpu_place()) + self.assertTrue(self.linear.buf_name.place.is_cpu_place()) + self.assertTrue(self.linear.weight._grad_ivar().place.is_cpu_place()) + + self.assertRaises(ValueError, self.linear.to, device=1) + + self.assertRaises(AssertionError, self.linear.to, blocking=1) + + if __name__ == '__main__': unittest.main() -- GitLab From 7ab47e8d660e9b0d1c3e9cd4eab246faac6710a4 Mon Sep 17 00:00:00 2001 From: YUNSHEN XIE <1084314248@qq.com> Date: Tue, 13 Apr 2021 18:10:02 +0800 Subject: [PATCH 203/486] Fix prec on windows for long args (#32218) * fix error for long args * remove unneccessary code --- paddle/scripts/paddle_build.bat | 16 ---------------- tools/windows/get_prec_ut_list.py | 5 ++++- tools/windows/run_unittests.sh | 16 +++++++++------- 3 files changed, 13 insertions(+), 24 deletions(-) diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat index 14e62d6761f..20c8794ba63 100644 --- a/paddle/scripts/paddle_build.bat +++ b/paddle/scripts/paddle_build.bat @@ -462,27 +462,11 @@ dir %THIRD_PARTY_PATH:/=\%\install\mkldnn\bin dir %THIRD_PARTY_PATH:/=\%\install\warpctc\bin pip install requests -python %work_dir%\tools\get_quick_disable_lt.py > Output -if %errorlevel%==0 ( - set /p disable_ut_quickly= Date: Tue, 13 Apr 2021 19:26:38 +0800 Subject: [PATCH 204/486] add statistics_UT_resource.sh for imporving UT parallel level (#32220) --- tools/statistics_UT_resource.sh | 58 +++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 tools/statistics_UT_resource.sh diff --git a/tools/statistics_UT_resource.sh b/tools/statistics_UT_resource.sh new file mode 100644 index 00000000000..a6f1f264c4c --- /dev/null +++ b/tools/statistics_UT_resource.sh @@ -0,0 +1,58 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +unset GREP_OPTIONS +rm ./run_detail.log +rm ./UT_resource.log +rm ./UT_resource_sort.log +rm ./while_list.log + +export LD_LIBRARY_PATH="$PWD/python/paddle/libs;$LD_LIBRARY_PATH" +export CUDA_VISIBLE_DEVICES=0,1 + +test_cases=$(ctest -N | awk -F ': ' '{print $2}' | sed '/^$/d' | sed '$d') +use_memory_base=$(nvidia-smi -q -i 0 | grep "Used" | head -1 | grep -o "[0-9]*") +for unittest in $test_cases +do + use_memory=0 + gpu_utilization=0 + memory_utilization=0 + ctest -R "^${unittest}$" --repeat-until-fail 5 -j 1 & + PID=$! + echo -e "******************************************************" + echo -e "[$unittest]: PID:$PID \n" + while [[ $(ps aux | awk '{print $2}' | grep "^$PID$" | grep -v "grep" | wc -l) -ge 1 ]] + do + use_memory_current=$(nvidia-smi -q -i 0 | grep "Used" | head -1 | grep -o "[0-9]*") + if [[ $use_memory_current -gt $use_memory ]];then + use_memory=$use_memory_current + fi + memory_utilization_current=$(nvidia-smi -q -i 0 | grep "Memory" | sed -n '3p' | grep -o "[0-9]*") + if [[ $memory_utilization_current -gt $memory_utilization ]];then + memory_utilization=$memory_utilization_current + fi + + gpu_utilization_current=$(nvidia-smi -q -i 0 | grep "Gpu" | grep -o "[0-9]*") + if [[ $gpu_utilization_current -gt $gpu_utilization ]];then + gpu_utilization=$gpu_utilization_current + fi + done + use_memory=`expr $use_memory - $use_memory_base` + echo -e " use_memory:$use_memory \n memory_utilization:$memory_utilization \n gpu_utilization:$gpu_utilization\n" + echo -e "[$unittest]: \n use_memory:$use_memory \n memory_utilization:$memory_utilization \n gpu_utilization:$gpu_utilization\n" >> run_detail.log + echo -e "$unittest:$use_memory:$memory_utilization:$gpu_utilization" >> UT_resource.log +done + +sort -r -n -k 2 -t : UT_resource.log > UT_resource_sort.log +cat UT_resource_sort.log | awk -F ':' '{print $1}' > while_list.log -- GitLab From b9e543f885114d38f96058bcb5a98a1f5e5e1d0a Mon Sep 17 00:00:00 2001 From: lidanqing Date: Tue, 13 Apr 2021 14:40:13 +0200 Subject: [PATCH 205/486] upgrade to oneDNN2.2.1 (fix when prim descriptor or attr contain NaN) (#32227) --- cmake/external/mkldnn.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake index 884219d8dd8..fb1d4d9d56d 100644 --- a/cmake/external/mkldnn.cmake +++ b/cmake/external/mkldnn.cmake @@ -20,7 +20,7 @@ SET(MKLDNN_SOURCE_DIR ${THIRD_PARTY_PATH}/mkldnn/src/extern_mkldnn) SET(MKLDNN_INSTALL_DIR ${THIRD_PARTY_PATH}/install/mkldnn) SET(MKLDNN_INC_DIR "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE) SET(MKLDNN_REPOSITORY ${GIT_URL}/oneapi-src/oneDNN.git) -SET(MKLDNN_TAG 72efa005effb49595933e033cc732f215ef0445a) +SET(MKLDNN_TAG f58682cd8bd0615f41d879f8afc8f1511ab42d24) # Introduce variables: # * CMAKE_INSTALL_LIBDIR -- GitLab From cb81826a6eaaf63d8ad07ebc39ad6790bc75e3a3 Mon Sep 17 00:00:00 2001 From: Pei Yang Date: Tue, 13 Apr 2021 20:41:57 +0800 Subject: [PATCH 206/486] extend multiclass_nms unittest timeout threshold (#32214) * extend multiclass_nms unittest timeout threshold * adjust timeout to 200s * temporarily disable multiclass_nms trt op teller --- paddle/fluid/inference/tensorrt/op_teller.cc | 1 - python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc index b681b098c8c..179f534acef 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -114,7 +114,6 @@ struct SimpleOpTypeSetTeller : public Teller { "yolo_box", "roi_align", "affine_channel", - "multiclass_nms", "nearest_interp", "anchor_generator", }; diff --git a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt index dfec1cc7572..3ebed017775 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt @@ -32,4 +32,5 @@ if(WITH_GPU AND TENSORRT_FOUND) set_tests_properties(test_trt_subgraph_pass PROPERTIES TIMEOUT 120) set_tests_properties(test_trt_activation_pass PROPERTIES TIMEOUT 120) set_tests_properties(test_trt_conv_pass PROPERTIES TIMEOUT 120) +set_tests_properties(test_trt_multiclass_nms_op PROPERTIES TIMEOUT 200) endif() -- GitLab From 4281eb496d574933893a925290a3eeddaa18e983 Mon Sep 17 00:00:00 2001 From: XGZhang <46363693+XGZhang11@users.noreply.github.com> Date: Wed, 14 Apr 2021 10:40:39 +0800 Subject: [PATCH 207/486] add new post-quant methods (#32208) --- .../post_training_quantization.py | 207 ++++++++++++++---- .../slim/quantization/quantization_pass.py | 51 ++++- .../test_post_training_quantization_mnist.py | 60 +++++ ..._post_training_quantization_mobilenetv1.py | 44 ++++ .../slim/tests/test_quantization_pass.py | 18 +- 5 files changed, 334 insertions(+), 46 deletions(-) diff --git a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py index aba6005f0cf..bc2e2dc9b65 100644 --- a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py +++ b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py @@ -55,7 +55,7 @@ def _set_variable_data(scope, place, var_name, np_value): Set the value of var node by name, if the node exits, ''' assert isinstance(np_value, np.ndarray), \ - 'The type of value should be numpy array.' + 'The type of value should be numpy array.' var_node = scope.find_var(var_name) if var_node != None: tensor = var_node.get_tensor() @@ -138,8 +138,10 @@ class PostTrainingQuantization(object): batch_size=10, batch_nums=None, algo="KL", + hist_percent=0.99999, quantizable_op_type=["conv2d", "depthwise_conv2d", "mul"], is_full_quantize=False, + bias_correction=False, activation_bits=8, weight_bits=8, activation_quantize_type='range_abs_max', @@ -180,7 +182,13 @@ class PostTrainingQuantization(object): get the KL threshold for quantized activations and get the abs_max value for quantized weights. If algo='abs_max', get the abs max value for activations and weights. If algo= 'min_max', get the min - and max value for quantized activations and weights. Default is KL. + and max value for quantized activations and weights. If algo='avg', + get the average value among the max values for activations. If + algo= 'hist', get the value of 'hist_percent' quantile as the threshold. + If algo='mse', get the value which makes the quantization mse loss + minimal. Default is KL. + hist_percent(float, optional): The threshold of algo 'hist' for activations. + Default is 0.99999. quantizable_op_type(list[str], optional): List the type of ops that will be quantized. Default is ["conv2d", "depthwise_conv2d", "mul"]. @@ -188,6 +196,8 @@ class PostTrainingQuantization(object): apply quantization to all supported quantizable op type. If set is_full_quantized as False, only apply quantization to the op type according to the input quantizable_op_type. + bias_correction(bool, optional): If set as True, use the bias correction + method of https://arxiv.org/abs/1810.05723. Default is False. activation_bits(int): quantization bit number for activation. weight_bits(int, optional): quantization bit number for weights. activation_quantize_type(str): quantization type for activation, @@ -255,7 +265,9 @@ class PostTrainingQuantization(object): 'range_abs_max', 'moving_average_abs_max', 'abs_max' ] self._support_weight_quantize_type = ['abs_max', 'channel_wise_abs_max'] - self._support_algo_type = ['KL', 'abs_max', 'min_max'] + self._support_algo_type = [ + 'KL', 'hist', 'avg', 'mse', 'abs_max', 'min_max' + ] self._dynamic_quantize_op_type = ['lstm'] self._support_quantize_op_type = \ list(set(QuantizationTransformPass._supported_quantizable_op_type + @@ -270,7 +282,7 @@ class PostTrainingQuantization(object): "cannot be None in the same time." assert batch_size > 0, "The batch_size should be greater than 0." assert algo in self._support_algo_type, \ - "The algo should be KL, abs_max or min_max." + "The algo should be KL, hist, mse, avg, abs_max or min_max." assert activation_quantize_type in self._support_activation_quantize_type, \ "The activation_quantize_type ({}) should in ({}).".format( activation_quantize_type, self._support_activation_quantize_type) @@ -279,6 +291,7 @@ class PostTrainingQuantization(object): weight_quantize_type, self._support_weight_quantize_type) # Save input params + self._bias_correction = bias_correction self._executor = executor self._scope = global_scope() if scope == None else scope self._model_dir = model_dir @@ -289,6 +302,7 @@ class PostTrainingQuantization(object): self._batch_size = batch_size self._batch_nums = batch_nums self._algo = algo + self._hist_percent = hist_percent self._activation_bits = activation_bits self._weight_bits = weight_bits self._activation_quantize_type = activation_quantize_type @@ -314,17 +328,21 @@ class PostTrainingQuantization(object): self._quantized_weight_var_name = set() self._quantized_act_var_name = set() self._weight_op_pairs = {} - # The vars for alog = KL + # The vars for alog = KL or hist self._sampling_act_abs_min_max = {} self._sampling_act_histogram = {} self._sampling_data = {} - self._quantized_var_kl_threshold = {} + self._quantized_var_threshold = {} self._histogram_bins = 2048 # The vars for algo = min_max self._quantized_var_min = {} self._quantized_var_max = {} - # The vars for algo = abs_max - self._quantized_var_abs_max = {} + # The vars for algo = avg + self._quantized_var_avg = {} + # The best loss of algo = mse + self._best_mse_loss = {} + # The threshold for algo = abs_max, mse or avg + self._quantized_threshold = {} def quantize(self): ''' @@ -341,7 +359,7 @@ class PostTrainingQuantization(object): self._collect_target_varnames() self._set_activation_persistable() - if self._algo == "KL": + if self._algo in ["KL", "hist"]: _logger.info("Preparation stage ...") batch_id = 0 for data in self._data_loader(): @@ -374,13 +392,14 @@ class PostTrainingQuantization(object): if self._batch_nums and batch_id >= self._batch_nums: break _logger.info("Finish sampling stage, all batch: " + str(batch_id)) - self._reset_activation_persistable() - - if self._algo == "KL": - self._calculate_kl_threshold() - - if self._algo in ["KL", "abs_max"]: + if self._algo == 'avg': + for var_name in self._quantized_act_var_name: + self._quantized_threshold[var_name] = \ + np.array(self._quantized_var_avg[var_name]).mean() + if self._algo in ["KL", "hist"]: + self._calculate_kl_hist_threshold() + if self._algo in ["KL", "abs_max", "hist", "avg", "mse"]: self._update_program() else: self._save_input_threhold() @@ -526,14 +545,84 @@ class PostTrainingQuantization(object): ''' if self._algo == "abs_max": self._sample_abs_max() + elif self._algo == "avg": + self._sample_avg() elif self._algo == "min_max": self._sample_min_max() - elif self._algo == "KL": + elif self._algo == "mse": + self._sample_mse() + elif self._algo in ["KL", "hist"]: self._sample_histogram() + def _sample_mse(self): + if self._quantized_threshold == {}: + for var_name in self._quantized_weight_var_name: + var_tensor = _load_variable_data(self._scope, var_name) + if self._weight_quantize_type == "abs_max": + abs_max_value = float(np.max(np.abs(var_tensor))) + elif self._weight_quantize_type == "channel_wise_abs_max": + abs_max_value = [] + if self._weight_op_pairs[ + var_name] in _channelwise_quant_axis1_ops: + for i in range(var_tensor.shape[1]): + abs_max_value.append( + float(np.max(np.abs(var_tensor[:, i])))) + else: + for i in range(var_tensor.shape[0]): + abs_max_value.append( + float(np.max(np.abs(var_tensor[i])))) + self._quantized_threshold[var_name] = abs_max_value + _logger.info("MSE searching stage ...") + for var_name in self._quantized_act_var_name: + var_tensor = _load_variable_data(self._scope, var_name) + var_tensor = var_tensor.flatten() + abs_max_value = float(np.max(np.abs(var_tensor))) + s = 0.3 + if var_name not in self._best_mse_loss: + self._best_mse_loss[var_name] = float('inf') + while s <= 1.0: + scale = s * abs_max_value + s += 0.02 + bins = 2**(self._activation_bits - 1) - 1 + quant_dequant_var = np.round( + np.clip(var_tensor, 0.0, scale) / scale * + bins) / bins * scale + mse_loss = ((var_tensor - quant_dequant_var)**2).mean() + if mse_loss <= self._best_mse_loss[var_name]: + self._best_mse_loss[var_name] = mse_loss + self._quantized_threshold[var_name] = scale + + def _sample_avg(self): + if self._quantized_threshold == {}: + for var_name in self._quantized_weight_var_name: + var_tensor = _load_variable_data(self._scope, var_name) + if self._weight_quantize_type == "abs_max": + abs_max_value = float(np.max(np.abs(var_tensor))) + elif self._weight_quantize_type == "channel_wise_abs_max": + abs_max_value = [] + if self._weight_op_pairs[ + var_name] in _channelwise_quant_axis1_ops: + for i in range(var_tensor.shape[1]): + abs_max_value.append( + float(np.max(np.abs(var_tensor[:, i])))) + else: + for i in range(var_tensor.shape[0]): + abs_max_value.append( + float(np.max(np.abs(var_tensor[i])))) + self._quantized_threshold[var_name] = abs_max_value + + for var_name in self._quantized_act_var_name: + var_tensor = _load_variable_data(self._scope, var_name) + abs_max_value = float(np.max(np.abs(var_tensor))) + if (var_name not in self._quantized_var_avg): + self._quantized_var_avg[var_name] = [] + abs_avg_value = float(np.mean(np.max( \ + np.abs(var_tensor.reshape(var_tensor.shape[0], -1)), axis=(1)))) + self._quantized_var_avg[var_name].append(abs_avg_value) + continue + def _sample_abs_max(self): - # Only calculate abs_max value for weight for once - if self._quantized_var_abs_max == {}: + if self._quantized_threshold == {}: for var_name in self._quantized_weight_var_name: var_tensor = _load_variable_data(self._scope, var_name) if self._weight_quantize_type == "abs_max": @@ -549,14 +638,14 @@ class PostTrainingQuantization(object): for i in range(var_tensor.shape[0]): abs_max_value.append( float(np.max(np.abs(var_tensor[i])))) - self._quantized_var_abs_max[var_name] = abs_max_value + self._quantized_threshold[var_name] = abs_max_value for var_name in self._quantized_act_var_name: var_tensor = _load_variable_data(self._scope, var_name) abs_max_value = float(np.max(np.abs(var_tensor))) - if (var_name not in self._quantized_var_abs_max) or \ - (abs_max_value > self._quantized_var_abs_max[var_name]): - self._quantized_var_abs_max[var_name] = abs_max_value + if (var_name not in self._quantized_threshold) or \ + (abs_max_value > self._quantized_threshold[var_name]): + self._quantized_threshold[var_name] = abs_max_value def _sample_min_max(self): if self._quantized_var_min == {} and self._quantized_var_max == {}: @@ -646,12 +735,12 @@ class PostTrainingQuantization(object): [], bins=self._histogram_bins, range=(min_val, max_val)) self._sampling_act_histogram[var_name] = [hist, hist_edeges] - def _calculate_kl_threshold(self): + def _calculate_kl_hist_threshold(self): ''' - Calculate the KL threshold of quantized variables. + Calculate the KL or hist threshold of quantized variables. ''' - _logger.info("Calculate KL threshold ...") - assert self._algo == "KL", "The algo should be KL to calculate kl threshold." + _logger.info("Calculate {} threshold ...".format(self._algo)) + assert self._algo in ["KL", "hist"], "The algo should be KL or hist." # Abs_max threshold for weights for var_name in self._quantized_weight_var_name: @@ -669,18 +758,22 @@ class PostTrainingQuantization(object): for i in range(weight_data.shape[0]): weight_threshold.append( float(np.max(np.abs(weight_data[i])))) - self._quantized_var_kl_threshold[var_name] = weight_threshold + self._quantized_var_threshold[var_name] = weight_threshold for var_name in self._quantized_act_var_name: hist, hist_edeges = self._sampling_act_histogram[var_name] - self._quantized_var_kl_threshold[var_name] = \ - self._get_kl_scaling_factor(hist, hist_edeges) + if self._algo == "KL": + self._quantized_var_threshold[var_name] = \ + self._get_kl_scaling_factor(hist, hist_edeges) + elif self._algo == "hist": + self._quantized_var_threshold[var_name] = \ + self._get_hist_scaling_factor(hist, hist_edeges) def _update_program(self): ''' Use QuantizationTransformPass and AddQuantDequantPass to insert fake_quantize, fake_dequantize and fake_quant_dequant op. - Besides, save all kl threshold to the scale var node. + Besides, save all threshold to the scale var node. ''' _logger.info("Update the program ...") graph = IrGraph(core.Graph(self._program.desc), for_test=True) @@ -711,11 +804,11 @@ class PostTrainingQuantization(object): quantizable_op_type=minor_quantizable_op_types) add_quant_dequant_pass.apply(graph) - # save abs_max or KL threshold to scale var node - if self._algo == "KL": - scale_dict = self._quantized_var_kl_threshold + # save threshold to scale var node + if self._algo in ["KL", "hist"]: + scale_dict = self._quantized_var_threshold else: - scale_dict = self._quantized_var_abs_max + scale_dict = self._quantized_threshold for key, val in scale_dict.items(): _set_variable_data( self._scope, @@ -734,6 +827,7 @@ class PostTrainingQuantization(object): freeze_pass = QuantizationFreezePass( scope=self._scope, place=self._place, + bias_correction=self._bias_correction, weight_bits=self._weight_bits, activation_bits=self._activation_bits, weight_quantize_type=self._weight_quantize_type, @@ -761,20 +855,28 @@ class PostTrainingQuantization(object): out_var_name + " is not the output of the op" if self._algo == "KL": # For compatibility, we save output threshold by two methods. - save_info(op_node, out_var_name, - self._quantized_var_kl_threshold, "out_threshold", - "post_kl") + save_info(op_node, out_var_name, self._quantized_var_threshold, + "out_threshold", "post_kl") save_info( - op_node, out_var_name, self._quantized_var_kl_threshold, + op_node, out_var_name, self._quantized_var_threshold, argname_index[0] + str(argname_index[1]) + "_threshold", "post_kl") - elif self._algo == "abs_max": - save_info(op_node, out_var_name, self._quantized_var_abs_max, - "out_threshold", "post_abs_max") + elif self._algo == "hist": + # For compatibility, we save output threshold by two methods. + save_info(op_node, out_var_name, self._quantized_var_threshold, + "out_threshold", "post_hist") save_info( - op_node, out_var_name, self._quantized_var_abs_max, + op_node, out_var_name, self._quantized_var_threshold, argname_index[0] + str(argname_index[1]) + "_threshold", - "post_kl") + "post_hist") + + elif self._algo in ["avg", "abs_max", "mse"]: + save_info(op_node, out_var_name, self._quantized_threshold, + "out_threshold", "post_" + str(self._algo)) + save_info( + op_node, out_var_name, self._quantized_threshold, + argname_index[0] + str(argname_index[1]) + "_threshold", + "post_" + str(self._algo)) elif self._algo == "min_max": save_info(op_node, out_var_name, self._quantized_var_min, "out_min", "post_min_max") @@ -817,10 +919,27 @@ class PostTrainingQuantization(object): op._set_attr("quantization_type", quantization_type) op._set_attr("bit_length", self._weight_bits) - def _get_kl_scaling_factor(self, hist, hist_edeges, num_quantized_bins=255): + def _get_hist_scaling_factor(self, hist, hist_edges): + ''' + Using the hist method to get the scaling factor. + ''' + threshold_rate = self._hist_percent + hist = hist / float(sum(hist)) + hist_sum = 0 + hist_index = 0 + for i in range(len(hist)): + hist_sum += hist[i] + if hist_sum >= threshold_rate: + hist_index = i + 1 + break + bin_width = hist_edges[1] - hist_edges[0] + return (hist_index - 0.5) * bin_width + + def _get_kl_scaling_factor(self, hist, hist_edeges): ''' Using the KL-divergenc method to get the more precise scaling factor. ''' + num_quantized_bins = 2**(self._activation_bits - 1) - 1 ending_iter = self._histogram_bins - 1 starting_iter = int(ending_iter * 0.7) bin_width = hist_edeges[1] - hist_edeges[0] diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py index 3f9ff7295dd..79aad8c8bc5 100644 --- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py +++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py @@ -1070,6 +1070,7 @@ class QuantizationFreezePass(object): def __init__(self, scope, place, + bias_correction=False, weight_bits=8, activation_bits=8, weight_quantize_type='abs_max', @@ -1085,6 +1086,8 @@ class QuantizationFreezePass(object): scope(fluid.Scope): scope is used to get the weight tensor values. place(fluid.CPUPlace|fluid.CUDAPlace|str): place is used to restore the weight tensors. If it's string, It can be ``cpu``, and ``gpu:x``, where ``x`` is the index of the GPUs. + bias_correction(bool): whether use bias correction for post-training quantization. + https://arxiv.org/abs/1810.05723. weight_bits(int): quantization bit number for weights. activation_bits(int): quantization bit number for activation. weight_quantize_type(str): quantization type for weights, support 'abs_max' and @@ -1098,6 +1101,7 @@ class QuantizationFreezePass(object): assert place is not None, \ 'The place cannot be set None.' self._scope = scope + self._bias_correction = bias_correction self._place = _get_paddle_place(place) self._weight_bits = weight_bits self._activation_bits = activation_bits @@ -1154,7 +1158,10 @@ class QuantizationFreezePass(object): else: quant_axis = 0 quantized_param_v = self._quant( - param_v, scale_v, self._weight_bits, quant_axis) + param_v.copy(), scale_v, self._weight_bits, quant_axis) + if self._bias_correction == True: + quantized_param_v = self._bias_correction_w( + param_v, quantized_param_v, scale_v, quant_axis) self._restore_var(input_arg_name, quantized_param_v) self._remove_fake_quant_and_dequant_op(graph, op_node) @@ -1373,6 +1380,8 @@ class QuantizationFreezePass(object): if isinstance(scale, list): for i, s in enumerate(scale): + if s == 0.0: + s = 1e-8 if quant_axis == 0: x[i] = _clip(x[i], s) x[i] = np.round(x[i] / s * bnt) @@ -1384,6 +1393,46 @@ class QuantizationFreezePass(object): x = np.round(x / scale * bnt) return x + def _bias_correction_w(self, x, x_quant, scale_v, quant_axis): + ''' + Bias correction for weight + ''' + eps = 1e-8 + bnt = (1 << (self._weight_bits - 1)) - 1 + x_dequant = x_quant.copy() + if isinstance(scale_v, list): + if quant_axis == 0: + for i, s in enumerate(scale_v): + x_dequant[i] = x_dequant[i] * s / bnt + quant_bias = x - x_dequant + mean_bias = quant_bias.reshape(quant_bias.shape[0], -1).mean(-1) + std_orig = x.reshape(x.shape[0], -1).std(-1) + std_quant = x_dequant.reshape(x_dequant.shape[0], -1).std(-1) + std_bias = std_orig / (std_quant + eps) + else: + for i, s in enumerate(scale_v): + x_dequant[:, i] = x_quant[:, i] * s / bnt + quant_bias = x - x_dequant + mean_bias = np.array([ + quant_bias[:, i].mean() for i in range(quant_bias.shape[1]) + ]) + std_orig = np.array([x[:, i].std() for i in range(x.shape[1])]) + std_quant = np.array( + [x_dequant[:, i].std() for i in range(x_dequant.shape[1])]) + std_bias = std_orig / (std_quant + eps) + else: + x_dequant = x_quant * scale_v / bnt + mean_bias = (x - x_dequant).mean() + std_bias = x.std() / (x_dequant.std() + eps) + if mean_bias.ndim == 1: + std_bias = np.resize(std_bias, x.shape) + mean_bias = np.resize(mean_bias, x.shape) + + x_dequant = (mean_bias + x_dequant) * std_bias + quantized_param_v = self._quant(x_dequant, scale_v, self._weight_bits, + quant_axis) + return quantized_param_v + class ConvertToInt8Pass(object): def __init__(self, scope, place, quantizable_op_type=None): diff --git a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mnist.py b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mnist.py index 3ea1c84f976..da5c5d6dc94 100644 --- a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mnist.py +++ b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mnist.py @@ -204,6 +204,66 @@ class TestPostTrainingKLForMnist(TestPostTrainingQuantization): quant_iterations) +class TestPostTraininghistForMnist(TestPostTrainingQuantization): + def test_post_training_hist(self): + model_name = "mnist_model" + data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model.tar.gz" + data_md5 = "be71d3997ec35ac2a65ae8a145e2887c" + algo = "hist" + quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"] + is_full_quantize = False + is_use_cache_file = False + is_optimize_model = True + diff_threshold = 0.01 + batch_size = 10 + infer_iterations = 50 + quant_iterations = 5 + self.run_test(model_name, data_url, data_md5, algo, quantizable_op_type, + is_full_quantize, is_use_cache_file, is_optimize_model, + diff_threshold, batch_size, infer_iterations, + quant_iterations) + + +class TestPostTrainingmseForMnist(TestPostTrainingQuantization): + def test_post_training_mse(self): + model_name = "mnist_model" + data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model.tar.gz" + data_md5 = "be71d3997ec35ac2a65ae8a145e2887c" + algo = "mse" + quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"] + is_full_quantize = False + is_use_cache_file = False + is_optimize_model = True + diff_threshold = 0.01 + batch_size = 10 + infer_iterations = 50 + quant_iterations = 5 + self.run_test(model_name, data_url, data_md5, algo, quantizable_op_type, + is_full_quantize, is_use_cache_file, is_optimize_model, + diff_threshold, batch_size, infer_iterations, + quant_iterations) + + +class TestPostTrainingavgForMnist(TestPostTrainingQuantization): + def test_post_training_avg(self): + model_name = "mnist_model" + data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_model.tar.gz" + data_md5 = "be71d3997ec35ac2a65ae8a145e2887c" + algo = "avg" + quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"] + is_full_quantize = False + is_use_cache_file = False + is_optimize_model = True + diff_threshold = 0.01 + batch_size = 10 + infer_iterations = 50 + quant_iterations = 5 + self.run_test(model_name, data_url, data_md5, algo, quantizable_op_type, + is_full_quantize, is_use_cache_file, is_optimize_model, + diff_threshold, batch_size, infer_iterations, + quant_iterations) + + class TestPostTrainingAbsMaxForMnist(TestPostTrainingQuantization): def test_post_training_abs_max(self): model_name = "mnist_model" diff --git a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py index 18389d9433b..71611048610 100644 --- a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py +++ b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py @@ -328,6 +328,50 @@ class TestPostTrainingKLForMobilenetv1(TestPostTrainingQuantization): diff_threshold) +class TestPostTrainingavgForMobilenetv1(TestPostTrainingQuantization): + def test_post_training_avg_mobilenetv1(self): + model = "MobileNet-V1" + algo = "avg" + data_urls = [ + 'http://paddle-inference-dist.bj.bcebos.com/int8/mobilenetv1_int8_model.tar.gz' + ] + data_md5s = ['13892b0716d26443a8cdea15b3c6438b'] + quantizable_op_type = [ + "conv2d", + "depthwise_conv2d", + "mul", + ] + is_full_quantize = False + is_use_cache_file = False + is_optimize_model = True + diff_threshold = 0.025 + self.run_test(model, algo, data_urls, data_md5s, quantizable_op_type, + is_full_quantize, is_use_cache_file, is_optimize_model, + diff_threshold) + + +class TestPostTraininghistForMobilenetv1(TestPostTrainingQuantization): + def test_post_training_hist_mobilenetv1(self): + model = "MobileNet-V1" + algo = "hist" + data_urls = [ + 'http://paddle-inference-dist.bj.bcebos.com/int8/mobilenetv1_int8_model.tar.gz' + ] + data_md5s = ['13892b0716d26443a8cdea15b3c6438b'] + quantizable_op_type = [ + "conv2d", + "depthwise_conv2d", + "mul", + ] + is_full_quantize = False + is_use_cache_file = False + is_optimize_model = True + diff_threshold = 0.025 + self.run_test(model, algo, data_urls, data_md5s, quantizable_op_type, + is_full_quantize, is_use_cache_file, is_optimize_model, + diff_threshold) + + class TestPostTrainingAbsMaxForMobilenetv1(TestPostTrainingQuantization): def test_post_training_abs_max_mobilenetv1(self): model = "MobileNet-V1" diff --git a/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py b/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py index 768a9ba7cfc..790213d4b02 100644 --- a/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py +++ b/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py @@ -257,6 +257,7 @@ class TestQuantizationFreezePass(unittest.TestCase): use_cuda, seed, activation_quant_type, + bias_correction=False, weight_quant_type='abs_max', for_ci=True, quant_skip_pattern='skip_quant'): @@ -355,7 +356,8 @@ class TestQuantizationFreezePass(unittest.TestCase): # Freeze graph for inference, but the weight of fc/conv is still float type. freeze_pass = QuantizationFreezePass( - scope=scope, place=place, weight_quantize_type=weight_quant_type) + scope=scope, place=place, bias_correction=bias_correction, \ + weight_quantize_type=weight_quant_type) freeze_pass.apply(test_graph) if not for_ci: marked_nodes = set() @@ -472,6 +474,13 @@ class TestQuantizationFreezePass(unittest.TestCase): def test_freeze_graph_cuda_static(self): if fluid.core.is_compiled_with_cuda(): with fluid.unique_name.guard(): + self.freeze_graph( + True, + seed=1, + activation_quant_type='range_abs_max', + bias_correction=True, + weight_quant_type='abs_max', + for_ci=True) self.freeze_graph( True, seed=1, @@ -496,6 +505,13 @@ class TestQuantizationFreezePass(unittest.TestCase): activation_quant_type='moving_average_abs_max', weight_quant_type='channel_wise_abs_max', for_ci=True) + self.freeze_graph( + True, + seed=1, + activation_quant_type='moving_average_abs_max', + bias_correction=True, + weight_quant_type='channel_wise_abs_max', + for_ci=True) def test_freeze_graph_cpu_static(self): with fluid.unique_name.guard(): -- GitLab From f4b2ce44ba3ae4a9bfd8e69002b21417c60758e5 Mon Sep 17 00:00:00 2001 From: Thomas Young <35565423+HexToString@users.noreply.github.com> Date: Wed, 14 Apr 2021 10:50:43 +0800 Subject: [PATCH 208/486] fix expand op lack of float16 (#32238) --- python/paddle/fluid/layers/nn.py | 3 ++- python/paddle/tensor/manipulation.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index c4f4754cc77..565c134ae9d 100755 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -10332,7 +10332,8 @@ def expand(x, expand_times, name=None): inputs = {"X": [x]} attrs = {} check_variable_and_dtype( - x, 'x', ['bool', 'float32', 'float64', 'int32', 'int64'], 'expand') + x, 'x', ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'], + 'expand') check_type(expand_times, 'expand_times', (list, tuple, Variable), 'expand') if convert_dtype(x.dtype) == 'bool' and x.stop_gradient == True: raise ValueError( diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py index 377435a5000..696775434b9 100644 --- a/python/paddle/tensor/manipulation.py +++ b/python/paddle/tensor/manipulation.py @@ -1432,7 +1432,8 @@ def expand(x, shape, name=None): 'Elements in shape must be 1-D Tensors or integers.') check_variable_and_dtype( - x, 'x', ['bool', 'float32', 'float64', 'int32', 'int64'], 'expand') + x, 'x', ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'], + 'expand') check_type(shape, 'shape', (list, tuple, Variable), 'expand') if convert_dtype(x.dtype) == 'bool' and x.stop_gradient == False: raise ValueError("When the data type of input 'x' for expand is bool, " -- GitLab From 95939b522c39d85fa90916325e96b4e275892502 Mon Sep 17 00:00:00 2001 From: Feiyu Chan Date: Wed, 14 Apr 2021 11:00:30 +0800 Subject: [PATCH 209/486] add common dtypes as paddle's dtypes (#32012) * add common dtypes as paddle's dtypes * import paddle.fluid.core_avx.VarDesc.VarType as paddle.dtype --- python/paddle/__init__.py | 13 +++++++++++ python/paddle/framework/dtype.py | 40 ++++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+) create mode 100644 python/paddle/framework/dtype.py diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py index 17bf2d544f3..861839256a3 100755 --- a/python/paddle/__init__.py +++ b/python/paddle/__init__.py @@ -31,6 +31,19 @@ from .fluid.dygraph import monkey_patch_math_varbase monkey_patch_variable() monkey_patch_math_varbase() import paddle.framework +from .framework.dtype import dtype as dtype +from paddle.framework.dtype import uint8 +from paddle.framework.dtype import int8 +from paddle.framework.dtype import int16 +from paddle.framework.dtype import int32 +from paddle.framework.dtype import int64 +from paddle.framework.dtype import float16 +from paddle.framework.dtype import float32 +from paddle.framework.dtype import float64 +from paddle.framework.dtype import bfloat16 +from paddle.framework.dtype import bool +from paddle.framework.dtype import complex64 +from paddle.framework.dtype import complex128 from .framework import VarBase as Tensor import paddle.compat import paddle.distributed diff --git a/python/paddle/framework/dtype.py b/python/paddle/framework/dtype.py new file mode 100644 index 00000000000..3eeaa6e74ec --- /dev/null +++ b/python/paddle/framework/dtype.py @@ -0,0 +1,40 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +__all__ = [ + "dtype", "uint8", "int8", "int16", "int32", "int64", "bfloat16", "float16", + "float32", "float64", "complex64", "complex128", "bool" +] + +from ..fluid.core import VarDesc + +dtype = VarDesc.VarType +dtype.__qualname__ = "dtype" +dtype.__module__ = "paddle" + +uint8 = VarDesc.VarType.UINT8 +int8 = VarDesc.VarType.INT8 +int16 = VarDesc.VarType.INT16 +int32 = VarDesc.VarType.INT32 +int64 = VarDesc.VarType.INT64 + +float32 = VarDesc.VarType.FP32 +float64 = VarDesc.VarType.FP64 +float16 = VarDesc.VarType.FP16 +bfloat16 = VarDesc.VarType.BF16 + +complex64 = VarDesc.VarType.COMPLEX64 +complex128 = VarDesc.VarType.COMPLEX128 + +bool = VarDesc.VarType.BOOL -- GitLab From 279b653ce982cdb7a608851c93d080b5d1579b45 Mon Sep 17 00:00:00 2001 From: xiegegege <46314656+xiegegege@users.noreply.github.com> Date: Wed, 14 Apr 2021 11:10:35 +0800 Subject: [PATCH 210/486] Add model benchmark ci (#32247) --- paddle/scripts/paddle_build.sh | 7 ++++ tools/test_model_benchmark.sh | 62 ++++++++++++++++++++++++++++++++++ 2 files changed, 69 insertions(+) create mode 100644 tools/test_model_benchmark.sh diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 2df9e0198ee..822f8058557 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -1830,6 +1830,10 @@ function test_op_benchmark() { bash ${PADDLE_ROOT}/tools/test_op_benchmark.sh } +function test_model_benchmark() { + bash ${PADDLE_ROOT}/tools/test_model_benchmark.sh +} + function summary_check_problems() { set +x local check_style_code=$1 @@ -2024,6 +2028,9 @@ function main() { test_op_benchmark) test_op_benchmark ;; + test_model_benchmark) + test_model_benchmark + ;; *) print_usage exit 1 diff --git a/tools/test_model_benchmark.sh b/tools/test_model_benchmark.sh new file mode 100644 index 00000000000..720bb334790 --- /dev/null +++ b/tools/test_model_benchmark.sh @@ -0,0 +1,62 @@ +#!/bin/bash + +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +function compile_install_paddle { + export CUDA_ARCH_NAME=Auto + export PY_VERSION=3.7 + export WITH_DISTRIBUTE=OFF + export WITH_GPU=ON + export WITH_TENSORRT=OFF + export WITH_TESTING=OFF + export WITH_UNITY_BUILD=ON + bash -x paddle/scripts/paddle_build.sh build + [ $? -ne 0 ] && echo "build paddle failed." && exit 1 + pip uninstall -y paddlepaddle_gpu + pip install build/python/dist/paddlepaddle_gpu-0.0.0-cp37-cp37m-linux_x86_64.whl + [ $? -ne 0 ] && echo "install paddle failed." && exit 1 +} + +function prepare_data { + cd ${cache_dir} + if [ -d "benchmark_data" ];then + echo -e "benchmark_data exist!" + else + mkdir benchmark_data + cd benchmark_data + mkdir dataset + cd dataset + wget --no-proxy -q https://paddle-qa.bj.bcebos.com/benchmark_data/Bert.zip + unzip Bert.zip + wget --no-proxy -q https://paddle-qa.bj.bcebos.com/benchmark_data/imagenet100_data.zip + unzip imagenet100_data.zip + fi +} + +function run_model_benchmark { + cd ${cache_dir}/benchmark_data + if [ -d "benchmark" ];then rm -rf benchmark + fi + git clone --recurse-submodules=PaddleClas --recurse-submodules=PaddleNLP https://github.com/paddlepaddle/benchmark.git + export data_path=${cache_dir}/benchmark_data/dataset + export BENCHMARK_ROOT=${cache_dir}/benchmark_data/benchmark + cd ${BENCHMARK_ROOT}/scripts/benchmark_ci + bash model_ci.sh +} + +compile_install_paddle +prepare_data +run_model_benchmark -- GitLab From 995b5f2c0f0135ea00119f4a247c9bd7385913c0 Mon Sep 17 00:00:00 2001 From: zhulei <563755780@qq.com> Date: Wed, 14 Apr 2021 11:14:54 +0800 Subject: [PATCH 211/486] fix matrix_inverse_op with rocm (#32128) * fix matrix_inverse_op with rocm * fix matrix_inverse_op with rocm * fix matrix_inverse_op with rocm * fix matrix_inverse_op with rocm --- paddle/fluid/operators/math/matrix_inverse.cc | 26 +-------------- .../fluid/operators/math/matrix_inverse.cu.cc | 5 +++ paddle/fluid/operators/math/matrix_inverse.h | 32 +++++++++++++++++++ 3 files changed, 38 insertions(+), 25 deletions(-) diff --git a/paddle/fluid/operators/math/matrix_inverse.cc b/paddle/fluid/operators/math/matrix_inverse.cc index 25bc5d725e1..60481491cb4 100644 --- a/paddle/fluid/operators/math/matrix_inverse.cc +++ b/paddle/fluid/operators/math/matrix_inverse.cc @@ -23,34 +23,10 @@ namespace math { template class MatrixInverseFunctor { - using Matrix = - Eigen::Matrix; - using EigenMatrixMap = Eigen::Map; - using ConstEigenMatrixMap = Eigen::Map; - public: void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& a, framework::Tensor* a_inv) { - const auto& mat_dims = a.dims(); - const int rank = mat_dims.size(); - int n = mat_dims[rank - 1]; - int batch_size = rank > 2 ? a.numel() / (n * n) : 1; - - const T* a_ptr = a.data(); - T* a_inv_ptr = a_inv->mutable_data(context.GetPlace()); - - for (int i = 0; i < batch_size; ++i) { - ConstEigenMatrixMap mat(a_ptr + i * n * n, n, n); - EigenMatrixMap mat_inv(a_inv_ptr + i * n * n, n, n); - Eigen::PartialPivLU lu; - lu.compute(mat); - - const T min_abs_pivot = lu.matrixLU().diagonal().cwiseAbs().minCoeff(); - PADDLE_ENFORCE_GT( - min_abs_pivot, static_cast(0), - platform::errors::InvalidArgument("Input is not invertible.")); - mat_inv.noalias() = lu.inverse(); - } + compute_inverse_eigen(context, a, a_inv); } }; diff --git a/paddle/fluid/operators/math/matrix_inverse.cu.cc b/paddle/fluid/operators/math/matrix_inverse.cu.cc index 7f5df114680..5deedf084c6 100644 --- a/paddle/fluid/operators/math/matrix_inverse.cu.cc +++ b/paddle/fluid/operators/math/matrix_inverse.cu.cc @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/operators/math/matrix_inverse.h" #include "paddle/fluid/operators/math/blas.h" namespace paddle { @@ -32,6 +33,7 @@ class MatrixInverseFunctor { public: void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& a, framework::Tensor* a_inv) { +#ifndef PADDLE_WITH_HIP const auto& mat_dims = a.dims(); const int rank = mat_dims.size(); int n = mat_dims[rank - 1]; @@ -111,6 +113,9 @@ class MatrixInverseFunctor { "non-singular matrix", i, info[i], info[i])); } +#else + compute_inverse_eigen(context, a, a_inv); +#endif } }; diff --git a/paddle/fluid/operators/math/matrix_inverse.h b/paddle/fluid/operators/math/matrix_inverse.h index f0baf0b250e..fb58b483666 100644 --- a/paddle/fluid/operators/math/matrix_inverse.h +++ b/paddle/fluid/operators/math/matrix_inverse.h @@ -15,6 +15,8 @@ limitations under the License. */ #pragma once #include +#include "Eigen/Core" +#include "Eigen/LU" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/platform/device_context.h" @@ -22,6 +24,36 @@ namespace paddle { namespace operators { namespace math { +template +void compute_inverse_eigen(const DeviceContext& context, + const framework::Tensor& a, + framework::Tensor* a_inv) { + using Matrix = + Eigen::Matrix; + using EigenMatrixMap = Eigen::Map; + using ConstEigenMatrixMap = Eigen::Map; + const auto& mat_dims = a.dims(); + const int rank = mat_dims.size(); + int n = mat_dims[rank - 1]; + int batch_size = rank > 2 ? a.numel() / (n * n) : 1; + + const T* a_ptr = a.data(); + T* a_inv_ptr = a_inv->mutable_data(context.GetPlace()); + + for (int i = 0; i < batch_size; ++i) { + ConstEigenMatrixMap mat(a_ptr + i * n * n, n, n); + EigenMatrixMap mat_inv(a_inv_ptr + i * n * n, n, n); + Eigen::PartialPivLU lu; + lu.compute(mat); + + const T min_abs_pivot = lu.matrixLU().diagonal().cwiseAbs().minCoeff(); + PADDLE_ENFORCE_GT( + min_abs_pivot, static_cast(0), + platform::errors::InvalidArgument("Input is not invertible.")); + mat_inv.noalias() = lu.inverse(); + } +} + template class MatrixInverseFunctor { public: -- GitLab From 22ea4c30c2679a89e6800b7c1cdc2c68ff4e55cb Mon Sep 17 00:00:00 2001 From: tianshuo78520a <707759223@qq.com> Date: Wed, 14 Apr 2021 11:21:49 +0800 Subject: [PATCH 212/486] Delete grpc.cmake/distribeted/distributed_ops (#32166) * Delete grpc.cmake/distribeted/distributed_ops * reset operators/CMakeLists.txt * rm test_transpiler_ops.py * del test_transpiler_ops.py --- cmake/external/grpc.cmake | 77 -- .../operators/collective/allreduce_op.cc | 2 +- .../operators/collective/allreduce_op.cu.cc | 2 +- .../operators/distributed/CMakeLists.txt | 76 -- .../async_sparse_param_update_recorder.cc | 27 - .../async_sparse_param_update_recorder.h | 186 ---- ...async_sparse_param_update_recorder_test.cc | 97 -- .../operators/distributed/brpc/brpc_client.cc | 462 -------- .../operators/distributed/brpc/brpc_client.h | 174 --- .../distributed/brpc/brpc_rdma_pool.cc | 86 -- .../distributed/brpc/brpc_rdma_pool.h | 56 - .../distributed/brpc/brpc_sendrecvop_utils.cc | 224 ---- .../distributed/brpc/brpc_sendrecvop_utils.h | 49 - .../distributed/brpc/brpc_serde_test.cc | 175 ---- .../operators/distributed/brpc/brpc_server.cc | 417 -------- .../operators/distributed/brpc/brpc_server.h | 53 - .../brpc/brpc_variable_response.cc | 75 -- .../distributed/brpc/brpc_variable_response.h | 67 -- .../distributed/collective_client.cc | 57 - .../operators/distributed/collective_client.h | 104 -- .../distributed/collective_server.cc | 68 -- .../operators/distributed/collective_server.h | 116 -- .../distributed/collective_server_test.cc | 131 --- .../operators/distributed/communicator.cc | 989 ------------------ .../operators/distributed/communicator.h | 490 --------- .../distributed/communicator_common.h | 91 -- .../distributed/communicator_test.cc | 106 -- .../fluid/operators/distributed/distributed.h | 36 - .../operators/distributed/distributed_pb.h | 30 - .../grpc/grpc_bytebuffer_stream.cc | 92 -- .../distributed/grpc/grpc_bytebuffer_stream.h | 174 --- .../operators/distributed/grpc/grpc_client.cc | 671 ------------ .../operators/distributed/grpc/grpc_client.h | 321 ------ .../operators/distributed/grpc/grpc_serde.cc | 190 ---- .../operators/distributed/grpc/grpc_serde.h | 69 -- .../distributed/grpc/grpc_serde_test.cc | 224 ---- .../operators/distributed/grpc/grpc_server.cc | 720 ------------- .../operators/distributed/grpc/grpc_server.h | 93 -- .../operators/distributed/grpc/grpc_service.h | 145 --- .../grpc/grpc_variable_response.cc | 344 ------ .../distributed/grpc/grpc_variable_response.h | 67 -- .../distributed/heart_beat_monitor.cc | 97 -- .../distributed/heart_beat_monitor.h | 127 --- .../distributed/heart_beat_monitor_test.cc | 54 - .../operators/distributed/large_scale_kv.cc | 26 - .../operators/distributed/large_scale_kv.h | 848 --------------- .../distributed/parameter_prefetch.cc | 311 ------ .../distributed/parameter_prefetch.h | 53 - .../operators/distributed/parameter_recv.cc | 248 ----- .../operators/distributed/parameter_recv.h | 37 - .../operators/distributed/parameter_send.cc | 331 ------ .../operators/distributed/parameter_send.h | 35 - .../distributed/proto_encoder_helper.h | 146 --- .../operators/distributed/request_handler.h | 261 ----- .../distributed/request_handler_impl.cc | 354 ------- .../distributed/request_handler_impl.h | 198 ---- .../fluid/operators/distributed/rpc_client.cc | 32 - .../fluid/operators/distributed/rpc_client.h | 143 --- .../fluid/operators/distributed/rpc_server.cc | 242 ----- .../fluid/operators/distributed/rpc_server.h | 149 --- .../operators/distributed/rpc_server_test.cc | 344 ------ .../operators/distributed/send_recv.proto.in | 88 -- .../operators/distributed/sendrecvop_utils.cc | 117 --- .../operators/distributed/sendrecvop_utils.h | 110 -- .../operators/distributed/varhandle_test.cc | 50 - .../distributed/variable_response.cc | 271 ----- .../operators/distributed/variable_response.h | 155 --- .../operators/distributed_ops/CMakeLists.txt | 38 - .../operators/distributed_ops/allreduce_op.cc | 80 -- .../distributed_ops/allreduce_op.cu.cc | 25 - .../operators/distributed_ops/allreduce_op.h | 90 -- .../operators/distributed_ops/broadcast_op.cc | 79 -- .../distributed_ops/broadcast_op.cu.cc | 91 -- .../distributed_ops/checkpoint_notify_op.cc | 117 --- .../distributed_lookup_table_op.cc | 156 --- .../distributed_lookup_table_op.cu.cc | 22 - .../distributed_lookup_table_op.h | 66 -- .../operators/distributed_ops/fake_init_op.cc | 81 -- .../distributed_ops/fetch_barrier_op.cc | 105 -- .../distributed_ops/fl_listen_and_serv_op.cc | 284 ----- .../distributed_ops/fl_listen_and_serv_op.h | 107 -- .../distributed_ops/gen_nccl_id_op.cc | 313 ------ .../distributed_ops/listen_and_serv_op.cc | 636 ----------- .../distributed_ops/listen_and_serv_op.h | 135 --- .../lookup_sparse_table_fuse_adam_op.cc | 158 --- .../lookup_sparse_table_fuse_adam_op.h | 142 --- .../lookup_sparse_table_fuse_sgd_op.cc | 125 --- .../lookup_sparse_table_fuse_sgd_op.h | 105 -- .../lookup_sparse_table_grad_split_op.cc | 79 -- .../lookup_sparse_table_grad_split_op.h | 97 -- .../lookup_sparse_table_init_op.cc | 147 --- .../lookup_sparse_table_merge_op.cc | 84 -- .../lookup_sparse_table_merge_op.h | 78 -- .../lookup_sparse_table_read_op.cc | 133 --- .../lookup_sparse_table_write_op.cc | 116 -- .../operators/distributed_ops/merge_ids_op.cc | 134 --- .../operators/distributed_ops/merge_ids_op.h | 112 -- .../operators/distributed_ops/prefetch_op.cc | 119 --- .../operators/distributed_ops/recv_op.cc | 153 --- .../operators/distributed_ops/recv_save_op.cc | 328 ------ .../distributed_ops/ref_by_trainer_id_op.cc | 99 -- .../ref_by_trainer_id_op.cu.cc | 26 - .../distributed_ops/ref_by_trainer_id_op.h | 53 - .../distributed_ops/send_and_recv_op.cc | 98 -- .../distributed_ops/send_barrier_op.cc | 120 --- .../operators/distributed_ops/send_op.cc | 160 --- .../distributed_ops/send_recv_op_test.cc | 257 ----- .../distributed_ops/send_recv_util.h | 73 -- .../distributed_ops/sparse_tensor_load_op.cc | 217 ---- .../distributed_ops/split_byref_op.cc | 103 -- .../distributed_ops/split_byref_op.cu.cc | 19 - .../distributed_ops/split_byref_op.h | 43 - .../operators/distributed_ops/split_ids_op.cc | 96 -- .../operators/distributed_ops/split_ids_op.h | 127 --- .../distributed_ops/test_send_nccl_id.cc | 107 -- .../fluid/operators/split_selected_rows_op.cc | 99 -- .../fluid/operators/split_selected_rows_op.cu | 19 - .../fluid/operators/split_selected_rows_op.h | 108 -- .../fluid/tests/unittests/CMakeLists.txt | 2 - .../unittests/test_split_selected_rows_op.py | 137 --- .../tests/unittests/test_transpiler_ops.py | 143 --- 121 files changed, 2 insertions(+), 19099 deletions(-) delete mode 100644 cmake/external/grpc.cmake delete mode 100644 paddle/fluid/operators/distributed/CMakeLists.txt delete mode 100644 paddle/fluid/operators/distributed/async_sparse_param_update_recorder.cc delete mode 100644 paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h delete mode 100644 paddle/fluid/operators/distributed/async_sparse_param_update_recorder_test.cc delete mode 100644 paddle/fluid/operators/distributed/brpc/brpc_client.cc delete mode 100644 paddle/fluid/operators/distributed/brpc/brpc_client.h delete mode 100644 paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.cc delete mode 100644 paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.h delete mode 100644 paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.cc delete mode 100644 paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h delete mode 100644 paddle/fluid/operators/distributed/brpc/brpc_serde_test.cc delete mode 100644 paddle/fluid/operators/distributed/brpc/brpc_server.cc delete mode 100644 paddle/fluid/operators/distributed/brpc/brpc_server.h delete mode 100644 paddle/fluid/operators/distributed/brpc/brpc_variable_response.cc delete mode 100644 paddle/fluid/operators/distributed/brpc/brpc_variable_response.h delete mode 100644 paddle/fluid/operators/distributed/collective_client.cc delete mode 100644 paddle/fluid/operators/distributed/collective_client.h delete mode 100644 paddle/fluid/operators/distributed/collective_server.cc delete mode 100644 paddle/fluid/operators/distributed/collective_server.h delete mode 100644 paddle/fluid/operators/distributed/collective_server_test.cc delete mode 100644 paddle/fluid/operators/distributed/communicator.cc delete mode 100644 paddle/fluid/operators/distributed/communicator.h delete mode 100644 paddle/fluid/operators/distributed/communicator_common.h delete mode 100644 paddle/fluid/operators/distributed/communicator_test.cc delete mode 100644 paddle/fluid/operators/distributed/distributed.h delete mode 100644 paddle/fluid/operators/distributed/distributed_pb.h delete mode 100644 paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.cc delete mode 100644 paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h delete mode 100644 paddle/fluid/operators/distributed/grpc/grpc_client.cc delete mode 100644 paddle/fluid/operators/distributed/grpc/grpc_client.h delete mode 100644 paddle/fluid/operators/distributed/grpc/grpc_serde.cc delete mode 100644 paddle/fluid/operators/distributed/grpc/grpc_serde.h delete mode 100644 paddle/fluid/operators/distributed/grpc/grpc_serde_test.cc delete mode 100644 paddle/fluid/operators/distributed/grpc/grpc_server.cc delete mode 100644 paddle/fluid/operators/distributed/grpc/grpc_server.h delete mode 100644 paddle/fluid/operators/distributed/grpc/grpc_service.h delete mode 100644 paddle/fluid/operators/distributed/grpc/grpc_variable_response.cc delete mode 100644 paddle/fluid/operators/distributed/grpc/grpc_variable_response.h delete mode 100644 paddle/fluid/operators/distributed/heart_beat_monitor.cc delete mode 100644 paddle/fluid/operators/distributed/heart_beat_monitor.h delete mode 100644 paddle/fluid/operators/distributed/heart_beat_monitor_test.cc delete mode 100644 paddle/fluid/operators/distributed/large_scale_kv.cc delete mode 100644 paddle/fluid/operators/distributed/large_scale_kv.h delete mode 100644 paddle/fluid/operators/distributed/parameter_prefetch.cc delete mode 100644 paddle/fluid/operators/distributed/parameter_prefetch.h delete mode 100644 paddle/fluid/operators/distributed/parameter_recv.cc delete mode 100644 paddle/fluid/operators/distributed/parameter_recv.h delete mode 100644 paddle/fluid/operators/distributed/parameter_send.cc delete mode 100644 paddle/fluid/operators/distributed/parameter_send.h delete mode 100644 paddle/fluid/operators/distributed/proto_encoder_helper.h delete mode 100644 paddle/fluid/operators/distributed/request_handler.h delete mode 100644 paddle/fluid/operators/distributed/request_handler_impl.cc delete mode 100644 paddle/fluid/operators/distributed/request_handler_impl.h delete mode 100644 paddle/fluid/operators/distributed/rpc_client.cc delete mode 100644 paddle/fluid/operators/distributed/rpc_client.h delete mode 100644 paddle/fluid/operators/distributed/rpc_server.cc delete mode 100644 paddle/fluid/operators/distributed/rpc_server.h delete mode 100644 paddle/fluid/operators/distributed/rpc_server_test.cc delete mode 100644 paddle/fluid/operators/distributed/send_recv.proto.in delete mode 100644 paddle/fluid/operators/distributed/sendrecvop_utils.cc delete mode 100644 paddle/fluid/operators/distributed/sendrecvop_utils.h delete mode 100644 paddle/fluid/operators/distributed/varhandle_test.cc delete mode 100644 paddle/fluid/operators/distributed/variable_response.cc delete mode 100644 paddle/fluid/operators/distributed/variable_response.h delete mode 100644 paddle/fluid/operators/distributed_ops/CMakeLists.txt delete mode 100644 paddle/fluid/operators/distributed_ops/allreduce_op.cc delete mode 100644 paddle/fluid/operators/distributed_ops/allreduce_op.cu.cc delete mode 100644 paddle/fluid/operators/distributed_ops/allreduce_op.h delete mode 100644 paddle/fluid/operators/distributed_ops/broadcast_op.cc delete mode 100644 paddle/fluid/operators/distributed_ops/broadcast_op.cu.cc delete mode 100644 paddle/fluid/operators/distributed_ops/checkpoint_notify_op.cc delete mode 100644 paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.cc delete mode 100644 paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.cu.cc delete mode 100644 paddle/fluid/operators/distributed_ops/distributed_lookup_table_op.h delete mode 100644 paddle/fluid/operators/distributed_ops/fake_init_op.cc delete mode 100644 paddle/fluid/operators/distributed_ops/fetch_barrier_op.cc delete mode 100644 paddle/fluid/operators/distributed_ops/fl_listen_and_serv_op.cc delete mode 100644 paddle/fluid/operators/distributed_ops/fl_listen_and_serv_op.h delete mode 100644 paddle/fluid/operators/distributed_ops/gen_nccl_id_op.cc delete mode 100644 paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc delete mode 100644 paddle/fluid/operators/distributed_ops/listen_and_serv_op.h delete mode 100644 paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_adam_op.cc delete mode 100644 paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_adam_op.h delete mode 100644 paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_sgd_op.cc delete mode 100644 paddle/fluid/operators/distributed_ops/lookup_sparse_table_fuse_sgd_op.h delete mode 100644 paddle/fluid/operators/distributed_ops/lookup_sparse_table_grad_split_op.cc delete mode 100644 paddle/fluid/operators/distributed_ops/lookup_sparse_table_grad_split_op.h delete mode 100644 paddle/fluid/operators/distributed_ops/lookup_sparse_table_init_op.cc delete mode 100644 paddle/fluid/operators/distributed_ops/lookup_sparse_table_merge_op.cc delete mode 100644 paddle/fluid/operators/distributed_ops/lookup_sparse_table_merge_op.h delete mode 100644 paddle/fluid/operators/distributed_ops/lookup_sparse_table_read_op.cc delete mode 100644 paddle/fluid/operators/distributed_ops/lookup_sparse_table_write_op.cc delete mode 100644 paddle/fluid/operators/distributed_ops/merge_ids_op.cc delete mode 100644 paddle/fluid/operators/distributed_ops/merge_ids_op.h delete mode 100644 paddle/fluid/operators/distributed_ops/prefetch_op.cc delete mode 100644 paddle/fluid/operators/distributed_ops/recv_op.cc delete mode 100644 paddle/fluid/operators/distributed_ops/recv_save_op.cc delete mode 100644 paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.cc delete mode 100644 paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.cu.cc delete mode 100644 paddle/fluid/operators/distributed_ops/ref_by_trainer_id_op.h delete mode 100644 paddle/fluid/operators/distributed_ops/send_and_recv_op.cc delete mode 100644 paddle/fluid/operators/distributed_ops/send_barrier_op.cc delete mode 100644 paddle/fluid/operators/distributed_ops/send_op.cc delete mode 100644 paddle/fluid/operators/distributed_ops/send_recv_op_test.cc delete mode 100644 paddle/fluid/operators/distributed_ops/send_recv_util.h delete mode 100644 paddle/fluid/operators/distributed_ops/sparse_tensor_load_op.cc delete mode 100644 paddle/fluid/operators/distributed_ops/split_byref_op.cc delete mode 100644 paddle/fluid/operators/distributed_ops/split_byref_op.cu.cc delete mode 100644 paddle/fluid/operators/distributed_ops/split_byref_op.h delete mode 100644 paddle/fluid/operators/distributed_ops/split_ids_op.cc delete mode 100644 paddle/fluid/operators/distributed_ops/split_ids_op.h delete mode 100644 paddle/fluid/operators/distributed_ops/test_send_nccl_id.cc delete mode 100644 paddle/fluid/operators/split_selected_rows_op.cc delete mode 100644 paddle/fluid/operators/split_selected_rows_op.cu delete mode 100644 paddle/fluid/operators/split_selected_rows_op.h delete mode 100644 python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py delete mode 100644 python/paddle/fluid/tests/unittests/test_transpiler_ops.py diff --git a/cmake/external/grpc.cmake b/cmake/external/grpc.cmake deleted file mode 100644 index 536e95c1dc2..00000000000 --- a/cmake/external/grpc.cmake +++ /dev/null @@ -1,77 +0,0 @@ -# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -include (ExternalProject) - -SET(GRPC_SOURCES_DIR ${THIRD_PARTY_PATH}/grpc) -SET(GRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/grpc) -SET(GRPC_INCLUDE_DIR "${GRPC_INSTALL_DIR}/include/" CACHE PATH "grpc include directory." FORCE) -SET(GRPC_CPP_PLUGIN "${GRPC_INSTALL_DIR}/bin/grpc_cpp_plugin" CACHE FILEPATH "GRPC_CPP_PLUGIN" FORCE) - -include(ProcessorCount) -ProcessorCount(NUM_OF_PROCESSOR) - -IF(APPLE) - SET(BUILD_CMD make -n HAS_SYSTEM_PROTOBUF=false -s -j ${NUM_OF_PROCESSOR} static grpc_cpp_plugin | sed "s/-Werror//g" | sh) - SET(GRPC_INSTALL_CMD make prefix=${GRPC_INSTALL_DIR} install) -ELSE() - SET(GRPC_CFLAGS "-Wno-error -std=c11 ${CLFAGS}") - SET(GRPC_CXXFLAGS "-Wno-error -std=c++11 ${CXXFLAGS}") - SET(BUILD_CMD make CFLAGS=${GRPC_CFLAGS} CXXFLAGS=${GRPC_CXXFLAGS} HAS_SYSTEM_PROTOBUF=false -s -j ${NUM_OF_PROCESSOR} static grpc_cpp_plugin) - SET(GRPC_INSTALL_CMD make prefix=${GRPC_INSTALL_DIR} install CFLAGS=${GRPC_CFLAGS} CXXFLAGS=${GRPC_CXXFLAGS}) -ENDIF() - -# FIXME(wuyi): do not build zlib cares protobuf twice, find a way to build grpc with them -ExternalProject_Add( - extern_grpc - DEPENDS protobuf zlib - # NOTE(wuyi): - # this package is generated by following steps: - # 1. git clone -b v1.8.x https://github.com/grpc/grpc.git - # 2. git submodule update --init - # 3. keep only zlib, cares, protobuf, boringssl under "third_party", - # checkout and clean other dirs under third_party - # 4. remove .git, and package the directory. - URL http://paddlepaddledeps.bj.bcebos.com/grpc-v1.10.x_paddle.tar.gz - URL_MD5 f5442d137ddccee252e194b1bc90f98c - PREFIX ${GRPC_SOURCES_DIR} - UPDATE_COMMAND "" - CONFIGURE_COMMAND "" - BUILD_IN_SOURCE 1 - # NOTE(yuyang18): - # Disable -Werror, otherwise the compile will fail in MacOS. - # It seems that we cannot configure that by make command. - # Just dry run make command and remove `-Werror`, then use a shell to run make commands - BUILD_COMMAND ${BUILD_CMD} - INSTALL_COMMAND ${GRPC_INSTALL_CMD} -) - -ADD_LIBRARY(grpc++_unsecure STATIC IMPORTED GLOBAL) -SET_PROPERTY(TARGET grpc++_unsecure PROPERTY IMPORTED_LOCATION - "${GRPC_INSTALL_DIR}/lib/libgrpc++_unsecure.a") - -ADD_LIBRARY(grpc++ STATIC IMPORTED GLOBAL) -SET_PROPERTY(TARGET grpc++ PROPERTY IMPORTED_LOCATION - "${GRPC_INSTALL_DIR}/lib/libgrpc++.a") -ADD_LIBRARY(gpr STATIC IMPORTED GLOBAL) -SET_PROPERTY(TARGET gpr PROPERTY IMPORTED_LOCATION - "${GRPC_INSTALL_DIR}/lib/libgpr.a") - -ADD_LIBRARY(grpc_unsecure STATIC IMPORTED GLOBAL) -SET_PROPERTY(TARGET grpc_unsecure PROPERTY IMPORTED_LOCATION - "${GRPC_INSTALL_DIR}/lib/libgrpc_unsecure.a") - -include_directories(${GRPC_INCLUDE_DIR}) -ADD_DEPENDENCIES(grpc++_unsecure extern_grpc) diff --git a/paddle/fluid/operators/collective/allreduce_op.cc b/paddle/fluid/operators/collective/allreduce_op.cc index 86f1c28a9dd..63b135a74cf 100644 --- a/paddle/fluid/operators/collective/allreduce_op.cc +++ b/paddle/fluid/operators/collective/allreduce_op.cc @@ -15,7 +15,7 @@ limitations under the License. */ #include // NOLINT #include -#include "paddle/fluid/operators/distributed_ops/allreduce_op.h" +#include "paddle/fluid/operators/collective/allreduce_op.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/collective/allreduce_op.cu.cc b/paddle/fluid/operators/collective/allreduce_op.cu.cc index 9b70f783990..fe2e4910552 100644 --- a/paddle/fluid/operators/collective/allreduce_op.cu.cc +++ b/paddle/fluid/operators/collective/allreduce_op.cu.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/distributed_ops/allreduce_op.h" +#include "paddle/fluid/operators/collective/allreduce_op.h" namespace ops = paddle::operators; namespace plat = paddle::platform; diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt deleted file mode 100644 index c9db6148bc4..00000000000 --- a/paddle/fluid/operators/distributed/CMakeLists.txt +++ /dev/null @@ -1,76 +0,0 @@ -return() - -if(WITH_GRPC) - set(cc_generic_services "false") -else() - set(cc_generic_services "true") -endif() -configure_file(send_recv.proto.in ${CMAKE_CURRENT_SOURCE_DIR}/send_recv.proto @ONLY) - -cc_library(async_sparse_param_update_recorder SRCS async_sparse_param_update_recorder.cc DEPS enforce simple_threadpool) -cc_test(async_sparse_param_update_recorder_test SRCS async_sparse_param_update_recorder_test.cc DEPS async_sparse_param_update_recorder) - -cc_library(heart_beat_monitor SRCS heart_beat_monitor.cc DEPS enforce simple_threadpool) -cc_library(large_scale_kv SRCS large_scale_kv.cc DEPS enforce simple_threadpool device_context) -cc_test(heart_beat_monitor_test SRCS heart_beat_monitor_test.cc DEPS heart_beat_monitor) - -# FIXME(typhoonzero): use add_subdirectory once we clean the dependency of these files -set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") -if(WITH_GRPC) - set(GRPC_DEPS grpc++_unsecure grpc_unsecure gpr zlib protobuf) - set(GRPC_SRCS grpc/grpc_client.cc grpc/grpc_server.cc grpc/grpc_serde.cc grpc/grpc_bytebuffer_stream.cc grpc/grpc_variable_response.cc) - grpc_library(sendrecvop_rpc SRCS sendrecvop_utils.cc - request_handler_impl.cc rpc_client.cc rpc_server.cc - variable_response.cc - collective_client.cc collective_server.cc - ${GRPC_SRCS} - PROTO send_recv.proto - DEPS lod_tensor selected_rows_functor memory scope ${GRPC_DEPS} async_sparse_param_update_recorder heart_beat_monitor large_scale_kv) - - set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - set(RPC_DEPS sendrecvop_rpc ${GRPC_DEPS}) - - cc_test(grpc_serde_test SRCS grpc/grpc_serde_test.cc - DEPS ${RPC_DEPS} scope profiler math_function) - -else() - set(BRPC_SRCS brpc/brpc_client.cc brpc/brpc_server.cc brpc/brpc_sendrecvop_utils.cc brpc/brpc_variable_response.cc brpc/brpc_rdma_pool.cc) - set_source_files_properties(${BRPC_SRCS} parameter_prefetch.cc parameter_send.cc parameter_recv.cc communicator.cc rpc_server_test.cc brpc/brpc_serde_test.cc collective_server.cc collective_server_test.cc collective_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - - set(BRPC_DEPS brpc ssl crypto protobuf leveldb zlib) - - brpc_library(sendrecvop_rpc SRCS sendrecvop_utils.cc - request_handler_impl.cc rpc_client.cc rpc_server.cc - variable_response.cc - collective_client.cc collective_server.cc - ${BRPC_SRCS} - PROTO send_recv.proto - DEPS lod_tensor selected_rows memory scope ${BRPC_DEPS}) - - set(RPC_DEPS sendrecvop_rpc ${BRPC_DEPS}) - cc_test(brpc_serde_test SRCS brpc/brpc_serde_test.cc - DEPS ${RPC_DEPS} gflags glog executor proto_desc lookup_sparse_table_read_op) -endif() - - -cc_test(rpc_server_test SRCS rpc_server_test.cc - DEPS ${RPC_DEPS} executor scope proto_desc lookup_sparse_table_read_op checkpoint_notify_op scale_op ) -cc_test(varhandle_test SRCS varhandle_test.cc DEPS profiler scope) -cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_rpc memory node) -cc_library(parameter_send SRCS parameter_send.cc DEPS sendrecvop_rpc memory) -cc_library(parameter_recv SRCS parameter_recv.cc DEPS sendrecvop_rpc memory) -cc_library(communicator SRCS communicator.cc DEPS scope selected_rows tensor variable_helper selected_rows_functor simple_threadpool parameter_send parameter_recv generator) -cc_test(communicator_test SRCS communicator_test.cc DEPS communicator) -if(WITH_GPU OR WITH_ROCM) - cc_test(collective_server_test SRCS collective_server_test.cc - DEPS sendrecvop_rpc executor ${RPC_DEPS} - selected_rows_functor scope math_function) -endif() -if(WITH_TESTING) - if(TEST rpc_server_test) - set_tests_properties(rpc_server_test PROPERTIES TIMEOUT 120) - endif() - if(TEST heart_beat_monitor_test) - set_tests_properties(heart_beat_monitor_test PROPERTIES TIMEOUT 120) - endif() -endif() diff --git a/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.cc b/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.cc deleted file mode 100644 index 3f3b6b959e3..00000000000 --- a/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.cc +++ /dev/null @@ -1,27 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h" - -namespace paddle { -namespace operators { -namespace distributed { - -std::once_flag AsyncSparseParamUpdateRecorder::init_flag_; -std::unique_ptr - AsyncSparseParamUpdateRecorder::recorder_(nullptr); - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h b/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h deleted file mode 100644 index 28a5f2ad6c7..00000000000 --- a/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h +++ /dev/null @@ -1,186 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include // NOLINT -#include -#include -#include -#include -#include -#include - -#include "paddle/fluid/platform/enforce.h" - -namespace paddle { -namespace operators { -namespace distributed { - -class ConcurrentSet { - public: - ConcurrentSet() : pool_(new ::ThreadPool(1)) {} - ~ConcurrentSet() {} - - std::future Update(const std::vector& rows) { - auto task = [this, rows] { - if (VLOG_IS_ON(3)) { - std::ostringstream sstream; - sstream << "["; - for (auto& id : rows) { - sstream << id << ", "; - } - sstream << "]"; - VLOG(3) << "update ids -> " << sstream.str(); - } - for (auto row : rows) { - set_.insert(row); - } - }; - return pool_->enqueue(std::move(task)); - } - - std::future GetAndClear(std::vector* result) { - auto task = [this, &result] { - result->clear(); - for (auto& id : set_) { - result->push_back(id); - } - if (VLOG_IS_ON(3)) { - std::ostringstream sstream; - sstream << "["; - for (auto& id : *result) { - sstream << id << ", "; - } - sstream << "]"; - VLOG(3) << "result ids size: " << result->size() << " " - << sstream.str(); - } - set_.clear(); - }; - return pool_->enqueue(std::move(task)); - } - - private: - std::unordered_set set_; - std::unique_ptr<::ThreadPool> pool_{nullptr}; -}; - -class AsyncSparseParamUpdateRecorder { - using TrainerToRows = std::vector>; - - public: - AsyncSparseParamUpdateRecorder( - int trainer_num, - const std::unordered_map& grad_to_param) - : trainer_num_(trainer_num), grad_to_param_(grad_to_param) { - if (VLOG_IS_ON(3)) { - std::ostringstream sstream; - sstream << "["; - for (auto& item : grad_to_param) { - sstream << item.first << ":" << item.second << ", "; - } - sstream << "]"; - VLOG(3) << "trainer_num: " << trainer_num - << " grad_to_param_: " << sstream.str(); - } - for (auto& iter : grad_to_param) { - param_to_grad_[iter.second] = iter.first; - auto& param_name = iter.second; - param_to_updated_rows_[param_name] = TrainerToRows(); - auto& trainer_to_rows = param_to_updated_rows_[param_name]; - for (auto i = 0; i < trainer_num; ++i) { - trainer_to_rows.emplace_back(new ConcurrentSet()); - } - } - } - - ~AsyncSparseParamUpdateRecorder() = default; - - void Update(const std::string& grad_name, - const std::vector& update_rows) { - VLOG(3) << "update grad: " << grad_name - << " row size: " << update_rows.size(); - auto& param_name = grad_to_param_.at(grad_name); - auto& trainer_to_rows = param_to_updated_rows_.at(param_name); - - std::vector> fs; - for (auto& set : trainer_to_rows) { - fs.push_back(set->Update(update_rows)); - } - for (auto& f : fs) { - f.wait(); - } - } - - void GetAndClear(const std::string& param_name, int trainer_id, - std::vector* result) { - VLOG(3) << "GetAndClear param: " << param_name - << " for trainer: " << trainer_id; - PADDLE_ENFORCE_LT( - trainer_id, trainer_num_, - platform::errors::InvalidArgument( - "The value of trainer_id: %s should less than trainer_num: %s.", - trainer_id, trainer_num_)); - param_to_updated_rows_.at(param_name)[trainer_id] - ->GetAndClear(result) - .wait(); - } - - bool HasParam(const std::string& param_name) { - return param_to_grad_.find(param_name) != param_to_grad_.end(); - } - - bool HasGrad(const std::string& grad_name) { - return grad_to_param_.find(grad_name) != grad_to_param_.end(); - } - - private: - const int trainer_num_; - std::unordered_map grad_to_param_; - std::unordered_map param_to_grad_; - std::unordered_map param_to_updated_rows_; - - // init recorder - public: - static void Init( - int trainer_num, - const std::unordered_map& grad_to_param) { - InitImpl(trainer_num, grad_to_param); - } - - static AsyncSparseParamUpdateRecorder* GetInstance() { - return recorder_.get(); - } - - private: - // Init is called by GetInstance. - static void InitImpl( - int trainer_num, - const std::unordered_map& grad_to_param) { - if (recorder_ == nullptr) { - recorder_.reset( - new AsyncSparseParamUpdateRecorder(trainer_num, grad_to_param)); - } - } - - static std::once_flag init_flag_; - static std::unique_ptr recorder_; -}; - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/async_sparse_param_update_recorder_test.cc b/paddle/fluid/operators/distributed/async_sparse_param_update_recorder_test.cc deleted file mode 100644 index 2d78559625c..00000000000 --- a/paddle/fluid/operators/distributed/async_sparse_param_update_recorder_test.cc +++ /dev/null @@ -1,97 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h" -#include -#include "gtest/gtest.h" - -namespace paddle { -namespace operators { -namespace distributed { - -TEST(ConcurrentSet, All) { - ConcurrentSet concurrent_set; - std::vector in1 = {1, 2, 3, 4}; - std::vector in2 = {2, 3, 5, 6}; - - std::vector> futures; - futures.push_back(concurrent_set.Update(in1)); - futures.push_back(concurrent_set.Update(in2)); - - for (auto &f : futures) { - f.wait(); - } - - std::unordered_set in; - std::copy(in1.begin(), in1.end(), std::inserter(in, in.begin())); - std::copy(in2.begin(), in2.end(), std::inserter(in, in.begin())); - - std::vector ret; - concurrent_set.GetAndClear(&ret).wait(); - - std::unordered_set out; - std::copy(ret.begin(), ret.end(), std::inserter(out, out.begin())); - - EXPECT_EQ(in, out); - - concurrent_set.GetAndClear(&ret).wait(); - EXPECT_EQ(ret.size(), 0UL); -} - -TEST(AsyncSparseParamUpdateRecorder, All) { - std::unordered_map grad_to_param; - grad_to_param["grad1"] = "param1"; - grad_to_param["grad2"] = "param2"; - - int trainer_num = 10; - - AsyncSparseParamUpdateRecorder recorder(trainer_num, grad_to_param); - std::vector in1 = {1, 2, 3, 4}; - std::vector in2 = {2, 3, 5, 6}; - - std::unordered_set in; - std::copy(in1.begin(), in1.end(), std::inserter(in, in.begin())); - std::copy(in2.begin(), in2.end(), std::inserter(in, in.begin())); - - recorder.Update("grad1", in1); - recorder.Update("grad1", in2); - - EXPECT_TRUE(recorder.HasParam("param1")); - EXPECT_TRUE(recorder.HasParam("param2")); - EXPECT_FALSE(recorder.HasParam("param3")); - - EXPECT_TRUE(recorder.HasGrad("grad1")); - EXPECT_TRUE(recorder.HasGrad("grad2")); - EXPECT_FALSE(recorder.HasGrad("grad3")); - - std::vector ret; - EXPECT_ANY_THROW(recorder.GetAndClear("param1", trainer_num, &ret)); - - for (int i = 0; i < trainer_num; ++i) { - std::vector ret; - std::unordered_set out; - - recorder.GetAndClear("param1", i, &ret); - std::copy(ret.begin(), ret.end(), std::inserter(out, out.begin())); - - EXPECT_EQ(in, out); - - recorder.GetAndClear("param1", i, &ret); - EXPECT_EQ(ret.size(), 0UL); - } -} - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/brpc/brpc_client.cc b/paddle/fluid/operators/distributed/brpc/brpc_client.cc deleted file mode 100644 index b2a26089c86..00000000000 --- a/paddle/fluid/operators/distributed/brpc/brpc_client.cc +++ /dev/null @@ -1,462 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/distributed/brpc/brpc_client.h" -#include "paddle/fluid/framework/threadpool.h" -#include "paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h" -#include "paddle/fluid/platform/profiler.h" - -namespace paddle { -namespace operators { -namespace distributed { - -DEFINE_int32(timeout_ms, 30000, "RPC timeout in milliseconds"); -DEFINE_int32(max_retry, 3, "Max retries(not including the first RPC)"); - -BRPCClient::~BRPCClient() { Wait(); } - -void HandleSendResponse(brpc::Controller* cntl, sendrecv::VoidMessage* response, - VarHandlePtr var_h, ChannelQueuePtr ch_ptr, - ChannelContextPtr ch_ctx, BRPCClient* cls) { - // std::unique_ptr makes sure cntl/response will be deleted before returning. - std::unique_ptr cntl_guard(cntl); - std::unique_ptr response_guard(response); - - // this channel can be used by other now. - ch_ptr->Push(ch_ctx); - - if (cntl->Failed()) { - PADDLE_THROW(platform::errors::Unavailable( - "Failed to send variable %s, error text is %s.", var_h->name(), - cntl->ErrorText())); - var_h->Finish(false); - cls->DecreaseReqCount(); - return; - } - var_h->Finish(true); - cls->DecreaseReqCount(); - - VLOG(4) << "HandleSendResponse from: " << cntl->remote_side() - << ", varname: " << var_h->name() - << ", latency: " << cntl->latency_us() << "us"; - VLOG(4) << "Finish HandleSendResponse"; -} - -VarHandlePtr BRPCClient::AsyncSendVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& var_name, - int64_t time_out) { - const platform::DeviceContext* p_ctx = &ctx; - const std::string ep_val = ep; - const std::string var_name_val = var_name; - const framework::Scope* p_scope = &scope; - const auto ch_ptr = GetChannel(ep_val); - const std::string method = kSendRPC; - VarHandlePtr var_h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope)); - - framework::AsyncIO([=] { - auto ch_ctx = ch_ptr->Pop(); - brpc::Controller* cntl = new brpc::Controller(); - sendrecv::VoidMessage* response = new sendrecv::VoidMessage(); - cntl->set_timeout_ms(time_out); - - auto* var = p_scope->FindVar(var_name_val); - sendrecv::VariableMessage request; - distributed::SerializeToIOBuf(var_name_val, var, *p_ctx, &request, - &cntl->request_attachment(), "", false, - trainer_id_); - - google::protobuf::Closure* done = brpc::NewCallback( - &HandleSendResponse, cntl, response, var_h, ch_ptr, ch_ctx, this); - - platform::RecordRPCEvent record_event(method); - - ch_ctx->stub->SendVariable(cntl, &request, response, done); - - if (UNLIKELY(platform::IsProfileEnabled())) { - var_h->Wait(); - } - }); - req_count_++; - - return var_h; -} -void HandleFetchBarrierResponse(brpc::Controller* cntl, - sendrecv::VariableMessage* response, - VarHandlePtr var_h, ChannelQueuePtr ch_ptr, - ChannelContextPtr ch_ctx, BRPCClient* cls) { - // std::unique_ptr makes sure cntl/response will be deleted before returning. - std::unique_ptr cntl_guard(cntl); - std::unique_ptr response_guard(response); - - // this channel can be used other now. - ch_ptr->Push(ch_ctx); - - if (cntl->Failed()) { - PADDLE_THROW(platform::errors::Unavailable( - "Failed to get HandleFetchBarrierResponse %s, error text is %s.", - var_h->name(), cntl->ErrorText())); - var_h->Finish(false); - cls->DecreaseReqCount(); - return; - } - - var_h->Finish(true); - cls->DecreaseReqCount(); - - VLOG(4) << "HandleFetchBarrierResponse from: " << cntl->remote_side() - << ", varname: " << var_h->name() - << ", latency: " << cntl->latency_us() << "us"; - VLOG(4) << "Finish HandleFetchBarrierResponse"; -} -void HandleGetResponse(brpc::Controller* cntl, - sendrecv::VariableMessage* response, VarHandlePtr var_h, - ChannelQueuePtr ch_ptr, ChannelContextPtr ch_ctx, - BRPCClient* cls) { - // std::unique_ptr makes sure cntl/response will be deleted before returning. - std::unique_ptr cntl_guard(cntl); - std::unique_ptr response_guard(response); - - // this channel can be used other now. - ch_ptr->Push(ch_ctx); - - if (cntl->Failed()) { - PADDLE_THROW(platform::errors::Unavailable( - "Failed to get variable %s, error text is %s.", var_h->name(), - cntl->ErrorText())); - cls->DecreaseReqCount(); - var_h->Finish(false); - return; - } - - VLOG(4) << "HandleGetResponse from: " << cntl->remote_side() - << ", varname: " << var_h->name() - << ", latency: " << cntl->latency_us() << "us"; - - framework::Variable* outvar = nullptr; - int trainer_id; - distributed::DeserializeFromIOBuf(*response, cntl->response_attachment(), - *var_h->ctx(), var_h->scope(), &outvar, - &trainer_id); - VLOG(4) << "Finish HandleGetResponse"; - cls->DecreaseReqCount(); - var_h->Finish(true); -} - -VarHandlePtr BRPCClient::_AsyncGetVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& var_name, - const std::string& out_var_name, - const std::string& method_name, - int64_t time_out) { - const platform::DeviceContext* p_ctx = &ctx; - const std::string ep_val = ep; - const std::string var_name_val = var_name; - const std::string out_varname_val = out_var_name; - const framework::Scope* p_scope = &scope; - const auto ch_ptr = GetChannel(ep_val); - const std::string method = kGetRPC; - VarHandlePtr var_h( - new VarHandle(ep, method, out_varname_val, p_ctx, p_scope)); - - framework::AsyncIO([=] { - auto ch_ctx = ch_ptr->Pop(); - - brpc::Controller* cntl = new brpc::Controller(); - sendrecv::VariableMessage* response = new sendrecv::VariableMessage(); - cntl->set_timeout_ms(time_out); - - sendrecv::VariableMessage req; - req.set_varname(var_name_val); - req.set_out_varname(out_varname_val); - req.set_trainer_id(trainer_id_); - - google::protobuf::Closure* done = brpc::NewCallback( - &HandleGetResponse, cntl, response, var_h, ch_ptr, ch_ctx, this); - - platform::RecordRPCEvent record_event(method); - - if (method_name == kGetMonomerRPC) { - ch_ctx->stub->GetMonomerVariable(cntl, &req, response, done); - } else if (method_name == kGetNoBarrierRPC) { - ch_ctx->stub->GetVariableNoBarrier(cntl, &req, response, done); - } else { - ch_ctx->stub->GetVariable(cntl, &req, response, done); - } - - if (UNLIKELY(platform::IsProfileEnabled())) { - var_h->Wait(); - } - }); - - req_count_++; - - return var_h; -} - -VarHandlePtr BRPCClient::AsyncGetVarNoBarrier( - const std::string& ep, const platform::DeviceContext& ctx, - const framework::Scope& scope, const std::string& var_name, - const std::string& out_var_name, int64_t time_out) { - std::string var_name_no_barrier = - string::Sprintf("%s%s", var_name, WITHOUT_BARRIER_MESSAGE); - - return _AsyncGetVar(ep, ctx, scope, var_name_no_barrier, out_var_name, - kGetNoBarrierRPC, time_out); -} - -VarHandlePtr BRPCClient::AsyncGetMonomerVariable( - const std::string& ep, const platform::DeviceContext& ctx, - const framework::Scope& scope, const std::string& var_name, - int64_t time_out) { - return _AsyncGetVar(ep, ctx, scope, var_name, var_name, kGetMonomerRPC, - time_out); -} - -VarHandlePtr BRPCClient::AsyncGetMonomerBarrier(const std::string& ep, - const std::string& var_name, - int64_t time_out) { - return AsyncSendMessage(ep, kSendMonomerFetchBarrierRPC, var_name, time_out); -} - -VarHandlePtr BRPCClient::AsyncGetVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& var_name, - const std::string& out_var_name, - const std::string& table_name, - int64_t time_out) { - return _AsyncGetVar(ep, ctx, scope, var_name, out_var_name, kGetRPC, - time_out); -} - -VarHandlePtr BRPCClient::AsyncPrefetchVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& in_var_name, - const std::string& out_var_name, - const std::string& table_name, - int64_t time_out) { - const platform::DeviceContext* p_ctx = &ctx; - const std::string ep_val = ep; - const std::string in_var_name_val = in_var_name; - const std::string out_var_name_val = out_var_name; - const std::string table_name_val = table_name; - const framework::Scope* p_scope = &scope; - const auto ch_ptr = GetChannel(ep_val); - - const std::string method = kPrefetchRPC; - - VarHandlePtr var_h( - new VarHandle(ep, method, out_var_name_val, p_ctx, p_scope)); - - framework::AsyncIO([=] { - auto ch_ctx = ch_ptr->Pop(); - - brpc::Controller* cntl = new brpc::Controller(); - sendrecv::VariableMessage* response = new sendrecv::VariableMessage(); - cntl->set_timeout_ms(time_out); - - auto* var = p_scope->FindVar(in_var_name_val); - sendrecv::VariableMessage req; - distributed::SerializeToIOBuf(in_var_name_val, var, *p_ctx, &req, - &cntl->request_attachment(), out_var_name_val, - false, 0, table_name_val); - - platform::RecordRPCEvent record_event(method); - - google::protobuf::Closure* done = brpc::NewCallback( - &HandleGetResponse, cntl, response, var_h, ch_ptr, ch_ctx, this); - - ch_ctx->stub->PrefetchVariable(cntl, &req, response, done); - - if (UNLIKELY(platform::IsProfileEnabled())) { - var_h->Wait(); - } - }); - - req_count_++; - return var_h; -} - -VarHandlePtr BRPCClient::AsyncSendBatchBarrier(const std::string& ep, - int64_t time_out) { - return AsyncSendMessage(ep, kBatchBarrierRPC, BATCH_BARRIER_MESSAGE, - time_out); -} - -VarHandlePtr BRPCClient::AsyncSendFetchBarrier(const std::string& ep, - int64_t time_out) { - auto ch_ptr = GetChannel(ep); - auto ch_ctx = ch_ptr->Pop(); - - brpc::Controller* cntl = new brpc::Controller(); - sendrecv::VariableMessage* response = new sendrecv::VariableMessage(); - cntl->set_timeout_ms(time_out); - - sendrecv::VariableMessage req; - req.set_varname(FETCH_BARRIER_MESSAGE); - - const std::string method = kFetchBarrierRPC; - // var handle - VarHandlePtr var_h( - new VarHandle(ep, method, FETCH_BARRIER_MESSAGE, nullptr, nullptr)); - - platform::RecordRPCEvent record_event(method); - - google::protobuf::Closure* done = brpc::NewCallback( - &HandleFetchBarrierResponse, cntl, response, var_h, ch_ptr, ch_ctx, this); - - ch_ctx->stub->GetVariable(cntl, &req, response, done); - - req_count_++; - - if (UNLIKELY(platform::IsProfileEnabled())) { - var_h->Wait(); - } - - return var_h; -} - -bool BRPCClient::Wait() { - VLOG(9) << "begin to brpcclient wait"; - { - std::unique_lock lk(sync_mutex_); - sync_cond_.wait(lk, [this] { return req_count_ == 0; }); - } - VLOG(9) << "end to brpcclient wait"; - return true; -} - -ChannelQueuePtr BRPCClient::GetChannel(const std::string& ep) { - VLOG(4) << "begin to GetChannel:" << ep; - { - std::lock_guard guard(chan_mutex_); - auto it = channels_.find(ep); - if (it != channels_.end()) { - VLOG(4) << "end to GetChannel:" << ep; - return it->second; - } - } - - ChannelQueuePtr q(new framework::BlockingQueue()); - - brpc::ChannelOptions options; -#ifdef PADDLE_WITH_BRPC_RDMA - options.use_rdma = true; -#endif - options.protocol = "baidu_std"; - // don't use pooled type. the server can't afford that. - options.connection_type = "single"; - options.connect_timeout_ms = 1000; - options.timeout_ms = FLAGS_timeout_ms /*milliseconds*/; - options.max_retry = FLAGS_max_retry; - - VLOG(1) << "create " << brpc_channel_num_per_server_ - << " brpc channels to pserver:" << ep; - - for (int i = 0; i < brpc_channel_num_per_server_; ++i) { - std::shared_ptr c(new ChannelContext()); - if (c->channel.Init(ep.c_str(), &options) != 0) { - PADDLE_THROW( - platform::errors::Unavailable("Failed to initialize channel.")); - return nullptr; - } - - c->stub.reset(new sendrecv::SendRecvService_Stub( - static_cast(&c->channel))); - q->Push(c); - } - - { - std::lock_guard guard(chan_mutex_); - channels_[ep] = q; - } - - VLOG(4) << "end to GetChannel:" << ep; - return q; -} - -VarHandlePtr BRPCClient::AsyncSendComplete(const std::string& ep, - int64_t time_out) { - return AsyncSendMessage(ep, kSendCompleteRPC, COMPLETE_MESSAGE, time_out); -} - -void BRPCClient::SendComplete() { - for (auto& kv : channels_) { - AsyncSendComplete(kv.first); - } -} - -VarHandlePtr BRPCClient::AsyncSendVarMessage( - const std::string& ep, const std::string& method_name, - const sendrecv::VariableMessage& req, int64_t time_out) { - auto ch_ptr = GetChannel(ep); - auto ch_ctx = ch_ptr->Pop(); - - brpc::Controller* cntl = new brpc::Controller(); - sendrecv::VoidMessage* response = new sendrecv::VoidMessage(); - cntl->set_timeout_ms(time_out); - - platform::RecordRPCEvent record_event(method_name); - - VarHandlePtr var_h( - new VarHandle(ep, method_name, req.varname(), nullptr, nullptr)); - - google::protobuf::Closure* done = brpc::NewCallback( - &HandleSendResponse, cntl, response, var_h, ch_ptr, ch_ctx, this); - - if (method_name == kCheckPointNotifyRPC) { - ch_ctx->stub->CheckpointNotify(cntl, &req, response, done); - } else if (method_name == kSendMonomerFetchBarrierRPC) { - ch_ctx->stub->GetMonomerBarrier(cntl, &req, response, done); - } else { - ch_ctx->stub->SendVariable(cntl, &req, response, done); - } - req_count_++; - - if (UNLIKELY(platform::IsProfileEnabled())) { - var_h->Wait(); - } - - return var_h; -} - -VarHandlePtr BRPCClient::AsyncSendMessage(const std::string& ep, - const std::string& method_name, - const std::string& message, - int64_t time_out) { - sendrecv::VariableMessage req; - req.set_varname(message); - - return AsyncSendVarMessage(ep, method_name, req, time_out); -} - -VarHandlePtr BRPCClient::AsyncCheckpointNotify(const std::string& ep, - const std::string& dirname, - const std::string& varname, - const int mode, - int64_t time_out) { - sendrecv::VariableMessage req; - req.set_varname(varname); - req.set_out_varname(dirname); - - return AsyncSendVarMessage(ep, "CheckPointNotifyRPC", req, time_out); -} - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/brpc/brpc_client.h b/paddle/fluid/operators/distributed/brpc/brpc_client.h deleted file mode 100644 index 91f94b4c9d5..00000000000 --- a/paddle/fluid/operators/distributed/brpc/brpc_client.h +++ /dev/null @@ -1,174 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include - -#include // NOLINT -#include -#include -#include -#include -#include -#include // NOLINT -#include -#include -#include - -#include "brpc/channel.h" -#include "paddle/fluid/framework/blocking_queue.h" -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h" -#include "paddle/fluid/operators/distributed/distributed_pb.h" -#include "paddle/fluid/operators/distributed/request_handler.h" -#include "paddle/fluid/operators/distributed/rpc_client.h" -#include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN - -namespace paddle { -namespace operators { -namespace distributed { - -struct ChannelContext { - brpc::Channel channel; - std::shared_ptr stub; -}; - -typedef std::shared_ptr ChannelContextPtr; -typedef std::shared_ptr> - ChannelQueuePtr; - -class BRPCClient : public RPCClient { - public: - BRPCClient() {} - virtual ~BRPCClient(); - - VarHandlePtr AsyncSendVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& var_name, - int64_t time_out = FLAGS_rpc_deadline) override; - - VarHandlePtr AsyncGetVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& var_name, - const std::string& out_var_name, - const std::string& table_name = "", - int64_t time_out = FLAGS_rpc_deadline) override; - - VarHandlePtr AsyncGetMonomerBarrier( - const std::string& ep, const std::string& var_name, - int64_t time_out = FLAGS_rpc_deadline) override; - - VarHandlePtr AsyncGetMonomerVariable( - const std::string& ep, const platform::DeviceContext& ctx, - const framework::Scope& scope, const std::string& var_name, - int64_t time_out = FLAGS_rpc_deadline) override; - - VarHandlePtr AsyncGetVarNoBarrier(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& var_name, - const std::string& out_varname, - int64_t time_out = FLAGS_rpc_deadline); - - VarHandlePtr AsyncPrefetchVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& in_var_name, - const std::string& out_var_name, - const std::string& table_name = "", - int64_t time_out = FLAGS_rpc_deadline) override; - - VarHandlePtr AsyncSendBatchBarrier( - const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override; - - VarHandlePtr AsyncSendFetchBarrier( - const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override; - - VarHandlePtr AsyncCheckpointNotify( - const std::string& ep, const std::string& dirname, - const std::string& varname, const int mode, - int64_t time_out = FLAGS_rpc_deadline) override; - - bool Wait() override; - - void SendComplete() override; - - private: - VarHandlePtr _AsyncGetVar( - const std::string& ep, const platform::DeviceContext& ctx, - const framework::Scope& scope, const std::string& var_name, - const std::string& out_var_name, const std::string& method_name, - const std::string& table_name, int64_t time_out = FLAGS_rpc_deadline); - - void Proceed(); - ChannelQueuePtr GetChannel(const std::string& ep); - - VarHandlePtr AsyncSendComplete(const std::string& ep, - int64_t time_out = FLAGS_rpc_deadline); - - VarHandlePtr AsyncSendMessage(const std::string& ep, - const std::string& method_name, - const std::string& message, int64_t time_out); - - VarHandlePtr AsyncSendVarMessage(const std::string& ep, - const std::string& method_name, - const sendrecv::VariableMessage& req, - int64_t time_out); - - friend void HandleSendResponse(brpc::Controller* cntl, - sendrecv::VoidMessage* response, - VarHandlePtr var_h, ChannelQueuePtr ch_ptr, - ChannelContextPtr ch_ctx, BRPCClient* cls); - - friend void HandleGetResponse(brpc::Controller* cntl, - sendrecv::VariableMessage* response, - VarHandlePtr var_h, ChannelQueuePtr ch_ptr, - ChannelContextPtr ch_ctx, BRPCClient* cls); - - friend void HandleFetchBarrierResponse(brpc::Controller* cntl, - sendrecv::VariableMessage* response, - VarHandlePtr var_h, - ChannelQueuePtr ch_ptr, - ChannelContextPtr ch_ctx, - BRPCClient* cls); - void DecreaseReqCount() { - if (--req_count_ <= 0) { - sync_cond_.notify_all(); - } - } - - private: - std::unordered_map channels_; - - // mutex for Wait client sync - std::mutex sync_mutex_; - std::condition_variable sync_cond_; - std::atomic req_count_{0}; - - static constexpr int brpc_channel_num_per_server_ = 4; - - // mutex for GetChannel thread safety - std::mutex chan_mutex_; - DISABLE_COPY_AND_ASSIGN(BRPCClient); -}; - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.cc b/paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.cc deleted file mode 100644 index 94f0b9919ac..00000000000 --- a/paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.cc +++ /dev/null @@ -1,86 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifdef PADDLE_WITH_BRPC_RDMA - -#include "paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.h" -#include "brpc/channel.h" -#include "brpc/rdma/rdma_helper.h" -#include "paddle/fluid/platform/enforce.h" - -namespace paddle { -namespace operators { -namespace distributed { - -RdmaMemPool& RdmaMemPool::Instance() { - static RdmaMemPool* g_rdma_mem_pool = new RdmaMemPool(); - return *g_rdma_mem_pool; -} - -void* RdmaMemPool::Find(const std::string& varname, int64_t size) { - pthread_rwlock_rdlock(&access_); - auto it = pool_.find(varname); - if (it == pool_.end()) { - pthread_rwlock_unlock(&access_); - return nullptr; - } - - auto info = it->second; - if (info.data_size != size) { - pthread_rwlock_unlock(&access_); - PADDLE_THROW(platform::errors::InvalidArgument( - "var:%s size:%ld != %ld", varname, size, info.data_size)); - return nullptr; - } - - pthread_rwlock_unlock(&access_); - return info.data; -} - -void RdmaMemPool::Register(const std::string& varname, void* data, - int64_t data_size) { - void* old = Find(varname, data_size); - if (old != nullptr) { - PADDLE_ENFORCE_EQ( - data, old, platform::errors::InvalidArgument("var:%s data:%ld != %ld", - varname, data, old)); - VLOG(7) << "Find on rdma:" << varname << " data:" << data - << " data_size:" << data_size; - return; - } - - VarInfo info; - info.data = data; - info.data_size = data_size; - - pthread_rwlock_wrlock(&access_); - pool_[varname] = info; - pthread_rwlock_unlock(&access_); - - if (brpc::rdma::RegisterMemoryForRdma(data, data_size)) { - PADDLE_THROW(platform::errors::Unavailable( - "Register memory for RDMA failed. Register %s data: %s data size %d " - "error.", - varname, data, data_size)); - } - - VLOG(4) << "register on rdma:" << varname << " data:" << data - << " data_size:" << data_size; -} - -} // namespace distributed -} // namespace operators -} // namespace paddle - -#endif diff --git a/paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.h b/paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.h deleted file mode 100644 index 156a93ec578..00000000000 --- a/paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.h +++ /dev/null @@ -1,56 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#ifdef PADDLE_WITH_BRPC_RDMA - -#include // NOLINT -#include -#include - -namespace paddle { -namespace operators { -namespace distributed { - -/* - * This class is used to avoid duplicated registion of brpc::rdma. - */ -class RdmaMemPool { - public: - static RdmaMemPool& Instance(); - RdmaMemPool() : access_(PTHREAD_RWLOCK_INITIALIZER) {} - - virtual ~RdmaMemPool() { pthread_rwlock_destroy(&access_); } - - void Register(const std::string& varname, void* data, int64_t size); - void* Find(const std::string& varname, int64_t size); - - private: - struct VarInfo { - void* data; - int64_t data_size; - - VarInfo() : data(nullptr), data_size(0) {} - }; - - private: - std::unordered_map pool_; - pthread_rwlock_t access_; -}; - -} // namespace distributed -} // namespace operators -} // namespace paddle - -#endif diff --git a/paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.cc b/paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.cc deleted file mode 100644 index 411c0f36deb..00000000000 --- a/paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.cc +++ /dev/null @@ -1,224 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef PADDLE_WITH_NCCL -#include -#endif -#ifdef PADDLE_WITH_RCCL -#include -#endif -#include -#include -#include -#include // NOLINT - -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/operators/distributed/brpc/brpc_rdma_pool.h" -#include "paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h" -#include "paddle/fluid/operators/distributed/brpc/brpc_variable_response.h" -#include "paddle/fluid/operators/distributed/distributed_pb.h" -#include "paddle/fluid/platform/profiler.h" - -namespace paddle { -namespace operators { -namespace distributed { - -class IOBufWriter { - public: - static void Append(const std::string& varname, butil::IOBuf* iobuf, int k, - const char* v, int64_t vlen) { - if (vlen >= std::numeric_limits::max() || vlen < 0) { - PADDDLE_THROW(platform::errors::Unavailable( - "Variable lenght is invalid. Variable name is %s, length is %d.", - varname, vlen)); - } - - iobuf->append(reinterpret_cast(&k), 4); - iobuf->append(reinterpret_cast(&vlen), 8); - iobuf->append(v, vlen); - } - - static void AppendTCPZeroCopy(butil::IOBuf* iobuf, int k, const char* v, - int64_t vlen, bool in_cuda_pinned, - void (*destroy)(void*), void* user_data) { - VLOG(7) << "AppendTCPZeroCopy " - << " k:" << k - << " data:" << static_cast(const_cast(v)) - << " data_size:" << vlen << " in_cuda_pinned:" << in_cuda_pinned; - - iobuf->append(reinterpret_cast(&k), 4); - iobuf->append(reinterpret_cast(&vlen), 8); - - // FIXME(gongwb): use append_zerocopy - /* - if (in_cuda_pinned) { - iobuf->append_zerocopy(v, vlen, IOBufWriter::FreeMemory); - } else { - iobuf->append_zerocopy(v, vlen, nullptr); - } - */ - iobuf->append(v, vlen); - destroy(user_data); - } - -#ifdef PADDLE_WITH_BRPC_RDMA - static void AppendRdmaZeroCopy(const std::string varname, butil::IOBuf* iobuf, - int k, const char* v, int64_t vlen, - bool in_cuda_pinned, void (*destroy)(void*), - void* user_data) { - VLOG(7) << "AppendRdmaZeroCopy varname:" << varname << " k:" << k - << " data:" << static_cast(const_cast(v)) - << " data_size:" << vlen << " in_cuda_pinned:" << in_cuda_pinned; - - iobuf->append(reinterpret_cast(&k), 4); - iobuf->append(reinterpret_cast(&vlen), 8); - - RdmaMemPool::Instance().Register( - varname, static_cast(const_cast(v)), vlen); - - // FIXME(gongwb): use append_zerocopy - // iobuf->append_zerocopy(v, vlen, nullptr); - iobuf->append(v, vlen); - destroy(user_data); - return; - } -#endif - - static void AppendZeroCopy(const std::string varname, butil::IOBuf* iobuf, - int k, const char* v, int64_t vlen, - bool in_cuda_pinned, void (*destroy)(void*), - void* user_data) { - if (vlen >= std::numeric_limits::max() || vlen < 0) { - PADDDLE_THROW(platform::errors::Unavailable( - "Variable lenght is invalid. Variable name is %s, length is %d.", - varname, vlen)); - } - -#ifdef PADDLE_WITH_BRPC_RDMA - IOBufWriter::AppendRdmaZeroCopy(varname, iobuf, k, v, vlen, in_cuda_pinned, - destroy, user_data); -#else - IOBufWriter::AppendTCPZeroCopy(iobuf, k, v, vlen, in_cuda_pinned, destroy, - user_data); -#endif - } -}; - -void SerializeToIOBuf(const std::string& name, framework::Variable* var, - const platform::DeviceContext& ctx, VarMsg* request, - butil::IOBuf* iobuf, const std::string& out_varname, - bool var_is_not_stable, int trainer_id, - const std::string& table_name) { - std::unique_ptr payload; - - request->set_varname(name); - request->set_trainer_id(trainer_id); - // Note: normally the profiler is enabled in 1 trainer, hence only - // 1 trainer returns true for ShouldSendProfileState(). It tells PS - // servers the trainer's profiling state so that PS can follow the - // trainer. - if (platform::ShouldSendProfileState()) { - if (platform::IsProfileEnabled()) { - request->set_profile(platform::kEnableProfiler); - } else { - request->set_profile(platform::kDisableProfiler); - } - } - if (!out_varname.empty()) { - request->set_out_varname(out_varname); - } - if (!table_name.empty()) { - request->set_table_name(table_name); - } - if (var->IsType()) { - request->set_type(::sendrecv::LOD_TENSOR); - payload.reset(new TensorPayload(GetTensorPayload(var, ctx, request))); - } else if (var->IsType()) { - request->set_type(::sendrecv::SELECTED_ROWS); - payload.reset(new TensorPayload(GetSelectedRowsPayload(var, ctx, request))); -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - } else if (var->IsType()) { - request->set_type(::sendrecv::NCCL_ID); - const ncclUniqueId& uid = var->Get(); - // TODO(gongwb): use append_zero to avoid data copy. - IOBufWriter::Append(name, iobuf, - sendrecv::VariableMessage::kSerializedFieldNumber, - uid.internal, NCCL_UNIQUE_ID_BYTES); - return; -#endif - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "Serialize does not support type: %s", typeid(var->Type()).name())); - } - - PADDLE_ENFORCE_NOT_NULL( - payload, - platform::errors::InvalidArgument( - "Not support type: %s, need to be LOD_TENSOR or SELECTED_ROWS.", - var->Type())); - - // FIXME(gongwb): it seems that can use zero copy. - if (var_is_not_stable) { - IOBufWriter::Append( - name, iobuf, ::sendrecv::VariableMessage::kSerializedFieldNumber, - static_cast(payload->ptr()), payload->memory_size()); - } else { - if (platform::is_gpu_place(ctx.GetPlace())) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - IOBufWriter::AppendZeroCopy( - name, iobuf, ::sendrecv::VariableMessage::kSerializedFieldNumber, - static_cast(payload->ptr()), payload->memory_size(), - true, SerializeDestroyCallback, static_cast(payload.get())); - payload.release(); -#endif - } else { - IOBufWriter::AppendZeroCopy( - name, iobuf, ::sendrecv::VariableMessage::kSerializedFieldNumber, - static_cast(payload->ptr()), payload->memory_size(), - false, SerializeDestroyCallback, static_cast(payload.get())); - payload.release(); - } - } - - if (var->IsType()) { - auto* slr = var->GetMutable(); - PADDLE_ENFORCE_EQ(VectorElemName(slr->rows()), typeid(int64_t).name(), - platform::errors::InvalidArgument( - "Got wrong type: %s, expect type: int64_t", - VectorElemName(slr->rows()))); - size_t rows_memory_size = slr->rows().size() * sizeof(int64_t); - - IOBufWriter::Append(name, iobuf, - ::sendrecv::VariableMessage::kRowsFieldNumber, - reinterpret_cast(slr->rows().data()), - static_cast(rows_memory_size)); - } -} - -void DeserializeFromIOBuf(const ::sendrecv::VariableMessage& meta, - const butil::IOBuf& iobuf, - const platform::DeviceContext& ctx, - const framework::Scope* scope, - framework::Variable** var, int* trainer_id) { - operators::distributed::BRPCVariableResponse resp(scope, &ctx); - PADDLE_ENFORCE_EQ( - resp.Parse(iobuf, meta), 0, - platform::errors::InvalidArgument("parse iobuf to tensor error!")); - *var = resp.GetVar(); - *trainer_id = resp.GetTrainerId(); -} - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h b/paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h deleted file mode 100644 index a5bdc331eb2..00000000000 --- a/paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h +++ /dev/null @@ -1,49 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include -#include - -#include "brpc/channel.h" -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/framework/var_type.h" -#include "paddle/fluid/operators/distributed/distributed_pb.h" -#include "paddle/fluid/operators/distributed/sendrecvop_utils.h" - -namespace paddle { -namespace operators { -namespace distributed { - -void SerializeToIOBuf(const std::string& name, framework::Variable* var, - const platform::DeviceContext& ctx, VarMsg* request, - butil::IOBuf* iobuf, const std::string& out_varname, - bool var_is_not_stable, const int trainer_id = 0, - const std::string& table_name = std::string()); - -void DeserializeFromIOBuf(const VarMsg& meta, const butil::IOBuf& iobuf, - const platform::DeviceContext& ctx, - const framework::Scope* scope, - framework::Variable** var, int* trainer_id); - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/brpc/brpc_serde_test.cc b/paddle/fluid/operators/distributed/brpc/brpc_serde_test.cc deleted file mode 100644 index bcf20ad076b..00000000000 --- a/paddle/fluid/operators/distributed/brpc/brpc_serde_test.cc +++ /dev/null @@ -1,175 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include // NOLINT - -#include "brpc/channel.h" -#include "google/protobuf/text_format.h" -#include "gtest/gtest.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/framework/variable.h" -#include "paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h" -#include "paddle/fluid/operators/distributed/brpc/brpc_variable_response.h" -#include "paddle/fluid/operators/distributed/sendrecvop_utils.h" -#include "paddle/fluid/operators/distributed/variable_response.h" -#include "paddle/fluid/operators/math/math_function.h" -#include "paddle/fluid/platform/place.h" -#include "paddle/fluid/string/printf.h" - -namespace framework = paddle::framework; -namespace platform = paddle::platform; -namespace operators = paddle::operators; -namespace math = paddle::operators::math; -namespace memory = paddle::memory; - -void RunSerdeTestSelectedRows(platform::Place place) { - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - auto& ctx = *pool.Get(place); - - butil::IOBuf iobuf; - sendrecv::VariableMessage msg; - int tensor_numel = 564 * 128; - - // serialize var to IOBuf - { - framework::Variable var; - auto* slr = var.GetMutable(); - slr->set_height(1000); - auto* tensor = slr->mutable_value(); - auto* rows = slr->mutable_rows(); - tensor->Resize(framework::make_ddim({564, 128})); - tensor->mutable_data(place); - math::set_constant(ctx, tensor, 32.7); - for (int i = 0; i < 564; ++i) rows->push_back(i); - - operators::distributed::SerializeToIOBuf("myvar", &var, ctx, &msg, &iobuf, - "", false); - } - - // desrialize - { - framework::Scope scope; - scope.Var("myvar"); - operators::distributed::BRPCVariableResponse resp(&scope, &ctx); - EXPECT_EQ(resp.Parse(iobuf, msg), 0); - - framework::Variable* var2 = resp.GetVar(); - - auto* slr2 = var2->GetMutable(); - auto* tensor2 = slr2->mutable_value(); - auto* rows2 = slr2->mutable_rows(); - float* tensor_data2 = nullptr; - framework::Tensor tmp_tensor; - - if (platform::is_gpu_place(ctx.GetPlace())) { - platform::CPUPlace cpu; - framework::TensorCopy(*tensor2, cpu, &tmp_tensor); - tensor_data2 = tmp_tensor.data(); - } else { - tensor_data2 = const_cast(tensor2->data()); - } - const int64_t* rows_data2 = rows2->data(); - - for (int i = 0; i < tensor_numel; ++i) { - EXPECT_FLOAT_EQ(tensor_data2[i], 32.7); - } - for (size_t i = 0; i < rows2->size(); ++i) { - EXPECT_EQ(rows_data2[i], static_cast(i)); - } - EXPECT_EQ(slr2->height(), 1000); - } -} - -void RunTestLodTensor(platform::Place place) { - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - auto& ctx = *pool.Get(place); - - // serialize var to ByteBuffer - butil::IOBuf iobuf; - sendrecv::VariableMessage msg; - int tensor_numel = 512 * 8 * 4 * 2; - { - framework::Variable var; - auto* tensor = var.GetMutable(); - tensor->Resize(framework::make_ddim({512, 8, 4, 2})); - framework::LoD lod; - lod.push_back(framework::Vector({1, 3, 8})); - tensor->set_lod(lod); - tensor->mutable_data(place); - math::set_constant(ctx, tensor, 31.9); - - operators::distributed::SerializeToIOBuf("myvar", &var, ctx, &msg, &iobuf, - "", false); - } - - // check sendrecv::VariableMessage meta data - { - EXPECT_EQ(msg.varname(), "myvar"); - EXPECT_EQ(msg.type(), 0); - EXPECT_EQ(msg.dims()[0], 512); - EXPECT_EQ(msg.dims()[1], 8); - EXPECT_EQ(msg.dims()[2], 4); - EXPECT_EQ(msg.dims()[3], 2); - EXPECT_EQ(msg.lod_level(), 1); - EXPECT_EQ(msg.lod(0).lod_data(0), 1); - EXPECT_EQ(msg.lod(0).lod_data(1), 3); - EXPECT_EQ(msg.lod(0).lod_data(2), 8); - } - - // deserialize - { - framework::Scope scope; - scope.Var("myvar"); - operators::distributed::BRPCVariableResponse resp(&scope, &ctx); - EXPECT_EQ(resp.Parse(iobuf, msg), 0); - - framework::Variable* var2 = resp.GetVar(); - - auto tensor2 = var2->Get(); - float* tensor_data2 = nullptr; - framework::Tensor tmp_tensor; - - if (platform::is_gpu_place(ctx.GetPlace())) { - platform::CPUPlace cpu; - framework::TensorCopy(tensor2, cpu, &tmp_tensor); - tensor_data2 = tmp_tensor.data(); - } else { - tensor_data2 = const_cast(tensor2.data()); - } - - for (int i = 0; i < tensor_numel; ++i) - EXPECT_FLOAT_EQ(tensor_data2[i], 31.9); - } -} - -TEST(LodTensor, Run) { - platform::CPUPlace place; - RunTestLodTensor(place); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - platform::CUDAPlace gpu(0); - RunTestLodTensor(gpu); -#endif -} - -TEST(SelectedRows, Run) { - platform::CPUPlace place; - RunSerdeTestSelectedRows(place); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - platform::CUDAPlace gpu; - RunSerdeTestSelectedRows(gpu); -#endif -} diff --git a/paddle/fluid/operators/distributed/brpc/brpc_server.cc b/paddle/fluid/operators/distributed/brpc/brpc_server.cc deleted file mode 100644 index 5ca26f006bf..00000000000 --- a/paddle/fluid/operators/distributed/brpc/brpc_server.cc +++ /dev/null @@ -1,417 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/distributed/brpc/brpc_server.h" -#include -#include -#include "paddle/fluid/framework/threadpool.h" -#include "paddle/fluid/operators/distributed/brpc/brpc_sendrecvop_utils.h" -#include "paddle/fluid/operators/distributed/brpc/brpc_variable_response.h" -#include "paddle/fluid/operators/distributed/request_handler.h" - -namespace sendrecv { - -namespace distributed = paddle::operators::distributed; - -typedef std::unordered_map - HandlerMap; - -class BRPCServiceImpl : public SendRecvService { - public: - explicit BRPCServiceImpl(const HandlerMap& rpc_call_map, - distributed::RPCServer* rpc_server) - : rpc_server_(rpc_server) { - VLOG(3) << "BRPCServiceImpl size: " << rpc_call_map.size(); - auto it = rpc_call_map.find(distributed::kRequestSend); - if (it != rpc_call_map.end()) { - request_send_h_ = it->second; - send_threads_.reset(new paddle::framework::ThreadPool( - rpc_server_->GetThreadNum(distributed::kRequestSend))); - } - - it = rpc_call_map.find(distributed::kRequestGet); - if (it != rpc_call_map.end()) { - request_get_h_ = it->second; - get_threads_.reset(new paddle::framework::ThreadPool( - rpc_server_->GetThreadNum(distributed::kRequestGet))); - } - - it = rpc_call_map.find(distributed::kRequestGetNoBarrier); - if (it != rpc_call_map.end()) { - request_getnobarrier_h_ = it->second; - getnobarrier_threads_.reset(new paddle::framework::ThreadPool( - rpc_server_->GetThreadNum(distributed::kRequestGetNoBarrier))); - } - - it = rpc_call_map.find(distributed::kRequestPrefetch); - if (it != rpc_call_map.end()) { - request_prefetch_h_ = it->second; - prefetch_threads_.reset(new paddle::framework::ThreadPool( - rpc_server_->GetThreadNum(distributed::kRequestPrefetch))); - } - - it = rpc_call_map.find(distributed::kRequestCheckpoint); - if (it != rpc_call_map.end()) { - request_checkpoint_h_ = it->second; - checkpoint_notify_threads_.reset(new paddle::framework::ThreadPool( - rpc_server_->GetThreadNum(distributed::kRequestPrefetch))); - } - - it = rpc_call_map.find(distributed::kRequestGetMonomerVariable); - if (it != rpc_call_map.end()) { - request_get_monomer_handler_h_ = it->second; - } - - it = rpc_call_map.find(distributed::kRequestGetMonomerBarrier); - if (it != rpc_call_map.end()) { - request_get_monomer_barrier_handler_h_ = it->second; - } - } - - virtual ~BRPCServiceImpl() {} - void SendVariable(google::protobuf::RpcController* cntl_butil, - const VariableMessage* request, VoidMessage* response, - google::protobuf::Closure* done) override { - send_threads_->Run( - [=] { _SendVariable(cntl_butil, request, response, done); }); - } - - void _SendVariable(google::protobuf::RpcController* cntl_butil, - const VariableMessage* request, VoidMessage* response, - google::protobuf::Closure* done) { - PADDLE_ENFORCE_NOT_NULL( - request_send_h_, platform::errors::PreconditionNotMet( - "RequestSend handler should be registed first!")); - brpc::ClosureGuard done_guard(done); - brpc::Controller* cntl = static_cast(cntl_butil); - - std::string varname = request->varname(); - VLOG(3) << "RequestSend var_name:" << varname - << ", trainer_id:" << request->trainer_id() - << ", from:" << cntl->remote_side(); - - distributed::BRPCVariableResponse resp(request_send_h_->scope(), - request_send_h_->dev_ctx(), - request_send_h_->distributed_mode()); - PADDLE_ENFORCE_EQ( - resp.Parse(cntl->request_attachment(), *request), 0, - platform::errors::InvalidArgument("parse iobuf to tensor error!")); - - auto scope = resp.GetMutableLocalScope(); - auto invar = resp.GetVar(); - int trainer_id = request->trainer_id(); - paddle::framework::Variable* outvar = nullptr; - - request_send_h_->Handle(varname, scope, invar, &outvar, trainer_id); - } - - void GetVariable(google::protobuf::RpcController* cntl_butil, - const VariableMessage* request, VariableMessage* response, - google::protobuf::Closure* done) override { - get_threads_->Run( - [=] { _GetVariable(cntl_butil, request, response, done); }); - } - - void GetVariableNoBarrier(google::protobuf::RpcController* cntl_butil, - const VariableMessage* request, - VariableMessage* response, - google::protobuf::Closure* done) override { - getnobarrier_threads_->Run( - [=] { _GetVariableNoBarrier(cntl_butil, request, response, done); }); - } - - void _GetVariable(google::protobuf::RpcController* cntl_butil, - const VariableMessage* request, VariableMessage* response, - google::protobuf::Closure* done) { - PADDLE_ENFORCE_NOT_NULL( - request_get_h_, platform::errors::PreconditionNotMet( - "RequestGet handler should be registed first!")); - - brpc::ClosureGuard done_guard(done); - brpc::Controller* cntl = static_cast(cntl_butil); - - std::string varname = request->varname(); - std::string out_varname = request->out_varname(); - VLOG(3) << "RequestGet varname:" << varname - << ", out_varname:" << out_varname - << ", trainer_id:" << request->trainer_id() - << ", from:" << cntl->remote_side(); - - auto scope = request_get_h_->scope(); - paddle::framework::Variable* invar = nullptr; - int trainer_id = request->trainer_id(); - paddle::framework::Variable* outvar = nullptr; - - request_get_h_->Handle(varname, scope, invar, &outvar, trainer_id, - out_varname); - - if (outvar) { - distributed::SerializeToIOBuf(out_varname, outvar, - *request_get_h_->dev_ctx(), response, - &cntl->response_attachment(), "", false); - } - } - - void _GetVariableNoBarrier(google::protobuf::RpcController* cntl_butil, - const VariableMessage* request, - VariableMessage* response, - google::protobuf::Closure* done) { - PADDLE_ENFORCE_NOT_NULL( - request_getnobarrier_h_, - platform::errors::PreconditionNotMet( - "RequestGetNoBarrier handler should be registed first!")); - - brpc::ClosureGuard done_guard(done); - brpc::Controller* cntl = static_cast(cntl_butil); - - std::string varname = request->varname(); - std::string out_varname = request->out_varname(); - int trainer_id = request->trainer_id(); - - VLOG(3) << "RequestGetNoBarrier varname:" << varname - << ", out_varname:" << out_varname << ", trainer_id:" << trainer_id - << ", from:" << cntl->remote_side(); - - auto scope = request_getnobarrier_h_->scope(); - paddle::framework::Variable* invar = nullptr; - paddle::framework::Variable* outvar = nullptr; - - request_getnobarrier_h_->Handle(varname, scope, invar, &outvar, trainer_id, - out_varname); - - if (outvar) { - distributed::SerializeToIOBuf( - out_varname, outvar, *request_getnobarrier_h_->dev_ctx(), response, - &cntl->response_attachment(), "", false); - } - } - - void PrefetchVariable(google::protobuf::RpcController* cntl_butil, - const VariableMessage* request, - VariableMessage* response, - google::protobuf::Closure* done) override { - prefetch_threads_->Run( - [=] { _PrefetchVariable(cntl_butil, request, response, done); }); - } - - void _PrefetchVariable(google::protobuf::RpcController* cntl_butil, - const VariableMessage* request, - VariableMessage* response, - google::protobuf::Closure* done) { - PADDLE_ENFORCE_NOT_NULL(request_prefetch_h_, - platform::errors::PreconditionNotMet( - "kRequestPrefetch handler should be registed first!"); - - brpc::ClosureGuard done_guard(done); - brpc::Controller* cntl = static_cast(cntl_butil); - - // prefetch process... - std::string in_var_name = request->varname(); - std::string out_var_name = request->out_varname(); - VLOG(3) << "RequestPrefetch, in_var_name: " << in_var_name - << ", out_var_name: " << out_var_name - << ", trainer_id:" << request->trainer_id() - << ", from:" << cntl->remote_side(); - - distributed::BRPCVariableResponse resp( - request_prefetch_h_->scope(), request_prefetch_h_->dev_ctx(), true); - - PADDLE_ENFORCE_EQ(resp.Parse(cntl->request_attachment(), *request), 0, - platform::errors::InvalidArgument( - "parse iobuf to tensor error!")); - - auto scope = resp.GetMutableLocalScope(); - auto invar = scope->FindVar(in_var_name); - std::string table_name = request->table_name(); - int trainer_id = request->trainer_id(); - paddle::framework::Variable* outvar = scope->Var(out_var_name); - - request_prefetch_h_->Handle(in_var_name, scope, invar, &outvar, trainer_id, - out_var_name, table_name); - - distributed::SerializeToIOBuf(out_var_name, outvar, - *request_prefetch_h_->dev_ctx(), response, - &cntl->response_attachment(), "", true); - } - - void CheckpointNotify(google::protobuf::RpcController* cntl_butil, - const VariableMessage* request, VoidMessage* response, - google::protobuf::Closure* done) override { - checkpoint_notify_threads_->Run( - [=] { _CheckpointNotify(cntl_butil, request, response, done); }); - } - - void _CheckpointNotify(google::protobuf::RpcController* cntl_butil, - const VariableMessage* request, VoidMessage* response, - google::protobuf::Closure* done) { - PADDLE_ENFORCE_NOT_NULL( - request_checkpoint_h_, - platform::errors::PreconditionNotMet( - "kRequestCheckpointNotify handler should be registed first!")); - - brpc::ClosureGuard done_guard(done); - brpc::Controller* cntl = static_cast(cntl_butil); - - distributed::BRPCVariableResponse resp(request_checkpoint_h_->scope(), - request_checkpoint_h_->dev_ctx()); - - auto scope = resp.GetMutableLocalScope(); - - std::string checkpoint_notify = request->varname(); - std::string checkpoint_dir = request->out_varname(); - int trainer_id = request->trainer_id(); - - VLOG(4) << "RequestCheckpointNotify notify: " << checkpoint_notify - << ", dir: " << checkpoint_dir - << ", trainer_id:" << request->trainer_id() - << ", from:" << cntl->remote_side(); - - request_checkpoint_h_->Handle(checkpoint_notify, scope, nullptr, nullptr, - trainer_id, checkpoint_dir); - } - - void GetMonomerVariable(google::protobuf::RpcController* cntl_butil, - const VariableMessage* request, - VariableMessage* response, - google::protobuf::Closure* done) override { - PADDLE_ENFORCE_NOT_NULL( - request_get_monomer_handler_h_, - platform::errors::PreconditionNotMet( - "kRequestGetMonomerVariable handler should be registed first!")); - - brpc::ClosureGuard done_guard(done); - brpc::Controller* cntl = static_cast(cntl_butil); - - // proc request. - std::string varname = request->varname(); - VLOG(3) << "GetMonomerVariable " << varname - << ", trainer_id:" << request->trainer_id() - << ", from:" << cntl->remote_side(); - - rpc_server_->WaitVarCond(varname); - distributed::MonomerHandle h = rpc_server_->GetMonomer(varname); - - auto scope = h.scope_; - auto invar = scope->FindVar(varname); - paddle::framework::Variable* outvar = nullptr; - - request_get_monomer_handler_h_->Handle(varname, scope, invar, &outvar, - request->trainer_id()); - - if (outvar) { - distributed::SerializeToIOBuf(varname, outvar, *h.dev_ctx_, response, - &cntl->response_attachment(), "", false); - } - } - - void GetMonomerBarrier(google::protobuf::RpcController* cntl_butil, - const VariableMessage* request, VoidMessage* response, - google::protobuf::Closure* done) override { - PADDLE_ENFORCE_NOT_NULL( - request_get_monomer_barrier_handler_h_, - platform::errors::PreconditionNotMet( - "RequestGetMonomerBarrier handler should be registed first!")); - - brpc::ClosureGuard done_guard(done); - brpc::Controller* cntl = static_cast(cntl_butil); - - std::string varname = request->varname(); - VLOG(3) << "RequestGetMonomerBarrier var_name:" << varname - << ", trainer_id:" << request->trainer_id() - << ", from:" << cntl->remote_side(); - - rpc_server_->WaitVarCond(varname); - distributed::MonomerHandle h = rpc_server_->GetMonomer(varname); - - paddle::framework::Scope* scope = nullptr; - paddle::framework::Variable* invar = nullptr; - paddle::framework::Variable* outvar = nullptr; - - request_get_monomer_barrier_handler_h_->Handle( - varname, scope, invar, &outvar, request->trainer_id()); - } - - private: - distributed::RequestHandler* request_send_h_{nullptr}; - distributed::RequestHandler* request_get_h_{nullptr}; - distributed::RequestHandler* request_getnobarrier_h_{nullptr}; - distributed::RequestHandler* request_prefetch_h_{nullptr}; - distributed::RequestHandler* request_checkpoint_h_{nullptr}; - distributed::RequestHandler* request_get_monomer_handler_h_{nullptr}; - distributed::RequestHandler* request_get_monomer_barrier_handler_h_{nullptr}; - - distributed::RPCServer* rpc_server_{nullptr}; - - // FIXME(gongwb): brpc should support process one rpc use one threadpool. - std::unique_ptr send_threads_; - std::unique_ptr get_threads_; - std::unique_ptr getnobarrier_threads_; - std::unique_ptr prefetch_threads_; - std::unique_ptr checkpoint_notify_threads_; -}; -} // namespace sendrecv - -namespace paddle { -namespace operators { -namespace distributed { - -void AsyncBRPCServer::StartServer() { - // Instance of your service. - sendrecv::BRPCServiceImpl service_impl(rpc_call_map_, this); - - // Add the service into server. Notice the second parameter, because the - // service is put on stack, we don't want server to delete it, otherwise - // use brpc::SERVER_OWNS_SERVICE. - if (server_.AddService(&service_impl, brpc::SERVER_DOESNT_OWN_SERVICE) != 0) { - PADDDLE_THROW(platform::errors::Unavailable( - "Failed to add service into BRPC server.")); - return; - } - - brpc::ServerOptions options; -#ifdef PADDLE_WITH_BRPC_RDMA - options.use_rdma = true; -#endif - options.idle_timeout_sec = idle_timeout_s_; - options.max_concurrency = max_concurrency_; - if (server_.Start(bind_address_.c_str(), &options) != 0) { - PADDDLE_THROW(platform::errors::Unavailable( - "Failed to start EchoServer %s.", bind_address_)); - return; - } - - butil::EndPoint ep = server_.listen_address(); - selected_port_ = ep.port; - - { - std::lock_guard lock(this->mutex_ready_); - ready_ = 1; - } - condition_ready_.notify_all(); - - server_.Join(); -} - -void AsyncBRPCServer::ShutDownImpl() { server_.Stop(1000); } - -void AsyncBRPCServer::WaitServerReady() { - VLOG(3) << "AsyncGRPCServer is wait server ready"; - std::unique_lock lock(this->mutex_ready_); - condition_ready_.wait(lock, [=] { return this->ready_ == 1; }); - VLOG(3) << "AsyncGRPCServer WaitSeverReady"; -} - -}; // namespace distributed -}; // namespace operators -}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/brpc/brpc_server.h b/paddle/fluid/operators/distributed/brpc/brpc_server.h deleted file mode 100644 index 78bbe5adc08..00000000000 --- a/paddle/fluid/operators/distributed/brpc/brpc_server.h +++ /dev/null @@ -1,53 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include // NOLINT -#include // NOLINT -#include - -#include "brpc/server.h" -#include "paddle/fluid/operators/distributed/distributed_pb.h" -#include "paddle/fluid/operators/distributed/rpc_server.h" - -namespace paddle { -namespace operators { -namespace distributed { - -class AsyncBRPCServer final : public RPCServer { - public: - explicit AsyncBRPCServer(const std::string& address, int client_num) - : RPCServer(address, client_num), ready_(0) {} - - virtual ~AsyncBRPCServer() {} - void StartServer() override; - void WaitServerReady() override; - - private: - void ShutDownImpl() override; - - brpc::Server server_; - - static constexpr int idle_timeout_s_ = -1; - static constexpr int max_concurrency_ = 0; - - std::mutex mutex_ready_; - std::condition_variable condition_ready_; - int ready_; -}; - -}; // namespace distributed -}; // namespace operators -}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/brpc/brpc_variable_response.cc b/paddle/fluid/operators/distributed/brpc/brpc_variable_response.cc deleted file mode 100644 index 49521e8a770..00000000000 --- a/paddle/fluid/operators/distributed/brpc/brpc_variable_response.cc +++ /dev/null @@ -1,75 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// - -#include "paddle/fluid/operators/distributed/brpc/brpc_variable_response.h" -#include "paddle/fluid/operators/distributed/send_recv.pb.h" - -namespace paddle { -namespace operators { -namespace distributed { - -namespace pb = ::google::protobuf; -using vr = ::sendrecv::VariableMessage; - -int BRPCVariableResponse::Parse(Source* source) { - pb::io::ZeroCopyInputStream* input_stream = source->contents(); - pb::io::CodedInputStream input(input_stream); - input.SetTotalBytesLimit(INT_MAX, INT_MAX); - - while (1) { - unsigned int tag = 0; - if (!input.ReadLittleEndian32(&tag)) { - break; - } - - uint64_t num_bytes = 0; - if (!input.ReadLittleEndian64(&num_bytes)) { - break; - } - - int field = static_cast(tag); - int ret = field == 0 ? -1 : field; - switch (field) { - case vr::kSerializedFieldNumber: { - if (!ProcSerializedField(field, &input, num_bytes)) { - return ret; - } - break; - } - case vr::kRowsFieldNumber: { - PADDLE_ENFORCE((meta_.type() == sendrecv::SELECTED_ROWS || - meta_.type() == sendrecv::LOD_TENSOR) && - meta_.varname() != "", - platform::errors::PreconditionNotMet( - "meta info should be got first!")); - - if (!CopySelectRowsData(&input, *dev_ctx_, num_bytes)) { - return ret; - } - break; - } - default: { - PADDLE_THROW(platform::errors::Unavailable( - "not surpported %u fieldnumber", field)); - return ret; - } - } - } - - return 0; -} -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/brpc/brpc_variable_response.h b/paddle/fluid/operators/distributed/brpc/brpc_variable_response.h deleted file mode 100644 index 6282f08a725..00000000000 --- a/paddle/fluid/operators/distributed/brpc/brpc_variable_response.h +++ /dev/null @@ -1,67 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include - -#include "brpc/channel.h" -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/framework/var_type.h" - -#include "paddle/fluid/operators/distributed/distributed_pb.h" - -#include "google/protobuf/io/coded_stream.h" -#include "google/protobuf/io/zero_copy_stream.h" -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/operators/distributed/variable_response.h" - -namespace paddle { -namespace operators { -namespace distributed { - -class BRPCSourceWrapper : public Source { - public: - explicit BRPCSourceWrapper(const butil::IOBuf& iobuf) : source_(iobuf) {} - ::google::protobuf::io::ZeroCopyInputStream* contents() override { - return &source_; - } - - private: - butil::IOBufAsZeroCopyInputStream source_; -}; - -class BRPCVariableResponse : public VariableResponse { - public: - BRPCVariableResponse(const framework::Scope* scope, - const platform::DeviceContext* dev_ctx, - bool create_scope = false) - : VariableResponse(scope, dev_ctx, create_scope) {} - - virtual ~BRPCVariableResponse() {} - - // parse attachment from iobuf - int Parse(Source* source) override; - int Parse(const butil::IOBuf& iobuf, const sendrecv::VariableMessage& meta) { - BRPCSourceWrapper wrapper(iobuf); - return VariableResponse::Parse(&wrapper, meta); - } -}; - -}; // namespace distributed -}; // namespace operators -}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/collective_client.cc b/paddle/fluid/operators/distributed/collective_client.cc deleted file mode 100644 index fcd3e6abead..00000000000 --- a/paddle/fluid/operators/distributed/collective_client.cc +++ /dev/null @@ -1,57 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/distributed/collective_client.h" -#include -#include "gflags/gflags.h" - -DECLARE_int32(rpc_deadline); - -namespace paddle { -namespace operators { -namespace distributed { -std::once_flag CollectiveClient::init_flag_; -std::unique_ptr CollectiveClient::client_(nullptr); - -bool CollectiveClient::Gather(const std::vector& remote_vars, - std::vector* dst, - const platform::DeviceContext& ctx, - framework::Scope* scope, int64_t time_out) { - for (auto r : remote_vars) { - VLOG(50) << "begin gather from ep:" << r.String(); - scope->Var(r.var_name_)->GetMutable(); - VarHandlePtr ptr = rpc_client_->AsyncGetMonomerVariable( - r.ep_, ctx, *scope, r.var_name_, time_out); - } - - rpc_client_->Wait(); - - for (auto r : remote_vars) { - auto select_rows = - scope->FindVar(r.var_name_)->GetMutable(); - dst->push_back(select_rows); - - VLOG(4) << "gather from ep:" << r.String() - << ", select_rows:" << GetSelectedRowsInfo(*select_rows); - - rpc_client_->AsyncGetMonomerBarrier(r.ep_, r.var_name_); - } - - rpc_client_->Wait(); - return true; -} - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/collective_client.h b/paddle/fluid/operators/distributed/collective_client.h deleted file mode 100644 index e7d8bb8df98..00000000000 --- a/paddle/fluid/operators/distributed/collective_client.h +++ /dev/null @@ -1,104 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include // NOLINT -#include -#include -#include - -#include "gflags/gflags.h" -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/operators/distributed/distributed.h" -#include "paddle/fluid/operators/distributed/request_handler.h" - -namespace paddle { -namespace framework { -class Scope; -class SelectedRows; -} // namespace framework -namespace platform { -class DeviceContext; -} // namespace platform -} // namespace paddle - -DECLARE_int32(rpc_deadline); - -namespace paddle { -namespace operators { -namespace distributed { - -inline std::string GetSelectedRowsInfo(const framework::SelectedRows& slr) { - std::stringstream ss; - ss << ", height:" << slr.height() << ", rows:["; - for (unsigned int i = 0; i < slr.rows().size(); i++) { - if (i != slr.rows().size() - 1) { - ss << slr.rows()[i] << ","; - } else { - ss << slr.rows()[i]; - } - } - ss << "], dims:" << slr.value().dims(); - return ss.str(); -} - -struct RemoteVar { - std::string ep_; - std::string var_name_; - int trainer_id_{0}; - - std::string String() { - std::stringstream ss; - ss << "ep:" << ep_ << ", var_name:" << var_name_ - << ", trainer_id:" << trainer_id_; - - return ss.str(); - } -}; - -class CollectiveClient { - public: - CollectiveClient() { - rpc_client_.reset(new RPCCLIENT_T()); - rpc_client_->InitImpl(); - } - virtual ~CollectiveClient() {} - - // note this function will retain the rank order. - bool Gather(const std::vector& remote_vars, - std::vector* dst, - const platform::DeviceContext& ctx, framework::Scope* scope, - int64_t time_out = FLAGS_rpc_deadline); - - static CollectiveClient* GetInstance() { - std::call_once(init_flag_, [&]() { - if (client_.get() == nullptr) { - client_.reset(new CollectiveClient()); - } - }); - return client_.get(); - } - - private: - std::unique_ptr rpc_client_; - - static std::once_flag init_flag_; - static std::unique_ptr client_; -}; -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/collective_server.cc b/paddle/fluid/operators/distributed/collective_server.cc deleted file mode 100644 index cdd37742d2d..00000000000 --- a/paddle/fluid/operators/distributed/collective_server.cc +++ /dev/null @@ -1,68 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/distributed/collective_server.h" -#include - -DEFINE_int32(collective_get_thread_num, 5, "number of threads for rpc get"); - -namespace paddle { -namespace operators { -namespace distributed { - -std::once_flag CollectiveServer::init_flag_; -std::shared_ptr CollectiveServer::collective_server_(nullptr); - -CollectiveServer::CollectiveServer(const std::string& end_point, int fan_in) { - VLOG(1) << "Create colllective server:" << end_point << ", fan_in:" << fan_in; - rpc_server_.reset(new RPCSERVER_T(end_point, fan_in)); -} - -void CollectiveServer::Stop() { - rpc_server_->ShutDown(); - server_thread_->join(); - loop_thread_->join(); -} - -void CollectiveServer::StartServer() { - get_monomer_handler_.reset(new GetMonomerHandler()); - get_monomer_handler_->SetRPCServer(rpc_server_.get()); - - get_barrier_handler_.reset(new GetMonomerBarrierHandler()); - get_barrier_handler_->SetRPCServer(rpc_server_.get()); - - rpc_server_->RegisterRPC(distributed::kRequestGetMonomerVariable, - get_monomer_handler_.get(), - FLAGS_collective_get_thread_num); - rpc_server_->RegisterRPC(distributed::kRequestGetMonomerBarrier, - get_barrier_handler_.get(), 1); - - server_thread_.reset(new std::thread([&]() { rpc_server_->StartServer(); })); - rpc_server_->WaitServerReady(); - - loop_thread_.reset(new std::thread([&]() { - while (true) { - if (rpc_server_->IsExit()) { - LOG(WARNING) << "get exit!rpc_processor break!"; - break; - } - sleep(1); - } - VLOG(1) << "CollectiveServer loop_thread end"; - })); -} - -}; // namespace distributed -}; // namespace operators -}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/collective_server.h b/paddle/fluid/operators/distributed/collective_server.h deleted file mode 100644 index 49649232860..00000000000 --- a/paddle/fluid/operators/distributed/collective_server.h +++ /dev/null @@ -1,116 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include -#include -#include // NOLINT -#include -#include -#include "gflags/gflags.h" -#include "paddle/fluid/operators/distributed/distributed.h" -#include "paddle/fluid/operators/distributed/request_handler.h" -#include "paddle/fluid/operators/distributed/request_handler_impl.h" -#include "paddle/fluid/operators/distributed/rpc_server.h" - -namespace paddle { -namespace framework { -class Variable; -} // namespace framework -} // namespace paddle - -namespace paddle { -namespace operators { -namespace distributed { - -class CollectiveServer; - -class GetMonomerHandler final : public RequestHandler { - public: - GetMonomerHandler() : RequestHandler(true) {} - virtual ~GetMonomerHandler() {} - bool Handle(const std::string& var_name, framework::Scope* scope, - framework::Variable* var, framework::Variable** outvar, - const int trainer_id, const std::string& out_var_name = "", - const std::string& table_name = "") override { - VLOG(50) << "GetMonomerHandler recv " << var_name; - - *outvar = scope->FindVar(var_name); - PADDLE_ENFORCE_NOT_NULL( - outvar, platform::errors::NotFound("var: %s is not found.", var_name)); - - return true; - } -}; - -class GetMonomerBarrierHandler final : public RequestHandler { - public: - GetMonomerBarrierHandler() : RequestHandler(true) {} - virtual ~GetMonomerBarrierHandler() {} - bool Handle(const std::string& var_name, framework::Scope* scope, - framework::Variable* var, framework::Variable** outvar, - const int trainer_id, const std::string& out_var_name = "", - const std::string& table_name = "") override { - VLOG(50) << "GetMonomerHandler recv " << var_name; - - rpc_server_->IncreaseVarBarrier(var_name); - - return true; - } -}; - -class CollectiveServer final { - public: - explicit CollectiveServer(const std::string& end_point, int fan_in); - - virtual ~CollectiveServer() {} - - void StartServer(); - - static CollectiveServer* GetInstance(const std::string& end_point, - int fan_in) { - std::call_once(init_flag_, [&]() { - if (collective_server_.get() == nullptr) { - collective_server_.reset(new CollectiveServer(end_point, fan_in)); - collective_server_->StartServer(); - } - }); - - return collective_server_.get(); - } - - std::shared_ptr GetRPCServer() { return rpc_server_; } - - void Stop(); - - private: - std::unique_ptr get_monomer_handler_; - std::unique_ptr get_barrier_handler_; - - std::shared_ptr rpc_server_; - std::shared_ptr server_thread_; - std::shared_ptr loop_thread_; - - bool ready_{false}; - - static std::once_flag init_flag_; - static std::shared_ptr collective_server_; -}; - -}; // namespace distributed -}; // namespace operators -}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/collective_server_test.cc b/paddle/fluid/operators/distributed/collective_server_test.cc deleted file mode 100644 index 92b2eb4b51e..00000000000 --- a/paddle/fluid/operators/distributed/collective_server_test.cc +++ /dev/null @@ -1,131 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include - -#include "gtest/gtest.h" -#include "paddle/fluid/operators/distributed/collective_client.h" -#include "paddle/fluid/operators/distributed/collective_server.h" - -namespace paddle { -namespace framework { -class Variable; -} // namespace framework -} // namespace paddle - -namespace framework = paddle::framework; -namespace platform = paddle::platform; -namespace distributed = paddle::operators::distributed; - -std::unique_ptr StartServer( - const std::string& ep, int fan_in, framework::Scope* scope, - platform::DeviceContext* dev_ctx) { - distributed::CollectiveServer* server = - distributed::CollectiveServer::GetInstance(ep, fan_in); - - auto rpc_server = server->GetRPCServer(); - rpc_server->RegisterVar("var1", distributed::kRequestGetMonomerVariable, - scope, dev_ctx); - - std::cout << "StartServer return" << std::endl; - return std::unique_ptr(server); -} - -std::unique_ptr GenerateVars(platform::Place place) { - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - auto& ctx = *pool.Get(place); - - framework::Scope* scope = new framework::Scope(); - framework::Variable* var = scope->Var("var1"); - auto* slr = var->GetMutable(); - slr->set_height(20000); - - auto* tensor = slr->mutable_value(); - auto* rows = slr->mutable_rows(); - - tensor->Resize(framework::make_ddim({3, 1024})); - tensor->mutable_data(place); - - paddle::operators::math::set_constant(ctx, tensor, 32.7); - for (int i = 0; i < 3; ++i) rows->push_back(i); - - std::cout << "src:" << distributed::GetSelectedRowsInfo(*slr); - - return std::unique_ptr(scope); -} - -void Gather(const std::vector& vars, - platform::DeviceContext* dev_ctx) { - distributed::CollectiveClient* client = - distributed::CollectiveClient::GetInstance(); - - framework::Scope* scope = new framework::Scope(); - framework::Variable* var = scope->Var("var1"); - var->GetMutable(); - - std::vector dst; - client->Gather(vars, &dst, *dev_ctx, scope); - std::cout << "dst:" << distributed::GetSelectedRowsInfo(*dst[0]); - dev_ctx->Wait(); - - ASSERT_EQ(dst[0]->value().dims(), framework::make_ddim({3, 1024})); - ASSERT_EQ(dst[0]->height(), 20000); - ASSERT_EQ(dst[0]->rows().size(), static_cast(3)); - for (int i = 0; i < 3; i++) { - ASSERT_EQ(dst[0]->rows()[i], i); - } - - std::vector vec; - TensorToVector(dst[0]->value(), *dev_ctx, &vec); - for (size_t i = 0; i < 3 * 1024; i++) { - ASSERT_FLOAT_EQ(vec[i], 32.7); - } -} - -TEST(CollectiveServer, GPU) { - setenv("http_proxy", "", 1); - setenv("https_proxy", "", 1); - - platform::CUDAPlace place; - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - auto& ctx = *pool.Get(place); - - std::string ep = "127.0.0.1:7164"; - auto scope = GenerateVars(place); - - auto* v1 = scope->FindVar("var1"); - std::cout << "var1:" << v1 << std::endl; - - auto server = StartServer(ep, 2, scope.get(), &ctx); - auto rpc_server = server->GetRPCServer(); - - distributed::RemoteVar var; - var.ep_ = ep; - var.var_name_ = "var1"; - var.trainer_id_ = 0; - - std::vector vars{var}; - Gather(vars, &ctx); - Gather(vars, &ctx); - - std::cout << "begin WaitVarBarrier" << std::endl; - rpc_server->WaitVarBarrier("var1"); - rpc_server->ClearRegisteredVars(); - server->Stop(); - - scope.release(); - server.release(); -} diff --git a/paddle/fluid/operators/distributed/communicator.cc b/paddle/fluid/operators/distributed/communicator.cc deleted file mode 100644 index 4ee27a64146..00000000000 --- a/paddle/fluid/operators/distributed/communicator.cc +++ /dev/null @@ -1,989 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/distributed/communicator.h" - -#include - -#include -#include // NOLINT -#include -#include // NOLINT -#include - -#include "gflags/gflags.h" -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/framework/threadpool.h" -#include "paddle/fluid/framework/variable_helper.h" -#include "paddle/fluid/operators/distributed/distributed.h" -#include "paddle/fluid/operators/distributed/parameter_recv.h" -#include "paddle/fluid/operators/distributed/parameter_send.h" -#include "paddle/fluid/string/printf.h" -#include "paddle/fluid/string/split.h" - -namespace paddle { -namespace operators { -namespace distributed { - -using Tree = - std::map>>; -using RpcCtxMap = operators::distributed::RpcCtxMap; - -inline double GetCurrentUS() { - struct timeval time; - gettimeofday(&time, NULL); - return 1e+6 * time.tv_sec + time.tv_usec; -} - -Communicator::Communicator() {} - -std::once_flag Communicator::init_flag_; -std::shared_ptr Communicator::communicator_(nullptr); - -void AsyncCommunicator::InitImpl(const RpcCtxMap &send_varname_to_ctx, - const RpcCtxMap &recv_varname_to_ctx, - Scope *recv_scope) { - send_varname_to_ctx_ = std::move(send_varname_to_ctx); - recv_varname_to_ctx_ = std::move(recv_varname_to_ctx); - recv_scope_ = std::move(recv_scope); - - if (send_varname_to_ctx.size() == 0) { - VLOG(0) << "nothing need to be send, will not start send_thread"; - } else { - send_scope_.reset(new Scope()); - for (auto &iter : send_varname_to_ctx_) { - if (iter.first == STEP_COUNTER && !need_global_step_) continue; - send_varname_to_queue_[iter.first] = - std::make_shared>>( - send_queue_size_); - } - send_threadpool_.reset(new ::ThreadPool(thread_pool_size_)); - } - - if (recv_varname_to_ctx.size() == 0) { - VLOG(0) << "nothing need to be received, will not start recv_thread"; - } else { - recv_threadpool_.reset(new ::ThreadPool(thread_pool_size_)); - } - - InitParams(); -} - -void AsyncCommunicator::InitParams() { RecvNoBarrier(); } - -AsyncCommunicator::~AsyncCommunicator() { - running_ = false; - if (main_thread_) main_thread_->join(); -} - -void AsyncCommunicator::SendGlobalStep(int batches) { - if (!need_global_step_) { - return; - } - - if (batches == 0) { - return; - } - - auto &var_name = STEP_COUNTER; - auto *out_var = send_scope_->Var(var_name); - auto *out_t = out_var->GetMutable(); - auto *data = out_t->mutable_data({1}, platform::CPUPlace()); - data[0] = static_cast(batches); - - auto &ctx = send_varname_to_ctx_.at(var_name); - auto send_functor = distributed::ParameterSend(); - send_functor(ctx, *send_scope_, true, 1); -} - -void AsyncCommunicator::SendByCommunicator() { - std::vector> task_futures; - task_futures.reserve(send_varname_to_ctx_.size()); - VLOG(3) << "run send graph"; - - auto before_run_send_graph = GetCurrentUS(); - for (auto &iter : send_varname_to_queue_) { - auto &var_name = iter.first; - auto &var_queue = iter.second; - - auto send_task = [this, &var_name, &var_queue] { - VLOG(3) << var_name << " merge and send; "; - std::vector> vars; - - int merged_var_num = 0; - int wait_times = 0; - while (merged_var_num < max_merge_var_num_) { - if (var_queue->Size() == 0) { - VLOG(4) << "wait_times -> " << wait_times; - if (wait_times >= send_wait_times_) { - break; - } - std::this_thread::sleep_for(std::chrono::milliseconds(10)); - wait_times++; - continue; - } else { - wait_times = 0; - - vars.push_back(var_queue->Pop()); - merged_var_num++; - } - } - auto before_merge = GetCurrentUS(); - if (var_name == STEP_COUNTER) { - SendGlobalStep(merged_var_num); - auto after_merge = GetCurrentUS(); - VLOG(3) << "merge and send " << merged_var_num << " " << var_name - << " use time " << after_merge - before_merge; - return; - } - - auto &ctx = send_varname_to_ctx_.at(var_name); - - MergeVars(var_name, vars, send_scope_.get(), ctx.merge_add); - auto after_merge = GetCurrentUS(); - VLOG(3) << "merge " << merged_var_num << " " << var_name << " use time " - << after_merge - before_merge; - - auto send_functor = distributed::ParameterSend(); - send_functor(ctx, *send_scope_, true, 1); - auto after_send = GetCurrentUS(); - VLOG(3) << "send " << var_name << " use time " - << after_send - after_merge; - - if (var_name.rfind("@GRAD") != var_name.size() - 5) return; - - auto recv_param = var_name.substr(0, var_name.size() - 5); - if (recv_varname_to_ctx_.find(recv_param) == recv_varname_to_ctx_.end()) - return; - - auto recv_functor = distributed::ParameterRecv(); - recv_functor(recv_varname_to_ctx_.at(recv_param), *recv_scope_); - auto after_recv = GetCurrentUS(); - VLOG(3) << "recv " << recv_param << " use time " - << after_recv - after_send; - }; - task_futures.emplace_back(send_threadpool_->enqueue(std::move(send_task))); - } - for (auto &task_f : task_futures) { - task_f.wait(); - } - auto after_run_send_graph = GetCurrentUS(); - - VLOG(3) << "run send graph use time " - << (after_run_send_graph - before_run_send_graph); -} - -void HalfAsyncCommunicator::SendByCommunicator() { - std::vector> task_futures; - task_futures.reserve(send_varname_to_ctx_.size()); - VLOG(3) << "run send graph"; - - int batches = BatchesCounter(); - if (batches <= 0) return; - - auto before_run_send_graph = GetCurrentUS(); - for (auto &iter : send_varname_to_queue_) { - auto &var_name = iter.first; - auto &var_queue = iter.second; - - auto send_task = [this, batches, &var_name, &var_queue] { - VLOG(3) << var_name << " merge and send; "; - auto before_task = GetCurrentUS(); - std::vector> vars; - vars.reserve(batches); - - for (int i = 0; i < batches; ++i) { - vars.push_back(var_queue->Pop()); - } - - if (var_name == STEP_COUNTER) { - SendGlobalStep(batches); - auto end_task = GetCurrentUS(); - VLOG(3) << "merge " << batches << " " << var_name << " use time " - << end_task - before_task; - return; - } - - auto &ctx = send_varname_to_ctx_.at(var_name); - - auto before_merge = GetCurrentUS(); - MergeVars(var_name, vars, send_scope_.get(), ctx.merge_add); - auto after_merge = GetCurrentUS(); - VLOG(3) << "merge " << batches << " " << var_name << " use time " - << after_merge - before_merge; - - auto send_functor = distributed::ParameterSend(); - send_functor(ctx, *send_scope_, true, 1); - auto after_send = GetCurrentUS(); - VLOG(3) << "send " << var_name << " use time " - << after_send - before_task; - - if (var_name.rfind("@GRAD") != var_name.size() - 5) return; - - auto recv_param = var_name.substr(0, var_name.size() - 5); - if (recv_varname_to_ctx_.find(recv_param) == recv_varname_to_ctx_.end()) - return; - - auto recv_functor = distributed::ParameterRecv(); - recv_functor(recv_varname_to_ctx_.at(recv_param), *recv_scope_); - auto after_recv = GetCurrentUS(); - VLOG(3) << "recv " << recv_param << " use time " - << after_recv - after_send; - return; - }; - task_futures.emplace_back(send_threadpool_->enqueue(std::move(send_task))); - } - for (auto &task_f : task_futures) { - task_f.wait(); - } - auto after_run_send_graph = GetCurrentUS(); - - VLOG(3) << "run send graph use time " - << (after_run_send_graph - before_run_send_graph); -} - -void AsyncCommunicator::MainThread() { - VLOG(3) << "MainThread start and wait"; - - while (waiting_ && running_) { - std::this_thread::sleep_for(std::chrono::milliseconds(100)); - VLOG(3) << "wait for running"; - } - - while (running_) { - SendByCommunicator(); - BarrierSend(); - } - VLOG(3) << "communicator stopped, send thread exit"; -} - -void HalfAsyncCommunicator::MainThread() { - VLOG(3) << "MainThread start and wait"; - - while (waiting_ && running_) { - std::this_thread::sleep_for(std::chrono::milliseconds(100)); - VLOG(3) << "wait for running"; - } - - while (running_) { - SendByCommunicator(); - BarrierSend(); - RecvByCommunicator(); - BarrierRecv(); - BarrierWeakUp(); - } - VLOG(3) << "communicator stopped, send thread exit"; -} - -void AsyncCommunicator::RecvByCommunicator() { - VLOG(3) << "parallel run recv graph"; - if (!running_) return; - RecvNoBarrier(); - VLOG(3) << "run recv graph use time"; -} - -void AsyncCommunicator::RecvNoBarrier() { - std::vector> task_futures; - task_futures.reserve(recv_varname_to_ctx_.size()); - - for (auto &iter : recv_varname_to_ctx_) { - auto recv_task = [this, &iter] { - auto before_task = GetCurrentUS(); - auto &var_name = iter.first; - auto recv_functor = distributed::ParameterRecv(); - recv_functor(iter.second, *recv_scope_); - auto end_task = GetCurrentUS(); - VLOG(1) << "recv var " << var_name << " use time " - << (end_task - before_task); - }; - task_futures.emplace_back(recv_threadpool_->enqueue(std::move(recv_task))); - } - - for (auto &task : task_futures) { - task.wait(); - } -} - -void AsyncCommunicator::Start() { - VLOG(3) << "Communicator start"; - if (!communicator_) { - VLOG(0) << "Communicator is not inited, do nothing"; - } else { - VLOG(3) << "start send thread and recv thread"; - waiting_ = true; - running_ = true; - BarrierTriggerReset(max_merge_var_num_); - // start send and recv thread - main_thread_.reset( - new std::thread(std::bind(&AsyncCommunicator::MainThread, this))); - } -} - -void AsyncCommunicator::Stop() { - VLOG(3) << "Communicator stop"; - running_ = false; - if (!communicator_) { - VLOG(0) << "Communicator is not inited, do nothing"; - } else { - if (main_thread_) { - VLOG(3) << "stop send thread"; - main_thread_->join(); - main_thread_.reset(nullptr); - } - } - VLOG(3) << "Communicator stop done"; -} - -void AsyncCommunicator::Send(const std::vector &var_names, - const std::vector &var_tables, - const framework::Scope &scope) { - waiting_ = false; - - PADDLE_ENFORCE_EQ( - var_tables.size(), 1, - platform::errors::InvalidArgument("var_tables.size() == 1 is permitted")); - - auto table_name = var_tables[0]; - - if (table_name == STEP_COUNTER && !need_global_step_) return; - - auto before_send_op = GetCurrentUS(); - auto &queue = send_varname_to_queue_.at(table_name); - - if (table_name == STEP_COUNTER) { - auto tmp_var = std::make_shared(); - auto *tensor = tmp_var->GetMutable(); - tensor->Resize(framework::make_ddim({1})); - auto *out_d = tensor->mutable_data(platform::CPUPlace()); - out_d[0] = 1; - queue->Push(tmp_var); - } else { - PADDLE_ENFORCE_GE(var_names.size(), 1, - platform::errors::InvalidArgument( - "var_names.size() >= 1 is permitted")); - - auto *var = scope.FindVar(var_names[0]); - - PADDLE_ENFORCE_EQ( - var->IsInitialized(), true, - platform::errors::InvalidArgument("grad var should be inited")); - - auto tmp_var = std::make_shared(); - if (var->IsType()) { - framework::CopyVariable(*var, tmp_var.get()); - queue->Push(tmp_var); - } else if (var->IsType()) { - // push var into send queue by var_name - auto var_name = var_names[0]; - framework::CopyVariable(*var, tmp_var.get()); - queue->Push(tmp_var); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "unknown var type to copy, only support LoDTensor/SelectedRows")); - } - } - auto after_send_op = GetCurrentUS(); - VLOG(3) << "send to " << table_name << " with queue size " << queue->Size() - << ", use time " << (after_send_op - before_send_op); -} - -void HalfAsyncCommunicator::Clean() { - for (auto &iter : send_varname_to_queue_) { - auto &var_name = iter.first; - auto &var_queue = iter.second; - - while (var_queue->Size() > 0) { - var_queue->Pop(); - } - - VLOG(3) << "clean var: " << var_name << " done"; - } -} - -int HalfAsyncCommunicator::BatchesCounter() { - while (running_) { - if (barrier_counter_.load() >= barrier_trigger_.load() && - barrier_trigger_.load() != 0) { - break; - } else { - std::this_thread::sleep_for(std::chrono::milliseconds(10)); - } - } - - return barrier_counter_.load(); -} - -void HalfAsyncCommunicator::Barrier() { - barrier_counter_++; - - if (!running_) { - VLOG(3) << "Communicator is not running, release barrier"; - return; - } - - { - std::unique_lock lk(barrier_mutex_); - barrier_cond_.wait(lk, [this] { return (barrier_counter_ == 0); }); - } -} - -void HalfAsyncCommunicator::BarrierTriggerDecrement() { - barrier_trigger_--; - VLOG(3) << "BarrierTriggerDecrement decrement barrier trigger to " - << barrier_trigger_.load(); -} - -void HalfAsyncCommunicator::BarrierTriggerReset(int initial_val) { - barrier_trigger_.store(initial_val); - - VLOG(3) << "BarrierTriggerReset reset barrier trigger to " - << barrier_trigger_.load(); -} - -void HalfAsyncCommunicator::BarrierWeakUp() { - barrier_counter_.store(0); - barrier_cond_.notify_all(); -} - -void SyncCommunicator::BarrierSend() { - if (!running_) return; - - distributed::RPCClient *rpc_client = - distributed::RPCClient::GetInstance(trainer_id_); - - std::vector rets; - - for (auto &ep : pserver_endpoints_) { - rets.push_back(rpc_client->AsyncSendBatchBarrier(ep)); - } - - for (size_t i = 0; i < rets.size(); i++) { - PADDLE_ENFORCE_NE(rets[i]->Wait(), 0U, platform::errors::External( - "internal error in RPCClient")); - } - - VLOG(4) << "BarrierSend with SyncCommunicator"; -} - -void SyncCommunicator::BarrierRecv() { - if (!running_) return; - - distributed::RPCClient *rpc_client = - distributed::RPCClient::GetInstance(trainer_id_); - - std::vector rets; - for (auto &ep : pserver_endpoints_) { - rets.push_back(rpc_client->AsyncSendFetchBarrier(ep)); - } - - for (size_t i = 0; i < rets.size(); i++) { - PADDLE_ENFORCE_NE(rets[i]->Wait(), 0U, platform::errors::External( - "internal error in RPCClient")); - } - - VLOG(4) << "BarrierRecv with SyncCommunicator"; -} - -void GeoCommunicator::InitImpl(const RpcCtxMap &send_varname_to_ctx, - const RpcCtxMap &recv_varname_to_ctx, - Scope *recv_scope) { - send_varname_to_ctx_ = std::move(send_varname_to_ctx); - recv_varname_to_ctx_ = std::move(recv_varname_to_ctx); - recv_scope_ = std::move(recv_scope); - - PADDLE_ENFORCE_GT( - send_varname_to_ctx.size(), 0, - platform::errors::InvalidArgument("send var contexts can not be zero")); - - send_scope_.reset(new Scope()); - for (auto &iter : send_varname_to_ctx_) { - auto &varname = iter.first; - - if (varname == STEP_COUNTER) { - send_varname_to_queue_[varname] = - std::make_shared>>( - send_queue_size_); - } else { - auto &send_ctx = iter.second; - - send_var_nums_ += send_ctx.splited_varnames.size(); - if (!send_ctx.is_sparse) { - continue; - } - int pserver_num = static_cast(send_ctx.epmap.size()); - for (int ep_idx = 0; ep_idx < pserver_num; ep_idx++) { - sparse_id_queues_.insert( - std::pair>>>>( - send_ctx.splited_varnames[ep_idx], - std::make_shared< - BlockingQueue>>>( - send_queue_size_))); - } - } - } - send_threadpool_.reset(new ::ThreadPool(thread_pool_size_)); - - if (recv_varname_to_ctx.size() == 0) { - VLOG(0) << "nothing need to be received, will not start recv_thread"; - } else { - recv_threadpool_.reset(new ::ThreadPool(thread_pool_size_)); - } - - delta_scope_.reset(new Scope()); - old_scope_.reset(new Scope()); - pserver_scope_.reset(new Scope()); - - InitParams(); -} - -void GeoCommunicator::Send(const std::vector &var_names, - const std::vector &var_tables, - const framework::Scope &scope) { - waiting_ = false; - PADDLE_ENFORCE_EQ( - var_tables.size(), 1, - platform::errors::InvalidArgument("var_tables.size() == 1 is permitted")); - - auto table_name = var_tables[0]; - if (table_name == STEP_COUNTER) return; - - auto before_send = GetCurrentUS(); - size_t splited_var_nums = - send_varname_to_ctx_[table_name].splited_varnames.size(); - - std::unordered_map> ids_table; - - for (size_t j = 0; j < splited_var_nums; j++) { - ids_table.insert(std::pair>( - send_varname_to_ctx_[table_name].splited_varnames[j], - std::unordered_set())); - } - auto *var = scope.FindVar(var_names[0]); - auto &rows = var->Get().rows(); - - // insert ids which has not been record - for (size_t j = 0; j < rows.size(); j++) { - auto ep_idx = rows[j] % splited_var_nums; - ids_table.at(send_varname_to_ctx_[table_name].splited_varnames[ep_idx]) - .insert(rows[j]); - } - - auto before_push = GetCurrentUS(); - for (auto &iter : ids_table) { - auto &key = iter.first; - auto &sparse_ids_set = iter.second; - auto sparse_ids_vec = std::make_shared>(); - sparse_ids_vec->assign(sparse_ids_set.begin(), sparse_ids_set.end()); - sparse_id_queues_.at(key)->Push(sparse_ids_vec); - VLOG(3) << "push " << sparse_ids_vec->size() << " ids to " << key - << "'s queue"; - } - auto after_send = GetCurrentUS(); - VLOG(3) << "run send " << table_name << " op finish. using " - << (before_push - before_send) << "; " << (after_send - before_push); -} - -void GeoCommunicator::MainThread() { - VLOG(3) << "MainThread start and wait"; - - while (waiting_ && running_) { - std::this_thread::sleep_for(std::chrono::milliseconds(100)); - VLOG(3) << "wait for running"; - } - - while (running_) { - std::vector> tasks; - tasks.reserve(send_var_nums_); - - for (auto &iter : send_varname_to_ctx_) { - auto &var_name = iter.first; - auto &send_ctx = iter.second; - int pserver_num = static_cast(send_ctx.epmap.size()); - if (send_ctx.is_sparse) { - for (int ep_idx = 0; ep_idx < pserver_num; ep_idx++) { - auto send_recv_task = [this, ep_idx, &var_name] { - auto before_send_sparse = GetCurrentUS(); - if (var_name == STEP_COUNTER) { - return; - } - auto send_varname = - send_varname_to_ctx_.at(var_name).splited_varnames[ep_idx]; - auto sparse_ids = MergeSparseIds(send_varname); - if (sparse_ids.size() == 0) { - return; - } - SendSparse(var_name, ep_idx, sparse_ids); - auto after_send_sparse = GetCurrentUS(); - RecvSparse(var_name, ep_idx); - auto after_recv_sparse = GetCurrentUS(); - VLOG(3) - << "send recv " - << send_varname_to_ctx_.at(var_name).splited_varnames[ep_idx] - << " finish, using " << (after_send_sparse - before_send_sparse) - << " and " << (after_recv_sparse - after_send_sparse) - << "; total = " << (after_recv_sparse - before_send_sparse); - }; - tasks.emplace_back( - send_threadpool_->enqueue(std::move(send_recv_task))); - } - } else { - auto send_recv_task = [this, &var_name, &send_ctx] { - if (var_name == STEP_COUNTER) { - return; - } - SendDense(var_name); - RecvDense(var_name); - }; - tasks.emplace_back( - send_threadpool_->enqueue(std::move(send_recv_task))); - } - } - for (auto &task : tasks) { - task.wait(); - } - } -} - -std::vector GeoCommunicator::MergeSparseIds( - const std::string &send_varname) { - size_t merge_num = 0, wait_times = 0; - std::unordered_set sparse_ids; - while (merge_num < static_cast(max_merge_var_num_)) { - VLOG(3) << "Merge Number of " << send_varname << " = " << merge_num; - if (sparse_id_queues_.at(send_varname)->Size() > 0) { - wait_times = 0; - std::shared_ptr> pop_ids = - sparse_id_queues_.at(send_varname)->Pop(); - for (size_t j = 0; j < pop_ids->size(); j++) { - sparse_ids.insert(pop_ids->at(j)); - } - merge_num += 1; - VLOG(3) << "sparse_id_queues_(" << send_varname << ") pushed"; - } else if (sparse_id_queues_.at(send_varname)->Size() == 0) { - VLOG(3) << "wait_times -> " << wait_times; - if (wait_times >= static_cast(send_wait_times_)) { - break; - } - std::this_thread::sleep_for(std::chrono::milliseconds(10)); - wait_times++; - continue; - } - } - std::vector res; - res.assign(sparse_ids.begin(), sparse_ids.end()); - return res; -} -void GeoCommunicator::SendSparse(const std::string &varname, int ep_idx, - const std::vector &sparse_ids) { - auto &rpc_ctx = send_varname_to_ctx_.at(varname); - auto send_varname = rpc_ctx.splited_varnames[ep_idx]; - auto trainer_id = rpc_ctx.trainer_id; - auto endpoint = rpc_ctx.epmap[ep_idx]; - auto pserver_num = rpc_ctx.epmap.size(); - - auto *var_latest = recv_scope_->FindVar(varname); - - PADDLE_ENFORCE_EQ(var_latest->IsInitialized(), true, - platform::errors::Unavailable( - "%s is not initialized, please check", varname)); - auto &t_latest = var_latest->Get(); - - auto dims1 = t_latest.dims()[1]; - - auto cpu_ctx = paddle::platform::CPUDeviceContext(); - auto *var_delta = delta_scope_->Var(send_varname); - auto *t_delta = var_delta->GetMutable(); - - auto *t_value = t_delta->mutable_value(); - t_value->mutable_data( - framework::make_ddim({static_cast(sparse_ids.size()), dims1}), - cpu_ctx.GetPlace()); - - std::vector *>> values; - auto *ins = distributed::LargeScaleKV::GetInstance(); - ins->Get(varname)->Get(sparse_ids, {"Param"}, &values); - - auto blas = math::GetBlas(cpu_ctx); - float coefficient = 1.0 / static_cast(trainers_); - - for (auto j = 0; j < static_cast(sparse_ids.size()); ++j) { - blas.VSUB(dims1, t_latest.data() + sparse_ids[j] * dims1, - values[j][0]->data(), t_value->data() + j * dims1); - blas.SCAL(dims1, coefficient, t_value->data() + j * dims1); - blas.VADD(dims1, values[j][0]->data(), t_value->data() + j * dims1, - values[j][0]->data()); - } - - std::vector send_rows; - send_rows.reserve(sparse_ids.size()); - for (auto idx : sparse_ids) { - send_rows.push_back(idx / pserver_num); - } - t_delta->set_height(rpc_ctx.height_sections[ep_idx]); - t_delta->set_rows(send_rows); - - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto &cpu_ctx_send = *pool.Get(platform::CPUPlace()); - distributed::RPCClient *rpc_client = - distributed::RPCClient::GetInstance(trainer_id); - - auto ret = rpc_client->AsyncSendVar(endpoint, cpu_ctx_send, - *delta_scope_.get(), send_varname); - ret->Wait(); -} - -void GeoCommunicator::SendDense(const std::string &varname) { - auto *var_latest = recv_scope_->FindVar(varname); - auto *var_timestamp = old_scope_->FindVar(varname); - - PADDLE_ENFORCE_EQ(var_latest->IsInitialized(), true, - platform::errors::Unavailable( - "%s is not initialized, please check", varname)); - PADDLE_ENFORCE_EQ(var_timestamp->IsInitialized(), true, - platform::errors::Unavailable( - "%s is not initialized, please check", varname)); - - auto &t_latest = var_latest->Get(); - auto t_timestamp = var_timestamp->GetMutable(); - - auto cpu_ctx = paddle::platform::CPUDeviceContext(); - auto *var_delta = delta_scope_->Var(varname); - auto *t_delta = var_delta->GetMutable(); - t_delta->mutable_data(t_latest.dims(), cpu_ctx.GetPlace()); - - auto blas = math::GetBlas(cpu_ctx); - blas.VSUB(t_latest.numel(), t_latest.data(), - t_timestamp->data(), t_delta->data()); - - float coefficient = 1.0 / static_cast(trainers_); - blas.SCAL(t_latest.numel(), coefficient, t_delta->data()); - - blas.VADD(t_latest.numel(), t_timestamp->data(), - t_delta->data(), t_timestamp->data()); - - auto &ctx = send_varname_to_ctx_.at(varname); - auto send = distributed::ParameterSend(); - send(ctx, *delta_scope_, true, 1); -} - -void GeoCommunicator::RecvByCommunicator() { return; } - -void GeoCommunicator::RecvSparse(const std::string &varname, int ep_idx) { - auto train_id = recv_varname_to_ctx_.at(varname).trainer_id; - auto endpoint = recv_varname_to_ctx_.at(varname).epmap[ep_idx]; - auto splited_var_name = - recv_varname_to_ctx_.at(varname).splited_varnames[ep_idx]; - auto pserver_num = recv_varname_to_ctx_.at(varname).epmap.size(); - - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto &cpu_ctx_recv = *pool.Get(platform::CPUPlace()); - distributed::RPCClient *rpc_client = - distributed::RPCClient::GetInstance(train_id); - - auto *var_psrever = pserver_scope_->Var(splited_var_name); - auto handle = rpc_client->AsyncGetVar(endpoint, cpu_ctx_recv, - *pserver_scope_.get(), splited_var_name, - splited_var_name, splited_var_name); - handle->Wait(); - - auto *var_latest = recv_scope_->FindVar(varname); - - PADDLE_ENFORCE_EQ( - var_psrever->IsInitialized(), true, - platform::errors::Unavailable( - "%s in pserver scope is not initialized, please check", varname)); - - std::vector ids; - ids.assign(var_psrever->Get().rows().begin(), - var_psrever->Get().rows().end()); - - for (size_t j = 0; j < ids.size(); j++) { - ids[j] = ids[j] * pserver_num + ep_idx; - } - - VLOG(3) << "RecvSparse receive var: " << splited_var_name - << " ids Size: " << ids.size(); - - auto t_psrever = var_psrever->Get().value(); - - std::vector *>> old_values; - - auto *ins = distributed::LargeScaleKV::GetInstance(); - ins->Get(varname)->Get(ids, {"Param"}, &old_values); - - auto *t_latest = var_latest->GetMutable(); - - auto dims1 = t_latest->dims()[1]; - auto numel = ids.size() * dims1; - - std::vector v_delta; - v_delta.resize(numel); - - auto cpu_ctx = paddle::platform::CPUDeviceContext(); - auto blas = math::GetBlas(cpu_ctx); - - for (auto j = 0; j < static_cast(ids.size()); ++j) { - blas.VSUB(dims1, t_psrever.data() + j * dims1, - old_values[j][0]->data(), v_delta.data() + j * dims1); - blas.VADD(dims1, t_latest->data() + ids[j] * dims1, - v_delta.data() + j * dims1, - t_latest->data() + ids[j] * dims1); - blas.VCOPY(dims1, t_psrever.data() + j * dims1, - old_values[j][0]->data()); - } -} - -void GeoCommunicator::RecvDense(const std::string &varname) { - auto *var_latest = recv_scope_->FindVar(varname); - auto *var_timestamp = old_scope_->FindVar(varname); - auto *var_psrever = pserver_scope_->Var(varname); - - auto &ctx = recv_varname_to_ctx_.at(varname); - auto recv = distributed::ParameterRecv(); - recv(ctx, *pserver_scope_); - - PADDLE_ENFORCE_EQ( - var_psrever->IsInitialized(), true, - platform::errors::Unavailable( - "%s in pserver scope is not initialized, please check", varname)); - - auto t_psrever = var_psrever->Get(); - auto t_latest = var_latest->GetMutable(); - auto t_timestamp = var_timestamp->GetMutable(); - - auto cpu_ctx = paddle::platform::CPUDeviceContext(); - auto *var_delta = delta_scope_->Var(varname); - auto *t_delta = var_delta->GetMutable(); - t_delta->mutable_data(t_latest->dims(), cpu_ctx.GetPlace()); - - auto blas = math::GetBlas(cpu_ctx); - blas.VSUB(t_latest->numel(), t_psrever.data(), - t_timestamp->data(), t_delta->data()); - blas.VADD(t_latest->numel(), t_latest->data(), t_delta->data(), - t_latest->data()); - blas.VCOPY(t_latest->numel(), t_psrever.data(), - t_timestamp->data()); -} - -void GeoCommunicator::InitParams() { - std::vector> tasks; - tasks.reserve(recv_varname_to_ctx_.size()); - - for (auto &iter : recv_varname_to_ctx_) { - auto &var_name = iter.first; - auto &recv_ctx = iter.second; - - auto recv_task = [this, &var_name, &recv_ctx] { - if (!recv_ctx.is_sparse) { - InitDense(var_name); - } - }; - tasks.emplace_back(send_threadpool_->enqueue(std::move(recv_task))); - } - - for (auto &task : tasks) { - task.wait(); - } - InitSparse(); -} - -void GeoCommunicator::InitDense(const std::string varname) { - auto &ctx = recv_varname_to_ctx_.at(varname); - auto recv = distributed::ParameterRecv(); - recv(ctx, *recv_scope_); - - auto *global_var = recv_scope_->FindVar(varname); - global_var->GetMutable(); - - auto *old_var = old_scope_->Var(varname); - old_var->GetMutable(); - - framework::CopyVariable(*global_var, old_var); - VLOG(1) << "init dense variable " << varname << " done"; -} - -void GeoCommunicator::InitSparse() { - auto sparse_metas = string::split_string(sparse_attrs_, "#"); - - std::vector metas; - std::vector dicts; - - for (auto &sparse_meta : sparse_metas) { - auto attrs = string::split_string(sparse_meta, ":"); - - auto meta = distributed::SparseMeta(); - meta.name = attrs[0]; - meta.value_names = {"Param"}; - - auto dic = string::split_string(attrs[1], ","); - dicts.push_back(std::stoi(dic[0])); - meta.value_dims = {std::stoi(dic[1])}; - meta.mode = distributed::Mode::training; - meta.grad_name = "none"; - meta.cached_varnames = {}; - meta.initializer_attrs = string::split_string(attrs[2]); - meta.entry = "none"; - - VLOG(3) << "add sparse meta: " << meta.ToString(); - metas.push_back(meta); - } - - LargeScaleKV::Init(metas); - - for (auto &meta : metas) { - auto &ctx = recv_varname_to_ctx_.at(meta.name); - auto recv = distributed::ParameterRecv(); - - auto *global_var = recv_scope_->FindVar(meta.name); - auto global_value = global_var->Get(); - auto rows = global_value.dims()[0]; - auto dim1 = global_value.dims()[1]; - - recv(ctx, *recv_scope_); - VLOG(1) << "recv " << meta.name << " with global scope for init"; - - auto n_rows = global_var->Get().dims()[0]; - - PADDLE_ENFORCE_EQ( - rows, n_rows, - platform::errors::InvalidArgument( - "global var: %s origin dim must equal recved rows", meta.name)); - - std::vector ids(rows); - std::iota(ids.begin(), ids.end(), 0); - - auto *ins = distributed::LargeScaleKV::GetInstance(); - std::vector *>> values; - - ins->Get(meta.name)->Init(ids); - ins->Get(meta.name)->Get(ids, {"Param"}, &values); - - auto blas = math::GetBlas( - paddle::platform::CPUDeviceContext()); - - for (auto &id : ids) { - blas.VCOPY(dim1, global_value.data() + id * dim1, - values[id][0]->data()); - } - } - - VLOG(3) << "init sparse variable done"; -} - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/communicator.h b/paddle/fluid/operators/distributed/communicator.h deleted file mode 100644 index 4be3253d392..00000000000 --- a/paddle/fluid/operators/distributed/communicator.h +++ /dev/null @@ -1,490 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "gflags/gflags.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/variable.h" -#include "paddle/fluid/framework/variable_helper.h" -#include "paddle/fluid/operators/distributed/communicator_common.h" -#include "paddle/fluid/operators/distributed/distributed.h" -#include "paddle/fluid/operators/distributed/large_scale_kv.h" -#include "paddle/fluid/operators/distributed/rpc_client.h" -#include "paddle/fluid/operators/distributed_ops/send_recv_util.h" -#include "paddle/fluid/operators/math/blas.h" -#include "paddle/fluid/operators/math/math_function.h" -#include "paddle/fluid/operators/math/selected_rows_functor.h" -#include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/place.h" -#include "paddle/fluid/string/split.h" - -DECLARE_bool(communicator_is_sgd_optimizer); - -namespace paddle { -namespace operators { -namespace distributed { - -using Scope = framework::Scope; -using Variable = framework::Variable; - -template -class BlockingQueue { - public: - explicit BlockingQueue(size_t capacity) : capacity_(capacity) { - PADDLE_ENFORCE_GT(capacity_, 0, - platform::errors::InvalidArgument( - "The capacity must be greater than 0.")); - } - - bool Push(const T &elem) { - { - std::unique_lock lock(mutex_); - cv_.wait(lock, [&] { return queue_.size() < capacity_; }); - PADDLE_ENFORCE_LT( - queue_.size(), capacity_, - platform::errors::OutOfRange("The queue size: %s out of capacity:%s", - queue_.size(), capacity_)); - queue_.push_back(elem); - } - cv_.notify_one(); - return true; - } - - bool Push(T &&elem) { - { - std::unique_lock lock(mutex_); - cv_.wait(lock, [&] { return queue_.size() < capacity_; }); - PADDLE_ENFORCE_LT( - queue_.size(), capacity_, - platform::errors::OutOfRange("The queue size: %s out of capacity:%s", - queue_.size(), capacity_)); - queue_.emplace_back(std::move(elem)); - } - cv_.notify_one(); - return true; - } - - T Pop() { - std::unique_lock lock(mutex_); - cv_.wait(lock, [=] { return !queue_.empty(); }); - T rc(std::move(queue_.front())); - queue_.pop_front(); - cv_.notify_one(); - return rc; - } - - size_t Cap() const { - std::lock_guard lock(mutex_); - return capacity_; - } - - size_t Size() const { - std::lock_guard lock(mutex_); - return queue_.size(); - } - - private: - const size_t capacity_; - std::deque queue_; - - mutable std::mutex mutex_; - std::condition_variable cv_; -}; - -template -using EigenVector = framework::EigenVector; - -template -inline void MergeVars(const std::string &var_name, - const std::vector> &vars, - Scope *scope, bool merge_add = true) { - PADDLE_ENFORCE_NE(vars.empty(), true, platform::errors::InvalidArgument( - "vector vars are empty.")); - auto cpu_place = platform::CPUPlace(); - auto &var0 = vars[0]; - auto *out_var = scope->Var(var_name); - if (var0->IsType()) { - auto dims = var0->Get().dims(); - VLOG(3) << "merge " << var_name << " LoDTensor dims " << dims - << "; merge add: " << merge_add; - // init output tensor - auto *out_t = out_var->GetMutable(); - out_t->mutable_data(dims, cpu_place); - // check the input dims - for (auto &var : vars) { - auto &var_t = var->Get(); - PADDLE_ENFORCE_EQ( - var_t.dims(), dims, - platform::errors::InvalidArgument("vars should have the same dims")); - } - - // set output tensor to 0. - auto cpu_ctx = paddle::platform::CPUDeviceContext(); - math::SetConstant constant_functor; - constant_functor(cpu_ctx, out_t, static_cast(0)); - // sum all vars to out - auto result = EigenVector::Flatten(*out_t); - for (auto &var : vars) { - auto &in_t = var->Get(); - auto in = EigenVector::Flatten(in_t); - result.device(*cpu_ctx.eigen_device()) = result + in; - } - if (!merge_add) { - result.device(*cpu_ctx.eigen_device()) = - result / static_cast(vars.size()); - } - } else if (var0->IsType()) { - auto &slr0 = var0->Get(); - auto *out_slr = out_var->GetMutable(); - out_slr->mutable_rows()->clear(); - out_slr->mutable_value()->mutable_data({{}}, cpu_place); - std::vector inputs; - inputs.reserve(vars.size()); - for (auto &var : vars) { - inputs.push_back(&var->Get()); - } - auto dev_ctx = paddle::platform::CPUDeviceContext(); - if (merge_add) { - math::scatter::MergeAdd merge_add; - merge_add(dev_ctx, inputs, out_slr); - } else { - math::scatter::MergeAverage - merge_average; - merge_average(dev_ctx, inputs, out_slr); - } - - VLOG(3) << "merge " << var_name << " SelectedRows height: " << slr0.height() - << " dims: " << slr0.value().dims() << "; merge add: " << merge_add; - } else { - PADDLE_THROW(platform::errors::InvalidArgument("unsupported var type: %s!", - var0->Type())); - } -} - -using RpcCtxMap = std::unordered_map; -using SparseValue = std::unordered_map>; - -class Communicator { - public: - Communicator(); - - explicit Communicator(const std::map &envs_) { - for (auto &iter : envs_) { - envs[iter.first] = iter.second; - } - } - - virtual ~Communicator() {} - - virtual void Start() = 0; - - virtual void Stop() = 0; - - virtual bool IsRunning() { return running_; } - - virtual void Clean() {} - - virtual void Send(const std::vector &var_names, - const std::vector &var_tables, - const framework::Scope &scope) = 0; - - virtual void RecvNoBarrier() {} - - virtual void Barrier() {} - - virtual void BarrierTriggerDecrement() {} - - virtual void BarrierTriggerReset(int init_counter) {} - - virtual void InitEnvs() = 0; - - virtual void InitImpl(const RpcCtxMap &send_varname_to_ctx, - const RpcCtxMap &recv_varname_to_ctx, - Scope *recv_scope) {} - - static Communicator *GetInstance() { return communicator_.get(); } - - static std::shared_ptr GetInstantcePtr() { - return communicator_; - } - - template - static Communicator *InitInstance( - const RpcCtxMap &send_ctx, const RpcCtxMap &recv_ctx, Scope *recv_scope, - const std::map &envs) { - std::call_once(init_flag_, &Communicator::InitWithRpcCtx, send_ctx, - recv_ctx, recv_scope, std::ref(envs)); - return communicator_.get(); - } - - // Init is called by InitInstance. - template - static void InitWithRpcCtx(const RpcCtxMap &send_ctx, - const RpcCtxMap &recv_ctx, Scope *recv_scope, - const std::map &envs) { - if (communicator_.get() == nullptr) { - communicator_.reset(new T(std::ref(envs))); - communicator_->InitEnvs(); - communicator_->InitImpl(send_ctx, recv_ctx, recv_scope); - } - } - - protected: - bool running_ = false; - bool waiting_ = true; - static std::shared_ptr communicator_; - static std::once_flag init_flag_; - std::unordered_map envs; -}; - -class AsyncCommunicator : public Communicator { - public: - AsyncCommunicator() : Communicator() {} - - explicit AsyncCommunicator(const std::map &envs) - : Communicator(envs) {} - - ~AsyncCommunicator(); - - void InitEnvs() { - min_send_grad_num_before_recv_ = - std::stoi(envs.at("communicator_min_send_grad_num_before_recv")); - thread_pool_size_ = std::stoi(envs.at("communicator_thread_pool_size")); - max_merge_var_num_ = std::stoi(envs.at("communicator_max_merge_var_num")); - send_wait_times_ = std::stoi(envs.at("communicator_send_wait_times")); - send_queue_size_ = std::stoi(envs.at("communicator_send_queue_size")); - need_global_step_ = - static_cast(std::stoi(envs.at("need_global_step"))); - VLOG(0) << "AsyncCommunicator Initialized"; - } - - void Start() override; - - void Stop() override; - - void InitImpl(const RpcCtxMap &send_varname_to_ctx, - const RpcCtxMap &recv_varname_to_ctx, - Scope *recv_scope) override; - - void InitParams(); - - virtual void MainThread(); - - void Send(const std::vector &var_names, - const std::vector &var_tables, - const framework::Scope &scope) override; - - virtual void SendByCommunicator(); - virtual void SendGlobalStep(int batches); - - virtual void RecvByCommunicator(); - - virtual void RecvNoBarrier(); - - virtual void BarrierSend() {} - - virtual void BarrierRecv() {} - - virtual void BarrierWeakUp() {} - - protected: - int min_send_grad_num_before_recv_; - int thread_pool_size_; - int max_merge_var_num_; - int send_wait_times_; - int send_queue_size_; - int trainer_id_ = 0; - bool need_global_step_ = false; - - std::unordered_map>>> - send_varname_to_queue_; - RpcCtxMap send_varname_to_ctx_; - RpcCtxMap recv_varname_to_ctx_; - std::unique_ptr main_thread_{nullptr}; - Scope *recv_scope_; // should be global scope - std::unique_ptr send_scope_; // an independent scope - std::unique_ptr<::ThreadPool> send_threadpool_{nullptr}; - std::unique_ptr<::ThreadPool> recv_threadpool_{nullptr}; - std::atomic_uint grad_num_{0}; // the num of gradient sent since last recv -}; - -class HalfAsyncCommunicator : public AsyncCommunicator { - public: - HalfAsyncCommunicator() {} - - explicit HalfAsyncCommunicator(const std::map &envs) - : AsyncCommunicator(envs) {} - - void InitEnvs() { - min_send_grad_num_before_recv_ = 0; - - max_merge_var_num_ = std::stoi(envs.at("communicator_max_merge_var_num")); - send_wait_times_ = std::stoi(envs.at("communicator_send_wait_times")); - thread_pool_size_ = std::stoi(envs.at("communicator_thread_pool_size")); - send_queue_size_ = std::stoi(envs.at("communicator_send_queue_size")); - need_global_step_ = - static_cast(std::stoi(envs.at("need_global_step"))); - VLOG(0) << "HalfAsyncCommunicator Initialized"; - } - - void MainThread() override; - - void SendByCommunicator() override; - - void Clean() override; - - void Barrier() override; - - void BarrierTriggerDecrement() override; - - void BarrierTriggerReset(int initial_val) override; - - int BatchesCounter(); - - void BarrierWeakUp(); - - protected: - // mutex for Wait for barrier - std::mutex barrier_mutex_; - std::condition_variable barrier_cond_; - std::atomic barrier_trigger_{0}; - std::atomic barrier_counter_{0}; -}; - -class SyncCommunicator : public HalfAsyncCommunicator { - public: - SyncCommunicator() : HalfAsyncCommunicator() {} - - explicit SyncCommunicator(const std::map &envs) - : HalfAsyncCommunicator(envs) {} - - void InitEnvs() { - min_send_grad_num_before_recv_ = 0; - - max_merge_var_num_ = std::stoi(envs.at("communicator_max_merge_var_num")); - send_wait_times_ = std::stoi(envs.at("communicator_send_wait_times")); - thread_pool_size_ = std::stoi(envs.at("communicator_thread_pool_size")); - send_queue_size_ = std::stoi(envs.at("communicator_send_queue_size")); - need_global_step_ = - static_cast(std::stoi(envs.at("need_global_step"))); - - trainer_id_ = std::stoi(envs.at("trainer_id")); - auto pserver_strings = envs.at("pserver_endpoints"); - pserver_endpoints_ = paddle::string::Split(pserver_strings, ','); - VLOG(0) << "SyncCommunicator Initialized"; - } - - void BarrierSend(); - - void BarrierRecv(); - - private: - std::vector pserver_endpoints_{}; -}; - -class GeoCommunicator : public AsyncCommunicator { - public: - GeoCommunicator() : AsyncCommunicator() {} - - explicit GeoCommunicator(const std::map &envs) - : AsyncCommunicator(envs) {} - - void InitImpl(const RpcCtxMap &send_varname_to_ctx, - const RpcCtxMap &recv_varname_to_ctx, - Scope *recv_scope) override; - void MainThread() override; - void InitEnvs() { - min_send_grad_num_before_recv_ = 0; - - max_merge_var_num_ = std::stoi(envs.at("communicator_max_merge_var_num")); - send_wait_times_ = std::stoi(envs.at("communicator_send_wait_times")); - thread_pool_size_ = std::stoi(envs.at("communicator_thread_pool_size")); - - send_queue_size_ = max_merge_var_num_; - trainers_ = std::stoi(envs.at("trainers")); - sparse_attrs_ = envs.at("sparse_attrs"); - VLOG(0) << "GeoCommunicator Initialized"; - } - - void Send(const std::vector &var_names, - const std::vector &var_tables, - const framework::Scope &scope) override; - - void SendByCommunicator() { return; } - - std::vector MergeSparseIds(const std::string &send_varname); - - void SendSparse(const std::string &varname, int ep_idx, - const std::vector &sparse_ids); - - void SendDense(const std::string &varname); - - void SendGlobalStep(int batches) override {} - - void RecvByCommunicator() override; - - void RecvSparse(const std::string &varname, int ep_idx); - - void RecvDense(const std::string &varname); - - void InitParams(); - - void InitSparse(); - - void InitDense(const std::string varname); - - private: - int trainers_; - std::string sparse_attrs_; - - // parameter for delta calc and send - std::shared_ptr delta_scope_; - - // parameter for storage the pserver param after last recv - std::shared_ptr old_scope_; - - // parameter on pserver - std::shared_ptr pserver_scope_; - - int send_var_nums_ = 0; - - std::unordered_map> old_sparses_; - - std::unordered_map< - std::string, - std::shared_ptr>>>> - sparse_id_queues_; -}; - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/communicator_common.h b/paddle/fluid/operators/distributed/communicator_common.h deleted file mode 100644 index 122d904eba2..00000000000 --- a/paddle/fluid/operators/distributed/communicator_common.h +++ /dev/null @@ -1,91 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include - -namespace paddle { -namespace operators { -namespace distributed { - -struct CommContext { - CommContext() = default; - - CommContext(const std::string &name, const std::vector &names, - const std::vector &emap, - const std::vector §ions, - const std::vector &origin_names, int id, - bool merge_add_ = true, bool is_sparse_ = true, - bool is_distributed_ = false) - : var_name(name), - splited_varnames(names), - epmap(emap), - height_sections(sections), - origin_varnames(origin_names), - trainer_id(id), - merge_add(merge_add_), - is_sparse(is_sparse_), - is_distributed(is_distributed_) {} - - CommContext(const CommContext &ctx) { - var_name = ctx.var_name; - splited_varnames = ctx.splited_varnames; - epmap = ctx.epmap; - height_sections = ctx.height_sections; - trainer_id = ctx.trainer_id; - merge_add = ctx.merge_add; - is_sparse = ctx.is_sparse; - origin_varnames = ctx.origin_varnames; - is_distributed = ctx.is_distributed; - } - - std::string print() const { - std::stringstream ss; - - ss << "varname: " << var_name << " trainer_id: " << trainer_id << " "; - - for (size_t i = 0; i < splited_varnames.size(); i++) { - ss << "slice varname: " << splited_varnames[i] << " ep: " << epmap[i] - << " section: " << height_sections[i] << " "; - } - - ss << "origin varnames: "; - for (size_t i = 0; i < origin_varnames.size(); i++) { - ss << origin_varnames[i] << " "; - } - - ss << " aggregation->add: " << merge_add << " "; - ss << " is_sparse: " << is_sparse << "\n"; - ss << " is_distributed: " << is_distributed << "\n"; - - return ss.str(); - } - - std::string var_name; - std::vector splited_varnames; - std::vector epmap; - std::vector height_sections; - std::vector origin_varnames; - int trainer_id; - bool merge_add; - bool is_sparse; - bool is_distributed; -}; - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/communicator_test.cc b/paddle/fluid/operators/distributed/communicator_test.cc deleted file mode 100644 index 38b7c8b0031..00000000000 --- a/paddle/fluid/operators/distributed/communicator_test.cc +++ /dev/null @@ -1,106 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include - -#include "paddle/fluid/operators/distributed/communicator.h" - -namespace paddle { -namespace operators { -namespace distributed { - -using LoDTensor = framework::LoDTensor; -using SelectedRows = framework::SelectedRows; - -TEST(communicator, merge_lod_tensors) { - auto cpu_place = platform::CPUPlace(); - auto dims = framework::make_ddim({2, 3}); - std::vector> in_vars; - float out_value = 0; - for (auto i = 0; i < 10; ++i) { - auto var = std::make_shared(); - in_vars.emplace_back(var); - auto *tensor = var->GetMutable(); - auto *data = tensor->mutable_data(dims, cpu_place); - for (auto j = 0; j < tensor->numel(); ++j) { - data[j] = static_cast(i); - } - out_value += static_cast(i); - } - const std::string out_name = "Out"; - std::unique_ptr scope; - scope.reset(new framework::Scope()); - scope->Var(out_name); - for (auto i = 0; i < 10; ++i) { - MergeVars(out_name, in_vars, scope.get()); - } - auto &out_tensor = scope->FindVar(out_name)->Get(); - auto *out_data = out_tensor.data(); - ASSERT_EQ(out_tensor.dims(), dims); - for (auto i = 0; i < out_tensor.numel(); ++i) { - ASSERT_EQ(out_data[i], out_value); - } -} - -TEST(communicator, merge_selected_rows) { - auto cpu_place = platform::CPUPlace(); - int64_t width = 10; - std::vector> in_vars; - const int64_t height = 100; - for (auto i = 0; i < 10; ++i) { - std::vector rows; - for (auto k = 0; k <= i; ++k) { - rows.push_back(k); - } - auto var = std::make_shared(); - in_vars.emplace_back(var); - auto *slr = var->GetMutable(); - slr->set_height(height); - slr->set_rows(rows); - auto dims = - framework::make_ddim({static_cast(rows.size()), width}); - auto *data = slr->mutable_value()->mutable_data(dims, cpu_place); - for (size_t i = 0; i < rows.size(); ++i) { - for (auto j = 0; j < width; ++j) { - data[i * width + j] = static_cast(rows[i]); - } - } - } - const std::string out_name = "Out"; - std::unique_ptr scope; - scope.reset(new framework::Scope()); - scope->Var(out_name); - for (auto i = 0; i < 10; ++i) { - MergeVars(out_name, in_vars, scope.get()); - } - auto &out_slr = scope->FindVar(out_name)->Get(); - auto &out_t = out_slr.value(); - auto *out_data = out_t.data(); - ASSERT_EQ(out_t.dims(), framework::make_ddim({10, width})); - std::vector out_values; - out_values.reserve(10); - for (auto i = 0; i < 10; ++i) { - out_values.push_back(static_cast(i * (10 - i))); - } - for (size_t i = 0; i < out_slr.rows().size(); ++i) { - ASSERT_EQ(out_slr.rows()[i], static_cast(i)); - for (auto j = 0; j < width; ++j) { - ASSERT_EQ(out_data[i * width + j], out_values[i]); - } - } -} - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/distributed.h b/paddle/fluid/operators/distributed/distributed.h deleted file mode 100644 index 5917c18fb0d..00000000000 --- a/paddle/fluid/operators/distributed/distributed.h +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#ifdef PADDLE_WITH_DISTRIBUTE - -#ifdef PADDLE_WITH_GRPC -#include "paddle/fluid/operators/distributed/communicator.h" - -#include "paddle/fluid/operators/distributed/grpc/grpc_client.h" -#include "paddle/fluid/operators/distributed/grpc/grpc_server.h" -#define RPCSERVER_T paddle::operators::distributed::AsyncGRPCServer -#define RPCCLIENT_T paddle::operators::distributed::GRPCClient - -#else // PADDLE_WITH_GRPC - -#include "paddle/fluid/operators/distributed/brpc/brpc_client.h" -#include "paddle/fluid/operators/distributed/brpc/brpc_server.h" -#define RPCSERVER_T paddle::operators::distributed::AsyncBRPCServer -#define RPCCLIENT_T paddle::operators::distributed::BRPCClient - -#endif // PADDLE_WITH_GRPC - -#endif // PADDLE_WITH_DISTRIBUTE diff --git a/paddle/fluid/operators/distributed/distributed_pb.h b/paddle/fluid/operators/distributed/distributed_pb.h deleted file mode 100644 index f1c662be9af..00000000000 --- a/paddle/fluid/operators/distributed/distributed_pb.h +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#ifdef PADDLE_WITH_DISTRIBUTE - -#ifdef PADDLE_WITH_GRPC - -#include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h" -#include "paddle/fluid/operators/distributed/send_recv.pb.h" - -#else // PADDLE_WITH_GRPC - -#include "paddle/fluid/operators/distributed/send_recv.pb.h" - -#endif // PADDLE_WITH_GRPC - -#endif // PADDLE_WITH_DISTRIBUTE diff --git a/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.cc b/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.cc deleted file mode 100644 index 7d6756b4136..00000000000 --- a/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.cc +++ /dev/null @@ -1,92 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -// NOTE: This file was originally created by tensorflow -// (https://github.com/tensorflow/tensorflow/) we borrow this -// file and did some modifications so that we can send gRPC -// requests without too much copying of the tensor data. - -#include "paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h" - -namespace grpc { -class ByteBuffer; -} // namespace grpc - -namespace paddle { -namespace operators { -namespace distributed { - -GrpcByteBufferSource::GrpcByteBufferSource() {} - -bool GrpcByteBufferSource::Init(const grpc::ByteBuffer& src) { - cur_ = -1; - left_ = 0; - ptr_ = nullptr; - byte_count_ = 0; - bool ok = src.Dump(&slices_).ok(); - if (!ok) { - slices_.clear(); - } - return ok; -} - -bool GrpcByteBufferSource::Next(const void** data, int* size) { - // Use loop instead of if in case buffer contained empty slices. - while (left_ == 0) { - // Advance to next slice. - cur_++; - if (cur_ >= slices_.size()) { - return false; - } - const ::grpc::Slice& s = slices_[cur_]; - left_ = s.size(); - ptr_ = reinterpret_cast(s.begin()); - } - - *data = ptr_; - *size = left_; - byte_count_ += left_; - ptr_ += left_; - left_ = 0; - return true; -} - -void GrpcByteBufferSource::BackUp(int count) { - ptr_ -= count; - left_ += count; - byte_count_ -= count; -} - -bool GrpcByteBufferSource::Skip(int count) { - const void* data; - int size; - while (Next(&data, &size)) { - if (size >= count) { - BackUp(size - count); - return true; - } - // size < count; - count -= size; - } - // error or we have too large count; - return false; -} - -google::protobuf::int64 GrpcByteBufferSource::ByteCount() const { - return byte_count_; -} - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h b/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h deleted file mode 100644 index 486870de7a5..00000000000 --- a/paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h +++ /dev/null @@ -1,174 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -// NOTE: This file was originally created by tensorflow -// (https://github.com/tensorflow/tensorflow/) we borrow this -// file and did some modifications so that we can send gRPC -// requests without too much copying of the tensor data. - -#pragma once - -#include - -#include "google/protobuf/io/coded_stream.h" -#include "google/protobuf/io/zero_copy_stream.h" -#include "grpc++/grpc++.h" -#include "paddle/fluid/operators/distributed/variable_response.h" - -struct grpc_byte_buffer; - -namespace grpc { -// A ZeroCopyInputStream that reads from grpc_byte_buffer -class ByteBuffer; - -class GrpcBufferReader final - : public ::google::protobuf::io::ZeroCopyInputStream { - typedef void (CoreCodegenInterface::*OldReaderInitAPI)( - grpc_byte_buffer_reader* reader, grpc_byte_buffer* buffer); - typedef int (CoreCodegenInterface::*NewReaderInitAPI)( - grpc_byte_buffer_reader* reader, grpc_byte_buffer* buffer); - void ReaderInit(OldReaderInitAPI ptr, grpc_byte_buffer_reader* reader, - grpc_byte_buffer* buffer) { - (g_core_codegen_interface->*ptr)(reader, buffer); - } - void ReaderInit(NewReaderInitAPI ptr, grpc_byte_buffer_reader* reader, - grpc_byte_buffer* buffer) { - int result = (g_core_codegen_interface->*ptr)(reader, buffer); - (void)result; - } - - public: - explicit GrpcBufferReader(grpc_byte_buffer* buffer) - : byte_count_(0), backup_count_(0) { - ReaderInit(&CoreCodegenInterface::grpc_byte_buffer_reader_init, &reader_, - buffer); - } - ~GrpcBufferReader() override { - g_core_codegen_interface->grpc_byte_buffer_reader_destroy(&reader_); - } - - bool Next(const void** data, int* size) override { - if (backup_count_ > 0) { - *data = GRPC_SLICE_START_PTR(slice_) + GRPC_SLICE_LENGTH(slice_) - - backup_count_; - GPR_CODEGEN_ASSERT(backup_count_ <= INT_MAX); - *size = static_cast(backup_count_); - backup_count_ = 0; - return true; - } - if (!g_core_codegen_interface->grpc_byte_buffer_reader_next(&reader_, - &slice_)) { - return false; - } - g_core_codegen_interface->grpc_slice_unref(slice_); - *data = GRPC_SLICE_START_PTR(slice_); - // On win x64, int is only 32bit - GPR_CODEGEN_ASSERT(GRPC_SLICE_LENGTH(slice_) <= INT_MAX); - byte_count_ += * size = static_cast(GRPC_SLICE_LENGTH(slice_)); - return true; - } - - void BackUp(int count) override { backup_count_ = count; } - - bool Skip(int count) override { - const void* data; - int size; - while (Next(&data, &size)) { - if (size >= count) { - BackUp(size - count); - return true; - } - // size < count; - count -= size; - } - // error or we have too large count; - return false; - } - - ::google::protobuf::int64 ByteCount() const override { - return byte_count_ - backup_count_; - } - - private: - int64_t byte_count_; - int64_t backup_count_; - grpc_byte_buffer_reader reader_; - grpc_slice slice_; -}; - -}; // namespace grpc - -namespace paddle { -namespace operators { -namespace distributed { - -// A ZeroCopyInputStream that reads from a grpc::ByteBuffer. -class GrpcByteBufferSource - : public ::google::protobuf::io::ZeroCopyInputStream { - public: - GrpcByteBufferSource(); - bool Init(const ::grpc::ByteBuffer& src); // Can be called multiple times. - bool Next(const void** data, int* size) override; - void BackUp(int count) override; - bool Skip(int count) override; - ::google::protobuf::int64 ByteCount() const override; - - private: - std::vector<::grpc::Slice> slices_; - size_t cur_; // Current slice index. - int left_; // Number of bytes in slices_[cur_] left to yield. - const char* ptr_; // Address of next byte in slices_[cur_] to yield. - ::google::protobuf::int64 byte_count_; -}; - -class GrpcByteBufferSourceWrapper : public Source { - public: - explicit GrpcByteBufferSourceWrapper(GrpcByteBufferSource* source) - : source_(source) {} - ::google::protobuf::io::ZeroCopyInputStream* contents() override { - return source_; - } - - private: - GrpcByteBufferSource* source_; -}; - -class GrpcByteSource : public Source { - public: - explicit GrpcByteSource(grpc_byte_buffer* buffer) : buffer_(buffer) {} - ~GrpcByteSource() override { DeleteStream(); } - - typedef ::grpc::GrpcBufferReader Reader; - - ::google::protobuf::io::ZeroCopyInputStream* contents() override { - DeleteStream(); - stream_ = new (&space_) Reader(buffer_); - return stream_; - } - - private: - void DeleteStream() { - if (stream_) { - stream_->~Reader(); - } - } - - grpc_byte_buffer* buffer_; // Not owned - Reader* stream_ = nullptr; // Points into space_ if non-nullptr - char space_[sizeof(Reader)]; -}; - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/grpc/grpc_client.cc b/paddle/fluid/operators/distributed/grpc/grpc_client.cc deleted file mode 100644 index 97a9c14e4f1..00000000000 --- a/paddle/fluid/operators/distributed/grpc/grpc_client.cc +++ /dev/null @@ -1,671 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include - -#include "glog/logging.h" // For VLOG -#include "paddle/fluid/framework/threadpool.h" -#include "paddle/fluid/operators/distributed/grpc/grpc_client.h" -#include "paddle/fluid/operators/distributed/grpc/grpc_serde.h" -#include "paddle/fluid/operators/distributed/request_handler.h" -#include "paddle/fluid/platform/port.h" -#include "paddle/fluid/platform/profiler.h" - -DEFINE_int32(rpc_client_threads, 2, ""); -DECLARE_bool(rpc_disable_reuse_port); - -namespace paddle { -namespace operators { -namespace distributed { - -void GRPCClient::InitImpl() { - // start the client process thread - // TODO(wuyi): can make this in a threadpool - client_threads_.resize(FLAGS_rpc_client_threads); - for (int i = 0; i < FLAGS_rpc_client_threads; i++) { - client_threads_[i].reset( - new std::thread(std::bind(&GRPCClient::Proceed, this))); - } -} - -void GRPCClient::SendComplete() { - std::unique_lock lk(completed_mutex_); - if (!completed_) { - for (auto& it : channels_) { - VLOG(3) << "send complete message to " << it.first; - this->AsyncSendComplete(it.first); - } - PADDLE_ENFORCE_EQ(this->Wait(), true, platform::errors::PreconditionNotMet( - "internal grpc service error.")); - completed_ = true; - } -} - -GRPCClient::~GRPCClient() { - stopped_ = true; - Wait(); - cq_.Shutdown(); - { - std::lock_guard guard(chan_mutex_); - for (auto& it : channels_) { - it.second.reset(); - } - channels_.clear(); - } - for (size_t i = 0; i < client_threads_.size(); i++) - client_threads_[i]->join(); -} - -VarHandlePtr GRPCClient::AsyncSendVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& var_name, - int64_t time_out) { - const platform::DeviceContext* p_ctx = &ctx; - const std::string ep_val = ep; - const std::string var_name_val = var_name; - const framework::Scope* p_scope = &scope; - const auto ch = GetChannel(ep_val); - const std::string method = kSendRPC; - - int retry_times_ = 0; - - while (true) { - SendProcessor* s = new SendProcessor(ch); - VarHandlePtr h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope)); - s->Prepare(h, time_out); - - framework::Async([var_name_val, p_scope, p_ctx, s, method, h, this] { - auto* var = p_scope->FindVar(var_name_val); - - ::grpc::ByteBuffer req; - SerializeToByteBuffer(var_name_val, var, *p_ctx, &req, "", trainer_id_); - - VLOG(3) << s->GetVarHandlePtr()->String() << " begin"; - - // stub context - s->response_call_back_ = nullptr; - - platform::RecordRPCEvent record_event(method); - - auto call = s->stub_g_.PrepareUnaryCall( - s->context_.get(), "/sendrecv.SendRecvService/SendVariable", req, - &cq_); - call->StartCall(); - call->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); - - if (UNLIKELY(platform::IsProfileEnabled())) { - h->Wait(); - } - }); - req_count_++; - - if (FLAGS_rpc_retry_times > 0 && retry_times_ < FLAGS_rpc_retry_times) { - h->Wait(); - if (h->should_retry) { - VLOG(3) << "rpc call failed, retry times " << retry_times_; - retry_times_++; - std::random_device rd; - std::this_thread::sleep_for(std::chrono::milliseconds(rd() % 5)); - continue; - } - } - - return h; - } -} - -void ProcGetResponse(const VarHandle& var_h, - const ::grpc::ByteBuffer& ret_msg) { - VLOG(4) << "ProcGetResponse"; - framework::Variable* outvar = nullptr; - // get response's trainer_id is not used - int trainer_id; - DeserializeFromByteBuffer(ret_msg, *var_h.ctx(), var_h.scope(), &outvar, - &trainer_id); -} - -void ProcGetRecvResponse(const VarHandle& var_h, - const ::grpc::ByteBuffer& ret_msg) { - VLOG(4) << "ProcGetRecvResponse"; - framework::Variable* outvar = nullptr; - int trainer_id; - DeserializeRecvFromByteBuffer(ret_msg, *var_h.ctx(), var_h.scope(), &outvar, - &trainer_id); -} - -template -void RequestToByteBuffer(const T& proto, ::grpc::ByteBuffer* result) { - ::grpc::Slice slice(proto.ByteSizeLong()); - proto.SerializeWithCachedSizesToArray(const_cast(slice.begin())); - ::grpc::ByteBuffer tmp(&slice, 1); - result->Swap(&tmp); -} - -VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& var_name, - const std::string& out_varname, - const std::string& table_name, - int64_t time_out) { - return _AsyncGetVar(ep, ctx, scope, kGetRPC, var_name, out_varname, - "/sendrecv.SendRecvService/GetVariable", table_name, - time_out); -} - -VarHandlePtr GRPCClient::AsyncGetVarNoBarrier( - const std::string& ep, const platform::DeviceContext& ctx, - const framework::Scope& scope, const std::string& var_name, - const std::string& out_varname, int64_t time_out) { - std::string var_name_no_barrier = - string::Sprintf("%s%s", var_name, WITHOUT_BARRIER_MESSAGE); - - return _AsyncGetVar( - ep, ctx, scope, kGetNoBarrierRPC, var_name_no_barrier, out_varname, - "/sendrecv.SendRecvService/GetVariableNoBarrier", "", time_out); -} - -VarHandlePtr GRPCClient::AsyncGetMonomerVariable( - const std::string& ep, const platform::DeviceContext& ctx, - const framework::Scope& scope, const std::string& var_name, - int64_t time_out) { - return _AsyncGetVar(ep, ctx, scope, kGetMonomerRPC, var_name, var_name, - "/sendrecv.SendRecvService/GetMonomerVariable", "", - time_out); -} - -VarHandlePtr GRPCClient::_AsyncGetVar( - const std::string& ep, const platform::DeviceContext& ctx, - const framework::Scope& scope, const std::string& method, - const std::string& var_name, const std::string& out_varname, - const std::string& rpc_path, const std::string& table_name, - int64_t time_out) { - const platform::DeviceContext* p_ctx = &ctx; - const std::string ep_val = ep; - const std::string var_name_val = var_name; - const std::string out_varname_val = out_varname; - const std::string table_name_val = table_name; - const framework::Scope* p_scope = &scope; - const auto ch = GetChannel(ep_val); - - int retry_times_ = 0; - - while (true) { - GetProcessor* s = new GetProcessor(ch); - - VarHandlePtr h(new VarHandle(ep, method, out_varname_val, p_ctx, p_scope)); - s->Prepare(h, time_out); - - framework::Async([var_name_val, out_varname_val, table_name_val, s, method, - p_ctx, h, rpc_path, this] { - // prepare input - sendrecv::VariableMessage req; - req.set_varname(var_name_val); - req.set_out_varname(out_varname_val); - req.set_trainer_id(trainer_id_); - req.set_table_name(table_name_val); - ::grpc::ByteBuffer buf; - RequestToByteBuffer(req, &buf); - - VLOG(3) << s->GetVarHandlePtr()->String() << " begin"; - - // stub context - s->response_call_back_ = ProcGetResponse; - - platform::RecordRPCEvent record_event(method); - - auto call = - s->stub_g_.PrepareUnaryCall(s->context_.get(), rpc_path, buf, &cq_); - call->StartCall(); - call->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); - - if (UNLIKELY(platform::IsProfileEnabled())) { - h->Wait(); - } - }); - req_count_++; - - if (FLAGS_rpc_retry_times > 0 && retry_times_ < FLAGS_rpc_retry_times) { - h->Wait(); - if (h->should_retry) { - VLOG(3) << "rpc call failed, retry times " << retry_times_; - retry_times_++; - std::random_device rd; - std::this_thread::sleep_for(std::chrono::milliseconds(rd() % 5)); - continue; - } - } - - return h; - } -} - -VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& in_var_name, - const std::string& out_var_name, - const std::string& table_name, - int64_t time_out) { - const platform::DeviceContext* p_ctx = &ctx; - const std::string ep_val = ep; - const std::string in_var_name_val = in_var_name; - const std::string out_var_name_val = out_var_name; - const std::string table_name_val = table_name; - const framework::Scope* p_scope = &scope; - const auto ch = GetChannel(ep_val); - - const std::string method = kPrefetchRPC; - int retry_times_ = 0; - - while (true) { - GetProcessor* s = new GetProcessor(ch); - VarHandlePtr h(new VarHandle(ep, method, out_var_name_val, p_ctx, p_scope)); - s->Prepare(h, kPrefetchTimeout); - - auto* var = p_scope->FindVar(in_var_name_val); - - ::grpc::ByteBuffer req; - SerializeToByteBuffer(in_var_name_val, var, *p_ctx, &req, out_var_name_val, - 0, table_name_val); - - VLOG(3) << s->GetVarHandlePtr()->String() << " begin"; - - // stub context - s->response_call_back_ = ProcGetResponse; - - platform::RecordRPCEvent record_event(method); - - auto call = s->stub_g_.PrepareUnaryCall( - s->context_.get(), "/sendrecv.SendRecvService/PrefetchVariable", req, - &cq_); - call->StartCall(); - call->Finish(&s->reply_, &s->status_, static_cast(s)); - - if (UNLIKELY(platform::IsProfileEnabled())) { - h->Wait(); - } - - req_count_++; - - if (FLAGS_rpc_retry_times > 0 && retry_times_ < FLAGS_rpc_retry_times) { - h->Wait(); - if (h->should_retry) { - VLOG(3) << "rpc call failed, retry times " << retry_times_; - retry_times_++; - std::random_device rd; - std::this_thread::sleep_for(std::chrono::milliseconds(rd() % 5)); - continue; - } - } - - return h; - } -} - -VarHandlePtr GRPCClient::AsyncSendBatchBarrier(const std::string& ep, - int64_t time_out) { - const auto ch = GetChannel(ep); - - BatchBarrierProcessor* s = new BatchBarrierProcessor(ch); - const std::string method = kBatchBarrierRPC; - VarHandlePtr h( - new VarHandle(ep, method, BATCH_BARRIER_MESSAGE, nullptr, nullptr)); - s->Prepare(h, time_out); - - sendrecv::VariableMessage req; - req.set_varname(BATCH_BARRIER_MESSAGE); - - platform::RecordRPCEvent record_event(method); - - auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_); - rpc->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); - req_count_++; - - if (UNLIKELY(platform::IsProfileEnabled())) { - h->Wait(); - } - - return h; -} - -VarHandlePtr GRPCClient::AsyncSendFetchBarrier(const std::string& ep, - int64_t time_out) { - const auto ch = GetChannel(ep); - FetchBarrierProcessor* s = new FetchBarrierProcessor(ch); - const std::string method = kFetchBarrierRPC; - VarHandlePtr h( - new VarHandle(ep, method, FETCH_BARRIER_MESSAGE, nullptr, nullptr)); - s->Prepare(h, time_out); - - sendrecv::VariableMessage req; - req.set_varname(FETCH_BARRIER_MESSAGE); - - platform::RecordRPCEvent record_event(method); - - auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_); - rpc->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); - req_count_++; - - if (UNLIKELY(platform::IsProfileEnabled())) { - h->Wait(); - } - - return h; -} - -VarHandlePtr GRPCClient::AsyncGetMonomerBarrier(const std::string& ep, - const std::string& var_name, - int64_t time_out) { - const auto ch = GetChannel(ep); - BatchBarrierProcessor* s = new BatchBarrierProcessor(ch); - const std::string method = kSendMonomerFetchBarrierRPC; - VarHandlePtr h(new VarHandle(ep, method, var_name, nullptr, nullptr)); - s->Prepare(h, time_out); - - VLOG(30) << s->GetVarHandlePtr()->String() << " begin"; - - sendrecv::VariableMessage req; - req.set_varname(var_name); - - platform::RecordRPCEvent record_event(method); - - auto rpc = s->stub_->AsyncGetMonomerBarrier(s->context_.get(), req, &cq_); - rpc->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); - req_count_++; - - if (UNLIKELY(platform::IsProfileEnabled())) { - h->Wait(); - } - - return h; -} - -VarHandlePtr GRPCClient::AsyncSendComplete(const std::string& ep, - int64_t time_out) { - const auto ch = GetChannel(ep); - - BatchBarrierProcessor* s = new BatchBarrierProcessor(ch); - const std::string method = kSendCompleteRPC; - VarHandlePtr h(new VarHandle(ep, method, COMPLETE_MESSAGE, nullptr, nullptr)); - s->Prepare(h, time_out); - - sendrecv::VariableMessage req; - req.set_trainer_id(trainer_id_); - req.set_varname(COMPLETE_MESSAGE); - - platform::RecordRPCEvent record_event(method); - - auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_); - rpc->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); - req_count_++; - - if (UNLIKELY(platform::IsProfileEnabled())) { - h->Wait(); - } - - return h; -} - -VarHandlePtr GRPCClient::AsyncCheckpointNotify(const std::string& ep, - const std::string& dirname, - const std::string& varname, - const int mode, - int64_t time_out) { - const auto ch = GetChannel(ep); - - CheckpointNotifyProcessor* s = new CheckpointNotifyProcessor(ch); - - const std::string method = kCheckPointNotifyRPC; - - VarHandlePtr h( - new VarHandle(ep, method, CHECKPOINT_SAVE_MESSAGE, nullptr, nullptr)); - s->Prepare(h, time_out); - - sendrecv::VariableMessage req; - req.set_varname(varname); - req.set_table_name(std::to_string(mode)); - req.set_out_varname(dirname); - - platform::RecordRPCEvent record_event(method); - - auto rpc = s->stub_->AsyncCheckpointNotify(s->context_.get(), req, &cq_); - rpc->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); - req_count_++; - - if (UNLIKELY(platform::IsProfileEnabled())) { - h->Wait(); - } - - return h; -} - -VarHandlePtr GRPCClient::AsyncDistributeNotify( - const std::string& ep, const platform::DeviceContext& ctx, - const framework::Scope& scope, const std::string& var_name, - int64_t time_out) { - const platform::DeviceContext* p_ctx = &ctx; - const std::string ep_val = ep; - const std::string var_name_val = var_name; - const framework::Scope* p_scope = &scope; - const auto ch = GetChannel(ep_val); - const std::string method = kRequestNotify; - - SendProcessor* s = new SendProcessor(ch); - VarHandlePtr h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope)); - s->Prepare(h, time_out); - - framework::Async([var_name_val, p_scope, p_ctx, s, method, h, this] { - auto* var = p_scope->FindVar(var_name_val); - - ::grpc::ByteBuffer req; - SerializeToByteBuffer(var_name_val, var, *p_ctx, &req, "", trainer_id_); - - VLOG(3) << s->GetVarHandlePtr()->String() << " begin"; - - // stub context - s->response_call_back_ = nullptr; - - platform::RecordRPCEvent record_event(method); - - auto call = s->stub_g_.PrepareUnaryCall( - s->context_.get(), "/sendrecv.SendRecvService/DistributeNotify", req, - &cq_); - call->StartCall(); - call->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); - }); - req_count_++; - - if (UNLIKELY(platform::IsProfileEnabled())) { - h->Wait(); - } - - return h; -} - -VarHandlePtr GRPCClient::AsyncSendAndRecv(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& send_var_name, - const std::string& recv_var_name, - const std::string& table_name, - int64_t time_out) { - const platform::DeviceContext* p_ctx = &ctx; - const std::string ep_val = ep; - const std::string send_var_name_val = send_var_name; - const std::string recv_var_name_val = recv_var_name; - const std::string table_name_val = table_name; - const framework::Scope* p_scope = &scope; - const auto ch = GetChannel(ep_val); - const std::string method = kSendAndRecvRPC; - VLOG(4) << "GRPCClient::SendAndRecv Begin ,Send_var_name: " - << send_var_name_val << " Recv_var_name: " << recv_var_name_val; - int retry_times_ = 0; - - while (true) { - SendAndRecvProcessor* s = new SendAndRecvProcessor(ch); - VarHandlePtr h( - new VarHandle(ep, method, send_var_name_val, p_ctx, p_scope)); - VarHandlePtr h_recv( - new VarHandle(ep, method, recv_var_name_val, p_ctx, p_scope)); - s->Prepare(h, time_out); - s->RecvPrepare(h_recv); - - framework::Async([send_var_name_val, recv_var_name_val, table_name_val, - p_scope, p_ctx, s, method, h, this] { - auto* send_var = p_scope->FindVar(send_var_name_val); - send_var->GetMutable()->set_lod({}); - ::grpc::ByteBuffer buf; - VLOG(4) << "SerializeToByteBuffer: send_var_name_val: " - << send_var_name_val - << " recv_var_name_val: " << recv_var_name_val; - SerializeToByteBuffer(send_var_name_val, send_var, *p_ctx, &buf, - recv_var_name_val, trainer_id_, table_name_val); - - VLOG(3) << s->GetVarHandlePtr()->String() << " begin"; - - // stub context - s->response_call_back_ = ProcGetRecvResponse; - - platform::RecordRPCEvent record_event(method); - - auto call = s->stub_g_.PrepareUnaryCall( - s->context_.get(), "/sendrecv.SendRecvService/SendAndRecvVariable", - buf, &cq_); - call->StartCall(); - call->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); - - if (UNLIKELY(platform::IsProfileEnabled())) { - h->Wait(); - } - }); - req_count_++; - - if (FLAGS_rpc_retry_times > 0 && retry_times_ < FLAGS_rpc_retry_times) { - h->Wait(); - if (h->should_retry) { - VLOG(3) << "rpc call failed, retry times " << retry_times_; - retry_times_++; - std::random_device rd; - std::this_thread::sleep_for(std::chrono::milliseconds(rd() % 5)); - continue; - } - } - - return h; - } -} - -bool GRPCClient::Wait() { - std::unique_lock lk(sync_mutex_); - sync_cond_.wait(lk, [this] { return (req_count_ == 0 || ok_ == false); }); - return ok_; -} - -inline bool ShouldRetry(const std::string& method, int error_code) { - if (method == kPrefetchRPC) { - return true; - } - - if (error_code == grpc::StatusCode::DEADLINE_EXCEEDED) { - return true; - } - - return false; -} - -void GRPCClient::Proceed() { - void* tag = nullptr; - bool ok = false; - - VLOG(3) << "GRPCClient Proceed begin"; - while (!stopped_ && cq_.Next(&tag, &ok)) { - BaseProcessor* c = static_cast(tag); - GPR_ASSERT(ok); - PADDLE_ENFORCE_NOT_NULL( - c, platform::errors::PreconditionNotMet("Make BaseProcessor failed.")); - - if (c->status_.ok()) { - VLOG(3) << c->GetVarHandlePtr()->String() << " process"; - c->Process(); - } else if (ShouldRetry(c->GetVarHandlePtr()->method(), - c->status_.error_code())) { - VLOG(0) << c->GetVarHandlePtr()->String() - << " meets grpc error, error_code:" << c->status_.error_code() - << " error_message:" << c->status_.error_message() - << " error_details:" << c->status_.error_details() - << " should retry!"; - c->GetVarHandlePtr()->should_retry = true; - c->Finish(false); - } else { - PADDLE_THROW(platform::errors::External( - "%s meets grpc error, error_code is %d, error message is %s, error " - "details is %s.", - c->GetVarHandlePtr()->String(), c->status_.error_code(), - c->status_.error_message(), c->status_.error_details())); - c->Finish(false); - } - - bool notify = false; - { - std::lock_guard lk(sync_mutex_); - req_count_--; - notify = (req_count_ <= 0 || !c->status_.ok()); - } - - delete c; - - if (notify) { - sync_cond_.notify_all(); - } - } - - // Last log message - // Avoid using VLOG() and LOG(): in the destructor of google::LogMessage() a - // static Mutex log_mutex is used for synchronization, which might have been - // destructed at this moment. - if (FLAGS_v >= 3) { - std::string msg("GRPCClient Proceed end"); - fwrite(msg.c_str(), msg.length(), 1, stderr); - } -} - -std::shared_ptr GRPCClient::GetChannel(const std::string& ep) { - std::lock_guard guard(chan_mutex_); - auto it = channels_.find(ep); - if (it != channels_.end()) { - return it->second; - } - - // Channel configurations: - grpc::ChannelArguments args; - args.SetInt(GRPC_ARG_MAX_RECONNECT_BACKOFF_MS, 2000); - if (FLAGS_rpc_disable_reuse_port) { - args.SetInt(GRPC_ARG_ALLOW_REUSEPORT, 0); - } - args.SetCompressionAlgorithm(GRPC_COMPRESS_NONE); - args.SetMaxSendMessageSize(std::numeric_limits::max()); - args.SetMaxReceiveMessageSize(std::numeric_limits::max()); - - auto ch = - grpc::CreateCustomChannel(ep, grpc::InsecureChannelCredentials(), args); - channels_[ep] = ch; - return ch; -} - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/grpc/grpc_client.h b/paddle/fluid/operators/distributed/grpc/grpc_client.h deleted file mode 100644 index 5885f944b60..00000000000 --- a/paddle/fluid/operators/distributed/grpc/grpc_client.h +++ /dev/null @@ -1,321 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include // NOLINT -#include // NOLINT -#include -#include -#include -#include -#include -#include // NOLINT -#include -#include // NOLINT -#include -#include - -#include "grpc++/channel.h" -#include "grpc++/generic/generic_stub.h" -#include "grpc++/grpc++.h" -#include "grpc++/support/byte_buffer.h" -#include "grpc++/support/slice.h" -#include "grpc/support/log.h" -#include "paddle/fluid/framework/blocking_queue.h" -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/operators/distributed/distributed_pb.h" -#include "paddle/fluid/operators/distributed/request_handler.h" -#include "paddle/fluid/operators/distributed/rpc_client.h" -#include "paddle/fluid/operators/distributed/sendrecvop_utils.h" -#include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN - -namespace grpc { -class Channel; -} // namespace grpc -namespace paddle { -namespace framework { -class Scope; -} // namespace framework -namespace platform { -class DeviceContext; -} // namespace platform -} // namespace paddle - -namespace paddle { -namespace operators { -namespace distributed { - -void ProcGetResponse(const VarHandle& var_h, const grpc::ByteBuffer& msg); - -void ProcGetRecvResponse(const VarHandle& var_h, const grpc::ByteBuffer& msg); - -class BaseProcessor { - public: - BaseProcessor() { context_ = nullptr; } - - virtual ~BaseProcessor() {} - - virtual void Prepare(VarHandlePtr h, int64_t time_out) { - var_h_ = h; - - context_.reset(new grpc::ClientContext()); - context_->set_wait_for_ready(true); - if (time_out) { - std::chrono::system_clock::time_point deadline = - std::chrono::system_clock::now() + - std::chrono::milliseconds(time_out); - context_->set_deadline(deadline); - } - } - - void Process() { - ProcessImpl(); - var_h_->Finish(true); - } - - VarHandlePtr GetVarHandlePtr() { return var_h_; } - bool Wait() { return var_h_->Wait(); } - void Finish(bool ok) { return var_h_->Finish(ok); } - virtual void ProcessImpl() = 0; - - std::unique_ptr context_; - grpc::Status status_; - - protected: - VarHandlePtr var_h_; -}; - -typedef std::function - RequestSendCallBack; - -class SendProcessor : public BaseProcessor { - public: - explicit SendProcessor(std::shared_ptr ch) - : BaseProcessor(), stub_g_(ch) {} - - virtual ~SendProcessor() {} - - void ProcessImpl() override { - if (response_call_back_) { - response_call_back_(*var_h_.get(), reply_); - } - } - - ::grpc::GenericStub stub_g_; - ::grpc::ByteBuffer reply_; - RequestSendCallBack response_call_back_ = nullptr; -}; - -typedef std::function - RequestGetCallBack; - -class GetProcessor : public BaseProcessor { - public: - explicit GetProcessor(std::shared_ptr ch) - : BaseProcessor(), stub_g_(ch) {} - - virtual ~GetProcessor() {} - - void ProcessImpl() override { - if (response_call_back_) { - response_call_back_(*var_h_.get(), reply_); - } - } - - ::grpc::ByteBuffer reply_; - ::grpc::GenericStub stub_g_; - RequestGetCallBack response_call_back_ = ProcGetResponse; -}; - -class SendAndRecvProcessor : public BaseProcessor { - public: - explicit SendAndRecvProcessor(std::shared_ptr ch) - : BaseProcessor(), stub_g_(ch) {} - - virtual ~SendAndRecvProcessor() {} - - void ProcessImpl() override { - if (response_call_back_) { - response_call_back_(*var_h_recv_.get(), reply_); - var_h_recv_->Finish(true); - } - } - - void RecvPrepare(VarHandlePtr h_recv) { var_h_recv_ = h_recv; } - - ::grpc::ByteBuffer reply_; - ::grpc::GenericStub stub_g_; - RequestGetCallBack response_call_back_ = ProcGetResponse; - VarHandlePtr var_h_recv_; -}; - -class BatchBarrierProcessor : public BaseProcessor { - public: - explicit BatchBarrierProcessor(std::shared_ptr ch) - : BaseProcessor() { - stub_ = sendrecv::SendRecvService::NewStub(ch); - } - - virtual ~BatchBarrierProcessor() {} - - void ProcessImpl() override {} - sendrecv::VoidMessage reply_; - std::unique_ptr stub_; -}; - -class FetchBarrierProcessor : public BaseProcessor { - public: - explicit FetchBarrierProcessor(std::shared_ptr ch) - : BaseProcessor() { - stub_ = sendrecv::SendRecvService::NewStub(ch); - } - - virtual ~FetchBarrierProcessor() {} - - void ProcessImpl() override {} - sendrecv::VariableMessage reply_; - std::unique_ptr stub_; -}; - -class CheckpointNotifyProcessor : public BaseProcessor { - public: - explicit CheckpointNotifyProcessor(std::shared_ptr ch) - : BaseProcessor() { - stub_ = sendrecv::SendRecvService::NewStub(ch); - } - - virtual ~CheckpointNotifyProcessor() {} - - void ProcessImpl() override {} - sendrecv::VoidMessage reply_; - std::unique_ptr stub_; -}; - -class GRPCClient : public RPCClient { - public: - GRPCClient() : ok_(true), completed_(false), stopped_(false) {} - virtual ~GRPCClient(); - - VarHandlePtr AsyncSendVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& var_name, - int64_t time_out = FLAGS_rpc_deadline) override; - - VarHandlePtr AsyncGetVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& var_name, - const std::string& out_varname, - const std::string& table_name = "", - int64_t time_out = FLAGS_rpc_deadline) override; - - VarHandlePtr AsyncGetVarNoBarrier( - const std::string& ep, const platform::DeviceContext& ctx, - const framework::Scope& scope, const std::string& var_name, - const std::string& out_varname, - int64_t time_out = FLAGS_rpc_deadline) override; - - VarHandlePtr AsyncGetMonomerVariable( - const std::string& ep, const platform::DeviceContext& ctx, - const framework::Scope& scope, const std::string& var_name, - int64_t time_out = FLAGS_rpc_deadline) override; - - VarHandlePtr AsyncPrefetchVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& in_var_name, - const std::string& out_var_name, - const std::string& table_name = "", - int64_t time_out = FLAGS_rpc_deadline) override; - - VarHandlePtr AsyncSendBatchBarrier( - const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override; - - VarHandlePtr AsyncSendFetchBarrier(const std::string& ep, - int64_t time_out) override; - - VarHandlePtr AsyncGetMonomerBarrier( - const std::string& ep, const std::string& var_name, - int64_t time_out = FLAGS_rpc_deadline) override; - - VarHandlePtr AsyncCheckpointNotify( - const std::string& ep, const std::string& dirname, - const std::string& varname, const int mode, - int64_t time_out = FLAGS_rpc_deadline) override; - - VarHandlePtr AsyncDistributeNotify( - const std::string& ep, const platform::DeviceContext& ctx, - const framework::Scope& scope, const std::string& var_name, - int64_t time_out = FLAGS_rpc_deadline) override; - - VarHandlePtr AsyncSendAndRecv(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& send_var_name, - const std::string& recv_var_name, - const std::string& table_name = "", - int64_t time_out = FLAGS_rpc_deadline) override; - - VarHandlePtr AsyncSendComplete( - const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override; - - bool Wait() override; - - void SendComplete() override; - - void InitImpl() override; - - private: - void Proceed(); - - std::shared_ptr GetChannel(const std::string& ep); - VarHandlePtr _AsyncGetVar( - const std::string& ep, const platform::DeviceContext& ctx, - const framework::Scope& scope, const std::string& method, - const std::string& var_name, const std::string& out_varname, - const std::string& rpc_path, const std::string& table_name = "", - int64_t time_out = FLAGS_rpc_deadline); - - private: - grpc::CompletionQueue cq_; - std::unordered_map> channels_; - std::vector> client_threads_; - - // mutex for Wait client sync - std::mutex sync_mutex_; - std::condition_variable sync_cond_; - std::atomic req_count_{0}; - bool ok_; - - // mutex for GetChannel thread safety - std::mutex chan_mutex_; - DISABLE_COPY_AND_ASSIGN(GRPCClient); - - // mutex for sending complete message only once - std::mutex completed_mutex_; - bool completed_; - - volatile bool stopped_; -}; - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/grpc/grpc_serde.cc b/paddle/fluid/operators/distributed/grpc/grpc_serde.cc deleted file mode 100644 index 0fc9b695779..00000000000 --- a/paddle/fluid/operators/distributed/grpc/grpc_serde.cc +++ /dev/null @@ -1,190 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef PADDLE_WITH_NCCL -#include -#endif -#ifdef PADDLE_WITH_RCCL -#include -#endif -#include -#include -#include "grpcpp/impl/codegen/byte_buffer.h" -#include "grpcpp/impl/codegen/slice.h" -#include "paddle/fluid/operators/distributed/grpc/grpc_serde.h" -#include "paddle/fluid/operators/distributed/grpc/grpc_variable_response.h" -#include "paddle/fluid/operators/distributed/proto_encoder_helper.h" -#include "paddle/fluid/operators/distributed/send_recv.pb.h" -#include "paddle/fluid/operators/distributed/sendrecvop_utils.h" -#include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/profiler.h" - -namespace paddle { -namespace framework { -class Scope; -class Variable; -} // namespace framework -namespace platform { -class DeviceContext; -} // namespace platform -} // namespace paddle - -namespace paddle { -namespace operators { -namespace distributed { - -void SerializeToByteBuffer(const std::string& name, framework::Variable* var, - const platform::DeviceContext& ctx, - ::grpc::ByteBuffer* msg, const std::string& out_name, - const int trainer_id, - const std::string& table_name) { - platform::RecordRPCEvent record_event("serial"); - VarMsg request; - TensorPayload* payload = nullptr; - - request.set_varname(name); - request.set_trainer_id(trainer_id); - // Note: normally the profiler is enabled in 1 trainer, hence only - // 1 trainer returns true for ShouldSendProfileState(). It tells PS - // servers the trainer's profiling state so that PS can follow the - // trainer. - if (platform::ShouldSendProfileState()) { - if (platform::IsProfileEnabled()) { - request.set_profile(platform::kEnableProfiler); - } else { - request.set_profile(platform::kDisableProfiler); - } - } - if (!out_name.empty()) { - request.set_out_varname(out_name); - } - if (!table_name.empty()) { - request.set_table_name(table_name); - } - if (var->IsType()) { - request.set_type(::sendrecv::LOD_TENSOR); - payload = new TensorPayload(GetTensorPayload(var, ctx, &request)); - } else if (var->IsType()) { - request.set_type(::sendrecv::SELECTED_ROWS); - payload = new TensorPayload(GetSelectedRowsPayload(var, ctx, &request)); -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - } else if (var->IsType()) { - request.set_type(::sendrecv::NCCL_ID); -#endif - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "Serialize does not support type: %s", typeid(var->Type()).name())); - } - std::string header; - request.AppendToString(&header); - auto buffer = std::unique_ptr(new char[1024]); - void* buf = buffer.get(); - ProtoEncodeHelper e(static_cast(buf), 1024); - e.WriteRawBytes(std::string(header.data(), header.size())); -// NCCLID is copied directly to the message, return bytebuffer -// with only one slice if serializing NCCLID. -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - if (var->IsType()) { - e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber, - NCCL_UNIQUE_ID_BYTES); - const ncclUniqueId& uid = var->Get(); - e.WriteRawBytes(std::string(uid.internal, NCCL_UNIQUE_ID_BYTES)); - - // for serialize NCCL_ID - ::grpc::Slice slices(e.size()); - memcpy(const_cast(slices.begin()), e.data(), e.size()); - ::grpc::ByteBuffer tmp(&slices, 1); - msg->Swap(&tmp); - return; - } -#endif - PADDLE_ENFORCE_NOT_NULL( - payload, - platform::errors::InvalidArgument( - "Not support type: %s, need to be LOD_TENSOR or SELECTED_ROWS", - var->Type())); - e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber, - payload->memory_size()); - if (payload->memory_size() >= std::numeric_limits::max()) { - PADDLE_THROW(platform::errors::InvalidArgument( - "Variable %s length %d should less than %d.", name, - payload->memory_size(), std::numeric_limits::max())); - } - // steal reference of tensor data - ::grpc::Slice slices[4]; // metadata, tensor, rows meta, rows - int num_slices = 2; // only SelectedRows have rows buffer - slices[0] = ::grpc::Slice(e.size()); - memcpy(const_cast(slices[0].begin()), e.data(), e.size()); - slices[1] = ::grpc::Slice( - grpc_slice_new_with_user_data(payload->ptr(), payload->memory_size(), - SerializeDestroyCallback, payload), - ::grpc::Slice::STEAL_REF); - - if (var->IsType()) { - auto* slr = var->GetMutable(); - ProtoEncodeHelper e2(static_cast(buf), 128); - - PADDLE_ENFORCE_EQ(VectorElemName(slr->rows()), typeid(int64_t).name(), - platform::errors::InvalidArgument( - "Got wrong type %s, expect type: int64_t", - VectorElemName(slr->rows()))); - size_t rows_memory_size = slr->rows().size() * sizeof(int64_t); - - e2.WriteVarlengthBeginning(VarMsg::kRowsFieldNumber, rows_memory_size); - slices[2] = ::grpc::Slice(e2.size()); - memcpy(const_cast(slices[2].begin()), e2.data(), e2.size()); - - slices[3] = ::grpc::Slice( - grpc_slice_new_with_user_data( - const_cast( - reinterpret_cast(slr->rows().data())), - rows_memory_size, [](void* backing) {}, - const_cast( - reinterpret_cast(slr->rows().data()))), - ::grpc::Slice::STEAL_REF); - num_slices = 4; - } - ::grpc::ByteBuffer tmp(&slices[0], num_slices); - msg->Swap(&tmp); -} - -void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg, - const platform::DeviceContext& ctx, - const framework::Scope* scope, - framework::Variable** var, int* trainer_id) { - platform::RecordRPCEvent record_event("deserial"); - operators::distributed::GRPCVariableResponse resp(scope, &ctx); - PADDLE_ENFORCE_EQ( - resp.Parse(msg), 0, - platform::errors::InvalidArgument("parse bytebuffer to tensor error!")); - *var = resp.GetVar(); - *trainer_id = resp.GetTrainerId(); -} - -void DeserializeRecvFromByteBuffer(const ::grpc::ByteBuffer& msg, - const platform::DeviceContext& ctx, - const framework::Scope* scope, - framework::Variable** var, int* trainer_id) { - platform::RecordRPCEvent record_event("deserial"); - operators::distributed::GRPCVariableResponse resp(scope, &ctx); - PADDLE_ENFORCE_EQ( - resp.Parse(msg), 0, - platform::errors::InvalidArgument("parse bytebuffer to tensor error!")); - *var = resp.GetRecvVar(); - *trainer_id = resp.GetTrainerId(); -} - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/grpc/grpc_serde.h b/paddle/fluid/operators/distributed/grpc/grpc_serde.h deleted file mode 100644 index 932f3e2f069..00000000000 --- a/paddle/fluid/operators/distributed/grpc/grpc_serde.h +++ /dev/null @@ -1,69 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include - -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/framework/var_type.h" -#include "paddle/fluid/operators/distributed/distributed_pb.h" -#include "paddle/fluid/operators/distributed/sendrecvop_utils.h" -#include "paddle/fluid/platform/port.h" - -namespace grpc { -class ByteBuffer; -} // namespace grpc -namespace paddle { -namespace framework { -class Scope; -class Variable; -} // namespace framework -namespace platform { -class DeviceContext; -} // namespace platform -} // namespace paddle - -namespace paddle { -namespace operators { -namespace distributed { - -typedef void (*DestroyCallback)(void*); - -void SerializeToByteBuffer(const std::string& name, framework::Variable* var, - const platform::DeviceContext& ctx, - ::grpc::ByteBuffer* msg, - const std::string& out_varname = std::string(), - const int trainer_id = 0, - const std::string& table_name = std::string()); - -void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg, - const platform::DeviceContext& ctx, - const framework::Scope* scope, - framework::Variable** var, int* trainer_id); - -void DeserializeRecvFromByteBuffer(const ::grpc::ByteBuffer& msg, - const platform::DeviceContext& ctx, - const framework::Scope* scope, - framework::Variable** var, int* trainer_id); - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/grpc/grpc_serde_test.cc b/paddle/fluid/operators/distributed/grpc/grpc_serde_test.cc deleted file mode 100644 index d407a72938a..00000000000 --- a/paddle/fluid/operators/distributed/grpc/grpc_serde_test.cc +++ /dev/null @@ -1,224 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include // NOLINT - -#include "google/protobuf/text_format.h" -#include "gtest/gtest.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/framework/variable.h" -#include "paddle/fluid/operators/distributed/distributed.h" -#include "paddle/fluid/operators/distributed/grpc/grpc_serde.h" -#include "paddle/fluid/operators/distributed/grpc/grpc_variable_response.h" -#include "paddle/fluid/operators/distributed/sendrecvop_utils.h" -#include "paddle/fluid/operators/math/math_function.h" -#include "paddle/fluid/platform/place.h" -#include "paddle/fluid/string/printf.h" - -namespace framework = paddle::framework; -namespace platform = paddle::platform; -namespace operators = paddle::operators; -namespace math = paddle::operators::math; -namespace memory = paddle::memory; - -void RunSerdeTestSelectedRows(platform::Place place) { - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - auto& ctx = *pool.Get(place); - - // serialize var to ByteBuffer - framework::Variable var; - auto* slr = var.GetMutable(); - slr->set_height(1000); - auto* tensor = slr->mutable_value(); - auto* rows = slr->mutable_rows(); - tensor->Resize(framework::make_ddim({564, 128})); - tensor->mutable_data(place); - int tensor_numel = 564 * 128; - math::set_constant(ctx, tensor, 32.7); - for (int i = 0; i < 564; ++i) rows->push_back(i); - - ::grpc::ByteBuffer msg; - operators::distributed::SerializeToByteBuffer("myvar", &var, ctx, &msg); - EXPECT_GT(msg.Length(), static_cast(0)); - - // deserialize - std::vector<::grpc::Slice> slices; - (void)msg.Dump(&slices); - std::string tmp; - for (const auto& s : slices) { - tmp.append(reinterpret_cast(s.begin()), s.size()); - } - - sendrecv::VariableMessage varmsg; - EXPECT_TRUE(varmsg.ParseFromString(tmp)); - - // deserialize bytebuffer - EXPECT_EQ(varmsg.varname(), "myvar"); - EXPECT_EQ(varmsg.type(), 1); - - const float* tensor_data = - reinterpret_cast(varmsg.serialized().data()); - const int64_t* rows_data = - reinterpret_cast(varmsg.rows().data()); - for (int i = 0; i < tensor_numel; ++i) { - EXPECT_FLOAT_EQ(tensor_data[i], 32.7); - } - for (int i = 0; i < 564; ++i) { - EXPECT_EQ(rows_data[i], i); - } - - // deserialize zero-copy - // framework::Variable var2; - // operators::distributed::DeserializeFromByteBuffer(msg, ctx, &var2); - framework::Scope scope; - scope.Var("myvar"); - operators::distributed::GRPCVariableResponse resp(&scope, &ctx); - EXPECT_EQ(resp.Parse(msg), 0); - - framework::Variable* var2 = resp.GetVar(); - - auto* slr2 = var2->GetMutable(); - auto* tensor2 = slr2->mutable_value(); - auto* rows2 = slr2->mutable_rows(); - float* tensor_data2 = nullptr; - framework::Tensor tmp_tensor; - - if (platform::is_gpu_place(ctx.GetPlace())) { - platform::CPUPlace cpu; - framework::TensorCopy(*tensor2, cpu, &tmp_tensor); - tensor_data2 = tmp_tensor.data(); - } else { - tensor_data2 = const_cast(tensor2->data()); - } - const int64_t* rows_data2 = rows2->data(); - - for (int i = 0; i < tensor_numel; ++i) { - EXPECT_FLOAT_EQ(tensor_data2[i], 32.7); - } - for (size_t i = 0; i < rows2->size(); ++i) { - EXPECT_EQ(rows_data2[i], static_cast(i)); - } - EXPECT_EQ(slr2->height(), 1000); -} - -void RunTestLodTensor(platform::Place place, int from_type = 0) { - // serialize var to ByteBuffer - framework::Variable var; - auto* tensor = var.GetMutable(); - tensor->Resize(framework::make_ddim({512, 8, 4, 2})); - framework::LoD lod; - lod.push_back(framework::Vector({1, 3, 8})); - tensor->set_lod(lod); - int tensor_numel = 512 * 8 * 4 * 2; - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - auto& ctx = *pool.Get(place); - tensor->mutable_data(place); - math::set_constant(ctx, tensor, 31.9); - - ::grpc::ByteBuffer msg; - operators::distributed::SerializeToByteBuffer("myvar", &var, ctx, &msg, - "outvar", 0, "table_name"); - EXPECT_GT(msg.Length(), static_cast(0)); - - // deserialize - std::vector<::grpc::Slice> slices; - (void)msg.Dump(&slices); - std::string tmp; - for (const auto& s : slices) { - tmp.append(reinterpret_cast(s.begin()), s.size()); - } - sendrecv::VariableMessage varmsg; - EXPECT_TRUE(varmsg.ParseFromString(tmp)); - EXPECT_EQ(varmsg.varname(), "myvar"); - EXPECT_EQ(varmsg.type(), 0); - EXPECT_EQ(varmsg.dims()[0], 512); - EXPECT_EQ(varmsg.dims()[1], 8); - EXPECT_EQ(varmsg.dims()[2], 4); - EXPECT_EQ(varmsg.dims()[3], 2); - EXPECT_EQ(varmsg.lod_level(), 1); - EXPECT_EQ(varmsg.lod(0).lod_data(0), 1); - EXPECT_EQ(varmsg.lod(0).lod_data(1), 3); - EXPECT_EQ(varmsg.lod(0).lod_data(2), 8); - - const float* tensor_data = - reinterpret_cast(varmsg.serialized().data()); - for (int i = 0; i < tensor_numel; ++i) { - EXPECT_FLOAT_EQ(tensor_data[i], 31.9); - } - - // message binary - std::string str; - varmsg.SerializeToString(&str); - - // message bytebuffer - ::grpc::Slice slices_2[1]; - int num_slices = 1; - slices_2[0] = ::grpc::Slice(str.length()); - memcpy(const_cast(slices_2[0].begin()), str.c_str(), str.length()); - ::grpc::ByteBuffer bytebuffer2(&slices_2[0], num_slices); - - // deserialize zero-copy - framework::Scope scope; - scope.Var("myvar"); - operators::distributed::GRPCVariableResponse resp(&scope, &ctx); - if (from_type == 0) { - EXPECT_EQ(resp.Parse(msg), 0); - } else { - EXPECT_EQ(resp.Parse(bytebuffer2), 0); - } - - framework::Variable* var2 = resp.GetVar(); - - auto tensor2 = var2->Get(); - float* tensor_data2 = nullptr; - framework::Tensor tmp_tensor; - - if (platform::is_gpu_place(ctx.GetPlace())) { - platform::CPUPlace cpu; - framework::TensorCopy(tensor2, cpu, &tmp_tensor); - tensor_data2 = tmp_tensor.data(); - } else { - tensor_data2 = const_cast(tensor2.data()); - } - - EXPECT_EQ(varmsg.lod_level(), 1); - EXPECT_EQ(varmsg.lod(0).lod_data(0), 1); - EXPECT_EQ(varmsg.lod(0).lod_data(1), 3); - EXPECT_EQ(varmsg.lod(0).lod_data(2), 8); - for (int i = 0; i < tensor_numel; ++i) EXPECT_FLOAT_EQ(tensor_data2[i], 31.9); -} - -TEST(LodTensor, Run) { - platform::CPUPlace place; - RunTestLodTensor(place); - RunTestLodTensor(place, 1); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - platform::CUDAPlace gpu(0); - RunTestLodTensor(gpu); - RunTestLodTensor(gpu, 1); -#endif -} - -TEST(SelectedRows, Run) { - platform::CPUPlace place; - RunSerdeTestSelectedRows(place); - -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - platform::CUDAPlace gpu; - RunSerdeTestSelectedRows(gpu); -#endif -} diff --git a/paddle/fluid/operators/distributed/grpc/grpc_server.cc b/paddle/fluid/operators/distributed/grpc/grpc_server.cc deleted file mode 100644 index 912520d782d..00000000000 --- a/paddle/fluid/operators/distributed/grpc/grpc_server.cc +++ /dev/null @@ -1,720 +0,0 @@ -/*Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include -#include - -#include "paddle/fluid/operators/distributed/grpc/grpc_serde.h" -#include "paddle/fluid/operators/distributed/grpc/grpc_server.h" - -namespace grpc { -class ChannelArguments; -} // namespace grpc -namespace paddle { -namespace framework { -class Variable; -} // namespace framework -namespace operators { -namespace distributed { -class GRPCVariableResponse; -} // namespace distributed -} // namespace operators -} // namespace paddle - -using ::grpc::ServerAsyncResponseWriter; - -DECLARE_bool(rpc_disable_reuse_port); -DECLARE_int32(rpc_retry_bind_port); - -namespace paddle { -namespace operators { -namespace distributed { - -enum CallStatus { PROCESS = 0, FINISH }; - -// reference: -// https://stackoverflow.com/questions/41732884/grpc-multiple-services-in-cpp-async-server -class RequestBase { - public: - explicit RequestBase(GrpcService::AsyncService* service, - ::grpc::ServerCompletionQueue* cq, - RequestHandler* request_handler, int req_id) - : service_(service), - cq_(cq), - status_(PROCESS), - request_handler_(request_handler), - req_id_(req_id) { - PADDLE_ENFORCE_NOT_NULL(cq_, platform::errors::InvalidArgument( - "ServerCompletionQueue cq are empty")); - } - virtual ~RequestBase() {} - virtual void Process() = 0; - - std::string Status2String(const std::string& method) { - std::string status = "Process"; - if (status_ == FINISH) { - status = "Finish"; - } - - std::ostringstream s; - s << method << " name:[" << GetReqName() << "]" - << ", ep:[" << ctx_.peer() << "]" - << " " << status << " using req_id:" << req_id_; - return s.str(); - } - - CallStatus Status() const { - std::lock_guard l(status_mu_); - return status_; - } - - template - void Finish(const T& reply, ServerAsyncResponseWriter* responder) { - std::lock_guard l(status_mu_); - status_ = FINISH; - responder->Finish(reply, ::grpc::Status::OK, - reinterpret_cast(static_cast(req_id_))); - } - virtual std::string GetReqName() = 0; - - protected: - mutable std::mutex status_mu_; - ::grpc::ServerContext ctx_; - GrpcService::AsyncService* service_; - ::grpc::ServerCompletionQueue* cq_; - CallStatus status_; - RequestHandler* request_handler_; - int req_id_; -}; - -class RequestSend final : public RequestBase { - public: - explicit RequestSend(GrpcService::AsyncService* service, - ::grpc::ServerCompletionQueue* cq, - RequestHandler* request_handler, int req_id) - : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) { - request_.reset(new GRPCVariableResponse(request_handler->scope(), - request_handler->dev_ctx(), true)); - int method_id = static_cast(distributed::GrpcMethod::kSendVariable); - service_->RequestAsyncUnary( - method_id, &ctx_, request_.get(), &responder_, cq_, cq_, - reinterpret_cast(static_cast(req_id))); - } - virtual ~RequestSend() {} - std::string GetReqName() override { return request_->Varname(); } - - void Process() override { - std::string varname = GetReqName(); - - auto scope = request_->GetMutableLocalScope(); - auto invar = request_->GetVar(); - int trainer_id = request_->GetTrainerId(); - - VLOG(4) << "RequestSend var_name:" << varname << " trainer: " << trainer_id; - - framework::Variable* outvar = nullptr; - request_handler_->Handle(varname, scope, invar, &outvar, trainer_id); - Finish(reply_, &responder_); - } - - protected: - sendrecv::VoidMessage reply_; - std::shared_ptr request_; - ServerAsyncResponseWriter responder_; -}; - -class RequestGet final : public RequestBase { - public: - explicit RequestGet(GrpcService::AsyncService* service, - ::grpc::ServerCompletionQueue* cq, - RequestHandler* request_handler, int req_id) - : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) { - auto method_id = static_cast(distributed::GrpcMethod::kGetVariable); - service_->RequestAsyncUnary( - method_id, &ctx_, &request_, &responder_, cq_, cq_, - reinterpret_cast(static_cast(req_id))); - } - - virtual ~RequestGet() {} - - std::string GetReqName() override { return request_.varname(); } - - void Process() override { - // proc request. - std::string varname = request_.varname(); - std::string out_varname = request_.out_varname(); - std::string table_name = request_.table_name(); - int trainer_id = request_.trainer_id(); - - VLOG(4) << "RequestGet " << out_varname << " from " << varname; - - auto scope = request_handler_->scope(); - framework::Variable* invar = nullptr; - framework::Variable* outvar = nullptr; - - tmp_scope_ = std::move(scope->NewTmpScope()); - request_handler_->Handle(varname, tmp_scope_.get(), invar, &outvar, - trainer_id, out_varname, table_name); - - VLOG(1) << "before SerializeToByteBuffer"; - if (outvar) { - SerializeToByteBuffer(out_varname, outvar, *request_handler_->dev_ctx(), - &reply_); - } - VLOG(1) << "after SerializeToByteBuffer"; - Finish(reply_, &responder_); - } - - protected: - sendrecv::VariableMessage request_; - ::grpc::ByteBuffer reply_; - std::unique_ptr tmp_scope_; - ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_; -}; - -class RequestGetNoBarrier final : public RequestBase { - public: - explicit RequestGetNoBarrier(GrpcService::AsyncService* service, - ::grpc::ServerCompletionQueue* cq, - RequestHandler* request_handler, int req_id) - : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) { - auto method_id = - static_cast(distributed::GrpcMethod::kGetVariableNoBarrier); - service_->RequestAsyncUnary( - method_id, &ctx_, &request_, &responder_, cq_, cq_, - reinterpret_cast(static_cast(req_id))); - } - - virtual ~RequestGetNoBarrier() {} - - std::string GetReqName() override { return request_.varname(); } - - void Process() override { - // proc request. - std::string varname = request_.varname(); - std::string out_varname = request_.out_varname(); - int trainer_id = request_.trainer_id(); - - VLOG(4) << "RequestGetNoBarrier " << out_varname << " from " << varname; - - auto scope = request_handler_->scope(); - framework::Variable* invar = nullptr; - framework::Variable* outvar = nullptr; - - request_handler_->Handle(varname, scope, invar, &outvar, trainer_id, - out_varname); - - if (outvar) { - SerializeToByteBuffer(out_varname, outvar, *request_handler_->dev_ctx(), - &reply_); - } - Finish(reply_, &responder_); - } - - protected: - sendrecv::VariableMessage request_; - ::grpc::ByteBuffer reply_; - ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_; -}; - -class RequestGetMonomerVariable final : public RequestBase { - public: - explicit RequestGetMonomerVariable(GrpcService::AsyncService* service, - ::grpc::ServerCompletionQueue* cq, - RequestHandler* request_handler, - int req_id, RPCServer* rpc_server) - : RequestBase(service, cq, request_handler, req_id), - responder_(&ctx_), - rpc_server_(rpc_server) { - auto method_id = - static_cast(distributed::GrpcMethod::kGetMonomerVariable); - service_->RequestAsyncUnary( - method_id, &ctx_, &request_, &responder_, cq_, cq_, - reinterpret_cast(static_cast(req_id))); - } - - virtual ~RequestGetMonomerVariable() {} - - std::string GetReqName() override { return request_.varname(); } - - void Process() override { - // proc request. - std::string varname = request_.varname(); - - rpc_server_->WaitVarCond(varname); - MonomerHandle h = rpc_server_->GetMonomer(varname); - - auto scope = h.scope_; - auto invar = scope->FindVar(varname); - framework::Variable* outvar = nullptr; - - request_handler_->Handle(varname, scope, invar, &outvar, - request_.trainer_id()); - - if (outvar) { - SerializeToByteBuffer(varname, outvar, *h.dev_ctx_, &reply_); - } - Finish(reply_, &responder_); - } - - protected: - sendrecv::VariableMessage request_; - ::grpc::ByteBuffer reply_; - ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_; - RPCServer* rpc_server_{nullptr}; -}; - -class RequestGetMonomerBarrier final : public RequestBase { - public: - explicit RequestGetMonomerBarrier(GrpcService::AsyncService* service, - ::grpc::ServerCompletionQueue* cq, - RequestHandler* request_handler, int req_id, - RPCServer* rpc_server) - : RequestBase(service, cq, request_handler, req_id), - responder_(&ctx_), - rpc_server_(rpc_server) { - auto method_id = - static_cast(distributed::GrpcMethod::kGetMonomerBarrier); - service_->RequestAsyncUnary( - method_id, &ctx_, &request_, &responder_, cq_, cq_, - reinterpret_cast(static_cast(req_id))); - } - - virtual ~RequestGetMonomerBarrier() {} - - std::string GetReqName() override { return request_.varname(); } - - void Process() override { - // proc request. - std::string varname = request_.varname(); - VLOG(4) << "RequestGetMonomerBarrier " << varname; - - rpc_server_->WaitVarCond(varname); - MonomerHandle h = rpc_server_->GetMonomer(varname); - - framework::Scope* scope = nullptr; - framework::Variable* invar = nullptr; - framework::Variable* outvar = nullptr; - - request_handler_->Handle(varname, scope, invar, &outvar, - request_.trainer_id()); - - Finish(reply_, &responder_); - } - - protected: - sendrecv::VariableMessage request_; - sendrecv::VoidMessage reply_; - ServerAsyncResponseWriter responder_; - RPCServer* rpc_server_{nullptr}; -}; - -class RequestPrefetch final : public RequestBase { - public: - explicit RequestPrefetch(GrpcService::AsyncService* service, - ::grpc::ServerCompletionQueue* cq, - RequestHandler* request_handler, int req_id) - : RequestBase(service, cq, request_handler, req_id), - responder_(&ctx_), - local_scope_(nullptr) { - request_.reset(new GRPCVariableResponse(request_handler->scope(), - request_handler->dev_ctx(), true)); - int method_id = - static_cast(distributed::GrpcMethod::kPrefetchVariable); - service_->RequestAsyncUnary( - method_id, &ctx_, request_.get(), &responder_, cq_, cq_, - reinterpret_cast(static_cast(req_id))); - } - - virtual ~RequestPrefetch() {} - - std::string GetReqName() override { return request_->Varname(); } - - void Process() override { - // prefetch process... - std::string in_var_name = request_->Varname(); - std::string out_var_name = request_->OutVarname(); - std::string table_name = request_->TableName(); - int trainer_id = request_->GetTrainerId(); - - VLOG(4) << "RequestPrefetch, in_var_name: " << in_var_name - << " out_var_name: " << out_var_name << " trainer: " << trainer_id; - - auto scope = request_->GetMutableLocalScope(); - auto invar = scope->FindVar(in_var_name); - // out var must be created in local scope! - framework::Variable* outvar = scope->Var(out_var_name); - - request_handler_->Handle(in_var_name, scope, invar, &outvar, trainer_id, - out_var_name, table_name); - - SerializeToByteBuffer(out_var_name, outvar, *request_handler_->dev_ctx(), - &reply_); - Finish(reply_, &responder_); - } - - protected: - std::shared_ptr request_; - ::grpc::ByteBuffer reply_; - ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_; - framework::Scope* local_scope_; -}; - -class RequestCheckpointNotify final : public RequestBase { - public: - explicit RequestCheckpointNotify(GrpcService::AsyncService* service, - ::grpc::ServerCompletionQueue* cq, - RequestHandler* request_handler, int req_id) - : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) { - request_.reset(new GRPCVariableResponse(request_handler->scope(), - request_handler->dev_ctx())); - int method_id = - static_cast(distributed::GrpcMethod::kCheckpointNotify); - service_->RequestAsyncUnary( - method_id, &ctx_, request_.get(), &responder_, cq_, cq_, - reinterpret_cast(static_cast(req_id))); - } - - virtual ~RequestCheckpointNotify() {} - - std::string GetReqName() override { return request_->Varname(); } - - void Process() override { - auto scope = request_->GetMutableLocalScope(); - - std::string checkpoint_notify = request_->Varname(); - std::string checkpoint_dir = request_->OutVarname(); - int trainer_id = request_->GetTrainerId(); - std::string table_name = request_->TableName(); - - VLOG(4) << "RequestCheckpointNotify notify: " << checkpoint_notify - << ", dir: " << checkpoint_dir; - - request_handler_->Handle(checkpoint_notify, scope, nullptr, nullptr, - trainer_id, checkpoint_dir, table_name); - Finish(reply_, &responder_); - } - - protected: - std::shared_ptr request_; - sendrecv::VoidMessage reply_; - ServerAsyncResponseWriter responder_; -}; - -class RequestNotify final : public RequestBase { - public: - explicit RequestNotify(GrpcService::AsyncService* service, - ::grpc::ServerCompletionQueue* cq, - RequestHandler* request_handler, int req_id) - : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) { - request_.reset(new GRPCVariableResponse(request_handler->scope(), - request_handler->dev_ctx(), true)); - int method_id = static_cast(distributed::GrpcMethod::kRequestNotify); - service_->RequestAsyncUnary( - method_id, &ctx_, request_.get(), &responder_, cq_, cq_, - reinterpret_cast(static_cast(req_id))); - } - virtual ~RequestNotify() {} - std::string GetReqName() override { return request_->Varname(); } - - void Process() override { - std::string varname = GetReqName(); - VLOG(4) << "RequestNotify var_name:" << varname; - - auto scope = request_->GetMutableLocalScope(); - auto invar = request_->GetVar(); - int trainer_id = request_->GetTrainerId(); - framework::Variable* outvar = nullptr; - request_handler_->Handle(varname, scope, invar, &outvar, trainer_id); - Finish(reply_, &responder_); - } - - protected: - sendrecv::VoidMessage reply_; - std::shared_ptr request_; - ServerAsyncResponseWriter responder_; -}; - -class RequestSendAndRecv final : public RequestBase { - public: - explicit RequestSendAndRecv(GrpcService::AsyncService* service, - ::grpc::ServerCompletionQueue* cq, - RequestHandler* request_handler, int req_id) - : RequestBase(service, cq, request_handler, req_id), responder_(&ctx_) { - request_.reset(new GRPCVariableResponse(request_handler->scope(), - request_handler->dev_ctx(), true)); - - int method_id = - static_cast(distributed::GrpcMethod::kRequestSendAndRecv); - - service_->RequestAsyncUnary( - method_id, &ctx_, request_.get(), &responder_, cq_, cq_, - reinterpret_cast(static_cast(req_id))); - } - - virtual ~RequestSendAndRecv() {} - std::string GetReqName() override { return request_->Varname(); } - - void Process() override { - std::string in_var_name = request_->Varname(); - std::string out_var_name = request_->OutVarname(); - std::string table_name = request_->TableName(); - int trainer_id = request_->GetTrainerId(); - - VLOG(4) << "RequestSendAndRecv, in_var_name: " << in_var_name - << " out_var_name: " << out_var_name << " trainer: " << trainer_id; - auto scope = request_->GetMutableLocalScope(); - auto invar = scope->FindVar(in_var_name); - framework::Variable* outvar = nullptr; - request_handler_->Handle(in_var_name, scope, invar, &outvar, trainer_id, - out_var_name, table_name); - SerializeToByteBuffer(out_var_name, outvar, *request_handler_->dev_ctx(), - &reply_); - Finish(reply_, &responder_); - } - - protected: - std::shared_ptr request_; - ::grpc::ByteBuffer reply_; - ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_; -}; - -void AsyncGRPCServer::WaitServerReady() { - VLOG(4) << "AsyncGRPCServer is waiting server ready"; - std::unique_lock lock(this->mutex_ready_); - condition_ready_.wait(lock, [=] { return this->ready_ == 1; }); - VLOG(4) << "AsyncGRPCServer WaitSeverReady"; -} - -// Define an option subclass in order to disable SO_REUSEPORT for the -// server socket. -// Come from: -// https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc -class NoReusePortOption : public ::grpc::ServerBuilderOption { - public: - void UpdateArguments(::grpc::ChannelArguments* args) override { - args->SetInt(GRPC_ARG_ALLOW_REUSEPORT, 0); - } - - void UpdatePlugins(std::vector>* - plugins) override {} -}; - -void AsyncGRPCServer::StartServer() { - for (int i = 0; i < FLAGS_rpc_retry_bind_port; i++) { - ::grpc::ServerBuilder builder; - std::unique_ptr service( - new GrpcService::AsyncService()); - builder.AddListeningPort(bind_address_, ::grpc::InsecureServerCredentials(), - &selected_port_); - - builder.SetMaxSendMessageSize(std::numeric_limits::max()); - builder.SetMaxReceiveMessageSize(std::numeric_limits::max()); - if (FLAGS_rpc_disable_reuse_port) { - builder.SetOption( - std::unique_ptr<::grpc::ServerBuilderOption>(new NoReusePortOption)); - LOG(INFO) << "set FLAGS_rpc_disable_reuse_port"; - } - builder.RegisterService(service.get()); - - for (auto t : rpc_call_map_) { - rpc_cq_[t.first].reset(builder.AddCompletionQueue().release()); - } - - server_ = builder.BuildAndStart(); - if (selected_port_ != 0) { - LOG(INFO) << "Server listening on " << bind_address_ - << " successful, selected port: " << selected_port_; - service_.reset(service.release()); - break; - } - - LOG(WARNING) << "Server listening on " << bind_address_ - << " failed, selected port: " << selected_port_ - << ", retry after 3 seconds!"; - - sleep(3); - } - - PADDLE_ENFORCE_NE( - selected_port_, 0, - platform::errors::Unavailable("can't bind to address:%s", bind_address_)); - - std::function f = - std::bind(&AsyncGRPCServer::TryToRegisterNewOne, this, - std::placeholders::_1, std::placeholders::_2); - - for (auto& t : rpc_call_map_) { - auto& rpc_name = t.first; - auto& cq = rpc_cq_[rpc_name]; - auto threadnum = rpc_thread_num_[rpc_name]; - auto& reqs = rpc_reqs_[rpc_name]; - - reqs.reserve(kRequestBufSize); - - for (int i = 0; i < kRequestBufSize; i++) { - VLOG(6) << "TryToRegisterNewOne on RPC NAME: " << rpc_name << " I: " << i; - TryToRegisterNewOne(rpc_name, i); - } - - for (int i = 0; i < threadnum; i++) { - rpc_threads_[rpc_name].emplace_back(new std::thread(std::bind( - &AsyncGRPCServer::HandleRequest, this, cq.get(), rpc_name, f))); - VLOG(4) << t.first << " creates threads!"; - } - } - - { - std::lock_guard lock(this->mutex_ready_); - ready_ = 1; - } - condition_ready_.notify_all(); - - // wait server - server_->Wait(); - - for (auto& t : rpc_threads_) { - auto& threads = t.second; - for (size_t i = 0; i < threads.size(); ++i) { - threads[i]->join(); - VLOG(4) << t.first << " threads ends!"; - } - } -} - -void AsyncGRPCServer::ShutdownQueue() { - for (auto& t : rpc_cq_) { - t.second->Shutdown(); - VLOG(4) << t.first << " queue shutdown!"; - } -} - -void AsyncGRPCServer::ShutDownImpl() { - std::unique_lock lock(cq_mutex_); - is_shut_down_ = true; - ShutdownQueue(); - - VLOG(4) << "server_ shutdown!"; - server_->Shutdown(); -} - -void AsyncGRPCServer::TryToRegisterNewOne(const std::string& rpc_name, - int req_id) { - std::unique_lock lock(cq_mutex_); - if (is_shut_down_) { - VLOG(4) << "shutdown, do not TryToRegisterNewSendOne"; - return; - } - - VLOG(4) << "TryToRegisterNewOne on RPC NAME: " << rpc_name - << " REQ ID: " << req_id; - - auto& reqs = rpc_reqs_[rpc_name]; - auto& handler = rpc_call_map_[rpc_name]; - auto& cq = rpc_cq_[rpc_name]; - - RequestBase* b = nullptr; - if (rpc_name == kRequestSend) { - b = new RequestSend(service_.get(), cq.get(), handler, req_id); - } else if (rpc_name == kRequestGet) { - b = new RequestGet(service_.get(), cq.get(), handler, req_id); - - } else if (rpc_name == kRequestGetNoBarrier) { - b = new RequestGetNoBarrier(service_.get(), cq.get(), handler, req_id); - } else if (rpc_name == kRequestGetMonomerVariable) { - b = new RequestGetMonomerVariable(service_.get(), cq.get(), handler, req_id, - this); - } else if (rpc_name == kRequestGetMonomerBarrier) { - b = new RequestGetMonomerBarrier(service_.get(), cq.get(), handler, req_id, - this); - } else if (rpc_name == kRequestPrefetch) { - b = new RequestPrefetch(service_.get(), cq.get(), handler, req_id); - } else if (rpc_name == kRequestCheckpoint) { - b = new RequestCheckpointNotify(service_.get(), cq.get(), handler, req_id); - } else if (rpc_name == kRequestNotify) { - b = new RequestNotify(service_.get(), cq.get(), handler, req_id); - } else if (rpc_name == kRequestSendAndRecv) { - b = new RequestSendAndRecv(service_.get(), cq.get(), handler, req_id); - } else { - PADDLE_THROW( - platform::errors::InvalidArgument("not supported rpc: %s", rpc_name)); - } - - reqs[req_id] = b; - - VLOG(4) << "TryToRegisterNewOne status:" << b->Status(); -} - -void AsyncGRPCServer::HandleRequest( - ::grpc::ServerCompletionQueue* cq, const std::string& rpc_name, - std::function TryToRegisterNewOne) { - void* tag = NULL; - bool ok = false; - - while (true) { - VLOG(4) << "HandleRequest " << rpc_name << " wait next"; - if (!cq->Next(&tag, &ok)) { - VLOG(4) << "CompletionQueue " << rpc_name << " shutdown!"; - break; - } - - int req_id = static_cast(reinterpret_cast(tag)); - VLOG(4) << "HandleRequest " << rpc_name << ", req_id:" << req_id - << " get next"; - - auto& reqs = rpc_reqs_[rpc_name]; - RequestBase* base = nullptr; - { - PADDLE_ENFORCE_EQ( - (req_id >= 0 && req_id < kRequestBufSize), true, - platform::errors::OutOfRange("request id: %s out of bounds: [0, %s)", - req_id, kRequestBufSize)); - std::unique_lock lock(cq_mutex_); - base = reqs[req_id]; - } - - VLOG(3) << base->Status2String(rpc_name); - - // reference: - // https://github.com/tensorflow/tensorflow/issues/5596 - // https://groups.google.com/forum/#!topic/grpc-io/xftlRy-IQwM - // https://groups.google.com/forum/#!topic/grpc-io/ywATt88Ef_I - if (!ok) { - VLOG(4) << "completion queue:" << rpc_name << " recv no regular event" - << " context:" << base->Status2String(rpc_name); - TryToRegisterNewOne(rpc_name, req_id); - delete base; - continue; - } - - switch (base->Status()) { - case PROCESS: { - base->Process(); - break; - } - case FINISH: { - TryToRegisterNewOne(rpc_name, req_id); - delete base; - break; - } - default: { assert(false); } - } - } -} - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/grpc/grpc_server.h b/paddle/fluid/operators/distributed/grpc/grpc_server.h deleted file mode 100644 index 3d68b7e8ceb..00000000000 --- a/paddle/fluid/operators/distributed/grpc/grpc_server.h +++ /dev/null @@ -1,93 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include -#include -#include // NOLINT -#include -#include - -#include "grpc++/grpc++.h" -#include "paddle/fluid/framework/blocking_queue.h" -#include "paddle/fluid/framework/executor.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/framework/var_type.h" -#include "paddle/fluid/operators/distributed/distributed_pb.h" -#include "paddle/fluid/operators/distributed/grpc/grpc_service.h" -#include "paddle/fluid/operators/distributed/request_handler.h" -#include "paddle/fluid/operators/distributed/rpc_server.h" -#include "paddle/fluid/operators/distributed/sendrecvop_utils.h" -#include "paddle/fluid/platform/profiler.h" - -namespace grpc { -class ServerCompletionQueue; -} // namespace grpc - -namespace paddle { -namespace operators { -namespace distributed { - -class RequestBase; - -class AsyncGRPCServer final : public RPCServer { - public: - explicit AsyncGRPCServer(const std::string& address, int client_num) - : RPCServer(address, client_num), ready_(0) {} - - virtual ~AsyncGRPCServer() {} - void WaitServerReady() override; - void StartServer() override; - - private: - // HandleRequest needs to be thread-safe. - void HandleRequest( - ::grpc::ServerCompletionQueue* cq, const std::string& rpc_name, - std::function TryToRegisterNewOne); - - void TryToRegisterNewOne(const std::string& rpc_name, int req_id); - void ShutdownQueue(); - void ShutDownImpl() override; - - private: - static const int kRequestBufSize = 100; - - std::mutex cq_mutex_; - volatile bool is_shut_down_ = false; - - std::unique_ptr service_; - std::unique_ptr<::grpc::Server> server_; - - // condition of the sub program - std::condition_variable barrier_condition_; - - std::mutex mutex_ready_; - std::condition_variable condition_ready_; - - int ready_; - - std::map> rpc_cq_; - std::map>> rpc_threads_; - std::map> rpc_reqs_; -}; - -}; // namespace distributed -}; // namespace operators -}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/grpc/grpc_service.h b/paddle/fluid/operators/distributed/grpc/grpc_service.h deleted file mode 100644 index 10037c90853..00000000000 --- a/paddle/fluid/operators/distributed/grpc/grpc_service.h +++ /dev/null @@ -1,145 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "paddle/fluid/operators/distributed/grpc/grpc_variable_response.h" -#include "paddle/fluid/platform/profiler.h" - -// NOTE: This method was originally created by tensorflow -// (https://github.com/tensorflow/tensorflow/) we borrow this -// method and did some modifications so that we can parse gRPC -// requests without too much copying of the tensor data. - -namespace grpc { -class CompletionQueue; -class Channel; -class RpcService; -class ServerCompletionQueue; -class ServerContext; - -// Support parsing/unparsing of tensorflow::VariableResponse. -// Wire-format is identical to RecvVariableResponse. -template <> -class SerializationTraits< - paddle::operators::distributed::GRPCVariableResponse> { - public: - static Status Serialize( - const paddle::operators::distributed::GRPCVariableResponse& msg, - grpc_byte_buffer** bp, bool* own_buffer) { - PADDLE_THROW(paddle::platform::errors::Unimplemented( - "SerializationTraits::Serialize not implemented!")); - return Status(); - } - static Status Deserialize( - grpc_byte_buffer* buffer, - paddle::operators::distributed::GRPCVariableResponse* msg, - int max_message_size = INT_MAX) { - if (buffer == nullptr) { - return Status(StatusCode::INTERNAL, "No payload"); - } - - Status result = g_core_codegen_interface->ok(); - if (result.ok()) { - paddle::operators::distributed::GrpcByteSource source(buffer); - int ret = msg->Parse(&source); - if (ret != 0) { - result = Status(StatusCode::INTERNAL, "VariableResponse parse error"); - } - } - g_core_codegen_interface->grpc_byte_buffer_destroy(buffer); - return result; - } -}; -} // namespace grpc - -namespace paddle { -namespace operators { -namespace distributed { - -enum class GrpcMethod { - kSendVariable, - kGetVariable, - kPrefetchVariable, - kCheckpointNotify, - kGetVariableNoBarrier, - kGetMonomerVariable, - kGetMonomerBarrier, - kRequestNotify, - kRequestSendAndRecv, - // when you add new handler, change kGrpcNumMethods at the same time! -}; - -static const int kGrpcNumMethods = - static_cast(GrpcMethod::kRequestSendAndRecv) + 1; - -inline const char* GrpcMethodName(GrpcMethod id) { - switch (id) { - case GrpcMethod::kSendVariable: - return "/sendrecv.SendRecvService/SendVariable"; - case GrpcMethod::kGetVariable: - return "/sendrecv.SendRecvService/GetVariable"; - case GrpcMethod::kGetVariableNoBarrier: - return "/sendrecv.SendRecvService/GetVariableNoBarrier"; - case GrpcMethod::kGetMonomerVariable: - return "/sendrecv.SendRecvService/GetMonomerVariable"; - case GrpcMethod::kGetMonomerBarrier: - return "/sendrecv.SendRecvService/GetMonomerBarrier"; - case GrpcMethod::kPrefetchVariable: - return "/sendrecv.SendRecvService/PrefetchVariable"; - case GrpcMethod::kCheckpointNotify: - return "/sendrecv.SendRecvService/CheckpointNotify"; - case GrpcMethod::kRequestNotify: - return "/sendrecv.SendRecvService/DistributeNotify"; - case GrpcMethod::kRequestSendAndRecv: - return "/sendrecv.SendRecvService/SendAndRecvVariable"; - } - - // Shouldn't be reached. - PADDLE_THROW(platform::errors::InvalidArgument( - "Invalid id: not found valid method name")); - return nullptr; -} - -class GrpcService final { - public: - class AsyncService : public ::grpc::Service { - public: - AsyncService() { - for (int i = 0; i < kGrpcNumMethods; ++i) { - AddMethod(new ::grpc::internal::RpcServiceMethod( - GrpcMethodName(static_cast(i)), - ::grpc::internal::RpcMethod::NORMAL_RPC, nullptr)); - ::grpc::Service::MarkMethodAsync(i); - } - } - virtual ~AsyncService() {} - - // Make RequestAsyncUnary public for grpc_call.h - using ::grpc::Service::RequestAsyncUnary; - }; -}; - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/grpc/grpc_variable_response.cc b/paddle/fluid/operators/distributed/grpc/grpc_variable_response.cc deleted file mode 100644 index f7679e9fc92..00000000000 --- a/paddle/fluid/operators/distributed/grpc/grpc_variable_response.cc +++ /dev/null @@ -1,344 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include -#include - -#include "google/protobuf/io/coded_stream.h" -#include "paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h" -#include "paddle/fluid/operators/distributed/grpc/grpc_variable_response.h" -#include "paddle/fluid/operators/distributed/send_recv.pb.h" -#include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/profiler.h" - -namespace google { -namespace protobuf { -namespace io { -class ZeroCopyInputStream; -} // namespace io -} // namespace protobuf -} // namespace google -namespace grpc { -class ByteBuffer; -} // namespace grpc - -namespace paddle { -namespace operators { -namespace distributed { - -enum WireType { - WIRETYPE_VARINT = 0, - WIRETYPE_LENGTH_DELIMITED = 2, -}; - -inline int GetTagFieldNumber(uint32_t tag) { return tag >> 3; } - -inline WireType GetTagWireType(uint32_t tag) { - return static_cast(tag & 0x7); -} - -bool ReadVarintSizeAsInt(::google::protobuf::io::CodedInputStream* input, - int* result) { - uint64_t v; - if (input->ReadVarint64(&v) && v <= static_cast(INT_MAX)) { - *result = static_cast(v); - return true; - } else { - return false; - } -} - -int GRPCVariableResponse::Parse(const ::grpc::ByteBuffer& byte_buffer) { - GrpcByteBufferSource source; - source.Init(byte_buffer); - GrpcByteBufferSourceWrapper r(&source); - - return Parse(&r); -} - -bool ParseLodData(::google::protobuf::io::CodedInputStream* input, - std::vector* lod) { - while (true) { - auto p = input->ReadTagWithCutoff(127); - int tag = GetTagFieldNumber(p.first); - WireType wt = GetTagWireType(p.first); - - if (!p.second) { - return (tag == 0); - } - - switch (tag) { - case sendrecv::VariableMessage_LodData::kLodDataFieldNumber: { - uint64_t v; - if (wt == WIRETYPE_VARINT) { - if (!input->ReadVarint64(&v)) { - return false; - } - lod->push_back(v); - break; - } - - if (wt == WIRETYPE_LENGTH_DELIMITED) { - int num_bytes = 0; - if (!input->ReadVarintSizeAsInt(&num_bytes)) { - return tag; - } - int start_pos = input->CurrentPosition(); - while (input->CurrentPosition() - start_pos < num_bytes) { - uint64_t v; - if (!input->ReadVarint64(&v)) { - return tag; - } - lod->push_back(v); - } - break; - } - - return false; - } - default: { return false; } - } - } - - return true; -} - -int GRPCVariableResponse::Parse(Source* source) { - ::google::protobuf::io::ZeroCopyInputStream* input_stream = - source->contents(); - ::google::protobuf::io::CodedInputStream input(input_stream); - input.SetTotalBytesLimit(INT_MAX, INT_MAX); - - while (true) { - auto p = input.ReadTagWithCutoff(127); - int tag = GetTagFieldNumber(p.first); - WireType wt = GetTagWireType(p.first); - if (!p.second) { - if (tag != 0) { - return -1; - } - return 0; - } - - switch (tag) { - case sendrecv::VariableMessage::kVarnameFieldNumber: { - uint32_t length; - if ((wt != WIRETYPE_LENGTH_DELIMITED) || !input.ReadVarint32(&length)) { - return tag; - } - - std::string temp; - if (!input.ReadString(&temp, length)) { - return tag; - } - - meta_.set_varname(temp); - break; - } - case sendrecv::VariableMessage::kTypeFieldNumber: { - uint32_t v; - if ((wt != WIRETYPE_VARINT) || !input.ReadVarint32(&v)) { - return tag; - } - - meta_.set_type(static_cast<::sendrecv::VarType>(v)); - break; - } - case sendrecv::VariableMessage::kDataTypeFieldNumber: { - uint32_t v = 0; - if ((wt != WIRETYPE_VARINT) || !input.ReadVarint32(&v)) { - return tag; - } - - meta_.set_data_type(static_cast<::sendrecv::VariableMessage_Type>(v)); - break; - } - case sendrecv::VariableMessage::kDimsFieldNumber: { - // not packed - if (wt == WIRETYPE_VARINT) { - uint64_t v; - if (!input.ReadVarint64(&v)) { - return tag; - } - meta_.add_dims(v); - break; - } - - // packed - if (wt == WIRETYPE_LENGTH_DELIMITED) { - int num_bytes = 0; - if (!input.ReadVarintSizeAsInt(&num_bytes)) { - return tag; - } - int start_pos = input.CurrentPosition(); - while (input.CurrentPosition() - start_pos < num_bytes) { - uint64_t v; - if (!input.ReadVarint64(&v)) { - return tag; - } - meta_.add_dims(v); - } - break; - } - return tag; - } - case sendrecv::VariableMessage::kLodLevelFieldNumber: { - uint64_t v = 0; - if ((wt != WIRETYPE_VARINT) || !input.ReadVarint64(&v)) { - return tag; - } - meta_.set_lod_level(static_cast(v)); - break; - } - case sendrecv::VariableMessage::kLodFieldNumber: { - int length = 0; - if (wt != WIRETYPE_LENGTH_DELIMITED || - !ReadVarintSizeAsInt(&input, &length)) { - return tag; - } - - std::pair<::google::protobuf::io::CodedInputStream::Limit, int> p = - input.IncrementRecursionDepthAndPushLimit(length); - - std::vector lod_data; - if (p.second < 0 || !ParseLodData(&input, &lod_data)) { - return tag; - } - - if (!input.DecrementRecursionDepthAndPopLimit(p.first)) { - return tag; - } - - if (lod_data.size() == 0) { - break; - } - - auto lod = meta_.add_lod(); - for (uint32_t i = 0; i < lod_data.size(); i++) { - lod->add_lod_data(lod_data[i]); - } - break; - } - case sendrecv::VariableMessage::kSlrHeightFieldNumber: { - uint64_t v = 0; - if ((wt != WIRETYPE_VARINT) || !input.ReadVarint64(&v)) { - return tag; - } - meta_.set_slr_height(static_cast(v)); - break; - } - case sendrecv::VariableMessage::kSerializedFieldNumber: { - int num_bytes = 0; - if (wt != WIRETYPE_LENGTH_DELIMITED || - !ReadVarintSizeAsInt(&input, &num_bytes)) { - return tag; - } - - if (!ProcSerializedField(tag, &input, num_bytes)) { - return tag; - } - - break; - } - case sendrecv::VariableMessage::kRowsFieldNumber: { - PADDLE_ENFORCE((meta_.type() == sendrecv::SELECTED_ROWS || - meta_.type() == sendrecv::LOD_TENSOR) && - meta_.varname() != "", - platform::errors::PreconditionNotMet( - "meta info should be got first!")); - - int num_bytes = 0; - if (wt != WIRETYPE_LENGTH_DELIMITED || - !ReadVarintSizeAsInt(&input, &num_bytes)) { - return tag; - } - - if (!CopySelectRowsData(&input, *dev_ctx_, num_bytes)) { - return tag; - } - break; - } - case sendrecv::VariableMessage::kOutVarnameFieldNumber: { - uint32_t length; - if ((wt != WIRETYPE_LENGTH_DELIMITED) || !input.ReadVarint32(&length)) { - return tag; - } - - std::string temp; - if (!input.ReadString(&temp, length)) { - return tag; - } - - meta_.set_out_varname(temp); - break; - } - case sendrecv::VariableMessage::kProfileFieldNumber: { - uint64_t profiling = 0; - if (!input.ReadVarint64(&profiling)) { - return tag; - } - meta_.set_profile(profiling); - int64_t listener_id = platform::ListenerId(); - if (listener_id <= 0) { - break; - } - if (profiling == platform::kEnableProfiler && - !platform::IsProfileEnabled()) { - platform::EnableProfiler(platform::ProfilerState::kCPU); - } else if (profiling == platform::kDisableProfiler && - platform::IsProfileEnabled()) { - platform::DisableProfiler( - platform::EventSortingKey::kDefault, - string::Sprintf("%s_%lld", FLAGS_rpc_server_profile_path, - listener_id)); - } - break; - } - case sendrecv::VariableMessage::kTrainerIdFieldNumber: { - uint64_t trainer_id = 0; - if (!input.ReadVarint64(&trainer_id)) { - return tag; - } - meta_.set_trainer_id(trainer_id); - break; - } - case sendrecv::VariableMessage::kTableNameFieldNumber: { - uint32_t length; - if ((wt != WIRETYPE_LENGTH_DELIMITED) || !input.ReadVarint32(&length)) { - return tag; - } - - std::string temp; - if (!input.ReadString(&temp, length)) { - return tag; - } - - meta_.set_table_name(temp); - break; - } - default: { - // Unknown tag, return unknown error. - return -1; - } - } - } - - return 0; -} - -}; // namespace distributed -}; // namespace operators -}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/grpc/grpc_variable_response.h b/paddle/fluid/operators/distributed/grpc/grpc_variable_response.h deleted file mode 100644 index 4d12b4a4bac..00000000000 --- a/paddle/fluid/operators/distributed/grpc/grpc_variable_response.h +++ /dev/null @@ -1,67 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include - -#include "google/protobuf/io/coded_stream.h" -#include "google/protobuf/io/zero_copy_stream.h" -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/framework/var_type.h" -#include "paddle/fluid/operators/distributed/distributed_pb.h" -#include "paddle/fluid/operators/distributed/grpc/grpc_bytebuffer_stream.h" -#include "paddle/fluid/operators/distributed/variable_response.h" - -namespace grpc { -class ByteBuffer; -} // namespace grpc -namespace paddle { -namespace framework { -class Scope; -} // namespace framework -namespace platform { -class DeviceContext; -} // namespace platform -} // namespace paddle - -namespace paddle { -namespace operators { -namespace distributed { - -class GRPCVariableResponse : public VariableResponse { - public: - GRPCVariableResponse(const framework::Scope* scope, - const platform::DeviceContext* dev_ctx, - bool create_scope = false) - : VariableResponse(scope, dev_ctx, create_scope) {} - - virtual ~GRPCVariableResponse() {} - - int Parse(Source* source) override; - - // return: - // 0:ok. - // -1: unkown error. - // other: number of error field. - int Parse(const ::grpc::ByteBuffer& byte_buffer); -}; - -}; // namespace distributed -}; // namespace operators -}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/heart_beat_monitor.cc b/paddle/fluid/operators/distributed/heart_beat_monitor.cc deleted file mode 100644 index 9f537f53348..00000000000 --- a/paddle/fluid/operators/distributed/heart_beat_monitor.cc +++ /dev/null @@ -1,97 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/distributed/heart_beat_monitor.h" - -#include - -namespace paddle { -namespace operators { -namespace distributed { - -DEFINE_int32(worker_update_interval_secs, 900, - " the longest time interval between the worker update variables"); - -inline int GetCurrentUS() { - // current date/time based on current system - time_t t = std::time(0); - int now = static_cast(t); - return now; -} - -void HeartBeatMonitor::Update(const int worker_id, std::string be_monitored_var, - WorkerStatus status) { - if (status == UNINITED) { - LOG(WARNING) << "HeartBeatMonitor receive UNINITED status can not be used " - "in Update, something error"; - } - - if (!is_chief_) { - return; - } - - if ((be_monitored_var == be_monitored_var_ && status == RUNNING) || - status == COMPLETED) { - auto timestamp = GetCurrentUS(); - UnderMonitoredWorker& worker = worker_status_map_.at(worker_id); - - if (worker.status != COMPLETED) { - worker.status = status; - } - worker.timestamp = timestamp; - return; - } -} - -void HeartBeatMonitor::LostWorkerMonitor() { - VLOG(1) << "worker heartbeat monitor start at No.0 parameter server"; - while (running_) { - for (int id = 0; id < workers_; ++id) { - auto& worker = worker_status_map_.at(id); - - if (worker.status == UNINITED) { - VLOG(4) << "worker " << worker.id << " is under UNINITED"; - continue; - } - if (worker.status == COMPLETED) { - VLOG(4) << "worker " << worker.id << " is under COMPLETED"; - continue; - } - - auto timestamp = GetCurrentUS(); - - VLOG(4) << "worker " << worker.id << " status is " << worker.status - << " timestamp is " << worker.timestamp << " the interval is " - << timestamp - worker.timestamp; - - if (timestamp - worker.timestamp >= FLAGS_worker_update_interval_secs) { - PADDLE_THROW(platform::errors::ExecutionTimeout( - "the latest update of worker %d is %d secs ago, we doubt the " - "the worker is not alive and this may have a bad effect on the " - "fitting result, please check", - worker.id, FLAGS_worker_update_interval_secs)); - } - } - - std::this_thread::sleep_for(std::chrono::milliseconds(10 * 1000)); - } - VLOG(1) << "worker heartbeat monitor stopped, thread exit"; -} - -std::once_flag HeartBeatMonitor::init_flag_; -std::unique_ptr HeartBeatMonitor::monitor_(nullptr); - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/heart_beat_monitor.h b/paddle/fluid/operators/distributed/heart_beat_monitor.h deleted file mode 100644 index d96433c318b..00000000000 --- a/paddle/fluid/operators/distributed/heart_beat_monitor.h +++ /dev/null @@ -1,127 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include // NOLINT -#include -#include -#include // NOLINT -#include -#include -#include -#include -#include "gflags/gflags.h" - -#include "paddle/fluid/platform/enforce.h" - -namespace paddle { -namespace operators { -namespace distributed { - -enum WorkerStatus { UNINITED = 0, RUNNING, COMPLETED }; - -struct UnderMonitoredWorker { - int id; - WorkerStatus status; - int timestamp; - - UnderMonitoredWorker() {} - - explicit UnderMonitoredWorker(int worker_id) { - this->id = worker_id; - this->status = UNINITED; - this->timestamp = 0; - } -}; - -class HeartBeatMonitor { - public: - explicit HeartBeatMonitor(int workers, bool is_chief, - std::string be_monitored_var) - : workers_(workers), - is_chief_(is_chief), - be_monitored_var_(be_monitored_var), - running_(true) { - PADDLE_ENFORCE_GT(workers, 0, platform::errors::InvalidArgument( - "workers must greater than 0.")); - - for (auto worker_id = 0; worker_id < workers; worker_id++) { - UnderMonitoredWorker worker(worker_id); - worker_status_map_[worker_id] = std::move(worker); - } - - // we define the No.0 pserver is the first parameter server - // only No.0 will check the heartbeat of all trainers - if (is_chief) { - monitor_thread_.reset(new std::thread( - std::bind(&HeartBeatMonitor::LostWorkerMonitor, this))); - } - } - - ~HeartBeatMonitor() { - running_ = false; - if (monitor_thread_) monitor_thread_->join(); - } - - static void Init(int workers, bool is_chief, std::string be_monitored_var) { - std::call_once(init_flag_, &HeartBeatMonitor::InitImpl, workers, is_chief, - be_monitored_var); - } - - static HeartBeatMonitor* GetInstance() { return monitor_.get(); } - - void Stop() { - running_ = false; - if (!monitor_) { - VLOG(0) << "HeartBeatMonitor is not inited, do nothing"; - } else { - if (monitor_thread_) { - monitor_thread_->join(); - monitor_thread_.reset(nullptr); - } - } - } - - void Update(const int worker_id, std::string be_monitored_var, - WorkerStatus status); - - void LostWorkerMonitor(); - - private: - // Init is called by GetInstance. - static void InitImpl(int workers, bool is_chief, - std::string be_monitored_var) { - if (monitor_ == nullptr) { - monitor_.reset(new HeartBeatMonitor(workers, is_chief, be_monitored_var)); - } - } - - static std::once_flag init_flag_; - static std::unique_ptr monitor_; - - int workers_; - bool is_chief_; - std::string be_monitored_var_; - std::unordered_map worker_status_map_; - std::unique_ptr monitor_thread_{nullptr}; - std::mutex mutex_; - bool running_ = false; -}; - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/heart_beat_monitor_test.cc b/paddle/fluid/operators/distributed/heart_beat_monitor_test.cc deleted file mode 100644 index 8505023f63a..00000000000 --- a/paddle/fluid/operators/distributed/heart_beat_monitor_test.cc +++ /dev/null @@ -1,54 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/distributed/heart_beat_monitor.h" - -#include "gtest/gtest.h" - -namespace paddle { -namespace operators { -namespace distributed { - -void run(HeartBeatMonitor* monitor) { monitor->LostWorkerMonitor(); } - -TEST(HeartBeatMonitor, All) { - int trainers = 10; - int pserver_id = 0; - std::string var = "nce_w@GRAD.block0"; - std::string var2 = "nce_w@GRAD.block2"; - - HeartBeatMonitor::Init(trainers, pserver_id == 0, var); - - auto* monitor = HeartBeatMonitor::GetInstance(); - - std::vector ids{1, 3, 5, 7}; - - for (auto& id : ids) { - monitor->Update(id, var, RUNNING); - } - - monitor->Update(9, var2, RUNNING); - monitor->Update(2, var, COMPLETED); - - std::thread t(run, monitor); - t.detach(); - - std::this_thread::sleep_for(std::chrono::milliseconds(15 * 1000)); - - monitor->Stop(); -} - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/large_scale_kv.cc b/paddle/fluid/operators/distributed/large_scale_kv.cc deleted file mode 100644 index d2673ed6ffb..00000000000 --- a/paddle/fluid/operators/distributed/large_scale_kv.cc +++ /dev/null @@ -1,26 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/distributed/large_scale_kv.h" - -namespace paddle { -namespace operators { -namespace distributed { - -std::once_flag LargeScaleKV::init_flag_; -std::shared_ptr LargeScaleKV::scale_kv_(nullptr); - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/large_scale_kv.h b/paddle/fluid/operators/distributed/large_scale_kv.h deleted file mode 100644 index da2281231fc..00000000000 --- a/paddle/fluid/operators/distributed/large_scale_kv.h +++ /dev/null @@ -1,848 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include // NOLINT -#include -#include -#include // NOLINT -#include -#include -#include -#include -#include "gflags/gflags.h" - -#include "paddle/fluid/framework/generator.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/rw_lock.h" -#include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/framework/threadpool.h" -#include "paddle/fluid/framework/variable.h" -#include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/place.h" -#include "paddle/fluid/platform/port.h" -#include "paddle/fluid/string/printf.h" -#include "paddle/fluid/string/string_helper.h" - -namespace paddle { -namespace operators { -namespace distributed { - -enum Mode { training, infer }; -enum InitType { uniform_random, fill_constant, gaussian_random }; - -inline std::vector bucket(const int v_size, const int b_size) { - int remainder = v_size % b_size; - int bucket = v_size / b_size; - std::vector ret_vec(b_size, bucket); - for (int i = 0; i < remainder; ++i) { - ret_vec[i] = ret_vec[i] + 1; - } - int cur_bucket = 0; - for (int &j : ret_vec) { - int tmp = j; - j = cur_bucket; - cur_bucket += tmp; - } - ret_vec.push_back(cur_bucket); - return ret_vec; -} - -class Initializer { - public: - Initializer() {} - - explicit Initializer(const std::vector &attrs) {} - - virtual float GetValue() = 0; - - virtual ~Initializer() {} - - protected: - std::string name_; - unsigned int seed_; -}; - -class UniformInitializer : public Initializer { - public: - explicit UniformInitializer(const std::vector &attrs) { - name_ = attrs[0]; - seed_ = static_cast(std::stoi(attrs[1])); - min_ = std::stof(attrs[2]); - max_ = std::stof(attrs[3]); - - dist_ = std::uniform_real_distribution(min_, max_); - random_engine_ = framework::GetCPURandomEngine(seed_); - } - - float GetValue() override { return dist_(*random_engine_); } - - private: - float min_; - float max_; - - std::shared_ptr random_engine_; - std::uniform_real_distribution dist_; -}; - -template -inline bool entry(const int count, const T threshold); - -template <> -inline bool entry(const int count, const std::string threshold) { - return true; -} - -template <> -inline bool entry(const int count, const int threshold) { - return count >= threshold; -} - -template <> -inline bool entry(const int count, const float threshold) { - UniformInitializer uniform = UniformInitializer({"0", "0", "1"}); - return uniform.GetValue() >= threshold; -} - -class GaussianInitializer : public Initializer { - public: - explicit GaussianInitializer(const std::vector &attrs) { - name_ = attrs[0]; - seed_ = static_cast(std::stoi(attrs[1])); - mean_ = std::stof(attrs[2]); - std_ = std::stof(attrs[3]); - - random_engine_ = framework::GetCPURandomEngine(seed_); - - dist_ = std::normal_distribution(mean_, std_); - } - - float GetValue() override { return dist_(*random_engine_); } - - private: - float std_; - float mean_; - - std::shared_ptr random_engine_; - std::normal_distribution dist_; -}; - -class FillConstantInitializer : public Initializer { - public: - explicit FillConstantInitializer(const std::vector &attrs) { - name_ = attrs[0]; - value_ = std::stof(attrs[1]); - } - - float GetValue() override { return value_; } - - private: - float value_; -}; - -struct SparseMeta { - std::string name; - std::string grad_name; - std::vector value_names; - std::vector value_dims; - std::vector cached_varnames; - std::vector initializer_attrs; - std::string entry; - Mode mode; - - std::string ToString() { - std::stringstream ss; - ss << "name: " << name << " "; - ss << "mode: " << mode << " "; - - for (int i = 0; i < static_cast(value_names.size()); i++) { - ss << "value_name: " << value_names[i] << " dim: " << value_dims[i] - << " "; - } - - ss << " grad var: " << grad_name; - - ss << " cached varnames: "; - for (int i = 0; i < static_cast(cached_varnames.size()); i++) { - ss << cached_varnames[i] << " "; - } - - ss << " initializer attrs: "; - for (int i = 0; i < static_cast(initializer_attrs.size()); i++) { - ss << initializer_attrs[i] << " "; - } - - ss << " entry attrs: " << entry; - - return ss.str(); - } -}; - -struct VALUE { - explicit VALUE(const std::vector &names) - : names_(names), count_(0), unseen_days_(0) { - values_.resize(names.size()); - for (int i = 0; i < static_cast(names.size()); i++) { - places[names[i]] = i; - } - } - - void set(std::vector> *values) { - values_ = std::move(*values); - } - - void set(const std::vector &names, - const std::vector> &values) { - for (int i = 0; i < static_cast(names.size()); i++) { - auto idx = places[names[i]]; - auto value = values[i]; - values_[idx].assign(value.begin(), value.end()); - } - } - - std::vector *> get() { - auto pts = std::vector *>(); - pts.reserve(values_.size()); - - for (auto &value : values_) { - pts.push_back(&value); - } - return pts; - } - - int fetch_count() { return ++count_; } - void reset_unseen_days() { unseen_days_ = 0; } - - void set_entry(bool is_entry) { is_entry_ = is_entry; } - - bool get_entry() { return is_entry_; } - - std::vector *> get(const std::vector names) { - auto pts = std::vector *>(); - pts.reserve(values_.size()); - - for (int i = 0; i < static_cast(names.size()); i++) { - pts.push_back(&(values_[places[names[i]]])); - } - return pts; - } - - std::vector names_; - int count_; - bool seen_after_last_save_; - int unseen_days_; - bool is_entry_; - std::vector> values_; - std::unordered_map places; -}; - -class ValueBlock { - public: - explicit ValueBlock(const std::vector value_names, - const std::vector value_dims, const Mode &mode, - const std::vector &init_attrs, - const std::string &entry_attr) - : value_names_(value_names), value_dims_(value_dims), mode_(mode) { - // for Initializer - for (size_t i = 0; i < value_names.size(); i++) { - auto name = value_names[i]; - auto slices = string::split_string(init_attrs[i], "&"); - - if (slices[0] == "gaussian_random") { - initializers_[name] = new GaussianInitializer(slices); - } else if (slices[0] == "fill_constant") { - initializers_[name] = new FillConstantInitializer(slices); - } else if (slices[0] == "uniform_random") { - initializers_[name] = new UniformInitializer(slices); - } else { - PADDLE_THROW( - platform::errors::InvalidArgument("%s can not be supported", name)); - } - } - - // for Entry - { - if (entry_attr == "none") { - entry_func_ = - std::bind(entry, std::placeholders::_1, "none"); - } else { - auto slices = string::split_string(entry_attr, "&"); - if (slices[0] == "count_filter") { - int threshold = std::stoi(slices[1]); - entry_func_ = std::bind(entry, std::placeholders::_1, threshold); - } else if (slices[0] == "probability") { - float threshold = std::stof(slices[1]); - entry_func_ = - std::bind(entry, std::placeholders::_1, threshold); - } - } - } - - rwlock_.reset(new framework::RWLock); - } - - ~ValueBlock() { - // for (auto init : initializers_) { - // delete init.second; - // initializers_.erase(init.first); - // } - // - // for (auto value : values_) { - // delete value.second; - // values_.erase(value.first); - // } - } - - void Init(const int64_t &id, std::vector> *values, - int count) { - if (Has(id)) { - PADDLE_THROW(platform::errors::AlreadyExists("id already exist, error")); - } - - if (values->size() != value_names_.size()) { - PADDLE_THROW( - platform::errors::AlreadyExists("values can not match, error")); - } - - auto value = new VALUE(value_names_); - value->set(values); - value->seen_after_last_save_ = true; - value->count_ = count; - values_[id] = value; - } - - std::vector *> Get( - const int64_t &id, const std::vector &value_names) { - rwlock_->RDLock(); - auto ret_values = values_.at(id)->get(value_names); - rwlock_->UNLock(); - return ret_values; - } - - void InitFromInitializer(const int64_t &id, - const std::vector &value_names) { - rwlock_->WRLock(); - - if (Has(id)) { - Update(id); - rwlock_->UNLock(); - return; - } - - auto rets = std::vector>(); - rets.resize(value_names_.size()); - - for (int i = 0; i < static_cast(value_names_.size()); i++) { - auto name = value_names_[i]; - auto *init = initializers_.at(name); - - auto dim = value_dims_[i]; - rets[i].resize(dim); - - for (int j = 0; j < static_cast(dim); j++) { - rets[i][j] = init->GetValue(); - } - } - - Init(id, &rets, 0); - Update(id); - rwlock_->UNLock(); - } - - bool GetEntry(const int64_t &id) { - rwlock_->RDLock(); - auto value = values_.at(id); - auto entry = value->get_entry(); - rwlock_->UNLock(); - return entry; - } - - void Set(const int64_t &id, const std::vector &value_names, - const std::vector> &values) { - rwlock_->WRLock(); - auto value = values_.at(id); - value->set(value_names, values); - rwlock_->UNLock(); - } - - void Update(const int64_t id) { - auto *value = values_.at(id); - value->reset_unseen_days(); - auto count = value->fetch_count(); - - if (!value->get_entry()) { - value->set_entry(entry_func_(count)); - } - } - - private: - bool Has(const int64_t id) { - auto got = values_.find(id); - if (got == values_.end()) { - return false; - } else { - return true; - } - } - - public: - std::unordered_map values_; - - private: - std::vector value_names_; - std::vector value_dims_; - Mode mode_; - std::function entry_func_; - std::unordered_map initializers_; - std::unique_ptr rwlock_{nullptr}; -}; - -class SparseVariable { - public: - explicit SparseVariable(const SparseMeta &meta) { - meta_.name = meta.name; - meta_.mode = meta.mode; - meta_.value_names = meta.value_names; - meta_.value_dims = meta.value_dims; - meta_.grad_name = meta.grad_name; - meta_.cached_varnames = meta.cached_varnames; - meta_.initializer_attrs = meta.initializer_attrs; - meta_.entry = meta.entry; - - for (int i = 0; i < static_cast(meta_.value_names.size()); i++) { - values_dims_[meta_.value_names[i]] = meta_.value_dims[i]; - } - - for (size_t i = 0; i < shard_num_; i++) { - auto block = std::make_shared( - meta.value_names, meta.value_dims, meta.mode, meta.initializer_attrs, - meta.entry); - shard_blocks_.emplace_back(block); - } - - rwlock_.reset(new framework::RWLock); - } - - void Init(const std::vector &ids) { - rwlock_->RDLock(); - for (auto &id : ids) { - auto *block = GetShard(id); - block->InitFromInitializer(id, meta_.value_names); - } - rwlock_->UNLock(); - } - - void Get(const std::vector &ids, - const std::vector &value_names, - std::vector *>> *values) { - values->resize(ids.size()); - - auto buckets = bucket(ids.size(), 8); - std::vector> fs; - - for (int j = 0; j < 8; ++j) { - auto begin = buckets[j]; - auto end = buckets[j + 1]; - - fs.push_back( - framework::Async([begin, end, &values, &ids, &value_names, this]() { - for (int x = begin; x < end; x++) { - auto id = ids[x]; - auto *block = GetShard(id); - auto id_values = block->Get(id, value_names); - (*values)[x] = id_values; - } - })); - } - - for (size_t i = 0; i < fs.size(); ++i) fs[i].wait(); - } - - void GetEntry(const std::vector &ids, std::vector *values) { - auto buckets = bucket(ids.size(), 8); - std::vector> fs; - - for (int j = 0; j < 8; ++j) { - auto begin = buckets[j]; - auto end = buckets[j + 1]; - - fs.push_back(framework::Async([begin, end, &values, &ids, this]() { - for (int x = begin; x < end; x++) { - auto id = ids[x]; - auto *block = GetShard(id); - auto is_entry = block->GetEntry(id); - - if (!is_entry) { - values->push_back(id); - } - } - })); - } - for (size_t i = 0; i < fs.size(); ++i) fs[i].wait(); - } - - void Set(const std::vector &ids, - const std::vector &value_names, - const std::vector>> &values) { - for (int i = 0; i < static_cast(ids.size()); i++) { - GetShard(ids[i])->Set(ids[i], value_names, values[i]); - } - } - - void Dims(std::vector value_names, std::vector *dims) { - for (auto &name : value_names) { - dims->push_back(values_dims_.at(name)); - } - } - - std::vector CachedVarnames() const { - return meta_.cached_varnames; - } - - void Load(const std::string &dirname) { - rwlock_->WRLock(); - VLOG(1) << "load " << meta_.name << " from dir: " << dirname << " begin"; - - std::vector filenames; - for (auto &value_name : meta_.value_names) { - auto filename = string::Sprintf("%s/%s", dirname, value_name); - filenames.push_back(filename); - } - - LoadFromSelectedRows(filenames, meta_.value_names); - VLOG(1) << "load " << meta_.name << " in dir: " << dirname << " done"; - rwlock_->UNLock(); - } - - void LoadFromSelectedRows(const std::vector &filenames, - const std::vector &valuenames) { - std::vector> variables; - auto place = platform::CPUPlace(); - - for (int i = 0; i < static_cast(filenames.size()); i++) { - auto var = std::make_shared(); - variables.push_back(var); - auto &filename = filenames[i]; - std::ifstream fin(filename, std::ios::binary); - auto *selectedRows = var->GetMutable(); - - platform::DeviceContextPool &pool = - platform::DeviceContextPool::Instance(); - auto &dev_ctx = *pool.Get(place); - - framework::DeserializeFromStream(fin, selectedRows, dev_ctx); - selectedRows->SyncIndex(); - } - - std::vector tensors; - - for (int i = 0; i < static_cast(filenames.size()); i++) { - auto &slr = variables[i]->Get(); - auto src_t = slr.value(); - const auto *value = src_t.data(); - tensors.push_back(value); - } - - for (int i = 1; i < static_cast(filenames.size()); i++) { - auto rows_0 = variables[0]->Get().rows(); - auto rows_i = variables[i]->Get().rows(); - - bool is_equal = std::equal(rows_0.begin(), rows_0.end(), rows_i.begin()); - - if (!is_equal) { - PADDLE_THROW(platform::errors::InvalidArgument( - "%s and %s are not equal, can not be load rightly", filenames[0], - filenames[i])); - } - } - - auto rows = variables[0]->Get().rows(); - - for (auto i = 0; i < static_cast(rows.size()); i++) { - auto id = rows[i]; - std::vector> values; - values.resize(filenames.size()); - - for (int j = 0; j < static_cast(filenames.size()); ++j) { - values[j].resize(meta_.value_dims[j]); - std::memcpy(values[j].data(), tensors[j] + i * meta_.value_dims[j], - sizeof(float) * meta_.value_dims[j]); - } - - auto *block = GetShard(id); - block->Init(id, &values, 0); - block->Update(id); - } - } - - void Save(const std::string &dirname, const int mode = 0) { - rwlock_->WRLock(); - VLOG(3) << "save " << meta_.name << " in dir: " << dirname << " begin"; - - MkDirRecursively(dirname.c_str()); - - std::vector filenames; - for (auto &value_name : meta_.value_names) { - auto filename = string::Sprintf("%s/%s", dirname, value_name); - filenames.push_back(filename); - } - - SaveToSelectedRows(filenames, meta_.value_names, mode); - VLOG(3) << "save " << meta_.name << " in dir: " << dirname << " done"; - rwlock_->UNLock(); - } - - void SaveToSelectedRows(const std::vector &filenames, - const std::vector &valuenames, - const int mode) { - for (auto &value_name : valuenames) { - auto it = std::find(meta_.value_names.begin(), meta_.value_names.end(), - value_name); - if (it == meta_.value_names.end()) { - PADDLE_THROW(platform::errors::InvalidArgument( - "[%s] is invalid param for [%s]", value_name, meta_.name)); - } - } - - auto place = platform::CPUPlace(); - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto &dev_ctx = *pool.Get(place); - - std::vector ids; - - for (auto &block : shard_blocks_) { - for (auto value : block->values_) { - if (mode == 0) { - ids.push_back(value.first); - } else { - bool id_need_save = false; - // save all params - if (mode == 1) { - id_need_save = true; - } else { - id_need_save = value.second->seen_after_last_save_; - } - - if (id_need_save) { - ids.push_back(value.first); - } - value.second->seen_after_last_save_ = false; - } - } - } - - VLOG(3) << "save " << ids.size() << " feasigns for " << meta_.name - << " with mode: " << mode; - - std::vector> variables; - std::vector tensors; - std::vector dims; - - for (int i = 0; i < static_cast(filenames.size()); i++) { - auto dim = values_dims_.at(valuenames[i]); - auto var = std::make_shared(); - auto *slr = var->GetMutable(); - auto *src_t = slr->mutable_value(); - - src_t->Resize({static_cast(ids.size()), dim}); - auto *value = src_t->mutable_data(place); - - dims.push_back(dim); - variables.push_back(var); - tensors.push_back(value); - } - - std::vector *>> values; - Get(ids, valuenames, &values); - - int64_t offset = 0; - for (auto &vss : values) { - for (int i = 0; i < static_cast(vss.size()); i++) { - auto &vs = vss[i]; - std::memcpy(tensors[i] + offset * dims[i], vs->data(), - sizeof(float) * dims[i]); - } - offset += 1; - } - - for (auto &var : variables) { - auto *slr = var->GetMutable(); - slr->set_rows(ids); - slr->set_height(ids.size()); - } - - for (int i = 0; i < static_cast(filenames.size()); i++) { - auto &filename = filenames[i]; - auto &selectedRows = variables[i]->Get(); - - std::ofstream fout(filename, std::ios::binary); - PADDLE_ENFORCE_EQ(static_cast(fout), true, - platform::errors::Unavailable( - "Cannot open %s to save variables.", filename)); - - framework::SerializeToStream(fout, selectedRows, dev_ctx); - fout.close(); - } - } - - void SaveToText(const std::vector &filenames, - const std::vector &valuenames) { - for (auto &value_name : valuenames) { - auto it = std::find(meta_.value_names.begin(), meta_.value_names.end(), - value_name); - if (it == meta_.value_names.end()) { - PADDLE_THROW(platform::errors::InvalidArgument( - "[%s] is invalid param for [%s]", value_name, meta_.name)); - } - } - - std::vector> fouts; - - for (auto filename : filenames) { - std::unique_ptr fout(new std::ofstream(filename)); - fouts.push_back(std::move(fout)); - } - - for (auto &block : shard_blocks_) { - for (auto value : block->values_) { - std::vector *> vss = value.second->get(valuenames); - - auto id = value.first; - - for (int i = 0; i < static_cast(vss.size()); i++) { - auto &vs = vss[i]; - std::stringstream ss; - ss << id << "\t"; - ss << vs->size() << "\t"; - for (auto v : (*vs)) { - ss << v << " "; - } - ss << "\n"; - - fouts[i]->write(ss.str().c_str(), sizeof(char) * ss.str().size()); - } - } - } - - for (int i = 0; i < static_cast(fouts.size()); i++) { - fouts[i]->close(); - } - } - - int64_t Size() { - int64_t cnt = 0; - - for (auto &block : shard_blocks_) { - cnt += block->values_.size(); - } - return cnt; - } - - ValueBlock *GetShard(const int64_t id) { - return shard_blocks_[id & shard_mask_].get(); - } - - SparseMeta *GetMeta() { return &meta_; } - - private: - std::unique_ptr rwlock_{nullptr}; - - SparseMeta meta_; - std::unordered_map values_dims_; - const size_t shard_mask_ = 127; - const size_t shard_num_ = 128; - std::vector> shard_blocks_; -}; - -class LargeScaleKV { - public: - LargeScaleKV() {} - - explicit LargeScaleKV(const std::vector &table_metas) { - for (auto &sparse_meta : table_metas) { - auto table_name = sparse_meta.name; - auto meta = std::shared_ptr( - new SparseVariable(std::move(sparse_meta))); - sparse_variables[table_name] = meta; - grad_to_variables[sparse_meta.grad_name] = table_name; - grad_names_.push_back(sparse_meta.grad_name); - } - } - - ~LargeScaleKV() {} - - static std::shared_ptr GetInstantcePtr() { return scale_kv_; } - - static LargeScaleKV *GetInstance() { return scale_kv_.get(); } - - static LargeScaleKV *InitInstance( - const std::vector &table_metas) { - std::call_once(init_flag_, &LargeScaleKV::Init, table_metas); - return scale_kv_.get(); - } - - static void Init(const std::vector &table_metas) { - if (scale_kv_.get() == nullptr) { - scale_kv_.reset(new LargeScaleKV(table_metas)); - } - } - - SparseVariable *Get(const std::string &name) { - auto variable = sparse_variables.at(name); - return variable.get(); - } - - bool ParamInLargeScale(const std::string &name) { - auto got = sparse_variables.find(name); - - if (got == sparse_variables.end()) { - return false; - } - - return true; - } - - bool GradInLargeScale(const std::string &name) { - auto got = grad_to_variables.find(name); - - if (got == grad_to_variables.end()) { - return false; - } - - return true; - } - - SparseVariable *GetByGrad(const std::string &name) { - return Get(grad_to_variables[name]); - } - - const std::vector &GetAllGrads() { return grad_names_; } - - private: - std::unordered_map> - sparse_variables; - std::unordered_map grad_to_variables; - std::vector grad_names_; - static std::shared_ptr scale_kv_; - static std::once_flag init_flag_; -}; - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.cc b/paddle/fluid/operators/distributed/parameter_prefetch.cc deleted file mode 100644 index 558d70e5c33..00000000000 --- a/paddle/fluid/operators/distributed/parameter_prefetch.cc +++ /dev/null @@ -1,311 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/distributed/parameter_prefetch.h" -#include -#include -#include -#include -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/operators/distributed/distributed.h" - -namespace paddle { -namespace framework { -class ExecutionContext; -class Scope; -} // namespace framework -} // namespace paddle - -namespace paddle { -namespace operators { -namespace distributed { - -class RPCClient; - -using LoDTensor = framework::LoDTensor; -using LoDTensor = framework::LoDTensor; -using SelectedRows = framework::SelectedRows; -using DDim = framework::DDim; - -static void SplitIdsIntoMultipleVarsBySection( - const std::vector &in_ids, - const std::vector &in_varnames, const int tables, - const int pservers, const bool is_distibuted, framework::Scope *scope, - std::vector> *splited_ids, - std::vector> *origin_ids) { - PADDLE_ENFORCE_EQ( - in_varnames.size(), tables, - platform::errors::OutOfRange( - "send varnames size: %d not equal table number: %d, internal error", - in_varnames.size(), tables)); - - PADDLE_ENFORCE_LE( - tables, pservers, - platform::errors::OutOfRange("table number %d not equal or less than " - "pserver number: %d, internal error", - tables, pservers)); - - auto place = platform::CPUPlace(); - - std::set st(in_ids.begin(), in_ids.end()); - std::vector all_ids; - all_ids.assign(st.begin(), st.end()); - - splited_ids->resize(tables); - origin_ids->resize(tables); - - if (is_distibuted) { - for (auto &id : all_ids) { - auto pserver_id = id % pservers; - (*splited_ids)[pserver_id].push_back(id); - (*origin_ids)[pserver_id].push_back(id); - } - } else { - for (auto &id : all_ids) { - auto pserver_id = id % pservers; - (*origin_ids)[pserver_id].push_back(id); - id = id / pservers; - (*splited_ids)[pserver_id].push_back(id); - } - } - - for (size_t i = 0; i < in_varnames.size(); ++i) { - auto *id_tensor = - scope->Var(in_varnames[i])->GetMutable(); - - auto &ids = (*splited_ids)[i]; - if (!ids.empty()) { - auto *id_tensor_data = id_tensor->mutable_data( - framework::make_ddim({static_cast(ids.size()), 1}), place); - memcpy(id_tensor_data, ids.data(), sizeof(int64_t) * ids.size()); - } - } -} - -typedef std::vector> TableAndEndpoints; - -void prefetch_core( - const std::vector &ids, const TableAndEndpoints &tables, - const framework::ExecutionContext &context, const framework::Scope &scope, - const bool is_distributed, - std::unordered_map> *recved_vec_map) { - distributed::RPCClient *rpc_client = - distributed::RPCClient::GetInstance( - context.Attr("trainer_id")); - - int pservers = context.Attr("pserver_num"); - - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto &actual_ctx = *pool.Get(platform::CPUPlace()); - - std::unique_ptr local_scope = scope.NewTmpScope(); - - std::vector in_var_names; - std::vector out_var_names; - for (size_t i = 0; i < tables.size(); ++i) { - in_var_names.push_back("prefetch_send@" + tables[i].second); - out_var_names.push_back("prefetch_recv@" + tables[i].second); - } - - std::vector> split_ids; - std::vector> origin_ids; - SplitIdsIntoMultipleVarsBySection(ids, in_var_names, tables.size(), pservers, - is_distributed, local_scope.get(), - &split_ids, &origin_ids); - - // create output var in local scope - for (auto &name : out_var_names) { - local_scope->Var(name)->GetMutable(); - } - - std::vector rets; - for (size_t i = 0; i < in_var_names.size(); i++) { - if (NeedSend(*local_scope.get(), in_var_names[i])) { - VLOG(3) << "sending " << in_var_names[i] << " to " << tables[i].second - << " to get " << out_var_names[i] << " back"; - rets.push_back(rpc_client->AsyncPrefetchVar( - tables[i].second, actual_ctx, *local_scope.get(), in_var_names[i], - out_var_names[i], tables[i].first)); - } else { - VLOG(3) << "don't send no-initialied variable: " << out_var_names[i]; - } - } - for (size_t i = 0; i < rets.size(); i++) { - PADDLE_ENFORCE_NE(rets[i]->Wait(), 0U, platform::errors::ExecutionTimeout( - "internal error in RPCClient")); - } - - for (size_t o_idx = 0; o_idx < out_var_names.size(); ++o_idx) { - auto &ids_in_this_section = origin_ids[o_idx]; - - if (!ids_in_this_section.empty()) { - auto &prefetch_out_var = - local_scope->Var(out_var_names[o_idx])->Get(); - const auto *out_var_data = prefetch_out_var.data(); - auto &dims = prefetch_out_var.dims(); - - PADDLE_ENFORCE_EQ(dims.size(), 2, - platform::errors::InvalidArgument( - "The size of Tensor dims must be 2.")); - PADDLE_ENFORCE_EQ(ids_in_this_section.size(), dims[0], - platform::errors::InvalidArgument( - "The size of ids in this section must equal to " - "dims[0]: %s, but got %s", - dims[0], ids_in_this_section.size())); - - auto row_numel = dims[1]; - - for (int64_t i = 0; i < dims[0]; ++i) { - auto origin_id = ids_in_this_section[i]; - std::vector vecs(row_numel); - - std::copy_n(out_var_data + i * row_numel, row_numel, vecs.begin()); - (*recved_vec_map)[origin_id] = vecs; - } - } else { - VLOG(3) << "ids in this section is empty"; - } - } -} - -void prefetch(const std::string &id_name, const std::string &out_name, - const std::string &persistable_var_name, - const bool is_distributed, - const std::vector &table_names, - const std::vector &endpoints, - const framework::ExecutionContext &context, - const framework::Scope &scope) { - prefetchs({id_name}, {out_name}, persistable_var_name, is_distributed, - table_names, endpoints, context, scope); -} - -void prefetchs(const std::vector &id_var_names, - const std::vector &out_var_names, - const std::string &persistable_var_name, - const bool is_distributed, - const std::vector &table_names, - const std::vector &endpoints, - const framework::ExecutionContext &context, - const framework::Scope &scope) { - auto vec_dim_1 = 0; - auto vec_dim_0 = 0; - framework::Variable *var = scope.FindVar(persistable_var_name); - - if (var->IsType()) { - vec_dim_1 = var->Get().value().dims()[1]; - } else { - vec_dim_0 = var->Get().dims()[0]; - vec_dim_1 = var->Get().dims()[1]; - } - - PADDLE_ENFORCE_GT(vec_dim_1, 0, - platform::errors::InvalidArgument( - "lookup table var's dim must gather than 0")); - - const auto place = - scope.FindVar(id_var_names[0])->Get().place(); - - std::vector> ids_group; - std::vector ids_union; - std::vector ids_lods; - TableAndEndpoints tables; - - for (auto &id_name : id_var_names) { - auto &id_tensor = scope.FindVar(id_name)->Get(); - std::vector ids; - TensorToVector(id_tensor, context.device_context(), &ids); - ids_union.insert(ids_union.end(), ids.begin(), ids.end()); - ids_group.push_back(ids); - ids_lods.push_back(id_tensor.lod()); - } - - std::unordered_set s(ids_union.begin(), ids_union.end()); - ids_union.assign(s.begin(), s.end()); - - for (auto &i : ids_union) { - PADDLE_ENFORCE_GE( - i, 0, platform::errors::OutOfRange( - "each element in embedding should be larger or equal 0")); - if (!is_distributed) { - PADDLE_ENFORCE_LT( - i, vec_dim_0, - platform::errors::OutOfRange( - "embedding id must in [0, %d) when is_distributed False", - vec_dim_0)); - } - } - - for (size_t i = 0; i < table_names.size(); i++) { - tables.push_back(std::make_pair(table_names[i], endpoints[i])); - } - std::unordered_map> recved_vec_map; - prefetch_core(ids_union, tables, context, scope, is_distributed, - &recved_vec_map); - - auto padding_idx = distributed::kNoPadding; - - if (context.HasAttr("padding_idx")) { - padding_idx = context.Attr("padding_idx"); - } - - for (size_t i = 0; i < out_var_names.size(); i++) { - std::vector ids = ids_group[i]; - auto ids_size = ids.size(); - auto *out_t = - scope.FindVar(out_var_names[i])->GetMutable(); - out_t->set_lod(ids_lods[i]); - out_t->Resize( - framework::make_ddim({static_cast(ids_size), vec_dim_1})); - auto *out_d = out_t->mutable_data(place); - - if (platform::is_cpu_place(out_t->place())) { - for (auto idx = 0; idx < static_cast(ids_size); idx++) { - const auto &id = ids[idx]; - if (padding_idx != distributed::kNoPadding && id == padding_idx) { - memset(out_d + idx * vec_dim_1, 0, sizeof(float) * vec_dim_1); - } else { - std::copy_n(recved_vec_map[id].begin(), vec_dim_1, - out_d + idx * vec_dim_1); - } - } - } else { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - std::vector ids_value_vec(ids_size * vec_dim_1); - for (auto idx = 0; idx < static_cast(ids_size); idx++) { - const auto &id = ids[idx]; - if (padding_idx != distributed::kNoPadding && id == padding_idx) { - memset(&ids_value_vec[idx * vec_dim_1], 0, sizeof(float) * vec_dim_1); - } else { - memcpy(&ids_value_vec[idx * vec_dim_1], &recved_vec_map[id][0], - sizeof(float) * vec_dim_1); - } - } - auto &gpu_place = BOOST_GET_CONST(platform::CUDAPlace, out_t->place()); - auto &cpu_place = BOOST_GET_CONST( - platform::CPUPlace, paddle::platform::CPUDeviceContext().GetPlace()); - auto stream = context.cuda_device_context().stream(); - memory::Copy(gpu_place, out_d, cpu_place, &ids_value_vec[0], - sizeof(float) * ids_size * vec_dim_1, stream); -#else - PADDLE_ENFORCE(true, platform::errors::PermissionDenied( - "Paddle is not compiled with GPU!")); -#endif - } - } -} - -}; // namespace distributed -}; // namespace operators -}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.h b/paddle/fluid/operators/distributed/parameter_prefetch.h deleted file mode 100644 index 6fd3a998813..00000000000 --- a/paddle/fluid/operators/distributed/parameter_prefetch.h +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include - -#include "paddle/fluid/framework/operator.h" - -namespace paddle { -namespace framework { -class ExecutionContext; -class Scope; -} // namespace framework -} // namespace paddle - -namespace paddle { -namespace operators { -namespace distributed { - -constexpr int64_t kNoPadding = -1; - -void prefetchs(const std::vector& id_var_names, - const std::vector& out_var_names, - const std::string& persistable_var_name, const bool backfill, - const std::vector& table_names, - const std::vector& endpoints, - const framework::ExecutionContext& context, - const framework::Scope& scope); - -void prefetch(const std::string& id_name, const std::string& out_name, - const std::string& persistable_var_name, const bool backfill, - const std::vector& table_names, - const std::vector& endpoints, - const framework::ExecutionContext& context, - const framework::Scope& scope); - -}; // namespace distributed -}; // namespace operators -}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/parameter_recv.cc b/paddle/fluid/operators/distributed/parameter_recv.cc deleted file mode 100644 index d5d3c9c3c7c..00000000000 --- a/paddle/fluid/operators/distributed/parameter_recv.cc +++ /dev/null @@ -1,248 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include - -#include "glog/logging.h" -#include "paddle/fluid/framework/ddim.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/operators/distributed/communicator_common.h" -#include "paddle/fluid/operators/distributed/distributed.h" -#include "paddle/fluid/operators/distributed/parameter_recv.h" -#include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/place.h" - -namespace paddle { -namespace operators { -namespace distributed { - -class RPCClient; - -using LoDTensor = framework::LoDTensor; -using LoDTensor = framework::LoDTensor; -using SelectedRows = framework::SelectedRows; -using DDim = framework::DDim; - -template -void RecvSparseLodTensor(const CommContext &rpc_ctx, - const framework::Scope &scope) { - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto cpu_place = platform::CPUPlace(); - auto &cpu_ctx = *pool.Get(cpu_place); - - distributed::RPCClient *rpc_client = - distributed::RPCClient::GetInstance(rpc_ctx.trainer_id); - - std::unique_ptr local_scope = scope.NewTmpScope(); - std::vector tensors; - std::vector rets; - std::vector recv_varnames; - for (size_t i = 0; i < rpc_ctx.splited_varnames.size(); i++) { - auto &recv_var_name = rpc_ctx.splited_varnames[i]; - VLOG(4) << "recv " << recv_var_name << " from " << rpc_ctx.epmap[i]; - local_scope->Var(recv_var_name); - // sparse param in recv_scope is LoDTensor - rets.push_back(rpc_client->AsyncGetVarNoBarrier( - rpc_ctx.epmap[i], cpu_ctx, *local_scope.get(), recv_var_name, - recv_var_name)); - recv_varnames.push_back(recv_var_name); - } - - for (size_t i = 0; i < rets.size(); i++) { - PADDLE_ENFORCE_NE(rets[i]->Wait(), 0U, platform::errors::ExecutionTimeout( - "internal error in RPCClient")); - auto &recv_var_name = recv_varnames[i]; - auto *local_var = local_scope->FindVar(recv_var_name); - const auto *value = local_var->Get().data(); - tensors.push_back(value); - } - - auto *merged_var = scope.FindVar(rpc_ctx.var_name); - - if (merged_var == nullptr || !merged_var->IsInitialized()) { - PADDLE_THROW( - platform::errors::InvalidArgument("%s must initialized at first.")); - } - auto dims1 = merged_var->Get().dims()[1]; - int64_t height = 0; - for (size_t i = 0; i < rpc_ctx.splited_varnames.size(); i++) { - auto *splited_var = local_scope->FindVar(rpc_ctx.splited_varnames[i]); - height += splited_var->Get().dims()[0]; - } - - PADDLE_ENFORCE_EQ( - merged_var->Get().dims()[0], height, - platform::errors::InvalidArgument( - "Received variable must has same dimension with local variable.")); - - auto *merged_t = merged_var->GetMutable(); - auto *merged_d = merged_t->mutable_data(cpu_place); - - auto pserver_num = rpc_ctx.splited_varnames.size(); - for (int x = 0; x < height; ++x) { - auto id = x % pserver_num; - auto idx = x / pserver_num; - std::memcpy(merged_d + x * dims1, tensors[id] + idx * dims1, - sizeof(float) * dims1); - } -} - -template -void RecvGeoSparseRecords(const CommContext &rpc_ctx, - const framework::Scope &scope) { - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto cpu_place = platform::CPUPlace(); - auto &cpu_ctx = *pool.Get(cpu_place); - - distributed::RPCClient *rpc_client = - distributed::RPCClient::GetInstance(rpc_ctx.trainer_id); - - std::unique_ptr local_scope = scope.NewTmpScope(); - - std::vector rets; - for (size_t i = 0; i < rpc_ctx.splited_varnames.size(); i++) { - auto &recv_var_name = rpc_ctx.splited_varnames[i]; - local_scope->Var(recv_var_name); - VLOG(4) << "recv " << recv_var_name << " from " << rpc_ctx.epmap[i]; - // sparse param in recv_scope is LoDTensor - rets.push_back(rpc_client->AsyncGetVar(rpc_ctx.epmap[i], cpu_ctx, - *local_scope.get(), recv_var_name, - recv_var_name, recv_var_name)); - } - - for (size_t i = 0; i < rets.size(); i++) { - PADDLE_ENFORCE_NE(rets[i]->Wait(), 0U, platform::errors::ExecutionTimeout( - "internal error in RPCClient")); - } - - int64_t height = 0; - int64_t ids_num = 0; - int64_t width = 0; - - std::vector all_ids; - auto pserver_num = rpc_ctx.splited_varnames.size(); - - for (size_t i = 0; i < rpc_ctx.splited_varnames.size(); i++) { - auto &recv_var_name = rpc_ctx.splited_varnames[i]; - auto *recv_var = local_scope->FindVar(recv_var_name); - auto &recv_t = recv_var->Get(); - - height += recv_t.height(); - ids_num += recv_t.rows().size(); - width = recv_t.value().dims()[1]; - - if (rpc_ctx.is_distributed) { - std::copy(recv_t.rows().begin(), recv_t.rows().end(), - std::back_inserter(all_ids)); - } else { - std::transform(recv_t.rows().begin(), recv_t.rows().end(), - std::back_inserter(all_ids), - [&](int64_t id) { return id * pserver_num + i; }); - } - } - - auto *var = scope.FindVar(rpc_ctx.var_name); - auto *t_ = var->GetMutable(); - T *out_data = - t_->mutable_value()->mutable_data({ids_num, width}, cpu_place); - t_->set_height(height); - t_->set_rows(all_ids); - - int64_t cnt = 0; - for (size_t i = 0; i < rpc_ctx.splited_varnames.size(); i++) { - auto &recv_var_name = rpc_ctx.splited_varnames[i]; - auto *recv_var = local_scope->FindVar(recv_var_name); - auto &recv_t = recv_var->Get(); - - auto rows = recv_t.rows().size(); - const T *in_data = recv_t.value().data(); - std::copy_n(in_data, rows * width, out_data + cnt); - cnt += rows * width; - } - t_->SyncIndex(); -} - -template -void RecvLodTensor(const CommContext &rpc_ctx, const framework::Scope &scope) { - distributed::RPCClient *rpc_client = - distributed::RPCClient::GetInstance(rpc_ctx.trainer_id); - - std::vector rets; - - // variable do not spilt - if (rpc_ctx.origin_varnames.size() == 1 && - rpc_ctx.splited_varnames.size() == 1) { - auto varname = rpc_ctx.origin_varnames[0]; - const auto place = - scope.FindVar(varname)->Get().place(); - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto &ctx = *pool.Get(place); - VLOG(4) << "recv " << varname << " from " << rpc_ctx.epmap[0] << " in gpu? " - << platform::is_gpu_place(place); - rets.push_back(rpc_client->AsyncGetVarNoBarrier(rpc_ctx.epmap[0], ctx, - scope, varname, varname)); - - for (size_t i = 0; i < rets.size(); i++) { - PADDLE_ENFORCE_NE( - rets[i]->Wait(), 0U, - platform::errors::ExecutionTimeout("internal error in RPCClient")); - } - - VLOG(3) << "ParameterRecv out " << rpc_ctx.var_name; - return; - } else { - PADDLE_ENFORCE(false, platform::errors::Unimplemented( - "ParameterRecv can not recv dense with multi " - "parts now, add it soon.")); - } -} - -template -void ParameterRecv::operator()(const CommContext &rpc_ctx, - const framework::Scope &scope, - bool geo_records) { - VLOG(3) << "ParameterRecv in " << rpc_ctx.var_name; - - PADDLE_ENFORCE_GE(rpc_ctx.origin_varnames.size(), 1, - platform::errors::InvalidArgument( - "origin_varnames.size() >= 1 is permitted")); - - if (rpc_ctx.is_sparse) { - if (geo_records) { - RecvGeoSparseRecords(rpc_ctx, scope); - } else { - RecvSparseLodTensor(rpc_ctx, scope); - } - } else { - RecvLodTensor(rpc_ctx, scope); - } - - VLOG(3) << "ParameterRecv out " << rpc_ctx.var_name; -} -template -void ParameterRecv::operator()(const CommContext &rpc_ctx, - const framework::Scope &scope) { - this->operator()(rpc_ctx, scope, false); -} - -template struct ParameterRecv; - -}; // namespace distributed -}; // namespace operators -}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/parameter_recv.h b/paddle/fluid/operators/distributed/parameter_recv.h deleted file mode 100644 index c30d21aa791..00000000000 --- a/paddle/fluid/operators/distributed/parameter_recv.h +++ /dev/null @@ -1,37 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include - -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/operators/distributed/communicator_common.h" - -namespace paddle { -namespace operators { -namespace distributed { - -template -struct ParameterRecv { - void operator()(const CommContext &rpc_ctx, const framework::Scope &scope, - bool barrier); - - void operator()(const CommContext &rpc_ctx, const framework::Scope &scope); -}; - -}; // namespace distributed -}; // namespace operators -}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/parameter_send.cc b/paddle/fluid/operators/distributed/parameter_send.cc deleted file mode 100644 index 109514ca254..00000000000 --- a/paddle/fluid/operators/distributed/parameter_send.cc +++ /dev/null @@ -1,331 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/distributed/parameter_send.h" -#include -#include -#include "glog/logging.h" -#include "paddle/fluid/framework/ddim.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/operators/distributed/communicator_common.h" -#include "paddle/fluid/operators/distributed/distributed.h" -#include "paddle/fluid/operators/distributed/request_handler.h" -#include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/place.h" - -namespace paddle { -namespace framework { -class Scope; -class Tensor; -} // namespace framework -} // namespace paddle - -namespace paddle { -namespace operators { -namespace distributed { - -class RPCClient; - -using LoDTensor = framework::LoDTensor; -using LoDTensor = framework::LoDTensor; -using SelectedRows = framework::SelectedRows; -using DDim = framework::DDim; - -typedef std::vector> EP_SPLIT_TABLE_PAIRS; - -inline EP_SPLIT_TABLE_PAIRS GetMultiFieldCommContext( - const CommContext &rpc_ctx, const framework::Scope &scope, - int multi_parts) { - EP_SPLIT_TABLE_PAIRS table_pairs; - - auto *send_var = scope.FindVar(rpc_ctx.var_name); - if (send_var->IsType()) { - PADDLE_ENFORCE_GE(multi_parts, 1, - platform::errors::InvalidArgument( - "multi_parts must == 1 in parameter send, now is: %d", - multi_parts)); - - for (size_t i = 0; i < rpc_ctx.splited_varnames.size(); i++) { - table_pairs.push_back( - std::make_pair(rpc_ctx.epmap[i], rpc_ctx.splited_varnames[i])); - } - - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "GetMultiFieldCommContext unsupported LoDTensor current!")); - } - - return table_pairs; -} // namespace distributed - -void SendByNotifyRPC(const CommContext &rpc_ctx, - const framework::Scope &scope) { - auto cpu_ctx = paddle::platform::CPUDeviceContext(); - auto &send_var_name = rpc_ctx.var_name; - std::vector rets; - - distributed::RPCClient *rpc_client = - distributed::RPCClient::GetInstance(rpc_ctx.trainer_id); - - if (NeedSend(scope, send_var_name)) { - for (size_t j = 0; j < rpc_ctx.epmap.size(); j++) { - auto &endpoint = rpc_ctx.epmap[j]; - VLOG(4) << "sending " << send_var_name << " to " << endpoint; - rets.push_back(rpc_client->AsyncDistributeNotify(endpoint, cpu_ctx, scope, - send_var_name)); - VLOG(4) << "send var " << send_var_name << " by notify RPC done"; - } - } else { - VLOG(3) << "don't send non-initialized variable: " << rpc_ctx.var_name; - } - - for (auto &handle : rets) { - PADDLE_ENFORCE_NE(handle->Wait(), 0U, platform::errors::ExecutionTimeout( - "internal error in RPCClient")); - } -} - -template -void ParameterSend::operator()(const CommContext &rpc_ctx, - const framework::Scope &scope, bool sync, - int multi_parts) { - if (rpc_ctx.var_name == STEP_COUNTER) { - SendByNotifyRPC(rpc_ctx, scope); - return; - } - - std::unique_ptr local_scope = scope.NewTmpScope(); - - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto &cpu_ctx = *pool.Get(platform::CPUPlace()); - - distributed::RPCClient *rpc_client = - distributed::RPCClient::GetInstance(rpc_ctx.trainer_id); - - std::vector rets; - auto *send_var = scope.FindVar(rpc_ctx.var_name); - - if (send_var->IsType()) { - size_t out_num = rpc_ctx.splited_varnames.size(); - if (out_num > 1) { - auto &send_tensor = send_var->Get(); - auto &send_tensor_dims = send_tensor.dims(); - std::vector outs_dims; - outs_dims.reserve(out_num); - - // infer output shape - PADDLE_ENFORCE_EQ( - rpc_ctx.height_sections.size(), out_num, - platform::errors::InvalidArgument("tensor split sections size" - "should be equal to output size.")); - for (size_t i = 0; i < out_num; ++i) { - auto dim = send_tensor_dims; - dim[0] = rpc_ctx.height_sections[i]; - outs_dims.push_back(dim); - } - - // create output var in local scope - size_t row_offset = 0; - for (size_t i = 0; i < out_num; ++i) { - framework::Tensor *out = local_scope->Var(rpc_ctx.splited_varnames[i]) - ->GetMutable(); - *out = send_tensor.Slice(row_offset, row_offset + outs_dims[i][0]); - row_offset += outs_dims[i][0]; - } - } else { - auto &send_tensor = send_var->Get(); - framework::Tensor *out = local_scope->Var(rpc_ctx.splited_varnames[0]) - ->GetMutable(); - out->ShareDataWith(send_tensor); - } - - for (size_t i = 0; i < rpc_ctx.splited_varnames.size(); i++) { - auto &send_var_name = rpc_ctx.splited_varnames[i]; - auto &endpoint = rpc_ctx.epmap[i]; - VLOG(4) << " send var name: " << send_var_name - << "endpoint: " << endpoint; - if (NeedSend(*local_scope.get(), send_var_name)) { - VLOG(3) << "sending " << send_var_name << " to " << endpoint; - rets.push_back(rpc_client->AsyncSendVar( - endpoint, cpu_ctx, *local_scope.get(), send_var_name)); - VLOG(4) << "send var " << send_var_name << " async handle done"; - } else { - VLOG(3) << "don't send non-initialized variable: " - << rpc_ctx.splited_varnames[i]; - } - } - } else if (send_var->IsType()) { - auto &send_slr = send_var->Get(); - - auto &send_rows = send_slr.rows(); - if (send_rows.size() == 0) { - LOG(WARNING) - << "WARNING: The variable sent to pserver is empty, which " - "may cause an unknown error. Please check the state of " - "use_double_buffer in pyreader/dataloader async mode, you need to " - "turn it false."; - } - - std::vector> outs_rows_idx; - std::vector> outs_dense_idx; - - auto table_pairs = GetMultiFieldCommContext(rpc_ctx, scope, 1); - outs_rows_idx.resize(table_pairs.size()); - outs_dense_idx.resize(table_pairs.size()); - - auto row_numel = send_slr.value().numel() / send_slr.value().dims()[0]; - auto *src = send_slr.value().data(); - - // create output var in local scope - std::vector outs; - for (auto &table : table_pairs) { - auto *out = - local_scope->Var(table.second)->GetMutable(); - outs.push_back(out); - } - - if (!rpc_ctx.is_distributed) { - auto pserver_num = rpc_ctx.epmap.size(); - - // split rows index into output sparse vars - for (size_t i = 0; i < send_rows.size(); ++i) { - auto ep_idx = send_rows[i] % pserver_num; - auto id = send_rows[i] / pserver_num; - outs_rows_idx[ep_idx].push_back(id); - outs_dense_idx[ep_idx].push_back(i); - } - - auto place = platform::CPUPlace(); - - for (size_t out_idx = 0; out_idx < rpc_ctx.splited_varnames.size(); - out_idx++) { - auto rows_idx = outs_rows_idx[out_idx]; - - auto dims = send_slr.GetCompleteDims(); - dims[0] = rows_idx.size(); - outs[out_idx]->set_height(rpc_ctx.height_sections[out_idx]); - outs[out_idx]->mutable_rows()->clear(); - outs[out_idx]->mutable_value()->mutable_data(dims, send_slr.place()); - - if (rows_idx.size() > 0) { - for (auto idx : rows_idx) { - outs[out_idx]->mutable_rows()->push_back(idx); - } - auto dst = outs[out_idx]->mutable_value()->mutable_data(place); - for (size_t j = 0; j < rows_idx.size(); j++) { - if (platform::is_cpu_place(place)) { - memory::Copy(platform::CPUPlace(), dst + j * row_numel, - platform::CPUPlace(), - src + outs_dense_idx[out_idx][j] * row_numel, - sizeof(T) * row_numel); - } else { - PADDLE_THROW( - platform::errors::Unimplemented("do not support GPU now")); - } - } - } - PADDLE_ENFORCE_EQ( - rows_idx.size(), outs[out_idx]->rows().size(), - platform::errors::InvalidArgument( - "rows should has the same size with tensor dim 0")); - } - } else { - auto pserver_num = rpc_ctx.epmap.size(); - - // split rows index into output sparse vars - for (size_t i = 0; i < send_rows.size(); ++i) { - auto out_idx = send_rows[i] % pserver_num; - outs_rows_idx[out_idx].push_back(send_rows[i]); - outs_dense_idx[out_idx].push_back(i); - } - - auto place = platform::CPUPlace(); - - for (size_t out_idx = 0; out_idx < rpc_ctx.splited_varnames.size(); - out_idx++) { - auto rows_idx = outs_rows_idx[out_idx]; - - auto dims = send_slr.GetCompleteDims(); - dims[0] = rows_idx.size(); - - outs[out_idx]->set_height(rpc_ctx.height_sections[out_idx]); - outs[out_idx]->mutable_rows()->clear(); - outs[out_idx]->mutable_value()->mutable_data(dims, send_slr.place()); - - if (rows_idx.size() > 0) { - for (auto idx : rows_idx) { - outs[out_idx]->mutable_rows()->push_back(idx); - } - auto dst = outs[out_idx]->mutable_value()->mutable_data(place); - for (size_t j = 0; j < rows_idx.size(); j++) { - if (platform::is_cpu_place(place)) { - memory::Copy(platform::CPUPlace(), dst + j * row_numel, - platform::CPUPlace(), - src + outs_dense_idx[out_idx][j] * row_numel, - sizeof(T) * row_numel); - } else { - PADDLE_THROW( - platform::errors::Unimplemented("do not support GPU now")); - } - } - } - PADDLE_ENFORCE_EQ( - rows_idx.size(), outs[out_idx]->rows().size(), - platform::errors::InvalidArgument( - "rows should has the same size with tensor dim 0")); - } - } - - for (size_t i = 0; i < table_pairs.size(); i++) { - auto &send_var_name = table_pairs[i].second; - auto &endpoint = table_pairs[i].first; - auto need_send = NeedSend(*local_scope.get(), send_var_name); - - VLOG(4) << "send var name: " << send_var_name - << " send var endpoint: " << endpoint - << " need send: " << need_send; - - if (need_send) { - VLOG(4) << "sending " << send_var_name << " to " << endpoint; - - rets.push_back(rpc_client->AsyncSendVar( - endpoint, cpu_ctx, *local_scope.get(), send_var_name)); - VLOG(4) << "send var " << send_var_name << " async handle done"; - } else { - VLOG(4) << "don't send non-initialized variable: " - << rpc_ctx.splited_varnames[i]; - } - } - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "unsupported var type: %s to send!", send_var->Type())); - } - - VLOG(4) << "Prepare to send var " << rpc_ctx.var_name; - if (sync) { - for (auto &handle : rets) { - VLOG(4) << "Wait send var to pserver handle: " << handle; - PADDLE_ENFORCE_NE(handle->Wait(), 0U, platform::errors::ExecutionTimeout( - "internal error in RPCClient")); - } - } -} - -template struct ParameterSend; - -}; // namespace distributed -}; // namespace operators -}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/parameter_send.h b/paddle/fluid/operators/distributed/parameter_send.h deleted file mode 100644 index 4335ef8c73c..00000000000 --- a/paddle/fluid/operators/distributed/parameter_send.h +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include - -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/operators/distributed/communicator_common.h" - -namespace paddle { -namespace operators { -namespace distributed { - -template -struct ParameterSend { - void operator()(const CommContext &rpc_ctx, const framework::Scope &scope, - bool sync, int multi_parts); -}; - -}; // namespace distributed -}; // namespace operators -}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/proto_encoder_helper.h b/paddle/fluid/operators/distributed/proto_encoder_helper.h deleted file mode 100644 index cedc98b1fca..00000000000 --- a/paddle/fluid/operators/distributed/proto_encoder_helper.h +++ /dev/null @@ -1,146 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -// NOTE: This file was originally created by tensorflow -// (https://github.com/tensorflow/tensorflow/) we borrow this -// file and did some modifications so that we can send gRPC -// requests without too much copying of the tensor data. - -#pragma once - -#include - -#include "grpc++/grpc++.h" -#include "paddle/fluid/platform/enforce.h" - -namespace paddle { -namespace operators { -namespace distributed { - -char* EncodeVarint32(char* dst, uint32_t v) { - // Operate on characters as unsigneds - unsigned char* ptr = reinterpret_cast(dst); - static const int B = 128; - if (v < (1 << 7)) { - *(ptr++) = v; - } else if (v < (1 << 14)) { - *(ptr++) = v | B; - *(ptr++) = v >> 7; - } else if (v < (1 << 21)) { - *(ptr++) = v | B; - *(ptr++) = (v >> 7) | B; - *(ptr++) = v >> 14; - } else if (v < (1 << 28)) { - *(ptr++) = v | B; - *(ptr++) = (v >> 7) | B; - *(ptr++) = (v >> 14) | B; - *(ptr++) = v >> 21; - } else { - *(ptr++) = v | B; - *(ptr++) = (v >> 7) | B; - *(ptr++) = (v >> 14) | B; - *(ptr++) = (v >> 21) | B; - *(ptr++) = v >> 28; - } - return reinterpret_cast(ptr); -} - -char* EncodeVarint64(char* dst, uint64_t v) { - static const int B = 128; - unsigned char* ptr = reinterpret_cast(dst); - while (v >= B) { - *(ptr++) = (v & (B - 1)) | B; - v >>= 7; - } - *(ptr++) = static_cast(v); - return reinterpret_cast(ptr); -} - -int VarintLength(uint64_t v) { - int len = 1; - while (v >= 128) { - v >>= 7; - len++; - } - return len; -} - -class ProtoEncodeHelper { - public: - ProtoEncodeHelper(char* buf, int max_size) - : base_(buf), p_(buf), limit_(base_ + max_size) {} - - ~ProtoEncodeHelper() {} - - const char* data() const { return base_; } - size_t size() const { return p_ - base_; } - - void WriteUint64(int tag, uint64_t v) { - Encode32(combine(tag, WIRETYPE_VARINT)); - Encode64(v); - } - void WriteBool(int tag, bool v) { - Encode32(combine(tag, WIRETYPE_VARINT)); - EncodeBool(v); - } - void WriteString(int tag, const std::string& v) { - Encode32(combine(tag, WIRETYPE_LENGTH_DELIMITED)); - Encode32(v.size()); - EncodeBytes(v.data(), v.size()); - } - void WriteVarlengthBeginning(int tag, uint32_t len) { - Encode32(combine(tag, WIRETYPE_LENGTH_DELIMITED)); - Encode32(len); - } - void WriteRawBytes(const std::string& v) { EncodeBytes(v.data(), v.size()); } - - private: - // Note: this module's behavior must match the protocol buffer wire encoding - // format. - enum { - WIRETYPE_VARINT = 0, - WIRETYPE_LENGTH_DELIMITED = 2, - }; - static uint32_t combine(uint32_t tag, uint32_t type) { - return ((tag << 3) | type); - } - inline void Encode32(uint32_t v) { - if (v < 128) { - // Fast path for single-byte values. Many of the calls will use a - // constant value for v, so the comparison will get optimized away - // when Encode32 is inlined into the caller. - *p_ = v; - p_++; - } else { - p_ = EncodeVarint32(p_, v); - } - } - void Encode64(uint64_t v) { p_ = EncodeVarint64(p_, v); } - void EncodeBool(bool v) { - *p_ = (v ? 1 : 0); // Equal to varint32 encoding of 0 or 1 - p_++; - } - void EncodeBytes(const char* bytes, int N) { - memcpy(p_, bytes, N); - p_ += N; - } - - char* base_; - char* p_; - char* limit_; // Just for CHECKs -}; - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/request_handler.h b/paddle/fluid/operators/distributed/request_handler.h deleted file mode 100644 index 44359af1b1b..00000000000 --- a/paddle/fluid/operators/distributed/request_handler.h +++ /dev/null @@ -1,261 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include // NOLINT - -#include -#include -#include -#include -#include -#include - -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/executor.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/framework/var_type.h" -#include "paddle/fluid/platform/macros.h" - -namespace paddle { -namespace operators { -namespace distributed { - -constexpr char kRequestSend[] = "RequestSend"; -constexpr char kRequestGet[] = "RequestGet"; -constexpr char kRequestGetMonomerVariable[] = "RequestGetMonomerVariable"; -constexpr char kRequestGetMonomerBarrier[] = "RequestGetMonomerBarrier"; -constexpr char kRequestPrefetch[] = "RequestPrefetch"; -constexpr char kRequestCheckpoint[] = "RequestCheckpoint"; -constexpr char kRequestPassBarrier[] = "RequestPassBarrier"; -constexpr char kRequestGetNoBarrier[] = "GetVariableNoBarrier"; -constexpr char kRequestNotify[] = "RequestNotify"; -constexpr char kRequestSendAndRecv[] = "RequestSendAndRecv"; - -constexpr char kSendRPC[] = "SendRPC"; -constexpr char kGetRPC[] = "GetRPC"; -constexpr char kGetNoBarrierRPC[] = "GetNoBarrierRPC"; -constexpr char kGetMonomerRPC[] = "GetMonomerRPC"; -constexpr char kPrefetchRPC[] = "PrefetchRPC"; -constexpr char kBatchBarrierRPC[] = "BatchBarrierRPC"; -constexpr char kFetchBarrierRPC[] = "FetchBarrierRPC"; -constexpr char kSendMonomerFetchBarrierRPC[] = "SendMonomerFetchBarrierRPC"; -constexpr char kSendCompleteRPC[] = "SendCompleteRPC"; -constexpr char kCheckPointNotifyRPC[] = "CheckPointNotifyRPC"; -constexpr char kSendAndRecvRPC[] = "SendAndRecvRPC"; -constexpr int64_t kPrefetchTimeout = 60000; - -#define LISTEN_TERMINATE_MESSAGE "TERMINATE@RECV" -#define BATCH_BARRIER_MESSAGE "BATCH_BARRIER@RECV" -#define FETCH_BARRIER_MESSAGE "FETCH_BARRIER@RECV" -#define COMPLETE_MESSAGE "COMPLETE@RECV" -#define WITHOUT_BARRIER_MESSAGE "@WITHOUT_BARRIER@RECV" -#define LEARNING_RATE_DECAY_COUNTER "@LR_DECAY_COUNTER@" -#define STEP_COUNTER "@PS_STEP_COUNTER@" - -#define CHECKPOINT_SAVE_MESSAGE "SAVE@CHECKPOINTNOTIFY" -#define CHECKPOINT_LOAD_MESSAGE "LOAD@CHECKPOINTNOTIFY" - -enum DistributedMode { kSync = 0, kAsync = 1, kHalfAsync = 2, kGeo = 3 }; - -class RPCServer; - -class VarHandle { - public: - VarHandle(const std::string ep, const std::string& method, - const std::string& name, - const platform::DeviceContext* p_ctx = nullptr, - const framework::Scope* p_scope = nullptr) - : status_(kDefaultState) { - ep_ = ep; - ctx_ = p_ctx; - scope_ = p_scope; - name_ = name; - method_ = method; - } - - virtual ~VarHandle() {} - - public: - bool should_retry = false; - - bool Wait() { - int ret = kDefaultState; - { - std::unique_lock lk(sync_mutex_); - wait_cond_.wait(lk, [this] { return status_ != kDefaultState; }); - ret = status_; - } - VLOG(7) << "VarHandle wait:" << ret; - return ret != kErrorState; - } - - void Finish(bool ok) { - { - std::unique_lock lk(sync_mutex_); - status_ = ok ? kFinishState : kErrorState; - } - VLOG(7) << "VarHandle finish:" << ok; - wait_cond_.notify_all(); - } - - std::string String() const { - std::ostringstream s; - s << method_ << " name:[" << name_ << "], ep:[" << ep_ << "], status:[" - << status_ << "]"; - return s.str(); - } - - std::string ep() const { return ep_; } - const platform::DeviceContext* ctx() const { return ctx_; } - const framework::Scope* scope() const { return scope_; } - std::string name() const { return name_; } - std::string method() const { return method_; } - - protected: - // RPC endpoint. - std::string ep_; - const platform::DeviceContext* ctx_; - const framework::Scope* scope_; - // Variable name. - std::string name_; - // RPC method name. - std::string method_; - - protected: - std::mutex sync_mutex_; - std::condition_variable wait_cond_; - - enum VarHandleStatus { - kDefaultState = -1, - kErrorState = 0, - kFinishState = 1, - }; - VarHandleStatus status_; - - private: - DISABLE_COPY_AND_ASSIGN(VarHandle); -}; - -typedef std::shared_ptr VarHandlePtr; - -class RequestHandler { - public: - explicit RequestHandler(int distributed_mode) - : distributed_mode_(distributed_mode), - dev_ctx_(nullptr), - executor_(nullptr), - scope_(nullptr), - program_(nullptr), - rpc_server_(nullptr) {} - - virtual ~RequestHandler() {} - - // Set attributes. - void SetScope(framework::Scope* scope) { scope_ = scope; } - void SetDevCtx(const platform::DeviceContext* dev_ctx) { dev_ctx_ = dev_ctx; } - void SetProgram(framework::ProgramDesc* program) { program_ = program; } - void SetExecutor(framework::Executor* executor) { executor_ = executor; } - - // Used for dist lookup table prefetch - void SetPrefetchPreparedCtx( - std::unordered_map< - std::string, std::shared_ptr>* g) { - prefetch_var_name_to_prepared_ctx_ = g; - } - - void SetCheckpointNotifyPreparedCtx( - std::shared_ptr g) { - checkpoint_prepared_ctx_ = g; - } - - // Used for async. - void SetGradToPreparedCtx( - std::unordered_map< - std::string, std::shared_ptr>* g) { - grad_to_prepared_ctx_ = g; - } - - void SetSparseGradToParam(std::unordered_map* g) { - sparse_grad_to_param_ = g; - } - - void SetLrDecayPreparedCtx( - std::shared_ptr g) { - lr_decay_prepared_ctx_ = g; - } - - void SetRPCServer(RPCServer* rpc_server) { rpc_server_ = rpc_server; } - - // Get attributes. - int distributed_mode() { return distributed_mode_; } - framework::Scope* scope() { return scope_; } - const platform::DeviceContext* dev_ctx() { return dev_ctx_; } - framework::ProgramDesc* program() { return program_; } - framework::Executor* executor() { return executor_; } - - // This function processes user's rpc request. - // The implemention is in request_handler_impl. - // example: - // std::string varname = request_.varname(); - // - // auto scope = request_handler_->scope(); - // auto invar = scope->FindVar(varname); - // framework::Variable* outvar = nullptr; - // - // request_handler_->Handle(varname, scope, invar, &outvar); - // if (outvar) { - // SerializeToByteBuffer(varname, outvar, - // *request_handler_->dev_ctx(), &reply_); - // } - virtual bool Handle(const std::string& varname, framework::Scope* scope, - framework::Variable* var, framework::Variable** outvar, - const int trainer_id, - const std::string& out_var_name = "", - const std::string& table_name = "") = 0; - - protected: - const int distributed_mode_; - - const platform::DeviceContext* dev_ctx_; - framework::Executor* executor_; - framework::Scope* scope_; - framework::ProgramDesc* program_; - - // used for distribute lookup table prefetch - std::unordered_map>* - prefetch_var_name_to_prepared_ctx_; - // used for checkpoint notify - std::shared_ptr checkpoint_prepared_ctx_; - - // Used for async. - std::unordered_map>* - grad_to_prepared_ctx_; - std::unordered_map* sparse_grad_to_param_; - - // used for lr decay - std::shared_ptr lr_decay_prepared_ctx_; - RPCServer* rpc_server_; -}; - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/request_handler_impl.cc b/paddle/fluid/operators/distributed/request_handler_impl.cc deleted file mode 100644 index 8c4f2ef57a3..00000000000 --- a/paddle/fluid/operators/distributed/request_handler_impl.cc +++ /dev/null @@ -1,354 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/distributed/request_handler_impl.h" -#include -#include -#include - -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/framework/variable_helper.h" -#include "paddle/fluid/operators/distributed/rpc_server.h" -#include "paddle/fluid/string/piece.h" -#include "paddle/fluid/string/printf.h" -#include "paddle/fluid/string/split.h" - -#include "paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h" -#include "paddle/fluid/operators/distributed/heart_beat_monitor.h" -#include "paddle/fluid/operators/distributed/large_scale_kv.h" - -namespace paddle { -namespace operators { -namespace distributed { - -// define LOOKUP_TABLE_PATH for checkpoint notify to save lookup table variables -// to directory specified. -constexpr char LOOKUP_TABLE_PATH[] = "kLookupTablePath"; - -bool RequestSendHandler::Handle(const std::string &varname, - framework::Scope *scope, - framework::Variable *invar, - framework::Variable **outvar, - const int trainer_id, - const std::string &out_var_name, - const std::string &table_name) { - VLOG(4) << "RequestSendHandler:" << varname; - - // Sync - if (varname == BATCH_BARRIER_MESSAGE) { - VLOG(3) << "sync: recv BATCH_BARRIER_MESSAGE"; - rpc_server_->IncreaseBatchBarrier(kRequestSend); - } else if (varname == COMPLETE_MESSAGE) { - VLOG(3) << "sync: recv complete message"; - - if (HeartBeatMonitor::GetInstance() != nullptr) { - HeartBeatMonitor::GetInstance()->Update(trainer_id, "", COMPLETED); - } - - rpc_server_->Complete(); - } else { - // Async - if (distributed_mode_ != DistributedMode::kSync) { - VLOG(3) << "async process var: " << varname; - if (varname == BATCH_BARRIER_MESSAGE) { - PADDLE_THROW(platform::errors::InvalidArgument( - "async mode should not recv BATCH_BARRIER_MESSAGE or " - "COMPLETE_MESSAGE")); - } - HeartBeatMonitor::GetInstance()->Update(trainer_id, varname, RUNNING); - - std::string run_varname = varname; - - string::Piece part_piece("@PIECE"); - string::Piece var_name_piece = string::Piece(varname); - - if (string::Contains(var_name_piece, part_piece)) { - auto varname_splits = paddle::string::Split(varname, '@'); - PADDLE_ENFORCE_EQ( - varname_splits.size(), 3, - platform::errors::InvalidArgument( - "varname: %s should be separated into 3 parts by @", varname)); - run_varname = varname_splits[0]; - scope->Rename(varname, run_varname); - } - - auto *var = scope->FindVar(run_varname); - - // for sparse ids - if (var->IsType()) { - if (distributed_mode_ == DistributedMode::kAsync || - distributed_mode_ == DistributedMode::kHalfAsync) { - auto *ins = distributed::LargeScaleKV::GetInstance(); - if (ins->GradInLargeScale(run_varname)) { - auto *large_scale_var = ins->GetByGrad(run_varname); - - for (auto name : large_scale_var->CachedVarnames()) { - scope->Var(name); - } - } - } - if (distributed_mode_ == DistributedMode::kGeo) { - if (AsyncSparseParamUpdateRecorder::GetInstance()->HasGrad( - run_varname)) { - auto &grad_slr = - scope->FindVar(run_varname)->Get(); - AsyncSparseParamUpdateRecorder::GetInstance()->Update( - run_varname, grad_slr.rows()); - } - } - } - - executor_->RunPreparedContext((*grad_to_prepared_ctx_)[run_varname].get(), - scope); - return true; - } else { // sync - rpc_server_->WaitCond(kRequestSend); - VLOG(3) << "sync: processing received var: " << varname; - PADDLE_ENFORCE_NOT_NULL( - invar, platform::errors::NotFound( - "sync: Can not find server side var %s.", varname)); - } - } - return true; -} - -bool RequestGetHandler::Handle(const std::string &varname, - framework::Scope *scope, - framework::Variable *invar, - framework::Variable **outvar, - const int trainer_id, - const std::string &out_var_name, - const std::string &table_name) { - VLOG(3) << "RequestGetHandler:" << varname - << " out_var_name: " << out_var_name << " trainer_id: " << trainer_id - << " table_name: " << table_name; - - if (distributed_mode_ == DistributedMode::kSync) { - if (varname == FETCH_BARRIER_MESSAGE) { - VLOG(3) << "sync: recv fetch barrier message"; - rpc_server_->IncreaseBatchBarrier(kRequestGet); - } else { - rpc_server_->WaitCond(kRequestGet); - *outvar = scope_->FindVar(varname); - } - } else { - if (varname != FETCH_BARRIER_MESSAGE && varname != COMPLETE_MESSAGE) { - if (enable_dc_asgd_) { - // NOTE: the format is determined by distribute_transpiler.py - std::string param_bak_name = - string::Sprintf("%s.trainer_%d_bak", varname, trainer_id); - VLOG(3) << "getting " << param_bak_name << " trainer_id " << trainer_id; - auto var = scope_->FindVar(varname); - auto t_orig = var->Get(); - auto param_bak = scope_->Var(param_bak_name); - auto t = param_bak->GetMutable(); - t->mutable_data(dev_ctx_->GetPlace(), t_orig.type()); - VLOG(3) << "copying " << varname << " to " << param_bak_name; - framework::TensorCopy(t_orig, dev_ctx_->GetPlace(), t); - } - - if (distributed_mode_ == DistributedMode::kGeo && - AsyncSparseParamUpdateRecorder::GetInstance()->HasParam(varname) && - !table_name.empty()) { - VLOG(3) << "AsyncSparseParamUpdateRecorder " << varname << " exist "; - - std::vector updated_rows; - AsyncSparseParamUpdateRecorder::GetInstance()->GetAndClear( - varname, trainer_id, &updated_rows); - - if (VLOG_IS_ON(3)) { - std::ostringstream sstream; - sstream << "["; - for (auto &row_id : updated_rows) { - sstream << row_id << ", "; - } - sstream << "]"; - VLOG(3) << "updated_rows size: " << updated_rows.size() << " " - << sstream.str(); - } - - auto &origin_tensor = - scope_->FindVar(varname)->Get(); - auto *origin_tensor_data = origin_tensor.data(); - auto &dims = origin_tensor.dims(); - *outvar = scope->Var(); - auto *out_slr = (*outvar)->GetMutable(); - out_slr->set_rows(updated_rows); - out_slr->set_height(dims[0]); - auto out_dims = framework::make_ddim( - {static_cast(updated_rows.size()), dims[1]}); - auto *data = out_slr->mutable_value()->mutable_data( - out_dims, origin_tensor.place()); - auto width = dims[1]; - for (size_t i = 0; i < updated_rows.size(); ++i) { - PADDLE_ENFORCE_LT( - updated_rows[i], dims[0], - platform::errors::OutOfRange( - "The value of updated_rows: %s out of Tensor %s dims[0]: %s", - updated_rows[i], varname, dims[0])); - memcpy(data + i * width, origin_tensor_data + updated_rows[i] * width, - sizeof(float) * width); - } - } else { - *outvar = scope_->FindVar(varname); - } - } - } - return true; -} - -bool RequestGetNoBarrierHandler::Handle(const std::string &varname, - framework::Scope *scope, - framework::Variable *invar, - framework::Variable **outvar, - const int trainer_id, - const std::string &out_var_name, - const std::string &table_name) { - VLOG(4) << "RequestGetNoBarrierHandler:" << varname - << " out_var_name: " << out_var_name; - - // get var from pserver immediately without barriers - string::Piece without_barrier_piece(WITHOUT_BARRIER_MESSAGE); - string::Piece var_name_piece = string::Piece(varname); - - if (string::Contains(var_name_piece, without_barrier_piece)) { - var_name_piece = string::TrimSuffix(var_name_piece, without_barrier_piece); - VLOG(4) << "Get var " << var_name_piece << " with " - << WITHOUT_BARRIER_MESSAGE; - *outvar = scope_->FindVar(var_name_piece.ToString()); - return true; - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "GetNoBarrier must contain %s", WITHOUT_BARRIER_MESSAGE)); - } - return true; -} - -bool RequestPrefetchHandler::Handle(const std::string &varname, - framework::Scope *scope, - framework::Variable *invar, - framework::Variable **outvar, - const int trainer_id, - const std::string &out_var_name, - const std::string &table_name) { - VLOG(4) << "RequestPrefetchHandler " << varname; - - (*outvar)->GetMutable(); - - VLOG(1) << "Prefetch " - << "tablename: " << table_name << " ids:" << varname - << " out: " << out_var_name; - paddle::platform::CPUPlace cpu_place; - auto *ins = distributed::LargeScaleKV::GetInstance(); - - if (ins->ParamInLargeScale(table_name)) { - auto lookup_table_op = PullLargeScaleOp(table_name, varname, out_var_name); - lookup_table_op->Run(*scope, cpu_place); - } else { - auto lookup_table_op = - BuildLookupTableOp(table_name, varname, out_var_name); - lookup_table_op->Run(*scope, cpu_place); - } - - return true; -} - -bool RequestCheckpointHandler::Handle(const std::string &varname, - framework::Scope *scope, - framework::Variable *invar, - framework::Variable **outvar, - const int trainer_id, - const std::string &out_var_name, - const std::string &table_name) { - VLOG(4) << "receive save var " << varname << " with path " << out_var_name - << " mode " << table_name; - - int mode = std::stoi(table_name); - - auto *ins = distributed::LargeScaleKV::GetInstance(); - ins->Get(varname)->Save(out_var_name, mode); - return true; -} - -bool RequestNotifyHandler::Handle(const std::string &varname, - framework::Scope *scope, - framework::Variable *invar, - framework::Variable **outvar, - const int trainer_id, - const std::string &out_var_name, - const std::string &table_name) { - VLOG(3) << "RequestNotifyHandler: " << varname - << ", trainer_id: " << trainer_id; - - string::Piece decay_piece(STEP_COUNTER); - string::Piece var_name_piece = string::Piece(varname); - if (string::Contains(var_name_piece, decay_piece)) { - VLOG(3) << "LearningRate Decay Counter Update"; - - auto *send_var = scope->FindVar(varname); - auto send_var_tensor = send_var->Get(); - auto *send_value = - send_var_tensor.mutable_data(send_var_tensor.place()); - - auto counter = decay_counters.at(trainer_id); - counter += send_value[0]; - decay_counters.at(trainer_id) = counter; - - auto *global_step_var = this->scope()->FindVar(LEARNING_RATE_DECAY_COUNTER); - if (global_step_var == nullptr) { - PADDLE_THROW(platform::errors::InvalidArgument( - "can not find LEARNING_RATE_DECAY_COUNTER ")); - } - - auto *tensor = global_step_var->GetMutable(); - auto *value = tensor->mutable_data(platform::CPUPlace()); - - auto global_counter = 0; - for (auto &trainer_counter : decay_counters) { - global_counter += trainer_counter.second; - } - value[0] = global_counter; - - if (lr_decay_prepared_ctx_.get() == nullptr) { - PADDLE_THROW(platform::errors::InvalidArgument( - "can not find decay block for executor")); - } - - executor_->RunPreparedContext(lr_decay_prepared_ctx_.get(), scope_); - } - return true; -} - -bool RequestSendAndRecvHandler::Handle(const std::string &varname, - framework::Scope *Scope, - framework::Variable *var, - framework::Variable **outvar, - const int trainer_id, - const std::string &out_var_name, - const std::string &table_name) { - VLOG(3) << "SendAndRecvHandle: " << varname - << " out_var_name: " << out_var_name - << " , trainer_id: " << trainer_id; - - executor_->RunPreparedContext((*grad_to_prepared_ctx_)[varname].get(), Scope); - *outvar = Scope->FindVar(out_var_name); - return true; -} - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/request_handler_impl.h b/paddle/fluid/operators/distributed/request_handler_impl.h deleted file mode 100644 index 6d239673f91..00000000000 --- a/paddle/fluid/operators/distributed/request_handler_impl.h +++ /dev/null @@ -1,198 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include -#include -#include -#include - -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/executor.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/framework/var_type.h" -#include "paddle/fluid/operators/distributed/request_handler.h" - -namespace paddle { -namespace framework { -class Scope; -class Variable; -} // namespace framework -} // namespace paddle - -namespace paddle { -namespace operators { -namespace distributed { - -class RequestSendHandler final : public RequestHandler { - public: - explicit RequestSendHandler(int distributed_mode, bool enable_dc_asgd = false) - : RequestHandler(distributed_mode) { - enable_dc_asgd_ = enable_dc_asgd; - } - virtual ~RequestSendHandler() {} - bool Handle(const std::string& varname, framework::Scope* scope, - framework::Variable* var, framework::Variable** outvar, - const int trainer_id, const std::string& out_var_name = "", - const std::string& table_name = "") override; - - private: - bool enable_dc_asgd_; -}; - -class RequestGetHandler final : public RequestHandler { - public: - explicit RequestGetHandler(int distributed_mode, bool enable_dc_asgd = false) - : RequestHandler(distributed_mode) { - enable_dc_asgd_ = enable_dc_asgd; - } - virtual ~RequestGetHandler() {} - bool Handle(const std::string& varname, framework::Scope* scope, - framework::Variable* var, framework::Variable** outvar, - const int trainer_id, const std::string& out_var_name = "", - const std::string& table_name = "") override; - - private: - bool enable_dc_asgd_; -}; - -class RequestGetNoBarrierHandler final : public RequestHandler { - public: - RequestGetNoBarrierHandler() : RequestHandler(false) {} - virtual ~RequestGetNoBarrierHandler() {} - bool Handle(const std::string& varname, framework::Scope* scope, - framework::Variable* var, framework::Variable** outvar, - const int trainer_id, const std::string& out_var_name = "", - const std::string& table_name = "") override; -}; - -static inline void BuildVar(const std::string& param_name, - std::initializer_list arguments, - paddle::framework::proto::OpDesc::Var* var) { - var->set_parameter(param_name); - for (auto& arg_name : arguments) { - *var->mutable_arguments()->Add() = arg_name; - } -} - -class RequestPrefetchHandler final : public RequestHandler { - public: - explicit RequestPrefetchHandler(int distributed_mode) - : RequestHandler(distributed_mode) {} - virtual ~RequestPrefetchHandler() {} - bool Handle(const std::string& varname, framework::Scope* scope, - framework::Variable* var, framework::Variable** outvar, - const int trainer_id, const std::string& out_var_name = "", - const std::string& table_name = "") override; - - private: - std::unique_ptr PullLargeScaleOp( - const std::string& table_name, const std::string& id_name, - const std::string& out_name) { - framework::OpDesc desc; - desc.SetType("lookup_sparse_table_read"); - desc.SetInput("Ids", {id_name}); - desc.SetOutput("Out", std::vector({out_name})); - desc.SetAttr("tablename", {table_name}); - desc.SetAttr("init", true); - desc.SetAttr("value_names", std::vector({"Param"})); - - auto op = paddle::framework::OpRegistry::CreateOp(desc); - return op; - } - - std::unique_ptr BuildLookupTableOp( - const std::string& table_name, const std::string& id_name, - const std::string& out_name) { - paddle::framework::proto::OpDesc op_desc; - op_desc.set_type("lookup_table"); - BuildVar("W", {table_name.data()}, op_desc.add_inputs()); - BuildVar("Ids", {id_name.data()}, op_desc.add_inputs()); - BuildVar("Out", {out_name.data()}, op_desc.add_outputs()); - - auto op = paddle::framework::OpRegistry::CreateOp(op_desc); - return op; - } -}; - -class RequestCheckpointHandler final : public RequestHandler { - public: - explicit RequestCheckpointHandler(int distributed_mode) - : RequestHandler(distributed_mode) {} - - virtual ~RequestCheckpointHandler() {} - bool Handle(const std::string& varname, framework::Scope* scope, - framework::Variable* var, framework::Variable** outvar, - const int trainer_id, const std::string& out_var_name = "", - const std::string& table_name = "") override; - - private: - std::unique_ptr BuildCheckpointOp( - const std::string& varname, const std::string& file_path) { - paddle::framework::proto::OpDesc op_desc; - op_desc.set_type("save"); - BuildVar("X", {varname.data()}, op_desc.add_inputs()); - - auto attr = op_desc.mutable_attrs()->Add(); - attr->set_name("file_path"); - attr->set_type(paddle::framework::proto::AttrType::STRING); - attr->set_s(file_path); - - auto op = paddle::framework::OpRegistry::CreateOp(op_desc); - return op; - } -}; - -class RequestNotifyHandler final : public RequestHandler { - public: - explicit RequestNotifyHandler(int distributed_mode, int trainers) - : RequestHandler(distributed_mode) { - this->trainers = trainers; - for (int i = 0; i < trainers; i++) { - decay_counters[i] = 0; - } - } - virtual ~RequestNotifyHandler() {} - bool Handle(const std::string& varname, framework::Scope* scope, - framework::Variable* var, framework::Variable** outvar, - const int trainer_id, const std::string& out_var_name = "", - const std::string& table_name = "") override; - - private: - int trainers; - std::unordered_map decay_counters; -}; - -class RequestSendAndRecvHandler final : public RequestHandler { - public: - explicit RequestSendAndRecvHandler(int distributed_mode) - : RequestHandler(distributed_mode) {} - virtual ~RequestSendAndRecvHandler() {} - bool Handle(const std::string& varname, framework::Scope* Scope, - framework::Variable* var, framework::Variable** outvar, - const int trainer_id, const std::string& out_var_name = "", - const std::string& table_name = "") override; -}; - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/rpc_client.cc b/paddle/fluid/operators/distributed/rpc_client.cc deleted file mode 100644 index 57ce54870de..00000000000 --- a/paddle/fluid/operators/distributed/rpc_client.cc +++ /dev/null @@ -1,32 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/distributed/rpc_client.h" -#include "gflags/gflags.h" - -// default to 3min to avoid temprary network failures. -DEFINE_int32(rpc_deadline, 180000, "deadline timeouts for rpc"); -DEFINE_int32(rpc_retry_times, 3, "retry times for rpc"); - -namespace paddle { -namespace operators { -namespace distributed { - -std::once_flag RPCClient::init_flag_; -std::unique_ptr RPCClient::rpc_client_(nullptr); -int RPCClient::trainer_id_ = 0; - -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/rpc_client.h b/paddle/fluid/operators/distributed/rpc_client.h deleted file mode 100644 index 2c756a6f71f..00000000000 --- a/paddle/fluid/operators/distributed/rpc_client.h +++ /dev/null @@ -1,143 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include // NOLINT -#include -#include - -#include "gflags/gflags.h" -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/operators/distributed/request_handler.h" - -namespace paddle { -namespace framework { -class Scope; -} // namespace framework -namespace platform { -class DeviceContext; -} // namespace platform -} // namespace paddle - -DECLARE_int32(rpc_deadline); -DECLARE_int32(rpc_retry_times); - -namespace paddle { -namespace operators { -namespace distributed { - -class RPCClient { - public: - RPCClient() {} - virtual ~RPCClient() {} - virtual VarHandlePtr AsyncSendVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& var_name, - int64_t time_out = FLAGS_rpc_deadline) = 0; - - virtual VarHandlePtr AsyncGetVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& var_name, - const std::string& out_varname, - const std::string& table_name = "", - int64_t time_out = FLAGS_rpc_deadline) = 0; - - virtual VarHandlePtr AsyncGetVarNoBarrier( - const std::string& ep, const platform::DeviceContext& ctx, - const framework::Scope& scope, const std::string& var_name, - const std::string& out_varname, - int64_t time_out = FLAGS_rpc_deadline) = 0; - - virtual VarHandlePtr AsyncGetMonomerVariable( - const std::string& ep, const platform::DeviceContext& ctx, - const framework::Scope& scope, const std::string& var_name, - int64_t time_out = FLAGS_rpc_deadline) = 0; - - virtual VarHandlePtr AsyncPrefetchVar( - const std::string& ep, const platform::DeviceContext& ctx, - const framework::Scope& scope, const std::string& in_var_name, - const std::string& out_var_name, const std::string& table_name = "", - int64_t time_out = FLAGS_rpc_deadline) = 0; - - virtual VarHandlePtr AsyncSendBatchBarrier( - const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) = 0; - - virtual VarHandlePtr AsyncSendFetchBarrier( - const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) = 0; - - virtual VarHandlePtr AsyncGetMonomerBarrier( - const std::string& ep, const std::string& var_name, - int64_t time_out = FLAGS_rpc_deadline) = 0; - - virtual VarHandlePtr AsyncCheckpointNotify( - const std::string& ep, const std::string& dirname, - const std::string& varname, const int mode, - int64_t time_out = FLAGS_rpc_deadline) = 0; - - virtual VarHandlePtr AsyncDistributeNotify( - const std::string& ep, const platform::DeviceContext& ctx, - const framework::Scope& scope, const std::string& var_name, - int64_t time_out = FLAGS_rpc_deadline) = 0; - - virtual VarHandlePtr AsyncSendAndRecv( - const std::string& ep, const platform::DeviceContext& ctx, - const framework::Scope& scope, const std::string& send_var_name, - const std::string& recv_var_name, const std::string& table_name = "", - int64_t time_out = FLAGS_rpc_deadline) = 0; - - virtual VarHandlePtr AsyncSendComplete( - const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) = 0; - - // Complete tells all the pserver instances that finishe the training, - // the pserver can reduce it's barrier count, and continue to train - // with other trainers. - virtual void SendComplete() = 0; - - virtual bool Wait() = 0; - - template - static RPCClient* GetInstance(int trainer_id) { - std::call_once(init_flag_, &RPCClient::Init, trainer_id); - return rpc_client_.get(); - } - - // Init is called by GetInstance. - template - static void Init(int trainer_id) { - VLOG(1) << "init rpc client with trainer_id " << trainer_id; - trainer_id_ = trainer_id; - if (rpc_client_.get() == nullptr) { - rpc_client_.reset(new T()); - rpc_client_->InitImpl(); - } - } - - virtual void InitImpl() {} - - protected: - // each trainer have exact one trainer id, it should be static - static int trainer_id_; - - private: - static std::once_flag init_flag_; - static std::unique_ptr rpc_client_; -}; -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/rpc_server.cc b/paddle/fluid/operators/distributed/rpc_server.cc deleted file mode 100644 index 37cf0460fb1..00000000000 --- a/paddle/fluid/operators/distributed/rpc_server.cc +++ /dev/null @@ -1,242 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/distributed/rpc_server.h" - -#include -#include - -namespace paddle { -namespace framework { -class Scope; -} // namespace framework -namespace platform { -class DeviceContext; -} // namespace platform -} // namespace paddle - -namespace paddle { -namespace operators { -namespace distributed { - -class RequestHandler; - -void RPCServer::ShutDown() { - VLOG(3) << "RPCServer ShutDown "; - ShutDownImpl(); - - exit_flag_ = true; - barrier_cond_.notify_all(); - rpc_cond_.notify_all(); -} - -void RPCServer::SavePort() const { - auto file_path = string::Sprintf("/tmp/paddle.%d.port", ::getpid()); - std::ofstream port_file; - port_file.open(file_path); - port_file << selected_port_; - port_file.close(); - VLOG(3) << "selected port written to " << file_path; -} - -void RPCServer::WaitBarrier(const std::string& rpc_name) { - VLOG(3) << "WaitBarrier in: " << rpc_name; - std::unique_lock lock(this->mutex_); - barrier_cond_.wait(lock, [this, &rpc_name] { - return ((barrier_counter_[rpc_name] == client_num_ && client_num_ != 0) || - exit_flag_.load()); - }); - - VLOG(3) << "WaitBarrier out: " << rpc_name - << " counter: " << barrier_counter_[rpc_name]; -} - -void RPCServer::IncreaseBatchBarrier(const std::string rpc_name) { - VLOG(3) << "RPCServer begin IncreaseBatchBarrier " << rpc_name; - // barrier msg should make sure that it's in the right cond(send|recv) - WaitCond(rpc_name); - int b = 0; - std::unique_lock lock(mutex_); - b = ++barrier_counter_[rpc_name]; - VLOG(3) << rpc_name << " barrier_counter: " << b; - if (b >= client_num_) { - lock.unlock(); - VLOG(3) << "BatchBarrier counter reach " << client_num_ << " for " - << rpc_name; - barrier_cond_.notify_all(); - lock.lock(); - } -} - -void RPCServer::Complete() { - { - std::unique_lock lock(mutex_); - client_num_--; - need_reset_all_vars_ = true; - - VLOG(3) << "decrease client_num to: " << client_num_; - if (cur_cond_.load() == rpc_cond_map_[kRequestGet]) { - barrier_counter_[kRequestGet]--; - } - } - barrier_cond_.notify_all(); -} - -bool RPCServer::NeedResetAllVars() { - std::unique_lock lock(mutex_); - return need_reset_all_vars_; -} - -int RPCServer::GetClientNum() { - std::unique_lock lock(mutex_); - return client_num_; -} - -void RPCServer::ResetBarrierCounter() { - VLOG(3) << "RPCServer ResetBarrierCounter "; - std::unique_lock lock(mutex_); - for (auto& t : barrier_counter_) { - t.second = 0; - } - need_reset_all_vars_ = false; -} - -void RPCServer::RegisterRPC(const std::string& rpc_name, - RequestHandler* handler, int thread_num) { - rpc_call_map_[rpc_name] = handler; - rpc_thread_num_[rpc_name] = thread_num; - - static int cond = -1; - rpc_cond_map_[rpc_name] = ++cond; - VLOG(3) << "RegisterRPC rpc_name: " << rpc_name << ", handler: " << handler - << ", cond: " << rpc_cond_map_[rpc_name]; -} - -void RPCServer::SetCond(const std::string& rpc_name) { - VLOG(3) << "RPCServer SetCond " << rpc_name; - { - std::unique_lock lock(mutex_); - cur_cond_ = rpc_cond_map_[rpc_name]; - } - - rpc_cond_.notify_all(); -} - -void RPCServer::WaitCond(const std::string& rpc_name) { - VLOG(3) << "RPCServer WaitCond in " << rpc_name; - int cond = 0; - { - std::unique_lock lock(mutex_); - cond = rpc_cond_map_[rpc_name]; - } - - std::unique_lock lock(mutex_); - rpc_cond_.wait( - lock, [=] { return (cur_cond_.load() == cond || exit_flag_.load()); }); - VLOG(3) << "RPCServer WaitCond out " << rpc_name; -} - -void RPCServer::RegisterVar(const std::string& var_name, - const std::string& rpc_name, - framework::Scope* scope, - platform::DeviceContext* dev_ctx) { - MonomerHandle h; - h.var_name_ = var_name; - h.rpc_name_ = rpc_name; - h.scope_ = scope; - h.dev_ctx_ = dev_ctx; - - { - std::unique_lock lock(mutex_); - PADDLE_ENFORCE_EQ( - var_map_.find(var_name), var_map_.end(), - platform::errors::AlreadyExists("%s already in var_map.", var_name)); - var_map_[var_name] = h; - } - - rpc_cond_.notify_all(); - VLOG(3) << "RegisterVar context:" << h.String(); -} - -void RPCServer::IncreaseVarBarrier(const std::string& var_name) { - int b = 0; - MonomerHandle h; - { - std::unique_lock lock(mutex_); - b = ++var_map_[var_name].barrier_; - h = var_map_[var_name]; - } - - if (b >= client_num_) { - barrier_cond_.notify_all(); - } - - VLOG(3) << "IncreaseVarBarrier context:" << h.String(); -} - -void RPCServer::WaitVarBarrier(const std::string& var_name) { - VLOG(3) << "WaitVarBarrier var_name:" << var_name; - - std::unique_lock lock(mutex_); - barrier_cond_.wait(lock, [&]() { - return ((var_map_[var_name].barrier_ >= client_num_ && client_num_ != 0) || - exit_flag_.load()); - }); - - VLOG(3) << "WaitVarBarrier context: " << var_map_[var_name].String(); -} - -void RPCServer::SetVarCond(const std::string& var_name) { - VLOG(3) << "SetVarCond var_name:" << var_name; - { - std::unique_lock lock(mutex_); - if (var_map_.find(var_name) != var_map_.end()) { - rpc_cond_.notify_all(); - } - } -} - -void RPCServer::WaitVarCond(const std::string& var_name) { - VLOG(3) << "WaitVarCond var_name:" << var_name; - - std::unique_lock lock(mutex_); - rpc_cond_.wait(lock, [=] { - return (var_map_.find(var_name) != var_map_.end() || exit_flag_.load()); - }); - - VLOG(3) << "WaitVarCond var_name:" << var_name << " end"; -} - -MonomerHandle RPCServer::GetMonomer(const std::string& var_name) { - MonomerHandle h; - { - std::unique_lock lock(mutex_); - h = var_map_[var_name]; - } - - return h; -} - -void RPCServer::ClearRegisteredVars() { - std::unique_lock lock(mutex_); - var_map_.clear(); -} - -void RPCServer::ClearVar(const std::string& var_name) { - std::unique_lock lock(mutex_); - var_map_.erase(var_name); -} -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/rpc_server.h b/paddle/fluid/operators/distributed/rpc_server.h deleted file mode 100644 index 2120260515e..00000000000 --- a/paddle/fluid/operators/distributed/rpc_server.h +++ /dev/null @@ -1,149 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include // NOLINT -#include -#include -#include - -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/operators/distributed/request_handler.h" -#include "paddle/fluid/platform/device_context.h" - -namespace paddle { -namespace framework { -class Scope; -} // namespace framework -namespace platform { -class DeviceContext; -} // namespace platform -} // namespace paddle - -namespace paddle { -namespace operators { -namespace distributed { - -class RequestHandler; - -struct MonomerHandle { - std::string var_name_; - std::string rpc_name_; - framework::Scope* scope_{nullptr}; - platform::DeviceContext* dev_ctx_{nullptr}; - int64_t barrier_{0}; - - std::string String() { - std::stringstream ss; - ss << "var_name:" << var_name_ << ", rpc_name:" << rpc_name_ - << ", scope:" << scope_ << ", dev_ctx:" << dev_ctx_ - << ", barrier_:" << barrier_; - return ss.str(); - } -}; - -class RPCServer { - public: - explicit RPCServer(const std::string& address, int client_num) - : cur_cond_(0), - bind_address_(address), - exit_flag_(false), - selected_port_(0), - client_num_(client_num), - need_reset_all_vars_(false) {} - - virtual ~RPCServer() {} - virtual void StartServer() = 0; - virtual void WaitServerReady() = 0; - - void ShutDown(); - - bool IsExit() { return exit_flag_.load(); } - - int GetSelectedPort() const { return selected_port_; } - - int GetClientNum(); - - void SavePort() const; - - // RegisterRPC, register the rpc method name to a handler - // class, and auto generate a condition id for this call - // to be used for the barrier. - void RegisterRPC(const std::string& rpc_name, RequestHandler* handler, - int thread_num = 1); - - int GetThreadNum(const std::string& rpc_name) { - return rpc_thread_num_[rpc_name]; - } - - // Wait util all the clients have reached the barrier for one - // rpc method. This function should be called in the - // RequestHandler if you want to run the server/client in a - // synchronous mode. - void WaitBarrier(const std::string& rpc_name); - - void SetCond(const std::string& rpc_name); - void WaitCond(const std::string& rpc_name); - void IncreaseBatchBarrier(const std::string rpc_name); - - void RegisterVar(const std::string& var_name, const std::string& rpc_name, - framework::Scope* scope, platform::DeviceContext* dev_ctx); - void IncreaseVarBarrier(const std::string& var_name); - void WaitVarBarrier(const std::string& var_name); - void SetVarCond(const std::string& var_name); - void WaitVarCond(const std::string& var_name); - void ClearRegisteredVars(); - void ClearVar(const std::string& var_name); - MonomerHandle GetMonomer(const std::string& var_name); - - void Complete(); - - void ResetBarrierCounter(); - - bool NeedResetAllVars(); - - protected: - virtual void ShutDownImpl() = 0; - - private: - std::mutex mutex_; - std::unordered_map barrier_counter_; - std::condition_variable barrier_cond_; - - std::unordered_map rpc_cond_map_; - std::atomic cur_cond_; - std::condition_variable rpc_cond_; - - protected: - std::string bind_address_; - std::atomic exit_flag_; - int selected_port_; - int client_num_; - bool need_reset_all_vars_; - - std::unordered_map rpc_call_map_; - std::unordered_map rpc_thread_num_; - friend class RequestHandler; - - // TODO(gongwb): use more cond to notify or wait; - std::unordered_map var_map_; -}; - -}; // namespace distributed -}; // namespace operators -}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/rpc_server_test.cc b/paddle/fluid/operators/distributed/rpc_server_test.cc deleted file mode 100644 index f5928540003..00000000000 --- a/paddle/fluid/operators/distributed/rpc_server_test.cc +++ /dev/null @@ -1,344 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include // NOLINT -#include -#include -#include // NOLINT -#include - -#include "gtest/gtest.h" -#include "paddle/fluid/framework/block_desc.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" - -#include "paddle/fluid/operators/distributed/distributed.h" -#include "paddle/fluid/operators/distributed/heart_beat_monitor.h" -#include "paddle/fluid/operators/distributed/large_scale_kv.h" -#include "paddle/fluid/operators/distributed/request_handler_impl.h" -#include "paddle/fluid/operators/distributed/rpc_client.h" -#include "paddle/fluid/operators/distributed/rpc_server.h" - -namespace framework = paddle::framework; -namespace platform = paddle::platform; -namespace distributed = paddle::operators::distributed; - -USE_NO_KERNEL_OP(lookup_sparse_table_read); -USE_NO_KERNEL_OP(checkpoint_notify); -USE_OP(scale); - -std::unique_ptr g_rpc_service; -std::unique_ptr g_req_handler; - -framework::BlockDesc* AppendSendAndRecvBlock(framework::ProgramDesc* program) { - auto root_block = program->MutableBlock(0); - auto* block = program->AppendBlock(*root_block); - - framework::OpDesc* op = block->AppendOp(); - op->SetType("scale"); - op->SetInput("X", {"x"}); - op->SetOutput("Out", {"res"}); - op->SetAttr("scale", 0.5f); - - auto& out = *root_block->Var("res"); - out.SetType(framework::proto::VarType::LOD_TENSOR); - out.SetShape({1, 10}); - - return block; -} - -void CreateVarsOnScope(framework::Scope* scope, platform::CPUPlace* place) { - auto w_var = scope->Var("w"); - w_var->GetMutable(); - - auto out_var = scope->Var("out"); - out_var->GetMutable(); - - auto ids_var = scope->Var("ids"); - ids_var->GetMutable(); - - auto x_var = scope->Var("x"); - x_var->GetMutable(); - - auto res_var = scope->Var("res"); - res_var->GetMutable(); -} - -void InitTensorsOnClient(framework::Scope* scope, platform::CPUPlace* place, - int64_t rows_numel) { - CreateVarsOnScope(scope, place); - auto ids_var = scope->Var("ids")->GetMutable(); - int64_t* ids_ptr = - ids_var->mutable_data(framework::DDim({rows_numel, 1}), *place); - for (int64_t i = 0; i < rows_numel; ++i) ids_ptr[i] = i * 2; - - auto x_var = scope->Var("x")->GetMutable(); - float* x_ptr = - x_var->mutable_data(framework::DDim({1, rows_numel}), *place); - for (int64_t i = 0; i < rows_numel; ++i) x_ptr[i] = 1.0; -} - -void InitTensorsOnServer(framework::Scope* scope, platform::CPUPlace* place, - int64_t rows_numel) { - CreateVarsOnScope(scope, place); - auto w = scope->Var("w")->GetMutable(); - auto w_value = w->mutable_value(); - w_value->Resize({rows_numel, 10}); - for (int64_t i = 0; i < rows_numel; ++i) w->AutoGrownIndex(i, true); - - auto ptr = w_value->mutable_data(*place); - - for (int64_t i = 0; i < w_value->numel(); ++i) { - ptr[i] = static_cast(i / 10); - } -} - -void StartServer(const std::string& rpc_name) { - framework::ProgramDesc program; - framework::Scope scope; - platform::CPUPlace place; - framework::Executor exe(place); - platform::CPUDeviceContext ctx(place); - - std::unordered_map> - prefetch_var_name_to_prepared; - - g_req_handler->SetProgram(&program); - g_req_handler->SetPrefetchPreparedCtx(&prefetch_var_name_to_prepared); - g_req_handler->SetDevCtx(&ctx); - g_req_handler->SetScope(&scope); - g_req_handler->SetExecutor(&exe); - - g_rpc_service->RegisterRPC(rpc_name, g_req_handler.get()); - - // distributed::HeartBeatMonitor::Init(1, true, "w@grad"); - - g_req_handler->SetRPCServer(g_rpc_service.get()); - - std::thread server_thread( - std::bind(&distributed::RPCServer::StartServer, g_rpc_service.get())); - - server_thread.join(); -} - -void StartSendAndRecvServer(const std::string& rpc_name) { - framework::ProgramDesc program; - framework::Scope scope; - platform::CPUPlace place; - framework::Executor exe(place); - platform::CPUDeviceContext ctx(place); - auto block = AppendSendAndRecvBlock(&program); - std::string in_var_name("x"); - std::vector prefetch_block_ids{block->ID()}; - auto prepared = exe.Prepare(program, prefetch_block_ids); - InitTensorsOnServer(&scope, &place, 10); - - std::unordered_map> - grad_to_prepared_ctx; - grad_to_prepared_ctx[in_var_name] = prepared[0]; - - g_req_handler->SetProgram(&program); - g_req_handler->SetGradToPreparedCtx(&grad_to_prepared_ctx); - g_req_handler->SetDevCtx(&ctx); - g_req_handler->SetScope(&scope); - g_req_handler->SetExecutor(&exe); - - g_rpc_service->RegisterRPC(rpc_name, g_req_handler.get()); - g_req_handler->SetRPCServer(g_rpc_service.get()); - - std::thread server_thread( - std::bind(&distributed::RPCServer::StartServer, g_rpc_service.get())); - - server_thread.join(); -} - -TEST(COMPLETE, CPU) { - setenv("http_proxy", "", 1); - setenv("https_proxy", "", 1); - g_req_handler.reset( - new distributed::RequestSendHandler(distributed::DistributedMode::kSync)); - g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 2)); - distributed::RPCClient* client = - distributed::RPCClient::GetInstance(0); - PADDLE_ENFORCE_NE(client, nullptr, - platform::errors::InvalidArgument( - "Client Start Fail, Check Your Code & Env")); - std::thread server_thread(StartServer, distributed::kRequestSend); - g_rpc_service->WaitServerReady(); - int port = g_rpc_service->GetSelectedPort(); - std::string ep = paddle::string::Sprintf("127.0.0.1:%d", port); - client->AsyncSendComplete(ep); - client->Wait(); - - EXPECT_EQ(g_rpc_service->GetClientNum(), 1); - - g_rpc_service->ShutDown(); - server_thread.join(); - g_rpc_service.reset(nullptr); - g_req_handler.reset(nullptr); -} - -TEST(SENDANDRECV, CPU) { - setenv("http_proxy", "", 1); - setenv("https_proxy", "", 1); - g_req_handler.reset(new distributed::RequestSendAndRecvHandler( - distributed::DistributedMode::kAsync)); - g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 1)); - distributed::RPCClient* client = - distributed::RPCClient::GetInstance(0); - PADDLE_ENFORCE_NE(client, nullptr, - platform::errors::InvalidArgument( - "Client Start Fail, Check Your Code & Env")); - std::thread server_thread(StartSendAndRecvServer, - distributed::kRequestSendAndRecv); - g_rpc_service->WaitServerReady(); - int port = g_rpc_service->GetSelectedPort(); - std::string ep = paddle::string::Sprintf("127.0.0.1:%d", port); - - framework::Scope scope; - platform::CPUPlace place; - platform::CPUDeviceContext ctx(place); - - // create var on local scope - int64_t rows_numel = 10; - InitTensorsOnClient(&scope, &place, rows_numel); - std::string in_var_name("x"); - std::string out_var_name("res"); - - client->AsyncSendAndRecv(ep, ctx, scope, in_var_name, out_var_name); - client->Wait(); - auto var = scope.Var(out_var_name); - auto value = var->GetMutable(); - auto ptr = value->mutable_data(place); - - for (int64_t i = 0; i < rows_numel; ++i) { - EXPECT_EQ(ptr[i], 0.5); - } - g_rpc_service->ShutDown(); - server_thread.join(); - LOG(INFO) << "begin reset"; - g_rpc_service.reset(nullptr); - g_req_handler.reset(nullptr); -} - -void StartCheckpointServer(const std::string& rpc_name) { - framework::ProgramDesc program; - framework::Scope scope; - platform::CPUPlace place; - framework::Executor exe(place); - platform::CPUDeviceContext ctx(place); - - std::vector metas; - - auto meta = distributed::SparseMeta(); - meta.name = "embedding.block0"; - meta.value_names = {"Param"}; - meta.value_dims = {64}; - meta.mode = distributed::Mode::training; - meta.grad_name = "embedding@Grad"; - meta.cached_varnames = {"kSparseIds"}; - meta.initializer_attrs = {"fill_constant&1.0"}; - meta.entry = "none"; - - metas.push_back(meta); - distributed::LargeScaleKV::Init(metas); - - auto* ins = distributed::LargeScaleKV::GetInstance(); - ins->Get("embedding.block0")->Init({0, 1, 2, 3, 4, 5, 6, 7, 8, 9}); - - std::unordered_map> - prefetch_var_name_to_prepared; - - g_req_handler->SetProgram(&program); - g_req_handler->SetPrefetchPreparedCtx(&prefetch_var_name_to_prepared); - g_req_handler->SetDevCtx(&ctx); - g_req_handler->SetScope(&scope); - g_req_handler->SetExecutor(&exe); - - g_rpc_service->RegisterRPC(rpc_name, g_req_handler.get()); - - g_req_handler->SetRPCServer(g_rpc_service.get()); - - std::thread server_thread( - std::bind(&distributed::RPCServer::StartServer, g_rpc_service.get())); - - server_thread.join(); -} - -TEST(LARGE_SCALE_CHECKPOINT, CPU) { - setenv("http_proxy", "", 1); - setenv("https_proxy", "", 1); - - paddle::framework::Scope scope; - paddle::platform::CPUPlace place; - - g_req_handler.reset(new distributed::RequestCheckpointHandler( - distributed::DistributedMode::kAsync)); - g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 1)); - - distributed::RPCClient* client = - distributed::RPCClient::GetInstance(0); - - PADDLE_ENFORCE_NE(client, nullptr, - platform::errors::InvalidArgument( - "Client Start Fail, Check Your Code & Env")); - - std::thread server_thread(StartCheckpointServer, - distributed::kRequestCheckpoint); - g_rpc_service->WaitServerReady(); - - int port = g_rpc_service->GetSelectedPort(); - std::string ep = paddle::string::Sprintf("127.0.0.1:%d", port); - - auto save_path = - paddle::string::Sprintf("%s/%s/%s", "/tmp/large_scale_table/base", - "embedding", "embedding.block0"); - int mode = 0; - client->AsyncCheckpointNotify(ep, save_path, "embedding.block0", mode); - client->Wait(); - - save_path = - paddle::string::Sprintf("%s/%s/%s", "/tmp/large_scale_table/delta", - "embedding", "embedding.block0"); - mode = 1; - client->AsyncCheckpointNotify(ep, save_path, "embedding.block0", mode); - client->Wait(); - - paddle::framework::AttributeMap attrs; - - std::vector eps = {ep}; - attrs["endpoints"] = eps; - attrs["dirname"] = std::string("/tmp/large_scale_table/delta1"); - attrs["varname"] = std::string("embedding"); - attrs["mode"] = 2; - std::vector slices = {"embedding.block0"}; - attrs["slice_varnames"] = slices; - std::vector remotes = {"embedding.block0"}; - attrs["remote_varnames"] = remotes; - - auto ops = - framework::OpRegistry::CreateOp("checkpoint_notify", {}, {}, attrs, true); - ops->Run(scope, place); - - g_rpc_service->ShutDown(); - server_thread.join(); - LOG(INFO) << "begin reset"; - g_rpc_service.reset(nullptr); - g_req_handler.reset(nullptr); -} diff --git a/paddle/fluid/operators/distributed/send_recv.proto.in b/paddle/fluid/operators/distributed/send_recv.proto.in deleted file mode 100644 index a333642bd16..00000000000 --- a/paddle/fluid/operators/distributed/send_recv.proto.in +++ /dev/null @@ -1,88 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. Licensed under -the Apache License, Version 2.0 (the "License"); you may not use this file -except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -syntax = "proto3"; -package sendrecv; - -option cc_generic_services = @cc_generic_services@; - -service SendRecvService { - // For parameter server round-robin like hashing, do not split tensors. - // Send and recv only one tensor - // TODO(typhoonzero): add streaming API - rpc SendVariable(VariableMessage) returns (VoidMessage) {} - // Argument VariableMessage for GetVariable should only contain varname. - rpc GetVariable(VariableMessage) returns (VariableMessage) {} - rpc GetVariableNoBarrier(VariableMessage) returns (VariableMessage) {} - // pre-fetch variable by given variable name and Ids - rpc PrefetchVariable(VariableMessage) returns (VariableMessage) {} - - rpc CheckpointNotify(VariableMessage) returns (VoidMessage) {} - rpc DistributeNotify(VariableMessage) returns (VoidMessage) {} - rpc SendAndRecvVariable(VariableMessage) returns (VariableMessage) {} - rpc GetMonomerVariable(VariableMessage) returns (VariableMessage) {} - rpc GetMonomerBarrier(VariableMessage) returns (VoidMessage) {} -} - -// It can be: LoDTensor、SelectedRows or NCCL_ID -enum VarType { - LOD_TENSOR = 0; - SELECTED_ROWS = 1; - NCCL_ID = 2; -} - -// VariableMessage is serialized paddle variable message. -// NOTICE(gongwb):don't modify this proto if you are not -// not familar with how we serialize in sendrecvop_utils.h -// and deserilize it in variable_response.h. -message VariableMessage { - enum Type { - // Pod Types - BOOL = 0; - INT16 = 1; - INT32 = 2; - INT64 = 3; - FP16 = 4; - FP32 = 5; - FP64 = 6; - } - - message LodData { repeated int64 lod_data = 1; } - string varname = 1; - // TODO(Yancey1989): reference framework::proto::VarDesc::VarType - VarType type = 2; - // bool persistable is not needed for sending. - // tensor info: - Type data_type = 3; - repeated int64 dims = 4; - - // lod details: - int64 lod_level = 5; - repeated LodData lod = 6; - // selected_rows height, aka. original dim0 - int64 slr_height = 7; - // tensor data - bytes serialized = 8; - // selected_rows data - bytes rows = 9; - // Look up table block execution output variable name. - string out_varname = 10; - // If 1, the ps server will start profiling, the ps - // server stops profiling and generates a profile to /tmp/profile_ps_* - // when profile switches from 1 to 2. - int64 profile = 11; - int64 trainer_id = 12; - string table_name = 13; -} - -message VoidMessage {} diff --git a/paddle/fluid/operators/distributed/sendrecvop_utils.cc b/paddle/fluid/operators/distributed/sendrecvop_utils.cc deleted file mode 100644 index 107c74eb267..00000000000 --- a/paddle/fluid/operators/distributed/sendrecvop_utils.cc +++ /dev/null @@ -1,117 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#include - -#include "paddle/fluid/operators/distributed/sendrecvop_utils.h" - -namespace paddle { -namespace framework { -class Variable; -} // namespace framework -namespace memory { -namespace allocation { -class Allocation; -} // namespace allocation -} // namespace memory -} // namespace paddle - -DEFINE_bool(rpc_disable_reuse_port, false, "Disable SO_REUSEPORT or not."); -DEFINE_int32(rpc_retry_bind_port, 3, - "Retry to bind the address if address is already used."); - -namespace paddle { -namespace operators { -namespace distributed { - -using VarMsg = sendrecv::VariableMessage; - -static TensorPayload GetCommunicationAllocationFromTensor( - const platform::DeviceContext& ctx, const framework::Tensor& tensor) { - if (is_gpu_place(ctx.GetPlace())) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - PADDLE_ENFORCE_EQ( - is_gpu_place(tensor.place()), true, - platform::errors::PreconditionNotMet("Please run in gpu place.")); - auto& gpu_dev_ctx = - reinterpret_cast(ctx); - auto copy_size = tensor.numel() * framework::SizeOfType(tensor.type()); - platform::CUDAPinnedPlace cuda_pinned; - auto result = memory::AllocShared(cuda_pinned, copy_size); - - memory::Copy(cuda_pinned, result->ptr(), - BOOST_GET_CONST(platform::CUDAPlace, tensor.place()), - tensor.data(), copy_size, gpu_dev_ctx.stream()); - ctx.Wait(); - return TensorPayload(result); -#else - PADDLE_THROW( - platform::errors::Unavailable("This situation should not be happened")); -#endif - } else { - return TensorPayload(tensor); - } -} -TensorPayload GetTensorPayload(framework::Variable* var, - const platform::DeviceContext& ctx, - VarMsg* request) { - auto tensor = var->Get(); - // FIXME(wuyi): data types in send_recv.proto is copied from - // framework.proto - request->set_data_type(static_cast(tensor.type())); - for (auto& dim : framework::vectorize(tensor.dims())) { - request->add_dims(dim); - } - const framework::LoD lod = tensor.lod(); - if (lod.size() > 0) { - request->set_lod_level(lod.size()); - for (auto& each : lod) { - VarMsg::LodData* lod_inner = request->add_lod(); - for (auto& d : each) { - lod_inner->add_lod_data(d); - } - } - } - return GetCommunicationAllocationFromTensor(ctx, tensor); -} - -TensorPayload GetSelectedRowsPayload(framework::Variable* var, - const platform::DeviceContext& ctx, - VarMsg* request) { - auto* slr = var->GetMutable(); - request->set_data_type(static_cast(slr->value().type())); - request->set_lod_level(0); - request->set_slr_height(slr->height()); - - for (auto& dim : framework::vectorize(slr->value().dims())) { - request->add_dims(dim); - } - - auto* tensor = slr->mutable_value(); - return GetCommunicationAllocationFromTensor(ctx, *tensor); -} - -TensorPayload::TensorPayload(std::shared_ptr allocation) - : allocation_(allocation), offset_(0), memory_size_(allocation->size()) {} -TensorPayload::TensorPayload(const framework::Tensor& tensor) - : allocation_(tensor.Holder()), - offset_(tensor.offset()), - memory_size_(tensor.numel() * framework::SizeOfType(tensor.type())) {} -void* TensorPayload::ptr() const { - return reinterpret_cast( - reinterpret_cast(allocation_->ptr()) + offset_); -} -size_t TensorPayload::memory_size() const { return memory_size_; } -} // namespace distributed -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/distributed/sendrecvop_utils.h b/paddle/fluid/operators/distributed/sendrecvop_utils.h deleted file mode 100644 index 84ed1ab0247..00000000000 --- a/paddle/fluid/operators/distributed/sendrecvop_utils.h +++ /dev/null @@ -1,110 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include -#include -#include -#include - -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/framework/var_type.h" -#include "paddle/fluid/operators/distributed/distributed_pb.h" -#include "paddle/fluid/platform/port.h" - -namespace paddle { -namespace framework { -class Tensor; -class Variable; -} // namespace framework -namespace memory { -namespace allocation { -class Allocation; -} // namespace allocation -} // namespace memory -namespace platform { -class DeviceContext; -} // namespace platform -} // namespace paddle - -namespace paddle { -namespace operators { -namespace distributed { - -using VarMsg = sendrecv::VariableMessage; - -class TensorPayload final { - public: - explicit TensorPayload(const framework::Tensor& tensor); - explicit TensorPayload(std::shared_ptr allocation); - - TensorPayload(const TensorPayload& o) = default; - TensorPayload& operator=(const TensorPayload& o) = default; - - void* ptr() const; - size_t memory_size() const; - - private: - std::shared_ptr allocation_; - size_t offset_; - size_t memory_size_; -}; - -inline void SerializeDestroyCallback(void* payload) { - if (payload != nullptr) { - auto* shared_payload = reinterpret_cast(payload); - delete shared_payload; - } -} - -TensorPayload GetTensorPayload(framework::Variable* var, - const platform::DeviceContext& ctx, - VarMsg* request); - -TensorPayload GetSelectedRowsPayload(framework::Variable* var, - const platform::DeviceContext& ctx, - VarMsg* request); - -inline framework::proto::VarType::Type ToVarType( - sendrecv::VariableMessage::Type type) { - switch (type) { - case sendrecv::VariableMessage::FP32: - return framework::proto::VarType::FP32; // NOLINT - case sendrecv::VariableMessage::FP64: - return framework::proto::VarType::FP64; // NOLINT - case sendrecv::VariableMessage::INT32: - return framework::proto::VarType::INT32; // NOLINT - case sendrecv::VariableMessage::INT64: - return framework::proto::VarType::INT64; // NOLINT - case sendrecv::VariableMessage::BOOL: - return framework::proto::VarType::BOOL; // NOLINT - default: - PADDLE_THROW( - platform::errors::InvalidArgument("Not support type id: %d.", type)); - } -} - -template