未验证 提交 431eab64 编写于 作者: X xiaoting 提交者: GitHub

Merge branch 'develop' into yolov3_loss

...@@ -47,33 +47,34 @@ find_package(Threads REQUIRED) ...@@ -47,33 +47,34 @@ find_package(Threads REQUIRED)
include(simd) include(simd)
################################ Configurations ####################################### ################################ Exposed Configurations #######################################
option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_FOUND}) option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_FOUND})
option(WITH_AMD_GPU "Compile PaddlePaddle with AMD GPU" OFF) option(WITH_DSO "Compile PaddlePaddle with dynamic linked CUDA" ON)
option(WITH_AVX "Compile PaddlePaddle with AVX intrinsics" ${AVX_FOUND}) option(WITH_AVX "Compile PaddlePaddle with AVX intrinsics" ${AVX_FOUND})
option(WITH_PYTHON "Compile PaddlePaddle with python interpreter" ON)
option(WITH_TESTING "Compile PaddlePaddle with unit testing" OFF)
option(WITH_MKL "Compile PaddlePaddle with MKL support." ${AVX_FOUND}) option(WITH_MKL "Compile PaddlePaddle with MKL support." ${AVX_FOUND})
option(WITH_SYSTEM_BLAS "Use system blas library" OFF)
option(WITH_DISTRIBUTE "Compile with distributed support" OFF)
option(WITH_BRPC_RDMA "Use brpc rdma as the rpc protocal" OFF)
option(ON_INFER "Turn on inference optimization." OFF)
option(WITH_ANAKIN "Compile with Anakin library" OFF)
################################ Internal Configurations #######################################
option(WITH_AMD_GPU "Compile PaddlePaddle with AMD GPU" OFF)
option(WITH_NGRAPH "Compile PaddlePaddle with nGraph support." OFF) option(WITH_NGRAPH "Compile PaddlePaddle with nGraph support." OFF)
option(WITH_DSO "Compile PaddlePaddle with dynamic linked CUDA" ON)
option(WITH_TESTING "Compile PaddlePaddle with unit testing" OFF)
option(WITH_PYTHON "Compile PaddlePaddle with python interpreter" ON)
option(WITH_PROFILER "Compile PaddlePaddle with GPU profiler and gperftools" OFF) option(WITH_PROFILER "Compile PaddlePaddle with GPU profiler and gperftools" OFF)
option(WITH_JEMALLOC "Compile PaddlePaddle with jemalloc" OFF) option(WITH_JEMALLOC "Compile PaddlePaddle with jemalloc" OFF)
option(WITH_COVERAGE "Compile PaddlePaddle with code coverage" OFF) option(WITH_COVERAGE "Compile PaddlePaddle with code coverage" OFF)
option(COVERALLS_UPLOAD "Package code coverage data to coveralls" OFF) option(COVERALLS_UPLOAD "Package code coverage data to coveralls" OFF)
option(WITH_DISTRIBUTE "Compile with distributed support" OFF)
option(WITH_PSLIB "Compile with pslib support" OFF) option(WITH_PSLIB "Compile with pslib support" OFF)
option(WITH_CONTRIB "Compile the third-party contributation" OFF) option(WITH_CONTRIB "Compile the third-party contributation" OFF)
option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better debug." OFF) option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better debug." OFF)
# TODO(Superjomn) Remove WITH_ANAKIN option if not needed latter. # TODO(Superjomn) Remove WITH_ANAKIN option if not needed latter.
option(WITH_ANAKIN "Compile with Anakin library" OFF)
option(ANAKIN_BUILD_FAT_BIN "Build anakin cuda fat-bin lib for all device plantform, ignored when WITH_ANAKIN=OFF" OFF) option(ANAKIN_BUILD_FAT_BIN "Build anakin cuda fat-bin lib for all device plantform, ignored when WITH_ANAKIN=OFF" OFF)
option(ANAKIN_BUILD_CROSS_PLANTFORM "Build anakin lib for any nvidia device plantform. ignored when WITH_ANAKIN=OFF" ON) option(ANAKIN_BUILD_CROSS_PLANTFORM "Build anakin lib for any nvidia device plantform. ignored when WITH_ANAKIN=OFF" ON)
option(WITH_GRPC "Use grpc as the default rpc framework" ${WITH_DISTRIBUTE}) option(WITH_GRPC "Use grpc as the default rpc framework" ${WITH_DISTRIBUTE})
option(WITH_BRPC_RDMA "Use brpc rdma as the rpc protocal" OFF)
option(ON_INFER "Turn on inference optimization." OFF)
option(WITH_INFERENCE_API_TEST "Test fluid inference C++ high-level api interface" OFF) option(WITH_INFERENCE_API_TEST "Test fluid inference C++ high-level api interface" OFF)
option(WITH_HIGH_LEVEL_API_TEST "Test fluid python high-level api interface" OFF) option(WITH_HIGH_LEVEL_API_TEST "Test fluid python high-level api interface" OFF)
option(WITH_SYSTEM_BLAS "Use system blas library" OFF)
option(PY_VERSION "Compile PaddlePaddle with python3 support" ${PY_VERSION}) option(PY_VERSION "Compile PaddlePaddle with python3 support" ${PY_VERSION})
option(WITH_FAST_MATH "Make use of fast math library, might affect the precision to some extent" ON) option(WITH_FAST_MATH "Make use of fast math library, might affect the precision to some extent" ON)
......
...@@ -241,6 +241,7 @@ paddle.fluid.layers.tree_conv (ArgSpec(args=['nodes_vector', 'edge_set', 'output ...@@ -241,6 +241,7 @@ paddle.fluid.layers.tree_conv (ArgSpec(args=['nodes_vector', 'edge_set', 'output
paddle.fluid.layers.npair_loss (ArgSpec(args=['anchor', 'positive', 'labels', 'l2_reg'], varargs=None, keywords=None, defaults=(0.002,)), ('document', '46994d10276dd4cb803b4062b5d14329')) paddle.fluid.layers.npair_loss (ArgSpec(args=['anchor', 'positive', 'labels', 'l2_reg'], varargs=None, keywords=None, defaults=(0.002,)), ('document', '46994d10276dd4cb803b4062b5d14329'))
paddle.fluid.layers.pixel_shuffle (ArgSpec(args=['x', 'upscale_factor'], varargs=None, keywords=None, defaults=None), ('document', '731b21c62a4add60a33bd76d802ffc5c')) paddle.fluid.layers.pixel_shuffle (ArgSpec(args=['x', 'upscale_factor'], varargs=None, keywords=None, defaults=None), ('document', '731b21c62a4add60a33bd76d802ffc5c'))
paddle.fluid.layers.fsp_matrix (ArgSpec(args=['x', 'y'], varargs=None, keywords=None, defaults=None), ('document', 'b76ccca3735bea4a58a0dbf0d77c5393')) paddle.fluid.layers.fsp_matrix (ArgSpec(args=['x', 'y'], varargs=None, keywords=None, defaults=None), ('document', 'b76ccca3735bea4a58a0dbf0d77c5393'))
paddle.fluid.layers.continuous_value_model (ArgSpec(args=['input', 'cvm', 'use_cvm'], varargs=None, keywords=None, defaults=(True,)), ('document', 'a07a44c2bacdcd09c1f5f35a96a0514e'))
paddle.fluid.layers.data (ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)), ('document', '33bbd42027d872b3818b3d64ec52e139')) paddle.fluid.layers.data (ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)), ('document', '33bbd42027d872b3818b3d64ec52e139'))
paddle.fluid.layers.open_files (ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)), ('document', 'b1ae2e1cc0750e58726374061ea90ecc')) paddle.fluid.layers.open_files (ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)), ('document', 'b1ae2e1cc0750e58726374061ea90ecc'))
paddle.fluid.layers.read_file (ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None), ('document', 'b0a1c2fc51c27a106da28f3308c41f5e')) paddle.fluid.layers.read_file (ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None), ('document', 'b0a1c2fc51c27a106da28f3308c41f5e'))
...@@ -276,6 +277,7 @@ paddle.fluid.layers.has_nan (ArgSpec(args=['x'], varargs=None, keywords=None, de ...@@ -276,6 +277,7 @@ paddle.fluid.layers.has_nan (ArgSpec(args=['x'], varargs=None, keywords=None, de
paddle.fluid.layers.isfinite (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', '0a437011c3906079fd8947ed3e52d292')) paddle.fluid.layers.isfinite (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', '0a437011c3906079fd8947ed3e52d292'))
paddle.fluid.layers.range (ArgSpec(args=['start', 'end', 'step', 'dtype'], varargs=None, keywords=None, defaults=None), ('document', '2ec937ede953ded2fdff2675883900bb')) paddle.fluid.layers.range (ArgSpec(args=['start', 'end', 'step', 'dtype'], varargs=None, keywords=None, defaults=None), ('document', '2ec937ede953ded2fdff2675883900bb'))
paddle.fluid.layers.linspace (ArgSpec(args=['start', 'stop', 'num', 'dtype'], varargs=None, keywords=None, defaults=None), ('document', '495e21e9a848c2d075a102802fc67756')) paddle.fluid.layers.linspace (ArgSpec(args=['start', 'stop', 'num', 'dtype'], varargs=None, keywords=None, defaults=None), ('document', '495e21e9a848c2d075a102802fc67756'))
paddle.fluid.layers.zeros_like (ArgSpec(args=['x', 'out'], varargs=None, keywords=None, defaults=(None,)), ('document', 'c7e4cfffc93ae89c8f6f53b6d650f923'))
paddle.fluid.layers.While.__init__ (ArgSpec(args=['self', 'cond', 'is_test', 'name'], varargs=None, keywords=None, defaults=(False, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.layers.While.__init__ (ArgSpec(args=['self', 'cond', 'is_test', 'name'], varargs=None, keywords=None, defaults=(False, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.layers.While.block (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.layers.While.block (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.layers.Switch.__init__ (ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.layers.Switch.__init__ (ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
...@@ -285,7 +287,11 @@ paddle.fluid.layers.increment (ArgSpec(args=['x', 'value', 'in_place'], varargs= ...@@ -285,7 +287,11 @@ paddle.fluid.layers.increment (ArgSpec(args=['x', 'value', 'in_place'], varargs=
paddle.fluid.layers.array_write (ArgSpec(args=['x', 'i', 'array'], varargs=None, keywords=None, defaults=(None,)), ('document', '40b6d15f4c86b2b09df340d7778ad713')) paddle.fluid.layers.array_write (ArgSpec(args=['x', 'i', 'array'], varargs=None, keywords=None, defaults=(None,)), ('document', '40b6d15f4c86b2b09df340d7778ad713'))
paddle.fluid.layers.create_array (ArgSpec(args=['dtype'], varargs=None, keywords=None, defaults=None), ('document', '2d4f20087080ba5105b55205ad5c5b6a')) paddle.fluid.layers.create_array (ArgSpec(args=['dtype'], varargs=None, keywords=None, defaults=None), ('document', '2d4f20087080ba5105b55205ad5c5b6a'))
paddle.fluid.layers.less_than (ArgSpec(args=['x', 'y', 'force_cpu', 'cond'], varargs=None, keywords=None, defaults=(None, None)), ('document', '067bbc799c66289ca8b8924c26b6673f')) paddle.fluid.layers.less_than (ArgSpec(args=['x', 'y', 'force_cpu', 'cond'], varargs=None, keywords=None, defaults=(None, None)), ('document', '067bbc799c66289ca8b8924c26b6673f'))
paddle.fluid.layers.less_equal (ArgSpec(args=['x', 'y', 'cond'], varargs=None, keywords=None, defaults=(None,)), ('document', 'd6b173ae1a149e0bdfe7b8bf69285957'))
paddle.fluid.layers.greater_than (ArgSpec(args=['x', 'y', 'cond'], varargs=None, keywords=None, defaults=(None,)), ('document', '2c9bd414caa6c615539018d27001b44c'))
paddle.fluid.layers.greater_equal (ArgSpec(args=['x', 'y', 'cond'], varargs=None, keywords=None, defaults=(None,)), ('document', '62c667d24e7b07e166b47a53b61b2ff4'))
paddle.fluid.layers.equal (ArgSpec(args=['x', 'y', 'cond'], varargs=None, keywords=None, defaults=(None,)), ('document', '80c29b1dc64718f0116de90d1ac88a77')) paddle.fluid.layers.equal (ArgSpec(args=['x', 'y', 'cond'], varargs=None, keywords=None, defaults=(None,)), ('document', '80c29b1dc64718f0116de90d1ac88a77'))
paddle.fluid.layers.not_equal (ArgSpec(args=['x', 'y', 'cond'], varargs=None, keywords=None, defaults=(None,)), ('document', '56148fb1024687a08e96af79bdc5c929'))
paddle.fluid.layers.array_read (ArgSpec(args=['array', 'i'], varargs=None, keywords=None, defaults=None), ('document', 'dd68bead34dfbaf6b0a163fc1cc3c385')) paddle.fluid.layers.array_read (ArgSpec(args=['array', 'i'], varargs=None, keywords=None, defaults=None), ('document', 'dd68bead34dfbaf6b0a163fc1cc3c385'))
paddle.fluid.layers.array_length (ArgSpec(args=['array'], varargs=None, keywords=None, defaults=None), ('document', 'ffb8b9578ec66db565b223d313aa82a2')) paddle.fluid.layers.array_length (ArgSpec(args=['array'], varargs=None, keywords=None, defaults=None), ('document', 'ffb8b9578ec66db565b223d313aa82a2'))
paddle.fluid.layers.IfElse.__init__ (ArgSpec(args=['self', 'cond', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.layers.IfElse.__init__ (ArgSpec(args=['self', 'cond', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
...@@ -301,12 +307,12 @@ paddle.fluid.layers.DynamicRNN.static_input (ArgSpec(args=['self', 'x'], varargs ...@@ -301,12 +307,12 @@ paddle.fluid.layers.DynamicRNN.static_input (ArgSpec(args=['self', 'x'], varargs
paddle.fluid.layers.DynamicRNN.step_input (ArgSpec(args=['self', 'x', 'level'], varargs=None, keywords=None, defaults=(0,)), ('document', '7568c5ac7622a10288d3307a94134655')) paddle.fluid.layers.DynamicRNN.step_input (ArgSpec(args=['self', 'x', 'level'], varargs=None, keywords=None, defaults=(0,)), ('document', '7568c5ac7622a10288d3307a94134655'))
paddle.fluid.layers.DynamicRNN.update_memory (ArgSpec(args=['self', 'ex_mem', 'new_mem'], varargs=None, keywords=None, defaults=None), ('document', '5d83987da13b98363d6a807a52d8024f')) paddle.fluid.layers.DynamicRNN.update_memory (ArgSpec(args=['self', 'ex_mem', 'new_mem'], varargs=None, keywords=None, defaults=None), ('document', '5d83987da13b98363d6a807a52d8024f'))
paddle.fluid.layers.StaticRNN.__init__ (ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.layers.StaticRNN.__init__ (ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.layers.StaticRNN.memory (ArgSpec(args=['self', 'init', 'shape', 'batch_ref', 'init_value', 'init_batch_dim_idx', 'ref_batch_dim_idx'], varargs=None, keywords=None, defaults=(None, None, None, 0.0, 0, 1)), ('document', 'c24e368e23afac1ed91a78a639d7a9c7')) paddle.fluid.layers.StaticRNN.memory (ArgSpec(args=['self', 'init', 'shape', 'batch_ref', 'init_value', 'init_batch_dim_idx', 'ref_batch_dim_idx'], varargs=None, keywords=None, defaults=(None, None, None, 0.0, 0, 1)), ('document', '72530f299d6451a567cf4a12dc3fb1ff'))
paddle.fluid.layers.StaticRNN.output (ArgSpec(args=['self'], varargs='outputs', keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.layers.StaticRNN.output (ArgSpec(args=['self'], varargs='outputs', keywords=None, defaults=None), ('document', 'df6ceab6e6c9bd31e97914d7e7538137'))
paddle.fluid.layers.StaticRNN.step (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.layers.StaticRNN.step (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6d3e0a5d9aa519a9773a36e1620ea9b7'))
paddle.fluid.layers.StaticRNN.step_input (ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.layers.StaticRNN.step_input (ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None), ('document', '903387ec11f3d0bf46821d31a68cffa5'))
paddle.fluid.layers.StaticRNN.step_output (ArgSpec(args=['self', 'o'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.layers.StaticRNN.step_output (ArgSpec(args=['self', 'o'], varargs=None, keywords=None, defaults=None), ('document', '252890d4c3199a7623ab8667e13fd837'))
paddle.fluid.layers.StaticRNN.update_memory (ArgSpec(args=['self', 'mem', 'var'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.layers.StaticRNN.update_memory (ArgSpec(args=['self', 'mem', 'var'], varargs=None, keywords=None, defaults=None), ('document', '7a0000520f179f35239956a5ba55119f'))
paddle.fluid.layers.reorder_lod_tensor_by_rank (ArgSpec(args=['x', 'rank_table'], varargs=None, keywords=None, defaults=None), ('document', '3545f529ef04e8f6ecb76b47fa3df01a')) paddle.fluid.layers.reorder_lod_tensor_by_rank (ArgSpec(args=['x', 'rank_table'], varargs=None, keywords=None, defaults=None), ('document', '3545f529ef04e8f6ecb76b47fa3df01a'))
paddle.fluid.layers.Print (ArgSpec(args=['input', 'first_n', 'message', 'summarize', 'print_tensor_name', 'print_tensor_type', 'print_tensor_shape', 'print_tensor_lod', 'print_phase'], varargs=None, keywords=None, defaults=(-1, None, -1, True, True, True, True, 'both')), ('document', '5fef91b0e21c93610785f2b1f7161732')) paddle.fluid.layers.Print (ArgSpec(args=['input', 'first_n', 'message', 'summarize', 'print_tensor_name', 'print_tensor_type', 'print_tensor_shape', 'print_tensor_lod', 'print_phase'], varargs=None, keywords=None, defaults=(-1, None, -1, True, True, True, True, 'both')), ('document', '5fef91b0e21c93610785f2b1f7161732'))
paddle.fluid.layers.is_empty (ArgSpec(args=['x', 'cond'], varargs=None, keywords=None, defaults=(None,)), ('document', 'bbe578dbb49ad13e15b014e98c22b519')) paddle.fluid.layers.is_empty (ArgSpec(args=['x', 'cond'], varargs=None, keywords=None, defaults=(None,)), ('document', 'bbe578dbb49ad13e15b014e98c22b519'))
...@@ -318,6 +324,7 @@ paddle.fluid.layers.atan (ArgSpec(args=['x', 'name'], varargs=None, keywords=Non ...@@ -318,6 +324,7 @@ paddle.fluid.layers.atan (ArgSpec(args=['x', 'name'], varargs=None, keywords=Non
paddle.fluid.layers.tanh_shrink (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '1e521554b9fdda9061ec6d306f0709b7')) paddle.fluid.layers.tanh_shrink (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '1e521554b9fdda9061ec6d306f0709b7'))
paddle.fluid.layers.softshrink (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '9eef31597bbafa2bd49691e072296e13')) paddle.fluid.layers.softshrink (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '9eef31597bbafa2bd49691e072296e13'))
paddle.fluid.layers.sqrt (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e9e27491c39ac74d0b1ffe506aec0ebb')) paddle.fluid.layers.sqrt (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e9e27491c39ac74d0b1ffe506aec0ebb'))
paddle.fluid.layers.rsqrt (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'c445467ebe58b3c0d7f0bba7795b6f56'))
paddle.fluid.layers.abs (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '64650ac42cf82e9920cb0b172b1d29fd')) paddle.fluid.layers.abs (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '64650ac42cf82e9920cb0b172b1d29fd'))
paddle.fluid.layers.ceil (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'c75d67dc5fe28f68e4cfffead4f698ad')) paddle.fluid.layers.ceil (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'c75d67dc5fe28f68e4cfffead4f698ad'))
paddle.fluid.layers.floor (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '647b16c5da5ef909649ae02abb434973')) paddle.fluid.layers.floor (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '647b16c5da5ef909649ae02abb434973'))
...@@ -330,13 +337,13 @@ paddle.fluid.layers.reciprocal (ArgSpec(args=['x', 'name'], varargs=None, keywor ...@@ -330,13 +337,13 @@ paddle.fluid.layers.reciprocal (ArgSpec(args=['x', 'name'], varargs=None, keywor
paddle.fluid.layers.square (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '48dfb45d773dbc30126c3a7f777de5ee')) paddle.fluid.layers.square (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '48dfb45d773dbc30126c3a7f777de5ee'))
paddle.fluid.layers.softplus (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '459c5781e9d1dd88283b7c5769d7872a')) paddle.fluid.layers.softplus (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '459c5781e9d1dd88283b7c5769d7872a'))
paddle.fluid.layers.softsign (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '80846bcd4bd457207457a6d5411f4148')) paddle.fluid.layers.softsign (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '80846bcd4bd457207457a6d5411f4148'))
paddle.fluid.layers.uniform_random (ArgSpec(args=['shape', 'dtype', 'min', 'max', 'seed'], varargs=None, keywords=None, defaults=('float32', -1.0, 1.0, 0)), ('document', '308b619af849caa82bbc31e897f5e641')) paddle.fluid.layers.uniform_random (ArgSpec(args=['shape', 'dtype', 'min', 'max', 'seed'], varargs=None, keywords=None, defaults=('float32', -1.0, 1.0, 0)), ('document', 'a8c4e972b7d6742c838a37abf407ed9a'))
paddle.fluid.layers.hard_shrink (ArgSpec(args=['x', 'threshold'], varargs=None, keywords=None, defaults=(None,)), ('document', 'c142f5884f3255e0d6075c286bbd531e')) paddle.fluid.layers.hard_shrink (ArgSpec(args=['x', 'threshold'], varargs=None, keywords=None, defaults=(None,)), ('document', 'c142f5884f3255e0d6075c286bbd531e'))
paddle.fluid.layers.cumsum (ArgSpec(args=['x', 'axis', 'exclusive', 'reverse'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '944d7c03057f5fc88bc78acd4d82f926')) paddle.fluid.layers.cumsum (ArgSpec(args=['x', 'axis', 'exclusive', 'reverse'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '944d7c03057f5fc88bc78acd4d82f926'))
paddle.fluid.layers.thresholded_relu (ArgSpec(args=['x', 'threshold'], varargs=None, keywords=None, defaults=(None,)), ('document', '90566ea449ea4c681435546e2f70610a')) paddle.fluid.layers.thresholded_relu (ArgSpec(args=['x', 'threshold'], varargs=None, keywords=None, defaults=(None,)), ('document', '90566ea449ea4c681435546e2f70610a'))
paddle.fluid.layers.prior_box (ArgSpec(args=['input', 'image', 'min_sizes', 'max_sizes', 'aspect_ratios', 'variance', 'flip', 'clip', 'steps', 'offset', 'name', 'min_max_aspect_ratios_order'], varargs=None, keywords=None, defaults=(None, [1.0], [0.1, 0.1, 0.2, 0.2], False, False, [0.0, 0.0], 0.5, None, False)), ('document', '14cac0ee643fa6e026ad82aeeee75bd8')) paddle.fluid.layers.prior_box (ArgSpec(args=['input', 'image', 'min_sizes', 'max_sizes', 'aspect_ratios', 'variance', 'flip', 'clip', 'steps', 'offset', 'name', 'min_max_aspect_ratios_order'], varargs=None, keywords=None, defaults=(None, [1.0], [0.1, 0.1, 0.2, 0.2], False, False, [0.0, 0.0], 0.5, None, False)), ('document', '14cac0ee643fa6e026ad82aeeee75bd8'))
paddle.fluid.layers.density_prior_box (ArgSpec(args=['input', 'image', 'densities', 'fixed_sizes', 'fixed_ratios', 'variance', 'clip', 'steps', 'offset', 'flatten_to_2d', 'name'], varargs=None, keywords=None, defaults=(None, None, None, [0.1, 0.1, 0.2, 0.2], False, [0.0, 0.0], 0.5, False, None)), ('document', 'a0d762bb08de9ce93bc780aa57cd5cd9')) paddle.fluid.layers.density_prior_box (ArgSpec(args=['input', 'image', 'densities', 'fixed_sizes', 'fixed_ratios', 'variance', 'clip', 'steps', 'offset', 'flatten_to_2d', 'name'], varargs=None, keywords=None, defaults=(None, None, None, [0.1, 0.1, 0.2, 0.2], False, [0.0, 0.0], 0.5, False, None)), ('document', 'a0d762bb08de9ce93bc780aa57cd5cd9'))
paddle.fluid.layers.multi_box_head (ArgSpec(args=['inputs', 'image', 'base_size', 'num_classes', 'aspect_ratios', 'min_ratio', 'max_ratio', 'min_sizes', 'max_sizes', 'steps', 'step_w', 'step_h', 'offset', 'variance', 'flip', 'clip', 'kernel_size', 'pad', 'stride', 'name', 'min_max_aspect_ratios_order'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None, 0.5, [0.1, 0.1, 0.2, 0.2], True, False, 1, 0, 1, None, False)), ('document', 'a6ab47a2fe681e52fabb7057ddf0efdd')) paddle.fluid.layers.multi_box_head (ArgSpec(args=['inputs', 'image', 'base_size', 'num_classes', 'aspect_ratios', 'min_ratio', 'max_ratio', 'min_sizes', 'max_sizes', 'steps', 'step_w', 'step_h', 'offset', 'variance', 'flip', 'clip', 'kernel_size', 'pad', 'stride', 'name', 'min_max_aspect_ratios_order'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None, 0.5, [0.1, 0.1, 0.2, 0.2], True, False, 1, 0, 1, None, False)), ('document', 'fe9afaee481dd09f28866df22756466f'))
paddle.fluid.layers.bipartite_match (ArgSpec(args=['dist_matrix', 'match_type', 'dist_threshold', 'name'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '3ddb9b966f193900193a95a3df77c3c1')) paddle.fluid.layers.bipartite_match (ArgSpec(args=['dist_matrix', 'match_type', 'dist_threshold', 'name'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '3ddb9b966f193900193a95a3df77c3c1'))
paddle.fluid.layers.target_assign (ArgSpec(args=['input', 'matched_indices', 'negative_indices', 'mismatch_value', 'name'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', 'c0b334f917828f95056f6ebe10907b1c')) paddle.fluid.layers.target_assign (ArgSpec(args=['input', 'matched_indices', 'negative_indices', 'mismatch_value', 'name'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', 'c0b334f917828f95056f6ebe10907b1c'))
paddle.fluid.layers.detection_output (ArgSpec(args=['loc', 'scores', 'prior_box', 'prior_box_var', 'background_label', 'nms_threshold', 'nms_top_k', 'keep_top_k', 'score_threshold', 'nms_eta'], varargs=None, keywords=None, defaults=(0, 0.3, 400, 200, 0.01, 1.0)), ('document', 'c33093a82a46e3091e789e5572588db1')) paddle.fluid.layers.detection_output (ArgSpec(args=['loc', 'scores', 'prior_box', 'prior_box_var', 'background_label', 'nms_threshold', 'nms_top_k', 'keep_top_k', 'score_threshold', 'nms_eta'], varargs=None, keywords=None, defaults=(0, 0.3, 400, 200, 0.01, 1.0)), ('document', 'c33093a82a46e3091e789e5572588db1'))
...@@ -351,7 +358,7 @@ paddle.fluid.layers.generate_mask_labels (ArgSpec(args=['im_info', 'gt_classes', ...@@ -351,7 +358,7 @@ paddle.fluid.layers.generate_mask_labels (ArgSpec(args=['im_info', 'gt_classes',
paddle.fluid.layers.iou_similarity (ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '587845f60c5d97ffdf2dfd21da52eca1')) paddle.fluid.layers.iou_similarity (ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '587845f60c5d97ffdf2dfd21da52eca1'))
paddle.fluid.layers.box_coder (ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'code_type', 'box_normalized', 'name', 'axis'], varargs=None, keywords=None, defaults=('encode_center_size', True, None, 0)), ('document', '032d0f4b7d8f6235ee5d91e473344f0e')) paddle.fluid.layers.box_coder (ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'code_type', 'box_normalized', 'name', 'axis'], varargs=None, keywords=None, defaults=('encode_center_size', True, None, 0)), ('document', '032d0f4b7d8f6235ee5d91e473344f0e'))
paddle.fluid.layers.polygon_box_transform (ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '0e5ac2507723a0b5adec473f9556799b')) paddle.fluid.layers.polygon_box_transform (ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '0e5ac2507723a0b5adec473f9556799b'))
paddle.fluid.layers.yolov3_loss (ArgSpec(args=['x', 'gtbox', 'gtlabel', 'anchors', 'anchor_mask', 'class_num', 'ignore_thresh', 'downsample_ratio', 'gtscore', 'use_label_smooth', 'name'], varargs=None, keywords=None, defaults=(None, True, None)), ('document', 'bbed7a8e63324cb76873ddd32b2f84ef')) paddle.fluid.layers.yolov3_loss (ArgSpec(args=['x', 'gt_box', 'gt_label', 'anchors', 'anchor_mask', 'class_num', 'ignore_thresh', 'downsample_ratio', 'gt_score', 'use_label_smooth', 'name'], varargs=None, keywords=None, defaults=(None, True, None)), ('document', '4d170807a13d33925d1049d2892832bf'))
paddle.fluid.layers.yolo_box (ArgSpec(args=['x', 'img_size', 'anchors', 'class_num', 'conf_thresh', 'downsample_ratio', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '5566169a5ab993d177792c023c7fb340')) paddle.fluid.layers.yolo_box (ArgSpec(args=['x', 'img_size', 'anchors', 'class_num', 'conf_thresh', 'downsample_ratio', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '5566169a5ab993d177792c023c7fb340'))
paddle.fluid.layers.box_clip (ArgSpec(args=['input', 'im_info', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '397e9e02b451d99c56e20f268fa03f2e')) paddle.fluid.layers.box_clip (ArgSpec(args=['input', 'im_info', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '397e9e02b451d99c56e20f268fa03f2e'))
paddle.fluid.layers.multiclass_nms (ArgSpec(args=['bboxes', 'scores', 'score_threshold', 'nms_top_k', 'keep_top_k', 'nms_threshold', 'normalized', 'nms_eta', 'background_label', 'name'], varargs=None, keywords=None, defaults=(0.3, True, 1.0, 0, None)), ('document', 'ca7d1107b6c5d2d6d8221039a220fde0')) paddle.fluid.layers.multiclass_nms (ArgSpec(args=['bboxes', 'scores', 'score_threshold', 'nms_top_k', 'keep_top_k', 'nms_threshold', 'normalized', 'nms_eta', 'background_label', 'name'], varargs=None, keywords=None, defaults=(0.3, True, 1.0, 0, None)), ('document', 'ca7d1107b6c5d2d6d8221039a220fde0'))
......
...@@ -64,9 +64,12 @@ void ProcessGraph(std::vector<ir::Graph *> graphs, Scope *scope) { ...@@ -64,9 +64,12 @@ void ProcessGraph(std::vector<ir::Graph *> graphs, Scope *scope) {
node->Op()->GetNullableAttr("epmap")); node->Op()->GetNullableAttr("epmap"));
auto height_section = boost::get<std::vector<int64_t>>( auto height_section = boost::get<std::vector<int64_t>>(
node->Op()->GetNullableAttr("sections")); node->Op()->GetNullableAttr("sections"));
auto trainer_id =
boost::get<int>(node->Op()->GetNullableAttr("trainer_id"));
send_varname_to_ctx[send_var_name] = send_varname_to_ctx[send_var_name] =
operators::distributed::RpcContext(send_var_name, send_varnames, operators::distributed::RpcContext(send_var_name, send_varnames,
epmap, height_section); epmap, height_section,
trainer_id);
VLOG(3) << "find and init an send op: " VLOG(3) << "find and init an send op: "
<< send_varname_to_ctx[send_var_name]; << send_varname_to_ctx[send_var_name];
} else if (node->Name() == "recv") { } else if (node->Name() == "recv") {
...@@ -75,9 +78,11 @@ void ProcessGraph(std::vector<ir::Graph *> graphs, Scope *scope) { ...@@ -75,9 +78,11 @@ void ProcessGraph(std::vector<ir::Graph *> graphs, Scope *scope) {
node->Op()->GetNullableAttr("recv_varnames")); node->Op()->GetNullableAttr("recv_varnames"));
auto epmap = boost::get<std::vector<std::string>>( auto epmap = boost::get<std::vector<std::string>>(
node->Op()->GetNullableAttr("epmap")); node->Op()->GetNullableAttr("epmap"));
auto trainer_id =
boost::get<int>(node->Op()->GetNullableAttr("trainer_id"));
recv_varname_to_ctx[recv_var_name] = recv_varname_to_ctx[recv_var_name] =
operators::distributed::RpcContext(recv_var_name, recv_varnames, operators::distributed::RpcContext(recv_var_name, recv_varnames,
epmap, {}); epmap, {}, trainer_id);
nodes_to_delete.push_back(node); nodes_to_delete.push_back(node);
VLOG(3) << "find and remove an recv op: " VLOG(3) << "find and remove an recv op: "
<< recv_varname_to_ctx[recv_var_name]; << recv_varname_to_ctx[recv_var_name];
......
...@@ -101,8 +101,6 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { ...@@ -101,8 +101,6 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
"mode."; "mode.";
strategy_.fuse_all_optimizer_ops_ = false; strategy_.fuse_all_optimizer_ops_ = false;
} else { } else {
VLOG(10) << "Add alloc_continuous_space_for_grad_pass";
AppendPass("alloc_continuous_space_for_grad_pass");
// NOTE: fuse_all_xx_ops will count the number of xx operator first, // NOTE: fuse_all_xx_ops will count the number of xx operator first,
// if the number is zero, fuse_all_reduce_ops will do nothing. // if the number is zero, fuse_all_reduce_ops will do nothing.
// Currently, only one type of optimization algorithm can be fused. // Currently, only one type of optimization algorithm can be fused.
......
...@@ -24,7 +24,7 @@ namespace details { ...@@ -24,7 +24,7 @@ namespace details {
const std::string FuseAdamOpPass::GetOpType() const { return "adam"; } const std::string FuseAdamOpPass::GetOpType() const { return "adam"; }
const std::vector<std::string> FuseAdamOpPass::GetAuxiliaryVarNames() const { const std::vector<std::string> FuseAdamOpPass::GetAuxiliaryVarNames() const {
return {"Param", "Moment1", "Moment2", "Beta1Pow", "Beta2Pow"}; return {"Moment1", "Moment2", "Beta1Pow", "Beta2Pow"};
} }
void FuseAdamOpPass::FuseOptimizerOps( void FuseAdamOpPass::FuseOptimizerOps(
...@@ -77,16 +77,16 @@ void FuseAdamOpPass::FuseAdamOps( ...@@ -77,16 +77,16 @@ void FuseAdamOpPass::FuseAdamOps(
VLOG(10) << "Insert adam to graph "; VLOG(10) << "Insert adam to graph ";
OpDesc adam_desc(adam_ops[0]->Op()->Block()); OpDesc adam_desc(adam_ops[0]->Op()->Block());
adam_desc.SetType("adam"); adam_desc.SetType("adam");
adam_desc.SetInput("Param", {fused_vars_name.at("Param")}); adam_desc.SetInput(kParam, {fused_vars_name.at(kParam)});
adam_desc.SetInput("Grad", {fused_vars_name.at("Grad")}); adam_desc.SetInput(kGrad, {fused_vars_name.at(kGrad)});
adam_desc.SetInput("Moment1", {fused_vars_name.at("Moment1")}); adam_desc.SetInput("Moment1", {fused_vars_name.at("Moment1")});
adam_desc.SetInput("Moment2", {fused_vars_name.at("Moment2")}); adam_desc.SetInput("Moment2", {fused_vars_name.at("Moment2")});
// TODO(zcd): The LearningRate, Beta1Pow, Beta2Pow should be equal. // TODO(zcd): The LearningRate, Beta1Pow, Beta2Pow should be equal.
adam_desc.SetInput("LearningRate", adam_ops[0]->Op()->Input("LearningRate")); adam_desc.SetInput(kLearningRate, adam_ops[0]->Op()->Input(kLearningRate));
adam_desc.SetInput("Beta1Pow", adam_ops[0]->Op()->Input("Beta1Pow")); adam_desc.SetInput("Beta1Pow", adam_ops[0]->Op()->Input("Beta1Pow"));
adam_desc.SetInput("Beta2Pow", adam_ops[0]->Op()->Input("Beta2Pow")); adam_desc.SetInput("Beta2Pow", adam_ops[0]->Op()->Input("Beta2Pow"));
adam_desc.SetOutput("ParamOut", {fused_vars_name.at("Param")}); adam_desc.SetOutput("ParamOut", {fused_vars_name.at(kParam)});
adam_desc.SetOutput("Moment1Out", {fused_vars_name.at("Moment1")}); adam_desc.SetOutput("Moment1Out", {fused_vars_name.at("Moment1")});
adam_desc.SetOutput("Moment2Out", {fused_vars_name.at("Moment2")}); adam_desc.SetOutput("Moment2Out", {fused_vars_name.at("Moment2")});
adam_desc.SetAttr("beta1", beta1); adam_desc.SetAttr("beta1", beta1);
......
...@@ -29,7 +29,9 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const { ...@@ -29,7 +29,9 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const {
auto &local_scopes = Get<const std::vector<Scope *>>(kLocalScopes); auto &local_scopes = Get<const std::vector<Scope *>>(kLocalScopes);
const std::string fuse_op_type = GetOpType(); const std::string fuse_op_type = GetOpType();
const std::vector<std::string> aux_var_names = GetAuxiliaryVarNames(); std::vector<std::string> aux_var_names = GetAuxiliaryVarNames();
aux_var_names.emplace_back(kParam);
aux_var_names.emplace_back(kGrad);
// Step 1: Get the specified op and auxiliary variables. // Step 1: Get the specified op and auxiliary variables.
std::vector<ir::Node *> topo_nodes = ir::TopologySortOperations(result); std::vector<ir::Node *> topo_nodes = ir::TopologySortOperations(result);
...@@ -61,7 +63,7 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const { ...@@ -61,7 +63,7 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const {
result.Set(kFusedVars, new FusedVars); result.Set(kFusedVars, new FusedVars);
} }
std::unordered_map<std::string, std::string> fused_vars_name; std::unordered_map<std::string, std::string> fused_vars_name;
fused_vars_name.reserve(aux_var_names.size() + 1); fused_vars_name.reserve(aux_var_names.size());
auto &fused_var_set = result.Get<FusedVars>(kFusedVars); auto &fused_var_set = result.Get<FusedVars>(kFusedVars);
const std::string prefix(kFusedVarNamePrefix); const std::string prefix(kFusedVarNamePrefix);
// NOTE: the fused_var_name should be unique. // NOTE: the fused_var_name should be unique.
...@@ -75,39 +77,103 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const { ...@@ -75,39 +77,103 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const {
} }
// Step 3: Get the fused Gradient's name // Step 3: Get the fused Gradient's name
auto &params_grads = result.Get<ParamsAndGrads>(kParamsAndGrads); bool grad_fused = false;
if (!result.Has(kFusedGrads)) { if (result.Has(kParamsAndGrads)) {
PADDLE_THROW( auto &params_grads = result.Get<ParamsAndGrads>(kParamsAndGrads);
"The alloc_continuous_space_for_grad_pass should be called before this " PADDLE_ENFORCE_EQ(
"pass."); params_grads.size(), aux_var_set.at(kGrad).size(),
} "The number of gradients and optimizer ops is not equal.");
auto &fused_grad = result.Get<FusedGrads>(kFusedGrads); std::unordered_set<std::string> opt_grad_set(aux_var_set.at(kGrad).begin(),
auto &fused_vars = result.Get<FusedVars>(kFusedVars); aux_var_set.at(kGrad).end());
auto iter = std::find(fused_vars.begin(), fused_vars.end(), fused_grad); size_t same_grad_num = 0;
PADDLE_ENFORCE(iter != fused_vars.end(), "Not find the fused_grad."); for (auto &p_g : params_grads) {
fused_vars_name.emplace("Grad", fused_grad); if (opt_grad_set.count(p_g.second)) {
++same_grad_num;
// Step 4: Sort the parameters and auxiliary variables according }
// to parameters' name to make variables' name correspond correctly. }
PADDLE_ENFORCE(result.Has(kParamsAndGrads), "Does't find kParamsAndGrads.");
PADDLE_ENFORCE_EQ(params_grads.size(), aux_var_set.begin()->second.size(), // NOTE(zcd): the gradient of kParamsAndGrads may be different with the
"The size of params_grads and aux_var_set are not equal."); // kGrad.
SortParametersAndAuxVars(params_grads, &aux_var_set, &opt_ops); if (same_grad_num == aux_var_set.at(kGrad).size()) {
if (!result.Has(kFusedGrads)) {
// Step 5: Alloc continuous space for Parameters and AuxiliaryVar(e.g. PADDLE_THROW(
"The alloc_continuous_space_for_grad_pass should be called before "
"this pass.");
}
auto &fused_grad = result.Get<FusedGrads>(kFusedGrads);
auto &fused_vars = result.Get<FusedVars>(kFusedVars);
auto iter = std::find(fused_vars.begin(), fused_vars.end(), fused_grad);
PADDLE_ENFORCE(iter != fused_vars.end(), "Not find the fused_grad.");
fused_vars_name[kGrad] = fused_grad;
// Sort the parameters and auxiliary variables according
// to parameters' name to make variables' name correspond correctly.
SortParametersAndAuxVars(params_grads, &aux_var_set, &opt_ops);
grad_fused = true;
}
}
// Step 4: Alloc continuous space for Parameters and AuxiliaryVar(e.g.
// Moment1, Moment2, Beta1Pow, Beta2Pow) of all the optimizer ops separately. // Moment1, Moment2, Beta1Pow, Beta2Pow) of all the optimizer ops separately.
aux_var_names.pop_back();
if (!grad_fused) {
InitFusedGradsAndAllocSpaceForGrads(
places, local_scopes, aux_var_set.at(kParam), aux_var_set.at(kGrad),
fused_vars_name.at(kGrad), &result);
}
InitFusedVarsAndAllocSpaceForVars(places, local_scopes, aux_var_names, InitFusedVarsAndAllocSpaceForVars(places, local_scopes, aux_var_names,
aux_var_set, fused_vars_name); aux_var_set, fused_vars_name);
// Step 6: Fuse optimizer Ops and Scale Ops // Step 5: Fuse optimizer Ops and Scale Ops
FuseOptimizerOps(aux_var_set, fused_vars_name, opt_ops, &result); FuseOptimizerOps(aux_var_set, fused_vars_name, opt_ops, &result);
// Step 7: Remove optimizer Ops // Step 6: Remove optimizer Ops
for (auto &opt_op : opt_ops) { for (auto &opt_op : opt_ops) {
graph->RemoveNode(opt_op); graph->RemoveNode(opt_op);
} }
} }
void FuseOptimizerOpPass::InitFusedGradsAndAllocSpaceForGrads(
const std::vector<platform::Place> &places,
const std::vector<Scope *> &local_scopes,
const std::vector<std::string> &params,
const std::vector<std::string> &grads, const std::string &fused_grad_name,
ir::Graph *result) const {
// Get Var Nodes
std::unordered_map<std::string, ir::Node *> vars;
for (ir::Node *node : result->Nodes()) {
if (node->IsVar() && node->Var()) {
// Note: The graph may have the same name node. For example, parameter
// is the input of operator and it also is the output of optimizer;
vars.emplace(node->Var()->Name(), node);
}
}
// Init Grads
for (auto it = local_scopes.rbegin(); it != local_scopes.rend(); ++it) {
auto &scope = *it;
VLOG(10) << "Init " << fused_grad_name;
PADDLE_ENFORCE(scope->FindVar(fused_grad_name) == nullptr,
"%s has existed in scope.", fused_grad_name);
scope->Var(fused_grad_name)->GetMutable<LoDTensor>();
for (auto &grad_var_name : grads) {
auto iter = vars.find(grad_var_name);
PADDLE_ENFORCE(iter != vars.end());
PADDLE_ENFORCE_NOT_NULL(iter->second->Var());
PADDLE_ENFORCE_EQ(iter->second->Var()->GetType(),
proto::VarType::LOD_TENSOR);
scope->Var(grad_var_name)->GetMutable<LoDTensor>();
}
}
// Define Ops
ProgramDesc program_desc;
auto *global_block = program_desc.MutableBlock(0);
AppendAllocContinuousSpace(params, grads, fused_grad_name, global_block,
false, false);
// Run Ops
RunInitOps(places, local_scopes, *global_block);
}
void FuseOptimizerOpPass::InitFusedVarsAndAllocSpaceForVars( void FuseOptimizerOpPass::InitFusedVarsAndAllocSpaceForVars(
const std::vector<platform::Place> &places, const std::vector<platform::Place> &places,
const std::vector<Scope *> &local_scopes, const std::vector<Scope *> &local_scopes,
...@@ -115,37 +181,49 @@ void FuseOptimizerOpPass::InitFusedVarsAndAllocSpaceForVars( ...@@ -115,37 +181,49 @@ void FuseOptimizerOpPass::InitFusedVarsAndAllocSpaceForVars(
const std::unordered_map<std::string, std::vector<std::string>> const std::unordered_map<std::string, std::vector<std::string>>
&aux_var_set, &aux_var_set,
const std::unordered_map<std::string, std::string> &fused_vars_name) const { const std::unordered_map<std::string, std::string> &fused_vars_name) const {
VLOG(10) << "Init FusedVars."; // Init Vars
// Alloc parameters and auxiliary vars in the respective scope. for (auto &var_name : aux_var_names) {
size_t idx = local_scopes.size(); auto &fused_var_name = fused_vars_name.at(var_name);
for (auto iter = local_scopes.rbegin(); iter != local_scopes.rend(); InitVars(local_scopes, fused_var_name);
++iter, --idx) {
auto &scope = *iter;
for (auto &var_name : aux_var_names) {
auto fused_var_name = fused_vars_name.at(var_name);
VLOG(10) << "Init " << fused_var_name;
PADDLE_ENFORCE(scope->FindVar(fused_var_name) == nullptr,
"%s has exist in scope[%d]", fused_var_name, idx);
scope->Var(fused_var_name)->GetMutable<LoDTensor>();
}
} }
// Define Ops
ProgramDesc program_desc; ProgramDesc program_desc;
auto *global_block = program_desc.MutableBlock(0); auto *global_block = program_desc.MutableBlock(0);
for (auto &var_name : aux_var_names) { for (auto &var_name : aux_var_names) {
AppendAllocContinuousSpace(aux_var_set.at(var_name), AppendAllocContinuousSpace(
fused_vars_name.at(var_name), true, aux_var_set.at(var_name), aux_var_set.at(var_name),
global_block); fused_vars_name.at(var_name), global_block, true);
} }
// Run Ops
RunInitOps(places, local_scopes, *global_block);
}
void FuseOptimizerOpPass::RunInitOps(const std::vector<platform::Place> &places,
const std::vector<Scope *> &local_scopes,
const BlockDesc &global_block) const {
for (size_t i = 0; i < local_scopes.size(); ++i) { for (size_t i = 0; i < local_scopes.size(); ++i) {
for (auto &op_desc : global_block->AllOps()) { for (auto &op_desc : global_block.AllOps()) {
auto op = OpRegistry::CreateOp(*op_desc); auto op = OpRegistry::CreateOp(*op_desc);
op->Run(*local_scopes[i], places[i]); op->Run(*local_scopes[i], places[i]);
} }
} }
} }
void FuseOptimizerOpPass::InitVars(const std::vector<Scope *> &local_scopes,
const std::string &fused_var_name) const {
VLOG(10) << "Init FusedVars.";
// Alloc parameters and auxiliary vars in the respective scope.
size_t idx = local_scopes.size();
for (auto iter = local_scopes.rbegin(); iter != local_scopes.rend();
++iter, --idx) {
auto &scope = *iter;
VLOG(10) << "Init " << fused_var_name;
PADDLE_ENFORCE(scope->FindVar(fused_var_name) == nullptr,
"%s has exist in scope[%d]", fused_var_name, idx);
scope->Var(fused_var_name)->GetMutable<LoDTensor>();
}
}
void FuseOptimizerOpPass::SortParametersAndAuxVars( void FuseOptimizerOpPass::SortParametersAndAuxVars(
const std::vector<std::pair<std::string, std::string>> &params_grads, const std::vector<std::pair<std::string, std::string>> &params_grads,
std::unordered_map<std::string, std::vector<std::string>> *aux_vars_set, std::unordered_map<std::string, std::vector<std::string>> *aux_vars_set,
...@@ -203,15 +281,16 @@ void FuseOptimizerOpPass::GetSpecifiedOpsAndVars( ...@@ -203,15 +281,16 @@ void FuseOptimizerOpPass::GetSpecifiedOpsAndVars(
} }
void FuseOptimizerOpPass::AppendAllocContinuousSpace( void FuseOptimizerOpPass::AppendAllocContinuousSpace(
const std::vector<std::string> &args, const std::string &out_arg, const std::vector<std::string> &in_args,
bool copy_data, BlockDesc *global_block) const { const std::vector<std::string> &out_args, const std::string &fused_out_arg,
BlockDesc *global_block, bool copy_data, bool check_name) const {
auto op_desc = global_block->AppendOp(); auto op_desc = global_block->AppendOp();
op_desc->SetType("alloc_continuous_space"); op_desc->SetType("alloc_continuous_space");
op_desc->SetInput("Input", args); op_desc->SetInput("Input", in_args);
op_desc->SetOutput("Output", args); op_desc->SetOutput("Output", out_args);
op_desc->SetOutput("FusedOutput", {out_arg}); op_desc->SetOutput("FusedOutput", {fused_out_arg});
op_desc->SetAttr("copy_data", copy_data); op_desc->SetAttr("copy_data", copy_data);
op_desc->SetAttr("check_name", true); op_desc->SetAttr("check_name", check_name);
} }
void FuseOptimizerOpPass::InserInputAndOutputForOptOps( void FuseOptimizerOpPass::InserInputAndOutputForOptOps(
......
...@@ -27,6 +27,10 @@ namespace paddle { ...@@ -27,6 +27,10 @@ namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
constexpr char kGrad[] = "Grad";
constexpr char kParam[] = "Param";
constexpr char kLearningRate[] = "LearningRate";
class FuseOptimizerOpPass : public ir::Pass { class FuseOptimizerOpPass : public ir::Pass {
protected: protected:
void ApplyImpl(ir::Graph *graph) const override; void ApplyImpl(ir::Graph *graph) const override;
...@@ -56,9 +60,18 @@ class FuseOptimizerOpPass : public ir::Pass { ...@@ -56,9 +60,18 @@ class FuseOptimizerOpPass : public ir::Pass {
std::unordered_map<std::string, std::vector<std::string>> *aux_args_name) std::unordered_map<std::string, std::vector<std::string>> *aux_args_name)
const; const;
void AppendAllocContinuousSpace(const std::vector<std::string> &args, void AppendAllocContinuousSpace(const std::vector<std::string> &in_args,
const std::string &out_arg, bool copy_data, const std::vector<std::string> &out_args,
BlockDesc *global_block) const; const std::string &fused_out_arg,
BlockDesc *global_block, bool copy_data,
bool check_name = true) const;
void InitFusedGradsAndAllocSpaceForGrads(
const std::vector<platform::Place> &places,
const std::vector<Scope *> &local_scopes,
const std::vector<std::string> &params,
const std::vector<std::string> &grads, const std::string &fused_grad_name,
ir::Graph *result) const;
void InitFusedVarsAndAllocSpaceForVars( void InitFusedVarsAndAllocSpaceForVars(
const std::vector<platform::Place> &places, const std::vector<platform::Place> &places,
...@@ -68,6 +81,13 @@ class FuseOptimizerOpPass : public ir::Pass { ...@@ -68,6 +81,13 @@ class FuseOptimizerOpPass : public ir::Pass {
&aux_var_set, &aux_var_set,
const std::unordered_map<std::string, std::string> &fused_vars_name) const std::unordered_map<std::string, std::string> &fused_vars_name)
const; const;
void RunInitOps(const std::vector<platform::Place> &places,
const std::vector<Scope *> &local_scopes,
const BlockDesc &global_block) const;
void InitVars(const std::vector<Scope *> &local_scopes,
const std::string &fused_var_name) const;
}; };
} // namespace details } // namespace details
......
...@@ -24,7 +24,7 @@ namespace details { ...@@ -24,7 +24,7 @@ namespace details {
const std::string FuseSgdOpPass::GetOpType() const { return "sgd"; } const std::string FuseSgdOpPass::GetOpType() const { return "sgd"; }
const std::vector<std::string> FuseSgdOpPass::GetAuxiliaryVarNames() const { const std::vector<std::string> FuseSgdOpPass::GetAuxiliaryVarNames() const {
return {"Param"}; return {};
} }
void FuseSgdOpPass::FuseOptimizerOps( void FuseSgdOpPass::FuseOptimizerOps(
...@@ -50,12 +50,12 @@ void FuseSgdOpPass::FuseSgdOps( ...@@ -50,12 +50,12 @@ void FuseSgdOpPass::FuseSgdOps(
// Add fused scale // Add fused scale
OpDesc Sgd_desc(sgd_ops[0]->Op()->Block()); OpDesc Sgd_desc(sgd_ops[0]->Op()->Block());
Sgd_desc.SetType("sgd"); Sgd_desc.SetType("sgd");
Sgd_desc.SetInput("Param", {fused_vars_name.at("Param")}); Sgd_desc.SetInput(kParam, {fused_vars_name.at(kParam)});
Sgd_desc.SetInput("Grad", {fused_vars_name.at("Grad")}); Sgd_desc.SetInput(kGrad, {fused_vars_name.at(kGrad)});
Sgd_desc.SetOutput("ParamOut", {fused_vars_name.at("Param")}); Sgd_desc.SetOutput("ParamOut", {fused_vars_name.at(kParam)});
// TODO(zcd): The LearningRate, Beta1Pow, Beta2Pow should be equal. // TODO(zcd): The LearningRate, Beta1Pow, Beta2Pow should be equal.
Sgd_desc.SetInput("LearningRate", sgd_ops[0]->Op()->Input("LearningRate")); Sgd_desc.SetInput(kLearningRate, sgd_ops[0]->Op()->Input(kLearningRate));
// NOTE: multi_devices_pass requires that every op should have a role. // NOTE: multi_devices_pass requires that every op should have a role.
Sgd_desc.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), op_role); Sgd_desc.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), op_role);
......
...@@ -106,7 +106,7 @@ ParallelSSAGraphExecutor::ParallelSSAGraphExecutor( ...@@ -106,7 +106,7 @@ ParallelSSAGraphExecutor::ParallelSSAGraphExecutor(
VLOG(1) << "set num_threads: " << strategy_.num_threads_ VLOG(1) << "set num_threads: " << strategy_.num_threads_
<< " to run the operators of the graph on each device."; << " to run the operators of the graph on each device.";
for (size_t i = 0; i < places.size(); ++i) { for (size_t i = 0; i < places.size(); ++i) {
executors_.emplace_back(new details::ThreadedSSAGraphExecutor( executors_.emplace_back(new details::FastThreadedSSAGraphExecutor(
strategy_, local_scopes_, {places_[i]}, graphs_.at(i).get())); strategy_, local_scopes_, {places_[i]}, graphs_.at(i).get()));
} }
} }
......
...@@ -14,12 +14,12 @@ ...@@ -14,12 +14,12 @@
#pragma once #pragma once
#include <memory>
#include <string> #include <string>
#include <vector> #include <vector>
#include "ThreadPool.h" #include "ThreadPool.h"
#include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
#include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph.h"
namespace paddle { namespace paddle {
...@@ -48,7 +48,8 @@ class ParallelSSAGraphExecutor : public SSAGraphExecutor { ...@@ -48,7 +48,8 @@ class ParallelSSAGraphExecutor : public SSAGraphExecutor {
std::vector<platform::Place> places_; std::vector<platform::Place> places_;
std::vector<std::unique_ptr<ir::Graph>> graphs_; std::vector<std::unique_ptr<ir::Graph>> graphs_;
std::vector<std::unique_ptr<details::ThreadedSSAGraphExecutor>> executors_; std::vector<std::unique_ptr<details::FastThreadedSSAGraphExecutor>>
executors_;
ExceptionHolder exception_holder_; ExceptionHolder exception_holder_;
}; };
......
...@@ -45,12 +45,16 @@ class InferVarTypeContext { ...@@ -45,12 +45,16 @@ class InferVarTypeContext {
virtual bool HasInput(const std::string& name) const { virtual bool HasInput(const std::string& name) const {
PADDLE_ENFORCE_NOT_NULL(op_); PADDLE_ENFORCE_NOT_NULL(op_);
return op_->Inputs().count(name) > 0; auto& inputs = op_->Inputs();
auto input = inputs.find(name);
return input != inputs.end() && !input->second.empty();
} }
virtual bool HasOutput(const std::string& name) const { virtual bool HasOutput(const std::string& name) const {
PADDLE_ENFORCE_NOT_NULL(op_); PADDLE_ENFORCE_NOT_NULL(op_);
return op_->Outputs().count(name) > 0; auto& outputs = op_->Outputs();
auto output = outputs.find(name);
return output != outputs.end() && !output->second.empty();
} }
virtual const std::vector<std::string>& Input(const std::string& name) const { virtual const std::vector<std::string>& Input(const std::string& name) const {
......
...@@ -832,6 +832,45 @@ std::string AnalysisPredictor::GetSerializedProgram() const { ...@@ -832,6 +832,45 @@ std::string AnalysisPredictor::GetSerializedProgram() const {
return inference_program_->Proto()->SerializeAsString(); return inference_program_->Proto()->SerializeAsString();
} }
// Add SaveOptimModel
void AnalysisPredictor::SaveOptimModel(const std::string &dir) {
// save model
std::string model_name = dir + "/model";
std::ofstream outfile;
outfile.open(model_name, std::ios::out | std::ios::binary);
std::string inference_prog_desc = GetSerializedProgram();
outfile << inference_prog_desc;
// save params
framework::ProgramDesc save_program;
auto *save_block = save_program.MutableBlock(0);
const framework::ProgramDesc &main_program = program();
const framework::BlockDesc &global_block = main_program.Block(0);
std::vector<std::string> save_var_list;
for (framework::VarDesc *var : global_block.AllVars()) {
if (IsPersistable(var)) {
framework::VarDesc *new_var = save_block->Var(var->Name());
new_var->SetShape(var->GetShape());
new_var->SetDataType(var->GetDataType());
new_var->SetType(var->GetType());
new_var->SetLoDLevel(var->GetLoDLevel());
new_var->SetPersistable(true);
save_var_list.push_back(new_var->Name());
}
}
std::sort(save_var_list.begin(), save_var_list.end());
auto *op = save_block->AppendOp();
op->SetType("save_combine");
op->SetInput("X", save_var_list);
op->SetAttr("file_path", dir + "/params");
op->CheckAttrs();
platform::CPUPlace place;
framework::Executor exe(place);
exe.Run(save_program, scope(), 0, true, true);
}
template <> template <>
std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<AnalysisConfig>( std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<AnalysisConfig>(
const AnalysisConfig &config) { const AnalysisConfig &config) {
......
...@@ -86,6 +86,10 @@ class AnalysisPredictor : public PaddlePredictor { ...@@ -86,6 +86,10 @@ class AnalysisPredictor : public PaddlePredictor {
bool MkldnnQuantize(); bool MkldnnQuantize();
// save program to model
// save parameters to params
void SaveOptimModel(const std::string &dir);
protected: protected:
// For memory optimization. // For memory optimization.
bool need_collect_var_shapes_for_memory_optim(); bool need_collect_var_shapes_for_memory_optim();
......
...@@ -196,6 +196,9 @@ TEST(AnalysisPredictor, Clone) { ...@@ -196,6 +196,9 @@ TEST(AnalysisPredictor, Clone) {
} }
} }
// This function is not released yet, will fail on some machine.
// TODO(Superjomn) Turn on it latter.
/*
TEST(AnalysisPredictor, memory_optim) { TEST(AnalysisPredictor, memory_optim) {
AnalysisConfig config(FLAGS_dirname); AnalysisConfig config(FLAGS_dirname);
config.DisableGpu(); config.DisableGpu();
...@@ -246,6 +249,7 @@ TEST(AnalysisPredictor, memory_optim) { ...@@ -246,6 +249,7 @@ TEST(AnalysisPredictor, memory_optim) {
inference::CompareResult(output, output1); inference::CompareResult(output, output1);
} }
*/
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
class MkldnnQuantizerTest : public testing::Test { class MkldnnQuantizerTest : public testing::Test {
......
...@@ -170,6 +170,15 @@ void SetConfig(AnalysisConfig *cfg) { ...@@ -170,6 +170,15 @@ void SetConfig(AnalysisConfig *cfg) {
cfg->SwitchIrOptim(true); cfg->SwitchIrOptim(true);
} }
void SetOptimConfig(AnalysisConfig *cfg) {
std::string optimModelPath =
FLAGS_infer_model.substr(0, FLAGS_infer_model.find_last_of("/")) +
"/saved_optim_model";
cfg->SetModel(optimModelPath + "/model", optimModelPath + "/params");
cfg->SwitchIrOptim(true);
cfg->SwitchSpecifyInputNames();
}
void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) { void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
DataRecord data(FLAGS_infer_data, FLAGS_batch_size); DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
std::vector<PaddleTensor> input_slots; std::vector<PaddleTensor> input_slots;
...@@ -315,5 +324,44 @@ TEST(Analyzer_dam, compare_determine) { ...@@ -315,5 +324,44 @@ TEST(Analyzer_dam, compare_determine) {
input_slots_all); input_slots_all);
} }
// Save optim model
TEST(Analyzer_dam, save_optim_model) {
AnalysisConfig cfg;
SetConfig(&cfg);
std::string optimModelPath =
FLAGS_infer_model.substr(0, FLAGS_infer_model.find_last_of("/")) +
"/saved_optim_model";
mkdir(optimModelPath.c_str(), 0777);
auto predictor = CreateTestPredictor(
reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
FLAGS_use_analysis);
(static_cast<AnalysisPredictor *>(predictor.get()))
->SaveOptimModel(optimModelPath);
}
void CompareOptimAndOrig(const PaddlePredictor::Config *orig_config,
const PaddlePredictor::Config *optim_config,
const std::vector<std::vector<PaddleTensor>> &inputs) {
PrintConfig(orig_config, true);
PrintConfig(optim_config, true);
std::vector<std::vector<PaddleTensor>> orig_outputs, optim_outputs;
TestOneThreadPrediction(orig_config, inputs, &orig_outputs, false);
TestOneThreadPrediction(optim_config, inputs, &optim_outputs, false);
CompareResult(orig_outputs.back(), optim_outputs.back());
}
TEST(Analyzer_dam, compare_optim_orig) {
AnalysisConfig orig_cfg;
AnalysisConfig optim_cfg;
SetConfig(&orig_cfg);
SetOptimConfig(&optim_cfg);
std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all);
CompareOptimAndOrig(
reinterpret_cast<const PaddlePredictor::Config *>(&orig_cfg),
reinterpret_cast<const PaddlePredictor::Config *>(&optim_cfg),
input_slots_all);
}
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
...@@ -32,6 +32,17 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) { ...@@ -32,6 +32,17 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
SetFakeImageInput(inputs, FLAGS_infer_model); SetFakeImageInput(inputs, FLAGS_infer_model);
} }
void SetOptimConfig(AnalysisConfig *cfg) {
std::string optimModelPath =
FLAGS_infer_model.substr(0, FLAGS_infer_model.find_last_of("/")) +
"/saved_optim_model";
cfg->SetModel(optimModelPath + "/model", optimModelPath + "/params");
cfg->DisableGpu();
cfg->SwitchIrOptim();
cfg->SwitchSpecifyInputNames();
cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads);
}
// Easy for profiling independently. // Easy for profiling independently.
void profile(bool use_mkldnn = false) { void profile(bool use_mkldnn = false) {
AnalysisConfig cfg; AnalysisConfig cfg;
...@@ -87,13 +98,51 @@ TEST(Analyzer_resnet50, compare_mkldnn) { compare(true /* use_mkldnn */); } ...@@ -87,13 +98,51 @@ TEST(Analyzer_resnet50, compare_mkldnn) { compare(true /* use_mkldnn */); }
TEST(Analyzer_resnet50, compare_determine) { TEST(Analyzer_resnet50, compare_determine) {
AnalysisConfig cfg; AnalysisConfig cfg;
SetConfig(&cfg); SetConfig(&cfg);
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all); SetInput(&input_slots_all);
CompareDeterministic(reinterpret_cast<const PaddlePredictor::Config *>(&cfg), CompareDeterministic(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
input_slots_all); input_slots_all);
} }
// Save optim model
TEST(Analyzer_resnet50, save_optim_model) {
AnalysisConfig cfg;
SetConfig(&cfg);
std::string optimModelPath =
FLAGS_infer_model.substr(0, FLAGS_infer_model.find_last_of("/")) +
"/saved_optim_model";
mkdir(optimModelPath.c_str(), 0777);
auto predictor = CreateTestPredictor(
reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
FLAGS_use_analysis);
(static_cast<AnalysisPredictor *>(predictor.get()))
->SaveOptimModel(optimModelPath);
}
void CompareOptimAndOrig(const PaddlePredictor::Config *orig_config,
const PaddlePredictor::Config *optim_config,
const std::vector<std::vector<PaddleTensor>> &inputs) {
PrintConfig(orig_config, true);
PrintConfig(optim_config, true);
std::vector<std::vector<PaddleTensor>> orig_outputs, optim_outputs;
TestOneThreadPrediction(orig_config, inputs, &orig_outputs, false);
TestOneThreadPrediction(optim_config, inputs, &optim_outputs, false);
CompareResult(orig_outputs.back(), optim_outputs.back());
}
TEST(Analyzer_resnet50, compare_optim_orig) {
AnalysisConfig orig_cfg;
AnalysisConfig optim_cfg;
SetConfig(&orig_cfg);
SetOptimConfig(&optim_cfg);
std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all);
CompareOptimAndOrig(
reinterpret_cast<const PaddlePredictor::Config *>(&orig_cfg),
reinterpret_cast<const PaddlePredictor::Config *>(&optim_cfg),
input_slots_all);
}
} // namespace analysis } // namespace analysis
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
...@@ -116,7 +116,7 @@ void compare_continuous_input(std::string model_dir, bool use_tensorrt) { ...@@ -116,7 +116,7 @@ void compare_continuous_input(std::string model_dir, bool use_tensorrt) {
reinterpret_cast<const PaddlePredictor::Config*>(&analysis_config); reinterpret_cast<const PaddlePredictor::Config*>(&analysis_config);
auto native_pred = CreateTestPredictor(config, false); auto native_pred = CreateTestPredictor(config, false);
auto analysis_pred = CreateTestPredictor(config, true); auto analysis_pred = CreateTestPredictor(config, true);
for (int i = 0; i < 100; i++) { for (int i = 0; i < 20; i++) {
std::vector<std::vector<PaddleTensor>> inputs_all; std::vector<std::vector<PaddleTensor>> inputs_all;
if (!FLAGS_prog_filename.empty() && !FLAGS_param_filename.empty()) { if (!FLAGS_prog_filename.empty() && !FLAGS_param_filename.empty()) {
SetFakeImageInput(&inputs_all, model_dir, true, FLAGS_prog_filename, SetFakeImageInput(&inputs_all, model_dir, true, FLAGS_prog_filename,
...@@ -133,11 +133,13 @@ void compare_continuous_input(std::string model_dir, bool use_tensorrt) { ...@@ -133,11 +133,13 @@ void compare_continuous_input(std::string model_dir, bool use_tensorrt) {
TEST(TensorRT_mobilenet, compare) { TEST(TensorRT_mobilenet, compare) {
std::string model_dir = FLAGS_infer_model + "/mobilenet"; std::string model_dir = FLAGS_infer_model + "/mobilenet";
compare(model_dir, /* use_tensorrt */ true); compare(model_dir, /* use_tensorrt */ true);
// Open it when need.
// profile(model_dir, /* use_analysis */ true, FLAGS_use_tensorrt);
} }
TEST(TensorRT_resnet50, compare) { TEST(resnet50, compare_continuous_input) {
std::string model_dir = FLAGS_infer_model + "/resnet50"; std::string model_dir = FLAGS_infer_model + "/resnet50";
compare(model_dir, /* use_tensorrt */ true); compare_continuous_input(model_dir, true);
} }
TEST(TensorRT_resnext50, compare) { TEST(TensorRT_resnext50, compare) {
...@@ -145,24 +147,6 @@ TEST(TensorRT_resnext50, compare) { ...@@ -145,24 +147,6 @@ TEST(TensorRT_resnext50, compare) {
compare(model_dir, /* use_tensorrt */ true); compare(model_dir, /* use_tensorrt */ true);
} }
TEST(TensorRT_resnext50, profile) {
std::string model_dir = FLAGS_infer_model + "/resnext50";
// Set FLAGS_record_benchmark to true to record benchmark to file.
// FLAGS_record_benchmark=true;
FLAGS_model_name = "resnext50";
profile(model_dir, /* use_analysis */ true, FLAGS_use_tensorrt);
}
TEST(resnext50, compare_analysis_native) {
std::string model_dir = FLAGS_infer_model + "/resnext50";
compare(model_dir, false /*use tensorrt*/);
}
TEST(TensorRT_mobilenet, analysis) {
std::string model_dir = FLAGS_infer_model + "/" + "mobilenet";
compare(model_dir, false /* use_tensorrt */);
}
TEST(AnalysisPredictor, use_gpu) { TEST(AnalysisPredictor, use_gpu) {
std::string model_dir = FLAGS_infer_model + "/" + "mobilenet"; std::string model_dir = FLAGS_infer_model + "/" + "mobilenet";
AnalysisConfig config; AnalysisConfig config;
...@@ -180,20 +164,5 @@ TEST(AnalysisPredictor, use_gpu) { ...@@ -180,20 +164,5 @@ TEST(AnalysisPredictor, use_gpu) {
} }
} }
TEST(TensorRT_mobilenet, profile) {
std::string model_dir = FLAGS_infer_model + "/" + "mobilenet";
profile(model_dir, true, false);
}
TEST(resnet50, compare_continuous_input) {
std::string model_dir = FLAGS_infer_model + "/resnet50";
compare_continuous_input(model_dir, true);
}
TEST(resnet50, compare_continuous_input_native) {
std::string model_dir = FLAGS_infer_model + "/resnet50";
compare_continuous_input(model_dir, false);
}
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
...@@ -18,7 +18,6 @@ gru ...@@ -18,7 +18,6 @@ gru
hierarchical_sigmoid hierarchical_sigmoid
lrn lrn
lstm_unit lstm_unit
lstmp
max_pool2d_with_index max_pool2d_with_index
max_pool3d_with_index max_pool3d_with_index
maxout maxout
...@@ -29,8 +28,6 @@ pool3d ...@@ -29,8 +28,6 @@ pool3d
prelu prelu
quantize quantize
rank_loss rank_loss
reduce_all
reduce_any
reduce_max reduce_max
reduce_mean reduce_mean
reduce_min reduce_min
......
...@@ -227,6 +227,15 @@ $out = \sqrt{x}$ ...@@ -227,6 +227,15 @@ $out = \sqrt{x}$
)DOC"; )DOC";
UNUSED constexpr char RsqrtDoc[] = R"DOC(
Rsqrt Activation Operator.
Please make sure input is legal in case of numeric errors.
$out = \frac{1}{\sqrt{x}}$
)DOC";
UNUSED constexpr char AbsDoc[] = R"DOC( UNUSED constexpr char AbsDoc[] = R"DOC(
Abs Activation Operator. Abs Activation Operator.
...@@ -575,6 +584,7 @@ REGISTER_ACTIVATION_OP_MAKER(Gelu, GeluDoc); ...@@ -575,6 +584,7 @@ REGISTER_ACTIVATION_OP_MAKER(Gelu, GeluDoc);
REGISTER_ACTIVATION_OP_MAKER(Tanh, TanhDoc); REGISTER_ACTIVATION_OP_MAKER(Tanh, TanhDoc);
REGISTER_ACTIVATION_OP_MAKER(TanhShrink, TanhShrinkDoc); REGISTER_ACTIVATION_OP_MAKER(TanhShrink, TanhShrinkDoc);
REGISTER_ACTIVATION_OP_MAKER(Sqrt, SqrtDoc); REGISTER_ACTIVATION_OP_MAKER(Sqrt, SqrtDoc);
REGISTER_ACTIVATION_OP_MAKER(Rsqrt, RsqrtDoc);
REGISTER_ACTIVATION_OP_MAKER(Abs, AbsDoc); REGISTER_ACTIVATION_OP_MAKER(Abs, AbsDoc);
REGISTER_ACTIVATION_OP_MAKER(Ceil, CeilDoc); REGISTER_ACTIVATION_OP_MAKER(Ceil, CeilDoc);
REGISTER_ACTIVATION_OP_MAKER(Floor, FloorDoc); REGISTER_ACTIVATION_OP_MAKER(Floor, FloorDoc);
...@@ -586,6 +596,7 @@ REGISTER_ACTIVATION_OP_MAKER(Log, LogDoc); ...@@ -586,6 +596,7 @@ REGISTER_ACTIVATION_OP_MAKER(Log, LogDoc);
REGISTER_ACTIVATION_OP_MAKER(Square, SquareDoc); REGISTER_ACTIVATION_OP_MAKER(Square, SquareDoc);
REGISTER_ACTIVATION_OP_MAKER(Softplus, SoftplusDoc); REGISTER_ACTIVATION_OP_MAKER(Softplus, SoftplusDoc);
REGISTER_ACTIVATION_OP_MAKER(Softsign, SoftsignDoc); REGISTER_ACTIVATION_OP_MAKER(Softsign, SoftsignDoc);
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
......
...@@ -511,6 +511,26 @@ struct SqrtGradFunctor : public BaseActivationFunctor<T> { ...@@ -511,6 +511,26 @@ struct SqrtGradFunctor : public BaseActivationFunctor<T> {
static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
}; };
// rsqrt(x) = x^(-1/2)
template <typename T>
struct RsqrtFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Out>
void operator()(Device d, X x, Out out) const {
out.device(d) = x.rsqrt();
}
};
template <typename T>
struct RsqrtGradFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Out, typename dOut,
typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
dx.device(d) = static_cast<T>(-0.5) * dout * out * out * out;
}
static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
};
// ceil(x) = ceiling(x) // ceil(x) = ceiling(x)
template <typename T> template <typename T>
struct CeilFunctor : public BaseActivationFunctor<T> { struct CeilFunctor : public BaseActivationFunctor<T> {
...@@ -1191,6 +1211,7 @@ struct SwishGradFunctor : public BaseActivationFunctor<T> { ...@@ -1191,6 +1211,7 @@ struct SwishGradFunctor : public BaseActivationFunctor<T> {
__macro(atan, Atan, AtanFunctor, AtanGradFunctor); \ __macro(atan, Atan, AtanFunctor, AtanGradFunctor); \
__macro(softshrink, SoftShrink, SoftShrinkFunctor, SoftShrinkGradFunctor); \ __macro(softshrink, SoftShrink, SoftShrinkFunctor, SoftShrinkGradFunctor); \
__macro(sqrt, Sqrt, SqrtFunctor, SqrtGradFunctor); \ __macro(sqrt, Sqrt, SqrtFunctor, SqrtGradFunctor); \
__macro(rsqrt, Rsqrt, RsqrtFunctor, RsqrtGradFunctor); \
__macro(abs, Abs, AbsFunctor, AbsGradFunctor); \ __macro(abs, Abs, AbsFunctor, AbsGradFunctor); \
__macro(ceil, Ceil, CeilFunctor, ZeroGradFunctor); \ __macro(ceil, Ceil, CeilFunctor, ZeroGradFunctor); \
__macro(floor, Floor, FloorFunctor, ZeroGradFunctor); \ __macro(floor, Floor, FloorFunctor, ZeroGradFunctor); \
......
...@@ -79,9 +79,13 @@ class AffineChannelOp : public framework::OperatorWithKernel { ...@@ -79,9 +79,13 @@ class AffineChannelOp : public framework::OperatorWithKernel {
: x_dims[x_dims.size() - 1]); : x_dims[x_dims.size() - 1]);
PADDLE_ENFORCE_EQ(scale_dims.size(), 1UL); PADDLE_ENFORCE_EQ(scale_dims.size(), 1UL);
PADDLE_ENFORCE_EQ(scale_dims[0], C);
PADDLE_ENFORCE_EQ(b_dims.size(), 1UL); PADDLE_ENFORCE_EQ(b_dims.size(), 1UL);
PADDLE_ENFORCE_EQ(b_dims[0], C); if (ctx->IsRuntime() || scale_dims[0] > 0) {
PADDLE_ENFORCE_EQ(scale_dims[0], C);
}
if (ctx->IsRuntime() || b_dims[0] > 0) {
PADDLE_ENFORCE_EQ(b_dims[0], C);
}
ctx->SetOutputDim("Out", ctx->GetInputDim("X")); ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
ctx->ShareLoD("X", "Out"); ctx->ShareLoD("X", "Out");
......
...@@ -121,9 +121,11 @@ class AffineGridOpKernel : public framework::OpKernel<T> { ...@@ -121,9 +121,11 @@ class AffineGridOpKernel : public framework::OpKernel<T> {
// TODO(wanghaoshuang): Refine batched matrix multiply // TODO(wanghaoshuang): Refine batched matrix multiply
auto blas = math::GetBlas<DeviceContext, T>(ctx); auto blas = math::GetBlas<DeviceContext, T>(ctx);
for (int i = 0; i < n; ++i) { for (int i = 0; i < n; ++i) {
Tensor sliced_grid = grid.Slice(i, i + 1).Resize({h * w, 3}); Tensor sliced_grid = grid.Slice(i, i + 1).Resize(
{static_cast<int64_t>(h) * static_cast<int64_t>(w), 3});
Tensor sliced_theta = theta->Slice(i, i + 1).Resize({2, 3}); Tensor sliced_theta = theta->Slice(i, i + 1).Resize({2, 3});
Tensor sliced_out = output->Slice(i, i + 1).Resize({h * w, 2}); Tensor sliced_out = output->Slice(i, i + 1).Resize(
{static_cast<int64_t>(h) * static_cast<int64_t>(w), 2});
blas.MatMul(sliced_grid, false, sliced_theta, true, T(1), &sliced_out, blas.MatMul(sliced_grid, false, sliced_theta, true, T(1), &sliced_out,
T(0)); T(0));
} }
...@@ -161,8 +163,10 @@ class AffineGridGradOpKernel : public framework::OpKernel<T> { ...@@ -161,8 +163,10 @@ class AffineGridGradOpKernel : public framework::OpKernel<T> {
// TODO(wanghaoshuang): Refine batched matrix multiply // TODO(wanghaoshuang): Refine batched matrix multiply
auto blas = math::GetBlas<DeviceContext, T>(ctx); auto blas = math::GetBlas<DeviceContext, T>(ctx);
for (int i = 0; i < n; ++i) { for (int i = 0; i < n; ++i) {
Tensor sliced_grid = grid.Slice(i, i + 1).Resize({h * w, 3}); Tensor sliced_grid = grid.Slice(i, i + 1).Resize(
Tensor sliced_out_grad = output_grad->Slice(i, i + 1).Resize({h * w, 2}); {static_cast<int64_t>(h) * static_cast<int64_t>(w), 3});
Tensor sliced_out_grad = output_grad->Slice(i, i + 1).Resize(
{static_cast<int64_t>(h) * static_cast<int64_t>(w), 2});
Tensor sliced_theta_grad = theta_grad->Slice(i, i + 1).Resize({2, 3}); Tensor sliced_theta_grad = theta_grad->Slice(i, i + 1).Resize({2, 3});
blas.MatMul(sliced_out_grad, true, sliced_grid, false, T(1), blas.MatMul(sliced_out_grad, true, sliced_grid, false, T(1),
&sliced_theta_grad, T(0)); &sliced_theta_grad, T(0));
......
...@@ -65,11 +65,22 @@ void BatchNormOp::InferShape(framework::InferShapeContext *ctx) const { ...@@ -65,11 +65,22 @@ void BatchNormOp::InferShape(framework::InferShapeContext *ctx) const {
(data_layout == DataLayout::kNCHW ? x_dims[1] (data_layout == DataLayout::kNCHW ? x_dims[1]
: x_dims[x_dims.size() - 1]); : x_dims[x_dims.size() - 1]);
PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL); auto scale_dim = ctx->GetInputDim("Scale");
PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], C); auto bias_dim = ctx->GetInputDim("Bias");
PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias").size(), 1UL);
PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias")[0], C);
PADDLE_ENFORCE_EQ(scale_dim.size(), 1UL);
PADDLE_ENFORCE_EQ(scale_dim.size(), 1UL);
bool check = true;
if ((!ctx->IsRuntime()) && (framework::product(scale_dim) <= 0 ||
framework::product(bias_dim) <= 0)) {
check = false;
}
if (check) {
PADDLE_ENFORCE_EQ(scale_dim[0], C);
PADDLE_ENFORCE_EQ(scale_dim[0], C);
}
ctx->SetOutputDim("Y", x_dims); ctx->SetOutputDim("Y", x_dims);
ctx->SetOutputDim("MeanOut", {C}); ctx->SetOutputDim("MeanOut", {C});
ctx->SetOutputDim("VarianceOut", {C}); ctx->SetOutputDim("VarianceOut", {C});
......
...@@ -49,7 +49,15 @@ class ConcatOp : public framework::OperatorWithKernel { ...@@ -49,7 +49,15 @@ class ConcatOp : public framework::OperatorWithKernel {
for (size_t i = 1; i < n; i++) { for (size_t i = 1; i < n; i++) {
for (size_t j = 0; j < in_zero_dims_size; j++) { for (size_t j = 0; j < in_zero_dims_size; j++) {
if (j == axis) { if (j == axis) {
out_dims[axis] += ins[i][j]; if (ctx->IsRuntime()) {
out_dims[axis] += ins[i][j];
} else {
if (ins[i][j] == -1) {
out_dims[axis] = -1;
} else {
out_dims[axis] += ins[i][j];
}
}
} else { } else {
if (ctx->IsRuntime()) { if (ctx->IsRuntime()) {
// check all shape in run time // check all shape in run time
......
...@@ -68,9 +68,14 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const { ...@@ -68,9 +68,14 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]}); std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
for (size_t i = 0; i < strides.size(); ++i) { for (size_t i = 0; i < strides.size(); ++i) {
output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], if ((!ctx->IsRuntime()) &&
dilations[i], paddings[i], (in_dims[i + 2] <= 0 || filter_dims[i + 2] <= 0)) {
strides[i])); output_shape.push_back(-1);
} else {
output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
dilations[i], paddings[i],
strides[i]));
}
} }
ctx->SetOutputDim("Output", framework::make_ddim(output_shape)); ctx->SetOutputDim("Output", framework::make_ddim(output_shape));
ctx->ShareLoD("Input", "Output"); ctx->ShareLoD("Input", "Output");
......
...@@ -36,14 +36,17 @@ class ConvShiftOp : public framework::OperatorWithKernel { ...@@ -36,14 +36,17 @@ class ConvShiftOp : public framework::OperatorWithKernel {
auto y_dims = ctx->GetInputDim("Y"); auto y_dims = ctx->GetInputDim("Y");
PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2."); PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2.");
PADDLE_ENFORCE_EQ(y_dims.size(), 2, "Input(Y)'s rank should be 2."); PADDLE_ENFORCE_EQ(y_dims.size(), 2, "Input(Y)'s rank should be 2.");
PADDLE_ENFORCE_EQ(x_dims[0], y_dims[0], if (ctx->IsRuntime() || (x_dims[0] > 0 && y_dims[0] > 0))
"The 1st dimension of Input(X) and Input(Y) should " PADDLE_ENFORCE_EQ(x_dims[0], y_dims[0],
"be equal."); "The 1st dimension of Input(X) and Input(Y) should "
PADDLE_ENFORCE_EQ(y_dims[1] % 2, 1, "be equal.");
"The 2nd dimension of Input(Y) should be odd."); if (ctx->IsRuntime() || y_dims[1] > 0)
PADDLE_ENFORCE_LE(y_dims[1], x_dims[1], PADDLE_ENFORCE_EQ(y_dims[1] % 2, 1,
"The 2nd dimension of Input(Y) should be less than or " "The 2nd dimension of Input(Y) should be odd.");
"equal to the 2nd dimension of Input(X)."); if (ctx->IsRuntime() || (x_dims[1] > 0 && y_dims[1] > 0))
PADDLE_ENFORCE_LE(y_dims[1], x_dims[1],
"The 2nd dimension of Input(Y) should be less than or "
"equal to the 2nd dimension of Input(X).");
ctx->ShareDim("X", /*->*/ "Out"); ctx->ShareDim("X", /*->*/ "Out");
ctx->ShareLoD("X", /*->*/ "Out"); ctx->ShareLoD("X", /*->*/ "Out");
} }
......
...@@ -40,17 +40,27 @@ class CosSimOp : public framework::OperatorWithKernel { ...@@ -40,17 +40,27 @@ class CosSimOp : public framework::OperatorWithKernel {
auto x_dims = ctx->GetInputDim("X"); auto x_dims = ctx->GetInputDim("X");
auto y_dims = ctx->GetInputDim("Y"); auto y_dims = ctx->GetInputDim("Y");
PADDLE_ENFORCE_EQ(x_dims.size(), y_dims.size(), bool check = true;
"Ranks of Input(X) and Input(Y) must be equal."); if ((!ctx->IsRuntime()) &&
PADDLE_ENFORCE_GE(x_dims.size(), 2, (framework::product(x_dims) <= 0 || framework::product(y_dims) <= 0)) {
"Rank of Input(X) must not be less than 2."); check = false;
PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 1, x_dims.size()), }
framework::slice_ddim(y_dims, 1, y_dims.size()),
"All dimensions except the 1st of Input(X) and Input(Y) " if (check) {
"must be equal."); PADDLE_ENFORCE_EQ(x_dims.size(), y_dims.size(),
PADDLE_ENFORCE(x_dims[0] == y_dims[0] || y_dims[0] == 1, "Ranks of Input(X) and Input(Y) must be equal.");
"The 1st dimension of Input(Y) must be equal to Input(X) or" PADDLE_ENFORCE_GE(x_dims.size(), 2,
" just 1 (which will be broadcasted to match Input(X))."); "Rank of Input(X) must not be less than 2.");
PADDLE_ENFORCE_EQ(
framework::slice_ddim(x_dims, 1, x_dims.size()),
framework::slice_ddim(y_dims, 1, y_dims.size()),
"All dimensions except the 1st of Input(X) and Input(Y) "
"must be equal.");
PADDLE_ENFORCE(
x_dims[0] == y_dims[0] || y_dims[0] == 1,
"The 1st dimension of Input(Y) must be equal to Input(X) or"
" just 1 (which will be broadcasted to match Input(X)).");
}
// resize tensor // resize tensor
ctx->SetOutputDim("Out", {x_dims[0], 1}); ctx->SetOutputDim("Out", {x_dims[0], 1});
......
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/cvm_op.h"
#include <memory>
#include "paddle/fluid/operators/math/math_function.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
class CVMOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
PADDLE_ENFORCE(ctx->HasInput("CVM"), "Input(CVM) should be not null.");
PADDLE_ENFORCE(ctx->HasOutput("Y"), "Output(Y) should be not null.");
auto x_dims = ctx->GetInputDim("X");
auto cvm_dims = ctx->GetInputDim("CVM");
PADDLE_ENFORCE_EQ(x_dims.size(), 2UL, "Input(X)'s rank should be 2.");
PADDLE_ENFORCE_EQ(cvm_dims.size(), 2UL, "Input(CVM)'s rank should be 2.");
PADDLE_ENFORCE_EQ(cvm_dims[1], 2UL,
"The 2nd dimension of "
"Input(CVM) should be 2.");
if (ctx->Attrs().Get<bool>("use_cvm")) {
ctx->SetOutputDim("Y", {x_dims[0], x_dims[1]});
} else {
ctx->SetOutputDim("Y", {x_dims[0], x_dims[1] - 2});
}
ctx->ShareLoD("X", /*->*/ "Y");
}
protected:
// Explicitly set that the data type of computation kernel of
// cvm
// is determined by its input "X".
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
platform::CPUPlace());
}
};
class CVMGradientOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
PADDLE_ENFORCE(ctx->HasInput("CVM"), "Input(CVM) should be not null.");
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")),
"Input(Y@GRAD) should be not null.");
PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
"Output(X@GRAD) should be not null.");
auto x_dims = ctx->GetInputDim("X");
auto cvm_dims = ctx->GetInputDim("CVM");
auto dy_dims = ctx->GetInputDim(framework::GradVarName("Y"));
PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2.");
PADDLE_ENFORCE_EQ(dy_dims.size(), 2, "Input(Y@Grad)'s rank should be 2.");
PADDLE_ENFORCE_EQ(cvm_dims.size(), 2, "Input(CVM)'s rank should be 2.");
PADDLE_ENFORCE_EQ(x_dims[0], dy_dims[0],
"The 1st dimension of Input(X) and Input(Y@Grad) should "
"be equal.");
PADDLE_ENFORCE_EQ(cvm_dims[1], 2,
"When Attr(soft_label) == false, the 2nd dimension of "
"Input(CVM) should be 2.");
ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
ctx->ShareLoD("X", framework::GradVarName("X"));
}
protected:
// Explicitly set that the data type of computation kernel of
// cvm
// is determined by its input "X".
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
platform::CPUPlace());
}
};
class CVMOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X",
"(LodTensor, default LodTensor<float>), a 2-D tensor with shape "
"[N x D],"
" where N is the batch size and D is the emebdding dim. ");
AddInput("CVM",
"(Tensor), a 2-D Tensor with shape [N x 2], where N is the batch "
"size, 2 is show and click.");
AddOutput("Y",
"(LodTensor, default LodTensor<float>), a 2-D tensor with shape "
"[N x K].");
AddAttr<bool>("use_cvm", "bool, use cvm or not").SetDefault(true);
AddComment(R"DOC(
CVM Operator.
We assume that input X is a embedding vector with cvm_feature(show and click), which shape is [N * D] (D is 2(cvm_feature) + embedding dim, N is batch_size)
if use_cvm is True, we will log(cvm_feature), and output shape is [N * D].
if use_cvm is False, we will remove cvm_feature from input, and output shape is [N * (D - 2)].
)DOC");
}
};
class CVMGradOpDescMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override {
std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
op->SetType("cvm_grad");
op->SetInput("X", Input("X"));
op->SetInput("CVM", Input("CVM"));
op->SetInput(framework::GradVarName("Y"), OutputGrad("Y"));
op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
op->SetAttrMap(Attrs());
return op;
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(cvm, ops::CVMOp, ops::CVMOpMaker, ops::CVMGradOpDescMaker);
REGISTER_OPERATOR(cvm_grad, ops::CVMGradientOp);
REGISTER_OP_CPU_KERNEL(cvm, ops::CVMOpKernel<float>, ops::CVMOpKernel<double>);
REGISTER_OP_CPU_KERNEL(cvm_grad, ops::CVMGradOpKernel<float>,
ops::CVMGradOpKernel<double>);
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor;
template <typename T>
class CVMOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
const LoDTensor* x = context.Input<LoDTensor>("X");
const T* x_data = x->data<T>();
auto lod = x->lod()[0];
int64_t item_size = x->numel() / x->dims()[0];
int offset = 2;
if (!context.Attr<bool>("use_cvm")) {
item_size -= offset;
}
LoDTensor* y = context.Output<LoDTensor>("Y");
T* y_data = y->mutable_data<T>(context.GetPlace());
int seq_num = static_cast<int>(lod.size()) - 1;
for (int i = 0; i < seq_num; ++i) {
int64_t seq_len = static_cast<int64_t>(lod[i + 1] - lod[i]);
for (int j = 0; j < seq_len; ++j) {
if (context.Attr<bool>("use_cvm")) {
std::memcpy(y_data, x_data, item_size * sizeof(T));
y_data[0] = log(y_data[0] + 1);
y_data[1] = log(y_data[1] + 1) - y_data[0];
x_data += item_size;
y_data += item_size;
} else {
std::memcpy(y_data, x_data + offset, item_size * sizeof(T));
x_data += item_size + offset;
y_data += item_size;
}
}
}
}
};
template <typename T>
class CVMGradOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
LoDTensor* dx = context.Output<LoDTensor>(framework::GradVarName("X"));
T* dx_data = dx->mutable_data<T>(context.GetPlace());
const Tensor* cvm = context.Input<Tensor>("CVM");
const T* cvm_data = cvm->data<T>();
int offset = 2;
const framework::LoDTensor* dOut =
context.Input<framework::LoDTensor>(framework::GradVarName("Y"));
const T* dout_data = dOut->data<T>();
auto lod = dx->lod()[0];
int64_t item_size = dx->numel() / dx->dims()[0];
if (!context.Attr<bool>("use_cvm")) {
item_size -= offset;
}
int seq_num = static_cast<int>(lod.size()) - 1;
for (int i = 0; i < seq_num; ++i) {
int64_t seq_len = static_cast<int64_t>(lod[i + 1] - lod[i]);
for (int j = 0; j < seq_len; ++j) {
if (context.Attr<bool>("use_cvm")) {
std::memcpy(dx_data, dout_data, item_size * sizeof(T));
dx_data[0] = cvm_data[0];
dx_data[1] = cvm_data[1];
dx_data += item_size;
dout_data += item_size;
} else {
std::memcpy(dx_data + offset, dout_data, item_size * sizeof(T));
dx_data[0] = cvm_data[0];
dx_data[1] = cvm_data[1];
dx_data += item_size + offset;
dout_data += item_size;
}
}
cvm_data += offset;
}
}
};
} // namespace operators
} // namespace paddle
...@@ -51,8 +51,10 @@ class DetectionMAPOp : public framework::OperatorWithKernel { ...@@ -51,8 +51,10 @@ class DetectionMAPOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE_EQ(label_dims.size(), 2, PADDLE_ENFORCE_EQ(label_dims.size(), 2,
"The rank of Input(Label) must be 2, " "The rank of Input(Label) must be 2, "
"the shape is [N, 6]."); "the shape is [N, 6].");
PADDLE_ENFORCE(label_dims[1] == 6 || label_dims[1] == 5, if (ctx->IsRuntime() || label_dims[1] > 0) {
"The shape of Input(Label) is [N, 6] or [N, 5]."); PADDLE_ENFORCE(label_dims[1] == 6 || label_dims[1] == 5,
"The shape of Input(Label) is [N, 6] or [N, 5].");
}
if (ctx->HasInput("PosCount")) { if (ctx->HasInput("PosCount")) {
PADDLE_ENFORCE(ctx->HasInput("TruePos"), PADDLE_ENFORCE(ctx->HasInput("TruePos"),
......
...@@ -9,6 +9,9 @@ else() ...@@ -9,6 +9,9 @@ else()
endif() endif()
configure_file(send_recv.proto.in ${CMAKE_CURRENT_SOURCE_DIR}/send_recv.proto @ONLY) configure_file(send_recv.proto.in ${CMAKE_CURRENT_SOURCE_DIR}/send_recv.proto @ONLY)
cc_library(async_sparse_param_update_recorder SRCS async_sparse_param_update_recorder.cc DEPS enforce simple_threadpool)
cc_test(async_sparse_param_update_recorder_test SRCS async_sparse_param_update_recorder_test.cc DEPS async_sparse_param_update_recorder)
# FIXME(typhoonzero): use add_subdirectory once we clean the dependency of these files # FIXME(typhoonzero): use add_subdirectory once we clean the dependency of these files
set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
if(WITH_GRPC) if(WITH_GRPC)
...@@ -20,7 +23,7 @@ if(WITH_GRPC) ...@@ -20,7 +23,7 @@ if(WITH_GRPC)
collective_client.cc collective_server.cc collective_client.cc collective_server.cc
${GRPC_SRCS} ${GRPC_SRCS}
PROTO send_recv.proto PROTO send_recv.proto
DEPS lod_tensor selected_rows_functor memory scope ${GRPC_DEPS}) DEPS lod_tensor selected_rows_functor memory scope ${GRPC_DEPS} async_sparse_param_update_recorder)
set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
set(RPC_DEPS sendrecvop_rpc ${GRPC_DEPS}) set(RPC_DEPS sendrecvop_rpc ${GRPC_DEPS})
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h"
namespace paddle {
namespace operators {
namespace distributed {
std::once_flag AsyncSparseParamUpdateRecorder::init_flag_;
std::unique_ptr<AsyncSparseParamUpdateRecorder>
AsyncSparseParamUpdateRecorder::recorder_(nullptr);
} // namespace distributed
} // namespace operators
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <functional>
#include <future> // NOLINT
#include <memory>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <vector>
#include <ThreadPool.h>
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace operators {
namespace distributed {
class ConcurrentSet {
public:
ConcurrentSet() : pool_(new ::ThreadPool(1)) {}
~ConcurrentSet() {}
std::future<void> Update(const std::vector<int64_t>& rows) {
auto task = [this, rows] {
if (VLOG_IS_ON(3)) {
std::ostringstream sstream;
sstream << "[";
for (auto& id : rows) {
sstream << id << ", ";
}
sstream << "]";
VLOG(3) << "update ids -> " << sstream.str();
}
for (auto row : rows) {
set_.insert(row);
}
};
return pool_->enqueue(std::move(task));
}
std::future<void> GetAndClear(std::vector<int64_t>* result) {
auto task = [this, &result] {
result->clear();
for (auto& id : set_) {
result->push_back(id);
}
if (VLOG_IS_ON(3)) {
std::ostringstream sstream;
sstream << "[";
for (auto& id : *result) {
sstream << id << ", ";
}
sstream << "]";
VLOG(3) << "result ids size: " << result->size() << " "
<< sstream.str();
}
set_.clear();
};
return pool_->enqueue(std::move(task));
}
private:
std::unordered_set<int64_t> set_;
std::unique_ptr<::ThreadPool> pool_{nullptr};
};
class AsyncSparseParamUpdateRecorder {
using TrainerToRows = std::vector<std::unique_ptr<ConcurrentSet>>;
public:
AsyncSparseParamUpdateRecorder(
int trainer_num,
const std::unordered_map<std::string, std::string>& grad_to_param)
: trainer_num_(trainer_num), grad_to_param_(grad_to_param) {
if (VLOG_IS_ON(3)) {
std::ostringstream sstream;
sstream << "[";
for (auto& item : grad_to_param) {
sstream << item.first << ":" << item.second << ", ";
}
sstream << "]";
VLOG(3) << "trainer_num: " << trainer_num
<< " grad_to_param_: " << sstream.str();
}
for (auto& iter : grad_to_param) {
param_to_grad_[iter.second] = iter.first;
auto& param_name = iter.second;
param_to_updated_rows_[param_name] = TrainerToRows();
auto& trainer_to_rows = param_to_updated_rows_[param_name];
for (auto i = 0; i < trainer_num; ++i) {
trainer_to_rows.emplace_back(new ConcurrentSet());
}
}
}
~AsyncSparseParamUpdateRecorder() = default;
void Update(const std::string& grad_name,
const std::vector<int64_t>& update_rows) {
VLOG(3) << "update grad: " << grad_name
<< " row size: " << update_rows.size();
auto& param_name = grad_to_param_.at(grad_name);
auto& trainer_to_rows = param_to_updated_rows_.at(param_name);
std::vector<std::future<void>> fs;
for (auto& set : trainer_to_rows) {
fs.push_back(set->Update(update_rows));
}
for (auto& f : fs) {
f.wait();
}
}
void GetAndClear(const std::string& param_name, int trainer_id,
std::vector<int64_t>* result) {
VLOG(3) << "GetAndClear param: " << param_name
<< " for trainer: " << trainer_id;
PADDLE_ENFORCE_LT(trainer_id, trainer_num_);
param_to_updated_rows_.at(param_name)[trainer_id]
->GetAndClear(result)
.wait();
}
bool HasParam(const std::string& param_name) {
return param_to_grad_.find(param_name) != param_to_grad_.end();
}
bool HasGrad(const std::string& grad_name) {
return grad_to_param_.find(grad_name) != grad_to_param_.end();
}
private:
const int trainer_num_;
std::unordered_map<std::string, std::string> grad_to_param_;
std::unordered_map<std::string, std::string> param_to_grad_;
std::unordered_map<std::string, TrainerToRows> param_to_updated_rows_;
// init recorder
public:
static void Init(
int trainer_num,
const std::unordered_map<std::string, std::string>& grad_to_param) {
InitImpl(trainer_num, grad_to_param);
}
static AsyncSparseParamUpdateRecorder* GetInstance() {
return recorder_.get();
}
private:
// Init is called by GetInstance.
static void InitImpl(
int trainer_num,
const std::unordered_map<std::string, std::string>& grad_to_param) {
if (recorder_ == nullptr) {
recorder_.reset(
new AsyncSparseParamUpdateRecorder(trainer_num, grad_to_param));
}
}
static std::once_flag init_flag_;
static std::unique_ptr<AsyncSparseParamUpdateRecorder> recorder_;
};
} // namespace distributed
} // namespace operators
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h"
#include <algorithm>
#include "gtest/gtest.h"
namespace paddle {
namespace operators {
namespace distributed {
TEST(ConcurrentSet, All) {
ConcurrentSet concurrent_set;
std::vector<int64_t> in1 = {1, 2, 3, 4};
std::vector<int64_t> in2 = {2, 3, 5, 6};
std::vector<std::future<void>> futures;
futures.push_back(concurrent_set.Update(in1));
futures.push_back(concurrent_set.Update(in2));
for (auto &f : futures) {
f.wait();
}
std::unordered_set<int64_t> in;
std::copy(in1.begin(), in1.end(), std::inserter(in, in.begin()));
std::copy(in2.begin(), in2.end(), std::inserter(in, in.begin()));
std::vector<int64_t> ret;
concurrent_set.GetAndClear(&ret).wait();
std::unordered_set<int64_t> out;
std::copy(ret.begin(), ret.end(), std::inserter(out, out.begin()));
EXPECT_EQ(in, out);
concurrent_set.GetAndClear(&ret).wait();
EXPECT_EQ(ret.size(), 0);
}
TEST(AsyncSparseParamUpdateRecorder, All) {
std::unordered_map<std::string, std::string> grad_to_param;
grad_to_param["grad1"] = "param1";
grad_to_param["grad2"] = "param2";
int trainer_num = 10;
AsyncSparseParamUpdateRecorder recorder(trainer_num, grad_to_param);
std::vector<int64_t> in1 = {1, 2, 3, 4};
std::vector<int64_t> in2 = {2, 3, 5, 6};
std::unordered_set<int64_t> in;
std::copy(in1.begin(), in1.end(), std::inserter(in, in.begin()));
std::copy(in2.begin(), in2.end(), std::inserter(in, in.begin()));
recorder.Update("grad1", in1);
recorder.Update("grad1", in2);
EXPECT_TRUE(recorder.HasParam("param1"));
EXPECT_TRUE(recorder.HasParam("param2"));
EXPECT_FALSE(recorder.HasParam("param3"));
EXPECT_TRUE(recorder.HasGrad("grad1"));
EXPECT_TRUE(recorder.HasGrad("grad2"));
EXPECT_FALSE(recorder.HasGrad("grad3"));
std::vector<int64_t> ret;
EXPECT_ANY_THROW(recorder.GetAndClear("param1", trainer_num, &ret));
for (int i = 0; i < trainer_num; ++i) {
std::vector<int64_t> ret;
std::unordered_set<int64_t> out;
recorder.GetAndClear("param1", i, &ret);
std::copy(ret.begin(), ret.end(), std::inserter(out, out.begin()));
EXPECT_EQ(in, out);
recorder.GetAndClear("param1", i, &ret);
EXPECT_EQ(ret.size(), 0);
}
}
} // namespace distributed
} // namespace operators
} // namespace paddle
...@@ -234,6 +234,7 @@ VarHandlePtr BRPCClient::AsyncGetVar(const std::string& ep, ...@@ -234,6 +234,7 @@ VarHandlePtr BRPCClient::AsyncGetVar(const std::string& ep,
const framework::Scope& scope, const framework::Scope& scope,
const std::string& var_name, const std::string& var_name,
const std::string& out_var_name, const std::string& out_var_name,
const std::string& table_name,
int64_t time_out) { int64_t time_out) {
return _AsyncGetVar(ep, ctx, scope, var_name, out_var_name, kGetRPC, return _AsyncGetVar(ep, ctx, scope, var_name, out_var_name, kGetRPC,
time_out); time_out);
......
...@@ -21,8 +21,10 @@ limitations under the License. */ ...@@ -21,8 +21,10 @@ limitations under the License. */
#include <functional> #include <functional>
#include <iostream> #include <iostream>
#include <map> #include <map>
#include <memory>
#include <mutex> // NOLINT #include <mutex> // NOLINT
#include <string> #include <string>
#include <unordered_map>
#include <vector> #include <vector>
#include "brpc/channel.h" #include "brpc/channel.h"
...@@ -66,6 +68,7 @@ class BRPCClient : public RPCClient { ...@@ -66,6 +68,7 @@ class BRPCClient : public RPCClient {
const framework::Scope& scope, const framework::Scope& scope,
const std::string& var_name, const std::string& var_name,
const std::string& out_var_name, const std::string& out_var_name,
const std::string& table_name = "",
int64_t time_out = FLAGS_rpc_deadline) override; int64_t time_out = FLAGS_rpc_deadline) override;
VarHandlePtr AsyncGetMonomerBarrier( VarHandlePtr AsyncGetMonomerBarrier(
...@@ -107,13 +110,11 @@ class BRPCClient : public RPCClient { ...@@ -107,13 +110,11 @@ class BRPCClient : public RPCClient {
void SendComplete() override; void SendComplete() override;
private: private:
VarHandlePtr _AsyncGetVar(const std::string& ep, VarHandlePtr _AsyncGetVar(
const platform::DeviceContext& ctx, const std::string& ep, const platform::DeviceContext& ctx,
const framework::Scope& scope, const framework::Scope& scope, const std::string& var_name,
const std::string& var_name, const std::string& out_var_name, const std::string& method_name,
const std::string& out_var_name, const std::string& table_name, int64_t time_out = FLAGS_rpc_deadline);
const std::string& method_name,
int64_t time_out = FLAGS_rpc_deadline);
void Proceed(); void Proceed();
ChannelQueuePtr GetChannel(const std::string& ep); ChannelQueuePtr GetChannel(const std::string& ep);
......
...@@ -32,6 +32,9 @@ DEFINE_int32(communicator_send_queue_size, 20, ...@@ -32,6 +32,9 @@ DEFINE_int32(communicator_send_queue_size, 20,
DEFINE_int32(communicator_max_send_grad_num_before_recv, 20, DEFINE_int32(communicator_max_send_grad_num_before_recv, 20,
"max grad num to send before recv parameters"); "max grad num to send before recv parameters");
DEFINE_int32(communicator_thread_pool_size, 5, "thread num to do send or recv"); DEFINE_int32(communicator_thread_pool_size, 5, "thread num to do send or recv");
DEFINE_int32(communicator_send_wait_times, 5,
"times that send thread will wait if merge num does not reach "
"max_merge_var_num");
DEFINE_int32(communicator_max_merge_var_num, 20, DEFINE_int32(communicator_max_merge_var_num, 20,
"max var num to merge and send"); "max var num to merge and send");
DEFINE_bool(communicator_fake_rpc, false, DEFINE_bool(communicator_fake_rpc, false,
...@@ -65,6 +68,8 @@ Communicator::Communicator(const RpcCtxMap &send_varname_to_ctx, ...@@ -65,6 +68,8 @@ Communicator::Communicator(const RpcCtxMap &send_varname_to_ctx,
<< FLAGS_communicator_max_send_grad_num_before_recv; << FLAGS_communicator_max_send_grad_num_before_recv;
VLOG(0) << "communicator_thread_pool_size: " VLOG(0) << "communicator_thread_pool_size: "
<< FLAGS_communicator_thread_pool_size; << FLAGS_communicator_thread_pool_size;
VLOG(0) << "communicator_send_wait_times: "
<< FLAGS_communicator_send_wait_times;
VLOG(0) << "communicator_max_merge_var_num: " VLOG(0) << "communicator_max_merge_var_num: "
<< FLAGS_communicator_max_merge_var_num; << FLAGS_communicator_max_merge_var_num;
VLOG(0) << "communicator_fake_rpc: " << FLAGS_communicator_fake_rpc; VLOG(0) << "communicator_fake_rpc: " << FLAGS_communicator_fake_rpc;
...@@ -101,20 +106,32 @@ void Communicator::SendThread() { ...@@ -101,20 +106,32 @@ void Communicator::SendThread() {
VLOG(3) << var_name << " merge and send"; VLOG(3) << var_name << " merge and send";
std::vector<std::shared_ptr<Variable>> vars; std::vector<std::shared_ptr<Variable>> vars;
size_t merged_var_num = 0; size_t merged_var_num = 0;
while (var_queue->Size() > 0 && size_t wait_times = 0;
merged_var_num < FLAGS_communicator_max_merge_var_num) { while (merged_var_num < FLAGS_communicator_max_merge_var_num) {
vars.push_back(var_queue->Pop()); if (var_queue->Size() == 0) {
// only count the send number of the first var VLOG(3) << "wait_times -> " << wait_times;
if (var_name == send_varname_to_queue_.begin()->first) { if (wait_times >= FLAGS_communicator_send_wait_times) {
grad_num_.fetch_add(1, std::memory_order_relaxed); break;
}
std::this_thread::sleep_for(std::chrono::milliseconds(10));
wait_times++;
continue;
} else {
wait_times = 0;
vars.push_back(var_queue->Pop());
// only count the send number of the first var
if (var_name == send_varname_to_queue_.begin()->first) {
grad_num_.fetch_add(1, std::memory_order_relaxed);
}
merged_var_num++;
} }
merged_var_num++;
} }
auto before_merge = GetCurrentUS(); auto before_merge = GetCurrentUS();
MergeVars(var_name, vars, send_scope_.get()); MergeVars(var_name, vars, send_scope_.get());
auto after_merge = GetCurrentUS(); auto after_merge = GetCurrentUS();
VLOG(3) << "merge " << var_name << " use time " VLOG(3) << "merge " << merged_var_num << " " << var_name
<< after_merge - before_merge; << " use time " << after_merge - before_merge;
auto send_functor = distributed::ParameterSend<float>(); auto send_functor = distributed::ParameterSend<float>();
auto &ctx = send_varname_to_ctx_.at(var_name); auto &ctx = send_varname_to_ctx_.at(var_name);
if (!FLAGS_communicator_fake_rpc) { if (!FLAGS_communicator_fake_rpc) {
......
...@@ -109,7 +109,7 @@ inline void MergeVars(const std::string& var_name, ...@@ -109,7 +109,7 @@ inline void MergeVars(const std::string& var_name,
auto* out_var = scope->Var(var_name); auto* out_var = scope->Var(var_name);
if (var0->IsType<framework::LoDTensor>()) { if (var0->IsType<framework::LoDTensor>()) {
auto dims = var0->Get<framework::LoDTensor>().dims(); auto dims = var0->Get<framework::LoDTensor>().dims();
VLOG(3) << "merge " << var_name << " LoDTensor " << dims; VLOG(3) << "merge " << var_name << " LoDTensor dims " << dims;
// init output tensor // init output tensor
auto* out_t = out_var->GetMutable<framework::LoDTensor>(); auto* out_t = out_var->GetMutable<framework::LoDTensor>();
......
...@@ -128,9 +128,11 @@ VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep, ...@@ -128,9 +128,11 @@ VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep,
const framework::Scope& scope, const framework::Scope& scope,
const std::string& var_name, const std::string& var_name,
const std::string& out_varname, const std::string& out_varname,
const std::string& table_name,
int64_t time_out) { int64_t time_out) {
return _AsyncGetVar(ep, ctx, scope, kGetRPC, var_name, out_varname, return _AsyncGetVar(ep, ctx, scope, kGetRPC, var_name, out_varname,
"/sendrecv.SendRecvService/GetVariable", time_out); "/sendrecv.SendRecvService/GetVariable", table_name,
time_out);
} }
VarHandlePtr GRPCClient::AsyncGetVarNoBarrier( VarHandlePtr GRPCClient::AsyncGetVarNoBarrier(
...@@ -142,7 +144,7 @@ VarHandlePtr GRPCClient::AsyncGetVarNoBarrier( ...@@ -142,7 +144,7 @@ VarHandlePtr GRPCClient::AsyncGetVarNoBarrier(
return _AsyncGetVar( return _AsyncGetVar(
ep, ctx, scope, kGetNoBarrierRPC, var_name_no_barrier, out_varname, ep, ctx, scope, kGetNoBarrierRPC, var_name_no_barrier, out_varname,
"/sendrecv.SendRecvService/GetVariableNoBarrier", time_out); "/sendrecv.SendRecvService/GetVariableNoBarrier", "", time_out);
} }
VarHandlePtr GRPCClient::AsyncGetMonomerVariable( VarHandlePtr GRPCClient::AsyncGetMonomerVariable(
...@@ -150,18 +152,21 @@ VarHandlePtr GRPCClient::AsyncGetMonomerVariable( ...@@ -150,18 +152,21 @@ VarHandlePtr GRPCClient::AsyncGetMonomerVariable(
const framework::Scope& scope, const std::string& var_name, const framework::Scope& scope, const std::string& var_name,
int64_t time_out) { int64_t time_out) {
return _AsyncGetVar(ep, ctx, scope, kGetMonomerRPC, var_name, var_name, return _AsyncGetVar(ep, ctx, scope, kGetMonomerRPC, var_name, var_name,
"/sendrecv.SendRecvService/GetMonomerVariable", time_out); "/sendrecv.SendRecvService/GetMonomerVariable", "",
time_out);
} }
VarHandlePtr GRPCClient::_AsyncGetVar( VarHandlePtr GRPCClient::_AsyncGetVar(
const std::string& ep, const platform::DeviceContext& ctx, const std::string& ep, const platform::DeviceContext& ctx,
const framework::Scope& scope, const std::string& method, const framework::Scope& scope, const std::string& method,
const std::string& var_name, const std::string& out_varname, const std::string& var_name, const std::string& out_varname,
const std::string& rpc_path, int64_t time_out) { const std::string& rpc_path, const std::string& table_name,
int64_t time_out) {
const platform::DeviceContext* p_ctx = &ctx; const platform::DeviceContext* p_ctx = &ctx;
const std::string ep_val = ep; const std::string ep_val = ep;
const std::string var_name_val = var_name; const std::string var_name_val = var_name;
const std::string out_varname_val = out_varname; const std::string out_varname_val = out_varname;
const std::string table_name_val = table_name;
const framework::Scope* p_scope = &scope; const framework::Scope* p_scope = &scope;
const auto ch = GetChannel(ep_val); const auto ch = GetChannel(ep_val);
GetProcessor* s = new GetProcessor(ch); GetProcessor* s = new GetProcessor(ch);
...@@ -169,32 +174,33 @@ VarHandlePtr GRPCClient::_AsyncGetVar( ...@@ -169,32 +174,33 @@ VarHandlePtr GRPCClient::_AsyncGetVar(
VarHandlePtr h(new VarHandle(ep, method, out_varname_val, p_ctx, p_scope)); VarHandlePtr h(new VarHandle(ep, method, out_varname_val, p_ctx, p_scope));
s->Prepare(h, time_out); s->Prepare(h, time_out);
framework::AsyncIO( framework::AsyncIO([var_name_val, out_varname_val, table_name_val, s, method,
[var_name_val, out_varname_val, s, method, p_ctx, h, rpc_path, this] { p_ctx, h, rpc_path, this] {
// prepare input // prepare input
sendrecv::VariableMessage req; sendrecv::VariableMessage req;
req.set_varname(var_name_val); req.set_varname(var_name_val);
req.set_out_varname(out_varname_val); req.set_out_varname(out_varname_val);
req.set_trainer_id(trainer_id_); req.set_trainer_id(trainer_id_);
::grpc::ByteBuffer buf; req.set_table_name(table_name_val);
RequestToByteBuffer<sendrecv::VariableMessage>(req, &buf); ::grpc::ByteBuffer buf;
RequestToByteBuffer<sendrecv::VariableMessage>(req, &buf);
VLOG(3) << s->GetVarHandlePtr()->String() << " begin"; VLOG(3) << s->GetVarHandlePtr()->String() << " begin";
// stub context // stub context
s->response_call_back_ = ProcGetResponse; s->response_call_back_ = ProcGetResponse;
platform::RecordRPCEvent record_event(method); platform::RecordRPCEvent record_event(method);
auto call = auto call =
s->stub_g_.PrepareUnaryCall(s->context_.get(), rpc_path, buf, &cq_); s->stub_g_.PrepareUnaryCall(s->context_.get(), rpc_path, buf, &cq_);
call->StartCall(); call->StartCall();
call->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s)); call->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
if (UNLIKELY(platform::IsProfileEnabled())) { if (UNLIKELY(platform::IsProfileEnabled())) {
h->Wait(); h->Wait();
} }
}); });
req_count_++; req_count_++;
......
...@@ -23,9 +23,11 @@ limitations under the License. */ ...@@ -23,9 +23,11 @@ limitations under the License. */
#include <functional> #include <functional>
#include <iostream> #include <iostream>
#include <map> #include <map>
#include <memory>
#include <mutex> // NOLINT #include <mutex> // NOLINT
#include <string> #include <string>
#include <thread> // NOLINT #include <thread> // NOLINT
#include <unordered_map>
#include <vector> #include <vector>
#include "grpc++/channel.h" #include "grpc++/channel.h"
...@@ -187,6 +189,7 @@ class GRPCClient : public RPCClient { ...@@ -187,6 +189,7 @@ class GRPCClient : public RPCClient {
const framework::Scope& scope, const framework::Scope& scope,
const std::string& var_name, const std::string& var_name,
const std::string& out_varname, const std::string& out_varname,
const std::string& table_name = "",
int64_t time_out = FLAGS_rpc_deadline) override; int64_t time_out = FLAGS_rpc_deadline) override;
VarHandlePtr AsyncGetVarNoBarrier( VarHandlePtr AsyncGetVarNoBarrier(
...@@ -239,7 +242,8 @@ class GRPCClient : public RPCClient { ...@@ -239,7 +242,8 @@ class GRPCClient : public RPCClient {
const std::string& ep, const platform::DeviceContext& ctx, const std::string& ep, const platform::DeviceContext& ctx,
const framework::Scope& scope, const std::string& method, const framework::Scope& scope, const std::string& method,
const std::string& var_name, const std::string& out_varname, const std::string& var_name, const std::string& out_varname,
const std::string& rpc_path, int64_t time_out = FLAGS_rpc_deadline); const std::string& rpc_path, const std::string& table_name = "",
int64_t time_out = FLAGS_rpc_deadline);
private: private:
grpc::CompletionQueue cq_; grpc::CompletionQueue cq_;
......
...@@ -137,6 +137,7 @@ class RequestGet final : public RequestBase { ...@@ -137,6 +137,7 @@ class RequestGet final : public RequestBase {
// proc request. // proc request.
std::string varname = request_.varname(); std::string varname = request_.varname();
std::string out_varname = request_.out_varname(); std::string out_varname = request_.out_varname();
std::string table_name = request_.table_name();
int trainer_id = request_.trainer_id(); int trainer_id = request_.trainer_id();
VLOG(4) << "RequestGet " << out_varname << " from " << varname; VLOG(4) << "RequestGet " << out_varname << " from " << varname;
...@@ -145,19 +146,23 @@ class RequestGet final : public RequestBase { ...@@ -145,19 +146,23 @@ class RequestGet final : public RequestBase {
framework::Variable* invar = nullptr; framework::Variable* invar = nullptr;
framework::Variable* outvar = nullptr; framework::Variable* outvar = nullptr;
request_handler_->Handle(varname, scope, invar, &outvar, trainer_id, tmp_scope_ = std::move(scope->NewTmpScope());
out_varname); request_handler_->Handle(varname, tmp_scope_.get(), invar, &outvar,
trainer_id, out_varname, table_name);
VLOG(1) << "before SerializeToByteBuffer";
if (outvar) { if (outvar) {
SerializeToByteBuffer(out_varname, outvar, *request_handler_->dev_ctx(), SerializeToByteBuffer(out_varname, outvar, *request_handler_->dev_ctx(),
&reply_); &reply_);
} }
VLOG(1) << "after SerializeToByteBuffer";
Finish(reply_, &responder_); Finish(reply_, &responder_);
} }
protected: protected:
sendrecv::VariableMessage request_; sendrecv::VariableMessage request_;
::grpc::ByteBuffer reply_; ::grpc::ByteBuffer reply_;
std::unique_ptr<framework::Scope> tmp_scope_;
ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_; ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
}; };
......
...@@ -42,27 +42,23 @@ using DDim = framework::DDim; ...@@ -42,27 +42,23 @@ using DDim = framework::DDim;
template <typename T> template <typename T>
void ParameterRecv<T>::operator()(const RpcContext &rpc_ctx, void ParameterRecv<T>::operator()(const RpcContext &rpc_ctx,
const framework::Scope &scope) { const framework::Scope &scope) {
VLOG(3) << "ParameterRecv in"; VLOG(3) << "ParameterRecv in " << rpc_ctx.var_name;
std::unique_ptr<framework::Scope> local_scope = scope.NewTmpScope(); std::unique_ptr<framework::Scope> local_scope = scope.NewTmpScope();
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &cpu_ctx = *pool.Get(platform::CPUPlace()); auto &cpu_ctx = *pool.Get(platform::CPUPlace());
distributed::RPCClient *rpc_client = distributed::RPCClient *rpc_client =
distributed::RPCClient::GetInstance<RPCCLIENT_T>(0); distributed::RPCClient::GetInstance<RPCCLIENT_T>(rpc_ctx.trainer_id);
auto *recv_var = scope.FindVar(rpc_ctx.var_name); auto *recv_var = scope.FindVar(rpc_ctx.var_name);
std::vector<framework::Tensor *> recved_tensors;
// recv all vars to local scope // recv all vars to local scope
if (recv_var->IsType<framework::LoDTensor>()) { if (recv_var->IsType<framework::LoDTensor>()) {
std::vector<distributed::VarHandlePtr> rets; std::vector<distributed::VarHandlePtr> rets;
for (size_t i = 0; i < rpc_ctx.splited_var_names.size(); i++) { for (size_t i = 0; i < rpc_ctx.splited_var_names.size(); i++) {
auto &recv_var_name = rpc_ctx.splited_var_names[i]; auto &recv_var_name = rpc_ctx.splited_var_names[i];
framework::Tensor *t = local_scope->Var(recv_var_name);
local_scope->Var(recv_var_name)->GetMutable<framework::LoDTensor>();
recved_tensors.push_back(t);
VLOG(3) << "recv " << recv_var_name << " from " << rpc_ctx.epmap[i]; VLOG(3) << "recv " << recv_var_name << " from " << rpc_ctx.epmap[i];
rets.push_back(rpc_client->AsyncGetVar(rpc_ctx.epmap[i], cpu_ctx, rets.push_back(rpc_client->AsyncGetVar(rpc_ctx.epmap[i], cpu_ctx,
*local_scope.get(), recv_var_name, *local_scope.get(), recv_var_name,
...@@ -78,23 +74,61 @@ void ParameterRecv<T>::operator()(const RpcContext &rpc_ctx, ...@@ -78,23 +74,61 @@ void ParameterRecv<T>::operator()(const RpcContext &rpc_ctx,
// concat recved tensor into one var // concat recved tensor into one var
{ {
size_t output_offset = 0; size_t output_offset = 0;
size_t row_offset = 0;
framework::Tensor *recv_tensor = framework::Tensor *recv_tensor =
recv_var->GetMutable<framework::LoDTensor>(); recv_var->GetMutable<framework::LoDTensor>();
auto dev_ctx = paddle::platform::CPUDeviceContext(); auto dev_ctx = paddle::platform::CPUDeviceContext();
int64_t recv_numel = 0; int64_t recv_numel = 0;
for (auto *in : recved_tensors) { for (auto &recv_var_name : rpc_ctx.splited_var_names) {
recv_numel += in->numel(); auto *recv_var = local_scope->FindVar(recv_var_name);
auto in_stride = framework::stride_numel(in->dims()); if (recv_var->IsType<framework::LoDTensor>()) {
auto out_stride = framework::stride_numel(recv_tensor->dims()); auto &in = recv_var->Get<framework::LoDTensor>();
StridedNumelCopyWithAxis<T>( recv_numel += in.numel();
dev_ctx, 0, recv_tensor->data<T>() + output_offset, out_stride, auto in_stride = framework::stride_numel(in.dims());
in->data<T>(), in_stride, in_stride[0]); auto out_stride = framework::stride_numel(recv_tensor->dims());
output_offset += in_stride[0]; StridedNumelCopyWithAxis<T>(
dev_ctx, 0, recv_tensor->data<T>() + output_offset, out_stride,
in.data<T>(), in_stride, in_stride[0]);
output_offset += in_stride[0];
} else if (recv_var->IsType<framework::SelectedRows>()) {
auto &recv_slr = recv_var->Get<framework::SelectedRows>();
auto &recv_dims = recv_tensor->dims();
int64_t width = recv_dims[1];
recv_numel += recv_slr.height() * width;
PADDLE_ENFORCE_EQ(recv_slr.value().dims()[1], width);
PADDLE_ENFORCE_EQ(recv_slr.value().dims()[0], recv_slr.rows().size());
VLOG(3) << "recv slr " << recv_var_name << " dims "
<< recv_slr.value().dims();
if (VLOG_IS_ON(3)) {
std::ostringstream sstream;
sstream << "[";
for (auto &row_id : recv_slr.rows()) {
sstream << row_id << ", ";
}
sstream << "]";
VLOG(3) << "recv_slr size: " << recv_slr.rows().size() << " "
<< sstream.str();
}
for (auto i = 0; i < recv_slr.rows().size(); ++i) {
auto row_id = recv_slr.rows()[i] + row_offset;
PADDLE_ENFORCE_LT(row_id, recv_dims[0]);
memcpy(recv_tensor->data<T>() + row_id * width,
recv_slr.value().data<T>() + i * width, sizeof(T) * width);
}
row_offset += recv_slr.height();
} else {
PADDLE_THROW("unsupported recieved var type");
}
}
auto numel = recv_tensor->numel();
if (recv_numel != numel) {
LOG(FATAL) << "recv_numel: " << recv_numel << " acture numel: " << numel;
} }
PADDLE_ENFORCE_EQ(recv_numel, recv_tensor->numel()); PADDLE_ENFORCE_EQ(recv_numel, numel);
} }
VLOG(3) << "ParameterRecv out"; VLOG(3) << "ParameterRecv out " << rpc_ctx.var_name;
} }
template struct ParameterRecv<float>; template struct ParameterRecv<float>;
......
...@@ -47,7 +47,7 @@ void ParameterSend<T>::operator()(const RpcContext &rpc_ctx, ...@@ -47,7 +47,7 @@ void ParameterSend<T>::operator()(const RpcContext &rpc_ctx,
auto &cpu_ctx = *pool.Get(platform::CPUPlace()); auto &cpu_ctx = *pool.Get(platform::CPUPlace());
distributed::RPCClient *rpc_client = distributed::RPCClient *rpc_client =
distributed::RPCClient::GetInstance<RPCCLIENT_T>(0); distributed::RPCClient::GetInstance<RPCCLIENT_T>(rpc_ctx.trainer_id);
auto *send_var = scope.FindVar(rpc_ctx.var_name); auto *send_var = scope.FindVar(rpc_ctx.var_name);
size_t out_num = rpc_ctx.splited_var_names.size(); size_t out_num = rpc_ctx.splited_var_names.size();
......
...@@ -18,7 +18,9 @@ ...@@ -18,7 +18,9 @@
#include <condition_variable> // NOLINT #include <condition_variable> // NOLINT
#include <functional> #include <functional>
#include <memory>
#include <string> #include <string>
#include <unordered_map>
#include <utility> #include <utility>
#include <vector> #include <vector>
...@@ -180,6 +182,10 @@ class RequestHandler { ...@@ -180,6 +182,10 @@ class RequestHandler {
grad_to_prepared_ctx_ = g; grad_to_prepared_ctx_ = g;
} }
void SetSparseGradToParam(std::unordered_map<std::string, std::string>* g) {
sparse_grad_to_param_ = g;
}
void SetRPCServer(RPCServer* rpc_server) { rpc_server_ = rpc_server; } void SetRPCServer(RPCServer* rpc_server) { rpc_server_ = rpc_server; }
// Get attributes. // Get attributes.
...@@ -228,6 +234,7 @@ class RequestHandler { ...@@ -228,6 +234,7 @@ class RequestHandler {
std::unordered_map<std::string, std::unordered_map<std::string,
std::shared_ptr<framework::ExecutorPrepareContext>>* std::shared_ptr<framework::ExecutorPrepareContext>>*
grad_to_prepared_ctx_; grad_to_prepared_ctx_;
std::unordered_map<std::string, std::string>* sparse_grad_to_param_;
RPCServer* rpc_server_; RPCServer* rpc_server_;
}; };
......
...@@ -22,6 +22,7 @@ ...@@ -22,6 +22,7 @@
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/variable_helper.h" #include "paddle/fluid/framework/variable_helper.h"
#include "paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h"
#include "paddle/fluid/operators/distributed/rpc_server.h" #include "paddle/fluid/operators/distributed/rpc_server.h"
#include "paddle/fluid/string/piece.h" #include "paddle/fluid/string/piece.h"
#include "paddle/fluid/string/printf.h" #include "paddle/fluid/string/printf.h"
...@@ -59,6 +60,12 @@ bool RequestSendHandler::Handle(const std::string& varname, ...@@ -59,6 +60,12 @@ bool RequestSendHandler::Handle(const std::string& varname,
"async mode should not recv BATCH_BARRIER_MESSAGE or " "async mode should not recv BATCH_BARRIER_MESSAGE or "
"COMPLETE_MESSAGE"); "COMPLETE_MESSAGE");
} }
if (AsyncSparseParamUpdateRecorder::GetInstance()->HasGrad(varname)) {
auto& grad_slr =
scope->FindVar(varname)->Get<framework::SelectedRows>();
AsyncSparseParamUpdateRecorder::GetInstance()->Update(varname,
grad_slr.rows());
}
executor_->RunPreparedContext((*grad_to_prepared_ctx_)[varname].get(), executor_->RunPreparedContext((*grad_to_prepared_ctx_)[varname].get(),
scope); scope);
return true; return true;
...@@ -82,8 +89,9 @@ bool RequestGetHandler::Handle(const std::string& varname, ...@@ -82,8 +89,9 @@ bool RequestGetHandler::Handle(const std::string& varname,
const int trainer_id, const int trainer_id,
const std::string& out_var_name, const std::string& out_var_name,
const std::string& table_name) { const std::string& table_name) {
VLOG(4) << "RequestGetHandler:" << varname VLOG(3) << "RequestGetHandler:" << varname
<< " out_var_name: " << out_var_name; << " out_var_name: " << out_var_name << " trainer_id: " << trainer_id
<< " table_name: " << table_name;
if (sync_mode_) { if (sync_mode_) {
if (varname == FETCH_BARRIER_MESSAGE) { if (varname == FETCH_BARRIER_MESSAGE) {
...@@ -108,7 +116,42 @@ bool RequestGetHandler::Handle(const std::string& varname, ...@@ -108,7 +116,42 @@ bool RequestGetHandler::Handle(const std::string& varname,
VLOG(3) << "copying " << varname << " to " << param_bak_name; VLOG(3) << "copying " << varname << " to " << param_bak_name;
framework::TensorCopy(t_orig, dev_ctx_->GetPlace(), t); framework::TensorCopy(t_orig, dev_ctx_->GetPlace(), t);
} }
*outvar = scope_->FindVar(varname); if (AsyncSparseParamUpdateRecorder::GetInstance()->HasParam(varname) &&
!table_name.empty()) {
std::vector<int64_t> updated_rows;
AsyncSparseParamUpdateRecorder::GetInstance()->GetAndClear(
varname, trainer_id, &updated_rows);
if (VLOG_IS_ON(3)) {
std::ostringstream sstream;
sstream << "[";
for (auto& row_id : updated_rows) {
sstream << row_id << ", ";
}
sstream << "]";
VLOG(3) << "updated_rows size: " << updated_rows.size() << " "
<< sstream.str();
}
auto& origin_tensor =
scope_->FindVar(varname)->Get<framework::LoDTensor>();
auto* origin_tensor_data = origin_tensor.data<float>();
auto& dims = origin_tensor.dims();
*outvar = scope->Var();
auto* out_slr = (*outvar)->GetMutable<framework::SelectedRows>();
out_slr->set_rows(updated_rows);
out_slr->set_height(dims[0]);
auto out_dims = framework::make_ddim(
{static_cast<int64_t>(updated_rows.size()), dims[1]});
auto* data = out_slr->mutable_value()->mutable_data<float>(
out_dims, origin_tensor.place());
auto width = dims[1];
for (auto i = 0; i < updated_rows.size(); ++i) {
PADDLE_ENFORCE_LT(updated_rows[i], dims[0]);
memcpy(data + i * width, origin_tensor_data + updated_rows[i] * width,
sizeof(float) * width);
}
} else {
*outvar = scope_->FindVar(varname);
}
} }
} }
return true; return true;
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#pragma once #pragma once
#include <condition_variable> // NOLINT #include <condition_variable> // NOLINT
#include <memory>
#include <string> #include <string>
#include "gflags/gflags.h" #include "gflags/gflags.h"
...@@ -44,6 +45,7 @@ class RPCClient { ...@@ -44,6 +45,7 @@ class RPCClient {
const framework::Scope& scope, const framework::Scope& scope,
const std::string& var_name, const std::string& var_name,
const std::string& out_varname, const std::string& out_varname,
const std::string& table_name = "",
int64_t time_out = FLAGS_rpc_deadline) = 0; int64_t time_out = FLAGS_rpc_deadline) = 0;
virtual VarHandlePtr AsyncGetVarNoBarrier( virtual VarHandlePtr AsyncGetVarNoBarrier(
...@@ -96,6 +98,7 @@ class RPCClient { ...@@ -96,6 +98,7 @@ class RPCClient {
// Init is called by GetInstance. // Init is called by GetInstance.
template <typename T> template <typename T>
static void Init(int trainer_id) { static void Init(int trainer_id) {
VLOG(0) << "init rpc client with trainer_id " << trainer_id;
trainer_id_ = trainer_id; trainer_id_ = trainer_id;
if (rpc_client_.get() == nullptr) { if (rpc_client_.get() == nullptr) {
rpc_client_.reset(new T()); rpc_client_.reset(new T());
......
...@@ -27,23 +27,26 @@ struct RpcContext { ...@@ -27,23 +27,26 @@ struct RpcContext {
RpcContext(const std::string &name, const std::vector<std::string> &names, RpcContext(const std::string &name, const std::vector<std::string> &names,
const std::vector<std::string> &emap, const std::vector<std::string> &emap,
const std::vector<int64_t> &sections) const std::vector<int64_t> &sections, int id)
: var_name(name), : var_name(name),
splited_var_names(names), splited_var_names(names),
epmap(emap), epmap(emap),
height_sections(sections) {} height_sections(sections),
trainer_id(id) {}
RpcContext(const RpcContext &ctx) { RpcContext(const RpcContext &ctx) {
var_name = ctx.var_name; var_name = ctx.var_name;
splited_var_names = ctx.splited_var_names; splited_var_names = ctx.splited_var_names;
epmap = ctx.epmap; epmap = ctx.epmap;
height_sections = ctx.height_sections; height_sections = ctx.height_sections;
trainer_id = ctx.trainer_id;
} }
std::string var_name; std::string var_name;
std::vector<std::string> splited_var_names; std::vector<std::string> splited_var_names;
std::vector<std::string> epmap; std::vector<std::string> epmap;
std::vector<int64_t> height_sections; std::vector<int64_t> height_sections;
int trainer_id;
}; };
inline std::ostream &operator<<(std::ostream &os, const RpcContext &rpc_ctx) { inline std::ostream &operator<<(std::ostream &os, const RpcContext &rpc_ctx) {
......
...@@ -2,9 +2,9 @@ include(operators) ...@@ -2,9 +2,9 @@ include(operators)
set(DISTRIBUTE_DEPS "") set(DISTRIBUTE_DEPS "")
if(WITH_GRPC) if(WITH_GRPC)
set(DISTRIBUTE_DEPS sendrecvop_rpc parameter_send parameter_recv communicator grpc++_unsecure grpc_unsecure gpr cares zlib protobuf node) set(DISTRIBUTE_DEPS sendrecvop_rpc parameter_send parameter_recv communicator async_sparse_param_update_recorder grpc++_unsecure grpc_unsecure gpr cares zlib protobuf node)
else() else()
set(DISTRIBUTE_DEPS sendrecvop_rpc parameter_send parameter_recv communicator brpc leveldb snappystream snappy protobuf ssl crypto zlib node) set(DISTRIBUTE_DEPS sendrecvop_rpc parameter_send parameter_recv communicator async_sparse_param_update_recorder brpc leveldb snappystream snappy protobuf ssl crypto zlib node)
if(WITH_BRPC_RDMA) if(WITH_BRPC_RDMA)
find_library(IBVERBS_LIBRARY NAMES ibverbs) find_library(IBVERBS_LIBRARY NAMES ibverbs)
ADD_LIBRARY(ibverbs SHARED IMPORTED GLOBAL) ADD_LIBRARY(ibverbs SHARED IMPORTED GLOBAL)
......
...@@ -24,8 +24,10 @@ limitations under the License. */ ...@@ -24,8 +24,10 @@ limitations under the License. */
#include "paddle/fluid/operators/distributed/distributed.h" #include "paddle/fluid/operators/distributed/distributed.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h"
#include "paddle/fluid/operators/distributed/request_handler_impl.h" #include "paddle/fluid/operators/distributed/request_handler_impl.h"
#include "paddle/fluid/operators/distributed_ops/listen_and_serv_op.h" #include "paddle/fluid/operators/distributed_ops/listen_and_serv_op.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
DEFINE_int32(rpc_send_thread_num, 12, "number of threads for rpc send"); DEFINE_int32(rpc_send_thread_num, 12, "number of threads for rpc send");
...@@ -292,6 +294,8 @@ static void FillRequestCtx( ...@@ -292,6 +294,8 @@ static void FillRequestCtx(
std::unordered_map<std::string, std::unordered_map<std::string,
std::shared_ptr<framework::ExecutorPrepareContext>> std::shared_ptr<framework::ExecutorPrepareContext>>
*prefetch_ctx, *prefetch_ctx,
std::unordered_map<std::string, std::string>
*sparse_grad_name_to_param_name,
std::shared_ptr<framework::ExecutorPrepareContext> checkpoint_ctx, std::shared_ptr<framework::ExecutorPrepareContext> checkpoint_ctx,
distributed::RPCServer *rpc_server) { distributed::RPCServer *rpc_server) {
h->SetScope(scope); h->SetScope(scope);
...@@ -299,6 +303,7 @@ static void FillRequestCtx( ...@@ -299,6 +303,7 @@ static void FillRequestCtx(
h->SetExecutor(executor); h->SetExecutor(executor);
h->SetProgram(program); h->SetProgram(program);
h->SetPrefetchPreparedCtx(prefetch_ctx); h->SetPrefetchPreparedCtx(prefetch_ctx);
h->SetSparseGradToParam(sparse_grad_name_to_param_name);
h->SetRPCServer(rpc_server); h->SetRPCServer(rpc_server);
h->SetCheckpointNotifyPreparedCtx(checkpoint_ctx); h->SetCheckpointNotifyPreparedCtx(checkpoint_ctx);
} }
...@@ -414,10 +419,24 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, ...@@ -414,10 +419,24 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
prefetch_var_name_to_prepared_ctx[prefetch_var_name] = prefetch_prepared[i]; prefetch_var_name_to_prepared_ctx[prefetch_var_name] = prefetch_prepared[i];
} }
auto f = // parse attr of kSparseGradToParam sparse_grad_name -> param_name
std::bind(FillRequestCtx, std::placeholders::_1, &recv_scope, &dev_ctx, std::unordered_map<std::string, std::string> sparse_grad_name_to_param_name;
&executor, program, &prefetch_var_name_to_prepared_ctx, auto sparse_grad_name_to_param_name_str =
ckpt_pre_context, rpc_service_.get()); Attr<std::vector<std::string>>(kSparseGradToParam);
for (const auto &sparse_grad_name_and_param_name :
sparse_grad_name_to_param_name_str) {
std::vector<std::string> pieces;
split(sparse_grad_name_and_param_name, ':', &pieces);
PADDLE_ENFORCE_EQ(pieces.size(), 2);
VLOG(3) << "after split, sparse_grad_name = " << pieces[0]
<< ", param_name = " << pieces[1];
sparse_grad_name_to_param_name[pieces[0]] = pieces[1];
}
auto f = std::bind(
FillRequestCtx, std::placeholders::_1, &recv_scope, &dev_ctx, &executor,
program, &prefetch_var_name_to_prepared_ctx,
&sparse_grad_name_to_param_name, ckpt_pre_context, rpc_service_.get());
f(request_send_handler_.get()); f(request_send_handler_.get());
f(request_get_handler_.get()); f(request_get_handler_.get());
...@@ -445,6 +464,8 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, ...@@ -445,6 +464,8 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
RunSyncLoop(&executor, program, &recv_scope, &dev_ctx, RunSyncLoop(&executor, program, &recv_scope, &dev_ctx,
prefetch_block_id_list, checkpoint_block_id); prefetch_block_id_list, checkpoint_block_id);
} else { } else {
distributed::AsyncSparseParamUpdateRecorder::Init(
fan_in, sparse_grad_name_to_param_name);
RunAsyncLoop(&executor, program, &recv_scope); RunAsyncLoop(&executor, program, &recv_scope);
} }
} }
...@@ -475,6 +496,10 @@ class ListenAndServOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -475,6 +496,10 @@ class ListenAndServOpMaker : public framework::OpProtoAndCheckerMaker {
AddAttr<std::vector<std::string>>(kPrefetchVarNameToBlockId, AddAttr<std::vector<std::string>>(kPrefetchVarNameToBlockId,
"prefetch blocks to run on server side.") "prefetch blocks to run on server side.")
.SetDefault({}); .SetDefault({});
AddAttr<std::vector<std::string>>(
kSparseGradToParam,
"sparse grad name to param name. like: 'emb@Grad:emb'")
.SetDefault({});
AddAttr<int>("Fanin", "How many clients send to this server.") AddAttr<int>("Fanin", "How many clients send to this server.")
.SetDefault(1); .SetDefault(1);
AddAttr<int>(kCheckpointBlockId, AddAttr<int>(kCheckpointBlockId,
......
...@@ -16,8 +16,10 @@ limitations under the License. */ ...@@ -16,8 +16,10 @@ limitations under the License. */
#include <stdint.h> #include <stdint.h>
#include <atomic> #include <atomic>
#include <memory>
#include <set> #include <set>
#include <string> #include <string>
#include <unordered_map>
#include <utility> #include <utility>
#include <vector> #include <vector>
...@@ -35,6 +37,7 @@ namespace operators { ...@@ -35,6 +37,7 @@ namespace operators {
constexpr char kOptimizeBlocks[] = "optimize_blocks"; constexpr char kOptimizeBlocks[] = "optimize_blocks";
constexpr char kPrefetchVarNameToBlockId[] = "prefetch_var_name_to_block_id"; constexpr char kPrefetchVarNameToBlockId[] = "prefetch_var_name_to_block_id";
constexpr char kCheckpointBlockId[] = "checkpint_block_id"; constexpr char kCheckpointBlockId[] = "checkpint_block_id";
constexpr char kSparseGradToParam[] = "sparse_grad_to_param";
void RunServer(std::shared_ptr<distributed::RPCServer> service); void RunServer(std::shared_ptr<distributed::RPCServer> service);
......
...@@ -50,17 +50,18 @@ class RecvOp : public framework::OperatorBase { ...@@ -50,17 +50,18 @@ class RecvOp : public framework::OperatorBase {
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &ctx = *pool.Get(place); auto &ctx = *pool.Get(place);
auto trainer_id = Attr<int>("trainer_id");
distributed::RPCClient *rpc_client = distributed::RPCClient *rpc_client =
distributed::RPCClient::GetInstance<RPCCLIENT_T>( distributed::RPCClient::GetInstance<RPCCLIENT_T>(trainer_id);
Attr<int>("trainer_id"));
std::vector<std::string> recv_varnames = std::vector<std::string> recv_varnames =
Attr<std::vector<std::string>>("recv_varnames"); Attr<std::vector<std::string>>("recv_varnames");
if (recv_varnames.size() > 0) { if (recv_varnames.size() > 0) {
auto recv_functor = distributed::ParameterRecv<float>(); auto recv_functor = distributed::ParameterRecv<float>();
auto rpc_ctx = distributed::RpcContext(outs[0], recv_varnames, epmap, {}); auto rpc_ctx = distributed::RpcContext(outs[0], recv_varnames, epmap, {},
trainer_id);
recv_functor(rpc_ctx, scope); recv_functor(rpc_ctx, scope);
} else { } else {
if (with_barrier) { if (with_barrier) {
......
...@@ -42,6 +42,7 @@ class SendOp : public framework::OperatorBase { ...@@ -42,6 +42,7 @@ class SendOp : public framework::OperatorBase {
auto epmap = Attr<std::vector<std::string>>("epmap"); auto epmap = Attr<std::vector<std::string>>("epmap");
int sync_send = Attr<int>("sync_mode"); int sync_send = Attr<int>("sync_mode");
auto trainer_id = Attr<int>("trainer_id");
auto send_varnames = Attr<std::vector<std::string>>("send_varnames"); auto send_varnames = Attr<std::vector<std::string>>("send_varnames");
auto height_sections = Attr<std::vector<int64_t>>("sections"); auto height_sections = Attr<std::vector<int64_t>>("sections");
...@@ -51,7 +52,7 @@ class SendOp : public framework::OperatorBase { ...@@ -51,7 +52,7 @@ class SendOp : public framework::OperatorBase {
if (distributed::Communicator::GetInstance() == nullptr) { if (distributed::Communicator::GetInstance() == nullptr) {
auto send_functor = distributed::ParameterSend<float>(); auto send_functor = distributed::ParameterSend<float>();
auto rpc_ctx = distributed::RpcContext(ins[0], send_varnames, epmap, auto rpc_ctx = distributed::RpcContext(ins[0], send_varnames, epmap,
height_sections); height_sections, trainer_id);
send_functor(rpc_ctx, scope, true); send_functor(rpc_ctx, scope, true);
} else { } else {
distributed::Communicator::GetInstance()->Send(ins[0], scope); distributed::Communicator::GetInstance()->Send(ins[0], scope);
...@@ -62,8 +63,7 @@ class SendOp : public framework::OperatorBase { ...@@ -62,8 +63,7 @@ class SendOp : public framework::OperatorBase {
auto& ctx = *pool.Get(place); auto& ctx = *pool.Get(place);
distributed::RPCClient* rpc_client = distributed::RPCClient* rpc_client =
distributed::RPCClient::GetInstance<RPCCLIENT_T>( distributed::RPCClient::GetInstance<RPCCLIENT_T>(trainer_id);
Attr<int>("trainer_id"));
std::vector<distributed::VarHandlePtr> rets; std::vector<distributed::VarHandlePtr> rets;
for (size_t i = 0; i < ins.size(); i++) { for (size_t i = 0; i < ins.size(); i++) {
......
...@@ -31,14 +31,16 @@ class SplitByrefOp : public framework::OperatorWithKernel { ...@@ -31,14 +31,16 @@ class SplitByrefOp : public framework::OperatorWithKernel {
auto in_dims = ctx->GetInputDim("X"); auto in_dims = ctx->GetInputDim("X");
auto outs_names = ctx->Outputs("Out"); auto outs_names = ctx->Outputs("Out");
size_t num = static_cast<size_t>(ctx->Attrs().Get<int>("num")); size_t num = static_cast<size_t>(ctx->Attrs().Get<int>("num"));
std::vector<int> sections = static_cast<std::vector<int>>( auto sections = ctx->Attrs().Get<std::vector<int>>("sections");
ctx->Attrs().Get<std::vector<int>>("sections"));
const size_t outs_number = outs_names.size(); const size_t outs_number = outs_names.size();
std::vector<framework::DDim> outs_dims; std::vector<framework::DDim> outs_dims;
outs_dims.reserve(outs_number); outs_dims.reserve(outs_number);
if (num > 0) { if (num > 0) {
int64_t in_axis_dim = in_dims[0]; int64_t in_axis_dim = 0;
if (ctx->IsRuntime()) {
in_axis_dim = in_dims[0];
}
PADDLE_ENFORCE_EQ(in_axis_dim % num, 0, PADDLE_ENFORCE_EQ(in_axis_dim % num, 0,
"tensor split does not result" "tensor split does not result"
" in an equal division"); " in an equal division");
......
...@@ -235,11 +235,13 @@ struct FindRangeAbsMaxFunctor<platform::CUDADeviceContext, T> { ...@@ -235,11 +235,13 @@ struct FindRangeAbsMaxFunctor<platform::CUDADeviceContext, T> {
int g_find_max; int g_find_max;
memory::Copy(platform::CPUPlace(), &g_find_max, gpu_place, find_max, memory::Copy(platform::CPUPlace(), &g_find_max, gpu_place, find_max,
sizeof(int), 0); sizeof(int), ctx.stream());
ctx.Wait();
if (g_find_max) { if (g_find_max) {
int len; int len;
memory::Copy(platform::CPUPlace(), &len, gpu_place, out_size_data, memory::Copy(platform::CPUPlace(), &len, gpu_place, out_size_data,
sizeof(int), 0); sizeof(int), ctx.stream());
ctx.Wait();
FindAbsMaxFunctor<platform::CUDADeviceContext, T>()(ctx, scale_arr, len, FindAbsMaxFunctor<platform::CUDADeviceContext, T>()(ctx, scale_arr, len,
out_scale_data); out_scale_data);
} }
...@@ -258,25 +260,26 @@ struct FindMovingAverageAbsMaxFunctor<platform::CUDADeviceContext, T> { ...@@ -258,25 +260,26 @@ struct FindMovingAverageAbsMaxFunctor<platform::CUDADeviceContext, T> {
const auto gpu_place = boost::get<platform::CUDAPlace>(ctx.GetPlace()); const auto gpu_place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
T accum; T accum;
memory::Copy(platform::CPUPlace(), &accum, gpu_place, in_accum.data<T>(),
sizeof(T), 0);
T state; T state;
memory::Copy(platform::CPUPlace(), &state, gpu_place, in_state.data<T>(),
sizeof(T), 0);
T scale; T scale;
memory::Copy(platform::CPUPlace(), &accum, gpu_place, in_accum.data<T>(),
sizeof(T), ctx.stream());
memory::Copy(platform::CPUPlace(), &state, gpu_place, in_state.data<T>(),
sizeof(T), ctx.stream());
memory::Copy(platform::CPUPlace(), &scale, gpu_place, cur_scale, sizeof(T), memory::Copy(platform::CPUPlace(), &scale, gpu_place, cur_scale, sizeof(T),
0); ctx.stream());
ctx.Wait();
state = rate * state + 1; state = rate * state + 1;
accum = rate * accum + scale; accum = rate * accum + scale;
scale = accum / state; scale = accum / state;
memory::Copy(gpu_place, out_accum->mutable_data<T>(gpu_place), memory::Copy(gpu_place, out_accum->mutable_data<T>(gpu_place),
platform::CPUPlace(), &accum, sizeof(T), 0); platform::CPUPlace(), &accum, sizeof(T), ctx.stream());
memory::Copy(gpu_place, out_state->mutable_data<T>(gpu_place), memory::Copy(gpu_place, out_state->mutable_data<T>(gpu_place),
platform::CPUPlace(), &state, sizeof(T), 0); platform::CPUPlace(), &state, sizeof(T), ctx.stream());
memory::Copy(gpu_place, out_scale->mutable_data<T>(gpu_place), memory::Copy(gpu_place, out_scale->mutable_data<T>(gpu_place),
platform::CPUPlace(), &scale, sizeof(T), 0); platform::CPUPlace(), &scale, sizeof(T), ctx.stream());
ctx.Wait();
} }
}; };
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/grid_sampler_op.h" #include "paddle/fluid/operators/grid_sampler_op.h"
#include <memory>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/cudnn_helper.h" #include "paddle/fluid/platform/cudnn_helper.h"
...@@ -40,10 +41,12 @@ class GridSampleOp : public framework::OperatorWithKernel { ...@@ -40,10 +41,12 @@ class GridSampleOp : public framework::OperatorWithKernel {
"Input(X) of GridSampleOp should be 4-D Tensor."); "Input(X) of GridSampleOp should be 4-D Tensor.");
PADDLE_ENFORCE(grid_dims.size() == 4, PADDLE_ENFORCE(grid_dims.size() == 4,
"Input(Grid) of GridSampleOp should be 4-D Tensor."); "Input(Grid) of GridSampleOp should be 4-D Tensor.");
PADDLE_ENFORCE(grid_dims[3] == 2, "Input(Grid) dims[3] should be 2."); if (ctx->IsRuntime() || grid_dims[3] > 0) {
PADDLE_ENFORCE_EQ(grid_dims[0], x_dims[0], PADDLE_ENFORCE(grid_dims[3] == 2, "Input(Grid) dims[3] should be 2.");
"Input(X) and Input(Grid) dims[0] should be equal."); }
if (ctx->IsRuntime()) { if (ctx->IsRuntime()) {
PADDLE_ENFORCE_EQ(grid_dims[0], x_dims[0],
"Input(X) and Input(Grid) dims[0] should be equal.");
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
grid_dims[1], x_dims[2], grid_dims[1], x_dims[2],
"Input(X) dims[2] and Input(Grid) dims[1] should be equal."); "Input(X) dims[2] and Input(Grid) dims[1] should be equal.");
......
...@@ -238,6 +238,8 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> { ...@@ -238,6 +238,8 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
zero(dev_ctx, w_grad, static_cast<T>(0.0)); zero(dev_ctx, w_grad, static_cast<T>(0.0));
bit_code->MulGradWeight(pre_out_grad, w_grad, in); bit_code->MulGradWeight(pre_out_grad, w_grad, in);
} else { } else {
PADDLE_ENFORCE(path != nullptr,
"Sparse mode should not be used without custom tree!");
framework::Vector<int64_t> real_rows = PathToRows(*path); framework::Vector<int64_t> real_rows = PathToRows(*path);
auto* w_grad = auto* w_grad =
ctx.Output<framework::SelectedRows>(framework::GradVarName("W")); ctx.Output<framework::SelectedRows>(framework::GradVarName("W"));
......
...@@ -45,9 +45,14 @@ class InterpolateOp : public framework::OperatorWithKernel { ...@@ -45,9 +45,14 @@ class InterpolateOp : public framework::OperatorWithKernel {
// round down // round down
out_h = static_cast<int>(dim_x[2] * scale); out_h = static_cast<int>(dim_x[2] * scale);
out_w = static_cast<int>(dim_x[3] * scale); out_w = static_cast<int>(dim_x[3] * scale);
// protect when input shape is -1
out_h = out_h > 0 ? out_h : -1;
out_w = out_w > 0 ? out_w : -1;
} else { } else {
out_h = ctx->Attrs().Get<int>("out_h"); out_h = ctx->Attrs().Get<int>("out_h");
out_w = ctx->Attrs().Get<int>("out_w"); out_w = ctx->Attrs().Get<int>("out_w");
PADDLE_ENFORCE_GT(out_h, 0, "out_h should be greater than 0.");
PADDLE_ENFORCE_GT(out_w, 0, "out_w should be greater than 0.");
} }
if (ctx->HasInput("OutSize") && ctx->IsRuntime()) { if (ctx->HasInput("OutSize") && ctx->IsRuntime()) {
...@@ -58,6 +63,7 @@ class InterpolateOp : public framework::OperatorWithKernel { ...@@ -58,6 +63,7 @@ class InterpolateOp : public framework::OperatorWithKernel {
ctx->ShareLoD("X", "Out"); ctx->ShareLoD("X", "Out");
return; return;
} }
std::vector<int64_t> dim_out({dim_x[0], dim_x[1], out_h, out_w}); std::vector<int64_t> dim_out({dim_x[0], dim_x[1], out_h, out_w});
ctx->SetOutputDim("Out", framework::make_ddim(dim_out)); ctx->SetOutputDim("Out", framework::make_ddim(dim_out));
} }
......
...@@ -35,8 +35,10 @@ class KLDivLossOp : public framework::OperatorWithKernel { ...@@ -35,8 +35,10 @@ class KLDivLossOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE_EQ(dim_x.size(), dim_target.size(), PADDLE_ENFORCE_EQ(dim_x.size(), dim_target.size(),
"Input(X) rank and Input(Target) rank should be same."); "Input(X) rank and Input(Target) rank should be same.");
for (int i = 0; i < dim_x.size(); i++) { for (int i = 0; i < dim_x.size(); i++) {
PADDLE_ENFORCE_EQ(dim_x[i], dim_target[i], if (ctx->IsRuntime() || (dim_x[i] > 0 && dim_target[i] > 0)) {
"Input(X) and Input(Target) should in same shape."); PADDLE_ENFORCE_EQ(dim_x[i], dim_target[i],
"Input(X) and Input(Target) should in same shape.");
}
} }
auto reduction = ctx->Attrs().Get<std::string>("reduction"); auto reduction = ctx->Attrs().Get<std::string>("reduction");
......
...@@ -30,10 +30,10 @@ class LoDResetOp : public framework::OperatorWithKernel { ...@@ -30,10 +30,10 @@ class LoDResetOp : public framework::OperatorWithKernel {
if (!ctx->HasInput("Y")) { if (!ctx->HasInput("Y")) {
auto level0 = ctx->Attrs().Get<std::vector<int>>("target_lod"); auto level0 = ctx->Attrs().Get<std::vector<int>>("target_lod");
PADDLE_ENFORCE_GT(level0.size(), 1, PADDLE_ENFORCE_GT(level0.size(), 0,
"If Input(Y) not provided, the target lod should be " "If Input(Y) not provided, the target lod should be "
"specified by attribute `target_lod`."); "specified by attribute `target_lod`.");
} else { } else if (ctx->IsRuntime()) {
ctx->ShareLoD("Y", "Out"); ctx->ShareLoD("Y", "Out");
} }
...@@ -48,6 +48,23 @@ class LoDResetOp : public framework::OperatorWithKernel { ...@@ -48,6 +48,23 @@ class LoDResetOp : public framework::OperatorWithKernel {
} }
}; };
class LoDResetOpVarTypeInference : public framework::VarTypeInference {
public:
void operator()(framework::InferVarTypeContext *ctx) const override {
auto x_var_name = ctx->Input("X").front();
auto out_var_name = ctx->Output("Out").front();
if (ctx->HasInput("Y")) {
auto y_var_name = ctx->Input("Y").front();
auto y_lod_level = std::max(ctx->GetLoDLevel(y_var_name), 1);
ctx->SetLoDLevel(out_var_name, y_lod_level);
} else {
ctx->SetLoDLevel(out_var_name, 1);
}
ctx->SetDataType(out_var_name, ctx->GetDataType(x_var_name));
ctx->SetType(out_var_name, paddle::framework::proto::VarType::LOD_TENSOR);
}
};
class LoDResetOpMaker : public framework::OpProtoAndCheckerMaker { class LoDResetOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
void Make() override { void Make() override {
...@@ -177,9 +194,10 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(LoDResetGradNoNeedBufferVarInference, ...@@ -177,9 +194,10 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(LoDResetGradNoNeedBufferVarInference,
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OPERATOR(lod_reset, ops::LoDResetOp, ops::LoDResetOpMaker, REGISTER_OPERATOR(lod_reset, ops::LoDResetOp, ops::LoDResetOpMaker,
ops::LoDResetGradDescMaker); ops::LoDResetGradDescMaker, ops::LoDResetOpVarTypeInference);
REGISTER_OPERATOR(lod_reset_grad, ops::LoDResetGradOp, REGISTER_OPERATOR(lod_reset_grad, ops::LoDResetGradOp,
ops::LoDResetGradNoNeedBufferVarInference); ops::LoDResetGradNoNeedBufferVarInference);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
lod_reset, ops::LoDResetKernel<paddle::platform::CPUPlace, float>, lod_reset, ops::LoDResetKernel<paddle::platform::CPUPlace, float>,
ops::LoDResetKernel<paddle::platform::CPUPlace, double>, ops::LoDResetKernel<paddle::platform::CPUPlace, double>,
......
...@@ -63,7 +63,7 @@ class LoDResetKernel : public framework::OpKernel<T> { ...@@ -63,7 +63,7 @@ class LoDResetKernel : public framework::OpKernel<T> {
"Target LoD should be a vector end with the " "Target LoD should be a vector end with the "
"first dimension of Input(X)."); "first dimension of Input(X).");
for (size_t i = 0; i < level0.size() - 1; ++i) { for (size_t i = 0; i < level0.size() - 1; ++i) {
PADDLE_ENFORCE(level0[i + 1] > level0[i], PADDLE_ENFORCE(level0[i + 1] >= level0[i],
"Target LoD should be an ascending vector."); "Target LoD should be an ascending vector.");
} }
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/lstmp_op.h" #include "paddle/fluid/operators/lstmp_op.h"
#include <memory>
#include <string> #include <string>
namespace paddle { namespace paddle {
...@@ -45,6 +46,7 @@ class LSTMPOp : public framework::OperatorWithKernel { ...@@ -45,6 +46,7 @@ class LSTMPOp : public framework::OperatorWithKernel {
"Output(BatchHidden) of LSTMP operator should not be null."); "Output(BatchHidden) of LSTMP operator should not be null.");
auto in_dims = ctx->GetInputDim("Input"); auto in_dims = ctx->GetInputDim("Input");
PADDLE_ENFORCE_EQ(in_dims.size(), 2, PADDLE_ENFORCE_EQ(in_dims.size(), 2,
"Input(X)'s rank of LSTMP operator must be 2."); "Input(X)'s rank of LSTMP operator must be 2.");
...@@ -269,13 +271,47 @@ Users can choose to use fully-connected operator before LSTMP operator. ...@@ -269,13 +271,47 @@ Users can choose to use fully-connected operator before LSTMP operator.
} }
}; };
class LSTMPGradMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
std::unique_ptr<framework::OpDesc> Apply() const override {
auto* grad_op = new framework::OpDesc();
grad_op->SetType("lstmp_grad");
grad_op->SetInput("Weight", Input("Weight"));
grad_op->SetInput("ProjWeight", Input("ProjWeight"));
grad_op->SetInput("Bias", Input("Bias"));
grad_op->SetInput("Projection", Output("Projection"));
grad_op->SetInput("Cell", Output("Cell"));
grad_op->SetInput("BatchGate", Output("BatchGate"));
grad_op->SetInput("BatchCellPreAct", Output("BatchCellPreAct"));
grad_op->SetInput("BatchHidden", Output("BatchHidden"));
grad_op->SetInput("H0", Input("H0"));
grad_op->SetInput("C0", Input("C0"));
grad_op->SetInput(framework::GradVarName("Projection"),
OutputGrad("Projection"));
grad_op->SetOutput(framework::GradVarName("Input"), InputGrad("Input"));
grad_op->SetOutput(framework::GradVarName("Weight"), InputGrad("Weight"));
grad_op->SetOutput(framework::GradVarName("ProjWeight"),
InputGrad("ProjWeight"));
grad_op->SetOutput(framework::GradVarName("Bias"), InputGrad("Bias"));
grad_op->SetOutput(framework::GradVarName("H0"), InputGrad("H0"));
grad_op->SetOutput(framework::GradVarName("C0"), InputGrad("C0"));
grad_op->SetAttrMap(Attrs());
return std::unique_ptr<framework::OpDesc>(grad_op);
}
};
class LSTMPGradOp : public framework::OperatorWithKernel { class LSTMPGradOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override { void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Input"),
"Input(Input) of LSTMP operator should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Projection"), PADDLE_ENFORCE(ctx->HasInput("Projection"),
"Input(Projection) of LSTMP operator should not be null."); "Input(Projection) of LSTMP operator should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Cell"), PADDLE_ENFORCE(ctx->HasInput("Cell"),
...@@ -298,7 +334,8 @@ class LSTMPGradOp : public framework::OperatorWithKernel { ...@@ -298,7 +334,8 @@ class LSTMPGradOp : public framework::OperatorWithKernel {
ctx->SetOutputDim(g_name, ctx->GetInputDim(name)); ctx->SetOutputDim(g_name, ctx->GetInputDim(name));
}; };
SetOutGradDim("Input"); ctx->SetOutputDim(framework::GradVarName("Input"),
ctx->GetInputDim("BatchGate"));
SetOutGradDim("Weight"); SetOutGradDim("Weight");
SetOutGradDim("ProjWeight"); SetOutGradDim("ProjWeight");
SetOutGradDim("Bias"); SetOutGradDim("Bias");
...@@ -310,7 +347,8 @@ class LSTMPGradOp : public framework::OperatorWithKernel { ...@@ -310,7 +347,8 @@ class LSTMPGradOp : public framework::OperatorWithKernel {
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
ctx.Input<framework::LoDTensor>("Input")->type(), ctx.device_context()); ctx.Input<framework::LoDTensor>("BatchGate")->type(),
ctx.device_context());
} }
}; };
...@@ -318,8 +356,7 @@ class LSTMPGradOp : public framework::OperatorWithKernel { ...@@ -318,8 +356,7 @@ class LSTMPGradOp : public framework::OperatorWithKernel {
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OPERATOR(lstmp, ops::LSTMPOp, ops::LSTMPOpMaker, REGISTER_OPERATOR(lstmp, ops::LSTMPOp, ops::LSTMPOpMaker, ops::LSTMPGradMaker);
paddle::framework::DefaultGradOpDescMaker<true>);
REGISTER_OPERATOR(lstmp_grad, ops::LSTMPGradOp); REGISTER_OPERATOR(lstmp_grad, ops::LSTMPGradOp);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
lstmp, ops::LSTMPKernel<paddle::platform::CPUDeviceContext, float>, lstmp, ops::LSTMPKernel<paddle::platform::CPUDeviceContext, float>,
......
...@@ -267,7 +267,6 @@ class LSTMPGradKernel : public framework::OpKernel<T> { ...@@ -267,7 +267,6 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
} }
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
auto* input = ctx.Input<LoDTensor>("Input");
auto* weight = ctx.Input<Tensor>("Weight"); auto* weight = ctx.Input<Tensor>("Weight");
auto* proj_weight = ctx.Input<Tensor>("ProjWeight"); auto* proj_weight = ctx.Input<Tensor>("ProjWeight");
auto* bias = ctx.Input<Tensor>("Bias"); auto* bias = ctx.Input<Tensor>("Bias");
...@@ -323,7 +322,8 @@ class LSTMPGradKernel : public framework::OpKernel<T> { ...@@ -323,7 +322,8 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
ordered_c0_g.mutable_data<T>(c0_g->dims(), ctx.GetPlace()); ordered_c0_g.mutable_data<T>(c0_g->dims(), ctx.GetPlace());
} }
auto in_dims = input->dims(); // batch_gate dims equal to input dims
auto in_dims = batch_gate->dims();
auto out_dims = cell_out->dims(); auto out_dims = cell_out->dims();
framework::DDim proj_dims({in_dims[0], proj_weight->dims()[1]}); framework::DDim proj_dims({in_dims[0], proj_weight->dims()[1]});
int frame_size = static_cast<int>(in_dims[1] / 4); int frame_size = static_cast<int>(in_dims[1] / 4);
......
...@@ -296,6 +296,7 @@ struct MergeAdd<platform::CPUDeviceContext, T> { ...@@ -296,6 +296,7 @@ struct MergeAdd<platform::CPUDeviceContext, T> {
auto input_height = has_value_input->height(); auto input_height = has_value_input->height();
framework::SelectedRows& out = *output; framework::SelectedRows& out = *output;
std::set<int64_t> merged_row_set; std::set<int64_t> merged_row_set;
size_t row_num = 0;
for (auto* input : inputs) { for (auto* input : inputs) {
if (input->rows().size() == 0) { if (input->rows().size() == 0) {
continue; continue;
...@@ -305,42 +306,71 @@ struct MergeAdd<platform::CPUDeviceContext, T> { ...@@ -305,42 +306,71 @@ struct MergeAdd<platform::CPUDeviceContext, T> {
"dimension except for the first one"); "dimension except for the first one");
PADDLE_ENFORCE_EQ(input_height, input->height(), PADDLE_ENFORCE_EQ(input_height, input->height(),
"all input should have same height"); "all input should have same height");
row_num += input->rows().size();
merged_row_set.insert(input->rows().begin(), input->rows().end()); merged_row_set.insert(input->rows().begin(), input->rows().end());
} }
std::vector<int64_t> merge_rows(merged_row_set.begin(),
merged_row_set.end());
if (sorted_result) {
std::sort(merge_rows.begin(), merge_rows.end());
}
std::unordered_map<int64_t, size_t> rows_to_id;
for (size_t i = 0; i < merge_rows.size(); ++i) {
rows_to_id[merge_rows[i]] = i;
}
out.set_rows(merge_rows);
out.set_height(input_height); out.set_height(input_height);
out.mutable_value()->mutable_data<T>( out.mutable_value()->mutable_data<T>(
framework::make_ddim( framework::make_ddim(
{static_cast<int64_t>(merge_rows.size()), input_width}), {static_cast<int64_t>(merged_row_set.size()), input_width}),
context.GetPlace()); context.GetPlace());
auto* out_data = out.mutable_value()->data<T>();
math::SetConstant<platform::CPUDeviceContext, T> constant_functor; if (merged_row_set.size() == row_num && !sorted_result) {
constant_functor(context, out.mutable_value(), 0.0); // no duplicated ids, just concat the result together
std::vector<int64_t> merge_rows;
merge_rows.reserve(row_num);
// concat rows
for (auto* in : inputs) {
merge_rows.insert(merge_rows.end(), in->rows().begin(),
in->rows().end());
}
out.set_rows(merge_rows);
auto in_place = inputs[0]->place();
auto out_place = out.place();
int64_t copied_numel = 0;
for (auto* in : inputs) {
auto* in_data = in->value().data<T>();
auto in_numel = in->value().numel();
memory::Copy(boost::get<platform::CPUPlace>(out_place),
out_data + copied_numel,
boost::get<platform::CPUPlace>(in_place), in_data,
in_numel * sizeof(T));
copied_numel += in_numel;
}
} else {
std::vector<int64_t> merge_rows(merged_row_set.begin(),
merged_row_set.end());
auto* out_data = out.mutable_value()->data<T>(); if (sorted_result) {
std::sort(merge_rows.begin(), merge_rows.end());
}
auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context); out.set_rows(merge_rows);
for (auto* input : inputs) {
if (input->rows().size() == 0) { math::SetConstant<platform::CPUDeviceContext, T> constant_functor;
continue; constant_functor(context, out.mutable_value(), 0.0);
std::unordered_map<int64_t, size_t> rows_to_id;
for (size_t i = 0; i < merge_rows.size(); ++i) {
rows_to_id[merge_rows[i]] = i;
} }
auto* input_data = input->value().data<T>();
auto& input_rows = input->rows(); auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
for (auto* input : inputs) {
for (size_t i = 0; i < input_rows.size(); i++) { if (input->rows().size() == 0) {
size_t out_i = rows_to_id[input_rows[i]]; continue;
elementwise_add_to<platform::CPUDeviceContext, T>( }
context, &blas, static_cast<size_t>(input_width), auto* input_data = input->value().data<T>();
&input_data[i * input_width], &out_data[out_i * input_width]); auto& input_rows = input->rows();
for (size_t i = 0; i < input_rows.size(); i++) {
size_t out_i = rows_to_id[input_rows[i]];
elementwise_add_to<platform::CPUDeviceContext, T>(
context, &blas, static_cast<size_t>(input_width),
&input_data[i * input_width], &out_data[out_i * input_width]);
}
} }
} }
} }
......
...@@ -13,8 +13,11 @@ See the License for the specific language governing permissions and ...@@ -13,8 +13,11 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/math/selected_rows_functor.h" #include "paddle/fluid/operators/math/selected_rows_functor.h"
#include <memory>
#include <vector> #include <vector>
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
TEST(selected_rows_functor, cpu_add) { TEST(selected_rows_functor, cpu_add) {
...@@ -360,6 +363,69 @@ TEST(selected_rows_functor, cpu_merge_add_multi) { ...@@ -360,6 +363,69 @@ TEST(selected_rows_functor, cpu_merge_add_multi) {
} }
} }
TEST(selected_rows_functor, cpu_merge_add_multi_noduplicated) {
paddle::platform::CPUPlace cpu_place;
paddle::platform::CPUDeviceContext ctx(cpu_place);
paddle::operators::math::SetConstant<paddle::platform::CPUDeviceContext,
float>
set_const;
int64_t height = 10;
int64_t row_numel = 8;
std::vector<int64_t> rows1{1, 3, 5, 7, 9};
std::unique_ptr<paddle::framework::SelectedRows> selected_rows1{
new paddle::framework::SelectedRows(rows1, height)};
auto* in1_value = selected_rows1->mutable_value();
in1_value->mutable_data<float>(
paddle::framework::make_ddim(
{static_cast<int64_t>(rows1.size()), row_numel}),
cpu_place);
set_const(ctx, in1_value, 1.0);
std::vector<int64_t> rows2{0, 2, 4, 6, 8};
std::unique_ptr<paddle::framework::SelectedRows> selected_rows2{
new paddle::framework::SelectedRows(rows2, height)};
auto* in2_value = selected_rows2->mutable_value();
in2_value->mutable_data<float>(
paddle::framework::make_ddim(
{static_cast<int64_t>(rows2.size()), row_numel}),
cpu_place);
set_const(ctx, in2_value, 2.0);
std::unique_ptr<paddle::framework::SelectedRows> output{
new paddle::framework::SelectedRows()};
output->set_height(height);
paddle::operators::math::scatter::MergeAdd<paddle::platform::CPUDeviceContext,
float>
merge_add_functor;
std::vector<const paddle::framework::SelectedRows*> inputs;
inputs.push_back(selected_rows1.get());
inputs.push_back(selected_rows2.get());
merge_add_functor(ctx, inputs, output.get());
EXPECT_EQ(output->height(), height);
EXPECT_EQ(output->value().dims(),
paddle::framework::make_ddim({10, row_numel}));
std::vector<int64_t> ret_rows{1, 3, 5, 7, 9, 0, 2, 4, 6, 8};
EXPECT_EQ(output->rows(), ret_rows);
auto* out_data = output->value().data<float>();
for (size_t i = 0; i < ret_rows.size(); ++i) {
float data_value = 0;
if (i < 5) {
data_value = 1.0;
} else {
data_value = 2.0;
}
for (size_t j = 0; j < static_cast<size_t>(row_numel); ++j) {
EXPECT_EQ(out_data[i * row_numel + j], data_value);
}
}
}
TEST(selected_rows_functor, cpu_sum_to) { TEST(selected_rows_functor, cpu_sum_to) {
paddle::platform::CPUPlace cpu_place; paddle::platform::CPUPlace cpu_place;
paddle::platform::CPUDeviceContext ctx(cpu_place); paddle::platform::CPUDeviceContext ctx(cpu_place);
......
...@@ -164,7 +164,9 @@ class MergeLoDTensorInferShape : public framework::InferShapeBase { ...@@ -164,7 +164,9 @@ class MergeLoDTensorInferShape : public framework::InferShapeBase {
auto mask_dim = context->GetInputDim("Mask"); auto mask_dim = context->GetInputDim("Mask");
PADDLE_ENFORCE_EQ(mask_dim.size(), 2); PADDLE_ENFORCE_EQ(mask_dim.size(), 2);
PADDLE_ENFORCE_EQ(mask_dim[1], 1); if (context->IsRuntime() || mask_dim[1] > 0) {
PADDLE_ENFORCE_EQ(mask_dim[1], 1);
}
context->SetOutputDim("Out", context->GetInputDim("InTrue")); context->SetOutputDim("Out", context->GetInputDim("InTrue"));
} }
......
...@@ -39,13 +39,9 @@ struct bn_type_traits { ...@@ -39,13 +39,9 @@ struct bn_type_traits {
class BatchNormMKLDNNHandler : public platform::MKLDNNHandler { class BatchNormMKLDNNHandler : public platform::MKLDNNHandler {
public: public:
BatchNormMKLDNNHandler( BatchNormMKLDNNHandler(const platform::MKLDNNDeviceContext &dev_ctx,
std::shared_ptr<batch_norm_fwd::primitive_desc> batch_norm_pd, mkldnn::engine engine, const std::string &base_key)
const platform::MKLDNNDeviceContext &dev_ctx, mkldnn::engine engine, : platform::MKLDNNHandler(dev_ctx, engine, base_key) {}
const std::string &base_key)
: platform::MKLDNNHandler(dev_ctx, engine, base_key) {
batch_norm_pd_ = batch_norm_pd;
}
std::shared_ptr<memory> AcquireScaleshiftMemoryFromPrimitive(void *ptr) { std::shared_ptr<memory> AcquireScaleshiftMemoryFromPrimitive(void *ptr) {
return this->AcquireMemoryFromPrimitive( return this->AcquireMemoryFromPrimitive(
...@@ -62,6 +58,26 @@ class BatchNormMKLDNNHandler : public platform::MKLDNNHandler { ...@@ -62,6 +58,26 @@ class BatchNormMKLDNNHandler : public platform::MKLDNNHandler {
batch_norm_pd_->variance_primitive_desc(), ptr, "@variance_mem_p"); batch_norm_pd_->variance_primitive_desc(), ptr, "@variance_mem_p");
} }
std::shared_ptr<batch_norm_fwd::primitive_desc>
AcquireBatchNormPrimitiveDescriptor(const batch_norm_fwd::desc &bn_fwd_desc,
const mkldnn::engine &engine) {
const std::string key_batch_norm_fwd_pd = key_ + "@bn_fwd_pd";
auto batch_norm_pd =
std::static_pointer_cast<batch_norm_fwd::primitive_desc>(
dev_ctx_.GetBlob(key_batch_norm_fwd_pd));
if (batch_norm_pd == nullptr) {
batch_norm_pd_.reset(
new batch_norm_fwd::primitive_desc(bn_fwd_desc, engine));
dev_ctx_.SetBlob(key_batch_norm_fwd_pd, batch_norm_pd_);
} else {
batch_norm_pd_ = batch_norm_pd;
is_reusing_ = true;
}
return batch_norm_pd_;
}
std::shared_ptr<batch_norm_fwd> AcquireTestTrainingBatchNormFwd( std::shared_ptr<batch_norm_fwd> AcquireTestTrainingBatchNormFwd(
std::shared_ptr<memory> src_memory, std::shared_ptr<memory> src_memory,
std::shared_ptr<memory> scaleshift_memory, std::shared_ptr<memory> scaleshift_memory,
...@@ -213,7 +229,7 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -213,7 +229,7 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
const std::string key = BatchNormMKLDNNHandler::GetHash( const std::string key = BatchNormMKLDNNHandler::GetHash(
src_tz, epsilon, flags, global_stats, input_format, src_tz, epsilon, flags, global_stats, input_format,
ctx.op().Output("SavedMean")); ctx.op().Output("SavedMean"));
const std::string key_batch_norm_fwd_pd = key + "@bn_fwd_pd"; BatchNormMKLDNNHandler handler(dev_ctx, mkldnn_engine, key);
auto user_src_md = platform::MKLDNNMemDesc( auto user_src_md = platform::MKLDNNMemDesc(
{src_tz}, platform::MKLDNNGetDataType<T>(), input_format); {src_tz}, platform::MKLDNNGetDataType<T>(), input_format);
...@@ -222,13 +238,9 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -222,13 +238,9 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
using bn_fwd_types = bn_type_traits<mkldnn::batch_normalization_forward>; using bn_fwd_types = bn_type_traits<mkldnn::batch_normalization_forward>;
auto batch_norm_fwd_desc = auto batch_norm_fwd_desc =
bn_fwd_types::op_desc{propagation, user_src_md, epsilon, flags}; bn_fwd_types::op_desc{propagation, user_src_md, epsilon, flags};
auto batch_norm_fwd_pd = std::make_shared<batch_norm_fwd::primitive_desc>(
batch_norm_fwd_desc, mkldnn_engine);
// Save conv_pd/src_memory/weights_memory for backward pass
dev_ctx.SetBlob(key_batch_norm_fwd_pd, batch_norm_fwd_pd);
BatchNormMKLDNNHandler handler(batch_norm_fwd_pd, dev_ctx, mkldnn_engine, auto batch_norm_fwd_pd = handler.AcquireBatchNormPrimitiveDescriptor(
key); batch_norm_fwd_desc, mkldnn_engine);
auto src_memory = auto src_memory =
handler.AcquireSrcMemory(user_src_md, to_void_cast(x_data)); handler.AcquireSrcMemory(user_src_md, to_void_cast(x_data));
......
...@@ -144,7 +144,6 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -144,7 +144,6 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
const std::string key = platform::ConvMKLDNNHandler::GetHash( const std::string key = platform::ConvMKLDNNHandler::GetHash(
src_tz, weights_tz, strides, paddings, dilations, groups, src_tz, weights_tz, strides, paddings, dilations, groups,
ctx.op().Input("Input") + ctx.op().Input("Filter")); ctx.op().Input("Input") + ctx.op().Input("Filter"));
const std::string key_conv_pd = key + "@conv_pd";
std::vector<primitive> pipeline; std::vector<primitive> pipeline;
...@@ -183,6 +182,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -183,6 +182,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
auto dst_md = platform::MKLDNNMemDesc( auto dst_md = platform::MKLDNNMemDesc(
dst_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format); dst_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
platform::ConvMKLDNNHandler handler(dev_ctx, mkldnn_engine, key);
// create a conv primitive descriptor and save it for usage in backward // create a conv primitive descriptor and save it for usage in backward
std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd; std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd;
auto fwd_prop_kind = is_test ? mkldnn::prop_kind::forward_inference auto fwd_prop_kind = is_test ? mkldnn::prop_kind::forward_inference
...@@ -191,18 +192,14 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -191,18 +192,14 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
bias_tz = paddle::framework::vectorize2int(bias->dims()); bias_tz = paddle::framework::vectorize2int(bias->dims());
auto bias_md = platform::MKLDNNMemDesc( auto bias_md = platform::MKLDNNMemDesc(
bias_tz, platform::MKLDNNGetDataType<T>(), memory::format::x); bias_tz, platform::MKLDNNGetDataType<T>(), memory::format::x);
conv_pd = ConvFwdPrimitiveDesc( conv_pd = handler.AcquireConvolutionPrimitiveDescriptor(
src_md, weights_md, bias_md, dst_md, strides, paddings, mkldnn_engine, src_md, weights_md, bias_md, dst_md, strides, paddings, mkldnn_engine,
fuse_relu, fuse_residual_conn, fwd_prop_kind); fuse_relu, fuse_residual_conn, fwd_prop_kind);
} else { } else {
conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides, conv_pd = handler.AcquireConvolutionPrimitiveDescriptor(
paddings, mkldnn_engine, fuse_relu, src_md, weights_md, boost::none, dst_md, strides, paddings,
fuse_residual_conn, fwd_prop_kind); mkldnn_engine, fuse_relu, fuse_residual_conn, fwd_prop_kind);
} }
// Save conv_pd/src_memory/weights_memory for backward pass
if (!is_test) dev_ctx.SetBlob(key_conv_pd, conv_pd);
platform::ConvMKLDNNHandler handler(conv_pd, dev_ctx, mkldnn_engine, key);
// create mkldnn memory from input tensors (data/weights) // create mkldnn memory from input tensors (data/weights)
auto user_src_memory_p = auto user_src_memory_p =
...@@ -633,31 +630,6 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -633,31 +630,6 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
} }
private: private:
mkldnn::primitive_attr CreatePostOps(bool fuse_relu,
bool fuse_residual_conn) const {
mkldnn::primitive_attr conv_attr;
mkldnn::post_ops post_operations;
// Fusion with Elementwise layer relies on adding a sum post-operation with
// the scale parameter. It is assumed that when fuse_residual_connection is
// true, the output tensor contains the data coming from residual
// connection. The result of this post_op is:
// Output = scale * Output + Conv_Out.
if (fuse_residual_conn) {
post_operations.append_sum(1.0f);
}
// Fusion with ReLU layer is executed through the PostOps feature. Create a
// PostOps object and configure it to execute an eltwise relu operation.
if (fuse_relu) {
constexpr float scale = 1.0f;
constexpr float negative_slope = 0.0f;
constexpr float placeholder = 0.0f;
post_operations.append_eltwise(scale, mkldnn::algorithm::eltwise_relu,
negative_slope, placeholder);
}
conv_attr.set_post_ops(post_operations);
return conv_attr;
}
mkldnn::primitive_attr CreatePostOps( mkldnn::primitive_attr CreatePostOps(
bool fuse_relu, bool fuse_residual_conn, bool fuse_relu, bool fuse_residual_conn,
const std::vector<float> output_shift_scale, float sum_scale) const { const std::vector<float> output_shift_scale, float sum_scale) const {
...@@ -679,30 +651,6 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -679,30 +651,6 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
return conv_attr; return conv_attr;
} }
std::unique_ptr<mkldnn::convolution_forward::primitive_desc>
ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights,
const memory::desc& dst, const std::vector<int>& strides,
const std::vector<int>& paddings,
const mkldnn::engine& engine, const bool fuse_relu,
const bool fuse_residual_conn,
mkldnn::prop_kind fwd_prop_kind) const {
memory::dims stride_dims = strides;
memory::dims padding_dims = paddings;
auto conv_desc = mkldnn::convolution_forward::desc(
fwd_prop_kind, mkldnn::convolution_direct, src, weights, dst,
stride_dims, padding_dims, padding_dims, mkldnn::padding_kind::zero);
mkldnn::primitive_attr conv_attr =
CreatePostOps(fuse_relu, fuse_residual_conn);
auto p_conv_pd = new mkldnn::convolution_forward::primitive_desc(
conv_desc, conv_attr, engine);
return std::unique_ptr<mkldnn::convolution_forward::primitive_desc>(
p_conv_pd);
}
std::unique_ptr<mkldnn::convolution_forward::primitive_desc> std::unique_ptr<mkldnn::convolution_forward::primitive_desc>
ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights, ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights,
const memory::desc& dst, const std::vector<int>& strides, const memory::desc& dst, const std::vector<int>& strides,
...@@ -731,31 +679,6 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -731,31 +679,6 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
p_conv_pd); p_conv_pd);
} }
std::unique_ptr<mkldnn::convolution_forward::primitive_desc>
ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights,
const memory::desc& bias, const memory::desc& dst,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const mkldnn::engine& engine, const bool fuse_relu,
const bool fuse_residual_conn,
mkldnn::prop_kind fwd_prop_kind) const {
memory::dims stride_dims = strides;
memory::dims padding_dims = paddings;
auto conv_desc = mkldnn::convolution_forward::desc(
fwd_prop_kind, mkldnn::convolution_direct, src, weights, bias, dst,
stride_dims, padding_dims, padding_dims, mkldnn::padding_kind::zero);
mkldnn::primitive_attr conv_attr =
CreatePostOps(fuse_relu, fuse_residual_conn);
auto p_conv_pd = new mkldnn::convolution_forward::primitive_desc(
conv_desc, conv_attr, engine);
return std::unique_ptr<mkldnn::convolution_forward::primitive_desc>(
p_conv_pd);
}
std::unique_ptr<mkldnn::convolution_forward::primitive_desc> std::unique_ptr<mkldnn::convolution_forward::primitive_desc>
ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights, ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights,
const memory::desc& bias, const memory::desc& dst, const memory::desc& bias, const memory::desc& dst,
......
...@@ -12,6 +12,7 @@ ...@@ -12,6 +12,7 @@
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "boost/optional.hpp"
#include "paddle/fluid/framework/data_layout_transform.h" #include "paddle/fluid/framework/data_layout_transform.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/memory/malloc.h"
...@@ -124,7 +125,6 @@ class ConvTransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -124,7 +125,6 @@ class ConvTransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
const std::string key = platform::ConvTransposeMKLDNNHandler::GetHash( const std::string key = platform::ConvTransposeMKLDNNHandler::GetHash(
src_tz, weights_tz, strides, paddings, dilations, groups, src_tz, weights_tz, strides, paddings, dilations, groups,
ctx.op().Output("Output")); ctx.op().Output("Output"));
const std::string key_conv_transpose_pd = key + "@conv_transpose_pd";
std::vector<mkldnn::primitive> pipeline; std::vector<mkldnn::primitive> pipeline;
...@@ -153,6 +153,7 @@ class ConvTransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -153,6 +153,7 @@ class ConvTransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
auto dst_md = platform::MKLDNNMemDesc( auto dst_md = platform::MKLDNNMemDesc(
dst_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format); dst_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
platform::ConvTransposeMKLDNNHandler handler(dev_ctx, mkldnn_engine, key);
// create a deconv(conv transpose) primitive descriptor and save it for // create a deconv(conv transpose) primitive descriptor and save it for
// usage in backward // usage in backward
std::shared_ptr<mkldnn::deconvolution_forward::primitive_desc> std::shared_ptr<mkldnn::deconvolution_forward::primitive_desc>
...@@ -163,19 +164,14 @@ class ConvTransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -163,19 +164,14 @@ class ConvTransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
bias_tz = paddle::framework::vectorize2int(bias->dims()); bias_tz = paddle::framework::vectorize2int(bias->dims());
auto bias_md = platform::MKLDNNMemDesc( auto bias_md = platform::MKLDNNMemDesc(
bias_tz, platform::MKLDNNGetDataType<T>(), mkldnn::memory::format::x); bias_tz, platform::MKLDNNGetDataType<T>(), mkldnn::memory::format::x);
conv_transpose_pd = ConvTransposeFwdPrimitiveDesc( conv_transpose_pd = handler.AcquireConvolutionPrimitiveDescriptor(
src_md, weights_md, bias_md, dst_md, strides, paddings, mkldnn_engine, src_md, weights_md, bias_md, dst_md, strides, paddings, mkldnn_engine,
fuse_relu, fwd_prop_kind); fuse_relu, false, fwd_prop_kind);
} else { } else {
conv_transpose_pd = ConvTransposeFwdPrimitiveDesc( conv_transpose_pd = handler.AcquireConvolutionPrimitiveDescriptor(
src_md, weights_md, dst_md, strides, paddings, mkldnn_engine, src_md, weights_md, boost::none, dst_md, strides, paddings,
fuse_relu, fwd_prop_kind); mkldnn_engine, fuse_relu, false, fwd_prop_kind);
} }
// Save conv_pd/src_memory/weights_memory for backward pass
if (!is_test) dev_ctx.SetBlob(key_conv_transpose_pd, conv_transpose_pd);
platform::ConvTransposeMKLDNNHandler handler(conv_transpose_pd, dev_ctx,
mkldnn_engine, key);
// create mkldnn memory from input tensors (data/weights) // create mkldnn memory from input tensors (data/weights)
auto user_src_memory_p = handler.AcquireSrcMemory( auto user_src_memory_p = handler.AcquireSrcMemory(
...@@ -224,70 +220,6 @@ class ConvTransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -224,70 +220,6 @@ class ConvTransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
output->set_layout(DataLayout::kMKLDNN); output->set_layout(DataLayout::kMKLDNN);
output->set_format(platform::GetMKLDNNFormat(*dst_memory_p)); output->set_format(platform::GetMKLDNNFormat(*dst_memory_p));
} }
private:
mkldnn::primitive_attr CreatePostOps(bool fuse_relu) const {
mkldnn::primitive_attr conv_attr;
mkldnn::post_ops post_operations;
// Fusion with ReLU layer is executed through the PostOps feature. Create a
// PostOps object and configure it to execute an eltwise relu operation.
if (fuse_relu) {
constexpr float scale = 1.0f;
constexpr float negative_slope = 0.0f;
constexpr float placeholder = 0.0f;
post_operations.append_eltwise(scale, mkldnn::algorithm::eltwise_relu,
negative_slope, placeholder);
}
conv_attr.set_post_ops(post_operations);
return conv_attr;
}
std::unique_ptr<mkldnn::deconvolution_forward::primitive_desc>
ConvTransposeFwdPrimitiveDesc(
const mkldnn::memory::desc& src, const mkldnn::memory::desc& weights,
const mkldnn::memory::desc& dst, const std::vector<int>& strides,
const std::vector<int>& paddings, const mkldnn::engine& engine,
const bool fuse_relu, mkldnn::prop_kind fwd_prop_kind) const {
mkldnn::memory::dims stride_dims = {strides[0], strides[1]};
mkldnn::memory::dims padding_dims = {paddings[0], paddings[1]};
auto deconv_desc = mkldnn::deconvolution_forward::desc(
fwd_prop_kind, mkldnn::deconvolution_direct, src, weights, dst,
stride_dims, padding_dims, padding_dims, mkldnn::padding_kind::zero);
mkldnn::primitive_attr deconv_attr = CreatePostOps(fuse_relu);
auto p_conv_transpose_pd =
new mkldnn::deconvolution_forward::primitive_desc(deconv_desc,
deconv_attr, engine);
return std::unique_ptr<mkldnn::deconvolution_forward::primitive_desc>(
p_conv_transpose_pd);
}
std::unique_ptr<mkldnn::deconvolution_forward::primitive_desc>
ConvTransposeFwdPrimitiveDesc(
const mkldnn::memory::desc& src, const mkldnn::memory::desc& weights,
const mkldnn::memory::desc& bias, const mkldnn::memory::desc& dst,
const std::vector<int>& strides, const std::vector<int>& paddings,
const mkldnn::engine& engine, const bool fuse_relu,
mkldnn::prop_kind fwd_prop_kind) const {
mkldnn::memory::dims stride_dims = {strides[0], strides[1]};
mkldnn::memory::dims padding_dims = {paddings[0], paddings[1]};
auto deconv_desc = mkldnn::deconvolution_forward::desc(
fwd_prop_kind, mkldnn::deconvolution_direct, src, weights, bias, dst,
stride_dims, padding_dims, padding_dims, mkldnn::padding_kind::zero);
mkldnn::primitive_attr deconv_attr = CreatePostOps(fuse_relu);
auto p_conv_transpose_pd =
new mkldnn::deconvolution_forward::primitive_desc(deconv_desc,
deconv_attr, engine);
return std::unique_ptr<mkldnn::deconvolution_forward::primitive_desc>(
p_conv_transpose_pd);
}
}; };
} // namespace operators } // namespace operators
......
...@@ -34,12 +34,9 @@ using platform::to_void_cast; ...@@ -34,12 +34,9 @@ using platform::to_void_cast;
class SoftmaxMKLDNNHandler : public platform::MKLDNNHandler { class SoftmaxMKLDNNHandler : public platform::MKLDNNHandler {
public: public:
SoftmaxMKLDNNHandler( SoftmaxMKLDNNHandler(const platform::MKLDNNDeviceContext& dev_ctx,
std::shared_ptr<mkldnn::softmax_forward::primitive_desc> softmax_pd, mkldnn::engine engine, const std::string& base_key)
const platform::MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine, : platform::MKLDNNHandler(dev_ctx, engine, base_key) {}
const std::string& base_key)
: platform::MKLDNNHandler(dev_ctx, engine, base_key),
softmax_pd_(softmax_pd) {}
SoftmaxMKLDNNHandler( SoftmaxMKLDNNHandler(
std::shared_ptr<mkldnn::softmax_forward::primitive_desc> softmax_pd, std::shared_ptr<mkldnn::softmax_forward::primitive_desc> softmax_pd,
...@@ -54,6 +51,26 @@ class SoftmaxMKLDNNHandler : public platform::MKLDNNHandler { ...@@ -54,6 +51,26 @@ class SoftmaxMKLDNNHandler : public platform::MKLDNNHandler {
key_ += "-BWD"; key_ += "-BWD";
} }
std::shared_ptr<softmax_forward::primitive_desc>
AcquireSoftmaxPrimitiveDescriptor(const softmax_forward::desc& softmax_desc,
const mkldnn::engine& engine) {
const std::string key_softmax_pd = key_ + "@softmax_pd";
auto softmax_pd = std::static_pointer_cast<softmax_forward::primitive_desc>(
dev_ctx_.GetBlob(key_softmax_pd));
if (softmax_pd == nullptr) {
softmax_pd_.reset(
new softmax_forward::primitive_desc(softmax_desc, engine));
dev_ctx_.SetBlob(key_softmax_pd, softmax_pd_);
} else {
softmax_pd_ = softmax_pd;
is_reusing_ = true;
}
return softmax_pd_;
}
std::shared_ptr<mkldnn::softmax_forward> AcquireSoftmax( std::shared_ptr<mkldnn::softmax_forward> AcquireSoftmax(
std::shared_ptr<mkldnn::memory> dst_memory_p, std::shared_ptr<mkldnn::memory> dst_memory_p,
std::shared_ptr<mkldnn::memory> src_memory_p) { std::shared_ptr<mkldnn::memory> src_memory_p) {
...@@ -138,19 +155,18 @@ class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel<T> { ...@@ -138,19 +155,18 @@ class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel<T> {
// Generate keys for storing/retriving primitives for this operator // Generate keys for storing/retriving primitives for this operator
const std::string key = const std::string key =
platform::MKLDNNHandler::GetHash(softmax_tz, ctx.op().Output("Out")); platform::MKLDNNHandler::GetHash(softmax_tz, ctx.op().Output("Out"));
const std::string key_softmax_pd = key + "@softmax_pd";
SoftmaxMKLDNNHandler handler(dev_ctx, mkldnn_engine, key);
// Currently only NC data format is supported // Currently only NC data format is supported
auto softmax_md = MKLDNNMemDesc( auto softmax_md = MKLDNNMemDesc(
{softmax_tz}, platform::MKLDNNGetDataType<T>(), memory::format::nc); {softmax_tz}, platform::MKLDNNGetDataType<T>(), memory::format::nc);
// Normalization is made after innermost dimension eg. C out of NC // Normalization is made after innermost dimension eg. C out of NC
auto softmax_desc = softmax_forward::desc(prop_kind::forward_scoring, auto softmax_desc = softmax_forward::desc(prop_kind::forward_scoring,
softmax_md, 1 /*dim: C*/); softmax_md, 1 /*dim: C*/);
auto softmax_pd = std::make_shared<mkldnn::softmax_forward::primitive_desc>(
softmax_desc, mkldnn_engine);
dev_ctx.SetBlob(key_softmax_pd, softmax_pd);
SoftmaxMKLDNNHandler handler(softmax_pd, dev_ctx, mkldnn_engine, key); auto softmax_pd =
handler.AcquireSoftmaxPrimitiveDescriptor(softmax_desc, mkldnn_engine);
auto softmax_src_memory_p = auto softmax_src_memory_p =
handler.AcquireSrcMemory(softmax_md, to_void_cast<T>(input_data)); handler.AcquireSrcMemory(softmax_md, to_void_cast<T>(input_data));
auto softmax_dst_memory_p = auto softmax_dst_memory_p =
......
...@@ -483,8 +483,10 @@ class Pad2dOp : public framework::OperatorWithKernel { ...@@ -483,8 +483,10 @@ class Pad2dOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
paddings_dim.size(), 1, paddings_dim.size(), 1,
"Size of Input(Paddings)'s dimension should be equal to 1."); "Size of Input(Paddings)'s dimension should be equal to 1.");
PADDLE_ENFORCE_EQ(paddings_dim[0], 4, if (ctx->IsRuntime()) {
"Shape of Input(Paddings) should be equal to [4]."); PADDLE_ENFORCE_EQ(paddings_dim[0], 4,
"Shape of Input(Paddings) should be equal to [4].");
}
out_dims[1] = x_dim[1]; out_dims[1] = x_dim[1];
out_dims[2] = x_dim[2]; out_dims[2] = x_dim[2];
out_dims[3] = x_dim[3]; out_dims[3] = x_dim[3];
...@@ -504,11 +506,7 @@ class Pad2dOp : public framework::OperatorWithKernel { ...@@ -504,11 +506,7 @@ class Pad2dOp : public framework::OperatorWithKernel {
} }
ctx->SetOutputDim("Out", framework::make_ddim(out_dims)); ctx->SetOutputDim("Out", framework::make_ddim(out_dims));
if (out_dims[0] == x_dim[0]) { ctx->ShareLoD("X", /*->*/ "Out");
// Only pass LoD when the first dimension is equal between
// output and input.
ctx->ShareLoD("X", /*->*/ "Out");
}
} }
protected: protected:
......
...@@ -23,6 +23,7 @@ constexpr char kInitialStates[] = "initial_states"; ...@@ -23,6 +23,7 @@ constexpr char kInitialStates[] = "initial_states";
constexpr char kParameters[] = "parameters"; constexpr char kParameters[] = "parameters";
constexpr char kOutputs[] = "outputs"; constexpr char kOutputs[] = "outputs";
constexpr char kStepScopes[] = "step_scopes"; constexpr char kStepScopes[] = "step_scopes";
constexpr char kHasStates[] = "has_states";
constexpr char kExStates[] = "ex_states"; constexpr char kExStates[] = "ex_states";
constexpr char kStates[] = "states"; constexpr char kStates[] = "states";
constexpr char kStepBlock[] = "sub_block"; constexpr char kStepBlock[] = "sub_block";
...@@ -241,11 +242,16 @@ class RecurrentOp : public RecurrentBase { ...@@ -241,11 +242,16 @@ class RecurrentOp : public RecurrentBase {
private: private:
void RunImpl(const framework::Scope &scope, void RunImpl(const framework::Scope &scope,
const platform::Place &place) const override { const platform::Place &place) const override {
bool has_state = Attr<bool>(kHasStates);
auto seq_len = static_cast<size_t>(this->GetSequenceLength(scope)); auto seq_len = static_cast<size_t>(this->GetSequenceLength(scope));
VLOG(3) << "Static RNN input sequence length = " << seq_len; VLOG(3) << "Static RNN input sequence length = " << seq_len;
StepScopes scopes = CreateStepScopes(scope, seq_len); StepScopes scopes = CreateStepScopes(scope, seq_len);
auto reverse = Attr<bool>(kReverse); auto reverse = Attr<bool>(kReverse);
// get device context from pool
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(place);
framework::Executor executor(place); framework::Executor executor(place);
auto *block = Attr<framework::BlockDesc *>(kStepBlock); auto *block = Attr<framework::BlockDesc *>(kStepBlock);
...@@ -269,15 +275,17 @@ class RecurrentOp : public RecurrentBase { ...@@ -269,15 +275,17 @@ class RecurrentOp : public RecurrentBase {
inside->Resize(framework::make_ddim(dims)); inside->Resize(framework::make_ddim(dims));
}); });
if (i == 0) { if (has_state) {
// Link initial states --> ex_states if (i == 0) {
LinkTensor(scope, Inputs(kInitialStates), &cur_scope, // Link initial states --> ex_states
Attr<std::vector<std::string>>(kExStates)); LinkTensor(scope, Inputs(kInitialStates), &cur_scope,
} else { Attr<std::vector<std::string>>(kExStates));
auto &ex_scope = scopes.ExScope(); } else {
// Link ex_scope::state --> cur_scope::ex_state auto &ex_scope = scopes.ExScope();
LinkTensor(ex_scope, Attr<std::vector<std::string>>(kStates), // Link ex_scope::state --> cur_scope::ex_state
&cur_scope, Attr<std::vector<std::string>>(kExStates)); LinkTensor(ex_scope, Attr<std::vector<std::string>>(kStates),
&cur_scope, Attr<std::vector<std::string>>(kExStates));
}
} }
// Every inputs are linked now, execute! // Every inputs are linked now, execute!
...@@ -286,11 +294,6 @@ class RecurrentOp : public RecurrentBase { ...@@ -286,11 +294,6 @@ class RecurrentOp : public RecurrentBase {
std::vector<std::string>() /*skip_ref_cnt_vars*/, std::vector<std::string>() /*skip_ref_cnt_vars*/,
true /*force_disable_gc*/); true /*force_disable_gc*/);
// get device context from pool
platform::DeviceContextPool &pool =
platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(place);
// Copy inside::output -> outside::output // Copy inside::output -> outside::output
// outside::output[seq_offset: seq_offset + 1] = inside::output // outside::output[seq_offset: seq_offset + 1] = inside::output
this->LinkTensorWithCallback( this->LinkTensorWithCallback(
...@@ -333,13 +336,13 @@ class RecurrentGradOp : public RecurrentBase { ...@@ -333,13 +336,13 @@ class RecurrentGradOp : public RecurrentBase {
private: private:
void RunImpl(const framework::Scope &scope, void RunImpl(const framework::Scope &scope,
const platform::Place &place) const override { const platform::Place &place) const override {
auto seq_len = static_cast<size_t>(GetSequenceLength(scope)); bool has_state = Attr<bool>(kHasStates);
const size_t seq_len = static_cast<size_t>(GetSequenceLength(scope));
StepScopes scopes = CreateStepScopes(scope, seq_len); StepScopes scopes = CreateStepScopes(scope, seq_len);
auto reverse = Attr<bool>(kReverse); auto reverse = Attr<bool>(kReverse);
framework::Executor executor(place); framework::Executor executor(place);
auto *block = Attr<framework::BlockDesc *>(kStepBlock); auto *block = Attr<framework::BlockDesc *>(kStepBlock);
auto *program = block->Program(); auto *program = block->Program();
// get device context from pool // get device context from pool
...@@ -350,6 +353,7 @@ class RecurrentGradOp : public RecurrentBase { ...@@ -350,6 +353,7 @@ class RecurrentGradOp : public RecurrentBase {
size_t seq_offset = reverse ? step_id : seq_len - step_id - 1; size_t seq_offset = reverse ? step_id : seq_len - step_id - 1;
VLOG(3) << "Recurrent backward operate at the time step " << seq_offset; VLOG(3) << "Recurrent backward operate at the time step " << seq_offset;
auto &cur_scope = scopes.CurScope(); auto &cur_scope = scopes.CurScope();
// Link outside::output_grads --> inside::output_grads // Link outside::output_grads --> inside::output_grads
// inside::output_grad = outside::output_grad[seq_offset:seq_offset+1] // inside::output_grad = outside::output_grad[seq_offset:seq_offset+1]
LinkTensorWithCallback( LinkTensorWithCallback(
...@@ -370,30 +374,32 @@ class RecurrentGradOp : public RecurrentBase { ...@@ -370,30 +374,32 @@ class RecurrentGradOp : public RecurrentBase {
VLOG(10) << " RNN output gradients = [" << sout.str() << "]"; VLOG(10) << " RNN output gradients = [" << sout.str() << "]";
} }
// Link states if (has_state) {
// if cur_scope::cur_state_grad in out_grads: // Link states
// cur_scope::cur_state_grad += ex_scope::ex_state_grad // if cur_scope::cur_state_grad in out_grads:
// else: // cur_scope::cur_state_grad += ex_scope::ex_state_grad
// ex_scope::ex_state_grad --> cur_scope::cur_state_grad // else:
if (step_id != 0) { // not at beginning // ex_scope::ex_state_grad --> cur_scope::cur_state_grad
auto &ex_scope = scopes.ExScope(); if (step_id != 0) { // not at beginning
auto ex_state_grads = auto &ex_scope = scopes.ExScope();
GradVarLists(Attr<std::vector<std::string>>(kExStates)); auto ex_state_grads =
auto cur_state_grads = GradVarLists(Attr<std::vector<std::string>>(kExStates));
GradVarLists(Attr<std::vector<std::string>>(kStates)); auto cur_state_grads =
GradVarLists(Attr<std::vector<std::string>>(kStates));
PADDLE_ENFORCE_EQ(ex_state_grads.size(), cur_state_grads.size());
for (size_t i = 0; i < ex_state_grads.size(); ++i) { PADDLE_ENFORCE_EQ(ex_state_grads.size(), cur_state_grads.size());
auto &cur_grad = cur_state_grads[i]; for (size_t i = 0; i < ex_state_grads.size(); ++i) {
auto &ex_grad = ex_state_grads[i]; auto &cur_grad = cur_state_grads[i];
auto &ex_tensor = auto &ex_grad = ex_state_grads[i];
ex_scope.FindVar(ex_grad)->Get<framework::LoDTensor>(); auto &ex_tensor =
ex_scope.FindVar(ex_grad)->Get<framework::LoDTensor>();
VLOG(10) << " RNN link " << cur_grad << " from " << ex_grad;
auto *cur_grad_var = cur_scope.Var(cur_grad); VLOG(10) << " RNN link " << cur_grad << " from " << ex_grad;
auto cur_grad_tensor = auto *cur_grad_var = cur_scope.Var(cur_grad);
cur_grad_var->GetMutable<framework::LoDTensor>(); auto cur_grad_tensor =
framework::TensorCopy(ex_tensor, place, dev_ctx, cur_grad_tensor); cur_grad_var->GetMutable<framework::LoDTensor>();
framework::TensorCopy(ex_tensor, place, dev_ctx, cur_grad_tensor);
}
} }
} }
...@@ -442,8 +448,8 @@ class RecurrentGradOp : public RecurrentBase { ...@@ -442,8 +448,8 @@ class RecurrentGradOp : public RecurrentBase {
} }
auto new_inside_name = cur_scope.Rename(inside_grad_name); auto new_inside_name = cur_scope.Rename(inside_grad_name);
// sum gradient
// sum gradient
auto sum_op = framework::OpRegistry::CreateOp( auto sum_op = framework::OpRegistry::CreateOp(
"sum", {{"X", {pg_names[param_id], new_inside_name}}}, "sum", {{"X", {pg_names[param_id], new_inside_name}}},
{{"Out", {pg_names[param_id]}}}, {{"Out", {pg_names[param_id]}}},
...@@ -475,22 +481,33 @@ class RecurrentGradOp : public RecurrentBase { ...@@ -475,22 +481,33 @@ class RecurrentGradOp : public RecurrentBase {
true /*is_backward*/); true /*is_backward*/);
VLOG(5) << "Link outside gradient finished "; VLOG(5) << "Link outside gradient finished ";
if (step_id + 1 == seq_len) { // at_end if (has_state) {
// copy initialize states gradient from inside to outside if (step_id + 1 == seq_len) { // at_end
LinkTensorWithCallback( // copy initialize states gradient from inside to outside
cur_scope, GradVarLists(Attr<std::vector<std::string>>(kExStates)), LinkTensorWithCallback(
scope, Outputs(kInitStateGrads), cur_scope,
[&](const framework::LoDTensor &inside, GradVarLists(Attr<std::vector<std::string>>(kExStates)), scope,
framework::LoDTensor *outside) { Outputs(kInitStateGrads),
outside->Resize(inside.dims()); [&](const framework::LoDTensor &inside,
outside->mutable_data(place, inside.type()); framework::LoDTensor *outside) {
framework::TensorCopy(inside, place, dev_ctx, outside); outside->Resize(inside.dims());
}, outside->mutable_data(place, inside.type());
true /*is_backward*/); framework::TensorCopy(inside, place, dev_ctx, outside);
VLOG(5) << "Link initialize state gradient finished "; },
true /*is_backward*/);
VLOG(5) << "Link initialize state gradient finished ";
}
} }
scopes.Next(); scopes.Next();
} }
// Delete the scope of StepScopes
dev_ctx.Wait();
auto *var = scope.FindVar(Input(kStepScopes));
PADDLE_ENFORCE(var != nullptr);
auto step_scopes = var->GetMutable<StepScopeVar>();
for (auto *sub_scope : *step_scopes) {
const_cast<framework::Scope &>(scope).DeleteScope(sub_scope);
}
} }
private: private:
...@@ -541,6 +558,7 @@ class RecurrentOpProtoMaker : public framework::OpProtoAndCheckerMaker { ...@@ -541,6 +558,7 @@ class RecurrentOpProtoMaker : public framework::OpProtoAndCheckerMaker {
.AsDuplicable(); .AsDuplicable();
AddOutput(kStepScopes, AddOutput(kStepScopes,
"StepScopes contain all local variables in each time step."); "StepScopes contain all local variables in each time step.");
AddAttr<bool>(kHasStates, "Whether has states.").SetDefault(false);
AddAttr<std::vector<std::string>>(kExStates, AddAttr<std::vector<std::string>>(kExStates,
string::Sprintf( string::Sprintf(
R"DOC(The ex-state variable names. R"DOC(The ex-state variable names.
...@@ -624,20 +642,44 @@ class RecurrentGradOpDescMaker : public framework::SingleGradOpDescMaker { ...@@ -624,20 +642,44 @@ class RecurrentGradOpDescMaker : public framework::SingleGradOpDescMaker {
class RecurrentGradOpShapeInference : public framework::InferShapeBase { class RecurrentGradOpShapeInference : public framework::InferShapeBase {
public: public:
void operator()(framework::InferShapeContext *ctx) const override { void operator()(framework::InferShapeContext *ctx) const override {
std::vector<std::string> input{kInputs, kInitialStates};
std::vector<std::string> output{kOutputs}; std::vector<std::string> output{kOutputs};
for (auto &s : input) {
// NOTE(zcd): In some case, some of kInputs doesn't have gradient. // In some case the kInitialStates is empty.
PADDLE_ENFORCE(ctx->HasInputs(s)); // If the kInitialStates is empty, all the states should be empty.
} if (!ctx->HasInputs(kInitialStates)) {
for (auto &s : output) { PADDLE_ENFORCE_EQ(
PADDLE_ENFORCE(ctx->HasInputs(s)); ctx->Attrs().Get<std::vector<std::string>>(kExStates).size(), 0,
"The Attr(%s) should be empty.", kExStates);
PADDLE_ENFORCE_EQ(
ctx->Attrs().Get<std::vector<std::string>>(kStates).size(), 0,
"The Attr(%s) should be empty.", kStates);
} }
for (auto &s : input) {
ctx->SetOutputsDim(framework::GradVarName(s), ctx->GetInputsDim(s)); PADDLE_ENFORCE(ctx->HasInputs(kInputs),
"The input(%s) should not be empty.", kInputs);
PADDLE_ENFORCE(ctx->HasInputs(kOutputs),
"The input(%s) should not be empty.", kOutputs);
// In some case the kInitialStates is empty.
if (ctx->HasInputs(kInitialStates)) {
PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName(kInitialStates)),
"The output of(%s) should not be empty.",
framework::GradVarName(kInitialStates));
ctx->SetOutputsDim(framework::GradVarName(kInitialStates),
ctx->GetInputsDim(kInitialStates));
} }
PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName(kInputs)),
"The output of(%s) should not be empty.",
framework::GradVarName(kInputs));
ctx->SetOutputsDim(framework::GradVarName(kInputs),
ctx->GetInputsDim(kInputs));
// In some case the kParameters is empty.
if (ctx->HasInputs(kParameters)) { if (ctx->HasInputs(kParameters)) {
PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName(kParameters))); PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName(kParameters)),
"The output of(%s) should not be empty.",
framework::GradVarName(kParameters));
ctx->SetOutputsDim(framework::GradVarName(kParameters), ctx->SetOutputsDim(framework::GradVarName(kParameters),
ctx->GetInputsDim(kParameters)); ctx->GetInputsDim(kParameters));
} }
......
...@@ -14,7 +14,7 @@ ...@@ -14,7 +14,7 @@
#include "paddle/fluid/operators/reduce_ops/reduce_all_op.h" #include "paddle/fluid/operators/reduce_ops/reduce_all_op.h"
REGISTER_REDUCE_OP(reduce_all); REGISTER_REDUCE_OP_WITHOUT_GRAD(reduce_all);
REGISTER_OP_CPU_KERNEL(reduce_all, REGISTER_OP_CPU_KERNEL(reduce_all,
ops::ReduceKernel<paddle::platform::CPUDeviceContext, ops::ReduceKernel<paddle::platform::CPUDeviceContext,
bool, ops::AllFunctor>); bool, ops::AllFunctor>);
...@@ -14,7 +14,7 @@ ...@@ -14,7 +14,7 @@
#include "paddle/fluid/operators/reduce_ops/reduce_any_op.h" #include "paddle/fluid/operators/reduce_ops/reduce_any_op.h"
REGISTER_REDUCE_OP(reduce_any); REGISTER_REDUCE_OP_WITHOUT_GRAD(reduce_any);
REGISTER_OP_CPU_KERNEL(reduce_any, REGISTER_OP_CPU_KERNEL(reduce_any,
ops::ReduceKernel<paddle::platform::CPUDeviceContext, ops::ReduceKernel<paddle::platform::CPUDeviceContext,
bool, ops::AnyFunctor>); bool, ops::AnyFunctor>);
...@@ -270,3 +270,12 @@ namespace ops = paddle::operators; ...@@ -270,3 +270,12 @@ namespace ops = paddle::operators;
REGISTER_OPERATOR(op_name, ops::ReduceOp, __##op_name##Maker__, \ REGISTER_OPERATOR(op_name, ops::ReduceOp, __##op_name##Maker__, \
paddle::framework::DefaultGradOpDescMaker<true>); \ paddle::framework::DefaultGradOpDescMaker<true>); \
REGISTER_OPERATOR(op_name##_grad, ops::ReduceGradOp) REGISTER_OPERATOR(op_name##_grad, ops::ReduceGradOp)
#define REGISTER_REDUCE_OP_WITHOUT_GRAD(op_name) \
class __##op_name##Maker__ : public ops::ReduceOpMaker { \
protected: \
virtual std::string GetName() const { return #op_name; } \
virtual std::string GetOpType() const { return "Reduce " #op_name; } \
}; \
REGISTER_OPERATOR(op_name, ops::ReduceOp, __##op_name##Maker__, \
paddle::framework::EmptyGradOpMaker);
...@@ -40,9 +40,12 @@ class RNNMemoryHelperOp : public framework::OperatorBase { ...@@ -40,9 +40,12 @@ class RNNMemoryHelperOp : public framework::OperatorBase {
"Cannot find out_var in scope, out_var_name is %s", "Cannot find out_var in scope, out_var_name is %s",
out_name); out_name);
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(dev_place);
auto *out_tensor = out_var->GetMutable<framework::LoDTensor>(); auto *out_tensor = out_var->GetMutable<framework::LoDTensor>();
auto &mem_tensor = mem_var->Get<framework::LoDTensor>(); auto &mem_tensor = mem_var->Get<framework::LoDTensor>();
framework::TensorCopySync(mem_tensor, dev_place, out_tensor); framework::TensorCopy(mem_tensor, dev_place, dev_ctx, out_tensor);
out_tensor->set_lod(mem_tensor.lod()); out_tensor->set_lod(mem_tensor.lod());
} }
}; };
...@@ -92,6 +95,9 @@ class RNNMemoryHelperGradOp : public framework::OperatorBase { ...@@ -92,6 +95,9 @@ class RNNMemoryHelperGradOp : public framework::OperatorBase {
"Cannot find in_grad_var in scope, name is %s", "Cannot find in_grad_var in scope, name is %s",
in_grad_var_name); in_grad_var_name);
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(dev_place);
if (out_grad_var == nullptr) { if (out_grad_var == nullptr) {
VLOG(5) << "Using fill constant 0 as starting gradient"; VLOG(5) << "Using fill constant 0 as starting gradient";
auto in_var_name = Input("X"); auto in_var_name = Input("X");
...@@ -109,7 +115,8 @@ class RNNMemoryHelperGradOp : public framework::OperatorBase { ...@@ -109,7 +115,8 @@ class RNNMemoryHelperGradOp : public framework::OperatorBase {
} else { } else {
auto &out_grad_tensor = out_grad_var->Get<framework::LoDTensor>(); auto &out_grad_tensor = out_grad_var->Get<framework::LoDTensor>();
auto *in_grad_tensor = in_grad_var->GetMutable<framework::LoDTensor>(); auto *in_grad_tensor = in_grad_var->GetMutable<framework::LoDTensor>();
framework::TensorCopySync(out_grad_tensor, dev_place, in_grad_tensor); framework::TensorCopy(out_grad_tensor, dev_place, dev_ctx,
in_grad_tensor);
in_grad_tensor->set_lod(out_grad_tensor.lod()); in_grad_tensor->set_lod(out_grad_tensor.lod());
} }
} }
......
...@@ -37,9 +37,11 @@ class ROIAlignOp : public framework::OperatorWithKernel { ...@@ -37,9 +37,11 @@ class ROIAlignOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE(rois_dims.size() == 2, PADDLE_ENFORCE(rois_dims.size() == 2,
"ROIs should be a 2-D LoDTensor of shape (num_rois, 4)" "ROIs should be a 2-D LoDTensor of shape (num_rois, 4)"
"given as [[x1, y1, x2, y2], ...]."); "given as [[x1, y1, x2, y2], ...].");
PADDLE_ENFORCE(rois_dims[1] == 4, if (ctx->IsRuntime()) {
"ROIs should be a 2-D LoDTensor of shape (num_rois, 4)" PADDLE_ENFORCE(rois_dims[1] == 4,
"given as [[x1, y1, x2, y2], ...]."); "ROIs should be a 2-D LoDTensor of shape (num_rois, 4)"
"given as [[x1, y1, x2, y2], ...].");
}
int pooled_height = ctx->Attrs().Get<int>("pooled_height"); int pooled_height = ctx->Attrs().Get<int>("pooled_height");
int pooled_width = ctx->Attrs().Get<int>("pooled_width"); int pooled_width = ctx->Attrs().Get<int>("pooled_width");
float spatial_scale = ctx->Attrs().Get<float>("spatial_scale"); float spatial_scale = ctx->Attrs().Get<float>("spatial_scale");
......
...@@ -45,9 +45,12 @@ class RowConvOp : public framework::OperatorWithKernel { ...@@ -45,9 +45,12 @@ class RowConvOp : public framework::OperatorWithKernel {
auto filter_dims = ctx->GetInputDim("Filter"); auto filter_dims = ctx->GetInputDim("Filter");
PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2."); PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2.");
PADDLE_ENFORCE_EQ(filter_dims.size(), 2, "Input(Y)'s rank should be 2."); PADDLE_ENFORCE_EQ(filter_dims.size(), 2, "Input(Y)'s rank should be 2.");
PADDLE_ENFORCE_EQ( if (ctx->IsRuntime() || (x_dims[1] > 0 && filter_dims[1] > 0)) {
x_dims[1], filter_dims[1], PADDLE_ENFORCE_EQ(
"The 2nd dimension of Input(X) and Input(Filter) should be same."); x_dims[1], filter_dims[1],
"The 2nd dimension of Input(X) and Input(Filter) should be same.");
}
ctx->SetOutputDim("Out", x_dims); ctx->SetOutputDim("Out", x_dims);
ctx->ShareLoD("X", "Out"); ctx->ShareLoD("X", "Out");
} }
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/sample_logits_op.h" #include "paddle/fluid/operators/sample_logits_op.h"
#include <memory>
#include "paddle/fluid/operators/math/sample_prob.h" #include "paddle/fluid/operators/math/sample_prob.h"
namespace paddle { namespace paddle {
...@@ -60,6 +61,10 @@ class SampleLogitsOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -60,6 +61,10 @@ class SampleLogitsOpMaker : public framework::OpProtoAndCheckerMaker {
"(Tensor, default: Tensor<float>), A 2-D tensor with shape [N, NT + S]." "(Tensor, default: Tensor<float>), A 2-D tensor with shape [N, NT + S]."
"The probabilites of sampled positive and negtive labels.") "The probabilites of sampled positive and negtive labels.")
.AsIntermediate(); .AsIntermediate();
AddOutput("LogitsDim", "Store dim information of Logits for gradient op")
.AsIntermediate();
AddOutput("LabelsDim", "Store dim information of Logits for gradient op")
.AsIntermediate();
AddOutput("SampledLogits", AddOutput("SampledLogits",
"(Tensor, default: Tensor<float>), A 2-D tensor with shape" "(Tensor, default: Tensor<float>), A 2-D tensor with shape"
"[N, NT + S]. The outputs value of sampled logits, which will be" "[N, NT + S]. The outputs value of sampled logits, which will be"
...@@ -121,6 +126,10 @@ class SampleLogitsOp : public framework::OperatorWithKernel { ...@@ -121,6 +126,10 @@ class SampleLogitsOp : public framework::OperatorWithKernel {
"Output(SampledLogits) should be not null."); "Output(SampledLogits) should be not null.");
PADDLE_ENFORCE(ctx->HasOutput("SampledLabels"), PADDLE_ENFORCE(ctx->HasOutput("SampledLabels"),
"Output(SampledLabels) should be not null."); "Output(SampledLabels) should be not null.");
PADDLE_ENFORCE(ctx->HasOutput("LogitsDim"),
"Output(LogitsDim) should be not null.");
PADDLE_ENFORCE(ctx->HasOutput("LabelsDim"),
"Output(LabelsDim) should be not null.");
auto logits_dims = ctx->GetInputDim("Logits"); auto logits_dims = ctx->GetInputDim("Logits");
auto labels_dims = ctx->GetInputDim("Labels"); auto labels_dims = ctx->GetInputDim("Labels");
...@@ -137,6 +146,15 @@ class SampleLogitsOp : public framework::OperatorWithKernel { ...@@ -137,6 +146,15 @@ class SampleLogitsOp : public framework::OperatorWithKernel {
ctx->SetOutputDim("Probabilities", {logits_dims[0], num_sampled_classes}); ctx->SetOutputDim("Probabilities", {logits_dims[0], num_sampled_classes});
ctx->SetOutputDim("SampledLogits", {logits_dims[0], num_sampled_classes}); ctx->SetOutputDim("SampledLogits", {logits_dims[0], num_sampled_classes});
ctx->SetOutputDim("SampledLabels", {logits_dims[0], labels_dims[1]}); ctx->SetOutputDim("SampledLabels", {logits_dims[0], labels_dims[1]});
// append 0 to shape variable to avoid optimized by memory optimize pass
auto logits_dim_vec = framework::vectorize(logits_dims);
logits_dim_vec.push_back(0);
ctx->SetOutputDim("LogitsDim", framework::make_ddim(logits_dim_vec));
auto labels_dim_vec = framework::vectorize(labels_dims);
labels_dim_vec.push_back(0);
ctx->SetOutputDim("LabelsDim", framework::make_ddim(labels_dim_vec));
} }
protected: protected:
...@@ -155,28 +173,27 @@ class SampleLogitsOpGrad : public framework::OperatorWithKernel { ...@@ -155,28 +173,27 @@ class SampleLogitsOpGrad : public framework::OperatorWithKernel {
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override { void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Logits"), PADDLE_ENFORCE(ctx->HasInput("LogitsDim"),
"Input(Logits) should not be null."); "Input(LogitsDim) should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Labels"), PADDLE_ENFORCE(ctx->HasInput("LabelsDim"),
"Input(Labels) should be not null."); "Input(LabelsDim) should be not null.");
PADDLE_ENFORCE(ctx->HasInput("Samples"), PADDLE_ENFORCE(ctx->HasInput("Samples"),
"Input(Samples) should be not null."); "Input(Samples) should be not null.");
PADDLE_ENFORCE(ctx->HasInput("SampledLogits"),
"Input(SampledLogits) should be not null.");
PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("SampledLogits")), PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("SampledLogits")),
"Input(SampledLogits@Grad) should not be null."); "Input(SampledLogits@Grad) should not be null.");
PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Logits")), PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Logits")),
"Output(Logits@Grad) should be not null."); "Output(Logits@Grad) should be not null.");
auto logit_dims = ctx->GetInputDim("Logits"); auto logits_dims = ctx->GetInputDim("LogitsDim");
auto label_dims = ctx->GetInputDim("Labels"); logits_dims = framework::DDim(logits_dims.Get(), logits_dims.size() - 1);
PADDLE_ENFORCE_EQ(label_dims.size(), 2UL, auto labels_dims = ctx->GetInputDim("LabelsDim");
labels_dims = framework::DDim(labels_dims.Get(), labels_dims.size() - 1);
PADDLE_ENFORCE_EQ(labels_dims.size(), 2UL,
"The label should be a 2-D tensor."); "The label should be a 2-D tensor.");
PADDLE_ENFORCE_EQ(logit_dims.size(), 2UL, PADDLE_ENFORCE_EQ(logits_dims.size(), 2UL,
"The logits should be a 2-D tensor."); "The logits should be a 2-D tensor.");
ctx->SetOutputDim(framework::GradVarName("Logits"), ctx->SetOutputDim(framework::GradVarName("Logits"), logits_dims);
ctx->GetInputDim("Logits"));
} }
protected: protected:
...@@ -199,10 +216,9 @@ class SampleLogitsGradMaker : public framework::SingleGradOpDescMaker { ...@@ -199,10 +216,9 @@ class SampleLogitsGradMaker : public framework::SingleGradOpDescMaker {
std::unique_ptr<framework::OpDesc> Apply() const override { std::unique_ptr<framework::OpDesc> Apply() const override {
auto* grad_op = new framework::OpDesc(); auto* grad_op = new framework::OpDesc();
grad_op->SetType("sample_logits_grad"); grad_op->SetType("sample_logits_grad");
grad_op->SetInput("Logits", Input("Logits")); grad_op->SetInput("LogitsDim", Output("LogitsDim"));
grad_op->SetInput("Labels", Input("Labels")); grad_op->SetInput("LabelsDim", Output("LabelsDim"));
grad_op->SetInput("Samples", Output("Samples")); grad_op->SetInput("Samples", Output("Samples"));
grad_op->SetInput("SampledLogits", Output("SampledLogits"));
grad_op->SetInput(framework::GradVarName("SampledLogits"), grad_op->SetInput(framework::GradVarName("SampledLogits"),
OutputGrad("SampledLogits")); OutputGrad("SampledLogits"));
grad_op->SetOutput(framework::GradVarName("Logits"), InputGrad("Logits")); grad_op->SetOutput(framework::GradVarName("Logits"), InputGrad("Logits"));
......
...@@ -42,10 +42,6 @@ class ScatterOp : public framework::OperatorWithKernel { ...@@ -42,10 +42,6 @@ class ScatterOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE_EQ(ctx->GetInputDim("Updates")[0], PADDLE_ENFORCE_EQ(ctx->GetInputDim("Updates")[0],
ctx->GetInputDim("Ids")[0], ctx->GetInputDim("Ids")[0],
"Updates and Ids should have same batch-size."); "Updates and Ids should have same batch-size.");
framework::DDim data_dim(updates_dims);
for (int i = 1; i < data_dim.size(); ++i) {
PADDLE_ENFORCE_EQ(data_dim[i], updates_dims[i]);
}
ctx->SetOutputDim("Out", ref_dims); ctx->SetOutputDim("Out", ref_dims);
} }
......
...@@ -34,15 +34,22 @@ class SigmoidCrossEntropyWithLogitsOp : public framework::OperatorWithKernel { ...@@ -34,15 +34,22 @@ class SigmoidCrossEntropyWithLogitsOp : public framework::OperatorWithKernel {
auto x_dims = ctx->GetInputDim("X"); auto x_dims = ctx->GetInputDim("X");
auto labels_dims = ctx->GetInputDim("Label"); auto labels_dims = ctx->GetInputDim("Label");
PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2.");
PADDLE_ENFORCE_EQ(labels_dims.size(), 2, int rank = x_dims.size();
"Input(Label)'s rank should be 2."); PADDLE_ENFORCE_EQ(rank, labels_dims.size(),
PADDLE_ENFORCE_EQ(x_dims[0], labels_dims[0], "Input(X) and Input(Label) shall have the same rank.");
"The 1st dimension of Input(X) and Input(Label) should " bool check = true;
"be equal."); if ((!ctx->IsRuntime()) && (framework::product(x_dims) <= 0 ||
PADDLE_ENFORCE_EQ(x_dims[1], labels_dims[1], framework::product(labels_dims) <= 0)) {
"The 2nd dimension of Input(X) and Input(Label) should " check = false;
"be equal."); }
if (check) {
PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank),
framework::slice_ddim(labels_dims, 0, rank),
"Input(X) and Input(Label) shall have the same shape "
"except the last dimension.");
}
ctx->ShareDim("X", /*->*/ "Out"); ctx->ShareDim("X", /*->*/ "Out");
ctx->ShareLoD("X", /*->*/ "Out"); ctx->ShareLoD("X", /*->*/ "Out");
...@@ -65,23 +72,24 @@ class SigmoidCrossEntropyWithLogitsGradOp ...@@ -65,23 +72,24 @@ class SigmoidCrossEntropyWithLogitsGradOp
auto x_dims = ctx->GetInputDim("X"); auto x_dims = ctx->GetInputDim("X");
auto labels_dims = ctx->GetInputDim("Label"); auto labels_dims = ctx->GetInputDim("Label");
auto dout_dims = ctx->GetInputDim(framework::GradVarName("Out")); auto dout_dims = ctx->GetInputDim(framework::GradVarName("Out"));
PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2.");
PADDLE_ENFORCE_EQ(labels_dims.size(), 2, int rank = x_dims.size();
"Input(Label)'s rank should be 2."); bool check = true;
PADDLE_ENFORCE_EQ(dout_dims.size(), 2, if ((!ctx->IsRuntime()) && (framework::product(x_dims) <= 0 ||
"Input(Out@Grad)'s rank should be 2."); framework::product(labels_dims) <= 0)) {
PADDLE_ENFORCE_EQ(x_dims[0], labels_dims[0], check = false;
"The 1st dimension of Input(X) and Input(Label) should " }
"be equal.");
PADDLE_ENFORCE_EQ(x_dims[1], labels_dims[1], if (check) {
"The 2nd dimension of Input(X) and Input(Label) should " PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank),
"be equal."); framework::slice_ddim(labels_dims, 0, rank),
PADDLE_ENFORCE_EQ(x_dims[0], dout_dims[0], "Input(X) and Input(Label) shall have the same shape.");
"The 1st dimension of Input(X) and Input(Out@Grad) "
"should be equal."); PADDLE_ENFORCE_EQ(
PADDLE_ENFORCE_EQ(x_dims[1], dout_dims[1], framework::slice_ddim(x_dims, 0, rank),
"The 2nd dimension of Input(X) and Input(Out@Grad) " framework::slice_ddim(dout_dims, 0, rank),
"should be equal."); "Input(X) and Input(Out@Grad) shall have the same shape.");
}
ctx->SetOutputDim(framework::GradVarName("X"), x_dims); ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
} }
......
...@@ -56,13 +56,19 @@ class SpectralNormOp : public framework::OperatorWithKernel { ...@@ -56,13 +56,19 @@ class SpectralNormOp : public framework::OperatorWithKernel {
} }
auto dim_u = ctx->GetInputDim("U"); auto dim_u = ctx->GetInputDim("U");
auto dim_v = ctx->GetInputDim("V"); auto dim_v = ctx->GetInputDim("V");
PADDLE_ENFORCE_EQ(dim_u[0], h,
"Input(U) dims[0] should be equal to " if (ctx->IsRuntime() || (dim_u[0] > 0 && h > 0)) {
"Input(Weight) dims[Attr(dim)]"); PADDLE_ENFORCE_EQ(dim_u[0], h,
PADDLE_ENFORCE_EQ( "Input(U) dims[0] should be equal to "
dim_v[0], w, "Input(Weight) dims[Attr(dim)]");
"Input(V) dims[0] should be equal to " }
"the product of Input(Weight) dims except dims[Attr(dim)]");
if (ctx->IsRuntime() || (dim_v[0] > 0 && w > 0)) {
PADDLE_ENFORCE_EQ(
dim_v[0], w,
"Input(V) dims[0] should be equal to "
"the product of Input(Weight) dims except dims[Attr(dim)]");
}
ctx->SetOutputDim("Out", dim_weight); ctx->SetOutputDim("Out", dim_weight);
ctx->ShareLoD("Weight", /*->*/ "Out"); ctx->ShareLoD("Weight", /*->*/ "Out");
......
...@@ -39,14 +39,22 @@ class SplitOp : public framework::OperatorWithKernel { ...@@ -39,14 +39,22 @@ class SplitOp : public framework::OperatorWithKernel {
if (num > 0) { if (num > 0) {
int64_t in_axis_dim = in_dims[axis]; int64_t in_axis_dim = in_dims[axis];
PADDLE_ENFORCE_EQ(in_axis_dim % num, 0, if (ctx->IsRuntime() || in_axis_dim > 0) {
"tensor split does not result" PADDLE_ENFORCE_EQ(in_axis_dim % num, 0,
" in an equal division"); "tensor split does not result"
size_t out_axis_dim = in_axis_dim / num; " in an equal division");
for (size_t i = 0; i < outs_number; ++i) { size_t out_axis_dim = in_axis_dim / num;
auto dim = in_dims; for (size_t i = 0; i < outs_number; ++i) {
dim[axis] = out_axis_dim; auto dim = in_dims;
outs_dims.push_back(dim); dim[axis] = out_axis_dim;
outs_dims.push_back(dim);
}
} else {
for (size_t i = 0; i < outs_number; ++i) {
auto dim = in_dims;
dim[axis] = -1;
outs_dims.push_back(dim);
}
} }
} else if (sections.size() > 0) { } else if (sections.size() > 0) {
PADDLE_ENFORCE_EQ(sections.size(), outs_number, PADDLE_ENFORCE_EQ(sections.size(), outs_number,
......
...@@ -65,7 +65,21 @@ class SumOp : public framework::OperatorWithKernel { ...@@ -65,7 +65,21 @@ class SumOp : public framework::OperatorWithKernel {
if (framework::product(in_dim) == 0) { if (framework::product(in_dim) == 0) {
in_dim = x_dim; in_dim = x_dim;
} else { } else {
PADDLE_ENFORCE_EQ(in_dim, x_dim, "Input tensors must have same shape"); if (ctx->IsRuntime()) {
PADDLE_ENFORCE_EQ(in_dim, x_dim,
"Input tensors must have same shape");
} else {
PADDLE_ENFORCE_EQ(in_dim.size(), x_dim.size(),
"Input tensors must have same shape size");
// if in_dim or x_dim has -1, not check equal
for (int i = 0; i < x_dim.size(); ++i) {
if (x_dim[i] == -1 || in_dim[i] == -1) {
continue;
}
PADDLE_ENFORCE_EQ(in_dim[i], x_dim[i],
"Input tensors must have same shape if not -1");
}
}
} }
} }
ctx->SetOutputDim("Out", in_dim); ctx->SetOutputDim("Out", in_dim);
......
...@@ -99,10 +99,15 @@ class UnpoolOp : public framework::OperatorWithKernel { ...@@ -99,10 +99,15 @@ class UnpoolOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE(in_x_dims.size() == 4, PADDLE_ENFORCE(in_x_dims.size() == 4,
"Unpooling intput must be of 4-dimensional."); "Unpooling intput must be of 4-dimensional.");
PADDLE_ENFORCE_EQ(in_x_dims, in_y_dims); PADDLE_ENFORCE_EQ(in_x_dims, in_y_dims);
std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1]}); std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1]});
for (size_t i = 0; i < ksize.size(); ++i) { for (size_t i = 0; i < ksize.size(); ++i) {
output_shape.push_back(UnpoolOutputSize(in_x_dims[i + 2], ksize[i], if (!ctx->IsRuntime() && in_x_dims[i + 2] <= 0) {
paddings[i], strides[i])); output_shape.push_back(-1);
} else {
output_shape.push_back(UnpoolOutputSize(in_x_dims[i + 2], ksize[i],
paddings[i], strides[i]));
}
} }
ctx->SetOutputDim("Out", framework::make_ddim(output_shape)); ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
} }
......
...@@ -52,16 +52,26 @@ void PrintVar(framework::Scope* scope, const std::string& var_name, ...@@ -52,16 +52,26 @@ void PrintVar(framework::Scope* scope, const std::string& var_name,
return; return;
} }
#define PrintLoDTensorCallback(cpp_type, proto_type) \ framework::LoDTensor printed_tensor;
do { \ printed_tensor.set_lod(tensor->lod());
if (tensor->type() == proto_type) { \ printed_tensor.Resize(tensor->dims());
print_lod_tensor<cpp_type>(var_name, *tensor, print_info); \ if (platform::is_cpu_place(tensor->place())) {
return; \ printed_tensor.ShareDataWith(*tensor);
} \ } else {
platform::CPUPlace place;
framework::TensorCopy(*tensor, place, &printed_tensor);
}
#define PrintLoDTensorCallback(cpp_type, proto_type) \
do { \
if (tensor->type() == proto_type) { \
print_lod_tensor<cpp_type>(var_name, printed_tensor, print_info); \
return; \
} \
} while (0) } while (0)
_ForEachDataType_(PrintLoDTensorCallback); _ForEachDataType_(PrintLoDTensorCallback);
VLOG(1) << "PrintVar: unrecognized data type:" << tensor->type(); VLOG(1) << "PrintVar: unrecognized data type:" << printed_tensor.type();
} }
} // end namespace platform } // end namespace platform
......
...@@ -16,6 +16,7 @@ limitations under the License. */ ...@@ -16,6 +16,7 @@ limitations under the License. */
#include <memory> #include <memory>
#include <string> #include <string>
#include <vector> #include <vector>
#include "boost/optional.hpp"
#include "paddle/fluid/framework/data_layout_transform.h" #include "paddle/fluid/framework/data_layout_transform.h"
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/mkldnn_helper.h" #include "paddle/fluid/platform/mkldnn_helper.h"
...@@ -395,9 +396,28 @@ class TransposeMKLDNNHandler : public MKLDNNHandler { ...@@ -395,9 +396,28 @@ class TransposeMKLDNNHandler : public MKLDNNHandler {
std::vector<int> logical_axis_; std::vector<int> logical_axis_;
}; };
template <typename T>
struct convolutional_algorithm;
template <>
struct convolutional_algorithm<mkldnn::convolution_forward> {
static constexpr mkldnn::algorithm T = mkldnn::algorithm::convolution_direct;
};
template <>
struct convolutional_algorithm<mkldnn::deconvolution_forward> {
static constexpr mkldnn::algorithm T =
mkldnn::algorithm::deconvolution_direct;
};
template <class forward_t, class backward_data_t, class backward_weights_t> template <class forward_t, class backward_data_t, class backward_weights_t>
class ConvMKLDNNTemplateHandler : public MKLDNNHandler { class ConvMKLDNNTemplateHandler : public MKLDNNHandler {
public: public:
ConvMKLDNNTemplateHandler(const platform::MKLDNNDeviceContext& dev_ctx,
mkldnn::engine engine, const std::string& base_key)
: platform::MKLDNNHandler(dev_ctx, engine, base_key) {}
// TODO(jczaja): remove after conv int8 is adapted
ConvMKLDNNTemplateHandler( ConvMKLDNNTemplateHandler(
std::shared_ptr<typename forward_t::primitive_desc> conv_pd, std::shared_ptr<typename forward_t::primitive_desc> conv_pd,
const platform::MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine, const platform::MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine,
...@@ -542,6 +562,73 @@ class ConvMKLDNNTemplateHandler : public MKLDNNHandler { ...@@ -542,6 +562,73 @@ class ConvMKLDNNTemplateHandler : public MKLDNNHandler {
scale_data, mask); scale_data, mask);
} }
mkldnn::primitive_attr CreatePostOps(bool fuse_relu,
bool fuse_residual_conn = false) const {
mkldnn::primitive_attr conv_attr;
mkldnn::post_ops post_operations;
// Fusion with Elementwise layer relies on adding a sum post-operation with
// the scale parameter. It is assumed that when fuse_residual_connection is
// true, the output tensor contains the data coming from residual
// connection. The result of this post_op is:
// Output = scale * Output + Conv_Out.
if (fuse_residual_conn) {
post_operations.append_sum(1.0f);
}
// Fusion with ReLU layer is executed through the PostOps feature. Create a
// PostOps object and configure it to execute an eltwise relu operation.
if (fuse_relu) {
constexpr float scale = 1.0f;
constexpr float negative_slope = 0.0f;
constexpr float placeholder = 0.0f;
post_operations.append_eltwise(scale, mkldnn::algorithm::eltwise_relu,
negative_slope, placeholder);
}
conv_attr.set_post_ops(post_operations);
return conv_attr;
}
std::shared_ptr<typename forward_t::primitive_desc>
AcquireConvolutionPrimitiveDescriptor(
const mkldnn::memory::desc& src, const mkldnn::memory::desc& weights,
boost::optional<const mkldnn::memory::desc&> bias,
const mkldnn::memory::desc& dst, const std::vector<int>& strides,
const std::vector<int>& paddings, const mkldnn::engine& engine,
const bool fuse_relu, const bool fuse_residual_conn,
mkldnn::prop_kind fwd_prop_kind) {
const std::string key_conv_pd = key_ + "@conv_pd";
auto conv_pd = std::static_pointer_cast<typename forward_t::primitive_desc>(
dev_ctx_.GetBlob(key_conv_pd));
if (conv_pd == nullptr) {
mkldnn::memory::dims stride_dims = strides;
mkldnn::memory::dims padding_dims = paddings;
auto conv_desc =
bias ? typename forward_t::desc(
fwd_prop_kind, convolutional_algorithm<forward_t>::T, src,
weights, *bias, dst, stride_dims, padding_dims,
padding_dims, mkldnn::padding_kind::zero)
: typename forward_t::desc(
fwd_prop_kind, convolutional_algorithm<forward_t>::T, src,
weights, dst, stride_dims, padding_dims, padding_dims,
mkldnn::padding_kind::zero);
mkldnn::primitive_attr conv_attr =
CreatePostOps(fuse_relu, fuse_residual_conn);
conv_pd_.reset(
new typename forward_t::primitive_desc(conv_desc, conv_attr, engine));
// Save conv_pd/src_memory/weights_memory for backward pass
dev_ctx_.SetBlob(key_conv_pd, conv_pd_);
} else {
conv_pd_ = conv_pd;
is_reusing_ = true;
}
return conv_pd_;
}
std::shared_ptr<forward_t> AcquireConvolution( std::shared_ptr<forward_t> AcquireConvolution(
std::shared_ptr<mkldnn::memory> src_memory_p, std::shared_ptr<mkldnn::memory> src_memory_p,
std::shared_ptr<mkldnn::memory> weights_memory_p, std::shared_ptr<mkldnn::memory> weights_memory_p,
......
...@@ -446,7 +446,8 @@ function assert_api_spec_approvals() { ...@@ -446,7 +446,8 @@ function assert_api_spec_approvals() {
BRANCH="develop" BRANCH="develop"
fi fi
API_FILES=("paddle/fluid/API.spec" API_FILES=("CMakeLists.txt"
"paddle/fluid/API.spec"
"paddle/fluid/op_use_default_grad_op_maker.spec" "paddle/fluid/op_use_default_grad_op_maker.spec"
"python/paddle/fluid/parallel_executor.py" "python/paddle/fluid/parallel_executor.py"
"paddle/fluid/framework/operator.h" "paddle/fluid/framework/operator.h"
...@@ -469,24 +470,29 @@ function assert_api_spec_approvals() { ...@@ -469,24 +470,29 @@ function assert_api_spec_approvals() {
echo "checking ${API_FILE} change, PR: ${GIT_PR_ID}, changes: ${API_CHANGE}" echo "checking ${API_FILE} change, PR: ${GIT_PR_ID}, changes: ${API_CHANGE}"
if [ ${API_CHANGE} ] && [ "${GIT_PR_ID}" != "" ]; then if [ ${API_CHANGE} ] && [ "${GIT_PR_ID}" != "" ]; then
# NOTE: per_page=10000 should be ok for all cases, a PR review > 10000 is not human readable. # NOTE: per_page=10000 should be ok for all cases, a PR review > 10000 is not human readable.
# approval_user_list: velconia 1979255,panyx0718 2887803,XiaoguangHu01 46782768,chengduoZH 30176695,Xreki 12538138,luotao1 6836917,sneaxiy 32832641,tensor-tang 21351065,jacquesqiao 3048612,typhoonzero 13348433,shanyi15 35982308. # approval_user_list: velconia 1979255,XiaoguangHu01 46782768,chengduoZH 30176695,Xreki 12538138,luotao1 6836917,sneaxiy 32832641,tensor-tang 21351065,jacquesqiao 3048612,typhoonzero 13348433,shanyi15 35982308.
if [ "$API_FILE" == "paddle/fluid/API.spec" ];then if [ "$API_FILE" == "paddle/fluid/API.spec" ];then
APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \ APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
python ${PADDLE_ROOT}/tools/check_pr_approval.py 2 2887803 35982308 46782768 30176695` python ${PADDLE_ROOT}/tools/check_pr_approval.py 2 35982308 46782768 30176695`
if [ "${APPROVALS}" == "TRUE" ];then if [ "${APPROVALS}" == "TRUE" ];then
APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \ APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 35982308` python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 35982308`
fi fi
elif [ "$API_FILE" == "CMakeLists.txt" ];then
APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 6836917 46782768 30176695`
else else
APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \ APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 2887803 1979255 21351065 3048612 13348433 46782768 30176695 12538138 6836917 32832641` python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 1979255 21351065 3048612 13348433 46782768 30176695 12538138 6836917 32832641`
fi fi
echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}" echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
if [ "${APPROVALS}" == "FALSE" ]; then if [ "${APPROVALS}" == "FALSE" ]; then
if [ "$API_FILE" == "paddle/fluid/API.spec" ];then if [ "$API_FILE" == "paddle/fluid/API.spec" ];then
echo "You must have one RD (panyx0718 or chengduoZH or XiaoguangHu01) and one PM (shanyi15) approval for the api change! ${API_FILE}" echo "You must have one RD (chengduoZH or XiaoguangHu01) and one PM (shanyi15) approval for the api change! ${API_FILE}"
elif [ "$API_FILE" == "CMakeLists.txt" ];then
echo "You must have one RD (luotao1 or chengduoZH or XiaoguangHu01) approval for the cmakelist change! ${API_FILE}"
else else
echo "You must have one RD (velconia,panyx0718,XiaoguangHu01,chengduoZH,Xreki,luotao1,sneaxiy,tensor-tang,jacquesqiao,typhoonzero) approval for the api change! ${API_FILE}" echo "You must have one RD (velconia,XiaoguangHu01,chengduoZH,Xreki,luotao1,sneaxiy,tensor-tang,jacquesqiao,typhoonzero) approval for the api change! ${API_FILE}"
fi fi
exit 1 exit 1
fi fi
...@@ -496,10 +502,10 @@ function assert_api_spec_approvals() { ...@@ -496,10 +502,10 @@ function assert_api_spec_approvals() {
HAS_CONST_CAST=`git diff -U0 upstream/$BRANCH |grep -o -m 1 "const_cast" || true` HAS_CONST_CAST=`git diff -U0 upstream/$BRANCH |grep -o -m 1 "const_cast" || true`
if [ ${HAS_CONST_CAST} ] && [ "${GIT_PR_ID}" != "" ]; then if [ ${HAS_CONST_CAST} ] && [ "${GIT_PR_ID}" != "" ]; then
APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \ APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 2887803 1979255 21351065 3048612 13348433 46782768 30176695 12538138 6836917 32832641` python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 1979255 21351065 3048612 13348433 46782768 30176695 12538138 6836917 32832641`
echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}" echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
if [ "${APPROVALS}" == "FALSE" ]; then if [ "${APPROVALS}" == "FALSE" ]; then
echo "You must have one RD (velconia,panyx0718,XiaoguangHu01,chengduoZH,Xreki,luotao1,sneaxiy,tensor-tang,jacquesqiao,typhoonzero) approval for the api change! ${API_FILE}" echo "You must have one RD (velconia,XiaoguangHu01,chengduoZH,Xreki,luotao1,sneaxiy,tensor-tang,jacquesqiao,typhoonzero) approval for the api change! ${API_FILE}"
exit 1 exit 1
fi fi
fi fi
......
...@@ -175,6 +175,7 @@ def __bootstrap__(): ...@@ -175,6 +175,7 @@ def __bootstrap__():
read_env_flags.append('communicator_thread_pool_size') read_env_flags.append('communicator_thread_pool_size')
read_env_flags.append('communicator_max_merge_var_num') read_env_flags.append('communicator_max_merge_var_num')
read_env_flags.append('communicator_fake_rpc') read_env_flags.append('communicator_fake_rpc')
read_env_flags.append('communicator_send_wait_times')
if core.is_compiled_with_brpc(): if core.is_compiled_with_brpc():
read_env_flags.append('max_body_size') read_env_flags.append('max_body_size')
#set brpc max body size #set brpc max body size
......
...@@ -147,10 +147,11 @@ class TestCalibrationForResnet50(unittest.TestCase): ...@@ -147,10 +147,11 @@ class TestCalibrationForResnet50(unittest.TestCase):
self.data_cache_folder) self.data_cache_folder)
os.system(cmd) os.system(cmd)
self.batch_size = 1 self.batch_size = 1 if os.environ.get('DATASET') == 'full' else 50
self.sample_iterations = 50 self.sample_iterations = 50 if os.environ.get(
'DATASET') == 'full' else 1
self.infer_iterations = 50000 if os.environ.get( self.infer_iterations = 50000 if os.environ.get(
'DATASET') == 'full' else 50 'DATASET') == 'full' else 1
def cache_unzipping(self, target_folder, zip_path): def cache_unzipping(self, target_folder, zip_path):
if not os.path.exists(target_folder): if not os.path.exists(target_folder):
...@@ -279,15 +280,15 @@ class TestCalibrationForResnet50(unittest.TestCase): ...@@ -279,15 +280,15 @@ class TestCalibrationForResnet50(unittest.TestCase):
def test_calibration(self): def test_calibration(self):
self.download_model() self.download_model()
print("Start FP32 inference for {0} on {1} images ...").format( print("Start FP32 inference for {0} on {1} images ...").format(
self.model, self.infer_iterations) self.model, self.infer_iterations * self.batch_size)
(fp32_throughput, fp32_latency, (fp32_throughput, fp32_latency,
fp32_acc1) = self.run_program(self.model_cache_folder + "/model") fp32_acc1) = self.run_program(self.model_cache_folder + "/model")
print("Start INT8 calibration for {0} on {1} images ...").format( print("Start INT8 calibration for {0} on {1} images ...").format(
self.model, self.sample_iterations) self.model, self.sample_iterations * self.batch_size)
self.run_program( self.run_program(
self.model_cache_folder + "/model", True, algo=self.algo) self.model_cache_folder + "/model", True, algo=self.algo)
print("Start INT8 inference for {0} on {1} images ...").format( print("Start INT8 inference for {0} on {1} images ...").format(
self.model, self.infer_iterations) self.model, self.infer_iterations * self.batch_size)
(int8_throughput, int8_latency, (int8_throughput, int8_latency,
int8_acc1) = self.run_program("calibration_out") int8_acc1) = self.run_program("calibration_out")
delta_value = fp32_acc1 - int8_acc1 delta_value = fp32_acc1 - int8_acc1
......
...@@ -26,8 +26,8 @@ class DeviceWorker(object): ...@@ -26,8 +26,8 @@ class DeviceWorker(object):
""" """
Init. Init.
""" """
self.program_ = None self._program = None
self.infer_ = None self._infer = None
def _set_infer(self, infer=False): def _set_infer(self, infer=False):
""" """
...@@ -36,7 +36,7 @@ class DeviceWorker(object): ...@@ -36,7 +36,7 @@ class DeviceWorker(object):
Args: Args:
infer(bool): whether to do inference infer(bool): whether to do inference
""" """
self.infer_ = infer self._infer = infer
def _set_fleet_desc(self, fleet_desc): def _set_fleet_desc(self, fleet_desc):
""" """
...@@ -45,7 +45,7 @@ class DeviceWorker(object): ...@@ -45,7 +45,7 @@ class DeviceWorker(object):
Args: Args:
fleet_desc(PSParameter): pslib.PSParameter object fleet_desc(PSParameter): pslib.PSParameter object
""" """
self.fleet_desc_ = fleet_desc self._fleet_desc = fleet_desc
def _set_program(self, program): def _set_program(self, program):
""" """
...@@ -54,7 +54,7 @@ class DeviceWorker(object): ...@@ -54,7 +54,7 @@ class DeviceWorker(object):
Args: Args:
program(Program): a Program object program(Program): a Program object
""" """
self.program_ = program self._program = program
def _gen_worker_desc(self, trainer_desc): def _gen_worker_desc(self, trainer_desc):
""" """
...@@ -88,7 +88,7 @@ class Hogwild(DeviceWorker): ...@@ -88,7 +88,7 @@ class Hogwild(DeviceWorker):
trainer_desc(TrainerDesc): a TrainerDesc object trainer_desc(TrainerDesc): a TrainerDesc object
""" """
trainer_desc.device_worker_name = "HogwildWorker" trainer_desc.device_worker_name = "HogwildWorker"
if self.infer_: if self._infer:
# just ignore feed op for inference model # just ignore feed op for inference model
trainer_desc.hogwild_param.skip_ops.extend(["feed"]) trainer_desc.hogwild_param.skip_ops.extend(["feed"])
...@@ -113,11 +113,11 @@ class DownpourSGD(DeviceWorker): ...@@ -113,11 +113,11 @@ class DownpourSGD(DeviceWorker):
trainer_desc(TrainerDesc): a TrainerDesc object trainer_desc(TrainerDesc): a TrainerDesc object
""" """
dense_table_set = set() dense_table_set = set()
program_id = str(id(self.program_)) program_id = str(id(self._program))
if self.program_ == None: if self._program == None:
print("program of current device worker is not configured") print("program of current device worker is not configured")
exit(-1) exit(-1)
opt_info = self.program_._fleet_opt opt_info = self._program._fleet_opt
program_configs = opt_info["program_configs"] program_configs = opt_info["program_configs"]
downpour = trainer_desc.downpour_param downpour = trainer_desc.downpour_param
...@@ -140,7 +140,7 @@ class DownpourSGD(DeviceWorker): ...@@ -140,7 +140,7 @@ class DownpourSGD(DeviceWorker):
trainer_desc.device_worker_name = "DownpourWorker" trainer_desc.device_worker_name = "DownpourWorker"
pull_thread = trainer_desc.pull_dense_param pull_thread = trainer_desc.pull_dense_param
pull_thread.device_num = trainer_desc.thread_num pull_thread.device_num = trainer_desc.thread_num
for i in self.fleet_desc_.trainer_param.dense_table: for i in self._fleet_desc.trainer_param.dense_table:
if i.table_id in dense_table_set: if i.table_id in dense_table_set:
dense_table = pull_thread.dense_table.add() dense_table = pull_thread.dense_table.add()
dense_table.dense_value_name.extend(i.dense_variable_name) dense_table.dense_value_name.extend(i.dense_variable_name)
...@@ -148,29 +148,29 @@ class DownpourSGD(DeviceWorker): ...@@ -148,29 +148,29 @@ class DownpourSGD(DeviceWorker):
i.table_id i.table_id
sparse_table = downpour.sparse_table.add() sparse_table = downpour.sparse_table.add()
sparse_table.table_id = \ sparse_table.table_id = \
self.fleet_desc_.trainer_param.sparse_table[0].table_id self._fleet_desc.trainer_param.sparse_table[0].table_id
sparse_table.sparse_key_name.extend( sparse_table.sparse_key_name.extend(
self.fleet_desc_.trainer_param.sparse_table[0].slot_key) self._fleet_desc.trainer_param.sparse_table[0].slot_key)
sparse_table.sparse_value_name.extend( sparse_table.sparse_value_name.extend(
self.fleet_desc_.trainer_param.sparse_table[0].slot_value) self._fleet_desc.trainer_param.sparse_table[0].slot_value)
sparse_table.sparse_grad_name.extend( sparse_table.sparse_grad_name.extend(
self.fleet_desc_.trainer_param.sparse_table[0].slot_gradient) self._fleet_desc.trainer_param.sparse_table[0].slot_gradient)
sparse_table.emb_dim = \ sparse_table.emb_dim = \
self.fleet_desc_.server_param.downpour_server_param.downpour_table_param[ self._fleet_desc.server_param.downpour_server_param.downpour_table_param[
0].accessor.fea_dim - 2 0].accessor.fea_dim - 2
sparse_table.fea_dim = sparse_table.emb_dim + 2 sparse_table.fea_dim = sparse_table.emb_dim + 2
# TODO(guru4elephant): hard code here, need to improve # TODO(guru4elephant): hard code here, need to improve
sparse_table.label_var_name = "click" sparse_table.label_var_name = "click"
for i in self.fleet_desc_.trainer_param.dense_table: for i in self._fleet_desc.trainer_param.dense_table:
if i.table_id in dense_table_set: if i.table_id in dense_table_set:
dense_table = downpour.dense_table.add() dense_table = downpour.dense_table.add()
dense_table.table_id = i.table_id dense_table.table_id = i.table_id
dense_table.dense_value_name.extend(i.dense_variable_name) dense_table.dense_value_name.extend(i.dense_variable_name)
dense_table.dense_grad_name.extend( dense_table.dense_grad_name.extend(
i.dense_gradient_variable_name) i.dense_gradient_variable_name)
downpour.skip_ops.extend(self.fleet_desc_.trainer_param.skip_op) downpour.skip_ops.extend(self._fleet_desc.trainer_param.skip_op)
if self.infer_: if self._infer:
downpour.push_dense = False downpour.push_dense = False
downpour.push_sparse = False downpour.push_sparse = False
......
...@@ -48,6 +48,12 @@ class Layer(core.Layer): ...@@ -48,6 +48,12 @@ class Layer(core.Layer):
self._helper = LayerObjectHelper(self._full_name) self._helper = LayerObjectHelper(self._full_name)
def train(self):
framework._dygraph_tracer()._train_mode()
def eval(self):
framework._dygraph_tracer()._eval_mode()
def full_name(self): def full_name(self):
"""Full name for this layers. """Full name for this layers.
...@@ -254,6 +260,12 @@ class PyLayer(core.PyLayer): ...@@ -254,6 +260,12 @@ class PyLayer(core.PyLayer):
def __init__(self): def __init__(self):
super(PyLayer, self).__init__() super(PyLayer, self).__init__()
def train(self):
framework._dygraph_tracer()._train_mode()
def eval(self):
framework._dygraph_tracer()._eval_mode()
@classmethod @classmethod
def _do_forward(cls, inputs): def _do_forward(cls, inputs):
return cls._to_tuple(cls.forward(inputs)) return cls._to_tuple(cls.forward(inputs))
......
...@@ -24,7 +24,9 @@ __all__ = ['Tracer'] ...@@ -24,7 +24,9 @@ __all__ = ['Tracer']
def release_op(op): def release_op(op):
del framework._dygraph_tracer()._ops[op._trace_id] del framework._dygraph_tracer()._ops[op._trace_id].inputs
del framework._dygraph_tracer()._ops[op._trace_id].outputs
del framework._dygraph_tracer()._ops[op._trace_id].backward_refs
class Tracer(core.Tracer): class Tracer(core.Tracer):
...@@ -38,6 +40,7 @@ class Tracer(core.Tracer): ...@@ -38,6 +40,7 @@ class Tracer(core.Tracer):
self._ops = defaultdict() self._ops = defaultdict()
self._vars = defaultdict() self._vars = defaultdict()
self._trace_id = 0 self._trace_id = 0
self._train_mode = True
def trace_var(self, name, var): def trace_var(self, name, var):
self._vars[name] = var self._vars[name] = var
...@@ -46,15 +49,57 @@ class Tracer(core.Tracer): ...@@ -46,15 +49,57 @@ class Tracer(core.Tracer):
return list((item for name, item in six.iteritems(self._vars) return list((item for name, item in six.iteritems(self._vars)
if isinstance(item, framework.Parameter))) if isinstance(item, framework.Parameter)))
def trace_op(self, op, stop_gradient=False): def trace_op(self, op, inputs, outputs, stop_gradient=False):
# TODO(minqiyang): remove this line after we take apart all
# backward grads and forward variables
if self._train_mode:
op.inputs = inputs
inps = defaultdict(list)
for k, vars in six.iteritems(inputs):
if isinstance(vars, framework.Variable):
inps[k].append(vars._ivar)
elif isinstance(vars, list) or isinstance(vars, tuple):
for var in vars:
inps[k].append(var._ivar)
op.outputs = outputs
outs = defaultdict(list)
for k, vars in six.iteritems(outputs):
if isinstance(vars, framework.Variable):
outs[k].append(vars._ivar)
elif isinstance(vars, list) or isinstance(vars, tuple):
for var in vars:
outs[k].append(var._ivar)
else:
inps = defaultdict(list)
for k, vars in six.iteritems(inputs):
if isinstance(vars, framework.Variable):
op.previous_ops.append(vars.op)
inps[k].append(vars._ivar)
elif isinstance(vars, list) or isinstance(vars, tuple):
for var in vars:
op.previous_ops.append(var.op)
inps[k].append(var._ivar)
op.outputs = outputs
outs = defaultdict(list)
for k, vars in six.iteritems(outputs):
if isinstance(vars, framework.Variable):
vars.op = op
outs[k].append(vars._ivar)
elif isinstance(vars, list) or isinstance(vars, tuple):
for var in vars:
var.op = op
outs[k].append(var._ivar)
# record op's trace id # record op's trace id
op.iop._trace_id = self._trace_id op.iop._trace_id = self._trace_id
backward_refs = self.trace(op.iop, op.inputs, op.outputs, op.attrs, backward_refs = self.trace(op.iop, inps, outs, op.attrs,
framework._current_expected_place(), framework._current_expected_place(),
stop_gradient) stop_gradient)
if not stop_gradient: if not stop_gradient and self._train_mode:
self._trace_id += 1 self._trace_id += 1
self._ops[op.iop._trace_id] = op self._ops[op.iop._trace_id] = op
...@@ -65,10 +110,16 @@ class Tracer(core.Tracer): ...@@ -65,10 +110,16 @@ class Tracer(core.Tracer):
# TODO(minqiyang): remove all inputs and outputs after separate # TODO(minqiyang): remove all inputs and outputs after separate
# var and grad # var and grad
op.backward_refs = defaultdict(list) op.backward_refs = defaultdict(list)
for k, v in six.iteritems(op.inputs): for k, v in six.iteritems(inputs):
if k in backward_refs: if k in backward_refs:
op.backward_refs[k] = op.inputs[k] op.backward_refs[k] = inputs[k]
for k, v in six.iteritems(op.outputs): for k, v in six.iteritems(outputs):
if k in backward_refs: if k in backward_refs:
op.backward_refs[k] = op.outputs[k] op.backward_refs[k] = outputs[k]
def _train_mode(self):
self._train_mode = True
def _eval_mode(self):
self._train_mode = False
...@@ -411,6 +411,7 @@ class Variable(object): ...@@ -411,6 +411,7 @@ class Variable(object):
if persistable else False) if persistable else False)
if persistable: if persistable:
_dygraph_tracer().trace_var(name, self) _dygraph_tracer().trace_var(name, self)
self.op = None
else: else:
self.error_clip = error_clip self.error_clip = error_clip
...@@ -939,24 +940,7 @@ class Operator(object): ...@@ -939,24 +940,7 @@ class Operator(object):
raise ValueError( raise ValueError(
"`type` to initialized an Operator can not be None.") "`type` to initialized an Operator can not be None.")
self.iop = core.OpBase(type) self.iop = core.OpBase(type)
self.previous_ops = []
# TODO(minqiyang): remove these lines after we take apart all
# backward grads and forward variables
self.inputs = defaultdict(list)
if inputs is not None:
for k, v in six.iteritems(inputs):
if isinstance(v, Variable):
self.inputs[k].append(v._ivar)
elif isinstance(v, list) or isinstance(v, tuple):
self.inputs[k].extend([var._ivar for var in v])
self.outputs = defaultdict(list)
if outputs is not None:
for k, v in six.iteritems(outputs):
if isinstance(v, Variable):
self.outputs[k].append(v._ivar)
elif isinstance(v, list) or isinstance(v, tuple):
self.outputs[k].extend([var._ivar for var in v])
self.attrs = attrs if attrs else {} self.attrs = attrs if attrs else {}
else: else:
...@@ -1647,15 +1631,18 @@ class Block(object): ...@@ -1647,15 +1631,18 @@ class Block(object):
block=self, block=self,
desc=None, desc=None,
type=kwargs.get("type", None), type=kwargs.get("type", None),
inputs=kwargs.get("inputs", None), inputs=None,
outputs=kwargs.get("outputs", None), outputs=None,
attrs=kwargs.get("attrs", None)) attrs=kwargs.get("attrs", {}))
# record ops in tracer rather than blocks # record ops in tracer rather than blocks
# #
# TODO(minqiyang): add op stop_gradient support in static mode too. # TODO(minqiyang): add op stop_gradient support in static mode too.
# currently, we only support stop_gradient in dygraph mode. # currently, we only support stop_gradient in dygraph mode.
_dygraph_tracer().trace_op(op, kwargs.get("stop_gradient", False)) _dygraph_tracer().trace_op(op,
kwargs.get("inputs", {}),
kwargs.get("outputs", {}),
kwargs.get("stop_gradient", False))
else: else:
op_desc = self.desc.append_op() op_desc = self.desc.append_op()
op = Operator( op = Operator(
...@@ -1719,10 +1706,14 @@ class Block(object): ...@@ -1719,10 +1706,14 @@ class Block(object):
self, self,
None, None,
type=kwargs.get("type", None), type=kwargs.get("type", None),
inputs=kwargs.get("inputs", None), inputs=None,
outputs=kwargs.get("outputs", None), outputs=None,
attrs=kwargs.get("attrs", None)) attrs=kwargs.get("attrs", {}))
_dygraph_tracer().trace_op(op, kwargs.get("stop_gradient", False))
_dygraph_tracer().trace_op(op,
kwargs.get("inputs", {}),
kwargs.get("outputs", {}),
kwargs.get("stop_gradient", False))
else: else:
op_desc = self.desc._prepend_op() op_desc = self.desc._prepend_op()
op = Operator( op = Operator(
......
...@@ -23,10 +23,10 @@ class RoleMakerBase(object): ...@@ -23,10 +23,10 @@ class RoleMakerBase(object):
""" """
def __init__(self): def __init__(self):
self.role_maker_name_ = "" self._role_maker_name = ""
self.trainer_endpoints_ = [] self._trainer_endpoints = []
self.pserver_endpoints_ = [] self._pserver_endpoints = []
self.role_is_generated_ = False self._role_is_generated = False
def _is_worker(self): def _is_worker(self):
""" """
...@@ -45,20 +45,20 @@ class RoleMakerBase(object): ...@@ -45,20 +45,20 @@ class RoleMakerBase(object):
return get local ip return get local ip
""" """
import socket import socket
self.ip_ = socket.gethostbyname(socket.gethostname()) self._ip = socket.gethostbyname(socket.gethostname())
return self.ip_ return self._ip
def _get_trainer_endpoints(self): def _get_trainer_endpoints(self):
""" """
return trainer endpoints return trainer endpoints
""" """
return self.trainer_endpoints_ return self._trainer_endpoints
def _get_pserver_endpoints(self): def _get_pserver_endpoints(self):
""" """
return pserver endpoints return pserver endpoints
""" """
return self.pserver_endpoints_ return self._pserver_endpoints
def _generate_role(self): def _generate_role(self):
""" """
...@@ -76,59 +76,59 @@ class MPIRoleMaker(RoleMakerBase): ...@@ -76,59 +76,59 @@ class MPIRoleMaker(RoleMakerBase):
def __init__(self): def __init__(self):
super(MPIRoleMaker, self).__init__() super(MPIRoleMaker, self).__init__()
from mpi4py import MPI from mpi4py import MPI
self.comm_ = MPI.COMM_WORLD self._comm = MPI.COMM_WORLD
self.MPI = MPI self.MPI = MPI
self.ips_ = None self._ips = None
def _get_rank(self): def _get_rank(self):
""" """
return rank return rank
""" """
self.rank_ = self.comm_.Get_rank() self._rank = self._comm.Get_rank()
return self.rank_ return self._rank
def _get_size(self): def _get_size(self):
""" """
return size return size
""" """
self.size_ = self.comm_.Get_size() self._size = self._comm.Get_size()
return self.size_ return self._size
def _all_gather(self, obj): def _all_gather(self, obj):
""" """
all_gather(obj) will call MPI's allgather function all_gather(obj) will call MPI's allgather function
""" """
self._barrier_all() self._barrier_all()
return self.comm_.allgather(obj) return self._comm.allgather(obj)
def _worker_gather(self, obj): def _worker_gather(self, obj):
""" """
worker_gather(obj) will call MPI's allgather function worker_gather(obj) will call MPI's allgather function
""" """
if self._is_worker(): if self._is_worker():
self.node_type_comm_.barrier() self._node_type_comm.barrier()
return self.node_type_comm_.allgather(obj) return self._node_type_comm.allgather(obj)
return None return None
def _barrier_all(self): def _barrier_all(self):
""" """
barrier_all() will call MPI's barrier_all function barrier_all() will call MPI's barrier_all function
""" """
self.comm_.barrier() self._comm.barrier()
def _get_ips(self): def _get_ips(self):
""" """
collect current distributed job's ip list collect current distributed job's ip list
""" """
if self.ips_ == None: if self._ips == None:
self.ips_ = self.comm_.allgather(self._get_local_ip()) self._ips = self._comm.allgather(self._get_local_ip())
return self.ips_ return self._ips
def _finalize(self): def _finalize(self):
""" """
finalize the current MPI instance. finalize the current MPI instance.
""" """
self.comm_.finalize() self._comm.finalize()
class MPISymetricRoleMaker(MPIRoleMaker): class MPISymetricRoleMaker(MPIRoleMaker):
...@@ -140,11 +140,11 @@ class MPISymetricRoleMaker(MPIRoleMaker): ...@@ -140,11 +140,11 @@ class MPISymetricRoleMaker(MPIRoleMaker):
def __init__(self): def __init__(self):
super(MPISymetricRoleMaker, self).__init__() super(MPISymetricRoleMaker, self).__init__()
self.node_type_ = None self._node_type = None
self.proc_per_node_ = 2 self._proc_per_node = 2
def _check_role_generation(self): def _check_role_generation(self):
if not self.role_is_generated_: if not self._role_is_generated:
sys.stderr.write("generate_role() should be called first") sys.stderr.write("generate_role() should be called first")
sys.exit(-1) sys.exit(-1)
return False return False
...@@ -163,7 +163,7 @@ class MPISymetricRoleMaker(MPIRoleMaker): ...@@ -163,7 +163,7 @@ class MPISymetricRoleMaker(MPIRoleMaker):
return whether current process is worker assigned by role maker return whether current process is worker assigned by role maker
""" """
if self._check_role_generation(): if self._check_role_generation():
return self.node_type_ == 1 return self._node_type == 1
return False return False
def _is_server(self): def _is_server(self):
...@@ -171,7 +171,7 @@ class MPISymetricRoleMaker(MPIRoleMaker): ...@@ -171,7 +171,7 @@ class MPISymetricRoleMaker(MPIRoleMaker):
return whether current process is server assigned by role maker return whether current process is server assigned by role maker
""" """
if self._check_role_generation(): if self._check_role_generation():
return self.node_type_ == 0 return self._node_type == 0
return False return False
def _worker_num(self): def _worker_num(self):
...@@ -197,7 +197,7 @@ class MPISymetricRoleMaker(MPIRoleMaker): ...@@ -197,7 +197,7 @@ class MPISymetricRoleMaker(MPIRoleMaker):
return the index of worker return the index of worker
""" """
if self._check_role_generation(): if self._check_role_generation():
return self.rank_ / self.proc_per_node_ return self._rank / self._proc_per_node
return 0 return 0
def _server_index(self): def _server_index(self):
...@@ -205,7 +205,7 @@ class MPISymetricRoleMaker(MPIRoleMaker): ...@@ -205,7 +205,7 @@ class MPISymetricRoleMaker(MPIRoleMaker):
return the index of server return the index of server
""" """
if self._check_role_generation(): if self._check_role_generation():
return self.rank_ / self.proc_per_node_ return self._rank / self._proc_per_node
return 0 return 0
def _barrier_worker(self): def _barrier_worker(self):
...@@ -214,7 +214,7 @@ class MPISymetricRoleMaker(MPIRoleMaker): ...@@ -214,7 +214,7 @@ class MPISymetricRoleMaker(MPIRoleMaker):
""" """
if self._check_role_generation(): if self._check_role_generation():
if self._is_worker(): if self._is_worker():
self.node_type_comm_.barrier() self._node_type_comm.barrier()
def _barrier_server(self): def _barrier_server(self):
""" """
...@@ -222,20 +222,20 @@ class MPISymetricRoleMaker(MPIRoleMaker): ...@@ -222,20 +222,20 @@ class MPISymetricRoleMaker(MPIRoleMaker):
""" """
if self._check_role_generation(): if self._check_role_generation():
if self._is_server(): if self._is_server():
self.node_type_comm_.barrier() self._node_type_comm.barrier()
def _generate_role(self): def _generate_role(self):
""" """
generate currently process's role generate currently process's role
""" """
if not self.role_is_generated_: if not self._role_is_generated:
# TODO(guru4elephant): only allow to be called once # TODO(guru4elephant): only allow to be called once
self.trainer_endpoints_ = self._get_ips() self._trainer_endpoints = self._get_ips()
self.pserver_endpoints_ = self._get_ips() self._pserver_endpoints = self._get_ips()
if 0 == self._get_rank() % self.proc_per_node_ % 2: if 0 == self._get_rank() % self._proc_per_node % 2:
self.node_type_ = 0 self._node_type = 0
else: else:
self.node_type_ = 1 self._node_type = 1
self.node_type_comm_ = self.comm_.Split(self.node_type_) self._node_type_comm = self._comm.Split(self._node_type)
self.role_is_generated_ = True self._role_is_generated = True
...@@ -64,9 +64,9 @@ class Fleet(object): ...@@ -64,9 +64,9 @@ class Fleet(object):
def __init__(self): def __init__(self):
self._opt_info = None # for fleet only self._opt_info = None # for fleet only
self.role_maker_ = None self._role_maker = None
self.local_ip_ = 0 self._local_ip = 0
self.is_initialized_ = False self._is_initialized = False
def init(self): def init(self):
# TODO(guru4elephant) # TODO(guru4elephant)
...@@ -78,22 +78,22 @@ class Fleet(object): ...@@ -78,22 +78,22 @@ class Fleet(object):
current node's role, e.g. worker, server, etc. current node's role, e.g. worker, server, etc.
""" """
if not self.is_initialized_: if not self.is_initialized_:
self.role_maker_ = MPISymetricRoleMaker() self._role_maker = MPISymetricRoleMaker()
self.role_maker_._generate_role() self._role_maker._generate_role()
self._fleet_ptr = fluid.core.Fleet() self._fleet_ptr = fluid.core.Fleet()
self.is_initialized_ = True self._is_initialized = True
def stop(self): def stop(self):
""" """
stop(): will be called after a user finishes his/her training task. Fleet instance will be stop(): will be called after a user finishes his/her training task. Fleet instance will be
destroyed when stop() is called. destroyed when stop() is called.
""" """
self.role_maker_._barrier_worker() self._role_maker._barrier_worker()
if self.role_maker_._is_first_worker(): if self._role_maker._is_first_worker():
self._fleet_ptr.stop_server() self._fleet_ptr.stop_server()
self.role_maker_._barrier_worker() self._role_maker._barrier_worker()
self.role_maker_._barrier_all() self._role_maker._barrier_all()
self.role_maker_._finalize() self._role_maker._finalize()
def init_pserver(self): def init_pserver(self):
""" """
...@@ -110,15 +110,15 @@ class Fleet(object): ...@@ -110,15 +110,15 @@ class Fleet(object):
sys.exit(-1) sys.exit(-1)
self._fleet_ptr.init_server(self._dist_desc_str, self._fleet_ptr.init_server(self._dist_desc_str,
self.role_maker_._get_rank()) self.role_maker_._get_rank())
self.local_ip_ = self._fleet_ptr.run_server() self._local_ip = self._fleet_ptr.run_server()
# barrier_all for init_server # barrier_all for init_server
self.role_maker_._barrier_all() self._role_maker._barrier_all()
self.all_ips_ = self.role_maker_._all_gather(self.local_ip_) self._all_ips = self._role_maker._all_gather(self.local_ip_)
self._fleet_ptr.gather_servers(self.all_ips_, self._fleet_ptr.gather_servers(self._all_ips,
self.role_maker_._get_size()) self._role_maker._get_size())
# barrier_all for init_worker, wait all workers start # barrier_all for init_worker, wait all workers start
self.role_maker_._barrier_all() self._role_maker._barrier_all()
else: else:
print("You should run DistributedOptimizer.minimize() first") print("You should run DistributedOptimizer.minimize() first")
sys.exit(-1) sys.exit(-1)
...@@ -151,21 +151,21 @@ class Fleet(object): ...@@ -151,21 +151,21 @@ class Fleet(object):
print("You should run DistributedOptimizer.minimize() first") print("You should run DistributedOptimizer.minimize() first")
sys.exit(-1) sys.exit(-1)
# barrier_all for init_server, wait for server starts # barrier_all for init_server, wait for server starts
self.role_maker_._barrier_all() self._role_maker._barrier_all()
self.all_ips_ = self.role_maker_._all_gather(self.local_ip_) self._all_ips = self._role_maker._all_gather(self.local_ip_)
self._fleet_ptr.init_worker(self._dist_desc_str, self.all_ips_, self._fleet_ptr.init_worker(self._dist_desc_str, self._all_ips,
self.role_maker_._get_size(), self._role_maker._get_size(),
self.role_maker_._get_rank()) self._role_maker._get_rank())
# barrier_all for init_worker # barrier_all for init_worker
self.role_maker_._barrier_all() self._role_maker._barrier_all()
# prepare for client to client communication # prepare for client to client communication
info = self._fleet_ptr.get_clients_info() info = self._fleet_ptr.get_clients_info()
all_info = self.role_maker_._worker_gather(info[0]) all_info = self._role_maker._worker_gather(info[0])
self._fleet_ptr.gather_clients(all_info) self._fleet_ptr.gather_clients(all_info)
self._fleet_ptr.create_client2client_connection() self._fleet_ptr.create_client2client_connection()
# barrier for init model # barrier for init model
self.role_maker_._barrier_worker() self._role_maker._barrier_worker()
if self.role_maker_._is_first_worker(): if self._role_maker._is_first_worker():
tables = self._dist_desc.trainer_param.dense_table tables = self._dist_desc.trainer_param.dense_table
for prog, scope in zip(programs, scopes): for prog, scope in zip(programs, scopes):
prog_id = str(id(prog)) prog_id = str(id(prog))
...@@ -192,7 +192,7 @@ class Fleet(object): ...@@ -192,7 +192,7 @@ class Fleet(object):
int(table.table_id), int(table.table_id),
var_name_list) var_name_list)
# barrier for init model done # barrier for init model done
self.role_maker_._barrier_worker() self._role_maker._barrier_worker()
else: else:
print("You should run DistributedOptimizer.minimize() first") print("You should run DistributedOptimizer.minimize() first")
sys.exit(-1) sys.exit(-1)
...@@ -201,39 +201,39 @@ class Fleet(object): ...@@ -201,39 +201,39 @@ class Fleet(object):
""" """
return the number of current job's worker num return the number of current job's worker num
""" """
return self.role_maker_._worker_num() return self._role_maker._worker_num()
def get_server_num(self): def get_server_num(self):
""" """
return the number of current job's server num return the number of current job's server num
""" """
return self.role_maker_._server_num() return self._role_maker._server_num()
def get_worker_index(self): def get_worker_index(self):
""" """
return the mpi rank of current worker return the mpi rank of current worker
""" """
return self.role_maker_._worker_index() return self._role_maker._worker_index()
def is_worker(self): def is_worker(self):
""" """
return whether current node is a worker return whether current node is a worker
""" """
return self.role_maker_._is_worker() return self._role_maker._is_worker()
def is_server(self): def is_server(self):
""" """
return whether current node is pserver return whether current node is pserver
""" """
return self.role_maker_._is_server() return self._role_maker._is_server()
def init_pserver_model(self): def init_pserver_model(self):
""" """
init pserver model called from pserver init pserver model called from pserver
""" """
if self.role_maker_._is_first_worker(): if self._role_maker._is_first_worker():
self._fleet_ptr.init_model() self._fleet_ptr.init_model()
self.role_maker_._barrier_worker() self._role_maker._barrier_worker()
def save_pserver_model(self, save_path): def save_pserver_model(self, save_path):
""" """
......
...@@ -42,13 +42,13 @@ class DownpourServer(Server): ...@@ -42,13 +42,13 @@ class DownpourServer(Server):
""" """
def __init__(self): def __init__(self):
self.server_ = pslib.ServerParameter() self._server = pslib.ServerParameter()
self.server_.downpour_server_param.service_param.start_server_port = 0 self._server.downpour_server_param.service_param.start_server_port = 0
self.server_.downpour_server_param.service_param.server_class = "DownpourBrpcPsServer" self._server.downpour_server_param.service_param.server_class = "DownpourBrpcPsServer"
self.server_.downpour_server_param.service_param.client_class = "DownpourBrpcPsClient" self._server.downpour_server_param.service_param.client_class = "DownpourBrpcPsClient"
self.server_.downpour_server_param.service_param.service_class = "DownpourPsService" self._server.downpour_server_param.service_param.service_class = "DownpourPsService"
self.server_.downpour_server_param.service_param.start_server_port = 0 self._server.downpour_server_param.service_param.start_server_port = 0
self.server_.downpour_server_param.service_param.server_thread_num = 12 self._server.downpour_server_param.service_param.server_thread_num = 12
def add_sparse_table(self, table_id, learning_rate, slot_key_vars, def add_sparse_table(self, table_id, learning_rate, slot_key_vars,
slot_value_var): slot_value_var):
...@@ -62,7 +62,7 @@ class DownpourServer(Server): ...@@ -62,7 +62,7 @@ class DownpourServer(Server):
Returns: Returns:
return None return None
""" """
table = self.server_.downpour_server_param.downpour_table_param.add() table = self._server.downpour_server_param.downpour_table_param.add()
table.table_id = table_id table.table_id = table_id
table.table_class = "DownpourSparseTable" table.table_class = "DownpourSparseTable"
table.type = pslib.PS_SPARSE_TABLE table.type = pslib.PS_SPARSE_TABLE
...@@ -123,7 +123,7 @@ class DownpourServer(Server): ...@@ -123,7 +123,7 @@ class DownpourServer(Server):
Returns: Returns:
return None return None
""" """
table = self.server_.downpour_server_param.downpour_table_param.add() table = self._server.downpour_server_param.downpour_table_param.add()
table.table_id = table_id table.table_id = table_id
table.table_class = "DownpourDenseTable" table.table_class = "DownpourDenseTable"
table.type = pslib.PS_DENSE_TABLE table.type = pslib.PS_DENSE_TABLE
...@@ -140,7 +140,7 @@ class DownpourServer(Server): ...@@ -140,7 +140,7 @@ class DownpourServer(Server):
""" """
Return downpour server program_desc Return downpour server program_desc
""" """
return self.server_ return self._server
class DownpourWorker(Worker): class DownpourWorker(Worker):
...@@ -155,7 +155,7 @@ class DownpourWorker(Worker): ...@@ -155,7 +155,7 @@ class DownpourWorker(Worker):
def __init__(self, window): def __init__(self, window):
self.window = window self.window = window
self.worker_ = pslib.DownpourTrainerParameter() self._worker = pslib.DownpourTrainerParameter()
def add_sparse_table(self, table_id, learning_rate, slot_key_vars, def add_sparse_table(self, table_id, learning_rate, slot_key_vars,
slot_value_vars): slot_value_vars):
...@@ -187,7 +187,7 @@ class DownpourWorker(Worker): ...@@ -187,7 +187,7 @@ class DownpourWorker(Worker):
Returns: Returns:
return None return None
""" """
table = self.worker_.dense_table.add() table = self._worker.dense_table.add()
table.table_id = table_id table.table_id = table_id
table.dense_variable_name.extend( table.dense_variable_name.extend(
filter(lambda x: x.find("embedding") == -1, filter(lambda x: x.find("embedding") == -1,
...@@ -200,4 +200,4 @@ class DownpourWorker(Worker): ...@@ -200,4 +200,4 @@ class DownpourWorker(Worker):
""" """
Return downpour worker program_desc Return downpour worker program_desc
""" """
return self.worker_ return self._worker
...@@ -24,9 +24,9 @@ from .node import DownpourWorker, DownpourServer ...@@ -24,9 +24,9 @@ from .node import DownpourWorker, DownpourServer
class DistributedOptimizerImplBase(object): class DistributedOptimizerImplBase(object):
def __init__(self, optimizer): def __init__(self, optimizer):
self.optimizer_ = optimizer self._optimizer = optimizer
self.learning_rate_ = optimizer._learning_rate self._learning_rate = optimizer._learning_rate
self.regularization_ = optimizer.regularization self._regularization = optimizer.regularization
def minimize(self, def minimize(self,
losses, losses,
...@@ -41,7 +41,7 @@ class DistributedAdam(DistributedOptimizerImplBase): ...@@ -41,7 +41,7 @@ class DistributedAdam(DistributedOptimizerImplBase):
# todo(guru4elephant): add more optimizers here as argument # todo(guru4elephant): add more optimizers here as argument
# todo(guru4elephant): make learning_rate as a variable # todo(guru4elephant): make learning_rate as a variable
super(DistributedAdam, self).__init__(optimizer) super(DistributedAdam, self).__init__(optimizer)
self.window_ = 1 self._window = 1
self.type = "downpour" self.type = "downpour"
self.data_norm_name = [ self.data_norm_name = [
".batch_size", ".batch_square_sum", ".batch_sum", ".batch_size", ".batch_square_sum", ".batch_sum",
...@@ -79,9 +79,9 @@ class DistributedAdam(DistributedOptimizerImplBase): ...@@ -79,9 +79,9 @@ class DistributedAdam(DistributedOptimizerImplBase):
server = DownpourServer() server = DownpourServer()
worker = DownpourWorker(self.window_) worker = DownpourWorker(self.window_)
sparse_table_index = 0 sparse_table_index = 0
server.add_sparse_table(sparse_table_index, self.learning_rate_, server.add_sparse_table(sparse_table_index, self._learning_rate,
prefetch_slots, prefetch_slots_emb) prefetch_slots, prefetch_slots_emb)
worker.add_sparse_table(sparse_table_index, self.learning_rate_, worker.add_sparse_table(sparse_table_index, self._learning_rate,
prefetch_slots, prefetch_slots_emb) prefetch_slots, prefetch_slots_emb)
dense_table_index = 1 dense_table_index = 1
program_configs = {} program_configs = {}
...@@ -124,9 +124,9 @@ class DistributedAdam(DistributedOptimizerImplBase): ...@@ -124,9 +124,9 @@ class DistributedAdam(DistributedOptimizerImplBase):
data_norm_grads.append(i[1]) data_norm_grads.append(i[1])
if not is_data_norm_data: if not is_data_norm_data:
grads.append(i[1]) grads.append(i[1])
server.add_dense_table(dense_table_index, self.learning_rate_, server.add_dense_table(dense_table_index, self._learning_rate,
params, grads) params, grads)
worker.add_dense_table(dense_table_index, self.learning_rate_, worker.add_dense_table(dense_table_index, self._learning_rate,
params, grads) params, grads)
program_configs[program_id]["pull_dense"] = [dense_table_index] program_configs[program_id]["pull_dense"] = [dense_table_index]
program_configs[program_id]["push_dense"] = [dense_table_index] program_configs[program_id]["push_dense"] = [dense_table_index]
...@@ -135,9 +135,9 @@ class DistributedAdam(DistributedOptimizerImplBase): ...@@ -135,9 +135,9 @@ class DistributedAdam(DistributedOptimizerImplBase):
if len(data_norm_params) != 0 and len(data_norm_grads) != 0: if len(data_norm_params) != 0 and len(data_norm_grads) != 0:
dense_table_index += 1 dense_table_index += 1
server.add_data_norm_table(dense_table_index, server.add_data_norm_table(dense_table_index,
self.learning_rate_, self._learning_rate,
data_norm_params, data_norm_grads) data_norm_params, data_norm_grads)
worker.add_dense_table(dense_table_index, self.learning_rate_, worker.add_dense_table(dense_table_index, self._learning_rate,
data_norm_params, data_norm_grads) data_norm_params, data_norm_grads)
#program_config.pull_dense_table_id.extend([dense_table_index]) #program_config.pull_dense_table_id.extend([dense_table_index])
#program_config.push_dense_table_id.extend([dense_table_index]) #program_config.push_dense_table_id.extend([dense_table_index])
......
...@@ -509,14 +509,14 @@ def polygon_box_transform(input, name=None): ...@@ -509,14 +509,14 @@ def polygon_box_transform(input, name=None):
@templatedoc(op_type="yolov3_loss") @templatedoc(op_type="yolov3_loss")
def yolov3_loss(x, def yolov3_loss(x,
gtbox, gt_box,
gtlabel, gt_label,
anchors, anchors,
anchor_mask, anchor_mask,
class_num, class_num,
ignore_thresh, ignore_thresh,
downsample_ratio, downsample_ratio,
gtscore=None, gt_score=None,
use_label_smooth=True, use_label_smooth=True,
name=None): name=None):
""" """
...@@ -524,12 +524,12 @@ def yolov3_loss(x, ...@@ -524,12 +524,12 @@ def yolov3_loss(x,
Args: Args:
x (Variable): ${x_comment} x (Variable): ${x_comment}
gtbox (Variable): groud truth boxes, should be in shape of [N, B, 4], gt_box (Variable): groud truth boxes, should be in shape of [N, B, 4],
in the third dimenstion, x, y, w, h should be stored in the third dimenstion, x, y, w, h should be stored
and x, y, w, h should be relative value of input image. and x, y, w, h should be relative value of input image.
N is the batch number and B is the max box number in N is the batch number and B is the max box number in
an image. an image.
gtlabel (Variable): class id of ground truth boxes, shoud be in shape gt_label (Variable): class id of ground truth boxes, shoud be in shape
of [N, B]. of [N, B].
anchors (list|tuple): ${anchors_comment} anchors (list|tuple): ${anchors_comment}
anchor_mask (list|tuple): ${anchor_mask_comment} anchor_mask (list|tuple): ${anchor_mask_comment}
...@@ -537,7 +537,7 @@ def yolov3_loss(x, ...@@ -537,7 +537,7 @@ def yolov3_loss(x,
ignore_thresh (float): ${ignore_thresh_comment} ignore_thresh (float): ${ignore_thresh_comment}
downsample_ratio (int): ${downsample_ratio_comment} downsample_ratio (int): ${downsample_ratio_comment}
name (string): the name of yolov3 loss. Default None. name (string): the name of yolov3 loss. Default None.
gtscore (Variable): mixup score of ground truth boxes, shoud be in shape gt_score (Variable): mixup score of ground truth boxes, shoud be in shape
of [N, B]. Default None. of [N, B]. Default None.
use_label_smooth (bool): ${use_label_smooth_comment} use_label_smooth (bool): ${use_label_smooth_comment}
...@@ -558,13 +558,13 @@ def yolov3_loss(x, ...@@ -558,13 +558,13 @@ def yolov3_loss(x,
.. code-block:: python .. code-block:: python
x = fluid.layers.data(name='x', shape=[255, 13, 13], dtype='float32') x = fluid.layers.data(name='x', shape=[255, 13, 13], dtype='float32')
gtbox = fluid.layers.data(name='gtbox', shape=[6, 4], dtype='float32') gt_box = fluid.layers.data(name='gt_box', shape=[6, 4], dtype='float32')
gtlabel = fluid.layers.data(name='gtlabel', shape=[6], dtype='int32') gt_label = fluid.layers.data(name='gt_label', shape=[6], dtype='int32')
gtscore = fluid.layers.data(name='gtscore', shape=[6], dtype='float32') gt_score = fluid.layers.data(name='gt_score', shape=[6], dtype='float32')
anchors = [10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198, 373, 326] anchors = [10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198, 373, 326]
anchor_mask = [0, 1, 2] anchor_mask = [0, 1, 2]
loss = fluid.layers.yolov3_loss(x=x, gtbox=gtbox, gtlabel=gtlabel, loss = fluid.layers.yolov3_loss(x=x, gt_box=gt_box, gt_label=gt_label,
gtscore=gtscore, anchors=anchors, gt_score=gt_score, anchors=anchors,
anchor_mask=anchor_mask, class_num=80, anchor_mask=anchor_mask, class_num=80,
ignore_thresh=0.7, downsample_ratio=32) ignore_thresh=0.7, downsample_ratio=32)
""" """
...@@ -572,11 +572,11 @@ def yolov3_loss(x, ...@@ -572,11 +572,11 @@ def yolov3_loss(x,
if not isinstance(x, Variable): if not isinstance(x, Variable):
raise TypeError("Input x of yolov3_loss must be Variable") raise TypeError("Input x of yolov3_loss must be Variable")
if not isinstance(gtbox, Variable): if not isinstance(gt_box, Variable):
raise TypeError("Input gtbox of yolov3_loss must be Variable") raise TypeError("Input gtbox of yolov3_loss must be Variable")
if not isinstance(gtlabel, Variable): if not isinstance(gt_label, Variable):
raise TypeError("Input gtlabel of yolov3_loss must be Variable") raise TypeError("Input gtlabel of yolov3_loss must be Variable")
if gtscore is not None and not isinstance(gtscore, Variable): if gt_score is not None and not isinstance(gt_score, Variable):
raise TypeError("Input gtscore of yolov3_loss must be Variable") raise TypeError("Input gtscore of yolov3_loss must be Variable")
if not isinstance(anchors, list) and not isinstance(anchors, tuple): if not isinstance(anchors, list) and not isinstance(anchors, tuple):
raise TypeError("Attr anchors of yolov3_loss must be list or tuple") raise TypeError("Attr anchors of yolov3_loss must be list or tuple")
...@@ -602,11 +602,11 @@ def yolov3_loss(x, ...@@ -602,11 +602,11 @@ def yolov3_loss(x,
inputs = { inputs = {
"X": x, "X": x,
"GTBox": gtbox, "GTBox": gt_box,
"GTLabel": gtlabel, "GTLabel": gt_label,
} }
if gtscore: if gt_score:
inputs["GTScore"] = gtscore inputs["GTScore"] = gt_score
attrs = { attrs = {
"anchors": anchors, "anchors": anchors,
...@@ -1542,7 +1542,7 @@ def multi_box_head(inputs, ...@@ -1542,7 +1542,7 @@ def multi_box_head(inputs,
.. code-block:: python .. code-block:: python
mbox_locs, mbox_confs, box, var = fluid.layers.multi_box_head( mbox_locs, mbox_confs, box, var = fluid.layers.multi_box_head(
inputs=[conv1, conv2, conv3, conv4, conv5, conv5], inputs=[conv1, conv2, conv3, conv4, conv5, conv6],
image=images, image=images,
num_classes=21, num_classes=21,
min_ratio=20, min_ratio=20,
......
...@@ -196,6 +196,7 @@ __all__ = [ ...@@ -196,6 +196,7 @@ __all__ = [
'npair_loss', 'npair_loss',
'pixel_shuffle', 'pixel_shuffle',
'fsp_matrix', 'fsp_matrix',
'continuous_value_model',
] ]
kIgnoreIndex = -100 kIgnoreIndex = -100
...@@ -5720,12 +5721,21 @@ def hsigmoid(input, ...@@ -5720,12 +5721,21 @@ def hsigmoid(input,
raise ValueError( raise ValueError(
"num_classes must not be less than 2 with default tree") "num_classes must not be less than 2 with default tree")
if (not is_custom) and (is_sparse):
print("Sparse mode should not be used without custom tree")
is_sparse = False
if (not is_custom) and ((path_table is not None) or
(path_code is not None)):
raise ValueError(
"only num_classes should be passed without custom tree")
if (is_custom) and (path_code is None): if (is_custom) and (path_code is None):
raise ValueError("path_code should not be None with costum tree") raise ValueError("path_code should not be None with custom tree")
elif (is_custom) and (path_table is None): elif (is_custom) and (path_table is None):
raise ValueError("path_table should not be None with costum tree") raise ValueError("path_table should not be None with custom tree")
elif (is_custom) and (num_classes is None): elif (is_custom) and (num_classes is None):
raise ValueError("num_classes should not be None with costum tree") raise ValueError("num_classes should not be None with custom tree")
else: else:
pass pass
...@@ -6268,6 +6278,8 @@ def sampled_softmax_with_cross_entropy(logits, ...@@ -6268,6 +6278,8 @@ def sampled_softmax_with_cross_entropy(logits,
sampled_label = helper.create_variable_for_type_inference(dtype='int64') sampled_label = helper.create_variable_for_type_inference(dtype='int64')
sampled_softlabel = helper.create_variable_for_type_inference( sampled_softlabel = helper.create_variable_for_type_inference(
dtype=logits.dtype) dtype=logits.dtype)
logits_dim = helper.create_variable_for_type_inference(dtype=logits.dtype)
labels_dim = helper.create_variable_for_type_inference(dtype=label.type)
helper.append_op( helper.append_op(
type='sample_logits', type='sample_logits',
...@@ -6281,7 +6293,9 @@ def sampled_softmax_with_cross_entropy(logits, ...@@ -6281,7 +6293,9 @@ def sampled_softmax_with_cross_entropy(logits,
'Samples': samples, 'Samples': samples,
'Probabilities': probabilities, 'Probabilities': probabilities,
'SampledLabels': sampled_label, 'SampledLabels': sampled_label,
'SampledLogits': sampled_logits 'SampledLogits': sampled_logits,
'LogitsDim': logits_dim,
'LabelsDim': labels_dim
}, },
attrs={ attrs={
'use_customized_samples': use_customized_samples, 'use_customized_samples': use_customized_samples,
...@@ -11202,3 +11216,54 @@ def fsp_matrix(x, y): ...@@ -11202,3 +11216,54 @@ def fsp_matrix(x, y):
input_param_name='x')) input_param_name='x'))
helper.append_op(type='fsp', inputs={'X': x, 'Y': y}, outputs={'Out': out}) helper.append_op(type='fsp', inputs={'X': x, 'Y': y}, outputs={'Out': out})
return out return out
def continuous_value_model(input, cvm, use_cvm=True):
"""
**continuous_value_model layers**
continuous value model(cvm). Now, it only considers show and click value in CTR project.
We assume that input is an embedding vector with cvm_feature, whose shape is [N * D] (D is 2 + embedding dim).
If use_cvm is True, it will log(cvm_feature), and output shape is [N * D].
If use_cvm is False, it will remove cvm_feature from input, and output shape is [N * (D - 2)].
This layer accepts a tensor named input which is ID after embedded(lod level is 1), cvm is a show_click info.
Args:
input (Variable): a 2-D LodTensor with shape [N x D], where N is the batch size, D is 2 + the embedding dim. lod level = 1.
cvm (Variable): a 2-D Tensor with shape [N x 2], where N is the batch size, 2 is show and click.
use_cvm (bool): use cvm or not. if use cvm, the output dim is the same as input
if don't use cvm, the output dim is input dim - 2(remove show and click)
(cvm op is a customized op, which input is a sequence has embedd_with_cvm default, so we need an op named cvm to decided whever use it or not.)
Returns:
Variable: A 2-D LodTensor with shape [N x D], if use cvm, D is equal to input dim, if don't use cvm, D is equal to input dim - 2.
Examples:
.. code-block:: python
input = fluid.layers.data(name="input", shape=[-1, 1], lod_level=1, append_batch_size=False, dtype="int64")#, stop_gradient=False)
label = fluid.layers.data(name="label", shape=[-1, 1], append_batch_size=False, dtype="int64")
embed = fluid.layers.embedding(
input=input,
size=[100, 11],
dtype='float32')
ones = fluid.layers.fill_constant_batch_size_like(input=label, shape=[-1, 1], dtype="int64", value=1)
show_clk = fluid.layers.cast(fluid.layers.concat([ones, label], axis=1), dtype='float32')
show_clk.stop_gradient = True
input_with_cvm = fluid.layers.continuous_value_model(embed, show_clk, True)
"""
helper = LayerHelper('cvm', **locals())
out = helper.create_variable(dtype=input.dtype)
helper.append_op(
type='cvm',
inputs={'X': [input],
'CVM': [cvm]},
outputs={'Y': [out]},
attrs={"use_cvm": use_cvm})
return out
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册