提交 eb02eb94 编写于 作者: S SunGaofeng

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into affine_modify

...@@ -71,7 +71,8 @@ option(ANAKIN_BUILD_CROSS_PLANTFORM "Build anakin lib for any nvidia device plan ...@@ -71,7 +71,8 @@ option(ANAKIN_BUILD_CROSS_PLANTFORM "Build anakin lib for any nvidia device plan
option(WITH_GRPC "Use grpc as the default rpc framework" ${WITH_DISTRIBUTE}) option(WITH_GRPC "Use grpc as the default rpc framework" ${WITH_DISTRIBUTE})
option(WITH_BRPC_RDMA "Use brpc rdma as the rpc protocal" OFF) option(WITH_BRPC_RDMA "Use brpc rdma as the rpc protocal" OFF)
option(ON_INFER "Turn on inference optimization." OFF) option(ON_INFER "Turn on inference optimization." OFF)
option(WITH_INFERENCE_API_TEST "Test fluid inference high-level api interface" OFF) option(WITH_INFERENCE_API_TEST "Test fluid inference C++ high-level api interface" OFF)
option(WITH_HIGH_LEVEL_API_TEST "Test fluid python high-level api interface" OFF)
option(WITH_SYSTEM_BLAS "Use system blas library" OFF) option(WITH_SYSTEM_BLAS "Use system blas library" OFF)
option(PY_VERSION "Compile PaddlePaddle with python3 support" ${PY_VERSION}) option(PY_VERSION "Compile PaddlePaddle with python3 support" ${PY_VERSION})
option(WITH_FAST_MATH "Make use of fast math library, might affect the precision to some extent" ON) option(WITH_FAST_MATH "Make use of fast math library, might affect the precision to some extent" ON)
......
...@@ -221,6 +221,7 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST) ...@@ -221,6 +221,7 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
-DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR} -DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR}
-DCMAKE_INSTALL_LIBDIR=lib -DCMAKE_INSTALL_LIBDIR=lib
-DBUILD_SHARED_LIBS=OFF
CMAKE_CACHE_ARGS CMAKE_CACHE_ARGS
-DCMAKE_INSTALL_PREFIX:PATH=${PROTOBUF_INSTALL_DIR} -DCMAKE_INSTALL_PREFIX:PATH=${PROTOBUF_INSTALL_DIR}
-DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
......
...@@ -118,6 +118,8 @@ paddle.fluid.layers.reduce_mean (ArgSpec(args=['input', 'dim', 'keep_dim', 'name ...@@ -118,6 +118,8 @@ paddle.fluid.layers.reduce_mean (ArgSpec(args=['input', 'dim', 'keep_dim', 'name
paddle.fluid.layers.reduce_max (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', '66a622db727551761ce4eb73eaa7f6a4')) paddle.fluid.layers.reduce_max (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', '66a622db727551761ce4eb73eaa7f6a4'))
paddle.fluid.layers.reduce_min (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', 'd50ac552b5d131468ed466d08bb2d38c')) paddle.fluid.layers.reduce_min (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', 'd50ac552b5d131468ed466d08bb2d38c'))
paddle.fluid.layers.reduce_prod (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', 'fcd8301a0ce15f219c7a4bcd0c1e8eca')) paddle.fluid.layers.reduce_prod (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', 'fcd8301a0ce15f219c7a4bcd0c1e8eca'))
paddle.fluid.layers.reduce_all (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', '646ca4d4a2cc16084f59de44b6927eca'))
paddle.fluid.layers.reduce_any (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', 'f36661060aeeaf6c6b1331e41b3726fa'))
paddle.fluid.layers.sequence_first_step (ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None), ('document', '2b290d3d77882bfe9bb8d331cac8cdd3')) paddle.fluid.layers.sequence_first_step (ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None), ('document', '2b290d3d77882bfe9bb8d331cac8cdd3'))
paddle.fluid.layers.sequence_last_step (ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None), ('document', 'c16a892f44f7fe71bfa5afc32d3f34ce')) paddle.fluid.layers.sequence_last_step (ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None), ('document', 'c16a892f44f7fe71bfa5afc32d3f34ce'))
paddle.fluid.layers.sequence_slice (ArgSpec(args=['input', 'offset', 'length', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'fdcea0e8b5bc7d8d4b1b072c521014e6')) paddle.fluid.layers.sequence_slice (ArgSpec(args=['input', 'offset', 'length', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'fdcea0e8b5bc7d8d4b1b072c521014e6'))
...@@ -125,7 +127,7 @@ paddle.fluid.layers.dropout (ArgSpec(args=['x', 'dropout_prob', 'is_test', 'seed ...@@ -125,7 +127,7 @@ paddle.fluid.layers.dropout (ArgSpec(args=['x', 'dropout_prob', 'is_test', 'seed
paddle.fluid.layers.split (ArgSpec(args=['input', 'num_or_sections', 'dim', 'name'], varargs=None, keywords=None, defaults=(-1, None)), ('document', '652625345c2acb900029c78cc75f8aa6')) paddle.fluid.layers.split (ArgSpec(args=['input', 'num_or_sections', 'dim', 'name'], varargs=None, keywords=None, defaults=(-1, None)), ('document', '652625345c2acb900029c78cc75f8aa6'))
paddle.fluid.layers.ctc_greedy_decoder (ArgSpec(args=['input', 'blank', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'ebbf2adbd79683dc93db03454dfa18c2')) paddle.fluid.layers.ctc_greedy_decoder (ArgSpec(args=['input', 'blank', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'ebbf2adbd79683dc93db03454dfa18c2'))
paddle.fluid.layers.edit_distance (ArgSpec(args=['input', 'label', 'normalized', 'ignored_tokens'], varargs=None, keywords=None, defaults=(True, None)), ('document', '97f0262f97602644c83142789d784571')) paddle.fluid.layers.edit_distance (ArgSpec(args=['input', 'label', 'normalized', 'ignored_tokens'], varargs=None, keywords=None, defaults=(True, None)), ('document', '97f0262f97602644c83142789d784571'))
paddle.fluid.layers.l2_normalize (ArgSpec(args=['x', 'axis', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(1e-12, None)), ('document', '6e428384ce6a77207fa2c70d9f011990')) paddle.fluid.layers.l2_normalize (ArgSpec(args=['x', 'axis', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(1e-12, None)), ('document', '35c6a241bcc1a1fc89508860d82ad62b'))
paddle.fluid.layers.matmul (ArgSpec(args=['x', 'y', 'transpose_x', 'transpose_y', 'alpha', 'name'], varargs=None, keywords=None, defaults=(False, False, 1.0, None)), ('document', 'b4cbe1ac451005df6dad12e9ffdccca9')) paddle.fluid.layers.matmul (ArgSpec(args=['x', 'y', 'transpose_x', 'transpose_y', 'alpha', 'name'], varargs=None, keywords=None, defaults=(False, False, 1.0, None)), ('document', 'b4cbe1ac451005df6dad12e9ffdccca9'))
paddle.fluid.layers.topk (ArgSpec(args=['input', 'k', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'd3570c02f71bcd78e60b3f31dc8f5b32')) paddle.fluid.layers.topk (ArgSpec(args=['input', 'k', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'd3570c02f71bcd78e60b3f31dc8f5b32'))
paddle.fluid.layers.warpctc (ArgSpec(args=['input', 'label', 'blank', 'norm_by_times', 'use_cudnn'], varargs=None, keywords=None, defaults=(0, False, False)), ('document', 'aaba49c038ba927f0a8e45c0c9a686ab')) paddle.fluid.layers.warpctc (ArgSpec(args=['input', 'label', 'blank', 'norm_by_times', 'use_cudnn'], varargs=None, keywords=None, defaults=(0, False, False)), ('document', 'aaba49c038ba927f0a8e45c0c9a686ab'))
...@@ -204,6 +206,7 @@ paddle.fluid.layers.gaussian_random_batch_size_like (ArgSpec(args=['input', 'sha ...@@ -204,6 +206,7 @@ paddle.fluid.layers.gaussian_random_batch_size_like (ArgSpec(args=['input', 'sha
paddle.fluid.layers.sum (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', 'a418e3ccb5e2ac21bd60f5cc221d5860')) paddle.fluid.layers.sum (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', 'a418e3ccb5e2ac21bd60f5cc221d5860'))
paddle.fluid.layers.slice (ArgSpec(args=['input', 'axes', 'starts', 'ends'], varargs=None, keywords=None, defaults=None), ('document', '01dbb91e7c74cb11336cd531013de51a')) paddle.fluid.layers.slice (ArgSpec(args=['input', 'axes', 'starts', 'ends'], varargs=None, keywords=None, defaults=None), ('document', '01dbb91e7c74cb11336cd531013de51a'))
paddle.fluid.layers.shape (ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None), ('document', '17db0f814eb7bb5a3fac1ca6e60e16d8')) paddle.fluid.layers.shape (ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None), ('document', '17db0f814eb7bb5a3fac1ca6e60e16d8'))
paddle.fluid.layers.rank (ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None), ('document', 'ee1386c42ecc8f424fe3fb21862fefc2'))
paddle.fluid.layers.logical_and (ArgSpec(args=['x', 'y', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'cdcf20c494c92060d10feb9374532f42')) paddle.fluid.layers.logical_and (ArgSpec(args=['x', 'y', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'cdcf20c494c92060d10feb9374532f42'))
paddle.fluid.layers.logical_or (ArgSpec(args=['x', 'y', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', '0eae3f726a4afe590757552fa3ced012')) paddle.fluid.layers.logical_or (ArgSpec(args=['x', 'y', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', '0eae3f726a4afe590757552fa3ced012'))
paddle.fluid.layers.logical_xor (ArgSpec(args=['x', 'y', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'b0daaa3fa4a0aa62f9b58c43d959eb25')) paddle.fluid.layers.logical_xor (ArgSpec(args=['x', 'y', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'b0daaa3fa4a0aa62f9b58c43d959eb25'))
...@@ -236,7 +239,7 @@ paddle.fluid.layers.huber_loss (ArgSpec(args=['input', 'label', 'delta'], vararg ...@@ -236,7 +239,7 @@ paddle.fluid.layers.huber_loss (ArgSpec(args=['input', 'label', 'delta'], vararg
paddle.fluid.layers.kldiv_loss (ArgSpec(args=['x', 'target', 'reduction', 'name'], varargs=None, keywords=None, defaults=('mean', None)), ('document', '776d536cac47c89073abc7ee524d5aec')) paddle.fluid.layers.kldiv_loss (ArgSpec(args=['x', 'target', 'reduction', 'name'], varargs=None, keywords=None, defaults=('mean', None)), ('document', '776d536cac47c89073abc7ee524d5aec'))
paddle.fluid.layers.tree_conv (ArgSpec(args=['nodes_vector', 'edge_set', 'output_size', 'num_filters', 'max_depth', 'act', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(1, 2, 'tanh', None, None, None)), ('document', '34ea12ac9f10a65dccbc50100d12e607')) paddle.fluid.layers.tree_conv (ArgSpec(args=['nodes_vector', 'edge_set', 'output_size', 'num_filters', 'max_depth', 'act', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(1, 2, 'tanh', None, None, None)), ('document', '34ea12ac9f10a65dccbc50100d12e607'))
paddle.fluid.layers.npair_loss (ArgSpec(args=['anchor', 'positive', 'labels', 'l2_reg'], varargs=None, keywords=None, defaults=(0.002,)), ('document', '46994d10276dd4cb803b4062b5d14329')) paddle.fluid.layers.npair_loss (ArgSpec(args=['anchor', 'positive', 'labels', 'l2_reg'], varargs=None, keywords=None, defaults=(0.002,)), ('document', '46994d10276dd4cb803b4062b5d14329'))
paddle.fluid.layers.pixel_shuffle (ArgSpec(args=['x', 'upscale_factor'], varargs=None, keywords=None, defaults=None), ('document', 'ad669cdf83e72a69ebc5ed79e36486de')) paddle.fluid.layers.pixel_shuffle (ArgSpec(args=['x', 'upscale_factor'], varargs=None, keywords=None, defaults=None), ('document', '731b21c62a4add60a33bd76d802ffc5c'))
paddle.fluid.layers.fsp_matrix (ArgSpec(args=['x', 'y'], varargs=None, keywords=None, defaults=None), ('document', 'b76ccca3735bea4a58a0dbf0d77c5393')) paddle.fluid.layers.fsp_matrix (ArgSpec(args=['x', 'y'], varargs=None, keywords=None, defaults=None), ('document', 'b76ccca3735bea4a58a0dbf0d77c5393'))
paddle.fluid.layers.data (ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)), ('document', '33bbd42027d872b3818b3d64ec52e139')) paddle.fluid.layers.data (ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)), ('document', '33bbd42027d872b3818b3d64ec52e139'))
paddle.fluid.layers.open_files (ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)), ('document', 'b1ae2e1cc0750e58726374061ea90ecc')) paddle.fluid.layers.open_files (ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)), ('document', 'b1ae2e1cc0750e58726374061ea90ecc'))
...@@ -272,6 +275,7 @@ paddle.fluid.layers.has_inf (ArgSpec(args=['x'], varargs=None, keywords=None, de ...@@ -272,6 +275,7 @@ paddle.fluid.layers.has_inf (ArgSpec(args=['x'], varargs=None, keywords=None, de
paddle.fluid.layers.has_nan (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', '2e53e83127dbfd86e7098bdfe9a549e8')) paddle.fluid.layers.has_nan (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', '2e53e83127dbfd86e7098bdfe9a549e8'))
paddle.fluid.layers.isfinite (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', '0a437011c3906079fd8947ed3e52d292')) paddle.fluid.layers.isfinite (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', '0a437011c3906079fd8947ed3e52d292'))
paddle.fluid.layers.range (ArgSpec(args=['start', 'end', 'step', 'dtype'], varargs=None, keywords=None, defaults=None), ('document', '2ec937ede953ded2fdff2675883900bb')) paddle.fluid.layers.range (ArgSpec(args=['start', 'end', 'step', 'dtype'], varargs=None, keywords=None, defaults=None), ('document', '2ec937ede953ded2fdff2675883900bb'))
paddle.fluid.layers.linspace (ArgSpec(args=['start', 'stop', 'num', 'dtype'], varargs=None, keywords=None, defaults=None), ('document', '495e21e9a848c2d075a102802fc67756'))
paddle.fluid.layers.While.__init__ (ArgSpec(args=['self', 'cond', 'is_test', 'name'], varargs=None, keywords=None, defaults=(False, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.layers.While.__init__ (ArgSpec(args=['self', 'cond', 'is_test', 'name'], varargs=None, keywords=None, defaults=(False, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.layers.While.block (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.layers.While.block (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.layers.Switch.__init__ (ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.layers.Switch.__init__ (ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
...@@ -297,12 +301,12 @@ paddle.fluid.layers.DynamicRNN.static_input (ArgSpec(args=['self', 'x'], varargs ...@@ -297,12 +301,12 @@ paddle.fluid.layers.DynamicRNN.static_input (ArgSpec(args=['self', 'x'], varargs
paddle.fluid.layers.DynamicRNN.step_input (ArgSpec(args=['self', 'x', 'level'], varargs=None, keywords=None, defaults=(0,)), ('document', '7568c5ac7622a10288d3307a94134655')) paddle.fluid.layers.DynamicRNN.step_input (ArgSpec(args=['self', 'x', 'level'], varargs=None, keywords=None, defaults=(0,)), ('document', '7568c5ac7622a10288d3307a94134655'))
paddle.fluid.layers.DynamicRNN.update_memory (ArgSpec(args=['self', 'ex_mem', 'new_mem'], varargs=None, keywords=None, defaults=None), ('document', '5d83987da13b98363d6a807a52d8024f')) paddle.fluid.layers.DynamicRNN.update_memory (ArgSpec(args=['self', 'ex_mem', 'new_mem'], varargs=None, keywords=None, defaults=None), ('document', '5d83987da13b98363d6a807a52d8024f'))
paddle.fluid.layers.StaticRNN.__init__ (ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.layers.StaticRNN.__init__ (ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.layers.StaticRNN.memory (ArgSpec(args=['self', 'init', 'shape', 'batch_ref', 'init_value', 'init_batch_dim_idx', 'ref_batch_dim_idx'], varargs=None, keywords=None, defaults=(None, None, None, 0.0, 0, 1)), ('document', 'c24e368e23afac1ed91a78a639d7a9c7')) paddle.fluid.layers.StaticRNN.memory (ArgSpec(args=['self', 'init', 'shape', 'batch_ref', 'init_value', 'init_batch_dim_idx', 'ref_batch_dim_idx'], varargs=None, keywords=None, defaults=(None, None, None, 0.0, 0, 1)), ('document', '72530f299d6451a567cf4a12dc3fb1ff'))
paddle.fluid.layers.StaticRNN.output (ArgSpec(args=['self'], varargs='outputs', keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.layers.StaticRNN.output (ArgSpec(args=['self'], varargs='outputs', keywords=None, defaults=None), ('document', 'df6ceab6e6c9bd31e97914d7e7538137'))
paddle.fluid.layers.StaticRNN.step (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.layers.StaticRNN.step (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6d3e0a5d9aa519a9773a36e1620ea9b7'))
paddle.fluid.layers.StaticRNN.step_input (ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.layers.StaticRNN.step_input (ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None), ('document', '903387ec11f3d0bf46821d31a68cffa5'))
paddle.fluid.layers.StaticRNN.step_output (ArgSpec(args=['self', 'o'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.layers.StaticRNN.step_output (ArgSpec(args=['self', 'o'], varargs=None, keywords=None, defaults=None), ('document', '252890d4c3199a7623ab8667e13fd837'))
paddle.fluid.layers.StaticRNN.update_memory (ArgSpec(args=['self', 'mem', 'var'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.layers.StaticRNN.update_memory (ArgSpec(args=['self', 'mem', 'var'], varargs=None, keywords=None, defaults=None), ('document', '7a0000520f179f35239956a5ba55119f'))
paddle.fluid.layers.reorder_lod_tensor_by_rank (ArgSpec(args=['x', 'rank_table'], varargs=None, keywords=None, defaults=None), ('document', '3545f529ef04e8f6ecb76b47fa3df01a')) paddle.fluid.layers.reorder_lod_tensor_by_rank (ArgSpec(args=['x', 'rank_table'], varargs=None, keywords=None, defaults=None), ('document', '3545f529ef04e8f6ecb76b47fa3df01a'))
paddle.fluid.layers.Print (ArgSpec(args=['input', 'first_n', 'message', 'summarize', 'print_tensor_name', 'print_tensor_type', 'print_tensor_shape', 'print_tensor_lod', 'print_phase'], varargs=None, keywords=None, defaults=(-1, None, -1, True, True, True, True, 'both')), ('document', '5fef91b0e21c93610785f2b1f7161732')) paddle.fluid.layers.Print (ArgSpec(args=['input', 'first_n', 'message', 'summarize', 'print_tensor_name', 'print_tensor_type', 'print_tensor_shape', 'print_tensor_lod', 'print_phase'], varargs=None, keywords=None, defaults=(-1, None, -1, True, True, True, True, 'both')), ('document', '5fef91b0e21c93610785f2b1f7161732'))
paddle.fluid.layers.is_empty (ArgSpec(args=['x', 'cond'], varargs=None, keywords=None, defaults=(None,)), ('document', 'bbe578dbb49ad13e15b014e98c22b519')) paddle.fluid.layers.is_empty (ArgSpec(args=['x', 'cond'], varargs=None, keywords=None, defaults=(None,)), ('document', 'bbe578dbb49ad13e15b014e98c22b519'))
...@@ -361,8 +365,7 @@ paddle.fluid.layers.inverse_time_decay (ArgSpec(args=['learning_rate', 'decay_st ...@@ -361,8 +365,7 @@ paddle.fluid.layers.inverse_time_decay (ArgSpec(args=['learning_rate', 'decay_st
paddle.fluid.layers.polynomial_decay (ArgSpec(args=['learning_rate', 'decay_steps', 'end_learning_rate', 'power', 'cycle'], varargs=None, keywords=None, defaults=(0.0001, 1.0, False)), ('document', '882634f420f626642f0874481263da40')) paddle.fluid.layers.polynomial_decay (ArgSpec(args=['learning_rate', 'decay_steps', 'end_learning_rate', 'power', 'cycle'], varargs=None, keywords=None, defaults=(0.0001, 1.0, False)), ('document', '882634f420f626642f0874481263da40'))
paddle.fluid.layers.piecewise_decay (ArgSpec(args=['boundaries', 'values'], varargs=None, keywords=None, defaults=None), ('document', 'c717d9d1d78a53c809d01b8bc56f3cae')) paddle.fluid.layers.piecewise_decay (ArgSpec(args=['boundaries', 'values'], varargs=None, keywords=None, defaults=None), ('document', 'c717d9d1d78a53c809d01b8bc56f3cae'))
paddle.fluid.layers.noam_decay (ArgSpec(args=['d_model', 'warmup_steps'], varargs=None, keywords=None, defaults=None), ('document', 'd9a95746353fd574be36dc28d8726c28')) paddle.fluid.layers.noam_decay (ArgSpec(args=['d_model', 'warmup_steps'], varargs=None, keywords=None, defaults=None), ('document', 'd9a95746353fd574be36dc28d8726c28'))
paddle.fluid.layers.append_LARS (ArgSpec(args=['params_grads', 'learning_rate', 'weight_decay'], varargs=None, keywords=None, defaults=None), ('document', 'd24fa1e7d62ac8a534fc6a86002f84f8')) paddle.fluid.layers.cosine_decay (ArgSpec(args=['learning_rate', 'step_each_epoch', 'epochs'], varargs=None, keywords=None, defaults=None), ('document', 'f8b2727bccf0f368c997d7cf05847e49'))
paddle.fluid.layers.cosine_decay (ArgSpec(args=['learning_rate', 'step_each_epoch', 'epochs'], varargs=None, keywords=None, defaults=None), ('document', '9588c64c26ffaef3c466e404a6af9d9b'))
paddle.fluid.layers.linear_lr_warmup (ArgSpec(args=['learning_rate', 'warmup_steps', 'start_lr', 'end_lr'], varargs=None, keywords=None, defaults=None), ('document', '2ef3f5ca5cd71ea4217c418e5a7a0565')) paddle.fluid.layers.linear_lr_warmup (ArgSpec(args=['learning_rate', 'warmup_steps', 'start_lr', 'end_lr'], varargs=None, keywords=None, defaults=None), ('document', '2ef3f5ca5cd71ea4217c418e5a7a0565'))
paddle.fluid.contrib.InitState.__init__ (ArgSpec(args=['self', 'init', 'shape', 'value', 'init_boot', 'need_reorder', 'dtype'], varargs=None, keywords=None, defaults=(None, None, 0.0, None, False, 'float32')), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.contrib.InitState.__init__ (ArgSpec(args=['self', 'init', 'shape', 'value', 'init_boot', 'need_reorder', 'dtype'], varargs=None, keywords=None, defaults=(None, None, 0.0, None, False, 'float32')), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.contrib.StateCell.__init__ (ArgSpec(args=['self', 'inputs', 'states', 'out_state', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.contrib.StateCell.__init__ (ArgSpec(args=['self', 'inputs', 'states', 'out_state', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
......
...@@ -72,7 +72,6 @@ bool DataFeed::PickOneFile(std::string* filename) { ...@@ -72,7 +72,6 @@ bool DataFeed::PickOneFile(std::string* filename) {
} }
VLOG(3) << "file_idx_=" << *file_idx_; VLOG(3) << "file_idx_=" << *file_idx_;
*filename = filelist_[(*file_idx_)++]; *filename = filelist_[(*file_idx_)++];
// LOG(ERROR) << "pick file:" << *filename;
return true; return true;
} }
...@@ -466,6 +465,17 @@ void MultiSlotDataFeed::Init( ...@@ -466,6 +465,17 @@ void MultiSlotDataFeed::Init(
if (slot.is_used()) { if (slot.is_used()) {
use_slots_.push_back(all_slots_[i]); use_slots_.push_back(all_slots_[i]);
use_slots_is_dense_.push_back(slot.is_dense()); use_slots_is_dense_.push_back(slot.is_dense());
std::vector<int> local_shape;
if (slot.is_dense()) {
// for batch size holder if is_dense
if (slot.shape(0) > 0) {
local_shape.push_back(0);
}
}
for (size_t i = 0; i < slot.shape_size(); ++i) {
local_shape.push_back(slot.shape(i));
}
use_slots_shape_.push_back(local_shape);
} }
} }
feed_vec_.resize(use_slots_.size()); feed_vec_.resize(use_slots_.size());
...@@ -752,8 +762,8 @@ void MultiSlotDataFeed::PutToFeedVec( ...@@ -752,8 +762,8 @@ void MultiSlotDataFeed::PutToFeedVec(
LoD data_lod{offset}; LoD data_lod{offset};
feed_vec_[i]->set_lod(data_lod); feed_vec_[i]->set_lod(data_lod);
if (use_slots_is_dense_[i]) { if (use_slots_is_dense_[i]) {
int dim = total_instance / batch_size_; use_slots_shape_[i][0] = batch_size_;
feed_vec_[i]->Resize({batch_size_, dim}); feed_vec_[i]->Resize(framework::make_ddim(use_slots_shape_[i]));
} }
} }
#endif #endif
...@@ -785,6 +795,16 @@ void MultiSlotInMemoryDataFeed::Init( ...@@ -785,6 +795,16 @@ void MultiSlotInMemoryDataFeed::Init(
if (slot.is_used()) { if (slot.is_used()) {
use_slots_.push_back(all_slots_[i]); use_slots_.push_back(all_slots_[i]);
use_slots_is_dense_.push_back(slot.is_dense()); use_slots_is_dense_.push_back(slot.is_dense());
std::vector<int> local_shape;
if (slot.is_dense()) {
if (slot.shape(0) > 0) {
local_shape.push_back(0);
}
}
for (size_t i = 0; i < slot.shape_size(); ++i) {
local_shape.push_back(slot.shape(i));
}
use_slots_shape_.push_back(local_shape);
} }
} }
feed_vec_.resize(use_slots_.size()); feed_vec_.resize(use_slots_.size());
...@@ -940,8 +960,8 @@ void MultiSlotInMemoryDataFeed::PutToFeedVec( ...@@ -940,8 +960,8 @@ void MultiSlotInMemoryDataFeed::PutToFeedVec(
LoD data_lod{offset}; LoD data_lod{offset};
feed_vec_[i]->set_lod(data_lod); feed_vec_[i]->set_lod(data_lod);
if (use_slots_is_dense_[i]) { if (use_slots_is_dense_[i]) {
int dim = total_instance / batch_size_; use_slots_shape_[i][0] = batch_size_;
feed_vec_[i]->Resize({batch_size_, dim}); feed_vec_[i]->Resize(framework::make_ddim(use_slots_shape_[i]));
} }
} }
#endif #endif
......
...@@ -142,6 +142,7 @@ class DataFeed { ...@@ -142,6 +142,7 @@ class DataFeed {
// object) // object)
std::vector<std::string> all_slots_; std::vector<std::string> all_slots_;
std::vector<std::string> all_slots_type_; std::vector<std::string> all_slots_type_;
std::vector<std::vector<int>> use_slots_shape_;
std::vector<int> std::vector<int>
use_slots_index_; // -1: not used; >=0: the index of use_slots_ use_slots_index_; // -1: not used; >=0: the index of use_slots_
......
...@@ -19,6 +19,7 @@ message Slot { ...@@ -19,6 +19,7 @@ message Slot {
required string type = 2; required string type = 2;
optional bool is_dense = 3 [ default = false ]; optional bool is_dense = 3 [ default = false ];
optional bool is_used = 4 [ default = false ]; optional bool is_used = 4 [ default = false ];
repeated int32 shape = 5; // we can define N-D Tensor
} }
message MultiSlotDesc { repeated Slot slots = 1; } message MultiSlotDesc { repeated Slot slots = 1; }
......
...@@ -64,9 +64,12 @@ void ProcessGraph(std::vector<ir::Graph *> graphs, Scope *scope) { ...@@ -64,9 +64,12 @@ void ProcessGraph(std::vector<ir::Graph *> graphs, Scope *scope) {
node->Op()->GetNullableAttr("epmap")); node->Op()->GetNullableAttr("epmap"));
auto height_section = boost::get<std::vector<int64_t>>( auto height_section = boost::get<std::vector<int64_t>>(
node->Op()->GetNullableAttr("sections")); node->Op()->GetNullableAttr("sections"));
auto trainer_id =
boost::get<int>(node->Op()->GetNullableAttr("trainer_id"));
send_varname_to_ctx[send_var_name] = send_varname_to_ctx[send_var_name] =
operators::distributed::RpcContext(send_var_name, send_varnames, operators::distributed::RpcContext(send_var_name, send_varnames,
epmap, height_section); epmap, height_section,
trainer_id);
VLOG(3) << "find and init an send op: " VLOG(3) << "find and init an send op: "
<< send_varname_to_ctx[send_var_name]; << send_varname_to_ctx[send_var_name];
} else if (node->Name() == "recv") { } else if (node->Name() == "recv") {
...@@ -75,9 +78,11 @@ void ProcessGraph(std::vector<ir::Graph *> graphs, Scope *scope) { ...@@ -75,9 +78,11 @@ void ProcessGraph(std::vector<ir::Graph *> graphs, Scope *scope) {
node->Op()->GetNullableAttr("recv_varnames")); node->Op()->GetNullableAttr("recv_varnames"));
auto epmap = boost::get<std::vector<std::string>>( auto epmap = boost::get<std::vector<std::string>>(
node->Op()->GetNullableAttr("epmap")); node->Op()->GetNullableAttr("epmap"));
auto trainer_id =
boost::get<int>(node->Op()->GetNullableAttr("trainer_id"));
recv_varname_to_ctx[recv_var_name] = recv_varname_to_ctx[recv_var_name] =
operators::distributed::RpcContext(recv_var_name, recv_varnames, operators::distributed::RpcContext(recv_var_name, recv_varnames,
epmap, {}); epmap, {}, trainer_id);
nodes_to_delete.push_back(node); nodes_to_delete.push_back(node);
VLOG(3) << "find and remove an recv op: " VLOG(3) << "find and remove an recv op: "
<< recv_varname_to_ctx[recv_var_name]; << recv_varname_to_ctx[recv_var_name];
......
...@@ -101,8 +101,6 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { ...@@ -101,8 +101,6 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
"mode."; "mode.";
strategy_.fuse_all_optimizer_ops_ = false; strategy_.fuse_all_optimizer_ops_ = false;
} else { } else {
VLOG(10) << "Add alloc_continuous_space_for_grad_pass";
AppendPass("alloc_continuous_space_for_grad_pass");
// NOTE: fuse_all_xx_ops will count the number of xx operator first, // NOTE: fuse_all_xx_ops will count the number of xx operator first,
// if the number is zero, fuse_all_reduce_ops will do nothing. // if the number is zero, fuse_all_reduce_ops will do nothing.
// Currently, only one type of optimization algorithm can be fused. // Currently, only one type of optimization algorithm can be fused.
...@@ -150,6 +148,11 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { ...@@ -150,6 +148,11 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
AppendPass("runtime_context_cache_pass"); AppendPass("runtime_context_cache_pass");
} }
if (strategy_.cache_expected_kernel_) {
VLOG(10) << "Add expected_kernel_cache_pass";
AppendPass("expected_kernel_cache_pass");
}
AppendMultiDevPass(strategy_); AppendMultiDevPass(strategy_);
if (strategy_.fuse_all_reduce_ops_) { if (strategy_.fuse_all_reduce_ops_) {
...@@ -337,3 +340,4 @@ USE_PASS(fuse_adam_op_pass); ...@@ -337,3 +340,4 @@ USE_PASS(fuse_adam_op_pass);
USE_PASS(fuse_sgd_op_pass); USE_PASS(fuse_sgd_op_pass);
USE_PASS(fuse_all_reduce_op_pass); USE_PASS(fuse_all_reduce_op_pass);
USE_PASS(runtime_context_cache_pass); USE_PASS(runtime_context_cache_pass);
USE_PASS(expected_kernel_cache_pass);
...@@ -83,11 +83,11 @@ struct BuildStrategy { ...@@ -83,11 +83,11 @@ struct BuildStrategy {
bool sync_batch_norm_{false}; bool sync_batch_norm_{false};
bool memory_optimize_{true}; // FIXME(liuwei1031) disable memory_optimzie and enable_inplace in 1.4
// TODO(dzhwinter): // to open them by default, we need to solve the fetch variable issue
// make enable_inplace, memory_optimize_ bool memory_optimize_{false};
// memory_early_delete_ true by default
bool enable_inplace_{true}; bool enable_inplace_{false};
bool enable_sequential_execution_{false}; bool enable_sequential_execution_{false};
...@@ -108,6 +108,7 @@ struct BuildStrategy { ...@@ -108,6 +108,7 @@ struct BuildStrategy {
bool remove_unnecessary_lock_{true}; bool remove_unnecessary_lock_{true};
bool cache_runtime_context_{false}; bool cache_runtime_context_{false};
bool cache_expected_kernel_{true};
// NOTE: // NOTE:
// Before you add new options, think if it's a general strategy that works // Before you add new options, think if it's a general strategy that works
......
...@@ -24,7 +24,7 @@ namespace details { ...@@ -24,7 +24,7 @@ namespace details {
const std::string FuseAdamOpPass::GetOpType() const { return "adam"; } const std::string FuseAdamOpPass::GetOpType() const { return "adam"; }
const std::vector<std::string> FuseAdamOpPass::GetAuxiliaryVarNames() const { const std::vector<std::string> FuseAdamOpPass::GetAuxiliaryVarNames() const {
return {"Param", "Moment1", "Moment2", "Beta1Pow", "Beta2Pow"}; return {"Moment1", "Moment2", "Beta1Pow", "Beta2Pow"};
} }
void FuseAdamOpPass::FuseOptimizerOps( void FuseAdamOpPass::FuseOptimizerOps(
...@@ -77,16 +77,16 @@ void FuseAdamOpPass::FuseAdamOps( ...@@ -77,16 +77,16 @@ void FuseAdamOpPass::FuseAdamOps(
VLOG(10) << "Insert adam to graph "; VLOG(10) << "Insert adam to graph ";
OpDesc adam_desc(adam_ops[0]->Op()->Block()); OpDesc adam_desc(adam_ops[0]->Op()->Block());
adam_desc.SetType("adam"); adam_desc.SetType("adam");
adam_desc.SetInput("Param", {fused_vars_name.at("Param")}); adam_desc.SetInput(kParam, {fused_vars_name.at(kParam)});
adam_desc.SetInput("Grad", {fused_vars_name.at("Grad")}); adam_desc.SetInput(kGrad, {fused_vars_name.at(kGrad)});
adam_desc.SetInput("Moment1", {fused_vars_name.at("Moment1")}); adam_desc.SetInput("Moment1", {fused_vars_name.at("Moment1")});
adam_desc.SetInput("Moment2", {fused_vars_name.at("Moment2")}); adam_desc.SetInput("Moment2", {fused_vars_name.at("Moment2")});
// TODO(zcd): The LearningRate, Beta1Pow, Beta2Pow should be equal. // TODO(zcd): The LearningRate, Beta1Pow, Beta2Pow should be equal.
adam_desc.SetInput("LearningRate", adam_ops[0]->Op()->Input("LearningRate")); adam_desc.SetInput(kLearningRate, adam_ops[0]->Op()->Input(kLearningRate));
adam_desc.SetInput("Beta1Pow", adam_ops[0]->Op()->Input("Beta1Pow")); adam_desc.SetInput("Beta1Pow", adam_ops[0]->Op()->Input("Beta1Pow"));
adam_desc.SetInput("Beta2Pow", adam_ops[0]->Op()->Input("Beta2Pow")); adam_desc.SetInput("Beta2Pow", adam_ops[0]->Op()->Input("Beta2Pow"));
adam_desc.SetOutput("ParamOut", {fused_vars_name.at("Param")}); adam_desc.SetOutput("ParamOut", {fused_vars_name.at(kParam)});
adam_desc.SetOutput("Moment1Out", {fused_vars_name.at("Moment1")}); adam_desc.SetOutput("Moment1Out", {fused_vars_name.at("Moment1")});
adam_desc.SetOutput("Moment2Out", {fused_vars_name.at("Moment2")}); adam_desc.SetOutput("Moment2Out", {fused_vars_name.at("Moment2")});
adam_desc.SetAttr("beta1", beta1); adam_desc.SetAttr("beta1", beta1);
......
...@@ -29,7 +29,9 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const { ...@@ -29,7 +29,9 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const {
auto &local_scopes = Get<const std::vector<Scope *>>(kLocalScopes); auto &local_scopes = Get<const std::vector<Scope *>>(kLocalScopes);
const std::string fuse_op_type = GetOpType(); const std::string fuse_op_type = GetOpType();
const std::vector<std::string> aux_var_names = GetAuxiliaryVarNames(); std::vector<std::string> aux_var_names = GetAuxiliaryVarNames();
aux_var_names.emplace_back(kParam);
aux_var_names.emplace_back(kGrad);
// Step 1: Get the specified op and auxiliary variables. // Step 1: Get the specified op and auxiliary variables.
std::vector<ir::Node *> topo_nodes = ir::TopologySortOperations(result); std::vector<ir::Node *> topo_nodes = ir::TopologySortOperations(result);
...@@ -61,7 +63,7 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const { ...@@ -61,7 +63,7 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const {
result.Set(kFusedVars, new FusedVars); result.Set(kFusedVars, new FusedVars);
} }
std::unordered_map<std::string, std::string> fused_vars_name; std::unordered_map<std::string, std::string> fused_vars_name;
fused_vars_name.reserve(aux_var_names.size() + 1); fused_vars_name.reserve(aux_var_names.size());
auto &fused_var_set = result.Get<FusedVars>(kFusedVars); auto &fused_var_set = result.Get<FusedVars>(kFusedVars);
const std::string prefix(kFusedVarNamePrefix); const std::string prefix(kFusedVarNamePrefix);
// NOTE: the fused_var_name should be unique. // NOTE: the fused_var_name should be unique.
...@@ -75,39 +77,103 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const { ...@@ -75,39 +77,103 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const {
} }
// Step 3: Get the fused Gradient's name // Step 3: Get the fused Gradient's name
auto &params_grads = result.Get<ParamsAndGrads>(kParamsAndGrads); bool grad_fused = false;
if (!result.Has(kFusedGrads)) { if (result.Has(kParamsAndGrads)) {
PADDLE_THROW( auto &params_grads = result.Get<ParamsAndGrads>(kParamsAndGrads);
"The alloc_continuous_space_for_grad_pass should be called before this " PADDLE_ENFORCE_EQ(
"pass."); params_grads.size(), aux_var_set.at(kGrad).size(),
} "The number of gradients and optimizer ops is not equal.");
auto &fused_grad = result.Get<FusedGrads>(kFusedGrads); std::unordered_set<std::string> opt_grad_set(aux_var_set.at(kGrad).begin(),
auto &fused_vars = result.Get<FusedVars>(kFusedVars); aux_var_set.at(kGrad).end());
auto iter = std::find(fused_vars.begin(), fused_vars.end(), fused_grad); size_t same_grad_num = 0;
PADDLE_ENFORCE(iter != fused_vars.end(), "Not find the fused_grad."); for (auto &p_g : params_grads) {
fused_vars_name.emplace("Grad", fused_grad); if (opt_grad_set.count(p_g.second)) {
++same_grad_num;
// Step 4: Sort the parameters and auxiliary variables according }
// to parameters' name to make variables' name correspond correctly. }
PADDLE_ENFORCE(result.Has(kParamsAndGrads), "Does't find kParamsAndGrads.");
PADDLE_ENFORCE_EQ(params_grads.size(), aux_var_set.begin()->second.size(), // NOTE(zcd): the gradient of kParamsAndGrads may be different with the
"The size of params_grads and aux_var_set are not equal."); // kGrad.
SortParametersAndAuxVars(params_grads, &aux_var_set, &opt_ops); if (same_grad_num == aux_var_set.at(kGrad).size()) {
if (!result.Has(kFusedGrads)) {
// Step 5: Alloc continuous space for Parameters and AuxiliaryVar(e.g. PADDLE_THROW(
"The alloc_continuous_space_for_grad_pass should be called before "
"this pass.");
}
auto &fused_grad = result.Get<FusedGrads>(kFusedGrads);
auto &fused_vars = result.Get<FusedVars>(kFusedVars);
auto iter = std::find(fused_vars.begin(), fused_vars.end(), fused_grad);
PADDLE_ENFORCE(iter != fused_vars.end(), "Not find the fused_grad.");
fused_vars_name[kGrad] = fused_grad;
// Sort the parameters and auxiliary variables according
// to parameters' name to make variables' name correspond correctly.
SortParametersAndAuxVars(params_grads, &aux_var_set, &opt_ops);
grad_fused = true;
}
}
// Step 4: Alloc continuous space for Parameters and AuxiliaryVar(e.g.
// Moment1, Moment2, Beta1Pow, Beta2Pow) of all the optimizer ops separately. // Moment1, Moment2, Beta1Pow, Beta2Pow) of all the optimizer ops separately.
aux_var_names.pop_back();
if (!grad_fused) {
InitFusedGradsAndAllocSpaceForGrads(
places, local_scopes, aux_var_set.at(kParam), aux_var_set.at(kGrad),
fused_vars_name.at(kGrad), &result);
}
InitFusedVarsAndAllocSpaceForVars(places, local_scopes, aux_var_names, InitFusedVarsAndAllocSpaceForVars(places, local_scopes, aux_var_names,
aux_var_set, fused_vars_name); aux_var_set, fused_vars_name);
// Step 6: Fuse optimizer Ops and Scale Ops // Step 5: Fuse optimizer Ops and Scale Ops
FuseOptimizerOps(aux_var_set, fused_vars_name, opt_ops, &result); FuseOptimizerOps(aux_var_set, fused_vars_name, opt_ops, &result);
// Step 7: Remove optimizer Ops // Step 6: Remove optimizer Ops
for (auto &opt_op : opt_ops) { for (auto &opt_op : opt_ops) {
graph->RemoveNode(opt_op); graph->RemoveNode(opt_op);
} }
} }
void FuseOptimizerOpPass::InitFusedGradsAndAllocSpaceForGrads(
const std::vector<platform::Place> &places,
const std::vector<Scope *> &local_scopes,
const std::vector<std::string> &params,
const std::vector<std::string> &grads, const std::string &fused_grad_name,
ir::Graph *result) const {
// Get Var Nodes
std::unordered_map<std::string, ir::Node *> vars;
for (ir::Node *node : result->Nodes()) {
if (node->IsVar() && node->Var()) {
// Note: The graph may have the same name node. For example, parameter
// is the input of operator and it also is the output of optimizer;
vars.emplace(node->Var()->Name(), node);
}
}
// Init Grads
for (auto it = local_scopes.rbegin(); it != local_scopes.rend(); ++it) {
auto &scope = *it;
VLOG(10) << "Init " << fused_grad_name;
PADDLE_ENFORCE(scope->FindVar(fused_grad_name) == nullptr,
"%s has existed in scope.", fused_grad_name);
scope->Var(fused_grad_name)->GetMutable<LoDTensor>();
for (auto &grad_var_name : grads) {
auto iter = vars.find(grad_var_name);
PADDLE_ENFORCE(iter != vars.end());
PADDLE_ENFORCE_NOT_NULL(iter->second->Var());
PADDLE_ENFORCE_EQ(iter->second->Var()->GetType(),
proto::VarType::LOD_TENSOR);
scope->Var(grad_var_name)->GetMutable<LoDTensor>();
}
}
// Define Ops
ProgramDesc program_desc;
auto *global_block = program_desc.MutableBlock(0);
AppendAllocContinuousSpace(params, grads, fused_grad_name, global_block,
false, false);
// Run Ops
RunInitOps(places, local_scopes, *global_block);
}
void FuseOptimizerOpPass::InitFusedVarsAndAllocSpaceForVars( void FuseOptimizerOpPass::InitFusedVarsAndAllocSpaceForVars(
const std::vector<platform::Place> &places, const std::vector<platform::Place> &places,
const std::vector<Scope *> &local_scopes, const std::vector<Scope *> &local_scopes,
...@@ -115,37 +181,49 @@ void FuseOptimizerOpPass::InitFusedVarsAndAllocSpaceForVars( ...@@ -115,37 +181,49 @@ void FuseOptimizerOpPass::InitFusedVarsAndAllocSpaceForVars(
const std::unordered_map<std::string, std::vector<std::string>> const std::unordered_map<std::string, std::vector<std::string>>
&aux_var_set, &aux_var_set,
const std::unordered_map<std::string, std::string> &fused_vars_name) const { const std::unordered_map<std::string, std::string> &fused_vars_name) const {
VLOG(10) << "Init FusedVars."; // Init Vars
// Alloc parameters and auxiliary vars in the respective scope. for (auto &var_name : aux_var_names) {
size_t idx = local_scopes.size(); auto &fused_var_name = fused_vars_name.at(var_name);
for (auto iter = local_scopes.rbegin(); iter != local_scopes.rend(); InitVars(local_scopes, fused_var_name);
++iter, --idx) {
auto &scope = *iter;
for (auto &var_name : aux_var_names) {
auto fused_var_name = fused_vars_name.at(var_name);
VLOG(10) << "Init " << fused_var_name;
PADDLE_ENFORCE(scope->FindVar(fused_var_name) == nullptr,
"%s has exist in scope[%d]", fused_var_name, idx);
scope->Var(fused_var_name)->GetMutable<LoDTensor>();
}
} }
// Define Ops
ProgramDesc program_desc; ProgramDesc program_desc;
auto *global_block = program_desc.MutableBlock(0); auto *global_block = program_desc.MutableBlock(0);
for (auto &var_name : aux_var_names) { for (auto &var_name : aux_var_names) {
AppendAllocContinuousSpace(aux_var_set.at(var_name), AppendAllocContinuousSpace(
fused_vars_name.at(var_name), true, aux_var_set.at(var_name), aux_var_set.at(var_name),
global_block); fused_vars_name.at(var_name), global_block, true);
} }
// Run Ops
RunInitOps(places, local_scopes, *global_block);
}
void FuseOptimizerOpPass::RunInitOps(const std::vector<platform::Place> &places,
const std::vector<Scope *> &local_scopes,
const BlockDesc &global_block) const {
for (size_t i = 0; i < local_scopes.size(); ++i) { for (size_t i = 0; i < local_scopes.size(); ++i) {
for (auto &op_desc : global_block->AllOps()) { for (auto &op_desc : global_block.AllOps()) {
auto op = OpRegistry::CreateOp(*op_desc); auto op = OpRegistry::CreateOp(*op_desc);
op->Run(*local_scopes[i], places[i]); op->Run(*local_scopes[i], places[i]);
} }
} }
} }
void FuseOptimizerOpPass::InitVars(const std::vector<Scope *> &local_scopes,
const std::string &fused_var_name) const {
VLOG(10) << "Init FusedVars.";
// Alloc parameters and auxiliary vars in the respective scope.
size_t idx = local_scopes.size();
for (auto iter = local_scopes.rbegin(); iter != local_scopes.rend();
++iter, --idx) {
auto &scope = *iter;
VLOG(10) << "Init " << fused_var_name;
PADDLE_ENFORCE(scope->FindVar(fused_var_name) == nullptr,
"%s has exist in scope[%d]", fused_var_name, idx);
scope->Var(fused_var_name)->GetMutable<LoDTensor>();
}
}
void FuseOptimizerOpPass::SortParametersAndAuxVars( void FuseOptimizerOpPass::SortParametersAndAuxVars(
const std::vector<std::pair<std::string, std::string>> &params_grads, const std::vector<std::pair<std::string, std::string>> &params_grads,
std::unordered_map<std::string, std::vector<std::string>> *aux_vars_set, std::unordered_map<std::string, std::vector<std::string>> *aux_vars_set,
...@@ -203,15 +281,16 @@ void FuseOptimizerOpPass::GetSpecifiedOpsAndVars( ...@@ -203,15 +281,16 @@ void FuseOptimizerOpPass::GetSpecifiedOpsAndVars(
} }
void FuseOptimizerOpPass::AppendAllocContinuousSpace( void FuseOptimizerOpPass::AppendAllocContinuousSpace(
const std::vector<std::string> &args, const std::string &out_arg, const std::vector<std::string> &in_args,
bool copy_data, BlockDesc *global_block) const { const std::vector<std::string> &out_args, const std::string &fused_out_arg,
BlockDesc *global_block, bool copy_data, bool check_name) const {
auto op_desc = global_block->AppendOp(); auto op_desc = global_block->AppendOp();
op_desc->SetType("alloc_continuous_space"); op_desc->SetType("alloc_continuous_space");
op_desc->SetInput("Input", args); op_desc->SetInput("Input", in_args);
op_desc->SetOutput("Output", args); op_desc->SetOutput("Output", out_args);
op_desc->SetOutput("FusedOutput", {out_arg}); op_desc->SetOutput("FusedOutput", {fused_out_arg});
op_desc->SetAttr("copy_data", copy_data); op_desc->SetAttr("copy_data", copy_data);
op_desc->SetAttr("check_name", true); op_desc->SetAttr("check_name", check_name);
} }
void FuseOptimizerOpPass::InserInputAndOutputForOptOps( void FuseOptimizerOpPass::InserInputAndOutputForOptOps(
......
...@@ -27,6 +27,10 @@ namespace paddle { ...@@ -27,6 +27,10 @@ namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
constexpr char kGrad[] = "Grad";
constexpr char kParam[] = "Param";
constexpr char kLearningRate[] = "LearningRate";
class FuseOptimizerOpPass : public ir::Pass { class FuseOptimizerOpPass : public ir::Pass {
protected: protected:
void ApplyImpl(ir::Graph *graph) const override; void ApplyImpl(ir::Graph *graph) const override;
...@@ -56,9 +60,18 @@ class FuseOptimizerOpPass : public ir::Pass { ...@@ -56,9 +60,18 @@ class FuseOptimizerOpPass : public ir::Pass {
std::unordered_map<std::string, std::vector<std::string>> *aux_args_name) std::unordered_map<std::string, std::vector<std::string>> *aux_args_name)
const; const;
void AppendAllocContinuousSpace(const std::vector<std::string> &args, void AppendAllocContinuousSpace(const std::vector<std::string> &in_args,
const std::string &out_arg, bool copy_data, const std::vector<std::string> &out_args,
BlockDesc *global_block) const; const std::string &fused_out_arg,
BlockDesc *global_block, bool copy_data,
bool check_name = true) const;
void InitFusedGradsAndAllocSpaceForGrads(
const std::vector<platform::Place> &places,
const std::vector<Scope *> &local_scopes,
const std::vector<std::string> &params,
const std::vector<std::string> &grads, const std::string &fused_grad_name,
ir::Graph *result) const;
void InitFusedVarsAndAllocSpaceForVars( void InitFusedVarsAndAllocSpaceForVars(
const std::vector<platform::Place> &places, const std::vector<platform::Place> &places,
...@@ -68,6 +81,13 @@ class FuseOptimizerOpPass : public ir::Pass { ...@@ -68,6 +81,13 @@ class FuseOptimizerOpPass : public ir::Pass {
&aux_var_set, &aux_var_set,
const std::unordered_map<std::string, std::string> &fused_vars_name) const std::unordered_map<std::string, std::string> &fused_vars_name)
const; const;
void RunInitOps(const std::vector<platform::Place> &places,
const std::vector<Scope *> &local_scopes,
const BlockDesc &global_block) const;
void InitVars(const std::vector<Scope *> &local_scopes,
const std::string &fused_var_name) const;
}; };
} // namespace details } // namespace details
......
...@@ -24,7 +24,7 @@ namespace details { ...@@ -24,7 +24,7 @@ namespace details {
const std::string FuseSgdOpPass::GetOpType() const { return "sgd"; } const std::string FuseSgdOpPass::GetOpType() const { return "sgd"; }
const std::vector<std::string> FuseSgdOpPass::GetAuxiliaryVarNames() const { const std::vector<std::string> FuseSgdOpPass::GetAuxiliaryVarNames() const {
return {"Param"}; return {};
} }
void FuseSgdOpPass::FuseOptimizerOps( void FuseSgdOpPass::FuseOptimizerOps(
...@@ -50,12 +50,12 @@ void FuseSgdOpPass::FuseSgdOps( ...@@ -50,12 +50,12 @@ void FuseSgdOpPass::FuseSgdOps(
// Add fused scale // Add fused scale
OpDesc Sgd_desc(sgd_ops[0]->Op()->Block()); OpDesc Sgd_desc(sgd_ops[0]->Op()->Block());
Sgd_desc.SetType("sgd"); Sgd_desc.SetType("sgd");
Sgd_desc.SetInput("Param", {fused_vars_name.at("Param")}); Sgd_desc.SetInput(kParam, {fused_vars_name.at(kParam)});
Sgd_desc.SetInput("Grad", {fused_vars_name.at("Grad")}); Sgd_desc.SetInput(kGrad, {fused_vars_name.at(kGrad)});
Sgd_desc.SetOutput("ParamOut", {fused_vars_name.at("Param")}); Sgd_desc.SetOutput("ParamOut", {fused_vars_name.at(kParam)});
// TODO(zcd): The LearningRate, Beta1Pow, Beta2Pow should be equal. // TODO(zcd): The LearningRate, Beta1Pow, Beta2Pow should be equal.
Sgd_desc.SetInput("LearningRate", sgd_ops[0]->Op()->Input("LearningRate")); Sgd_desc.SetInput(kLearningRate, sgd_ops[0]->Op()->Input(kLearningRate));
// NOTE: multi_devices_pass requires that every op should have a role. // NOTE: multi_devices_pass requires that every op should have a role.
Sgd_desc.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), op_role); Sgd_desc.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), op_role);
......
...@@ -106,7 +106,7 @@ ParallelSSAGraphExecutor::ParallelSSAGraphExecutor( ...@@ -106,7 +106,7 @@ ParallelSSAGraphExecutor::ParallelSSAGraphExecutor(
VLOG(1) << "set num_threads: " << strategy_.num_threads_ VLOG(1) << "set num_threads: " << strategy_.num_threads_
<< " to run the operators of the graph on each device."; << " to run the operators of the graph on each device.";
for (size_t i = 0; i < places.size(); ++i) { for (size_t i = 0; i < places.size(); ++i) {
executors_.emplace_back(new details::ThreadedSSAGraphExecutor( executors_.emplace_back(new details::FastThreadedSSAGraphExecutor(
strategy_, local_scopes_, {places_[i]}, graphs_.at(i).get())); strategy_, local_scopes_, {places_[i]}, graphs_.at(i).get()));
} }
} }
......
...@@ -14,12 +14,12 @@ ...@@ -14,12 +14,12 @@
#pragma once #pragma once
#include <memory>
#include <string> #include <string>
#include <vector> #include <vector>
#include "ThreadPool.h" #include "ThreadPool.h"
#include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
#include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph.h"
namespace paddle { namespace paddle {
...@@ -48,7 +48,8 @@ class ParallelSSAGraphExecutor : public SSAGraphExecutor { ...@@ -48,7 +48,8 @@ class ParallelSSAGraphExecutor : public SSAGraphExecutor {
std::vector<platform::Place> places_; std::vector<platform::Place> places_;
std::vector<std::unique_ptr<ir::Graph>> graphs_; std::vector<std::unique_ptr<ir::Graph>> graphs_;
std::vector<std::unique_ptr<details::ThreadedSSAGraphExecutor>> executors_; std::vector<std::unique_ptr<details::FastThreadedSSAGraphExecutor>>
executors_;
ExceptionHolder exception_holder_; ExceptionHolder exception_holder_;
}; };
......
...@@ -21,40 +21,40 @@ namespace framework { ...@@ -21,40 +21,40 @@ namespace framework {
void DownpourWorker::Initialize(const TrainerDesc& desc) { void DownpourWorker::Initialize(const TrainerDesc& desc) {
param_ = desc.downpour_param(); param_ = desc.downpour_param();
for (size_t i = 0; i < param_.sparse_table_size(); ++i) { for (int i = 0; i < param_.sparse_table_size(); ++i) {
uint64_t table_id = uint64_t table_id =
static_cast<uint64_t>(param_.sparse_table(i).table_id()); static_cast<uint64_t>(param_.sparse_table(i).table_id());
TableParameter table = param_.sparse_table(i); TableParameter table = param_.sparse_table(i);
sparse_key_names_[table_id].resize(table.sparse_key_name_size()); sparse_key_names_[table_id].resize(table.sparse_key_name_size());
for (size_t j = 0; j < table.sparse_key_name_size(); ++j) { for (int j = 0; j < table.sparse_key_name_size(); ++j) {
sparse_key_names_[table_id][j] = table.sparse_key_name(j); sparse_key_names_[table_id][j] = table.sparse_key_name(j);
} }
sparse_value_names_[table_id].resize(table.sparse_value_name_size()); sparse_value_names_[table_id].resize(table.sparse_value_name_size());
for (size_t j = 0; j < table.sparse_value_name_size(); ++j) { for (int j = 0; j < table.sparse_value_name_size(); ++j) {
sparse_value_names_[table_id][j] = table.sparse_value_name(j); sparse_value_names_[table_id][j] = table.sparse_value_name(j);
} }
sparse_grad_names_[table_id].resize(table.sparse_grad_name_size()); sparse_grad_names_[table_id].resize(table.sparse_grad_name_size());
for (size_t j = 0; j < table.sparse_grad_name_size(); ++j) { for (int j = 0; j < table.sparse_grad_name_size(); ++j) {
sparse_grad_names_[table_id][j] = table.sparse_grad_name(j); sparse_grad_names_[table_id][j] = table.sparse_grad_name(j);
} }
label_var_name_[table_id] = table.label_var_name(); label_var_name_[table_id] = table.label_var_name();
} }
for (size_t i = 0; i < param_.dense_table_size(); ++i) { for (int i = 0; i < param_.dense_table_size(); ++i) {
uint64_t table_id = static_cast<uint64_t>(param_.dense_table(i).table_id()); uint64_t table_id = static_cast<uint64_t>(param_.dense_table(i).table_id());
auto table = param_.dense_table(i); auto table = param_.dense_table(i);
dense_value_names_[table_id].resize(table.dense_value_name_size()); dense_value_names_[table_id].resize(table.dense_value_name_size());
for (size_t j = 0; j < table.dense_value_name_size(); ++j) { for (int j = 0; j < table.dense_value_name_size(); ++j) {
dense_value_names_[table_id][j] = table.dense_value_name(j); dense_value_names_[table_id][j] = table.dense_value_name(j);
} }
dense_grad_names_[table_id].resize(table.dense_grad_name_size()); dense_grad_names_[table_id].resize(table.dense_grad_name_size());
for (size_t j = 0; j < table.dense_grad_name_size(); ++j) { for (int j = 0; j < table.dense_grad_name_size(); ++j) {
dense_grad_names_[table_id][j] = table.dense_grad_name(j); dense_grad_names_[table_id][j] = table.dense_grad_name(j);
} }
} }
skip_ops_.resize(param_.skip_ops_size()); skip_ops_.resize(param_.skip_ops_size());
for (size_t i = 0; i < param_.skip_ops_size(); ++i) { for (int i = 0; i < param_.skip_ops_size(); ++i) {
skip_ops_[i] = param_.skip_ops(i); skip_ops_[i] = param_.skip_ops(i);
} }
...@@ -83,14 +83,14 @@ void DownpourWorker::CollectLabelInfo(size_t table_idx) { ...@@ -83,14 +83,14 @@ void DownpourWorker::CollectLabelInfo(size_t table_idx) {
LoDTensor* tensor = var->GetMutable<LoDTensor>(); LoDTensor* tensor = var->GetMutable<LoDTensor>();
int64_t* label_ptr = tensor->data<int64_t>(); int64_t* label_ptr = tensor->data<int64_t>();
int global_index = 0; size_t global_index = 0;
for (size_t i = 0; i < sparse_key_names_[table_id].size(); ++i) { for (size_t i = 0; i < sparse_key_names_[table_id].size(); ++i) {
VLOG(3) << "sparse_key_names_[" << i VLOG(3) << "sparse_key_names_[" << i
<< "]: " << sparse_key_names_[table_id][i]; << "]: " << sparse_key_names_[table_id][i];
Variable* fea_var = thread_scope_->FindVar(sparse_key_names_[table_id][i]); Variable* fea_var = thread_scope_->FindVar(sparse_key_names_[table_id][i]);
LoDTensor* tensor = fea_var->GetMutable<LoDTensor>(); LoDTensor* tensor = fea_var->GetMutable<LoDTensor>();
int64_t* ids = tensor->data<int64_t>(); int64_t* ids = tensor->data<int64_t>();
int fea_idx = 0; size_t fea_idx = 0;
// tensor->lod()[0].size() == batch_size + 1 // tensor->lod()[0].size() == batch_size + 1
for (auto lod_idx = 1u; lod_idx < tensor->lod()[0].size(); ++lod_idx) { for (auto lod_idx = 1u; lod_idx < tensor->lod()[0].size(); ++lod_idx) {
for (; fea_idx < tensor->lod()[0][lod_idx]; ++fea_idx) { for (; fea_idx < tensor->lod()[0][lod_idx]; ++fea_idx) {
...@@ -138,7 +138,7 @@ void DownpourWorker::FillSparseValue(size_t table_idx) { ...@@ -138,7 +138,7 @@ void DownpourWorker::FillSparseValue(size_t table_idx) {
auto& tensor_lod = tensor->lod()[0]; auto& tensor_lod = tensor->lod()[0];
LoD data_lod{tensor_lod}; LoD data_lod{tensor_lod};
tensor_emb->set_lod(data_lod); tensor_emb->set_lod(data_lod);
for (auto index = 0u; index < len; ++index) { for (int index = 0; index < len; ++index) {
if (ids[index] == 0u) { if (ids[index] == 0u) {
memcpy(ptr + table.emb_dim() * index, init_value.data() + 2, memcpy(ptr + table.emb_dim() * index, init_value.data() + 2,
sizeof(float) * table.emb_dim()); sizeof(float) * table.emb_dim());
...@@ -192,7 +192,7 @@ void DownpourWorker::TrainFilesWithProfiler() { ...@@ -192,7 +192,7 @@ void DownpourWorker::TrainFilesWithProfiler() {
read_time += timeline.ElapsedSec(); read_time += timeline.ElapsedSec();
total_time += timeline.ElapsedSec(); total_time += timeline.ElapsedSec();
VLOG(3) << "program config size: " << param_.program_config_size(); VLOG(3) << "program config size: " << param_.program_config_size();
for (size_t i = 0; i < param_.program_config(0).pull_sparse_table_id_size(); for (int i = 0; i < param_.program_config(0).pull_sparse_table_id_size();
++i) { ++i) {
uint64_t tid = static_cast<uint64_t>( uint64_t tid = static_cast<uint64_t>(
param_.program_config(0).pull_sparse_table_id(i)); param_.program_config(0).pull_sparse_table_id(i));
...@@ -244,8 +244,8 @@ void DownpourWorker::TrainFilesWithProfiler() { ...@@ -244,8 +244,8 @@ void DownpourWorker::TrainFilesWithProfiler() {
} }
if (need_to_push_sparse_) { if (need_to_push_sparse_) {
for (size_t i = 0; for (int i = 0; i < param_.program_config(0).push_sparse_table_id_size();
i < param_.program_config(0).push_sparse_table_id_size(); ++i) { ++i) {
uint64_t tid = static_cast<uint64_t>( uint64_t tid = static_cast<uint64_t>(
param_.program_config(0).push_sparse_table_id(i)); param_.program_config(0).push_sparse_table_id(i));
TableParameter table; TableParameter table;
...@@ -268,8 +268,8 @@ void DownpourWorker::TrainFilesWithProfiler() { ...@@ -268,8 +268,8 @@ void DownpourWorker::TrainFilesWithProfiler() {
if (need_to_push_dense_) { if (need_to_push_dense_) {
timeline.Start(); timeline.Start();
for (size_t i = 0; for (int i = 0; i < param_.program_config(0).push_dense_table_id_size();
i < param_.program_config(0).push_dense_table_id_size(); ++i) { ++i) {
uint64_t tid = static_cast<uint64_t>( uint64_t tid = static_cast<uint64_t>(
param_.program_config(0).push_dense_table_id(i)); param_.program_config(0).push_dense_table_id(i));
fleet_ptr_->PushDenseVarsAsync( fleet_ptr_->PushDenseVarsAsync(
...@@ -315,8 +315,8 @@ void DownpourWorker::TrainFilesWithProfiler() { ...@@ -315,8 +315,8 @@ void DownpourWorker::TrainFilesWithProfiler() {
} }
if (need_to_push_dense_) { if (need_to_push_dense_) {
for (size_t i = 0; for (int i = 0; i < param_.program_config(0).push_dense_table_id_size();
i < param_.program_config(0).push_dense_table_id_size(); ++i) { ++i) {
uint64_t tid = static_cast<uint64_t>( uint64_t tid = static_cast<uint64_t>(
param_.program_config(0).push_dense_table_id(i)); param_.program_config(0).push_dense_table_id(i));
pull_dense_worker_->IncreaseThreadVersion(thread_id_, tid); pull_dense_worker_->IncreaseThreadVersion(thread_id_, tid);
...@@ -362,7 +362,7 @@ void DownpourWorker::TrainFiles() { ...@@ -362,7 +362,7 @@ void DownpourWorker::TrainFiles() {
int cur_batch; int cur_batch;
while ((cur_batch = device_reader_->Next()) > 0) { while ((cur_batch = device_reader_->Next()) > 0) {
// pull sparse here // pull sparse here
for (size_t i = 0; i < param_.program_config(0).pull_sparse_table_id_size(); for (int i = 0; i < param_.program_config(0).pull_sparse_table_id_size();
++i) { ++i) {
uint64_t tid = static_cast<uint64_t>( uint64_t tid = static_cast<uint64_t>(
param_.program_config(0).pull_sparse_table_id(i)); param_.program_config(0).pull_sparse_table_id(i));
...@@ -397,8 +397,8 @@ void DownpourWorker::TrainFiles() { ...@@ -397,8 +397,8 @@ void DownpourWorker::TrainFiles() {
if (need_to_push_sparse_) { if (need_to_push_sparse_) {
// push gradients here // push gradients here
for (size_t i = 0; for (int i = 0; i < param_.program_config(0).push_sparse_table_id_size();
i < param_.program_config(0).push_sparse_table_id_size(); ++i) { ++i) {
uint64_t tid = static_cast<uint64_t>( uint64_t tid = static_cast<uint64_t>(
param_.program_config(0).push_sparse_table_id(i)); param_.program_config(0).push_sparse_table_id(i));
TableParameter table; TableParameter table;
...@@ -416,8 +416,8 @@ void DownpourWorker::TrainFiles() { ...@@ -416,8 +416,8 @@ void DownpourWorker::TrainFiles() {
} }
if (need_to_push_dense_) { if (need_to_push_dense_) {
for (size_t i = 0; for (int i = 0; i < param_.program_config(0).push_dense_table_id_size();
i < param_.program_config(0).push_dense_table_id_size(); ++i) { ++i) {
uint64_t tid = static_cast<uint64_t>( uint64_t tid = static_cast<uint64_t>(
param_.program_config(0).push_dense_table_id(i)); param_.program_config(0).push_dense_table_id(i));
fleet_ptr_->PushDenseVarsAsync( fleet_ptr_->PushDenseVarsAsync(
...@@ -461,8 +461,8 @@ void DownpourWorker::TrainFiles() { ...@@ -461,8 +461,8 @@ void DownpourWorker::TrainFiles() {
} }
if (need_to_push_dense_) { if (need_to_push_dense_) {
for (size_t i = 0; for (int i = 0; i < param_.program_config(0).push_dense_table_id_size();
i < param_.program_config(0).push_dense_table_id_size(); ++i) { ++i) {
uint64_t tid = static_cast<uint64_t>( uint64_t tid = static_cast<uint64_t>(
param_.program_config(0).push_dense_table_id(i)); param_.program_config(0).push_dense_table_id(i));
pull_dense_worker_->IncreaseThreadVersion(thread_id_, tid); pull_dense_worker_->IncreaseThreadVersion(thread_id_, tid);
......
...@@ -23,7 +23,7 @@ namespace ir { ...@@ -23,7 +23,7 @@ namespace ir {
void ExpectedKernelCachePass::ApplyImpl(ir::Graph* graph) const { void ExpectedKernelCachePass::ApplyImpl(ir::Graph* graph) const {
VLOG(3) << "Applies Expected Kernel Cache strategy."; VLOG(3) << "Applies Expected Kernel Cache strategy.";
for (const Node* n : graph->Nodes()) { for (const Node* n : graph->Nodes()) {
if (n->IsOp()) { if (n->IsOp() && n->Op()) {
n->Op()->SetAttr(kEnableCacheExpectedKernel, true); n->Op()->SetAttr(kEnableCacheExpectedKernel, true);
} }
} }
......
...@@ -31,10 +31,10 @@ namespace paddle { ...@@ -31,10 +31,10 @@ namespace paddle {
namespace framework { namespace framework {
namespace ir { namespace ir {
namespace { namespace {
void SortHelper( void SortHelper(const std::map<ir::Node *, std::set<ir::Node *, ir::NodeComp>,
const std::map<ir::Node *, std::unordered_set<ir::Node *>> &adj_list, ir::NodeComp> &adj_list,
ir::Node *node, std::unordered_set<ir::Node *> *visited, ir::Node *node, std::unordered_set<ir::Node *> *visited,
std::vector<ir::Node *> *ret) { std::vector<ir::Node *> *ret) {
visited->insert(node); visited->insert(node);
for (auto adj : adj_list.at(node)) { for (auto adj : adj_list.at(node)) {
...@@ -50,7 +50,8 @@ void SortHelper( ...@@ -50,7 +50,8 @@ void SortHelper(
bool HasCircleHelper( bool HasCircleHelper(
ir::Node *node, ir::Node *node,
const std::map<ir::Node *, std::unordered_set<ir::Node *>> &adj_list, const std::map<ir::Node *, std::set<ir::Node *, ir::NodeComp>, ir::NodeComp>
&adj_list,
std::unordered_set<ir::Node *> *visited, std::unordered_set<ir::Node *> *visited,
std::unordered_set<ir::Node *> *in_trace, std::unordered_set<ir::Node *> *in_trace,
std::vector<std::vector<ir::Node *>> *circles) { std::vector<std::vector<ir::Node *>> *circles) {
...@@ -84,7 +85,8 @@ bool HasCircleHelper( ...@@ -84,7 +85,8 @@ bool HasCircleHelper(
} }
bool HasCircleInternal( bool HasCircleInternal(
const std::map<ir::Node *, std::unordered_set<ir::Node *>> &adj_list, const std::map<ir::Node *, std::set<ir::Node *, ir::NodeComp>, ir::NodeComp>
&adj_list,
std::vector<std::vector<ir::Node *>> *circles) { std::vector<std::vector<ir::Node *>> *circles) {
std::unordered_set<ir::Node *> visited; std::unordered_set<ir::Node *> visited;
std::unordered_set<ir::Node *> in_trace; std::unordered_set<ir::Node *> in_trace;
...@@ -107,8 +109,8 @@ bool FindCircleSubGraph(const Graph &graph, ...@@ -107,8 +109,8 @@ bool FindCircleSubGraph(const Graph &graph,
} }
std::vector<ir::Node *> TopologySortOperations(const Graph &graph) { std::vector<ir::Node *> TopologySortOperations(const Graph &graph) {
std::map<ir::Node *, std::unordered_set<ir::Node *>> adj_list = std::map<ir::Node *, std::set<ir::Node *, ir::NodeComp>, ir::NodeComp>
BuildOperationAdjList(graph); adj_list = BuildOperationAdjList(graph);
PADDLE_ENFORCE(!HasCircleInternal(adj_list, nullptr)); PADDLE_ENFORCE(!HasCircleInternal(adj_list, nullptr));
std::unordered_set<ir::Node *> visited; std::unordered_set<ir::Node *> visited;
std::vector<ir::Node *> ret; std::vector<ir::Node *> ret;
...@@ -117,34 +119,30 @@ std::vector<ir::Node *> TopologySortOperations(const Graph &graph) { ...@@ -117,34 +119,30 @@ std::vector<ir::Node *> TopologySortOperations(const Graph &graph) {
SortHelper(adj_list, adj.first, &visited, &ret); SortHelper(adj_list, adj.first, &visited, &ret);
} }
} }
return ret; return ret;
} }
// Build operator inlink edge table. // Build operator inlink edge table.
std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList( std::map<ir::Node *, std::set<ir::Node *, ir::NodeComp>, ir::NodeComp>
const Graph &graph) { BuildOperationAdjList(const Graph &graph) {
std::map<ir::Node *, std::unordered_set<ir::Node *>> adj_list; std::map<ir::Node *, std::set<ir::Node *, ir::NodeComp>, ir::NodeComp>
adj_list;
for (auto &n : graph.Nodes()) { for (auto &n : graph.Nodes()) {
if (!n->IsOp()) continue; if (!n->IsOp()) continue;
if (adj_list.find(n) == adj_list.end()) { if (adj_list.find(n) == adj_list.end()) {
adj_list[n] = std::unordered_set<ir::Node *>(); adj_list[n] = std::set<ir::Node *, ir::NodeComp>();
} }
std::vector<ir::Node *> nodes;
for (auto &var : n->inputs) { for (auto &var : n->inputs) {
for (auto &adj_n : var->inputs) { for (auto &adj_n : var->inputs) {
PADDLE_ENFORCE(adj_n->NodeType() == ir::Node::Type::kOperation); PADDLE_ENFORCE(adj_n->NodeType() == ir::Node::Type::kOperation);
VLOG(4) << "adj " << adj_n->Name() << reinterpret_cast<void *>(adj_n) VLOG(4) << "adj " << adj_n->Name() << reinterpret_cast<void *>(adj_n)
<< " -> " << n->Name() << reinterpret_cast<void *>(n) << " -> " << n->Name() << reinterpret_cast<void *>(n)
<< " via " << var->Name() << reinterpret_cast<void *>(var); << " via " << var->Name() << reinterpret_cast<void *>(var);
nodes.push_back(adj_n); adj_list[n].insert(adj_n);
} }
} }
std::sort(nodes.begin(), nodes.end(), [](ir::Node *node1, ir::Node *node2) {
return node1->id() > node2->id();
});
adj_list[n].insert(std::make_move_iterator(nodes.begin()),
std::make_move_iterator(nodes.end()));
} }
return adj_list; return adj_list;
} }
......
...@@ -16,6 +16,7 @@ limitations under the License. */ ...@@ -16,6 +16,7 @@ limitations under the License. */
#include <map> #include <map>
#include <memory> #include <memory>
#include <set>
#include <vector> #include <vector>
#include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph.h"
...@@ -25,6 +26,13 @@ namespace paddle { ...@@ -25,6 +26,13 @@ namespace paddle {
namespace framework { namespace framework {
namespace ir { namespace ir {
// Compare nodes via node id.
struct NodeComp {
bool operator()(ir::Node *const &node1, ir::Node *const &node2) const {
return node1->id() < node2->id();
}
};
// Test if the graph contains circle. // Test if the graph contains circle.
bool HasCircle(const Graph &graph); bool HasCircle(const Graph &graph);
...@@ -57,8 +65,8 @@ std::vector<Node *> TopologyVarientSort(const Graph &graph, SortKind sort_kind); ...@@ -57,8 +65,8 @@ std::vector<Node *> TopologyVarientSort(const Graph &graph, SortKind sort_kind);
void CleanIndividualNodes(Graph *graph); void CleanIndividualNodes(Graph *graph);
// Build an adjacency list of operations for the `graph`. // Build an adjacency list of operations for the `graph`.
std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList( std::map<ir::Node *, std::set<ir::Node *, ir::NodeComp>, ir::NodeComp>
const Graph &graph); BuildOperationAdjList(const Graph &graph);
template <typename T> template <typename T>
std::vector<T *> FilterByNodeWrapper(const Graph &graph) { std::vector<T *> FilterByNodeWrapper(const Graph &graph) {
......
...@@ -241,6 +241,7 @@ OpDesc::OpDesc(const std::string &type, const VariableNameMap &inputs, ...@@ -241,6 +241,7 @@ OpDesc::OpDesc(const std::string &type, const VariableNameMap &inputs,
outputs_ = outputs; outputs_ = outputs;
attrs_ = attrs; attrs_ = attrs;
need_update_ = true; need_update_ = true;
block_ = nullptr;
} }
OpDesc::OpDesc(const OpDesc &other, BlockDesc *block) { OpDesc::OpDesc(const OpDesc &other, BlockDesc *block) {
......
...@@ -221,7 +221,7 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places, ...@@ -221,7 +221,7 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
PADDLE_ENFORCE(!member_->use_cuda_, PADDLE_ENFORCE(!member_->use_cuda_,
"gpu mode does not support async_mode_ now!"); "gpu mode does not support async_mode_ now!");
graphs.push_back(graph); graphs.push_back(graph);
for (int i = 1; i < places.size(); ++i) { for (size_t i = 1; i < places.size(); ++i) {
auto *tmp_graph = new ir::Graph(graph->OriginProgram()); auto *tmp_graph = new ir::Graph(graph->OriginProgram());
async_graphs_.emplace_back(tmp_graph); async_graphs_.emplace_back(tmp_graph);
graphs.push_back(tmp_graph); graphs.push_back(tmp_graph);
...@@ -315,7 +315,7 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places, ...@@ -315,7 +315,7 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
graph = build_strategy.Apply(graph, {member_->places_[0]}, loss_var_name, graph = build_strategy.Apply(graph, {member_->places_[0]}, loss_var_name,
{member_->local_scopes_[0]}, 1, {member_->local_scopes_[0]}, 1,
member_->use_cuda_, member_->nccl_ctxs_.get()); member_->use_cuda_, member_->nccl_ctxs_.get());
for (int i = 1; i < member_->places_.size(); ++i) { for (size_t i = 1; i < member_->places_.size(); ++i) {
graphs[i] = graphs[i] =
build_strategy.Apply(graphs[i], {member_->places_[i]}, loss_var_name, build_strategy.Apply(graphs[i], {member_->places_[i]}, loss_var_name,
{member_->local_scopes_[i]}, 1, {member_->local_scopes_[i]}, 1,
......
...@@ -76,7 +76,7 @@ message PullDenseWorkerParameter { ...@@ -76,7 +76,7 @@ message PullDenseWorkerParameter {
message TableParameter { message TableParameter {
// dense table only // dense table only
optional int64 table_id = 1; optional uint64 table_id = 1;
repeated string dense_value_name = 2; repeated string dense_value_name = 2;
repeated string dense_grad_name = 3; repeated string dense_grad_name = 3;
repeated int32 push_dense_wait_times = 5; repeated int32 push_dense_wait_times = 5;
......
...@@ -45,12 +45,16 @@ class InferVarTypeContext { ...@@ -45,12 +45,16 @@ class InferVarTypeContext {
virtual bool HasInput(const std::string& name) const { virtual bool HasInput(const std::string& name) const {
PADDLE_ENFORCE_NOT_NULL(op_); PADDLE_ENFORCE_NOT_NULL(op_);
return op_->Inputs().count(name) > 0; auto& inputs = op_->Inputs();
auto input = inputs.find(name);
return input != inputs.end() && !input->second.empty();
} }
virtual bool HasOutput(const std::string& name) const { virtual bool HasOutput(const std::string& name) const {
PADDLE_ENFORCE_NOT_NULL(op_); PADDLE_ENFORCE_NOT_NULL(op_);
return op_->Outputs().count(name) > 0; auto& outputs = op_->Outputs();
auto output = outputs.find(name);
return output != outputs.end() && !output->second.empty();
} }
virtual const std::vector<std::string>& Input(const std::string& name) const { virtual const std::vector<std::string>& Input(const std::string& name) const {
......
...@@ -259,6 +259,9 @@ bool AnalysisPredictor::SetFeed(const std::vector<PaddleTensor> &inputs, ...@@ -259,6 +259,9 @@ bool AnalysisPredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
return false; return false;
} }
PADDLE_ENFORCE_NOT_NULL(input_ptr);
PADDLE_ENFORCE_NOT_NULL(inputs[i].data.data());
if (platform::is_cpu_place(place_)) { if (platform::is_cpu_place(place_)) {
// TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy. // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
std::memcpy(static_cast<void *>(input_ptr), inputs[i].data.data(), std::memcpy(static_cast<void *>(input_ptr), inputs[i].data.data(),
......
...@@ -54,6 +54,7 @@ PaddleBuf &PaddleBuf::operator=(const PaddleBuf &other) { ...@@ -54,6 +54,7 @@ PaddleBuf &PaddleBuf::operator=(const PaddleBuf &other) {
memory_owned_ = other.memory_owned_; memory_owned_ = other.memory_owned_;
} else { } else {
Resize(other.length()); Resize(other.length());
PADDLE_ENFORCE(!(other.length() > 0 && other.data() == nullptr));
memcpy(data_, other.data(), other.length()); memcpy(data_, other.data(), other.length());
length_ = other.length(); length_ = other.length();
memory_owned_ = true; memory_owned_ = true;
......
...@@ -169,6 +169,7 @@ std::unique_ptr<PaddlePredictor> NativePaddlePredictor::Clone() { ...@@ -169,6 +169,7 @@ std::unique_ptr<PaddlePredictor> NativePaddlePredictor::Clone() {
std::unique_ptr<PaddlePredictor> cls(new NativePaddlePredictor(config_)); std::unique_ptr<PaddlePredictor> cls(new NativePaddlePredictor(config_));
// Hot fix the bug that result diff in multi-thread. // Hot fix the bug that result diff in multi-thread.
// TODO(Superjomn) re-implement a real clone here. // TODO(Superjomn) re-implement a real clone here.
PADDLE_ENFORCE_NOT_NULL(dynamic_cast<NativePaddlePredictor *>(cls.get()));
if (!dynamic_cast<NativePaddlePredictor *>(cls.get())->Init(nullptr)) { if (!dynamic_cast<NativePaddlePredictor *>(cls.get())->Init(nullptr)) {
LOG(ERROR) << "fail to call Init"; LOG(ERROR) << "fail to call Init";
return nullptr; return nullptr;
...@@ -210,6 +211,8 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs, ...@@ -210,6 +211,8 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
return false; return false;
} }
PADDLE_ENFORCE_NOT_NULL(input_ptr);
PADDLE_ENFORCE_NOT_NULL(inputs[i].data.data());
if (platform::is_cpu_place(place_)) { if (platform::is_cpu_place(place_)) {
// TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy. // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
std::memcpy(static_cast<void *>(input_ptr), inputs[i].data.data(), std::memcpy(static_cast<void *>(input_ptr), inputs[i].data.data(),
...@@ -316,6 +319,8 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor< ...@@ -316,6 +319,8 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
} }
std::unique_ptr<PaddlePredictor> predictor(new NativePaddlePredictor(config)); std::unique_ptr<PaddlePredictor> predictor(new NativePaddlePredictor(config));
PADDLE_ENFORCE_NOT_NULL(
dynamic_cast<NativePaddlePredictor *>(predictor.get()));
if (!dynamic_cast<NativePaddlePredictor *>(predictor.get())->Init(nullptr)) { if (!dynamic_cast<NativePaddlePredictor *>(predictor.get())->Init(nullptr)) {
return nullptr; return nullptr;
} }
......
...@@ -47,6 +47,7 @@ struct DataRecord { ...@@ -47,6 +47,7 @@ struct DataRecord {
num_lines++; num_lines++;
std::vector<std::string> data; std::vector<std::string> data;
split(line, '\t', &data); split(line, '\t', &data);
PADDLE_ENFORCE(data.size() >= 4);
// load title1 data // load title1 data
std::vector<int64_t> title1_data; std::vector<int64_t> title1_data;
split_to_int64(data[0], ' ', &title1_data); split_to_int64(data[0], ' ', &title1_data);
......
...@@ -214,28 +214,23 @@ TEST(Analyzer_Transformer, fuse_statis) { ...@@ -214,28 +214,23 @@ TEST(Analyzer_Transformer, fuse_statis) {
} }
// Compare result of NativeConfig and AnalysisConfig // Compare result of NativeConfig and AnalysisConfig
// void compare(bool use_mkldnn = false) { void compare(bool use_mkldnn = false) {
// AnalysisConfig cfg; AnalysisConfig cfg;
// SetConfig(&cfg); SetConfig(&cfg);
// if (use_mkldnn) { if (use_mkldnn) {
// cfg.EnableMKLDNN(); cfg.EnableMKLDNN();
// } }
//
// std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
// SetInput(&input_slots_all); SetInput(&input_slots_all);
// CompareNativeAndAnalysis( CompareNativeAndAnalysis(
// reinterpret_cast<const PaddlePredictor::Config *>(&cfg), reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
// input_slots_all); }
// }
TEST(Analyzer_Transformer, compare) { compare(); }
// TODO(yihuaxu): #ifdef PADDLE_WITH_MKLDNN
// Disable compare and compare_mkldnn temporary, see TEST(Analyzer_Transformer, compare_mkldnn) { compare(true /* use_mkldnn */); }
// https://github.com/paddlePaddle/Paddle/issues/16316 for details. #endif
// TEST(Analyzer_Transformer, compare) { compare(); }
// #ifdef PADDLE_WITH_MKLDNN
// TEST(Analyzer_Transformer, compare_mkldnn) { compare(true /* use_mkldnn */);
// }
// #endif
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
...@@ -29,6 +29,8 @@ pool3d ...@@ -29,6 +29,8 @@ pool3d
prelu prelu
quantize quantize
rank_loss rank_loss
reduce_all
reduce_any
reduce_max reduce_max
reduce_mean reduce_mean
reduce_min reduce_min
......
...@@ -24,6 +24,7 @@ ...@@ -24,6 +24,7 @@
**/ **/
#include "paddle/fluid/operators/detection/gpc.h" #include "paddle/fluid/operators/detection/gpc.h"
#include "paddle/fluid/platform/enforce.h"
namespace gpc { namespace gpc {
...@@ -689,6 +690,7 @@ static bbox *create_contour_bboxes(gpc_polygon *p) { ...@@ -689,6 +690,7 @@ static bbox *create_contour_bboxes(gpc_polygon *p) {
gpc_malloc<bbox>(box, p->num_contours * sizeof(bbox), gpc_malloc<bbox>(box, p->num_contours * sizeof(bbox),
const_cast<char *>("Bounding box creation")); const_cast<char *>("Bounding box creation"));
PADDLE_ENFORCE_NOT_NULL(box);
/* Construct contour bounding boxes */ /* Construct contour bounding boxes */
for (c = 0; c < p->num_contours; c++) { for (c = 0; c < p->num_contours; c++) {
...@@ -852,6 +854,7 @@ void gpc_add_contour(gpc_polygon *p, gpc_vertex_list *new_contour, int hole) { ...@@ -852,6 +854,7 @@ void gpc_add_contour(gpc_polygon *p, gpc_vertex_list *new_contour, int hole) {
/* Create an extended hole array */ /* Create an extended hole array */
gpc_malloc<int>(extended_hole, (p->num_contours + 1) * sizeof(int), gpc_malloc<int>(extended_hole, (p->num_contours + 1) * sizeof(int),
const_cast<char *>("contour hole addition")); const_cast<char *>("contour hole addition"));
PADDLE_ENFORCE_NOT_NULL(extended_hole);
/* Create an extended contour array */ /* Create an extended contour array */
gpc_malloc<gpc_vertex_list>(extended_contour, gpc_malloc<gpc_vertex_list>(extended_contour,
...@@ -969,6 +972,7 @@ void gpc_polygon_clip(gpc_op op, gpc_polygon *subj, gpc_polygon *clip, ...@@ -969,6 +972,7 @@ void gpc_polygon_clip(gpc_op op, gpc_polygon *subj, gpc_polygon *clip,
/* Build scanbeam table from scanbeam tree */ /* Build scanbeam table from scanbeam tree */
gpc_malloc<double>(sbt, sbt_entries * sizeof(double), gpc_malloc<double>(sbt, sbt_entries * sizeof(double),
const_cast<char *>("sbt creation")); const_cast<char *>("sbt creation"));
PADDLE_ENFORCE_NOT_NULL(sbt);
build_sbt(&scanbeam, sbt, sbtree); build_sbt(&scanbeam, sbt, sbtree);
scanbeam = 0; scanbeam = 0;
free_sbtree(&sbtree); free_sbtree(&sbtree);
...@@ -1604,6 +1608,7 @@ void gpc_tristrip_clip(gpc_op op, gpc_polygon *subj, gpc_polygon *clip, ...@@ -1604,6 +1608,7 @@ void gpc_tristrip_clip(gpc_op op, gpc_polygon *subj, gpc_polygon *clip,
/* Build scanbeam table from scanbeam tree */ /* Build scanbeam table from scanbeam tree */
gpc_malloc<double>(sbt, sbt_entries * sizeof(double), gpc_malloc<double>(sbt, sbt_entries * sizeof(double),
const_cast<char *>("sbt creation")); const_cast<char *>("sbt creation"));
PADDLE_ENFORCE_NOT_NULL(sbt);
build_sbt(&scanbeam, sbt, sbtree); build_sbt(&scanbeam, sbt, sbtree);
scanbeam = 0; scanbeam = 0;
free_sbtree(&sbtree); free_sbtree(&sbtree);
......
...@@ -9,6 +9,9 @@ else() ...@@ -9,6 +9,9 @@ else()
endif() endif()
configure_file(send_recv.proto.in ${CMAKE_CURRENT_SOURCE_DIR}/send_recv.proto @ONLY) configure_file(send_recv.proto.in ${CMAKE_CURRENT_SOURCE_DIR}/send_recv.proto @ONLY)
cc_library(async_sparse_param_update_recorder SRCS async_sparse_param_update_recorder.cc DEPS enforce simple_threadpool)
cc_test(async_sparse_param_update_recorder_test SRCS async_sparse_param_update_recorder_test.cc DEPS async_sparse_param_update_recorder)
# FIXME(typhoonzero): use add_subdirectory once we clean the dependency of these files # FIXME(typhoonzero): use add_subdirectory once we clean the dependency of these files
set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
if(WITH_GRPC) if(WITH_GRPC)
...@@ -20,7 +23,7 @@ if(WITH_GRPC) ...@@ -20,7 +23,7 @@ if(WITH_GRPC)
collective_client.cc collective_server.cc collective_client.cc collective_server.cc
${GRPC_SRCS} ${GRPC_SRCS}
PROTO send_recv.proto PROTO send_recv.proto
DEPS lod_tensor selected_rows_functor memory scope ${GRPC_DEPS}) DEPS lod_tensor selected_rows_functor memory scope ${GRPC_DEPS} async_sparse_param_update_recorder)
set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
set(RPC_DEPS sendrecvop_rpc ${GRPC_DEPS}) set(RPC_DEPS sendrecvop_rpc ${GRPC_DEPS})
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h"
namespace paddle {
namespace operators {
namespace distributed {
std::once_flag AsyncSparseParamUpdateRecorder::init_flag_;
std::unique_ptr<AsyncSparseParamUpdateRecorder>
AsyncSparseParamUpdateRecorder::recorder_(nullptr);
} // namespace distributed
} // namespace operators
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <functional>
#include <future> // NOLINT
#include <memory>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <vector>
#include <ThreadPool.h>
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace operators {
namespace distributed {
class ConcurrentSet {
public:
ConcurrentSet() : pool_(new ::ThreadPool(1)) {}
~ConcurrentSet() {}
std::future<void> Update(const std::vector<int64_t>& rows) {
auto task = [this, rows] {
if (VLOG_IS_ON(3)) {
std::ostringstream sstream;
sstream << "[";
for (auto& id : rows) {
sstream << id << ", ";
}
sstream << "]";
VLOG(3) << "update ids -> " << sstream.str();
}
for (auto row : rows) {
set_.insert(row);
}
};
return pool_->enqueue(std::move(task));
}
std::future<void> GetAndClear(std::vector<int64_t>* result) {
auto task = [this, &result] {
result->clear();
for (auto& id : set_) {
result->push_back(id);
}
if (VLOG_IS_ON(3)) {
std::ostringstream sstream;
sstream << "[";
for (auto& id : *result) {
sstream << id << ", ";
}
sstream << "]";
VLOG(3) << "result ids size: " << result->size() << " "
<< sstream.str();
}
set_.clear();
};
return pool_->enqueue(std::move(task));
}
private:
std::unordered_set<int64_t> set_;
std::unique_ptr<::ThreadPool> pool_{nullptr};
};
class AsyncSparseParamUpdateRecorder {
using TrainerToRows = std::vector<std::unique_ptr<ConcurrentSet>>;
public:
AsyncSparseParamUpdateRecorder(
int trainer_num,
const std::unordered_map<std::string, std::string>& grad_to_param)
: trainer_num_(trainer_num), grad_to_param_(grad_to_param) {
if (VLOG_IS_ON(3)) {
std::ostringstream sstream;
sstream << "[";
for (auto& item : grad_to_param) {
sstream << item.first << ":" << item.second << ", ";
}
sstream << "]";
VLOG(3) << "trainer_num: " << trainer_num
<< " grad_to_param_: " << sstream.str();
}
for (auto& iter : grad_to_param) {
param_to_grad_[iter.second] = iter.first;
auto& param_name = iter.second;
param_to_updated_rows_[param_name] = TrainerToRows();
auto& trainer_to_rows = param_to_updated_rows_[param_name];
for (auto i = 0; i < trainer_num; ++i) {
trainer_to_rows.emplace_back(new ConcurrentSet());
}
}
}
~AsyncSparseParamUpdateRecorder() = default;
void Update(const std::string& grad_name,
const std::vector<int64_t>& update_rows) {
VLOG(3) << "update grad: " << grad_name
<< " row size: " << update_rows.size();
auto& param_name = grad_to_param_.at(grad_name);
auto& trainer_to_rows = param_to_updated_rows_.at(param_name);
std::vector<std::future<void>> fs;
for (auto& set : trainer_to_rows) {
fs.push_back(set->Update(update_rows));
}
for (auto& f : fs) {
f.wait();
}
}
void GetAndClear(const std::string& param_name, int trainer_id,
std::vector<int64_t>* result) {
VLOG(3) << "GetAndClear param: " << param_name
<< " for trainer: " << trainer_id;
PADDLE_ENFORCE_LT(trainer_id, trainer_num_);
param_to_updated_rows_.at(param_name)[trainer_id]
->GetAndClear(result)
.wait();
}
bool HasParam(const std::string& param_name) {
return param_to_grad_.find(param_name) != param_to_grad_.end();
}
bool HasGrad(const std::string& grad_name) {
return grad_to_param_.find(grad_name) != grad_to_param_.end();
}
private:
const int trainer_num_;
std::unordered_map<std::string, std::string> grad_to_param_;
std::unordered_map<std::string, std::string> param_to_grad_;
std::unordered_map<std::string, TrainerToRows> param_to_updated_rows_;
// init recorder
public:
static void Init(
int trainer_num,
const std::unordered_map<std::string, std::string>& grad_to_param) {
InitImpl(trainer_num, grad_to_param);
}
static AsyncSparseParamUpdateRecorder* GetInstance() {
return recorder_.get();
}
private:
// Init is called by GetInstance.
static void InitImpl(
int trainer_num,
const std::unordered_map<std::string, std::string>& grad_to_param) {
if (recorder_ == nullptr) {
recorder_.reset(
new AsyncSparseParamUpdateRecorder(trainer_num, grad_to_param));
}
}
static std::once_flag init_flag_;
static std::unique_ptr<AsyncSparseParamUpdateRecorder> recorder_;
};
} // namespace distributed
} // namespace operators
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h"
#include <algorithm>
#include "gtest/gtest.h"
namespace paddle {
namespace operators {
namespace distributed {
TEST(ConcurrentSet, All) {
ConcurrentSet concurrent_set;
std::vector<int64_t> in1 = {1, 2, 3, 4};
std::vector<int64_t> in2 = {2, 3, 5, 6};
std::vector<std::future<void>> futures;
futures.push_back(concurrent_set.Update(in1));
futures.push_back(concurrent_set.Update(in2));
for (auto &f : futures) {
f.wait();
}
std::unordered_set<int64_t> in;
std::copy(in1.begin(), in1.end(), std::inserter(in, in.begin()));
std::copy(in2.begin(), in2.end(), std::inserter(in, in.begin()));
std::vector<int64_t> ret;
concurrent_set.GetAndClear(&ret).wait();
std::unordered_set<int64_t> out;
std::copy(ret.begin(), ret.end(), std::inserter(out, out.begin()));
EXPECT_EQ(in, out);
concurrent_set.GetAndClear(&ret).wait();
EXPECT_EQ(ret.size(), 0);
}
TEST(AsyncSparseParamUpdateRecorder, All) {
std::unordered_map<std::string, std::string> grad_to_param;
grad_to_param["grad1"] = "param1";
grad_to_param["grad2"] = "param2";
int trainer_num = 10;
AsyncSparseParamUpdateRecorder recorder(trainer_num, grad_to_param);
std::vector<int64_t> in1 = {1, 2, 3, 4};
std::vector<int64_t> in2 = {2, 3, 5, 6};
std::unordered_set<int64_t> in;
std::copy(in1.begin(), in1.end(), std::inserter(in, in.begin()));
std::copy(in2.begin(), in2.end(), std::inserter(in, in.begin()));
recorder.Update("grad1", in1);
recorder.Update("grad1", in2);
EXPECT_TRUE(recorder.HasParam("param1"));
EXPECT_TRUE(recorder.HasParam("param2"));
EXPECT_FALSE(recorder.HasParam("param3"));
EXPECT_TRUE(recorder.HasGrad("grad1"));
EXPECT_TRUE(recorder.HasGrad("grad2"));
EXPECT_FALSE(recorder.HasGrad("grad3"));
std::vector<int64_t> ret;
EXPECT_ANY_THROW(recorder.GetAndClear("param1", trainer_num, &ret));
for (int i = 0; i < trainer_num; ++i) {
std::vector<int64_t> ret;
std::unordered_set<int64_t> out;
recorder.GetAndClear("param1", i, &ret);
std::copy(ret.begin(), ret.end(), std::inserter(out, out.begin()));
EXPECT_EQ(in, out);
recorder.GetAndClear("param1", i, &ret);
EXPECT_EQ(ret.size(), 0);
}
}
} // namespace distributed
} // namespace operators
} // namespace paddle
...@@ -234,6 +234,7 @@ VarHandlePtr BRPCClient::AsyncGetVar(const std::string& ep, ...@@ -234,6 +234,7 @@ VarHandlePtr BRPCClient::AsyncGetVar(const std::string& ep,
const framework::Scope& scope, const framework::Scope& scope,
const std::string& var_name, const std::string& var_name,
const std::string& out_var_name, const std::string& out_var_name,
const std::string& table_name,
int64_t time_out) { int64_t time_out) {
return _AsyncGetVar(ep, ctx, scope, var_name, out_var_name, kGetRPC, return _AsyncGetVar(ep, ctx, scope, var_name, out_var_name, kGetRPC,
time_out); time_out);
......
...@@ -21,8 +21,10 @@ limitations under the License. */ ...@@ -21,8 +21,10 @@ limitations under the License. */
#include <functional> #include <functional>
#include <iostream> #include <iostream>
#include <map> #include <map>
#include <memory>
#include <mutex> // NOLINT #include <mutex> // NOLINT
#include <string> #include <string>
#include <unordered_map>
#include <vector> #include <vector>
#include "brpc/channel.h" #include "brpc/channel.h"
...@@ -66,6 +68,7 @@ class BRPCClient : public RPCClient { ...@@ -66,6 +68,7 @@ class BRPCClient : public RPCClient {
const framework::Scope& scope, const framework::Scope& scope,
const std::string& var_name, const std::string& var_name,
const std::string& out_var_name, const std::string& out_var_name,
const std::string& table_name = "",
int64_t time_out = FLAGS_rpc_deadline) override; int64_t time_out = FLAGS_rpc_deadline) override;
VarHandlePtr AsyncGetMonomerBarrier( VarHandlePtr AsyncGetMonomerBarrier(
...@@ -107,13 +110,11 @@ class BRPCClient : public RPCClient { ...@@ -107,13 +110,11 @@ class BRPCClient : public RPCClient {
void SendComplete() override; void SendComplete() override;
private: private:
VarHandlePtr _AsyncGetVar(const std::string& ep, VarHandlePtr _AsyncGetVar(
const platform::DeviceContext& ctx, const std::string& ep, const platform::DeviceContext& ctx,
const framework::Scope& scope, const framework::Scope& scope, const std::string& var_name,
const std::string& var_name, const std::string& out_var_name, const std::string& method_name,
const std::string& out_var_name, const std::string& table_name, int64_t time_out = FLAGS_rpc_deadline);
const std::string& method_name,
int64_t time_out = FLAGS_rpc_deadline);
void Proceed(); void Proceed();
ChannelQueuePtr GetChannel(const std::string& ep); ChannelQueuePtr GetChannel(const std::string& ep);
......
...@@ -32,6 +32,9 @@ DEFINE_int32(communicator_send_queue_size, 20, ...@@ -32,6 +32,9 @@ DEFINE_int32(communicator_send_queue_size, 20,
DEFINE_int32(communicator_max_send_grad_num_before_recv, 20, DEFINE_int32(communicator_max_send_grad_num_before_recv, 20,
"max grad num to send before recv parameters"); "max grad num to send before recv parameters");
DEFINE_int32(communicator_thread_pool_size, 5, "thread num to do send or recv"); DEFINE_int32(communicator_thread_pool_size, 5, "thread num to do send or recv");
DEFINE_int32(communicator_send_wait_times, 5,
"times that send thread will wait if merge num does not reach "
"max_merge_var_num");
DEFINE_int32(communicator_max_merge_var_num, 20, DEFINE_int32(communicator_max_merge_var_num, 20,
"max var num to merge and send"); "max var num to merge and send");
DEFINE_bool(communicator_fake_rpc, false, DEFINE_bool(communicator_fake_rpc, false,
...@@ -65,6 +68,8 @@ Communicator::Communicator(const RpcCtxMap &send_varname_to_ctx, ...@@ -65,6 +68,8 @@ Communicator::Communicator(const RpcCtxMap &send_varname_to_ctx,
<< FLAGS_communicator_max_send_grad_num_before_recv; << FLAGS_communicator_max_send_grad_num_before_recv;
VLOG(0) << "communicator_thread_pool_size: " VLOG(0) << "communicator_thread_pool_size: "
<< FLAGS_communicator_thread_pool_size; << FLAGS_communicator_thread_pool_size;
VLOG(0) << "communicator_send_wait_times: "
<< FLAGS_communicator_send_wait_times;
VLOG(0) << "communicator_max_merge_var_num: " VLOG(0) << "communicator_max_merge_var_num: "
<< FLAGS_communicator_max_merge_var_num; << FLAGS_communicator_max_merge_var_num;
VLOG(0) << "communicator_fake_rpc: " << FLAGS_communicator_fake_rpc; VLOG(0) << "communicator_fake_rpc: " << FLAGS_communicator_fake_rpc;
...@@ -101,20 +106,32 @@ void Communicator::SendThread() { ...@@ -101,20 +106,32 @@ void Communicator::SendThread() {
VLOG(3) << var_name << " merge and send"; VLOG(3) << var_name << " merge and send";
std::vector<std::shared_ptr<Variable>> vars; std::vector<std::shared_ptr<Variable>> vars;
size_t merged_var_num = 0; size_t merged_var_num = 0;
while (var_queue->Size() > 0 && size_t wait_times = 0;
merged_var_num < FLAGS_communicator_max_merge_var_num) { while (merged_var_num < FLAGS_communicator_max_merge_var_num) {
vars.push_back(var_queue->Pop()); if (var_queue->Size() == 0) {
// only count the send number of the first var VLOG(3) << "wait_times -> " << wait_times;
if (var_name == send_varname_to_queue_.begin()->first) { if (wait_times >= FLAGS_communicator_send_wait_times) {
grad_num_.fetch_add(1, std::memory_order_relaxed); break;
}
std::this_thread::sleep_for(std::chrono::milliseconds(10));
wait_times++;
continue;
} else {
wait_times = 0;
vars.push_back(var_queue->Pop());
// only count the send number of the first var
if (var_name == send_varname_to_queue_.begin()->first) {
grad_num_.fetch_add(1, std::memory_order_relaxed);
}
merged_var_num++;
} }
merged_var_num++;
} }
auto before_merge = GetCurrentUS(); auto before_merge = GetCurrentUS();
MergeVars(var_name, vars, send_scope_.get()); MergeVars(var_name, vars, send_scope_.get());
auto after_merge = GetCurrentUS(); auto after_merge = GetCurrentUS();
VLOG(3) << "merge " << var_name << " use time " VLOG(3) << "merge " << merged_var_num << " " << var_name
<< after_merge - before_merge; << " use time " << after_merge - before_merge;
auto send_functor = distributed::ParameterSend<float>(); auto send_functor = distributed::ParameterSend<float>();
auto &ctx = send_varname_to_ctx_.at(var_name); auto &ctx = send_varname_to_ctx_.at(var_name);
if (!FLAGS_communicator_fake_rpc) { if (!FLAGS_communicator_fake_rpc) {
......
...@@ -109,7 +109,7 @@ inline void MergeVars(const std::string& var_name, ...@@ -109,7 +109,7 @@ inline void MergeVars(const std::string& var_name,
auto* out_var = scope->Var(var_name); auto* out_var = scope->Var(var_name);
if (var0->IsType<framework::LoDTensor>()) { if (var0->IsType<framework::LoDTensor>()) {
auto dims = var0->Get<framework::LoDTensor>().dims(); auto dims = var0->Get<framework::LoDTensor>().dims();
VLOG(3) << "merge " << var_name << " LoDTensor " << dims; VLOG(3) << "merge " << var_name << " LoDTensor dims " << dims;
// init output tensor // init output tensor
auto* out_t = out_var->GetMutable<framework::LoDTensor>(); auto* out_t = out_var->GetMutable<framework::LoDTensor>();
......
...@@ -128,9 +128,11 @@ VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep, ...@@ -128,9 +128,11 @@ VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep,
const framework::Scope& scope, const framework::Scope& scope,
const std::string& var_name, const std::string& var_name,
const std::string& out_varname, const std::string& out_varname,
const std::string& table_name,
int64_t time_out) { int64_t time_out) {
return _AsyncGetVar(ep, ctx, scope, kGetRPC, var_name, out_varname, return _AsyncGetVar(ep, ctx, scope, kGetRPC, var_name, out_varname,
"/sendrecv.SendRecvService/GetVariable", time_out); "/sendrecv.SendRecvService/GetVariable", table_name,
time_out);
} }
VarHandlePtr GRPCClient::AsyncGetVarNoBarrier( VarHandlePtr GRPCClient::AsyncGetVarNoBarrier(
...@@ -142,7 +144,7 @@ VarHandlePtr GRPCClient::AsyncGetVarNoBarrier( ...@@ -142,7 +144,7 @@ VarHandlePtr GRPCClient::AsyncGetVarNoBarrier(
return _AsyncGetVar( return _AsyncGetVar(
ep, ctx, scope, kGetNoBarrierRPC, var_name_no_barrier, out_varname, ep, ctx, scope, kGetNoBarrierRPC, var_name_no_barrier, out_varname,
"/sendrecv.SendRecvService/GetVariableNoBarrier", time_out); "/sendrecv.SendRecvService/GetVariableNoBarrier", "", time_out);
} }
VarHandlePtr GRPCClient::AsyncGetMonomerVariable( VarHandlePtr GRPCClient::AsyncGetMonomerVariable(
...@@ -150,18 +152,21 @@ VarHandlePtr GRPCClient::AsyncGetMonomerVariable( ...@@ -150,18 +152,21 @@ VarHandlePtr GRPCClient::AsyncGetMonomerVariable(
const framework::Scope& scope, const std::string& var_name, const framework::Scope& scope, const std::string& var_name,
int64_t time_out) { int64_t time_out) {
return _AsyncGetVar(ep, ctx, scope, kGetMonomerRPC, var_name, var_name, return _AsyncGetVar(ep, ctx, scope, kGetMonomerRPC, var_name, var_name,
"/sendrecv.SendRecvService/GetMonomerVariable", time_out); "/sendrecv.SendRecvService/GetMonomerVariable", "",
time_out);
} }
VarHandlePtr GRPCClient::_AsyncGetVar( VarHandlePtr GRPCClient::_AsyncGetVar(
const std::string& ep, const platform::DeviceContext& ctx, const std::string& ep, const platform::DeviceContext& ctx,
const framework::Scope& scope, const std::string& method, const framework::Scope& scope, const std::string& method,
const std::string& var_name, const std::string& out_varname, const std::string& var_name, const std::string& out_varname,
const std::string& rpc_path, int64_t time_out) { const std::string& rpc_path, const std::string& table_name,
int64_t time_out) {
const platform::DeviceContext* p_ctx = &ctx; const platform::DeviceContext* p_ctx = &ctx;
const std::string ep_val = ep; const std::string ep_val = ep;
const std::string var_name_val = var_name; const std::string var_name_val = var_name;
const std::string out_varname_val = out_varname; const std::string out_varname_val = out_varname;
const std::string table_name_val = table_name;
const framework::Scope* p_scope = &scope; const framework::Scope* p_scope = &scope;
const auto ch = GetChannel(ep_val); const auto ch = GetChannel(ep_val);
GetProcessor* s = new GetProcessor(ch); GetProcessor* s = new GetProcessor(ch);
...@@ -169,32 +174,33 @@ VarHandlePtr GRPCClient::_AsyncGetVar( ...@@ -169,32 +174,33 @@ VarHandlePtr GRPCClient::_AsyncGetVar(
VarHandlePtr h(new VarHandle(ep, method, out_varname_val, p_ctx, p_scope)); VarHandlePtr h(new VarHandle(ep, method, out_varname_val, p_ctx, p_scope));
s->Prepare(h, time_out); s->Prepare(h, time_out);
framework::AsyncIO( framework::AsyncIO([var_name_val, out_varname_val, table_name_val, s, method,
[var_name_val, out_varname_val, s, method, p_ctx, h, rpc_path, this] { p_ctx, h, rpc_path, this] {
// prepare input // prepare input
sendrecv::VariableMessage req; sendrecv::VariableMessage req;
req.set_varname(var_name_val); req.set_varname(var_name_val);
req.set_out_varname(out_varname_val); req.set_out_varname(out_varname_val);
req.set_trainer_id(trainer_id_); req.set_trainer_id(trainer_id_);
::grpc::ByteBuffer buf; req.set_table_name(table_name_val);
RequestToByteBuffer<sendrecv::VariableMessage>(req, &buf); ::grpc::ByteBuffer buf;
RequestToByteBuffer<sendrecv::VariableMessage>(req, &buf);
VLOG(3) << s->GetVarHandlePtr()->String() << " begin"; VLOG(3) << s->GetVarHandlePtr()->String() << " begin";
// stub context // stub context
s->response_call_back_ = ProcGetResponse; s->response_call_back_ = ProcGetResponse;
platform::RecordRPCEvent record_event(method); platform::RecordRPCEvent record_event(method);
auto call = auto call =
s->stub_g_.PrepareUnaryCall(s->context_.get(), rpc_path, buf, &cq_); s->stub_g_.PrepareUnaryCall(s->context_.get(), rpc_path, buf, &cq_);
call->StartCall(); call->StartCall();
call->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s)); call->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
if (UNLIKELY(platform::IsProfileEnabled())) { if (UNLIKELY(platform::IsProfileEnabled())) {
h->Wait(); h->Wait();
} }
}); });
req_count_++; req_count_++;
......
...@@ -23,9 +23,11 @@ limitations under the License. */ ...@@ -23,9 +23,11 @@ limitations under the License. */
#include <functional> #include <functional>
#include <iostream> #include <iostream>
#include <map> #include <map>
#include <memory>
#include <mutex> // NOLINT #include <mutex> // NOLINT
#include <string> #include <string>
#include <thread> // NOLINT #include <thread> // NOLINT
#include <unordered_map>
#include <vector> #include <vector>
#include "grpc++/channel.h" #include "grpc++/channel.h"
...@@ -187,6 +189,7 @@ class GRPCClient : public RPCClient { ...@@ -187,6 +189,7 @@ class GRPCClient : public RPCClient {
const framework::Scope& scope, const framework::Scope& scope,
const std::string& var_name, const std::string& var_name,
const std::string& out_varname, const std::string& out_varname,
const std::string& table_name = "",
int64_t time_out = FLAGS_rpc_deadline) override; int64_t time_out = FLAGS_rpc_deadline) override;
VarHandlePtr AsyncGetVarNoBarrier( VarHandlePtr AsyncGetVarNoBarrier(
...@@ -239,7 +242,8 @@ class GRPCClient : public RPCClient { ...@@ -239,7 +242,8 @@ class GRPCClient : public RPCClient {
const std::string& ep, const platform::DeviceContext& ctx, const std::string& ep, const platform::DeviceContext& ctx,
const framework::Scope& scope, const std::string& method, const framework::Scope& scope, const std::string& method,
const std::string& var_name, const std::string& out_varname, const std::string& var_name, const std::string& out_varname,
const std::string& rpc_path, int64_t time_out = FLAGS_rpc_deadline); const std::string& rpc_path, const std::string& table_name = "",
int64_t time_out = FLAGS_rpc_deadline);
private: private:
grpc::CompletionQueue cq_; grpc::CompletionQueue cq_;
......
...@@ -137,6 +137,7 @@ class RequestGet final : public RequestBase { ...@@ -137,6 +137,7 @@ class RequestGet final : public RequestBase {
// proc request. // proc request.
std::string varname = request_.varname(); std::string varname = request_.varname();
std::string out_varname = request_.out_varname(); std::string out_varname = request_.out_varname();
std::string table_name = request_.table_name();
int trainer_id = request_.trainer_id(); int trainer_id = request_.trainer_id();
VLOG(4) << "RequestGet " << out_varname << " from " << varname; VLOG(4) << "RequestGet " << out_varname << " from " << varname;
...@@ -145,19 +146,23 @@ class RequestGet final : public RequestBase { ...@@ -145,19 +146,23 @@ class RequestGet final : public RequestBase {
framework::Variable* invar = nullptr; framework::Variable* invar = nullptr;
framework::Variable* outvar = nullptr; framework::Variable* outvar = nullptr;
request_handler_->Handle(varname, scope, invar, &outvar, trainer_id, tmp_scope_ = std::move(scope->NewTmpScope());
out_varname); request_handler_->Handle(varname, tmp_scope_.get(), invar, &outvar,
trainer_id, out_varname, table_name);
VLOG(1) << "before SerializeToByteBuffer";
if (outvar) { if (outvar) {
SerializeToByteBuffer(out_varname, outvar, *request_handler_->dev_ctx(), SerializeToByteBuffer(out_varname, outvar, *request_handler_->dev_ctx(),
&reply_); &reply_);
} }
VLOG(1) << "after SerializeToByteBuffer";
Finish(reply_, &responder_); Finish(reply_, &responder_);
} }
protected: protected:
sendrecv::VariableMessage request_; sendrecv::VariableMessage request_;
::grpc::ByteBuffer reply_; ::grpc::ByteBuffer reply_;
std::unique_ptr<framework::Scope> tmp_scope_;
ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_; ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
}; };
......
...@@ -42,27 +42,23 @@ using DDim = framework::DDim; ...@@ -42,27 +42,23 @@ using DDim = framework::DDim;
template <typename T> template <typename T>
void ParameterRecv<T>::operator()(const RpcContext &rpc_ctx, void ParameterRecv<T>::operator()(const RpcContext &rpc_ctx,
const framework::Scope &scope) { const framework::Scope &scope) {
VLOG(3) << "ParameterRecv in"; VLOG(3) << "ParameterRecv in " << rpc_ctx.var_name;
std::unique_ptr<framework::Scope> local_scope = scope.NewTmpScope(); std::unique_ptr<framework::Scope> local_scope = scope.NewTmpScope();
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &cpu_ctx = *pool.Get(platform::CPUPlace()); auto &cpu_ctx = *pool.Get(platform::CPUPlace());
distributed::RPCClient *rpc_client = distributed::RPCClient *rpc_client =
distributed::RPCClient::GetInstance<RPCCLIENT_T>(0); distributed::RPCClient::GetInstance<RPCCLIENT_T>(rpc_ctx.trainer_id);
auto *recv_var = scope.FindVar(rpc_ctx.var_name); auto *recv_var = scope.FindVar(rpc_ctx.var_name);
std::vector<framework::Tensor *> recved_tensors;
// recv all vars to local scope // recv all vars to local scope
if (recv_var->IsType<framework::LoDTensor>()) { if (recv_var->IsType<framework::LoDTensor>()) {
std::vector<distributed::VarHandlePtr> rets; std::vector<distributed::VarHandlePtr> rets;
for (size_t i = 0; i < rpc_ctx.splited_var_names.size(); i++) { for (size_t i = 0; i < rpc_ctx.splited_var_names.size(); i++) {
auto &recv_var_name = rpc_ctx.splited_var_names[i]; auto &recv_var_name = rpc_ctx.splited_var_names[i];
framework::Tensor *t = local_scope->Var(recv_var_name);
local_scope->Var(recv_var_name)->GetMutable<framework::LoDTensor>();
recved_tensors.push_back(t);
VLOG(3) << "recv " << recv_var_name << " from " << rpc_ctx.epmap[i]; VLOG(3) << "recv " << recv_var_name << " from " << rpc_ctx.epmap[i];
rets.push_back(rpc_client->AsyncGetVar(rpc_ctx.epmap[i], cpu_ctx, rets.push_back(rpc_client->AsyncGetVar(rpc_ctx.epmap[i], cpu_ctx,
*local_scope.get(), recv_var_name, *local_scope.get(), recv_var_name,
...@@ -78,23 +74,61 @@ void ParameterRecv<T>::operator()(const RpcContext &rpc_ctx, ...@@ -78,23 +74,61 @@ void ParameterRecv<T>::operator()(const RpcContext &rpc_ctx,
// concat recved tensor into one var // concat recved tensor into one var
{ {
size_t output_offset = 0; size_t output_offset = 0;
size_t row_offset = 0;
framework::Tensor *recv_tensor = framework::Tensor *recv_tensor =
recv_var->GetMutable<framework::LoDTensor>(); recv_var->GetMutable<framework::LoDTensor>();
auto dev_ctx = paddle::platform::CPUDeviceContext(); auto dev_ctx = paddle::platform::CPUDeviceContext();
int64_t recv_numel = 0; int64_t recv_numel = 0;
for (auto *in : recved_tensors) { for (auto &recv_var_name : rpc_ctx.splited_var_names) {
recv_numel += in->numel(); auto *recv_var = local_scope->FindVar(recv_var_name);
auto in_stride = framework::stride_numel(in->dims()); if (recv_var->IsType<framework::LoDTensor>()) {
auto out_stride = framework::stride_numel(recv_tensor->dims()); auto &in = recv_var->Get<framework::LoDTensor>();
StridedNumelCopyWithAxis<T>( recv_numel += in.numel();
dev_ctx, 0, recv_tensor->data<T>() + output_offset, out_stride, auto in_stride = framework::stride_numel(in.dims());
in->data<T>(), in_stride, in_stride[0]); auto out_stride = framework::stride_numel(recv_tensor->dims());
output_offset += in_stride[0]; StridedNumelCopyWithAxis<T>(
dev_ctx, 0, recv_tensor->data<T>() + output_offset, out_stride,
in.data<T>(), in_stride, in_stride[0]);
output_offset += in_stride[0];
} else if (recv_var->IsType<framework::SelectedRows>()) {
auto &recv_slr = recv_var->Get<framework::SelectedRows>();
auto &recv_dims = recv_tensor->dims();
int64_t width = recv_dims[1];
recv_numel += recv_slr.height() * width;
PADDLE_ENFORCE_EQ(recv_slr.value().dims()[1], width);
PADDLE_ENFORCE_EQ(recv_slr.value().dims()[0], recv_slr.rows().size());
VLOG(3) << "recv slr " << recv_var_name << " dims "
<< recv_slr.value().dims();
if (VLOG_IS_ON(3)) {
std::ostringstream sstream;
sstream << "[";
for (auto &row_id : recv_slr.rows()) {
sstream << row_id << ", ";
}
sstream << "]";
VLOG(3) << "recv_slr size: " << recv_slr.rows().size() << " "
<< sstream.str();
}
for (auto i = 0; i < recv_slr.rows().size(); ++i) {
auto row_id = recv_slr.rows()[i] + row_offset;
PADDLE_ENFORCE_LT(row_id, recv_dims[0]);
memcpy(recv_tensor->data<T>() + row_id * width,
recv_slr.value().data<T>() + i * width, sizeof(T) * width);
}
row_offset += recv_slr.height();
} else {
PADDLE_THROW("unsupported recieved var type");
}
}
auto numel = recv_tensor->numel();
if (recv_numel != numel) {
LOG(FATAL) << "recv_numel: " << recv_numel << " acture numel: " << numel;
} }
PADDLE_ENFORCE_EQ(recv_numel, recv_tensor->numel()); PADDLE_ENFORCE_EQ(recv_numel, numel);
} }
VLOG(3) << "ParameterRecv out"; VLOG(3) << "ParameterRecv out " << rpc_ctx.var_name;
} }
template struct ParameterRecv<float>; template struct ParameterRecv<float>;
......
...@@ -47,7 +47,7 @@ void ParameterSend<T>::operator()(const RpcContext &rpc_ctx, ...@@ -47,7 +47,7 @@ void ParameterSend<T>::operator()(const RpcContext &rpc_ctx,
auto &cpu_ctx = *pool.Get(platform::CPUPlace()); auto &cpu_ctx = *pool.Get(platform::CPUPlace());
distributed::RPCClient *rpc_client = distributed::RPCClient *rpc_client =
distributed::RPCClient::GetInstance<RPCCLIENT_T>(0); distributed::RPCClient::GetInstance<RPCCLIENT_T>(rpc_ctx.trainer_id);
auto *send_var = scope.FindVar(rpc_ctx.var_name); auto *send_var = scope.FindVar(rpc_ctx.var_name);
size_t out_num = rpc_ctx.splited_var_names.size(); size_t out_num = rpc_ctx.splited_var_names.size();
......
...@@ -18,7 +18,9 @@ ...@@ -18,7 +18,9 @@
#include <condition_variable> // NOLINT #include <condition_variable> // NOLINT
#include <functional> #include <functional>
#include <memory>
#include <string> #include <string>
#include <unordered_map>
#include <utility> #include <utility>
#include <vector> #include <vector>
...@@ -180,6 +182,10 @@ class RequestHandler { ...@@ -180,6 +182,10 @@ class RequestHandler {
grad_to_prepared_ctx_ = g; grad_to_prepared_ctx_ = g;
} }
void SetSparseGradToParam(std::unordered_map<std::string, std::string>* g) {
sparse_grad_to_param_ = g;
}
void SetRPCServer(RPCServer* rpc_server) { rpc_server_ = rpc_server; } void SetRPCServer(RPCServer* rpc_server) { rpc_server_ = rpc_server; }
// Get attributes. // Get attributes.
...@@ -228,6 +234,7 @@ class RequestHandler { ...@@ -228,6 +234,7 @@ class RequestHandler {
std::unordered_map<std::string, std::unordered_map<std::string,
std::shared_ptr<framework::ExecutorPrepareContext>>* std::shared_ptr<framework::ExecutorPrepareContext>>*
grad_to_prepared_ctx_; grad_to_prepared_ctx_;
std::unordered_map<std::string, std::string>* sparse_grad_to_param_;
RPCServer* rpc_server_; RPCServer* rpc_server_;
}; };
......
...@@ -22,6 +22,7 @@ ...@@ -22,6 +22,7 @@
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/variable_helper.h" #include "paddle/fluid/framework/variable_helper.h"
#include "paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h"
#include "paddle/fluid/operators/distributed/rpc_server.h" #include "paddle/fluid/operators/distributed/rpc_server.h"
#include "paddle/fluid/string/piece.h" #include "paddle/fluid/string/piece.h"
#include "paddle/fluid/string/printf.h" #include "paddle/fluid/string/printf.h"
...@@ -59,6 +60,12 @@ bool RequestSendHandler::Handle(const std::string& varname, ...@@ -59,6 +60,12 @@ bool RequestSendHandler::Handle(const std::string& varname,
"async mode should not recv BATCH_BARRIER_MESSAGE or " "async mode should not recv BATCH_BARRIER_MESSAGE or "
"COMPLETE_MESSAGE"); "COMPLETE_MESSAGE");
} }
if (AsyncSparseParamUpdateRecorder::GetInstance()->HasGrad(varname)) {
auto& grad_slr =
scope->FindVar(varname)->Get<framework::SelectedRows>();
AsyncSparseParamUpdateRecorder::GetInstance()->Update(varname,
grad_slr.rows());
}
executor_->RunPreparedContext((*grad_to_prepared_ctx_)[varname].get(), executor_->RunPreparedContext((*grad_to_prepared_ctx_)[varname].get(),
scope); scope);
return true; return true;
...@@ -82,8 +89,9 @@ bool RequestGetHandler::Handle(const std::string& varname, ...@@ -82,8 +89,9 @@ bool RequestGetHandler::Handle(const std::string& varname,
const int trainer_id, const int trainer_id,
const std::string& out_var_name, const std::string& out_var_name,
const std::string& table_name) { const std::string& table_name) {
VLOG(4) << "RequestGetHandler:" << varname VLOG(3) << "RequestGetHandler:" << varname
<< " out_var_name: " << out_var_name; << " out_var_name: " << out_var_name << " trainer_id: " << trainer_id
<< " table_name: " << table_name;
if (sync_mode_) { if (sync_mode_) {
if (varname == FETCH_BARRIER_MESSAGE) { if (varname == FETCH_BARRIER_MESSAGE) {
...@@ -108,7 +116,42 @@ bool RequestGetHandler::Handle(const std::string& varname, ...@@ -108,7 +116,42 @@ bool RequestGetHandler::Handle(const std::string& varname,
VLOG(3) << "copying " << varname << " to " << param_bak_name; VLOG(3) << "copying " << varname << " to " << param_bak_name;
framework::TensorCopy(t_orig, dev_ctx_->GetPlace(), t); framework::TensorCopy(t_orig, dev_ctx_->GetPlace(), t);
} }
*outvar = scope_->FindVar(varname); if (AsyncSparseParamUpdateRecorder::GetInstance()->HasParam(varname) &&
!table_name.empty()) {
std::vector<int64_t> updated_rows;
AsyncSparseParamUpdateRecorder::GetInstance()->GetAndClear(
varname, trainer_id, &updated_rows);
if (VLOG_IS_ON(3)) {
std::ostringstream sstream;
sstream << "[";
for (auto& row_id : updated_rows) {
sstream << row_id << ", ";
}
sstream << "]";
VLOG(3) << "updated_rows size: " << updated_rows.size() << " "
<< sstream.str();
}
auto& origin_tensor =
scope_->FindVar(varname)->Get<framework::LoDTensor>();
auto* origin_tensor_data = origin_tensor.data<float>();
auto& dims = origin_tensor.dims();
*outvar = scope->Var();
auto* out_slr = (*outvar)->GetMutable<framework::SelectedRows>();
out_slr->set_rows(updated_rows);
out_slr->set_height(dims[0]);
auto out_dims = framework::make_ddim(
{static_cast<int64_t>(updated_rows.size()), dims[1]});
auto* data = out_slr->mutable_value()->mutable_data<float>(
out_dims, origin_tensor.place());
auto width = dims[1];
for (auto i = 0; i < updated_rows.size(); ++i) {
PADDLE_ENFORCE_LT(updated_rows[i], dims[0]);
memcpy(data + i * width, origin_tensor_data + updated_rows[i] * width,
sizeof(float) * width);
}
} else {
*outvar = scope_->FindVar(varname);
}
} }
} }
return true; return true;
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#pragma once #pragma once
#include <condition_variable> // NOLINT #include <condition_variable> // NOLINT
#include <memory>
#include <string> #include <string>
#include "gflags/gflags.h" #include "gflags/gflags.h"
...@@ -44,6 +45,7 @@ class RPCClient { ...@@ -44,6 +45,7 @@ class RPCClient {
const framework::Scope& scope, const framework::Scope& scope,
const std::string& var_name, const std::string& var_name,
const std::string& out_varname, const std::string& out_varname,
const std::string& table_name = "",
int64_t time_out = FLAGS_rpc_deadline) = 0; int64_t time_out = FLAGS_rpc_deadline) = 0;
virtual VarHandlePtr AsyncGetVarNoBarrier( virtual VarHandlePtr AsyncGetVarNoBarrier(
...@@ -96,6 +98,7 @@ class RPCClient { ...@@ -96,6 +98,7 @@ class RPCClient {
// Init is called by GetInstance. // Init is called by GetInstance.
template <typename T> template <typename T>
static void Init(int trainer_id) { static void Init(int trainer_id) {
VLOG(0) << "init rpc client with trainer_id " << trainer_id;
trainer_id_ = trainer_id; trainer_id_ = trainer_id;
if (rpc_client_.get() == nullptr) { if (rpc_client_.get() == nullptr) {
rpc_client_.reset(new T()); rpc_client_.reset(new T());
......
...@@ -27,23 +27,26 @@ struct RpcContext { ...@@ -27,23 +27,26 @@ struct RpcContext {
RpcContext(const std::string &name, const std::vector<std::string> &names, RpcContext(const std::string &name, const std::vector<std::string> &names,
const std::vector<std::string> &emap, const std::vector<std::string> &emap,
const std::vector<int64_t> &sections) const std::vector<int64_t> &sections, int id)
: var_name(name), : var_name(name),
splited_var_names(names), splited_var_names(names),
epmap(emap), epmap(emap),
height_sections(sections) {} height_sections(sections),
trainer_id(id) {}
RpcContext(const RpcContext &ctx) { RpcContext(const RpcContext &ctx) {
var_name = ctx.var_name; var_name = ctx.var_name;
splited_var_names = ctx.splited_var_names; splited_var_names = ctx.splited_var_names;
epmap = ctx.epmap; epmap = ctx.epmap;
height_sections = ctx.height_sections; height_sections = ctx.height_sections;
trainer_id = ctx.trainer_id;
} }
std::string var_name; std::string var_name;
std::vector<std::string> splited_var_names; std::vector<std::string> splited_var_names;
std::vector<std::string> epmap; std::vector<std::string> epmap;
std::vector<int64_t> height_sections; std::vector<int64_t> height_sections;
int trainer_id;
}; };
inline std::ostream &operator<<(std::ostream &os, const RpcContext &rpc_ctx) { inline std::ostream &operator<<(std::ostream &os, const RpcContext &rpc_ctx) {
......
...@@ -2,9 +2,9 @@ include(operators) ...@@ -2,9 +2,9 @@ include(operators)
set(DISTRIBUTE_DEPS "") set(DISTRIBUTE_DEPS "")
if(WITH_GRPC) if(WITH_GRPC)
set(DISTRIBUTE_DEPS sendrecvop_rpc parameter_send parameter_recv communicator grpc++_unsecure grpc_unsecure gpr cares zlib protobuf node) set(DISTRIBUTE_DEPS sendrecvop_rpc parameter_send parameter_recv communicator async_sparse_param_update_recorder grpc++_unsecure grpc_unsecure gpr cares zlib protobuf node)
else() else()
set(DISTRIBUTE_DEPS sendrecvop_rpc parameter_send parameter_recv communicator brpc leveldb snappystream snappy protobuf ssl crypto zlib node) set(DISTRIBUTE_DEPS sendrecvop_rpc parameter_send parameter_recv communicator async_sparse_param_update_recorder brpc leveldb snappystream snappy protobuf ssl crypto zlib node)
if(WITH_BRPC_RDMA) if(WITH_BRPC_RDMA)
find_library(IBVERBS_LIBRARY NAMES ibverbs) find_library(IBVERBS_LIBRARY NAMES ibverbs)
ADD_LIBRARY(ibverbs SHARED IMPORTED GLOBAL) ADD_LIBRARY(ibverbs SHARED IMPORTED GLOBAL)
......
...@@ -24,8 +24,10 @@ limitations under the License. */ ...@@ -24,8 +24,10 @@ limitations under the License. */
#include "paddle/fluid/operators/distributed/distributed.h" #include "paddle/fluid/operators/distributed/distributed.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h"
#include "paddle/fluid/operators/distributed/request_handler_impl.h" #include "paddle/fluid/operators/distributed/request_handler_impl.h"
#include "paddle/fluid/operators/distributed_ops/listen_and_serv_op.h" #include "paddle/fluid/operators/distributed_ops/listen_and_serv_op.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
DEFINE_int32(rpc_send_thread_num, 12, "number of threads for rpc send"); DEFINE_int32(rpc_send_thread_num, 12, "number of threads for rpc send");
...@@ -292,6 +294,8 @@ static void FillRequestCtx( ...@@ -292,6 +294,8 @@ static void FillRequestCtx(
std::unordered_map<std::string, std::unordered_map<std::string,
std::shared_ptr<framework::ExecutorPrepareContext>> std::shared_ptr<framework::ExecutorPrepareContext>>
*prefetch_ctx, *prefetch_ctx,
std::unordered_map<std::string, std::string>
*sparse_grad_name_to_param_name,
std::shared_ptr<framework::ExecutorPrepareContext> checkpoint_ctx, std::shared_ptr<framework::ExecutorPrepareContext> checkpoint_ctx,
distributed::RPCServer *rpc_server) { distributed::RPCServer *rpc_server) {
h->SetScope(scope); h->SetScope(scope);
...@@ -299,6 +303,7 @@ static void FillRequestCtx( ...@@ -299,6 +303,7 @@ static void FillRequestCtx(
h->SetExecutor(executor); h->SetExecutor(executor);
h->SetProgram(program); h->SetProgram(program);
h->SetPrefetchPreparedCtx(prefetch_ctx); h->SetPrefetchPreparedCtx(prefetch_ctx);
h->SetSparseGradToParam(sparse_grad_name_to_param_name);
h->SetRPCServer(rpc_server); h->SetRPCServer(rpc_server);
h->SetCheckpointNotifyPreparedCtx(checkpoint_ctx); h->SetCheckpointNotifyPreparedCtx(checkpoint_ctx);
} }
...@@ -414,10 +419,24 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, ...@@ -414,10 +419,24 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
prefetch_var_name_to_prepared_ctx[prefetch_var_name] = prefetch_prepared[i]; prefetch_var_name_to_prepared_ctx[prefetch_var_name] = prefetch_prepared[i];
} }
auto f = // parse attr of kSparseGradToParam sparse_grad_name -> param_name
std::bind(FillRequestCtx, std::placeholders::_1, &recv_scope, &dev_ctx, std::unordered_map<std::string, std::string> sparse_grad_name_to_param_name;
&executor, program, &prefetch_var_name_to_prepared_ctx, auto sparse_grad_name_to_param_name_str =
ckpt_pre_context, rpc_service_.get()); Attr<std::vector<std::string>>(kSparseGradToParam);
for (const auto &sparse_grad_name_and_param_name :
sparse_grad_name_to_param_name_str) {
std::vector<std::string> pieces;
split(sparse_grad_name_and_param_name, ':', &pieces);
PADDLE_ENFORCE_EQ(pieces.size(), 2);
VLOG(3) << "after split, sparse_grad_name = " << pieces[0]
<< ", param_name = " << pieces[1];
sparse_grad_name_to_param_name[pieces[0]] = pieces[1];
}
auto f = std::bind(
FillRequestCtx, std::placeholders::_1, &recv_scope, &dev_ctx, &executor,
program, &prefetch_var_name_to_prepared_ctx,
&sparse_grad_name_to_param_name, ckpt_pre_context, rpc_service_.get());
f(request_send_handler_.get()); f(request_send_handler_.get());
f(request_get_handler_.get()); f(request_get_handler_.get());
...@@ -445,6 +464,8 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, ...@@ -445,6 +464,8 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
RunSyncLoop(&executor, program, &recv_scope, &dev_ctx, RunSyncLoop(&executor, program, &recv_scope, &dev_ctx,
prefetch_block_id_list, checkpoint_block_id); prefetch_block_id_list, checkpoint_block_id);
} else { } else {
distributed::AsyncSparseParamUpdateRecorder::Init(
fan_in, sparse_grad_name_to_param_name);
RunAsyncLoop(&executor, program, &recv_scope); RunAsyncLoop(&executor, program, &recv_scope);
} }
} }
...@@ -475,6 +496,10 @@ class ListenAndServOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -475,6 +496,10 @@ class ListenAndServOpMaker : public framework::OpProtoAndCheckerMaker {
AddAttr<std::vector<std::string>>(kPrefetchVarNameToBlockId, AddAttr<std::vector<std::string>>(kPrefetchVarNameToBlockId,
"prefetch blocks to run on server side.") "prefetch blocks to run on server side.")
.SetDefault({}); .SetDefault({});
AddAttr<std::vector<std::string>>(
kSparseGradToParam,
"sparse grad name to param name. like: 'emb@Grad:emb'")
.SetDefault({});
AddAttr<int>("Fanin", "How many clients send to this server.") AddAttr<int>("Fanin", "How many clients send to this server.")
.SetDefault(1); .SetDefault(1);
AddAttr<int>(kCheckpointBlockId, AddAttr<int>(kCheckpointBlockId,
......
...@@ -16,8 +16,10 @@ limitations under the License. */ ...@@ -16,8 +16,10 @@ limitations under the License. */
#include <stdint.h> #include <stdint.h>
#include <atomic> #include <atomic>
#include <memory>
#include <set> #include <set>
#include <string> #include <string>
#include <unordered_map>
#include <utility> #include <utility>
#include <vector> #include <vector>
...@@ -35,6 +37,7 @@ namespace operators { ...@@ -35,6 +37,7 @@ namespace operators {
constexpr char kOptimizeBlocks[] = "optimize_blocks"; constexpr char kOptimizeBlocks[] = "optimize_blocks";
constexpr char kPrefetchVarNameToBlockId[] = "prefetch_var_name_to_block_id"; constexpr char kPrefetchVarNameToBlockId[] = "prefetch_var_name_to_block_id";
constexpr char kCheckpointBlockId[] = "checkpint_block_id"; constexpr char kCheckpointBlockId[] = "checkpint_block_id";
constexpr char kSparseGradToParam[] = "sparse_grad_to_param";
void RunServer(std::shared_ptr<distributed::RPCServer> service); void RunServer(std::shared_ptr<distributed::RPCServer> service);
......
...@@ -50,17 +50,18 @@ class RecvOp : public framework::OperatorBase { ...@@ -50,17 +50,18 @@ class RecvOp : public framework::OperatorBase {
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &ctx = *pool.Get(place); auto &ctx = *pool.Get(place);
auto trainer_id = Attr<int>("trainer_id");
distributed::RPCClient *rpc_client = distributed::RPCClient *rpc_client =
distributed::RPCClient::GetInstance<RPCCLIENT_T>( distributed::RPCClient::GetInstance<RPCCLIENT_T>(trainer_id);
Attr<int>("trainer_id"));
std::vector<std::string> recv_varnames = std::vector<std::string> recv_varnames =
Attr<std::vector<std::string>>("recv_varnames"); Attr<std::vector<std::string>>("recv_varnames");
if (recv_varnames.size() > 0) { if (recv_varnames.size() > 0) {
auto recv_functor = distributed::ParameterRecv<float>(); auto recv_functor = distributed::ParameterRecv<float>();
auto rpc_ctx = distributed::RpcContext(outs[0], recv_varnames, epmap, {}); auto rpc_ctx = distributed::RpcContext(outs[0], recv_varnames, epmap, {},
trainer_id);
recv_functor(rpc_ctx, scope); recv_functor(rpc_ctx, scope);
} else { } else {
if (with_barrier) { if (with_barrier) {
......
...@@ -42,6 +42,7 @@ class SendOp : public framework::OperatorBase { ...@@ -42,6 +42,7 @@ class SendOp : public framework::OperatorBase {
auto epmap = Attr<std::vector<std::string>>("epmap"); auto epmap = Attr<std::vector<std::string>>("epmap");
int sync_send = Attr<int>("sync_mode"); int sync_send = Attr<int>("sync_mode");
auto trainer_id = Attr<int>("trainer_id");
auto send_varnames = Attr<std::vector<std::string>>("send_varnames"); auto send_varnames = Attr<std::vector<std::string>>("send_varnames");
auto height_sections = Attr<std::vector<int64_t>>("sections"); auto height_sections = Attr<std::vector<int64_t>>("sections");
...@@ -51,7 +52,7 @@ class SendOp : public framework::OperatorBase { ...@@ -51,7 +52,7 @@ class SendOp : public framework::OperatorBase {
if (distributed::Communicator::GetInstance() == nullptr) { if (distributed::Communicator::GetInstance() == nullptr) {
auto send_functor = distributed::ParameterSend<float>(); auto send_functor = distributed::ParameterSend<float>();
auto rpc_ctx = distributed::RpcContext(ins[0], send_varnames, epmap, auto rpc_ctx = distributed::RpcContext(ins[0], send_varnames, epmap,
height_sections); height_sections, trainer_id);
send_functor(rpc_ctx, scope, true); send_functor(rpc_ctx, scope, true);
} else { } else {
distributed::Communicator::GetInstance()->Send(ins[0], scope); distributed::Communicator::GetInstance()->Send(ins[0], scope);
...@@ -62,8 +63,7 @@ class SendOp : public framework::OperatorBase { ...@@ -62,8 +63,7 @@ class SendOp : public framework::OperatorBase {
auto& ctx = *pool.Get(place); auto& ctx = *pool.Get(place);
distributed::RPCClient* rpc_client = distributed::RPCClient* rpc_client =
distributed::RPCClient::GetInstance<RPCCLIENT_T>( distributed::RPCClient::GetInstance<RPCCLIENT_T>(trainer_id);
Attr<int>("trainer_id"));
std::vector<distributed::VarHandlePtr> rets; std::vector<distributed::VarHandlePtr> rets;
for (size_t i = 0; i < ins.size(); i++) { for (size_t i = 0; i < ins.size(); i++) {
......
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/linspace_op.h"
namespace paddle {
namespace operators {
class LinspaceOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Start"),
"Input(Start) of LinspaceOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Stop"),
"Input(Stop) of LinspaceOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Num"),
"Input(Num) of LinspaceOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(OUt) of LinspaceOp should not be null.");
auto s_dims = ctx->GetInputDim("Start");
PADDLE_ENFORCE((s_dims.size() == 1) && (s_dims[0] == 1),
"The shape of Input(Start) should be [1].");
auto e_dims = ctx->GetInputDim("Stop");
PADDLE_ENFORCE((e_dims.size() == 1) && (e_dims[0] == 1),
"The shape of Input(Stop) should be [1].");
auto step_dims = ctx->GetInputDim("Num");
PADDLE_ENFORCE((step_dims.size() == 1) && (step_dims[0] == 1),
"The shape of Input(Num) should be [1].");
ctx->SetOutputDim("Out", {-1});
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
framework::LibraryType library_{framework::LibraryType::kPlain};
framework::DataLayout layout_ = framework::DataLayout::kAnyLayout;
return framework::OpKernelType(
ctx.Input<framework::Tensor>("Start")->type(), ctx.device_context(),
layout_, library_);
}
};
class LinspaceOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("Start",
"First entry in the sequence. It is a tensor of shape [1], should "
"be of type float32 or float64.");
AddInput("Stop",
"Last entry in the sequence. It is a tensor of shape [1], should "
"be of type float32 or float64.");
AddInput("Num",
"Number of entry in the sequence. It is a tensor of shape [1], "
"should be of type int32.");
AddOutput("Out", "A sequence of numbers.");
AddComment(R"DOC(
Return fixed number of evenly spaced values within a given interval. First entry is start, and last entry is stop. In the case when Num is 1, only Start is returned. Like linspace function of numpy.
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(linspace, ops::LinspaceOp, ops::LinspaceOpMaker);
REGISTER_OP_CPU_KERNEL(linspace, ops::CPULinspaceKernel<float>,
ops::CPULinspaceKernel<double>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/linspace_op.h"
#include "paddle/fluid/platform/cuda_primitives.h"
namespace paddle {
namespace operators {
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
template <typename T>
__global__ void LinspaceKernel(T start, T step, int64_t size, T* out) {
CUDA_1D_KERNEL_LOOP(index, size) { out[index] = start + step * index; }
}
template <typename T>
__global__ void LinspaceSpecialKernel(T start, T* out) {
out[0] = start;
}
template <typename T>
class CUDALinspaceKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto* start_t = context.Input<framework::Tensor>("Start");
auto* stop_t = context.Input<framework::Tensor>("Stop");
auto* num_t = context.Input<framework::Tensor>("Num");
auto* out = context.Output<framework::Tensor>("Out");
framework::Tensor n;
framework::TensorCopy(*start_t, platform::CPUPlace(), &n);
T start = n.data<T>()[0];
framework::TensorCopy(*stop_t, platform::CPUPlace(), &n);
T stop = n.data<T>()[0];
framework::TensorCopy(*num_t, platform::CPUPlace(), &n);
int32_t num = n.data<int32_t>()[0];
PADDLE_ENFORCE(num > 0, "The num of linspace op should be larger than 0.");
out->Resize(framework::make_ddim({num}));
T* out_data = out->mutable_data<T>(context.GetPlace());
T step = 0;
if (num != 1) {
step = (stop - start) / (num - 1);
}
auto stream = context.cuda_device_context().stream();
int block = 512;
int grid = (num + block - 1) / block;
LinspaceKernel<T><<<grid, block, 0, stream>>>(start, step, num, out_data);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(linspace, ops::CUDALinspaceKernel<float>,
ops::CUDALinspaceKernel<double>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <functional>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace paddle {
namespace operators {
template <typename T>
class CPULinspaceKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
T start = context.Input<framework::Tensor>("Start")->data<T>()[0];
T stop = context.Input<framework::Tensor>("Stop")->data<T>()[0];
int32_t num = context.Input<framework::Tensor>("Num")->data<int32_t>()[0];
auto* out = context.Output<framework::Tensor>("Out");
PADDLE_ENFORCE(num > 0, "The num of linspace op should be larger than 0.");
out->Resize(framework::make_ddim({num}));
T* out_data = out->mutable_data<T>(context.GetPlace());
if (num > 1) {
T step = (stop - start) / (num - 1);
T value = start;
for (int i = 0; i < num; ++i) {
out_data[i] = value;
value += step;
}
} else {
out_data[0] = start;
}
}
};
} // namespace operators
} // namespace paddle
...@@ -30,10 +30,10 @@ class LoDResetOp : public framework::OperatorWithKernel { ...@@ -30,10 +30,10 @@ class LoDResetOp : public framework::OperatorWithKernel {
if (!ctx->HasInput("Y")) { if (!ctx->HasInput("Y")) {
auto level0 = ctx->Attrs().Get<std::vector<int>>("target_lod"); auto level0 = ctx->Attrs().Get<std::vector<int>>("target_lod");
PADDLE_ENFORCE_GT(level0.size(), 1, PADDLE_ENFORCE_GT(level0.size(), 0,
"If Input(Y) not provided, the target lod should be " "If Input(Y) not provided, the target lod should be "
"specified by attribute `target_lod`."); "specified by attribute `target_lod`.");
} else { } else if (ctx->IsRuntime()) {
ctx->ShareLoD("Y", "Out"); ctx->ShareLoD("Y", "Out");
} }
...@@ -48,6 +48,23 @@ class LoDResetOp : public framework::OperatorWithKernel { ...@@ -48,6 +48,23 @@ class LoDResetOp : public framework::OperatorWithKernel {
} }
}; };
class LoDResetOpVarTypeInference : public framework::VarTypeInference {
public:
void operator()(framework::InferVarTypeContext *ctx) const override {
auto x_var_name = ctx->Input("X").front();
auto out_var_name = ctx->Output("Out").front();
if (ctx->HasInput("Y")) {
auto y_var_name = ctx->Input("Y").front();
auto y_lod_level = std::max(ctx->GetLoDLevel(y_var_name), 1);
ctx->SetLoDLevel(out_var_name, y_lod_level);
} else {
ctx->SetLoDLevel(out_var_name, 1);
}
ctx->SetDataType(out_var_name, ctx->GetDataType(x_var_name));
ctx->SetType(out_var_name, paddle::framework::proto::VarType::LOD_TENSOR);
}
};
class LoDResetOpMaker : public framework::OpProtoAndCheckerMaker { class LoDResetOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
void Make() override { void Make() override {
...@@ -177,9 +194,10 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(LoDResetGradNoNeedBufferVarInference, ...@@ -177,9 +194,10 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(LoDResetGradNoNeedBufferVarInference,
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OPERATOR(lod_reset, ops::LoDResetOp, ops::LoDResetOpMaker, REGISTER_OPERATOR(lod_reset, ops::LoDResetOp, ops::LoDResetOpMaker,
ops::LoDResetGradDescMaker); ops::LoDResetGradDescMaker, ops::LoDResetOpVarTypeInference);
REGISTER_OPERATOR(lod_reset_grad, ops::LoDResetGradOp, REGISTER_OPERATOR(lod_reset_grad, ops::LoDResetGradOp,
ops::LoDResetGradNoNeedBufferVarInference); ops::LoDResetGradNoNeedBufferVarInference);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
lod_reset, ops::LoDResetKernel<paddle::platform::CPUPlace, float>, lod_reset, ops::LoDResetKernel<paddle::platform::CPUPlace, float>,
ops::LoDResetKernel<paddle::platform::CPUPlace, double>, ops::LoDResetKernel<paddle::platform::CPUPlace, double>,
......
...@@ -63,7 +63,7 @@ class LoDResetKernel : public framework::OpKernel<T> { ...@@ -63,7 +63,7 @@ class LoDResetKernel : public framework::OpKernel<T> {
"Target LoD should be a vector end with the " "Target LoD should be a vector end with the "
"first dimension of Input(X)."); "first dimension of Input(X).");
for (size_t i = 0; i < level0.size() - 1; ++i) { for (size_t i = 0; i < level0.size() - 1; ++i) {
PADDLE_ENFORCE(level0[i + 1] > level0[i], PADDLE_ENFORCE(level0[i + 1] >= level0[i],
"Target LoD should be an ascending vector."); "Target LoD should be an ascending vector.");
} }
......
...@@ -23,6 +23,7 @@ constexpr char kInitialStates[] = "initial_states"; ...@@ -23,6 +23,7 @@ constexpr char kInitialStates[] = "initial_states";
constexpr char kParameters[] = "parameters"; constexpr char kParameters[] = "parameters";
constexpr char kOutputs[] = "outputs"; constexpr char kOutputs[] = "outputs";
constexpr char kStepScopes[] = "step_scopes"; constexpr char kStepScopes[] = "step_scopes";
constexpr char kHasStates[] = "has_states";
constexpr char kExStates[] = "ex_states"; constexpr char kExStates[] = "ex_states";
constexpr char kStates[] = "states"; constexpr char kStates[] = "states";
constexpr char kStepBlock[] = "sub_block"; constexpr char kStepBlock[] = "sub_block";
...@@ -241,11 +242,16 @@ class RecurrentOp : public RecurrentBase { ...@@ -241,11 +242,16 @@ class RecurrentOp : public RecurrentBase {
private: private:
void RunImpl(const framework::Scope &scope, void RunImpl(const framework::Scope &scope,
const platform::Place &place) const override { const platform::Place &place) const override {
bool has_state = Attr<bool>(kHasStates);
auto seq_len = static_cast<size_t>(this->GetSequenceLength(scope)); auto seq_len = static_cast<size_t>(this->GetSequenceLength(scope));
VLOG(3) << "Static RNN input sequence length = " << seq_len; VLOG(3) << "Static RNN input sequence length = " << seq_len;
StepScopes scopes = CreateStepScopes(scope, seq_len); StepScopes scopes = CreateStepScopes(scope, seq_len);
auto reverse = Attr<bool>(kReverse); auto reverse = Attr<bool>(kReverse);
// get device context from pool
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(place);
framework::Executor executor(place); framework::Executor executor(place);
auto *block = Attr<framework::BlockDesc *>(kStepBlock); auto *block = Attr<framework::BlockDesc *>(kStepBlock);
...@@ -269,15 +275,17 @@ class RecurrentOp : public RecurrentBase { ...@@ -269,15 +275,17 @@ class RecurrentOp : public RecurrentBase {
inside->Resize(framework::make_ddim(dims)); inside->Resize(framework::make_ddim(dims));
}); });
if (i == 0) { if (has_state) {
// Link initial states --> ex_states if (i == 0) {
LinkTensor(scope, Inputs(kInitialStates), &cur_scope, // Link initial states --> ex_states
Attr<std::vector<std::string>>(kExStates)); LinkTensor(scope, Inputs(kInitialStates), &cur_scope,
} else { Attr<std::vector<std::string>>(kExStates));
auto &ex_scope = scopes.ExScope(); } else {
// Link ex_scope::state --> cur_scope::ex_state auto &ex_scope = scopes.ExScope();
LinkTensor(ex_scope, Attr<std::vector<std::string>>(kStates), // Link ex_scope::state --> cur_scope::ex_state
&cur_scope, Attr<std::vector<std::string>>(kExStates)); LinkTensor(ex_scope, Attr<std::vector<std::string>>(kStates),
&cur_scope, Attr<std::vector<std::string>>(kExStates));
}
} }
// Every inputs are linked now, execute! // Every inputs are linked now, execute!
...@@ -286,11 +294,6 @@ class RecurrentOp : public RecurrentBase { ...@@ -286,11 +294,6 @@ class RecurrentOp : public RecurrentBase {
std::vector<std::string>() /*skip_ref_cnt_vars*/, std::vector<std::string>() /*skip_ref_cnt_vars*/,
true /*force_disable_gc*/); true /*force_disable_gc*/);
// get device context from pool
platform::DeviceContextPool &pool =
platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(place);
// Copy inside::output -> outside::output // Copy inside::output -> outside::output
// outside::output[seq_offset: seq_offset + 1] = inside::output // outside::output[seq_offset: seq_offset + 1] = inside::output
this->LinkTensorWithCallback( this->LinkTensorWithCallback(
...@@ -333,13 +336,13 @@ class RecurrentGradOp : public RecurrentBase { ...@@ -333,13 +336,13 @@ class RecurrentGradOp : public RecurrentBase {
private: private:
void RunImpl(const framework::Scope &scope, void RunImpl(const framework::Scope &scope,
const platform::Place &place) const override { const platform::Place &place) const override {
auto seq_len = static_cast<size_t>(GetSequenceLength(scope)); bool has_state = Attr<bool>(kHasStates);
const size_t seq_len = static_cast<size_t>(GetSequenceLength(scope));
StepScopes scopes = CreateStepScopes(scope, seq_len); StepScopes scopes = CreateStepScopes(scope, seq_len);
auto reverse = Attr<bool>(kReverse); auto reverse = Attr<bool>(kReverse);
framework::Executor executor(place); framework::Executor executor(place);
auto *block = Attr<framework::BlockDesc *>(kStepBlock); auto *block = Attr<framework::BlockDesc *>(kStepBlock);
auto *program = block->Program(); auto *program = block->Program();
// get device context from pool // get device context from pool
...@@ -350,6 +353,7 @@ class RecurrentGradOp : public RecurrentBase { ...@@ -350,6 +353,7 @@ class RecurrentGradOp : public RecurrentBase {
size_t seq_offset = reverse ? step_id : seq_len - step_id - 1; size_t seq_offset = reverse ? step_id : seq_len - step_id - 1;
VLOG(3) << "Recurrent backward operate at the time step " << seq_offset; VLOG(3) << "Recurrent backward operate at the time step " << seq_offset;
auto &cur_scope = scopes.CurScope(); auto &cur_scope = scopes.CurScope();
// Link outside::output_grads --> inside::output_grads // Link outside::output_grads --> inside::output_grads
// inside::output_grad = outside::output_grad[seq_offset:seq_offset+1] // inside::output_grad = outside::output_grad[seq_offset:seq_offset+1]
LinkTensorWithCallback( LinkTensorWithCallback(
...@@ -370,30 +374,32 @@ class RecurrentGradOp : public RecurrentBase { ...@@ -370,30 +374,32 @@ class RecurrentGradOp : public RecurrentBase {
VLOG(10) << " RNN output gradients = [" << sout.str() << "]"; VLOG(10) << " RNN output gradients = [" << sout.str() << "]";
} }
// Link states if (has_state) {
// if cur_scope::cur_state_grad in out_grads: // Link states
// cur_scope::cur_state_grad += ex_scope::ex_state_grad // if cur_scope::cur_state_grad in out_grads:
// else: // cur_scope::cur_state_grad += ex_scope::ex_state_grad
// ex_scope::ex_state_grad --> cur_scope::cur_state_grad // else:
if (step_id != 0) { // not at beginning // ex_scope::ex_state_grad --> cur_scope::cur_state_grad
auto &ex_scope = scopes.ExScope(); if (step_id != 0) { // not at beginning
auto ex_state_grads = auto &ex_scope = scopes.ExScope();
GradVarLists(Attr<std::vector<std::string>>(kExStates)); auto ex_state_grads =
auto cur_state_grads = GradVarLists(Attr<std::vector<std::string>>(kExStates));
GradVarLists(Attr<std::vector<std::string>>(kStates)); auto cur_state_grads =
GradVarLists(Attr<std::vector<std::string>>(kStates));
PADDLE_ENFORCE_EQ(ex_state_grads.size(), cur_state_grads.size());
for (size_t i = 0; i < ex_state_grads.size(); ++i) { PADDLE_ENFORCE_EQ(ex_state_grads.size(), cur_state_grads.size());
auto &cur_grad = cur_state_grads[i]; for (size_t i = 0; i < ex_state_grads.size(); ++i) {
auto &ex_grad = ex_state_grads[i]; auto &cur_grad = cur_state_grads[i];
auto &ex_tensor = auto &ex_grad = ex_state_grads[i];
ex_scope.FindVar(ex_grad)->Get<framework::LoDTensor>(); auto &ex_tensor =
ex_scope.FindVar(ex_grad)->Get<framework::LoDTensor>();
VLOG(10) << " RNN link " << cur_grad << " from " << ex_grad;
auto *cur_grad_var = cur_scope.Var(cur_grad); VLOG(10) << " RNN link " << cur_grad << " from " << ex_grad;
auto cur_grad_tensor = auto *cur_grad_var = cur_scope.Var(cur_grad);
cur_grad_var->GetMutable<framework::LoDTensor>(); auto cur_grad_tensor =
framework::TensorCopy(ex_tensor, place, dev_ctx, cur_grad_tensor); cur_grad_var->GetMutable<framework::LoDTensor>();
framework::TensorCopy(ex_tensor, place, dev_ctx, cur_grad_tensor);
}
} }
} }
...@@ -442,8 +448,8 @@ class RecurrentGradOp : public RecurrentBase { ...@@ -442,8 +448,8 @@ class RecurrentGradOp : public RecurrentBase {
} }
auto new_inside_name = cur_scope.Rename(inside_grad_name); auto new_inside_name = cur_scope.Rename(inside_grad_name);
// sum gradient
// sum gradient
auto sum_op = framework::OpRegistry::CreateOp( auto sum_op = framework::OpRegistry::CreateOp(
"sum", {{"X", {pg_names[param_id], new_inside_name}}}, "sum", {{"X", {pg_names[param_id], new_inside_name}}},
{{"Out", {pg_names[param_id]}}}, {{"Out", {pg_names[param_id]}}},
...@@ -475,22 +481,33 @@ class RecurrentGradOp : public RecurrentBase { ...@@ -475,22 +481,33 @@ class RecurrentGradOp : public RecurrentBase {
true /*is_backward*/); true /*is_backward*/);
VLOG(5) << "Link outside gradient finished "; VLOG(5) << "Link outside gradient finished ";
if (step_id + 1 == seq_len) { // at_end if (has_state) {
// copy initialize states gradient from inside to outside if (step_id + 1 == seq_len) { // at_end
LinkTensorWithCallback( // copy initialize states gradient from inside to outside
cur_scope, GradVarLists(Attr<std::vector<std::string>>(kExStates)), LinkTensorWithCallback(
scope, Outputs(kInitStateGrads), cur_scope,
[&](const framework::LoDTensor &inside, GradVarLists(Attr<std::vector<std::string>>(kExStates)), scope,
framework::LoDTensor *outside) { Outputs(kInitStateGrads),
outside->Resize(inside.dims()); [&](const framework::LoDTensor &inside,
outside->mutable_data(place, inside.type()); framework::LoDTensor *outside) {
framework::TensorCopy(inside, place, dev_ctx, outside); outside->Resize(inside.dims());
}, outside->mutable_data(place, inside.type());
true /*is_backward*/); framework::TensorCopy(inside, place, dev_ctx, outside);
VLOG(5) << "Link initialize state gradient finished "; },
true /*is_backward*/);
VLOG(5) << "Link initialize state gradient finished ";
}
} }
scopes.Next(); scopes.Next();
} }
// Delete the scope of StepScopes
dev_ctx.Wait();
auto *var = scope.FindVar(Input(kStepScopes));
PADDLE_ENFORCE(var != nullptr);
auto step_scopes = var->GetMutable<StepScopeVar>();
for (auto *sub_scope : *step_scopes) {
const_cast<framework::Scope &>(scope).DeleteScope(sub_scope);
}
} }
private: private:
...@@ -541,6 +558,7 @@ class RecurrentOpProtoMaker : public framework::OpProtoAndCheckerMaker { ...@@ -541,6 +558,7 @@ class RecurrentOpProtoMaker : public framework::OpProtoAndCheckerMaker {
.AsDuplicable(); .AsDuplicable();
AddOutput(kStepScopes, AddOutput(kStepScopes,
"StepScopes contain all local variables in each time step."); "StepScopes contain all local variables in each time step.");
AddAttr<bool>(kHasStates, "Whether has states.").SetDefault(false);
AddAttr<std::vector<std::string>>(kExStates, AddAttr<std::vector<std::string>>(kExStates,
string::Sprintf( string::Sprintf(
R"DOC(The ex-state variable names. R"DOC(The ex-state variable names.
...@@ -624,20 +642,44 @@ class RecurrentGradOpDescMaker : public framework::SingleGradOpDescMaker { ...@@ -624,20 +642,44 @@ class RecurrentGradOpDescMaker : public framework::SingleGradOpDescMaker {
class RecurrentGradOpShapeInference : public framework::InferShapeBase { class RecurrentGradOpShapeInference : public framework::InferShapeBase {
public: public:
void operator()(framework::InferShapeContext *ctx) const override { void operator()(framework::InferShapeContext *ctx) const override {
std::vector<std::string> input{kInputs, kInitialStates};
std::vector<std::string> output{kOutputs}; std::vector<std::string> output{kOutputs};
for (auto &s : input) {
// NOTE(zcd): In some case, some of kInputs doesn't have gradient. // In some case the kInitialStates is empty.
PADDLE_ENFORCE(ctx->HasInputs(s)); // If the kInitialStates is empty, all the states should be empty.
} if (!ctx->HasInputs(kInitialStates)) {
for (auto &s : output) { PADDLE_ENFORCE_EQ(
PADDLE_ENFORCE(ctx->HasInputs(s)); ctx->Attrs().Get<std::vector<std::string>>(kExStates).size(), 0,
"The Attr(%s) should be empty.", kExStates);
PADDLE_ENFORCE_EQ(
ctx->Attrs().Get<std::vector<std::string>>(kStates).size(), 0,
"The Attr(%s) should be empty.", kStates);
} }
for (auto &s : input) {
ctx->SetOutputsDim(framework::GradVarName(s), ctx->GetInputsDim(s)); PADDLE_ENFORCE(ctx->HasInputs(kInputs),
"The input(%s) should not be empty.", kInputs);
PADDLE_ENFORCE(ctx->HasInputs(kOutputs),
"The input(%s) should not be empty.", kOutputs);
// In some case the kInitialStates is empty.
if (ctx->HasInputs(kInitialStates)) {
PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName(kInitialStates)),
"The output of(%s) should not be empty.",
framework::GradVarName(kInitialStates));
ctx->SetOutputsDim(framework::GradVarName(kInitialStates),
ctx->GetInputsDim(kInitialStates));
} }
PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName(kInputs)),
"The output of(%s) should not be empty.",
framework::GradVarName(kInputs));
ctx->SetOutputsDim(framework::GradVarName(kInputs),
ctx->GetInputsDim(kInputs));
// In some case the kParameters is empty.
if (ctx->HasInputs(kParameters)) { if (ctx->HasInputs(kParameters)) {
PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName(kParameters))); PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName(kParameters)),
"The output of(%s) should not be empty.",
framework::GradVarName(kParameters));
ctx->SetOutputsDim(framework::GradVarName(kParameters), ctx->SetOutputsDim(framework::GradVarName(kParameters),
ctx->GetInputsDim(kParameters)); ctx->GetInputsDim(kParameters));
} }
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/reduce_ops/reduce_all_op.h"
REGISTER_REDUCE_OP(reduce_all);
REGISTER_OP_CPU_KERNEL(reduce_all,
ops::ReduceKernel<paddle::platform::CPUDeviceContext,
bool, ops::AllFunctor>);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/reduce_ops/reduce_all_op.h"
REGISTER_OP_CUDA_KERNEL(reduce_all,
ops::ReduceKernel<paddle::platform::CUDADeviceContext,
bool, ops::AllFunctor>);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
namespace paddle {
namespace operators {
struct AllFunctor {
template <typename DeviceContext, typename X, typename Y, typename Dim>
void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
y->device(place) = x->all(dim);
}
};
} // namespace operators
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. Any Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/reduce_ops/reduce_any_op.h"
REGISTER_REDUCE_OP(reduce_any);
REGISTER_OP_CPU_KERNEL(reduce_any,
ops::ReduceKernel<paddle::platform::CPUDeviceContext,
bool, ops::AnyFunctor>);
// Copyright (c) 2018 PaddlePaddle Authors. Any Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/reduce_ops/reduce_any_op.h"
REGISTER_OP_CUDA_KERNEL(reduce_any,
ops::ReduceKernel<paddle::platform::CUDADeviceContext,
bool, ops::AnyFunctor>);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
namespace paddle {
namespace operators {
struct AnyFunctor {
template <typename DeviceContext, typename X, typename Y, typename Dim>
void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
y->device(place) = x->any(dim);
}
};
} // namespace operators
} // namespace paddle
...@@ -40,9 +40,12 @@ class RNNMemoryHelperOp : public framework::OperatorBase { ...@@ -40,9 +40,12 @@ class RNNMemoryHelperOp : public framework::OperatorBase {
"Cannot find out_var in scope, out_var_name is %s", "Cannot find out_var in scope, out_var_name is %s",
out_name); out_name);
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(dev_place);
auto *out_tensor = out_var->GetMutable<framework::LoDTensor>(); auto *out_tensor = out_var->GetMutable<framework::LoDTensor>();
auto &mem_tensor = mem_var->Get<framework::LoDTensor>(); auto &mem_tensor = mem_var->Get<framework::LoDTensor>();
framework::TensorCopySync(mem_tensor, dev_place, out_tensor); framework::TensorCopy(mem_tensor, dev_place, dev_ctx, out_tensor);
out_tensor->set_lod(mem_tensor.lod()); out_tensor->set_lod(mem_tensor.lod());
} }
}; };
...@@ -92,6 +95,9 @@ class RNNMemoryHelperGradOp : public framework::OperatorBase { ...@@ -92,6 +95,9 @@ class RNNMemoryHelperGradOp : public framework::OperatorBase {
"Cannot find in_grad_var in scope, name is %s", "Cannot find in_grad_var in scope, name is %s",
in_grad_var_name); in_grad_var_name);
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(dev_place);
if (out_grad_var == nullptr) { if (out_grad_var == nullptr) {
VLOG(5) << "Using fill constant 0 as starting gradient"; VLOG(5) << "Using fill constant 0 as starting gradient";
auto in_var_name = Input("X"); auto in_var_name = Input("X");
...@@ -109,7 +115,8 @@ class RNNMemoryHelperGradOp : public framework::OperatorBase { ...@@ -109,7 +115,8 @@ class RNNMemoryHelperGradOp : public framework::OperatorBase {
} else { } else {
auto &out_grad_tensor = out_grad_var->Get<framework::LoDTensor>(); auto &out_grad_tensor = out_grad_var->Get<framework::LoDTensor>();
auto *in_grad_tensor = in_grad_var->GetMutable<framework::LoDTensor>(); auto *in_grad_tensor = in_grad_var->GetMutable<framework::LoDTensor>();
framework::TensorCopySync(out_grad_tensor, dev_place, in_grad_tensor); framework::TensorCopy(out_grad_tensor, dev_place, dev_ctx,
in_grad_tensor);
in_grad_tensor->set_lod(out_grad_tensor.lod()); in_grad_tensor->set_lod(out_grad_tensor.lod());
} }
} }
......
...@@ -77,6 +77,9 @@ class SquaredL2DistanceGradKernel : public framework::OpKernel<T> { ...@@ -77,6 +77,9 @@ class SquaredL2DistanceGradKernel : public framework::OpKernel<T> {
auto* x_g = context.Output<Tensor>(framework::GradVarName("X")); auto* x_g = context.Output<Tensor>(framework::GradVarName("X"));
auto* y_g = context.Output<Tensor>(framework::GradVarName("Y")); auto* y_g = context.Output<Tensor>(framework::GradVarName("Y"));
PADDLE_ENFORCE_NOT_NULL(x_g);
PADDLE_ENFORCE_NOT_NULL(y_g);
auto sub_result = EigenMatrix<T>::From(*in0); auto sub_result = EigenMatrix<T>::From(*in0);
auto out_grad = EigenMatrix<T>::From(*in1); auto out_grad = EigenMatrix<T>::From(*in1);
...@@ -92,31 +95,28 @@ class SquaredL2DistanceGradKernel : public framework::OpKernel<T> { ...@@ -92,31 +95,28 @@ class SquaredL2DistanceGradKernel : public framework::OpKernel<T> {
// propagate back to input // propagate back to input
auto& eigen_place = auto& eigen_place =
*context.template device_context<DeviceContext>().eigen_device(); *context.template device_context<DeviceContext>().eigen_device();
if (x_g) {
x_g->mutable_data<T>(context.GetPlace());
// eigen matrix
auto x_grad =
EigenMatrix<T>::From(*x_g, framework::make_ddim({x_dims[0], cols}));
// dimensions are same with subResult
x_grad.device(eigen_place) = grad_mat;
}
if (y_g) { x_g->mutable_data<T>(context.GetPlace());
y_g->mutable_data<T>(context.GetPlace()); // eigen matrix
auto x_grad =
PADDLE_ENFORCE_GE(sub_result.dimensions()[0], y_dims[0], EigenMatrix<T>::From(*x_g, framework::make_ddim({x_dims[0], cols}));
"First dimension of gradient must be greater or " // dimensions are same with subResult
"equal than first dimension of target."); x_grad.device(eigen_place) = grad_mat;
if (sub_result.dimensions()[0] == y_dims[0]) { y_g->mutable_data<T>(context.GetPlace());
auto y_grad =
EigenMatrix<T>::From(*y_g, framework::make_ddim({y_dims[0], cols})); PADDLE_ENFORCE_GE(sub_result.dimensions()[0], y_dims[0],
y_grad.device(eigen_place) = -1 * grad_mat; "First dimension of gradient must be greater or "
} else { "equal than first dimension of target.");
auto col_sum_res = -1 * (grad_mat.sum(Eigen::array<int, 1>({{0}})));
auto y_grad = EigenVector<T>::Flatten(*y_g); if (sub_result.dimensions()[0] == y_dims[0]) {
y_grad.device(eigen_place) = col_sum_res; auto y_grad =
} EigenMatrix<T>::From(*y_g, framework::make_ddim({y_dims[0], cols}));
y_grad.device(eigen_place) = -1 * grad_mat;
} else {
auto col_sum_res = -1 * (grad_mat.sum(Eigen::array<int, 1>({{0}})));
auto y_grad = EigenVector<T>::Flatten(*y_g);
y_grad.device(eigen_place) = col_sum_res;
} }
} }
}; };
......
...@@ -52,16 +52,26 @@ void PrintVar(framework::Scope* scope, const std::string& var_name, ...@@ -52,16 +52,26 @@ void PrintVar(framework::Scope* scope, const std::string& var_name,
return; return;
} }
#define PrintLoDTensorCallback(cpp_type, proto_type) \ framework::LoDTensor printed_tensor;
do { \ printed_tensor.set_lod(tensor->lod());
if (tensor->type() == proto_type) { \ printed_tensor.Resize(tensor->dims());
print_lod_tensor<cpp_type>(var_name, *tensor, print_info); \ if (platform::is_cpu_place(tensor->place())) {
return; \ printed_tensor.ShareDataWith(*tensor);
} \ } else {
platform::CPUPlace place;
framework::TensorCopy(*tensor, place, &printed_tensor);
}
#define PrintLoDTensorCallback(cpp_type, proto_type) \
do { \
if (tensor->type() == proto_type) { \
print_lod_tensor<cpp_type>(var_name, printed_tensor, print_info); \
return; \
} \
} while (0) } while (0)
_ForEachDataType_(PrintLoDTensorCallback); _ForEachDataType_(PrintLoDTensorCallback);
VLOG(1) << "PrintVar: unrecognized data type:" << tensor->type(); VLOG(1) << "PrintVar: unrecognized data type:" << printed_tensor.type();
} }
} // end namespace platform } // end namespace platform
......
...@@ -1366,6 +1366,10 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -1366,6 +1366,10 @@ All parameter, weight, gradient are variables in Paddle.
"cache_runtime_context", "cache_runtime_context",
[](const BuildStrategy &self) { return self.cache_runtime_context_; }, [](const BuildStrategy &self) { return self.cache_runtime_context_; },
[](BuildStrategy &self, bool b) { self.cache_runtime_context_ = b; }) [](BuildStrategy &self, bool b) { self.cache_runtime_context_ = b; })
.def_property(
"cache_expected_kernel",
[](const BuildStrategy &self) { return self.cache_expected_kernel_; },
[](BuildStrategy &self, bool b) { self.cache_expected_kernel_ = b; })
.def("_finalize_strategy_and_create_passes", .def("_finalize_strategy_and_create_passes",
[](BuildStrategy &self) -> std::shared_ptr<ir::PassBuilder> { [](BuildStrategy &self) -> std::shared_ptr<ir::PassBuilder> {
return self.CreatePassesFromStrategy(true); return self.CreatePassesFromStrategy(true);
......
...@@ -202,6 +202,7 @@ function cmake_gen() { ...@@ -202,6 +202,7 @@ function cmake_gen() {
-DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
-DWITH_CONTRIB=${WITH_CONTRIB:-ON} -DWITH_CONTRIB=${WITH_CONTRIB:-ON}
-DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON} -DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON}
-DWITH_HIGH_LEVEL_API_TEST=${WITH_HIGH_LEVEL_API_TEST:-OFF}
-DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR} -DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR}
-DWITH_ANAKIN=${WITH_ANAKIN:-OFF} -DWITH_ANAKIN=${WITH_ANAKIN:-OFF}
-DANAKIN_BUILD_FAT_BIN=${ANAKIN_BUILD_FAT_BIN:OFF} -DANAKIN_BUILD_FAT_BIN=${ANAKIN_BUILD_FAT_BIN:OFF}
...@@ -234,6 +235,7 @@ EOF ...@@ -234,6 +235,7 @@ EOF
-DCMAKE_EXPORT_COMPILE_COMMANDS=ON \ -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
-DWITH_CONTRIB=${WITH_CONTRIB:-ON} \ -DWITH_CONTRIB=${WITH_CONTRIB:-ON} \
-DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON} \ -DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON} \
-DWITH_HIGH_LEVEL_API_TEST=${WITH_HIGH_LEVEL_API_TEST:-OFF} \
-DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR} \ -DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR} \
-DWITH_ANAKIN=${WITH_ANAKIN:-OFF} \ -DWITH_ANAKIN=${WITH_ANAKIN:-OFF} \
-DANAKIN_BUILD_FAT_BIN=${ANAKIN_BUILD_FAT_BIN:OFF}\ -DANAKIN_BUILD_FAT_BIN=${ANAKIN_BUILD_FAT_BIN:OFF}\
......
...@@ -175,6 +175,7 @@ def __bootstrap__(): ...@@ -175,6 +175,7 @@ def __bootstrap__():
read_env_flags.append('communicator_thread_pool_size') read_env_flags.append('communicator_thread_pool_size')
read_env_flags.append('communicator_max_merge_var_num') read_env_flags.append('communicator_max_merge_var_num')
read_env_flags.append('communicator_fake_rpc') read_env_flags.append('communicator_fake_rpc')
read_env_flags.append('communicator_send_wait_times')
if core.is_compiled_with_brpc(): if core.is_compiled_with_brpc():
read_env_flags.append('max_body_size') read_env_flags.append('max_body_size')
#set brpc max body size #set brpc max body size
......
...@@ -136,6 +136,7 @@ class DatasetBase(object): ...@@ -136,6 +136,7 @@ class DatasetBase(object):
slot_var.name = var.name slot_var.name = var.name
if var.lod_level == 0: if var.lod_level == 0:
slot_var.is_dense = True slot_var.is_dense = True
slot_var.shape.extend(var.shape)
if var.dtype == core.VarDesc.VarType.FP32: if var.dtype == core.VarDesc.VarType.FP32:
slot_var.type = "float" slot_var.type = "float"
elif var.dtype == core.VarDesc.VarType.INT64: elif var.dtype == core.VarDesc.VarType.INT64:
......
...@@ -26,8 +26,8 @@ class DeviceWorker(object): ...@@ -26,8 +26,8 @@ class DeviceWorker(object):
""" """
Init. Init.
""" """
self.program_ = None self._program = None
self.infer_ = None self._infer = None
def _set_infer(self, infer=False): def _set_infer(self, infer=False):
""" """
...@@ -36,7 +36,7 @@ class DeviceWorker(object): ...@@ -36,7 +36,7 @@ class DeviceWorker(object):
Args: Args:
infer(bool): whether to do inference infer(bool): whether to do inference
""" """
self.infer_ = infer self._infer = infer
def _set_fleet_desc(self, fleet_desc): def _set_fleet_desc(self, fleet_desc):
""" """
...@@ -45,7 +45,7 @@ class DeviceWorker(object): ...@@ -45,7 +45,7 @@ class DeviceWorker(object):
Args: Args:
fleet_desc(PSParameter): pslib.PSParameter object fleet_desc(PSParameter): pslib.PSParameter object
""" """
self.fleet_desc_ = fleet_desc self._fleet_desc = fleet_desc
def _set_program(self, program): def _set_program(self, program):
""" """
...@@ -54,7 +54,7 @@ class DeviceWorker(object): ...@@ -54,7 +54,7 @@ class DeviceWorker(object):
Args: Args:
program(Program): a Program object program(Program): a Program object
""" """
self.program_ = program self._program = program
def _gen_worker_desc(self, trainer_desc): def _gen_worker_desc(self, trainer_desc):
""" """
...@@ -88,7 +88,7 @@ class Hogwild(DeviceWorker): ...@@ -88,7 +88,7 @@ class Hogwild(DeviceWorker):
trainer_desc(TrainerDesc): a TrainerDesc object trainer_desc(TrainerDesc): a TrainerDesc object
""" """
trainer_desc.device_worker_name = "HogwildWorker" trainer_desc.device_worker_name = "HogwildWorker"
if self.infer_: if self._infer:
# just ignore feed op for inference model # just ignore feed op for inference model
trainer_desc.hogwild_param.skip_ops.extend(["feed"]) trainer_desc.hogwild_param.skip_ops.extend(["feed"])
...@@ -113,11 +113,11 @@ class DownpourSGD(DeviceWorker): ...@@ -113,11 +113,11 @@ class DownpourSGD(DeviceWorker):
trainer_desc(TrainerDesc): a TrainerDesc object trainer_desc(TrainerDesc): a TrainerDesc object
""" """
dense_table_set = set() dense_table_set = set()
program_id = str(id(self.program_)) program_id = str(id(self._program))
if self.program_ == None: if self._program == None:
print("program of current device worker is not configured") print("program of current device worker is not configured")
exit(-1) exit(-1)
opt_info = self.program_._fleet_opt opt_info = self._program._fleet_opt
program_configs = opt_info["program_configs"] program_configs = opt_info["program_configs"]
downpour = trainer_desc.downpour_param downpour = trainer_desc.downpour_param
...@@ -140,7 +140,7 @@ class DownpourSGD(DeviceWorker): ...@@ -140,7 +140,7 @@ class DownpourSGD(DeviceWorker):
trainer_desc.device_worker_name = "DownpourWorker" trainer_desc.device_worker_name = "DownpourWorker"
pull_thread = trainer_desc.pull_dense_param pull_thread = trainer_desc.pull_dense_param
pull_thread.device_num = trainer_desc.thread_num pull_thread.device_num = trainer_desc.thread_num
for i in self.fleet_desc_.trainer_param.dense_table: for i in self._fleet_desc.trainer_param.dense_table:
if i.table_id in dense_table_set: if i.table_id in dense_table_set:
dense_table = pull_thread.dense_table.add() dense_table = pull_thread.dense_table.add()
dense_table.dense_value_name.extend(i.dense_variable_name) dense_table.dense_value_name.extend(i.dense_variable_name)
...@@ -148,29 +148,29 @@ class DownpourSGD(DeviceWorker): ...@@ -148,29 +148,29 @@ class DownpourSGD(DeviceWorker):
i.table_id i.table_id
sparse_table = downpour.sparse_table.add() sparse_table = downpour.sparse_table.add()
sparse_table.table_id = \ sparse_table.table_id = \
self.fleet_desc_.trainer_param.sparse_table[0].table_id self._fleet_desc.trainer_param.sparse_table[0].table_id
sparse_table.sparse_key_name.extend( sparse_table.sparse_key_name.extend(
self.fleet_desc_.trainer_param.sparse_table[0].slot_key) self._fleet_desc.trainer_param.sparse_table[0].slot_key)
sparse_table.sparse_value_name.extend( sparse_table.sparse_value_name.extend(
self.fleet_desc_.trainer_param.sparse_table[0].slot_value) self._fleet_desc.trainer_param.sparse_table[0].slot_value)
sparse_table.sparse_grad_name.extend( sparse_table.sparse_grad_name.extend(
self.fleet_desc_.trainer_param.sparse_table[0].slot_gradient) self._fleet_desc.trainer_param.sparse_table[0].slot_gradient)
sparse_table.emb_dim = \ sparse_table.emb_dim = \
self.fleet_desc_.server_param.downpour_server_param.downpour_table_param[ self._fleet_desc.server_param.downpour_server_param.downpour_table_param[
0].accessor.fea_dim - 2 0].accessor.fea_dim - 2
sparse_table.fea_dim = sparse_table.emb_dim + 2 sparse_table.fea_dim = sparse_table.emb_dim + 2
# TODO(guru4elephant): hard code here, need to improve # TODO(guru4elephant): hard code here, need to improve
sparse_table.label_var_name = "click" sparse_table.label_var_name = "click"
for i in self.fleet_desc_.trainer_param.dense_table: for i in self._fleet_desc.trainer_param.dense_table:
if i.table_id in dense_table_set: if i.table_id in dense_table_set:
dense_table = downpour.dense_table.add() dense_table = downpour.dense_table.add()
dense_table.table_id = i.table_id dense_table.table_id = i.table_id
dense_table.dense_value_name.extend(i.dense_variable_name) dense_table.dense_value_name.extend(i.dense_variable_name)
dense_table.dense_grad_name.extend( dense_table.dense_grad_name.extend(
i.dense_gradient_variable_name) i.dense_gradient_variable_name)
downpour.skip_ops.extend(self.fleet_desc_.trainer_param.skip_op) downpour.skip_ops.extend(self._fleet_desc.trainer_param.skip_op)
if self.infer_: if self._infer:
downpour.push_dense = False downpour.push_dense = False
downpour.push_sparse = False downpour.push_sparse = False
......
...@@ -712,10 +712,6 @@ class Executor(object): ...@@ -712,10 +712,6 @@ class Executor(object):
if dataset == None: if dataset == None:
raise RuntimeError("dataset is needed and should be initialized") raise RuntimeError("dataset is needed and should be initialized")
if not isinstance(self.place, core.CPUPlace):
raise RuntimeError("infer_from_dataset is verified on CPUPlace"
"We will open CUDAPlace in the future")
scope, trainer = self._prepare_trainer( scope, trainer = self._prepare_trainer(
program=program, program=program,
dataset=dataset, dataset=dataset,
...@@ -796,10 +792,6 @@ class Executor(object): ...@@ -796,10 +792,6 @@ class Executor(object):
if dataset == None: if dataset == None:
raise RuntimeError("dataset is need and should be initialized") raise RuntimeError("dataset is need and should be initialized")
if not isinstance(self.place, core.CPUPlace):
raise RuntimeError("train_from_dataset is verified on CPUPlace"
"We will open CUDAPlace in the future")
scope, trainer = self._prepare_trainer( scope, trainer = self._prepare_trainer(
program=program, program=program,
dataset=dataset, dataset=dataset,
......
...@@ -23,10 +23,10 @@ class RoleMakerBase(object): ...@@ -23,10 +23,10 @@ class RoleMakerBase(object):
""" """
def __init__(self): def __init__(self):
self.role_maker_name_ = "" self._role_maker_name = ""
self.trainer_endpoints_ = [] self._trainer_endpoints = []
self.pserver_endpoints_ = [] self._pserver_endpoints = []
self.role_is_generated_ = False self._role_is_generated = False
def _is_worker(self): def _is_worker(self):
""" """
...@@ -45,20 +45,20 @@ class RoleMakerBase(object): ...@@ -45,20 +45,20 @@ class RoleMakerBase(object):
return get local ip return get local ip
""" """
import socket import socket
self.ip_ = socket.gethostbyname(socket.gethostname()) self._ip = socket.gethostbyname(socket.gethostname())
return self.ip_ return self._ip
def _get_trainer_endpoints(self): def _get_trainer_endpoints(self):
""" """
return trainer endpoints return trainer endpoints
""" """
return self.trainer_endpoints_ return self._trainer_endpoints
def _get_pserver_endpoints(self): def _get_pserver_endpoints(self):
""" """
return pserver endpoints return pserver endpoints
""" """
return self.pserver_endpoints_ return self._pserver_endpoints
def _generate_role(self): def _generate_role(self):
""" """
...@@ -76,59 +76,59 @@ class MPIRoleMaker(RoleMakerBase): ...@@ -76,59 +76,59 @@ class MPIRoleMaker(RoleMakerBase):
def __init__(self): def __init__(self):
super(MPIRoleMaker, self).__init__() super(MPIRoleMaker, self).__init__()
from mpi4py import MPI from mpi4py import MPI
self.comm_ = MPI.COMM_WORLD self._comm = MPI.COMM_WORLD
self.MPI = MPI self.MPI = MPI
self.ips_ = None self._ips = None
def _get_rank(self): def _get_rank(self):
""" """
return rank return rank
""" """
self.rank_ = self.comm_.Get_rank() self._rank = self._comm.Get_rank()
return self.rank_ return self._rank
def _get_size(self): def _get_size(self):
""" """
return size return size
""" """
self.size_ = self.comm_.Get_size() self._size = self._comm.Get_size()
return self.size_ return self._size
def _all_gather(self, obj): def _all_gather(self, obj):
""" """
all_gather(obj) will call MPI's allgather function all_gather(obj) will call MPI's allgather function
""" """
self._barrier_all() self._barrier_all()
return self.comm_.allgather(obj) return self._comm.allgather(obj)
def _worker_gather(self, obj): def _worker_gather(self, obj):
""" """
worker_gather(obj) will call MPI's allgather function worker_gather(obj) will call MPI's allgather function
""" """
if self._is_worker(): if self._is_worker():
self.node_type_comm_.barrier() self._node_type_comm.barrier()
return self.node_type_comm_.allgather(obj) return self._node_type_comm.allgather(obj)
return None return None
def _barrier_all(self): def _barrier_all(self):
""" """
barrier_all() will call MPI's barrier_all function barrier_all() will call MPI's barrier_all function
""" """
self.comm_.barrier() self._comm.barrier()
def _get_ips(self): def _get_ips(self):
""" """
collect current distributed job's ip list collect current distributed job's ip list
""" """
if self.ips_ == None: if self._ips == None:
self.ips_ = self.comm_.allgather(self._get_local_ip()) self._ips = self._comm.allgather(self._get_local_ip())
return self.ips_ return self._ips
def _finalize(self): def _finalize(self):
""" """
finalize the current MPI instance. finalize the current MPI instance.
""" """
self.comm_.finalize() self._comm.finalize()
class MPISymetricRoleMaker(MPIRoleMaker): class MPISymetricRoleMaker(MPIRoleMaker):
...@@ -140,11 +140,11 @@ class MPISymetricRoleMaker(MPIRoleMaker): ...@@ -140,11 +140,11 @@ class MPISymetricRoleMaker(MPIRoleMaker):
def __init__(self): def __init__(self):
super(MPISymetricRoleMaker, self).__init__() super(MPISymetricRoleMaker, self).__init__()
self.node_type_ = None self._node_type = None
self.proc_per_node_ = 2 self._proc_per_node = 2
def _check_role_generation(self): def _check_role_generation(self):
if not self.role_is_generated_: if not self._role_is_generated:
sys.stderr.write("generate_role() should be called first") sys.stderr.write("generate_role() should be called first")
sys.exit(-1) sys.exit(-1)
return False return False
...@@ -163,7 +163,7 @@ class MPISymetricRoleMaker(MPIRoleMaker): ...@@ -163,7 +163,7 @@ class MPISymetricRoleMaker(MPIRoleMaker):
return whether current process is worker assigned by role maker return whether current process is worker assigned by role maker
""" """
if self._check_role_generation(): if self._check_role_generation():
return self.node_type_ == 1 return self._node_type == 1
return False return False
def _is_server(self): def _is_server(self):
...@@ -171,7 +171,7 @@ class MPISymetricRoleMaker(MPIRoleMaker): ...@@ -171,7 +171,7 @@ class MPISymetricRoleMaker(MPIRoleMaker):
return whether current process is server assigned by role maker return whether current process is server assigned by role maker
""" """
if self._check_role_generation(): if self._check_role_generation():
return self.node_type_ == 0 return self._node_type == 0
return False return False
def _worker_num(self): def _worker_num(self):
...@@ -197,7 +197,7 @@ class MPISymetricRoleMaker(MPIRoleMaker): ...@@ -197,7 +197,7 @@ class MPISymetricRoleMaker(MPIRoleMaker):
return the index of worker return the index of worker
""" """
if self._check_role_generation(): if self._check_role_generation():
return self.rank_ / self.proc_per_node_ return self._rank / self._proc_per_node
return 0 return 0
def _server_index(self): def _server_index(self):
...@@ -205,7 +205,7 @@ class MPISymetricRoleMaker(MPIRoleMaker): ...@@ -205,7 +205,7 @@ class MPISymetricRoleMaker(MPIRoleMaker):
return the index of server return the index of server
""" """
if self._check_role_generation(): if self._check_role_generation():
return self.rank_ / self.proc_per_node_ return self._rank / self._proc_per_node
return 0 return 0
def _barrier_worker(self): def _barrier_worker(self):
...@@ -214,7 +214,7 @@ class MPISymetricRoleMaker(MPIRoleMaker): ...@@ -214,7 +214,7 @@ class MPISymetricRoleMaker(MPIRoleMaker):
""" """
if self._check_role_generation(): if self._check_role_generation():
if self._is_worker(): if self._is_worker():
self.node_type_comm_.barrier() self._node_type_comm.barrier()
def _barrier_server(self): def _barrier_server(self):
""" """
...@@ -222,20 +222,20 @@ class MPISymetricRoleMaker(MPIRoleMaker): ...@@ -222,20 +222,20 @@ class MPISymetricRoleMaker(MPIRoleMaker):
""" """
if self._check_role_generation(): if self._check_role_generation():
if self._is_server(): if self._is_server():
self.node_type_comm_.barrier() self._node_type_comm.barrier()
def _generate_role(self): def _generate_role(self):
""" """
generate currently process's role generate currently process's role
""" """
if not self.role_is_generated_: if not self._role_is_generated:
# TODO(guru4elephant): only allow to be called once # TODO(guru4elephant): only allow to be called once
self.trainer_endpoints_ = self._get_ips() self._trainer_endpoints = self._get_ips()
self.pserver_endpoints_ = self._get_ips() self._pserver_endpoints = self._get_ips()
if 0 == self._get_rank() % self.proc_per_node_ % 2: if 0 == self._get_rank() % self._proc_per_node % 2:
self.node_type_ = 0 self._node_type = 0
else: else:
self.node_type_ = 1 self._node_type = 1
self.node_type_comm_ = self.comm_.Split(self.node_type_) self._node_type_comm = self._comm.Split(self._node_type)
self.role_is_generated_ = True self._role_is_generated = True
...@@ -64,9 +64,9 @@ class Fleet(object): ...@@ -64,9 +64,9 @@ class Fleet(object):
def __init__(self): def __init__(self):
self._opt_info = None # for fleet only self._opt_info = None # for fleet only
self.role_maker_ = None self._role_maker = None
self.local_ip_ = 0 self._local_ip = 0
self.is_initialized_ = False self._is_initialized = False
def init(self): def init(self):
# TODO(guru4elephant) # TODO(guru4elephant)
...@@ -78,22 +78,22 @@ class Fleet(object): ...@@ -78,22 +78,22 @@ class Fleet(object):
current node's role, e.g. worker, server, etc. current node's role, e.g. worker, server, etc.
""" """
if not self.is_initialized_: if not self.is_initialized_:
self.role_maker_ = MPISymetricRoleMaker() self._role_maker = MPISymetricRoleMaker()
self.role_maker_._generate_role() self._role_maker._generate_role()
self._fleet_ptr = fluid.core.Fleet() self._fleet_ptr = fluid.core.Fleet()
self.is_initialized_ = True self._is_initialized = True
def stop(self): def stop(self):
""" """
stop(): will be called after a user finishes his/her training task. Fleet instance will be stop(): will be called after a user finishes his/her training task. Fleet instance will be
destroyed when stop() is called. destroyed when stop() is called.
""" """
self.role_maker_._barrier_worker() self._role_maker._barrier_worker()
if self.role_maker_._is_first_worker(): if self._role_maker._is_first_worker():
self._fleet_ptr.stop_server() self._fleet_ptr.stop_server()
self.role_maker_._barrier_worker() self._role_maker._barrier_worker()
self.role_maker_._barrier_all() self._role_maker._barrier_all()
self.role_maker_._finalize() self._role_maker._finalize()
def init_pserver(self): def init_pserver(self):
""" """
...@@ -110,15 +110,15 @@ class Fleet(object): ...@@ -110,15 +110,15 @@ class Fleet(object):
sys.exit(-1) sys.exit(-1)
self._fleet_ptr.init_server(self._dist_desc_str, self._fleet_ptr.init_server(self._dist_desc_str,
self.role_maker_._get_rank()) self.role_maker_._get_rank())
self.local_ip_ = self._fleet_ptr.run_server() self._local_ip = self._fleet_ptr.run_server()
# barrier_all for init_server # barrier_all for init_server
self.role_maker_._barrier_all() self._role_maker._barrier_all()
self.all_ips_ = self.role_maker_._all_gather(self.local_ip_) self._all_ips = self._role_maker._all_gather(self.local_ip_)
self._fleet_ptr.gather_servers(self.all_ips_, self._fleet_ptr.gather_servers(self._all_ips,
self.role_maker_._get_size()) self._role_maker._get_size())
# barrier_all for init_worker, wait all workers start # barrier_all for init_worker, wait all workers start
self.role_maker_._barrier_all() self._role_maker._barrier_all()
else: else:
print("You should run DistributedOptimizer.minimize() first") print("You should run DistributedOptimizer.minimize() first")
sys.exit(-1) sys.exit(-1)
...@@ -151,21 +151,21 @@ class Fleet(object): ...@@ -151,21 +151,21 @@ class Fleet(object):
print("You should run DistributedOptimizer.minimize() first") print("You should run DistributedOptimizer.minimize() first")
sys.exit(-1) sys.exit(-1)
# barrier_all for init_server, wait for server starts # barrier_all for init_server, wait for server starts
self.role_maker_._barrier_all() self._role_maker._barrier_all()
self.all_ips_ = self.role_maker_._all_gather(self.local_ip_) self._all_ips = self._role_maker._all_gather(self.local_ip_)
self._fleet_ptr.init_worker(self._dist_desc_str, self.all_ips_, self._fleet_ptr.init_worker(self._dist_desc_str, self._all_ips,
self.role_maker_._get_size(), self._role_maker._get_size(),
self.role_maker_._get_rank()) self._role_maker._get_rank())
# barrier_all for init_worker # barrier_all for init_worker
self.role_maker_._barrier_all() self._role_maker._barrier_all()
# prepare for client to client communication # prepare for client to client communication
info = self._fleet_ptr.get_clients_info() info = self._fleet_ptr.get_clients_info()
all_info = self.role_maker_._worker_gather(info[0]) all_info = self._role_maker._worker_gather(info[0])
self._fleet_ptr.gather_clients(all_info) self._fleet_ptr.gather_clients(all_info)
self._fleet_ptr.create_client2client_connection() self._fleet_ptr.create_client2client_connection()
# barrier for init model # barrier for init model
self.role_maker_._barrier_worker() self._role_maker._barrier_worker()
if self.role_maker_._is_first_worker(): if self._role_maker._is_first_worker():
tables = self._dist_desc.trainer_param.dense_table tables = self._dist_desc.trainer_param.dense_table
for prog, scope in zip(programs, scopes): for prog, scope in zip(programs, scopes):
prog_id = str(id(prog)) prog_id = str(id(prog))
...@@ -192,7 +192,7 @@ class Fleet(object): ...@@ -192,7 +192,7 @@ class Fleet(object):
int(table.table_id), int(table.table_id),
var_name_list) var_name_list)
# barrier for init model done # barrier for init model done
self.role_maker_._barrier_worker() self._role_maker._barrier_worker()
else: else:
print("You should run DistributedOptimizer.minimize() first") print("You should run DistributedOptimizer.minimize() first")
sys.exit(-1) sys.exit(-1)
...@@ -201,39 +201,39 @@ class Fleet(object): ...@@ -201,39 +201,39 @@ class Fleet(object):
""" """
return the number of current job's worker num return the number of current job's worker num
""" """
return self.role_maker_._worker_num() return self._role_maker._worker_num()
def get_server_num(self): def get_server_num(self):
""" """
return the number of current job's server num return the number of current job's server num
""" """
return self.role_maker_._server_num() return self._role_maker._server_num()
def get_worker_index(self): def get_worker_index(self):
""" """
return the mpi rank of current worker return the mpi rank of current worker
""" """
return self.role_maker_._worker_index() return self._role_maker._worker_index()
def is_worker(self): def is_worker(self):
""" """
return whether current node is a worker return whether current node is a worker
""" """
return self.role_maker_._is_worker() return self._role_maker._is_worker()
def is_server(self): def is_server(self):
""" """
return whether current node is pserver return whether current node is pserver
""" """
return self.role_maker_._is_server() return self._role_maker._is_server()
def init_pserver_model(self): def init_pserver_model(self):
""" """
init pserver model called from pserver init pserver model called from pserver
""" """
if self.role_maker_._is_first_worker(): if self._role_maker._is_first_worker():
self._fleet_ptr.init_model() self._fleet_ptr.init_model()
self.role_maker_._barrier_worker() self._role_maker._barrier_worker()
def save_pserver_model(self, save_path): def save_pserver_model(self, save_path):
""" """
......
...@@ -42,13 +42,13 @@ class DownpourServer(Server): ...@@ -42,13 +42,13 @@ class DownpourServer(Server):
""" """
def __init__(self): def __init__(self):
self.server_ = pslib.ServerParameter() self._server = pslib.ServerParameter()
self.server_.downpour_server_param.service_param.start_server_port = 0 self._server.downpour_server_param.service_param.start_server_port = 0
self.server_.downpour_server_param.service_param.server_class = "DownpourBrpcPsServer" self._server.downpour_server_param.service_param.server_class = "DownpourBrpcPsServer"
self.server_.downpour_server_param.service_param.client_class = "DownpourBrpcPsClient" self._server.downpour_server_param.service_param.client_class = "DownpourBrpcPsClient"
self.server_.downpour_server_param.service_param.service_class = "DownpourPsService" self._server.downpour_server_param.service_param.service_class = "DownpourPsService"
self.server_.downpour_server_param.service_param.start_server_port = 0 self._server.downpour_server_param.service_param.start_server_port = 0
self.server_.downpour_server_param.service_param.server_thread_num = 12 self._server.downpour_server_param.service_param.server_thread_num = 12
def add_sparse_table(self, table_id, learning_rate, slot_key_vars, def add_sparse_table(self, table_id, learning_rate, slot_key_vars,
slot_value_var): slot_value_var):
...@@ -62,7 +62,7 @@ class DownpourServer(Server): ...@@ -62,7 +62,7 @@ class DownpourServer(Server):
Returns: Returns:
return None return None
""" """
table = self.server_.downpour_server_param.downpour_table_param.add() table = self._server.downpour_server_param.downpour_table_param.add()
table.table_id = table_id table.table_id = table_id
table.table_class = "DownpourSparseTable" table.table_class = "DownpourSparseTable"
table.type = pslib.PS_SPARSE_TABLE table.type = pslib.PS_SPARSE_TABLE
...@@ -123,7 +123,7 @@ class DownpourServer(Server): ...@@ -123,7 +123,7 @@ class DownpourServer(Server):
Returns: Returns:
return None return None
""" """
table = self.server_.downpour_server_param.downpour_table_param.add() table = self._server.downpour_server_param.downpour_table_param.add()
table.table_id = table_id table.table_id = table_id
table.table_class = "DownpourDenseTable" table.table_class = "DownpourDenseTable"
table.type = pslib.PS_DENSE_TABLE table.type = pslib.PS_DENSE_TABLE
...@@ -140,7 +140,7 @@ class DownpourServer(Server): ...@@ -140,7 +140,7 @@ class DownpourServer(Server):
""" """
Return downpour server program_desc Return downpour server program_desc
""" """
return self.server_ return self._server
class DownpourWorker(Worker): class DownpourWorker(Worker):
...@@ -155,7 +155,7 @@ class DownpourWorker(Worker): ...@@ -155,7 +155,7 @@ class DownpourWorker(Worker):
def __init__(self, window): def __init__(self, window):
self.window = window self.window = window
self.worker_ = pslib.DownpourTrainerParameter() self._worker = pslib.DownpourTrainerParameter()
def add_sparse_table(self, table_id, learning_rate, slot_key_vars, def add_sparse_table(self, table_id, learning_rate, slot_key_vars,
slot_value_vars): slot_value_vars):
...@@ -187,7 +187,7 @@ class DownpourWorker(Worker): ...@@ -187,7 +187,7 @@ class DownpourWorker(Worker):
Returns: Returns:
return None return None
""" """
table = self.worker_.dense_table.add() table = self._worker.dense_table.add()
table.table_id = table_id table.table_id = table_id
table.dense_variable_name.extend( table.dense_variable_name.extend(
filter(lambda x: x.find("embedding") == -1, filter(lambda x: x.find("embedding") == -1,
...@@ -200,4 +200,4 @@ class DownpourWorker(Worker): ...@@ -200,4 +200,4 @@ class DownpourWorker(Worker):
""" """
Return downpour worker program_desc Return downpour worker program_desc
""" """
return self.worker_ return self._worker
...@@ -24,9 +24,9 @@ from .node import DownpourWorker, DownpourServer ...@@ -24,9 +24,9 @@ from .node import DownpourWorker, DownpourServer
class DistributedOptimizerImplBase(object): class DistributedOptimizerImplBase(object):
def __init__(self, optimizer): def __init__(self, optimizer):
self.optimizer_ = optimizer self._optimizer = optimizer
self.learning_rate_ = optimizer._learning_rate self._learning_rate = optimizer._learning_rate
self.regularization_ = optimizer.regularization self._regularization = optimizer.regularization
def minimize(self, def minimize(self,
losses, losses,
...@@ -41,7 +41,7 @@ class DistributedAdam(DistributedOptimizerImplBase): ...@@ -41,7 +41,7 @@ class DistributedAdam(DistributedOptimizerImplBase):
# todo(guru4elephant): add more optimizers here as argument # todo(guru4elephant): add more optimizers here as argument
# todo(guru4elephant): make learning_rate as a variable # todo(guru4elephant): make learning_rate as a variable
super(DistributedAdam, self).__init__(optimizer) super(DistributedAdam, self).__init__(optimizer)
self.window_ = 1 self._window = 1
self.type = "downpour" self.type = "downpour"
self.data_norm_name = [ self.data_norm_name = [
".batch_size", ".batch_square_sum", ".batch_sum", ".batch_size", ".batch_square_sum", ".batch_sum",
...@@ -79,9 +79,9 @@ class DistributedAdam(DistributedOptimizerImplBase): ...@@ -79,9 +79,9 @@ class DistributedAdam(DistributedOptimizerImplBase):
server = DownpourServer() server = DownpourServer()
worker = DownpourWorker(self.window_) worker = DownpourWorker(self.window_)
sparse_table_index = 0 sparse_table_index = 0
server.add_sparse_table(sparse_table_index, self.learning_rate_, server.add_sparse_table(sparse_table_index, self._learning_rate,
prefetch_slots, prefetch_slots_emb) prefetch_slots, prefetch_slots_emb)
worker.add_sparse_table(sparse_table_index, self.learning_rate_, worker.add_sparse_table(sparse_table_index, self._learning_rate,
prefetch_slots, prefetch_slots_emb) prefetch_slots, prefetch_slots_emb)
dense_table_index = 1 dense_table_index = 1
program_configs = {} program_configs = {}
...@@ -124,9 +124,9 @@ class DistributedAdam(DistributedOptimizerImplBase): ...@@ -124,9 +124,9 @@ class DistributedAdam(DistributedOptimizerImplBase):
data_norm_grads.append(i[1]) data_norm_grads.append(i[1])
if not is_data_norm_data: if not is_data_norm_data:
grads.append(i[1]) grads.append(i[1])
server.add_dense_table(dense_table_index, self.learning_rate_, server.add_dense_table(dense_table_index, self._learning_rate,
params, grads) params, grads)
worker.add_dense_table(dense_table_index, self.learning_rate_, worker.add_dense_table(dense_table_index, self._learning_rate,
params, grads) params, grads)
program_configs[program_id]["pull_dense"] = [dense_table_index] program_configs[program_id]["pull_dense"] = [dense_table_index]
program_configs[program_id]["push_dense"] = [dense_table_index] program_configs[program_id]["push_dense"] = [dense_table_index]
...@@ -135,9 +135,9 @@ class DistributedAdam(DistributedOptimizerImplBase): ...@@ -135,9 +135,9 @@ class DistributedAdam(DistributedOptimizerImplBase):
if len(data_norm_params) != 0 and len(data_norm_grads) != 0: if len(data_norm_params) != 0 and len(data_norm_grads) != 0:
dense_table_index += 1 dense_table_index += 1
server.add_data_norm_table(dense_table_index, server.add_data_norm_table(dense_table_index,
self.learning_rate_, self._learning_rate,
data_norm_params, data_norm_grads) data_norm_params, data_norm_grads)
worker.add_dense_table(dense_table_index, self.learning_rate_, worker.add_dense_table(dense_table_index, self._learning_rate,
data_norm_params, data_norm_grads) data_norm_params, data_norm_grads)
#program_config.pull_dense_table_id.extend([dense_table_index]) #program_config.pull_dense_table_id.extend([dense_table_index])
#program_config.push_dense_table_id.extend([dense_table_index]) #program_config.push_dense_table_id.extend([dense_table_index])
......
...@@ -267,8 +267,44 @@ class StaticRNN(object): ...@@ -267,8 +267,44 @@ class StaticRNN(object):
""" """
StaticRNN class. StaticRNN class.
StaticRNN class is used to create a StaticRNN. The RNN will have its The StaticRNN can process a batch of sequence data. The length of each
own parameters like inputs, outputs, memories, status and length. sample sequence must be equal. The StaticRNN will have its own parameters
like inputs, outputs, memories. **Note that the first dimension of inputs
represents sequence length, and all the sequence length of inputs must be
the same. And the meaning of each axis of input and output are the same.**
Examples:
>>> import paddle.fluid as fluid
>>> import paddle.fluid.layers as layers
>>>
>>> vocab_size, hidden_size=10000, 200
>>> x = layers.data(name="x", shape=[-1, 1, 1], dtype='int64')
>>> x_emb = layers.embedding(
>>> input=x,
>>> size=[vocab_size, hidden_size],
>>> dtype='float32',
>>> is_sparse=False)
>>> x_emb = layers.transpose(x_emb, perm=[1, 0, 2])
>>>
>>> rnn = fluid.layers.StaticRNN()
>>> with rnn.step():
>>> word = rnn.step_input(x_emb)
>>> prev = rnn.memory(shape=[-1, hidden_size], batch_ref = word)
>>> hidden = fluid.layers.fc(input=[word, prev], size=hidden_size, act='relu')
>>> rnn.update_memory(prev, hidden) # set prev to hidden
>>> rnn.step_output(hidden)
>>>
>>> result = rnn()
The StaticRNN will unfold sequence into time steps. Users need to define
how to process each time step during the :code:`with` step.
The :code:`memory` is used as a staging data cross time step. The initial
value of memory can be a variable that is filled with a constant value or
a specified variable.
The StaticRNN can mark multiple variables as its output. Use `rnn()` to
get the output sequence.
""" """
BEFORE_RNN_BLOCK = 0 BEFORE_RNN_BLOCK = 0
IN_RNN_BLOCK = 1 IN_RNN_BLOCK = 1
...@@ -284,6 +320,9 @@ class StaticRNN(object): ...@@ -284,6 +320,9 @@ class StaticRNN(object):
self.seq_len = None self.seq_len = None
def step(self): def step(self):
"""
The block for user to define operators in RNN.
"""
return BlockGuardWithCompletion(self) return BlockGuardWithCompletion(self)
def _assert_in_rnn_block_(self, method): def _assert_in_rnn_block_(self, method):
...@@ -298,13 +337,28 @@ class StaticRNN(object): ...@@ -298,13 +337,28 @@ class StaticRNN(object):
init_batch_dim_idx=0, init_batch_dim_idx=0,
ref_batch_dim_idx=1): ref_batch_dim_idx=1):
""" """
Create a memory variable for static rnn.
If the :code:`init` is not None, :code:`memory` will be initialized by
this Variable. If the :code:`init` is None, :code:`shape` and :code:`batch_ref`
must be set, and this function will initialize a :code:`init` Variable.
Args: Args:
init: boot memory, if not set, a shape, batch_ref must be provided init(Variable|None): The initialized variable. If it is not set,
shape: shape of the boot memory :code:`shape` and :code:`batch_ref` must be provided.
batch_ref: batch size reference variable Default: None.
init_value: the init value of boot memory shape(list|tuple): The shape of the boot memory. NOTE the shape
init_batch_dim_idx: the index of batch size in init's dimension does not contain batch_size. Default: None.
ref_batch_dim_idx: the index of batch size in batch_ref's dimension batch_ref(Variable|None): The batch size reference Variable.
Default: None.
init_value(float): the init value of boot memory. Default: 0.0.
init_batch_dim_idx(int): the batch_size axis of the
:code:`init` Variable. Default: 0.
ref_batch_dim_idx(int): the batch_size axis of the
:code:`batch_ref` Variable. Default: 1.
Returns:
The memory variable.
""" """
self._assert_in_rnn_block_('memory') self._assert_in_rnn_block_('memory')
if init is None: if init is None:
...@@ -343,6 +397,16 @@ class StaticRNN(object): ...@@ -343,6 +397,16 @@ class StaticRNN(object):
return pre_mem return pre_mem
def step_input(self, x): def step_input(self, x):
"""
Mark a sequence as a StaticRNN input.
Args:
x(Variable): The input sequence, the shape of x
should be [seq_len, ...].
Returns:
The current time step in the input sequence.
"""
self._assert_in_rnn_block_('step_input') self._assert_in_rnn_block_('step_input')
if not isinstance(x, Variable): if not isinstance(x, Variable):
raise TypeError("step input takes a Variable") raise TypeError("step input takes a Variable")
...@@ -357,6 +421,15 @@ class StaticRNN(object): ...@@ -357,6 +421,15 @@ class StaticRNN(object):
return ipt return ipt
def step_output(self, o): def step_output(self, o):
"""
Mark a sequence as a StaticRNN output.
Args:
o(Variable): The output sequence.
Returns:
None.
"""
self._assert_in_rnn_block_('step_output') self._assert_in_rnn_block_('step_output')
if not isinstance(o, Variable): if not isinstance(o, Variable):
raise TypeError("step output takes a Variable") raise TypeError("step output takes a Variable")
...@@ -376,10 +449,30 @@ class StaticRNN(object): ...@@ -376,10 +449,30 @@ class StaticRNN(object):
self.outputs.append(out_var) self.outputs.append(out_var)
def output(self, *outputs): def output(self, *outputs):
"""
Mark the StaticRNN output variables.
Args:
outputs: The output Variables.
Returns:
None
"""
for each in outputs: for each in outputs:
self.step_output(each) self.step_output(each)
def update_memory(self, mem, var): def update_memory(self, mem, var):
"""
Update the memory from ex_mem to new_mem. NOTE that the shape and data
type of :code:`ex_mem` and :code:`new_mem` must be same.
Args:
mem(Variable): the memory variable.
var(Variable): the plain variable generated in RNN block.
Returns:
None
"""
if not isinstance(mem, Variable) or not isinstance(var, Variable): if not isinstance(mem, Variable) or not isinstance(var, Variable):
raise TypeError("update memory should take variables") raise TypeError("update memory should take variables")
self.memories[mem.name].mem = var self.memories[mem.name].mem = var
...@@ -419,6 +512,9 @@ class StaticRNN(object): ...@@ -419,6 +512,9 @@ class StaticRNN(object):
for m in self.memories: for m in self.memories:
local_inputs.add(m) local_inputs.add(m)
# NOTE(zcd): the params have two categories of variables.
# - the variables that are the out of StaticRnn.
# - the variables that are the parameters of some layers, for example, conv2d.
params = list() params = list()
for op in rnn_block.ops: for op in rnn_block.ops:
assert isinstance(op, Operator) assert isinstance(op, Operator)
...@@ -435,17 +531,19 @@ class StaticRNN(object): ...@@ -435,17 +531,19 @@ class StaticRNN(object):
inlinks = [parent_block.var(i.name) for i in self.inputs] inlinks = [parent_block.var(i.name) for i in self.inputs]
outlinks = self.outputs outlinks = self.outputs
# NOTE(zcd): the states maybe empty in some case.
boot_memories = [] boot_memories = []
pre_memories = [] pre_memories = []
memories = [] memories = []
for _, mem in six.iteritems(self.memories): for _, mem in six.iteritems(self.memories):
boot_memories.append(mem.init) boot_memories.append(mem.init)
pre_memories.append(mem.pre_mem.name) pre_memories.append(mem.pre_mem.name)
assert mem.mem is not None, "%s should be updated in every step." % (
mem.init.name)
mem_var = rnn_block.var(mem.mem.name) mem_var = rnn_block.var(mem.mem.name)
assert isinstance(mem_var, Variable) assert isinstance(mem_var, Variable)
new_mem = self.helper.create_variable_for_type_inference( new_mem = self.helper.create_variable_for_type_inference(
dtype=mem_var.dtype) dtype=mem_var.dtype)
rnn_block.append_op( rnn_block.append_op(
type='rnn_memory_helper', type='rnn_memory_helper',
inputs={'X': [mem_var]}, inputs={'X': [mem_var]},
...@@ -464,6 +562,7 @@ class StaticRNN(object): ...@@ -464,6 +562,7 @@ class StaticRNN(object):
outputs={'outputs': outlinks, outputs={'outputs': outlinks,
'step_scopes': [step_scope]}, 'step_scopes': [step_scope]},
attrs={ attrs={
'has_states': len(pre_memories) > 0,
'ex_states': pre_memories, 'ex_states': pre_memories,
'states': memories, 'states': memories,
'sub_block': rnn_block 'sub_block': rnn_block
......
...@@ -35,8 +35,8 @@ from ..dygraph import learning_rate_scheduler as imperate_lr ...@@ -35,8 +35,8 @@ from ..dygraph import learning_rate_scheduler as imperate_lr
__all__ = [ __all__ = [
'exponential_decay', 'natural_exp_decay', 'inverse_time_decay', 'exponential_decay', 'natural_exp_decay', 'inverse_time_decay',
'polynomial_decay', 'piecewise_decay', 'noam_decay', 'append_LARS', 'polynomial_decay', 'piecewise_decay', 'noam_decay', 'cosine_decay',
'cosine_decay', 'linear_lr_warmup' 'linear_lr_warmup'
] ]
...@@ -349,24 +349,26 @@ def cosine_decay(learning_rate, step_each_epoch, epochs): ...@@ -349,24 +349,26 @@ def cosine_decay(learning_rate, step_each_epoch, epochs):
training progresses. By using this function, the learning rate will be decayed by training progresses. By using this function, the learning rate will be decayed by
following cosine decay strategy. following cosine decay strategy.
decayed_lr = learning_rate * 0.5 * (math.cos(epoch * math.pi / epochs) + 1) .. math::
decayed\_lr = learning\_rate * 0.5 * (math.cos * (epoch * \\frac{math.pi}{epochs} ) + 1)
Args: Args:
learning_rate(Variable|float): The initial learning rate. learning_rate(Variable|float): The initial learning rate.
step_each_epoch(int): the number of steps in an epoch. step_each_epoch(int): the number of steps in an epoch.
epochs(int): the number of epochs. epochs(int): the number of epochs.
Returns: Returns:
Variable: The decayed learning rate. Variable: The decayed learning rate.
Examples:
..code-block:: python Examples:
.. code-block:: python
base_lr = 0.1 base_lr = 0.1
lr = fluid.layers.cosine_decay( lr = fluid.layers.cosine_decay(
learning_rate = base_lr, step_each_epoch=10000, epochs=120) learning_rate = base_lr, step_each_epoch=10000, epochs=120)
""" """
with default_main_program()._lr_schedule_guard(): with default_main_program()._lr_schedule_guard():
if imperative_base.enabled(): if imperative_base.enabled():
decay = imperate_lr.CosineDecay(learning_rate, step_each_epoch, decay = imperate_lr.CosineDecay(learning_rate, step_each_epoch,
...@@ -381,50 +383,6 @@ def cosine_decay(learning_rate, step_each_epoch, epochs): ...@@ -381,50 +383,6 @@ def cosine_decay(learning_rate, step_each_epoch, epochs):
return decayed_lr return decayed_lr
def append_LARS(params_grads, learning_rate, weight_decay):
"""
Applies LARS (LAYER-WISE ADAPTIVE RATE SCALING) to learning rate for
each layer.
Args:
learning_rate: A learning rate Variable. This
is the global learning rate for LARS.
weight_decay: A Python `float` number.
Returns:
The decayed learning rate
Examples:
.. code-block:: python
learning_rate *= local_gw_ratio * sqrt(sumsq(param))
/ (sqrt(sumsq(gradient))+ weight_decay * sqrt(sumsq(param)))
"""
assert not imperative_base.enabled(
), "append_LARS is NOT supported in dygraph mode now"
def _balanced_weight(param_norm, grad_norm):
if weight_decay == 1.0:
return grad_norm + param_norm
else:
return grad_norm + weight_decay * param_norm
for param, grad in params_grads:
with param.block.program.optimized_guard(
[param, grad]), name_scope("optimizer"):
param_lr = param.optimize_attr['learning_rate']
param_norm = ops.sqrt(nn.reduce_sum(input=ops.square(param)))
grad_norm = ops.sqrt(nn.reduce_sum(input=ops.square(grad)))
if type(param_lr) == float and param_lr == 1.0:
decayed_lr = learning_rate * param_norm \
/ _balanced_weight(param_norm, grad_norm)
else:
decayed_lr = learning_rate * param_lr * param_norm \
/ _balanced_weight(param_norm, grad_norm)
# set back param local learning rate
param.optimize_attr['learning_rate'] = decayed_lr
def linear_lr_warmup(learning_rate, warmup_steps, start_lr, end_lr): def linear_lr_warmup(learning_rate, warmup_steps, start_lr, end_lr):
""" """
Applies linear learning rate warmup before the normal learning rate Applies linear learning rate warmup before the normal learning rate
......
...@@ -73,6 +73,8 @@ __all__ = [ ...@@ -73,6 +73,8 @@ __all__ = [
'reduce_max', 'reduce_max',
'reduce_min', 'reduce_min',
'reduce_prod', 'reduce_prod',
'reduce_all',
'reduce_any',
'sequence_first_step', 'sequence_first_step',
'sequence_last_step', 'sequence_last_step',
'sequence_slice', 'sequence_slice',
...@@ -159,6 +161,7 @@ __all__ = [ ...@@ -159,6 +161,7 @@ __all__ = [
'sum', 'sum',
'slice', 'slice',
'shape', 'shape',
'rank',
'logical_and', 'logical_and',
'logical_or', 'logical_or',
'logical_xor', 'logical_xor',
...@@ -4738,6 +4741,106 @@ def reduce_prod(input, dim=None, keep_dim=False, name=None): ...@@ -4738,6 +4741,106 @@ def reduce_prod(input, dim=None, keep_dim=False, name=None):
return out return out
def reduce_all(input, dim=None, keep_dim=False, name=None):
"""
Computes the ``logical and`` of tensor elements over the given dimension.
Args:
input (Variable): The input variable which is a Tensor or LoDTensor.
dim (list|int|None): The dimension along which the logical and is computed.
If :attr:`None`, compute the logical and over all elements of
:attr:`input` and return a Tensor variable with a single element,
otherwise must be in the range :math:`[-rank(input), rank(input))`.
If :math:`dim[i] < 0`, the dimension to reduce is :math:`rank + dim[i]`.
keep_dim (bool): Whether to reserve the reduced dimension in the
output Tensor. The result tensor will have one fewer dimension
than the :attr:`input` unless :attr:`keep_dim` is true.
name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically.
Returns:
Variable: The reduced Tensor variable.
Examples:
.. code-block:: python
# x is a bool Tensor variable with following elements:
# [[True, False]
# [True, True]]
# Each example is followed by the correspending output tensor.
fluid.layers.reduce_all(x) # False
fluid.layers.reduce_all(x, dim=0) # [True, False]
fluid.layers.reduce_all(x, dim=-1) # [False, True]
fluid.layers.reduce_all(x, dim=1,
keep_dim=True) # [[False], [True]]
"""
helper = LayerHelper('reduce_all', **locals())
out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
if dim is not None and not isinstance(dim, list):
dim = [dim]
helper.append_op(
type='reduce_all',
inputs={'X': input},
outputs={'Out': out},
attrs={
'dim': dim if dim != None else [0],
'keep_dim': keep_dim,
'reduce_all': True if dim == None else False
})
return out
def reduce_any(input, dim=None, keep_dim=False, name=None):
"""
Computes the ``logical or`` of tensor elements over the given dimension.
Args:
input (Variable): The input variable which is a Tensor or LoDTensor.
dim (list|int|None): The dimension along which the logical or is computed.
If :attr:`None`, compute the logical or over all elements of
:attr:`input` and return a Tensor variable with a single element,
otherwise must be in the range :math:`[-rank(input), rank(input))`.
If :math:`dim[i] < 0`, the dimension to reduce is :math:`rank + dim[i]`.
keep_dim (bool): Whether to reserve the reduced dimension in the
output Tensor. The result tensor will have one fewer dimension
than the :attr:`input` unless :attr:`keep_dim` is true.
name(str|None): A name for this layer(optional). If set None, the layer
will be named automatically.
Returns:
Variable: The reduced Tensor variable.
Examples:
.. code-block:: python
# x is a bool Tensor variable with following elements:
# [[True, False]
# [False, False]]
# Each example is followed by the correspending output tensor.
fluid.layers.reduce_any(x) # True
fluid.layers.reduce_any(x, dim=0) # [True, False]
fluid.layers.reduce_any(x, dim=-1) # [True, False]
fluid.layers.reduce_any(x, dim=1,
keep_dim=True) # [[True], [False]]
"""
helper = LayerHelper('reduce_any', **locals())
out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
if dim is not None and not isinstance(dim, list):
dim = [dim]
helper.append_op(
type='reduce_any',
inputs={'X': input},
outputs={'Out': out},
attrs={
'dim': dim if dim != None else [0],
'keep_dim': keep_dim,
'reduce_all': True if dim == None else False
})
return out
def split(input, num_or_sections, dim=-1, name=None): def split(input, num_or_sections, dim=-1, name=None):
""" """
Split the input tensor into multiple sub-tensors. Split the input tensor into multiple sub-tensors.
...@@ -4819,7 +4922,7 @@ def l2_normalize(x, axis, epsilon=1e-12, name=None): ...@@ -4819,7 +4922,7 @@ def l2_normalize(x, axis, epsilon=1e-12, name=None):
the dimension to normalization is rank(X) + axis. -1 is the the dimension to normalization is rank(X) + axis. -1 is the
last dimension. last dimension.
epsilon(float): The epsilon value is used to avoid division by zero, \ epsilon(float): The epsilon value is used to avoid division by zero, \
the defalut value is 1e-10. the defalut value is 1e-12.
name(str|None): A name for this layer(optional). If set None, the layer \ name(str|None): A name for this layer(optional). If set None, the layer \
will be named automatically. will be named automatically.
...@@ -9237,6 +9340,32 @@ def shape(input): ...@@ -9237,6 +9340,32 @@ def shape(input):
return out return out
def rank(input):
"""
**Rank Layer**
Returns the number of dimensions for a tensor, which is a 0-D int32 Tensor.
Args:
input (Variable): The input variable.
Returns:
Variable: The rank of the input variable.
Examples:
.. code-block:: python
input = layers.data(
name="input", shape=[3, 100, 100], dtype="float32")
rank = layers.rank(input) # 4
"""
ndims = len(input.shape)
out = assign(np.array(ndims, 'int32'))
return out
def _elementwise_op(helper): def _elementwise_op(helper):
op_type = helper.layer_type op_type = helper.layer_type
x = helper.kwargs.get('x', None) x = helper.kwargs.get('x', None)
...@@ -11002,7 +11131,7 @@ def pixel_shuffle(x, upscale_factor): ...@@ -11002,7 +11131,7 @@ def pixel_shuffle(x, upscale_factor):
Returns: Returns:
Out(Variable): the pixel shuffle result is a tensor variable with the same shape and the same type as the input. Out(Variable): Reshaped tensor according to the new dimension.
Raises: Raises:
......
...@@ -24,26 +24,11 @@ from .layer_function_generator import templatedoc ...@@ -24,26 +24,11 @@ from .layer_function_generator import templatedoc
import numpy import numpy
__all__ = [ __all__ = [
'create_tensor', 'create_tensor', 'create_parameter', 'create_global_var', 'cast',
'create_parameter', 'tensor_array_to_tensor', 'concat', 'sums', 'assign',
'create_global_var', 'fill_constant_batch_size_like', 'fill_constant', 'argmin', 'argmax',
'cast', 'argsort', 'ones', 'zeros', 'reverse', 'has_inf', 'has_nan', 'isfinite',
'tensor_array_to_tensor', 'range', 'linspace'
'concat',
'sums',
'assign',
'fill_constant_batch_size_like',
'fill_constant',
'argmin',
'argmax',
'argsort',
'ones',
'zeros',
'reverse',
'has_inf',
'has_nan',
'isfinite',
'range',
] ]
...@@ -826,3 +811,45 @@ def range(start, end, step, dtype): ...@@ -826,3 +811,45 @@ def range(start, end, step, dtype):
'Step': step}, 'Step': step},
outputs={'Out': [out]}) outputs={'Out': [out]})
return out return out
def linspace(start, stop, num, dtype):
"""
Return fixed number of evenly spaced values within a given interval.
First entry is start, and last entry is stop. In the case when Num is 1, only Start is returned. Like linspace function of numpy.
Args:
start(float|Variable): First entry in the sequence. It is a float scalar, or a tensor of shape [1] with type 'float32'|'float64'.
stop(float|Variable): Last entry in the sequence. It is a float scalar, or a tensor of shape [1] with type 'float32'|'float64'.
num(int|Variable): Number of entry in the sequence. It is an int scalar, or a tensor of shape [1] with type int32.
dtype(string): 'float32'|'float64', the data type of the output tensor.
Returns:
Variable: The tensor variable storing a 1-D tensor.
Examples:
.. code-block:: python
data = fluid.layers.linspace(0, 10, 5, 'float32') # [0.0, 2.5, 5.0, 7.5, 10.0]
data = fluid.layers.linspace(0, 10, 1, 'float32') # [0.0]
"""
helper = LayerHelper("linspace", **locals())
if not isinstance(start, Variable):
start = fill_constant([1], dtype, start)
if not isinstance(stop, Variable):
stop = fill_constant([1], dtype, stop)
if not isinstance(num, Variable):
num = fill_constant([1], 'int32', num)
out = helper.create_variable_for_type_inference(dtype=start.dtype)
helper.append_op(
type='linspace',
inputs={'Start': start,
'Stop': stop,
'Num': num},
outputs={'Out': [out]})
return out
...@@ -6,4 +6,6 @@ foreach(src ${TEST_OPS}) ...@@ -6,4 +6,6 @@ foreach(src ${TEST_OPS})
py_test(${src} SRCS ${src}.py) py_test(${src} SRCS ${src}.py)
endforeach() endforeach()
add_subdirectory(high-level-api) if(WITH_HIGH_LEVEL_API_TEST)
add_subdirectory(high-level-api)
endif()
file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*_new_api.py")
string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
# default test # This test is buggy
foreach(src ${TEST_OPS}) # py_test(test_understand_sentiment_dynamic_rnn SRCS
py_test(${src} SRCS ${src}.py) # test_understand_sentiment_dynamic_rnn.py SERIAL)
endforeach() LIST(REMOVE_ITEM TEST_OPS test_understand_sentiment_dynamic_rnn_new_api)
add_subdirectory(fit_a_line) if(NOT APPLE)
add_subdirectory(recognize_digits) # default test
add_subdirectory(image_classification) foreach(src ${TEST_OPS})
add_subdirectory(understand_sentiment) py_test(${src} SRCS ${src}.py)
add_subdirectory(label_semantic_roles) endforeach()
add_subdirectory(word2vec) else()
add_subdirectory(recommender_system) foreach(src ${TEST_OPS})
add_subdirectory(machine_translation) if(${src} STREQUAL "test_image_classification_vgg_new_api")
message(WARNING "These tests has been disabled in OSX for random fail: \n" ${src})
elseif(${src} STREQUAL "test_image_classification_resnet_new_api")
message(WARNING "These tests has been disabled in OSX for random fail: \n" ${src})
elseif(${src} STREQUAL "test_recognize_digits_conv_new_api")
message(WARNING "These tests has been disabled in OSX for random fail: \n" ${src})
elseif(${src} STREQUAL "test_recognize_digits_mlp_new_api")
message(WARNING "These tests has been disabled in OSX for random fail: \n" ${src})
elseif()
py_test(${src} SRCS ${src}.py)
endif()
endforeach()
endif()
file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
# default test
foreach(src ${TEST_OPS})
py_test(${src} SRCS ${src}.py)
endforeach()
file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
if(NOT APPLE)
# default test
foreach(src ${TEST_OPS})
py_test(${src} SRCS ${src}.py)
endforeach()
else()
foreach(src ${TEST_OPS})
if(${src} STREQUAL "test_image_classification_vgg")
message(WARNING "These tests has been disabled in OSX for random fail: \n" ${src})
elseif(${src} STREQUAL "test_image_classification_resnet")
message(WARNING "These tests has been disabled in OSX for random fail: \n" ${src})
elseif()
py_test(${src} SRCS ${src}.py)
endif()
endforeach()
endif()
file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
# default test
foreach(src ${TEST_OPS})
py_test(${src} SRCS ${src}.py)
endforeach()
file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
# default test
foreach(src ${TEST_OPS})
py_test(${src} SRCS ${src}.py)
endforeach()
file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
# default test
if(NOT APPLE)
foreach(src ${TEST_OPS})
py_test(${src} SRCS ${src}.py)
endforeach()
else()
foreach(src ${TEST_OPS})
if(${src} STREQUAL "test_recognize_digits_conv")
message(WARNING "These tests has been disabled in OSX for random fail: \n" ${src})
elseif(${src} STREQUAL "test_recognize_digits_mlp")
message(WARNING "These tests has been disabled in OSX for random fail: \n" ${src})
else()
py_test(${src} SRCS ${src}.py)
endif()
endforeach()
endif()
file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
# default test
foreach(src ${TEST_OPS})
py_test(${src} SRCS ${src}.py)
endforeach()
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册