diff --git a/doc/_images/bi_lstm1.jpg b/doc/_images/bi_lstm1.jpg deleted file mode 100644 index adec1606d64d6e35ffe7e62abfa9a09309b05c84..0000000000000000000000000000000000000000 Binary files a/doc/_images/bi_lstm1.jpg and /dev/null differ diff --git a/doc/_images/cifar.png b/doc/_images/cifar.png deleted file mode 100644 index f54a0c58837cb3385b32dc57d02cec92666ef0f1..0000000000000000000000000000000000000000 Binary files a/doc/_images/cifar.png and /dev/null differ diff --git a/doc/_images/curve.jpg b/doc/_images/curve.jpg deleted file mode 100644 index baa35ae7f0a0b6c246f3a0d331735477ab8bcd70..0000000000000000000000000000000000000000 Binary files a/doc/_images/curve.jpg and /dev/null differ diff --git a/doc/_images/encoder-decoder-attention-model1.png b/doc/_images/encoder-decoder-attention-model1.png deleted file mode 100644 index 79f911d4ba12ac0c0d1a936c9df639c302786914..0000000000000000000000000000000000000000 Binary files a/doc/_images/encoder-decoder-attention-model1.png and /dev/null differ diff --git a/doc/_images/feature.jpg b/doc/_images/feature.jpg deleted file mode 100644 index 0e3310e4ace5613917e7779d3198ccbb3cdc5ada..0000000000000000000000000000000000000000 Binary files a/doc/_images/feature.jpg and /dev/null differ diff --git a/doc/_images/graph_construction_example_all.png b/doc/_images/graph_construction_example_all.png new file mode 100644 index 0000000000000000000000000000000000000000..261611a5721f9aa97874f7e6d897fe48cf667db2 Binary files /dev/null and b/doc/_images/graph_construction_example_all.png differ diff --git a/doc/_images/graph_construction_example_forward_backward.png b/doc/_images/graph_construction_example_forward_backward.png new file mode 100644 index 0000000000000000000000000000000000000000..4c69687f4a6a181138f3df72ce5e8aa48487b5be Binary files /dev/null and b/doc/_images/graph_construction_example_forward_backward.png differ diff --git a/doc/_images/graph_construction_example_forward_only.png b/doc/_images/graph_construction_example_forward_only.png new file mode 100644 index 0000000000000000000000000000000000000000..e668c16e0cac73acb4e5dc2b1827557ae77126b4 Binary files /dev/null and b/doc/_images/graph_construction_example_forward_only.png differ diff --git a/doc/_images/image_classification.png b/doc/_images/image_classification.png deleted file mode 100644 index 14f255805081c1b4fab27eaf336fd389fa93ca19..0000000000000000000000000000000000000000 Binary files a/doc/_images/image_classification.png and /dev/null differ diff --git a/doc/_images/lenet.png b/doc/_images/lenet.png deleted file mode 100644 index 1e6f2b32bad797f3fccb929c72a121fc935b0cbb..0000000000000000000000000000000000000000 Binary files a/doc/_images/lenet.png and /dev/null differ diff --git a/doc/_images/lstm.png b/doc/_images/lstm.png deleted file mode 100644 index aaf1fc690da2ffb8418cde5ed81848ddb5263030..0000000000000000000000000000000000000000 Binary files a/doc/_images/lstm.png and /dev/null differ diff --git a/doc/_images/network_arch.png b/doc/_images/network_arch.png deleted file mode 100644 index 4ae7864212f2a0a38102ee7ff600527ea99fec82..0000000000000000000000000000000000000000 Binary files a/doc/_images/network_arch.png and /dev/null differ diff --git a/doc/_images/paddleci.png b/doc/_images/paddleci.png new file mode 100644 index 0000000000000000000000000000000000000000..16087ce059aa3c07ce8c927d983eb86351915825 Binary files /dev/null and b/doc/_images/paddleci.png differ diff --git a/doc/_images/parameters.png b/doc/_images/parameters.png deleted file mode 100644 index 2ec67480951e21f0400bce1c34b3108dcd65c18c..0000000000000000000000000000000000000000 Binary files a/doc/_images/parameters.png and /dev/null differ diff --git a/doc/_images/plot.png b/doc/_images/plot.png deleted file mode 100644 index a31f99791c670e18bb8c62b7604ec8cb0284ffb4..0000000000000000000000000000000000000000 Binary files a/doc/_images/plot.png and /dev/null differ diff --git a/doc/_images/pprof_1.png b/doc/_images/pprof_1.png new file mode 100644 index 0000000000000000000000000000000000000000..8e9edbf377672d0ef40f2fc7bd39e746923550cb Binary files /dev/null and b/doc/_images/pprof_1.png differ diff --git a/doc/_images/pprof_2.png b/doc/_images/pprof_2.png new file mode 100644 index 0000000000000000000000000000000000000000..172ba20399ba974d27f4c072425277b69b02520b Binary files /dev/null and b/doc/_images/pprof_2.png differ diff --git a/doc/_images/rec_regression_network.png b/doc/_images/rec_regression_network.png deleted file mode 100644 index 7d2b54d4fcf560cd5b667628f0012c3822efd9b2..0000000000000000000000000000000000000000 Binary files a/doc/_images/rec_regression_network.png and /dev/null differ diff --git a/doc/_images/stacked_lstm.jpg b/doc/_images/stacked_lstm.jpg deleted file mode 100644 index 4239055050966e0095e188a8c81d860711bce29d..0000000000000000000000000000000000000000 Binary files a/doc/_images/stacked_lstm.jpg and /dev/null differ diff --git a/doc/_sources/about/index_en.rst.txt b/doc/_sources/about/index_en.rst.txt deleted file mode 100644 index 065c430cdea802ed3c9f487cd00255b85a5598a5..0000000000000000000000000000000000000000 --- a/doc/_sources/about/index_en.rst.txt +++ /dev/null @@ -1,14 +0,0 @@ -ABOUT -======= - -PaddlPaddle is an easy-to-use, efficient, flexible and scalable deep learning platform, -which is originally developed by Baidu scientists and engineers for the purpose of applying deep learning to many products at Baidu. - -PaddlePaddle is now open source but far from complete, which is intended to be built upon, improved, scaled, and extended. -We hope to build an active open source community both by providing feedback and by actively contributing to the source code. - - -Credits --------- - -We owe many thanks to `all contributors and developers `_ of PaddlePaddle! diff --git a/doc/_sources/api/index_en.rst.txt b/doc/_sources/api/index_en.rst.txt index 25c1dd00b9cbb3ab647e04cdc2b4c27c552a2332..e6f632e1a5b9c4b50b7c6aa96a120030bd6ce338 100644 --- a/doc/_sources/api/index_en.rst.txt +++ b/doc/_sources/api/index_en.rst.txt @@ -7,3 +7,4 @@ API v2/model_configs.rst v2/data.rst v2/run_logic.rst + v2/fluid.rst diff --git a/doc/_sources/api/v1/trainer_config_helpers/activations.rst.txt b/doc/_sources/api/v1/trainer_config_helpers/activations.rst.txt deleted file mode 100644 index 269e6491e7ebe3899c3fb24fca756a393043473b..0000000000000000000000000000000000000000 --- a/doc/_sources/api/v1/trainer_config_helpers/activations.rst.txt +++ /dev/null @@ -1,108 +0,0 @@ -=========== -Activations -=========== - -BaseActivation -============== - -.. automodule:: paddle.trainer_config_helpers.activations - :members: BaseActivation - :noindex: - -AbsActivation -=============== - -.. automodule:: paddle.trainer_config_helpers.activations - :members: AbsActivation - :noindex: - -ExpActivation -=============== - -.. automodule:: paddle.trainer_config_helpers.activations - :members: ExpActivation - :noindex: - -IdentityActivation -================== - -.. automodule:: paddle.trainer_config_helpers.activations - :members: IdentityActivation - :noindex: - -LinearActivation -================== - -.. automodule:: paddle.trainer_config_helpers.activations - :members: LinearActivation - :noindex: - -LogActivation -================== - -.. automodule:: paddle.trainer_config_helpers.activations - :members: LogActivation - :noindex: - -SquareActivation -================ - -.. automodule:: paddle.trainer_config_helpers.activations - :members: SquareActivation - :noindex: - -SigmoidActivation -================= - -.. automodule:: paddle.trainer_config_helpers.activations - :members: SigmoidActivation - :noindex: - -SoftmaxActivation -================= - -.. automodule:: paddle.trainer_config_helpers.activations - :members: SoftmaxActivation - :noindex: - -SequenceSoftmaxActivation -========================= - -.. automodule:: paddle.trainer_config_helpers.activations - :members: SequenceSoftmaxActivation - :noindex: - -ReluActivation -============== - -.. automodule:: paddle.trainer_config_helpers.activations - :members: ReluActivation - :noindex: - -BReluActivation -=============== - -.. automodule:: paddle.trainer_config_helpers.activations - :members: BReluActivation - :noindex: - -SoftReluActivation -================== - -.. automodule:: paddle.trainer_config_helpers.activations - :members: SoftReluActivation - :noindex: - -TanhActivation -============== - -.. automodule:: paddle.trainer_config_helpers.activations - :members: TanhActivation - :noindex: - -STanhActivation -=============== - -.. automodule:: paddle.trainer_config_helpers.activations - :members: STanhActivation - :noindex: diff --git a/doc/_sources/api/v1/trainer_config_helpers/attrs.rst.txt b/doc/_sources/api/v1/trainer_config_helpers/attrs.rst.txt deleted file mode 100644 index ac63127bf7d9db6351365ab7b58f43db12347a8e..0000000000000000000000000000000000000000 --- a/doc/_sources/api/v1/trainer_config_helpers/attrs.rst.txt +++ /dev/null @@ -1,5 +0,0 @@ -Parameter Attributes -======================= - -.. automodule:: paddle.trainer_config_helpers.attrs - :members: diff --git a/doc/_sources/api/v1/trainer_config_helpers/data_sources.rst.txt b/doc/_sources/api/v1/trainer_config_helpers/data_sources.rst.txt deleted file mode 100644 index b9dd4dda01ae59d1260356aff50ddf298d02c87f..0000000000000000000000000000000000000000 --- a/doc/_sources/api/v1/trainer_config_helpers/data_sources.rst.txt +++ /dev/null @@ -1,7 +0,0 @@ -.. _api_trainer_config_helpers_data_sources: - -DataSources -=========== - -.. automodule:: paddle.trainer_config_helpers.data_sources - :members: diff --git a/doc/_sources/api/v1/trainer_config_helpers/evaluators.rst.txt b/doc/_sources/api/v1/trainer_config_helpers/evaluators.rst.txt deleted file mode 100644 index 11dc735164284d6ed1d661fab1e7690d263b3a7c..0000000000000000000000000000000000000000 --- a/doc/_sources/api/v1/trainer_config_helpers/evaluators.rst.txt +++ /dev/null @@ -1,108 +0,0 @@ -.. _api_trainer_config_helpers_evaluators: - -========== -Evaluators -========== - -Base -==== -.. automodule:: paddle.trainer_config_helpers.evaluators - :members: evaluator_base - :noindex: - -Classification -============== - -classification_error_evaluator ------------------------------- -.. automodule:: paddle.trainer_config_helpers.evaluators - :members: classification_error_evaluator - :noindex: - -auc_evaluator -------------- -.. automodule:: paddle.trainer_config_helpers.evaluators - :members: auc_evaluator - :noindex: - -ctc_error_evaluator -------------------- -.. automodule:: paddle.trainer_config_helpers.evaluators - :members: ctc_error_evaluator - :noindex: - -chunk_evaluator ---------------- -.. automodule:: paddle.trainer_config_helpers.evaluators - :members: chunk_evaluator - :noindex: - -precision_recall_evaluator --------------------------- -.. automodule:: paddle.trainer_config_helpers.evaluators - :members: precision_recall_evaluator - :noindex: - -Rank -==== - -pnpair_evaluator ----------------- -.. automodule:: paddle.trainer_config_helpers.evaluators - :members: pnpair_evaluator - :noindex: - -Utils -===== - -sum_evaluator -------------- -.. automodule:: paddle.trainer_config_helpers.evaluators - :members: sum_evaluator - :noindex: - -column_sum_evaluator --------------------- -.. automodule:: paddle.trainer_config_helpers.evaluators - :members: column_sum_evaluator - :noindex: - -Print -===== - -classification_error_printer_evaluator --------------------------------------- -.. automodule:: paddle.trainer_config_helpers.evaluators - :members: classification_error_printer_evaluator - :noindex: - -gradient_printer_evaluator --------------------------- -.. automodule:: paddle.trainer_config_helpers.evaluators - :members: gradient_printer_evaluator - :noindex: - -maxid_printer_evaluator ------------------------ -.. automodule:: paddle.trainer_config_helpers.evaluators - :members: maxid_printer_evaluator - :noindex: - -maxframe_printer_evaluator ---------------------------- -.. automodule:: paddle.trainer_config_helpers.evaluators - :members: maxframe_printer_evaluator - :noindex: - -seqtext_printer_evaluator -------------------------- -.. automodule:: paddle.trainer_config_helpers.evaluators - :members: seqtext_printer_evaluator - :noindex: - -value_printer_evaluator ------------------------ -.. automodule:: paddle.trainer_config_helpers.evaluators - :members: value_printer_evaluator - :noindex: - diff --git a/doc/_sources/api/v1/trainer_config_helpers/layers.rst.txt b/doc/_sources/api/v1/trainer_config_helpers/layers.rst.txt deleted file mode 100644 index 24389c2d8574dfda4bec9298776aa6b1aee51535..0000000000000000000000000000000000000000 --- a/doc/_sources/api/v1/trainer_config_helpers/layers.rst.txt +++ /dev/null @@ -1,508 +0,0 @@ -.. _api_trainer_config_helpers_layers: - -====== -Layers -====== - -Base -====== - -LayerType ---------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: LayerType - :noindex: - -LayerOutput ------------ -.. automodule:: paddle.trainer_config_helpers.layers - :members: LayerOutput - :noindex: - -Data layer -=========== - -.. _api_trainer_config_helpers_layers_data_layer: - -data_layer ----------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: data_layer - :noindex: - -Fully Connected Layers -====================== - -.. _api_trainer_config_helpers_layers_fc_layer: - -fc_layer --------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: fc_layer - :noindex: - -selective_fc_layer ------------------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: selective_fc_layer - :noindex: - -Conv Layers -=========== - -conv_operator -------------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: conv_operator - :noindex: - -conv_projection ---------------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: conv_projection - :noindex: - -conv_shift_layer ------------------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: conv_shift_layer - :noindex: - -img_conv_layer --------------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: img_conv_layer - :noindex: - -.. _api_trainer_config_helpers_layers_context_projection: - -context_projection ------------------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: context_projection - :noindex: - -Image Pooling Layer -=================== - -img_pool_layer --------------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: img_pool_layer - :noindex: - -spp_layer --------------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: spp_layer - :noindex: - -maxout_layer ------------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: maxout_layer - :noindex: - -Norm Layer -========== - -img_cmrnorm_layer ------------------ -.. automodule:: paddle.trainer_config_helpers.layers - :members: img_cmrnorm_layer - :noindex: - -batch_norm_layer ---------------------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: batch_norm_layer - :noindex: - -sum_to_one_norm_layer ---------------------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: sum_to_one_norm_layer - :noindex: - -Recurrent Layers -================ - -recurrent_layer ------------------ -.. automodule:: paddle.trainer_config_helpers.layers - :members: recurrent_layer - :noindex: - -lstmemory ---------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: lstmemory - :noindex: - -grumemory ---------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: grumemory - :noindex: - -Recurrent Layer Group -===================== - -memory ------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: memory - :noindex: - -recurrent_group ---------------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: recurrent_group - :noindex: - -lstm_step_layer ---------------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: lstm_step_layer - :noindex: - -gru_step_layer ---------------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: gru_step_layer - :noindex: - -beam_search ------------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: beam_search - :noindex: - -get_output_layer ------------------ -.. automodule:: paddle.trainer_config_helpers.layers - :members: get_output_layer - :noindex: - -Mixed Layer -=========== - -.. _api_trainer_config_helpers_layers_mixed_layer: - -mixed_layer ------------ -.. automodule:: paddle.trainer_config_helpers.layers - :members: mixed_layer - :noindex: - -.. _api_trainer_config_helpers_layers_embedding_layer: - -embedding_layer ---------------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: embedding_layer - :noindex: - -scaling_projection ------------------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: scaling_projection - :noindex: - -dotmul_projection ------------------ -.. automodule:: paddle.trainer_config_helpers.layers - :members: dotmul_projection - :noindex: - -dotmul_operator ---------------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: dotmul_operator - :noindex: - -full_matrix_projection ----------------------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: full_matrix_projection - :noindex: - -identity_projection -------------------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: identity_projection - :noindex: - - -table_projection ----------------------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: table_projection - :noindex: - -trans_full_matrix_projection ----------------------------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: trans_full_matrix_projection - :noindex: - -Aggregate Layers -================ - -.. _api_trainer_config_helpers_layers_pooling_layer: - -pooling_layer -------------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: pooling_layer - :noindex: - -.. _api_trainer_config_helpers_layers_last_seq: - -last_seq --------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: last_seq - :noindex: - -.. _api_trainer_config_helpers_layers_first_seq: - -first_seq ---------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: first_seq - :noindex: - -concat_layer ------------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: concat_layer - :noindex: - -seq_concat_layer ----------------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: seq_concat_layer - :noindex: - -Reshaping Layers -================ - -block_expand_layer ------------------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: block_expand_layer - :noindex: - -.. _api_trainer_config_helpers_layers_expand_layer: - -expand_layer ------------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: expand_layer - :noindex: - -repeat_layer ------------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: repeat_layer - :noindex: - -rotate_layer ------------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: rotate_layer - :noindex: - -seq_reshape_layer ------------------ -.. automodule:: paddle.trainer_config_helpers.layers - :members: seq_reshape_layer - :noindex: - -Math Layers -=========== - -addto_layer ------------ -.. automodule:: paddle.trainer_config_helpers.layers - :members: addto_layer - :noindex: - -linear_comb_layer ------------------ -.. automodule:: paddle.trainer_config_helpers.layers - :members: linear_comb_layer - :noindex: - -interpolation_layer -------------------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: interpolation_layer - :noindex: - -bilinear_interp_layer ----------------------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: bilinear_interp_layer - :noindex: - -power_layer ------------ -.. automodule:: paddle.trainer_config_helpers.layers - :members: power_layer - :noindex: - -scaling_layer -------------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: scaling_layer - :noindex: - -slope_intercept_layer ----------------------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: slope_intercept_layer - :noindex: - -tensor_layer ------------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: tensor_layer - :noindex: - -.. _api_trainer_config_helpers_layers_cos_sim: - -cos_sim -------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: cos_sim - :noindex: - -trans_layer ------------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: trans_layer - :noindex: - -Sampling Layers -=============== - -maxid_layer ------------ -.. automodule:: paddle.trainer_config_helpers.layers - :members: maxid_layer - :noindex: - -sampling_id_layer ------------------ -.. automodule:: paddle.trainer_config_helpers.layers - :members: sampling_id_layer - :noindex: - -Slicing and Joining Layers -========================== - -pad_layer ------------ -.. automodule:: paddle.trainer_config_helpers.layers - :members: pad_layer - :noindex: - -.. _api_trainer_config_helpers_layers_cost_layers: - -Cost Layers -=========== - -cross_entropy -------------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: cross_entropy - :noindex: - -cross_entropy_with_selfnorm ---------------------------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: cross_entropy_with_selfnorm - :noindex: - -multi_binary_label_cross_entropy --------------------------------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: multi_binary_label_cross_entropy - :noindex: - -mse_cost ---------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: mse_cost - :noindex: - -huber_cost ----------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: huber_cost - :noindex: - -lambda_cost ------------ -.. automodule:: paddle.trainer_config_helpers.layers - :members: lambda_cost - :noindex: - -rank_cost ---------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: rank_cost - :noindex: - -sum_cost ---------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: sum_cost - :noindex: - -crf_layer ------------------ -.. automodule:: paddle.trainer_config_helpers.layers - :members: crf_layer - :noindex: - -crf_decoding_layer -------------------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: crf_decoding_layer - :noindex: - -ctc_layer ------------ -.. automodule:: paddle.trainer_config_helpers.layers - :members: ctc_layer - :noindex: - -warp_ctc_layer --------------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: warp_ctc_layer - :noindex: - -nce_layer ------------ -.. automodule:: paddle.trainer_config_helpers.layers - :members: nce_layer - :noindex: - -hsigmoid ---------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: hsigmoid - :noindex: - -Check Layer -============ - -eos_layer ------------- -.. automodule:: paddle.trainer_config_helpers.layers - :members: eos_layer - :noindex: diff --git a/doc/_sources/api/v1/trainer_config_helpers/networks.rst.txt b/doc/_sources/api/v1/trainer_config_helpers/networks.rst.txt deleted file mode 100644 index edb53acbf0c31532aa34bda044066fed72eaa426..0000000000000000000000000000000000000000 --- a/doc/_sources/api/v1/trainer_config_helpers/networks.rst.txt +++ /dev/null @@ -1,123 +0,0 @@ -======== -Networks -======== - -The networks module contains pieces of neural network that combine multiple layers. - -NLP -=== - -sequence_conv_pool ------------------- -.. automodule:: paddle.trainer_config_helpers.networks - :members: sequence_conv_pool - :noindex: - -.. _api_trainer_config_helpers_network_text_conv_pool: - -text_conv_pool --------------- -.. automodule:: paddle.trainer_config_helpers.networks - :members: text_conv_pool - :noindex: - -Images -====== - -img_conv_bn_pool ----------------- -.. automodule:: paddle.trainer_config_helpers.networks - :members: img_conv_bn_pool - :noindex: - -img_conv_group --------------- -.. automodule:: paddle.trainer_config_helpers.networks - :members: img_conv_group - :noindex: - -.. _api_trainer_config_helpers_network_simple_img_conv_pool: - -simple_img_conv_pool --------------------- -.. automodule:: paddle.trainer_config_helpers.networks - :members: simple_img_conv_pool - :noindex: - -vgg_16_network ---------------- -.. automodule:: paddle.trainer_config_helpers.networks - :members: vgg_16_network - :noindex: - -Recurrent -========= - -LSTM ----- - -lstmemory_unit -`````````````` -.. automodule:: paddle.trainer_config_helpers.networks - :members: lstmemory_unit - :noindex: - -lstmemory_group -``````````````` -.. automodule:: paddle.trainer_config_helpers.networks - :members: lstmemory_group - :noindex: - -simple_lstm -``````````` -.. automodule:: paddle.trainer_config_helpers.networks - :members: simple_lstm - :noindex: - -bidirectional_lstm -`````````````````` -.. automodule:: paddle.trainer_config_helpers.networks - :members: bidirectional_lstm - :noindex: - -GRU ---- - -gru_unit -```````` -.. automodule:: paddle.trainer_config_helpers.networks - :members: gru_unit - :noindex: - -gru_group -````````` -.. automodule:: paddle.trainer_config_helpers.networks - :members: gru_group - :noindex: - -simple_gru -`````````` -.. automodule:: paddle.trainer_config_helpers.networks - :members: simple_gru - :noindex: - -simple_attention ----------------- -.. automodule:: paddle.trainer_config_helpers.networks - :members: simple_attention - :noindex: - -Miscs -===== - -dropout_layer --------------- -.. automodule:: paddle.trainer_config_helpers.networks - :members: dropout_layer - :noindex: - -outputs -------- -.. automodule:: paddle.trainer_config_helpers.networks - :members: outputs - :noindex: diff --git a/doc/_sources/api/v1/trainer_config_helpers/optimizers.rst.txt b/doc/_sources/api/v1/trainer_config_helpers/optimizers.rst.txt deleted file mode 100644 index d2f4958c92b8e3b7426945f1af07112ab4071136..0000000000000000000000000000000000000000 --- a/doc/_sources/api/v1/trainer_config_helpers/optimizers.rst.txt +++ /dev/null @@ -1,61 +0,0 @@ -.. _api_trainer_config_helpers_optimizers: - -========== -Optimizers -========== - -BaseSGDOptimizer -================ -.. automodule:: paddle.trainer_config_helpers.optimizers - :members: BaseSGDOptimizer - :noindex: - -MomentumOptimizer -================= -.. automodule:: paddle.trainer_config_helpers.optimizers - :members: MomentumOptimizer - :noindex: - -AdamOptimizer -============= -.. automodule:: paddle.trainer_config_helpers.optimizers - :members: AdamOptimizer - :noindex: - -AdamaxOptimizer -================ -.. automodule:: paddle.trainer_config_helpers.optimizers - :members: AdamaxOptimizer - :noindex: - -AdaGradOptimizer -================ -.. automodule:: paddle.trainer_config_helpers.optimizers - :members: AdaGradOptimizer - :noindex: - -DecayedAdaGradOptimizer -======================= -.. automodule:: paddle.trainer_config_helpers.optimizers - :members: DecayedAdaGradOptimizer - :noindex: - -AdaDeltaOptimizer -================= -.. automodule:: paddle.trainer_config_helpers.optimizers - :members: AdaDeltaOptimizer - :noindex: - -RMSPropOptimizer -================ -.. automodule:: paddle.trainer_config_helpers.optimizers - :members: RMSPropOptimizer - :noindex: - -.. _api_trainer_config_helpers_optimizers_settings: - -settings -======== -.. automodule:: paddle.trainer_config_helpers.optimizers - :members: settings - :noindex: diff --git a/doc/_sources/api/v1/trainer_config_helpers/poolings.rst.txt b/doc/_sources/api/v1/trainer_config_helpers/poolings.rst.txt deleted file mode 100644 index 66566809d26f59263597b5286c5b27e0bbc9415a..0000000000000000000000000000000000000000 --- a/doc/_sources/api/v1/trainer_config_helpers/poolings.rst.txt +++ /dev/null @@ -1,33 +0,0 @@ -======== -Poolings -======== - -BasePoolingType -=============== -.. automodule:: paddle.trainer_config_helpers.poolings - :members: BasePoolingType - :noindex: - -AvgPooling -========== -.. automodule:: paddle.trainer_config_helpers.poolings - :members: AvgPooling - :noindex: - -MaxPooling -========== -.. automodule:: paddle.trainer_config_helpers.poolings - :members: MaxPooling - :noindex: - -SumPooling -========== -.. automodule:: paddle.trainer_config_helpers.poolings - :members: SumPooling - :noindex: - -SquareRootNPooling -================== -.. automodule:: paddle.trainer_config_helpers.poolings - :members: SquareRootNPooling - :noindex: diff --git a/doc/_sources/api/v2/config/activation.rst.txt b/doc/_sources/api/v2/config/activation.rst.txt index eca3ce03bcdc599edca802d8dfca48d4f28275a2..5317e66b64bbd85c61f19700a9d2c1d239dee573 100644 --- a/doc/_sources/api/v2/config/activation.rst.txt +++ b/doc/_sources/api/v2/config/activation.rst.txt @@ -99,3 +99,10 @@ STanh .. automodule:: paddle.v2.activation :members: STanh :noindex: + +SoftSign +======== + +.. automodule:: paddle.v2.activation + :members: SoftSign + :noindex: diff --git a/doc/_sources/api/v2/config/evaluators.rst.txt b/doc/_sources/api/v2/config/evaluators.rst.txt new file mode 100644 index 0000000000000000000000000000000000000000..9ac972fb193a2fb525edc507f7ba1303d2c8eabe --- /dev/null +++ b/doc/_sources/api/v2/config/evaluators.rst.txt @@ -0,0 +1,110 @@ +.. _api_v2: + +========== +Evaluators +========== + +Classification +============== + +classification_error +-------------------- +.. automodule:: paddle.v2.evaluator + :members: classification_error + :noindex: + +auc +--- +.. automodule:: paddle.v2.evaluator + :members: auc + :noindex: + +ctc_error +--------- +.. automodule:: paddle.v2.evaluator + :members: ctc_error + :noindex: + +chunk +----- +.. automodule:: paddle.v2.evaluator + :members: chunk + :noindex: + +precision_recall +---------------- +.. automodule:: paddle.v2.evaluator + :members: precision_recall + :noindex: + +Rank +==== + +pnpair +------ +.. automodule:: paddle.v2.evaluator + :members: pnpair + :noindex: + +Utils +===== + +sum +--- +.. automodule:: paddle.v2.evaluator + :members: sum + :noindex: + +column_sum +---------- +.. automodule:: paddle.v2.evaluator + :members: column_sum + :noindex: + +Print +===== + +classification_error_printer +---------------------------- +.. automodule:: paddle.v2.evaluator + :members: classification_error_printer + :noindex: + +gradient_printer +---------------- +.. automodule:: paddle.v2.evaluator + :members: gradient_printer + :noindex: + +maxid_printer +------------- +.. automodule:: paddle.v2.evaluator + :members: maxid_printer + :noindex: + +maxframe_printer +---------------- +.. automodule:: paddle.v2.evaluator + :members: maxframe_printer + :noindex: + +seqtext_printer +--------------- +.. automodule:: paddle.v2.evaluator + :members: seqtext_printer + :noindex: + +value_printer +------------- +.. automodule:: paddle.v2.evaluator + :members: value_printer + :noindex: + +Detection +===== + +detection_map +------------- +.. automodule:: paddle.v2.evaluator + :members: detection_map + :noindex: diff --git a/doc/_sources/api/v2/config/layer.rst.txt b/doc/_sources/api/v2/config/layer.rst.txt index 2a02baf17ba0d1119a8d222024616ef8ae33f8d5..c3f9c18d0663a7a24880b441981875c1e4f015aa 100644 --- a/doc/_sources/api/v2/config/layer.rst.txt +++ b/doc/_sources/api/v2/config/layer.rst.txt @@ -54,18 +54,23 @@ img_conv .. _api_v2.layer_context_projection: -context_projection +context_projection ------------------ .. autoclass:: paddle.v2.layer.context_projection :noindex: +row_conv +-------- +.. autoclass:: paddle.v2.layer.row_conv + :noindex: + Image Pooling Layer =================== img_pool -------- .. autoclass:: paddle.v2.layer.img_pool - :noindex: + :noindex: spp --- @@ -77,6 +82,11 @@ maxout .. autoclass:: paddle.v2.layer.maxout :noindex: +roi_pool +-------- +.. autoclass:: paddle.v2.layer.roi_pool + :noindex: + Norm Layer ========== @@ -94,12 +104,17 @@ sum_to_one_norm --------------- .. autoclass:: paddle.v2.layer.sum_to_one_norm :noindex: - + cross_channel_norm ------------------ .. autoclass:: paddle.v2.layer.cross_channel_norm :noindex: - + +row_l2_norm +----------- +.. autoclass:: paddle.v2.layer.row_l2_norm + :noindex: + Recurrent Layers ================ @@ -130,7 +145,7 @@ recurrent_group --------------- .. autoclass:: paddle.v2.layer.recurrent_group :noindex: - + lstm_step --------- .. autoclass:: paddle.v2.layer.lstm_step @@ -145,12 +160,12 @@ beam_search ------------ .. autoclass:: paddle.v2.layer.beam_search :noindex: - + get_output ---------- .. autoclass:: paddle.v2.layer.get_output :noindex: - + Mixed Layer =========== @@ -193,6 +208,10 @@ identity_projection .. autoclass:: paddle.v2.layer.identity_projection :noindex: +slice_projection +------------------- +.. autoclass:: paddle.v2.layer.slice_projection + :noindex: table_projection ---------------- @@ -203,10 +222,15 @@ trans_full_matrix_projection ---------------------------- .. autoclass:: paddle.v2.layer.trans_full_matrix_projection :noindex: - + Aggregate Layers ================ +AggregateLevel +-------------- +.. autoclass:: paddle.v2.layer.AggregateLevel + :noindex: + .. _api_v2.layer_pooling: pooling @@ -238,6 +262,21 @@ seq_concat .. autoclass:: paddle.v2.layer.seq_concat :noindex: +seq_slice +--------- +.. autoclass:: paddle.v2.layer.seq_slice + :noindex: + +kmax_sequence_score +------------------- +.. autoclass:: paddle.v2.layer.kmax_sequence_score + :noindex: + +sub_nested_seq +-------------- +.. autoclass:: paddle.v2.layer.sub_nested_seq + :noindex: + Reshaping Layers ================ @@ -248,6 +287,11 @@ block_expand .. _api_v2.layer_expand: +ExpandLevel +----------- +.. autoclass:: paddle.v2.layer.ExpandLevel + :noindex: + expand ------ .. autoclass:: paddle.v2.layer.expand @@ -291,6 +335,16 @@ bilinear_interp .. autoclass:: paddle.v2.layer.bilinear_interp :noindex: +dot_prod +--------- +.. autoclass:: paddle.v2.layer.dot_prod + :noindex: + +out_prod +-------- +.. autoclass:: paddle.v2.layer.out_prod + :noindex: + power ----- .. autoclass:: paddle.v2.layer.power @@ -301,6 +355,16 @@ scaling .. autoclass:: paddle.v2.layer.scaling :noindex: +clip +---- +.. autoclass:: paddle.v2.layer.clip + :noindex: + +resize +------ +.. autoclass:: paddle.v2.layer.resize + :noindex: + slope_intercept --------------- .. autoclass:: paddle.v2.layer.slope_intercept @@ -318,11 +382,21 @@ cos_sim .. autoclass:: paddle.v2.layer.cos_sim :noindex: +l2_distance +----------- +.. autoclass:: paddle.v2.layer.l2_distance + :noindex: + trans ----- .. autoclass:: paddle.v2.layer.trans :noindex: +scale_shift +----------- +.. autoclass:: paddle.v2.layer.scale_shift + :noindex: + Sampling Layers =============== @@ -336,6 +410,19 @@ sampling_id .. autoclass:: paddle.v2.layer.sampling_id :noindex: +multiplex +--------- +.. autoclass:: paddle.v2.layer.multiplex + :noindex: + +Factorization Machine Layer +============================ + +factorization_machine +--------------------- +.. autoclass:: paddle.v2.layer.factorization_machine + :noindex: + Slicing and Joining Layers ========================== @@ -364,9 +451,14 @@ multi_binary_label_cross_entropy_cost .. autoclass:: paddle.v2.layer.multi_binary_label_cross_entropy_cost :noindex: -huber_cost ----------- -.. autoclass:: paddle.v2.layer.huber_cost +huber_regression_cost +------------------------- +.. autoclass:: paddle.v2.layer.huber_regression_cost + :noindex: + +huber_classification_cost +------------------------- +.. autoclass:: paddle.v2.layer.huber_classification_cost :noindex: lambda_cost @@ -374,9 +466,9 @@ lambda_cost .. autoclass:: paddle.v2.layer.lambda_cost :noindex: -mse_cost +square_error_cost -------- -.. autoclass:: paddle.v2.layer.mse_cost +.. autoclass:: paddle.v2.layer.square_error_cost :noindex: rank_cost @@ -419,10 +511,49 @@ hsigmoid .. autoclass:: paddle.v2.layer.hsigmoid :noindex: -Check Layer +smooth_l1_cost +-------------- +.. autoclass:: paddle.v2.layer.smooth_l1_cost + :noindex: + +multibox_loss +-------------- +.. autoclass:: paddle.v2.layer.multibox_loss + :noindex: + +Check Layer ============ eos --- .. autoclass:: paddle.v2.layer.eos :noindex: + +Miscs +===== + +dropout +-------------- +.. autoclass:: paddle.v2.layer.dropout + :noindex: + +Activation with learnable parameter +=================================== + +prelu +-------- +.. autoclass:: paddle.v2.layer.prelu + :noindex: + +gated_unit +----------- +.. autoclass:: paddle.v2.layer.gated_unit + :noindex: + +Detection output Layer +====================== + +detection_output +---------------- +.. autoclass:: paddle.v2.layer.detection_output + :noindex: diff --git a/doc/_sources/api/v2/config/networks.rst.txt b/doc/_sources/api/v2/config/networks.rst.txt index 6f209bc95bec7279051118bb857a96515e0371a9..048379cf01f4aec5e73e2fe3ddfa728f3c17a5d1 100644 --- a/doc/_sources/api/v2/config/networks.rst.txt +++ b/doc/_sources/api/v2/config/networks.rst.txt @@ -44,6 +44,12 @@ simple_img_conv_pool :members: simple_img_conv_pool :noindex: +small_vgg +--------- +.. automodule:: paddle.v2.networks + :members: small_vgg + :noindex: + vgg_16_network --------------- .. automodule:: paddle.v2.networks @@ -101,17 +107,26 @@ simple_gru :members: simple_gru :noindex: +simple_gru2 +``````````` +.. automodule:: paddle.v2.networks + :members: simple_gru2 + :noindex: + +bidirectional_gru +`````````````````` +.. automodule:: paddle.v2.networks + :members: bidirectional_gru + :noindex: + simple_attention ---------------- .. automodule:: paddle.v2.networks :members: simple_attention :noindex: -Miscs -===== - -dropout_layer --------------- +dot_product_attention +--------------------- .. automodule:: paddle.v2.networks - :members: dropout_layer + :members: dot_product_attention :noindex: diff --git a/doc/_sources/api/v2/data.rst.txt b/doc/_sources/api/v2/data.rst.txt index fef87c4fbdb452771ecdb361c6eeae5b32bcee14..b56c7332cc284649c7e04328e51a7faa78593a39 100644 --- a/doc/_sources/api/v2/data.rst.txt +++ b/doc/_sources/api/v2/data.rst.txt @@ -2,112 +2,9 @@ Data Reader Interface and DataSets ================================== +.. toctree:: + :maxdepth: 1 -DataTypes -========= - -.. automodule:: paddle.v2.data_type - :members: - :noindex: - -DataFeeder -========== - -.. automodule:: paddle.v2.data_feeder - :members: - :noindex: - -Reader -====== - -.. automodule:: paddle.v2.reader - :members: - :noindex: - -.. automodule:: paddle.v2.reader.creator - :members: - :noindex: - -minibatch -========= - -.. automodule:: paddle.v2.minibatch - :members: - :noindex: - -Dataset -======= - -.. automodule:: paddle.v2.dataset - :members: - :noindex: - -mnist -+++++ - -.. automodule:: paddle.v2.dataset.mnist - :members: - :noindex: - -cifar -+++++ - -.. automodule:: paddle.v2.dataset.cifar - :members: - :noindex: - -conll05 -+++++++ - -.. automodule:: paddle.v2.dataset.conll05 - :members: get_dict,get_embedding,test - :noindex: - -imdb -++++ - -.. automodule:: paddle.v2.dataset.imdb - :members: - :noindex: - -imikolov -++++++++ - -.. automodule:: paddle.v2.dataset.imikolov - :members: - :noindex: - -movielens -+++++++++ - -.. automodule:: paddle.v2.dataset.movielens - :members: - :noindex: - -.. autoclass:: paddle.v2.dataset.movielens.MovieInfo - :noindex: - -.. autoclass:: paddle.v2.dataset.movielens.UserInfo - :noindex: - -sentiment -+++++++++ - -.. automodule:: paddle.v2.dataset.sentiment - :members: - :noindex: - -uci_housing -+++++++++++ - -.. automodule:: paddle.v2.dataset.uci_housing - :members: - :noindex: - -wmt14 -+++++ - -.. automodule:: paddle.v2.dataset.wmt14 - :members: - :noindex: - + data/data_reader.rst + data/image.rst + data/dataset.rst diff --git a/doc/_sources/api/v2/data/data_reader.rst.txt b/doc/_sources/api/v2/data/data_reader.rst.txt new file mode 100644 index 0000000000000000000000000000000000000000..2ccfec9c284877a7576e9751526b169a4ac78d8e --- /dev/null +++ b/doc/_sources/api/v2/data/data_reader.rst.txt @@ -0,0 +1,36 @@ +===================== +Data Reader Interface +===================== + + +DataTypes +========= + +.. automodule:: paddle.v2.data_type + :members: + :noindex: + +DataFeeder +========== + +.. automodule:: paddle.v2.data_feeder + :members: + :noindex: + +Reader +====== + +.. automodule:: paddle.v2.reader + :members: + :noindex: + +.. automodule:: paddle.v2.reader.creator + :members: + :noindex: + +minibatch +========= + +.. automodule:: paddle.v2.minibatch + :members: + :noindex: diff --git a/doc/_sources/api/v2/data/dataset.rst.txt b/doc/_sources/api/v2/data/dataset.rst.txt new file mode 100644 index 0000000000000000000000000000000000000000..6a8ecc5bb1d855e0ded3719943ab3adb810de365 --- /dev/null +++ b/doc/_sources/api/v2/data/dataset.rst.txt @@ -0,0 +1,75 @@ +Dataset +======= + +.. automodule:: paddle.v2.dataset + :members: + :noindex: + +mnist ++++++ + +.. automodule:: paddle.v2.dataset.mnist + :members: + :noindex: + +cifar ++++++ + +.. automodule:: paddle.v2.dataset.cifar + :members: + :noindex: + +conll05 ++++++++ + +.. automodule:: paddle.v2.dataset.conll05 + :members: get_dict,get_embedding,test + :noindex: + +imdb +++++ + +.. automodule:: paddle.v2.dataset.imdb + :members: + :noindex: + +imikolov +++++++++ + +.. automodule:: paddle.v2.dataset.imikolov + :members: + :noindex: + +movielens ++++++++++ + +.. automodule:: paddle.v2.dataset.movielens + :members: + :noindex: + +.. autoclass:: paddle.v2.dataset.movielens.MovieInfo + :noindex: + +.. autoclass:: paddle.v2.dataset.movielens.UserInfo + :noindex: + +sentiment ++++++++++ + +.. automodule:: paddle.v2.dataset.sentiment + :members: + :noindex: + +uci_housing ++++++++++++ + +.. automodule:: paddle.v2.dataset.uci_housing + :members: + :noindex: + +wmt14 ++++++ + +.. automodule:: paddle.v2.dataset.wmt14 + :members: + :noindex: diff --git a/doc/_sources/api/v2/data/image.rst.txt b/doc/_sources/api/v2/data/image.rst.txt new file mode 100644 index 0000000000000000000000000000000000000000..97651ffa6be56cf3ecaca2caca38a353fa5c1f49 --- /dev/null +++ b/doc/_sources/api/v2/data/image.rst.txt @@ -0,0 +1,5 @@ +Image Interface +=============== + +.. automodule:: paddle.v2.image + :members: diff --git a/doc/_sources/api/v2/fluid.rst.txt b/doc/_sources/api/v2/fluid.rst.txt new file mode 100644 index 0000000000000000000000000000000000000000..43fc19dc492bbc119f2356034b81c65e443db2fa --- /dev/null +++ b/doc/_sources/api/v2/fluid.rst.txt @@ -0,0 +1,18 @@ +====================== +Fluid +====================== + +.. toctree:: + :maxdepth: 1 + + fluid/layers.rst + fluid/data_feeder.rst + fluid/executor.rst + fluid/initializer.rst + fluid/evaluator.rst + fluid/nets.rst + fluid/optimizer.rst + fluid/param_attr.rst + fluid/profiler.rst + fluid/regularizer.rst + diff --git a/doc/_sources/api/v2/fluid/data_feeder.rst.txt b/doc/_sources/api/v2/fluid/data_feeder.rst.txt new file mode 100644 index 0000000000000000000000000000000000000000..0fa78f7dfb04c13be7eb83b7fd35cb03f2f4a7fa --- /dev/null +++ b/doc/_sources/api/v2/fluid/data_feeder.rst.txt @@ -0,0 +1,9 @@ +=========== +DataFeeder +=========== + +DataFeeder +----------- +.. automodule:: paddle.v2.fluid.data_feeder + :members: DataFeeder + :noindex: diff --git a/doc/_sources/api/v2/fluid/evaluator.rst.txt b/doc/_sources/api/v2/fluid/evaluator.rst.txt new file mode 100644 index 0000000000000000000000000000000000000000..a23f3301d0331e0ea3733f06444515eb4680cd31 --- /dev/null +++ b/doc/_sources/api/v2/fluid/evaluator.rst.txt @@ -0,0 +1,9 @@ +=========== +Evaluator +=========== + +Evaluator +----------- +.. automodule:: paddle.v2.fluid.evaluator + :members: Evaluator + :noindex: diff --git a/doc/_sources/api/v2/fluid/executor.rst.txt b/doc/_sources/api/v2/fluid/executor.rst.txt new file mode 100644 index 0000000000000000000000000000000000000000..3a283538c120cfa1ef646c390bb71c6251c23675 --- /dev/null +++ b/doc/_sources/api/v2/fluid/executor.rst.txt @@ -0,0 +1,9 @@ +=========== +Executor +=========== + +Executor +----------- +.. automodule:: paddle.v2.fluid.executor + :members: Executor + :noindex: diff --git a/doc/_sources/api/v2/fluid/initializer.rst.txt b/doc/_sources/api/v2/fluid/initializer.rst.txt new file mode 100644 index 0000000000000000000000000000000000000000..8f587837e9873370722062404f511654a9460587 --- /dev/null +++ b/doc/_sources/api/v2/fluid/initializer.rst.txt @@ -0,0 +1,50 @@ +=========== +Initializer +=========== + + + +Initializer +----------- +.. automodule:: paddle.v2.fluid.initializer + :members: Initializer + :noindex: + + + +ConstantInitializer +------------------- +.. automodule:: paddle.v2.fluid.initializer + :members: ConstantInitializer + :noindex: + + + +UniformInitializer +------------------ +.. automodule:: paddle.v2.fluid.initializer + :members: UniformInitializer + :noindex: + + + +NormalInitializer +----------------- +.. automodule:: paddle.v2.fluid.initializer + :members: NormalInitializer + :noindex: + + +XavierInitializer +----------------- +.. automodule:: paddle.v2.fluid.initializer + :members: XavierInitializer + :noindex: + + +MSRAInitializer +--------------- +.. automodule:: paddle.v2.fluid.initializer + :members: MSRAInitializer + :noindex: + diff --git a/doc/_sources/api/v2/fluid/layers.rst.txt b/doc/_sources/api/v2/fluid/layers.rst.txt new file mode 100644 index 0000000000000000000000000000000000000000..89e5fec13bf9062dc7a7187b1334c8f5486a980b --- /dev/null +++ b/doc/_sources/api/v2/fluid/layers.rst.txt @@ -0,0 +1,302 @@ +========== +Layers +========== + + +fc +--- +.. autofunction:: paddle.v2.fluid.layers.fc + :noindex: + +embedding +--------- +.. autofunction:: paddle.v2.fluid.layers.embedding + :noindex: + +dynamic_lstm +------------ +.. autofunction:: paddle.v2.fluid.layers.dynamic_lstm + :noindex: + +data +--------- +.. autofunction:: paddle.v2.fluid.layers.data + :noindex: + +mean +--------- +.. autofunction:: paddle.v2.fluid.layers.mean + :noindex: + +mul +--------- +.. autofunction:: paddle.v2.fluid.layers.mul + :noindex: + +elementwise_add +--------------- +.. autofunction:: paddle.v2.fluid.layers.elementwise_add + :noindex: + +elementwise_div +--------------- +.. autofunction:: paddle.v2.fluid.layers.elementwise_div + :noindex: + + +dropout +--------- +.. autofunction:: paddle.v2.fluid.layers.dropout + :noindex: + + +reshape +--------- +.. autofunction:: paddle.v2.fluid.layers.reshape + :noindex: + + +sigmoid +--------- +.. autofunction:: paddle.v2.fluid.layers.sigmoid + :noindex: + + +scale +--------- +.. autofunction:: paddle.v2.fluid.layers.scale + :noindex: + + +reshape +--------- +.. autofunction:: paddle.v2.fluid.layers.reshape + :noindex: + + +transpose +--------- +.. autofunction:: paddle.v2.fluid.layers.transpose + :noindex: + + +sigmoid_cross_entropy_with_logits +--------- +.. autofunction:: paddle.v2.fluid.layers.esigmoid_cross_entropy_with_logits + :noindex: + + +cast +--------- +.. autofunction:: paddle.v2.fluid.layers.cast + :noindex: + + +concat +--------- +.. autofunction:: paddle.v2.fluid.layers.concat + :noindex: + + +sums +--------- +.. autofunction:: paddle.v2.fluid.layers.sums + :noindex: + + +linear_chain_crf +--------- +.. autofunction:: paddle.v2.fluid.layers.linear_chain_crf + :noindex: + + +assign +--------- +.. autofunction:: paddle.v2.fluid.layers.embedding + :noindex: + + +split_lod_tensor +--------- +.. autofunction:: paddle.v2.fluid.layers.split_lod_tensor + :noindex: + + +merge_lod_tensor +--------- +.. autofunction:: paddle.v2.fluid.layers.merge_lod_tensor + :noindex: + +cos_sim +--------- +.. autofunction:: paddle.v2.fluid.layers.cos_sim + :noindex: + + +cross_entropy +--------- +.. autofunction:: paddle.v2.fluid.layers.cross_entropy + :noindex: + + + +square_error_cost +--------- +.. autofunction:: paddle.v2.fluid.layers.square_error_cost + :noindex: + + +accuracy +--------- +.. autofunction:: paddle.v2.fluid.layers.accuracy + :noindex: + + +sequence_conv +--------- +.. autofunction:: paddle.v2.fluid.layers.sequence_conv + :noindex: + + +conv2d +--------- +.. autofunction:: paddle.v2.fluid.layers.conv2d + :noindex: + + +sequence_pool +--------- +.. autofunction:: paddle.v2.fluid.layers.sequence_pool + :noindex: + + +pool2d +--------- +.. autofunction:: paddle.v2.fluid.layers.pool2d + :noindex: + + +batch_norm +--------- +.. autofunction:: paddle.v2.fluid.layers.batch_norm + :noindex: + + +beam_search_decode +--------- +.. autofunction:: paddle.v2.fluid.layers.beam_search_decode + :noindex: + + +lstm +--------- +.. autofunction:: paddle.v2.fluid.layers.lstm + :noindex: + + +lod_rank_table +--------- +.. autofunction:: paddle.v2.fluid.layers.lod_rank_table + :noindex: + + +max_sequence_len +--------- +.. autofunction:: paddle.v2.fluid.layers.max_sequence_len + :noindex: + + +topk +--------- +.. autofunction:: paddle.v2.fluid.layers.topk + :noindex: + + +lod_tensor_to_array +--------- +.. autofunction:: paddle.v2.fluid.layers.lod_tensor_to_array + :noindex: + + + +array_to_lod_tensor +--------- +.. autofunction:: paddle.v2.fluid.layers.array_to_lod_tensor + :noindex: + + + + +fill_constant +--------- +.. autofunction:: paddle.v2.fluid.layers.fill_constant + :noindex: + + + +fill_constant_batch_size_like +--------- +.. autofunction:: paddle.v2.fluid.layers.fill_constant_batch_size_like + :noindex: + + +ones +--------- +.. autofunction:: paddle.v2.fluid.layers.ones + :noindex: + + +zeros +--------- +.. autofunction:: paddle.v2.fluid.layers.zeros + :noindex: + + +increment +--------- +.. autofunction:: paddle.v2.fluid.layers.increment + :noindex: + + +array_write +--------- +.. autofunction:: paddle.v2.fluid.layers.array_write + :noindex: + + + +create_array +--------- +.. autofunction:: paddle.v2.fluid.layers.create_array + :noindex: + + +less_than +--------- +.. autofunction:: paddle.v2.fluid.layers.less_than + :noindex: + + +array_read +--------- +.. autofunction:: paddle.v2.fluid.layers.array_read + :noindex: + + +shrink_memory +--------- +.. autofunction:: paddle.v2.fluid.layers.shrink_memory + :noindex: + + +array_length +--------- +.. autofunction:: paddle.v2.fluid.layers.array_length + :noindex: + + +conv2d_transpose +--------- +.. autofunction:: paddle.v2.fluid.layers.conv2d_transpose + :noindex: + diff --git a/doc/_sources/api/v2/fluid/nets.rst.txt b/doc/_sources/api/v2/fluid/nets.rst.txt new file mode 100644 index 0000000000000000000000000000000000000000..2c3d075422de29c96e25458e831133a30270dd39 --- /dev/null +++ b/doc/_sources/api/v2/fluid/nets.rst.txt @@ -0,0 +1,22 @@ +=========== +Nets +=========== + +simple_img_conv_pool +----------- +.. autofunction:: paddle.v2.fluid.nets.simple_img_conv_pool + :noindex: + + +img_conv_group +----------- +.. autofunction:: paddle.v2.fluid.nets.img_conv_group + :noindex: + + +sequence_conv_pool +----------- +.. autofunction:: paddle.v2.fluid.nets.sequence_conv_pool + :noindex: + + diff --git a/doc/_sources/api/v2/fluid/optimizer.rst.txt b/doc/_sources/api/v2/fluid/optimizer.rst.txt new file mode 100644 index 0000000000000000000000000000000000000000..233762fcdfb39e592740adef6721a556fae3feef --- /dev/null +++ b/doc/_sources/api/v2/fluid/optimizer.rst.txt @@ -0,0 +1,54 @@ +=========== +Optimizer +=========== + +Optimizer +----------- +.. automodule:: paddle.v2.fluid.optimizer + :members: Optimizer + :noindex: + + +SGDOptimizer +----------- +.. automodule:: paddle.v2.fluid.optimizer + :members: SGDOptimizer + :noindex: + + + +MomentumOptimizer +----------- +.. automodule:: paddle.v2.fluid.optimizer + :members: MomentumOptimizer + :noindex: + + + +AdagradOptimizer +----------- +.. automodule:: paddle.v2.fluid.optimizer + :members: AdagradOptimizer + :noindex: + + +AdamOptimizer +----------- +.. automodule:: paddle.v2.fluid.optimizer + :members: AdamOptimizer + :noindex: + + +AdamaxOptimizer +----------- +.. automodule:: paddle.v2.fluid.optimizer + :members: AdamaxOptimizer + :noindex: + + +DecayedAdagradOptimizer +----------- +.. automodule:: paddle.v2.fluid.optimizer + :members: DecayedAdagradOptimizer + :noindex: + diff --git a/doc/_sources/api/v2/fluid/param_attr.rst.txt b/doc/_sources/api/v2/fluid/param_attr.rst.txt new file mode 100644 index 0000000000000000000000000000000000000000..ca0c8af9e8c4f2271de7a131ad0d27c0e8635f50 --- /dev/null +++ b/doc/_sources/api/v2/fluid/param_attr.rst.txt @@ -0,0 +1,11 @@ +=========== +ParamAttr +=========== + + + +ParamAttr +----------- +.. automodule:: paddle.v2.fluid.param_attr + :members: ParamAttr + :noindex: diff --git a/doc/_sources/api/v2/fluid/profiler.rst.txt b/doc/_sources/api/v2/fluid/profiler.rst.txt new file mode 100644 index 0000000000000000000000000000000000000000..7d4042d1f41c12c4a551ba6576559d612116872a --- /dev/null +++ b/doc/_sources/api/v2/fluid/profiler.rst.txt @@ -0,0 +1,10 @@ +=========== +Profiler +=========== + + + +Profiler +----------- +.. autofunction:: paddle.v2.fluid.profiler.cuda_profiler + :noindex: diff --git a/doc/_sources/api/v2/fluid/regularizer.rst.txt b/doc/_sources/api/v2/fluid/regularizer.rst.txt new file mode 100644 index 0000000000000000000000000000000000000000..3af2b07d2ae55d99df705fbf1ad2402eee05c435 --- /dev/null +++ b/doc/_sources/api/v2/fluid/regularizer.rst.txt @@ -0,0 +1,25 @@ +=========== +Regularizer +=========== + +WeightDecayRegularizer +----------- +.. automodule:: paddle.v2.fluid.regularizer + :members: WeightDecayRegularizer + :noindex: + + +L2DecayRegularizer +----------- +.. automodule:: paddle.v2.fluid.regularizer + :members: L2DecayRegularizer + :noindex: + + + +L1DecayRegularizer +----------- +.. automodule:: paddle.v2.fluid.regularizer + :members: L1DecayRegularizer + + diff --git a/doc/_sources/api/v2/model_configs.rst.txt b/doc/_sources/api/v2/model_configs.rst.txt index a5fae7e29e56d9e236d489353a5c8967d1954641..992b559cbd87244612521d4c96f84f997d6c4196 100644 --- a/doc/_sources/api/v2/model_configs.rst.txt +++ b/doc/_sources/api/v2/model_configs.rst.txt @@ -6,6 +6,7 @@ Model Configuration config/activation.rst config/layer.rst + config/evaluators.rst config/optimizer.rst config/pooling.rst config/networks.rst diff --git a/doc/_sources/design/api.md.txt b/doc/_sources/design/api.md.txt index 8185d2af0ea264a2e7b4e28b9ed05279e4a22014..e6a4638d9100d9b07c3ee6b92b530a17eae1c162 100644 --- a/doc/_sources/design/api.md.txt +++ b/doc/_sources/design/api.md.txt @@ -3,7 +3,7 @@ ## Ingredients As our design principle is starting from the essence: how could we -allow users to express and solve their problems at neural networks. +allow users to express and solve their problems as neural networks. Some essential concepts that our API have to provide include: 1. A *topology* is an expression of *layers*. @@ -233,7 +233,7 @@ paddle.dist_train(model, num_parameter_servers=15) ``` -The pseudo code if `paddle.dist_train` is as follows: +The pseudo code of `paddle.dist_train` is as follows: ```python def dist_train(topology, parameters, trainer, reader, ...): diff --git a/doc/_sources/design/auto_gradient_check.md.txt b/doc/_sources/design/auto_gradient_check.md.txt new file mode 100644 index 0000000000000000000000000000000000000000..f9991541bc51c6e13ffce4e9cec60f73dc800121 --- /dev/null +++ b/doc/_sources/design/auto_gradient_check.md.txt @@ -0,0 +1,146 @@ +## Auto Gradient Checker Design + +## Backgraound: +- Generally, it is easy to check whether the forward computation of an Operator is correct or not. However, backpropagation is a notoriously difficult algorithm to debug and get right: + 1. you should get the right backpropagation formula according to the forward computation. + 2. you should implement it right in CPP. + 3. it's difficult to prepare test data. + +- Auto gradient checking gets a numerical gradient by forward Operator and use it as a reference of the backward Operator's result. It has several advantages: + 1. numerical gradient checker only need forward operator. + 2. user only need to prepare the input data for forward Operator. + +## Mathematical Theory +The following two document from Stanford has a detailed explanation of how to get numerical gradient and why it's useful. + +- [Gradient checking and advanced optimization(en)](http://deeplearning.stanford.edu/wiki/index.php/Gradient_checking_and_advanced_optimization) +- [Gradient checking and advanced optimization(cn)](http://ufldl.stanford.edu/wiki/index.php/%E6%A2%AF%E5%BA%A6%E6%A3%80%E9%AA%8C%E4%B8%8E%E9%AB%98%E7%BA%A7%E4%BC%98%E5%8C%96) + + +## Numeric Gradient Implementation +### Python Interface +```python +def get_numerical_gradient(op, + input_values, + output_name, + input_to_check, + delta=0.005, + local_scope=None): + """ + Get Numeric Gradient for an operator's input. + + :param op: C++ operator instance, could be an network + :param input_values: The input variables. Should be an dictionary, whose key is + variable name, and value is numpy array. + :param output_name: The final output variable name. + :param input_to_check: The input variable with respect to which to compute the gradient. + :param delta: The perturbation value for numeric gradient method. The + smaller delta is, the more accurate result will get. But if that delta is + too small, it will suffer from numerical stability problem. + :param local_scope: The local scope used for get_numeric_gradient. + :return: The gradient array in numpy format. + """ +``` + +### Explaination: + +- Why need `output_name` + - An Operator may have multiple Output, one can get independent gradient from each Output. So caller should specify the name of the output variable. + +- Why need `input_to_check` + - One operator may have multiple inputs. Gradient Op can calculate the gradient of these inputs at the same time. But Numeric Gradient needs to calculate them one by one. So `get_numeric_gradient` is designed to calculate the gradient for one input. If you need to compute multiple inputs, you can call `get_numeric_gradient` multiple times. + + +### Core Algorithm Implementation + + +```python + # we only compute gradient of one element a time. + # we use a for loop to compute the gradient of each element. + for i in xrange(tensor_size): + # get one input element by its index i. + origin = tensor_to_check.get_float_element(i) + + # add delta to it, run op and then get the new value of the result tensor. + x_pos = origin + delta + tensor_to_check.set_float_element(i, x_pos) + y_pos = get_output() + + # plus delta to this element, run op and get the new value of the result tensor. + x_neg = origin - delta + tensor_to_check.set_float_element(i, x_neg) + y_neg = get_output() + + # restore old value + tensor_to_check.set_float_element(i, origin) + + # compute the gradient of this element and store it into a numpy array. + gradient_flat[i] = (y_pos - y_neg) / delta / 2 + + # reshape the gradient result to the shape of the source tensor. + return gradient_flat.reshape(tensor_to_check.get_dims()) +``` + +## Auto Graident Checker Framework + +Each Operator Kernel has three kinds of Gradient: + +1. Numerical gradient +2. CPU kernel gradient +3. GPU kernel gradient (if supported) + +The numerical gradient only relies on forward Operator. So we use the numerical gradient as the reference value. And the gradient checking is performed in the following three steps: + +1. calculate the numerical gradient +2. calculate CPU kernel gradient with the backward Operator and compare it with the numerical gradient +3. calculate GPU kernel gradient with the backward Operator and compare it with the numeric gradient (if supported) + +#### Python Interface + +```python + def check_grad(self, + forward_op, + input_vars, + inputs_to_check, + output_name, + no_grad_set=None, + only_cpu=False, + max_relative_error=0.005): + """ + :param forward_op: used to create backward_op + :param input_vars: numpy value of input variable. The following + computation will use these variables. + :param inputs_to_check: the input variable with respect to which to compute the gradient. + :param output_name: The final output variable name. + :param max_relative_error: The relative tolerance parameter. + :param no_grad_set: used when create backward ops + :param only_cpu: only compute and check gradient on cpu kernel. + :return: + """ +``` + +### How to check if two numpy array is close enough? +if `abs_numerical_grad` is nearly zero, then use abs error for numerical_grad + +```python +numerical_grad = ... +operator_grad = numpy.array(scope.find_var(grad_var_name(name)).get_tensor()) + +abs_numerical_grad = numpy.abs(numerical_grad) +# if abs_numerical_grad is nearly zero, then use abs error for numeric_grad, not relative +# error. +abs_numerical_grad[abs_numerical_grad < 1e-3] = 1 + +diff_mat = numpy.abs(abs_numerical_grad - operator_grad) / abs_numerical_grad +max_diff = numpy.max(diff_mat) +``` + + +#### Notes: +The Input data for auto gradient checker should be reasonable to avoid numerical stability problem. + + +#### Refs: + +- [Gradient checking and advanced optimization(en)](http://deeplearning.stanford.edu/wiki/index.php/Gradient_checking_and_advanced_optimization) +- [Gradient checking and advanced optimization(cn)](http://ufldl.stanford.edu/wiki/index.php/%E6%A2%AF%E5%BA%A6%E6%A3%80%E9%AA%8C%E4%B8%8E%E9%AB%98%E7%BA%A7%E4%BC%98%E5%8C%96) diff --git a/doc/_sources/design/block.md.txt b/doc/_sources/design/block.md.txt new file mode 100644 index 0000000000000000000000000000000000000000..4066122c0e8dfa33776796c3d205ba5aec9e0f52 --- /dev/null +++ b/doc/_sources/design/block.md.txt @@ -0,0 +1,336 @@ +# Design Doc: Block and Scope + +## The Representation of Computation + +Both deep learning systems and programming languages help users describe computation procedures. These systems use various representations of computation: + +- Caffe, Torch, and Paddle: sequences of layers. +- TensorFlow, Caffe2, Mxnet: graph of operators. +- PaddlePaddle: nested blocks, like C++ and Java programs. + +## Block in Programming Languages and Deep Learning + +In programming languages, a block is a pair of curly braces that includes local variables definitions and a sequence of instructions or operators. + +Blocks work with control flow structures like `if`, `else`, and `for`, which have equivalents in deep learning: + +| programming languages | PaddlePaddle | +|-----------------------|-----------------------| +| for, while loop | RNN, WhileOp | +| if, if-else, switch | IfElseOp, SwitchOp | +| sequential execution | a sequence of layers | + +A key difference is that a C++ program describes a one pass computation, whereas a deep learning program describes both the forward and backward passes. + +## Stack Frames and the Scope Hierarchy + +The existence of the backward pass makes the execution of a block of PaddlePaddle different from traditional programs: + +| programming languages | PaddlePaddle | +|-----------------------|---------------------------------| +| stack | scope hierarchy | +| stack frame | scope | +| push at entering block| push at entering block | +| pop at leaving block | destroy when minibatch completes| + +1. In traditional programs: + + - When the execution enters the left curly brace of a block, the runtime pushes a frame into the stack, where it realizes local variables. + - After the execution leaves the right curly brace, the runtime pops the frame. + - The maximum number of frames in the stack is the maximum depth of nested blocks. + +1. In PaddlePaddle + + - When the execution enters a block, PaddlePaddle adds a new scope, where it realizes variables. + - PaddlePaddle doesn't pop a scope after the execution of the block because variables therein are used by the backward pass. So it has a stack forest known as a *scope hierarchy*. + - The height of the highest tree is the maximum depth of nested blocks. + - After the processing of a minibatch, PaddlePaddle destroys the scope hierarchy. + +## Use Blocks in C++ and PaddlePaddle Programs + +Let us consolidate the discussion by presenting some examples. + +### Blocks with `if-else` and `IfElseOp` + +The following C++ programs shows how blocks are used with the `if-else` structure: + +```c++ +namespace pd = paddle; + +int x = 10; +int y = 1; +int z = 10; +bool cond = false; +int o1, o2; +if (cond) { + int z = x + y; + o1 = z; + o2 = pd::layer::softmax(z); +} else { + int d = pd::layer::fc(z); + o1 = d; + o2 = d+1; +} + +``` + +An equivalent PaddlePaddle program from the design doc of the [IfElseOp operator](./if_else_op.md) is as follows: + +```python +import paddle as pd + +x = minibatch([10, 20, 30]) # shape=[None, 1] +y = var(1) # shape=[1], value=1 +z = minibatch([10, 20, 30]) # shape=[None, 1] +cond = larger_than(x, 15) # [false, true, true] + +ie = pd.ifelse() +with ie.true_block(): + d = pd.layer.add_scalar(x, y) + ie.output(d, pd.layer.softmax(d)) +with ie.false_block(): + d = pd.layer.fc(z) + ie.output(d, d+1) +o1, o2 = ie(cond) +``` + +In both examples, the left branch computes `x+y` and `softmax(x+y)`, the right branch computes `fc(x)` and `x+1` . + +The difference is that variables in the C++ program contain scalar values, whereas those in the PaddlePaddle programs are mini-batches of instances. + + +### Blocks with `for` and `RNNOp` + +The following RNN model in PaddlePaddle from the [RNN design doc](./rnn.md) : + +```python +x = sequence([10, 20, 30]) # shape=[None, 1] +m = var(0) # shape=[1] +W = var(0.314, param=true) # shape=[1] +U = var(0.375, param=true) # shape=[1] + +rnn = pd.rnn() +with rnn.step(): + h = rnn.memory(init = m) + h_prev = rnn.previous_memory(h) + a = layer.fc(W, x) + b = layer.fc(U, h_prev) + s = pd.add(a, b) + act = pd.sigmoid(s) + rnn.update_memory(h, act) + rnn.output(a, b) +o1, o2 = rnn() +``` +has its equivalent C++ program as follows + +```c++ +int* x = {10, 20, 30}; +int* m = {0}; +int* W = {0.314}; +int* U = {0.375}; + +int mem[sizeof(x) / sizeof(x[0]) + 1]; +int o1[sizeof(x) / sizeof(x[0]) + 1]; +int o2[sizeof(x) / sizeof(x[0]) + 1]; +for (int i = 1; i <= sizeof(x)/sizeof(x[0]); ++i) { + int x = x[i-1]; + if (i == 1) mem[0] = m; + int a = W * x; + int b = Y * mem[i-1]; + int s = fc_out + hidden_out; + int act = sigmoid(sum); + mem[i] = act; + o1[i] = act; + o2[i] = hidden_out; +} +``` + +## Compilation and Execution + +Like TensorFlow, a PaddlePaddle program is written in Python. The first part describes a neural network as a protobuf message, and the rest executes the message for training or inference. + +The generation of this protobuf message is similar to how a compiler generates a binary executable file. The execution of the message is similar to how the OS executes the binary file. + +## The "Binary Executable File Format" + +The definition of the protobuf message is as follows: + +```protobuf +message BlockDesc { + repeated VarDesc vars = 1; + repeated OpDesc ops = 2; +} +``` + +The step net in above RNN example would look like + +``` +BlockDesc { + vars = { + VarDesc {...} // x + VarDesc {...} // h + VarDesc {...} // fc_out + VarDesc {...} // hidden_out + VarDesc {...} // sum + VarDesc {...} // act + } + ops = { + OpDesc {...} // matmul + OpDesc {...} // add_two + OpDesc {...} // sigmoid + } +}; +``` + +Also, the RNN operator in above example is serialized into a protobuf message of type `OpDesc` and would look like: + +``` +OpDesc { + inputs = {0} // the index of x in vars of BlockDesc above + outputs = {5, 3} // indices of act and hidden_out in vars of BlockDesc above + attrs { + "states" : {1} // the index of h + "step_net" : + } +}; +``` + +This `OpDesc` value is in the `ops` field of the `BlockDesc` value representing the global block. + + +## The Compilation of Blocks + +During the generation of the Protobuf message, the Block should store VarDesc (the Protobuf message which describes Variable) and OpDesc (the Protobuf message which describes Operator). + +VarDesc in a block should have its name scope to avoid local variables affect parent block's name scope. +Child block's name scopes should inherit the parent's so that OpDesc in child block can reference a VarDesc that stored in parent block. For example: + +```python +a = pd.Variable(shape=[20, 20]) +b = pd.fc(a, params=["fc.w", "fc.b"]) + +rnn = pd.create_rnn() +with rnn.stepnet(): + x = a.as_step_input() + # reuse fc's parameter + fc_without_b = pd.get_variable("fc.w") + rnn.output(fc_without_b) + +out = rnn() +``` +The method `pd.get_variable` can help retrieve a Variable by the name. The Variable may be stored in a parent block, but might be retrieved in a child block, so block should have a variable scope that supports inheritance. + +In compiler design, the symbol table is a data structure created and maintained by compilers to store information about the occurrence of various entities such as variable names, function names, classes, etc. + +To store the definition of variables and operators, we define a C++ class `SymbolTable`, like the one used in compilers. + +`SymbolTable` can do the following: + +- store the definitions (some names and attributes) of variables and operators, +- verify if a variable was declared, +- make it possible to implement type checking (offer Protobuf message pointers to `InferShape` handlers). + + +```c++ +// Information in SymbolTable is enough to trace the dependency graph. So maybe +// the Eval() interface takes a SymbolTable is enough. +class SymbolTable { + public: + SymbolTable(SymbolTable* parent) : parent_(parent) {} + + OpDesc* NewOp(const string& name=""); + + // TODO determine whether name is generated by python or C++. + // Currently assume that a unique name will be generated by C++ if the + // argument name is left default. + VarDesc* Var(const string& name=""); + + // find a VarDesc by name, if recursive is true, find parent's SymbolTable + // recursively. + // this interface is introduced to support InferShape, find protobuf messages + // of variables and operators, pass pointers into InferShape. + // + // NOTE maybe some C++ classes such as VarDescBuilder and OpDescBuilder should + // be proposed and embedded into pybind to enable python operation on C++ pointers. + VarDesc* FindVar(const string& name, bool recursive=true); + + OpDesc* FindOp(const string& name); + + BlockDesc Compile() const; + + private: + SymbolTable* parent_; + + map ops_; + map vars_; +}; +``` + +After all the description of variables and operators is added into SymbolTable, +the block has enough information to run. + +The `Block` class takes a `BlockDesc` as input, and provides `Run` and `InferShape` functions. + + +```c++ +namespace { + +class Block : OperatorBase { +public: + Block(const BlockDesc& desc) desc_(desc) {} + + void InferShape(const framework::Scope& scope) const override { + if (!symbols_ready_) { + CreateVariables(scope); + CreateOperators(); + } + // should run InferShape first. + for (auto& op : runtime_table_.ops()) { + op->InferShape(scope); + } + } + + void Run(const framework::Scope& scope, + const platform::DeviceContext& dev_ctx) const override { + PADDLE_ENFORCE(symbols_ready_, "operators and variables should be created first."); + for (auto& op : runtime_table_.ops()) { + op->Run(scope, dev_ctx); + } + } + + void CreateVariables(const framework::Scope& scope); + void CreateOperators(); + + // some other necessary interfaces of NetOp are listed below + // ... + +private: + BlockDesc desc_; + bool symbols_ready_{false}; +}; +``` + +## The Execution of Blocks + +Block inherits from OperatorBase, which has a Run method. +Block's Run method will run its operators sequentially. + +There is another important interface called `Eval`, which takes some arguments called targets and generates a minimal graph which treats targets as the end points and creates a new Block. After `Run`, `Eval` will get the latest value and return the targets. + +The definition of Eval is as follows: + +```c++ +// clean a block description by targets using the corresponding dependency graph. +// return a new BlockDesc with minimal number of operators. +// NOTE: The return type is not a Block but the block's description so that this can be distributed +// to a cluster. +BlockDesc Prune(const BlockDesc& desc, vector targets); + +void Block::Eval(const vector& targets, + const framework::Scope& scope, + const platform::DeviceContext& dev_ctx) { + BlockDesc min_desc = Prune(desc_, targets); + Block min_block(min_desc); + min_block.Run(scope, dev_ctx); +} +``` diff --git a/doc/_sources/design/build_system/README.md.txt b/doc/_sources/design/build_system/README.md.txt new file mode 100644 index 0000000000000000000000000000000000000000..bf0e4dddc1b640ecbce489f65820aaf8a4b3b1e7 --- /dev/null +++ b/doc/_sources/design/build_system/README.md.txt @@ -0,0 +1,152 @@ +A few months ago when we were trying to replace CMake with Bazel, @emailweixu suggested that we rewrite those handy Bazel functions using CMake. Now it seems that it's the right time to get this done, as we are facing problems from the porting of Majel and the development of new the parameter server using Go and C++. + +Here are some initial thoughts. Your comments are welcome! + +### Required CMake Function + +I think we need only the following few CMake functions to make a project description mean and clean: + +| C++ | CUDA C++ | Go | +|---|---|---| +| cc_library | nv_library | go_library | +| cc_binary | nv_binary | go_binary | +| cc_test | nv_test | go_test | + +- The `_library` functions generate .a files from source code. +- The `_binary` functions generate executable binary files. +- The `_test` functions generate executable unit test files. They work like `_binary` but links `-lgtest` and `-lgtest_main`. + +The difference between `nv_` functions and `cc_` functions is that the former use `nvcc` instead of the system-default C++ compiler. + +Both `nv_` and `cc_` functions enables C++11 (-std=c++11). + +Also, + +- to describe external dependencies, we need `external_library`. +- to build shared libraries, we need `shared_library`. + +### An Example Project + +Suppose that we have aforementioned functions defined in our `/cmake` directory. The following example `CMakeLists.txt` describes a project including the following source files: + +- tensor.h +- tensor.cc +- tensor_test.cc +- ops.h +- ops.cu +- ops_test.cu +- api.go +- api_test.go + +Suppose that ops.cu depends on CUDNN. + +```cmake +# cc_binary parses tensor.cc and figures out that target also depend +# on tensor.h. +cc_binary(tensor + SRCS + tensor.cc) + +# The dependency to target tensor implies that if any of +# tensor{.h,.cc,_test.cc} is changed, tensor_test need to be re-built. +cc_test(tensor_test + SRCS + tensor_test.cc + DEPS + tensor) + +# I don't have a clear idea what parameters external_library need to +# have. @gangliao as a CMake expert would have better ideas. +external_library(cudnn + ....) + +# Suppose that ops.cu depends on external target CUDNN. Also, ops.cu +# include global functions that take Tensor as their parameters, so +# ops depend on tensor. This implies that if any of tensor.{h.cc}, +# ops.{h,cu} is changed, ops need to be re-built. +nv_library(ops + SRCS + ops.cu + DEPS + tensor + cudnn) # cudnn is defined later. + +nv_test(ops_test + SRCS + ops_test.cu + DEPS + ops) + +# Because api.go defines a GO wrapper to ops and tensor, it depends on +# both. This implies that if any of tensor.{h,cc}, ops.{h,cu}, or +# api.go is changed, api need to be re-built. +go_library(api + SRCS + api.go + DEPS + tensor # Because ops depend on tensor, this line is optional. + ops) + +go_test(api_test + SRCS + api_test.go + DEPS + api) + + +# This builds libapi.so. shared_library might use CMake target +# api_shared so to distinguish it from above target api. +shared_library(api + DEPS + api) + +``` + +### Implementation + +As above example CMakeLists.txt executes, each function invocation adds "nodes" to a dependency graph. It also use this graph to generate CMake commands including `add_executable`, `add_dependencies`, `target_link_libraries`, and `add_test`. + +### Using Package Manager For Go + +Building Go binaries and libraries need to satisfy their dependencies, generally +we can do `go get ./...` to download and compile all external dependencies. The +problems are: + +1. `go get` will always get the latest code from the default branch of the + remote repo, so changes of dependents might break the build. This is very + different with what we already have in `cmake/external` which download a + specific version or commit id of the dependency. +1. Some locations can not access external dependencies through the internet, as mentioned + in https://github.com/PaddlePaddle/Paddle/issues/2605. Using package management + tools can package the dependencies as a "vendor" package, which can be mirrored + at many cloud file hosting, so users what to compile paddle by themselves can + download this "vendor" package from a mirror site. + +#### Choose A Suitable Tool + +As mentioned by @wangkuiyi, [Here](https://github.com/golang/go/wiki/PackageManagementTools) +list dozens of Go package managers. We choose the tool using following principles: + +- Most "active" projects with more stars, more pull requests or commits +- Widely used project + +After comparing all these projects, we shall choose between the most popular +tools: Godep and Glide. + +Here's a brief comparison between Godep and Glide +: https://github.com/Masterminds/glide/wiki/Go-Package-Manager-Comparison. There are +also many complaints about using `Godep`. There's also a new "official" pakcage +management tool has been started at: https://github.com/golang/dep to resolve +such problems, but it's currently at Alpha stage. So the best choice now is +glide obviously. + +#### Manage Go Packages + +- Dependencies: `go/glide.yaml` will store the dependencies and their versions which + is directly imported by paddle. `go/glide.lock` will store all dependencies recursively + with their commit id. Builds will "lock" to these packages if we don't `glide up` + them +- Vendor package: `go/vendor` directory will generated when running `cmake` command. `cmake` + will download the code corresponding to `go/glide.lock`. If we put a vendor folder + under `go/`, cmake will just check the commit id to the packages under the folder, + if commit id matches, there will be no download at all. diff --git a/doc_cn/_sources/design/dist/README.md.txt b/doc/_sources/design/cluster_train/README.md.txt similarity index 62% rename from doc_cn/_sources/design/dist/README.md.txt rename to doc/_sources/design/cluster_train/README.md.txt index 1788208bcabca30f66cb1c80e80f6b824c0d9579..177a5f5d54bd924fab34795219ce1f7b270c8e25 100644 --- a/doc_cn/_sources/design/dist/README.md.txt +++ b/doc/_sources/design/cluster_train/README.md.txt @@ -15,33 +15,37 @@ This poses technical challenges to PaddlePaddle: A training job will be created once user asks Paddle cloud to train a model. The training job is made up of different processes that collaboratively consume data and produce a trained model. There are three kinds of processes: -1. the *master process*, which dispatches tasks to +1. the *master server process*, which dispatches tasks to 1. one or more *trainer processes*, which run distributed training and synchronize gradients/models via -1. one or more *parameter server processes*, where each holds a shard of the global model. +1. one or more *parameter server processes*, where each holds a shard of the global model, and receive the uploaded gradients from every *trainer process*, so they can run the optimize functions to update their parameters. Their relation is illustrated in the following graph: -### Master Process +By coordinating these processes, PaddlePaddle supports use both Synchronize Stochastic Gradient Descent (sync SGD) and Asynchronous Stochastic Gradient Descent (async SGD) to train user-defined neural network topologies. -The master process will: +When training with sync SGD, parameter servers wait for all trainers to finish gradients update and then send the updated parameters to trainers, training can not proceed until the trainer received the updated parameters. This creates a synchronization point between trainers. When training with async SGD, each trainer upload gradient and download new parameters individually, without the synchronization with other trainers. Using asyc SGD will be faster in terms of time per pass, but have more noise in gradient since trainers are likely to have a stale model. + +### Master Server Process + +The master server process will: - Partition a dataset into [tasks](#task) and dispatch tasks to trainers. - Keep track of training progress on the dataset with [task queue](#task-queue). A training job will iterate on the dataset for a full pass until it goes into next pass. -#### Task +#### Task A task is a data shard to be trained. The total number of tasks will be much bigger than the total number of trainers. The number of data instances inside a task will be much bigger than the mini-batch size. #### Task Queue -The master process has three task queues to track training progress. As illustrated in the graph below, Job A and Job B both have one master process. Each master process has three task queues. +The master server has three task queues to track training progress. As illustrated in the graph below, Job A and Job B both have one master server. Each master server process has three task queues. -- The todo queue holds tasks to be dispatched. When a job starts, the master process fills in the todo queue with all tasks. +- The todo queue holds tasks to be dispatched. When a job starts, the master server fills in the todo queue with all tasks. - The pending queue holds tasks that are currently training by trainers. - the done queue holds tasks that are already trained. @@ -50,17 +54,18 @@ The life cycle of a single task is illustrated below: 1. When a new pass of training starts, all tasks will be placed in the todo queue. -1. The master process will dispatch few tasks to each trainer at a time, puts them in the pending queue and waits for completion. -1. The trainer will work on its tasks and tell the master process once a task is completed. The master process will dispatch a new task to that trainer. -1. If a task timeout. the master process will move it back to the todo queue. The timeout count will increase by one. If the timeout count is above a threshold, the task is likely to cause a trainer to crash, so it will be discarded. -1. The master process will move completed task to the done queue. When the todo queue is empty, the master process will start a new pass by moving all tasks in the done queue to todo queue and reset the timeout counter of all tasks to zero. +1. Upon trainer requests for new task, the master server will dispatch a task from todo queue to it, put the task in the pending queue and wait for completion. +1. The trainer will work on its task and tell the master server once the task is completed and ask for new task. The master server will dispatch a new task to that trainer. +1. If a task fails for any reason in trainer, or takes longer than a specific period of time, the master server will move the task back to the todo queue. The timeout count for that task will increase by one. If the timeout count is above a threshold, the task is likely to cause a trainer to crash, then it will be discarded. +1. The master server will move completed task to the done queue. When the todo queue is empty, the master server will start a new pass by moving all tasks in the done queue to todo queue and reset the timeout counter of all tasks to zero. ### Trainer Process The trainer process will: -- Receive tasks from the master. -- Work on the tasks: calculate and upload gradient to parameter servers, and update local model by downloading new parameters from parameter servers. +- Request tasks from the master. +- Work on the tasks +- Upload gradient to parameter servers, and update local model by downloading new parameters from parameter servers. ### Parameter Server Process @@ -78,7 +83,7 @@ The communication pattern between the trainers and the parameter servers depends - Synchronous Stochastic Gradient Descent (sync-SGD) Parameter server will wait for all trainer finish n-th mini-batch calculation and send their gradients before broadcasting new parameters to every trainer. Every trainer will wait for the new parameters before starting n+1-th mini-batch. - + - Asynchronous Stochastic Gradient Descent (async-SGD) There will no synchronization between different trainers, and parameter server updates its parameter as soon as it receives new gradient: @@ -89,7 +94,7 @@ The communication pattern between the trainers and the parameter servers depends ## Fault Tolerant -The training job will pause if the master processes is dead, or any of the parameter server process is dead. They will be started by [Kubernetes](https://kubernetes.io/) and recover in few minutes. Please refer to [fault recovery](#fault-recovery). +The training job will pause if the master server processes is dead, or any of the parameter server process is dead. They will be started by [Kubernetes](https://kubernetes.io/) and recover in few minutes. Please refer to [fault recovery](#fault-recovery). The training job will continue to make progress if there is at least one training process running. The strategy depends on the type of optimization algorithm: @@ -109,28 +114,26 @@ Now we will introduce how each process recovers from a failure, the graph below -### Master Process +### Master Server Process When the master is started by the Kubernetes, it executes the following steps at startup: 1. Grabs a unique *master* lock in etcd, which prevents concurrent master instantiations. 1. Recovers the task queues from etcd if they already exist, otherwise, the master will create them. -1. Watches the trainer prefix keys `/trainer/` on etcd to find the live trainers. -1. Starts dispatching the tasks to the trainers, and updates task queue using an etcd transaction to ensure lock is held during the update. +1. Write its ip address to */master/addr* so that trainers can discover it. +1. Listens to trainers' request of task, dispatch one upon request, and updates task queue using an etcd transaction to ensure lock is held during the update. -The master process will kill itself if its etcd lease expires. - -When the master process is dead for any reason, Kubernetes will restart it. It will be online again with all states recovered from etcd in few minutes. +When the master server process is dead for any reason, Kubernetes will restart it. It will be online again with all states recovered from etcd in few minutes. ### Trainer Process When the trainer is started by the Kubernetes, it executes the following steps at startup: -1. Watches the available parameter server prefix keys `/ps/` on etcd and waits until the count of parameter servers reaches the desired count. -1. Generates a unique ID, and sets key `/trainer/` with its contact address as value. The key will be deleted when the lease expires, so the master will be aware of the trainer being online and offline. -1. Waits for tasks from the master to start training. +1. Watches the available parameter server prefix keys `/ps/` on etcd and waits until the count of parameter servers reaches the desired count */ps_desired*. +1. Finds and watches */master/addr* to get master's address. +1. Requests for tasks from the master to start training. -If trainer's etcd lease expires, it will try set key `/trainer/` again so that the master process can discover the trainer again. +When a trainer fails, Kuberentes would try to restart it. The recovered trainer would fetch tasks from master and go on training. ### Parameter Server Process @@ -140,11 +143,11 @@ When the parameter server is started by Kubernetes, it executes the following st 1. Search through etcd keys `/ps/` (`/ps/0`, `/ps/1`, ...) to find the first non-existant key whose index is smaller than the total number of parameter servers. Set the key using a transaction to avoid concurrent writes. The parameter server's index is inferred from the key name. The desired number of parameter servers is 3: - + - + The third parameter server joined: - + 1. The parameter server can load parameters if there are already saved parameters in the save path (inferred from its index). @@ -153,6 +156,13 @@ When the parameter server is started by Kubernetes, it executes the following st If the parameter server's etcd lease expires, the parameter server will kill itself. +## Parameter Server Checkpointing +See [here](./checkpointing.md) + +## Store and dispatching trainning data +See [here](./data_dispatch.md) + + ## Dynamic Scaling ### Trainer Scaling diff --git a/doc/_sources/design/cluster_train/checkpointing.md.txt b/doc/_sources/design/cluster_train/checkpointing.md.txt new file mode 100644 index 0000000000000000000000000000000000000000..c87ef2c7d2636208866d05456d5d44316d0bb200 --- /dev/null +++ b/doc/_sources/design/cluster_train/checkpointing.md.txt @@ -0,0 +1,44 @@ +## 模型参数检查点(Checkpointing) +模型数据检查点的实现,可以有效的避免parameter server的单点或多点同时故障。模型参数检查点通过定期向磁盘上保存一份存储在parameter server内存中的模型数据的完整镜像,来保证训练过程可以从中间状态重新启动。在一个不可中断并缺少备份的训练任务中,可以通过阶段性的保存每个parameter server的数据快照(snapshot)到 ***分布式存储服务*** 达到容灾的目的,比如每隔10分钟最新的快照,并删除更早的快照。在出现单点故障时,只需要恢复这台节点,或者将这台节点迁移到另一个节点并启动即可恢复训练任务。 + + + +### 快照保存的设计如下: + +说明: + +* parameter server在集群中启动后,自动挂载分布式存储目录,并把快照保存到这个目录下。 +* ***注:每个parameter server的检查点各自独立保存,暂时不考虑多个parameter server同步的保存一个特定时间点的全局检查点,因为这样做也没法保证消除随机性。*** + +检查点保存程序流程: + +1. 如果满足条件"每隔10分钟"时,parameter server会获取parameters内存的`read_lock`,启动一个新的线程开始保存检查点。如果已经正在执行保存检查点的线程,则忽略。由于对parameters的更新需要获取parameters内存的`write_lock`,所以在写入快照的过程中,parameter server会暂停参数更新并等待。 +2. parameter server生成一个UUID,向指定的目录中一个新的文件(文件名为此UUID)写入快照数据。在快照写入完成后,计算这个文件的MD5 sum。然后在etcd的`/checkpoints/[pserver_id]`中写入json内容:`{"uuid": [UUID], "md5", "MD5 sum", "timestamp": xxxx}`。 +3. 删除磁盘目录中不是当前uuid的快照文件。 +4. 释放对paramters内存的锁定,停止保存检查点的线程。 + +这里需要用户额外注意,在您的实际环境中,训练任务的运行可能会占满trainer和parameter server之间的网络带宽,如果parameter server此时还需要通过网络访问分布式存储以保存快照,可能会造成网络拥塞,而出现阶段性的运行停滞。 + +### 从快照恢复 + +在parameter server第一次启动或任意时间parameter server故障后被Kubernetes重新启动,则需要回滚到上一个检查点: + + 1. 从etcd中读取节点:`/checkpoints/[pserver_id]`获取最新的检查点的文件uuid + 1. 从磁盘文件中加载uuid文件名的检查点快照文件,并加载其中的参数 + 1. 如果上面两步出现错误,则使用启动参数定义的初始化方法初始化参数 + 1. 开始提供服务 + +## TODO List +### 推测执行/加速执行(TODO) +在异构集群中,如果存在某些trainer执行速度过慢会影响整体集群的速度(如图中Trainer 1),此时master将负责启动一个新的Trainer(Accelerate Trainer 2),使用同样的训练数据block。哪个trainer先完成block的训练,则把另一个慢速的kill掉。 + +### 动态扩容/缩容 +目前只考虑动态扩容trainer数量,可以减小系统复杂性。 + +## 术语 +* model: 指深度学习训练之后得到的所有参数,使用这个神经网络可以完成对新数据的预测 +* parameters: 神经网络中的参数,包括权重w和偏置b。一个神经网络的模型由大量的参数组成 +* shard: 分片,通常指将一个整体拆分成多份的其中的一份。 +* model shard: 将一个神经网络参数拆分成多份,每个shard分别存储在其中一台parameter server之上 +* parameter block: 多个parameter block构成一个model shard +* 单点故障: 任意时刻只可能同时有一台服务器故障。由于集群中同时存在两台机器故障的概率极低((平均故障率*平均故障修复时间)^2)只对特殊在线系统考虑两台以上同时故障的容灾。 diff --git a/doc/_sources/design/cluster_train/data_dispatch.md.txt b/doc/_sources/design/cluster_train/data_dispatch.md.txt new file mode 100644 index 0000000000000000000000000000000000000000..1f5d22ff5e6abcb576d16cbe7391da1967a1ab8e --- /dev/null +++ b/doc/_sources/design/cluster_train/data_dispatch.md.txt @@ -0,0 +1,160 @@ +## 训练数据的存储和分发 + +### 概念解释 + +### 流程介绍 +生产环境中的训练数据集通常体积很大,并被存储在诸如Hadoop HDFS,Ceph,AWS S3之类的分布式存储之上。这些分布式存储服务通常会把数据切割成多个分片分布式的存储在多个节点之上。这样就可以在云端执行多种数据类计算任务,包括: + +* 数据预处理任务 +* Paddle训练任务 +* 在线模型预测服务 +
+ +
+ +在上图中显示了在一个实际生产环境中的应用(人脸识别)的数据流图。生产环境的日志数据会通过实时流的方式(Kafka)和离线数据的方式(HDFS)存储,并在集群中运行多个分布式数据处理任务,比如流式数据处理(online data process),离线批处理(offline data process)完成数据的预处理,提供给paddle作为训练数据。用户也可以上传labeled data到分布式存储补充训练数据。在paddle之上运行的深度学习训练输出的模型会提供给在线人脸识别的应用使用。 + +### 训练数据存储 +我们选择[CephFS](http://docs.ceph.com/docs/master/cephfs/)作为存储系统。 + +- 无论是从[PFSClient](../file_manager/README.md)的角度,还是从[Pod](https://kubernetes.io/docs/concepts/workloads/pods/pod/)中运行任务的角度,统一用`/pfs/$DATACENTER/home/$USER`来访问用户自己的数据。 +- `/pfs/$DATACENTER/common`下存放公共数据集合 + - 做只读挂载 + +
+ +
+ +### 文件预处理 + + +在开始训练之前, 数据集需要预先被转换成PaddlePaddle分布式训练使用的存储格[RecordIO](https://github.com/PaddlePaddle/Paddle/issues/1947)。我们提供两个转换方式: + +1. 用户在本地转换好再上传 +1. 用户上传数据后,在机群上运行转换程序 + +转换生成的文件名会是以下格式: + +```text +name_prefix-aaaaa-of-bbbbb +``` + +"aaaaa"和"bbbbb"都是五位的数字,每一个文件是数据集的一个shard,"aaaaa"代表shard的index,"bbbbb"代表这个shard的最大index。 + +比如ImageNet这个数据集可能被分成1000个shard,它们的文件名是: +```text +imagenet-00000-of-00999 +imagenet-00001-of-00999 +... +imagenet-00999-of-00999 +``` + +#### 转换库 + +无论是在本地或是云端转换,我们都提供Python的转换库,接口是: +```python +def convert(output_path, reader, num_shards, name_prefix) +``` + +- `output_path`: directory in which output files will be saved. +- `reader`: a [data reader](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/reader/README.md#data-reader-interface), from which the convert program will read data instances. +- `num_shards`: the number of shards that the dataset will be partitioned into. +- `name_prefix`: the name prefix of generated files. + +`reader`每次输出一个data instance,这个instance可以是单个值,或者用tuple表示的多个值: + +```python +yield 1 # 单个值 +yield numpy.random.uniform(-1, 1, size=28*28) # 单个值 +yield numpy.random.uniform(-1, 1, size=28*28), 0 # 多个值 +``` + +每个值的类型可以是整形、浮点型数据、字符串,或者由它们组成的list,以及numpy.ndarray。如果是其它类型,会被Pickle序列化成字符串。 + +### 示例程序 + +#### 使用转换库 + +以下`reader_creator`生成的`reader`每次输出一个data instance,每个data instance包涵两个值:numpy.ndarray类型的值和整型的值: +```python +def reader_creator(): + def reader(): + for i in range(1000): + yield numpy.random.uniform(-1, 1, size=28*28), 0 # 多个值 + return reader +``` + +把`reader_creator`生成的`reader`传入`convert`函数即可完成转换: +```python +convert("./", reader_creator(), 100, random_images) +``` + +以上命令会在当前目录下生成100个文件: +```text +random_images-00000-of-00099 +random_images-00001-of-00099 +... +random_images-00099-of-00099 +``` + +#### 进行训练 + + +PaddlePaddle提供专用的[data reader creator](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/reader/README.md#python-data-reader-design-doc),生成给定`RecordIO`文件对应的data reader。**无论在本地还是在云端,reader的使用方式都是一致的**: + +```python +# ... +reader = paddle.reader.creator.RecordIO("/pfs/datacenter_name/home/user_name/random_images-*-of-*") +batch_reader = paddle.batch(paddle.dataset.mnist.train(), 128) +trainer.train(batch_reader, ...) +``` + +以上代码的reader输出的data instance与生成数据集时,reader输出的data instance是一模一样的。 + +### 上传训练文件 + +使用下面命令,可以把本地的数据上传到存储集群中。 + +```bash +paddle pfs cp filename /pfs/$DATACENTER/home/$USER/folder/ +``` + +比如,把之前示例中转换完毕的random_images数据集上传到云端的`/home/`可以用以下指令: + +```bash +paddle pfs cp random_images-*-of-* /pfs/$DATACENTER/home/$USER/folder/ +``` + +需要`$DATACENTER`的配置写到配置文件中,例如 + +``` +# config file +[datacenter_1] +username=user +usercert=user.pem +userkey=user-key.pem +endpoint=datacenter1.paddlepaddle.org + +[datacenter_2] +username=user +usercert=user.pem +userkey=user-key.pem +endpoint=datacenter2.paddlepaddle.org +``` +## TODO +### 文件访问的权限 +控制用户权限 + +- 用户可以把自己的数据分享给别人 + +### 文件访问方式 +不用mount的方式来访问数据,而是直接用API的接口远程访问 + +例如: + +``` +f = open('/pfs/datacenter_name/home/user_name/test1.dat') +``` + + +### 支持用户自定义的数据预处理job diff --git a/doc/_sources/design/cluster_train/large_model_dist_train.md.txt b/doc/_sources/design/cluster_train/large_model_dist_train.md.txt new file mode 100644 index 0000000000000000000000000000000000000000..0c4b5bc24c854b7062d509249bea9c50d42bd5f1 --- /dev/null +++ b/doc/_sources/design/cluster_train/large_model_dist_train.md.txt @@ -0,0 +1,101 @@ +# Alalysis of large model distributed training in Paddle + +***NOTE: This is only some note for how we implemeted this scheme in V1, not a new design.*** + +## What is it + +We often encounter cases that the embedding layer parameters(sparse) are so large that we can not store it in the trainer's memory when training. So we need to put them to several servers, and fetch them row by row instead of fetch all of the parameters. + +## How to use + +Specify command-line argument like `--loadsave_parameters_in_pserver=true --ports_num_for_sparse=1 --use_old_updater=1` when starting the paddle trainer. And also add something like `--ports_num_for_sparse=1 --pserver_num_threads=5` when starting pserver processes. + +Accrodingly, configure your embedding layers like: + +```python +SPARSE_REMOTE=True + +w1 = data_layer(name="w1", size=dict_size) +emb1 = embedding_layer(input=w1, size=32, param_attr=ParameterAttribute(sparse_update=SPARSE_REMOTE)) +w2 = data_layer(name="w2", size=dict_size) +emb2 = embedding_layer(input=w2, size=32, param_attr=ParameterAttribute(sparse_update=SPARSE_REMOTE)) +... +``` + +## Implementation details + +```c++ +enum MatType { + MAT_NORMAL, + MAT_NORMAL_SHARED, + MAT_VALUE_SHARED, + MAT_SPARSE_ROW_IDS, + MAT_SPARSE_ROW_AUTO_GROW, + MAT_CACHE_ROW, + MAT_SPARSE_ROW, + MAT_SPARSE_ROW_PREFETCH, + MAT_SPARSE_ROW_PREFETCH_FULL_SIZE, +}; +``` + +`MAT_SPARSE_ROW_PREFETCH` is what we use when configured to fetch only row of matrix when training. + +In `trainer_internal.cpp:L93 trainOneBatch`: + +```c++ + if (config_->getOptConfig().use_sparse_remote_updater()) { + REGISTER_TIMER("prefetch"); + gradientMachine_->prefetch(inArgs); + parameterUpdater_->getParametersRemote(); + } +``` + +When doing actual network forward and backward, at the beginning of each batch, the trainer will try to download one row of data from pserver. + +In `trainer/RemoteParameterUpdater.cpp`: `parameterUpdater_->getParametersRemote();`: + +```c++ +if (fullSize) { + ... +} else { +getParams = [&] { + parameterClient_->getParameterSparse( + /* recvParameterType= */ PARAMETER_VALUE, sendBackParameterType); +}; +applyL1 = [](Parameter& para, real decayRate) { + para.getMat(PARAMETER_VALUE)->applyL1(/*lr=*/1.0f, decayRate); +}; +} +``` + +Calling `parameterClient_->getParameterSparse` will do remote call to pserver's `getParameterSparse`: + +```c++ +void ParameterServer2::getParameterSparse(const SendParameterRequest& request, + std::vector& inputBuffers, + SendParameterResponse* response, + std::vector* outputBuffers) { + (void)inputBuffers; + auto& buffer = *readWriteBuffer_; + size_t numReals = 0; + for (const auto& block : request.blocks()) { + numReals += getParameterConfig(block).dims(1); + } + buffer.resize(numReals); + + VLOG(3) << "pserver: getParameterSparse, numReals=" << numReals; + + ReadLockGuard guard(parameterMutex_); + size_t offset = 0; + for (const auto& block : request.blocks()) { + size_t width = getParameterConfig(block).dims(1); + Buffer buf = {buffer.data() + offset, width}; + int type = request.send_back_parameter_type(); + sendBackParameterSparse(block, type, response, &buf, width, outputBuffers); + offset += width; + } +} +``` + +`getParameterConfig(block).dims(1)` returns the width of the current "parameter block"(a shard of parameter object), +then `getParameterSparse` remote call returns only one row of data to the client. diff --git a/doc/_sources/design/cluster_train/master_server.md.txt b/doc/_sources/design/cluster_train/master_server.md.txt new file mode 100644 index 0000000000000000000000000000000000000000..4bf3c506f101361875043f8bfd97972b8c981a22 --- /dev/null +++ b/doc/_sources/design/cluster_train/master_server.md.txt @@ -0,0 +1,91 @@ +# Design Doc: Master Server + +For an overview of master server's role, please refer to [distributed training design doc](./README.md). In this design doc we will discuss the master server in more details. The master will be implemented in [Go](https://golang.org/). + +## Dataset + + + +A dataset is a list of files in *RecordIO* format. A RecordIO file consists of chunks, whereas each chunk consists some records. + +## Task Queue + +As mentioned in [distributed training design doc](./README.md), a *task* is a data shard that the master server assigns to the trainer process to train on. A task consists of one or multiple *chunks* from one or multiple files. The master server maintains *task queues* to track the training progress. + +### Task Queue Creation + +1. Each trainer will make an RPC call (using Go's [rpc](https://golang.org/pkg/net/rpc/) package) to the master server, telling it the RecordIO files representing the dataset specified by the user. Since every trainer will tell the master server the same dataset, only the first RPC call will be honored. + + The RPC interface is: + ```go + func (m *RPCServer) ReportDataset(Paths []string, dummy *int) error { + } + ``` +1. The master server will scan through each RecordIO file to generate the *chunk index* and know how many chunks does each file have. A chunk can be referenced by the file path and the index of the chunk within the file. The chunk index is in memory data structure that enables fast access to each chunk, and the index of the chunk with the file is an integer start from 0, representing the n-th chunk within the file. + + The definition of the chunk is: + ```go + type Chunk struct { + Idx int // index of the chunk within the file + Path string + Index recordio.Index // chunk index + } + ``` +1. Chunks are grouped into tasks, and tasks are filled into the todo queue. The pending queue and the done queue are initialized with no element. + + The definition of the task is: + ```go + type Task struct { + Index int + Chunks []Chunk + } + ``` + + The elements in the tasks queues is of type `TaskEntry`, containing a timeout counter (described in [task retry logic](#task-retry-logic)), and a task: + ```go + type TaskEntry struct { + NumTimeout int + Task Task + } + ``` + + The definition of task queues is: + ```go + type TaskQueues struct { + Todo []TaskEntry + Pending map[int]TaskEntry // map from task index to task entry + Done []TaskEntry + } + ``` + +### Task Queue Persistence + +The task queues need to be persisted on [etcd](https://github.com/coreos/etcd) for fault recovery. Since the task queues only change once a task is completed or timed out, which is not very frequent, we can afford to synchronize with etcd every time the task queues change. + +We will serialize the task queues data structure with [gob encoding](https://golang.org/pkg/encoding/gob/), compress with gzip, and save into etcd synchronously under key `/task_queues`. + +### Task Dispatch + +The trainer will make an RPC call to master to get a new task when: + +- the trainer first started, or +- the trainer finishes a task. + +The RPC interface is: +```go +func (m *RPCServer) GetTask(finished *Task, result *Task) error { +} +``` +Argument `finished` will be `nil` when the trainer is just started. + +During the RPC call the master will do the following: + +- Make a copy of the task queues, and update the copy reflecting the finished tasks and the new pending tasks. +- Synchronize the copy of task queues with etcd using a transaction conditioned on holding the master lock. +- Replace the task queues with the copy and report to the trainer with the new tasks if succeeded, or discard the copy and report the error to the trainer if failed. + +### Task Retry Logic + +When a task is dispatched to the trainer, the master will schedule a function for execution after the timeout duration (based on the moving average of task completion time). If the task entry in still in the pending queue, its timeout counter will increase by one, and the task will be moved to todo queue. If the timeout counter is above the threshold, the master will log the error and discard the task. + +Please note that since a timed out task could be completed after it has been dispatched for retry, so it is possible for a task to be processed multiple times. We do not try to prevent it from happening since it's fine to train on the same task multiple times due to the stochastic nature of the stochastic gradient decent algorithm. diff --git a/doc/_sources/design/cluster_train/pserver_client.md.txt b/doc/_sources/design/cluster_train/pserver_client.md.txt new file mode 100644 index 0000000000000000000000000000000000000000..474b8c572cd92fc87e9f7f3f2b19d12cccd158de --- /dev/null +++ b/doc/_sources/design/cluster_train/pserver_client.md.txt @@ -0,0 +1,171 @@ +# Design Doc: The Client Library of Parameter Server + +For an overview of trainer's role, please refer to [distributed training design doc](README.md). In this design doc, we will discuss the parameter server's client library, which will manage communication with parameter servers. The library will be implemented in [Go](https://golang.org/) and made available as a static or dynamic library with a C header file. + +## Parameter Partition + +Each parameter will be partitioned into parameter blocks to make the parameters evenly distributed on parameter servers. The partition is done automatically by the client library. The *sparse parameter* require a little different treatment: + +### Sparse Parameter + +The sparse parameter is a parameter that is updated sparsely. The name is somewhat misleading, it does not have a sparse representation, it has the same representation as a dense vector. + +Because a sparse parameter is updated sparsely, the trainer will have to partition the sparse parameter. Because the parameter server will merge all sparse parameter shard into the same file when saving the parameter. It needs special naming convention: + +If a sparse parameter is partitioned into n shards, they should be named as: + +```text +name:sparse-0 +name:sparse-1 +... +name:sparse-n-1 +``` + +The library is unaware of the partition, and treat each parameter independently. Only when saving parameters, the parameter servers will merge the sparse parameters according to the naming convention. + +## Model Optimization Using Gradients + +There are two ways to perform model optimization using gradients: + +- On Client + + The client does multiple steps of forward and backward update. In each step, the gradients are calculated and a new model is generated. After some steps, the client will calculate the difference between the newest model and the old model at step 0. The difference will be updated to parameter servers. Parameter servers will just update parameters using the difference without any optimization using gradients (such as Adam and L1 regularization). + +- On Parameter Server + + The client will send accumulated gradients to parameter servers, the parameter server will do the optimization using gradients. + +## L1 and L2 Regularization + +PaddlePaddle allows L1 or L2 regularizations to be specified per parameter, so when the trainer initializes the parameter it needs include a parameter configuration when L1 or L2 regularization is necessary. + +## Parameter Initialization + +The parameters on parameter servers need to be initialized. To provide maximum flexibility, the trainer will initialize the parameters. Only one trainer will do the initialization, the other trainers will wait for the completion of initialization and get the parameters from the parameter servers. + +### Trainer Selection + +To select the trainer for initialization, every trainer will try to get a distributed lock, whoever owns the lock will do the initialization. As illustrated below: + + + +### Trainer Selection Process + +The trainer select process is encapsulated in the C API function: +```c +int paddle_begin_init_params(paddle_pserver_client* client, const char* config_proto); +``` +The selected trainer's call to `paddle_begin_init_params` will return with 1, and the other trainers' call to `paddle_begin_init_params` will return 0. `paddle_get_params` will be blocked until initialization is completed. As illustrated below: + + + +## C Interface + +```c +typedef enum { + PADDLE_ELEMENT_TYPE_INT32 = 0, + PADDLE_ELEMENT_TYPE_UINT32 = 1, + PADDLE_ELEMENT_TYPE_INT64 = 2, + PADDLE_ELEMENT_TYPE_UINT64 = 3, + PADDLE_ELEMENT_TYPE_FLOAT32 = 4, + PADDLE_ELEMENT_TYPE_FLOAT64 = 5, +} paddle_element_type; + +typedef struct { + char* name; + paddle_element_type element_type; + unsigned char* content; + int content_len; +} paddle_parameter, paddle_gradient; + +typedef int paddle_pserver_client; + +/** + * @brief creates a pserver client that talks to etcd for coordination. + */ +paddle_pserver_client paddle_new_etcd_pserver_client(char* etcd_addr); + +/** + * @brief creates a pserver client given pserver addresses. + * + * @param pserver_addrs comma-separated pserver addresses. + * @param selected if current pserver client is selected to initialize all parameter servers. + */ +paddle_pserver_client paddle_new_pserver_client(char* pserver_addrs, int selected); +void paddle_pserver_client_release(paddle_pserver_client c); + +/** + * @brief paddle_begin_init_params begins to initialize parameters on + * parameter servers. + * + * paddle_begin_init_params will be called from multiple trainers, + * only one trainer will be selected to initialize the parameters on + * parameter servers. Other trainers need to get the initialized + * parameters from parameter servers using @paddle_get_params. + * + * @return 1 if the trainer is selected to initialize parameter + * servers, otherwise 0. + */ +int paddle_begin_init_params(paddle_pserver_client client); + +/** + * @brief paddle_init_param initializes the parameter on parameter + * servers. + * + * @param param the parameter to initialize. + * @param param_config_proto the configuration for the parameter. + * @param config_len the length of param_config_proto + * @return 0 if successful, otherwise -1. On failure, the trainer + * needs to restart the entire initialization process (starting from + * @paddle_begin_init_param). Or simply exit the program and wait for + * the cluster management system to restart the trainer. + */ +int paddle_init_param(paddle_pserver_client client, paddle_parameter param, const unsigned char* param_config_proto, int config_len); + +/** + * @brief paddle_finish_init_params tells parameter servers client has + * sent all parameters to parameter servers as initialization. + * + * @return 0 if successful, otherwise -1. On failure, the trainer + * needs to restart the entire initialization process (starting from + * @paddle_begin_init_param). Or simply exit the program and wait for + * the cluster management system to restart the trainer. + */ +int paddle_finish_init_params(paddle_pserver_client client); + +/** + * @brief paddle_send_grads sends gradients to parameter servers for + * updating parameters. + * + * @param grads the array of gradients to send. + * @param len the length of the gradient array. + * @param learning_rate the learning rate for the gradients. + * @return 0 if successful, otherwise -1. + */ +int paddle_send_grads(paddle_pserver_client client, const paddle_gradient* grads, int len); + +/** + * @brief paddle_get_params gets parameters from parameter servers. + * + * paddle_get_params will block until parameters are initialized on + * the parameter servers. + * + * @param dst the destination array of parameter pointers to save to. + * The parameter pointer must be pre-popullated with required parameter name, + * and the content of parameter must be pre-allocated of the size of required + * parameter on pserver. + * @param len the length of the names array and the paddle_parameter + * array. + * @return 0 if successful, otherwise -1. + */ +int paddle_get_params(paddle_pserver_client client, paddle_parameter** dst, int len); + +/** + * @brief paddle_save_model indicates parameters to save the parameter + * to the given path + * + * @param path the path to save parameters. + * @return 0 if successful, otherwise -1. + */ +int paddle_save_model(paddle_pserver_client client, const char* path); +``` diff --git a/doc/_sources/design/cluster_train/remote_parameter_updater.md.txt b/doc/_sources/design/cluster_train/remote_parameter_updater.md.txt new file mode 100644 index 0000000000000000000000000000000000000000..6e8e5938455b869e0f3367794c41250340b37f77 --- /dev/null +++ b/doc/_sources/design/cluster_train/remote_parameter_updater.md.txt @@ -0,0 +1,21 @@ +# Design Doc: Remote Parameter Updater for Cluster Train + +For an overview of distribute training, please refer to [distributed training design doc](README.md). In this design doc, we will discuss the parameter updater that will use parameter server cclient [The Client Library of Parameter Server Design Doc](pserver_client.md) to manage and update parameters. + +## Parameter Updater + +Parameter Updater is used by trainer to manage and update parameter, there are mainly two kind of parameter updater: local and remote, since this design is for cluster train, we will only discuss remote parameter updater here. + +### Remote Parameter Updater + +Remote Parameter Updater manage parameters through remote parameter server with the client that communicate with pserver([The Client Library of Parameter Server Design Doc](pserver_client.md)) + +In PaddlePaddle Python V2 API, trainer is implemented in python, and the trainer will hold a instance of parameter updater and call it's functions directly. In this design, we will also expose the api of RemoteParameterUpdater to python with swig. + +#### Sparse Remote Parameter Updater + +Since we will only implement dense parameter management new, the mechanism for sparse parameter will be discussed in next stage. + +### Interface Design + +TBD diff --git a/doc/_sources/design/cluster_train/save_model.md.txt b/doc/_sources/design/cluster_train/save_model.md.txt new file mode 100644 index 0000000000000000000000000000000000000000..b755185c81ad617b9c85c47de0f5f65d2201c658 --- /dev/null +++ b/doc/_sources/design/cluster_train/save_model.md.txt @@ -0,0 +1,111 @@ +# Design Doc: Save Model + +## Overview + +The model is the output of the training process. There are two +ways from which user can obtain a model: + +- Save model triggered by user code: user code asks PaddlePaddle to + save a model. +- Convert model from the checkpoint: model being converted from + pservers' periodic checkpoint. In this way, the user can cancel a + job at any time, and still have a relatively fresh model (we + checkpoint around every 5 minutes). + +### Trainer Saving Model vs. Pservers Saving Model + +Both trainers and pservers have access to the model. So the model can +be saved from a trainer or pservers. We need to decide where the model +is saved from. + +#### Dense Update vs. Sparse Update + +There are two types of model update methods: dense update and sparse +update (when the model parameter is configured to be sparse). + +- Dense update + + Every trainer has it's own full copy of the model. Every model + update will update the entire model. + +- Sparse update + + The training input is sparse, and the trainer does not have the + entire model. It will only download the sub-model necessary related + to the input. When updating the model, only the sub-model related to + the training input is updated. + + +#### Pservers Saving Model + +The benefit of letting pservers save model is they have the entire +model all the time. However, since pservers are on different nodes, it +requires a merging process to merge model shards into the same +model. Thus requires the pservers to write models to a distributed +filesystem, making the checkpoint shards visible to the merge program. + +#### Trainer Saving Model + +The benefit of letting one trainer to save the model is it does not +require a distributed filesystem. And it's reusing the same save model +logic when training locally - except when doing sparse update, the +trainer needs to download the entire model during the saving process. + +#### Conclusion + +Given trainer saving model does not require a distributed filesystem, +and is an intuitive extension to trainer saving model when training +locally, we decide to let the trainer save the model when doing +distributed training. + + +### Convert Model from Checkpoint + +TODO + + +## Timeline + +We first implement trainer save the model. Converting the latest +snapshot to a model will be a TODO for future. + + +## Trainer Save Model + +### Trainer Election + +One trainer will be elected as the one to save the model. When using +etcd, trainer ID is a randomly generated UUID, the trainer will +contact the master server requesting to save the model, and find out +if itself is elected. When the master server is not used, unique +trainer IDs will be given by the administrator, the trainer whose ID +is "0" is elected to save the model. + +### Model Save Path + +Each trainer will be given the directory to save the model. The +elected trainer will save the model to +`given-directory/trainerID`. Since the trainer ID is unique, this +would prevent concurrent save to the same file when multiple trainers +are elected to save the model when split-brain problem happens. + +### What Happens When Model Is Saving + +It takes some time to save model, we need to define what will happen +when save model is taking place. + +When doing dense update, the trainer uses the local model. Pservers +does not need to pause model update. + +When doing sparse update. The trainer needs to download the entire +model while saving. To get the most accurate model, the model update +needs to be paused before the download starts and resumed after the +download finishes. Otherwise, the trainer gets a model that is +"polluted": some part of the model is old, some part of the model is +new. + +It's unclear that the "polluted" model will be inferior due to the +stochastic nature of deep learning, and pausing the model update will +add more complexity to the system. Since supporting sparse update is a +TODO item. We defer the evaluation of pause the model update or not +during saving model to the future. diff --git a/doc/_sources/design/cluster_train/submit-job.md.txt b/doc/_sources/design/cluster_train/submit-job.md.txt new file mode 100644 index 0000000000000000000000000000000000000000..8377d5489dc64bd2fdc5bb4f7bc737e7b489000d --- /dev/null +++ b/doc/_sources/design/cluster_train/submit-job.md.txt @@ -0,0 +1,127 @@ +# Submit a Distributed Training Job + +The user can submit a distributed training job with Python code, rather than with a command-line interface. + +## Runtime Environment On Kubernetes + +For a distributed training job, there is two Docker image called *runtime Docker image* and *base Docker image*. The runtime Docker image is the Docker image that gets scheduled by Kubernetes to run during training. The base Docker image is for building the runtime Docker image. + +### Base Docker Image + +Usually, the base Docker image is PaddlePaddle product Docker image including paddle binary files and python package. And of course, users can specify any image name hosted on any docker registry which users have the access right. + +### Runtime Docker Image + +The trainer package which user upload and some Python dependencies are packaged into a runtime Docker image based on base Docker image. + +- Handle Python Dependencies + + You need to provide requirements.txt file in your `trainer-package` folder. Example: + + ```txt + pillow + protobuf==3.1.0 + ``` + More [details](https://pip.readthedocs.io/en/1.1/requirements.html) about requirements, an example project looks like: + ```bash + paddle_example + |-quick_start + |-trainer.py + |-dataset.py + |-requirements.txt + ``` + +## Submit Distributed Training Job With Python Code + + +- `paddle.job.dist_train()` will call the Job Server API `/v1/packages` to upload the trainer package and save them on CephFS, and then call `/v1/trainer/job` to submit the PaddlePaddle distributed job. +- `/v1/trainer/job` will start a building job for preparing the runtime Docker image. When the building job is finished, Job Server will submit the PaddlePaddle distributed job to Kubernetes. +- *NOTE*: For the first version, we will not prepare the runtime Docker image, instead, the package is uploaded to Paddle Cloud, and Paddle Cloud will mount the package in a temporary folder into the base Docker image. We will not support custom Python dependencies in the first version as well. + +You can call `paddle.job.dist_train` and provide distributed training configuration as the parameters: +```python +paddle.job.dist_train( + trainer=dist_trainer(), + paddle_job=PaddleJob( + job_name = "paddle-cloud", + entry_point = "python %s"%__file__, + trainer_package = "/example/word2vec", + image = "yancey1989/paddle-job", + trainers = 10, + pservers = 3, + trainer_cpu = 1, + trainer_gpu = 1, + trainer_mem = "10G", + pserver_cpu = 1, + pserver_mem = "2G" + )) +``` + +The parameter `trainer` of `paddle.job.dist_train` is a function and you can implement it as follows: +```python +def dist_trainer(): + def trainer_creator(): + trainer = paddle.v2.trainer.SGD(...) + trainer.train(...) + return trainer_creator +``` + +The pseudo code of `paddle.job.dist_train` is as follows: +```python +def dist_train(trainer, paddle_job): + # if the code is running on cloud, set PADDLE_ON_CLOUD=YES + if os.getenv("RUNNING_ON_CLOUD", "NO") == "NO": + #submit the paddle job + paddle_job.submit() + else: + #start the training + trainer() +``` +### PaddleJob Parameters +parameter | type | explanation + --- | --- | --- +job_name | str | the unique name for the training job +entry_point | str | entry point for startup trainer process +trainer_package | str | trainer package file path which user have the access right +image|str|the [base image](#base-docker-image) for building the [runtime image](#runtime-docker-image) +pservers|int| Parameter Server process count +trainers|int| Trainer process count +pserver_cpu|int| CPU count for each Parameter Server process +pserver_mem|str| memory allocated for each Parameter Server process, a plain integer using one of these suffixes: E, P, T, G, M, K +trainer_cpu|int| CPU count for each Trainer process +trainer_mem|str| memory allocated for each Trainer process, a plain integer using one of these suffixes: E, P, T, G, M, K +trainer_gpu|int| GPU count for each Trainer process, if you only want CPU, do not set this parameter + +### Deploy Parameter Server, Trainer and Master Process + - Deploy PaddlePaddle Parameter Server processes, it's a Kubernetes ReplicaSet. + - Deploy PaddlePaddle Trainer processes, it's a Kubernetes Job. + - Deploy PaddlePaddle Master processes, it's a Kubernetes ReplicaSet. + +## Job Server + +- RESTful API + + Job server provides RESTful HTTP API for receiving the trainer package and displaying + PaddlePaddle job related informations. + - `POST /v1/package` receive the trainer package and save them on CephFS + - `POST /v1/trainer/job` submit a trainer job + - `GET /v1/jobs/` list all jobs + - `GET /v1/jobs/` the status of a job + - `DELETE /v1/jobs/` delete a job + - `GET /v1/version` job server version + +- Build Runtime Docker Image on Kubernetes + + `paddle.job.dist_train` will upload the trainer package to Job Server, save them on the distributed filesystem, and then start up a job for building the runtime Docker image that gets scheduled by Kubernetes to run during training. + + There are some benefits for building runtime Docker image on JobServer: + - On Paddle Cloud, users will run the trainer code in a Jupyter Notebook which is a Kubernetes Pod, if we want to execute `docker build` in the Pod, we should mount the host's `docker.sock` to the Pod, user's code will connect the host's Docker Engine directly, it's not safe. + - Users only need to upload the training package files, does not need to install docker engine, docker registry as dependencies. + - If we want to change another image type, such as RKT, users do not need to care about it. + +- Deploy Parameter Server, Trainer and Master Processes + + `POST /v1/trainer/job` receives the distributed training parameters, and deploy the job as follows: + - Deploy PaddlePaddle Parameter Server processes, it's a Kubernetes ReplicaSet. + - Deploy PaddlePaddle Trainer processes, it's a Kubernetes Job. + - Deploy PaddlePaddle Master processes, it's a Kubernetes ReplicaSet. diff --git a/doc/_sources/design/evaluator.md.txt b/doc/_sources/design/evaluator.md.txt new file mode 100644 index 0000000000000000000000000000000000000000..11cc129d56905a9ee666da92fbe6f8559c6d325a --- /dev/null +++ b/doc/_sources/design/evaluator.md.txt @@ -0,0 +1,58 @@ +## Evaluator Design + +### Problem Statement + +During training or inference, we provide an evaluation function to measure the model performance, for example, accuracy, precision, etc. In the operator based framework design, the data passes through the network pipeline batch by batch. As a result, inside the operator, we only calculate the metrics for one minibatch. Thus, we need to provide a mechanism to calculate the metrics for each N pass/batch the user wants. + +### Evaluator Design +Currently, every operation is expressed in the graph. We divide the evaluator process into three steps. + +1. Initialize the metric state and add it into the block. + +2. Calculate the concerned metrics for every mini-batch. The single evaluator operator is only responsible for calculating the necessary statistics for one mini-batch. For example, the accuracy operator only calculates the accuracy for a minibatch data if run once. + + +3. Merge the mini-batch statistics to form the evaluation result for multiple mini-batches. When it comes to distributed training/Multi-GPU training, aggregate the value from different devices. + +### Implementation +This design is shown in the Python API. +Each metric operator needs to caculate the metric statistic and return the batch-aware states. Python side is responsible for accumulating the states for each pass. + + +```python +class Evaluator(object): + """ + Evaluator Base class. + """ + def __init__(self, name, **kwargs): + """ + Different evaluator may has different metric states. E.g, Accuracy need two variables, total and right sample counts. + Auc need four variables, `true_positives`, + `true_negatives`, `false_positives` and `false_negatives`. So every evaluator should create its needed variables and append to main_program + + The initialization of Evaluator should be responsible for: + create metric states and append to the main_program + """ + pass + + def _update_ops(self, input, label, **kwargs) + """ + Add mini-batch evaluator caculate operators to the main_program. + Add increment operator to accumulate the metric states. + """ + + + def reset(self, executor, reset_program=None): + """ + Reset metric states at the begin of each pass/user specified batch number. + Execute the reset_program to reset the states. + """ + + + def eval(self, executor, eval_program=None): + """ + Merge the mini-batch statistics to form the evaluation result for multiple mini-batches. + Execute the eval_program and return the result. + """ + return eval_result +``` diff --git a/doc/_sources/design/executor.md.txt b/doc/_sources/design/executor.md.txt new file mode 100644 index 0000000000000000000000000000000000000000..b5fb6c5c3c1da3c112ce63878322083dd5c42b70 --- /dev/null +++ b/doc/_sources/design/executor.md.txt @@ -0,0 +1,23 @@ +# Executor Design Doc + +## Motivation + +We use executor to do the runtime evaluation of a `ProgramDesc`. + +## Overview + +An executor takes a `ProgramDesc`, a `block_id` and a `Scope`. The `ProgramDesc` is a list of blocks and each block contains the protobuf definition of all the parameters and operators. The `block_id` specifies the entrance block. And the `Scope` is the container of all the variable instance, which is persistent throughout different runs. + +### What does executor do? + +It evaluates all the operators in the `block_id`th block of a `ProgramDesc`. + +### What does executor NOT do? + +It does not do runtime optimization, meaning intelligently parse the dependency of each op a choose which one to be run and in which order they should be run. + +It does not do graph partitioning, meaning dividing the `ProgramDesc` into several small pieces and executing them on different devices. + +## Implementation + +`Executor` evaluates a `ProgramDesc`. Essentially, it instantiates Variables and Operators, then run all the operators in sequence. [[code]](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.cc) diff --git a/doc/_sources/design/file_manager/README.md.txt b/doc/_sources/design/file_manager/README.md.txt new file mode 100644 index 0000000000000000000000000000000000000000..3df10d801e568834729f902aace483d033340e2d --- /dev/null +++ b/doc/_sources/design/file_manager/README.md.txt @@ -0,0 +1,87 @@ +# FileManager设计文档 +## 目标 +在本文档中,我们设计说明了名为FileManager系统,方便用户上传自己的训练数据以进行分布式训练 + +主要功能包括: + +- 提供常用的命令行管理命令管理文件和目录 +- 支持大文件的断点上传、下载 + +## 名词解释 +- PFS:是`Paddlepaddle cloud File System`的缩写,是对用户文件存储空间的抽象,与之相对的是local filesystem。目前我们用CephFS来搭建。 +- [CephFS](http://docs.ceph.com/docs/master/cephfs/):一个POSIX兼容的文件系统。 +- Chunk:逻辑划上文件分块的单位。 + +## 模块 +### 架构图 + + +### PFSClient +- 功能: 详细设计[link](./pfs/pfsclient.md) + - 提供用户管理文件的命令 + - 需要可以跨平台执行 + +- 双向验证 + PFSClient需要和Ingress之间做双向验证[tls](#tls),所以用户需要首先在`cloud.paddlepaddle.org`上注册一下,申请用户空间,并且把系统生成的CA(certificate authority)、Key、CRT(CA signed certificate)下载到本地,然后才能使用PFSClient。 + +### [Ingress](https://kubernetes.io/docs/concepts/services-networking/ingress/) +- 功能: + 提供七层协议的反向代理、基于粘性会话的负载均衡功能。 + +- 透传用户身份的办法 + Ingress需要把PFSClient的身份信息传给PFSServer,配置的方法参考[link](http://www.integralist.co.uk/posts/clientcertauth.html#3) + +### PFSServer +PFSServer提供RESTful API接口,接收处理PFSClient端的文件管理请求,并且把结果返回PFSClient端。 + +RESTful API + +- /api/v1/files + - `GET /api/v1/files`: Get metadata of files or directories. + - `POST /api/v1/files`: Create files or directories. + - `PATCH /api/v1/files`: Update files or directories. + - `DELETE /api/v1/files`: Delete files or directories. + +- /api/v1/file/chunks + - `GET /api/v1/storage/file/chunks`: Get chunks's metadata of a file. + +- /api/v1/storage/files + - `GET /api/v1/storage/files`: Download files or directories. + - `POST /api/v1/storage/files`: Upload files or directories. + +- /api/v1/storage/file/chunks + - `GET /api/v1/storage/file/chunks`: Download chunks's data. + - `POST /api/v1/storage/file/chunks`: Upload chunks's data. + +## 文件传输优化 + +### 分块文件传输 +用户文件可能是比较大的,上传到Cloud或者下载到本地的时间可能比较长,而且在传输的过程中也可能出现网络不稳定的情况。为了应对以上的问题,我们提出了Chunk的概念,一个Chunk由所在的文件偏移、数据、数据长度及校验值组成。文件的上传和下载都是通过对Chunk的操作来实现的。由于Chunk比较小(默认256K),完成一个传输动作完成的时间也比较短,不容易出错。PFSClient需要在传输完毕最后一个Chunk的时候检查destination文件的MD5值是否和source文件一致。 + +一个典型的Chunk如下所示: + +``` +type Chunk struct { + fileOffset int64 + checksum uint32 + len uint32 + data []byte +} +``` + +### 生成sparse文件 +当destination文件不存在或者大小和source文件不一致时,可以用[Fallocate](https://Go.org/pkg/syscall/#Fallocate)生成sparse文件,然后就可以并发写入多个Chunk。 + +### 覆盖不一致的部分 +文件传输的的关键在于需要PFSClient端对比source和destination的文件Chunks的checksum是否保持一致,不一致的由PFSClient下载或者传输Chunk完成。这样已经传输成功的部分就不用重新传输了。 + +## 用户使用流程 +参考[link](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/cluster_train/data_dispatch.md) + +## 框架生成 +用[swagger](https://github.com/swagger-api/swagger-codegen)生成PFSClient和PFSServer的框架部分,以便我们可以把更多的精力放到逻辑本身上。 + +## 参考文档 +- [TLS complete guide](https://github.com/k8sp/tls/blob/master/tls.md) +- [aws.s3](http://docs.aws.amazon.com/cli/latest/reference/s3/) +- [linux man document](https://linux.die.net/man/) diff --git a/doc/_sources/design/file_manager/pfs/pfsclient.md.txt b/doc/_sources/design/file_manager/pfs/pfsclient.md.txt new file mode 100644 index 0000000000000000000000000000000000000000..56bc70c54bbc92b78d66e04fb495b1300cf8ebe0 --- /dev/null +++ b/doc/_sources/design/file_manager/pfs/pfsclient.md.txt @@ -0,0 +1,129 @@ +# PFSClient + +## Description +The `pfs` command is a Command Line Interface to manage your files on PaddlePaddle Cloud + +## Synopsis +``` +paddle [options] pfs [parameters] +``` + +## Options +``` +--profile (string) + Use a specific profile from your credential file. + +--help (string) + Display more information about command + +--version + Output version information and exit + +--debug + Show detailed debugging log + +--only-show-errors (boolean) + Only errors and warnings are displayed. All other output is suppressed. +``` + +## Path Arguments +When using a command, we need to specify path arguments. There are two path argument type: `localpath` and `pfspath`. + +A `pfspath` begin with `/pfs`, eg: `/pfs/$DATACENTER/home/$USER/folder`. + +[Here](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/cluster_train/data_dispatch.md#上传训练文件) is how to config datacenters. + +## order of Path Arguments +Commonly, if there are two path arguments, the first is the source, and the second is the destination. + +## Subcommonds +- rm - remove files or directories + +``` +Synopsis: + rm [-r] [-v] ... + +Options: + -r + Remove directories and their contents recursively + -v + Cause rm to be verbose, showing files after they are removed. + +Examples: + paddle pfs rm /pfs/$DATACENTER/home/$USER/file + paddle pfs rm -r /pfs/$DATACENTER/home/$USER/folder +``` +- mv - move (rename) files + +``` +Synopsis: + mv [-f | -n] [-v] + mv [-f | -n] [-v] ... + mv [-f | -n] [-v] + mv [-f | -n] [-v] ... + mv [-f | -n] [-v] + mv [-f | -n] [-v] ... + +Options: + -f + Do not prompt for confirmation before overwriting the destination path. (The -f option overrides previous -n options.) + -n + Do not overwrite an existing file. (The -n option overrides previous -f options.) + -v + Cause mv to be verbose, showing files after they are moved. + +Examples: + paddle pfs mv ./text1.txt /pfs/$DATACENTER/home/$USER/text1.txt +``` +- cp - copy files or directories + +``` +Synopsis: + cp [-r] [-f | -n] [-v] [--preserve--links] + cp [-r] [-f | -n] [-v] [--preserve--links] ... + cp [-r] [-f | -n] [-v] [--preserve--links] + cp [-r] [-f | -n] [-v] [--preserve--links] ... + cp [-r] [-f | -n] [-v] [--preserve--links] + cp [-r] [-f | -n] [-v] [--preserve--links] ... + +Options: + -r + Copy directories recursively + -f + Do not prompt for confirmation before overwriting the destination path. (The -f option overrides previous -n options.) + -n + Do not overwrite an existing file. (The -n option overrides previous -f options.) + -v + Cause cp to be verbose, showing files after they are copied. + --preserve--links + Reserve links when copy links + +Examples: + paddle pfs cp ./file /pfs/$DATACENTER/home/$USER/file + paddle pfs cp /pfs/$DATACENTER/home/$USER/file ./file +``` +- ls- list files + +``` +Synopsis: + ls [-r] ... + +Options: + -R + List directory(ies) recursively + +Examples: + paddle pfs ls /pfs/$DATACENTER/home/$USER/file + paddle pfs ls /pfs/$DATACENTER/home/$USER/folder +``` + +- mkdir - mkdir directory(ies) +Create intermediate directory(ies) as required. + +``` +Synopsis: + mkdir ... + +Examples: + paddle pfs mkdir /pfs/$DATACENTER/home/$USER/folder +``` diff --git a/doc/_sources/design/float16.md.txt b/doc/_sources/design/float16.md.txt new file mode 100644 index 0000000000000000000000000000000000000000..1ea95ed6b5d6792171569b6ff76d09be92fcb13e --- /dev/null +++ b/doc/_sources/design/float16.md.txt @@ -0,0 +1,105 @@ +# Design Doc: float16 + +## Why float16 +Half precision (float16) is a binary floating-point format that occupies 16 bits in memory. float16 is half the size of traditional 32-bit single precision format (float) and has lower precision and smaller range. + +When high precision computation is not required, using float16 data type could potentially + +- reduce storage space, memory bandwidth, and power usages; +- increase the chance of data fitting into a smaller cache of lower latency; +- provide arithmetic speed up if supported by hardware. + +## Survey of current float16 support +A brief survey of float16 support on different compilers, hardwares, and libraries can be found below. Interested readers can refer to [link1](https://github.com/PaddlePaddle/Paddle/issues/4853) and [link2](https://github.com/Xreki/Xreki.github.io/blob/master/multi_data_types_in_dl_framework/ppt/float16_and_quantized_type.md) for more info. + +The goal of float16 is to serve as a key for the executor to find and run the correct version of compute method specialized for float16 in operator kernel. It should be compatible with various natively supported float16 implementations including `__half` for cuda, `float16_t` for ARM, and `Eigen::half` for Eigen to make writing customized float16 kernels easier. + +### Compiler +- nvcc supports `__half` data type after CUDA 7.5. +- `__fp16` or `float16_t` is supported as storage type for gcc >= 6.1 and clang >= 3.4. +- `__fp16` or `float16_t` is supported as arithmetic type for gcc >= 7.1 and clang >= 3.9. + +### Hardware +- `__half` is supported on GPU with compute capability >= 5.3. +- `__fp16` is supported as storage type for ARMv7-A, ARMv8-A, and above. +- `__fp16` is supported as arithmetic type after ARMv8.2-A (currently, the only microarchitecture implementing ARMv8.2-A is ARM Cortex-A75, which is announced in May 2017. There seems to be no application processors currently available on market that adopts this architecture. It is reported that Qualcomm Snapdragon 845 uses Cortex-A75 design and will be available in mobile devices in early 2018). + +### Libraries +- [Eigen](https://github.com/RLovelett/eigen) >= 3.3 supports float16 calculation on both GPU and CPU using the `Eigen::half` class. It is mostly useful for Nvidia GPUs because of the overloaded arithmetic operators using cuda intrinsics. It falls back to using software emulation on CPU for calculation and there is no special treatment to ARM processors. +- [ARM compute library](https://github.com/ARM-software/ComputeLibrary) >= 17.02.01 supports NEON FP16 kernels (requires ARMv8.2-A CPU). + +### CUDA version issue +There are currently three versions of CUDA that supports `__half` data type, namely, CUDA 7.5, 8.0, and 9.0. +CUDA 7.5 and 8.0 define `__half` as a simple struct that has a `uint16_t` data (see [`cuda_fp16.h`](https://github.com/ptillet/isaac/blob/9212ab5a3ddbe48f30ef373f9c1fb546804c7a8c/include/isaac/external/CUDA/cuda_fp16.h)) as follows: +``` +typedef struct __align__(2) { + unsigned short x; +} __half; + +typedef __half half; +``` +This struct does not define any overloaded arithmetic operators. So you have to directly use `__hadd` instead of `+` to correctly add two half types: +``` +__global__ void Add() { + half a, b, c; + c = __hadd(a, b); // correct + c = a + b; // compiler error: no operator "+" matches these operands +} +``` +CUDA 9.0 provides a major update to the half data type. The related code can be found in the updated [`cuda_fp16.h`](https://github.com/ptillet/isaac/blob/master/include/isaac/external/CUDA/cuda_fp16.h) and the newly added [`cuda_fp16.hpp`](https://github.com/ptillet/isaac/blob/master/include/isaac/external/CUDA/cuda_fp16.hpp). + +Essentially, CUDA 9.0 renames the original `__half` type in 7.5 and 8.0 as `__half_raw`, and defines a new `__half` class type that has constructors, conversion operators, and also provides overloaded arithmetic operators such as follows: +``` +typedef struct __CUDA_ALIGN__(2) { + unsigned short x; +} __half_raw; + + +struct __CUDA_ALIGN__(2) __half { +protected: + unsigned short __x; +public: + // constructors and conversion operators from/to + // __half_raw and other built-in data types +} + +typedef __half half; + +__device__ __forceinline__ +__half operator+(const __half &lh, const __half &rh) { + return __hadd(lh, rh); +} + +// Other overloaded operators +``` +This new design makes `c = a + b` work correctly for CUDA half data type. + +## Implementation +The float16 class holds a 16-bit `uint16_t` data internally. +``` +struct float16 { + uint16_t x; +}; +``` + +float16 supports the following features: + - constructors / assignment operators that take input from primitive data types including bool, integers of various length, float, and double. + - constructors / assignment operators that take input from `__half` on cuda, `float16_t` on ARM, and `Eigen::half` on Eigen. + - conversion operators to primitive data types and half precision data types on cuda, ARM and Eigen. + - overloaded arithmetic operators for cuda, arm, and non-arm cpu, respectively. These operators will take advantage of the cuda and ARM intrinsics on the corresponding hardware. + +To support the above features, two fundamental conversion functions are provided: +``` +float16 float_to_half_rn(float f); // convert to half precision in round-to-nearest-even mode +float half_to_float(float16 h); +``` +which provides one-to-one conversion between float32 and float16. These twos functions will do different conversion routines based on the current hardware. CUDA/ARM instrinsics will be used when the corresonding hardware is available. If the hardware or compiler level does not support float32 to float16 conversion, software emulation will be performed to do the conversion. + +## To do +After float16 class is available, some of the future items are below: + +- Update pybind/tensor_py.h to bind c++ float16 with numpy float16. + +- Modify `GetKernelType()` method in `framework/operator.h` to make it compatible with float16. + +- Create a type-casting operator that can convert the data type in tensor between float16 and other types. diff --git a/doc/_sources/design/functions_operators_layers.md.txt b/doc/_sources/design/functions_operators_layers.md.txt new file mode 100644 index 0000000000000000000000000000000000000000..984b59f4c6971dfb6f46dfe342f2751f392c0e88 --- /dev/null +++ b/doc/_sources/design/functions_operators_layers.md.txt @@ -0,0 +1,100 @@ +# Design Doc: Functions, Operators, and Layers + +In a DL system, we can compose one or more fine grained operators into a coarse grained one. For example, the FC layer can be composed of a multiplication operator and an add operator. + +Historically, some fine grained operations are known as operators, and some coarse level ones are known as layers. But we need a well-defined separation. + +In general, operators are those very fine grained operations, e.g., mul and add. In the implementation, we can write them as C++ functions: + +```c++ +template T add(T x, T y) { return x + y; } +template T mul(T x, T y) { return x * y; } +``` + +Then we can wrap them into operators which are C++ classes and can be created from Python bindings by name. A C macro can do this. For example, the following macro invocation + +```c++ +#define MAKE_FUNCTION_OPERATOR(mul); +``` + +generates + +```c++ +template class mulOp : public OperatorBase {...}; +REGISTER_OP(mulOp, "mul"); +``` + +so that in Python we can create operator mul by: + +```python +X1 = Var() +X2 = Var() +Y = Var() +paddle.cpp.create_operator("mul", input=[X1, X2], output=Y) +``` + +Also, at the same time, we can compose a coarse level C++ operator class by composing functions `mul` and `add`: + +```c++ +template +class FCOp : public OperatorBase { + public: + void Run(...) { + add(mul(Input("X"), Input("W")), Input("b"); + } +}; +REGISTER_OP(FCOp, "fc"); +``` + +We need to support such composition in Python as well. To do so, we need a higher level Python wrapping of operator creation than `paddle.cpp.create_operator`. This higher level operator API should be compatible with the layer API. + +Let's explain using an example. Suppose that we are going to compose the FC using mul and add in Python, we'd like to have Python functions `mul` and `add` defined in module `operator`: + +```python +def operator.mul(X1, X2): + O = Var() + paddle.cpp.create_operator("mul", input={X1, Y1}, output=O) + return O + +def operator.add(X1, X2): + O = Var() + paddle.cpp.create_operator("add", input={X1, X2}, output=O) + return O +``` + +Above code snippets are automatically generated. Given them, users can define + +```python +def layer.fc(X): + W = Var() + b = Var() + return operator.add(operator.mul(X, W), b) +``` + +If we don't have `operator.mul` and `operator.add`, the definiton of `layer.fc` would be complicated: + +```python +def layer.fc(X): + W = Var() + b = Var() + O1 = Var() + paddle.cpp.create_operator("mul", input=[X, W], output=O1) + O2 = Var() + paddle.cpp.create_operator("add", input=[O1, b], output=O2) + return O2 +``` + +We'd like to have Python bindings to operators in package `paddle.operator`, and Python compositions of operators in package `paddle.layer`. So we have the following concepts in above illustrative example: + + +| C++ functions/functors | mul | add | | | +|------------------------|--------------|--------------|-------------|----------| +| C++ operator class | mulOp | addOp | FCOp | | +| Python binding | operator.mul | operator.add | operator.fc | | +| Python function | | | | layer.fc | + + +This is how we differentiate layer and operators in PaddlePaddle: + +- those defined in C++ and have a lightweighted Python wrapper in module `operators` are operators; whereas +- those who don't have C++ implementations but a Python implementation that compose C++ operators are known as layers. diff --git a/doc/_sources/design/gan_api.md.txt b/doc/_sources/design/gan_api.md.txt new file mode 100644 index 0000000000000000000000000000000000000000..fb41df8615f73d9fd4c32995eab265833eac1a55 --- /dev/null +++ b/doc/_sources/design/gan_api.md.txt @@ -0,0 +1,253 @@ +# Design for GAN + +GAN (General Adversarial Net [https://arxiv.org/abs/1406.2661]) is an important model for unsupervised learning and widely used in many areas. + +It applies several important concepts in machine learning system design, including building and running subgraphs, dependency tracing, different optimizers in one executor and so forth. + +In our GAN design, we wrap it as a user-friendly easily customized python API to design different models. We take the conditional DC-GAN (Unsupervised Representation Learning with Deep Convolutional Generative Adversarial Networks [https://arxiv.org/abs/1511.06434]) as an example due to its good performance on image generation. + +

+
+Figure 1. The overall running logic of GAN. The black solid arrows indicate the forward pass; the green dashed arrows indicate the backward pass of generator training; the red dashed arrows indicate the backward pass of the discriminator training. The BP pass of the green (red) arrow should only update the parameters in the green (red) boxes. The diamonds indicate the data providers. d\_loss and g\_loss marked in red and green are the two targets we would like to run. +

+ +The operators, layers and functions required/optional to build a GAN demo is summarized in https://github.com/PaddlePaddle/Paddle/issues/4563. + +

+
+Figure 2. Photo borrowed from the original DC-GAN paper. +

+ +## The Conditional-GAN might be a class. +This design we adopt the popular open source design in https://github.com/carpedm20/DCGAN-tensorflow and https://github.com/rajathkmp/DCGAN. It contains following data structure: + +- DCGAN(object): which contains everything required to build a GAN model. It provides following member functions methods as API: + +- __init__(...): Initialize hyper-parameters (like conv dimension and so forth), and declare model parameters of discriminator and generator as well. + +- generator(z, y=None): Generate a fake image from input noise z. If the label y is provided, the conditional GAN model will be chosen. +Returns a generated image. + +- discriminator(image): +Given an image, decide if it is from a real source or a fake one. +Returns a 0/1 binary label. + +- build_model(self): +build the whole GAN model, define training loss for both generator and discrimator. + +## Discussion on Engine Functions required to build GAN +- Trace the tensor and variable dependency in the engine executor. (Very critical, otherwise GAN can'be be trained correctly) +- Different optimizers responsible for optimizing different loss. + +To be more detailed, we introduce our design of DCGAN as following: + +### Class member Function: Initializer +- Set up hyper-parameters, including condtional dimension, noise dimension, batch size and so forth. +- Declare and define all the model variables. All the discriminator parameters are included in the list self.theta_D and all the generator parameters are included in the list self.theta_G. +```python +class DCGAN(object): + def __init__(self, y_dim=None): + + # hyper parameters + self.y_dim = y_dim # conditional gan or not + self.batch_size = 100 + self.z_dim = z_dim # input noise dimension + + # define parameters of discriminators + self.D_W0 = pd.Variable(shape=[3,3, 1, 128], data=pd.gaussian_normal_randomizer()) + self.D_b0 = pd.Variable(np.zeros(128)) # variable also support initialization using a numpy data + self.D_W1 = pd.Variable(shape=[784, 128], data=pd.gaussian_normal_randomizer()) + self.D_b1 = pd.Variable(np.zeros(128)) # variable also support initialization using a numpy data + self.D_W2 = pd.Varialble(np.random.rand(128, 1)) + self.D_b2 = pd.Variable(np.zeros(128)) + self.theta_D = [self.D_W0, self.D_b0, self.D_W1, self.D_b1, self.D_W2, self.D_b2] + + # define parameters of generators + self.G_W0 = pd.Variable(shape=[784, 128], data=pd.gaussian_normal_randomizer()) + self.G_b0 = pd.Variable(np.zeros(128)) # variable also support initialization using a numpy data + self.G_W1 = pd.Variable(shape=[784, 128], data=pd.gaussian_normal_randomizer()) + self.G_b1 = pd.Variable(np.zeros(128)) # variable also support initialization using a numpy data + self.G_W2 = pd.Varialble(np.random.rand(128, 1)) + self.G_b2 = pd.Variable(np.zeros(128)) + self.theta_G = [self.G_W0, self.G_b0, self.G_W1, self.G_b1, self.G_W2, self.G_b2] +``` + +### Class member Function: Generator +- Given a noisy input z, returns a fake image. +- Concatenation, batch-norm, FC operations required; +- Deconv layer required, which is missing now... +```python +class DCGAN(object): + def generator(self, z, y = None): + # input z: the random noise + # input y: input data label (optional) + # output G_im: generated fake images + + if not self.y_dim: + z = pd.layer.concat(1, [z, y]) + + G_h0 = pd.layer.fc(z, self.G_w0, self.G_b0) + G_h0_bn = pd.layer.batch_norm(G_h0) + G_h0_relu = pd.layer.relu(G_h0_bn) + + G_h1 = pd.layer.deconv(G_h0_relu, self.G_w1, self.G_b1) + G_h1_bn = pd.layer.batch_norm(G_h1) + G_h1_relu = pd.layer.relu(G_h1_bn) + + G_h2 = pd.layer.deconv(G_h1_relu, self.G_W2, self.G_b2)) + G_im = pd.layer.tanh(G_im) + return G_im +``` + +### Class member function: Discriminator +- Given a noisy input z, returns a fake image. +- Concatenation, Convolution, batch-norm, FC, Leaky-ReLU operations required; +```python +class DCGAN(object): + def discriminator(self, image): + # input image: either generated images or real ones + # output D_h2: binary logit of the label + + D_h0 = pd.layer.conv2d(image, w=self.D_w0, b=self.D_b0) + D_h0_bn = pd.layer.batchnorm(h0) + D_h0_relu = pd.layer.lrelu(h0_bn) + + D_h1 = pd.layer.conv2d(D_h0_relu, w=self.D_w1, b=self.D_b1) + D_h1_bn = pd.layer.batchnorm(D_h1) + D_h1_relu = pd.layer.lrelu(D_h1_bn) + + D_h2 = pd.layer.fc(D_h1_relu, w=self.D_w2, b=self.D_b2) + return D_h2 +``` + +### Class member function: Build the model +- Define data readers as placeholders to hold the data; +- Build generator and discriminators; +- Define two training losses for discriminator and generator, respectively. +If we have execution dependency engine to back-trace all tensors, the module building our GAN model will be like this: +```python +class DCGAN(object): + def build_model(self): + if self.y_dim: + self.y = pd.data(pd.float32, [self.batch_size, self.y_dim]) + self.images = pd.data(pd.float32, [self.batch_size, self.im_size, self.im_size]) + self.faked_images = pd.data(pd.float32, [self.batch_size, self.im_size, self.im_size]) + self.z = pd.data(tf.float32, [None, self.z_size]) + + # step 1: generate images by generator, classify real/fake images with discriminator + if self.y_dim: # if conditional GAN, includes label + self.G = self.generator(self.z, self.y) + self.D_t = self.discriminator(self.images) + # generated fake images + self.sampled = self.sampler(self.z, self.y) + self.D_f = self.discriminator(self.G) + else: # original version of GAN + self.G = self.generator(self.z) + self.D_t = self.discriminator(self.images) + # generate fake images + self.sampled = self.sampler(self.z) + self.D_f = self.discriminator(self.images) + + # step 2: define the two losses + self.d_loss_real = pd.reduce_mean(pd.cross_entropy(self.D_t, np.ones(self.batch_size)) + self.d_loss_fake = pd.reduce_mean(pd.cross_entropy(self.D_f, np.zeros(self.batch_size)) + self.d_loss = self.d_loss_real + self.d_loss_fake + + self.g_loss = pd.reduce_mean(pd.cross_entropy(self.D_f, np.ones(self.batch_szie)) +``` + +If we do not have dependency engine but blocks, the module building our GAN model will be like this: +```python +class DCGAN(object): + def build_model(self, default_block): + # input data in the default block + if self.y_dim: + self.y = pd.data(pd.float32, [self.batch_size, self.y_dim]) + self.images = pd.data(pd.float32, [self.batch_size, self.im_size, self.im_size]) + # self.faked_images = pd.data(pd.float32, [self.batch_size, self.im_size, self.im_size]) + self.z = pd.data(tf.float32, [None, self.z_size]) + + # step 1: generate images by generator, classify real/fake images with discriminator + with pd.default_block().g_block(): + if self.y_dim: # if conditional GAN, includes label + self.G = self.generator(self.z, self.y) + self.D_g = self.discriminator(self.G, self.y) + else: # original version of GAN + self.G = self.generator(self.z) + self.D_g = self.discriminator(self.G, self.y) + self.g_loss = pd.reduce_mean(pd.cross_entropy(self.D_g, np.ones(self.batch_szie)) + + with pd.default_block().d_block(): + if self.y_dim: # if conditional GAN, includes label + self.D_t = self.discriminator(self.images, self.y) + self.D_f = self.discriminator(self.G, self.y) + else: # original version of GAN + self.D_t = self.discriminator(self.images) + self.D_f = self.discriminator(self.G) + + # step 2: define the two losses + self.d_loss_real = pd.reduce_mean(pd.cross_entropy(self.D_t, np.ones(self.batch_size)) + self.d_loss_fake = pd.reduce_mean(pd.cross_entropy(self.D_f, np.zeros(self.batch_size)) + self.d_loss = self.d_loss_real + self.d_loss_fake +``` +Some small confusion and problems with this design: +- D\_g and D\_f are actually the same thing, but has to be written twice; i.e., if we want to run two sub-graphs conceptually, the same codes have to be written twice if they are shared by the graph. +- Requires ability to create a block anytime, rather than in if-else or rnn only; + +## Main function for the demo: +Generally, the user of GAN just need to the following things: +- Define an object as DCGAN class; +- Build the DCGAN model; +- Specify two optimizers for two different losses with respect to different parameters. +```python +# pd for short, should be more concise. +from paddle.v2 as pd +import numpy as np +import logging + +if __name__ == "__main__": + # dcgan class in the default graph/block + # if we use dependency engine as tensorflow + # the codes, will be slightly different like: + # dcgan = DCGAN() + # dcgan.build_model() + with pd.block() as def_block: + dcgan = DCGAN() + dcgan.build_model(def_block) + + # load mnist data + data_X, data_y = self.load_mnist() + + # Two subgraphs required!!! + with pd.block().d_block(): + d_optim = pd.train.Adam(lr = .001, beta= .1) + d_step = d_optim.minimize(dcgan.d_loss, dcgan.theta_D) + with pd.block.g_block(): + g_optim = pd.train.Adam(lr = .001, beta= .1) + g_step = pd.minimize(dcgan.g_loss, dcgan.theta_G) + + # executor + sess = pd.executor() + + # training + for epoch in xrange(10000): + for batch_id in range(N / batch_size): + idx = ... + # sample a batch + batch_im, batch_label = data_X[idx:idx+batch_size], data_y[idx:idx+batch_size] + # sample z + batch_z = np.random.uniform(-1., 1., [batch_size, z_dim]) + + if batch_id % 2 == 0: + sess.run(d_step, + feed_dict = {dcgan.images: batch_im, + dcgan.y: batch_label, + dcgan.z: batch_z}) + else: + sess.run(g_step, + feed_dict = {dcgan.z: batch_z}) +``` + +# More thinking about dependency engine v.s. block design: +- What if we just want to run an intermediate result? Do we need to run the whole block/graph? +- Should we call eval() to get the fake images in the first stage? And then train the discriminator in the second stage? diff --git a/doc/_sources/design/graph.md.txt b/doc/_sources/design/graph.md.txt new file mode 100644 index 0000000000000000000000000000000000000000..7519a65df835a39fe14f6ef45530afff170191ff --- /dev/null +++ b/doc/_sources/design/graph.md.txt @@ -0,0 +1,70 @@ +# Design Doc: Computations as a Graph + +A primary goal of the refactorization of PaddlePaddle is a more flexible representation of deep learning computation, in particular, a graph of operators and variables, instead of sequences of layers as before. + +This document explains that the construction of a graph as three steps: + +- construct the forward part +- construct the backward part +- construct the optimization part + +## The Construction of a Graph + +Let us take the problem of image classification as a simple example. The application program that trains the model looks like: + +```python +x = layer.data("images") +l = layer.data("label") +y = layer.fc(x) +cost = layer.mse(y, l) +optimize(cost) +train(cost, reader=mnist.train()) +``` + +### Forward Part + +The first four lines of above program build the forward part of the graph. + +![](images/graph_construction_example_forward_only.png) + +In particular, the first line `x = layer.data("images")` creates variable x and a Feed operator that copies a column from the minibatch to x. `y = layer.fc(x)` creates not only the FC operator and output variable y, but also two parameters, W and b, and the initialization operators. + +Initialization operators are kind of "run-once" operators -- the `Run` method increments a class data member counter so to run at most once. By doing so, a parameter wouldn't be initialized repeatedly, say, in every minibatch. + +In this example, all operators are created as `OpDesc` protobuf messages, and all variables are `VarDesc`. These protobuf messages are saved in a `BlockDesc` protobuf message. + +### Backward Part + +The fifth line `optimize(cost)` calls two functions, `ConstructBackwardGraph` and `ConstructOptimizationGraph`. + +`ConstructBackwardGraph` traverses the forward graph in the `BlockDesc` protobuf message and builds the backward part. + +![](images/graph_construction_example_forward_backward.png) + +According to the chain rule of gradient computation, `ConstructBackwardGraph` would + +1. create a gradient operator G for each operator F, +1. make all inputs, outputs, and outputs' gradient of F as inputs of G, +1. create gradients for all inputs of F, except for those who don't have gradients, like x and l, and +1. make all these gradients as outputs of G. + +### Optimization Part + +For each parameter, like W and b created by `layer.fc`, marked as double circles in above graphs, `ConstructOptimizationGraph` creates an optimization operator to apply its gradient. Here results in the complete graph: + +![](images/graph_construction_example_all.png) + +## Block and Graph + +The word block and graph are interchangable in the desgin of PaddlePaddle. A [Block](https://github.com/PaddlePaddle/Paddle/pull/3708) is a metaphore of the code and local variables in a pair of curly braces in programming languages, where operators are like statements or instructions. A graph of operators and variables is a representation of the block. + +A Block keeps operators in an array `BlockDesc::ops` + +```protobuf +message BlockDesc { + repeated OpDesc ops = 1; + repeated VarDesc vars = 2; +} +``` + +in the order that they appear in user programs, like the Python program at the beginning of this article. We can imagine that in `ops`, we have some forward operators, followed by some gradient operators, and then some optimization operators. diff --git a/doc/_sources/design/graph_survey.md.txt b/doc/_sources/design/graph_survey.md.txt new file mode 100644 index 0000000000000000000000000000000000000000..6c6db08f463ae0a2b94fc4546f123a1d7c151870 --- /dev/null +++ b/doc/_sources/design/graph_survey.md.txt @@ -0,0 +1,232 @@ +## Survey on Graph + +Neural network framework often provides symbolic API for users to write network topology conveniently. This doc manily focus on symbolic API in most popular neural network frameworks, and try to find out how to parse symbolic configuration to a portable file, such as protobuf or json. + +### Mxnet + +The core concept of symbolic API is `Symbol`. Mxnet implements `Symbol` class in C++, and export to Python using C-API. Please refer to the comments in Mxnet: + + +`Symbol` is help class used to represent the operator node in Graph. +`Symbol` acts as an interface for building graphs from different components like Variable, Functor and Group. `Symbol` is also exported to python front-end (while Graph is not) to enable quick test and deployment. Conceptually, symbol is the final operation of a graph and thus including all the information required (the graph) to evaluate its output value. + + +A simple network topology wrote by Symbol is as follows: + +```python +def get_symbol(num_classes=10, **kwargs): + data = mx.symbol.Variable('data') + data = mx.symbol.Flatten(data=data) + fc1 = mx.symbol.FullyConnected(data = data, name='fc1', num_hidden=128) + act1 = mx.symbol.Activation(data = fc1, name='relu1', act_type="relu") + fc2 = mx.symbol.FullyConnected(data = act1, name = 'fc2', num_hidden = 64) + act2 = mx.symbol.Activation(data = fc2, name='relu2', act_type="relu") + fc3 = mx.symbol.FullyConnected(data = act2, name='fc3', num_hidden=num_classes) + mlp = mx.symbol.SoftmaxOutput(data = fc3, name = 'softmax') + return mlp +``` + + + +Varible here is actually a Symbol. Every basic Symbol will correspond to one Node, and every Node has its own NodeAttr. There is a op field in NodeAttr class, when a Symbol represents Variable(often input data), the op field is null. + +Symbol contains a data member, std::vector outputs, and NodeEntry cantains a poniter to Node. We can follow the Node pointer to get all the Graph. + +And Symbol can be saved to a Json file. + +Here is a detailed example: + +``` +>>> import mxnet as mx +>>> data = mx.symbol.Variable('data') +>>> print data.debug_str() +Variable:data + +>>> data = mx.symbol.Flatten(data=data) +>>> print data.debug_str() +Symbol Outputs: + output[0]=flatten0(0) +Variable:data +-------------------- +Op:Flatten, Name=flatten0 +Inputs: + arg[0]=data(0) version=0 + +>>> fc1 = mx.symbol.FullyConnected(data = data, name='fc1', num_hidden=128) +>>> print fc1.debug_str() +Symbol Outputs: + output[0]=fc1(0) +Variable:data +-------------------- +Op:Flatten, Name=flatten0 +Inputs: + arg[0]=data(0) version=0 +Variable:fc1_weight +Variable:fc1_bias +-------------------- +Op:FullyConnected, Name=fc1 +Inputs: + arg[0]=flatten0(0) + arg[1]=fc1_weight(0) version=0 + arg[2]=fc1_bias(0) version=0 +Attrs: + num_hidden=128 + +``` + + +### TensorFlow + + +The core concept of symbolic API is `Tensor`. Tensorflow defines `Tensor` in Python. Please refer to the comments in TensorFlow: + +A `Tensor` is a symbolic handle to one of the outputs of an `Operation`. It does not hold the values of that operation's output, but instead provides a means of computing those values in a TensorFlow [Session](https://www.tensorflow.org/api_docs/python/tf/Session). + +A simple example is as follows: + +```python + # Build a dataflow graph. + c = tf.constant([[1.0, 2.0], [3.0, 4.0]]) + d = tf.constant([[1.0, 1.0], [0.0, 1.0]]) + e = tf.matmul(c, d) + + # Construct a `Session` to execute the graph. + sess = tf.Session() + + # Execute the graph and store the value that `e` represents in `result`. + result = sess.run(e) +``` + + +The main method of `Tensor` is as follows: + + +```python +@property +def op(self): + """The `Operation` that produces this tensor as an output.""" + return self._op + +@property +def dtype(self): + """The `DType` of elements in this tensor.""" + return self._dtype + +@property +def graph(self): + """The `Graph` that contains this tensor.""" + return self._op.graph + +@property +def name(self): + """The string name of this tensor.""" + if not self._op.name: + raise ValueError("Operation was not named: %s" % self._op) + return "%s:%d" % (self._op.name, self._value_index) + +@property +def device(self): + """The name of the device on which this tensor will be produced, or None.""" + return self._op.device +``` + + +Tensor can be taken as target to run by session. Tensor contains all the information of Graph, and tracks data dependency. + + +Here is a detailed example: + + +``` +>>> import tensorflow as tf +>>> c = tf.constant([[1.0, 2.0], [3.0, 4.0]]) +>>> print c.graph + +>>> d = tf.constant([[1.0, 1.0], [0.0, 1.0]]) +>>> print d.graph + +>>> e = tf.matmul(c, d) +>>> print e.graph + +``` + +### Dynet + + +The core concept of symbolic API is `Expression`, and Dynet defines `Expression` class in C++. + + +A simple example is as follows: + +```cpp +ComputationGraph cg; +Expression W = parameter(cg, pW); + +Expression in = input(cg, xs[i]); +Expression label = input(cg, ys[i]); +Expression pred = W * in; +Expression loss = square(pred - label); +``` + +The input data and parameter are also represented by Expression. Every basci Expression corresponds to a Node. And input data is also a Node. + +Expression has a data member ComputationGraph, and ComputationGraph will be modified in users' configuring process. Expression can be a running target, beacuse Expression contains all dependency. + + +Here is a detailed example: + +write topology in C++ + +``` +ComputationGraph cg; +Expression W = parameter(cg, pW); +cg.print_graphviz(); + +Expression pred = W * xs[i]; +cg.print_graphviz(); + +Expression loss = square(pred - ys[i]); +cg.print_graphviz(); +``` + +compile and print + +``` +# first print +digraph G { + rankdir=LR; + nodesep=.05; + N0 [label="v0 = parameters({1}) @ 0x7ffe4de00110"]; +} +# second print +digraph G { + rankdir=LR; + nodesep=.05; + N0 [label="v0 = parameters({1}) @ 0x7ffe4de00110"]; + N1 [label="v1 = v0 * -0.98"]; + N0 -> N1; +} +# third print +digraph G { + rankdir=LR; + nodesep=.05; + N0 [label="v0 = parameters({1}) @ 0x7ffe4de00110"]; + N1 [label="v1 = v0 * -0.98"]; + N0 -> N1; + N2 [label="v2 = -1.88387 - v1"]; + N1 -> N2; + N3 [label="v3 = -v2"]; + N2 -> N3; + N4 [label="v4 = square(v3)"]; + N3 -> N4; +} +``` + +### Conclusion + + +Actually, Symbol/Tensor/Expression in Mxnet/TensorFlow/Dynet are the same level concepts. We use a unified name Expression here, this level concept has following features: + +- Users wirte topoloy with symbolic API, and all return value is Expression, including input data and parameter. +- Expression corresponds with a global Graph, and Expression can also be composed. +- Expression tracks all dependency and can be taken as a run target diff --git a/doc/_sources/design/if_else_op.md.txt b/doc/_sources/design/if_else_op.md.txt new file mode 100644 index 0000000000000000000000000000000000000000..26d140f06db4ecefa86be015eaa731ffddc6910c --- /dev/null +++ b/doc/_sources/design/if_else_op.md.txt @@ -0,0 +1,51 @@ +# The `IfElse` Operator + +PaddlePaddle's `IfElse` operator differs from TensorFlow's: + +- the TensorFlow version takes a scalar boolean value as the condition so that the whole mini-batch goes to either the true or the false branch, whereas +- the PaddlePaddle version takes a vector of boolean value as the condition, and instances corresponding to true values go to the true branch, those corresponding to false values go to the false branch. + +## Example + +The following PaddlePaddle program shows the usage of the IfElse operator: + +```python +import paddle as pd + +x = minibatch([10, 20, 30]) # shape=[None, 1] +y = var(1) # shape=[1], value=1 +z = minibatch([10, 20, 30]) # shape=[None, 1] +cond = larger_than(x, 15) # [false, true, true] + +ie = pd.ifelse() +with ie.true_block(): + d = pd.layer.add(x, y) + ie.output(d, pd.layer.softmax(d)) +with ie.false_block(): + d = pd.layer.fc(z) + ie.output(d, d+1) +o1, o2 = ie(cond) +``` + +A challenge to implement the `IfElse` operator is to infer those variables to be split, or, say, to identify the variable of the mini-batch or those derived from the mini-batch. + +An equivalent C++ program is as follows: + +```c++ +namespace pd = paddle; + +int x = 10; +int y = 1; +int z = 10; +bool cond = false; +int o1, o2; +if (cond) { + int d = x + y; + o1 = z; + o2 = pd::layer::softmax(z); +} else { + int d = pd::layer::fc(z); + o1 = d; + o2 = d+1; +} +``` diff --git a/doc/_sources/design/infer_var_type.md.txt b/doc/_sources/design/infer_var_type.md.txt new file mode 100644 index 0000000000000000000000000000000000000000..d9d5397becba2ef1806d9341cd49cd9aabbf4a6a --- /dev/null +++ b/doc/_sources/design/infer_var_type.md.txt @@ -0,0 +1,78 @@ +# Design Doc: InferVarType + +## The Problem Posed + +The variable in our design can hold variant types. Such as `LoDTensor` and `SelectedRows`. An operator should be able to inference the variable types of its output. + +For example, a `lookup table` operator takes two `LoDTensor`; one is a float tensor as the embedding table, the other is an int tensor as word ID. The gradient operator of `lookup table` will generate a `SelectedRows` as its output. A `sum` operator can take both `LoDTensor` and `SelectedRows` as its inputs and will generate a `LoDTensor` if any of its inputs is `LoDTensor`, otherwise, the `sum` operator will generate `SelectedRows` as its output. + +The variable type will be constant at runtime. Every variable's type can either be set by the user (input data and parameter) or be inferred by the operator in compile time. + +## Proposed Solution + +The `InferVarType` is a compile-time function which is registered to each operator. The inferface of that function is: + + +```c++ +using InferVarTypeFN = std::function< + void (const OpDescBind& /*op_desc*/, BlockDescBind* /*block*/)>; +``` + +It takes an operator description as its input and will write the output variable type and store them in block description. + +The `InferVarTypeFN` will be registered in `OpInfo`, to replace `infer_var_type_` field. The `OpInfo` should be + +```cpp +struct OpInfo { + InferVarTypeFN infer_var_type_; + ... +}; +``` + +The default `InferVarType` will set output type as `LoDTensor`. It can be done by `GetInferVarType()`. + +```cpp +void DefaultInferVarType(const OpDescBind& op_desc, BlockDescBind* block) { + // set the output type of variable as `LoDTensor`. + // ... +} + +struct OpInfo { + InferVarTypeFN infer_var_type_; + InferVarTypeFN GetInferVarType() const { + if (infer_var_type_) { + return infer_var_type_; + } else { + return DefaultInferVarType; + } + } +}; +``` + +## Register InferVarType + +We provide a thin base class for registering an `InferVarTypeFN`. To use a base class will ease the implementation of registry since we can detect the registry entry is an `InferVarTypeFN` or not. + +```cpp +class VarTypeInferer { +public: + virtual void operator()(const OpDescBind& op_desc, BlockDescBind* block) const = 0; +} +``` + +Operator developers can write the specialize `VarTypeInferer` as follow. + +```cpp +class SpecialVarTypeInferer : public VarTypeInferer { +public: + virtual void operator()(const OpDescBind& op_desc, BlockDescBind* block) const { + // .. own logic + } +} +``` + +Then user can register the `InferVarType` just like `GradOpDescMaker` and `OpInfoMaker`. + +``` +REGISTER_OPERATOR(some_op, OpType, SpecialVarTypeInferer, ...); +``` diff --git a/doc/_sources/design/model_format.md.txt b/doc/_sources/design/model_format.md.txt new file mode 100644 index 0000000000000000000000000000000000000000..e29129fddf775939c9f7a8b49d850d523e6e5a45 --- /dev/null +++ b/doc/_sources/design/model_format.md.txt @@ -0,0 +1,36 @@ +# Design Doc: Model Format + +## Motivation + +A model is an output of the training process. One complete model consists of two parts, the **topology** and the **parameters**. In order to support industrial deployment, the model format must be self-complete and must not expose any training source code. + +As a result, In PaddlePaddle, the **topology** is represented as a [ProgramDesc](https://github.com/PaddlePaddle/Paddle/blob/1c0a4c901c9fc881d120249c703b15d1c50dae7d/doc/design/program.md), which describes the model structure. The **parameters** contain all the trainable weights in the model. We must support large size parameters and efficient serialization/deserialization of parameters. + +## Implementation + +The topology is saved as a plain text in a detailed self-contain protobuf file. + +The parameters are saved as a binary file. As we all know, the protobuf message has a limit of [64M size](https://developers.google.com/protocol-buffers/docs/reference/cpp/google.protobuf.io.coded_stream#CodedInputStream.SetTotalBytesLimit.details). We have done a [benchmark experiment](https://github.com/PaddlePaddle/Paddle/pull/4610), which shows that protobuf is not fit for the task. + +As a result, we design a particular format for tensor serialization. By default, an arbitrary tensor in Paddle is a [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md), and has a description information proto of [LoDTensorDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L99). We save the DescProto as the byte string header. It contains all the necessary information, such as the `dims`, and the `LoD` information in [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/1c0a4c901c9fc881d120249c703b15d1c50dae7d/paddle/framework/lod_tensor.md). A tensor stores values in a continuous memory buffer. For speed we dump the raw memory to disk and save it as the byte string content. So, the binary format of one tensor is, + +The table below shows a tensor's byte view in detail. Note that all the signed values are written in the little-endian format. + +|field name | type | description | +| --- | --- | --- | +| version | uint32_t | Version of saved file. Always 0 now. | +| tensor desc length | uint32_t | TensorDesc(Protobuf message) length in bytes. | +| tensor desc | void* | TensorDesc protobuf binary message | +| tensor data | void* | Tensor's data in binary format. The length of `tensor_data` is decided by `TensorDesc.dims()` and `TensorDesc.data_type()` | +| lod_level | uint64_t | Level of LoD | +| length of lod[0] | uint64_t | [Optional] length of lod[0] in bytes. | +| data of lod[0] | uint64_t* | [Optional] lod[0].data() | +| ... | ... | ... | + + + +## Summary + +- We introduce a model format. +- The model represented by its forward-pass computation procedure is saved in a **ProgramDesc** protobuf message. +- A bunch of specified format binary tensors describe the **parameters**. diff --git a/doc_cn/_sources/design/multi_language_interface/why_plain_c.md.txt b/doc/_sources/design/multi_language_interface/00.why_plain_c.md.txt similarity index 94% rename from doc_cn/_sources/design/multi_language_interface/why_plain_c.md.txt rename to doc/_sources/design/multi_language_interface/00.why_plain_c.md.txt index a3f41ca7b93de8a55d927c88812802ef12246182..a1443093342c5a3ed698fb6b52a751dfc7cb5319 100644 --- a/doc_cn/_sources/design/multi_language_interface/why_plain_c.md.txt +++ b/doc/_sources/design/multi_language_interface/00.why_plain_c.md.txt @@ -58,32 +58,32 @@ typedef void* paddle_matrix; typedef int paddle_error; extern "C" -paddle_error paddle_matrix_shape(paddle_matrix matrix, - uint64_t* width, - uint64_t* height); +paddle_error paddle_matrix_get_shape(paddle_matrix matrix, + uint64_t* width, + uint64_t* height); ``` 而在CPP里面实现这个C的接口,文件 `paddle_matrix.cpp` ```cpp -#include "paddle/math/matrix.hpp" +#include "paddle/math/matrix.h" extern "C" paddle_error paddle_matrix_shape(paddle_matrix matrix, uint64_t *width, uint64_t *height) { - auto m = (paddle::math::matrix*)(matrix); + auto m = (paddle::capi::CMatrix*)(matrix); *width = m->width(); *height = m->height(); } ``` -其中`paddle/math/matrix.hpp`文件内容为: +其中`paddle/capi/CMatrix.hpp`文件内容为: ```cpp namespace paddle { namespace math { -class Matrix { - //... +class CMatrix { + std::shared_ptr mat; }; } // namespace math @@ -113,6 +113,6 @@ class Matrix { | 手写多语言绑定 | 不使用SWIG | 使用SWIG需要多语言绑定的开发人员熟练掌握SWIG配置,社区参与困难。SWIG生成的代码不能保证多语言代码风格的一致性 | -## 简单实现 +## 实现 -TBD +参考[Inference implementation](01.inference_implementation.md) diff --git a/doc/_sources/design/multi_language_interface/01.inference_implementation.md.txt b/doc/_sources/design/multi_language_interface/01.inference_implementation.md.txt new file mode 100644 index 0000000000000000000000000000000000000000..9820284523246a062581f322616d196f575c9d29 --- /dev/null +++ b/doc/_sources/design/multi_language_interface/01.inference_implementation.md.txt @@ -0,0 +1,131 @@ +# C-API 模型推断实现文档 + +本文档描述Paddle C-API的实现细节。Paddle C-API是多语言API的基础部分。Paddle需要暴露的API很多。先实现模型推断的API,通过模型推断API的实现作为一个样例,来进行讨论。至于为什么需要C-API,请参考[Why Plain C](./00.why_plain_c.md)。 + +## Table of Contents + * [C-API 模型推断实现文档](#c-api-模型推断实现文档) + * [暴露接口原则](#暴露接口原则) + * [目录结构](#目录结构) + * [实现方式](#实现方式) + * [capi.h](#capih) + * [具体某种类型的头文件](#具体某种类型的头文件) + * [capi_private.h](#capi_privateh) + * [具体某种类型的实现文件](#具体某种类型的实现文件) + * [libpaddle_capi_shared.{so, dylib}](#libpaddle_capi_sharedso-dylib) + * [libpaddle_capi_whole.a](#libpaddle_capi_wholea) + * [examples](#examples) + * [编译选项](#编译选项) + + +## 暴露接口原则 + +1. 所有的接口均为C接口。即使用`extern "C"` +2. 除构造某种类型的函数(`paddle_matrix_create`等),其他函数均返回`paddle_error`。且调用时不能抛出异常或出现运行时错误。 +3. 所有类型名为`paddle_类型名`,所有与类型相关的函数,函数名为`paddle_类型名_函数名` +4. 如果某一个Paddle Core概念(GradientMachine/Matrix)需要被暴露到其他语言,那么 + * 为了暴露的接口尽量简单。只暴露概念的接口,而不暴露概念的实现。即暴露`GradientMachine`或者`Matrix`但不暴露`RecurrentGradientMachine`和`CpuSparseMatrix`。 + * 暴露这个概念必要函数。`必要`是指,即完成某一个任务的最少函数。 +5. 不在`capi`接口层做过多封装。 + * 如果某一个Paddle概念必须要暴露,但是又过于琐碎。不在`capi`这一层进行封装,而是直接修改Paddle Core。让Paddle核心中,这一概念不再琐碎。 + + +## 目录结构 + +```text +Paddle + `-- paddle + `-- capi + `-- examples # The example project for C-API. + `-- tests # unittests for C-API + `-- capi.h # C-API header file. + `-- capi_private.h # The shared header file between implementation sources. + `-- matrix.{h, cpp} + `-- gradient_machine.{h, cpp} + `-- ... +``` + + +Paddle的C-API目录结构如上图表所示。这个目录中除了`capi_private.h`之外的所有头文件,均会被安装到include/paddle路径下。C-API生成的二进制文件会被安装到`lib`目录下。即,安装后的目录结构为 + +```text +`-- include + `-- paddle + `-- capi.h + `-- matrix.h + `-- gradient_machine.h + `-- ... +`-- lib + `-- libpaddle_capi_shared.{so, dylib} # In mac, dynamic libary's file name extention is `dylib` + `-- libpaddle_capi_whole.a # static library for all symbols of Paddle. +``` + +## 实现方式 + +下面分别介绍某一类文件的实现方式。 + +### capi.h + +`capi.h`是用户使用C-API时所唯一需要引入的头文件。在`capi.h`中,引入了类型的头文件,`matrix.h`, `gradient_machine.h`。在引入其他类型的头文件时,使用相对路径的引用方式。即`#include "matrix.h"` + +### 具体某种类型的头文件 + +具体某种类型的头文件,即例如`matrix.h`,`gradient_machine.h`等。在这些头文件中,包含了某种类型的类型定义和暴露的全部函数。 + +这个头文件不假设其他文件的引用顺序,即使用户直接引用某种类型的头文件,也不应该报错(虽然不鼓励这样)。如果某一个类型需要引用另一个类型,例如`gradient_machine`需要引用`matrix`,则直接引入另一种类型的头文件,即`#include "matrix.h"`。 + +### capi_private.h + +`capi_prviate.h`是各个实现中共享的头文件,他主要包含了实际暴露的类型结构。在用户使用C-API时,Paddle的类型全部退化成`void *`,即`typedef paddle_matrix void*`。但,对于每种C-API暴露的类型,均是在`capi_private.h`中实现的结构体。 + +```cpp +struct CMatrix { + int type = MatrixType; + std::shared_ptr mat; +}; +``` + +通常,这个结构体包含两个项目。 + +* `type`是一个类型的标志。对于每种类型,type字段均不尽相同。这样,即使C-API接受的类型全是`void *`,我们也可以确定每一个参数的类型。 + + ```cpp + void some_c_api_function(void* some_instance) { + int* type = (int *) some_instance; + switch (*type) { + case MatrixType: + CMatrix* mat = (CMatrix *) some_instance; + ... + ... + } + } + ``` +* 这个结构体中的另一个项目是,Paddle Core中这一类型接口的智能指针(shared_ptr)。 + * 使用智能指针的原因是: 用户可以安全的释放某个C-API的实例,而不必在意Paddle Core是否还在使用这个实例。 + * 例如,用户通过C-API获得了神经网络的参数实例。当用户使用完这个参数后,直接删除这个参数即可。即便Paddle Core中的模型还在使用这个参数,这个参数也不会一并删除。 + +### 具体某种类型的实现文件 + +具体某种类型的实现文件,即`matrix.cpp`, `gradient_machine.cpp`等文件。在这些文件中,使用C++ 11实现了C-API的接口,并且使用`extern "C"`导出这些接口。在实现过程中,对输入参数的安全性进行了必要的判断,并将C-API接口的参数转发给`Paddle Core`。 + +### libpaddle\_capi_shared.{so, dylib} + +`libpaddle_capi_shared`是C-API导出的动态库。这个动态库的连接参数与Paddle的其他二进制(例如`paddle_trainer`)类似。用户可以直接使用这个动态库来引入Paddle C-API。具体使用方法为`-lpaddle_capi_shared`。 + +### libpaddle\_capi_whole.a + +`libpaddle_capi_whole`是C-API导出的静态库。这个静态库包含了Paddle的全部符号。他是将`libpaddle_gserver.a`, `libpaddle_math.a`, `libpaddle_capi.a`等全部静态库中的目标文件全部打包后产生的文件。具体使用方法为`--whole-archive -lpaddle_capi_whole --no-whole-archive`。 + + +### examples + +在样例中,使用`C99`开发了模型预测的样例代码。具体请参考[example/README.md](../../../paddle/capi/examples/README.md)。 + +## 编译选项 + +C-API的编译选项默认关闭,打开这个编译选项,需要在cmake的时候,设置 + +```bash +cmake ${YOUR_SOURCE_ROOT} -DWITH_C_API=ON -DWITH_PYTHON=OFF -DWITH_SWIG_PY=OFF +``` + +编译C-API的时候推荐Paddle不嵌入Python解释器,也不生成`SWIG`接口,具体原因参考[Why Plain C](./00.why_plain_c.md)。 diff --git a/doc/_sources/design/ops/rnn.md.txt b/doc/_sources/design/ops/rnn.md.txt new file mode 100644 index 0000000000000000000000000000000000000000..2f4854793fa1f0b02e4dc17b51a48a972be61c06 --- /dev/null +++ b/doc/_sources/design/ops/rnn.md.txt @@ -0,0 +1,153 @@ +# RNNOp design + +This document describes the RNN (Recurrent Neural Network) operator and how it is implemented in PaddlePaddle. The RNN op requires that all instances in a mini-batch have the same length. We will have a more flexible dynamic RNN operator in the future. + +## RNN Algorithm Implementation + +

+ +

+ +The above diagram shows an RNN unrolled into a full network. + +There are several important concepts here: + +- *step-net*: the sub-graph that runs at each step. +- *memory*, $h_t$, the state of the current step. +- *ex-memory*, $h_{t-1}$, the state of the previous step. +- *initial memory value*, the memory of the first (initial) step. + +### Step-scope + +There could be local variables defined in each step-net. PaddlePaddle runtime realizes these variables in *step-scopes* which are created for each step. + +

+
+Figure 2 illustrates the RNN's data flow +

+ +Please be aware that every step runs the same step-net. Each step does the following: + +1. Creates the step-scope. +2. Initializes the local variables including step-outputs, in the step-scope. +3. Runs the step-net, which uses the above mentioned variables. + +The RNN operator will compose its output from step outputs in each of the step scopes. + +### Memory and Ex-memory + +Let's give more details about memory and ex-memory using a simple example: + +$$ +h_t = U h_{t-1} + W x_t +$$, + +where $h_t$ and $h_{t-1}$ are the memory and ex-memory (previous memory) of step $t$ respectively. + +In the implementation, we can make an ex-memory variable either "refer to" the memory variable of the previous step, +or copy the memory value of the previous step to the current ex-memory variable. + +### Usage in Python + +For more information on Block, please refer to the [design doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/block.md). + +We can define an RNN's step-net using a Block: + +```python +import paddle as pd + +X = some_op() # x is some operator's output and is a LoDTensor +a = some_op() + +# declare parameters +W = pd.Variable(shape=[20, 30]) +U = pd.Variable(shape=[20, 30]) + +rnn = pd.create_rnn_op(output_num=1) +with rnn.stepnet(): + x = rnn.add_input(X) + # declare a memory (rnn's step) + h = rnn.add_memory(init=a) + # h.pre_state(), the previous memory of rnn + new_state = pd.add_two( pd.matmul(W, x) + pd.matmul(U, h.pre_state())) + # update current memory + h.update(new_state) + # indicate that h variables in all step scopes should be merged + rnn.add_outputs(h) + +out = rnn() +``` + +Python API functions in above example: + +- `rnn.add_input`: indicates that the parameter is a variable that will be segmented into step-inputs. +- `rnn.add_memory`: creates a variable used as the memory. +- `rnn.add_outputs`: marks the variables that will be concatenated across steps into the RNN output. + +### Nested RNN and LoDTensor + +An RNN whose step-net includes other RNN operators is known as an *nested RNN*. + +For example, we could have a 2-level RNN, where the top level corresponds to paragraphs, and the lower level corresponds to sentences. Each step of the higher level RNN also receives an input from the corresponding step of the lower level, and additionally the output from the previous time step at the same level. + +The following figure illustrates feeding in text into the lower level, one sentence at a step, and the feeding in step outputs to the top level. The final top level output is about the whole text. + +

+ +

+ +```python +import paddle as pd + +W = pd.Variable(shape=[20, 30]) +U = pd.Variable(shape=[20, 30]) + +W0 = pd.Variable(shape=[20, 30]) +U0 = pd.Variable(shape=[20, 30]) + +# a is output of some op +a = some_op() + +# chapter_data is a set of 128-dim word vectors +# the first level of LoD is sentence +# the second level of LoD is a chapter +chapter_data = pd.Variable(shape=[None, 128], type=pd.lod_tensor, level=2) + +def lower_level_rnn(paragraph): + ''' + x: the input + ''' + rnn = pd.create_rnn_op(output_num=1) + with rnn.stepnet(): + sentence = rnn.add_input(paragraph, level=0) + h = rnn.add_memory(shape=[20, 30]) + h.update( + pd.matmul(W, sentence) + pd.matmul(U, h.pre_state())) + # get the last state as sentence's info + rnn.add_outputs(h) + return rnn + +top_level_rnn = pd.create_rnn_op(output_num=1) +with top_level_rnn.stepnet(): + paragraph_data = rnn.add_input(chapter_data, level=1) + low_rnn = lower_level_rnn(paragraph_data) + paragraph_out = low_rnn() + + h = rnn.add_memory(init=a) + h.update( + pd.matmul(W0, paragraph_data) + pd.matmul(U0, h.pre_state())) + top_level_rnn.add_outputs(h) + +# output the last step +chapter_out = top_level_rnn(output_all_steps=False) +``` + +In the above example, the construction of the `top_level_rnn` calls `lower_level_rnn`. The input is an LoD Tensor. The top level RNN segments input text data into paragraphs, and the lower level RNN segments each paragraph into sentences. + +By default, the `RNNOp` will concatenate the outputs from all the time steps. +If the `output_all_steps` is set to False, it will only output the final time step. + + +

+ +

diff --git a/doc/_sources/design/ops/sequence_decoder.md.txt b/doc/_sources/design/ops/sequence_decoder.md.txt new file mode 100644 index 0000000000000000000000000000000000000000..9db5fb8e9a9f89b004bf71ddc064cd976c0d0bee --- /dev/null +++ b/doc/_sources/design/ops/sequence_decoder.md.txt @@ -0,0 +1,229 @@ +# Design: Sequence Decoder Generating LoDTensors +In tasks such as machine translation and visual captioning, +a [sequence decoder](https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/README.md) is necessary to generate sequences, one word at a time. + +This documentation describes how to implement the sequence decoder as an operator. + +## Beam Search based Decoder +The [beam search algorithm](https://en.wikipedia.org/wiki/Beam_search) is necessary when generating sequences. It is a heuristic search algorithm that explores the paths by expanding the most promising node in a limited set. + +In the old version of PaddlePaddle, the C++ class `RecurrentGradientMachine` implements the general sequence decoder based on beam search, due to the complexity involved, the implementation relies on a lot of special data structures that are quite trivial and hard to be customized by users. + +There are a lot of heuristic tricks in the sequence generation tasks, so the flexibility of sequence decoder is very important to users. + +During the refactoring of PaddlePaddle, some new concepts are proposed such as: [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md) and [TensorArray](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/tensor_array.md) that can better support the sequence usage, and they can also help make the implementation of beam search based sequence decoder **more transparent and modular** . + +For example, the RNN states, candidates IDs and probabilities of beam search can be represented all as `LoDTensors`; +the selected candidate's IDs in each time step can be stored in a `TensorArray`, and `Packed` to the sentences translated. + +## Changing LoD's absolute offset to relative offsets +The current `LoDTensor` is designed to store levels of variable-length sequences. It stores several arrays of integers where each represents a level. + +The integers in each level represent the begin and end (not inclusive) offset of a sequence **in the underlying tensor**, +let's call this format the **absolute-offset LoD** for clarity. + +The relative-offset LoD can retrieve any sequence very quickly but fails to represent empty sequences, for example, a two-level LoD is as follows +```python +[[0, 3, 9] + [0, 2, 3, 3, 3, 9]] +``` +The first level tells that there are two sequences: +- the first's offset is `[0, 3)` +- the second's offset is `[3, 9)` + +while on the second level, there are several empty sequences that both begin and end at `3`. +It is impossible to tell how many empty second-level sequences exist in the first-level sequences. + +There are many scenarios that rely on empty sequence representation, for example in machine translation or visual captioning, one instance has no translation or the empty candidate set for a prefix. + +So let's introduce another format of LoD, +it stores **the offsets of the lower level sequences** and is called **relative-offset** LoD. + +For example, to represent the same sequences of the above data + +```python +[[0, 3, 6] + [0, 2, 3, 3, 3, 9]] +``` + +the first level represents that there are two sequences, +their offsets in the second-level LoD is `[0, 3)` and `[3, 5)`. + +The second level is the same with the relative offset example because the lower level is a tensor. +It is easy to find out the second sequence in the first-level LoD has two empty sequences. + +The following examples are based on relative-offset LoD. + +## Usage in a simple machine translation model +Let's start from a simple machine translation model that is simplified from the [machine translation chapter](https://github.com/PaddlePaddle/book/tree/develop/08.machine_translation) to draw a blueprint of what a sequence decoder can do and how to use it. + +The model has an encoder that learns the semantic vector from a sequence, and a decoder which uses the sequence encoder to generate new sentences. + +**Encoder** +```python +import paddle as pd + +dict_size = 8000 +source_dict_size = dict_size +target_dict_size = dict_size +word_vector_dim = 128 +encoder_dim = 128 +decoder_dim = 128 +beam_size = 5 +max_length = 120 + +# encoder +src_word_id = pd.data( + name='source_language_word', + type=pd.data.integer_value_sequence(source_dict_dim)) +src_embedding = pd.embedding(size=source_dict_size, size=word_vector_dim) + +src_word_vec = pd.lookup(src_embedding, src_word_id) + +encoder_out_seq = pd.gru(input=src_word_vec, size=encoder_dim) + +encoder_ctx = pd.last_seq(encoder_out_seq) +# encoder_ctx_proj is the learned semantic vector +encoder_ctx_proj = pd.fc( + encoder_ctx, size=decoder_dim, act=pd.activation.Tanh(), bias=None) +``` + +**Decoder** + +```python +def generate(): + decoder = pd.while_loop() + with decoder.step(): + decoder_mem = decoder.memory(init=encoder_ctx) # mark the memory + generated_ids = decoder.memory() # TODO init to batch_size s + generated_scores = decoder.memory() # TODO init to batch_size 1s or 0s + + target_word = pd.lookup(trg_embedding, gendrated_ids) + # expand encoder_ctx's batch to fit target_word's lod + # for example + # decoder_mem.lod is + # [[0 1 3], + # [0 1 3 6]] + # its tensor content is [a1 a2 a3 a4 a5] + # which means there are 2 sentences to translate + # - the first sentence has 1 translation prefixes, the offsets are [0, 1) + # - the second sentence has 2 translation prefixes, the offsets are [1, 3) and [3, 6) + # the target_word.lod is + # [[0, 1, 6] + # [0, 2, 4, 7, 9 12]] + # which means 2 sentences to translate, each has 1 and 5 prefixes + # the first prefix has 2 candidates + # the following has 2, 3, 2, 3 candidates + # the encoder_ctx_expanded's content will be + # [a1 a1 a2 a2 a3 a3 a3 a4 a4 a5 a5 a5] + encoder_ctx_expanded = pd.lod_expand(encoder_ctx, target_word) + decoder_input = pd.fc( + act=pd.activation.Linear(), + input=[target_word, encoder_ctx], + size=3 * decoder_dim) + gru_out, cur_mem = pd.gru_step( + decoder_input, mem=decoder_mem, size=decoder_dim) + scores = pd.fc( + gru_out, + size=trg_dic_size, + bias=None, + act=pd.activation.Softmax()) + # K is an config + topk_scores, topk_ids = pd.top_k(scores, K) + topk_generated_scores = pd.add_scalar(topk_scores, generated_scores) + + selected_ids, selected_generation_scores = decoder.beam_search( + topk_ids, topk_generated_scores) + + # update the states + decoder_mem.update(cur_mem) # tells how to update state + generated_ids.update(selected_ids) + generated_scores.update(selected_generation_scores) + + decoder.output(selected_ids) + decoder.output(selected_generation_scores) + +translation_ids, translation_scores = decoder() +``` +The `decoder.beam_search` is an operator that, given the candidates and the scores of translations including the candidates, +returns the result of the beam search algorithm. + +In this way, users can customize anything on the input or output of beam search, for example: + +1. Make the corresponding elements in `topk_generated_scores` zero or some small values, beam_search will discard this candidate. +2. Remove some specific candidate in `selected_ids`. +3. Get the final `translation_ids`, remove the translation sequence in it. + +The implementation of sequence decoder can reuse the C++ class: [RNNAlgorithm](https://github.com/Superjom/Paddle/blob/68cac3c0f8451fe62a4cdf156747d6dc0ee000b3/paddle/operators/dynamic_recurrent_op.h#L30), +so the python syntax is quite similar to that of an [RNN](https://github.com/Superjom/Paddle/blob/68cac3c0f8451fe62a4cdf156747d6dc0ee000b3/doc/design/block.md#blocks-with-for-and-rnnop). + +Both of them are two-level `LoDTensors`: + +- The first level represents `batch_size` of (source) sentences. +- The second level represents the candidate ID sets for translation prefix. + +For example, 3 source sentences to translate, and has 2, 3, 1 candidates. + +Unlike an RNN, in sequence decoder, the previous state and the current state have different LoD and shape, and an `lod_expand` operator is used to expand the LoD of the previous state to fit the current state. + +For example, the previous state: + +* LoD is `[0, 1, 3][0, 2, 5, 6]` +* content of tensor is `a1 a2 b1 b2 b3 c1` + +the current state is stored in `encoder_ctx_expanded`: + +* LoD is `[0, 2, 7][0 3 5 8 9 11 11]` +* the content is + - a1 a1 a1 (a1 has 3 candidates, so the state should be copied 3 times for each candidates) + - a2 a2 + - b1 b1 b1 + - b2 + - b3 b3 + - None (c1 has 0 candidates, so c1 is dropped) + +The benefit from the relative offset LoD is that the empty candidate set can be represented naturally. + +The status in each time step can be stored in `TensorArray`, and `Pack`ed to a final LoDTensor. The corresponding syntax is: + +```python +decoder.output(selected_ids) +decoder.output(selected_generation_scores) +``` + +The `selected_ids` are the candidate ids for the prefixes, and will be `Packed` by `TensorArray` to a two-level `LoDTensor`, where the first level represents the source sequences and the second level represents generated sequences. + +Packing the `selected_scores` will get a `LoDTensor` that stores scores of each translation candidate. + +Packing the `selected_generation_scores` will get a `LoDTensor`, and each tail is the probability of the translation. + +## LoD and shape changes during decoding +

+ +

+ +According to the image above, the only phase that changes the LoD is beam search. + +## Beam search design +The beam search algorithm will be implemented as one method of the sequence decoder and has 3 inputs: + +1. `topk_ids`, the top K candidate ids for each prefix. +2. `topk_scores`, the corresponding scores for `topk_ids` +3. `generated_scores`, the score of the prefixes. + +All of these are LoDTensors, so that the sequence affiliation is clear. Beam search will keep a beam for each prefix and select a smaller candidate set for each prefix. + +It will return three variables: + +1. `selected_ids`, the final candidate beam search function selected for the next step. +2. `selected_scores`, the scores for the candidates. +3. `generated_scores`, the updated scores for each prefix (with the new candidates appended). + +## Introducing the LoD-based `Pack` and `Unpack` methods in `TensorArray` +The `selected_ids`, `selected_scores` and `generated_scores` are LoDTensors that exist at each time step, +so it is natural to store them in arrays. + +Currently, PaddlePaddle has a module called `TensorArray` which can store an array of tensors. It is better to store the results of beam search in a `TensorArray`. + +The `Pack` and `UnPack` in `TensorArray` are used to pack tensors in the array to an `LoDTensor` or split the `LoDTensor` to an array of tensors. +It needs some extensions to support the packing or unpacking an array of `LoDTensors`. diff --git a/doc/_sources/design/optimizer.md.txt b/doc/_sources/design/optimizer.md.txt new file mode 100644 index 0000000000000000000000000000000000000000..202b4b65103c0b7c536a9cb466c4120ce134d8c3 --- /dev/null +++ b/doc/_sources/design/optimizer.md.txt @@ -0,0 +1,91 @@ +## Optimizer Design + +### The Problem + +A PaddlePaddle program, or a block, is a sequence of operators operating variables. A training program needs to do three kinds of works: + +1. the forward pass, which computes intermediate results and the cost(s), +1. the backward pass, which derives gradients from intermediate results and costs, and +1. the optimization pass, which update model parameters to optimize the cost(s). + +These works rely on three kinds of operators: + +1. forward operators, +1. gradient operators, and +1. optimization operators. + +It's true that users should be able to create all these operators manually by calling some low-level API, but it would be much more convenient if they could only describe the forward pass and let PaddlePaddle create the backward and optimization operators automatically. + +In this design, we propose a high-level API that automatically derives the optimisation pass and operators from the forward pass. + + +### High-level Python API to describe the training process + +1. User write code to describe the network: + + ```python + images = layer.data("images") + labels = layer.data("labels") + w1 = pd.var("w1") + b1 = pd.var("b1") + hidden = layer.fc(images, w=w1, b=b1) + cost = layer.mse(hidden, labels) + ``` + + The above code snippet will create forward operators in [Block](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/block.md). + + +2. Users create a certain kind of Optimizer with some argument. + + ```python + optimizer = AdagradOptimizer(learing_rate=0.001) + ``` + +3. Users use the optimizer to `minimize` a certain `cost` through updating parameters in parameter_list. + + ```python + opt_op_list = optimizer.minimize(cost, parameter_list=[w1, b1]) + ``` + The above code snippet will create gradient and optimization operators in Block. The return value of `minimize()` is list of optimization operators that will be run by session. + +4. Users use Session/Executor to run this opt_op_list as target to do training. + + ```python + sess.run(target= opt_op_list, ...) + ``` + +#### Optimizer Python interface: + +```python +class Optimizer(object): + """Optimizer Base class. + + """ + + def __init__(self): + pass + + def create_optimization_pass(self, parameters_and_grads): + """Add optimization operators to update gradients to variables. + + Args: + parameters_and_grads: a list of (variable, gradient) pair to update. + + Returns: + optmization_op_list: a list of optimization operator that will update parameter using gradient. + """ + return None + + def minimize(self, loss, parameter_list): + """Add operations to minimize `loss` by updating `parameter_list`. + + This method combines interface `append_backward_ops()` and + `create_optimization_pass()` into one. + """ + params_grads = self.create_backward_pass(loss, parameter_list) + update_ops = self.create_optimization_pass(params_grads) + return update_ops + +``` + +Users can inherit the Optimizer above to create their own Optimizer with some special logic, such as AdagradOptimizer. diff --git a/doc/_sources/design/parameter_average.md.txt b/doc/_sources/design/parameter_average.md.txt new file mode 100644 index 0000000000000000000000000000000000000000..2c4edee9fe31d502ea62b9fe5c8757c0a4c5e79f --- /dev/null +++ b/doc/_sources/design/parameter_average.md.txt @@ -0,0 +1,72 @@ +# Averaging Parameter in PaddlePaddle + +## Why Averaging +In a large scale machine learning setup where the size of the training data is huge, it could take us a large number of iterations over the training data before we can achieve the optimal values of parameters of our model. Looking at the problem setup, it is desirable if we can obtain the optimal values of parameters by going through the data in as few passes as we can. + +Polyak and Juditsky (1992) showed that the test performance of simple average of parameters obtained by Stochastic Gradient Descent (SGD) is as good as that of parameter values that are obtained by training the model over and over again, over the training dataset. + +Hence, to accelerate the speed of Stochastic Gradient Descent, Averaged Stochastic Gradient Descent (ASGD) was proposed in Polyak and Juditsky (1992). For ASGD, the running average of parameters obtained by SGD, is used as the estimator for
. The averaging is done as follows: + +
+ +We propose averaging for any optimizer similar to how ASGD performs it, as mentioned above. + +### How to perform Parameter Averaging in PaddlePaddle + +Parameter Averaging in PaddlePaddle works in the following way during training : +1. It will take in an instance of a normal optimizer as an input, e.g. RMSPropOptimizer +2. The optimizer itself is responsible for updating the parameters. +3. The ParameterAverageOptimizer maintains a separate copy of the parameters for itself: + 1. In concept, the values of this copy are the average of the values of the parameters in the most recent N batches. + 2. However, saving all the N instances of the parameters in memory is not feasible. + 3. Therefore, an approximation algorithm is used. + +Hence, overall we have have two copies of the parameters: one for the optimizer itself, and one for the ParameterAverageOptimizer. The former should be used in back propagation, while the latter should be used during testing and should be saved. + +During the testing/ saving the model phase, we perform the following steps: +1. Perform the delayed operations. +2. Save current values of the parameters to a temporary variable. +3. Replace the values of the parameters with the averaged values. +4. Perform testing and/or save the parameters. +5. Restore the values of the parameters once done. + +### How to implement Averaging of Parameter in PaddlePaddle + +We can add the ParameterAverageOptimizer op to the graph through Python API. Using this approach, we manually add this op to the graph and direct the output of the optimizer op to this op during training. + + **Advantages**: + - Allows for greater flexibility to the users of PaddlePaddle. Using this approach, the users can plug different optimizers into ParameterAverageOptimizer by passing in the optimizer to the op. + - Makes it easy for the users to customize and extend the framework. + + **Disadvantages**: + - Implementation requires re-writing the averaging methodology in Python. + +### Low-Level implementation + +In the new design, we propose to create a new operation for averaging parameter updates (ParameterAverageOptimizer). For now, we can add an op that takes in the following as input: +- the optimizer +- the window_size to keep the updates + +The ParameterAverageOptimizer op can be like any other operator with its own CPU/GPU implementation either using Eigen or separate CPU and GPU kernels. As the initial implementation, we can implement the kernel using Eigen following the abstraction pattern implemented for [Operators](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/rmsprop_op.h). We also want to support the case when the Trainer/Optimizer runs on the GPU while ParameterAverageOptimizer runs on a CPU. + +The idea of building an op for averaging is in sync with the refactored PaddlePaddle philosophy of using operators to represent any computation unit. The way the op will be added to the computation graph will be decided by the [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md#layer-function) in Python API. + +### Python API implementation for ParameterAverageOptimizer + +Based on Polyak and Juditsky (1992), we can generalize the averaging of updates to any optimizer. The input to the op would be the following: +- Any optimizer (RMSProp , AdaGrad etc.) +- A window size. The op keeps accumulating updated parameter values over a window of N batches and takes an average. Move the averaged value to a buffer when window is full to avoid loss of precision. + +Using the ParameterAverageOptimizer op, any user can add the operation to their computation graphs. However, this will require a lot of lines of code and we should design Python APIs that support averaging. As per the PaddlePaddle [Python API design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md), the layer functions are responsible for creating operators, operator parameters and variables. Since ParameterAverageOptimizer will be an operator, it makes sense to create it in the layer functions. +We will have a wrapper written in Python that will support the functionality and implement the actual core computation in C++ core as we have done for other [Optimizers](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/rmsprop_op.cc) + +#### Creation of the ParameterAverageOptimizer operator +There are two ways for creating the ParameterAverageOptimizer op: +1. We create the op immediately while building the computation graph. +2. We add the op in a lazy manner, just before the backward pass, similar to the way the optimization ops are added. + +The proposal is to add the op immediately while building the computation graph. + +#### High-level API + +In PaddlePaddle Python API, users will primarily rely on [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md#layer-function) to create neural network layers. Hence, we also need to provide parameter average functionality in layer functions. diff --git a/doc/_sources/design/parameters_in_cpp.md.txt b/doc/_sources/design/parameters_in_cpp.md.txt new file mode 100644 index 0000000000000000000000000000000000000000..a7ac3f17c44ca94a669a8f1e283b291bceb42317 --- /dev/null +++ b/doc/_sources/design/parameters_in_cpp.md.txt @@ -0,0 +1,41 @@ +# Design Doc: The C++ Class `Parameters` + +`Parameters` is a concept we designed in PaddlePaddle V2 API. `Parameters` is a container of parameters, which makes PaddlePaddle capable of sharing parameter between topologies. We described usages of `Parameter` in [api.md](./api.md). + +We used Python to implement Parameters when designing V2 API before. There are several defects for the current implementation: +* We just use `memcpy` to share Parameters between topologies, but this is very inefficient. +* We did not support sharing Parameters while training. We just trigger `memcpy` when start training. + +It is necessary that we implement Parameters in CPP side. However, it could result a code refactoring for PaddlePaddle, because PaddlePaddle was designed for training only one topology before, i.e., each GradientMachine contains its Parameter as a data member. In current PaddlePaddle implementation, there are three concepts associated with `Parameters`: + +1. `paddle::Parameter`. A `Parameters` is a container for `paddle::Parameter`. +It is evident that we should use `paddle::Parameter` when developing `Parameters`. +However, the `Parameter` class contains many functions and does not have a clear interface. +It contains `create/store Parameter`, `serialize/deserialize`, `optimize(i.e SGD)`, `randomize/zero`. +When we developing `Parameters`, we only use `create/store Parameter` functionality. +We should extract functionalities of Parameter into many classes to clean PaddlePaddle CPP implementation. + +2. `paddle::GradientMachine` and its sub-classes, e.g., `paddle::MultiGradientMachine`, `paddle::NeuralNetwork`. +We should pass `Parameters` to `paddle::GradientMachine` when `forward/backward` to avoid `memcpy` between topologies. +Also, we should handle multi-GPU/CPU training, because `forward` and `backward` would perform on multi-GPUs and multi-CPUs. +`Parameters` should dispatch the parameter value to each device, and gather the parameter gradient from each device. + +3. `paddle::ParameterUpdater`. The ParameterUpdater is used to update parameters in Paddle. +So `Parameters` should be used by `paddle::ParameterUpdater`, and `paddle::ParameterUpdater` should optimize `Parameters` (by SGD). + + +The step by step approach for implementation Parameters in PaddlePaddle C++ core is listed below. Each step should be a PR and could be merged into PaddlePaddle one by one. + +1. Clean `paddle::Parameter` interface. Extract the functionalities of `paddle::Parameter` to prepare for the implementation of Parameters. + +2. Implementation a `Parameters` class. It just stores the `paddle::Parameter` inside. Make `GradientMachine` uses `Parameters` as a class member. + +3. Make `Parameters` support Multi-CPU and Multi-GPU training to prepare for sharing `Parameter` between topologies. +Because we need share `Parameters` between topologies, it is `Parameters`'s response to exchange Parameters between GPUs. +`GradientMachine` should not handle how to exchange Parameters because `GradientMachine` only used to train one topology and we need to support train many topologies in Paddle, i.e., there could be many GradientMachines use one `Parameters`. + * We should use a global function to exchange Parameters between GPUs, not a member function in `Parameters`. The `MultiGradientMachine` invoke this function, which uses `Parameters` as this function inputs. + * The MultiGradientMachine contains many functionalities. Extracting the Parameters exchanging logic could make MultiGradientMachine clearer and simpler. + +4. Make `Parameters` as an argument for `forward/backward` function, not a data member for `GradientMachine`. For example, `forward` could be `forward(const Parameters& params, ...)` and `backward` could be `backward(Parameters* params, ...)`. After this step, Paddle could share `Parameters` between topologies. + +5. `ParameterUpdater` is invoked by `GradientMachine` and `Trainer`, but it updates `Parameters`. In the end of this code refactoring, we could change `ParameterUpdater` directly uses `Parameters` to make `ParameterUpdater`'s implementation clear. diff --git a/doc/_sources/design/program.md.txt b/doc/_sources/design/program.md.txt new file mode 100644 index 0000000000000000000000000000000000000000..bd2456787c4e336d357a65255a8274a7c9e465cc --- /dev/null +++ b/doc/_sources/design/program.md.txt @@ -0,0 +1,139 @@ +# Design Doc: PaddlePaddle Programs + +## Compile and Execution + +A PaddlePaddle program consists of two parts -- the first generates a `ProgramDesc` protobuf message that describes the program, and the second runs this message using a C++ class `Executor`. + +A simple example PaddlePaddle program can be found in [graph.md](./graph.md): + +```python +x = layer.data("images") +l = layer.data("label") +y = layer.fc(x) +cost = layer.mse(y, l) +optimize(cost) +train(cost, reader=mnist.train()) +``` + +The first five lines of the following PaddlePaddle program generates, or, compiles, the `ProgramDesc` message. The last line runs it. + +## Programs and Blocks + +The basic structure of a PaddlePaddle program is some nested blocks, as a C++ or Java program. + +- program: some nested blocks +- [block](./block.md): + - some local variable definitions, and + - a sequence of operators + +The concept of block comes from usual programs. For example, the following C++ program has three blocks: + +```c++ +int main() { // block 0 + int i = 0; + if (i < 10) { // block 1 + for (int j = 0; j < 10; j++) { // block 2 + } + } + return 0; +} +``` + +The following PaddlePaddle program has three blocks: + +```python +import paddle as pd // block 0 + +x = minibatch([10, 20, 30]) # shape=[None, 1] +y = var(1) # shape=[1], value=1 +z = minibatch([10, 20, 30]) # shape=[None, 1] +cond = larger_than(x, 15) # [false, true, true] + +ie = pd.ifelse() +with ie.true_block(): // block 1 + d = pd.layer.add_scalar(x, y) + ie.output(d, pd.layer.softmax(d)) +with ie.false_block(): // block 2 + d = pd.layer.fc(z) + ie.output(d, d+1) +o1, o2 = ie(cond) +``` + +## `BlockDesc` and `ProgramDesc` + +All protobuf messages are defined in `framework.proto`. + +`BlockDesc` is straight-forward -- it includes local variable definitions, `vars`, and a sequence of operators, `ops`. + +```protobuf +message BlockDesc { + required int32 parent = 1; + repeated VarDesc vars = 2; + repeated OpDesc ops = 3; +} +``` + +The parent ID indicates the parent block so that operators in a block can refer to variables defined locally and also those defined in their ancestor blocks. + +All hierarchical blocks in a program are flattened and stored in an array. The block ID is the index of the block in this array. + +```protobuf +message ProgramDesc { + repeated BlockDesc blocks = 1; +} +``` + + +### Global Block + +The global block is the first one in the above array. + +## Operators that Use Blocks + +In the above example, the operator `IfElseOp` has two blocks -- the true branch and the false branch. + +The definition of `OpDesc` shows that an operator could have some attributes: + +```protobuf +message OpDesc { + AttrDesc attrs = 1; + ... +} +``` + +and an attribute could be of type block, which is, in fact, a block ID as described above: + +``` +message AttrDesc { + required string name = 1; + + enum AttrType { + INT = 1, + STRING = 2, + ... + BLOCK = ... + } + required AttrType type = 2; + + optional int32 block = 10; // when type == BLOCK + ... +} +``` + +## InferShape + +With this design, the InferShape function should take the following parameters: + +```c++ +void InferShape(int current_block, + int current_operator, + ProgramDesc* program // might change VarDesc values. + ) { + ... +} +``` + +where + +- `current_block` indices into `ProgramDesc::blocks`, +- `current_operator` indices into `BlockDesc::ops`. diff --git a/doc/_sources/design/prune.md.txt b/doc/_sources/design/prune.md.txt new file mode 100644 index 0000000000000000000000000000000000000000..4a5cf10c79a554779137f0cce5494fdd96ef6b7a --- /dev/null +++ b/doc/_sources/design/prune.md.txt @@ -0,0 +1,63 @@ +# Prune + +## Motivation + +We want to support running inference, training and checkpointing in one `ProgramDesc`. We implement +`void Prune(const ProgramDesc* input, ProgramDesc* output)` function, which takes a `ProgramDesc` +and generate a pruned `ProgramDesc`. + +## Challenge + +Pruning need to support both variables and operators being evaluation targets. Consider the following +different situations. + +```python +# Case 1: run foward pass. +cost_np = session.run(target=cost) +# Case 2: run backward passing. +opts_np, _ = session.run(target=[cost, opt]) +# Case 3: run checkpointing +_ = session.run(target=checkpoint) +``` + +## Solution + +To support evaluation of operators, we add `is_target` field in the `OpDesc`. + +```c++ +message OpDesc { + required string type = 3; + repeated Var inputs = 1; + repeated Var outputs = 2; + repeated Attr attrs = 4; + optional bool is_target = 5 [ default = false ]; +}; +``` + +To support evaluation of variables, we add [fetch_op](https://github.com/PaddlePaddle/Paddle/pull/4599). +For each variable in the `target`, we insert a `fetch_op` into the `ProgramDesc` with `variable` being +`fetch_op`'s input. Then we also set `fetch_op` is a target. + +### Algorithm + +If an operator needs to be run, it must fall into one of the following cases: + +1. It is the target. +2. It is depended by some other ops, meaning its output is some other op's input. + +The first case can be checked by `op_desc.is_traget()` . The second case can be implement as + +```c++ +bool HasDependentVar(const OpDesc& op_desc, const std::set& dependent_vars) { + for (auto& var : op_desc.outputs()) { + for (auto& argu : var.arguments()) { + if (dependent_vars.count(argu) != 0) { + return true; + } + } + } + return false; +} +``` + +Then the whole algorithm can be implemented as the following [code](https://github.com/tonyyang-svail/Paddle/blob/prune_impl/paddle/framework/prune.cc). diff --git a/doc/_sources/design/python_api.md.txt b/doc/_sources/design/python_api.md.txt new file mode 100644 index 0000000000000000000000000000000000000000..cb5fdc765b7126fc66a1c8978d4b96c0dc5a9f2c --- /dev/null +++ b/doc/_sources/design/python_api.md.txt @@ -0,0 +1,284 @@ +# Design Doc: Python API + +Due to the refactorization of the PaddlePaddle core, we need Python classes to construct corresponding protobuf messages that describe a DL program. + +| Python classes | Protobuf messages | +| --- | --- | +| Program | ProgramDesc | +| Block | BlockDesc | +| Operator | OpDesc | +| Variable | VarDesc | + +Please be aware that these Python classes need to maintain some construction-time information, which are not part of the protobuf messages. + +## Core Concepts + +### Program + +A `ProgramDesc` describes a [DL program](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/program.md), which is composed of an array of `BlockDesc`s. The `BlockDesc`s in a `ProgramDesc` can have a tree-like hierarchical structure. However, the `ProgramDesc` onlys stores a flattened array of `BlockDesc`s. A `BlockDesc` refers to its parent block by its index in the array. For example, operators in the step block of an RNN operator need to be able to access variables in its ancestor blocks. + +Whenever we create a block, we need to set its parent block to the current block, hence the Python class `Program` needs to maintain a data member `current_block`. + +```python +class Program(objects): + def __init__(self): + self.desc = core.NewProgram() # a C++ ProgramDesc pointer. + self.blocks = vector() + self.blocks.append(Block(self, -1)) # the global block + self.current_block = 0 # initialized to the global block + + def global_block(): + return self.blocks[0] + + def current_block(): + return self.get_block(self.current_block) + + def rollback(): + self.current_block = self.current_block().parent_idx + + def create_block(): + new_block_idx = len(self.block) + self.blocks.append(Block(self, self.current_block)) + self.current_block = new_block_idx + return current_block() +``` + +`Program` is an accessor to the protobuf message `ProgramDesc`, which is created in C++ space, because the InferShape function is in C++, which manipulates `VarDesc` messages, which are in turn members of `BlockDesc`, which is a member of `ProgramDesc`. + +`Program` creates the first block as the global block in its constructor. All parameters and their initializer operators are in the global block. + +### Block + +A [Block](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/block.md) includes + +1. a map from variable names to an instance of the Python `Variable` class, and +1. a list of `Operator` instances. + +```python +class Block(objects): + def __init__(self, program, parent_idx): + self.desc = core.NewBlock(program.desc) + self.program = program + self.vars = map() + self.ops = vector() + self.parent_idx = parent_idx + + def create_var(self, ...): + return Variable(self, ...) + + def _create_global_var(self, ...): + program.global_block().create_var(...) + + def create_parameter(self, name, ...): + # Parameter is a subclass of variable. See Parameter section for details. + self.vars[name] = Parameter(self._create_global_var(...), ...) + return self.vars[name] + + def append_operator(self, ...): + self.ops.append(Operator(self, ...)) + + def prepend_operator(self, ...): # Parameter's ctor prepands initialize operators. + self.ops.prepend(Operator(self, ...)) +``` + +`create_parameter` is necessary because parameters are global variables, defined in the global block, but can be created in some sub-blocks. For example, an FC layer in the step block of an RNN operator. + +`prepend_operator` is necessary because the constructor of `Parameter` needs to create the initialize (or load) operator of the parameter, and would like to put it in the *preamble* of the global block. + +### Operator + +The `Operator` class fills in the `OpDesc` message and calls the C++ function `InferShape` to infer the output shapes from the input shapes. + +```python +class Operator(object): + def __init__(self, + block, # Block + type, # string + inputs, # dict + outputs,# dict + attrs # dict + ): + self.desc = core.NewOpDesc(block.desc, type, inputs, outputs, attrs) + core.infer_shape(self.desc, inputs, outputs) + + def type(self): + return self.desc.type() +``` + +`Operator` creates the `OpDesc` message in C++ space, so that it can call the `InferShape` function, which is in C++. + +### Variable + +Operators take Variables as its inputs and outputs. + +```python +class Variable(object): + def __init__(self, + block=None, # Block + name=None, # string + shape, # tuple + dtype="float32", # string + lod_level=None # int + ): + if name is None: + name = unique_name_generator() + self.name = name + self.block = block + self.desc = core.NewVarDesc(block.desc, name, shape, lod_level) + self.writer = None +``` + +Please be aware of `self.writer`, that tracks operator who creates the variable. It possible that there are more than one operators who write a variable, but in Python space, each write to a variable is represented by a Variable class. This is guaranteed by the fact that **`core.NewVarDesc` must NOT create a new `VarDesc` message if its name already exists in the specified block**. + +### Parameter + +A parameter is a global variable with an initializer (or load) operator. + +```python +class Parameter(Variable): + def __init__(self, + block=None, # Block + name=None, # string + shape, # tuple + dtype="float32", # string + lod_level=None # int + trainable, # bool + initialize_op_attrs, + optimize_op_attrs): + super(Parameter, self).__init__(block, name, shape, dtype, lod_level) + self.trainable = trainable + self.optimize_op_attrs = optimize_op_attrs + block.prepend(Operator(block, # Block + initialize_op_attrs['type'], # string + None, # no inputs + self, # output is the parameter + initialize_op_attrs) +``` + +When users create a parameter, they can call + +```python +program.create_parameter( + ..., + init_attr={ + type: "uniform_random", + min: -1.0, + max: 1.0, + }) +) +``` + +In above example, `init_attr.type` names an initialize operator. It can also name the load operator + +```python +init_attr={ + type: "load", + filename: "something.numpy", +} +``` + +`optimize_op_attrs` is not in the `VarDesc` message, but kept in the Python instance, as it will be used in the Python space when creating the optimize operator's `OpDesc`, and will be in the `OpDesc` message. + +## Layer Function + +A layer is a Python function that creates some operators and variables. Layers simplify the work of application programmers. + +Layer functions take `Variable` and configuration parameters as its input and return the output variable(s). + +For example, `FullyConnected` take one or more variable as its input. The input could be input data or another layer's output. There are many configuration options for a `FullyConnected` layer, such as layer size, activation, parameter names, initialization strategies of parameters, and so on. The `FullyConnected` layer will return an output variable. + + +### Necessity for reusing code between layer functions + +There are a lot of code that can be reused. Such as + +* Give the default value of configuration. e.g., default initialize strategy for parameters is uniform random with `min = -1.0`, `max = 1.0`. and default initialize strategy for bias is to fill zero. +* Append the activation operator. +* Create a temporary variable. +* Create parameter. +* Generate a unique name. +* Add a bias. +* ... + +A mechanism to reuse code between layer functions is necessary. It will be around [150 lines of code](https://github.com/PaddlePaddle/Paddle/pull/4724/files#diff-823b27e07e93914ada859232ae23f846R12) if we write a `FullyConnected` layer without any helper functions. + + + +### Comparision between global functions and helper class + +The `FullyConnected` layer will be as follow when we provide global functions: + +```python +def fc_layer(input, size, param_attr=None, bias_attr=None, act=None, name=None): + if name is None: + name = unique_name("fc") + input = multiple_input(input) + param_attr = default_param_attr(param_attr) + param_attr = multiple_param_attr(param_attr, len(input)) + + # mul + mul_results = [] + for ipt, attr in zip(input, param_attr): + shape = ipt.shape[1:] + [size] + w = g_program.global_block().create_parameter(shape, ipt.dtype, name, attr) + tmp = create_tmp_var(name) + g_program.current_block().append_op("mul", {ipt, w}, {tmp}) + mul_results.append(tmp) + + # add sum + ... + # add bias + ... + # add activation + ... + return out +``` + +We can provide many helpers functions for layer developers. However, there are several disadvantages for global helper functions: + +1. We need a namespace for these methods, then layer developers can quickly figure out what method they can use. +2. Global functions will force layer developers to pass its parameter time by time. + +So we provide a helper class, `LayerHelper`, to share code between layer functions. The `FullyConnected` Layer will be as follow. + +```python +def fc_layer(input, size, param_attr=None, bias_attr=None, act=None, name=None): + helper = LayerHelper(locals()) # pass all parameter to LayerHelper + + mul_results = [] + for ipt, param in helper.iter_multiple_input_and_param(): + w = helper.create_parameter(shape=ipt.shape[1:] + [size], dtype = ipt.dtype) + tmp = helper.create_tmp_variable() + helper.append_op('mul', {ipt, w}, {tmp}) + mul_results.append(tmp) + + pre_bias = helper.add_sum(mul_results) + pre_activation = helper.add_bias(pre_bias) + return helper.add_activation(pre_activation) +``` + +We not only use the fewer lines of code to write `fc_layer` but also make the code clearer to understand. At the same time, layer developers can figure out what function they can invoke by typing `helper.` in a python editor. + + +### Implementation of layer helper + +We just keep all parameters of a layer function as a dictionary in layer helper as a private data member. Every method of layer helper will look up the dictionary after it is invoked. In that way, we can implement a layer helper for all layer functions even some layer does not contain some operator. For example, The `activation` is used by the FullyConnected layer or convolution layers, but a cross-entropy layer does not use it. The example code of `add_activation` are: + +```python +class LayerHelper(object): + def __init__(self, **kwargs): # kwargs is short for `keyword arguments` + self.kwargs = kwargs + + def add_activation(self, input_var): + act = self.kwargs.get("act", None) # default value is None + if act is None: # do nothing if no act + return input_var + + tmp = self.create_tmp_var(self) + self.append_op(type=act, input=input_var, output=tmp) + return tmp +``` + +## Optimizer + +[Optimizer Design Doc](./optimizer.md) diff --git a/doc/_sources/design/reader/README.md.txt b/doc/_sources/design/reader/README.md.txt index f21f7af520df5171798326818ecb97c3bcd14a12..2cd4b6225b61cf374458e40afabad7745f61ba71 100644 --- a/doc/_sources/design/reader/README.md.txt +++ b/doc/_sources/design/reader/README.md.txt @@ -1,25 +1,25 @@ # Python Data Reader Design Doc -At training and testing time, PaddlePaddle programs need to read data. To ease the users' work to write data reading code, we define that +During the training and testing phases, PaddlePaddle programs need to read data. To help the users write code that performs reading input data, we define the following: -- A *reader* is a function that reads data (from file, network, random number generator, etc) and yields data items. -- A *reader creator* is a function that returns a reader function. -- A *reader decorator* is a function, which accepts one or more readers, and returns a reader. -- A *batch reader* is a function that reads data (from *reader*, file, network, random number generator, etc) and yields a batch of data items. +- A *reader*: A function that reads data (from file, network, random number generator, etc) and yields the data items. +- A *reader creator*: A function that returns a reader function. +- A *reader decorator*: A function, which takes in one or more readers, and returns a reader. +- A *batch reader*: A function that reads data (from *reader*, file, network, random number generator, etc) and yields a batch of data items. -and provide function which converts reader to batch reader, frequently used reader creators and reader decorators. +and also provide a function which can convert a reader to a batch reader, frequently used reader creators and reader decorators. ## Data Reader Interface -Indeed, *data reader* doesn't have to be a function that reads and yields data items. It can be any function with no parameter that creates a iterable (anything can be used in `for x in iterable`): +*Data reader* doesn't have to be a function that reads and yields data items. It can just be any function without any parameters that creates an iterable (anything can be used in `for x in iterable`) as follows: ``` iterable = data_reader() ``` -Element produced from the iterable should be a **single** entry of data, **not** a mini batch. That entry of data could be a single item, or a tuple of items. Item should be of [supported type](http://www.paddlepaddle.org/doc/ui/data_provider/pydataprovider2.html?highlight=dense_vector#input-types) (e.g., numpy 1d array of float32, int, list of int) +The item produced from the iterable should be a **single** entry of data and **not** a mini batch. The entry of data could be a single item or a tuple of items. Item should be of one of the [supported types](http://www.paddlepaddle.org/doc/ui/data_provider/pydataprovider2.html?highlight=dense_vector#input-types) (e.g., numpy 1d array of float32, int, list of int etc.) -An example implementation for single item data reader creator: +An example implementation for single item data reader creator is as follows: ```python def reader_creator_random_image(width, height): @@ -29,7 +29,7 @@ def reader_creator_random_image(width, height): return reader ``` -An example implementation for multiple item data reader creator: +An example implementation for multiple item data reader creator is as follows: ```python def reader_creator_random_image_and_label(width, height, label): def reader(): @@ -40,9 +40,10 @@ def reader_creator_random_image_and_label(width, height, label): ## Batch Reader Interface -*batch reader* can be any function with no parameter that creates a iterable (anything can be used in `for x in iterable`). The output of the iterable should be a batch (list) of data items. Each item inside the list must be a tuple. +*Batch reader* can be any function without any parameters that creates an iterable (anything can be used in `for x in iterable`). The output of the iterable should be a batch (list) of data items. Each item inside the list should be a tuple. + +Here are some valid outputs: -Here are valid outputs: ```python # a mini batch of three data items. Each data item consist three columns of data, each of which is 1. [(1, 1, 1), @@ -52,26 +53,28 @@ Here are valid outputs: # a mini batch of three data items, each data item is a list (single column). [([1,1,1],), ([2,2,2],), -([3,3,3],), +([3,3,3],)] ``` Please note that each item inside the list must be a tuple, below is an invalid output: ```python # wrong, [1,1,1] needs to be inside a tuple: ([1,1,1],). - # Otherwise it's ambiguous whether [1,1,1] means a single column of data [1, 1, 1], - # or three column of datas, each of which is 1. + # Otherwise it is ambiguous whether [1,1,1] means a single column of data [1, 1, 1], + # or three columns of data, each of which is 1. [[1,1,1], [2,2,2], [3,3,3]] ``` -It's easy to convert from reader to batch reader: +It is easy to convert from a reader to a batch reader: + ```python mnist_train = paddle.dataset.mnist.train() mnist_train_batch_reader = paddle.batch(mnist_train, 128) ``` -Also easy to create custom batch reader: +It is also straight forward to create a custom batch reader: + ```python def custom_batch_reader(): while True: @@ -85,7 +88,8 @@ mnist_random_image_batch_reader = custom_batch_reader ## Usage -batch reader, mapping from item(s) read to data layer, batch size and number of total pass will be passed into `paddle.train`: +Following is how we can use the reader with PaddlePaddle: +The batch reader, a mapping from item(s) to data layer, the batch size and the number of total passes will be passed into `paddle.train` as follows: ```python # two data layer is created: @@ -99,13 +103,13 @@ paddle.train(batch_reader, {"image":0, "label":1}, 128, 10, ...) ## Data Reader Decorator -*Data reader decorator* takes a single or multiple data reader, returns a new data reader. It is similar to a [python decorator](https://wiki.python.org/moin/PythonDecorators), but it does not use `@` syntax. +The *Data reader decorator* takes in a single reader or multiple data readers and returns a new data reader. It is similar to a [python decorator](https://wiki.python.org/moin/PythonDecorators), but it does not use `@` in the syntax. -Since we have a strict interface for data readers (no parameter, return a single data item). Data reader can be used flexiable via data reader decorators. Following are a few examples: +Since we have a strict interface for data readers (no parameters and return a single data item), a data reader can be used in a flexible way using data reader decorators. Following are a few examples: ### Prefetch Data -Since reading data may take time and training can not proceed without data. It is generally a good idea to prefetch data. +Since reading data may take some time and training can not proceed without data, it is generally a good idea to prefetch the data. Use `paddle.reader.buffered` to prefetch data: @@ -117,9 +121,9 @@ buffered_reader = paddle.reader.buffered(paddle.dataset.mnist.train(), 100) ### Compose Multiple Data Readers -For example, we want to use a source of real images (reusing mnist dataset), and a source of random images as input for [Generative Adversarial Networks](https://arxiv.org/abs/1406.2661). +For example, if we want to use a source of real images (say reusing mnist dataset), and a source of random images as input for [Generative Adversarial Networks](https://arxiv.org/abs/1406.2661). -We can do: +We can do the following : ```python def reader_creator_random_image(width, height): @@ -139,13 +143,13 @@ false_reader = reader_creator_bool(False) reader = paddle.reader.compose(paddle.dataset.mnist.train(), data_reader_creator_random_image(20, 20), true_reader, false_reader) # Skipped 1 because paddle.dataset.mnist.train() produces two items per data entry. -# And we don't care second item at this time. +# And we don't care about the second item at this time. paddle.train(paddle.batch(reader, 128), {"true_image":0, "fake_image": 2, "true_label": 3, "false_label": 4}, ...) ``` ### Shuffle -Given shuffle buffer size `n`, `paddle.reader.shuffle` will return a data reader that buffers `n` data entries and shuffle them before a data entry is read. +Given the shuffle buffer size `n`, `paddle.reader.shuffle` returns a data reader that buffers `n` data entries and shuffles them before a data entry is read. Example: ```python @@ -154,21 +158,21 @@ reader = paddle.reader.shuffle(paddle.dataset.mnist.train(), 512) ## Q & A -### Why reader return only a single entry, but not a mini batch? +### Why does a reader return only a single entry, and not a mini batch? -Always returning a single entry make reusing existing data readers much easier (e.g., if existing reader return not a single entry but 3 entries, training code will be more complex because it need to handle cases like batch size 2). +Returning a single entry makes reusing existing data readers much easier (for example, if an existing reader returns 3 entries instead if a single entry, the training code will be more complicated because it need to handle cases like a batch size 2). -We provide function `paddle.batch` to turn (single entry) reader into batch reader. +We provide a function: `paddle.batch` to turn (a single entry) reader into a batch reader. -### Why do we need batch reader, isn't train take reader and batch_size as arguments sufficient? +### Why do we need a batch reader, isn't is sufficient to give the reader and batch_size as arguments during training ? -In most of the case, train taking reader and batch_size as arguments would be sufficent. However sometimes user want to customize order of data entries inside a mini batch. Or even change batch size dynamically. +In most of the cases, it would be sufficient to give the reader and batch_size as arguments to the train method. However sometimes the user wants to customize the order of data entries inside a mini batch, or even change the batch size dynamically. For these cases using a batch reader is very efficient and helpful. -### Why use a dictionary but not a list to provide mapping? +### Why use a dictionary instead of a list to provide mapping? -We decided to use dictionary (`{"image":0, "label":1}`) instead of list (`["image", "label"]`) is because that user can easily resue item (e.g., using `{"image_a":0, "image_b":0, "label":1}`) or skip item (e.g., using `{"image_a":0, "label":2}`). +Using a dictionary (`{"image":0, "label":1}`) instead of a list (`["image", "label"]`) gives the advantage that the user can easily reuse the items (e.g., using `{"image_a":0, "image_b":0, "label":1}`) or even skip an item (e.g., using `{"image_a":0, "label":2}`). -### How to create custom data reader creator +### How to create a custom data reader creator ? ```python def image_reader_creator(image_path, label_path, n): @@ -192,7 +196,7 @@ paddle.train(paddle.batch(reader, 128), {"image":0, "label":1}, ...) ### How is `paddle.train` implemented -An example implementation of paddle.train could be: +An example implementation of paddle.train is: ```python def train(batch_reader, mapping, batch_size, total_pass): diff --git a/doc/_sources/design/refactor/distributed_architecture.md.txt b/doc/_sources/design/refactor/distributed_architecture.md.txt new file mode 100644 index 0000000000000000000000000000000000000000..d9fe7d6bbb0eeb73fcdca3ee749a4f10bcdda682 --- /dev/null +++ b/doc/_sources/design/refactor/distributed_architecture.md.txt @@ -0,0 +1,144 @@ +# Design Doc: Distributed Training Architecture + +## Abstract + +PaddlePaddle version 0.10.0 uses the "trainer-parameter server" architecture. We run multiple instances of trainers (where each trainer runs the same model) and parameter servers for distributed training. This architecture serves well, but has few limitations: + +1. There is a need to write special code that handles tasks which should only be run on a single trainer. E.g., initializing the model, saving the model etc. + +2. Model parallelism is hard: It would need all the if-else branches conditioned on the trainer ID to partition the model onto the trainers, and eventually manually writing out the inter-model-shard communication code to communicate between different trainers. + +3. The user can not directly specify the parameter update rule: This would need to modify the parameter server code and compile a new binary. This makes things more complicated for researchers: A lot of extra effort is required to make this work. Besides, the training job submission program may not allow running arbitrary binaries. + +This design doc discusses PaddlePaddle's new distributed training architecture that addresses the above mentioned limitations. + +## Analysis + +The assumption is that the user writes the trainer program in either Python or C++. + +### Limitation 1 + +There are two basic functionalities in the trainer program: + +1. The training logic such as loading / saving the model and printing out the logs. +2. The neural network definition such as the definition of the data layer, the fully connected layer, the cost function and the + optimizer. + +When we train using PaddlePaddle v0.10.0 in a distributed fashion, multiple instances of the same Python code are run on different nodes, hence both: the +training logic as well as the neural network computation logic, is replicated. + +The tasks that only need to be run once belong to the training logic. Hence if we only replicate the neural network computation part, and do **not** +replicate the training logic, the limitation mentioned above can be avoided. + +### Limitation 2 + +Model parallelism means that a single model is partitioned into different components and each node runs one of the component separately. This comes at the extra cost of managing the +inter-model-shard communication between nodes. + +PaddlePaddle should ideally be able to modify the neural network computation and figure out the support for model parallelism automatically. However, the +computation is only specified in Python code which sits outside of PaddlePaddle, hence PaddlePaddle can not support the feature in this setup. + +Similar to how a compiler uses an intermediate representation (IR) so that the programmer does not need to manually optimize their code for most of the cases, we can have an intermediate representation in PaddlePaddle as well. The compiler optimizes the IR as follows: + + + +PaddlePaddle can support model parallelism by converting the IR so that the user no longer needs to manually perform the computation and operations in the Python component: + + + +The IR for PaddlePaddle after refactoring is called a `Block`, it specifies the computation dependency graph and the variables used in the computation. + +### Limitation 3 + +The user can not directly specify the parameter update rule for the parameter server in the Python module, since the parameter server does not use the same computation definition as the trainer. Instead, the update rule is baked inside the parameter server. The user can not specify the update rule explicitly. + +This could be fixed by making the parameter server run the same computation definition as the trainer (the user's Python module). For a detailed explanation, refer to this document - +[Design Doc: Operation Graph Based Parameter Server](./parameter_server.md) + +## Distributed Training Architecture + +The revamped distributed training architecture can address the above discussed limitations. Below is the illustration of how it does so: + + + +The major components in the architecture are: *PaddlePaddle Python*, *PaddlePaddle converter* and *PaddlePaddle runtime*. + +### PaddlePaddle Python + +PaddlePaddle Python is the Python library that user's Python code invokes, to read the data. build the neural network topology, start training, etc. + +```Python +paddle.init() +input = paddle.op.recordIO("/home/data/mnist.recordio") # file stored on the cluster +img, label = input[0], input[1] +hidden = paddle.layer.fc(input=img, size=200, act=paddle.activation.Tanh()) +prediction = paddle.layer.fc(input=img, size=10, act=paddle.activation.Softmax()) +cost = paddle.layer.classification_cost(input=prediction, label=label) +optimizer = paddle.optimizer.SGD(cost, learning_rate=0.01) +session = paddle.session.NewRemote(num_trainer=3, num_ps=2, GPU_per_trainer=1) +for i in range(1000): + _, cost_val = session.eval(targets=[cost, optimizer]) + print cost_val +``` + +The above code is what a typical Python trainer code is, the neural network topology is built using the helper functions such as `paddle.layer.fc`. Training is done by calling `session.eval` iteratively. + +#### session.eval + +As shown in the graph, `session.eval` sends the IR and the evaluation inputs or targets to the PaddlePaddle cluster for evaluation. +The targets can be any variable in the computation graph. When the target is say, the `optimizer` variable, the neural network will be optimized once. When the target is the `cost` variable, `session.eval` returns the cost value. Based on what the target is, an appropriate action is taken. + +The Python `session` is a wrapper of the C++ `Session` class. For more information about `Session`, refer to this document - [Design Doc: Session](./session.md). + +### PaddlePaddle Converter + +The PaddlePaddle converter automatically converts the IR in the request (IR and evaluation inputs/targets) from PaddlePaddle Python to partitioned IRs and dispatches the new IRs and evaluation inputs/targets to different PaddlePaddle runtimes. Below are the steps that are followed : + +1. Add a `feed` OP that feeds the eval inputs, and a `fetch` OP that fetches the eval targets to the IR. + +2. Extract a new computation (sub)graph with the `feed` and `fetch` OPs as the boundary. The runtime does not need to run the OP that is not dependent on the `fetch` OP. + +3. Optimize the computation graph. + +4. Place the OPs in the graph onto different devices on different PaddlePaddle runtime according to a placement algorithm and the device constraints specified by the user. + +5. Partition the graph according to runtime boundaries and add `send` / `recv` OP pair on the runtime boundaries. + +6. Dispatch the partitioned graph to different PaddlePaddle runtimes. + +7. PaddlePaddle runtimes with the `fetch` OP reports evaluation results back to the converter, the converter reports the evaluation results back to the PaddlePaddle Python. + +The output IRs will be cached to optimize the conversion latency. + + +#### Placement Algorithm + +Our first implementation will only support "trainer-parameter server" placement: the parameters, initializers, and optimizers are all placed on the PaddlePaddle runtimes with the parameter server role. Everything else will be placed on the PaddlePaddle runtimes with the trainer role. This has the same functionality as the "trainer-parameter server" architecture of PaddlePaddle v0.10.0, but is more generic and flexible. + +In the future, a more general placement algorithm should be implemented, which makes placements according to the input IR, and a model of device computation time and device communication time. Model parallelism requires the generic placement algorithm. + + +### PaddlePaddle Runtime + +The PaddlePaddle runtime owns multiple devices (e.g., CPUs, GPUs) and runs the IR. The runtime does not need to do OP placement since it is already done by the converter. + + +### Local Training Architecture + +The local training architecture will be the same as the distributed training architecture, the difference is that everything runs locally, and there is just one PaddlePaddle runtime: + + + + +### Training Data + +In PaddlePaddle v0.10.0, training data is typically read with a [data reader](../reader/README.md) from Python. This approach is no longer efficient when training in a distributed fashion since the Python process no longer runs on the same node with the trainer processes. The Python reader will need to read from the distributed filesystem (assuming it has the required access) and send to the trainers, doubling the network traffic. + +When doing distributed training, the user can still use Python data reader: the training data are sent with `session.eval`. However this should be used for debugging purpose only. The users are encouraged to use the read data OPs. + + +## References: + +[1] [TensorFlow: Large-Scale Machine Learning on Heterogeneous Distributed Systems](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/45166.pdf) + +[2] [TensorFlow: A System for Large-Scale Machine Learning](https://www.usenix.org/system/files/conference/osdi16/osdi16-abadi.pdf) diff --git a/doc/_sources/design/refactor/parameter_server.md.txt b/doc/_sources/design/refactor/parameter_server.md.txt new file mode 100644 index 0000000000000000000000000000000000000000..fa3c5d7990213cf2b0d236e66e592dd2699da876 --- /dev/null +++ b/doc/_sources/design/refactor/parameter_server.md.txt @@ -0,0 +1,106 @@ +# Design Doc: Operation Graph Based Parameter Server + +## Abstract + +We propose an approach to implement the parameter server. In this +approach, there is no fundamental difference between the trainer and +the parameter server: they both run subgraphs, but subgraphs of +different purposes. + +## Background + +The previous implementations of the parameter server does not run a +subgraph. parameter initialization, optimizer computation, network +communication and checkpointing are implemented twice on both the +trainer and the parameter server. + +It would be great if we can write code once and use them on both the +trainer and the parameter server: reduces code duplication and +improves extensibility. Given that after the current refactor, we are +representing everything as a computing graph on the +trainer. Representing everything as a computing graph on the parameter +server becomes a natural extension. + +## Design + +### Graph Converter + +The *graph converter* converts the user-defined operation (OP) graph +into subgraphs to be scheduled on different nodes with the following +steps: + +1. OP placement: the OPs will be placed on different nodes according + to heuristic that minimizes estimated total computation + time. Currently we will use a simple heuristic that puts parameter + varable on parameter server workers and everything else on trainer + workers. + +1. Add communication OPs to enable the communication between nodes. + +We will need these OPs: *Send*, *Recv*, *Enqueue*, *Dequeue*. + +Below is an example of converting the user defined graph to the +subgraphs for the trainer and the parameter server: + + + +After converting: + + + +1. The parameter variable W and it's optimizer subgraph are placed on the parameter server. +1. Operators are added to the subgraphs. + - *Send* sends data to the connected *Recv* operator. The + scheduler on the receive node will only schedule *Recv* operator + to run when the *Send* operator has ran (the *Send* OP will mark + the *Recv* OP runnable automatically). + - *Enueue* enqueues the input variable, it can block until space + become available in the queue. + - *Dequeue* outputs configurable numbers of tensors from the + queue. It will block until the queue have the required number of + tensors. + + +### Benefits + +- Model parallelism become easier to implement: it's an extension to + the trainer - parameter server approach. we already have the + communication OPs, but need to extend the graph converter's + placement functionality. + +- User-defined optimizer is easier to add - user can now express it as + a subgraph. + +- No more duplication logic inside the trainer and the parameter + server mentioned in the background section. + +### Challenges + +- It might be hard for the graph converter to cut a general graph + (without any hint for which subgraph is the optimizer). We may need + to label which subgraph inside the OP graph is the optimizer. + +- It's important to balance the parameter shards of on multiple + parameter server. If a single parameter is very big (some + word-embedding, fully connected, softmax layer), we need to + automatically partition the single parameter onto different + parameter servers when possible (only element-wise optimizer depends + on the parameter variable). + +### Discussion + +- In the "Aync SGD" figure, the "W" variable on the parameter server + could be read and wrote concurrently, what is our locking strategy? + E.g., each variable have a lock cpp method to be invoked by every + OP, or, have a lock OP. + +- Can the Enqueue OP be implemented under our current tensor design + (puts the input tensor into the queue tensor)? + +- *Dequeue* OP will have variable numbers of output (depends on the + `min_count` attribute), does our current design support it? (similar + question for the *Add* OP) + + +### References: +[1] [TensorFlow: Large-Scale Machine Learning on Heterogeneous Distributed Systems](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/45166.pdf) diff --git a/doc/_sources/design/refactor/session.md.txt b/doc/_sources/design/refactor/session.md.txt new file mode 100644 index 0000000000000000000000000000000000000000..1d9a26683c14f54e3b5fe41675cd03b5620646b8 --- /dev/null +++ b/doc/_sources/design/refactor/session.md.txt @@ -0,0 +1,180 @@ +# Design Doc: Session + +## Abstract + +The *session* object encapsulates the environment in which the +computation graph is executed. + +We will have the *local* session and *remote* session, they offer the +same [interface](#interface). The local session encapsulates the local +runtime environment and the remote session encapsulates the cluster +runtime environment. + +The local runtime environment contains: + +1. computation devices (i.e., CPU, GPU) handles, and +1. the [scope](../scope.md) which holds all variables. + +The remote runtime environment contains: + +1. computation devices (i.e., CPU and GPU on node 0, 1) in a cluster, + and +1. the distributed [scope](../scope.md) in a cluster which holds all + variables. + +The user can create a remote session on Paddle Cloud and evaluate the +computation graph with it. In this way, the user can control the +remote computation resource in a cluster from his local computer. + + +## Background + +The current design has an implicit global session in which +`paddle.eval()` is executed. The pain point is: + +Since the user is not able to explicitly switch between runtime +environments, the user cannot run a topology in two independent +environments. + +For example, in reinforcement learning, the user may want to have a +stale model for inference and a fresh model for training, and only +replace the stale model with the fresh model periodically. + +Furthermore, we have no concept that encapsulates a remote environment +that executes a computation graph. + +We need the session object to address above issues. + + +## Session + +A session is an object that owns the runtime environment. All +computations are executed through `session.eval()`. + + +### Interface + +```python +eval( + targets, + feed_dict=None, +) +``` + +Evaluates the target Operations or Variables in `targets`. + +- *targets*: the evaluation targets. Can be a single Operation or + Variable, or a list with the Operations or Variables as + elements. The value returned by `eval()` has the same shape as the + `target` argument. + + The PaddlePaddle program is represented by + the [ProgramDesc](../design/program.md), `eval()` will infer the + ProgramDesc from the given targets and run the PaddlePaddle + program. Please + see + [this graph](./distributed_architecture.md#local-training-architecture) for + the detailed illustration for the local session + and + [this graph](./distributed_architecture.md#distributed-training-architecture) for + the detailed illustration for the remote session. + +- *feed_dict*: a dictionary that contains the tensors which override + the edges of the computation graph. + + feed_dict not only can provide the input data, it can override any + OP's input as well: + + ```python + a = pd.constant(2.0, name="a") + b = pd.variable(name="b") + c = pd.mul(a,b) + sess.eval(targets=c, feed_dict={"b":3.0}) # returns 6.0 + ``` + +```python +close() +``` + +Closes the session and releases the scope that the session owns. + + +### Create a Local Session + +```python +session( + devices=None +) +``` + +Creates a new session. One session owns one global scope, so creating +multiple sessions will create different scopes. + +- *devices*: a single `string` or a list of `string` of device names, + the corresponding devices will be the computation devices for + `eval()`. If not specified, all available devices (e.g., all GPUs) + will be used. The user doesn't need to specify the CPU device since + it will be always used. Multiple sessions can use the same device. + + +#### Example + +```Python +a = paddle.constant(1.0) +b = paddle.constant(2.0) +c = a + b +sess = paddle.session(devices=["gpu:0", "gpu:1", "fpga:0"]) +sess.eval(c) +sess.close() +``` + +### Create a Remote Session + +```python +create_cloud_job( + name, + num_trainer, + mem_per_trainer, + gpu_per_trainer, + cpu_per_trainer, + num_ps, + mem_per_ps, + cpu_per_ps, +) +``` + +Creates a Paddle Cloud job. Fails if the job name exists. + +```python +get_cloud_job( + name +) +``` + +Gets a Paddle Cloud job. + +```python +remote_session( + job +) +``` + +- *job*: the Paddle Cloud job. + +#### Example + +```Python +reader = paddle.reader.recordio("/pfs/home/peter/mnist-train-*") # data stored on Paddle Cloud +image = reader.column(0) +label = reader.column(1) +fc1 = paddle.op.fc(image, size=256, act="sigmoid") +fc2 = paddle.op.fc(fc1, size=10, act="softmax") +cost = paddle.op.cross_entropy(fc2, label) +opt = paddle.optimizer.sgd(cost) + +job = paddle.create_cloud_job("test", 3, "1G", 1, 1, 2, "1G", 1) +sess = paddle.remote_ession(job) +for i in range(1000): + sess.eval(opt) +sess.close() +``` diff --git a/doc/_sources/design/refactorization.md.txt b/doc/_sources/design/refactorization.md.txt new file mode 100644 index 0000000000000000000000000000000000000000..f93d6155e1764386b01d2f0df3f141ab75cd55d4 --- /dev/null +++ b/doc/_sources/design/refactorization.md.txt @@ -0,0 +1,249 @@ +# Design Doc: Refactorization Overview + +The goals of refactoring include: + +1. Making it easy for external contributors to write new elementary computation operations. +1. Making the codebase clean and readable. +1. Designing a new computation representation -- a computation graph of operators and variables. +1. Implementing auto-scalability and auto fault recoverable distributed computing with the help of computation graphs. + +## Computation Graphs + +1. PaddlePaddle represents the computation, training and inference of Deep Learning models, by computation graphs. + + 1. Please refer to [computation graphs](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/graph.md) for a concrete example. + +1. Users write Python programs to describe the graphs and run them (locally or remotely). + +1. A graph is composed of *variables* and *operators*. + +1. The description of graphs must be serializable/deserializable, so that: + + 1. It can be sent to the cloud for distributed execution, and + 1. It can be sent to clients for mobile or enterprise deployment. + +1. The Python program does two things + + 1. *Compilation* runs a Python program to generate a protobuf message representation of the graph and send it to + 1. the C++ library `libpaddle.so` for local execution, + 1. the master process of a distributed training job for training, or + 1. the server process of a Kubernetes serving job for distributed serving. + 1. *Execution* executes the graph by constructing instances of class [`Variable`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/variable.h#L24) and [`OperatorBase`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L70), according to the protobuf message. + +## Description and Realization of Computation Graph + +At compile time, the Python program generates a protobuf message representation of the graph, or a description of the graph. + +At runtime, the C++ program realizes the graph and runs it. + +| | Representation (protobuf messages) | Realization (C++ class objects) | +|---|---|---| +|Data|[VarDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L107)|[Variable](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/variable.h#L24)| +|Operation|[OpDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L35)|[Operator](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L64)| +|Block|BlockDesc|Block| + +The word *graph* is interchangeable with *block* in this document. A graph consists of computation steps and local variables similar to a C++/Java program block, or a pair of parentheses(`{` and `}`). + +## Compilation and Execution + +1. Run a Python program to describe the graph. In particular, the Python application program does the following: + + 1. Create `VarDesc` to represent local/intermediate variables, + 1. Create operators and set attributes, + 1. Validate attribute values, + 1. Infer the type and the shape of variables, + 1. Plan memory-reuse for variables, + 1. Generate the backward graph + 1. Add optimization operators to the computation graph. + 1. Optionally, split the graph for distributed training. + +1. The invocation of `train` or [`infer`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/inference.py#L108) methods in the Python program does the following: + + 1. Create a new Scope instance in the [scope hierarchy](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/scope.md) for each run of a block, + 1. realize local variables defined in the BlockDesc message in the new scope, + 1. a scope is similar to the stack frame in programming languages, + + 1. Create an instance of class `Block`, in which, + 1. realize operators in the BlockDesc message, + + 1. Run the Block by calling + 1. `Block::Eval(vector* targets)` for forward and backward computations, or + 1. `Block::Eval(vector* targets)` for optimization. + + +## Intermediate Representation (IR) + +```text +Compile Time -> IR -> Runtime +``` + +### Benefits of IR + +- Optimization + ```text + Compile Time -> IR -> Optimized IR -> Runtime + ``` +- Automatically send partitioned IR to different nodes. + - Automatic Data Parallelism + ```text + Compile Time + |-> Single GPU IR + |-> [trainer-IR-0, trainer-IR-1, pserver-IR] + |-> Node-0 (runs trainer-IR-0) + |-> Node-1 (runs trainer-IR-1) + |-> Node-2 (runs pserver-IR) + ``` + - Automatic Model Parallelism (planned for future) + +--- + +# Operator/OpWithKernel/OpKernel + +![class_diagram](http://api.paddlepaddle.org/graphviz?dot=https://gist.githubusercontent.com/reyoung/53df507f6749762675dff3e7ce53372f/raw/49caf1fb70820fb4a6c217634317c9306f361f36/op_op_with_kern_class_diagram.dot) + +--- + +# Operator +![class_diagram](http://api.paddlepaddle.org/graphviz?dot=https://gist.githubusercontent.com/reyoung/53df507f6749762675dff3e7ce53372f/raw/dd598e8f1976f5759f58af5e5ef94738a6b2e661/op.dot) + +* `Operator` is the fundamental building block of the user interface. + * Operator stores input/output variable names and attributes. + * The `InferShape` interface is used to infer the shape of the output variables based on the shapes of the input variables. + * Use `Run` to compute the `output` variables from the `input` variables. + +--- + +# OpWithKernel/Kernel + +![class_diagram](http://api.paddlepaddle.org/graphviz?dot=https://gist.githubusercontent.com/reyoung/53df507f6749762675dff3e7ce53372f/raw/9d7f4eba185cf41c8e2fbfb40ae21890dbddcd39/op_with_kernel.dot) + +* `OpWithKernel` inherits `Operator`. +* `OpWithKernel` contains a Kernel map. + * `OpWithKernel::Run` get device's kernel, and invoke `OpKernel::Compute`. + * `OpKernelKey` is the map key. Only device place now, but may be data type later. + +--- + +# Why separate Kernel and Operator + +* Separate GPU and CPU code. + * Make Paddle capable of running without GPU. +* Make one operator (which is a user interface) and create many implementations. + * For example, same multiplication op can have different implementations kernels such as FP16 kernel, FP32 kernel, MKL, eigen kernel. +--- + +# Libraries for Kernel development + +* `Eigen::Tensor` contains basic math and element-wise functions. + * Note that `Eigen::Tensor` has broadcast implementation. + * Limit the number of `tensor.device(dev) = ` in your code. +* `thrust::transform` and `std::transform`. + * `thrust` has the same API as C++ standard library. Using `transform`, one can quickly implement customized element-wise kernels. + * `thrust`, in addition, supports more complex APIs, like `scan`, `reduce`, `reduce_by_key`. +* Hand-writing `GPUKernel` and `CPU` code + * Do not write in header (`.h`) files. CPU Kernel should be in cpp source (`.cc`) and GPU kernels should be in cuda (`.cu`) files. (GCC cannot compile GPU code.) +--- +# Operator Registration + +## Why is registration necessary? +We need a method to build mappings between Op type names and Op classes. + +## How is registration implemented? +Maintaining a map, whose key is the type name and the value is the corresponding Op constructor. + +--- +# The Registry Map + +### `OpInfoMap` + +`op_type(string)` -> `OpInfo` + +`OpInfo`: + +- **`creator`**: The Op constructor. +- **`grad_op_type`**: The type of the gradient Op. +- **`proto`**: The Op's Protobuf, including inputs, outputs and required attributes. +- **`checker`**: Used to check attributes. + +--- +# Related Concepts + +### Op_Maker +It's constructor takes `proto` and `checker`. They are completed during Op_Maker's construction. ([ScaleOpMaker](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/scale_op.cc#L37)) + +### Register Macros +```cpp +REGISTER_OP(op_type, op_class, op_maker_class, grad_op_type, grad_op_class) +REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class) +``` + +--- +# Registration Process +1. Write an Op class and its gradient Op class, if required. +2. Write an Op maker class. In the constructor of this class, describe the inputs, outputs and attributes of the operator. +3. Invoke the macro `REGISTER_OP`. This macro will + 1. Call maker class to complete `proto` and `checker` + 2. Using the completed `proto` and `checker`, it will add a new key-value pair to the `OpInfoMap` + +--- +# Backward Module (1/2) +### Create Backward Operator +- Mapping from forward Op to backward Op +![backward](https://gist.githubusercontent.com/dzhwinter/a6fbd4623ee76c459f7f94591fd1abf0/raw/61026ab6e518e66bde66a889bc42557a1fccff33/backward.png) + +--- +# Backward Module (2/2) +### Build Backward Network +- **Input**: a graph of forward operators +- **Output**: a graph of backward operators +- **Corner cases in construction** + - Shared Variables => insert an `Add` operator to combine gradients + - No Gradient => insert a `fill_zero_grad` operator + - Recursive NetOp => call `Backward` recursively + - RNN Op => recursively call `Backward` on stepnet + - RNN Op => recursively call `Backward` on stepnet + + +--- +# Scope, Variable, Tensor + +* `Tensor` is an n-dimension array with type. + * Only dims and data pointers are stored in `Tensor`. + * All operations on `Tensor` are written in `Operator` or global functions. + * Variable length Tensor design [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md) +* `Variable` instances are the inputs and the outputs of an operator, not just `Tensor`. + * `step_scopes` in RNN is a variable and not a tensor. +* `Scope` is where variables are stored. + * map + * `Scope` has a hierarchical structure. The local scope can get variables from its parent scope. + +--- +# Block (in design) +## the difference between original RNNOp and Block +- As an operator is more intuitive than `RNNOp`, +- Offers a new interface `Eval(targets)` to deduce the minimal block to `Run`, +- Fits the compile-time/ runtime separation design paradigm. + - During the compilation, `SymbolTable` stores `VarDesc`s and `OpDesc`s and serialize to a `BlockDesc` + - When graph executes, a Block with `BlockDesc` is passed. It then creates `Op` and `Var` instances and then invokes `Run`. + +--- +# Milestone +- Take Paddle/books as the main line, the requirement of the models motivates framework refactoring, +- Model migration + - Framework development gives **priority support** to model migration, for example, + - the MNIST demo needs a Python interface, + - the RNN models require the framework to support `LoDTensor`. + - Determine some timelines, + - Frequently used Ops need to be migrated first, + - Different models can be migrated in parallel. +- Improve the framework at the same time +- Accept imperfection, concentrate on solving the specific problem at the right price. + +--- +# Control the migration quality +- Compare the performance of migrated models with old ones. +- Follow the google C++ style guide. +- Build the automatic workflow of generating Python/C++ documentations. + - The documentation of layers and ops should be written inside the code. + - Take the documentation quality into account when submitting pull requests. + - Preview the documentations, read and improve them from a user's perspective. diff --git a/doc/_sources/design/register_grad_op.md.txt b/doc/_sources/design/register_grad_op.md.txt new file mode 100644 index 0000000000000000000000000000000000000000..8d973eb53178c3e889c845144553a453e11f067c --- /dev/null +++ b/doc/_sources/design/register_grad_op.md.txt @@ -0,0 +1,92 @@ +# Design Doc: Gradient Operators Registration + + +## The Problem Posed + +Currently, for each C++ operator class definition, a *gradient operator creator* function is registered, which takes as input a C++ operator instance and returns the corresponding gradient operator instance. + +However, we noticed two problems with the current design: + +1. As we decided to separate the *compilation* and the *execution* phases, we need to change the creator to take an `OpDesc` protobuf message in a `ProgramDesc` and inserts corresponding `OpDesc` messages into the `ProgramDesc` message. + +1. For some operators, the gradient computation can be written in terms of existing operators. For example, the gradient of *minus* operator consists of two operators -- an *identity* operator followed by a *scale* operator. Hence the registration mechanism needs to support mapping from an operator to a set of operators for the gradient computation. + +## The Current Implementation + +Instances of the C++ class `OpInfo` are stored an associative map whose key is the operator type. The `grad_op_type` indicates the associated gradient operator type. An operator can create the gradient operator by invoking `OpInfo::creator_` of the gradient operator. The pseudo code is as follows + +```cpp +struct OpInfo { + std::function creator_; + std::string grad_op_type_; + ... +}; + +map OpInfoMap; + +OperatorBase* CreateGradientOperator(const OperatorBase& op) { + return OpInfoMap.at(op.Type()).creator_(...); +} +``` + +## Proposed Solution + +The mapping relationship between an operator and its gradient operators is a function. The interface of this function is: + +```cpp +// (OpDesc) --> vector +std::function(const OpDescBind&)>; +``` + +The function takes an `OpDescBind` of the forward operator and returns one or many gradient operator descriptions. `OpDescBind` is a C++ wrapper for the protobuf message `OpDesc` for rapid manipulation of `OpDesc`. + +The `GradOpDescMaker` will be registered in `OpInfo` and will replace the `grad_op_type_` field. The `OpInfo` should look like + +```cpp +struct OpInfo { + std::function>(const OpDescBind&)> grad_op_maker_; + ... +}; +``` + +The `grad_op_maker_ ` is a `nullptr` if the operator does not have any associated gradient operators. + +We propose a base class called `GradOpDescMakerBase` to let operator developers generate `Gradient Operators` easily. The public interface of that class is + +```cpp +class GradOpDescMakerBase { +public: + GradOpDescMakerBase(const OpDescBind& ); + virtual std::vector> operator()()const = 0; +}; +``` + +We can convert `GradOpDescMakerBase` to `std::function>(const OpDescBind&)>` by + +```cpp +using GradOpMaker = ...; +std::function(const OpDescBind&)> func; +func = [] (const OpDescBind& fwd_op) { + GradOpMaker maker(fwd_op); + return maker(); +}; +``` + +We can write many helper functions since the `GradOpDescMakerBase` is a class now. The basic helper functions get the variables of `Input`, `Output`, `InputGradient` and `OutputGradient` in the forwarding operator. + +We should change register macros at the same time. In the current solution, there is no difference between forwarding operators and backward operators. So `REGISTER_OP` just register one operator. If the `REGISTER_OPERATOR ` contains `OpProtoAndCheckerMaker` and `GradOpDescMaker`, we just list them in the same macro. It can be done by a macro contains `__VA_ARGS__`. + +The user interface should be + +```cpp +vector MinusOpGradMaker(OpDesc) {...} +REGISTER_OPERATOR(minus, MinusOp, MinusOpProtoAndCheckerMaker, SumOpGradMaker); +// Developers can still manually implement gradient operator. +REGISTER_OPERATOR(minus_grad, MinusGradOp); +``` + +The interface of current `REGISTER_OP` macro could not be changed. In `REGISTER_OP`, it will invoke `REGISTER_OPERATOR` two times and generate GradOpDescMaker inside. + +```cpp +REGISTER_OP(minus, MinusOp, MinusOpProtoAndCheckerMaker, minus_grad, MinusGradOp); +``` diff --git a/doc/_sources/design/regularization.md.txt b/doc/_sources/design/regularization.md.txt new file mode 100644 index 0000000000000000000000000000000000000000..21280ac898feb4dd5e5a5d9e88d121e856850f0b --- /dev/null +++ b/doc/_sources/design/regularization.md.txt @@ -0,0 +1,72 @@ +# Regularization in PaddlePaddle + +## Introduction to Regularization +A central problem in machine learning is how to design an algorithm that will perform well not just on the training data, but also on new data. A frequently faced problem is the problem of **overfitting**, where the model does not make reliable predictions on new unseen data. **Regularization** is the process of introducing additional information in order to prevent overfitting. This is usually done by adding extra penalties to the loss function that restricts the parameter spaces that an optimization algorithm can explore. + +### Parameter Norm Penalties +Most common regularization approaches in deep learning are based on limiting the capacity of the models by adding a parameter norm penalty to the objective function `J`. This is given as follows: + +
+ +The parameter `alpha` is a hyperparameter that weights the relative contribution of the norm penalty term, `omega`, relative to the standard objective function `J`. + +The most commonly used norm penalties are the L2 norm penalty and the L1 norm penalty. These are given as follows: + +##### L2 Regularization: +
+ +##### L1 Regularization +
+ +A much more detailed mathematical background of regularization can be found [here](http://www.deeplearningbook.org/contents/regularization.html). + +## Regularization Survey + +A detailed survey of regularization in various deep learning frameworks can be found [here](https://github.com/PaddlePaddle/Paddle/wiki/Regularization-Survey). + +## Proposal for Regularization in PaddlePaddle + +### Low-Level implementation + +In the new design, we propose to create new operations for regularization. For now, we can add 2 ops that correspond to the most frequently used regularizations: +- L2_regularization_op +- L1_regularization_op + +These ops can be like any other ops with their own CPU/GPU implementations either using Eigen or separate CPU and GPU kernels. As the initial implementation, we can implement their kernels using Eigen following the abstraction pattern implemented for [Activation Ops](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/accuracy_op.h). This abstraction pattern can make it very easy to implement new regularization schemes other than L1 and L2 norm penalties. + +The idea of building ops for regularization is in sync with the refactored Paddle philosophy of using operators to represent any computation unit. The way these ops will be added to the computation graph, will be decided by the [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md#layer-function) in Python API. + +### Computation Graph + +Below is an example of a really simple feed forward neural network. + +
+ +The Python API will modify this computation graph to add regularization operators. The modified computation graph will look as follows: + +
+    +### Python API implementation for Regularization + +Using the low level ops, `L2_regularization_op` and `L1_regularization_op`, any user can add regularization to their computation graphs. However, this will require a lot of lines of code and we should design Python APIs that support regularization. An example of such an API can be seen in [Keras](https://keras.io/regularizers/). As per the PaddlePaddle [Python API design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md), the layer functions are responsible for creating operators, operator parameters and variables. Since regularization is a property of parameters, it makes sense to create these in the layer functions. + +#### Creation of Regularization ops +There are two possibilities for creating the regularization ops: +1. We create these ops immediately while building the computation graph. +2. We add these ops in a lazy manner, just before the backward, similar to the way the optimization ops are added. + +The proposal is to add these ops in a lazy manner just before the backward pass. + +#### Storage of Regularization attributes + +Since we want to create the regularization ops in a lazy manner, the regularization attributes (type of regularization and weight of regularization penalty) can be stored as attributes of the [`Parameter`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/framework.py#L421) class. This is because regularization is a property of the parameters and storing regularization properties with Parameters also allows for shared parameters. + +#### High-level API + +In PaddlePaddle Python API, users will primarily rely on [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md#layer-function) to create neural network layers. Hence, we also need to provide regularization functionality in layer functions. The design of these APIs can be postponed for later right now. A good reference for these APIs can be found in [Keras](https://keras.io/regularizers/) and also by looking at Tensorflow in [`tf.contrib.layers`](https://www.tensorflow.org/api_guides/python/contrib.layers). + + + + + + diff --git a/doc/_sources/design/releasing_process.md.txt b/doc/_sources/design/releasing_process.md.txt new file mode 100644 index 0000000000000000000000000000000000000000..14c081ea84282e52a2e36475c3c0ea755122d154 --- /dev/null +++ b/doc/_sources/design/releasing_process.md.txt @@ -0,0 +1,68 @@ +# PaddlePaddle发行规范 + +PaddlePaddle使用git-flow branching model做分支管理,使用[Semantic Versioning](http://semver.org/)标准表示PaddlePaddle版本号。 + +PaddlePaddle每次发新的版本,遵循以下流程: + +1. 从`develop`分支派生出新的分支,分支名为`release/版本号`。例如,`release/0.10.0` +1. 将新分支的版本打上tag,tag为`版本号rc.Patch号`。第一个tag为`0.10.0rc1`,第二个为`0.10.0rc2`,依次类推。 +1. 对这个版本的提交,做如下几个操作: + * 修改`python/setup.py.in`中的版本信息,并将`istaged`字段设为`True`。 + * 编译这个版本的Docker发行镜像,发布到dockerhub。如果失败,修复Docker编译镜像问题,Patch号加一,返回第二步 + * 编译这个版本的Ubuntu Deb包。如果失败,修复Ubuntu Deb包编译问题,Patch号加一,返回第二步。 + * 使用Regression Test List作为检查列表,测试Docker镜像/ubuntu安装包的功能正确性 + * 如果失败,记录下所有失败的例子,在这个`release/版本号`分支中,修复所有bug后,Patch号加一,返回第二步 + * 编译这个版本的python wheel包,并发布到pypi。 + * 由于pypi.python.org目前遵循[严格的命名规范PEP 513](https://www.python.org/dev/peps/pep-0513),在使用twine上传之前,需要重命名wheel包中platform相关的后缀,比如将`linux_x86_64`修改成`manylinux1_x86_64`。 + * pypi上的package名称为paddlepaddle和paddlepaddle_gpu,如果要上传GPU版本的包,需要修改build/python/setup.py中,name: "paddlepaddle_gpu"并重新打包wheel包:`python setup.py bdist_wheel`。 + * 上传方法: + ``` + cd build/python + pip install twine + twine upload dist/[package to upload] + ``` +1. 第三步完成后,将`release/版本号`分支合入master分支,并删除`release/版本号`分支。将master分支的合入commit打上tag,tag为`版本号`。同时再将`master`分支合入`develop`分支。最后删除`release/版本号`分支。 +1. 编译master分支的Docker发行镜像,发布到dockerhub。编译ubuntu的deb包,发布到github release页面 +1. 协同完成Release Note的书写 + + +需要注意的是: + +* `release/版本号`分支一旦建立,一般不允许再从`develop`分支合入`release/版本号`。这样保证`release/版本号`分支功能的封闭,方便测试人员测试PaddlePaddle的行为。 +* 在`release/版本号`分支存在的时候,如果有bugfix的行为,需要将bugfix的分支同时merge到`master`, `develop`和`release/版本号`这三个分支。 + +## PaddlePaddle 分支规范 + +PaddlePaddle开发过程使用[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范,并适应github的特性做了一些区别。 + +* PaddlePaddle的主版本库遵循[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范。其中: + * `master`分支为稳定(stable branch)版本分支。每一个`master`分支的版本都是经过单元测试和回归测试的版本。 + * `develop`分支为开发(develop branch)版本分支。每一个`develop`分支的版本都经过单元测试,但并没有经过回归测试。 + * `release/版本号`分支为每一次Release时建立的临时分支。在这个阶段的代码正在经历回归测试。 + +* 其他用户的fork版本库并不需要严格遵守[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范,但所有fork的版本库的所有分支都相当于特性分支。 + * 建议,开发者fork的版本库使用`develop`分支同步主版本库的`develop`分支 + * 建议,开发者fork的版本库中,再基于`develop`版本fork出自己的功能分支。 + * 当功能分支开发完毕后,向PaddlePaddle的主版本库提交`Pull Reuqest`,进而进行代码评审。 + * 在评审过程中,开发者修改自己的代码,可以继续在自己的功能分支提交代码。 + +* BugFix分支也是在开发者自己的fork版本库维护,与功能分支不同的是,BugFix分支需要分别给主版本库的`master`、`develop`与可能有的`release/版本号`分支,同时提起`Pull Request`。 + +## PaddlePaddle回归测试列表 + +本列表说明PaddlePaddle发版之前需要测试的功能点。 + +### PaddlePaddle Book中所有章节 + +PaddlePaddle每次发版本首先要保证PaddlePaddle Book中所有章节功能的正确性。功能的正确性包括验证PaddlePaddle目前的`paddle_trainer`训练和纯使用`Python`训练模型正确性。 + +| | 新手入门章节 | 识别数字 | 图像分类 | 词向量 | 情感分析 | 语意角色标注 | 机器翻译 | 个性化推荐 | +| --- | --- | --- | --- | --- | --- | --- | --- | --- | +| API.V2 + Docker + GPU | | | | | | | | | +| API.V2 + Docker + CPU | | | | | | | | | +| `paddle_trainer` + Docker + GPU | | | | | | | | | +| `paddle_trainer` + Docker + CPU | | | | | | | | | +| API.V2 + Ubuntu + GPU | | | | | | | | | +| API.V2 + Ubuntu + CPU | | | | | | | | | +| `paddle_trainer` + Ubuntu + GPU | | | | | | | | | +| `paddle_trainer` + Ubuntu + CPU | | | | | | | | | diff --git a/doc/_sources/design/scope.md.txt b/doc/_sources/design/scope.md.txt new file mode 100644 index 0000000000000000000000000000000000000000..4da76eebb74abcd26ec2b8671399e6bc4fb58574 --- /dev/null +++ b/doc/_sources/design/scope.md.txt @@ -0,0 +1,124 @@ +# Design of Scope in Paddle + +## Overview + +Scope is an important concept in programming languages, which defines a program region that a set of bindings between names and entities applies. In a specific scope, a valid name is uniquely associated with an entity, such as a variable. And in another scope, this name may refer to other entity or nothing at all. It clearly restricts the visibility and validity of names in a program. Hence **Scope** is introduced to PaddlePaddle to manage variables in context. But different from the original abstract concept, Scope now becomes an object with two important attributes: + +- Scope is an association of a name to variable. +- Variables in a parent scope can be retrieved from local scope. + +A detailed explanation of these two attributes goes as following. + + +## Scope is an association of a name to variable. + +Scope is an association of a name to variable. All variables belong to `Scope`. You need to specify a scope to run a Net, i.e., `net.Run(&scope)`. One net can run in different scopes and update different variable in the scope. + + +1. Scope only contains a map of a name to variable. + + All parameters, data, states in a Net should be variables and stored inside a scope. Each op should get inputs and outputs to do computation from a scope, such as data buffer, state (momentum) etc. + +1. Variable can only be created by Scope and a variable can only be got from Scope. User cannot create or get a variable outside a scope. This is a constraints of our framework, and will keep our framework simple and clear. + +1. Scope only contains methods that are used to Create and Get Variables. Scope do not contain Operators and have no information to run them. + `Net` is designed to drive the computation and Scope only contains a map of variables. There is no computation logic inside a `Scope`. Scope just handles the lifetime management of variables. + - `Create` is used to create a Variable by its name and add the mapping relation. + - `Get` is used to find a Variable by name. + +1. Every variable only belongs to one certain Scope. + + Variable can not belong to many scopes. If you want to use variables from parent scope, you can use `parent scope`. + +1. Scope should destruct all Variables inside it when itself is destructed. User can never store `Variable` pointer somewhere else. + + Because Variable can only be got from Scope. When destroying Scope, we also need to destroy all the Variables in it. If user store `Variable` pointer to private data member or some global variable, the pointer will be an invalid pointer when associated `Scope` is destroyed. + +```cpp +class Scope { + public: + Variable* Var(const std::string& name); + const Variable* FindVar(const std::string& name) const; + + private: + std::unordered_map> vars_; +}; +``` + + +## Parent scope and local scope + +Just like [scope](https://en.wikipedia.org/wiki/Scope_(computer_science)) in programming languages, `Scope` in the neural network can also be a local scope. There are two attributes about local scope. + +1. We can create local variables in a local scope. When that local scope is destroyed, all local variables should also be destroyed. +2. Variables in a parent scope can be retrieved from local scopes of that parent scope, i.e., when user get a variable from a scope, it will try to search this variable in current scope. If there is no such variable in the local scope, `scope` will keep searching from its parent, until the variable is found or there is no parent. + +```cpp +class Scope { + public: + Scope(const std::shared_ptr& scope): parent_(scope) {} + + Variable* FindVar(const std::string& name) const { + auto it = vars_.find(name); + if (it != vars_.end()) { + return it->second.get(); + } else if (parent_ != nullptr) { + return parent_->FindVar(name); + } else { + return nullptr; + } + } + + private: + std::shared_ptr parent_ {nullptr}; +}; +``` + +In `Scope` class, there is a private data member called `parent_`. `parent_` is a smart pointer to its parent scope. When user `Get` a variable by its `name`, the `name` will be searched inside the current scope. If the variable cannot be found locally and parent scope is not a `nullptr`, the variable will be searched inside that parent scope. `parent_` pointer's default value is `nullptr`. It means that the scope is a global scope when `parent_` is nullptr. + +A local scope is very useful when we implement Recurrent Neural Network. Each timestep of an RNN should be a `Net`. Each `Net` of timestep (`StepNet` for short) should use an independent local scope. Just like variables in a while loop is inside a local scope in programming languages. By using a single `StepNet` and changing local scope, we can implement an RNN easily. + +# Interface Design + +```cpp +class Variable { + private: + Variable() = default; + friend class Scope; +}; + +class Scope { + private: + Scope(const std::shared_ptr& parent = nullptr); + + public: + static std::shared_ptr Create(const std::shared_ptr& parent = nullptr); + + // return nullptr if not found. + Variable* FindVar(const std::string& name) const; + + // return if already contains same name variable. + Variable* Var(const std::string& name); + + private: + std::shared_ptr parent_; + std::unordered_map> vars_; +}; +``` +## Only scope can create a variable + +To ensure `only scope can create a variable`, we should mark `Variable`'s constructor as a private member function, and Scope is a friend class of Variable. And then only `Var` can construct `Variable`. + +## When scope destroyed, all variables inside this scope should be destroyed together + +The scope hold unique pointers for all variables. User can `FindVar` from scope, but he should not hold this pointer as a member variable. Because when scope is destroyed, all variables inside this scope will be destroyed together. + +## Sharing a parent scope + +Local scope contains a `parent_` pointer. It is a linked-list for scopes. Using a `shared_ptr` because when a local scope is using, its parents cannot be destroyed. + +Also, as the parent scope is a `shared_ptr`, we can only `Create()` a scope shared pointer. We cannot construct a scope variable, because it cannot be passed to other scope as `parent` pointer. + +## Orthogonal interface + +`FindVar` will return `nullptr` when `name` is not found. It can be used as `Contains` method. `Var` will return an `Error` when there is a name conflict locally. Combine `FindVar` and `Var`, we can implement `Var` easily. diff --git a/doc/_sources/design/selected_rows.md.txt b/doc/_sources/design/selected_rows.md.txt new file mode 100644 index 0000000000000000000000000000000000000000..1a98839a957612b91b2276b58818623ecc62d1d5 --- /dev/null +++ b/doc/_sources/design/selected_rows.md.txt @@ -0,0 +1,74 @@ +# Design Doc: Selected Rows + +`SelectedRows` is a type of sparse tensor data type, which is designed to support `embedding` operators. The gradient of embedding table is a sparse tensor. Only a few rows are non-zero values in this tensor. It is straight-forward to represent a sparse tensor by the following sparse tensor data structure: + +```cpp +class SelectedRows { + private: + vector rows_; + Tensor value_; + int height_; +}; +``` + +The field `height_` is the first dimension of `SelectedRows`. The `rows` are the indices of the non-zero rows of `SelectedRows`. The `value_` field is an N-dim tensor of shape `[rows.size() /* NUM_ROWS */, ...]`, which supplies values for each row. The dimension of `SelectedRows` satisfies `[height_] + value_.shape[1:]`. + +Suppose that a SelectedRows-typed variable `x` has many rows, but only two of them have values -- row 73 is `[1, 2]` and row 84 is `[3, 4]`, the `SelectedRows` representation would be: + +``` +x = SelectedRow { + rows = [73, 84], + value = [[1, 2], [3,4]] +} +``` + + +## SelectedRows in Protobuf + +`SelectedRows` is a type of `Variable`. `VarDesc` in protobuf should describe the `SelectedRows` information. Only the tensor dimension of a `SelectedRows` will be described in compile-time because the `rows_` and `value_` are dependent on the training data. +So we use `TensorDesc` to unify `data_type` and `dims`. A LodTensorDesc contains a `TensorDesc` and `lod_level`. The description of `SelectedRows` is a Tensor description. + +```proto +message TensorDesc { + required DataType data_type = 1; + repeated int64 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480] +} + +message LodTensorDesc { + required TensorDesc tensor = 1; + optional int lod_level = 2; +} + +message VarDesc { + required string name = 1; + enum VarType { + LOD_TENSOR = 0; + SELECTED_ROWS = 1; + } + required VarType type = 2; + optional LodTensorDesc lod_desc = 3; + optional TensorDesc selected_rows_desc = 4; + optional bool persistable = 5 [ default = false ]; +} +``` + +## InferShape for Selected Rows + +Just like `LoD` information, `InferShape` method will infer the output tensor type as well. The operator should decide whether its output is a `SelectedRows` or `Dense` tensor. + +For example, the gradient operator of `TableLookup` will always generate `SelectedRows`. Its `InferShape` method should be like following + +```cpp +void TableLookupGrad::InferShape(context) { + ... + context.SetDataType("Embedding.Grad", kSelectedRows); +} +``` + + +## Sparse Operators + +There are several operators that need to be written to support `SelectedRows`. These are: + +1. Operators which generate `SelectedRows` gradient. e.g. Gradient of `TableLookupOp`. +2. Optimize operators which support `SelectedRows` gradient. e.g. `SGD` or `AdaGrad` for `SelectedRows`. However, there should be only one `SGD` operator. `OpWithKernel::Run` should select a suitable kernel for both `dense` tensor or `SelectedRows`. diff --git a/doc/_sources/design/simple_op_design.md.txt b/doc/_sources/design/simple_op_design.md.txt new file mode 100644 index 0000000000000000000000000000000000000000..c7aeed7f9b4637e1c29d530f37b42d12500af82f --- /dev/null +++ b/doc/_sources/design/simple_op_design.md.txt @@ -0,0 +1,202 @@ +## Interaction between C++ and Python + +Users employ API in Python to describe their own network, however, the network construction actually happens in C++. so Protobuf is introduced to send the message between Python and C++. + +The Interaction between Python and C++ can be simplified as two steps: + +1. C++ tells Python how many Ops there are, and what parameter do users need to offer to initialize a new Op. Python then builds API for each Op at compile time. + +2. Users invoke APIs built by Python and provide necessary parameters. These parameters will be sent to C++ for finishing the Op construction task. + +### Message from C++ to Python + +We define a Protobuf message class `OpProto` to hold message needed in the first step. What should an `OpProto` contain? This question is equivalent to “What message do we need to offer, to build a Python API which is legal and user oriented and can use to describe a whole Op.” + +Following message are necessary: + +1. Op's name, and its simple comment. +2. Input and output variable number; each variable's name, type, and comment. +3. Op's attributes; each attribute includes name, type, comment, **default value** and **value range**. + +So `OpProto` can be defined as follows: + +```proto +enum AttrType { + INT = 1; + FLOAT = 2; + STRING = 3; + INTS = 4; + FLOATS = 5; + STRINGS = 6; +}; + +message AttrValue { + AttrType type = 1; + optional int iv = 2; + optional float fv = 3; + optional string sv = 4; + repeated int ivs = 5; + repeated float fvs = 6; + repeated string svs = 7; +}; + +message AttrProto { + required string name = 1; + required string comment = 2; + required AttrType type = 3; +}; + +message VarProto { + required string name = 1; + required string comment = 2; + required bool is_tensor = 3; +}; + +message OpProto { + repeated VarProto inputs = 1; + repeated VarProto outputs = 2; + repeated AttrProto attrs = 3; + required string type = 4; + required string comment = 5; +}; +``` + +To generate Python code automatically: + +```python +def create_python_ops_creatation_functions(): + op_protos = paddle.framework.OpRegistry.get_all_op_proto() + for type_name in op_protos: + op_proto = op_protos[type_name] + def __impl__(**kwargs): # User must use key word args in Paddle API + inputs = [kwargs.get(ipt.name, "") for ipt in op_proto.inputs] + outputs = [kwargs.get(opt.name, "") for opt in op_proto.outputs] + attrs = [cast_to_op_attr(attr, kwargs.get(attr.name, None)) for attr in op_proto.attrs] + opdesc = (input, outputs, type_name, attrs) + return paddle.framework.OpRegistry.CreateOp(opdesc) + __impl__.__doc__ = create_doc_string(op_proto) + globals()[type_name] = __impl__ + +create_python_ops_creatation_functions() +``` + +### Message from Python to C++ + +To hold message needed in the above second step, we define Protobuf message class `OpDesc`. It is used to hold user-specified parameters in Op describing. + +```proto +message OpDesc { + required string type = 1; + repeated string inputs = 2; + repeated string outputs = 3; + map attrs = 4; +}; +``` + +## OpProto Register + +Every Op has its own `OpProto`. For using convenience, we need to register them and record all their messages. For each `Op` class, we define a corresponding `OpMaker` class, in whose constructor we implement the `OpProto`'s building process. `OpMaker`'s constructor will be invoked by another function `OpRegistry::RegisterOp()`. + +```cpp +class OpProtoMaker { +public: + OpProtoMaker(OpProto* proto): proto_(proto) {} +protected: + OpProto* proto_; + void AddInput(const std::string& name, const std::string& desc) {...} + void AddAttr(const std::string& name, const std::string& desc, TypeId type) {...} + void AddComment(const std::string& comment) { ... } +}; + +class OpRegistry { +public: + using OpCreator = std::function; + + template + static void RegisterOp(const std::string& name) { + gCreators_[name] = [](const OpDesc& desc) { + return new OpType(desc); + }; + OpProto& opProto = gProtos_[name]; + OpMaker()(&opProto); + } + + static map gCreators_; + static map gProtos_; +}; + +template +class OpRegister { + public: + OpRegister(std::string type) { + OpRegistry::RegisterOp(type); + } +}; + +#define REGISTER_OP(op_class, op_maker_class, type_name) \ + class op_class##Register { \ + private: \ + const static OpRegister<#op_class, #op_maker_class> reg; \ + }; \ + const Register op_class##Register::reg(#type_name); + +class CosineOp { +// ... +} + +struct CosineOpProtoMaker : public OpProtoMaker { + CosineOpProtoMaker(OpProto* proto) : OpProtoMaker(proto) { + AddInput("input", "input of cosine op"); + AddAttr("scale", "scale of cosine op", float).Default(1.0).GreaterThan(0.0); + AddType("cos"); + AddComment("This is cos op"); + } +} + +REGISTER_OP(CosineOp, CosineOpProtoMaker, cos); +``` + +In `REGISTER_OP(CosineOp, CosineOpProtoMaker, cos)`, we register not only `CosineOp` but also `CosineOpProto`. As fields of `CosineOpProto`, the default value and value range of `scale` are also registered here. + +## Python API + +Python APIs are divided into two types, high-level API and low-level API. + +### High-Level API + +High-level API is called by users directly, so it should keep its style consistent with existing V2 APIs. + +Here is a sample about how a define a fc layer: + +```python +hd = fc_layer(input=data, size=56, with_bias=True, activation="sigmoid"); +``` + +`hd` is the output of `fc_layer` and it's a `variable`. It can be further sent into other layers as input. + +The definition of `fc_layer()`: + +```python +def fc_layer(input, size, with_bias, activation): + attr_map = {"size":size} + check_attrs(attr_map) + w = make_variable('w') + if with_bias: + b = make_variable('b') + else: + b = None + fc_output = make_variable('fc_output'); + fc_op(input, w, b, fc_output, attr_map) + act_output = make_variable('sigmod_output'); + if activation == "sigmod": + sigmod_op(fc_output, act_output); + elif: + # ... + return act_output; +``` + +### Low Leval API + +In above sample, `fc_op` and `sigmod_op` are low-level API. They build `OpDesc` and invoke corresponding C++ code. + +*TODO* diff --git a/doc/_sources/design/tensor_array.md.txt b/doc/_sources/design/tensor_array.md.txt new file mode 100644 index 0000000000000000000000000000000000000000..37e4f7b90f94fa3eb015e733999cd84c96b2239c --- /dev/null +++ b/doc/_sources/design/tensor_array.md.txt @@ -0,0 +1,271 @@ +# Design for TensorArray +This design doc presents the necessity of a new C++ class `TensorArray`. +In addition to the very simple C++ implementation + +```c++ +class TensorArray { + public: + explicit TensorArray(const LoDTensor&); + explicit TensorArray(size_t size); + + private: + vector values_; +}; +``` + +We also need to expose it to PaddlePaddle's Python API, +because users would want to use it with our very flexible operators `WhileLoop`. +An example for a RNN based on dynamic operators is + +```python +input = pd.data(...) +num_steps = Var(12) + +TensorArray states(size=num_steps) +TensorArray step_inputs(unstack_from=input) +TensorArray step_outputs(size=num_steps) + +W = Tensor(...) +U = Tensor(...) +default_state = some_op() + +step = Var(1) + +wloop = paddle.create_whileloop(loop_vars=[step]) +with wloop.frame(): + wloop.break_if(pd.equal(step, num_steps) + pre_state = states.read(step-1, default_state) + step_input = step_inputs.read(step) + state = pd.sigmoid(pd.matmul(U, pre_state) + pd.matmul(W, step_input)) + states.write(step, state) + step_outputs.write(step, state) # output state + step.update(state+1) + +output = step_outputs.stack() +``` + +## Background +Steps are one of the core concepts of RNN. In each time step of RNN, there should be several input segments, states, and output segments; all these components act like arrays, for example, call `states[step_id]` will get the state in `step_id`th time step. + +An RNN can be implemented with the following pseudocode + +```c++ +Array states; +Array input_segments; +Array output_segments; +Parameter W, U; + +step = 1 +seq_len = 12 +while_loop { + if (step == seq_len) break; + states[step] = sigmoid(W * states[step-1] + U * input_segments[step]); + output_segments[step] = states[step] // take state as output + step++; +} +``` +According to the [RNN roadmap](https://github.com/PaddlePaddle/Paddle/issues/4561), there are several different RNNs that PaddlePaddle will eventually support. + +Currently, the basic RNN implementation supported by PaddlePaddle is the `recurrent_op` which takes tensors as input and splits them into `input_segments`. + + +Since a tensor cannot store variable-length sequences directly, PaddlePaddle implements the tensor with level of details (`LoDTensor` for short). +Segmenting the `LoDTensor` is much more complicated than splitting a tensor, that makes it necessary to refactor the `recurrent_op` with `LoDTensor` segmenting support. + +As the next step in RNN support, `dynamic_recurrent_op` should be introduced to handle inputs with variable-length sequences. + +The implementation is similar to `recurrent_op`. +The key difference is the way **the original input `LoDTensors` and outupts are split to get the `input_segments` and the `output_segments`.** + + +Though it can't be built over `recurrent_op` or `dynamic_recurrent_op` directly, +the logic behind splitting a tensor or a LoD tensor into `input_segments` remains the same. + +## Why `TensorArray` +The logic behind splitting the inputs to segments, states and outputs is similar and can be shared in a seperate module. + +The array of `states`, `input_segments` and `output_segments` would be exposed to users when writing a dynamic RNN model similar to the above pseudo codes. + +So there should be an array-like container, which can store the segments of a tensor or LoD tensor. + +**This container can store an array of tensors and provides several methods to split a tensor or a LoD tensor** . +This is where the notion of `TensorArray` comes from. + +## Introduce TensorArray to uniform all the three RNNs +TensorArray as a new concept is borrowed from TensorFlow, +it is meant to be used with dynamic iteration primitives such as `while_loop` and `map_fn`. + +This concept can be used to support our new design of dynamic operations, and help to refactor some existing variant-sentence-related layers, +such as `recurrent_op`, `RecurrentGradientMachine`. + +In [our design for dynamic RNN](https://github.com/PaddlePaddle/Paddle/pull/4401), +`TensorArray` is used to segment inputs and store states in all time steps. +By providing some methods similar to a C++ array, +the definition of some state-based dynamic models such as RNN can be more natural and highly flexible. + +## Dynamic-operations on TensorArray + +`TensorArray` will be used directly when defining dynamic models, so some operators listed below should be implemented + +```python +# several helper operators for TensorArray +def tensor_array_stack(ta, tensor): + ''' + get a tensor array `ta`, return a packed `tensor`. + ''' + pass + +def tensor_array_unstack(tensor, ta): + ''' + get a `tensor`, unstack it and get a tensor array `ta`. + ''' + pass + +def tensor_array_write(ta, index, tensor, data_shared): + ''' + get a `tensor` and a scalar tensor `index`, write `tensor` into index-th + value of the tensor array `ta`. + `data_shared` is an attribute that specifies whether to copy or reference the tensors. + ''' + pass + +def tensor_array_read(ta, index, tensor): + ''' + get a tensor array `ta`, a scalar tensor `index`, read the index-th value of + `ta` and return as the `tensor`. + ''' + pass + +def tensor_array_size(ta, tensor): + ''' + get a tensor array `ta`, return the size of `ta` and return as the scalar `tensor`. + ''' + pass +``` + +It is trivial for users to use so many low-level operators, so some helper methods should be proposed in python wrapper to make `TensorArray` easier to use, +for example + +```python +class TensorArray: + def __init__(self, name): + self.name = name + self.desc = TensorArrayDesc() + + def stack(self, name=None): + ''' + Pack the values in a `TensorArray` into a tensor with rank one higher + than each tensor in `values`. + `stack` can be used to split tensor into time steps for RNN or whileloop. + + @name: str + the name of the variable to output. + ''' + tensor = Var(name) + tensor_array_stack(self.name, tensor) + return tensor + + def unstack(self, input): + ''' + Unpacks the given dimension of a rank-`R` tensor into rank-`(R-1)` tensors. + `unstack` can be used to concatenate all the time steps for RNN or whileloop. + + @input: str + the name of input tensor + ''' + tensor_array_unstack(tensor, self.name) + + def write(self, index, value, data_shared=True): + ''' + Write value into index of the TensorArray. + If `data_shared` is set to True, than the index-th value in TensorArray will + be shared with the tensor passed in. + + @index: str + name of a scalar tensor + @value: str + name of a tensor + @data_shared: bool + ''' + tensor_array_write(self.name, index, value, data_shared) + + def read(self, index, output): + ''' + Read the value at location `index` in the `TensorArray`. + + @index: str + name of a scalar tensor + @output: + name of a output variable + ''' + tensor_array_read(self.name, index, output) + + + def size(self, output): + ''' + Return the number of values. + + @output: str + name of a scalar tensor + ''' + tensor_array_size(self.name, output) +``` + +## LoDTensor-related Supports +The `RecurrentGradientMachine` in Paddle serves as a flexible RNN layer; it takes varience-length sequences as input, and output sequences too. + +Since each step of RNN can only take a tensor-represented batch of data as input, +some preprocess should be taken on the inputs such as sorting the sentences by their length in descending order and cut each word and pack to new batches. + +Such cut-like operations can be embedded into `TensorArray` as general methods called `unpack` and `pack`, +these two operations are similar to `stack` and `unstack` except that they operate on variable-length sequences formated as a LoD tensor rather than a tensor. + +Some definitions are like + +```python +def unpack(level): + ''' + Split LodTensor in some `level` and generate batches, if set `sort_by_length`, + will sort by length. + + Returns: + - a new `TensorArray`, whose values are LodTensors and represents batches + of data. + - an int32 Tensor, which stores the map from the new batch's indices to + original LoDTensor + ''' + pass + +def pack(level, indices_map): + ''' + Recover the original LoD-arranged LoDTensor with the values in a `TensorArray` + and `level` and `indices_map`. + ''' + pass +``` + +With these two methods, a varience-length sentence supported RNN can be implemented like + +```c++ +// input is the varient-length data +LodTensor sentence_input(xxx); +TensorArray ta; +Tensor indice_map; +Tensor boot_state = xxx; // to initialize rnn's first state +TensorArray::unpack(input, 1/*level*/, true/*sort_by_length*/, &ta, &indice_map); +TessorArray step_outputs; +TensorArray states; + +for (int step = 0; step = ta.size(); step++) { + auto state = states.read(step); + // rnnstep is a function which acts like a step of RNN + auto step_input = ta.read(step); + auto step_output = rnnstep(step_input, state); + step_outputs.write(step_output, true/*data_shared*/); +} + +// rnn_output is the final output of an rnn +LoDTensor rnn_output = ta.pack(ta, indice_map); +``` +the code above shows that by embedding the LoDTensor-related preprocess operations into `TensorArray`, +the implementation of a RNN that supports varient-length sentences is far more concise than `RecurrentGradientMachine` because the latter mixes all the codes together, hard to read and extend. diff --git a/doc/_sources/design/var_desc.md.txt b/doc/_sources/design/var_desc.md.txt new file mode 100644 index 0000000000000000000000000000000000000000..0b2958c1b10ef6a6ce51aa75f61e15a7f2d94b3f --- /dev/null +++ b/doc/_sources/design/var_desc.md.txt @@ -0,0 +1,69 @@ +## Background +PaddlePaddle divides the description of neural network computation graph into two stages: compile time and runtime. + +PaddlePaddle use proto message to describe compile time graph because + +1. Computation graph should be able to be saved to a file. +1. In distributed training, the graph will be serialized and send to multiple workers. + +The computation graph is constructed by Data Node and Operation Node. The concept to represent them is in the table below. + +| |compile time|runtime| +|---|---|---| +|Data|VarDesc(proto)|Variable(cpp)| +|Operation|OpDesc(proto)|Operator(cpp)| + + +## Definition of VarDesc + +A VarDesc should have a name, and value. The are two kinds of variable type in compile time, they are `LoDTensor` and `SelectedRows`. + +```proto +message VarDesc { + required string name = 1; + enum VarType { + LOD_TENSOR = 0; + SELECTED_ROWS = 1; + } + required VarType type = 2; + optional LoDTensorDesc lod_desc = 3; + optional TensorDesc selected_rows_desc = 4; + optional bool persistable = 5 [ default = false ]; +} +``` + +## Definition of TensorDesc + +```proto +enum DataType { + BOOL = 0; + INT16 = 1; + INT32 = 2; + INT64 = 3; + FP16 = 4; + FP32 = 5; + FP64 = 6; +} + +message TensorDesc { + required DataType data_type = 1; + repeated int64 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480] +} +``` + +A TensorDesc describes `SelectedRows` and `LoDTensor`. For details of `SelectedRows`, please reference [`SelectedRows`](./selected_rows.md). + +## Definition of LodTensorDesc + +```proto +message LoDTensorDesc { + required TensorDesc tensor = 1; + optional int lod_level = 2; +} +``` + +A LoDTensorDesc contains a tensor and a lod_level. + +## Definition of Variable in Python + +For Variable in Python, please reference [`Python API`](./python_api.md). diff --git a/doc/_sources/getstarted/basic_usage/index_en.rst.txt b/doc/_sources/getstarted/basic_usage/index_en.rst.txt deleted file mode 100644 index 6775da20c2f51000f305b095d40abd27b8fa6c0e..0000000000000000000000000000000000000000 --- a/doc/_sources/getstarted/basic_usage/index_en.rst.txt +++ /dev/null @@ -1,101 +0,0 @@ -Simple Linear Regression -======================== - -PaddlePaddle is a deep learning platform open-sourced by Baidu. With PaddlePaddle, you can easily train a classic neural network within a couple lines of configuration, or you can build sophisticated models that provide state-of-the-art performance on difficult learning tasks like sentiment analysis, machine translation, image caption and so on. - -Problem Background ------------------- - -Now, to give you a hint of what using PaddlePaddle looks like, let's start with a fundamental learning problem - `simple linear regression `_: you have observed a set of two-dimensional data points of ``X`` and ``Y``, where ``X`` is an explanatory variable and ``Y`` is corresponding dependent variable, and you want to recover the underlying correlation between ``X`` and ``Y``. Linear regression can be used in many practical scenarios. For example, ``X`` can be a variable about house size, and ``Y`` a variable about house price. You can build a model that captures relationship between them by observing real estate markets. - -Prepare the Data ------------------ - -Suppose the true relationship can be characterized as ``Y = 2X + 0.3``, let's see how to recover this pattern only from observed data. Here is a piece of python code that feeds synthetic data to PaddlePaddle. The code is pretty self-explanatory, the only extra thing you need to add for PaddlePaddle is a definition of input data types. - - .. code-block:: python - - # dataprovider.py - from paddle.trainer.PyDataProvider2 import * - import random - - # define data types of input: 2 real numbers - @provider(input_types=[dense_vector(1), dense_vector(1)],use_seq=False) - def process(settings, input_file): - for i in xrange(2000): - x = random.random() - yield [x], [2*x+0.3] - -Train a NeuralNetwork ----------------------- - -To recover this relationship between ``X`` and ``Y``, we use a neural network with one layer of linear activation units and a square error cost layer. Don't worry if you are not familiar with these terminologies, it's just saying that we are starting from a random line ``Y' = wX + b`` , then we gradually adapt ``w`` and ``b`` to minimize the difference between ``Y'`` and ``Y``. Here is what it looks like in PaddlePaddle: - - .. code-block:: python - - # trainer_config.py - from paddle.trainer_config_helpers import * - - # 1. read data. Suppose you saved above python code as dataprovider.py - data_file = 'empty.list' - with open(data_file, 'w') as f: f.writelines(' ') - define_py_data_sources2(train_list=data_file, test_list=None, - module='dataprovider', obj='process',args={}) - - # 2. learning algorithm - settings(batch_size=12, learning_rate=1e-3, learning_method=MomentumOptimizer()) - - # 3. Network configuration - x = data_layer(name='x', size=1) - y = data_layer(name='y', size=1) - y_predict = fc_layer(input=x, param_attr=ParamAttr(name='w'), size=1, act=LinearActivation(), bias_attr=ParamAttr(name='b')) - cost = mse_cost(input=y_predict, label=y) - outputs(cost) - -Some of the most fundamental usages of PaddlePaddle are demonstrated: - -- The first part shows how to feed data into PaddlePaddle. In general cases, PaddlePaddle reads raw data from a list of files, and then do some user-defined process to get real input. In this case, we only need to create a placeholder file since we are generating synthetic data on the fly. - -- The second part describes learning algorithm. It defines in what ways adjustments are made to model parameters. PaddlePaddle provides a rich set of optimizers, but a simple momentum based optimizer will suffice here, and it processes 12 data points each time. - -- Finally, the network configuration. It usually is as simple as "stacking" layers. Three kinds of layers are used in this configuration: - - **Data Layer**: a network always starts with one or more data layers. They provide input data to the rest of the network. In this problem, two data layers are used respectively for ``X`` and ``Y``. - - **FC Layer**: FC layer is short for Fully Connected Layer, which connects all the input units to current layer and does the actual computation specified as activation function. Computation layers like this are the fundamental building blocks of a deeper model. - - **Cost Layer**: in training phase, cost layers are usually the last layers of the network. They measure the performance of current model, and provide guidence to adjust parameters. - -Now that everything is ready, you can train the network with a simple command line call: - - .. code-block:: bash - - paddle train --config=trainer_config.py --save_dir=./output --num_passes=30 - - -This means that PaddlePaddle will train this network on the synthectic dataset for 30 passes, and save all the models under path ``./output``. You will see from the messages printed out during training phase that the model cost is decreasing as time goes by, which indicates we are getting a closer guess. - - -Evaluate the Model -------------------- - -Usually, a different dataset that left out during training phase should be used to evalute the models. However, we are lucky enough to know the real answer: ``w=2, b=0.3``, thus a better option is to check out model parameters directly. - -In PaddlePaddle, training is just to get a collection of model parameters, which are ``w`` and ``b`` in this case. Each parameter is saved in an individual file in the popular ``numpy`` array format. Here is the code that reads parameters from last pass. - - .. code-block:: python - - import numpy as np - import os - - def load(file_name): - with open(file_name, 'rb') as f: - f.read(16) # skip header for float type. - return np.fromfile(f, dtype=np.float32) - - print 'w=%.6f, b=%.6f' % (load('output/pass-00029/w'), load('output/pass-00029/b')) - # w=1.999743, b=0.300137 - - .. image:: parameters.png - :align: center - -Although starts from a random guess, you can see that value of ``w`` changes quickly towards 2 and ``b`` changes quickly towards 0.3. In the end, the predicted line is almost identical with real answer. - -There, you have recovered the underlying pattern between ``X`` and ``Y`` only from observed data. diff --git a/doc/_sources/getstarted/build_and_install/build_from_source_en.md.txt b/doc/_sources/getstarted/build_and_install/build_from_source_en.md.txt deleted file mode 100644 index 69f4501f370dcc9d603ec54a63d68568d66e832e..0000000000000000000000000000000000000000 --- a/doc/_sources/getstarted/build_and_install/build_from_source_en.md.txt +++ /dev/null @@ -1,222 +0,0 @@ -Installing from Sources -========================== - -* [1. Download and Setup](#download) -* [2. Requirements](#requirements) -* [3. Build on Ubuntu](#ubuntu) -* [4. Build on Centos](#centos) - - -## Download and Setup -You can download PaddlePaddle from the [github source](https://github.com/PaddlePaddle/Paddle). - -```bash -git clone https://github.com/PaddlePaddle/Paddle paddle -cd paddle -``` -## Requirements - -To compile the source code, your computer must be equipped with the following dependencies. - -- **Compiler**: GCC >= 4.8 or Clang >= 3.3 (AppleClang >= 5.1) and gfortran compiler -- **CMake**: CMake >= 3.0 (at least CMake 3.4 on Mac OS X) -- **BLAS**: MKL, OpenBlas or ATLAS -- **Python**: only support Python 2.7 - -**Note:** For CUDA 7.0 and CUDA 7.5, GCC 5.0 and up are not supported! -For CUDA 8.0, GCC versions later than 5.3 are not supported! - -### Options - -PaddlePaddle supports some build options. - - - - - - - - - - - - - - - - - - - - - - - - - - -
OptionalDescription
WITH_GPUCompile PaddlePaddle with NVIDIA GPU
WITH_AVXCompile PaddlePaddle with AVX intrinsics
WITH_DSOCompile PaddlePaddle with dynamic linked CUDA
WITH_TESTINGCompile PaddlePaddle with unit testing
WITH_SWIG_PYCompile PaddlePaddle with inference api
WITH_STYLE_CHECKCompile PaddlePaddle with style check
WITH_PYTHONCompile PaddlePaddle with python interpreter
WITH_DOUBLECompile PaddlePaddle with double precision
WITH_RDMACompile PaddlePaddle with RDMA support
WITH_TIMERCompile PaddlePaddle with stats timer
WITH_PROFILERCompile PaddlePaddle with GPU profiler
WITH_DOCCompile PaddlePaddle with documentation
WITH_COVERAGECompile PaddlePaddle with code coverage
COVERALLS_UPLOADPackage code coverage data to coveralls
ON_TRAVISExclude special unit test on Travis CI
- - -**Note:** - - The GPU version works best with Cuda Toolkit 8.0 and cuDNN v5. - - Other versions like Cuda Toolkit 7.0, 7.5 and cuDNN v3, v4 are also supported. - - **To utilize cuDNN v5, Cuda Toolkit 7.5 is prerequisite and vice versa.** - -As a simple example, consider the following: - -1. **BLAS Dependencies(optional)** - - CMake will search BLAS libraries from system. If not found, OpenBLAS will be downloaded, built and installed automatically. - To utilize preinstalled BLAS, you can simply specify MKL, OpenBLAS or ATLAS via `MKL_ROOT`, `OPENBLAS_ROOT` or `ATLAS_ROOT`. - - ```bash - # specify MKL - cmake .. -DMKL_ROOT= - # or specify OpenBLAS - cmake .. -DOPENBLAS_ROOT= - ``` - -2. **Doc Dependencies(optional)** - - To generate PaddlePaddle's documentation, install dependencies and set `-DWITH_DOC=ON` as follows: - - ```bash - pip install 'sphinx>=1.4.0' - pip install sphinx_rtd_theme recommonmark - - # install doxygen on Ubuntu - sudo apt-get install doxygen - # install doxygen on Mac OS X - brew install doxygen - - # active docs in cmake - cmake .. -DWITH_DOC=ON` - ``` - -## Build on Ubuntu 14.04 - -### Install Dependencies - -- **Paddle Dependencies** - - ```bash - # necessary - sudo apt-get update - sudo apt-get install -y git curl gcc g++ gfortran make build-essential automake - sudo apt-get install -y python python-pip python-numpy libpython-dev bison - sudo pip install 'protobuf==3.1.0.post1' - - # install cmake 3.4 - curl -sSL https://cmake.org/files/v3.4/cmake-3.4.1.tar.gz | tar -xz && \ - cd cmake-3.4.1 && ./bootstrap && make -j4 && sudo make install && \ - cd .. && rm -rf cmake-3.4.1 - ``` - -- **GPU Dependencies (optional)** - - To build GPU version, you will need the following installed: - - 1. a CUDA-capable GPU - 2. A supported version of Linux with a gcc compiler and toolchain - 3. NVIDIA CUDA Toolkit (available at http://developer.nvidia.com/cuda-downloads) - 4. NVIDIA cuDNN Library (availabel at https://developer.nvidia.com/cudnn) - - The CUDA development environment relies on tight integration with the host development environment, - including the host compiler and C runtime libraries, and is therefore only supported on - distribution versions that have been qualified for this CUDA Toolkit release. - - After downloading cuDNN library, issue the following commands: - - ```bash - sudo tar -xzf cudnn-7.5-linux-x64-v5.1.tgz -C /usr/local - sudo chmod a+r /usr/local/cuda/include/cudnn.h /usr/local/cuda/lib64/libcudnn* - ``` - Then you need to set LD\_LIBRARY\_PATH, PATH environment variables in ~/.bashrc. - - ```bash - export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH - export PATH=/usr/local/cuda/bin:$PATH - ``` - -### Build and Install - -As usual, the best option is to create build folder under paddle project directory. - -```bash -mkdir build && cd build -``` - -Finally, you can build and install PaddlePaddle: - -```bash -# you can add build option here, such as: -cmake .. -DCMAKE_INSTALL_PREFIX= -# please use sudo make install, if you want to install PaddlePaddle into the system -make -j `nproc` && make install -# set PaddlePaddle installation path in ~/.bashrc -export PATH=/bin:$PATH -# install PaddlePaddle Python modules. -sudo pip install /opt/paddle/share/wheels/*.whl -``` -## Build on Centos 7 - -### Install Dependencies - -- **CPU Dependencies** - - ```bash - # necessary - sudo yum update - sudo yum install -y epel-release - sudo yum install -y make cmake3 python-devel python-pip gcc-gfortran swig git - sudo pip install wheel numpy - sudo pip install 'protobuf>=3.0.0' - ``` - -- **GPU Dependencies (optional)** - - To build GPU version, you will need the following installed: - - 1. a CUDA-capable GPU - 2. A supported version of Linux with a gcc compiler and toolchain - 3. NVIDIA CUDA Toolkit (available at http://developer.nvidia.com/cuda-downloads) - 4. NVIDIA cuDNN Library (availabel at https://developer.nvidia.com/cudnn) - - The CUDA development environment relies on tight integration with the host development environment, - including the host compiler and C runtime libraries, and is therefore only supported on - distribution versions that have been qualified for this CUDA Toolkit release. - - After downloading cuDNN library, issue the following commands: - - ```bash - sudo tar -xzf cudnn-7.5-linux-x64-v5.1.tgz -C /usr/local - sudo chmod a+r /usr/local/cuda/include/cudnn.h /usr/local/cuda/lib64/libcudnn* - ``` - Then you need to set LD\_LIBRARY\_PATH, PATH environment variables in ~/.bashrc. - - ```bash - export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH - export PATH=/usr/local/cuda/bin:$PATH - ``` - -### Build and Install - -As usual, the best option is to create build folder under paddle project directory. - -```bash -mkdir build && cd build -``` - -Finally, you can build and install PaddlePaddle: - -```bash -# you can add build option here, such as: -cmake3 .. -DCMAKE_INSTALL_PREFIX= -# please use sudo make install, if you want to install PaddlePaddle into the system -make -j `nproc` && make install -# set PaddlePaddle installation path in ~/.bashrc -export PATH=/bin:$PATH -# install PaddlePaddle Python modules. -sudo pip install /opt/paddle/share/wheels/*.whl -``` diff --git a/doc/_sources/getstarted/build_and_install/build_from_source_en.rst.txt b/doc/_sources/getstarted/build_and_install/build_from_source_en.rst.txt new file mode 100644 index 0000000000000000000000000000000000000000..f194f84ce7c961bb8644d7c077a7c71730220ea2 --- /dev/null +++ b/doc/_sources/getstarted/build_and_install/build_from_source_en.rst.txt @@ -0,0 +1,159 @@ +Build from Sources +========================== + +.. _build_step: + +How To Build +---------------- + +PaddlePaddle mainly uses `CMake `_ and GCC, G++ as compile +tools. We recommend you to use our pre-built Docker image to run the build +to avoid installing dependencies by yourself. We have several build environment +Docker images `here `_ . + +If you choose not to use Docker image for your build, you need to install the +below `Compile Dependencies`_ before run the build. + +Then run: + +.. code-block:: bash + + git clone https://github.com/PaddlePaddle/Paddle.git + cd Paddle + # run the following command to build a CPU-Only binaries if you are using docker + docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/docker/build.sh + # else run these commands + mkdir build + cd build + cmake -DWITH_GPU=OFF -DWITH_TESTING=OFF .. + make + +When the compile finishes, you can get the output whl package under +build/python/dist, then you can choose to install the whl on local +machine or copy it to the target machine. + +.. code-block:: bash + + pip install build/python/dist/*.whl + + +.. _run_test: + +Run Tests +---------------- + +If you wish to run the tests, you may follow the below steps: + +When using Docker, set :code:`RUN_TEST=ON` and :code:`WITH_TESTING=ON` will run test immediately after the build. +Set :code:`WITH_GPU=ON` Can also run tests on GPU. + +.. code-block:: bash + + docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/paddle/scripts/docker/build.sh + +If you don't use Docker, just run ctest will start the tests: + +.. code-block:: bash + + mkdir build + cd build + cmake -DWITH_GPU=OFF -DWITH_TESTING=ON .. + make + ctest + # run a single test like test_mul_op + ctest -R test_mul_op + + +.. _compile_deps: + +Compile Dependencies +---------------- + +PaddlePaddle need the following dependencies when compiling, other dependencies +will be downloaded automatically. + +.. csv-table:: PaddlePaddle Compile Dependencies + :header: "Dependency", "Version", "Description" + :widths: 10, 15, 30 + + "CMake", ">=3.5", "" + "GCC", "4.8.2", "Recommend devtools2 for CentOS" + "Python", "2.7.x", "Need libpython2.7.so" + "pip", ">=9.0", "" + "numpy", "", "" + "SWIG", ">=2.0", "" + "Go", ">=1.8", "Optional" + + +.. _build_options: + +Build Options +---------------- + +Build options include whether build binaries for CPU or GPU, which BLAS +library to use etc. You may pass these settings when running cmake. +For detailed cmake tutorial please refer to `here `_ 。 + +.. _build_options_bool: + +Bool Type Options +---------------- + +You can add :code:`-D` argument to pass such options, like: + +.. code-block:: bash + + cmake .. -DWITH_GPU=OFF + +.. csv-table:: Bool Type Options + :header: "Option", "Description", "Default" + :widths: 1, 7, 2 + + "WITH_GPU", "Build with GPU support", "ON" + "WITH_C_API", "Build only CAPI", "OFF" + "WITH_DOUBLE", "Build with double precision", "OFF" + "WITH_DSO", "Dynamically load CUDA libraries", "ON" + "WITH_AVX", "Build with AVX support", "ON" + "WITH_PYTHON", "Build with integrated Python interpreter", "ON" + "WITH_STYLE_CHECK", "Check code style when building", "ON" + "WITH_TESTING", "Build unit tests", "ON" + "WITH_DOC", "Build documentations", "OFF" + "WITH_SWIG_PY", "Build Python SWIG interface for V2 API", "Auto" + "WITH_GOLANG", "Build fault-tolerant parameter server written in go", "ON" + "WITH_MKL", "Use MKL as BLAS library, else use OpenBLAS", "ON" + + +BLAS ++++++ + +PaddlePaddle supports `MKL `_ and +`OpenBlAS `_ as BLAS library。By default it uses MKL. +If you are using MKL and your machine supports AVX2, MKL-DNN will also be downloaded +and used, for more `details `_ . + +If you choose not to use MKL, then OpenBlAS will be used. + +CUDA/cuDNN ++++++++++++ + +PaddlePaddle will automatically find CUDA and cuDNN when compiling and running. +parameter :code:`-DCUDA_ARCH_NAME=Auto` can be used to detect SM architecture +automatically in order to speed up the build. + +PaddlePaddle can build with any version later than cuDNN v5.1, and we intend to +keep on with latest cuDNN versions. Be sure to run with the same version of cuDNN +you built. + +Pass Compile Options +++++++++++++++ + +You can pass compile options to use intended BLAS/CUDA/Cudnn libraries. +When running cmake command, it will search system paths like +:code:`/usr/lib:/usr/local/lib` and then search paths that you +passed to cmake, i.e. + +.. code-block:: bash + + cmake .. -DWITH_GPU=ON -DWITH_TESTING=OFF -DCUDNN_ROOT=/opt/cudnnv5 + +**NOTE: These options only take effect when running cmake for the first time, you need to clean the cmake cache or clean the build directory (** :code:`rm -rf` **) if you want to change it.** diff --git a/doc/_sources/getstarted/build_and_install/docker_install_en.rst.txt b/doc/_sources/getstarted/build_and_install/docker_install_en.rst.txt index 03df497506099d2fb758bd0ab437d2c082f2b537..d7acc7aeb744b19d83acb520d07c8551168dd096 100644 --- a/doc/_sources/getstarted/build_and_install/docker_install_en.rst.txt +++ b/doc/_sources/getstarted/build_and_install/docker_install_en.rst.txt @@ -1,270 +1,146 @@ -PaddlePaddle in Docker Containers +Run in Docker Containers ================================= -Docker container is currently the only officially-supported way to -running PaddlePaddle. This is reasonable as Docker now runs on all -major operating systems including Linux, Mac OS X, and Windows. -Please be aware that you will need to change `Dockers settings -`_ to make full use -of your hardware resource on Mac OS X and Windows. +Run PaddlePaddle in Docker container so that you don't need to care about +runtime dependencies, also you can run under Windows system. You can get +tutorials at `here `_ . -Working With Docker -------------------- +If you are using Windows, please refer to +`this `_ +tutorial to start running docker under windows. -Docker is simple as long as we understand a few basic concepts: +After you've read above tutorials you may proceed the following steps. -- *image*: A Docker image is a pack of software. It could contain one or more programs and all their dependencies. For example, the PaddlePaddle's Docker image includes pre-built PaddlePaddle and Python and many Python packages. We can run a Docker image directly, other than installing all these software. We can type +.. _docker_pull: - .. code-block:: bash - - docker images +Pull PaddlePaddle Docker Image +------------------------------ - to list all images in the system. We can also run +Run the following command to download the latest Docker images: .. code-block:: bash - - docker pull paddlepaddle/paddle:0.10.0rc2 - to download a Docker image, paddlepaddle/paddle in this example, - from Dockerhub.com. + docker pull paddlepaddle/paddle -- *container*: considering a Docker image a program, a container is a - "process" that runs the image. Indeed, a container is exactly an - operating system process, but with a virtualized filesystem, network - port space, and other virtualized environment. We can type +For users in China, we provide a faster mirror: .. code-block:: bash - docker run paddlepaddle/paddle:0.10.0rc2 - - to start a container to run a Docker image, paddlepaddle/paddle in this example. + docker pull docker.paddlepaddle.org/paddle -- By default docker container have an isolated file system namespace, - we can not see the files in the host file system. By using *volume*, - mounted files in host will be visible inside docker container. - Following command will mount current dirctory into /data inside - docker container, run docker container from debian image with - command :code:`ls /data`. +Download GPU version images: .. code-block:: bash - docker run --rm -v $(pwd):/data debian ls /data - -Usage of CPU-only and GPU Images ----------------------------------- - -We package PaddlePaddle's compile environment into a Docker image, -called the develop image, it contains all compiling tools that -PaddlePaddle needs. We package compiled PaddlePaddle program into a -Docker image as well, called the production image, it contains all -runtime environment that running PaddlePaddle needs. For each version -of PaddlePaddle, we release both of them. Production image includes -CPU-only version and a CUDA GPU version and their no-AVX versions. - -We put the docker images on `dockerhub.com -`_. You can find the -latest versions under "tags" tab at dockerhub.com. If you are in -China, you can use our Docker image registry mirror to speed up the -download process. To use it, please replace all paddlepaddle/paddle in -the commands to docker.paddlepaddle.org/paddle. - -1. Production images, this image might have multiple variants: - - - GPU/AVX::code:`paddlepaddle/paddle:-gpu` - - GPU/no-AVX::code:`paddlepaddle/paddle:-gpu-noavx` - - CPU/AVX::code:`paddlepaddle/paddle:` - - CPU/no-AVX::code:`paddlepaddle/paddle:-noavx` - - Please be aware that the CPU-only and the GPU images both use the - AVX instruction set, but old computers produced before 2008 do not - support AVX. The following command checks if your Linux computer - supports AVX: - - .. code-block:: bash - - if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi - - - To run the CPU-only image as an interactive container: - - .. code-block:: bash - - docker run -it --rm paddlepaddle/paddle:0.10.0rc2 /bin/bash - - Above method work with the GPU image too -- the recommended way is - using `nvidia-docker `_. - - Please install nvidia-docker first following this `tutorial - `_. - - Now you can run a GPU image: - - .. code-block:: bash - - nvidia-docker run -it --rm paddlepaddle/paddle:0.10.0rc2-gpu /bin/bash - -2. development image :code:`paddlepaddle/paddle:-dev` - - This image has packed related develop tools and runtime - environment. Users and developers can use this image instead of - their own local computer to accomplish development, build, - releasing, document writing etc. While different version of paddle - may depends on different version of libraries and tools, if you - want to setup a local environment, you must pay attention to the - versions. The development image contains: - - - gcc/clang - - nvcc - - Python - - sphinx - - woboq - - sshd - - Many developers use servers with GPUs, they can use ssh to login to - the server and run :code:`docker exec` to enter the docker - container and start their work. Also they can start a development - docker image with SSHD service, so they can login to the container - and start work. - - -Train Model Using Python API ----------------------------- - -Our official docker image provides a runtime for PaddlePaddle -programs. The typical workflow will be as follows: - -Create a directory as workspace: - -.. code-block:: bash - - mkdir ~/workspace - -Edit a PaddlePaddle python program using your favourite editor - -.. code-block:: bash - - emacs ~/workspace/example.py + docker pull paddlepaddle/paddle:latest-gpu + docker pull docker.paddlepaddle.org/paddle:latest-gpu -Run the program using docker: +Choose between different BLAS version: -.. code-block:: bash - - docker run --rm -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0rc2 python /workspace/example.py - -Or if you are using GPU for training: - -.. code-block:: bash - - nvidia-docker run --rm -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0rc2-gpu python /workspace/example.py - -Above commands will start a docker container by running :code:`python -/workspace/example.py`. It will stop once :code:`python -/workspace/example.py` finishes. - -Another way is to tell docker to start a :code:`/bin/bash` session and -run PaddlePaddle program interactively: - -.. code-block:: bash - - docker run -it -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0rc2 /bin/bash - # now we are inside docker container - cd /workspace - python example.py - -Running with GPU is identical: - -.. code-block:: bash - - nvidia-docker run -it -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0rc2-gpu /bin/bash - # now we are inside docker container - cd /workspace - python example.py - - -Develop PaddlePaddle or Train Model Using C++ API ---------------------------------------------------- - -We will be using PaddlePaddle development image since it contains all -compiling tools and dependencies. - -1. Build PaddlePaddle develop image + .. code-block:: bash - Use following command to build PaddlePaddle develop image: + # image using MKL by default + docker pull paddlepaddle/paddle + # image using OpenBLAS + docker pull paddlepaddle/paddle:latest-openblas - .. code-block:: bash - git clone https://github.com/PaddlePaddle/Paddle.git && cd Paddle - docker build -t paddle:dev . +If you want to use legacy versions, choose a tag from +`DockerHub `_ +and run: -2. Build PaddlePaddle production image + .. code-block:: bash - There are two steps for building production image, the first step is to run: + docker pull paddlepaddle/paddle:[tag] + # i.e. + docker pull docker.paddlepaddle.org/paddle:0.10.0-gpu - .. code-block:: bash +.. _docker_run: - docker run -v $(pwd):/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=OFF" -e "WITH_TEST=ON" paddle:dev +Launch your training program in Docker +------------------------------ - The above command will compile PaddlePaddle and create a Dockerfile for building production image. All the generated files are in the build directory. "WITH_GPU" controls if the generated production image supports GPU. "WITH_AVX" controls if the generated production image supports AVX. "WITH_TEST" controls if the unit test will be generated. +Assume that you have already written a PaddlePaddle program +named :code:`train.py` under directory :code:`/home/work` (refer to +`PaddlePaddleBook `_ +for more samples), then run the following command: - The second step is to run: + .. code-block:: bash - .. code-block:: bash + cd /home/work + docker run -it -v $PWD:/work paddlepaddle/paddle /work/train.py - docker build -t paddle:prod -f build/Dockerfile ./build +In the above command, :code:`-it` means run the container interactively; +:code:`-v $PWD:/work` means mount the current directory ($PWD will expand +to current absolute path in Linux) under :code:`/work` in the container. +:code:`paddlepaddle/paddle` to specify image to use; finnally +:code:`/work/train.py` is the command to run inside docker. - The above command will generate the production image by copying the compiled PaddlePaddle program into the image. +Also, you can go into the container shell, run or debug your code +interactively: -3. Run unit test + .. code-block:: bash + docker run -it -v $PWD:/work paddlepaddle/paddle /bin/bash + cd /work + python train.py - Following command will run unit test: +**NOTE: We did not install vim in the default docker image to reduce the image size, you can run** :code:`apt-get install -y vim` **to install it if you need to edit python files.** - .. code-block:: bash - - docker run -it -v $(pwd):/paddle paddle:dev bash -c "cd /paddle/build && ctest" +.. _docker_run_book: PaddlePaddle Book ------------------ -The Jupyter Notebook is an open-source web application that allows -you to create and share documents that contain live code, equations, -visualizations and explanatory text in a single browser. - -PaddlePaddle Book is an interactive Jupyter Notebook for users and developers. -We already exposed port 8888 for this book. If you want to +You can create a container serving PaddlePaddle Book using Jupyter Notebook in +one minute using Docker. PaddlePaddle Book is an interactive Jupyter Notebook +for users and developers.If you want to dig deeper into deep learning, PaddlePaddle Book definitely is your best choice. We provide a packaged book image, simply issue the command: -.. code-block:: bash + .. code-block:: bash - docker run -p 8888:8888 paddlepaddle/book + docker run -p 8888:8888 paddlepaddle/book Then, you would back and paste the address into the local browser: -.. code-block:: text + .. code-block:: text - http://localhost:8888/ + http://localhost:8888/ That's all. Enjoy your journey! +.. _docker_run_gpu: -Documentation -------------- +Train with Docker with GPU +------------------------------ -Paddle Docker images include an HTML version of C++ source code -generated using `woboq code browser -`_. This makes it easy -for users to browse and understand the C++ source code. +We recommend using +`nvidia-docker `_ +to run GPU training jobs. Please ensure you have latest +GPU driver installed before move on. -As long as we give the Paddle Docker container a name, we can run an -additional Nginx Docker container to serve the volume from the Paddle -container: + .. code-block:: bash -.. code-block:: bash + nvidia-docker run -it -v $PWD:/work paddledev/paddle:latest-gpu /bin/bash - docker run -d --name paddle-cpu-doc paddle: - docker run -d --volumes-from paddle-cpu-doc -p 8088:80 nginx +**NOTE: If you don't have nvidia-docker installed, try the following method to mount CUDA libs and devices into the container.** + .. code-block:: bash + + export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')" + export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}') + docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:latest-gpu + +**About AVX:** + +AVX is a kind of CPU instruction can accelerate PaddlePaddle's calculations. +The latest PaddlePaddle Docker image turns AVX on by default, so, if your +computer doesn't support AVX, you'll probably need to +`build <./build_from_source_en.rst>`_ with :code:`WITH_AVX=OFF`. -Then we can direct our Web browser to the HTML version of source code -at http://localhost:8088/paddle/ +The following command will tell you whether your computer supports AVX. + + .. code-block:: bash + + if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi diff --git a/doc/_sources/getstarted/build_and_install/index_en.rst.txt b/doc/_sources/getstarted/build_and_install/index_en.rst.txt index 1bfd4f75c0b9b82d61d28a30f03181f7be159f24..32d66d63dd5b2a30d5de4a088dc80b680830cb84 100644 --- a/doc/_sources/getstarted/build_and_install/index_en.rst.txt +++ b/doc/_sources/getstarted/build_and_install/index_en.rst.txt @@ -1,23 +1,34 @@ Install and Build ================= -Install PaddlePaddle ----------------------- +.. _install_steps: -.. toctree:: - :maxdepth: 1 +Install Steps +++++++++ + +You can choose either pip or Docker to complete your install: + +.. toctree:: + :maxdepth: 1 + + pip_install_en.rst + docker_install_en.rst + ../../howto/dev/build_en.md - docker_install_en.rst - ubuntu_install_en.rst Build from Source ----------------- .. warning:: - Please use :code:`deb` package or :code:`docker` image to install paddle. The building guide is used for hacking or contributing PaddlePaddle source code. + We recommend to directly install via above installation steps, you'll only need to build PaddlePaddle from source when you need a modifed binary. .. toctree:: :maxdepth: 1 build_from_source_en.md + +FAQ +++++++++++ + +`FAQ `_ diff --git a/doc/_sources/getstarted/build_and_install/pip_install_en.rst.txt b/doc/_sources/getstarted/build_and_install/pip_install_en.rst.txt new file mode 100644 index 0000000000000000000000000000000000000000..70f601a11c610e0a2b5dcc8b73d2c3ea19e195e1 --- /dev/null +++ b/doc/_sources/getstarted/build_and_install/pip_install_en.rst.txt @@ -0,0 +1,104 @@ +Install Using pip +================================ + +You can use current widely used Python package management +tool `pip `_ +to install PaddlePaddle. This method can be used in +most of current Linux systems or MacOS. + +.. _pip_install: + +Install Using pip +------------------------------ + +Run the following command to install PaddlePaddle on the current +machine, it will also download requirements. + + .. code-block:: bash + + pip install paddlepaddle + + +If you wish to install GPU version, just run: + + .. code-block:: bash + + pip install paddlepaddle-gpu + +If you wish to install the latest develop branch PaddlePaddle, +you can download the latest whl package from our CI system. Access +the below links, log in as guest, then click at the "Artifact" +tab, you'll find the download link of whl packages. + +If the links below shows up the login form, just click "Log in as guest" to start the download: + +.. image:: paddleci.png + :scale: 50 % + :align: center + +.. csv-table:: whl package of each version + :header: "version", "cp27-cp27mu", "cp27-cp27m", "C-API" + :widths: 1, 3, 3, 3 + + "cpu_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_" + "cpu_avx_openblas", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl `_", "Not Available" + "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_" + "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_" + "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_" + +.. _pip_dependency: + +Runtime Dependency +------------------------------ + +PaddlePaddle installation packages (whl) does not only contain .py files, +but also binaries built from C++ code. We ensure that PaddlePaddle can +run on current mainline Linux distributions, like CentOS 6, Ubuntu 14.04 +and MacOS 10.12. + +PaddlePaddle whl packages are trying to satisfy +`manylinux1 `_ +standard, which uses CentOS 5 as default build environment. But CUDA libraries +seems only run on CentOS 6 at least, also, CentOS 5 is about to end its lifetime, +so we use CentOS 6 as default build environment. + +.. csv-table:: PaddlePaddle Runtime Deps + :header: "Dependency", "version", "description" + :widths: 10, 15, 30 + + "OS", "Linux, MacOS", "CentOS 6 or later,Ubuntu 14.04 or later,MacOS 10.12 or later" + "Python", "2.7.x", "Currently Python3 is not supported" + "libc.so", "GLIBC_2.7", "glibc at least include GLIBC_2.7 symbols" + "libstdc++.so", "GLIBCXX_3.4.11, CXXABI_1.3.3", "At least include GLIBCXX_3.4.11, CXXABI_1.3.3 symbols" + "libgcc_s.so", "GCC_3.3", "At least include GCC_3.3 symbols" + +.. _pip_faq: + +FAQ +------------------------------ + +- paddlepaddle*.whl is not a supported wheel on this platform. + + The main cause of this issue is that your current platform is + not supported. Please check that you are using Python 2.7 series. + Besides, pypi only supports manylinux1 standard, you'll need to + upgrade your pip to >9.0.0. Then run the below command: + + .. code-block:: bash + + pip install --upgrade pip + + If the problem still exists, run the following command: + + .. code-block:: bash + + python -c "import pip; print(pip.pep425tags.get_supported())" + + Then you'll get supported package suffixes, then check if it matches + the file name of the whl package. You can find default whl package at + `here `_ + + If your system supports linux_x86_64 but the whl package is manylinux1_x86_64, + you'll need to update pip to the latest version; If your system supports + manylinux1_x86_64 but the whl package is linux_x86_64 you can rename the + file to manylinux1_x86_64 suffix and then install. diff --git a/doc/_sources/getstarted/build_and_install/ubuntu_install_en.rst.txt b/doc/_sources/getstarted/build_and_install/ubuntu_install_en.rst.txt deleted file mode 100644 index ea8042085bf458be96e71017d229d88ad867695b..0000000000000000000000000000000000000000 --- a/doc/_sources/getstarted/build_and_install/ubuntu_install_en.rst.txt +++ /dev/null @@ -1,25 +0,0 @@ -Debian Package installation guide -================================= - -PaddlePaddle supports :code:`deb` pacakge. The installation of this :code:`deb` package is tested in ubuntu 14.04, but it should be support other debian based linux, too. - -There are four versions of debian package, :code:`cpu`, :code:`gpu`, :code:`cpu-noavx`, :code:`gpu-noavx`. And :code:`noavx` version is used to support CPU which does not contain :code:`AVX` instructions. The download url of :code:`deb` package is \: https://github.com/baidu/Paddle/releases/ - - -After downloading PaddlePaddle deb packages, you can use :code:`gdebi` install. - -.. code-block:: bash - - gdebi paddle-*.deb - -If :code:`gdebi` is not installed, you can use :code:`sudo apt-get install gdebi` to install it. - -Or you can use following commands to install PaddlePaddle. - -.. code-block:: bash - - dpkg -i paddle-*.deb - apt-get install -f - -And if you use GPU version deb package, you need to install CUDA toolkit and cuDNN, and set related environment variables(such as LD_LIBRARY_PATH) first. It is normal when `dpkg -i` get errors. `apt-get install -f` will continue install paddle, and install dependences. - diff --git a/doc/_sources/getstarted/index_en.rst.txt b/doc/_sources/getstarted/index_en.rst.txt index 9f771e93e8b63eb98e31ec12667bd1aa007af20e..d14e3f5c0cc90792fce9cb82e65da482c44dc433 100644 --- a/doc/_sources/getstarted/index_en.rst.txt +++ b/doc/_sources/getstarted/index_en.rst.txt @@ -1,9 +1,61 @@ GET STARTED ============ +.. _quick_install: + +Quick Install +---------------------- + +You can use pip to install PaddlePaddle with a single command, supports +CentOS 6 above, Ubuntu 14.04 above or MacOS 10.12, with Python 2.7 installed. +Simply run the following command to install: + + .. code-block:: bash + + pip install paddlepaddle + +If you need to install GPU version, run: + + .. code-block:: bash + + pip install paddlepaddle-gpu + +For more details about installation and build: + .. toctree:: :maxdepth: 1 build_and_install/index_en.rst -- `Deep Learning 101 `_ + +.. _quick_start: + +Quick Start +++++++++ + +Create a new file called housing.py, and paste this Python +code: + + + .. code-block:: python + + import paddle.v2 as paddle + + # Initialize PaddlePaddle. + paddle.init(use_gpu=False, trainer_count=1) + + # Configure the neural network. + x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13)) + y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear()) + + # Infer using provided test data. + probs = paddle.infer( + output_layer=y_predict, + parameters=paddle.dataset.uci_housing.model(), + input=[item for item in paddle.dataset.uci_housing.test()()]) + + for i in xrange(len(probs)): + print 'Predicted price: ${:,.2f}'.format(probs[i][0] * 1000) + +Run :code:`python housing.py` and voila! It should print out a list of predictions +for the test housing data. diff --git a/doc/_sources/howto/deep_model/rnn/index_en.rst.txt b/doc/_sources/howto/deep_model/rnn/index_en.rst.txt index 13a153b05c578e0af82ee29db5ea27fd4b6d6f59..7adc79873d699fdfd5a85034bcef964dd1f19132 100644 --- a/doc/_sources/howto/deep_model/rnn/index_en.rst.txt +++ b/doc/_sources/howto/deep_model/rnn/index_en.rst.txt @@ -1,2 +1,7 @@ RNN Models ========== + +.. toctree:: + :maxdepth: 1 + + rnn_config_en.rst diff --git a/doc/_sources/howto/deep_model/rnn/rnn_config_en.rst.txt b/doc/_sources/howto/deep_model/rnn/rnn_config_en.rst.txt index 73f5d5371fcd3ce95253cad47b0d8e738284441c..f92edd108ff5c10a31b5f181f0f6dcb7a3f119f3 100644 --- a/doc/_sources/howto/deep_model/rnn/rnn_config_en.rst.txt +++ b/doc/_sources/howto/deep_model/rnn/rnn_config_en.rst.txt @@ -3,34 +3,11 @@ RNN Configuration This tutorial will guide you how to configure recurrent neural network in PaddlePaddle. PaddlePaddle supports highly flexible and efficient recurrent neural network configuration. In this tutorial, you will learn how to: -- prepare sequence data for learning recurrent neural networks. - configure recurrent neural network architecture. - generate sequence with learned recurrent neural network models. -We will use vanilla recurrent neural network, and sequence to sequence model to guide you through these steps. The code of sequence to sequence model can be found at :code:`demo/seqToseq`. - -===================== -Prepare Sequence Data -===================== - -PaddlePaddle does not need any preprocessing to sequence data, such as padding. The only thing that needs to be done is to set the type of the corresponding type to input. For example, the following code snippets defines three input. All of them are sequences, and the size of them are :code:`src_dict`, :code:`trg_dict`, and :code:`trg_dict`: - -.. code-block:: python - - settings.input_types = [ - integer_value_sequence(len(settings.src_dict)), - integer_value_sequence(len(settings.trg_dict)), - integer_value_sequence(len(settings.trg_dict))] - - -Then at the :code:`process` function, each :code:`yield` function will return three integer lists. Each integer list is treated as a sequence of integers: - -.. code-block:: python - - yield src_ids, trg_ids, trg_ids_next - - -For more details description of how to write a data provider, please refer to :ref:`api_pydataprovider2` . The full data provider file is located at :code:`demo/seqToseq/dataprovider.py`. +We will use vanilla recurrent neural network, and sequence to sequence model to guide you through these steps. The code of sequence to sequence model can be found at `book/08.machine_translation `_ . +And the data preparation of this model can be found at `python/paddle/v2/dataset/wmt14.py `_ =============================================== Configure Recurrent Neural Network Architecture @@ -42,7 +19,7 @@ Simple Gated Recurrent Neural Network Recurrent neural network process a sequence at each time step sequentially. An example of the architecture of LSTM is listed below. -.. image:: ../../../tutorials/sentiment_analysis/src/bi_lstm.jpg +.. image:: src/bi_lstm.jpg :align: center Generally speaking, a recurrent network perform the following operations from :math:`t=1` to :math:`t=T`, or reversely from :math:`t=T` to :math:`t=1`. @@ -75,19 +52,19 @@ Its **output function** simply takes :math:`x_t` as the output. act=None, rnn_layer_attr=None): def __rnn_step__(ipt): - out_mem = memory(name=name, size=size) - rnn_out = mixed_layer(input = [full_matrix_projection(ipt), - full_matrix_projection(out_mem)], - name = name, - bias_attr = rnn_bias_attr, - act = act, - layer_attr = rnn_layer_attr, - size = size) + out_mem = paddle.layer.memory(name=name, size=size) + rnn_out = paddle.layer.mixed(input = [paddle.layer.full_matrix_projection(input=ipt), + paddle.layer.full_matrix_projection(input=out_mem)], + name = name, + bias_attr = rnn_bias_attr, + act = act, + layer_attr = rnn_layer_attr, + size = size) return rnn_out - return recurrent_group(name='%s_recurrent_group' % name, - step=__rnn_step__, - reverse=reverse, - input=input) + return paddle.layer.recurrent_group(name='%s_recurrent_group' % name, + step=__rnn_step__, + reverse=reverse, + input=input) PaddlePaddle uses memory to construct step function. **Memory** is the most important concept when constructing recurrent neural networks in PaddlePaddle. A memory is a state that is used recurrently in step functions, such as :math:`x_{t+1} = f_x(x_t)`. One memory contains an **output** and a **input**. The output of memory at the current time step is utilized as the input of the memory at the next time step. A memory can also has a **boot layer**, whose output is utilized as the initial value of the memory. In our case, the output of the gated recurrent unit is employed as the output memory. Notice that the name of the layer :code:`rnn_out` is the same as the name of :code:`out_mem`. This means the output of the layer :code:`rnn_out` (:math:`x_{t+1}`) is utilized as the **output** of :code:`out_mem` memory. @@ -101,7 +78,7 @@ Sequence to Sequence Model with Attention ----------------------------------------- We will use the sequence to sequence model with attention as an example to demonstrate how you can configure complex recurrent neural network models. An illustration of the sequence to sequence model with attention is shown in the following figure. -.. image:: ../../../tutorials/text_generation/encoder-decoder-attention-model.png +.. image:: src/encoder-decoder-attention-model.png :align: center In this model, the source sequence :math:`S = \{s_1, \dots, s_T\}` is encoded with a bidirectional gated recurrent neural networks. The hidden states of the bidirectional gated recurrent neural network :math:`H_S = \{H_1, \dots, H_T\}` is called *encoder vector* The decoder is a gated recurrent neural network. When decoding each token :math:`y_t`, the gated recurrent neural network generates a set of weights :math:`W_S^t = \{W_1^t, \dots, W_T^t\}`, which are used to compute a weighted sum of the encoder vector. The weighted sum of the encoder vector is utilized to condition the generation of the token :math:`y_t`. @@ -113,43 +90,52 @@ We also project the encoder vector to :code:`decoder_size` dimensional space, ge .. code-block:: python # Define the data layer of the source sentence. - src_word_id = data_layer(name='source_language_word', size=source_dict_dim) + src_word_id = paddle.layer.data( + name='source_language_word', + type=paddle.data_type.integer_value_sequence(source_dict_dim)) # Calculate the word embedding of each word. - src_embedding = embedding_layer( + src_embedding = paddle.layer.embedding( input=src_word_id, size=word_vector_dim, - param_attr=ParamAttr(name='_source_language_embedding')) + param_attr=paddle.attr.ParamAttr(name='_source_language_embedding')) # Apply forward recurrent neural network. - src_forward = grumemory(input=src_embedding, size=encoder_size) + src_forward = paddle.networks.simple_gru( + input=src_embedding, size=encoder_size) # Apply backward recurrent neural network. reverse=True means backward recurrent neural network. - src_backward = grumemory(input=src_embedding, - size=encoder_size, - reverse=True) + src_backward = paddle.networks.simple_gru( + input=src_embedding, size=encoder_size, reverse=True) # Mix the forward and backward parts of the recurrent neural network together. - encoded_vector = concat_layer(input=[src_forward, src_backward]) + encoded_vector = paddle.layer.concat(input=[src_forward, src_backward]) # Project encoding vector to decoder_size. - encoder_proj = mixed_layer(input = [full_matrix_projection(encoded_vector)], - size = decoder_size) + encoded_proj = paddle.layer.mixed( + size=decoder_size, + input=paddle.layer.full_matrix_projection(encoded_vector)) # Compute the first instance of the backward RNN. - backward_first = first_seq(input=src_backward) + backward_first = paddle.layer.first_seq(input=src_backward) # Project the first instance of backward RNN to decoder size. - decoder_boot = mixed_layer(input=[full_matrix_projection(backward_first)], size=decoder_size, act=TanhActivation()) + decoder_boot = paddle.layer.mixed( + size=decoder_size, + act=paddle.activation.Tanh(), + input=paddle.layer.full_matrix_projection(backward_first)) The decoder uses :code:`recurrent_group` to define the recurrent neural network. The step and output functions are defined in :code:`gru_decoder_with_attention`: .. code-block:: python - group_inputs=[StaticInput(input=encoded_vector,is_seq=True), - StaticInput(input=encoded_proj,is_seq=True)] - trg_embedding = embedding_layer( - input=data_layer(name='target_language_word', - size=target_dict_dim), - size=word_vector_dim, - param_attr=ParamAttr(name='_target_language_embedding')) + group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True) + group_input2 = paddle.layer.StaticInput(input=encoded_proj, is_seq=True) + group_inputs = [group_input1, group_input2] + trg_embedding = paddle.layer.embedding( + input=paddle.layer.data( + name='target_language_word', + type=paddle.data_type.integer_value_sequence(target_dict_dim)), + size=word_vector_dim, + param_attr=paddle.attr.ParamAttr(name='_target_language_embedding')) + group_inputs.append(trg_embedding) group_inputs.append(trg_embedding) # For decoder equipped with attention mechanism, in training, @@ -158,9 +144,10 @@ The decoder uses :code:`recurrent_group` to define the recurrent neural network. # StaticInput means the same value is utilized at different time steps. # Otherwise, it is a sequence input. Inputs at different time steps are different. # All sequence inputs should have the same length. - decoder = recurrent_group(name=decoder_group_name, - step=gru_decoder_with_attention, - input=group_inputs) + decoder = paddle.layer.recurrent_group( + name=decoder_group_name, + step=gru_decoder_with_attention, + input=group_inputs) The implementation of the step function is listed as below. First, it defines the **memory** of the decoder network. Then it defines attention, gated recurrent unit step function, and the output function: @@ -171,27 +158,32 @@ The implementation of the step function is listed as below. First, it defines th # Defines the memory of the decoder. # The output of this memory is defined in gru_step. # Notice that the name of gru_step should be the same as the name of this memory. - decoder_mem = memory(name='gru_decoder', - size=decoder_size, - boot_layer=decoder_boot) + decoder_mem = paddle.layer.memory( + name='gru_decoder', size=decoder_size, boot_layer=decoder_boot) # Compute attention weighted encoder vector. - context = simple_attention(encoded_sequence=enc_vec, - encoded_proj=enc_proj, - decoder_state=decoder_mem) + context = paddle.networks.simple_attention( + encoded_sequence=enc_vec, + encoded_proj=enc_proj, + decoder_state=decoder_mem) # Mix the current word embedding and the attention weighted encoder vector. - decoder_inputs = mixed_layer(inputs = [full_matrix_projection(context), - full_matrix_projection(current_word)], - size = decoder_size * 3) + decoder_inputs = paddle.layer.mixed( + size=decoder_size * 3, + input=[ + paddle.layer.full_matrix_projection(input=context), + paddle.layer.full_matrix_projection(input=current_word) + ]) # Define Gated recurrent unit recurrent neural network step function. - gru_step = gru_step_layer(name='gru_decoder', - input=decoder_inputs, - output_mem=decoder_mem, - size=decoder_size) + gru_step = paddle.layer.gru_step( + name='gru_decoder', + input=decoder_inputs, + output_mem=decoder_mem, + size=decoder_size) # Defines the output function. - out = mixed_layer(input=[full_matrix_projection(input=gru_step)], - size=target_dict_dim, - bias_attr=True, - act=SoftmaxActivation()) + out = paddle.layer.mixed( + size=target_dict_dim, + bias_attr=True, + act=paddle.activation.Softmax(), + input=paddle.layer.full_matrix_projection(input=gru_step)) return out @@ -207,45 +199,37 @@ After training the model, we can use it to generate sequences. A common practice - :code:`eos_id`: the end token. Every sentence ends with the end token. - :code:`beam_size`: the beam size used in beam search. - :code:`max_length`: the maximum length of the generated sentences. - -* use :code:`seqtext_printer_evaluator` to print text according to index matrix and dictionary. This function needs to set: - - - :code:`id_input`: the integer ID of the data, used to identify the corresponding output in the generated files. - - :code:`dict_file`: the dictionary file for converting word id to word. - - :code:`result_file`: the path of the generation result file. The code is listed below: .. code-block:: python - group_inputs=[StaticInput(input=encoded_vector,is_seq=True), - StaticInput(input=encoded_proj,is_seq=True)] + group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True) + group_input2 = paddle.layer.StaticInput(input=encoded_proj, is_seq=True) + group_inputs = [group_input1, group_input2] # In generation, decoder predicts a next target word based on # the encoded source sequence and the last generated target word. # The encoded source sequence (encoder's output) must be specified by # StaticInput which is a read-only memory. # Here, GeneratedInputs automatically fetchs the last generated word, # which is initialized by a start mark, such as . - trg_embedding = GeneratedInput( - size=target_dict_dim, - embedding_name='_target_language_embedding', - embedding_size=word_vector_dim) + trg_embedding = paddle.layer.GeneratedInput( + size=target_dict_dim, + embedding_name='_target_language_embedding', + embedding_size=word_vector_dim) group_inputs.append(trg_embedding) - beam_gen = beam_search(name=decoder_group_name, - step=gru_decoder_with_attention, - input=group_inputs, - bos_id=0, # Beginnning token. - eos_id=1, # End of sentence token. - beam_size=beam_size, - max_length=max_length) + beam_gen = paddle.layer.beam_search( + name=decoder_group_name, + step=gru_decoder_with_attention, + input=group_inputs, + bos_id=0, # Beginnning token. + eos_id=1, # End of sentence token. + beam_size=beam_size, + max_length=max_length) - seqtext_printer_evaluator(input=beam_gen, - id_input=data_layer(name="sent_id", size=1), - dict_file=trg_dict_path, - result_file=gen_trans_file) - outputs(beam_gen) + return beam_gen -Notice that this generation technique is only useful for decoder like generation process. If you are working on sequence tagging tasks, please refer to :ref:`semantic_role_labeling` for more details. +Notice that this generation technique is only useful for decoder like generation process. If you are working on sequence tagging tasks, please refer to `book/06.understand_sentiment `_ for more details. -The full configuration file is located at :code:`demo/seqToseq/seqToseq_net.py`. +The full configuration file is located at `book/08.machine_translation/train.py `_ . diff --git a/doc/_sources/howto/dev/build_en.md.txt b/doc/_sources/howto/dev/build_en.md.txt new file mode 100644 index 0000000000000000000000000000000000000000..91c41ef8ce3abdec5d69a9cbcebbc49b17d8f663 --- /dev/null +++ b/doc/_sources/howto/dev/build_en.md.txt @@ -0,0 +1,124 @@ +# Build using Docker + +## What Developers Need + +To contribute to PaddlePaddle, you need + +1. A computer -- Linux, BSD, Windows, MacOS, and +1. Docker. + +Nothing else. Not even Python and GCC, because you can install all build tools into a Docker image. We run all the tools by running this image. + +## General Process + +1. Retrieve source code. + + ```bash + git clone https://github.com/paddlepaddle/paddle + ``` + +2. Install build tools into a Docker image. + + ```bash + cd paddle; docker build -t paddle:dev . + ``` + + Please be aware of the `.` at the end of the command, which refers to the [`./Dockerfile` file](https://github.com/PaddlePaddle/Paddle/blob/develop/Dockerfile). `docker build` follows instructions in this file to create a Docker image named `paddle:dev`, and installs building tools into it. + +3. Build from source. + + This following command starts a Docker container that executes the Docker image `paddle:dev`, mapping the current directory to `/paddle/` in the container, and runs the default entry-point [`build.sh`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh) as specified in the Dockefile. `build.sh` invokes `cmake` and `make` to build PaddlePaddle source code, which had been mapped to `/paddle`, and writes outputs to `/paddle/build`, which maps to `build` in the current source directory on the computer. + + ```bash + docker run -v $PWD:/paddle paddle:dev + ``` + + Above command builds a CUDA-enabled version. If we want to build a CPU-only version, we can type + + ```bash + docker run -e WITH_GPU=OFF -v $PWD:/paddle paddle:dev + ``` + +4. Run unit tests. + + To run all unit tests using the first GPU of a node: + + ```bash + NV_GPU=0 nvidia-docker run -v $PWD:/paddle paddle:dev bash -c "cd /paddle/build; ctest" + ``` + + If we used `WITH_GPU=OFF` at build time, it generates only CPU-based unit tests, and we don't need nvidia-docker to run them. We can just run + + ```bash + docker run -v $PWD:/paddle paddle:dev bash -c "cd /paddle/build; ctest" + ``` + + Sometimes we want to run a specific unit test, say `memory_test`, we can run + + ```bash + nvidia-docker run -v $PWD:/paddle paddle:dev bash -c "cd /paddle/build; ctest -V -R memory_test" + ``` + +5. Clean Build. + + Sometimes, we might want to clean all thirt-party dependents and built binaries. To do so, just + + ```bash + rm -rf build + ``` + +## Docker, Or Not? + +- What is Docker? + + If you haven't heard of it, consider it something like Python's virtualenv. + +- Docker or virtual machine? + + Some people compare Docker with VMs, but Docker doesn't virtualize any hardware nor running a guest OS, which means there is no compromise on the performance. + +- Why Docker? + + Using a Docker image of build tools standardizes the building environment, which makes it easier for others to reproduce your problems and to help. + + Also, some build tools don't run on Windows or Mac or BSD, but Docker runs almost everywhere, so developers can use whatever computer they want. + +- Can I choose not to use Docker? + + Sure, you don't have to install build tools into a Docker image; instead, you can install them in your local computer. This document exists because Docker would make the development way easier. + +- How difficult is it to learn Docker? + + It takes you ten minutes to read [an introductory article](https://docs.docker.com/get-started) and saves you more than one hour to install all required build tools, configure them, especially when new versions of PaddlePaddle require some new tools. Not even to mention the time saved when other people trying to reproduce the issue you have. + +- Can I use my favorite IDE? + + Yes, of course. The source code resides on your local computer, and you can edit it using whatever editor you like. + + Many PaddlePaddle developers are using Emacs. They add the following few lines into their `~/.emacs` configure file: + + ```emacs + (global-set-key "\C-cc" 'compile) + (setq compile-command + "docker run --rm -it -v $(git rev-parse --show-toplevel):/paddle paddle:dev") + ``` + + so they could type `Ctrl-C` and `c` to build PaddlePaddle from source. + +- Does Docker do parallel building? + + Our building Docker image runs a [Bash script](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh), which calls `make -j$(nproc)` to starts as many processes as the number of your CPU cores. + +## Some Gotchas + +- Docker requires sudo + + An owner of a computer has the administrative privilege, a.k.a., sudo, and Docker requires this privilege to work properly. If you use a shared computer for development, please ask the administrator to install and configure Docker. We will do our best to support rkt, another container technology that doesn't require sudo. + +- Docker on Windows/MacOS builds slowly + + On Windows and MacOS, Docker containers run in a Linux VM. You might want to give this VM some more memory and CPUs so to make the building efficient. Please refer to [this issue](https://github.com/PaddlePaddle/Paddle/issues/627) for details. + +- Not enough disk space + + Examples in this article uses option `--rm` with the `docker run` command. This option ensures that stopped containers do not exist on hard disks. We can use `docker ps -a` to list all containers, including stopped. Sometimes `docker build` generates some intermediate dangling images, which also take disk space. To clean them, please refer to [this article](https://zaiste.net/posts/removing_docker_containers/). diff --git a/doc/_sources/howto/dev/contribute_to_paddle_en.md.txt b/doc/_sources/howto/dev/contribute_to_paddle_en.md.txt index 9b0d3e83c0dc264650eda73e6801c60a75439b4a..a60453ff4e3bba6e6cb3b3de915dd69afd3a1ec3 100644 --- a/doc/_sources/howto/dev/contribute_to_paddle_en.md.txt +++ b/doc/_sources/howto/dev/contribute_to_paddle_en.md.txt @@ -1,146 +1,157 @@ # Contribute Code -We sincerely appreciate your contributions. You can use fork and pull request -workflow to merge your code. +We sincerely appreciate your contribution. This document explains our workflow and work style. -## Code Requirements -- Your code must be fully documented by - [doxygen](http://www.stack.nl/~dimitri/doxygen/) style. -- Make sure the compiler option WITH\_STYLE\_CHECK is on and the compiler - passes the code style check. -- All code must have unit test. -- Pass all unit tests. +## Workflow -The following tutorial guides you into submitting your contibution. +PaddlePaddle uses this [Git branching model](http://nvie.com/posts/a-successful-git-branching-model/). The following steps guide usual contributions. -## [Creating a Fork](https://help.github.com/articles/fork-a-repo/) +1. Fork -Just head over to the GitHub page and click the "Fork" button. -It's just that simple. + Our development community has been growing fastly; it doesn't make sense for everyone to write into the official repo. So, please file Pull Requests from your fork. To make a fork, just head over to the GitHub page and click the ["Fork" button](https://help.github.com/articles/fork-a-repo/). -## Clone +1. Clone -Paddle is currently using [git-flow branching model](http://nvie.com/posts/a-successful-git-branching-model/). -The **develop** is the main branch, and other user's branches are feature branches. + To make a copy of your fork to your local computers, please run -Once you've created a fork, you can use your favorite git client to clone your -repo or just head straight to the command line: + ```bash + git clone https://github.com/your-github-account/paddle + cd paddle + ``` -```shell -# Clone your fork to your local machine -git clone --branch develop https://github.com/USERNAME/Paddle.git -``` -If your repository doesn't contain **develop** branch, just create it by your own. - -```shell -git clone https://github.com/USERNAME/Paddle.git Paddle -cd Paddle -git checkout -b develop # create develop branch. -git remote add upstream https://github.com/PaddlePaddle/Paddle.git # add upstream to baidu/Paddle -git pull upstream develop # update to upstream -``` +1. Create the local feature branch -Then you can start to develop by making a local developement branch + For daily works like adding a new feature or fixing a bug, please open your feature branch before coding: -```shell -git checkout -b MY_COOL_STUFF_BRANCH -``` + ```bash + git checkout -b my-cool-stuff + ``` -## Using `pre-commit` hook +1. Commit -Paddle developers use [pre-commit](http://pre-commit.com/) tool to manage git -pre-commit hooks. It can help us format source codes (cpp, python), check some -basic thing before commit (only one EOL for each file, do not add a huge file -in git). `pre-commit` tests is a part of unit tests in Travis-CI now, every -PR doesn't fit hook can not be merged into Paddle. + Before issuing your first `git commit` command, please install [`pre-commit`](http://pre-commit.com/) by running the following commands: -To use [pre-commit](http://pre-commit.com/), you should install it by -`pip install pre-commit`, and currently, Paddle uses `clang-format` to format -c/cpp sources. Please make sure clang-format 3.8+ installed. + ```bash + pip install pre-commit + pre-commit install + ``` -Then just run `pre-commit install` in your Paddle clone directory. When you -commit your code, the pre-commit hook will check the local code if there is -anything not suitable to commit, and so on. + Our pre-commit configuration requires clang-format 3.8 for auto-formating C/C++ code and yapf for Python. -## Commit + Once installed, `pre-commit` checks the style of code and documentation in every commit. We will see something like the following when you run `git commit`: -Commit your changes by following command lines: + ``` + ➜ git commit + CRLF end-lines remover...............................(no files to check)Skipped + yapf.................................................(no files to check)Skipped + Check for added large files..............................................Passed + Check for merge conflicts................................................Passed + Check for broken symlinks................................................Passed + Detect Private Key...................................(no files to check)Skipped + Fix End of Files.....................................(no files to check)Skipped + clang-formater.......................................(no files to check)Skipped + [my-cool-stuff c703c041] add test file + 1 file changed, 0 insertions(+), 0 deletions(-) + create mode 100644 233 + ``` -```shell -# show the working tree status -git status -# add modified files -git add xx -env EDITOR=vim git commit # You can write your comments by vim/nano/emacs. -``` -The first line of commit infomation is the title. The second and later lines -are the details if any. +1. Build and test -## Keeping Fork Up to Date + Users can build PaddlePaddle natively on Linux and Mac OS X. But to unify the building environment and to make it easy for debugging, the recommended way is [using Docker](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/build_en.md). -Before pull your request, you should sync your code from the latest PaddlePaddle. -To do this, you'll need to add a remote at first: +1. Keep pulling -```shell -# see the current configured remote repository -git remote -v -# add upstream repository -git remote add upstream https://github.com/PaddlePaddle/Paddle.git -# verify the new upstream -git remote -v -``` + An experienced Git user pulls from the official repo often -- daily or even hourly, so they notice conflicts with others work early, and it's easier to resolve smaller conflicts. -Update your fork with the latest upstream changes: + ```bash + git remote add upstream https://github.com/PaddlePaddle/Paddle + git pull upstream develop + ``` -```shell -git pull --rebase upstream develop -``` +1. Push and file a pull request -If there are no unique commits locally, git will simply perform a fast-forward. -However, if you have been making changes (in the vast majority of cases you -probably shouldn't be), you may have to deal with conflicts. + You can "push" your local work into your forked repo: -Now, your local master branch is up-to-date with everything modified upstream. + ```bash + git push origin my-cool-stuff + ``` -## Push to GitHub + The push allows you to create a pull request, requesting owners of this [official repo](https://github.com/PaddlePaddle/Paddle) to pull your change into the official one. -```shell -# push to your repository in Github -git push -u origin MY_COOL_STUFF_BRANCH # create remote branch MY_COOL_STUFF_BRANCH to origin. -``` + To create a pull request, please follow [these steps](https://help.github.com/articles/creating-a-pull-request/). -## Pull Request + If your change is for fixing an issue, please write ["Fixes "](https://help.github.com/articles/closing-issues-using-keywords/) in the description section of your pull request. Github would close the issue when the owners merge your pull request. -Go to the page for your fork on GitHub, select your development branch, -and click the **pull request button**. + Please remember to specify some reviewers for your pull request. If you don't know who are the right ones, please follow Github's recommendation. -## Update your pull request with the lastest version -During the code review, your pull request may become stale because new commits in -baidu/Paddle. GitHub allows autmotic update if there is no conflict. You can do this -by clicking the "Update Branch" button in your pull request page. However, in the case -of conflict, you need to do the update manually. You need to do the following on -your local repository: -```shell -git checkout MY_COOL_STUFF_BRANCH -git pull upstream develop -# You may need to resolve the conflict according to the git prompt. -# Make and test your code. -git push origin MY_COOL_STUFF_BRANCH -``` -Now your Pull Request is updated with the latest version. +1. Delete local and remote branches + + To keep your local workspace and your fork clean, you might want to remove merged branches: + + ```bash + git push origin :my-cool-stuff + git checkout develop + git pull upstream develop + git branch -d my-cool-stuff + ``` + +### Code Review + +- Please feel free to ping your reviewers by sending them the URL of your pull request via IM or email. Please do this after your pull request passes the CI. + +- Please answer reviewers' every comment. If you are to follow the comment, please write "Done"; please give a reason otherwise. + +- If you don't want your reviewers to get overwhelmed by email notifications, you might reply their comments by [in a batch](https://help.github.com/articles/reviewing-proposed-changes-in-a-pull-request/). + +- Reduce the unnecessary commits. Some developers commit often. It is recommended to append a sequence of small changes into one commit by running `git commit --amend` instead of `git commit`. + + +## Coding Standard -## Revise your pull request +### Code Style -When you revise your pull request according to reviewer's comments, please use 'git commit' instead of 'git commit --amend' to commit your changes so that the reviewers can see the difference between the new pull requrest and the old pull request. +Our C/C++ code follows the [Google style guide](http://google.github.io/styleguide/cppguide.html). -The possible commands are +Our Python code follows the [PEP8 style guide](https://www.python.org/dev/peps/pep-0008/). -```shell -git checkout MY_COOL_STUFF_BRANCH -git pull upstream develop # update local to newest code base. -# May be some conflicts will occured. -# And develop your cool stuff -env EDITOR=vim git commit # add your revise log -git push origin MY_COOL_STUFF_BRANCH +Our build process helps to check the code style. In [`build.sh`](https://github.com/PaddlePaddle/Paddle/blob/b84e8226514b8bb4405c3c28e54aa5077193d179/paddle/scripts/docker/build.sh#L42), the entry point of our [builder Docker image](https://github.com/PaddlePaddle/Paddle/blob/b84e8226514b8bb4405c3c28e54aa5077193d179/Dockerfile#L88), the CMake argument `WITH_STYLE_CHECK` is set to `ON` by default. This flag is on + +Please install pre-commit, which automatically reformat the changes to C/C++ and Python code whenever we run `git commit`. To check the whole codebase, we can run the command `pre-commit run -a`, as in the [`check_style.sh` file](https://github.com/PaddlePaddle/Paddle/blob/b84e8226514b8bb4405c3c28e54aa5077193d179/paddle/scripts/travis/check_style.sh#L30), which is invoked by [our Travis CI configuration](https://github.com/PaddlePaddle/Paddle/blob/b84e8226514b8bb4405c3c28e54aa5077193d179/.travis.yml#L43). + +### Unit Tests + +Please remember to add related unit tests. + +- For C/C++ code, please follow [`google-test` Primer](https://github.com/google/googletest/blob/master/googletest/docs/Primer.md). + +- For Python code, please use [Python's standard `unittest` package](http://pythontesting.net/framework/unittest/unittest-introduction/). + + +### Writing Logs + +We use [glog](https://github.com/google/glog) for logging in our C/C++ code. + +For general information, please use `LOG`. For debug information, please use [`VLOG`](http://htmlpreview.github.io/?https://github.com/google/glog/blob/master/doc/glog.html#verbose). The reason is at [here](https://groups.google.com/a/chromium.org/d/msg/chromium-dev/3NDNd1KzXeY/AZKMMx37fdQJ). + +`VLOG` requires a *verbose level* parameter. For example: + +```c++ +VLOG(3) << "Operator FC is taking " << num_inputs << "inputs." +``` + +When we run a PaddlePaddle application or test, we can specify a verbose threshold. For example: + +```bash +GLOG_vmodule=buddy_allocator=2 \ +GLOG_v=10 \ +python \ +../python/paddle/v2/framework/tests/test_recurrent_op.py ``` + +This will enable VLOG messages generated by `buddy_allocator.{h,cc}` and in the verbose range of 0 to 3, so you will see above example VLOG message, which is in level 3. This suggests that we output overall messages in lower verbose levels, so they display with higher probability. When coding C++, please follow the verbose level convention as follows: + +- verbose level 1: [framework](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/framework) +- verbose level 3: [operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators) +- verbose level 5: [memory](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/memory), [platform](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/platform) +- verbose level 7: [math](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/math) diff --git a/doc/_sources/howto/dev/new_layer_en.rst.txt b/doc/_sources/howto/dev/new_layer_en.rst.txt index 46481f5ead33dc6a26507e021fd9ae0f8316e940..110a9fb38f890a766bb4480e91feb22d3b0838a5 100644 --- a/doc/_sources/howto/dev/new_layer_en.rst.txt +++ b/doc/_sources/howto/dev/new_layer_en.rst.txt @@ -29,7 +29,7 @@ Fully connected layer takes a dense input vector with dimension :math:`D_i`. It where :math:`f(.)` is an nonlinear *activation* function, such as sigmoid, tanh, and Relu. -The transformation matrix :math:`W` and bias vector :math:`b` are the *parameters* of the layer. The *parameters* of a layer are learned during training in the *backward pass*. The backward pass computes the gradients of the output function with respect to all parameters and inputs. The optimizer can use chain rule to compute the gradients of the loss function with respect to each parameter. +The transformation matrix :math:`W` and bias vector :math:`b` are the *parameters* of the layer. The *parameters* of a layer are learned during training in the *backward pass*. The backward pass computes the gradients of the output function with respect to all parameters and inputs. The optimizer can use chain rule to compute the gradients of the loss function with respect to each parameter. Suppose our loss function is :math:`c(y)`, then @@ -37,7 +37,7 @@ Suppose our loss function is :math:`c(y)`, then \frac{\partial c(y)}{\partial x} = \frac{\partial c(y)}{\partial y} \frac{\partial y}{\partial x} -Suppose :math:`z = f(W^T x + b)`, then +Suppose :math:`z = W^T x + b`, then .. math:: @@ -48,7 +48,7 @@ This derivative can be automatically computed by our base layer class. Then, for fully connected layer, we need to compute: .. math:: - + \frac{\partial z}{\partial x} = W, \frac{\partial z_j}{\partial W_{ij}} = x_i, \frac{\partial z}{\partial b} = \mathbf 1 where :math:`\mathbf 1` is an all one vector, :math:`W_{ij}` is the number at the i-th row and j-th column of the matrix :math:`W`, :math:`z_j` is the j-th component of the vector :math:`z`, and :math:`x_i` is the i-th component of the vector :math:`x`. @@ -322,7 +322,7 @@ All the gradient check unit tests are located in :code:`paddle/gserver/tests/tes /* weight */ true); } } - + If you are creating a new file for the test, such as :code:`paddle/gserver/tests/testFCGrad.cpp`, you need to add the file to :code:`paddle/gserver/tests/CMakeLists.txt`. An example is given below. All the unit tests will run when you execute the command :code:`make tests`. Notice that some layers might need high accuracy for the gradient check unit tests to work well. You need to configure :code:`WITH_DOUBLE` to `ON` when configuring cmake. .. code-block:: bash diff --git a/doc/_sources/howto/dev/new_op_en.md.txt b/doc/_sources/howto/dev/new_op_en.md.txt new file mode 100644 index 0000000000000000000000000000000000000000..1e88e1f5b4df710f1b69f0305d8d8a2921c4249a --- /dev/null +++ b/doc/_sources/howto/dev/new_op_en.md.txt @@ -0,0 +1,342 @@ +# How to write a new operator + + - [Background](#background) + - [Implementing C++ Types](#implementing-c++-types) + - [Defining ProtoMaker](#defining-protoMaker) + - [Defining Operator](#defining-operator) + - [Registering Operator](#registering-operator) + - [Compilation](#compilation) + - [Python Binding](#python-binding) + - [Unit Tests](#unit-tests) + - [Testing Forward Operators](#testing-forward-operators) + - [Testing Backward Operators](#testing-backward-operators) + - [Compiling and Running](#compiling-and-running) + - [Remarks](#remarks) +## Background + +Here are the base types needed. For details, please refer to the design docs. + +- `framework::OperatorBase`: Operator (Op)base class. +- `framework::OpKernel`: Base class for Op computation. +- `framework::OperatorWithKernel`: Inherited from OperatorBase, describing an operator with computation. +- `class OpProtoAndCheckerMaker`: Describes an Operator's input, output, attributes and description, mainly used to interface with Python API. + +An operator can be differentiated by whether in has kernel methods. An operator with kernel inherits from `OperatorWithKernel` while the ones without inherit from `OperatorBase`. This tutorial focuses on implementing operators with kernels. In short, an operator includes the following information: + + + Information | Where is it defined +-------------- | :---------------------- +OpProtoMake definition | `.cc`files, Backward Op does not need an OpProtoMake interface. +Op definition | `.cc` files +Kernel implementation | The kernel methods shared between CPU and GPU are defined in `.h` files. CPU-specific kernels live in `.cc` files, while GPU-specific kernels are implemented in `.cu`files. +Registering the Op | Ops are registered in `.cc` files; For Kernel registration, `.cc` files contain the CPU implementation, while `.cu` files contain the GPU implementation. + + +New Operator implementations are added to the list [paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators), with file names in the format `*_op.h` (if applicable), `*_op.cc`, `*_op.cu` (if applicable).** The system will use the naming scheme to automatically build operators and their corresponding Python extensions. ** + + +Let's take matrix multiplication operator, [MulOp](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc), as an example to introduce the writing of an Operator with Kernel. + + +## Implementing C++ Types + + +### 1. Defining Class ProtoMaker + +Matrix Multiplication can be written as $Out = X * Y$, meaning that the operation consists of two inputs and pne output. + +First, define `ProtoMaker` to describe the Operator's input, output, and additional comments: + +```cpp +class MulOpMaker : public framework::OpProtoAndCheckerMaker { + public: + MulOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "(Tensor), 2D tensor of size (M x K)"); + AddInput("Y", "(Tensor), 2D tensor of size (K x N)"); + AddOutput("Out", "(Tensor), 2D tensor of size (M x N)"); + AddComment(R"DOC( +Two Element Mul Operator. +The equation is: Out = X * Y +)DOC"); + } +}; +``` + +[`MulOpMaker`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc#L43)is inherited from`framework::OpProtoAndCheckerMaker`, consisting of 2 variables in the constructor: + + - `framework::OpProto` stores Operator input and variable attribute, used for generating Python API interfaces. + - `framework::OpAttrChecker` is used to validate variable attributes. + +The constructor utilizes `AddInput`, `AddOutput`, and `AddComment`, so that the corresponding information will be added to `OpProto`. + +The code above adds two inputs `X` and `Y` to `MulOp`, an output `Out`, and their corresponding descriptions, in accordance to Paddle's [naming convention](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/name_convention.md). + + +An additional example [`ScaleOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/scale_op.cc#L37) is implemented as follows: + +```cpp +template +class ScaleOpMaker : public framework::OpProtoAndCheckerMaker { + public: + ScaleOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "The input tensor of scale operator.").NotInGradient(); + AddOutput("Out", "The output tensor of scale operator.").NotInGradient(); + AddComment(R"DOC(Scale operator +The equation is: Out = scale*X +)DOC"); + AddAttr("scale", "scale of scale operator.").SetDefault(1.0); + } +}; +``` + +There are two changes in this example: + +- `AddInput("X","...").NotInGradient()` expresses that input `X` is not involved in `ScaleOp`'s corresponding computation. If an input to an operator is not participating in back-propagation, please explicitly set `.NotInGradient()`. + +- `AddAttr("scale", "...").SetDefault(1.0);` adds `scale`constant as an attribute, and sets the default value to 1.0. + + +### 2. Defining Operator + +The following code defines the interface for MulOp: + +```cpp +class MulOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(const framework::InferShapeContext &ctx) const override { + auto dim0 = ctx.Input("X")->dims(); + auto dim1 = ctx.Input("Y")->dims(); + PADDLE_ENFORCE_EQ(dim0.size(), 2, + "input X(%s) should be a tensor with 2 dims, a matrix", + ctx.op_.Input("X")); + PADDLE_ENFORCE_EQ(dim1.size(), 2, + "input Y(%s) should be a tensor with 2 dims, a matrix", + ctx.op_.Input("Y")); + PADDLE_ENFORCE_EQ( + dim0[1], dim1[0], + "First matrix's width must be equal with second matrix's height."); + ctx.Output("Out")->Resize({dim0[0], dim1[1]}); + } +}; +``` + +[`MulOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc#L22) is inherited from `OperatorWithKernel`. Its `public` member + +```cpp +using framework::OperatorWithKernel::OperatorWithKernel; +``` + +expresses an operator constructor using base class `OperatorWithKernel`, alternatively written as + +```cpp +MulOp(const std::string &type, const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorWithKernel(type, inputs, outputs, attrs) {} +``` + +`InferShape` interface needs to be re-written.`InferShape` is a constant method and cannot modify Op's member variables, its constant member `const framework::InferShapeContext &ctx` can be used to extract input, output, and attributes. It functions to + + - 1). validate and error out early: it checks input data dimensions and types. + - 2). configures the tensor shape in the output. + +Usually `OpProtoMaker` and `Op`'s type definitions are written in `.cc` files, which also include the registration methods introduced later. + +### 3. Defining OpKernel + +`MulKernel` inherits `framework::OpKernel`, which includes the following templates: + +- `typename Place` denotes device type. When different devices, namely the CPU and the GPU, share the same kernel, this template needs to be added. If they don't share kernels, this must not be added. An example of a non-sharing kernel is [`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43). + +- `typename T` denotes data type, such as `float` or `double`. + +`MulKernel` types need to rewrite the interface for `Compute`. +- `Compute` takes one input variable `const framework::ExecutionContext& context`. +- Compared with `InferShapeContext`, `ExecutionContext` includes device types, and can similarly extract input, output, and attribute variables. +- `Compute` implements the computation logics of an `OpKernel`. + +`MulKernel`'s implementation of `Compute` is as follows: + + ```cpp + template + class MulKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* X = context.Input("X"); + auto* Y = context.Input("Y"); + auto* Z = context.Output("Out"); + Z->mutable_data(context.GetPlace()); + auto* device_context = + const_cast(context.device_context_); + math::matmul(*X, false, *Y, false, 1, Z, 0, device_context); + } + }; + ``` + +Note that **different devices (CPU, GPU)share an Op definition; whether or not they share the same `OpKernel` depends on whether `Compute` calls functions that support both devices.** + +`MulOp`'s CPU and GPU share the same `Kernel`. A non-sharing `OpKernel` example can be seen in [`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43). + +To ease the writing of `OpKernel` compute, and for reusing code cross-device, [`Eigen-unsupported Tensor`](https://bitbucket.org/eigen/eigen/src/default/unsupported/Eigen/CXX11/src/Tensor/README.md?fileviewer=file-view-default) module is used to implement `Compute` interface. To learn about how the Eigen library is used in PaddlePaddle, please see [usage document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/use_eigen_cn.md). + + +This concludes the forward implementation of an operator. Next its operation and kernel need to be registered in a `.cc` file. + +The definition of its corresponding backward operator, if applicable, is similar to that of an forward operator. **Note that a backward operator does not include a `ProtoMaker`**. + +### 4. Registering Operator + +- In `.cc` files, register forward and backward operator classes and the CPU kernel. + + ```cpp + namespace ops = paddle::operators; + REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker, mul_grad, ops::MulOpGrad); + REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel); + REGISTER_OP_CPU_KERNEL(mul_grad, + ops::MulGradKernel); + ``` + + In that code block, + + - `REGISTER_OP` registers the `ops::MulOp` class, type named `mul`, its type `ProtoMaker` is `ops::MulOpMaker`, registering `ops::MulOpGrad` as `mul_grad`. + - `REGISTER_OP_WITHOUT_GRADIENT` registers an operator without gradient. + - `REGISTER_OP_CPU_KERNEL` registers `ops::MulKernel` class and specialized template types `paddle::platform::CPUPlace` and `float`, which also registers `ops::MulGradKernel`. + + +- Registering GPU Kernel in `.cu` files + - Note that if GPU Kernel is implemented using the `Eigen unsupported` module, then on top of `.cu`, a macro definition `#define EIGEN_USE_GPU` is needed, such as + + ```cpp + // if use Eigen unsupported module before include head files + #define EIGEN_USE_GPU + + namespace ops = paddle::operators; + REGISTER_OP_GPU_KERNEL(mul, ops::MulKernel); + REGISTER_OP_GPU_KERNEL(mul_grad, + ops::MulGradKernel); + ``` + +### 5. Compilation + +Run the following commands to compile. + +``` +make mul_op +``` + +## Python Binding + +The system will automatically bind to Python and link it to a generated library. + +## Unit Tests + +Unit tests for an operator include + +1. comparing a forward operator's implementations on different devices, + +2. comparing a backward operator's implementation on different devices, and + +3. a scaling test for the backward operator. + +Here, we introduce the [unit tests for `MulOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/tests/test_mul_op.py). + +### Testing Forward Operators + +A forward operator unit test inherits `unittest.TestCase` and defines metaclass `__metaclass__ = OpTestMeta`. More concrete tests are performed in `OpTestMeta`. Testing a forward operator requires the following: + +1. Defining input, output and relevant attributes in `setUp` method. + +2. Generating random input data. + +3. Implementing the same computation logic in a Python script: + + ```python + import unittest + import numpy as np + from gradient_checker import GradientChecker, create_op + from op_test_util import OpTestMeta + + class TestMulOp(unittest.TestCase): + __metaclass__ = OpTestMeta + + def setUp(self): + self.type = "mul" + self.inputs = { + 'X': np.random.random((32, 84)).astype("float32"), + 'Y': np.random.random((84, 100)).astype("float32") + } + self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])} + ``` +Get its output, and compare it with the forward operator's own output. + +The code above first loads required packages. In addition, we have + +- `self.type = "mul" ` defines the type that is identical to what the operator's registered type. +- `self.inputs` defines input, with type `numpy.array` and initializes it. +- `self.outputs` defines output and completes the same operator computation in the Python script, and returns its result from the Python script. + +### Testing Backward Operators + +A backward operator unit test inherits `GradientChecker`, which inherits `unittest.TestCase`. As a result, **a backward operator unit test needs to be have the prefix `test_`**. + +```python +class TestMulGradOp(GradientChecker): + def setUp(self): + self.op = create_op("mul") + self.inputs = { + 'X': np.random.random((32, 84)).astype("float32"), + 'Y': np.random.random((84, 100)).astype("float32") + } + + def test_check_grad_normal(self): + # mul op will enlarge the relative error + self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5) + + def test_check_grad_ingore_x(self): + self.check_grad( + ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X")) + + def test_check_grad_ingore_y(self): + self.check_grad( + ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y')) +``` + +Some key points in the code above include: + +- `create_op("mul")` creates the backward operator's corresponding forward operator. +- `test_normal` calls `check_grad` to validate scaling tests' correctness and stability through numeric methods. + - The first variable `["X", "Y"]` appoints `X` and `Y` to be scale tested. + - The second variable `"Out"` points to the network's final output target `Out`. + - The third variable `max_relative_error` points to the maximum relative tolerance error during scaling tests. +- `test_check_grad_ingore_x` and `test_check_grad_ingore_y`branches test the cases where there is only one scaling input. + +### Compiling and Running + + +Any new unit testing file of the format `test_*.py` added to the director `python/paddle/v2/framework/tests` is automatically added to the project to compile. + +Note that **unlike the compile test for Ops, running unit tests requires compiling the entire project** and requires compiling with flag `WITH_TESTING` on i.e. `cmake paddle_dir -DWITH_TESTING=ON`. + +After successfully compiling the project, run the following command to run unit tests: + +```bash +make test ARGS="-R test_mul_op -V" +``` + +Or, + +```bash +ctest -R test_mul_op +``` + +## Remarks + +- Every `*_op.h` (if applicable), `*_op.cc`, and `*_op.cu` (if applicable) must be created for a unique Op. Compiling will fail if multiple operators are included per file. +- The type with which an operator is registered needs to be identical to the Op's name. Registering `REGISTER_OP(B, ...)` in `A_op.cc` will cause unit testing failures. +- If the operator does not implement a GPU kernel, please refrain from creating an empty `*_op.cu` file, or else unit tests will fail. +- If multiple operators rely on some shared methods, a file NOT named `*_op.*` can be created to store them, such as `gather.h`. diff --git a/doc/_sources/howto/dev/use_eigen_en.md.txt b/doc/_sources/howto/dev/use_eigen_en.md.txt new file mode 100644 index 0000000000000000000000000000000000000000..e169106e12f5d62696f1f0e7163562793b32c18c --- /dev/null +++ b/doc/_sources/howto/dev/use_eigen_en.md.txt @@ -0,0 +1,146 @@ +## How to use Eigen in Paddle + +Essentially, a neural network is a compute graph. T data needed for the computation is stored in `Tensor`s and its computation procedure is described by `Operator`s. An `Operator` calls the `Compute` interface in its corresponding `OpKernel` and operates on the `Tensor`. + + +### Eigen Tensor Module + +The Eigen Tensor module supports powerful element-wise computation. In addition, a piece of code written using it can be run on both the CPU and the GPU. + +Note that Eigen Tensor is still being actively developed, so its tests are not completely covered and its documentation may be sparse. + +For details on Eigen Tensor module, please see [doc 1](https://github.com/RLovelett/eigen/blob/master/unsupported/Eigen/CXX11/src/Tensor/README.md) and [doc 2](https://bitbucket.org/eigen/eigen/src/default/unsupported/Eigen/CXX11/src/Tensor/README.md). + + +### paddle::framework::Tensor + +Paddle Tensor's is defined in the framework directory with the following interface: + +```cpp +class Tensor { + public: + /*! Return a pointer to mutable memory block. */ + template + inline T* data(); + + /** + * @brief Return a pointer to mutable memory block. + * @note If not exist, then allocation. + */ + template + inline T* mutable_data(platform::Place place); + + /** + * @brief Return a pointer to mutable memory block. + * + * @param[in] dims The dimensions of the memory block. + * @param[in] place The place of the memory block. + * + * @note If not exist, then allocation. + */ + template + inline T* mutable_data(DDim dims, platform::Place place); + + /*! Resize the dimensions of the memory block. */ + inline Tensor& Resize(const DDim& dims); + + /*! Return the dimensions of the memory block. */ + inline const DDim& dims() const; + + private: + /*! holds the memory block if allocated. */ + std::shared_ptr holder_; + + /*! points to dimensions of memory block. */ + DDim dim_; +}; +``` + +`Placeholder` is used to delay memory allocation; that is, we can first define a tensor, using `Resize` to configure its shape, and then call `mutuable_data` to allocate the actual memory. + +```cpp +paddle::framework::Tensor t; +paddle::platform::CPUPlace place; +// set size first +t.Resize({2, 3}); +// allocate memory on CPU later +t.mutable_data(place); +``` + +### paddle::framework::Tensor Usage +`AddOp` demonstrates Tensor's usage. + +- InferShape + +When computing a neural network's compute graph, first call every `Operator`'s `InferShape` method, and use `Resize` to configure the size of the output tensor. + +```cpp +void InferShape(const framework::InferShapeContext &ctx) const override { + PADDLE_ENFORCE_EQ(ctx.Input("X")->dims(), + ctx.Input("Y")->dims(), + "Two input of Add Op's dimension must be same."); + ctx.Output("Out")->Resize(ctx.Input("X")->dims()); +} +``` + + +- Run + +```cpp +void Compute(const framework::ExecutionContext& context) const override { + auto* input0 = context.Input("X"); + auto* input1 = context.Input("Y"); + auto* output = context.Output("Out"); + + output->mutable_data(context.GetPlace()); + + auto x = EigenVector::Flatten(*input0); + auto y = EigenVector::Flatten(*input1); + auto z = EigenVector::Flatten(*output); + + auto place = context.GetEigenDevice(); + + z.device(place) = x + y; +} +``` + + +### paddle::framework::Tensor到EigenTensor的转换 + +As shown above, in actual computation, we need to transform the input and output `Tensor`s into formats Eigen supports. We show some functions in [eigen.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/eigen.h) to implement the transformation from `paddle::framework::Tensor`to `EigenTensor/EigenMatrix/EigenVector/EigenScalar`. + +Using EigenTensor as an example: + +```cpp +Tensor t; +float* p = t.mutable_data(make_ddim({1, 2, 3}), platform::CPUPlace()); +for (int i = 0; i < 1 * 2 * 3; i++) { + p[i] = static_cast(i); +} + +EigenTensor::Type et = EigenTensor::From(t); +``` + +`From` is an interfacing method provided by the EigenTensor template, which implements the transformation from a `paddle::framework::Tensor` object to an EigenTensor. Since `rank` is a template parameter, it needs to be explicitly specified at the time of the transformation. + +In Eigen, tensors with different ranks are different types, with `Vector` bring a rank-1 instance. Note that `EigenVector::From` uses a transformation from an 1-dimensional Paddle tensor to a 1-dimensional Eigen tensor while `EigenVector::Flatten` reshapes a paddle tensor and flattens it into a 1-dimensional Eigen tensor. Both resulting tensors are still typed EigenVector. + +For more transformations, see the [unit tests](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/eigen_test.cc) in the `eigen_test.cc` file. + + + +### Implementing Computation + +While computing, the device interface is needed from the EigenTensors on the left hand side of the assignments. Note that the computation between EigenTensors only changes the data originally inthe Tensor and does not change all the shape information associated with the Tensor. + +```cpp +auto x = EigenVector::Flatten(*input0); +auto y = EigenVector::Flatten(*input1); +auto z = EigenVector::Flatten(*output); +auto place = context.GetEigenDevice(); +z.device(place) = x + y; +``` + +In this code segment, input0/input1/output can be Tensors of arbitrary dimension. We are calling Flatten from EigenVector, transforming a tensor of any dimension into a 1-dimensional EigenVector. After completing computation, input0/input1/output will retain the same shape information, and they can be resized using the `Resize` interface. + +Because the Eigen Tensor module is under-documented, please refer to `OpKernel`'s computation code in TensorFlow's [kernel module documentation](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/core/kernels). diff --git a/doc/_sources/howto/dev/write_docs_en.rst.txt b/doc/_sources/howto/dev/write_docs_en.rst.txt new file mode 100644 index 0000000000000000000000000000000000000000..b3ef07eb1d0012827df8e6a4f27c5fa643649492 --- /dev/null +++ b/doc/_sources/howto/dev/write_docs_en.rst.txt @@ -0,0 +1,80 @@ +################## +Contribute Documentation +################## + +PaddlePaddle supports English documentation ``doc`` and Chinese documentation ``doc_cn``. +Both are compiled by `cmake`_ and `sphinx`_ , the compiled documentations will be stored under ``doc`` and ``doc_cn`` directories. +When using the PaddlePaddle.org to compile documentations, the compiled documentations will be stored under a consolidated directory: .ppo_workspace/content + +How to Build Documentations +============ + +We recommend using PaddlePaddle.org tool to build documentation + + +Use PaddlePaddle.org tool +-------------- +This is the recommended method to build documentation. It can compile documentation and preview the documentation in a web browser. + +The tool uses Docker, please install it on your system. Please check Docker official website on how to install Docker. You may use the following commands to activate the tool + +.. code-block:: bash + + mkdir paddlepaddle # Create paddlepaddle working directory + cd paddlepaddle + + # Clone the content repositories. You may only clone the contents you need + git clone https://github.com/PaddlePaddle/Paddle.git + git clone https://github.com/PaddlePaddle/book.git + git clone https://github.com/PaddlePaddle/models.git + git clone https://github.com/PaddlePaddle/Mobile.git + + # Please specify the working directory through -v + docker run -it -p 8000:8000 -v `pwd`:/var/content paddlepaddle/paddlepaddle.org:latest + +Note: PaddlePaddle.org will read the content repos specified in the -v (volume) flag of the docker run command +Use a web browser and navigate to http://localhost:8000, click the buttons to compile the documentation +The compiled documentations will be stored in /.ppo_workspace/content + + +If you don't wish to use Docker, you can also activate the tool through Django. Use the following the commands to set up + +.. code-block:: bash + + mkdir paddlepaddle # Create paddlepaddle working directory + cd paddlepaddle + + # Clone the content repositories and PaddlePaddle.org + git clone https://github.com/PaddlePaddle/Paddle.git + git clone https://github.com/PaddlePaddle/book.git + git clone https://github.com/PaddlePaddle/models.git + git clone https://github.com/PaddlePaddle/Mobile.git + git clone https://github.com/PaddlePaddle/PaddlePaddle.org.git + + # Please specify the PaddlePaddle working directory. In the current setting, it should be pwd + export CONTENT_DIR= + export ENV='' + cd PaddlePaddle.org/portal/ + pip install -r requirements.txt + python manage.py runserver + +Use a web browser and navigate to http://localhost:8000, click the buttons to compile the documentation +The compiled documentations will be stored in /.ppo_workspace/content + +If you want to learn more on the PaddlePaddle.org, please `click here `_ 。 + +How to write Documentations +============ + +PaddlePaddle uses `sphinx`_ to compile documentations,Please check sphinx official website for more detail. + + +How to update www.paddlepaddle.org +============================ + +Please create PRs and submit them to github, please check `Contribute Code `_ 。 +PaddlePaddle develop branch will update the documentation once the PR is merged. User may check latest `Chinese Docs `_ and +`English Docs `_ 。 + +.. _cmake: https://cmake.org/ +.. _sphinx: http://www.sphinx-doc.org/en/1.4.8/ diff --git a/doc/_sources/howto/index_en.rst.txt b/doc/_sources/howto/index_en.rst.txt index 1fbfcd260b912078f00ed5b720ed607db725c4e2..61bf25ccd12eeedffc747fdd4ce84fa4adde07ee 100644 --- a/doc/_sources/howto/index_en.rst.txt +++ b/doc/_sources/howto/index_en.rst.txt @@ -20,6 +20,7 @@ Development dev/new_layer_en.rst dev/contribute_to_paddle_en.md + dev/write_docs_en.rst Configuration ------------- diff --git a/doc/_sources/howto/optimization/cpu_profiling.md.txt b/doc/_sources/howto/optimization/cpu_profiling.md.txt new file mode 100644 index 0000000000000000000000000000000000000000..1775374cf6e518586c28bbd8e04946c74df7e4c5 --- /dev/null +++ b/doc/_sources/howto/optimization/cpu_profiling.md.txt @@ -0,0 +1,197 @@ +This tutorial introduces techniques we use to profile and tune the +CPU performance of PaddlePaddle. We will use Python packages +`cProfile` and `yep`, and Google's `perftools`. + +Profiling is the process that reveals performance bottlenecks, +which could be very different from what's in the developers' mind. +Performance tuning is done to fix these bottlenecks. Performance optimization +repeats the steps of profiling and tuning alternatively. + +PaddlePaddle users program AI applications by calling the Python API, which calls +into `libpaddle.so.` written in C++. In this tutorial, we focus on +the profiling and tuning of + +1. the Python code and +1. the mixture of Python and C++ code. + +## Profiling the Python Code + +### Generate the Performance Profiling File + +We can use Python standard +package, [`cProfile`](https://docs.python.org/2/library/profile.html), +to generate Python profiling file. For example: + +```bash +python -m cProfile -o profile.out main.py +``` + +where `main.py` is the program we are going to profile, `-o` specifies +the output file. Without `-o`, `cProfile` would outputs to standard +output. + +### Look into the Profiling File + +`cProfile` generates `profile.out` after `main.py` completes. We can +use [`cprofilev`](https://github.com/ymichael/cprofilev) to look into +the details: + +```bash +cprofilev -a 0.0.0.0 -p 3214 -f profile.out main.py +``` + +where `-a` specifies the HTTP IP, `-p` specifies the port, `-f` +specifies the profiling file, and `main.py` is the source file. + +Open the Web browser and points to the local IP and the specifies +port, we will see the output like the following: + +``` + ncalls tottime percall cumtime percall filename:lineno(function) + 1 0.284 0.284 29.514 29.514 main.py:1() + 4696 0.128 0.000 15.748 0.003 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/executor.py:20(run) + 4696 12.040 0.003 12.040 0.003 {built-in method run} + 1 0.144 0.144 6.534 6.534 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/__init__.py:14() +``` + +where each line corresponds to Python function, and the meaning of +each column is as follows: + +| column | meaning | +| --- | --- | +| ncalls | the number of calls into a function | +| tottime | the total execution time of the function, not including the + execution time of other functions called by the function | +| percall | tottime divided by ncalls | +| cumtime | the total execution time of the function, including the execution time of other functions being called | +| percall | cumtime divided by ncalls | +| filename:lineno(function) | where the function is defined | + +### Identify Performance Bottlenecks + +Usually, `tottime` and the related `percall` time is what we want to +focus on. We can sort above profiling file by tottime: + +```text + 4696 12.040 0.003 12.040 0.003 {built-in method run} + 300005 0.874 0.000 1.681 0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/dataset/mnist.py:38(reader) + 107991 0.676 0.000 1.519 0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:219(__init__) + 4697 0.626 0.000 2.291 0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:428(sync_with_cpp) + 1 0.618 0.618 0.618 0.618 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/__init__.py:1() +``` + +We can see that the most time-consuming function is the `built-in +method run`, which is a C++ function in `libpaddle.so`. We will +explain how to profile C++ code in the next section. At this +moment, let's look into the third function `sync_with_cpp`, which is a +Python function. We can click it to understand more about it: + +``` +Called By: + + Ordered by: internal time + List reduced from 4497 to 2 due to restriction <'sync_with_cpp'> + +Function was called by... + ncalls tottime cumtime +/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:428(sync_with_cpp) <- 4697 0.626 2.291 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:562(sync_with_cpp) +/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:562(sync_with_cpp) <- 4696 0.019 2.316 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:487(clone) + 1 0.000 0.001 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:534(append_backward) + + +Called: + + Ordered by: internal time + List reduced from 4497 to 2 due to restriction <'sync_with_cpp'> +``` + +The lists of the callers of `sync_with_cpp` might help us understand +how to improve the function definition. + +## Profiling Python and C++ Code + +### Generate the Profiling File + +To profile a mixture of Python and C++ code, we can use a Python +package, `yep`, that can work with Google's `perftools`, which is a +commonly-used profiler for C/C++ code. + +In Ubuntu systems, we can install `yep` and `perftools` by running the +following commands: + +```bash +apt update +apt install libgoogle-perftools-dev +pip install yep +``` + +Then we can run the following command + +```bash +python -m yep -v main.py +``` + +to generate the profiling file. The default filename is +`main.py.prof`. + +Please be aware of the `-v` command line option, which prints the +analysis results after generating the profiling file. By examining the + the print result, we'd know that if we stripped debug +information from `libpaddle.so` at build time. The following hints +help make sure that the analysis results are readable: + +1. Use GCC command line option `-g` when building `libpaddle.so` so to + include the debug information. The standard building system of + PaddlePaddle is CMake, so you might want to set + `CMAKE_BUILD_TYPE=RelWithDebInfo`. + +1. Use GCC command line option `-O2` or `-O3` to generate optimized + binary code. It doesn't make sense to profile `libpaddle.so` + without optimization, because it would anyway run slowly. + +1. Profiling the single-threaded binary file before the + multi-threading version, because the latter often generates tangled + profiling analysis result. You might want to set environment + variable `OMP_NUM_THREADS=1` to prevents OpenMP from automatically + starting multiple threads. + +### Examining the Profiling File + +The tool we used to examine the profiling file generated by +`perftools` is [`pprof`](https://github.com/google/pprof), which +provides a Web-based GUI like `cprofilev`. + +We can rely on the standard Go toolchain to retrieve the source code +of `pprof` and build it: + +```bash +go get github.com/google/pprof +``` + +Then we can use it to profile `main.py.prof` generated in the previous +section: + +```bash +pprof -http=0.0.0.0:3213 `which python` ./main.py.prof +``` + +Where `-http` specifies the IP and port of the HTTP service. +Directing our Web browser to the service, we would see something like +the following: + +![result](./pprof_1.png) + +### Identifying the Performance Bottlenecks + +Similar to how we work with `cprofilev`, we'd focus on `tottime` and +`cumtime`. + +![kernel_perf](./pprof_2.png) + +We can see that the execution time of multiplication and the computing +of the gradient of multiplication takes 2% to 4% of the total running +time, and `MomentumOp` takes about 17%. Obviously, we'd want to +optimize `MomentumOp`. + +`pprof` would mark performance critical parts of the program in +red. It's a good idea to follow the hints. diff --git a/doc/_sources/howto/usage/cluster/cluster_train_en.md.txt b/doc/_sources/howto/usage/cluster/cluster_train_en.md.txt index c60876721cbf5565d6e48c8061811aacada748cd..baa97c0c02ae490fff8587071bd2d4adfb5325e3 100644 --- a/doc/_sources/howto/usage/cluster/cluster_train_en.md.txt +++ b/doc/_sources/howto/usage/cluster/cluster_train_en.md.txt @@ -1,129 +1,220 @@ -# Run Distributed Training +# PaddlePaddle Distributed Training + +* [Introduction](#introduction) +* [Preparations](#preparations) +* [Command-line arguments](#command-line-arguments) + * [Starting parameter server](#starting-parameter-server) + * [Starting trainer](#starting-trainer) + * [Prepare Training Dataset](#prepare-training-dataset) + * [Prepare Training program](#prepare-training-program) +* [Use cluster platforms or cluster management tools](#use-cluster-platforms-or-cluster-management-tools) + * [Cluster Training Using Fabric](#cluster-training-using-fabric) + * [Prepare a Linux cluster](#prepare-a-linux-cluster) + * [Launching Cluster Job](#launching-cluster-job) + * [Kill Cluster Job](#kill-cluster-job) + * [Check Cluster Training Result](#check-cluster-training-result) + * [Check Model Output](#check-model-output) + * [Cluster Training Using OpenMPI](#cluster-training-using-openmpi) + * [Prepare an OpenMPI cluster](#prepare-an-openmpi-cluster) + * [Launching Cluster Job](#launching-cluster-job-1) + * [Cluster Training Using Kubernetes](#cluster-training-using-kubernetes) + +## Introduction + +In this article, we'll explain how to run distributed training jobs with PaddlePaddle on different types of clusters. The diagram below shows the main architecture of a distributed trainning job: + + + +- Data shard: training data will be split into multiple partitions, trainers use the partitions of the whole dataset to do the training job. +- Trainer: each trainer reads the data shard, and train the neural network. Then the trainer will upload calculated "gradients" to parameter servers, and wait for parameters to be optimized on the parameter server side. When that finishes, the trainer download optimized parameters and continues its training. +- Parameter server: every parameter server stores part of the whole neural network model data. They will do optimization calculations when gradients are uploaded from trainers, and then send updated parameters to trainers. + +PaddlePaddle can support both synchronize stochastic gradient descent (SGD) and asynchronous SGD. + +When training with synchronize SGD, PaddlePaddle uses an internal "synchronize barrier" which makes gradients update and parameter download in strict order. On the other hand, asynchronous SGD won't wait for all trainers to finish upload at a single step, this will increase the parallelism of distributed training: parameter servers do not depend on each other, they'll do parameter optimization concurrently. Parameter servers will not wait for trainers, so trainers will also do their work concurrently. But asynchronous SGD will introduce more randomness and noises in the gradient. + +## Preparations +1. Prepare your computer cluster. It's normally a bunch of Linux servers connected by LAN. Each server will be assigned a unique IP address. The computers in the cluster can be called "nodes". +2. Install PaddlePaddle on every node. If you are going to take advantage of GPU cards, you'll also need to install proper driver and CUDA libraries. To install PaddlePaddle please read [this build and install](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/getstarted/build_and_install) document. We strongly recommend using [Docker installation](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/getstarted/build_and_install/docker_install_en.rst). + +After installation, you can check the version by typing the below command (run a docker container if using docker: `docker run -it paddlepaddle/paddle:[tag] /bin/bash`): + +```bash +$ paddle version +PaddlePaddle 0.10.0rc, compiled with + with_avx: ON + with_gpu: OFF + with_double: OFF + with_python: ON + with_rdma: OFF + with_timer: OFF +``` -In this article, we explain how to run distributed Paddle training jobs on clusters. We will create the distributed version of the single-process training example, [recommendation](https://github.com/baidu/Paddle/tree/develop/demo/recommendation). +We'll take `doc/howto/usage/cluster/src/word2vec` as an example to introduce distributed training using PaddlePaddle v2 API. -[Scripts](https://github.com/baidu/Paddle/tree/develop/paddle/scripts/cluster_train) used in this article launch distributed jobs via SSH. They also work as a reference for users running more sophisticated cluster management systems like MPI and [Kubernetes](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/k8s). +## Command-line arguments -## Prerequisite +### Starting parameter server -1. Aforementioned scripts use a Python library [fabric](http://www.fabfile.org/) to run SSH commands. We can use `pip` to install fabric: +Type the below command to start a parameter server which will wait for trainers to connect: - ```bash - pip install fabric - ``` +```bash +$ paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1 +``` -1. We need to install PaddlePaddle on all nodes in the cluster. To enable GPUs, we need to install CUDA in `/usr/local/cuda`; otherwise Paddle would report errors at runtime. +If you wish to run parameter servers in background, and save a log file, you can type: +```bash +$ stdbuf -oL /usr/bin/nohup paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1 &> pserver.log +``` -1. Set the `ROOT_DIR` variable in [`cluster_train/conf.py`] on all nodes. For convenience, we often create a Unix user `paddle` on all nodes and set `ROOT_DIR=/home/paddle`. In this way, we can write public SSH keys into `/home/paddle/.ssh/authorized_keys` so that user `paddle` can SSH to all nodes without password. +| param | required | default | description | +| ------------- | ------------- | ------------- | ------------- | +| port | required | 7164 | port which parameter server will listen on. If ports_num greater than 1, parameter server will listen on multiple ports for more network throughput | +| ports_num | required | 1 | total number of ports will listen on | +| ports_num_for_sparse | required | 1 | number of ports which serves sparse parameter update | +| num_gradient_servers | required | 1 | total number of gradient servers | -## Prepare Job Workspace +### Starting trainer +Type the command below to start the trainer(name the file whatever you want, like "train.py") -We refer to the directory where we put dependent libraries, config files, etc., as *workspace*. +```bash +$ python train.py +``` -These `train/test` data should be prepared before launching cluster job. To satisfy the requirement that train/test data are placed in different directory from workspace, PADDLE refers train/test data according to index file named as `train.list/test.list` which are used in model config file. So the train/test data also contains train.list/test.list two list file. All local training demo already provides scripts to help you create these two files, and all nodes in cluster job will handle files with same logical code in normal condition. +Trainers' network need to be connected with parameter servers' network to finish the job. Trainers need to know port and IPs to locate parameter servers. You can pass arguments to trainers through [environment variables](https://en.wikipedia.org/wiki/Environment_variable) or pass to `paddle.init()` function. Arguments passed to the `paddle.init()` function will overwrite environment variables. -Generally, you can use same model file from local training for cluster training. What you should have in mind that, the `batch_size` set in `setting` function in model file means batch size in `each` node of cluster job instead of total batch size if synchronization SGD was used. +Use environment viriables: -Following steps are based on [demo/recommendation](https://github.com/PaddlePaddle/Paddle/tree/develop/demo/recommendation) demo in demo directory. +```bash +export PADDLE_INIT_USE_GPU=False +export PADDLE_INIT_TRAINER_COUNT=1 +export PADDLE_INIT_PORT=7164 +export PADDLE_INIT_PORTS_NUM=1 +export PADDLE_INIT_PORTS_NUM_FOR_SPARSE=1 +export PADDLE_INIT_NUM_GRADIENT_SERVERS=1 +export PADDLE_INIT_TRAINER_ID=0 +export PADDLE_INIT_PSERVERS=127.0.0.1 +python train.py +``` -You just go through demo/recommendation tutorial doc until `Train` section, and at last you will get train/test data and model configuration file. Finaly, just use demo/recommendation as workspace for cluster training. +Pass arguments: -At last your workspace should look like as follow: +```python +paddle.init( + use_gpu=False, + trainer_count=1, + port=7164, + ports_num=1, + ports_num_for_sparse=1, + num_gradient_servers=1, + trainer_id=0, + pservers="127.0.0.1") ``` -. -|-- common_utils.py -|-- data -| |-- config.json -| |-- config_generator.py -| |-- meta.bin -| |-- meta_config.json -| |-- meta_generator.py -| |-- ml-1m -| |-- ml_data.sh -| |-- ratings.dat.test -| |-- ratings.dat.train -| |-- split.py -| |-- test.list -| `-- train.list -|-- dataprovider.py -|-- evaluate.sh -|-- prediction.py -|-- preprocess.sh -|-- requirements.txt -|-- run.sh -`-- trainer_config.py + +| param | required | default | description | +| ------------- | ------------- | ------------- | ------------- | +| use_gpu | optional | False | set to "True" to enable GPU training | +| trainer_count | required | 1 | total count of trainers in the training job | +| port | required | 7164 | port to connect to parameter server | +| ports_num | required | 1 | number of ports for communication | +| ports_num_for_sparse | required | 1 | number of ports for sparse type caculation | +| num_gradient_servers | required | 1 | total number of gradient server | +| trainer_id | required | 0 | ID for every trainer, start from 0 | +| pservers | required | 127.0.0.1 | list of IPs of parameter servers, separated by "," | + +### Prepare Training Dataset + +Here's some example code [prepare.py](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/src/word2vec/prepare.py), it will download public `imikolov` dataset and split it into multiple files according to job parallelism(trainers count). Modify `SPLIT_COUNT` at the begining of `prepare.py` to change the count of output files. + +In the real world, we often use `MapReduce` job's output as training data, so there will be lots of files. You can use `mod` to assign training file to trainers: + +```python +import os +train_list = [] +flist = os.listdir("/train_data/") +for f in flist: + suffix = int(f.split("-")[1]) + if suffix % TRAINER_COUNT == TRAINER_ID: + train_list.append(f) ``` -Not all of these files are needed for cluster training, but it's not necessary to remove useless files. -`trainer_config.py` -Indicates the model config file. +Example code `prepare.py` will split training data and testing data into 3 files with digital suffix like `-00000`, `-00001` and`-00002`: -`train.list` and `test.list` -File index. It stores all relative or absolute file paths of all train/test data at current node. +``` +train.txt +train.txt-00000 +train.txt-00001 +train.txt-00002 +test.txt +test.txt-00000 +test.txt-00001 +test.txt-00002 +``` -`dataprovider.py` -used to read train/test samples. It's same as local training. +When job started, every trainer needs to get it's own part of data. In some distributed systems a storage service will be provided, so the date under that path can be accessed by all the trainer nodes. Without the storage service, you must copy the training data to each trainer node. -`data` -all files in data directory are refered by train.list/test.list which are refered by data provider. +Different training jobs may have different data format and `reader()` function, developers may need to write different data prepare scripts and `reader()` functions for their job. +### Prepare Training program -## Prepare Cluster Job Configuration +We'll create a *workspace* directory on each node, storing your training program, dependencies, mounted or downloaded dataset directory. -The options below must be carefully set in cluster_train/conf.py -`HOSTS` all nodes hostname or ip that will run cluster job. You can also append user and ssh port with hostname, such as root@192.168.100.17:9090. +Your workspace may looks like: +``` +. +|-- my_lib.py +|-- word_dict.pickle +|-- train.py +|-- train_data_dir/ +| |-- train.txt-00000 +| |-- train.txt-00001 +| |-- train.txt-00002 +`-- test_data_dir/ + |-- test.txt-00000 + |-- test.txt-00001 + `-- test.txt-00002 +``` -`ROOT_DIR` workspace ROOT directory for placing JOB workspace directory +- `my_lib.py`: user defined libraries, like PIL libs. This is optional. +- `word_dict.pickle`: dict file for training word embeding. +- `train.py`: training program. Sample code: [api_train_v2_cluster.py](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/src/word2vec/prepare.py). ***NOTE:*** You may need to modify the head part of `train.py` when using different cluster platform to retrive configuration environment variables: -`PADDLE_NIC` the NIC(Network Interface Card) interface name for cluster communication channel, such as eth0 for ethternet, ib0 for infiniband. + ```python + cluster_train_file = "./train_data_dir/train/train.txt" + cluster_test_file = "./test_data_dir/test/test.txt" + node_id = os.getenv("OMPI_COMM_WORLD_RANK") + if not node_id: + raise EnvironmentError("must provied OMPI_COMM_WORLD_RANK") + ``` -`PADDLE_PORT` port number for cluster commnunication channel +- `train_data_dir`: containing training data. Mount from storage service or copy trainning data to here. +- `test_data_dir`: containing testing data. -`PADDLE_PORTS_NUM` the number of port used for cluster communication channle. if the number of cluster nodes is small(less than 5~6nodes), recommend you set it to larger, such as 2 ~ 8, for better network performance. +## Use cluster platforms or cluster management tools -`PADDLE_PORTS_NUM_FOR_SPARSE` the number of port used for sparse updater cluster commnunication channel. if sparse remote update is used, set it like `PADDLE_PORTS_NUM` +PaddlePaddle supports running jobs on several platforms including: +- [Kubernetes](http://kubernetes.io) open-source system for automating deployment, scaling, and management of containerized applications from Google. +- [OpenMPI](https://www.open-mpi.org) Mature high performance parallel computing framework. +- [Fabric](http://www.fabfile.org) A cluster management tool. Write scripts to submit jobs or manage the cluster. -`LD_LIBRARY_PATH` set addtional LD_LIBRARY_PATH for cluster job. You can use it to set CUDA libraries path. +We'll introduce cluster job management on these platforms. The examples can be found under [cluster_train_v2](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/scripts/cluster_train_v2). -Default Configuration as follow: +These cluster platforms provide API or environment variables for training processes, when the job is dispatched to different nodes. Like node ID, IP or total number of nodes etc. -```python -HOSTS = [ - "root@192.168.100.17", - "root@192.168.100.18", - ] - -''' -workspace configuration -''' - -#root dir for workspace -ROOT_DIR = "/home/paddle" - -''' -network configuration -''' -#pserver nics -PADDLE_NIC = "eth0" -#pserver port -PADDLE_PORT = 7164 -#pserver ports num -PADDLE_PORTS_NUM = 2 -#pserver sparse ports num -PADDLE_PORTS_NUM_FOR_SPARSE = 2 - -#environments setting for all processes in cluster job -LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/lib64" -``` +### Cluster Training Using Fabric -### Launching Cluster Job -`paddle.py` provides automatical scripts to start all PaddlePaddle cluster processes in different nodes. By default, all command line options can set as `paddle.py` command options and `paddle.py` will transparently and automatically set these options to PaddlePaddle lower level processes. +#### Prepare a Linux cluster + +Run `kubectl -f ssh_servers.yaml` under the directory: `paddle/scripts/cluster_train_v2/fabric/docker_cluster` will launch a demo cluster. Run `kubectl get po -o wide` to get IP addresses of these nodes. + +#### Launching Cluster Job +`paddle.py` provides automatical scripts to start all PaddlePaddle cluster processes in different nodes. By default, all command line options can be set as `paddle.py` command options and `paddle.py` will transparently and automatically set these options to PaddlePaddle lower level processes. `paddle.py`provides two distinguished command option for easy job launching. -`job_dispatch_package` set it with local `workspace`directory, it will be dispatched to all nodes set in conf.py. It could be helpful for frequent hacking workspace files, otherwise frequent mulit-nodes workspace deployment could make your crazy. -`job_workspace` set it with already deployed workspace directory, `paddle.py` will skip dispatch stage to directly launch cluster job with all nodes. It could help to reduce heavy +- `job_dispatch_package` set it with local `workspace` directory, it will be dispatched to all nodes which is set in `conf.py`. It could be helpful for frequently manipulating workspace files. otherwise, frequent multi-nodes workspace deployment is very annoying. +- `job_workspace` set it with already deployed workspace directory, `paddle.py` will skip dispatch stage to directly launch cluster job with all nodes. It could help to reduce heavy dispatch latency. `cluster_train/run.sh` provides command line sample to run `demo/recommendation` cluster job, just modify `job_dispatch_package` and `job_workspace` with your defined directory, then: @@ -133,24 +224,70 @@ sh run.sh The cluster Job will start in several seconds. -### Kill Cluster Job -`paddle.py` can capture `Ctrl + C` SIGINT signal to automatically kill all processes launched by it. So just stop `paddle.py` to kill cluster job. You should mannally kill job if program crashed. +#### Kill Cluster Job +`paddle.py` can capture `Ctrl + C` SIGINT signal to automatically kill all processes launched by it. So just stop `paddle.py` to kill cluster job. You should manually kill the job if the program crashed. -### Check Cluster Training Result +#### Check Cluster Training Result Check log in $workspace/log for details, each node owns same log structure. `paddle_trainer.INFO` -It provides almost all interal output log for training, same as local training. Check runtime model convergence here. +It provides almost all internal output log for training, same as local training. Check runtime model convergence here. `paddle_pserver2.INFO` -It provides pserver running log, which could help to diagnose distributed error. +It provides parameter server running log, which could help to diagnose distributed error. `server.log` -It provides stderr and stdout of pserver process. Check error log if training crashs. +It provides stderr and stdout of parameter server process. Check error log if training crashes. `train.log` -It provides stderr and stdout of trainer process. Check error log if training crashs. +It provides stderr and stdout of trainer process. Check error log if training crashes. -### Check Model Output -After one pass finished, model files will be writed in `output` directory in node 0. +#### Check Model Output +After one pass finished, model files will be written in `output` directory in node 0. `nodefile` in workspace indicates the node id of current cluster job. + +### Cluster Training Using OpenMPI + +#### Prepare an OpenMPI cluster + +Run the following command to start a 3-node MPI cluster and one "head" node. + +```bash +cd paddle/scripts/cluster_train_v2/openmpi/docker_cluster +kubectl create -f head.yaml +kubectl create -f mpi-nodes.yaml +``` + +Then you can log in to every OpenMPI node using ssh without input any passwords. + +#### Launching Cluster Job + +Follow the steps to launch a PaddlePaddle training job in OpenMPI cluster:\ + +```bash +# find out node IP addresses +kubectl get po -o wide +# generate a "machines" file containing node IP addresses +kubectl get po -o wide | grep nodes | awk '{print $6}' > machines +# copy necessary files onto "head" node +scp -i ssh/id_rsa.mpi.pub machines prepare.py train.py start_mpi_train.sh tutorial@[headIP]:~ +# login to head node using ssh +ssh -i ssh/id_rsa.mpi.pub tutorial@[headIP] +# --------------- in head node --------------- +# prepare training data +python prepare.py +# copy training data and dict file to MPI nodes +cat machines | xargs -i scp word_dict.pickle train.py start_mpi_train.sh machines {}:/home/tutorial +# creat a directory for storing log files +mpirun -hostfile machines -n 3 mkdir /home/tutorial/logs +# copy training data to every node +scp train.txt-00000 test.txt-00000 [node1IP]:/home/tutorial +scp train.txt-00001 test.txt-00001 [node2IP]:/home/tutorial +scp train.txt-00002 test.txt-00002 [node3IP]:/home/tutorial +# start the job +mpirun -hostfile machines -n 3 /home/tutorial/start_mpi_train.sh +``` + +### Cluster Training Using Kubernetes + +The details can be found [here](../k8s/k8s_cn.md) diff --git a/doc/_sources/index_en.rst.txt b/doc/_sources/index_en.rst.txt index 168c7667c61da677905585d6c4b5037ce80b3765..23b64b6cadf776d44c4d0aa5a550ffe24be13b18 100644 --- a/doc/_sources/index_en.rst.txt +++ b/doc/_sources/index_en.rst.txt @@ -7,4 +7,4 @@ PaddlePaddle Documentation getstarted/index_en.rst howto/index_en.rst api/index_en.rst - about/index_en.rst + mobile/index_en.rst diff --git a/doc/_sources/mobile/cross_compiling_for_android_en.md.txt b/doc/_sources/mobile/cross_compiling_for_android_en.md.txt new file mode 100644 index 0000000000000000000000000000000000000000..26858581fc1d77a9391520ac0dfd80fbd98f508c --- /dev/null +++ b/doc/_sources/mobile/cross_compiling_for_android_en.md.txt @@ -0,0 +1,175 @@ +# Build PaddlePaddle for Android + +There are two approaches to build PaddlePaddle for Android: using Docker and on Linux without Docker. + +## Cross-Compiling Using Docker + +Docker-based cross-compiling is the recommended approach because Docker runs on all major operating systems, including Linux, Mac OS X, and Windows. + +### Build the Docker Image + +The following steps pack all the tools that we need to build PaddlePaddle into a Docker image. + +```bash +$ git clone https://github.com/PaddlePaddle/Paddle.git +$ cd Paddle +$ docker build -t paddle:dev-android . -f Dockerfile.android +``` + +### Build the Inference Library + +We can run the Docker image we just created to build the inference library of PaddlePaddle for Android using the command below: + +```bash +$ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=armeabi-v7a" -e "ANDROID_API=21" paddle:dev-android +``` + +The Docker image accepts two arguments `ANDROID_ABI` and `ANDROID_API`: + + ++ + + + + + + + + + + + + + + + + + + + + + + +
ArgumentOptional ValuesDefault
ANDROID_ABIarmeabi-v7a, arm64-v8aarmeabi-v7a
ANDROID_API>= 2121
+ +The ARM-64 architecture (`arm64-v8a`) requires at least level 21 of Android API. + +The default entry-point of the Docker image, [`paddle/scripts/docker/build_android.sh`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build_android.sh) generates the [Android cross-compiling standalone toolchain](https://developer.android.com/ndk/guides/standalone_toolchain.html) based on the argument: `ANDROID_ABI` or `ANDROID_API`. For information about other configuration arguments, please continue reading. + +The above command generates and outputs the inference library in `$PWD/install_android` and puts third-party libraries in `$PWD/install_android/third_party`. + +## Cross-Compiling on Linux + +The Linux-base approach to cross-compile is to run steps in `Dockerfile.android` manually on a Linux x64 computer. + +### Setup the Environment + +To build for Android's, we need [Android NDK]( +https://developer.android.com/ndk/downloads/index.html): + +```bash +wget -q https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip +unzip -q android-ndk-r14b-linux-x86_64.zip +``` + +Android NDK includes everything we need to build the [*standalone toolchain*](https://developer.android.com/ndk/guides/standalone_toolchain.html), which in then used to build PaddlePaddle for Android. (We plan to remove the intermediate stage of building the standalone toolchain in the near future.) + +- To build the standalone toolchain for `armeabi-v7a` and Android API level 21: + + ```bash + your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain.sh \ + --arch=arm --platform=android-21 --install-dir=your/path/to/arm_standalone_toolchain + ``` + + The generated standalone toolchain will be in `your/path/to/arm_standalone_toolchain`. + +- To build the standalone toolchain for `arm64-v8a` and Android API level 21: + + ```bash + your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain.sh \ + --arch=arm64 --platform=android-21 --install-dir=your/path/to/arm64_standalone_toolchain + ``` + + The generated standalone toolchain will be in `your/path/to/arm64_standalone_toolchain`. + +**Please be aware that the minimum level of Android API required by PaddlePaddle is 21.** + +### Cross-Compiling Arguments + +CMake supports [choosing the toolchain](https://cmake.org/cmake/help/v3.0/manual/cmake-toolchains.7.html#cross-compiling). PaddlePaddle provides [`android.cmake`](https://github.com/PaddlePaddle/Paddle/blob/develop/cmake/cross_compiling/android.cmake), which configures the Android cross-compiling toolchain for CMake. `android.cmake` is not required for CMake >= 3.7, which support Android cross-compiling. PaddlePaddle detects the CMake version, for those newer than 3.7, it uses [the official version](https://cmake.org/cmake/help/v3.7/manual/cmake-toolchains.7.html#cross-compiling). + +Some other CMake arguments you need to know: + +- `CMAKE_SYSTEM_NAME` must be `Android`. This tells PaddlePaddle's CMake system to cross-compile third-party dependencies. This also changes some other CMake arguments like `WITH_GPU=OFF`, `WITH_AVX=OFF`, `WITH_PYTHON=OFF`, and `WITH_RDMA=OFF`. +- `WITH_C_API` must be `ON`, to build the C-based inference library for Android. +- `WITH_SWIG_PY` must be `OFF` because the Android platform doesn't support SWIG-based API. + +Some Android-specific arguments: + +- `ANDROID_STANDALONE_TOOLCHAIN`: the absolute path of the Android standalone toolchain, or the path relative to the CMake build directory. PaddlePaddle's CMake extensions would derive the cross-compiler, sysroot and Android API level from this argument. +- `ANDROID_TOOLCHAIN`: could be `gcc` or `clang`. The default value is `clang`. + - For CMake >= 3.7, it should anyway be `clang`. For older versions, it could be `gcc`. + - Android's official `clang` requires `glibc` >= 2.15. +- `ANDROID_ABI`: could be `armeabi-v7a` or `arm64-v8a`. The default value is `armeabi-v7a`. +- `ANDROID_NATIVE_API_LEVEL`: could be derived from the value of `ANDROID_STANDALONE_TOOLCHAIN`. +- `ANROID_ARM_MODE`: + - could be `ON` or `OFF`, and defaults to `ON`, when `ANDROID_ABI=armeabi-v7a`; + - no need to specify when `ANDROID_ABI=arm64-v8a`. +- `ANDROID_ARM_NEON`: indicates if to use NEON instructions. + - could be `ON` or `OFF`, and defaults to `ON`, when `ANDROID_ABI=armeabi-v7a`; + - no need to specify when `ANDROID_ABI=arm64-v8a`. + +Other useful arguments: + +- `USE_EIGEN_FOR_BLAS`: indicates if using Eigen. Could be `ON` or `OFF`, defaults to `OFF`. +- `HOST_C/CXX_COMPILER`: specifies the host compiler, which is used to build the host-specific protoc and target-specific OpenBLAS. It defaults to the value of the environment variable `CC`, or `cc`. + +Some frequent configurations for your reference: + +```bash +cmake -DCMAKE_SYSTEM_NAME=Android \ + -DANDROID_STANDALONE_TOOLCHAIN=your/path/to/arm_standalone_toolchain \ + -DANDROID_ABI=armeabi-v7a \ + -DANDROID_ARM_NEON=ON \ + -DANDROID_ARM_MODE=ON \ + -DUSE_EIGEN_FOR_BLAS=ON \ + -DCMAKE_INSTALL_PREFIX=your/path/to/install \ + -DWITH_C_API=ON \ + -DWITH_SWIG_PY=OFF \ + .. +``` + +``` +cmake -DCMAKE_SYSTEM_NAME=Android \ + -DANDROID_STANDALONE_TOOLCHAIN=your/path/to/arm64_standalone_toolchain \ + -DANDROID_ABI=arm64-v8a \ + -DUSE_EIGEN_FOR_BLAS=OFF \ + -DCMAKE_INSTALL_PREFIX=your/path/to/install \ + -DWITH_C_API=ON \ + -DWITH_SWIG_PY=OFF \ + .. +``` + + +There are some other arguments you might want to configure. + +- `CMAKE_BUILD_TYPE=MinSizeRel` minimizes the size of library. +- `CMAKE_BUILD_TYPE-Release` optimizes the runtime performance. + +Our own tip for performance optimization to use clang and Eigen or OpenBLAS: +- `CMAKE_BUILD_TYPE=Release` +- `ANDROID_TOOLCHAIN=clang` +- `USE_EIGEN_BLAS=ON` for `armeabi-v7a`, or `USE_EIGEN_FOR_BLAS=OFF` for `arm64-v8a`. + +### Build and Install + +After running `cmake`, we can run `make; make install` to build and install. + +Before building, you might want to remove the `third_party` and `build` directories including pre-built libraries for other architectures. + +After building,in the directory `CMAKE_INSTALL_PREFIX`, you will find three sub-directories: + +- `include`: the header file of the inference library, +- `lib`: the inference library built for various Android ABIs, +- `third_party`: dependent third-party libraries built for Android. diff --git a/doc/_sources/mobile/cross_compiling_for_raspberry_en.md.txt b/doc/_sources/mobile/cross_compiling_for_raspberry_en.md.txt new file mode 100644 index 0000000000000000000000000000000000000000..3c1a5950ff9553bb725d5a96e3fdf2e5e9f6f95c --- /dev/null +++ b/doc/_sources/mobile/cross_compiling_for_raspberry_en.md.txt @@ -0,0 +1,62 @@ +# Build PaddlePaddle for Raspberry Pi + +You may use any of the following two approaches to build the inference library of PaddlePaddle for Raspberry Pi: + +1. Build using SSH: Log in to a Raspberry Pi using SSH and build the library. The required development tools and third-party dependencies are listed in here: [`/Dockerfile`](https://github.com/PaddlePaddle/Paddle/blob/develop/Dockerfile). + +1. Cross-compile: We talk about how to cross-compile PaddlePaddle for Raspberry Pi on a Linux/x64 machine, in more detail in this article. + +## The Cross-Compiling Toolchain + +Step 1. Clone the Github repo by running the following command. + +```bash +git clone https://github.com/raspberrypi/tools.git +``` + +Step 2. Use the pre-built cross-compiler found in `./tools/tree/master/arm-bcm2708/gcc-linaro-arm-linux-gnueabihf-raspbian-x64`. To run it on a Linux computer, glibc version >= 2.14 is needed. + +## CMake Arguments + +CMake supports [cross-compiling](https://cmake.org/cmake/help/v3.0/manual/cmake-toolchains.7.html#cross-compiling). All CMake configuration arguments required for the cross-compilation for Raspberry Pi can be found in [`cmake/cross_compiling/raspberry_pi.cmake`](https://github.com/PaddlePaddle/Paddle/blob/develop/cmake/cross_compiling/raspberry_pi.cmake). + +Some important arguments that need to be set: + +- `CMAKE_SYSTEM_NAME`: The target platform. Must be `RPi`. + +- `RPI_TOOLCHAIN`: The absolute path of the cross-compiling toolchain. + +- `RPI_ARM_NEON`: Use ARM NEON Intrinsics. This is a required argument and set default to `ON`. + +- `HOST_C/CXX_COMPILER`: The C/C++ compiler for the host. It is used to build building tools running on the host, for example, protoc. + +A commonly-used CMake configuration is as follows: + +``` +cmake -DCMAKE_SYSTEM_NAME=RPi \ + -DRPI_TOOLCHAIN=your/path/to/arm-bcm2708/gcc-linaro-arm-linux-gnueabihf-raspbian-x64 \ + -DRPI_ARM_NEON=ON \ + -DCMAKE_INSTALL_PREFIX=your/path/to/install \ + -DWITH_GPU=OFF \ + -DWITH_C_API=ON \ + -DWITH_PYTHON=OFF \ + -DWITH_SWIG_PY=OFF \ + .. +``` + +To build the inference library, please set the argument WITH\_C\_API to ON: `WITH_C_API=ON`. + +You can add more arguments. For example, to minimize the size of the generated inference library, you may use `CMAKE_BUILD_TYPE=MinSizeRel`. For performance optimization, you may use `CMAKE_BUILD_TYPE=Release`. + +## Build and Install + +The following commands build the inference library of PaddlePaddle for Raspberry Pi and third-party dependencies. + +```bash +make +make install +``` + + The intermediate files will be stored in `build`. Third-party libraries will be located in `build/third_party`. If you have already built it for other platforms like Android or iOS, you may want to clear these directories by running the command: `rm -rf build`. + +The infernece library will be in `your/path/to/install/lib`, with related header files in `your/path/to/install/include`. diff --git a/doc/_sources/mobile/index_en.rst.txt b/doc/_sources/mobile/index_en.rst.txt new file mode 100644 index 0000000000000000000000000000000000000000..3c08d736717cfe8d5fdf449dc58015086befbe60 --- /dev/null +++ b/doc/_sources/mobile/index_en.rst.txt @@ -0,0 +1,8 @@ +MOBILE +====== + +.. toctree:: + :maxdepth: 1 + + cross_compiling_for_android_en.md + cross_compiling_for_raspberry_en.md diff --git a/doc/_sources/survey/cluster_bootstrapping_tools.md.txt b/doc/_sources/survey/cluster_bootstrapping_tools.md.txt new file mode 100644 index 0000000000000000000000000000000000000000..1cd9962700bb49866f1ed6987abc28b27888a23f --- /dev/null +++ b/doc/_sources/survey/cluster_bootstrapping_tools.md.txt @@ -0,0 +1,71 @@ +# Cluster bootstrapping tool survey +## Abstract +In order to bring up a cluster from bare metal machine to a fully functional kubernetes cluster for Paddlepaddle to run, we need to utilize some tools. Here we are going to compare [Sextant](https://github.com/k8sp/sextant) and [Tectonic installer](https://github.com/coreos/tectonic-installer) + +## Basic assumptions +Here are some basic assumptions before we move on to details +1. You are an administrator of a bare metal machine cluster, which means: + * you have full control to each of the machines. + * you have full control to the network which machines are connected to. +2. Machines can be booted from network with PEX or iPXE +3. You understand the [general procedure to bring up a cluster](#appendix-general-procedure-to-bring-up-a-cluster) + +if your cluster is able to mark above items with checkmarks, then keep reading. + +## Comparing Sextant and Tectonic installer +### Sextant +Sextant is an end2end solution to bring up a bare metal cluster to a fully functional k8s cluster, it integrates DHCP, name service, PEX, cloud-config-service, docker registry services altogether. + +#### Pros +1. End2End: basically all admin need to do is to config the cluster.yaml and power on the cluster. +2. Offline cluster configuration: Sextant has 2 phases during working with it, config time and deploy time. when admin is configuring, it requires admin's machine has internet connectivity, which will download some images, etc. But in deploy time, it's completely OK to go offline since all dependencies are ready during config time. +3. docker registry integrated. +4. GPU machine took care of. + +### Cons +1. k8s API server is not deployed with high availability in considering by default. +2. No grouping support. +3. No API interface, a one-off service. + + +### Tectonic installer +First of all, Tectonic is not free, it requires coreos.com account as a step of installation, and free user can only create less than 10 nodes. + +Tectonic is a suite of software which wraps around k8s and providing more utility regarding dev ops, ie, +Tectonic installer as it's named, it installs Tectonic to a bare metal cluster which means it's not totally an equivalent of Sextant. At the "booting a cluster" part, it mostly utilizes [Matchbox](https://github.com/coreos/matchbox), which is a general cluster bootstrapper. + +Matchbox's Approach is similar to Sexstant. + +### Pros +1. supports grouping machines. +2. supports running provisioning service in rtk. (not a big deal though). +3. supports http/gRPC API interface. +4. supports multi-template. + +### Cons +1. Not an e2e solution to bring up a cluster, need a lot of extra work and other software. +2. [Not fully supporting](https://github.com/coreos/matchbox/issues/550) centOS deployment yet. + +## Conclusion +Sextant is a better solution overall for paddle cloud deploying to a bare metal cluster. It would be great if Sextant can also 1) deploy k8s api server with high availability by default; 2) not designed as a one-off service. + + + +## Appendix: General procedure to bring up a cluster +It's physically impossible for a cluster admin to manually install OS and applications into cluster nodes one by one, here is what an admin would do in cloud industry: +1. setup a bootstrap machine with static IP in the cluster, which has following services: + * DHCP: assigns ip address for rest of the nodes. + * name service: to map node name to a IP + * PXE related services: the booting related info will be delivered to newly booted machines as their IP is assigned via DHCP service, PXE service will provide further booting and installing info and image with TFTP and http protocol. + * cluster config service: this is for providing cluster node with OS config via http + * optional docker registry: a built-in docker registry makes the whole cluster independent from connecting internet, and speeds up software distribution. +2. New node powers on, it will + * broadcast the request for an IP address + * DHCP server assigns the IP address, and deliver the PXE booting related info to the node. + * cluster node will request config files with booting info delivered with DHCP via the TFTP service, and in most of the cases, the config file will point to a http service for the booting image. + * Since PXE is configured with initrd, it will utilize the cloud config service and do further installations like coreOS or K8s installations. + * then restart the node. + +For further understanding, following 2 links from Matchbox are some good readings: +* [Machine lifecycle](https://github.com/coreos/matchbox/blob/master/Documentation/machine-lifecycle.md) +* [PXE booting](https://github.com/coreos/matchbox/blob/master/Documentation/network-booting.md) diff --git a/doc/_sources/tutorials/image_classification/index_en.md.txt b/doc/_sources/tutorials/image_classification/index_en.md.txt deleted file mode 100644 index 60c81a6a539944634773f38ec4c9a59709dd4afc..0000000000000000000000000000000000000000 --- a/doc/_sources/tutorials/image_classification/index_en.md.txt +++ /dev/null @@ -1,221 +0,0 @@ -Image Classification Tutorial -============================== - -This tutorial will guide you through training a convolutional neural network to classify objects using the CIFAR-10 image classification dataset. -As shown in the following figure, the convolutional neural network can recognize the main object in images, and output the classification result. - -
![Image Classification](./image_classification.png)
- -## Data Preparation -First, download CIFAR-10 dataset. CIFAR-10 dataset can be downloaded from its official website. - - - -We have prepared a script to download and process CIFAR-10 dataset. The script will download CIFAR-10 dataset from the official dataset. -It will convert it to jpeg images and organize them into a directory with the required structure for the tutorial. Make sure that you have installed pillow and its dependents. -Consider the following commands: - -1. install pillow dependents - -```bash -sudo apt-get install libjpeg-dev -pip install pillow -``` - -2. download data and preparation - -```bash -cd demo/image_classification/data/ -sh download_cifar.sh -``` - -The CIFAR-10 dataset consists of 60000 32x32 color images in 10 classes, with 6000 images per class. There are 50000 training images and 10000 test images. - -Here are the classes in the dataset, as well as 10 random images from each: -
![Image Classification](./cifar.png)
- - -After downloading and converting, we should find a directory (cifar-out) containing the dataset in the following format: - -``` -train ----airplane ----automobile ----bird ----cat ----deer ----dog ----frog ----horse ----ship ----truck -test ----airplane ----automobile ----bird ----cat ----deer ----dog ----frog ----horse ----ship ----truck -``` - -It has two directories:`train` and `test`. These two directories contain training data and testing data of CIFAR-10, respectively. Each of these two folders contains 10 sub-folders, ranging from `airplane` to `truck`. Each sub-folder contains images with the corresponding label. After the images are organized into this structure, we are ready to train an image classification model. - -## Preprocess -After the data has been downloaded, it needs to be pre-processed into the Paddle format. We can run the following command for preprocessing. - -``` -cd demo/image_classification/ -sh preprocess.sh -``` - -`preprocess.sh` calls `./demo/image_classification/preprocess.py` to preprocess image data. -```sh -export PYTHONPATH=$PYTHONPATH:../../ -data_dir=./data/cifar-out -python preprocess.py -i $data_dir -s 32 -c 1 -``` - -`./demo/image_classification/preprocess.py` has the following arguments - -- `-i` or `--input` specifes the input data directory. -- `-s` or `--size` specifies the processed size of images. -- `-c` or `--color` specifes whether images are color images or gray images. - - -## Model Training -We need to create a model config file before training the model. An example of the config file (vgg_16_cifar.py) is listed below. **Note**, it is slightly different from the `vgg_16_cifar.py` which also applies to the prediction. - -```python -from paddle.trainer_config_helpers import * -data_dir='data/cifar-out/batches/' -meta_path=data_dir+'batches.meta' -args = {'meta':meta_path, 'mean_img_size': 32, - 'img_size': 32, 'num_classes': 10, - 'use_jpeg': 1, 'color': "color"} -define_py_data_sources2(train_list=data_dir+"train.list", - test_list=data_dir+'test.list', - module='image_provider', - obj='processData', - args=args) -settings( - batch_size = 128, - learning_rate = 0.1 / 128.0, - learning_method = MomentumOptimizer(0.9), - regularization = L2Regularization(0.0005 * 128)) - -img = data_layer(name='image', size=3*32*32) -lbl = data_layer(name="label", size=10) -# small_vgg is predined in trainer_config_helpers.network -predict = small_vgg(input_image=img, num_channels=3) -outputs(classification_cost(input=predict, label=lbl)) -``` - -The first line imports python functions for defining networks. -```python -from paddle.trainer_config_helpers import * -``` - -Then define an `define_py_data_sources2` which use python data provider -interface. The arguments in `args` are used in `image_provider.py` which -yeilds image data and transform them to Paddle. - - `meta`: the mean value of training set. - - `mean_img_size`: the size of mean feature map. - - `img_size`: the height and width of input image. - - `num_classes`: the number of classes. - - `use_jpeg`: the data storage type when preprocessing. - - `color`: specify color image. - -`settings` specifies the training algorithm. In the following example, -it specifies learning rate as 0.1, but divided by batch size, and the weight decay -is 0.0005 and multiplied by batch size. -```python -settings( - batch_size = 128, - learning_rate = 0.1 / 128.0, - learning_method = MomentumOptimizer(0.9), - regularization = L2Regularization(0.0005 * 128) -) -``` - -The `small_vgg` specifies the network. We use a small version of VGG convolutional network as our network -for classification. A description of VGG network can be found here [http://www.robots.ox.ac.uk/~vgg/research/very_deep/](http://www.robots.ox.ac.uk/~vgg/research/very_deep/). -```python -# small_vgg is predined in trainer_config_helpers.network -predict = small_vgg(input_image=img, num_channels=3) -``` -After writing the config, we can train the model by running the script train.sh. - -```bash -config=vgg_16_cifar.py -output=./cifar_vgg_model -log=train.log - -paddle train \ ---config=$config \ ---dot_period=10 \ ---log_period=100 \ ---test_all_data_in_one_period=1 \ ---use_gpu=1 \ ---save_dir=$output \ -2>&1 | tee $log - -python -m paddle.utils.plotcurve -i $log > plot.png -``` - -- Here we use GPU mode to train. If you have no gpu environment, just set `use_gpu=0`. - -- `./demo/image_classification/vgg_16_cifar.py` is the network and data configuration file. The meaning of the other flags can be found in the documentation of the command line flags. - -- The script `plotcurve.py` requires the python module of `matplotlib`, so if it fails, maybe you need to install `matplotlib`. - - -After training finishes, the training and testing error curves will be saved to `plot.png` using `plotcurve.py` script. An example of the plot is shown below: - -
![Training and testing curves.](./plot.png)
- - -## Prediction -After we train the model, the model file as well as the model parameters are stored in path `./cifar_vgg_model/pass-%05d`. For example, the model of the 300-th pass is stored at `./cifar_vgg_model/pass-00299`. - -To make a prediction for an image, one can run `predict.sh` as follows. The script will output the label of the classfiication. - -``` -sh predict.sh -``` - -predict.sh: -``` -model=cifar_vgg_model/pass-00299/ -image=data/cifar-out/test/airplane/seaplane_s_000978.png -use_gpu=1 -python prediction.py $model $image $use_gpu -``` - -## Exercise -Train a image classification of birds using VGG model and CUB-200 dataset. The birds dataset can be downloaded here. It contains an image dataset with photos of 200 bird species (mostly North American). - - - - - - -## Delve into Details -### Convolutional Neural Network -A Convolutional Neural Network is a feedforward neural network that uses convolution layers. It is very suitable for building neural networks that process and understand images. A standard convolutional neural network is shown below: - -![Convolutional Neural Network](./lenet.png) - -Convolutional Neural Network contains the following layers: - -- Convolutional layer: It uses convolution operation to extract features from an image or a feature map. -- Pooling layer: It uses max-pooling to downsample feature maps. -- Fully Connected layer: It uses fully connected connections to transform features. - -Convolutional Neural Network achieves amazing performance for image classification because it exploits two important characteristics of images: *local correlation* and *spatial invariance*. By iteratively applying convolution and max-pooing operations, convolutional neural network can well represent these two characteristics of images. - - -For more details of how to define layers and their connections, please refer to the documentation of layers. diff --git a/doc/_sources/tutorials/index_en.md.txt b/doc/_sources/tutorials/index_en.md.txt deleted file mode 100644 index 77331a703b6f0fdf92921ebcc476325b7327e976..0000000000000000000000000000000000000000 --- a/doc/_sources/tutorials/index_en.md.txt +++ /dev/null @@ -1,14 +0,0 @@ -# TUTORIALS -There are several examples and demos here. - -* [Quick Start](quick_start/index_en.md) -* [MovieLens Regression](rec/ml_regression_en.rst) -* [Image Classification](image_classification/index_en.md) -* [Sentiment Analysis](sentiment_analysis/index_en.md) -* [Semantic Role Labeling](semantic_role_labeling/index_en.md) -* [Text Generation](text_generation/index_en.md) -* [Image Auto-Generation](gan/index_en.md) - -## Model Zoo -* [ImageNet: ResNet](imagenet_model/resnet_model_en.md) -* [Embedding: Chinese Word](embedding_model/index_en.md) diff --git a/doc/_sources/tutorials/rec/ml_dataset_en.md.txt b/doc/_sources/tutorials/rec/ml_dataset_en.md.txt deleted file mode 100644 index 25dea5c4afbf1ce1c1ac6195cbd245b116459e2e..0000000000000000000000000000000000000000 --- a/doc/_sources/tutorials/rec/ml_dataset_en.md.txt +++ /dev/null @@ -1,111 +0,0 @@ -```eval_rst -.. _demo_ml_dataset: -``` - -# MovieLens Dataset - -The [MovieLens Dataset](http://grouplens.org/datasets/movielens/) was collected by GroupLens Research. -The data set contains some user information, movie information, and many movie ratings from \[1-5\]. -The data sets have many version depending on the size of set. -We use [MovieLens 1M Dataset](http://files.grouplens.org/datasets/movielens/ml-1m.zip) as a demo dataset, which contains -1 million ratings from 6000 users on 4000 movies. Released 2/2003. - -## Dataset Features - -In [ml-1m Dataset](http://files.grouplens.org/datasets/movielens/ml-1m.zip), there are many features in these dataset. -The data files (which have ".dat" extension) in [ml-1m Dataset](http://files.grouplens.org/datasets/movielens/ml-1m.zip) -is basically CSV file that delimiter is "::". The description in README we quote here. - -### RATINGS FILE DESCRIPTION(ratings.dat) - - -All ratings are contained in the file "ratings.dat" and are in the -following format: - -UserID::MovieID::Rating::Timestamp - -- UserIDs range between 1 and 6040 -- MovieIDs range between 1 and 3952 -- Ratings are made on a 5-star scale (whole-star ratings only) -- Timestamp is represented in seconds since the epoch as returned by time(2) -- Each user has at least 20 ratings - -### USERS FILE DESCRIPTION(users.dat) - -User information is in the file "users.dat" and is in the following -format: - -UserID::Gender::Age::Occupation::Zip-code - -All demographic information is provided voluntarily by the users and is -not checked for accuracy. Only users who have provided some demographic -information are included in this data set. - -- Gender is denoted by a "M" for male and "F" for female -- Age is chosen from the following ranges: - - * 1: "Under 18" - * 18: "18-24" - * 25: "25-34" - * 35: "35-44" - * 45: "45-49" - * 50: "50-55" - * 56: "56+" - -- Occupation is chosen from the following choices: - - * 0: "other" or not specified - * 1: "academic/educator" - * 2: "artist" - * 3: "clerical/admin" - * 4: "college/grad student" - * 5: "customer service" - * 6: "doctor/health care" - * 7: "executive/managerial" - * 8: "farmer" - * 9: "homemaker" - * 10: "K-12 student" - * 11: "lawyer" - * 12: "programmer" - * 13: "retired" - * 14: "sales/marketing" - * 15: "scientist" - * 16: "self-employed" - * 17: "technician/engineer" - * 18: "tradesman/craftsman" - * 19: "unemployed" - * 20: "writer" - -### MOVIES FILE DESCRIPTION(movies.dat) - -Movie information is in the file "movies.dat" and is in the following -format: - -MovieID::Title::Genres - -- Titles are identical to titles provided by the IMDB (including -year of release) -- Genres are pipe-separated and are selected from the following genres: - - * Action - * Adventure - * Animation - * Children's - * Comedy - * Crime - * Documentary - * Drama - * Fantasy - * Film-Noir - * Horror - * Musical - * Mystery - * Romance - * Sci-Fi - * Thriller - * War - * Western - -- Some MovieIDs do not correspond to a movie due to accidental duplicate -entries and/or test entries -- Movies are mostly entered by hand, so errors and inconsistencies may exist diff --git a/doc/_sources/tutorials/rec/ml_regression_en.rst.txt b/doc/_sources/tutorials/rec/ml_regression_en.rst.txt deleted file mode 100644 index 993b9a516f134ff8b59e8755b721f76c8f32f0fd..0000000000000000000000000000000000000000 --- a/doc/_sources/tutorials/rec/ml_regression_en.rst.txt +++ /dev/null @@ -1,348 +0,0 @@ -Regression MovieLens Ratting -============================ - -Here we demonstrate a **Cosine Similarity Regression** job in movie lens dataset. -This demo will show how paddle does (word) embedding job, -handles the similarity regression, -the character-level convolutional networks for text, and how does paddle handle -multiple types of inputs. -Note that the model structure is not fine-tuned and just a demo to show how paddle works. - - -YOU ARE WELCOME TO BUILD A BETTER DEMO -BY USING PADDLEPADDLE, AND LET US KNOW TO MAKE THIS DEMO BETTER. - -Data Preparation -```````````````` -Download and extract dataset -'''''''''''''''''''''''''''' -We use :ref:`demo_ml_dataset` here. -To download and unzip the dataset, simply run the following commands. - -.. code-block:: bash - - cd demo/recommendation/data - ./ml_data.sh - -And the directory structure of :code:`demo/recommendation/data/ml-1m` is: - -.. code-block:: text - - +--ml-1m - +--- movies.dat # movie features - +--- ratings.dat # ratings - +--- users.dat # user features - +--- README # dataset description - -Field config file -''''''''''''''''' -**Field config file** is used to specify the fields of the dataset and the file format, -i.e, specific **WHAT** type it is in each feature file. - -The field config file of ml-1m shows in :code:`demo/recommendation/data/config.json`. -It specifics the field types and file names: 1) there are four types of field for user file\: id, gender, age and occupation; -2) the filename is "users.dat", and the delimiter of file is "::". - -.. include:: ../../../demo/recommendation/data/config.json - :code: json - :literal: - -Preprocess Data -``````````````` -You need to install python 3rd party libraries. -IT IS HIGHLY RECOMMEND TO USE VIRTUALENV MAKE A CLEAN PYTHON ENVIRONMENT. - -.. code-block:: bash - - pip install -r requirements.txt - -The general command for preprocessing the dataset is: - -.. code-block:: bash - - cd demo/recommendation - ./preprocess.sh - -And the detail steps are introduced as follows. - -Extract Movie/User features to python object -''''''''''''''''''''''''''''''''''''''''''''' - -There are many features in movie or user in movielens 1m dataset. -Each line of rating file just provides a Movie/User id to refer each movie or user. -We process the movie/user feature file first, and pickle the feature (**Meta**) object as a file. - -Meta config file -................ - -**Meta config file** is used to specific **HOW** to parse each field in dataset. -It could be translated from field config file, or written by hand. -Its file format could be either json or yaml syntax file. Parser will automatically choose the file format by extension name. - -To convert Field config file to meta config file, just run: - -.. code-block:: bash - - cd demo/recommendation/data - python config_generator.py config.json > meta_config.json - -The meta config file shows below: - -.. include:: ../../../demo/recommendation/data/meta_config.json - :code: json - :literal: - -There are two kinds of features in meta\: movie and user. - -* in movie file, whose name is movies.dat - * we just split each line by "::" - * pos 0 is id. - * pos 1 feature: - * name is title. - * it uses regex to parse this feature. - * it is a char based word embedding feature. - * it is a sequence. - * pos 2 feature: - * name is genres. - * type is one hot dense vector. - * dictionary is auto generated by parsing, each key is split by '|' -* in user file, whose name is users.dat - * we just split each line by "::" - * pos 0 is id. - * pos 1 feature: - * name is gender - * just simple char based embedding. - * pos 2 feature: - * name is age - * just whole word embedding. - * embedding id will be sort by word. - * pos 3 feature: - * name is occupation. - * just simple whole word embedding. - - -Meta file -''''''''' - -After having meta config file, we can generate **Meta file**, a python pickle object which stores movie/user information. -The following commands could be run to generate it. - -.. code-block:: bash - - python meta_generator.py ml-1m meta.bin --config=meta_config.json - -And the structure of the meta file :code:`meta.bin` is: - -.. code-block:: text - - +--+ movie - | +--+ __meta__ - | | +--+ raw_meta # each feature meta config. list - | | | + - | | | | # ID Field, we use id as key - | | | +--+ {'count': 3883, 'max': 3952, 'is_key': True, 'type': 'id', 'min': 1} - | | | | - | | | | # Titile field, the dictionary list of embedding. - | | | +--+ {'dict': [ ... ], 'type': 'embedding', 'name': 'title', 'seq': 'sequence'} - | | | | - | | | | # Genres field, the genres dictionary - | | | +--+ {'dict': [ ... ], 'type': 'one_hot_dense', 'name': 'genres'} - | | | - | | +--+ feature_map [1, 2] # a list for raw_meta index for feature field. - | | # it means there are 2 features for each key. - | | # * 0 offset of feature is raw_meta[1], Title. - | | # * 1 offset of feature is raw_meta[2], Genres. - | | - | +--+ 1 # movie 1 features - | | + - | | +---+ [[...], [...]] # title ids, genres dense vector - | | - | +--+ 2 - | | - | +--+ ... - | - +--- user - +--+ __meta__ - | + - | +--+ raw_meta - | | + - | | +--+ id field as user - | | | - | | +--+ {'dict': ['F', 'M'], 'type': 'embedding', 'name': 'gender', 'seq': 'no_sequence'} - | | | - | | +--+ {'dict': ['1', '18', '25', '35', '45', '50', '56'], 'type': 'embedding', 'name': 'age', 'seq': 'no_sequence'} - | | | - | | +--+ {'dict': [...], 'type': 'embedding', 'name': 'occupation', 'seq': 'no_sequence'} - | | - | +--+ feature_map [1, 2, 3] - | - +--+ 1 # user 1 features - | - +--+ 2 - +--+ ... - - -Split Training/Testing files -'''''''''''''''''''''''''''' - -We split :code:`ml-1m/ratings.dat` into a training and testing file. The way to split file is for each user, we split the -rating by two parts. So each user in testing file will have some rating information in training file. - -Use :code:`separate.py` to separate the training and testing file. - -.. code-block:: bash - - python split.py ml-1m/ratings.dat --delimiter="::" --test_ratio=0.1 - -Then two files will be generated\: :code:`ml-1m/ratings.dat.train` and :code:`ml-1m/rating.data.test`. -Move them to workspace :code:`data`, shuffle the train file, and prepare the file list for paddle train. - -.. code-block:: bash - - shuf ml-1m/ratings.dat.train > ratings.dat.train - cp ml-1m/ratings.dat.test . - echo "./data/ratings.dat.train" > train.list - echo "./data/ratings.dat.test" > test.list - - -Neural Network Configuration -```````````````````````````` - -Trainer Config File -''''''''''''''''''' - -The network structure shows below. - -.. image:: rec_regression_network.png - :align: center - :alt: rec_regression_network - -The demo's neural network config file :code:`trainer_config.py` show as below. - -.. literalinclude:: ../../../demo/recommendation/trainer_config.py - :language: python - :lines: 15- - -In this :code:`trainer_config.py`, we just map each feature type to -a feature vector, following shows how to map each feature to a vector shows below. - -* :code:`id`\: Just simple embedding, and then add to fully connected layer. -* :code:`embedding`\: - - if is_sequence, get the embedding and do a text convolutional operation, - get the average pooling result. - - if not sequence, get the embedding and add to fully connected layer. -* :code:`one_host_dense`\: - - just two fully connected layer. - -Then we combine each features of movie into one movie feature by a -:code:`fc_layer` with multiple inputs, and do the same thing to user features, -get one user feature. Then we calculate the cosine similarity of these two -features. - -In these networks, we use several APIs in :ref:`api_trainer_config` . There are - -* Data Layer, :ref:`api_trainer_config_helpers_layers_data_layer` -* Fully Connected Layer, :ref:`api_trainer_config_helpers_layers_fc_layer` -* Embedding Layer, :ref:`api_trainer_config_helpers_layers_embedding_layer` -* Context Projection Layer, :ref:`api_trainer_config_helpers_layers_context_projection` -* Pooling Layer, :ref:`api_trainer_config_helpers_layers_pooling_layer` -* Cosine Similarity Layer, :ref:`api_trainer_config_helpers_layers_cos_sim` -* Text Convolution Pooling Layer, :ref:`api_trainer_config_helpers_network_text_conv_pool` -* Declare Python Data Sources :ref:`api_trainer_config_helpers_data_sources`. - -Data Provider -''''''''''''' - -.. literalinclude:: ../../../demo/recommendation/dataprovider.py - :language: python - :lines: 15- - -The data provider just read the meta.bin and rating file, yield each sample for training. -In this :code:`dataprovider.py`, we should set\: - -* obj.slots\: The feature types and dimension. -* use_seq\: Whether this :code:`dataprovider.py` in sequence mode or not. -* process\: Return each sample of data to :code:`paddle`. - -The data provider details document see :ref:`api_pydataprovider2`. - -Train -````` - -After prepare data, config network, writting data provider, now we can run paddle training. - -The :code:`run.sh` is shown as follow: - -.. literalinclude:: ../../../demo/recommendation/run.sh - :language: bash - :lines: 16- - -It just start a paddle training process, write the log to :code:`log.txt`, -then print it on screen. - -Each command line argument in :code:`run.sh`, please refer to the :ref:`cmd_line_index` page. The short description of these arguments is shown as follow. - -* config\: Tell paddle which file is neural network configuration. -* save_dir\: Tell paddle save model into :code:`./output`. -* use_gpu\: Use gpu or not. Default is false. -* trainer_count\: The compute thread in one machine. -* test_all_data_in_one_period\: Test All Data during one test period. Otherwise, - will test a :code:`batch_size` data in one test period. -* log_period\: Print log after train :code:`log_period` batches. -* dot_period\: Print a :code:`.` after train :code:`dot_period` batches. -* num_passes\: Train at most :code:`num_passes`. - -If training process starts successfully, the output likes follow: - -.. code-block:: text - - I0601 08:07:22.832059 10549 TrainerInternal.cpp:157] Batch=100 samples=160000 AvgCost=4.13494 CurrentCost=4.13494 Eval: CurrentEval: - - I0601 08:07:50.672627 10549 TrainerInternal.cpp:157] Batch=200 samples=320000 AvgCost=3.80957 CurrentCost=3.48421 Eval: CurrentEval: - - I0601 08:08:18.877369 10549 TrainerInternal.cpp:157] Batch=300 samples=480000 AvgCost=3.68145 CurrentCost=3.42519 Eval: CurrentEval: - - I0601 08:08:46.863963 10549 TrainerInternal.cpp:157] Batch=400 samples=640000 AvgCost=3.6007 CurrentCost=3.35847 Eval: CurrentEval: - - I0601 08:09:15.413025 10549 TrainerInternal.cpp:157] Batch=500 samples=800000 AvgCost=3.54811 CurrentCost=3.33773 Eval: CurrentEval: - I0601 08:09:36.058670 10549 TrainerInternal.cpp:181] Pass=0 Batch=565 samples=902826 AvgCost=3.52368 Eval: - I0601 08:09:46.215489 10549 Tester.cpp:101] Test samples=97383 cost=3.32155 Eval: - I0601 08:09:46.215966 10549 GradientMachine.cpp:132] Saving parameters to ./output/model/pass-00000 - I0601 08:09:46.233397 10549 ParamUtil.cpp:99] save dir ./output/model/pass-00000 - I0601 08:09:46.233438 10549 Util.cpp:209] copy trainer_config.py to ./output/model/pass-00000 - I0601 08:09:46.233541 10549 ParamUtil.cpp:147] fileName trainer_config.py - -The model is saved in :code:`output/` directory. You can use :code:`Ctrl-C` to stop training whenever you want. - -Evaluate and Predict -```````````````````` - -After training several passes, you can evaluate them and get the best pass. Just run - -.. code-block:: bash - - ./evaluate.sh - -You will see messages like this: - -.. code-block:: text - - Best pass is 00009, error is 3.06949, which means predict get error as 0.875998002281 - evaluating from pass output/pass-00009 - -Then, you can predict what any user will rate a movie. Just run - -.. code-block:: bash - - python prediction.py 'output/pass-00009/' - -Predictor will read user input, and predict scores. It has a command-line user interface as follows: - -.. code-block:: text - - Input movie_id: 9 - Input user_id: 4 - Prediction Score is 2.56 - Input movie_id: 8 - Input user_id: 2 - Prediction Score is 3.13 diff --git a/doc/_sources/tutorials/semantic_role_labeling/index_en.md.txt b/doc/_sources/tutorials/semantic_role_labeling/index_en.md.txt deleted file mode 100644 index 92d7c634832119c718711a57c16f69492d405f28..0000000000000000000000000000000000000000 --- a/doc/_sources/tutorials/semantic_role_labeling/index_en.md.txt +++ /dev/null @@ -1,204 +0,0 @@ -```eval_rst -.. _semantic_role_labeling: -``` - -# Semantic Role labeling Tutorial # - -Semantic role labeling (SRL) is a form of shallow semantic parsing whose goal is to discover the predicate-argument structure of each predicate in a given input sentence. SRL is useful as an intermediate step in a wide range of natural language processing tasks, such as information extraction. automatic document categorization and question answering. An instance is as following [1]: - - [ A0 He ] [ AM-MOD would ][ AM-NEG n’t ] [ V accept] [ A1 anything of value ] from [A2 those he was writing about ]. - -- V: verb -- A0: acceptor -- A1: thing accepted -- A2: accepted-from -- A3: Attribute -- AM-MOD: modal -- AM-NEG: negation - -Given the verb "accept", the chunks in sentence would play certain semantic roles. Here, the label scheme is from Penn Proposition Bank. - -To this date, most of the successful SRL systems are built on top of some form of parsing results where pre-defined feature templates over the syntactic structure are used. This tutorial will present an end-to-end system using deep bidirectional long short-term memory (DB-LSTM)[2] for solving the SRL task, which largely outperforms the previous state-of-the-art systems. The system regards SRL task as the sequence labelling problem. - -## Data Description -The relevant paper[2] takes the data set in CoNLL-2005&2012 Shared Task for training and testing. Accordingto data license, the demo adopts the test data set of CoNLL-2005, which can be reached on website. - -To download and process the original data, user just need to execute the following command: - -```bash -cd data -./get_data.sh -``` -Several new files appear in the `data `directory as follows. -```bash -conll05st-release:the test data set of CoNll-2005 shared task -test.wsj.words:the Wall Street Journal data sentences -test.wsj.props: the propositional arguments -feature: the extracted features from data set -``` - -## Training -### DB-LSTM -Please refer to the Sentiment Analysis demo to learn more about the long short-term memory unit. - -Unlike Bidirectional-LSTM that used in Sentiment Analysis demo, the DB-LSTM adopts another way to stack LSTM layer. First a standard LSTM processes the sequence in forward direction. The input and output of this LSTM layer are taken by the next LSTM layer as input, processed in reversed direction. These two standard LSTM layers compose a pair of LSTM. Then we stack LSTM layers pair after pair to obtain the deep LSTM model. - -The following figure shows a temporal expanded 2-layer DB-LSTM network. -
-![pic](./src/network_arch.png) -
- -### Features -Two input features play an essential role in this pipeline: predicate (pred) and argument (argu). Two other features: predicate context (ctx-p) and region mark (mr) are also adopted. Because a single predicate word can not exactly describe the predicate information, especially when the same words appear more than one times in a sentence. With the predicate context, the ambiguity can be largely eliminated. Similarly, we use region mark mr = 1 to denote the argument position if it locates in the predicate context region, or mr = 0 if does not. These four simple features are all we need for our SRL system. Features of one sample with context size set to 1 is showed as following[2]: -
-![pic](./src/feature.jpg) -
- -In this sample, the coresponding labelled sentence is: - -[ A1 A record date ] has [ AM-NEG n't ] been [ V set ] . - -In the demo, we adopt the feature template as above, consists of : `argument`, `predicate`, `ctx-p (p=-1,0,1)`, `mark` and use `B/I/O` scheme to label each argument. These features and labels are stored in `feature` file, and separated by `\t`. - -### Data Provider - -`dataprovider.py` is the python file to wrap data. `hook()` function is to define the data slots for network. The Six features and label are all IndexSlots. -``` -def hook(settings, word_dict, label_dict, **kwargs): - settings.word_dict = word_dict - settings.label_dict = label_dict - #all inputs are integral and sequential type - settings.slots = [ - integer_value_sequence(len(word_dict)), - integer_value_sequence(len(predicate_dict)), - integer_value_sequence(len(word_dict)), - integer_value_sequence(len(word_dict)), - integer_value_sequence(len(word_dict)), - integer_value_sequence(len(word_dict)), - integer_value_sequence(len(word_dict)), - integer_value_sequence(2), - integer_value_sequence(len(label_dict))] -``` -The corresponding data iterator is as following: -``` -@provider(init_hook=hook, should_shuffle=True, calc_batch_size=get_batch_size, - can_over_batch_size=False, cache=CacheType.CACHE_PASS_IN_MEM) -def process(settings, file_name): - with open(file_name, 'r') as fdata: - for line in fdata: - sentence, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, label = \ - line.strip().split('\t') - - words = sentence.split() - sen_len = len(words) - word_slot = [settings.word_dict.get(w, UNK_IDX) for w in words] - - predicate_slot = [settings.predicate_dict.get(predicate)] * sen_len - ctx_n2_slot = [settings.word_dict.get(ctx_n2, UNK_IDX)] * sen_len - ctx_n1_slot = [settings.word_dict.get(ctx_n1, UNK_IDX)] * sen_len - ctx_0_slot = [settings.word_dict.get(ctx_0, UNK_IDX)] * sen_len - ctx_p1_slot = [settings.word_dict.get(ctx_p1, UNK_IDX)] * sen_len - ctx_p2_slot = [settings.word_dict.get(ctx_p2, UNK_IDX)] * sen_len - - marks = mark.split() - mark_slot = [int(w) for w in marks] - - label_list = label.split() - label_slot = [settings.label_dict.get(w) for w in label_list] - yield word_slot, predicate_slot, ctx_n2_slot, ctx_n1_slot, \ - ctx_0_slot, ctx_p1_slot, ctx_p2_slot, mark_slot, label_slot -``` -The `process`function yield 9 lists which are 8 features and label. - -### Neural Network Config -`db_lstm.py` is the neural network config file to load the dictionaries and define the data provider module and network architecture during the training procedure. - -Nine `data_layer` load instances from data provider. Eight features are transformed into embedddings respectively, and mixed by `mixed_layer` . Deep bidirectional LSTM layers extract features for the softmax layer. The objective function is cross entropy of labels. - -### Run Training -The script for training is `train.sh`, user just need to execute: -```bash - ./train.sh -``` -The content in `train.sh`: -``` -paddle train \ - --config=./db_lstm.py \ - --use_gpu=0 \ - --log_period=5000 \ - --trainer_count=1 \ - --show_parameter_stats_period=5000 \ - --save_dir=./output \ - --num_passes=10000 \ - --average_test_period=10000000 \ - --init_model_path=./data \ - --load_missing_parameter_strategy=rand \ - --test_all_data_in_one_period=1 \ -2>&1 | tee 'train.log' -``` - -- \--config=./db_lstm.py : network config file. -- \--use_gpu=false: use CPU to train, set true, if you install GPU version of PaddlePaddle and want to use GPU to train, until now crf_layer do not support GPU -- \--log_period=500: print log every 20 batches. -- \--trainer_count=1: set thread number (or GPU count). -- \--show_parameter_stats_period=5000: show parameter statistic every 100 batches. -- \--save_dir=./output: output path to save models. -- \--num_passes=10000: set pass number, one pass in PaddlePaddle means training all samples in dataset one time. -- \--average_test_period=10000000: do test on average parameter every average_test_period batches -- \--init_model_path=./data: parameter initialization path -- \--load_missing_parameter_strategy=rand: random initialization unexisted parameters -- \--test_all_data_in_one_period=1: test all data in one period - - -After training, the models will be saved in directory `output`. Our training curve is as following: -
-![pic](./src/curve.jpg) -
- -### Run testing -The script for testing is `test.sh`, user just need to execute: -```bash - ./test.sh -``` -The main part in `tesh.sh` -``` -paddle train \ - --config=./db_lstm.py \ - --model_list=$model_list \ - --job=test \ - --config_args=is_test=1 \ -``` - - - \--config=./db_lstm.py: network config file - - \--model_list=$model_list.list: model list file - - \--job=test: indicate the test job - - \--config_args=is_test=1: flag to indicate test - - \--test_all_data_in_one_period=1: test all data in 1 period - - -### Run prediction -The script for prediction is `predict.sh`, user just need to execute: -```bash - ./predict.sh - -``` -In `predict.sh`, user should offer the network config file, model path, label file, word dictionary file, feature file -``` -python predict.py - -c $config_file \ - -w $best_model_path \ - -l $label_file \ - -p $predicate_dict_file \ - -d $dict_file \ - -i $input_file \ - -o $output_file -``` - -`predict.py` is the main executable python script, which includes functions: load model, load data, data prediction. The network model will output the probability distribution of labels. In the demo, we take the label with maximum probability as result. User can also implement the beam search or viterbi decoding upon the probability distribution matrix. - -After prediction, the result is saved in `predict.res`. - -## Reference -[1] Martha Palmer, Dan Gildea, and Paul Kingsbury. The Proposition Bank: An Annotated Corpus of Semantic Roles , Computational Linguistics, 31(1), 2005. - -[2] Zhou, Jie, and Wei Xu. "End-to-end learning of semantic role labeling using recurrent neural networks." Proceedings of the Annual Meeting of the Association for Computational Linguistics. 2015. diff --git a/doc/_sources/tutorials/sentiment_analysis/index_en.md.txt b/doc/_sources/tutorials/sentiment_analysis/index_en.md.txt deleted file mode 100644 index bb7681db44ca6f286ad6935ddfecb9becb429192..0000000000000000000000000000000000000000 --- a/doc/_sources/tutorials/sentiment_analysis/index_en.md.txt +++ /dev/null @@ -1,328 +0,0 @@ -# Sentiment Analysis Tutorial - -Sentiment analysis has many applications. A basic task in sentiment analysis is classifying the polarity of a given text at the document, sentence or feature/aspect level. One simple example is to classify the customer reviews in a shopping website, a tourism website, and group buying websites like Amazon, TaoBao, Tmall etc. - -Sentiment analysis is also used to monitor social media based on large amount of reviews or blogs. For example, the researchers analyzed several surveys on consumer confidence and political opinion, found they correlate to sentiment word frequencies in contemporaneous Twitter messages [1]. Another example is to forecast stock movements through analyzing the text content of a daily Twitter blog [2]. - -On the other hand, grabbing the user comments of products and analyzing their sentiment are useful to understand user preferences for companies, products, even competing products. - -This tutorial will guide you through the process of training a Long Short Term Memory (LSTM) Network to classify the sentiment of sentences from [Large Movie Review Dataset](http://ai.stanford.edu/~amaas/data/sentiment/), sometimes known as the Internet Movie Database (IMDB). This dataset contains movie reviews along with their associated binary sentiment polarity labels, namely positive and negative. So randomly guessing yields 50% accuracy. - -## Data Preparation - -### IMDB Data Introduction - -Before training models, we need to preprocess the data and build a dictionary. First, you can use following script to download IMDB dataset and [Moses](http://www.statmt.org/moses/) tool, which is a statistical machine translation system. We provide a data preprocessing script, which is capable of handling not only IMDB data, but also other user-defined data. In order to use the pre-written script, it needs to move labeled train and test samples to another path, which has been done in `get_imdb.sh`. - -``` -cd demo/sentiment/data -./get_imdb.sh -``` -If the data is obtained successfuly, you will see the following files at ```./demo/sentiment/data```: - -``` -aclImdb get_imdb.sh imdb mosesdecoder-master -``` - -* aclImdb: raw dataset downloaded from website. -* imdb: only contains train and test data. -* mosesdecoder-master: Moses tool. - -IMDB dataset contains 25,000 highly polar movie reviews for training, and 25,000 for testing. A negative review has a score ≤ 4 out of 10, and a positive review has a score ≥ 7 out of 10. After running `./get_imdb.sh`, we can find the dataset has the following structure in `aclImdb`. - -``` -imdbEr.txt imdb.vocab README test train -``` -* train: train sets. -* test : test sets. -* imdb.vocab: dictionary. -* imdbEr.txt: expected rating for each token in imdb.vocab. -* README: data documentation. - -The file in train set directory is as follows. The test set also contains them except `unsup` and `urls_unsup.txt`. - -``` -labeledBow.feat neg pos unsup unsupBow.feat urls_neg.txt urls_pos.txt urls_unsup.txt -``` - -* pos: positive samples, contains 12,500 txt files, each file is one movie review. -* neg: negative samples, contains 12,500 txt files, each file is one movie review. -* unsup: unlabeled samples, contains 50,000 txt files. -* urls_xx.txt: urls of each reviews. -* xxBow.feat: already-tokenized bag of words (BoW) features. - -### IMDB Data Preparation - -In this demo, we only use labled train and test set and not use imdb.vocab as dictionary. By default, dictionary is builded on train set. Train set is shuffled and test set is not. `tokenizer.perl` in Moses tool is used to tokenize the words and punctuation. Simply execute the following command to preprcess data. - -``` -cd demo/sentiment/ -./preprocess.sh -``` -preprocess.sh: - -``` -data_dir="./data/imdb" -python preprocess.py -i data_dir -``` - -* data_dir: input data directory. -* preprocess.py: preprocess script. - -If running successfully, you will see `demo/sentiment/data/pre-imdb` directory as follows: - -``` -dict.txt labels.list test.list test_part_000 train.list train_part_000 -``` -* test\_part\_000 and train\_part\_000: all labeled test and train sets. Train sets have be shuffled. -* train.list and test.list: train and test file lists. -* dict.txt: dictionary generated on train sets by default. -* labels.txt: neg 0, pos 1, means label 0 is negative review, label 1 is positive review. - -### User-defined Data Preparation - -If you perform other sentiment classifcation task, you can prepare data as follows. We have provided the scripts to build dictionary and preprocess data. So just organize data as follows. - -``` -dataset -|----train -| |----class1 -| | |----text_files -| |----class2 -| | |----text_files -| | ... -|----test -| |----class1 -| | |----text_files -| |----class2 -| | |----text_files -| | ... -``` -* dataset: 1st directory. -* train, test: 2nd directory. -* class1,class2,...: 3rd directory. -* text_files: samples with text file format. - -All samples with text files format under the same folder are same category. Each text file contains one or more samples and each line is one sample. In order to shuffle fully, the preprocessing is a little different for data with multiple lines in one text file, which needs to set `-m True` in `preprocess.sh`. And tokenizer.perl is used by default. If you don't need it, only set `-t False` in `preprocess.sh'. - -## Training - -In this task, we use Recurrent Neural Network (RNN) of LSTM architecure to train sentiment analysis model. LSTM model was introduced primarily in order to overcome the problem of vanishing gradients. LSTM network resembles a standard recurrent neural network with a hidden layer, but each ordinary node in the hidden layer is replaced by a memory cell. Each memory cell contains four main elements: an input gate, a neuron with a self-recurrent connection, a forget gate and an output gate. More details can be found in the literature [4]. The biggest advantage of the LSTM architecture is that it learns to memorize information over long time intervals without the loss of short time memory. At each time step with a new coming word, historical information stored in the memory block is updated to iteratively learn the sequence representation. - -
![LSTM](./lstm.png)
-
Figure 1. LSTM [3]
- -Sentiment analysis is among the most typical problems in natural language understanding. It aims at predicting the attitude expressed in a sequence. Usually, only some key words, like adjectives and adverbs words, play a major role in predicting the sentiment of sequences or paragraphs. However, some review or comment contexts are very long, such as IMDB dataset. We use LSTM to perform this task for its improved design with the gate mechanism. First, it is able to summarize the representation from word level to context level with variable context length which is adapted by the gate values. Second, it can utilize the expanded context at the sentence level, while most methods are good at utilizing n-gram level knowledge. Third, it learns the paragraph representation directly rather than combining the context level information. This results in this end-to-end framework. - -In this demo we provide two network, namely bidirectional-LSTM and three layers of stacked-LSTM. - -#### Bidirectional-LSTM - -One is a bidirectional LSTM network, connected by fully connected layer and softmax, as shown in Figure 2. - -
![BiLSTM](./bi_lstm.jpg)
-
Figure 2. Bidirectional-LSTM
- -#### Stacked-LSTM -Another is three-layer LSTM structure in Figure 3. The bottom of the figure is word embedding. Next, three LSTM-Hidden layers are connected and the second LSTM is reversed. Then extract the maximum hidden vectors of all time step of hidden and LSTM layer as the representation for the entire sequence. Finally, a fully connected feed forward layer with softmax activation is used to perform the classification task. This network is refered to paper [5]. - -
![StackedLSTM](./stacked_lstm.jpg)
-
Figure 3. Stacked-LSTM for sentiment analysis
- -**Config** - -Switch into `demo/sentiment` directory, `trainer_config.py` file is an example of the config, containing algorithm and newtork configure. The first line imports predefined networks from `sentiment_net.py`. - -trainer_config.py: - -```python -from sentiment_net import * - -data_dir = "./data/pre-imdb" -# whether this config is used for test -is_test = get_config_arg('is_test', bool, False) -# whether this config is used for prediction -is_predict = get_config_arg('is_predict', bool, False) -dict_dim, class_dim = sentiment_data(data_dir, is_test, is_predict) - -################## Algorithm Config ##################### - -settings( - batch_size=128, - learning_rate=2e-3, - learning_method=AdamOptimizer(), - average_window=0.5, - regularization=L2Regularization(8e-4), - gradient_clipping_threshold=25 -) - -#################### Network Config ###################### -stacked_lstm_net(dict_dim, class_dim=class_dim, - stacked_num=3, is_predict=is_predict) -#bidirectional_lstm_net(dict_dim, class_dim=class_dim, is_predict=is_predict) -``` - -* **Data Definition**: - * get\_config\_arg(): get arguments setted by `--config_args=xx` in commandline argument. - * Define data provider, here using Python interface to load data. For details, you can refer to the document of PyDataProvider2. - -* **Algorithm Configuration**: - * set batch size of 128. - * set global learning rate. - * use adam optimization. - * set average sgd window. - * set L2 regularization. - * set gradient clipping threshold. -* **Network Configuration**: - * dict_dim: dictionary dimension. - * class_dim: category number, IMDB has two label, namely positive and negative label. - * `stacked_lstm_net`: predefined network as shown in Figure 3, use this network by default. - * `bidirectional_lstm_net`: predefined network as shown in Figure 2. - -**Training** - -Install PaddlePaddle first if necessary. Then you can use script `train.sh` as follows to launch local training. - -``` -cd demo/sentiment/ -./train.sh -``` - -train.sh: - -``` -config=trainer_config.py -output=./model_output -paddle train --config=$config \ - --save_dir=$output \ - --job=train \ - --use_gpu=false \ - --trainer_count=4 \ - --num_passes=10 \ - --log_period=20 \ - --dot_period=20 \ - --show_parameter_stats_period=100 \ - --test_all_data_in_one_period=1 \ - 2>&1 | tee 'train.log' -``` - -* \--config=$config: set network config. -* \--save\_dir=$output: set output path to save models. -* \--job=train: set job mode to train. -* \--use\_gpu=false: use CPU to train, set true, if you install GPU version of PaddlePaddle and want to use GPU to train. -* \--trainer\_count=4: set thread number (or GPU count). -* \--num\_passes=15: set pass number, one pass in PaddlePaddle means training all samples in dataset one time. -* \--log\_period=20: print log every 20 batches. -* \--show\_parameter\_stats\_period=100: show parameter statistic every 100 batches. -* \--test\_all_data\_in\_one\_period=1: test all data every testing. - -If the run succeeds, the output log is saved in path of `demo/sentiment/train.log` and model is saved in path of `demo/sentiment/model_output/`. The output log is explained as follows. - -``` -Batch=20 samples=2560 AvgCost=0.681644 CurrentCost=0.681644 Eval: classification_error_evaluator=0.36875 CurrentEval: classification_error_evaluator=0.36875 -... -Pass=0 Batch=196 samples=25000 AvgCost=0.418964 Eval: classification_error_evaluator=0.1922 -Test samples=24999 cost=0.39297 Eval: classification_error_evaluator=0.149406 -``` -- Batch=xx: means passing xx batches. -- samples=xx: means passing xx samples. -- AvgCost=xx: averaged cost from 0-th batch to current batch. -- CurrentCost=xx: current cost of latest log_period batches. -- Eval: classification\_error\_evaluator=xx: means classfication error from 0-th batch ro current batch. -- CurrentEval: classification\_error\_evaluator: current classfication error of the lates log_period batches. -- Pass=0: Going through all training set one time is called one pass. 0 means going through training set first time. - -By default, we use the `stacked_lstm_net` network, which converges at a faster rate than `bidirectional_lstm_net` when passing same sample number. If you want to use bidirectional LSTM, just remove comment in the last line and comment `stacked_lstm_net`. - -## Testing - -Testing means evaluating the labeled validation set using trained model. - -``` -cd demo/sentiment -./test.sh -``` - -test.sh: - -```bash -function get_best_pass() { - cat $1 | grep -Pzo 'Test .*\n.*pass-.*' | \ - sed -r 'N;s/Test.* error=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g' | \ - sort | head -n 1 -} - -log=train.log -LOG=`get_best_pass $log` -LOG=(${LOG}) -evaluate_pass="model_output/pass-${LOG[1]}" - -echo 'evaluating from pass '$evaluate_pass - -model_list=./model.list -touch $model_list | echo $evaluate_pass > $model_list -net_conf=trainer_config.py -paddle train --config=$net_conf \ - --model_list=$model_list \ - --job=test \ - --use_gpu=false \ - --trainer_count=4 \ - --config_args=is_test=1 \ - 2>&1 | tee 'test.log' -``` - -The function `get_best_pass` gets the best model by classification error rate for testing. In this example, We use test dataset of IMDB as validation by default. Unlike training, it needs to specify `--job=test` and model path, namely `--model_list=$model_list` here. If running successfully, the log is saved in path of `demo/sentiment/test.log`. For example, in our test, the best model is `model_output/pass-00002`, the classification error is 0.115645 as follows. - -``` -Pass=0 samples=24999 AvgCost=0.280471 Eval: classification_error_evaluator=0.115645 -``` - -## Prediction - -`predict.py` provides a predicting interface. You should install python api of PaddlePaddle before using it. One example to predict unlabeled review of IMDB is as follows. Simply running: - -``` -cd demo/sentiment -./predict.sh -``` -predict.sh: - -``` -#Note the default model is pass-00002, you shold make sure the model path -#exists or change the mode path. -model=model_output/pass-00002/ -config=trainer_config.py -label=data/pre-imdb/labels.list -cat ./data/aclImdb/test/pos/10007_10.txt | python predict.py \ - --tconf=$config\ - --model=$model \ - --label=$label \ - --dict=./data/pre-imdb/dict.txt \ - --batch_size=1 -``` - -* `cat ./data/aclImdb/test/pos/10007_10.txt` : the input sample. -* `predict.py` : predicting interface. -* `--tconf=$config` : set network configure. -* ` --model=$model` : set model path. -* `--label=$label` : set dictionary about corresponding relation between integer label and string label. -* `--dict=data/pre-imdb/dict.txt` : set dictionary. -* `--batch_size=1` : set batch size. - -Note you should make sure the default model path `model_output/pass-00002` -exists or change the model path. - -Predicting result of this example: - -``` -Loading parameters from model_output/pass-00002/ -./data/aclImdb/test/pos/10014_7.txt: predicting label is pos -``` -We sincerely appreciate your interest and welcome your contributions. - -## Reference -[1] Brendan O'Connor, Ramnath Balasubramanyan, Bryan R. Routledge, and Noah A. Smith. 2010. [From Tweets to Polls: Linking Text Sentiment to Public Opinion Time Series](http://homes.cs.washington.edu/~nasmith/papers/oconnor+balasubramanyan+routledge+smith.icwsm10.pdf). In ICWSM-2010.
-[2] Johan Bollen, Huina Mao, Xiaojun Zeng. 2011. [Twitter mood predicts the stock market](http://arxiv.org/abs/1010.3003), Journal of Computational Science.
-[3] Alex Graves, Marcus Liwicki, Santiago Fernan- dez, Roman Bertolami, Horst Bunke, and Ju ̈rgen Schmidhuber. 2009. [A novel connectionist system for unconstrained handwriting recognition. IEEE Transactions on Pattern Analysis and Machine In- telligence](http://www.cs.toronto.edu/~graves/tpami_2009.pdf), 31(5):855–868.
-[4] Zachary C. Lipton, [A Critical Review of Recurrent Neural Networks for Sequence Learning](http://arxiv.org/abs/1506.00019v1), arXiv:1506.00019.
-[5] Jie Zhou and Wei Xu; [End-to-end Learning of Semantic Role Labeling Using Recurrent Neural Networks](http://www.aclweb.org/anthology/P/P15/P15-1109.pdf); ACL-IJCNLP 2015.
diff --git a/doc/_sources/tutorials/text_generation/index_en.md.txt b/doc/_sources/tutorials/text_generation/index_en.md.txt deleted file mode 100644 index 5d8e667c20bd1fda64a6e11a88517d52112b72fa..0000000000000000000000000000000000000000 --- a/doc/_sources/tutorials/text_generation/index_en.md.txt +++ /dev/null @@ -1,338 +0,0 @@ -# Text generation Tutorial # - -Sequence to sequence has been proven to be a powerful model for language generation. It can be used for machine translation, query rewriting, image captioning, etc. - -This tutorial guides you through training a sequence to sequence model for neural machine translation (NMT) network that translates French to English. - -We follow the paper [Neural Machine Translation by Jointly Learning to Align and Translate](http://arxiv.org/abs/1409.0473) , which details the model architecture and training procedure for good performance on WMT-14 dataset. This tutorial reproduces this result in PaddlePaddle. - -We thank @caoying for the pull request that defines the model architecture and solver configurations. - -## Data Preparation ## -### Download and Extract ### -Download the WMT-14 dataset from [http://www-lium.univ-lemans.fr/~schwenk/cslm\_joint\_paper/](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/), extract it, and divide Develop and Test data into separate folder. - -- **Train data**: [bitexts (after selection)](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/bitexts.tgz) -- **Develop and Test data**: [dev+test data](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/dev+test.tgz) - -To do this, simply run the following commands in linux, otherwise, you need to download, extract, divide, and rename the file suffix respectively. - -```bash -cd demo/seqToseq/data -./wmt14_data.sh -``` - -We should find that the dataset `wmt14` has three folders as shown in the following table. - ------ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
folder nameFrench-English parallel corpora filenumber of total filesize
train_dataccb2_pc30.src, ccb2_pc30.trg, etctwelve3.55G
test_datantst1213.src, ntst1213.trgtwo1636k
gen_datantst14.src, ntst14.trgtwo864k
-
- -- Each folder has French-English parallel corpora -- **XXX.src** are source French files; **XXX.trg** are target English files. -- The number of lines of **XXX.src** and **XXX.trg** should be the same. -- Each line is a French/English sentence. -- There is a one-to-one correspondence between the sentence at the i-th line of **XXX.src** and **XXX.trg**. - -### User Defined Dataset ### - -If you need to do other sequence-to-sequence tasks, such as Paraphrasing, you only need to organize the data as follows, and place them in `demo/seqToseq/data`: - - dataset - train - file1.src file1.trg - file2.src file2.trg - ...... - test - file1.src file1.trg - file2.src file2.trg - ...... - gen - file1.src file1.trg - file2.src file2.trg - ...... -- 1st directory: dataset folder name -- 2nd directory: folder of train, test, and gen. The names of these three folders are fixed. -- 3rd file: Source-Target parallel corpora files. - - **XXX.src** are source files, **XXX.trg** are target files. - - Each line of the file must be a sequence. - - There should be a one-to-one correspondence between the i-th sequence of **XXX.src** and **XXX.trg**. - -## Data Preprocess ## -### Preprocessing Workflow ### -- Concat each Source-Target parallel corpora to be one file: - - concat each **XXX.src** and **XXX.trg** to be **XXX**. - - the i-th line of **XXX** = the i-th line of **XXX.src** + '\t' + the i-th line of **XXX.trg** -- Build source and target dictionary of train data, each dictionary has DICTSIZE words: - - the most frequent (DICTSIZE-3) words - - 3 special token: - - ``: the start of a sequence - - ``: the end of a sequence - - ``: a word not included in dictionary - -### Preprocessing Command and Result -The general command for preprocessing the dataset is: - -```python -cd demo/seqToseq/ -python preprocess.py -i INPUT [-d DICTSIZE] [-m] -``` - -- `-i INPUT`: the path of input original dataset -- `-d DICTSIZE`: the specified word count of dictionary, if not set, dictionary will contain all the words in input dataset -- `-m --mergeDict`: merge source and target dictionary, thus, two dictionaries have the same context - -And you will see messages like this: - - concat parallel corpora for dataset - build source dictionary for train data - build target dictionary for train data - dictionary size is XXX - -Here, you can simply run the command: - -```python -python preprocess.py -i data/wmt14 -d 30000 -``` - -It will take several minutes, and store the preprocessed dataset in `demo/seqToseq/data/pre-wmt14`, the directory has following structure. - - train test gen train.list test.list gen.list src.dict trg.dict - -- **train, test, gen**: folder contains French-English parallel corpora of train data, test data and gen data respectively. Each line of file in folder contains two parts, the former is a French sequence, and the latter is a corresponding English sequence. -- **train.list, test.list, gen.list**: text contains a file list in train folder, test folder and gen folder respectively -- **src.dict, trg.dict**: source (French) / target (English) dictionary, each dictionary has 30000 words: the most frequent 29997 words and 3 special token - -## Model Training ## -### Introduction ### - -Neural machine translation (NMT) aims at building a single neural network that can be jointly tuned to maximize translation performance. Recently proposed NMT models often belong to a family of encoder–decoder models. Encoder-Decoder models encode a source sentence into a fixed-length vector from which a decoder generates a target sentence. - -In this task, we use an extension to the encoder–decoder model which learns to align and translate jointly. Each time the model generates a word in a translation, it searches for a set of positions in the source sentence for the most relevant information. The decoder predicts a target word based on the context vectors associated with these source positions and all the previous generated target words. For more detailed explanation, readers can refer to paper [Neural Machine Translation by Jointly Learning to Align and Translate](http://arxiv.org/abs/1409.0473). - -The most distinguishing feature of this model is that it doesn't encode an input sentence into a single fixed-length vector. Instead, it encodes the input sentence into a sequence of vectors, where one vector corresponds to an input element. A subset of these vectors is chosen adaptively while decoding the translated sentence. This frees a NMT model from having to squash all the information of a source sentence, regardless of its length, into a fixed-length vector. The improvement of this model is more apparent for longer sentences, but the improvement can be observed for sentences of any length. -
![](./encoder-decoder-attention-model.png)
-
Figure 1. Encoder-Decoder-Attention-Model
- -### Training Model in PaddlePaddle ### -We need to create a model config file before training. Here is an example `demo/seqToseq/translation/train.conf`. The first three lines import python function for defining network, and define the job_mode and attention_mode. - -```python -from seqToseq_net import * -is_generating = False - -### Data Definiation -train_conf = seq_to_seq_data(data_dir = "./data/pre-wmt14", - is_generating = is_generating) - -### Algorithm Configuration -settings( - learning_method = AdamOptimizer(), - batch_size = 50, - learning_rate = 5e-4) - -### Network Architecture -gru_encoder_decoder(train_conf, is_generating) -``` - -1. **Data Definiation**: We define a SeqToSeq train and test data in our example. It returns train_conf as the configuration, following is its input arguments: - - data_dir: directory of train data and test data - - is\_generating: whether this config is used for generating, here is false -2. **Algorithm Configuration**: We use the SGD training algorithm (default), ADAM learning method in our example, specify batch_size as 50, and learning rate as 5e-4. -3. **Network Architecture**: We use an attention version of GRU Encoder-Decoder network in our example. It consists a bidirectional GRU as an encoder and a decoder that emulates searching through a source sentence during decoding a translation. - -### Training Command and Result### -After writing the model config, we can train the model by running the command: - -```bash -cd demo/seqToseq/translation -./train.sh -``` - -The `train.sh` is shown as follows: - -```bash -paddle train \ ---config='translation/train.conf' \ ---save_dir='translation/model' \ ---use_gpu=false \ ---num_passes=16 \ ---show_parameter_stats_period=100 \ ---trainer_count=4 \ ---log_period=10 \ ---dot_period=5 \ -2>&1 | tee 'translation/train.log' -``` -- config: set config of neural network -- save_dir: set output path to save models -- use_gpu: whether to use GPU to train, here use CPU -- num_passes: set number of passes. One pass in paddle means training all samples in dataset one time -- show_parameter_stats_period: here show parameter statistic every 100 batches -- trainer_count: set number of CPU threads or GPU devices -- log_period: here print log every 10 batches -- dot_period: here print '.' every 5 batches - -The training loss function is printed every 10 batch by default, and you will see messages like this: - - I0719 19:16:45.952062 15563 TrainerInternal.cpp:160] Batch=10 samples=500 AvgCost=198.475 CurrentCost=198.475 Eval: classification_error_evaluator=0.737155 CurrentEval: classification_error_evaluator=0.737155 - I0719 19:17:56.707319 15563 TrainerInternal.cpp:160] Batch=20 samples=1000 AvgCost=157.479 CurrentCost=116.483 Eval: classification_error_evaluator=0.698392 CurrentEval: classification_error_evaluator=0.659065 - ..... -- AvgCost: Average Cost from 0th batch to current batch -- CurrentCost: Cost in current batch -- classification\_error\_evaluator(Eval): False prediction rate for each word from 0th evaluation to current evaluation -- classification\_error\_evaluator(CurrentEval): False prediction rate for each word in current evaluation - -And when the classification\_error\_evaluator is less than 0.35, the model is trained sucessfully. - -## Text Generation ## -### Introduction ### - -Generally speaking, the NMT model is conditioned on the encodings of the source sentence, and then to predict the next target word by given the current target word. In the training process, the current word is always knowns as the ground truth, by contrast. In the generating process, the current word is the output of the decoder in last time step, which is accessed to from a memory in PaddlePaddle. - -Besides, we use Beam Search to generate sequences. Beam search uses breadth-first search to build its search tree. At each level of the tree, it generates all successors of the states at the current level, sorting them in increasing order of heuristic cost. However, it only stores a predetermined number of best states at each level (called the beam size). - -### Pretrained model ### -We trained the model on a cluster with 50 nodes, each node has two 6-core CPUs. We trained 16 passes in 5 days, where each pass takes 7 hours. The model_dir has 16 sub-folder, each of which contains the whole model parameters with 202MB size. And we find pass-00012 model has the highest BLEU 27.77 (see paper [BLEU: a Method for Automatic Evaluation of Machine Translation](http://www.aclweb.org/anthology/P02-1040.pdf)). To download and extract this model, simply run the following commands in linux. - -```bash -cd demo/seqToseq/data -./wmt14_model.sh -``` - -### Generating Model in PaddlePaddle ### -We need to create a model config file before translating French sequence. Here is an example `demo/seqToseq/translation/gen.conf`, the first three lines import python function for defining network, and define the job\_mode and attention\_mode. - -```python -from seqToseq_net import * -is_generating = True - -################## Data Definiation ##################### -gen_conf = seq_to_seq_data(data_dir = "./data/pre-wmt14", - is_generating = is_generating, - gen_result = "./translation/gen_result") - -############## Algorithm Configuration ################## -settings( - learning_method = AdamOptimizer(), - batch_size = 1, - learning_rate = 0) - -################# Network configure ##################### -gru_encoder_decoder(gen_conf, is_generating) -``` - -1. **Data Definiation**: We defines an SeqToSeq gen data in our example. It returns gen_conf as the configuration, following is its input arguments: - - data\_dir: directory of gen data -   - is\_generating: whether this config is used for generating, here is true -   - gen\_result: file to store the generation result -2. **Algorithm Configuration**: We use SGD traing algorithm in generation, and specify batch_size as 1 (each time generate one sequence), and learning rate as 0. -3. **Network Architecture**: Essentially the same as the training model. - -### Generating Command and Result ### -After writing the model config, we can do text translation from French to English by running the command: - -```bash -cd demo/seqToseq/translation -./gen.sh -``` - -The `gen.sh` is shown as follows, unlike training, there are some different arguments to specify: - -```bash -paddle train \ ---job=test \ ---config='translation/gen.conf' \ ---save_dir='data/wmt14_model' \ ---use_gpu=true \ ---num_passes=13 \ ---test_pass=12 \ ---trainer_count=1 \ -2>&1 | tee 'translation/gen.log' -``` -- job: set job mode to test -- save_dir: the path of saved models -- num_passes and test_pass: loading model parameters from test_pass to (num_passes - 1), here only loads `data/wmt14_model/pass-00012` - -You will see messages like this: - - I0706 14:48:31.178915 31441 GradientMachine.cpp:143] Loading parameters from data/wmt14_model/pass-00012 - I0706 14:48:40.012039 31441 Tester.cpp:125] Batch=100 samples=100 AvgCost=0 - I0706 14:48:48.898632 31441 Tester.cpp:125] Batch=200 samples=200 AvgCost=0 - ... - -And the generating result in `demo/seqToseq/translation/gen_result` likes: - - 0 - 0 -11.1314 The about the width of the seats while large controls are at stake - 1 -11.1519 The on the width of the seats while large controls are at stake - 2 -11.5988 The about the width of the seats while large controls are at stake . - - 1 - 0 -24.4149 The dispute is between the major aircraft manufacturers about the width of the tourist seats on the flights , paving the way for a confrontation during the month of the Dubai . - 1 -26.9524 The dispute is between the major aircraft manufacturers about the width of the tourist seats on the flights , paving the way for a confrontation during the month of Dubai ' s . - 2 -27.9574 The dispute is between the major aircraft manufacturers about the width of the tourist seats on the flights , paving the way for a confrontation during the month of Dubai ' s Dubai . - ... - -- This is the beam search result, where beam size is 3 -- '0' in 1st-line and '1' in 6th-line mean the sequence-id in gen data -- Other six lines list the beam search results - - The 2nd-column is the score of beam search (from large to small) - - The 3rd-colunm is the generating English sequence -- There is 2 special tokens: - - ``: the end of a sequence - - ``: a word not included in dictionary - -### Bleu Evalutaion ### -Human evaluations of machine translation are extensive but expensive. Paper [BLEU: a Method for Automatic Evaluation of Machine Translation](http://www.aclweb.org/anthology/P02-1040.pdf) presents a method as an automated understudy to skilled human judges which substitutes for them when there is need for quick or frequent evaluations. [Moses](http://www.statmt.org/moses/) is a statistical machine translation system, and we use [multi-bleu.perl](https://github.com/moses-smt/mosesdecoder/blob/master/scripts/generic/multi-bleu.perl) of it to do Bleu Evalution. To download this script, simply run the following command: - -```bash -cd demo/seqToseq/translation -./moses_bleu.sh -``` - -Since the standard translation is alrealy downloaded as `data/wmt14/gen/ntst14.trg`, we can do Bleu Evalution by running the command: - -```bash -cd demo/seqToseq/translation -./eval_bleu.sh FILE BEAMSIZE -``` - -- FILE: the generation result file -- BEAMSIZE: expand width in beam search diff --git a/doc/_sources/v1_api_tutorials/README.md.txt b/doc/_sources/v1_api_tutorials/README.md.txt new file mode 100644 index 0000000000000000000000000000000000000000..071b8da61fbcab3e88819273008b4526546202ad --- /dev/null +++ b/doc/_sources/v1_api_tutorials/README.md.txt @@ -0,0 +1,5 @@ +The tutorials in v1_api_tutorials are using v1_api currently, and will be upgraded to v2_api later. +Thus, v1_api_tutorials is a temporary directory. We decide not to maintain it and will delete it in future. + +Please go to [PaddlePaddle/book](https://github.com/PaddlePaddle/book) and +[PaddlePaddle/models](https://github.com/PaddlePaddle/models) to learn PaddlePaddle. diff --git a/doc/_sources/tutorials/embedding_model/index_en.md.txt b/doc/_sources/v1_api_tutorials/embedding_model/index_en.md.txt similarity index 100% rename from doc/_sources/tutorials/embedding_model/index_en.md.txt rename to doc/_sources/v1_api_tutorials/embedding_model/index_en.md.txt diff --git a/doc/_sources/tutorials/gan/index_en.md.txt b/doc/_sources/v1_api_tutorials/gan/index_en.md.txt similarity index 100% rename from doc/_sources/tutorials/gan/index_en.md.txt rename to doc/_sources/v1_api_tutorials/gan/index_en.md.txt diff --git a/doc/_sources/tutorials/imagenet_model/resnet_model_en.md.txt b/doc/_sources/v1_api_tutorials/imagenet_model/resnet_model_en.md.txt similarity index 100% rename from doc/_sources/tutorials/imagenet_model/resnet_model_en.md.txt rename to doc/_sources/v1_api_tutorials/imagenet_model/resnet_model_en.md.txt diff --git a/doc/_sources/tutorials/quick_start/index_en.md.txt b/doc/_sources/v1_api_tutorials/quick_start/index_en.md.txt similarity index 100% rename from doc/_sources/tutorials/quick_start/index_en.md.txt rename to doc/_sources/v1_api_tutorials/quick_start/index_en.md.txt diff --git a/doc/api/index_en.html b/doc/api/index_en.html index 1738caea7d0edbecdd781f0bb75c576d78bbdd28..61396b4c43ea688893ce79ca5e1924fdb7e3765c 100644 --- a/doc/api/index_en.html +++ b/doc/api/index_en.html @@ -64,7 +64,7 @@