diff --git a/doc/fluid/api/detection.rst b/doc/fluid/api/detection.rst new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/doc/fluid/api/gen_doc.sh b/doc/fluid/api/gen_doc.sh index 0f0539355559446fd91f659d61b636db214b5a40..3ee299f5aceb32465574277c25d962360183f1cc 100755 --- a/doc/fluid/api/gen_doc.sh +++ b/doc/fluid/api/gen_doc.sh @@ -1,5 +1,5 @@ #!/bin/bash -python gen_doc.py layers --submodules control_flow device io nn ops tensor > layers.rst +python gen_doc.py layers --submodules control_flow device io nn ops tensor detection > layers.rst for module in data_feeder clip metrics executor initializer io nets optimizer param_attr profiler regularizer do diff --git a/doc/fluid/api/io.rst b/doc/fluid/api/io.rst index 3e956f8302d261b52f9f76ff8eb4a01f9c6381f8..dd9d88b669957c22cd0a07fa4b7e219e2d6e5d61 100644 --- a/doc/fluid/api/io.rst +++ b/doc/fluid/api/io.rst @@ -59,21 +59,3 @@ get_inference_program .. autofunction:: paddle.fluid.io.get_inference_program :noindex: -save_checkpoint ---------------- - -.. autofunction:: paddle.fluid.io.save_checkpoint - :noindex: - -load_checkpoint ---------------- - -.. autofunction:: paddle.fluid.io.load_checkpoint - :noindex: - -clean_checkpoint ----------------- - -.. autofunction:: paddle.fluid.io.clean_checkpoint - :noindex: - diff --git a/doc/fluid/api/layers.rst b/doc/fluid/api/layers.rst index f78e6db3268e44d5f30d83508f07c4ed68106e48..ba33c0d7d650e76711040c40ab9e5fdcf11c3a6c 100644 --- a/doc/fluid/api/layers.rst +++ b/doc/fluid/api/layers.rst @@ -181,12 +181,6 @@ Print .. autofunction:: paddle.fluid.layers.Print :noindex: -is_empty --------- - -.. autofunction:: paddle.fluid.layers.is_empty - :noindex: - device ====== @@ -261,19 +255,6 @@ double_buffer .. autofunction:: paddle.fluid.layers.double_buffer :noindex: -random_data_generator ---------------------- - -.. autofunction:: paddle.fluid.layers.random_data_generator - :noindex: - -Preprocessor ------------- - -.. autoclass:: paddle.fluid.layers.Preprocessor - :members: - :noindex: - nn == @@ -613,30 +594,6 @@ roi_pool .. autofunction:: paddle.fluid.layers.roi_pool :noindex: -dice_loss ---------- - -.. autofunction:: paddle.fluid.layers.dice_loss - :noindex: - -resize_bilinear ---------------- - -.. autofunction:: paddle.fluid.layers.resize_bilinear - :noindex: - -gather ------- - -.. autofunction:: paddle.fluid.layers.gather - :noindex: - -random_crop ------------ - -.. autofunction:: paddle.fluid.layers.random_crop - :noindex: - ops === @@ -784,12 +741,6 @@ sum .. autofunction:: paddle.fluid.layers.sum :noindex: -shape ------ - -.. autofunction:: paddle.fluid.layers.shape - :noindex: - sigmoid ------- @@ -1039,3 +990,54 @@ zeros .. autofunction:: paddle.fluid.layers.zeros :noindex: +detection +========= + +multi_box_head +-------------- + +.. autofunction:: paddle.fluid.layers.multi_box_head + :noindex: + +bipartite_match +--------------- + +.. autofunction:: paddle.fluid.layers.bipartite_match + :noindex: + +target_assign +------------- + +.. autofunction:: paddle.fluid.layers.target_assign + :noindex: + +detection_output +---------------- + +.. autofunction:: paddle.fluid.layers.detection_output + :noindex: + +ssd_loss +-------- + +.. autofunction:: paddle.fluid.layers.ssd_loss + :noindex: + +detection_map +------------- + +.. autofunction:: paddle.fluid.layers.detection_map + :noindex: + +iou_similarity +-------------- + +.. autofunction:: paddle.fluid.layers.iou_similarity + :noindex: + +box_coder +--------- + +.. autofunction:: paddle.fluid.layers.box_coder + :noindex: + diff --git a/doc/fluid/api/optimizer.rst b/doc/fluid/api/optimizer.rst index 6ad44bb6905b6e3f2b6e4aeb3701ced5d18e2005..79a0995fce303518d989693976c4e92e05795ca2 100644 --- a/doc/fluid/api/optimizer.rst +++ b/doc/fluid/api/optimizer.rst @@ -89,13 +89,6 @@ DecayedAdagradOptimizer :members: :noindex: -RMSPropOptimizer ----------------- - -.. autoclass:: paddle.fluid.optimizer.RMSPropOptimizer - :members: - :noindex: - Adadelta -------- diff --git a/doc/fluid/api/profiler.rst b/doc/fluid/api/profiler.rst index 39fda65863471a78895503184848a754828b71a1..74d102dcb0db35766c34e3d14939a8aa5861686b 100644 --- a/doc/fluid/api/profiler.rst +++ b/doc/fluid/api/profiler.rst @@ -23,15 +23,3 @@ profiler .. autofunction:: paddle.fluid.profiler.profiler :noindex: -start_profiler --------------- - -.. autofunction:: paddle.fluid.profiler.start_profiler - :noindex: - -stop_profiler -------------- - -.. autofunction:: paddle.fluid.profiler.stop_profiler - :noindex: - diff --git a/doc/v2/api/config/evaluators.rst b/doc/v2/api/config/evaluators.rst index 9ac972fb193a2fb525edc507f7ba1303d2c8eabe..458d892e825a7a9bbe7843ad5c508bd5a31f5f0f 100644 --- a/doc/v2/api/config/evaluators.rst +++ b/doc/v2/api/config/evaluators.rst @@ -101,7 +101,7 @@ value_printer :noindex: Detection -===== +========== detection_map ------------- diff --git a/doc/v2/api/config/layer.rst b/doc/v2/api/config/layer.rst index 1a6496968cae1fef88142ba9ca3f9e63a81b196d..5a0cfadfce84df41defdf518b7c3a6222d5b30a1 100644 --- a/doc/v2/api/config/layer.rst +++ b/doc/v2/api/config/layer.rst @@ -11,7 +11,7 @@ Data layer data ---- -.. autoclass:: paddle.v2.layer.data +.. autofunction:: paddle.v2.layer.data :noindex: Fully Connected Layers @@ -21,12 +21,12 @@ Fully Connected Layers fc -- -.. autoclass:: paddle.v2.layer.fc +.. autofunction:: paddle.v2.layer.fc :noindex: selective_fc ------------ -.. autoclass:: paddle.v2.layer.selective_fc +.. autofunction:: paddle.v2.layer.selective_fc :noindex: Conv Layers @@ -34,34 +34,34 @@ Conv Layers conv_operator ------------- -.. autoclass:: paddle.v2.layer.conv_operator +.. autofunction:: paddle.v2.layer.conv_operator :noindex: conv_projection --------------- -.. autoclass:: paddle.v2.layer.conv_projection +.. autofunction:: paddle.v2.layer.conv_projection :noindex: conv_shift ---------- -.. autoclass:: paddle.v2.layer.conv_shift +.. autofunction:: paddle.v2.layer.conv_shift :noindex: img_conv -------- -.. autoclass:: paddle.v2.layer.img_conv +.. autofunction:: paddle.v2.layer.img_conv :noindex: .. _api_v2.layer_context_projection: context_projection ------------------ -.. autoclass:: paddle.v2.layer.context_projection +.. autofunction:: paddle.v2.layer.context_projection :noindex: row_conv -------- -.. autoclass:: paddle.v2.layer.row_conv +.. autofunction:: paddle.v2.layer.row_conv :noindex: Image Pooling Layer @@ -69,27 +69,27 @@ Image Pooling Layer img_pool -------- -.. autoclass:: paddle.v2.layer.img_pool +.. autofunction:: paddle.v2.layer.img_pool :noindex: spp --- -.. autoclass:: paddle.v2.layer.spp +.. autofunction:: paddle.v2.layer.spp :noindex: maxout ------ -.. autoclass:: paddle.v2.layer.maxout +.. autofunction:: paddle.v2.layer.maxout :noindex: roi_pool -------- -.. autoclass:: paddle.v2.layer.roi_pool +.. autofunction:: paddle.v2.layer.roi_pool :noindex: pad ---- -.. autoclass:: paddle.v2.layer.pad +.. autofunction:: paddle.v2.layer.pad :noindex: Norm Layer @@ -97,27 +97,27 @@ Norm Layer img_cmrnorm ----------- -.. autoclass:: paddle.v2.layer.img_cmrnorm +.. autofunction:: paddle.v2.layer.img_cmrnorm :noindex: batch_norm ---------- -.. autoclass:: paddle.v2.layer.batch_norm +.. autofunction:: paddle.v2.layer.batch_norm :noindex: sum_to_one_norm --------------- -.. autoclass:: paddle.v2.layer.sum_to_one_norm +.. autofunction:: paddle.v2.layer.sum_to_one_norm :noindex: cross_channel_norm ------------------ -.. autoclass:: paddle.v2.layer.cross_channel_norm +.. autofunction:: paddle.v2.layer.cross_channel_norm :noindex: row_l2_norm ----------- -.. autoclass:: paddle.v2.layer.row_l2_norm +.. autofunction:: paddle.v2.layer.row_l2_norm :noindex: Recurrent Layers @@ -125,22 +125,22 @@ Recurrent Layers recurrent --------- -.. autoclass:: paddle.v2.layer.recurrent +.. autofunction:: paddle.v2.layer.recurrent :noindex: lstmemory --------- -.. autoclass:: paddle.v2.layer.lstmemory +.. autofunction:: paddle.v2.layer.lstmemory :noindex: grumemory --------- -.. autoclass:: paddle.v2.layer.grumemory +.. autofunction:: paddle.v2.layer.grumemory :noindex: gated_unit ----------- -.. autoclass:: paddle.v2.layer.gated_unit +.. autofunction:: paddle.v2.layer.gated_unit :noindex: Recurrent Layer Group @@ -148,32 +148,32 @@ Recurrent Layer Group memory ------ -.. autoclass:: paddle.v2.layer.memory +.. autofunction:: paddle.v2.layer.memory :noindex: recurrent_group --------------- -.. autoclass:: paddle.v2.layer.recurrent_group +.. autofunction:: paddle.v2.layer.recurrent_group :noindex: lstm_step --------- -.. autoclass:: paddle.v2.layer.lstm_step +.. autofunction:: paddle.v2.layer.lstm_step :noindex: gru_step -------- -.. autoclass:: paddle.v2.layer.gru_step +.. autofunction:: paddle.v2.layer.gru_step :noindex: beam_search ------------ -.. autoclass:: paddle.v2.layer.beam_search +.. autofunction:: paddle.v2.layer.beam_search :noindex: get_output ---------- -.. autoclass:: paddle.v2.layer.get_output +.. autofunction:: paddle.v2.layer.get_output :noindex: Mixed Layer @@ -183,54 +183,54 @@ Mixed Layer mixed ----- -.. autoclass:: paddle.v2.layer.mixed +.. autofunction:: paddle.v2.layer.mixed :noindex: .. _api_v2.layer_embedding: embedding --------- -.. autoclass:: paddle.v2.layer.embedding +.. autofunction:: paddle.v2.layer.embedding :noindex: scaling_projection ------------------ -.. autoclass:: paddle.v2.layer.scaling_projection +.. autofunction:: paddle.v2.layer.scaling_projection :noindex: dotmul_projection ----------------- -.. autoclass:: paddle.v2.layer.dotmul_projection +.. autofunction:: paddle.v2.layer.dotmul_projection :noindex: dotmul_operator --------------- -.. autoclass:: paddle.v2.layer.dotmul_operator +.. autofunction:: paddle.v2.layer.dotmul_operator :noindex: full_matrix_projection ---------------------- -.. autoclass:: paddle.v2.layer.full_matrix_projection +.. autofunction:: paddle.v2.layer.full_matrix_projection :noindex: identity_projection ------------------- -.. autoclass:: paddle.v2.layer.identity_projection +.. autofunction:: paddle.v2.layer.identity_projection :noindex: slice_projection ------------------- -.. autoclass:: paddle.v2.layer.slice_projection +.. autofunction:: paddle.v2.layer.slice_projection :noindex: table_projection ---------------- -.. autoclass:: paddle.v2.layer.table_projection +.. autofunction:: paddle.v2.layer.table_projection :noindex: trans_full_matrix_projection ---------------------------- -.. autoclass:: paddle.v2.layer.trans_full_matrix_projection +.. autofunction:: paddle.v2.layer.trans_full_matrix_projection :noindex: Aggregate Layers @@ -245,51 +245,46 @@ AggregateLevel pooling ------- -.. autoclass:: paddle.v2.layer.pooling +.. autofunction:: paddle.v2.layer.pooling :noindex: .. _api_v2.layer_last_seq: last_seq -------- -.. autoclass:: paddle.v2.layer.last_seq +.. autofunction:: paddle.v2.layer.last_seq :noindex: .. _api_v2.layer_first_seq: first_seq --------- -.. autoclass:: paddle.v2.layer.first_seq +.. autofunction:: paddle.v2.layer.first_seq :noindex: sub_seq --------- -.. autoclass:: paddle.v2.layer.sub_seq +.. autofunction:: paddle.v2.layer.sub_seq :noindex: concat ------ -.. autoclass:: paddle.v2.layer.concat +.. autofunction:: paddle.v2.layer.concat :noindex: seq_concat ---------- -.. autoclass:: paddle.v2.layer.seq_concat +.. autofunction:: paddle.v2.layer.seq_concat :noindex: seq_slice --------- -.. autoclass:: paddle.v2.layer.seq_slice - :noindex: - -kmax_sequence_score -------------------- -.. autoclass:: paddle.v2.layer.kmax_sequence_score +.. autofunction:: paddle.v2.layer.seq_slice :noindex: sub_nested_seq -------------- -.. autoclass:: paddle.v2.layer.sub_nested_seq +.. autofunction:: paddle.v2.layer.sub_nested_seq :noindex: Reshaping Layers @@ -297,7 +292,7 @@ Reshaping Layers block_expand ------------ -.. autoclass:: paddle.v2.layer.block_expand +.. autofunction:: paddle.v2.layer.block_expand :noindex: .. _api_v2.layer_expand: @@ -309,22 +304,22 @@ ExpandLevel expand ------ -.. autoclass:: paddle.v2.layer.expand +.. autofunction:: paddle.v2.layer.expand :noindex: repeat ------ -.. autoclass:: paddle.v2.layer.repeat +.. autofunction:: paddle.v2.layer.repeat :noindex: rotate ------ -.. autoclass:: paddle.v2.layer.rotate +.. autofunction:: paddle.v2.layer.rotate :noindex: seq_reshape ----------- -.. autoclass:: paddle.v2.layer.seq_reshape +.. autofunction:: paddle.v2.layer.seq_reshape :noindex: Math Layers @@ -332,94 +327,94 @@ Math Layers addto ----- -.. autoclass:: paddle.v2.layer.addto +.. autofunction:: paddle.v2.layer.addto :noindex: linear_comb ----------- -.. autoclass:: paddle.v2.layer.linear_comb +.. autofunction:: paddle.v2.layer.linear_comb :noindex: interpolation ------------- -.. autoclass:: paddle.v2.layer.interpolation +.. autofunction:: paddle.v2.layer.interpolation :noindex: bilinear_interp --------------- -.. autoclass:: paddle.v2.layer.bilinear_interp +.. autofunction:: paddle.v2.layer.bilinear_interp :noindex: dropout -------- -.. autoclass:: paddle.v2.layer.dropout +.. autofunction:: paddle.v2.layer.dropout :noindex: dot_prod --------- -.. autoclass:: paddle.v2.layer.dot_prod +.. autofunction:: paddle.v2.layer.dot_prod :noindex: out_prod -------- -.. autoclass:: paddle.v2.layer.out_prod +.. autofunction:: paddle.v2.layer.out_prod :noindex: power ----- -.. autoclass:: paddle.v2.layer.power +.. autofunction:: paddle.v2.layer.power :noindex: scaling ------- -.. autoclass:: paddle.v2.layer.scaling +.. autofunction:: paddle.v2.layer.scaling :noindex: clip ---- -.. autoclass:: paddle.v2.layer.clip +.. autofunction:: paddle.v2.layer.clip :noindex: resize ------ -.. autoclass:: paddle.v2.layer.resize +.. autofunction:: paddle.v2.layer.resize :noindex: slope_intercept --------------- -.. autoclass:: paddle.v2.layer.slope_intercept +.. autofunction:: paddle.v2.layer.slope_intercept :noindex: tensor ------ -.. autoclass:: paddle.v2.layer.tensor +.. autofunction:: paddle.v2.layer.tensor :noindex: .. _api_v2.layer_cos_sim: cos_sim ------- -.. autoclass:: paddle.v2.layer.cos_sim +.. autofunction:: paddle.v2.layer.cos_sim :noindex: l2_distance ----------- -.. autoclass:: paddle.v2.layer.l2_distance +.. autofunction:: paddle.v2.layer.l2_distance :noindex: trans ----- -.. autoclass:: paddle.v2.layer.trans +.. autofunction:: paddle.v2.layer.trans :noindex: scale_shift ----------- -.. autoclass:: paddle.v2.layer.scale_shift +.. autofunction:: paddle.v2.layer.scale_shift :noindex: factorization_machine --------------------- -.. autoclass:: paddle.v2.layer.factorization_machine +.. autofunction:: paddle.v2.layer.factorization_machine :noindex: Sampling Layers @@ -427,17 +422,17 @@ Sampling Layers maxid ----- -.. autoclass:: paddle.v2.layer.max_id +.. autofunction:: paddle.v2.layer.max_id :noindex: sampling_id ----------- -.. autoclass:: paddle.v2.layer.sampling_id +.. autofunction:: paddle.v2.layer.sampling_id :noindex: multiplex --------- -.. autoclass:: paddle.v2.layer.multiplex +.. autofunction:: paddle.v2.layer.multiplex :noindex: .. _api_v2.layer_costs: @@ -447,97 +442,97 @@ Cost Layers cross_entropy_cost ------------------ -.. autoclass:: paddle.v2.layer.cross_entropy_cost +.. autofunction:: paddle.v2.layer.cross_entropy_cost :noindex: cross_entropy_with_selfnorm_cost -------------------------------- -.. autoclass:: paddle.v2.layer.cross_entropy_with_selfnorm_cost +.. autofunction:: paddle.v2.layer.cross_entropy_with_selfnorm_cost :noindex: multi_binary_label_cross_entropy_cost ------------------------------------- -.. autoclass:: paddle.v2.layer.multi_binary_label_cross_entropy_cost +.. autofunction:: paddle.v2.layer.multi_binary_label_cross_entropy_cost :noindex: classification_cost ------------------- -.. autoclass:: paddle.v2.layer.classification_cost +.. autofunction:: paddle.v2.layer.classification_cost :noindex: huber_regression_cost ------------------------- -.. autoclass:: paddle.v2.layer.huber_regression_cost +.. autofunction:: paddle.v2.layer.huber_regression_cost :noindex: huber_classification_cost ------------------------- -.. autoclass:: paddle.v2.layer.huber_classification_cost +.. autofunction:: paddle.v2.layer.huber_classification_cost :noindex: lambda_cost ----------- -.. autoclass:: paddle.v2.layer.lambda_cost +.. autofunction:: paddle.v2.layer.lambda_cost :noindex: square_error_cost ----------------- -.. autoclass:: paddle.v2.layer.square_error_cost +.. autofunction:: paddle.v2.layer.square_error_cost :noindex: rank_cost --------- -.. autoclass:: paddle.v2.layer.rank_cost +.. autofunction:: paddle.v2.layer.rank_cost :noindex: sum_cost --------- -.. autoclass:: paddle.v2.layer.sum_cost +.. autofunction:: paddle.v2.layer.sum_cost :noindex: crf --- -.. autoclass:: paddle.v2.layer.crf +.. autofunction:: paddle.v2.layer.crf :noindex: crf_decoding ------------ -.. autoclass:: paddle.v2.layer.crf_decoding +.. autofunction:: paddle.v2.layer.crf_decoding :noindex: ctc --- -.. autoclass:: paddle.v2.layer.ctc +.. autofunction:: paddle.v2.layer.ctc :noindex: warp_ctc -------- -.. autoclass:: paddle.v2.layer.warp_ctc +.. autofunction:: paddle.v2.layer.warp_ctc :noindex: nce --- -.. autoclass:: paddle.v2.layer.nce +.. autofunction:: paddle.v2.layer.nce :noindex: hsigmoid --------- -.. autoclass:: paddle.v2.layer.hsigmoid +.. autofunction:: paddle.v2.layer.hsigmoid :noindex: smooth_l1_cost -------------- -.. autoclass:: paddle.v2.layer.smooth_l1_cost +.. autofunction:: paddle.v2.layer.smooth_l1_cost :noindex: multibox_loss -------------- -.. autoclass:: paddle.v2.layer.multibox_loss +.. autofunction:: paddle.v2.layer.multibox_loss :noindex: detection_output ---------------- -.. autoclass:: paddle.v2.layer.detection_output +.. autofunction:: paddle.v2.layer.detection_output :noindex: Check Layer @@ -545,7 +540,7 @@ Check Layer eos --- -.. autoclass:: paddle.v2.layer.eos +.. autofunction:: paddle.v2.layer.eos :noindex: Activation @@ -553,5 +548,5 @@ Activation prelu -------- -.. autoclass:: paddle.v2.layer.prelu +.. autofunction:: paddle.v2.layer.prelu :noindex: diff --git a/doc/v2/api/index_en.rst b/doc/v2/api/index_en.rst index b11cd449affd1dcd9d3f42492961469331350942..70c5c524aaf0a9ae003bf4340c3f268c225d4419 100644 --- a/doc/v2/api/index_en.rst +++ b/doc/v2/api/index_en.rst @@ -8,4 +8,3 @@ API model_configs.rst data.rst run_logic.rst - fluid/index.rst diff --git a/doc/v2/build_and_install/pip_install_cn.rst b/doc/v2/build_and_install/pip_install_cn.rst index 853bdb21bbcf07ae1742d2196dbcfe4668828b7b..095da19cd41d29bfa72ab23abd24bec45f925a86 100644 --- a/doc/v2/build_and_install/pip_install_cn.rst +++ b/doc/v2/build_and_install/pip_install_cn.rst @@ -60,6 +60,7 @@ paddlepaddle-gpu==0.11.0 使用CUDA 7.5和cuDNN 5编译的0.11.0版 "cpu_noavx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl `__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl `_" "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl `__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl `__" "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl `__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl `__" + "cuda9.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl `__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl `__" .. _pip_dependency: diff --git a/doc/v2/build_and_install/pip_install_en.rst b/doc/v2/build_and_install/pip_install_en.rst index fecf6d3712feac3265100a6121901ba784f7d5cc..8406e4aa1fbb953c3b615b10d1bcb2c45974dde0 100644 --- a/doc/v2/build_and_install/pip_install_en.rst +++ b/doc/v2/build_and_install/pip_install_en.rst @@ -63,6 +63,7 @@ If the links below shows up the login form, just click "Log in as guest" to star "cpu_noavx_openblas", "`paddlepaddle-latest-cp27-cp27mu-linux_x86_64.whl `__", "`paddlepaddle-latest-cp27-cp27m-linux_x86_64.whl `__" "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl `__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl `__" "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl `__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl `__" + "cuda9.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-latest-cp27-cp27mu-linux_x86_64.whl `__", "`paddlepaddle_gpu-latest-cp27-cp27m-linux_x86_64.whl `__" .. _pip_dependency: diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 6bc770580640f242cfce6a9838f00210f785010a..6286dda4a54991b7a1042aed9886fdcb694198ba 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -84,7 +84,7 @@ cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor) cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog) if(WITH_DISTRIBUTE) - cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr) + cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr) set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) else() diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 4a6f53cba1f46214dbff3058b221f878ecf46613..e15232a77bb9c3e325b55737ea7abc55e3121708 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -330,8 +330,12 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, } for (auto& op : ctx->ops_) { - VLOG(3) << place_ << " " << op->DebugStringEx(local_scope); + VLOG(4) << place_ << " " << op->DebugStringEx(local_scope); op->Run(*local_scope, place_); + // NOTE! Please do not delete this line, it's usefull because the debug + // string before and after op.run are different, after run the output + // will have right shape which is usefull for debug. + VLOG(3) << place_ << " " << op->DebugStringEx(local_scope); if (FLAGS_benchmark) { VLOG(2) << "Memory used after operator " + op->Type() + " running: " diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index c633a2f847683debce08c40b0c2ed6e58c0a7ad1..d22ac66c5c5f5c395849286f3f42e74ebd8f1f13 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -69,6 +69,19 @@ static DDim GetDims(const Scope& scope, const std::string& name, } } +static int GetRowSize(const Scope& scope, const std::string& name) { + Variable* var = scope.FindVar(name); + if (var == nullptr) { + return -1; + } + + if (var->IsType()) { + return var->Get().rows().size(); + } + + return -1; +} + static LoD GetLoD(const Scope& scope, const std::string& name) { Variable* var = scope.FindVar(name); auto default_lod = LoD({{}}); @@ -153,6 +166,10 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const { for (size_t i = 0; i < input.second.size(); ++i) { ss << input.second[i]; if (scope) { + int row_size = GetRowSize(*scope, input.second[i]); + if (row_size >= 0) { + ss << "[row_size=" << row_size << "]"; + } ss << "[" << GetDims(*scope, input.second[i], true) << "]"; ss << "(" << GetLoD(*scope, input.second[i]) << ")"; } @@ -173,6 +190,10 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const { for (size_t i = 0; i < output.second.size(); ++i) { ss << output.second[i]; if (scope) { + int row_size = GetRowSize(*scope, output.second[i]); + if (row_size >= 0) { + ss << "[row_size=" << row_size << "]"; + } ss << "[" << GetDims(*scope, output.second[i], true) << "]"; ss << "(" << GetLoD(*scope, output.second[i]) << ")"; } diff --git a/paddle/fluid/operators/batch_norm_mkldnn_op.cc b/paddle/fluid/operators/batch_norm_mkldnn_op.cc index 0e4a56d4a45a732cfcf43b09228bc0c44df5924c..8206cc9890160da756efb13c991020f09b20126a 100644 --- a/paddle/fluid/operators/batch_norm_mkldnn_op.cc +++ b/paddle/fluid/operators/batch_norm_mkldnn_op.cc @@ -19,10 +19,17 @@ limitations under the License. */ namespace paddle { namespace operators { -using Tensor = framework::Tensor; +using batch_norm_bwd = mkldnn::batch_normalization_backward; +using batch_norm_fwd = mkldnn::batch_normalization_forward; +using framework::DataLayout; +using framework::Tensor; +using mkldnn::memory; +using mkldnn::primitive; +using mkldnn::reorder; +using mkldnn::stream; using paddle::platform::MKLDNNDeviceContext; using paddle::platform::MKLDNNMemDesc; -using mkldnn::memory; +using platform::to_void_cast; template using EigenArrayMap = @@ -64,21 +71,12 @@ void run_batch_norm_op(Args &&... args) { mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); } -template -inline void *cast_const_to_void(const T *t) { - return static_cast(const_cast(t)); -} } // namespace template class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { - auto data_layout_str = ctx.Attr("data_layout"); - auto data_layout = framework::StringToDataLayout(data_layout_str); - PADDLE_ENFORCE(data_layout == framework::DataLayout::kNCHW, - "MKLDNN batch normalization handles only NCHW data layout"); - const float epsilon = ctx.Attr("epsilon"); const float momentum = ctx.Attr("momentum"); const bool is_test = ctx.Attr("is_test"); @@ -99,41 +97,53 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel { const auto *scale = ctx.Input("Scale"); const auto *shift = ctx.Input("Bias"); - y->mutable_data(ctx.GetPlace()); - mean_out->mutable_data(ctx.GetPlace()); - variance_out->mutable_data(ctx.GetPlace()); + PADDLE_ENFORCE(x->layout() == DataLayout::kMKLDNN && + x->format() != memory::format::format_undef, + "Wrong layout/format set for Input x tensor"); + + const T *x_data = x->data(); + const T *mean_data = mean->data(); + const T *variance_data = variance->data(); + T *y_data = y->mutable_data(ctx.GetPlace()); + T *mean_out_data = mean_out->mutable_data(ctx.GetPlace()); + T *variance_out_data = variance_out->mutable_data(ctx.GetPlace()); + T *batch_mean_data = nullptr; + T *batch_variance_data = nullptr; if (!is_test) { - batch_mean->mutable_data(ctx.GetPlace()); - batch_variance->mutable_data(ctx.GetPlace()); + batch_mean_data = batch_mean->mutable_data(ctx.GetPlace()); + batch_variance_data = batch_variance->mutable_data(ctx.GetPlace()); } auto propagation = is_test == true ? mkldnn::prop_kind::forward_scoring : mkldnn::prop_kind::forward_training; - auto dims = paddle::framework::vectorize2int(x->dims()); - - auto src_md = - MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw); - auto dst_md = - MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw); - - auto src_pd = mkldnn::memory::primitive_desc{src_md, mkldnn_engine}; - auto dst_pd = mkldnn::memory::primitive_desc{dst_md, mkldnn_engine}; - - auto src = mkldnn::memory{src_pd, cast_const_to_void(x->data())}; - auto dst = mkldnn::memory{dst_pd, y->data()}; + auto src_tz = paddle::framework::vectorize2int(x->dims()); + auto scale_tz = paddle::framework::vectorize2int(scale->dims()); + PADDLE_ENFORCE(scale_tz.size() == 1, "Dims of scale tensor is NOT 1"); + const unsigned int ic = scale_tz[0]; unsigned flags = mkldnn::use_scale_shift; if (is_test) flags |= mkldnn::use_global_stats; + // create mkldnn memory from input x tensor + auto src_memory = + memory({{{src_tz}, memory::data_type::f32, x->format()}, mkldnn_engine}, + to_void_cast(x_data)); + + // create primitive descriptor for batch norm forward using bn_fwd_types = bn_type_traits; - auto batch_norm_fwd_desc = - bn_fwd_types::op_desc{propagation, src_md, epsilon, flags}; - auto batch_norm_fwd_pd = - bn_fwd_types::op_prim{batch_norm_fwd_desc, mkldnn_engine}; + auto batch_norm_fwd_desc = bn_fwd_types::op_desc{ + propagation, src_memory.get_primitive_desc().desc(), epsilon, flags}; + std::shared_ptr batch_norm_fwd_pd = + std::shared_ptr( + new batch_norm_fwd::primitive_desc(batch_norm_fwd_desc, + mkldnn_engine)); - const unsigned int ic = dims[1]; + // Save the pd to be used in backward pass + const std::string key = ctx.op().Output("SavedMean"); + const std::string key_batch_norm_fwd_pd = key + "@bn_fwd_pd"; + dev_ctx.SetBlob(key_batch_norm_fwd_pd, batch_norm_fwd_pd); // MKLDNN requires a single piece of memory for scale and shift/bias data const size_t scaleshift_size = 2 * ic; @@ -143,73 +153,58 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel { copy_to_weights(scale->data(), scale->data() + ic, shift->data(), shift->data() + ic, &scaleshift_data); - auto scaleshift_memory = mkldnn::memory{ - batch_norm_fwd_pd.weights_primitive_desc(), scaleshift_data.data()}; + // crate mkldnn memory for weights(scale/shift) + auto scaleshift_memory = memory(batch_norm_fwd_pd->weights_primitive_desc(), + scaleshift_data.data()); - if (is_test) { - auto mean_memory = mkldnn::memory{batch_norm_fwd_pd.mean_primitive_desc(), - cast_const_to_void(mean->data())}; + // create mkldnn memory for output y tensor + auto dst_memory = memory(batch_norm_fwd_pd->dst_primitive_desc(), y_data); + if (is_test) { + // create mkldnn memory for stats (as input) + auto mean_memory = memory(batch_norm_fwd_pd->mean_primitive_desc(), + to_void_cast(mean_data)); auto variance_memory = - mkldnn::memory{batch_norm_fwd_pd.variance_primitive_desc(), - cast_const_to_void(variance->data())}; + memory(batch_norm_fwd_pd->variance_primitive_desc(), + to_void_cast(variance_data)); run_batch_norm_op( - batch_norm_fwd_pd, src, (const mkldnn::primitive::at &)mean_memory, + *batch_norm_fwd_pd, src_memory, + (const mkldnn::primitive::at &)mean_memory, (const mkldnn::primitive::at &)variance_memory, scaleshift_memory, - dst); + dst_memory); } else { + // create mkldnn memory for stats (as output) auto mean_memory = - mkldnn::memory{batch_norm_fwd_pd.mean_primitive_desc(), - cast_const_to_void(batch_mean->data())}; - - auto variance_memory = - mkldnn::memory{batch_norm_fwd_pd.variance_primitive_desc(), - cast_const_to_void(batch_variance->data())}; + memory(batch_norm_fwd_pd->mean_primitive_desc(), batch_mean_data); + auto variance_memory = memory( + batch_norm_fwd_pd->variance_primitive_desc(), batch_variance_data); - run_batch_norm_op(batch_norm_fwd_pd, src, - scaleshift_memory, dst, + run_batch_norm_op(*batch_norm_fwd_pd, src_memory, + scaleshift_memory, dst_memory, mean_memory, variance_memory); } if (!is_test) { - const unsigned int in = dims[0]; - const unsigned int sample_size = x->numel() / in / ic; - - // saved_xx is use just in this batch of data - EigenVectorArrayMap saved_mean_e( - batch_mean->mutable_data(ctx.GetPlace()), ic); - EigenVectorArrayMap saved_variance_e( - batch_variance->mutable_data(ctx.GetPlace()), ic); - saved_mean_e.setZero(); - saved_variance_e.setZero(); - - const unsigned int x_arr_size = in * ic; - ConstEigenArrayMap x_arr(x->data(), sample_size, x_arr_size); - for (unsigned int nc = 0; nc < x_arr_size; ++nc) { - saved_mean_e(nc % ic) += x_arr.col(nc).sum(); - } - saved_mean_e /= in * sample_size; - for (unsigned int nc = 0; nc < x_arr_size; ++nc) { - saved_variance_e(nc % ic) += - (x_arr.col(nc) - saved_mean_e(nc % ic)).matrix().squaredNorm(); - } - saved_variance_e /= in * sample_size; - - ConstEigenVectorArrayMap mean_arr{mean->data(), ic}; - ConstEigenVectorArrayMap variance_arr{variance->data(), ic}; - - EigenVectorArrayMap running_mean_arr( - mean_out->mutable_data(ctx.GetPlace()), ic); - EigenVectorArrayMap running_var_arr( - variance_out->mutable_data(ctx.GetPlace()), ic); + // mkldnn only compute stats for current batch + // so we need compute momentum stats via Eigen lib + EigenVectorArrayMap batch_mean_e(batch_mean_data, ic); + EigenVectorArrayMap batch_variance_e(batch_variance_data, ic); + ConstEigenVectorArrayMap mean_e(mean_data, ic); + ConstEigenVectorArrayMap variance_e{variance_data, ic}; + + EigenVectorArrayMap running_mean_e(mean_out_data, ic); + EigenVectorArrayMap running_variance_e(variance_out_data, ic); auto one_minus_momentum = 1. - momentum; - running_mean_arr = - mean_arr * momentum + saved_mean_e * one_minus_momentum; - running_var_arr = - variance_arr * momentum + saved_variance_e * one_minus_momentum; + running_mean_e = mean_e * momentum + batch_mean_e * one_minus_momentum; + running_variance_e = + variance_e * momentum + batch_variance_e * one_minus_momentum; } + + y->set_layout(DataLayout::kMKLDNN); + y->set_format( + (memory::format)dst_memory.get_primitive_desc().desc().data.format); } }; @@ -217,11 +212,6 @@ template class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel { public: void Compute(const paddle::framework::ExecutionContext &ctx) const override { - auto data_layout_str = ctx.Attr("data_layout"); - auto data_layout = framework::StringToDataLayout(data_layout_str); - PADDLE_ENFORCE(data_layout == framework::DataLayout::kNCHW, - "MKLDNN batch normalization handles only NCHW data layout"); - auto &dev_ctx = ctx.template device_context(); auto mkldnn_engine = dev_ctx.GetEngine(); @@ -238,88 +228,132 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel { auto *diff_scale = ctx.Output(framework::GradVarName("Scale")); auto *diff_shift = ctx.Output(framework::GradVarName("Bias")); - diff_x->mutable_data(ctx.GetPlace()); - diff_scale->mutable_data(ctx.GetPlace()); - diff_shift->mutable_data(ctx.GetPlace()); + PADDLE_ENFORCE(diff_y->layout() == DataLayout::kMKLDNN && + diff_y->format() != memory::format::format_undef, + "Wrong layout/format set for Input diff_y tensor"); + + const T *x_data = x->data(); + const T *diff_y_data = diff_y->data(); + const T *batch_mean_data = batch_mean->data(); + const T *batch_variance_data = batch_variance->data(); + const T *scale_data = scale->data(); + const T *shift_data = shift->data(); + T *diff_x_data = diff_x->mutable_data(ctx.GetPlace()); + T *diff_scale_data = diff_scale->mutable_data(ctx.GetPlace()); + T *diff_shift_data = diff_shift->mutable_data(ctx.GetPlace()); + + auto src_tz = paddle::framework::vectorize2int(x->dims()); + auto diff_src_tz = src_tz; + auto dst_tz = src_tz; + auto diff_dst_tz = dst_tz; + auto scale_tz = paddle::framework::vectorize2int(scale->dims()); + PADDLE_ENFORCE(scale_tz.size() == 1, "Dims of scale tensor is NOT 1"); + + const unsigned int ic = scale_tz[0]; + + // Retrieve bn_fwd_pd from device context + const std::string key = ctx.op().Input("SavedMean"); + const std::string key_batch_norm_fwd_pd = key + "@bn_fwd_pd"; + auto batch_norm_fwd_pd = + std::static_pointer_cast( + dev_ctx.GetBlob(key_batch_norm_fwd_pd)); + PADDLE_ENFORCE(batch_norm_fwd_pd != nullptr, + "Fail to find batch_norm_fwd_pd in device context"); - auto dims = paddle::framework::vectorize2int(x->dims()); - unsigned flags = mkldnn::use_scale_shift | !mkldnn::use_global_stats; + using bn_bwd_types = bn_type_traits; - auto src_md = - MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw); - auto dst_md = - MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw); - auto diff_src_md = - MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw); - auto diff_dst_md = - MKLDNNMemDesc(dims, memory::data_type::f32, memory::format::nchw); + // create mkldnn memory from input diff_y tensor + auto user_diff_dst_memory = + memory({{{diff_dst_tz}, memory::data_type::f32, diff_y->format()}, + mkldnn_engine}, + to_void_cast(diff_y_data)); - using bn_bwd_types = bn_type_traits; - using bn_fwd_types = bn_type_traits; + // create mkldnn memory from input x tensor + auto src_memory = + memory({{{src_tz}, memory::data_type::f32, x->format()}, mkldnn_engine}, + to_void_cast(x_data)); - auto batch_norm_fwd_desc = bn_fwd_types::op_desc{ - mkldnn::prop_kind::forward_training, src_md, epsilon, flags}; - auto batch_norm_fwd_pd = - bn_fwd_types::op_prim{batch_norm_fwd_desc, mkldnn_engine}; + // for diff_dst, try to use same format as dst in forward pass + auto diff_dst_pd = batch_norm_fwd_pd.get()->dst_primitive_desc(); + auto diff_dst_md = diff_dst_pd.desc(); + // create primitive descriptor for batch norm backward + unsigned flags = mkldnn::use_scale_shift; auto batch_norm_bwd_desc = bn_bwd_types::op_desc{ - mkldnn::prop_kind::backward, diff_dst_md, dst_md, epsilon, flags}; + mkldnn::prop_kind::backward, diff_dst_md, + src_memory.get_primitive_desc().desc(), epsilon, flags}; auto batch_norm_bwd_pd = bn_bwd_types::op_prim{ - batch_norm_bwd_desc, mkldnn_engine, batch_norm_fwd_pd}; - - auto src = mkldnn::memory{{src_md, mkldnn_engine}, - cast_const_to_void(x->data())}; - - auto mean = mkldnn::memory{batch_norm_bwd_pd.mean_primitive_desc(), - cast_const_to_void(batch_mean->data())}; - - auto variance = - mkldnn::memory{batch_norm_bwd_pd.variance_primitive_desc(), - cast_const_to_void(batch_variance->data())}; - - auto diff_dst = mkldnn::memory{{diff_dst_md, mkldnn_engine}, - cast_const_to_void(diff_y->data())}; + batch_norm_bwd_desc, mkldnn_engine, *batch_norm_fwd_pd}; + + // reorder user_diff_dst if it's not in preferred format + auto diff_dst_memory = user_diff_dst_memory; + primitive reorder_diff_dst; + bool is_diff_dst_reordered = false; + if (diff_dst_pd != user_diff_dst_memory.get_primitive_desc()) { + diff_dst_memory = memory(diff_dst_pd); + reorder_diff_dst = reorder(user_diff_dst_memory, diff_dst_memory); + is_diff_dst_reordered = true; + } - const unsigned int ic = dims[1]; + // create mkldnn memory for input tensors (src/mean/variance) + auto mean_memory = memory(batch_norm_bwd_pd.mean_primitive_desc(), + to_void_cast(batch_mean_data)); + auto variance_memory = memory(batch_norm_bwd_pd.variance_primitive_desc(), + to_void_cast(batch_variance_data)); + // MKLDNN requires a single piece of memory for scale and shift/bias data const size_t scaleshift_size = 2 * ic; std::vector scaleshift_data; scaleshift_data.reserve(scaleshift_size); - copy_to_weights(scale->data(), scale->data() + ic, shift->data(), - shift->data() + ic, &scaleshift_data); + copy_to_weights(scale_data, scale_data + ic, shift_data, shift_data + ic, + &scaleshift_data); - auto scaleshift_memory = mkldnn::memory{ - batch_norm_bwd_pd.weights_primitive_desc(), scaleshift_data.data()}; + // create mkldnn memory for input tensors (scale/shift) + auto scaleshift_memory = memory(batch_norm_bwd_pd.weights_primitive_desc(), + scaleshift_data.data()); + // create mkldnn memory for output diff weights (combined scale/shift) std::vector diff_scaleshift_data; diff_scaleshift_data.reserve(scaleshift_size); - copy_to_weights(diff_scale->data(), diff_scale->data() + ic, - diff_shift->data(), diff_shift->data() + ic, - &diff_scaleshift_data); - auto diff_scaleshift_memory = - mkldnn::memory{batch_norm_bwd_pd.diff_weights_primitive_desc(), - diff_scaleshift_data.data()}; - - auto diff_src = mkldnn::memory{{diff_src_md, mkldnn_engine}, - static_cast(diff_x->data())}; - - run_batch_norm_op( - batch_norm_bwd_pd, src, mean, variance, diff_dst, scaleshift_memory, - diff_src, diff_scaleshift_memory); - + memory(batch_norm_bwd_pd.diff_weights_primitive_desc(), + diff_scaleshift_data.data()); + + // here assume diff_src is in the same format of src + auto diff_src_memory = memory(src_memory.get_primitive_desc(), diff_x_data); + + // finally create batch_norm backward primitive + auto batch_norm_bwd_prim = + batch_norm_bwd(batch_norm_bwd_pd, src_memory, mean_memory, + variance_memory, diff_dst_memory, scaleshift_memory, + diff_src_memory, diff_scaleshift_memory); + + // execute optional reorder and batch_norm backward primitive + std::vector pipeline; + if (is_diff_dst_reordered) pipeline.push_back(reorder_diff_dst); + pipeline.push_back(batch_norm_bwd_prim); + stream(stream::kind::eager).submit(pipeline).wait(); + + // copy back diff sacle/shift to output tensors (diff scale/shift) + diff_scaleshift_data.resize(scaleshift_size); auto it = std::begin(diff_scaleshift_data); - std::copy(it, std::next(it, ic), diff_scale->data()); + std::copy(it, std::next(it, ic), diff_scale_data); std::copy(std::next(it, ic), std::end(diff_scaleshift_data), - diff_shift->data()); + diff_shift_data); + + // set layout/format of output tensors + diff_x->set_layout(DataLayout::kMKLDNN); + diff_x->set_format((memory::format)diff_src_memory.get_primitive_desc() + .desc() + .data.format); } }; } // namespace operators } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_KERNEL(batch_norm, MKLDNN, paddle::platform::CPUPlace, +REGISTER_OP_KERNEL(batch_norm, MKLDNN, ::paddle::platform::CPUPlace, ops::BatchNormMKLDNNOpKernel); -REGISTER_OP_KERNEL(batch_norm_grad, MKLDNN, paddle::platform::CPUPlace, +REGISTER_OP_KERNEL(batch_norm_grad, MKLDNN, ::paddle::platform::CPUPlace, ops::BatchNormMKLDNNGradOpKernel); diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc index 92fbb9adaf6a6a335abee3c9443d4b1d6097021b..625ca2d7c4c70d1098b0fb28380d8d1eb24cb338 100644 --- a/paddle/fluid/operators/batch_norm_op.cc +++ b/paddle/fluid/operators/batch_norm_op.cc @@ -110,19 +110,19 @@ class BatchNormOp : public framework::OperatorWithKernel { ctx.Input("Variance")->type()), "Variance input should be of float type"); - framework::LibraryType library_{framework::LibraryType::kPlain}; // TODO(pzelazko-intel): enable MKLDNN layout when it's ready + framework::LibraryType library = framework::LibraryType::kPlain; framework::DataLayout layout = framework::DataLayout::kAnyLayout; - #ifdef PADDLE_WITH_MKLDNN - if (library_ == framework::LibraryType::kPlain && + if (library == framework::LibraryType::kPlain && platform::CanMKLDNNBeUsed(ctx)) { - library_ = framework::LibraryType::kMKLDNN; + library = framework::LibraryType::kMKLDNN; layout = framework::DataLayout::kMKLDNN; } #endif + return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout, - library_); + library); } }; @@ -370,19 +370,21 @@ class BatchNormGradOp : public framework::OperatorWithKernel { PADDLE_THROW("can't find Y@GRAD"); } - framework::LibraryType library_{framework::LibraryType::kPlain}; // TODO(pzelazko-intel): enable MKLDNN layout when it's ready - framework::DataLayout layout_ = framework::DataLayout::kAnyLayout; + framework::LibraryType library = framework::LibraryType::kPlain; + framework::DataLayout layout = framework::DataLayout::kAnyLayout; + #ifdef PADDLE_WITH_MKLDNN - if (library_ == framework::LibraryType::kPlain && + if (library == framework::LibraryType::kPlain && platform::CanMKLDNNBeUsed(ctx)) { - library_ = framework::LibraryType::kMKLDNN; - layout_ = framework::DataLayout::kMKLDNN; + library = framework::LibraryType::kMKLDNN; + layout = framework::DataLayout::kMKLDNN; } #endif + return framework::OpKernelType( framework::ToDataType(ctx.Input("X")->type()), ctx.GetPlace(), - layout_, library_); + layout, library); } }; diff --git a/paddle/fluid/operators/conv_mkldnn_op.cc b/paddle/fluid/operators/conv_mkldnn_op.cc index 63d371310d2a26a1460e527fc51923dfd6e0b8bc..6b06913d1c83f4534238ac3dd22ac4035c0f0fbf 100644 --- a/paddle/fluid/operators/conv_mkldnn_op.cc +++ b/paddle/fluid/operators/conv_mkldnn_op.cc @@ -18,6 +18,17 @@ namespace paddle { namespace operators { +using conv_bwd_data = mkldnn::convolution_backward_data; +using conv_bwd_weights = mkldnn::convolution_backward_weights; +using conv_fwd = mkldnn::convolution_forward; +using framework::DataLayout; +using mkldnn::memory; +using mkldnn::primitive; +using mkldnn::reorder; +using mkldnn::stream; +using platform::to_void_cast; +using platform::GetMKLDNNFormat; + template class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { public: @@ -25,6 +36,10 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), "It must use CPUPlace."); + // Get unique name for index + const std::string key = ctx.op().Output("Output"); + const std::string key_conv_pd = key + "@conv_pd"; + auto& dev_ctx = ctx.template device_context(); const auto& mkldnn_engine = dev_ctx.GetEngine(); @@ -33,10 +48,12 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { auto* filter = ctx.Input("Filter"); auto* output = ctx.Output("Output"); - // Get an unique name from "argument" name of "Output" variable - // This name will be used as key when saving info into device context - const std::string key = ctx.op().Output("Output"); - const std::string key_conv_pd = key + "@conv_pd"; + PADDLE_ENFORCE(input->layout() == DataLayout::kMKLDNN && + input->format() != memory::format::format_undef, + "Wrong layout/format set for Input tensor"); + PADDLE_ENFORCE(filter->layout() == DataLayout::kMKLDNN && + filter->format() != memory::format::format_undef, + "Wrong layout/format set for Filter tensor"); std::vector strides = ctx.Attr>("strides"); std::vector paddings = ctx.Attr>("paddings"); @@ -63,60 +80,86 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { paddle::framework::vectorize2int(filter->dims()); std::vector dst_tz = paddle::framework::vectorize2int(output->dims()); - // TODO(pzelazko-intel): support more formats - auto src_md = platform::MKLDNNMemDesc( - src_tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw); - auto weights_md = - platform::MKLDNNMemDesc(weights_tz, mkldnn::memory::data_type::f32, - mkldnn::memory::format::oihw); - auto dst_md = platform::MKLDNNMemDesc( - dst_tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw); - - auto src_memory = - mkldnn::memory({src_md, mkldnn_engine}, - reinterpret_cast(const_cast(input_data))); - auto weights_memory = - mkldnn::memory({weights_md, mkldnn_engine}, - reinterpret_cast(const_cast(filter_data))); - auto dst_memory = mkldnn::memory({dst_md, mkldnn_engine}, output_data); - - std::shared_ptr conv_pd = - ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides, paddings, - mkldnn_engine); - - // save conv_pd into global device context to be referred in backward path - dev_ctx.SetBlob(key_conv_pd, conv_pd); + // create mkldnn memory from input tensors (data/weights) + auto user_src_memory = memory( + {{{src_tz}, memory::data_type::f32, input->format()}, mkldnn_engine}, + to_void_cast(input_data)); + auto user_weights_memory = + memory({{{weights_tz}, memory::data_type::f32, filter->format()}, + mkldnn_engine}, + to_void_cast(filter_data)); + + /* create memory descriptor for convolution without specified format + * ('any') which lets a primitive (convolution in this case) choose + * the memory format preferred for best performance + */ + auto src_md = platform::MKLDNNMemDesc(src_tz, memory::data_type::f32, + memory::format::any); + auto weights_md = platform::MKLDNNMemDesc( + weights_tz, memory::data_type::f32, memory::format::any); + auto dst_md = platform::MKLDNNMemDesc(dst_tz, memory::data_type::f32, + memory::format::any); + + // create a conv primitive descriptor and save it for usage in backward + std::shared_ptr conv_pd = ConvFwdPrimitiveDesc( + src_md, weights_md, dst_md, strides, paddings, mkldnn_engine); + + // create reorder primitive if the input format is not the preferred one + auto src_memory = user_src_memory; + primitive reorder_src; + bool is_src_reordered = false; + if (memory::primitive_desc(conv_pd->src_primitive_desc()) != + user_src_memory.get_primitive_desc()) { + src_memory = memory(conv_pd->src_primitive_desc()); + reorder_src = reorder(user_src_memory, src_memory); + is_src_reordered = true; + } + auto weights_memory = user_weights_memory; + primitive reorder_weights; + bool is_weights_reordered = false; + if (memory::primitive_desc(conv_pd->weights_primitive_desc()) != + user_weights_memory.get_primitive_desc()) { + weights_memory = memory(conv_pd->weights_primitive_desc()); + reorder_weights = reorder(user_weights_memory, weights_memory); + is_weights_reordered = true; + } + + // create memory primitive for conv dst + auto dst_memory = memory(conv_pd->dst_primitive_desc(), output_data); // create convolution op primitive - auto conv_prim = mkldnn::convolution_forward(*conv_pd, src_memory, - weights_memory, dst_memory); + auto conv_prim = conv_fwd(*conv_pd, src_memory, weights_memory, dst_memory); // push primitive to stream and wait until it's executed - std::vector pipeline{conv_prim}; - mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); + std::vector pipeline; + if (is_src_reordered) pipeline.push_back(reorder_src); + if (is_weights_reordered) pipeline.push_back(reorder_weights); + pipeline.push_back(conv_prim); + stream(stream::kind::eager).submit(pipeline).wait(); + + // Save conv_pd/src_memory/weights_memory for backward pass + dev_ctx.SetBlob(key_conv_pd, conv_pd); + + output->set_layout(DataLayout::kMKLDNN); + output->set_format(GetMKLDNNFormat(dst_memory)); } private: - std::unique_ptr - ConvFwdPrimitiveDesc(const mkldnn::memory::desc& src, - const mkldnn::memory::desc& weights, - const mkldnn::memory::desc& dst, - const std::vector& strides, - const std::vector& paddings, - const mkldnn::engine& engine) const { - mkldnn::memory::dims stride_dims = {strides[0], strides[1]}; - mkldnn::memory::dims padding_dims = {paddings[0], paddings[1]}; - - auto conv_desc = mkldnn::convolution_forward::desc( - mkldnn::prop_kind::forward, mkldnn::convolution_direct, src, weights, - dst, stride_dims, padding_dims, padding_dims, - mkldnn::padding_kind::zero); - - auto p_conv_pd = - new mkldnn::convolution_forward::primitive_desc(conv_desc, engine); - - return std::unique_ptr( - p_conv_pd); + std::unique_ptr ConvFwdPrimitiveDesc( + const memory::desc& src, const memory::desc& weights, + const memory::desc& dst, const std::vector& strides, + const std::vector& paddings, const mkldnn::engine& engine) const { + memory::dims stride_dims = {strides[0], strides[1]}; + memory::dims padding_dims = {paddings[0], paddings[1]}; + + auto conv_desc = + conv_fwd::desc(mkldnn::prop_kind::forward, mkldnn::convolution_direct, + src, weights, dst, stride_dims, padding_dims, + padding_dims, mkldnn::padding_kind::zero); + + auto p_conv_pd = new conv_fwd::primitive_desc(conv_desc, engine); + + return std::unique_ptr(p_conv_pd); } }; @@ -139,6 +182,19 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel { Tensor* input_grad = ctx.Output(framework::GradVarName("Input")); Tensor* filter_grad = ctx.Output(framework::GradVarName("Filter")); + PADDLE_ENFORCE(input->layout() == DataLayout::kMKLDNN && + input->format() != memory::format::format_undef, + "Wrong layout/format set for Input tensor"); + PADDLE_ENFORCE(filter->layout() == DataLayout::kMKLDNN && + filter->format() != memory::format::format_undef, + "Wrong layout/format set for Filter tensor"); + PADDLE_ENFORCE(output->layout() == DataLayout::kMKLDNN && + output->format() != memory::format::format_undef, + "Wrong layout/format set for Output tensor"); + PADDLE_ENFORCE(output_grad->layout() == DataLayout::kMKLDNN && + output_grad->format() != memory::format::format_undef, + "Wrong layout/format set for output_grad tensor"); + if (!input_grad && !filter_grad) return; // Get an unique name from "argument" name of "Output" variable @@ -167,108 +223,147 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel { paddle::framework::vectorize2int(filter->dims()); std::vector dst_tz = paddle::framework::vectorize2int(output->dims()); - // TODO(pzelazko-intel): support more formats - auto src_md = platform::MKLDNNMemDesc( - src_tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw); - auto diff_src_md = platform::MKLDNNMemDesc( - src_tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw); - auto weights_md = - platform::MKLDNNMemDesc(weights_tz, mkldnn::memory::data_type::f32, - mkldnn::memory::format::oihw); - auto diff_weights_md = - platform::MKLDNNMemDesc(weights_tz, mkldnn::memory::data_type::f32, - mkldnn::memory::format::oihw); - auto diff_dst_md = platform::MKLDNNMemDesc( - dst_tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw); - - // create memory - auto diff_dst_memory = mkldnn::memory( - {diff_weights_md, mkldnn_engine}, - reinterpret_cast(const_cast(output_grad_data))); + // create mkldnn memory from input tensors (input/weights/output_grad) + auto user_src_memory = memory( + {{{src_tz}, memory::data_type::f32, input->format()}, mkldnn_engine}, + to_void_cast(input_data)); + auto user_weights_memory = + memory({{{weights_tz}, memory::data_type::f32, filter->format()}, + mkldnn_engine}, + to_void_cast(filter_data)); + auto user_diff_dst_memory = + memory({{{dst_tz}, memory::data_type::f32, output_grad->format()}, + mkldnn_engine}, + to_void_cast(output_grad_data)); + + /* create memory descriptor for conv backward without specified format + * ('any') which lets a primitive (conv backward in this case) choose + * the memory format preferred for best performance + */ + auto src_md = platform::MKLDNNMemDesc(src_tz, memory::data_type::f32, + memory::format::any); + auto diff_src_md = platform::MKLDNNMemDesc(src_tz, memory::data_type::f32, + memory::format::any); + auto weights_md = platform::MKLDNNMemDesc( + weights_tz, memory::data_type::f32, memory::format::any); + auto diff_weights_md = platform::MKLDNNMemDesc( + weights_tz, memory::data_type::f32, memory::format::any); + auto diff_dst_md = platform::MKLDNNMemDesc(dst_tz, memory::data_type::f32, + memory::format::any); + // Retrieve conv_pd from device context - auto conv_pd = - std::static_pointer_cast( - dev_ctx.GetBlob(key_conv_pd)); + auto conv_pd = std::static_pointer_cast( + dev_ctx.GetBlob(key_conv_pd)); PADDLE_ENFORCE(conv_pd != nullptr, "Fail to find conv_pd in device context"); // create backward conv primitive for weights if (filter_grad) { - // create primitive descriptor - mkldnn::convolution_backward_weights::primitive_desc conv_bwd_weights_pd = - ConvBwdWeightsPrimitiveDesc(src_md, diff_weights_md, diff_dst_md, - strides, paddings, *conv_pd, - mkldnn_engine); - - // create memory + // create backward convolution primitive descriptor + auto conv_bwd_weights_desc = conv_bwd_weights::desc( + mkldnn::convolution_direct, src_md, diff_weights_md, diff_dst_md, + strides, paddings, paddings, mkldnn::padding_kind::zero); + auto conv_bwd_weights_pd = conv_bwd_weights::primitive_desc( + conv_bwd_weights_desc, mkldnn_engine, *conv_pd); + + // create reorder primitive if the input format is not the preferred one + auto src_memory = user_src_memory; + primitive reorder_src; + bool is_src_reordered = false; + if (memory::primitive_desc(conv_bwd_weights_pd.src_primitive_desc()) != + user_src_memory.get_primitive_desc()) { + src_memory = memory(conv_bwd_weights_pd.src_primitive_desc()); + reorder_src = reorder(user_src_memory, src_memory); + is_src_reordered = true; + } + + auto diff_dst_memory_4filter = user_diff_dst_memory; + primitive reorder_diff_dst_4filter; + bool is_diff_dst_reordered_4filter = false; + if (memory::primitive_desc( + conv_bwd_weights_pd.diff_dst_primitive_desc()) != + user_diff_dst_memory.get_primitive_desc()) { + diff_dst_memory_4filter = + memory(conv_bwd_weights_pd.diff_dst_primitive_desc()); + reorder_diff_dst_4filter = + reorder(user_diff_dst_memory, diff_dst_memory_4filter); + is_diff_dst_reordered_4filter = true; + } + + // create mkldnn memory for output (i.e. diff weights) auto diff_weights_memory = - mkldnn::memory({diff_weights_md, mkldnn_engine}, - reinterpret_cast(filter_grad_data)); - auto src_memory = - mkldnn::memory({src_md, mkldnn_engine}, - reinterpret_cast(const_cast(input_data))); + memory(conv_bwd_weights_pd.diff_weights_primitive_desc(), + reinterpret_cast(filter_grad_data)); // create backward conv primitive for weights - auto conv_bwd_weights_prim = mkldnn::convolution_backward_weights( - conv_bwd_weights_pd, src_memory, diff_dst_memory, - diff_weights_memory); + auto conv_bwd_weights_prim = + conv_bwd_weights(conv_bwd_weights_pd, src_memory, + diff_dst_memory_4filter, diff_weights_memory); // push primitive and execute it - std::vector pipeline{conv_bwd_weights_prim}; - mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); + std::vector pipeline; + if (is_src_reordered) pipeline.push_back(reorder_src); + if (is_diff_dst_reordered_4filter) + pipeline.push_back(reorder_diff_dst_4filter); + pipeline.push_back(conv_bwd_weights_prim); + stream(stream::kind::eager).submit(pipeline).wait(); + + filter_grad->set_layout(DataLayout::kMKLDNN); + filter_grad->set_format(GetMKLDNNFormat(diff_weights_memory)); } if (input_grad) { - // create primitive descriptor - mkldnn::convolution_backward_data::primitive_desc conv_bwd_data_pd = - ConvBwdDataPrimitiveDesc(diff_src_md, weights_md, diff_dst_md, - strides, paddings, *conv_pd, mkldnn_engine); - - // create memory - auto diff_src_memory = mkldnn::memory( - {diff_src_md, mkldnn_engine}, - reinterpret_cast(const_cast(input_grad_data))); - auto weights_memory = - mkldnn::memory({weights_md, mkldnn_engine}, - reinterpret_cast(const_cast(filter_data))); + // create backward convolution primitive descriptor + auto conv_bwd_data_desc = conv_bwd_data::desc( + mkldnn::convolution_direct, diff_src_md, weights_md, diff_dst_md, + strides, paddings, paddings, mkldnn::padding_kind::zero); + auto conv_bwd_data_pd = conv_bwd_data::primitive_desc( + conv_bwd_data_desc, mkldnn_engine, *conv_pd); + + // create reorder primitive if the input format is not the preferred one + auto weights_memory = user_weights_memory; + primitive reorder_weights; + bool is_weights_reordered = false; + if (memory::primitive_desc(conv_bwd_data_pd.weights_primitive_desc()) != + user_weights_memory.get_primitive_desc()) { + weights_memory = memory(conv_bwd_data_pd.weights_primitive_desc()); + reorder_weights = reorder(user_weights_memory, weights_memory); + is_weights_reordered = true; + } + + auto diff_dst_memory_4data = user_diff_dst_memory; + primitive reorder_diff_dst_4data; + bool is_diff_dst_reordered_4data = false; + if (memory::primitive_desc(conv_bwd_data_pd.diff_dst_primitive_desc()) != + user_diff_dst_memory.get_primitive_desc()) { + diff_dst_memory_4data = + memory(conv_bwd_data_pd.diff_dst_primitive_desc()); + reorder_diff_dst_4data = + reorder(user_diff_dst_memory, diff_dst_memory_4data); + is_diff_dst_reordered_4data = true; + } + + // create mkldnn memory for output (i.e. diff src) + auto diff_src_memory = memory(conv_bwd_data_pd.diff_src_primitive_desc(), + reinterpret_cast(input_grad_data)); // create backward conv primitive for data - auto conv_bwd_data_prim = mkldnn::convolution_backward_data( - conv_bwd_data_pd, diff_dst_memory, weights_memory, diff_src_memory); + auto conv_bwd_data_prim = + conv_bwd_data(conv_bwd_data_pd, diff_dst_memory_4data, weights_memory, + diff_src_memory); - // push primitive to stream and wait until it's executed - std::vector pipeline{conv_bwd_data_prim}; - mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); + // push primitive and execute it + std::vector pipeline; + if (is_weights_reordered) pipeline.push_back(reorder_weights); + if (is_diff_dst_reordered_4data) + pipeline.push_back(reorder_diff_dst_4data); + pipeline.push_back(conv_bwd_data_prim); + stream(stream::kind::eager).submit(pipeline).wait(); + + input_grad->set_layout(DataLayout::kMKLDNN); + input_grad->set_format(GetMKLDNNFormat(diff_src_memory)); } } // Compute() - - private: - mkldnn::convolution_backward_weights::primitive_desc - ConvBwdWeightsPrimitiveDesc( - const mkldnn::memory::desc& src, const mkldnn::memory::desc& diff_weights, - const mkldnn::memory::desc& diff_dst, const std::vector& strides, - const std::vector& paddings, - const mkldnn::convolution_forward::primitive_desc& conv_pd, - const mkldnn::engine& engine) const { - auto conv_bwd_weights_desc = mkldnn::convolution_backward_weights::desc( - mkldnn::convolution_direct, src, diff_weights, diff_dst, strides, - paddings, paddings, mkldnn::padding_kind::zero); - return mkldnn::convolution_backward_weights::primitive_desc( - conv_bwd_weights_desc, engine, conv_pd); - } - - mkldnn::convolution_backward_data::primitive_desc ConvBwdDataPrimitiveDesc( - const mkldnn::memory::desc& diff_src, const mkldnn::memory::desc& weights, - const mkldnn::memory::desc& diff_dst, const std::vector& strides, - const std::vector& paddings, - const mkldnn::convolution_forward::primitive_desc& conv_pd, - const mkldnn::engine& engine) const { - auto conv_bwd_data_desc = mkldnn::convolution_backward_data::desc( - mkldnn::convolution_direct, diff_src, weights, diff_dst, strides, - paddings, paddings, mkldnn::padding_kind::zero); - return mkldnn::convolution_backward_data::primitive_desc(conv_bwd_data_desc, - engine, conv_pd); - } }; } // namespace operators diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc index 27f1313116aad99d34fa8f1d3d6a1e7aced4d394..37153d58439a90190eb2ad82d5dcc145e22dfa48 100644 --- a/paddle/fluid/operators/conv_op.cc +++ b/paddle/fluid/operators/conv_op.cc @@ -75,9 +75,8 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const { framework::OpKernelType ConvOp::GetExpectedKernelType( const framework::ExecutionContext& ctx) const { framework::LibraryType library{framework::LibraryType::kPlain}; - - std::string data_format = ctx.Attr("data_format"); // TODO(pzelazko-intel): enable MKLDNN layout when it's ready + std::string data_format = ctx.Attr("data_format"); framework::DataLayout layout = framework::StringToDataLayout(data_format); #ifdef PADDLE_WITH_CUDA diff --git a/paddle/fluid/operators/gen_nccl_id_op.cc b/paddle/fluid/operators/gen_nccl_id_op.cc index 111e58844c83806af4ebe0aa9e2126a9ddec1d8a..f824eee4e7d1ef19c9a38fd5d3369265f9c549a0 100644 --- a/paddle/fluid/operators/gen_nccl_id_op.cc +++ b/paddle/fluid/operators/gen_nccl_id_op.cc @@ -67,6 +67,10 @@ class GenNCCLIdOp : public framework::OperatorBase { client->AsyncSendVar(ep, dev_ctx, *scope, NCCL_ID_VARNAME); } client->Wait(); + for (auto& ep : endpoint_list) { + client->AsyncSendBatchBarrier(ep); + } + client->Wait(); VLOG(3) << "sending completed..."; } diff --git a/paddle/fluid/operators/merge_ids_op.cc b/paddle/fluid/operators/merge_ids_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..c6ec4ab047d5e91625e646fd26108d2e477cdce5 --- /dev/null +++ b/paddle/fluid/operators/merge_ids_op.cc @@ -0,0 +1,128 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/merge_ids_op.h" + +namespace paddle { +namespace operators { + +class MergeIdsOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("Ids", "(LoDTensor) the input ids with shape{batch_num, 1}"); + AddInput( + "X", + "(LoDTensors) multi input tensor with shape{batch_num, N}, N is the " + "size of embedding table") + .AsDuplicable(); + AddOutput("Out", "(LoDTensor) The merged outputs of the input tensors."); + + AddComment(R"DOC( +Merge multi LoDTensor's into one according to Ids's shard num. + + +split_ids_op -> prefetch_op -> merge_ids_op + + +merge_ids_op should be used after split_ids_op and prefetch_op, split_ids_op + will split input Ids into multiple tensors according to Id's shard number. +prefetch_op will send them to parameter server to prefetch embedding value +back. During split, the order of ids is disordered. In merge_ids_op we use +the original Ids to restore the order of the fetched embedding value and + also pass the lod information to the merged output. + + +Example: + + Ids = [1,2,3,4,5,6] # 3 shared + +split_ids_op -> + + Id0 = [3, 6] # id % 3 == 0 + Id1 = [1, 4] # id % 3 == 1 + Id2 = [2, 5] # id % 3 == 2 + +prefetch_op -> + + X0 = [[0.3 0.3] # 3 + [0.6 0.6]] # 6 + X1 = [[0.1 0.1] # 1 + [0.4 0.4]] # 4 + X2 = [[0.2 0.2] # 2 + [0.5 0.5]] # 5 + +merge_ids_op -> + + Out = [[0.1 0.1] # 1 + [0.2 0.2] # 2 + [0.3 0.3] # 3 + [0.4 0.4] # 4 + [0.5 0.5] # 5 + [0.6 0.6]] # 6 +)DOC"); + } +}; + +class MergeIdsOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Ids"), "MergeIdsOp must has input Ids."); + PADDLE_ENFORCE(ctx->HasInputs("X"), "MergeIdsOp must has input X."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), "MergeIdsOp must has output Out."); + + auto ids_var_type = ctx->GetInputsVarType("Ids").front(); + auto ids_dims = ctx->GetInputDim("Ids"); + if (ids_var_type == framework::proto::VarType::LOD_TENSOR) { + PADDLE_ENFORCE_EQ(ids_dims.size(), 2); + PADDLE_ENFORCE_EQ(ids_dims[1], 1); + } + auto x_var_type = ctx->GetInputsVarType("X"); + for (auto &var_type : x_var_type) { + PADDLE_ENFORCE_EQ(var_type, framework::proto::VarType::LOD_TENSOR, + "input X only support lod tensors"); + } + ctx->ShareLoD("Ids", "Out"); + } + + private: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType( + framework::ToDataType( + ctx.MultiInput("X").front()->type()), + ctx.GetPlace()); + } +}; + +class MergeIdsOpInferVarType : public framework::VarTypeInference { + public: + void operator()(const framework::OpDesc &op_desc, + framework::BlockDesc *block) const override { + auto *input_var = block->Var(op_desc.Input("Ids")[0]); + for (auto &out_var : op_desc.Output("Out")) { + block->Var(out_var)->SetType(input_var->GetType()); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(merge_ids, ops::MergeIdsOp, ops::MergeIdsOpMaker, + ops::MergeIdsOpInferVarType); +REGISTER_OP_CPU_KERNEL( + merge_ids, ops::MergeIdsOpKernel); diff --git a/paddle/fluid/operators/merge_ids_op.h b/paddle/fluid/operators/merge_ids_op.h new file mode 100644 index 0000000000000000000000000000000000000000..83712a8519c6817151e1922c606c0fdd4682a2db --- /dev/null +++ b/paddle/fluid/operators/merge_ids_op.h @@ -0,0 +1,92 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/operators/math/selected_rows_functor.h" + +namespace paddle { +namespace operators { + +template +class MergeIdsOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + auto place = ctx.GetPlace(); + if (!platform::is_cpu_place(place)) { + PADDLE_THROW("MergeIds do not support GPU kernel"); + } + VLOG(3) << "run in MergeIdsOpKernel"; + + const auto *ids_var = ctx.InputVar("Ids"); + PADDLE_ENFORCE(ids_var->IsType(), + "only support to merge Ids of LoDTensor"); + + const auto &ids_tensor = ids_var->Get(); + const auto &ids_dims = ids_tensor.dims(); + const int64_t *ids = ids_tensor.data(); + + auto x_tensors = ctx.MultiInput("X"); + + auto *out = ctx.Output("Out"); + + int batch_size = 0; + int embedding_size = 0; + for (auto &input : x_tensors) { + if (framework::product(input->dims()) != 0) { + if (embedding_size == 0) { + embedding_size = input->dims()[1]; + } + PADDLE_ENFORCE_EQ(embedding_size, input->dims()[1], + "embedding size of all input should be the same"); + batch_size += input->dims()[0]; + } + } + PADDLE_ENFORCE_EQ( + batch_size, ids_dims[0], + "the batch size of ids and merged embedding value should be the same"); + + const size_t shard_num = x_tensors.size(); + + if (shard_num == 1) { + VLOG(3) << "only one shard, we can copy the data directly"; + TensorCopy(*x_tensors[0], place, out); + } else { + std::vector in_indexs(shard_num, 0); + auto *out_data = out->mutable_data( + framework::make_ddim({batch_size, embedding_size}), place); + // copy data from ins[shard_num] to out. + for (int i = 0; i < ids_dims[0]; ++i) { + int64_t id = ids[i]; + size_t shard_id = static_cast(id) % shard_num; + int index = in_indexs[shard_id]; + memcpy(out_data + embedding_size * i, + x_tensors[shard_id]->data() + index * embedding_size, + sizeof(T) * embedding_size); + in_indexs[shard_id] += 1; + } + + for (size_t i = 0; i < shard_num; ++i) { + PADDLE_ENFORCE_EQ(in_indexs[i], x_tensors[i]->dims()[0], + "after merge, all data in x_tensor should be used"); + } + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/platform/cpu_info.cc b/paddle/fluid/platform/cpu_info.cc index c202eed354c5f1a91e93e1c3919d1bfebc1bc401..40dc7c9a0b6a40f2419ace3ce7e0e5e82bc95c1a 100644 --- a/paddle/fluid/platform/cpu_info.cc +++ b/paddle/fluid/platform/cpu_info.cc @@ -59,9 +59,10 @@ inline size_t CpuTotalPhysicalMemory() { size_t CpuMaxAllocSize() { // For distributed systems, it requires configuring and limiting // the fraction of memory to use. - return std::min(static_cast(FLAGS_fraction_of_cpu_memory_to_use * - CpuTotalPhysicalMemory()), - FLAGS_initial_cpu_memory_in_mb * 1 << 20); + return std::min( + static_cast(FLAGS_fraction_of_cpu_memory_to_use * + CpuTotalPhysicalMemory()), + static_cast(FLAGS_initial_cpu_memory_in_mb * 1 << 20)); } size_t CpuMinChunkSize() { diff --git a/python/paddle/batch.py b/python/paddle/batch.py index d48c54fcbb66487617b1946bc69724870c8f879c..3c6a53db3c2287e8ef5931a06ca5dad455665ee0 100644 --- a/python/paddle/batch.py +++ b/python/paddle/batch.py @@ -15,7 +15,7 @@ __all__ = ['batch'] -def batch(reader, batch_size, drop_last=False): +def batch(reader, batch_size, drop_last=True): """ Create a batched reader. diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index bbd35aaecba27ea9fd66b9be585a972690980ab8..f6438c82ac207d0e38d8be5e9d6252b28e72826e 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -382,7 +382,7 @@ class Operator(object): 'rnn_memory_helper_grad', 'conditional_block', 'while', 'send', 'recv', 'listen_and_serv', 'parallel_do', 'save_combine', 'load_combine', 'ncclInit', 'channel_create', 'channel_close', 'channel_send', - 'channel_recv', 'select' + 'channel_recv', 'select', 'gen_nccl_id' } def __init__(self, diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index d5db75ebea4202bdc9afba251e3b24f760051123..80452a1e8b26cb9ca4bc1176349e5daf1fe09e2f 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -261,9 +261,10 @@ def embedding(input, return tmp -# TODO(qijun): expose H0 and C0 def dynamic_lstm(input, size, + h_0=None, + c_0=None, param_attr=None, bias_attr=None, use_peepholes=True, @@ -324,6 +325,13 @@ def dynamic_lstm(input, (T X 4D), where T is the total time steps in this mini-batch, D is the hidden size. size(int): 4 * hidden size. + h_0(Variable): The initial hidden state is an optional input, default is zero. + This is a tensor with shape (N x D), where N is the + batch size and D is the hidden size. + c_0(Variable): The initial cell state is an optional input, default is zero. + This is a tensor with shape (N x D), where N is the + batch size. `h_0` and `c_0` can be NULL but only at the same time. + param_attr(ParamAttr|None): The parameter attribute for the learnable hidden-hidden weights. @@ -387,12 +395,20 @@ def dynamic_lstm(input, cell = helper.create_tmp_variable(dtype) batch_gate = helper.create_tmp_variable(dtype) batch_cell_pre_act = helper.create_tmp_variable(dtype) + inputs = {'Input': input, 'Weight': weight, 'Bias': bias} + batch_size = input.shape[0] + if h_0: + assert h_0.shape == (batch_size, size), \ + 'The shape of h0 should be (batch_size, %d)' % size + inputs['H0'] = h_0 + if c_0: + assert c_0.shape == (batch_size, size), \ + 'The shape of c0 should be (batch_size, %d)' % size + inputs['C0'] = c_0 helper.append_op( type='lstm', - inputs={'Input': input, - 'Weight': weight, - 'Bias': bias}, + inputs=inputs, outputs={ 'Hidden': hidden, 'Cell': cell, @@ -677,11 +693,13 @@ def dynamic_gru(input, attr=helper.param_attr, shape=[size, 3 * size], dtype=dtype) bias = helper.create_parameter( attr=helper.bias_attr, shape=[1, 3 * size], dtype=dtype, is_bias=True) + batch_size = input.shape[0] inputs = {'Input': input, 'Weight': weight, 'Bias': bias} if h_0 != None: assert h_0.shape == ( - size, size), 'The shape of h0 should be(%d, %d)' % (size, size) - inputs['h0'] = h_0 + batch_size, size + ), 'The shape of h0 should be(batch_size, %d)' % size + inputs['H0'] = h_0 hidden = helper.create_tmp_variable(dtype) batch_gate = helper.create_tmp_variable(dtype) diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py index 2df3da9cca7042222317de626460909f018cb107..8e222d26907e8fe697b596a67e62cc9df84afe0e 100644 --- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py +++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py @@ -96,10 +96,11 @@ def train(use_cuda, train_program, params_dirname): train_reader = paddle.batch( paddle.reader.shuffle( cifar10_small_test_set.train10(batch_size=10), buf_size=128 * 10), - batch_size=BATCH_SIZE) + batch_size=BATCH_SIZE, + drop_last=False) test_reader = paddle.batch( - paddle.dataset.cifar.test10(), batch_size=BATCH_SIZE) + paddle.dataset.cifar.test10(), batch_size=BATCH_SIZE, drop_last=False) def event_handler(event): if isinstance(event, fluid.EndStepEvent): diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py index 224cca417e717bbcc54b58be6ac0219be207dea3..dbc7bc06c93157f271c79e85b6925468e861e57f 100644 --- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py +++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py @@ -73,10 +73,11 @@ def train(use_cuda, train_program, params_dirname): train_reader = paddle.batch( paddle.reader.shuffle( cifar10_small_test_set.train10(batch_size=10), buf_size=128 * 10), - batch_size=BATCH_SIZE) + batch_size=BATCH_SIZE, + drop_last=False) test_reader = paddle.batch( - paddle.dataset.cifar.test10(), batch_size=BATCH_SIZE) + paddle.dataset.cifar.test10(), batch_size=BATCH_SIZE, drop_last=False) def event_handler(event): if isinstance(event, fluid.EndStepEvent): diff --git a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py index 113dda88ca974c9e6241f127091bd96fb2af4a70..8c74be0f08855c20f5aa3ecd75622a51e94a0304 100644 --- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py +++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py @@ -87,7 +87,9 @@ def train(use_cuda, train_program, params_dirname): def event_handler(event): if isinstance(event, fluid.EndEpochEvent): test_reader = paddle.batch( - paddle.dataset.imdb.test(word_dict), batch_size=BATCH_SIZE) + paddle.dataset.imdb.test(word_dict), + batch_size=BATCH_SIZE, + drop_last=False) avg_cost, acc = trainer.test( reader=test_reader, feed_order=['words', 'label']) @@ -113,7 +115,8 @@ def train(use_cuda, train_program, params_dirname): train_reader = paddle.batch( paddle.reader.shuffle( paddle.dataset.imdb.train(word_dict), buf_size=25000), - batch_size=BATCH_SIZE) + batch_size=BATCH_SIZE, + drop_last=False) trainer.train( num_epochs=1, diff --git a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py index 8818cf96fa8f08036f9e23aae786f67b5614b2b9..be347cd5315668dde0454d7959dbf9bcfa465b5f 100644 --- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py +++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py @@ -56,7 +56,7 @@ BATCH_SIZE = 200 # fix the order of training data train_reader = paddle.batch( - paddle.dataset.uci_housing.train(), batch_size=BATCH_SIZE) + paddle.dataset.uci_housing.train(), batch_size=BATCH_SIZE, drop_last=False) # train_reader = paddle.batch( # paddle.reader.shuffle( diff --git a/python/paddle/fluid/tests/unittests/test_merge_ids_op.py b/python/paddle/fluid/tests/unittests/test_merge_ids_op.py new file mode 100644 index 0000000000000000000000000000000000000000..f209bdf30faffc0b2c7932b7b10f384d6d61a831 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_merge_ids_op.py @@ -0,0 +1,38 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np +from op_test import OpTest + + +class TestMergeIdsOp(OpTest): + def setUp(self): + self.op_type = "merge_ids" + ids = np.array([[0], [2], [2], [3], [5], [5], [6]]).astype('int64') + x0 = np.array([[0.1, 0.2], [0.2, 0.3], [0.3, 0.4]]).astype('float32') + x1 = np.array([]).astype('float32') + x2 = np.array([[0.4, 0.5], [0.4, 0.5], [0.5, 0.6], + [0.5, 0.6]]).astype('float32') + out = np.array([[0.1, 0.2], [0.4, 0.5], [0.4, 0.5], [0.2, 0.3], + [0.5, 0.6], [0.5, 0.6], [0.3, 0.4]]).astype('float32') + self.inputs = {'Ids': ids, "X": [('x0', x0), ('x1', x1), ('x2', x2)]} + self.outputs = {'Out': out} + + def test_check_output(self): + self.check_output() + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index 2480d4e76a1b5fd76b7dc8299c2f8fcae967145e..9c604170b8b53c9cbcf39b4978ae60ccad84648c 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -629,7 +629,7 @@ class DistributeTranspiler: if op.type == LOOKUP_TABLE_TYPE: continue_search_lookup_table_op = True - op_index = list(all_ops).index(op) + lookup_table_op_index = list(all_ops).index(op) ids_name = op.input("Ids") out_name = op.output("Out") @@ -649,7 +649,7 @@ class DistributeTranspiler: # insert split_ids_op program.global_block().insert_op( - index=op_index, + index=lookup_table_op_index, type="split_ids", inputs={ 'Ids': [ @@ -661,7 +661,7 @@ class DistributeTranspiler: # insert prefetch_op program.global_block().insert_op( - index=op_index + 1, + index=lookup_table_op_index + 1, type="prefetch", inputs={'X': prefetch_input_vars}, outputs={"Out": prefetch_output_vars}, @@ -672,16 +672,21 @@ class DistributeTranspiler: # insert concat_op program.global_block().insert_op( - index=op_index + 2, - type="concat", - inputs={'X': prefetch_output_vars}, + index=lookup_table_op_index + 2, + type="merge_ids", + inputs={ + 'Ids': [ + program.global_block().vars[varname] + for varname in ids_name + ], + 'X': prefetch_output_vars + }, outputs={ "Out": [ program.global_block().vars[varname] for varname in out_name ] - }, - attrs={"axis": 0}) + }) # delete lookup_table_op delete_ops(program.global_block(), [op]) diff --git a/python/paddle/trainer_config_helpers/attrs.py b/python/paddle/trainer_config_helpers/attrs.py index e6f87ce61b1d16d4f98f111626776aa52c2ec35b..4e3beaf639bad9fed2862a5477095b66ef4b9aee 100644 --- a/python/paddle/trainer_config_helpers/attrs.py +++ b/python/paddle/trainer_config_helpers/attrs.py @@ -240,14 +240,15 @@ class ExtraLayerAttribute(object): :type error_clipping_threshold: float :param drop_rate: Dropout rate. Dropout will create a mask on layer output. The dropout rate is the zero rate of this mask. The - details of what dropout is please refer to `here - `_. + details of what dropout is please refer to `JMLRdropout + `_. :type drop_rate: float :param device: device ID of layer. device=-1, use CPU. device>=0, use GPU. - The details allocation in parallel_nn please refer to `here - `_. + The details allocation in parallel_nn please refer to `use_case + `_. :type device: int """ diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py index ebc31b23e0f5504b4bebccabe996b054c7fbce3b..e6a03759ef431086390e217eabcdff47e610346c 100644 --- a/python/paddle/trainer_config_helpers/layers.py +++ b/python/paddle/trainer_config_helpers/layers.py @@ -2556,7 +2556,7 @@ def img_conv_layer(input, the output will be obtained by concatenating the two results. The details of grouped convolution, please refer to: - `ImageNet Classification with Deep Convolutional Neural Networks + `ImageNet Classification With Deep Convolutional Neural Networks `_ The example usage is: @@ -5678,8 +5678,8 @@ def warp_ctc_layer(input, `_ library, which is used in `Deep Speech 2: End-toEnd Speech Recognition in English and Mandarin `_, to compute Connectionist Temporal - Classification (CTC) loss. Besides, another `warp-ctc - `_ repository, which is forked from + Classification (CTC) loss. Besides, another `warp-ctc repository + `_ , which is forked from the official one, is maintained to enable more compiling options. During the building process, PaddlePaddle will clone the source codes, build and install it to :code:`third_party/install/warpctc` directory. diff --git a/python/paddle/v2/minibatch.py b/python/paddle/v2/minibatch.py index d48c54fcbb66487617b1946bc69724870c8f879c..3c6a53db3c2287e8ef5931a06ca5dad455665ee0 100644 --- a/python/paddle/v2/minibatch.py +++ b/python/paddle/v2/minibatch.py @@ -15,7 +15,7 @@ __all__ = ['batch'] -def batch(reader, batch_size, drop_last=False): +def batch(reader, batch_size, drop_last=True): """ Create a batched reader.