diff --git a/.travis.yml b/.travis.yml index a406841f6abf01f15826f34fe4c63b4c24486ccd..361136ac2c8d899a0d7a4d7945083fcc489551b5 100644 --- a/.travis.yml +++ b/.travis.yml @@ -27,15 +27,6 @@ script: # 43min timeout paddle/scripts/paddle_docker_build.sh ${JOB} if [ $? -eq 0 ] || [ $? -eq 142 ]; then true; else exit 1; fi; - - | - if [[ "$JOB" != "doc" ]]; then exit 0; fi; - # For document only - if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then exit 0; fi; - if [[ "$TRAVIS_BRANCH" != "develop" && ! "$TRAVIS_BRANCH" =~ ^v|release/[[:digit:]]+\.[[:digit:]]+(\.[[:digit:]]+)?(-\S*)?$ ]]; then exit 0; fi; - export DEPLOY_DOCS_SH=https://raw.githubusercontent.com/PaddlePaddle/PaddlePaddle.org/master/scripts/deploy/deploy_docs.sh - export DOCS_DIR=`pwd` - cd .. - curl $DEPLOY_DOCS_SH | bash -s $CONTENT_DEC_PASSWD $TRAVIS_BRANCH $DOCS_DIR $DOCS_DIR/build/doc/ notifications: email: on_success: change diff --git a/CMakeLists.txt b/CMakeLists.txt index 231224f9249848b6e4981a98e0538794bf5d3c08..bdd48565edeca051f54e8fe4eb51cd1dbd5e836a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -200,6 +200,14 @@ include(external/snappy) # download snappy include(external/snappystream) include(external/threadpool) +if(WITH_GPU) + include(cuda) + include(tensorrt) + include(external/anakin) +else() + set(WITH_ANAKIN OFF CACHE STRING "Anakin is valid only when GPU is set." FORCE) +endif() + include(cudnn) # set cudnn libraries, must before configure include(cupti) include(configure) # add paddle env configuration @@ -228,14 +236,6 @@ set(EXTERNAL_LIBS ${PYTHON_LIBRARIES} ) -if(WITH_GPU) - include(cuda) - include(tensorrt) - include(external/anakin) -else() - set(WITH_ANAKIN OFF CACHE STRING "Anakin is valid only when GPU is set." FORCE) -endif() - if(WITH_AMD_GPU) find_package(HIP) include(hip) diff --git a/cmake/cudnn.cmake b/cmake/cudnn.cmake index 2c84061ff572de4687b4d496f8ded6deee8d1011..9eebea816cbfc91052c95ecf99ecc4b0bea4e4c2 100644 --- a/cmake/cudnn.cmake +++ b/cmake/cudnn.cmake @@ -21,6 +21,7 @@ list(APPEND CUDNN_CHECK_LIBRARY_DIRS ${CUDNN_ROOT}/lib64 ${CUDNN_ROOT}/lib ${CUDNN_ROOT}/lib/${TARGET_ARCH}-linux-gnu + ${CUDNN_ROOT}/local/cuda-${CUDA_VERSION}/targets/${TARGET_ARCH}-linux/lib/ $ENV{CUDNN_ROOT} $ENV{CUDNN_ROOT}/lib64 $ENV{CUDNN_ROOT}/lib diff --git a/cmake/generic.cmake b/cmake/generic.cmake index 07bab994d354df834d0667c69f307b2d7684fb22..82c958073cba92f00a341121e36ba45531b22aec 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -264,7 +264,10 @@ function(cc_test TARGET_NAME) WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) if (${cc_test_SERIAL}) set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1) + + set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true) set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true) + set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true) endif() endif() endfunction(cc_test) @@ -329,7 +332,10 @@ function(nv_test TARGET_NAME) add_test(${TARGET_NAME} ${TARGET_NAME}) if (nv_test_SERIAL) set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1) + + set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true) set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true) + set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true) endif() endif() endfunction(nv_test) @@ -577,7 +583,9 @@ function(py_test TARGET_NAME) set(multiValueArgs SRCS DEPS ARGS ENVS) cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) add_test(NAME ${TARGET_NAME} - COMMAND env FLAGS_init_allocated_mem=true PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS} + COMMAND env FLAGS_init_allocated_mem=true FLAGS_cudnn_deterministic=true + FLAGS_cpu_deterministic=true + PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS} ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS} WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) endif() diff --git a/doc/fluid/howto/optimization/timeline_cn.md b/doc/fluid/howto/optimization/timeline_cn.md index 5d061e1c00d2ca0194153730a39486b8357fa5b0..faf39f276dbddcd4961407ba2d082c9826051cbe 100644 --- a/doc/fluid/howto/optimization/timeline_cn.md +++ b/doc/fluid/howto/optimization/timeline_cn.md @@ -1,21 +1,27 @@ # 如何使用timeline工具做性能分析 -1. 在训练的主循环外加上`with profiler.profiler(...)`。运行之后,代码会在`/tmp/profile`目录下生成一个profile的记录文件。 +1. 在训练的主循环外加上`profiler.start_profiler(...)`和`profiler.stop_profiler(...)`。运行之后,代码会在`/tmp/profile`目录下生成一个profile的记录文件。 **提示:** 请不要在timeline记录信息时运行太多次迭代,因为timeline中的记录数量和迭代次数是成正比的。 ```python - with profiler.profiler('All', 'total', '/tmp/profile') as prof: - for pass_id in range(pass_num): - for batch_id, data in enumerate(train_reader()): - exe.run(fluid.default_main_program(), - feed=feeder.feed(data), - fetch_list=[]) + for pass_id in range(pass_num): + for batch_id, data in enumerate(train_reader()): + if pass_id == 0 and batch_id == 5: + profiler.start_profiler("All") + elif pass_id == 0 and batch_id == 10: + profiler.stop_profiler("total", "/tmp/profile") + exe.run(fluid.default_main_program(), + feed=feeder.feed(data), + fetch_list=[]) ... ``` 1. 运行`python paddle/tools/timeline.py`来处理`/tmp/profile`,这个程序默认会生成一个`/tmp/timeline`文件,你也可以用命令行参数来修改这个路径,请参考[timeline.py](https://github.com/PaddlePaddle/Paddle/blob/develop/tools/timeline.py)。 +```python +python Paddle/tools/timeline.py --profile_path=/tmp/profile --timeline_path=timeline +``` 1. 打开chrome浏览器,访问,用`load`按钮来加载生成的`timeline`文件。 diff --git a/doc/fluid/howto/optimization/timeline_en.md b/doc/fluid/howto/optimization/timeline_en.md index 96481ae2a6e4442d40803f8d5361e5f942502df3..6f963c6b4da6967fb2f493ada917a4b08917fa4c 100644 --- a/doc/fluid/howto/optimization/timeline_en.md +++ b/doc/fluid/howto/optimization/timeline_en.md @@ -1,15 +1,17 @@ # how to use timeline tool to do profile -1. Add `with profiler.profiler(...)` to the main training loop. After run, the code will generate a profile record file `/tmp/profile`. **Warning**: Please do not run too many batches when use profiler to record timeline information, for the profile record will grow with the batch number. +1. Add `profiler.start_profiler(...)`和`profiler.stop_profiler(...)` to the main training loop. After run, the code will generate a profile record file `/tmp/profile`. **Warning**: Please do not run too many batches when use profiler to record timeline information, for the profile record will grow with the batch number. ```python - with profiler.profiler('All', 'total', '/tmp/profile') as prof: - for pass_id in range(pass_num): - for batch_id, data in enumerate(train_reader()): - exe.run(fluid.default_main_program(), - feed=feeder.feed(data), - fetch_list=[], - use_program_cache=True) + for pass_id in range(pass_num): + for batch_id, data in enumerate(train_reader()): + if pass_id == 0 and batch_id == 5: + profiler.start_profiler("All") + elif pass_id == 0 and batch_id == 10: + profiler.stop_profiler("total", "/tmp/profile") + exe.run(fluid.default_main_program(), + feed=feeder.feed(data), + fetch_list=[]) ... ``` @@ -17,6 +19,10 @@ file `/tmp/timeline` by default. You can change the path by cmd parameter, please take a look at [timeline.py](https://github.com/PaddlePaddle/Paddle/blob/develop/tools/timeline.py) for details. +```python +python Paddle/tools/timeline.py --profile_path=/tmp/profile --timeline_path=timeline +``` + 1. Open chrome and visit , use `load` button to load the generated `timeline` file. ![chrome tracing](./tracing.jpeg) diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc index 700c73c745bad72637d77385f5cd38c494501c86..bf493a3fa44e48deec734250d04b2a413c3ed9da 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc @@ -17,6 +17,7 @@ #include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/reduce_and_gather.h" #include "paddle/fluid/framework/details/variable_visitor.h" +#include "paddle/fluid/platform/profiler.h" namespace paddle { namespace framework { @@ -45,6 +46,7 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, #endif void AllReduceOpHandle::RunImpl() { + platform::RecordEvent r("all_reduce", nullptr); if (NoDummyInputSize() == 1) { return; // No need to all reduce when GPU count = 1; } else { diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index b2e5399e2376a86c1cd310b29c768832665af87f..8714a42162bda3d5ad12e7925fe8cc4e693f51b1 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -21,6 +21,26 @@ namespace framework { namespace details { struct BuildStrategy { + // ParallelExecutor supports two modes of ReduceStrategy, kAllReduce and + // kReduce, for CPU and GPU. If you use kAllReduce, different threads + // optimize their parameters separately. If you use kReduce, the optimizations + // of parameters are distributed to different threads. + // For example, a model has 100 parameters and is running with four threads, + // if you choose kAllReduce, every thread is to optimize 100 parameters + // separately, if you choose kReduce, every thread is to optimize 25 + // parameters. + // Of particular note is, if you use kReduce when using CPU training, + // all the parameters are shared between different threads. This feature will + // save memory. + // FIXME(zcd): The result of the two modes(kAllReduce and kReduce) maybe not + // equal for GPU. Because, the result of the different order of summing maybe + // different, for example, the result of `a+b+c+d` may be different with the + // result of `c+a+b+d`. + // For GPU, the implementation of kAllReduce and kReduce is adopted NCCL, + // so the result of kAllReduce and kReduce maybe not equal. + // For CPU, if you want to fix the order of summing to make the result + // of kAllReduce and kReduce no diff, you can add + // `FLAGS_cpu_deterministic=true` to env. enum class ReduceStrategy { kAllReduce = 0, kReduce = 1 }; enum class GradientScaleStrategy { diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc index 7160e346dad0615e2fd32b70c096880af0359e1a..6c7e5c1fb06620b1c071b00fcfcc1b4a29bf8d62 100644 --- a/paddle/fluid/framework/details/reduce_op_handle.cc +++ b/paddle/fluid/framework/details/reduce_op_handle.cc @@ -16,12 +16,18 @@ #include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/reduce_and_gather.h" #include "paddle/fluid/framework/details/variable_visitor.h" +#include "paddle/fluid/platform/profiler.h" + +DEFINE_bool( + cpu_deterministic, false, + "Whether to make the result of computation deterministic in CPU side."); namespace paddle { namespace framework { namespace details { void ReduceOpHandle::RunImpl() { + platform::RecordEvent r("reduce", nullptr); if (places_.size() == 1) return; // the input and output may have dummy var. auto in_var_handles = DynamicCast(inputs_); @@ -89,11 +95,33 @@ void ReduceOpHandle::RunImpl() { } else { std::vector lod_tensors = GetInputValues(in_var_handles, var_scopes); + if (paddle::platform::is_cpu_place(lod_tensors[0]->place())) { this->RunAndRecordEvent([&] { - ReduceLoDTensor func(lod_tensors, - out_var->GetMutable()); - VisitDataType(ToDataType(lod_tensors[0]->type()), func); + // FIXME(zcd): The order of summing is important, + // especially when the type of data is float or double. + // For example, the result of `a+b+c+d` may be different + // with the result of `c+a+b+d`, so the summing order should be fixed. + if (!FLAGS_cpu_deterministic) { + ReduceLoDTensor func(lod_tensors, + out_var->GetMutable()); + VisitDataType(ToDataType(lod_tensors[0]->type()), func); + } else { + // We sum lod_tensors to reduce_sum_trg which is in local_scopes_0 + // here, but it doesn't mean reduce_sum_trg must be in local_scopes_0. + auto &reduce_sum_trg = *this->local_scopes_[0] + ->FindVar(kLocalExecScopeName) + ->Get() + ->FindVar(out_var_handle->name_) + ->GetMutable(); + ReduceLoDTensor func(lod_tensors, &reduce_sum_trg); + VisitDataType(ToDataType(lod_tensors[0]->type()), func); + + auto trg = out_var->GetMutable(); + if (reduce_sum_trg.data() != trg->data()) { + TensorCopy(reduce_sum_trg, platform::CPUPlace(), trg); + } + } }); } else if (paddle::platform::is_gpu_place(lod_tensors[0]->place())) { #ifdef PADDLE_WITH_CUDA diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc index 1d80bab90f513139f807b57258177c6b2ac53ac0..5bd974d6b789a2f085c0a69de5e133187342f587 100644 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc @@ -17,6 +17,7 @@ #include #include #include "paddle/fluid/framework/executor.h" +#include "paddle/fluid/platform/profiler.h" namespace paddle { namespace framework { @@ -62,6 +63,7 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run( eptr = std::current_exception(); } + platform::RecordEvent e("ScopeBufferedSSAGraphExecutorAfterRun", nullptr); drop_scope_counter_ += 1; if (!fetch_tensors.empty() || drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) { diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc index e556c84b0219eba2b92c456c205e03947171626b..0eaf9a9c951991a5775604eb8d0e7535f81a4ae2 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc @@ -15,6 +15,7 @@ #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h" #include "paddle/fluid/framework/details/ssa_graph_builder.h" +#include "paddle/fluid/platform/profiler.h" namespace paddle { namespace framework { @@ -34,6 +35,8 @@ ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor( FeedFetchList ThreadedSSAGraphExecutor::Run( const std::vector &fetch_tensors) { + std::unique_ptr event( + new platform::RecordEvent("ThreadedSSAGraphExecutorPrepare", nullptr)); std::unordered_map pending_ops; std::unordered_set pending_vars; BlockingQueue ready_vars; @@ -84,6 +87,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( // Clean run context run_op_futures_.clear(); exception_holder_.Clear(); + event.reset(nullptr); // Step 3. Execution while (!pending_vars.empty()) { diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 38c4297380f779fff4d4203a6c51f12b48800162..0c8acf71bfa0814e66560258ad6131c743ebc81b 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -18,6 +18,7 @@ limitations under the License. */ #include "paddle/fluid/framework/data_transform.h" #include "paddle/fluid/framework/executor.h" +#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/shape_inference.h" #include "paddle/fluid/framework/var_type.h" @@ -57,7 +58,11 @@ static DDim GetDims(const Scope& scope, const std::string& name, } if (var->IsType()) { - return var->Get().dims(); + const LoDTensor& tensor = var->Get(); + if (UNLIKELY(!tensor.IsInitialized())) { + return DDim({-1}); + } + return tensor.dims(); } else if (var->IsType()) { if (get_actual_dim) { return var->Get().value().dims(); @@ -74,8 +79,13 @@ static std::string GetDtype(const Scope& scope, const std::string& name) { if (var == nullptr) { return ""; } + if (var->IsType()) { - return DataTypeToString(ToDataType(var->Get().type())); + const LoDTensor& tensor = var->Get(); + if (UNLIKELY(!tensor.IsInitialized())) { + return ""; + } + return DataTypeToString(ToDataType(tensor.type())); } else if (var->IsType()) { return DataTypeToString( ToDataType(var->Get().value().type())); @@ -106,7 +116,11 @@ static LoD GetLoD(const Scope& scope, const std::string& name) { } if (var->IsType()) { - return var->Get().lod(); + const LoDTensor& tensor = var->Get(); + if (UNLIKELY(!tensor.IsInitialized())) { + return default_lod; + } + return tensor.lod(); } else { return default_lod; } @@ -122,6 +136,8 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { platform::SetDeviceId(dev_id); #endif } + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + platform::RecordEvent record_event(Type(), pool.Get(place)); RunImpl(scope, place); VLOG(10) << "+ " << DebugStringEx(&scope); } @@ -625,9 +641,6 @@ void OperatorWithKernel::RunImpl(const Scope& scope, platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); auto* dev_ctx = pool.Get(place); - // For profiling, don't move out of this function because that will result - // in the failure of multi-GPU profiling. - platform::RecordEvent record_event(Type(), dev_ctx); // check if op[type] has kernel registered. auto& all_op_kernels = AllOpKernels(); auto kernels_iter = all_op_kernels.find(type_); diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h index ef224d68f1fc561f45e9d7a81425e62655457648..0bbfd66148e9bc9080654bf1b0b34477115a0e6b 100644 --- a/paddle/fluid/framework/tensor.h +++ b/paddle/fluid/framework/tensor.h @@ -82,7 +82,7 @@ class Tensor { template const T* data() const; - bool IsInitialized() const; + inline bool IsInitialized() const; /** * @brief Return a pointer to mutable memory block. diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt index 259d79bedbf664f52b1189ca71567665a6d91180..08d0f493ab30d92a121d089d9003bc575429b4dd 100644 --- a/paddle/fluid/inference/api/CMakeLists.txt +++ b/paddle/fluid/inference/api/CMakeLists.txt @@ -74,9 +74,10 @@ if (WITH_ANAKIN) # only needed in CI target_link_libraries(inference_anakin_api anakin anakin_saber_common) target_link_libraries(inference_anakin_api_shared anakin anakin_saber_common) if (WITH_TESTING) - cc_test(inference_anakin_test SRCS api_anakin_engine_tester.cc - ARGS --model=${ANAKIN_INSTALL_DIR}/mobilenet_v2.anakin.bin - DEPS inference_anakin_api_shared) - target_compile_options(inference_anakin_test BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS}) + # this test is unstable, disable it first. + #cc_test(inference_anakin_test SRCS api_anakin_engine_tester.cc + #ARGS --model=${ANAKIN_INSTALL_DIR}/mobilenet_v2.anakin.bin + #DEPS inference_anakin_api_shared) + #target_compile_options(inference_anakin_test BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS}) endif(WITH_TESTING) endif() diff --git a/paddle/fluid/operators/feed_op.cc b/paddle/fluid/operators/feed_op.cc index bcb3e63ed7dbc775c1de6c4522f0548ea48a6cf0..dc7ef664958238ddbd48745bd59cc7db28e49f5b 100644 --- a/paddle/fluid/operators/feed_op.cc +++ b/paddle/fluid/operators/feed_op.cc @@ -31,7 +31,6 @@ class FeedOp : public framework::OperatorBase { const platform::Place &place) const override { // get device context from pool auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place); - platform::RecordEvent record_event(Type(), dev_ctx); auto feed_var_name = Input("X"); auto *feed_var = scope.FindVar(feed_var_name); diff --git a/paddle/fluid/operators/fetch_barrier_op.cc b/paddle/fluid/operators/fetch_barrier_op.cc index 680fde19eefe57475b7526ebc29d4ff977a16977..d9cd956dfdff3d009d38ee5088f5396080580483 100644 --- a/paddle/fluid/operators/fetch_barrier_op.cc +++ b/paddle/fluid/operators/fetch_barrier_op.cc @@ -36,12 +36,6 @@ class FetchBarrierOp : public framework::OperatorBase { void RunImpl(const framework::Scope& scope, const platform::Place& place) const override { std::vector eps = Attr>("endpoints"); - - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - auto& ctx = *pool.Get(place); - // For profiling - platform::RecordEvent record_event(Type(), &ctx); - distributed::RPCClient* rpc_client = distributed::RPCClient::GetInstance(); diff --git a/paddle/fluid/operators/fetch_op.cc b/paddle/fluid/operators/fetch_op.cc index 1640a2a22c69a0e3ab81a2889d6105b2cf4162b7..c197b45e8196a47def6465128e8ca39d8daefed6 100644 --- a/paddle/fluid/operators/fetch_op.cc +++ b/paddle/fluid/operators/fetch_op.cc @@ -30,9 +30,6 @@ class FetchOp : public framework::OperatorBase { private: void RunImpl(const framework::Scope &scope, const platform::Place &place) const override { - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - platform::RecordEvent record_event(Type(), pool.Get(place)); - auto fetch_var_name = Input("X"); auto *fetch_var = scope.FindVar(fetch_var_name); PADDLE_ENFORCE(fetch_var != nullptr, diff --git a/paddle/fluid/operators/load_op.cc b/paddle/fluid/operators/load_op.cc index ac35cf0b89bfaa0c0f8e64445f18a3bbd478e70a..27e26cb1b5c1e831f05dac299489628b92eaa58c 100644 --- a/paddle/fluid/operators/load_op.cc +++ b/paddle/fluid/operators/load_op.cc @@ -31,9 +31,6 @@ class LoadOp : public framework::OperatorBase { private: void RunImpl(const framework::Scope &scope, const platform::Place &place) const override { - auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place); - platform::RecordEvent record_event(Type(), dev_ctx); - // FIXME(yuyang18): We save variable to local file now, but we should change // it to save an output stream. auto filename = Attr("file_path"); diff --git a/paddle/fluid/operators/lookup_table_op.cc b/paddle/fluid/operators/lookup_table_op.cc index 3e8f3ec5c5cd683343bcbdfc2388bd37c25e00f9..d77b095c5d783a2a9fab87eb8b458117a6a3d225 100644 --- a/paddle/fluid/operators/lookup_table_op.cc +++ b/paddle/fluid/operators/lookup_table_op.cc @@ -32,11 +32,16 @@ class LookupTableOp : public framework::OperatorWithKernel { auto table_dims = ctx->GetInputDim("W"); auto ids_dims = ctx->GetInputDim("Ids"); + int ids_rank = ids_dims.size(); - PADDLE_ENFORCE_EQ(ids_dims.size(), 2); - PADDLE_ENFORCE_EQ(ids_dims[1], 1); + PADDLE_ENFORCE_EQ(table_dims.size(), 2); + PADDLE_ENFORCE_EQ(ids_dims[ids_rank - 1], 1, + "The last dimension of the 'Ids' tensor must be 1."); - ctx->SetOutputDim("Out", {ids_dims[0], table_dims[1]}); + auto output_dims = + framework::vectorize(framework::slice_ddim(ids_dims, 0, ids_rank - 1)); + output_dims.push_back(table_dims[1]); + ctx->SetOutputDim("Out", framework::make_ddim(output_dims)); if (ctx->GetOutputsVarType("Out")[0] == framework::proto::VarType::LOD_TENSOR) { @@ -61,8 +66,7 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("Ids", "An input with type int32 or int64 " "contains the ids to be looked up in W. " - "Ids must be a column vector with rank = 2. " - "The 2nd dimension size must be 1."); + "The last dimension size must be 1."); AddOutput("Out", "The lookup results, which have the same type as W."); AddAttr("is_sparse", "(boolean, default false) " diff --git a/paddle/fluid/operators/lookup_table_op.cu b/paddle/fluid/operators/lookup_table_op.cu index 27483372b93a850d313445386c7973838c4a0710..74823dab09cac358f647c074ac2f2ee2fed17e55 100644 --- a/paddle/fluid/operators/lookup_table_op.cu +++ b/paddle/fluid/operators/lookup_table_op.cu @@ -118,28 +118,31 @@ class LookupTableGradCUDAKernel : public framework::OpKernel { auto *d_table = context.Output(framework::GradVarName("W")); auto *ids_data = ids->data(); - auto ids_dim = ids->dims(); + int64_t ids_num = ids->numel(); auto stream = dev_ctx.stream(); // copy GPU memory to CPU pinned memory framework::Vector new_rows; - new_rows.resize(ids_dim[0]); + new_rows.resize(ids_num); auto gpu_place = boost::get(context.GetPlace()); // TODO(yuyang18): Strange code here. memory::Copy(platform::CPUPlace(), new_rows.CUDAMutableData(context.GetPlace()), gpu_place, - ids_data, ids_dim[0] * sizeof(int64_t), stream); + ids_data, ids_num * sizeof(int64_t), stream); d_table->set_rows(new_rows); auto *d_table_value = d_table->mutable_value(); - d_table_value->Resize({ids_dim[0], table->dims()[1]}); + d_table_value->Resize({ids_num, table->dims()[1]}); d_table_value->mutable_data(context.GetPlace()); auto *d_table_data = d_table_value->data(); auto *d_output_data = d_output->data(); - PADDLE_ENFORCE_EQ(d_table_value->dims(), d_output->dims()); + auto d_output_dims = d_output->dims(); + PADDLE_ENFORCE_EQ( + d_table_value->dims(), + framework::flatten_to_2d(d_output_dims, d_output_dims.size() - 1)); memory::Copy(gpu_place, d_table_data, gpu_place, d_output_data, d_output->numel() * sizeof(T), stream); diff --git a/paddle/fluid/operators/lookup_table_op.h b/paddle/fluid/operators/lookup_table_op.h index c9f074ca0e8dafb374dc9368165df5af5053a6b8..f5c10ced8305b64c6386c5051804f8c9a8f71802 100644 --- a/paddle/fluid/operators/lookup_table_op.h +++ b/paddle/fluid/operators/lookup_table_op.h @@ -109,17 +109,17 @@ class LookupTableGradKernel : public framework::OpKernel { auto *d_table = context.Output(framework::GradVarName("W")); auto *ids_data = ids->data(); - auto ids_dim = ids->dims(); + int64_t ids_num = ids->numel(); framework::Vector new_rows; - new_rows.reserve(ids_dim[0]); - for (int64_t i = 0; i < ids_dim[0]; i++) { + new_rows.reserve(ids_num); + for (int64_t i = 0; i < ids_num; i++) { new_rows.push_back(ids_data[i]); } d_table->set_rows(new_rows); auto *d_table_value = d_table->mutable_value(); - d_table_value->Resize({ids_dim[0], table_dim[1]}); + d_table_value->Resize({ids_num, table_dim[1]}); d_table_value->mutable_data(context.GetPlace()); d_table->set_height(table_dim[0]); @@ -127,7 +127,10 @@ class LookupTableGradKernel : public framework::OpKernel { auto *d_output_data = d_output->data(); auto *d_table_data = d_table_value->data(); - PADDLE_ENFORCE_EQ(d_table_value->dims(), d_output->dims()); + auto d_output_dims = d_output->dims(); + PADDLE_ENFORCE_EQ( + d_table_value->dims(), + framework::flatten_to_2d(d_output_dims, d_output_dims.size() - 1)); memcpy(d_table_data, d_output_data, sizeof(T) * d_output->numel()); } else { auto *ids = context.Input("Ids"); @@ -135,10 +138,9 @@ class LookupTableGradKernel : public framework::OpKernel { auto *d_table = context.Output(framework::GradVarName("W")); auto *ids_data = ids->data(); - auto ids_dim = ids->dims(); int N = table_dim[0]; - int D = d_output->dims()[1]; + int D = table_dim[1]; auto *d_output_data = d_output->data(); auto *d_table_data = d_table->mutable_data(context.GetPlace()); diff --git a/paddle/fluid/operators/parallel_do_op.cc b/paddle/fluid/operators/parallel_do_op.cc index c9744db3d0654ef63357963d9a9a3cb946f56e2d..916cdad3fd288d1f3ffb19bc769ab827dd1e9103 100644 --- a/paddle/fluid/operators/parallel_do_op.cc +++ b/paddle/fluid/operators/parallel_do_op.cc @@ -18,7 +18,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/threadpool.h" #include "paddle/fluid/operators/detail/safe_ref.h" -#include "paddle/fluid/platform/profiler.h" namespace paddle { namespace operators { @@ -166,8 +165,6 @@ class ParallelDoOp : public framework::OperatorBase { workers.emplace_back( framework::Async([program, cur_scope, place, block, place_idx] { - // Give the thread an id to distinguish parallel block with same id. - platform::RecordThread rt(static_cast(place_idx) + 1); framework::Executor executor(place); executor.Run(*program, cur_scope, block->ID(), false /*create_local_scope*/); @@ -244,8 +241,6 @@ class ParallelDoGradOp : public framework::OperatorBase { // execute workers.emplace_back( framework::Async([program, cur_scope, place, block, i] { - // Give the thread an id to distinguish parallel block with same id. - platform::RecordThread rt(static_cast(i) + 1); framework::Executor executor(place); executor.Run(*program, cur_scope, block->ID(), false /*create_local_scope*/); diff --git a/paddle/fluid/operators/read_op.cc b/paddle/fluid/operators/read_op.cc index 65fcce8bb019965a805ad09d50be0aba64e4f24e..a0d640b2020958af53a4405ae886eadb2a1e117e 100644 --- a/paddle/fluid/operators/read_op.cc +++ b/paddle/fluid/operators/read_op.cc @@ -15,6 +15,7 @@ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/reader.h" #include "paddle/fluid/operators/detail/safe_ref.h" +#include "paddle/fluid/platform/profiler.h" namespace paddle { namespace operators { @@ -65,6 +66,12 @@ class ReadOp : public framework::OperatorBase { .GetMutable(); std::vector out_arg_names = Outputs("Out"); std::vector ins; + + // For profiling + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto& ctx = *pool.Get(dev_place); + platform::RecordEvent record_event(Type(), &ctx); + reader->ReadNext(&ins); if (ins.empty()) { if (Attr("throw_eof_exp")) { diff --git a/paddle/fluid/operators/recv_op.cc b/paddle/fluid/operators/recv_op.cc index 1ba684014904e61a86bebacd7d29d7e10d313092..4a6ce938a5f337d035b21f562d46daf606236db0 100644 --- a/paddle/fluid/operators/recv_op.cc +++ b/paddle/fluid/operators/recv_op.cc @@ -40,8 +40,6 @@ class RecvOp : public framework::OperatorBase { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); auto& ctx = *pool.Get(place); - // For profiling - platform::RecordEvent record_event(Type(), &ctx); distributed::RPCClient* rpc_client = distributed::RPCClient::GetInstance(); diff --git a/paddle/fluid/operators/send_barrier_op.cc b/paddle/fluid/operators/send_barrier_op.cc index d7f8e994afd7e656bd5a9dd7c5ab45f0d52fe88b..1866a86048acbefadcb4d82cd6309cd16f0352d6 100644 --- a/paddle/fluid/operators/send_barrier_op.cc +++ b/paddle/fluid/operators/send_barrier_op.cc @@ -39,11 +39,6 @@ class SendBarrierOp : public framework::OperatorBase { std::vector eps = Attr>("endpoints"); bool sync_mode = Attr("sync_mode"); - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - auto& ctx = *pool.Get(place); - // For profiling - platform::RecordEvent record_event(Type(), &ctx); - distributed::RPCClient* rpc_client = distributed::RPCClient::GetInstance(); diff --git a/paddle/fluid/operators/send_op.cc b/paddle/fluid/operators/send_op.cc index 829f310d4233c01a7fbb9ccf7427f6e47ce8d384..3cd42f2d059532b7090e66ce21de8e5cb014adf1 100644 --- a/paddle/fluid/operators/send_op.cc +++ b/paddle/fluid/operators/send_op.cc @@ -42,9 +42,6 @@ class SendOp : public framework::OperatorBase { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); auto& ctx = *pool.Get(place); - // For profiling - platform::RecordEvent record_event(Type(), &ctx); - distributed::RPCClient* rpc_client = distributed::RPCClient::GetInstance(); diff --git a/paddle/fluid/operators/softmax_cudnn_op.cu.cc b/paddle/fluid/operators/softmax_cudnn_op.cu.cc index 5596fa0648ccc151bc0d11de9c556599428a8d71..2bdb23e999621b10799b5163f326bc4b66a437e6 100644 --- a/paddle/fluid/operators/softmax_cudnn_op.cu.cc +++ b/paddle/fluid/operators/softmax_cudnn_op.cu.cc @@ -30,8 +30,16 @@ class SoftmaxCUDNNKernel : public framework::OpKernel { // allocate memory on device. Out->mutable_data(context.GetPlace()); + auto dims = X->dims(); + auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1); + framework::LoDTensor flattened_x; + framework::LoDTensor flattened_out; + flattened_x.ShareDataWith(*X).Resize(flattened_dims); + flattened_out.ShareDataWith(*Out).Resize(flattened_dims); + math::SoftmaxCUDNNFunctor()( - context.template device_context(), X, Out); + context.template device_context(), + &flattened_x, &flattened_out); } }; @@ -46,9 +54,18 @@ class SoftmaxGradCUDNNKernel : public framework::OpKernel { // allocate memory on device. dX->mutable_data(context.GetPlace()); + auto dims = Out->dims(); + auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1); + framework::LoDTensor flattened_out; + framework::LoDTensor flattened_d_out; + framework::LoDTensor flattened_d_x; + flattened_out.ShareDataWith(*Out).Resize(flattened_dims); + flattened_d_out.ShareDataWith(*dOut).Resize(flattened_dims); + flattened_d_x.ShareDataWith(*dX).Resize(flattened_dims); + math::SoftmaxGradCUDNNFunctor()( - context.template device_context(), Out, - dOut, dX); + context.template device_context(), + &flattened_out, &flattened_d_out, &flattened_d_x); } }; diff --git a/paddle/fluid/operators/softmax_mkldnn_op.cc b/paddle/fluid/operators/softmax_mkldnn_op.cc index 6668e6b9e917eea7ba4a80ac78917b73eb827208..01819f53e3ab0973f6140c5a81f18f954b6a0376 100644 --- a/paddle/fluid/operators/softmax_mkldnn_op.cc +++ b/paddle/fluid/operators/softmax_mkldnn_op.cc @@ -26,9 +26,9 @@ using paddle::platform::MKLDNNMemDesc; using mkldnn::memory; // Note: paddle has also "memory" namespace using mkldnn::primitive; -using mkldnn::softmax_forward; -using mkldnn::softmax_backward; using mkldnn::prop_kind; +using mkldnn::softmax_backward; +using mkldnn::softmax_forward; using mkldnn::stream; using platform::to_void_cast; @@ -113,17 +113,27 @@ class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel { auto mkldnn_engine = dev_ctx.GetEngine(); const Tensor* input = ctx.Input("X"); Tensor* output = ctx.Output("Out"); - PADDLE_ENFORCE(input->dims().size() == 2UL, - "The input of softmax op must be a 2D matrix."); - const T* input_data = input->data(); - // allocate memory for output - T* output_data = output->mutable_data(ctx.GetPlace()); - std::vector src_tz = paddle::framework::vectorize2int(input->dims()); - std::vector dst_tz = paddle::framework::vectorize2int(output->dims()); - // MKL-DNN does support softmax over selected axis. Having 2D Tensor, - // we will make normalization after final eg. axis: 1 - PADDLE_ENFORCE(((src_tz[0] == dst_tz[0]) && (src_tz[1] == dst_tz[1])), - "Softmax input and output dimensions should match"); + PADDLE_ENFORCE_EQ( + input->dims(), output->dims(), + "The shape of softmax's input and output must be identical."); + + // make sure 'output' holds memory, which will be shared by + // 'flattened_output' later. + output->mutable_data(ctx.GetPlace()); + + // flatten input and output to 2-D matrixs + auto dims = input->dims(); // input and output share the same shape + auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1); + framework::Tensor flattened_input; + framework::Tensor flattened_output; + flattened_input.ShareDataWith(*input).Resize(flattened_dims); + flattened_output.ShareDataWith(*output).Resize(flattened_dims); + + const T* input_data = flattened_input.data(); + T* output_data = flattened_output.mutable_data(ctx.GetPlace()); + + std::vector src_tz = paddle::framework::vectorize2int(flattened_dims); + std::vector dst_tz = src_tz; // Same memory descriptor to be used for input and output memory::dims softmax_tz = {src_tz[0], src_tz[1]}; // Generate keys for storing/retriving primitives for this operator @@ -174,23 +184,34 @@ class SoftmaxMKLDNNGradKernel : public paddle::framework::OpKernel { auto& dev_ctx = ctx.template device_context(); auto mkldnn_engine = dev_ctx.GetEngine(); const Tensor* output = ctx.Input("Out"); - const T* dst_data = output->data(); - auto* dout = ctx.template Input(framework::GradVarName("Out")); - const auto* diff_dst_ptr = dout->template data(); - auto* dx = ctx.template Output(framework::GradVarName("X")); - T* diff_src_ptr = dx->template mutable_data(ctx.GetPlace()); - std::vector dst_tz = paddle::framework::vectorize2int(output->dims()); + PADDLE_ENFORCE_EQ( + dout->dims(), dx->dims(), + "The shape of softmax_grad's input and output must be identical."); + + // make sure 'dx' holds memory, which will be shared by 'flattened_dx' + // later. + dx->template mutable_data(ctx.GetPlace()); + + auto dims = dout->dims(); // input and output share the same shape + auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1); + framework::Tensor flattened_output; + framework::Tensor flattened_dout; + framework::Tensor flattened_dx; + flattened_output.ShareDataWith(*output).Resize(flattened_dims); + flattened_dout.ShareDataWith(*dout).Resize(flattened_dims); + flattened_dx.ShareDataWith(*dx).Resize(flattened_dims); + + const T* dst_data = flattened_output.data(); + const T* diff_dst_ptr = flattened_dout.template data(); + T* diff_src_ptr = flattened_dx.template mutable_data(ctx.GetPlace()); + + std::vector dst_tz = paddle::framework::vectorize2int(flattened_dims); std::vector src_tz(dst_tz); - PADDLE_ENFORCE(output->dims().size() == 2UL, - "The input of softmax op must be a 2D matrix."); - // MKL-DNN does support softmax over selected axis. Having 2D Tensor, - // we will make normalization after final eg. axis: 1 - PADDLE_ENFORCE(((src_tz[0] == dst_tz[0]) && (src_tz[1] == dst_tz[1])), - "Softmax input and output dimensions should match"); + // Same memory descriptor to be used for input and output memory::dims softmax_tz = {src_tz[0], src_tz[1]}; // Currently only supports NC data format diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc index fefc7125b4de7274589670d29be4511469d5064a..bb081238820b9ee3ae095442d21cfce11f7b41e5 100644 --- a/paddle/fluid/operators/softmax_op.cc +++ b/paddle/fluid/operators/softmax_op.cc @@ -37,10 +37,7 @@ class SoftmaxOp : public framework::OperatorWithKernel { PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) of SoftmaxOp should not be null."); - auto x_dims = ctx->GetInputDim("X"); - PADDLE_ENFORCE(x_dims.size() == 2UL, - "The input of softmax op must be a matrix."); - ctx->SetOutputDim("Out", x_dims); + ctx->SetOutputDim("Out", ctx->GetInputDim("X")); ctx->ShareLoD("X", /*->*/ "Out"); } @@ -81,8 +78,8 @@ class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { AddInput("X", - "The input tensor of softmax. " - "2-D with shape [batch_size, input_feature_dimensions]."); + "The input tensor of softmax, " + "whose last dimension is the input_feature_dimensions."); AddOutput("Out", "The normalized values with the same shape as X.") .Reuse("X"); AddAttr( @@ -105,20 +102,23 @@ class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker { AddComment(R"DOC( Softmax Operator. -The input of the softmax operator is a 2-D tensor with shape N x K (N is the -batch_size, K is the dimension of input feature). The output tensor has the -same shape as the input tensor. +The input of the softmax operator is a tensor of any rank. The output tensor +has the same shape as the input. -For each row of the input tensor, the softmax operator squashes the -K-dimensional vector of arbitrary real values to a K-dimensional vector of real -values in the range [0, 1] that add up to 1. +The input tensor will first be logically flattened to a 2-D matrix. The matrix's +second dimension(row length) is as same as the last dimension of the input +tensor, and the first dimension(column length) is the product of all other +dimensions of the input tensor. For each row of the matrix, the softmax operator +squashes the K-dimensional(K is the width of the matrix, which is also the size +of the input tensor's last dimension) vector of arbitrary real values to a +K-dimensional vector of real values in the range [0, 1] that add up to 1. It computes the exponential of the given dimension and the sum of exponential values of all the other dimensions in the K-dimensional vector input. Then the ratio of the exponential of the given dimension and the sum of exponential values of all the other dimensions is the output of the softmax operator. -For each row $i$ and each column $j$ in Input(X), we have: +For each row $i$ and each column $j$ in the matrix, we have: $$Out[i, j] = \frac{\exp(X[i, j])}{\sum_j(exp(X[i, j])}$$ )DOC"); diff --git a/paddle/fluid/operators/softmax_op.h b/paddle/fluid/operators/softmax_op.h index 600da45a0bbb69b76d59c981e195fc03a49b0504..1205bd0587f32caae04c27ecea581fc17988507f 100644 --- a/paddle/fluid/operators/softmax_op.h +++ b/paddle/fluid/operators/softmax_op.h @@ -31,8 +31,16 @@ class SoftmaxKernel : public framework::OpKernel { // allocate memory on device. Out->mutable_data(context.GetPlace()); + auto dims = X->dims(); + auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1); + framework::LoDTensor flattened_x; + framework::LoDTensor flattened_out; + flattened_x.ShareDataWith(*X).Resize(flattened_dims); + flattened_out.ShareDataWith(*Out).Resize(flattened_dims); + math::SoftmaxFunctor()( - context.template device_context(), X, Out); + context.template device_context(), &flattened_x, + &flattened_out); } }; @@ -47,8 +55,18 @@ class SoftmaxGradKernel : public framework::OpKernel { // allocate memory on device. dX->mutable_data(context.GetPlace()); + auto dims = Out->dims(); + auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1); + framework::LoDTensor flattened_out; + framework::LoDTensor flattened_d_out; + framework::LoDTensor flattened_d_x; + flattened_out.ShareDataWith(*Out).Resize(flattened_dims); + flattened_d_out.ShareDataWith(*dOut).Resize(flattened_dims); + flattened_d_x.ShareDataWith(*dX).Resize(flattened_dims); + math::SoftmaxGradFunctor()( - context.template device_context(), Out, dOut, dX); + context.template device_context(), &flattened_out, + &flattened_d_out, &flattened_d_x); } }; diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc index d9e2afadaf8ec439d158e57c94d3e6e684bce116..8fa8dbd67c936439840cffa073b6fa6693dd31a1 100644 --- a/paddle/fluid/platform/device_tracer.cc +++ b/paddle/fluid/platform/device_tracer.cc @@ -30,9 +30,6 @@ limitations under the License. */ namespace paddle { namespace platform { namespace { -// Current thread's id. Note, we don't distinguish nested threads -// for now. -thread_local int cur_thread_id = 0; // Tracking the nested block stacks of each thread. thread_local std::deque block_id_stack; // Tracking the nested event stacks. @@ -413,12 +410,5 @@ void SetCurBlock(int block_id) { block_id_stack.push_back(block_id); } void ClearCurBlock() { block_id_stack.pop_back(); } int BlockDepth() { return block_id_stack.size(); } - -void SetCurThread(int thread_id) { cur_thread_id = thread_id; } - -void ClearCurThread() { cur_thread_id = 0; } - -int CurThread() { return cur_thread_id; } - } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/device_tracer.h b/paddle/fluid/platform/device_tracer.h index 0375c7439c29d4122e8ff6b58734dad4f504b7a2..d2a571f4345b544ad5e74f4629c3967593d6d628 100644 --- a/paddle/fluid/platform/device_tracer.h +++ b/paddle/fluid/platform/device_tracer.h @@ -99,9 +99,5 @@ std::string CurAnnotation(); void SetCurBlock(int block_id); void ClearCurBlock(); int BlockDepth(); - -void SetCurThread(int thread_id); -void ClearCurThread(); -int CurThread(); } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc index 01de9d7041bf3eb40884e2a6295027cccfaebd2a..7c8d8a5964fa5258bebaf2c8522886ae5886ab2c 100644 --- a/paddle/fluid/platform/profiler.cc +++ b/paddle/fluid/platform/profiler.cc @@ -110,6 +110,8 @@ Event::Event(EventType type, std::string name, uint32_t thread_id, has_cuda_ = dev_ctx ? platform::is_gpu_place(dev_ctx->GetPlace()) : false; if (has_cuda_) { auto* cuda_dev_ctx = static_cast(dev_ctx); + PADDLE_ENFORCE(cudaSetDevice( + boost::get(cuda_dev_ctx->GetPlace()).device)); PADDLE_ENFORCE(cudaGetDevice(&device_)); PADDLE_ENFORCE(cudaEventCreate(&event_)); auto stream = cuda_dev_ctx->stream(); @@ -176,6 +178,7 @@ void PopEvent(const std::string& name, const DeviceContext* dev_ctx) { RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx) : is_enabled_(false), start_ns_(PosixInNsec()) { + std::lock_guard l(profiler_mu); if (g_state == ProfilerState::kDisabled) return; is_enabled_ = true; dev_ctx_ = dev_ctx; @@ -186,11 +189,12 @@ RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx) } RecordEvent::~RecordEvent() { + std::lock_guard l(profiler_mu); if (g_state == ProfilerState::kDisabled || !is_enabled_) return; DeviceTracer* tracer = GetDeviceTracer(); if (tracer) { tracer->AddCPURecords(CurAnnotation(), start_ns_, PosixInNsec(), - BlockDepth(), CurThread()); + BlockDepth(), g_thread_id); } ClearCurAnnotation(); PopEvent(name_, dev_ctx_); @@ -198,6 +202,7 @@ RecordEvent::~RecordEvent() { RecordBlock::RecordBlock(int block_id) : is_enabled_(false), start_ns_(PosixInNsec()) { + std::lock_guard l(profiler_mu); if (g_state == ProfilerState::kDisabled) return; is_enabled_ = true; SetCurBlock(block_id); @@ -205,27 +210,18 @@ RecordBlock::RecordBlock(int block_id) } RecordBlock::~RecordBlock() { + std::lock_guard l(profiler_mu); if (g_state == ProfilerState::kDisabled || !is_enabled_) return; DeviceTracer* tracer = GetDeviceTracer(); if (tracer) { // We try to put all blocks at the same nested depth in the // same timeline lane. and distinguish the using thread_id. tracer->AddCPURecords(name_, start_ns_, PosixInNsec(), BlockDepth(), - CurThread()); + g_thread_id); } ClearCurBlock(); } -RecordThread::RecordThread(int thread_id) { - if (g_state == ProfilerState::kDisabled) return; - SetCurThread(thread_id); -} - -RecordThread::~RecordThread() { - if (g_state == ProfilerState::kDisabled) return; - ClearCurThread(); -} - void EnableProfiler(ProfilerState state) { PADDLE_ENFORCE(state != ProfilerState::kDisabled, "Can't enbale profling, since the input state is ", diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h index bf43925373a12cd9ff2155d68c42d0266ba4df60..c99d9c807d1bfb45d1ce0725b84b9fff09049511 100644 --- a/paddle/fluid/platform/profiler.h +++ b/paddle/fluid/platform/profiler.h @@ -95,11 +95,6 @@ struct RecordBlock { uint64_t start_ns_; }; -struct RecordThread { - explicit RecordThread(int thread_id); - ~RecordThread(); -}; - // Return the event list of all threads. Assumed the returned value calls // event_lists, event_lists[i][j] represents the j-th Event of i-th thread. std::vector> GetAllEvents(); diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index a8bc16f1b5b9b624e88e355d8ce4741fcec34bc3..8460f93b841fe136db138e0dc7576f3aacdbeb5f 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -419,6 +419,25 @@ EOF linkchecker doc/v2/en/html/index.html linkchecker doc/v2/cn/html/index.html linkchecker doc/v2/api/en/html/index.html + + if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then exit 0; fi; + + # Deploy to the the content server if its a "develop" or "release/version" branch + # The "develop_doc" branch is reserved to test full deploy process without impacting the real content. + if [ "$TRAVIS_BRANCH" == "develop_doc" ]; then + PPO_SCRIPT_BRANCH=develop + elif [[ "$TRAVIS_BRANCH" == "develop" || "$TRAVIS_BRANCH" =~ ^v|release/[[:digit:]]+\.[[:digit:]]+(\.[[:digit:]]+)?(-\S*)?$ ]]; then + PPO_SCRIPT_BRANCH=master + else + # Early exit, this branch doesn't require documentation build + return 0; + fi + # Fetch the paddlepaddle.org deploy_docs.sh from the appopriate branch + export DEPLOY_DOCS_SH=https://raw.githubusercontent.com/PaddlePaddle/PaddlePaddle.org/$PPO_SCRIPT_BRANCH/scripts/deploy/deploy_docs.sh + export PYTHONPATH=$PYTHONPATH:${PADDLE_ROOT}/build/python:/paddle/build/python + cd .. + curl $DEPLOY_DOCS_SH | bash -s $CONTENT_DEC_PASSWD $TRAVIS_BRANCH ${PADDLE_ROOT} ${PADDLE_ROOT}/build/doc/ ${PPO_SCRIPT_BRANCH} + cd - } function gen_html() { diff --git a/paddle/scripts/paddle_docker_build.sh b/paddle/scripts/paddle_docker_build.sh index 3462deb9c2f88b6da643d6aa833449ed5f4a9b34..174c2a12f007b282a5182c0aec9b0a6bec9e55fa 100755 --- a/paddle/scripts/paddle_docker_build.sh +++ b/paddle/scripts/paddle_docker_build.sh @@ -52,6 +52,9 @@ EOL ${DOCKER_CMD} run -it \ ${DOCKER_ENV} \ -e SCRIPT_NAME=$0 \ + -e CONTENT_DEC_PASSWD=$CONTENT_DEC_PASSWD \ + -e TRAVIS_BRANCH=$TRAVIS_BRANCH \ + -e TRAVIS_PULL_REQUEST=$TRAVIS_PULL_REQUEST \ -v $PADDLE_ROOT:/paddle \ -v ${HOME}/.ccache:/root/.ccache \ -w /paddle \ diff --git a/python/paddle/batch.py b/python/paddle/batch.py index d48c54fcbb66487617b1946bc69724870c8f879c..008509660739d61245526278735064472b8b06dd 100644 --- a/python/paddle/batch.py +++ b/python/paddle/batch.py @@ -40,4 +40,10 @@ def batch(reader, batch_size, drop_last=False): if drop_last == False and len(b) != 0: yield b + # Batch size check + batch_size = int(batch_size) + if batch_size <= 0: + raise ValueError("batch_size should be a positive integeral value, " + "but got batch_size={}".format(batch_size)) + return batch_reader diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 956e3c43485b36aaeb2d366d6145edd3d4535122..3b38c42801e0a4b503d929ca422b354f4c51bb0c 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -123,7 +123,8 @@ def __bootstrap__(): read_env_flags = [ 'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir', 'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb', - 'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads' + 'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads', + 'cpu_deterministic' ] if core.is_compiled_with_dist(): read_env_flags.append('rpc_deadline') diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py index 812f68bdd849544456b2e0ebf0b739f4f92b09ea..3fb7019a450da5903952c59ee483d88fde42701a 100644 --- a/python/paddle/fluid/backward.py +++ b/python/paddle/fluid/backward.py @@ -572,8 +572,6 @@ def append_backward(loss, parameter_list=None, no_grad_set=None, program.current_block_idx = current_block_idx program._sync_with_cpp() - # FIXME(zcd): prevent loss.grad optimized by mem_opt. - loss.block.var(_append_grad_suffix_(loss.name)).persistable = True if parameter_list is not None: parameters = parameter_list diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index e10f8325e46ee52e98f9d31caddaf9ec7d188d67..10b318cf547b5bfb0d70aeb9fb25b17793297afb 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -1038,7 +1038,26 @@ class Block(object): global_block = self.program.global_block() param = Parameter(global_block, *args, **kwargs) if 'initializer' in kwargs: - kwargs['initializer'](param, self) + + def _is_inited_by(block, var): + init_ops = [] + for op in block.ops: + if var.name in op.output_arg_names: + init_ops.append(op) + return init_ops + + initializer = kwargs['initializer'] + init_ops = _is_inited_by(global_block, param) + init_ops_len = len(init_ops) + if init_ops_len > 1: + raise RuntimeError("param " + param.name + + " is inited by multiple init ops " + str( + init_ops)) + elif init_ops_len == 1: + #TODO already inited, do nothing, should log a warning + pass + else: + initializer(param, self) return param def append_op(self, *args, **kwargs): diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 058acd4a50ef54cea724a742d40eaca8f569a21c..94933d1489c356eb6e9efd6d98bd1cba5ddfcd23 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -949,6 +949,10 @@ def dropout(x, dropout_prob, is_test=False, seed=None, name=None): helper = LayerHelper('dropout', **locals()) out = helper.create_tmp_variable(dtype=x.dtype) mask = helper.create_tmp_variable(dtype=x.dtype, stop_gradient=True) + + if (seed is None or seed == 0) and helper.main_program.random_seed != 0: + seed = helper.main_program.random_seed + helper.append_op( type='dropout', inputs={'X': [x]}, @@ -1313,13 +1317,16 @@ def sequence_softmax(input, param_attr=None, bias_attr=None, use_cudnn=True): def softmax(input, param_attr=None, bias_attr=None, use_cudnn=True, name=None): """ - The input of the softmax layer is a 2-D tensor with shape N x K (N is the - batch_size, K is the dimension of input feature). The output tensor has the - same shape as the input tensor. + The input of the softmax operator is a tensor of any rank. The output tensor + has the same shape as the input. - For each row of the input tensor, the softmax operator squashes the - K-dimensional vector of arbitrary real values to a K-dimensional vector of real - values in the range [0, 1] that add up to 1. + The input tensor will first be logically flattened to a 2-D matrix. The matrix's + second dimension(row length) is as same as the last dimension of the input + tensor, and the first dimension(column length) is the product of all other + dimensions of the input tensor. For each row of the matrix, the softmax operator + squashes the K-dimensional(K is the width of the matrix, which is also the size + of the input tensor's last dimension) vector of arbitrary real values to a + K-dimensional vector of real values in the range [0, 1] that add up to 1. It computes the exponential of the given dimension and the sum of exponential values of all the other dimensions in the K-dimensional vector input. @@ -1327,7 +1334,7 @@ def softmax(input, param_attr=None, bias_attr=None, use_cudnn=True, name=None): exponential values of all the other dimensions is the output of the softmax operator. - For each row :math:`i` and each column :math:`j` in Input(X), we have: + For each row :math:`i` and each column :math:`j` in the matrix, we have: .. math:: diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index c8e881a672ad25654bd28604abfafc2c569af7ca..a6a911721dfa31e5fb8d57645071af42adc968be 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -50,6 +50,8 @@ list(REMOVE_ITEM TEST_OPS test_parallel_executor_crf) list(REMOVE_ITEM TEST_OPS test_parallel_executor_fetch_feed) list(REMOVE_ITEM TEST_OPS test_dist_se_resnext) list(REMOVE_ITEM TEST_OPS test_dist_transformer) +list(REMOVE_ITEM TEST_OPS test_parallel_executor_transformer) +list(REMOVE_ITEM TEST_OPS test_image_classification_resnet) foreach(TEST_OP ${TEST_OPS}) py_test_modules(${TEST_OP} MODULES ${TEST_OP}) endforeach(TEST_OP) @@ -64,3 +66,5 @@ py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SE py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL) py_test_modules(test_dist_transformer MODULES test_dist_transformer SERIAL) py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext SERIAL) +py_test_modules(test_parallel_executor_transformer MODULES test_parallel_executor_transformer SERIAL) +py_test_modules(test_image_classification_resnet MODULES test_image_classification_resnet SERIAL) diff --git a/python/paddle/fluid/tests/unittests/dist_se_resnext.py b/python/paddle/fluid/tests/unittests/dist_se_resnext.py index bf7816b2466edd7db836c738da90f5f97b631843..f1f35d96f67ad5ef79ec9cb20f070a8352f0e97e 100644 --- a/python/paddle/fluid/tests/unittests/dist_se_resnext.py +++ b/python/paddle/fluid/tests/unittests/dist_se_resnext.py @@ -174,6 +174,9 @@ class SE_ResNeXt(): padding=(filter_size - 1) / 2, groups=groups, act=None, + # avoid pserver CPU init differs from GPU + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant()), bias_attr=False) return fluid.layers.batch_norm(input=conv, act=act) @@ -194,10 +197,8 @@ class SE_ResNeXt(): def get_model(batch_size): # Input data - image = fluid.layers.fill_constant( - shape=[batch_size, 3, 224, 224], dtype='float32', value=0.0) - label = fluid.layers.fill_constant( - shape=[batch_size, 1], dtype='int64', value=0.0) + image = fluid.layers.data(name="data", shape=[3, 224, 224], dtype='float32') + label = fluid.layers.data(name="int64", shape=[1], dtype='int64') # Train program model = SE_ResNeXt(layers=50) @@ -222,8 +223,10 @@ def get_model(batch_size): lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)] optimizer = fluid.optimizer.Momentum( - learning_rate=fluid.layers.piecewise_decay( - boundaries=bd, values=lr), + # FIXME(typhoonzero): add back LR decay once ParallelExecutor fixed. + #learning_rate=fluid.layers.piecewise_decay( + # boundaries=bd, values=lr), + learning_rate=base_lr, momentum=0.9, regularization=fluid.regularizer.L2Decay(1e-4)) optimizer.minimize(avg_cost) @@ -232,7 +235,7 @@ def get_model(batch_size): train_reader = paddle.batch( paddle.dataset.flowers.train(), batch_size=batch_size) test_reader = paddle.batch( - paddle.dataset.flowers.test(), batch_size=batch_size) + paddle.dataset.flowers.test(use_xmap=False), batch_size=batch_size) return test_program, avg_cost, train_reader, test_reader, acc_top1, out @@ -256,7 +259,6 @@ class DistSeResneXt2x2: trainers) pserver_prog = t.get_pserver_program(current_endpoint) startup_prog = t.get_startup_program(current_endpoint, pserver_prog) - place = fluid.CPUPlace() exe = fluid.Executor(place) exe.run(startup_prog) @@ -302,12 +304,19 @@ class DistSeResneXt2x2: ] feeder = fluid.DataFeeder(feed_var_list, place) - reader_generator = train_reader() - first_loss, = exe.run(fetch_list=[avg_cost.name]) + reader_generator = test_reader() + + data = next(reader_generator) + first_loss, = exe.run(fetch_list=[avg_cost.name], + feed=feeder.feed(data)) print(first_loss) + for i in xrange(5): - loss, = exe.run(fetch_list=[avg_cost.name]) - last_loss, = exe.run(fetch_list=[avg_cost.name]) + data = next(reader_generator) + loss, = exe.run(fetch_list=[avg_cost.name], feed=feeder.feed(data)) + + data = next(reader_generator) + last_loss, = exe.run(fetch_list=[avg_cost.name], feed=feeder.feed(data)) print(last_loss) diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py index 1aaab6f906ef6482bc515bb3c42d82431902e1d8..58cfd4e1fd958d8d59e49c87fbbabd0182975add 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_base.py +++ b/python/paddle/fluid/tests/unittests/test_dist_base.py @@ -63,7 +63,8 @@ class TestDistBase(unittest.TestCase): "PATH": os.getenv("PATH"), "PYTHONPATH": os.getenv("PYTHONPATH"), "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH"), - "FLAGS_fraction_of_gpu_memory_to_use": "0.15" + "FLAGS_fraction_of_gpu_memory_to_use": "0.15", + "FLAGS_cudnn_deterministic": "1" } # Run local to get a base line env_local = {"CUDA_VISIBLE_DEVICES": "0"} diff --git a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py index 04671d079731ce414561b0ede6bc2b195b07d82a..f3a5fd6985bab1d04f6e1484534367548f383dfb 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py +++ b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py @@ -17,8 +17,7 @@ from test_dist_base import TestDistBase class TestDistSeResneXt2x2(TestDistBase): def test_se_resnext(self): - # TODO(paddle-dev): Is the delta too large? - self.check_with_place("dist_se_resnext.py", delta=0.2) + self.check_with_place("dist_se_resnext.py") if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py index 9dbef0693bb129186dfc50f6efdd0896deedda81..b24036326d51aa56220d46cba202a0d4b93cdd7c 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py +++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py @@ -73,9 +73,18 @@ class TranspilerTest(unittest.TestCase): return self.transpiler + def transpiler_test_impl(self): + pass -class TestBasicModel(TranspilerTest): def test_transpiler(self): + main = fluid.Program() + startup = fluid.Program() + with fluid.program_guard(main, startup): + self.transpiler_test_impl() + + +class TestBasicModel(TranspilerTest): + def transpiler_test_impl(self): pserver, startup = self.get_pserver(self.pserver1_ep) pserver2, startup2 = self.get_pserver(self.pserver2_ep) @@ -123,7 +132,7 @@ class TestBasicModel(TranspilerTest): class TestBasicModelWithLargeBlockSize(TranspilerTest): - def test_transpiler(self): + def transpiler_test_impl(self): config = fluid.DistributeTranspilerConfig() config.min_block_size = 1048576 @@ -148,7 +157,7 @@ class TestBasicModelWithLargeBlockSize(TranspilerTest): ["sum", "scale", "sgd"]) # confirm startup program self.assertEqual([op.type for op in startup.global_block().ops], - ["fill_constant", "fill_constant", "fill_constant"]) + ["fill_constant", "fill_constant"]) # the variable #fc_w will be split into two blocks fc_w_var = startup2.global_block().var("fc_w") self.assertEqual(fc_w_var.shape, (1000L, 1000L)) @@ -177,7 +186,7 @@ class TestNoSliceVar(TranspilerTest): def setUp(self): super(TestNoSliceVar, self).setUp() - def test_transpiler(self): + def transpiler_test_impl(self): config = fluid.DistributeTranspilerConfig() config.slice_var_up = False @@ -212,7 +221,7 @@ class TestLRDecay(TranspilerTest): sgd_optimizer.minimize(avg_cost) return - def test_transpiler(self): + def transpiler_test_impl(self): pserver, startup = self.get_pserver(self.pserver1_ep) trainer = self.get_trainer() @@ -242,7 +251,7 @@ class TestLRDecayConditional(TranspilerTest): sgd_optimizer.minimize(avg_cost) return - def test_transpiler(self): + def transpiler_test_impl(self): pserver, startup = self.get_pserver(self.pserver1_ep) trainer = self.get_trainer() @@ -291,7 +300,7 @@ class TestL2Decay(TranspilerTest): sgd_optimizer.minimize(avg_cost) return - def test_transpiler(self): + def transpiler_test_impl(self): pserver, startup = self.get_pserver(self.pserver1_ep) trainer = self.get_trainer() @@ -326,7 +335,7 @@ class TestL2DecayWithPiecewise(TranspilerTest): sgd_optimizer.minimize(avg_cost) return - def test_transpiler(self): + def transpiler_test_impl(self): pserver, startup = self.get_pserver(self.pserver1_ep) trainer = self.get_trainer() @@ -350,5 +359,110 @@ class TestL2DecayWithPiecewise(TranspilerTest): ["sum", "scale", "scale", "elementwise_add", "momentum"]) +class TestDistLookupTableBase(TranspilerTest): + def network_with_table(self, is_sparse, is_distributed): + def emb_pool(ids): + table_size = 1000 + emb_size = 64 + emb = fluid.layers.embedding( + input=ids, + size=[table_size, emb_size], + dtype='float32', + param_attr='shared_w', # share parameter + is_sparse=is_sparse, + is_distributed=is_distributed) + pool = fluid.layers.sequence_pool(input=emb, pool_type='average') + return pool + + title_ids = fluid.layers.data( + name='title_ids', shape=[1], dtype='int64', lod_level=1) + brand_ids = fluid.layers.data( + name='brand_ids', shape=[1], dtype='int64', lod_level=1) + title_emb = emb_pool(title_ids) + brand_emb = emb_pool(brand_ids) + fc0 = fluid.layers.concat(input=[title_emb, brand_emb], axis=1) + predict = fluid.layers.fc(input=fc0, + size=2, + act=None, + param_attr=fluid.ParamAttr(name='fc_w'), + bias_attr=fluid.ParamAttr(name='fc_b')) + + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + cost = fluid.layers.cross_entropy(input=predict, label=label) + avg_cost = fluid.layers.mean(cost) + optimizer = fluid.optimizer.Adam(learning_rate=0.003) + optimizer.minimize(avg_cost) + + +class TestLocalLookupTable(TestDistLookupTableBase): + def net_conf(self): + self.network_with_table(is_sparse=True, is_distributed=False) + + def transpiler_test_impl(self): + pserver1, startup1 = self.get_pserver(self.pserver1_ep) + + self.assertEqual(len(pserver1.blocks), 3) + # 0 listen_and_serv + # 1 optimize for fc_w or fc_b adam + self.assertEqual([op.type for op in pserver1.blocks[1].ops], + ["sum", "scale", "adam", "scale", "scale"]) + # 2 optimize for table adam + # NOTE: if param is not selected rows, the grad will scaled to grad / trainer_num + self.assertEqual([op.type for op in pserver1.blocks[2].ops], + ["sum", "adam", "scale", "scale"]) + + trainer = self.get_trainer() + self.assertEqual(len(trainer.blocks), 1) + ops = [ + 'lookup_table', 'sequence_pool', 'lookup_table', 'sequence_pool', + 'concat', 'mul', 'elementwise_add', 'cross_entropy', 'mean', + 'fill_constant', 'mean_grad', 'cross_entropy_grad', + 'elementwise_add_grad', 'send', 'mul_grad', 'send', 'concat_grad', + 'sequence_pool_grad', 'lookup_table_grad', 'sequence_pool_grad', + 'lookup_table_grad', 'sum', 'split_selected_rows', 'send', + 'send_barrier', 'recv', 'recv', 'recv', 'fetch_barrier', 'concat' + ] + self.assertEqual([op.type for op in trainer.blocks[0].ops], ops) + + +class TestDistLookupTable(TestDistLookupTableBase): + def net_conf(self): + self.network_with_table(is_sparse=True, is_distributed=True) + + def transpiler_test_impl(self): + pserver1, startup1 = self.get_pserver(self.pserver1_ep) + + self.assertEqual(len(pserver1.blocks), 6) + # 0 listen_and_serv + # 1 optimize for fc_w or fc_b adam + self.assertEqual([op.type for op in pserver1.blocks[1].ops], + ["sum", "scale", "adam", "scale", "scale"]) + # 2 optimize for table sgd + self.assertEqual([op.type for op in pserver1.blocks[2].ops], + ["sum", "sgd"]) + # 3 prefetch -> lookup_sparse_table for data0 + self.assertEqual([op.type for op in pserver1.blocks[3].ops], + ["lookup_sparse_table"]) + # 4 prefetch -> lookup_sparse_table for data1 + self.assertEqual([op.type for op in pserver1.blocks[4].ops], + ["lookup_sparse_table"]) + # 5 save table + self.assertEqual([op.type for op in pserver1.blocks[5].ops], ["save"]) + + trainer = self.get_trainer() + self.assertEqual(len(trainer.blocks), 1) + ops = [ + 'split_ids', 'prefetch', 'merge_ids', 'sequence_pool', 'split_ids', + 'prefetch', 'merge_ids', 'sequence_pool', 'concat', 'mul', + 'elementwise_add', 'cross_entropy', 'mean', 'fill_constant', + 'mean_grad', 'cross_entropy_grad', 'elementwise_add_grad', 'send', + 'mul_grad', 'send', 'concat_grad', 'sequence_pool_grad', + 'lookup_table_grad', 'sequence_pool_grad', 'lookup_table_grad', + 'sum', 'split_ids', 'send', 'send_barrier', 'recv', 'recv', + 'fetch_barrier' + ] + self.assertEqual([op.type for op in trainer.blocks[0].ops], ops) + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_initializer.py b/python/paddle/fluid/tests/unittests/test_initializer.py index 15a72cb605911dfe957fb927763174521a30a085..b215e379864e919af03591ab2566c08dddbb5743 100644 --- a/python/paddle/fluid/tests/unittests/test_initializer.py +++ b/python/paddle/fluid/tests/unittests/test_initializer.py @@ -27,12 +27,13 @@ class TestConstantInitializer(unittest.TestCase): """ program = framework.Program() block = program.global_block() - block.create_parameter( - dtype="float32", - shape=[5, 10], - lod_level=0, - name="param", - initializer=initializer.ConstantInitializer()) + for _ in range(2): + block.create_parameter( + dtype="float32", + shape=[5, 10], + lod_level=0, + name="param", + initializer=initializer.ConstantInitializer()) self.assertEqual(len(block.ops), 1) init_op = block.ops[0] self.assertEqual(init_op.type, 'fill_constant') @@ -43,12 +44,13 @@ class TestConstantInitializer(unittest.TestCase): """ program = framework.Program() block = program.global_block() - block.create_parameter( - dtype="float32", - shape=[5, 10], - lod_level=0, - name="param", - initializer=initializer.ConstantInitializer(2.3)) + for _ in range(2): + block.create_parameter( + dtype="float32", + shape=[5, 10], + lod_level=0, + name="param", + initializer=initializer.ConstantInitializer(2.3)) self.assertEqual(len(block.ops), 1) init_op = block.ops[0] self.assertEqual(init_op.type, 'fill_constant') @@ -61,12 +63,13 @@ class TestUniformInitializer(unittest.TestCase): """ program = framework.Program() block = program.global_block() - block.create_parameter( - dtype="float32", - shape=[5, 10], - lod_level=0, - name="param", - initializer=initializer.UniformInitializer()) + for _ in range(2): + block.create_parameter( + dtype="float32", + shape=[5, 10], + lod_level=0, + name="param", + initializer=initializer.UniformInitializer()) self.assertEqual(len(block.ops), 1) init_op = block.ops[0] self.assertEqual(init_op.type, 'uniform_random') @@ -80,18 +83,19 @@ class TestUniformInitializer(unittest.TestCase): program = framework.Program() program.random_seed = 123 block = program.global_block() - block.create_parameter( - dtype="float32", - shape=[5, 10], - lod_level=0, - name="param", - initializer=initializer.UniformInitializer()) - block.create_parameter( - dtype="float32", - shape=[5, 10], - lod_level=0, - name="param", - initializer=initializer.UniformInitializer(seed=456)) + for _ in range(2): + block.create_parameter( + dtype="float32", + shape=[5, 10], + lod_level=0, + name="param1", + initializer=initializer.UniformInitializer()) + block.create_parameter( + dtype="float32", + shape=[5, 10], + lod_level=0, + name="param2", + initializer=initializer.UniformInitializer(seed=456)) init_op = block.ops[1] self.assertEqual(init_op.attr("seed"), 123) init_op1 = block.ops[0] @@ -102,12 +106,13 @@ class TestUniformInitializer(unittest.TestCase): """ program = framework.Program() block = program.global_block() - block.create_parameter( - dtype="float32", - shape=[5, 10], - lod_level=0, - name="param", - initializer=initializer.UniformInitializer(-4.2, 3.1, 123)) + for _ in range(2): + block.create_parameter( + dtype="float32", + shape=[5, 10], + lod_level=0, + name="param", + initializer=initializer.UniformInitializer(-4.2, 3.1, 123)) self.assertEqual(len(block.ops), 1) init_op = block.ops[0] self.assertEqual(init_op.type, 'uniform_random') @@ -115,6 +120,25 @@ class TestUniformInitializer(unittest.TestCase): self.assertAlmostEqual(init_op.attr('max'), 3.1, delta=DELTA) self.assertEqual(init_op.attr('seed'), 123) + def test_uniform_initializer_two_op(self): + """Test uniform initializer with supplied attributes + """ + program = framework.Program() + block = program.global_block() + for i in range(2): + block.create_parameter( + dtype="float32", + shape=[5, 10], + lod_level=0, + name="param", + initializer=initializer.UniformInitializer(-4.2, float(i), 123)) + self.assertEqual(len(block.ops), 1) + init_op0 = block.ops[0] + self.assertEqual(init_op0.type, 'uniform_random') + self.assertAlmostEqual(init_op0.attr('min'), -4.2, delta=DELTA) + self.assertAlmostEqual(init_op0.attr('max'), 0.0, delta=DELTA) + self.assertEqual(init_op0.attr('seed'), 123) + class TestNormalInitializer(unittest.TestCase): def test_normal_initializer_default_value(self): @@ -122,12 +146,13 @@ class TestNormalInitializer(unittest.TestCase): """ program = framework.Program() block = program.global_block() - block.create_parameter( - dtype="float32", - shape=[5, 10], - lod_level=0, - name="param", - initializer=initializer.NormalInitializer()) + for _ in range(2): + block.create_parameter( + dtype="float32", + shape=[5, 10], + lod_level=0, + name="param", + initializer=initializer.NormalInitializer()) self.assertEqual(len(block.ops), 1) init_op = block.ops[0] self.assertEqual(init_op.type, 'gaussian_random') @@ -140,12 +165,13 @@ class TestNormalInitializer(unittest.TestCase): """ program = framework.Program() block = program.global_block() - block.create_parameter( - dtype="float32", - shape=[5, 10], - lod_level=0, - name="param", - initializer=initializer.NormalInitializer(2.3, 1.9, 123)) + for _ in range(2): + block.create_parameter( + dtype="float32", + shape=[5, 10], + lod_level=0, + name="param", + initializer=initializer.NormalInitializer(2.3, 1.9, 123)) self.assertEqual(len(block.ops), 1) init_op = block.ops[0] self.assertEqual(init_op.type, 'gaussian_random') @@ -161,12 +187,13 @@ class TestXavierInitializer(unittest.TestCase): """ program = framework.Program() block = program.global_block() - param = block.create_parameter( - dtype="float32", - shape=[5, 10], - lod_level=0, - name="param", - initializer=initializer.XavierInitializer()) + for _ in range(2): + param = block.create_parameter( + dtype="float32", + shape=[5, 10], + lod_level=0, + name="param", + initializer=initializer.XavierInitializer()) self.assertEqual(len(block.ops), 1) init_op = block.ops[0] self.assertEqual(init_op.type, 'uniform_random') @@ -181,12 +208,13 @@ class TestXavierInitializer(unittest.TestCase): """ program = framework.Program() block = program.global_block() - param = block.create_parameter( - dtype="float32", - shape=[5, 10, 15, 20], - lod_level=0, - name="param", - initializer=initializer.XavierInitializer()) + for _ in range(2): + param = block.create_parameter( + dtype="float32", + shape=[5, 10, 15, 20], + lod_level=0, + name="param", + initializer=initializer.XavierInitializer()) self.assertEqual(len(block.ops), 1) init_op = block.ops[0] self.assertEqual(init_op.type, 'uniform_random') @@ -203,12 +231,13 @@ class TestXavierInitializer(unittest.TestCase): """ program = framework.Program() block = program.global_block() - param = block.create_parameter( - dtype="float32", - shape=[5, 10], - lod_level=0, - name="param", - initializer=initializer.XavierInitializer(uniform=False)) + for _ in range(2): + param = block.create_parameter( + dtype="float32", + shape=[5, 10], + lod_level=0, + name="param", + initializer=initializer.XavierInitializer(uniform=False)) self.assertEqual(len(block.ops), 1) init_op = block.ops[0] self.assertEqual(init_op.type, 'gaussian_random') @@ -223,12 +252,13 @@ class TestXavierInitializer(unittest.TestCase): """ program = framework.Program() block = program.global_block() - param = block.create_parameter( - dtype="float32", - shape=[5, 10, 15, 20], - lod_level=0, - name="param", - initializer=initializer.XavierInitializer(uniform=False)) + for _ in range(2): + param = block.create_parameter( + dtype="float32", + shape=[5, 10, 15, 20], + lod_level=0, + name="param", + initializer=initializer.XavierInitializer(uniform=False)) self.assertEqual(len(block.ops), 1) init_op = block.ops[0] self.assertEqual(init_op.type, 'gaussian_random') @@ -244,13 +274,14 @@ class TestXavierInitializer(unittest.TestCase): """ program = framework.Program() block = program.global_block() - block.create_parameter( - dtype="float32", - shape=[5, 10], - lod_level=0, - name="param", - initializer=initializer.XavierInitializer( - fan_in=12, fan_out=23, seed=134)) + for _ in range(2): + block.create_parameter( + dtype="float32", + shape=[5, 10], + lod_level=0, + name="param", + initializer=initializer.XavierInitializer( + fan_in=12, fan_out=23, seed=134)) self.assertEqual(len(block.ops), 1) init_op = block.ops[0] self.assertEqual(init_op.type, 'uniform_random') @@ -267,12 +298,13 @@ class TestMSRAInitializer(unittest.TestCase): """ program = framework.Program() block = program.global_block() - param = block.create_parameter( - dtype="float32", - shape=[5, 10], - lod_level=0, - name="param", - initializer=initializer.MSRAInitializer()) + for _ in range(2): + param = block.create_parameter( + dtype="float32", + shape=[5, 10], + lod_level=0, + name="param", + initializer=initializer.MSRAInitializer()) self.assertEqual(len(block.ops), 1) init_op = block.ops[0] self.assertEqual(init_op.type, 'uniform_random') @@ -287,12 +319,13 @@ class TestMSRAInitializer(unittest.TestCase): """ program = framework.Program() block = program.global_block() - param = block.create_parameter( - dtype="float32", - shape=[5, 10, 15, 20], - lod_level=0, - name="param", - initializer=initializer.MSRAInitializer()) + for _ in range(2): + param = block.create_parameter( + dtype="float32", + shape=[5, 10, 15, 20], + lod_level=0, + name="param", + initializer=initializer.MSRAInitializer()) self.assertEqual(len(block.ops), 1) init_op = block.ops[0] self.assertEqual(init_op.type, 'uniform_random') @@ -308,12 +341,13 @@ class TestMSRAInitializer(unittest.TestCase): """ program = framework.Program() block = program.global_block() - param = block.create_parameter( - dtype="float32", - shape=[5, 10], - lod_level=0, - name="param", - initializer=initializer.MSRAInitializer(uniform=False)) + for _ in range(2): + param = block.create_parameter( + dtype="float32", + shape=[5, 10], + lod_level=0, + name="param", + initializer=initializer.MSRAInitializer(uniform=False)) self.assertEqual(len(block.ops), 1) init_op = block.ops[0] self.assertEqual(init_op.type, 'gaussian_random') @@ -328,12 +362,13 @@ class TestMSRAInitializer(unittest.TestCase): """ program = framework.Program() block = program.global_block() - param = block.create_parameter( - dtype="float32", - shape=[5, 10, 15, 20], - lod_level=0, - name="param", - initializer=initializer.MSRAInitializer(uniform=False)) + for _ in range(2): + param = block.create_parameter( + dtype="float32", + shape=[5, 10, 15, 20], + lod_level=0, + name="param", + initializer=initializer.MSRAInitializer(uniform=False)) self.assertEqual(len(block.ops), 1) init_op = block.ops[0] self.assertEqual(init_op.type, 'gaussian_random') @@ -348,13 +383,14 @@ class TestMSRAInitializer(unittest.TestCase): """ program = framework.Program() block = program.global_block() - block.create_parameter( - dtype="float32", - shape=[5, 10], - lod_level=0, - name="param", - initializer=initializer.MSRAInitializer( - fan_in=12, seed=134)) + for _ in range(2): + block.create_parameter( + dtype="float32", + shape=[5, 10], + lod_level=0, + name="param", + initializer=initializer.MSRAInitializer( + fan_in=12, seed=134)) self.assertEqual(len(block.ops), 1) init_op = block.ops[0] self.assertEqual(init_op.type, 'uniform_random') @@ -370,12 +406,13 @@ class TestMSRAInitializer(unittest.TestCase): """ program = framework.Program() block = program.global_block() - block.create_parameter( - dtype="float32", - shape=[8, 1, 3, 3], - lod_level=0, - name="param", - initializer=initializer.BilinearInitializer()) + for _ in range(2): + block.create_parameter( + dtype="float32", + shape=[8, 1, 3, 3], + lod_level=0, + name="param", + initializer=initializer.BilinearInitializer()) self.assertEqual(len(block.ops), 1) init_op = block.ops[0] self.assertEqual(init_op.type, 'assign_value') diff --git a/python/paddle/fluid/tests/unittests/test_lookup_table_op.py b/python/paddle/fluid/tests/unittests/test_lookup_table_op.py index e16ab1d15f165bd0efa1b7d51add36c3020a1910..ad0d555198c36c12fd1cc39c41d39b24b40f64c3 100644 --- a/python/paddle/fluid/tests/unittests/test_lookup_table_op.py +++ b/python/paddle/fluid/tests/unittests/test_lookup_table_op.py @@ -35,6 +35,22 @@ class TestLookupTableOp(OpTest): self.check_grad(['W'], 'Out', no_grad_set=set('Ids')) +class TestLookupTableOpWithTensorIds(OpTest): + def setUp(self): + self.op_type = "lookup_table" + table = np.random.random((17, 31)).astype("float32") + ids = np.random.randint( + low=0, high=17, size=(2, 4, 5, 1)).astype("int64") + self.inputs = {'W': table, 'Ids': ids} + self.outputs = {'Out': table[ids.flatten()].reshape((2, 4, 5, 31))} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(['W'], 'Out', no_grad_set=set('Ids')) + + class TestLookupTableOpWithPadding(TestLookupTableOp): def test_check_output(self): ids = np.squeeze(self.inputs['Ids']) @@ -44,21 +60,34 @@ class TestLookupTableOpWithPadding(TestLookupTableOp): self.check_output() def test_check_grad(self): - # Since paddings are not trainable and fixed in forward, the gradient of + # Since paddings are not trainable and fixed in forward, the gradient of # paddings makes no sense and we don't test the gradient here. pass -class TestLookupTableWIsSelectedRows(OpTest): - def check_with_place(self, place): - scope = core.Scope() +class TestLookupTableOpWithTensorIdsAndPadding(TestLookupTableOpWithTensorIds): + def test_check_output(self): + ids = self.inputs['Ids'] + flatten_idx = ids.flatten() + padding_idx = np.random.choice(flatten_idx, 1)[0] + self.outputs['Out'][np.squeeze(ids == padding_idx)] = np.zeros(31) + self.attrs = {'padding_idx': long(padding_idx)} + self.check_output() + + def test_check_grad(self): + # Since paddings are not trainable and fixed in forward, the gradient of + # paddings makes no sense and we don't test the gradient here. + pass - # create and initialize Id Variable + +class TestLookupTableWIsSelectedRows(OpTest): + def prepare_ids(self, scope, place): ids_tensor = scope.var('Ids').get_tensor() ids_array = np.array([[0], [4], [3], [5]]).astype("int64") ids_tensor.set(ids_array, place) + return ids_array - # create and initialize W Variable + def prepare_w(self, scope, place): rows = [0, 1, 2, 3, 4, 5, 6] row_numel = 12 @@ -71,8 +100,22 @@ class TestLookupTableWIsSelectedRows(OpTest): w_tensor = w_selected_rows.get_tensor() w_tensor.set(w_array, place) - # create Out Variable - out_tensor = scope.var('Out').get_tensor() + def create_out_tensor(self, scope, place): + return scope.var('Out').get_tensor() + + def check_result(self, ids_array, result_array): + # all(): return True if all elements of the iterable are true (or if the iterable is empty) + for idx, row in enumerate(ids_array): + assert (row[0] == result_array[idx]).all() + + def check_with_place(self, place): + scope = core.Scope() + + ids_array = self.prepare_ids(scope, place) + + self.prepare_w(scope, place) + + out_tensor = self.create_out_tensor(scope, place) # create and run lookup_table operator lookup_table = Operator("lookup_table", W='W', Ids='Ids', Out='Out') @@ -80,9 +123,8 @@ class TestLookupTableWIsSelectedRows(OpTest): # get result from Out result_array = np.array(out_tensor) - # all(): return True if all elements of the iterable are true (or if the iterable is empty) - for idx, row in enumerate(ids_array): - assert (row[0] == result_array[idx]).all() + + self.check_result(ids_array, result_array) def test_w_is_selected_rows(self): places = [core.CPUPlace()] @@ -91,5 +133,19 @@ class TestLookupTableWIsSelectedRows(OpTest): self.check_with_place(place) +class TestLookupTableWithTensorIdsWIsSelectedRows( + TestLookupTableWIsSelectedRows): + def prepare_ids(self, scope, place): + ids_tensor = scope.var('Ids').get_tensor() + ids_array = np.random.randint( + low=0, high=6, size=(2, 4, 3, 1)).astype("int64") + ids_tensor.set(ids_array, place) + return ids_array + + def check_result(self, ids_array, result_array): + for idx, row in np.ndenumerate(ids_array): + assert (row == result_array[idx]).all() + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py index c7a039d2589ef67bd1d3771a2f11084698ba909f..3a314f49ebe5091aa35299ea32ec593026a57c75 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py @@ -98,16 +98,13 @@ class TestMNIST(TestParallelExecutorBase): fluid.recordio_writer.convert_reader_to_recordio_file( MNIST_RECORDIO_FILE, reader, feeder) - def _init_data(self, random=True): + def _init_data(self): np.random.seed(5) - if random: - img = np.random.random(size=[32, 784]).astype(np.float32) - else: - img = np.ones(shape=[32, 784], dtype='float32') + img = np.random.random(size=[32, 784]).astype(np.float32) label = np.ones(shape=[32, 1], dtype='int64') return img, label - def _compare_reduce_and_allreduce(self, model, use_cuda, random_data=True): + def _compare_reduce_and_allreduce(self, model, use_cuda): if use_cuda and not core.is_compiled_with_cuda(): return self.check_network_convergence( @@ -115,7 +112,7 @@ class TestMNIST(TestParallelExecutorBase): self.check_network_convergence( model, use_cuda=use_cuda, allow_op_delay=True, use_reduce=True) - img, label = self._init_data(random_data) + img, label = self._init_data() all_reduce_first_loss, all_reduce_last_loss = self.check_network_convergence( model, @@ -166,27 +163,27 @@ class TestMNIST(TestParallelExecutorBase): if use_cuda and not core.is_compiled_with_cuda(): return - img, label = self._init_data(random=False) + img, label = self._init_data() single_first_loss, single_last_loss = self.check_network_convergence( method=simple_fc_net, - seed=1000, + seed=1, feed_dict={"image": img, "label": label}, use_cuda=use_cuda, use_parallel_executor=False) parallel_first_loss, parallel_last_loss = self.check_network_convergence( method=simple_fc_net, - seed=1000, + seed=1, feed_dict={"image": img, "label": label}, use_cuda=use_cuda, use_parallel_executor=True) - for p_f in parallel_first_loss: - self.assertAlmostEquals(p_f, single_first_loss[0], delta=1e-6) - for p_l in parallel_last_loss: - self.assertAlmostEquals(p_l, single_last_loss[0], delta=1e-6) + self.assertAlmostEquals( + np.mean(parallel_first_loss), single_first_loss, delta=1e-6) + self.assertAlmostEquals( + np.mean(parallel_last_loss), single_last_loss, delta=1e-6) def test_simple_fc_parallel_accuracy(self): self.check_simple_fc_parallel_accuracy(True) diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py index 834e920845f29b153909a971eb5afc4f8a33346e..a28428d8dee201ba105e18684c15d4b4582d989f 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py @@ -21,6 +21,19 @@ from parallel_executor_test_base import TestParallelExecutorBase import unittest import math import os +import numpy as np + +# FIXME(zcd): If the neural net has dropout_op, the output of ParallelExecutor +# and Executor is different. Because, for ParallelExecutor, the dropout_op of +# the neural net will be copied N copies(N is the number of device). This will +# lead to the random numbers generated by ParallelExecutor and Executor are different. +# So, if we compare the loss of ParallelExecutor and Executor, we should remove the +# dropout_op. +remove_dropout = False + +# FIXME(zcd): If the neural net has batch_norm, the output of ParallelExecutor +# and Executor is different. +remove_bn = False def squeeze_excitation(input, num_channels, reduction_ratio): @@ -53,7 +66,8 @@ def conv_bn_layer(input, num_filters, filter_size, stride=1, groups=1, groups=groups, act=None, bias_attr=False) - return fluid.layers.batch_norm(input=conv, act=act, momentum=0.1) + return conv if remove_bn else fluid.layers.batch_norm( + input=conv, act=act, momentum=0.1) def shortcut(input, ch_out, stride): @@ -92,13 +106,14 @@ def bottleneck_block(input, num_filters, stride, cardinality, reduction_ratio): return fluid.layers.elementwise_add(x=short, y=scale, act='relu') -def SE_ResNeXt50Small(batch_size=2, use_feed=False): - assert not use_feed, "SE_ResNeXt doesn't support feed yet" +batch_size = 12 +img_shape = [3, 224, 224] + - img = fluid.layers.fill_constant( - shape=[batch_size, 3, 224, 224], dtype='float32', value=0.0) - label = fluid.layers.fill_constant( - shape=[batch_size, 1], dtype='int64', value=0.0) +def SE_ResNeXt50Small(use_feed): + + img = fluid.layers.data(name='image', shape=img_shape, dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') conv = conv_bn_layer( input=img, num_filters=16, filter_size=3, stride=2, act='relu') @@ -127,7 +142,8 @@ def SE_ResNeXt50Small(batch_size=2, use_feed=False): reshape = fluid.layers.reshape( x=conv, shape=[-1, shape[1], shape[2] * shape[3]]) pool = fluid.layers.reduce_mean(input=reshape, dim=2) - dropout = fluid.layers.dropout(x=pool, dropout_prob=0.2) + dropout = pool if remove_dropout else fluid.layers.dropout( + x=pool, dropout_prob=0.2, seed=1) # Classifier layer: prediction = fluid.layers.fc(input=dropout, size=1000, act='softmax') loss = fluid.layers.cross_entropy(input=prediction, label=label) @@ -135,75 +151,135 @@ def SE_ResNeXt50Small(batch_size=2, use_feed=False): return loss -class TestResnet(TestParallelExecutorBase): - def check_resnet_convergence_with_learning_rate_decay(self, - use_cuda=True, - use_reduce=False, - iter=20): +def cosine_decay(learning_rate, step_each_epoch, epochs=120): + """ + Applies cosine decay to the learning rate. + lr = 0.05 * (math.cos(epoch * (math.pi / 120)) + 1) + """ + global_step = _decay_step_counter() - if use_cuda and not core.is_compiled_with_cuda(): - return + with init_on_cpu(): + epoch = ops.floor(global_step / step_each_epoch) + decayed_lr = learning_rate * \ + (ops.cos(epoch * (math.pi / epochs)) + 1)/2 + return decayed_lr - os.environ['CPU_NUM'] = str(4) - def _cosine_decay(learning_rate, step_each_epoch, epochs=120): - """ - Applies cosine decay to the learning rate. - lr = 0.05 * (math.cos(epoch * (math.pi / 120)) + 1) - """ - global_step = _decay_step_counter() +def optimizer(learning_rate=0.01): + optimizer = fluid.optimizer.Momentum( + learning_rate=cosine_decay( + learning_rate=learning_rate, step_each_epoch=2, epochs=1), + momentum=0.9, + regularization=fluid.regularizer.L2Decay(1e-4)) + return optimizer - with init_on_cpu(): - epoch = ops.floor(global_step / step_each_epoch) - decayed_lr = learning_rate * \ - (ops.cos(epoch * (math.pi / epochs)) + 1)/2 - return decayed_lr - def _optimizer(learning_rate=0.01): - optimizer = fluid.optimizer.Momentum( - learning_rate=_cosine_decay( - learning_rate=learning_rate, step_each_epoch=2, epochs=1), - momentum=0.9, - regularization=fluid.regularizer.L2Decay(1e-4)) - return optimizer +class TestResnet(TestParallelExecutorBase): + @classmethod + def setUpClass(cls): + os.environ['CPU_NUM'] = str(4) + global remove_dropout + global remove_bn + remove_dropout = False + remove_bn = False + + def _init_data(self, batch_size=2, random=True): + np.random.seed(5) + if random: + img = np.random.random( + size=[batch_size] + img_shape).astype(np.float32) + else: + img = np.ones(shape=[batch_size] + img_shape, dtype='float32') + label = [np.random.randint(0, 999) for _ in range(batch_size)] + label = np.array(label).astype(np.int64).reshape(-1, 1) + return img, label + + def _compare_reduce_and_allreduce(self, + model, + use_cuda, + iter=20, + delta2=1e-6): + if use_cuda and not core.is_compiled_with_cuda(): + return - import functools + global remove_bn + remove_bn = True - batch_size = 2 + img, label = self._init_data(batch_size=batch_size) + all_reduce_first_loss, all_reduce_last_loss = self.check_network_convergence( + model, + feed_dict={"image": img, + "label": label}, + iter=iter, + batch_size=batch_size, + use_cuda=use_cuda, + use_reduce=False, + optimizer=optimizer) + reduce_first_loss, reduce_last_loss = self.check_network_convergence( + model, + feed_dict={"image": img, + "label": label}, + iter=iter, + batch_size=batch_size, + use_cuda=use_cuda, + use_reduce=True, + optimizer=optimizer) + + for loss in zip(all_reduce_first_loss, reduce_first_loss): + self.assertAlmostEquals(loss[0], loss[1], delta=1e-6) + for loss in zip(all_reduce_last_loss, reduce_last_loss): + self.assertAlmostEquals(loss[0], loss[1], delta=delta2) + + def _check_resnet_convergence(self, + model, + use_cuda=True, + use_reduce=False, + iter=20, + delta2=1e-6): + if use_cuda and not core.is_compiled_with_cuda(): + return + global remove_dropout + global remove_bn + remove_dropout = True + remove_bn = True + + img, label = self._init_data(batch_size=batch_size) single_first_loss, single_last_loss = self.check_network_convergence( - functools.partial( - SE_ResNeXt50Small, batch_size=batch_size), + model, + feed_dict={"image": img, + "label": label}, iter=iter, batch_size=batch_size, use_cuda=use_cuda, use_reduce=use_reduce, - optimizer=_optimizer, + optimizer=optimizer, use_parallel_executor=False) - parallel_first_loss, parallel_last_loss = self.check_network_convergence( - functools.partial( - SE_ResNeXt50Small, batch_size=batch_size), + model, + feed_dict={"image": img, + "label": label}, iter=iter, batch_size=batch_size, use_cuda=use_cuda, use_reduce=use_reduce, - optimizer=_optimizer) + optimizer=optimizer) - for p_f in parallel_first_loss: - self.assertAlmostEquals(p_f, single_first_loss[0], delta=1e-6) - for p_l in parallel_last_loss: - self.assertAlmostEquals(p_l, single_last_loss[0], delta=1e-6) + self.assertAlmostEquals( + np.mean(parallel_first_loss), single_first_loss[0], delta=1e-6) + self.assertAlmostEquals( + np.mean(parallel_last_loss), single_last_loss[0], delta=delta2) def test_seresnext_with_learning_rate_decay(self): - self.check_resnet_convergence_with_learning_rate_decay(True, False) - self.check_resnet_convergence_with_learning_rate_decay( - False, False, iter=5) - - def test_seresnext_with_new_strategy_with_learning_rate_decay(self): - self.check_resnet_convergence_with_learning_rate_decay(True, True) - self.check_resnet_convergence_with_learning_rate_decay( - False, True, iter=5) + self._check_resnet_convergence(model=SE_ResNeXt50Small, use_cuda=True) + self._check_resnet_convergence( + model=SE_ResNeXt50Small, use_cuda=False, iter=2, delta2=1e-3) + + def test_seresnext_with_new_strategy(self): + self._compare_reduce_and_allreduce( + model=SE_ResNeXt50Small, use_cuda=True, delta2=1e-2) + self._compare_reduce_and_allreduce( + model=SE_ResNeXt50Small, use_cuda=False, iter=5) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_softmax_op.py b/python/paddle/fluid/tests/unittests/test_softmax_op.py index 0ab581cfb0ea0ff2205450b8e62edb8bf3c51707..70ad05597c4a160cf6a25aeb3c379320cef69c63 100644 --- a/python/paddle/fluid/tests/unittests/test_softmax_op.py +++ b/python/paddle/fluid/tests/unittests/test_softmax_op.py @@ -26,15 +26,22 @@ def stable_softmax(x): class TestSoftmaxOp(OpTest): + def get_x_shape(self): + return [10, 10] + def setUp(self): self.op_type = "softmax" self.use_cudnn = False self.use_mkldnn = False self.dtype = np.float32 self.init_kernel_type() + self.shape = self.get_x_shape() + + x = np.random.uniform(0.1, 1, self.shape).astype(self.dtype) + out = np.apply_along_axis(stable_softmax, 1, + x.reshape([-1, self.shape[-1]])) + out = out.reshape(self.shape) - x = np.random.uniform(0.1, 1, [10, 10]).astype(self.dtype) - out = np.apply_along_axis(stable_softmax, 1, x) self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} self.outputs = {'Out': out} self.attrs = { @@ -63,6 +70,11 @@ class TestSoftmaxOp(OpTest): self.check_grad(["X"], "Out", max_relative_error=0.01) +class TestSoftmaxOp2(TestSoftmaxOp): + def get_x_shape(self): + return [2, 3, 4, 5] + + @unittest.skipIf(not core.is_compiled_with_cuda(), "core is not compiled with CUDA") class TestSoftmaxCUDNNOp(TestSoftmaxOp): @@ -70,6 +82,13 @@ class TestSoftmaxCUDNNOp(TestSoftmaxOp): self.use_cudnn = True +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestSoftmaxCUDNNOp2(TestSoftmaxCUDNNOp): + def get_x_shape(self): + return [2, 3, 4, 5] + + @unittest.skipIf(not core.is_compiled_with_cuda(), "core is not compiled with CUDA") class TestSoftmaxFP16Op(TestSoftmaxOp): @@ -83,6 +102,13 @@ class TestSoftmaxFP16Op(TestSoftmaxOp): self.check_output_with_place(place, atol=1e-3) +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestSoftmaxFP16Op2(TestSoftmaxFP16Op): + def get_x_shape(self): + return [2, 3, 4, 5] + + @unittest.skipIf(not core.is_compiled_with_cuda(), "core is not compiled with CUDA") class TestSoftmaxFP16CUDNNOp(TestSoftmaxOp): @@ -97,10 +123,22 @@ class TestSoftmaxFP16CUDNNOp(TestSoftmaxOp): self.check_output_with_place(place, atol=1e-3) +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestSoftmaxFP16CUDNNOp2(TestSoftmaxFP16CUDNNOp): + def get_x_shape(self): + return [2, 3, 4, 5] + + class TestSoftmaxMKLDNNOp(TestSoftmaxOp): def init_kernel_type(self): self.use_mkldnn = True +class TestSoftmaxMKLDNNOp2(TestSoftmaxMKLDNNOp): + def get_x_shape(self): + return [2, 3, 4, 5] + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index d4d19799fdb291545117f327d2b9b2c25fbfe5f5..b0a100e1db34ad2971eadabff09fa5d0ce3f51dc 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -896,8 +896,6 @@ class DistributeTranspiler(object): self.table_name ][0] table_opt_block = pserver_program.create_block(pre_block_idx) - # only support sgd now - assert table_opt_op.type == "sgd" if self.sync_mode: # create grad vars in pserver program @@ -937,11 +935,12 @@ class DistributeTranspiler(object): "LearningRate": [lr_var] } outputs = {"ParamOut": [param_var]} - table_opt_block.append_op( - type=table_opt_op.type, - inputs=inputs, - outputs=outputs, - attrs=table_opt_op.attrs) + # only support sgd now + import logging + logging.warn( + "distribute lookup table only support sgd optimizer, change it's optimizer to sgd instead of " + + table_opt_op.type) + table_opt_block.append_op(type="sgd", inputs=inputs, outputs=outputs) # add table parameter gradient and it's block id to grad_to_block_id grad_to_block_id.append(grad_var.name + ":" + str(table_opt_block.idx))