diff --git a/CMakeLists.txt b/CMakeLists.txt index 9ad69738eb2ac21d6ff2624f11d17a38410d5c1f..9b77659f6142da3c8b6bb4913a8219683b723a76 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -75,6 +75,7 @@ option(WITH_INFERENCE_API_TEST "Test fluid inference high-level api interface" option(WITH_SYSTEM_BLAS "Use system blas library" OFF) option(PY_VERSION "Compile PaddlePaddle with python3 support" ${PY_VERSION}) option(WITH_FAST_MATH "Make use of fast math library, might affect the precision to some extent" ON) +option(WITH_WBAES "Compile PaddlePaddle with WBAES support" ON) # PY_VERSION if(NOT PY_VERSION) @@ -148,6 +149,7 @@ include(external/dlpack) include(external/snappy) # download snappy include(external/snappystream) # download snappystream include(external/warpctc) # download, build, install warpctc +include(external/wbaes) # download wbaes if (NOT WIN32) # there is no official support of nccl, cupti in windows diff --git a/cmake/configure.cmake b/cmake/configure.cmake index 93d74bb0a8f726ad31685cbfc7831b5441cd5108..283845541b8e303babeed7ed9f9ece2d51a6a2fc 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -157,3 +157,7 @@ endif(WITH_BRPC_RDMA) if(ON_INFER) add_definitions(-DPADDLE_ON_INFERENCE) endif(ON_INFER) + +if(WITH_WBAES) + add_definitions(-DPADDLE_WITH_WBAES) +endif(WITH_WBAES) diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake index bc7fe5454f5883108e43b4ca47920995dc13a1ff..69da9b98198de358348621ecdb444f2f81c7757f 100644 --- a/cmake/external/protobuf.cmake +++ b/cmake/external/protobuf.cmake @@ -201,7 +201,7 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST) SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} "-DCMAKE_GENERATOR_PLATFORM=x64") ENDIF() - SET(PROTOBUF_REPO "https://github.com/google/protobuf.git") + SET(PROTOBUF_REPO "https://github.com/protocolbuffers/protobuf.git") SET(PROTOBUF_TAG "9f75c5aa851cd877fb0d93ccc31b8567a6706546") ExternalProject_Add( diff --git a/cmake/external/wbaes.cmake b/cmake/external/wbaes.cmake new file mode 100644 index 0000000000000000000000000000000000000000..feda5cb367aeb532702c9ab8560388d1207c201c --- /dev/null +++ b/cmake/external/wbaes.cmake @@ -0,0 +1,71 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +IF(NOT ${WITH_WBAES}) + return() +ENDIF(NOT ${WITH_WBAES}) + +INCLUDE(ExternalProject) +SET(WBAES_DST_DIR "wbaes") +SET(WBAES_INSTALL_ROOT "${THIRD_PARTY_PATH}/install") +SET(WBAES_INSTALL_DIR ${WBAES_INSTALL_ROOT}/${WBAES_DST_DIR}) +SET(WBAES_ROOT ${WBAES_INSTALL_DIR}) +SET(WBAES_INC_DIR ${WBAES_ROOT}/include) +SET(WBAES_LIB_DIR ${WBAES_ROOT}/lib) + +SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${WBAES_ROOT}/lib") +SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) + +IF(APPLE) + SET(WBAES_TAG "v1.0.0" CACHE STRING "" FORCE) + SET(WBAES_URL "http://paddlepaddledeps.bj.bcebos.com/wbaes-sdk.mac.${WBAES_TAG}.tgz" CACHE STRING "" FORCE) + SET(WBAES_LIB ${WBAES_LIB_DIR}/libwbaes.dylib) + SET(WBAES_SHARED_LIB ${WBAES_LIB_DIR}/libwbaes.dylib) +ELSEIF(WIN32) + SET(WBAES_TAG "v1.0.0" CACHE STRING "" FORCE) + SET(WBAES_URL "http://paddlepaddledeps.bj.bcebos.com/wbaes-sdk.windows-x64.${WBAES_TAG}.tgz" CACHE STRING "" FORCE) + SET(WBAES_LIB ${WBAES_LIB_DIR}/libwbaes.lib) + SET(WBAES_SHARED_LIB ${WBAES_LIB_DIR}/libwbaes.dll) +ELSE() + SET(WBAES_TAG "v1.0.2" CACHE STRING "" FORCE) + SET(WBAES_URL "http://paddlepaddledeps.bj.bcebos.com/wbaes-sdk.linux-x86_64.${WBAES_TAG}.tgz" CACHE STRING "" FORCE) + SET(WBAES_LIB ${WBAES_LIB_DIR}/libwbaes.so) + SET(WBAES_SHARED_LIB ${WBAES_LIB_DIR}/libwbaes.so) +ENDIF() + +SET(WBAES_PROJECT "extern_wbaes") +MESSAGE(STATUS "WBAES_URL: ${WBAES_URL}, WBAES_LIB: ${WBAES_LIB}") +SET(WBAES_SOURCE_DIR "${THIRD_PARTY_PATH}/wbaes") +SET(WBAES_DOWNLOAD_DIR "${WBAES_SOURCE_DIR}/src/${WBAES_PROJECT}") + +ExternalProject_Add( + ${WBAES_PROJECT} + ${EXTERNAL_PROJECT_LOG_ARGS} + PREFIX ${WBAES_SOURCE_DIR} + URL ${WBAES_URL} + DOWNLOAD_DIR ${WBAES_DOWNLOAD_DIR} + DOWNLOAD_NO_PROGRESS 1 + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND "" + ${CMAKE_COMMAND} -E copy_directory ${WBAES_DOWNLOAD_DIR}/include ${WBAES_INC_DIR} && + ${CMAKE_COMMAND} -E copy_directory ${WBAES_DOWNLOAD_DIR}/lib ${WBAES_LIB_DIR} +) + +INCLUDE_DIRECTORIES(${WBAES_INC_DIR}) + +ADD_LIBRARY(wbaes SHARED IMPORTED GLOBAL) +SET_PROPERTY(TARGET wbaes PROPERTY IMPORTED_LOCATION ${WBAES_LIB}) +SET_PROPERTY(TARGET wbaes PROPERTY IMPORTED_NO_SONAME 1) +ADD_DEPENDENCIES(wbaes ${WBAES_PROJECT}) diff --git a/cmake/generic.cmake b/cmake/generic.cmake index 6679a09dfc9dd00cfe3b5c5da3e12bd1c1389432..19110812c240db4cbe3ba73a3a42ab0f1511a115 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -264,6 +264,14 @@ function(cc_library TARGET_NAME) list(REMOVE_ITEM cc_library_DEPS warpctc) add_dependencies(${TARGET_NAME} warpctc) endif() + # Only deps libwbaes.so, not link + if("${cc_library_DEPS};" MATCHES "wbaes;") + list(REMOVE_ITEM cc_library_DEPS wbaes) + if(NOT "${TARGET_NAME}" MATCHES "dynload_wbaes") + list(APPEND cc_library_DEPS dynload_wbaes) + endif() + add_dependencies(${TARGET_NAME} wbaes) + endif() # Only deps libmklml.so, not link if("${cc_library_DEPS};" MATCHES "mklml;") list(REMOVE_ITEM cc_library_DEPS mklml) diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index b7c32f80db0dcb826f3f67ffb55da1c715785add..2f558bffbd11a59699e050e6c8a53bca4cbb0884 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -170,6 +170,14 @@ copy(snappystream_lib DSTS ${dst_dir} ${dst_dir}/lib DEPS snappystream) +if (WITH_WBAES) + set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/wbaes") + copy(wbaes_lib + SRCS ${WBAES_INC_DIR} ${WBAES_LIB} + DSTS ${dst_dir} ${dst_dir}/lib + DEPS wbaes) +endif () + set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/zlib") copy(zlib_lib SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES} diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 54fb8016f5b7141d5904d9d696f2385a0fa67881..b19d50a6ad6afa312f5e695583174e56bf490755 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -15,7 +15,9 @@ paddle.fluid.cpu_places (ArgSpec(args=['device_count'], varargs=None, keywords=N paddle.fluid.cuda_pinned_places (ArgSpec(args=['device_count'], varargs=None, keywords=None, defaults=(None,)), ('document', 'd0c3ebd813c39958c92b78e3eef7e912')) paddle.fluid.Executor.__init__ (ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.Executor.close (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'f5369953dd0c443961cf79f7a00e1a03')) +paddle.fluid.Executor.infer_from_dataset (ArgSpec(args=['self', 'program', 'dataset', 'scope', 'thread', 'debug', 'fetch_list', 'fetch_info', 'print_period'], varargs=None, keywords=None, defaults=(None, None, None, 0, False, None, None, 100)), ('document', '9c7decb955b9c4f718114179c8985581')) paddle.fluid.Executor.run (ArgSpec(args=['self', 'program', 'feed', 'fetch_list', 'feed_var_name', 'fetch_var_name', 'scope', 'return_numpy', 'use_program_cache'], varargs=None, keywords=None, defaults=(None, None, None, 'feed', 'fetch', None, True, False)), ('document', 'f482e93b38b4018796969a2e1dde479d')) +paddle.fluid.Executor.train_from_dataset (ArgSpec(args=['self', 'program', 'dataset', 'scope', 'thread', 'debug', 'fetch_list', 'fetch_info', 'print_period'], varargs=None, keywords=None, defaults=(None, None, None, 0, False, None, None, 100)), ('document', 'd521011d79e71080fe9b5bb179b43518')) paddle.fluid.global_scope (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', 'e148d3ab1ed8edf3e928212a375959c0')) paddle.fluid.scope_guard (ArgSpec(args=['scope'], varargs=None, keywords=None, defaults=None), ('document', 'b94d1f6bcc29c4fb58fc0058561250c2')) paddle.fluid.DistributeTranspiler.__init__ (ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) @@ -36,15 +38,15 @@ paddle.fluid.DataFeedDesc.desc (ArgSpec(args=['self'], varargs=None, keywords=No paddle.fluid.DataFeedDesc.set_batch_size (ArgSpec(args=['self', 'batch_size'], varargs=None, keywords=None, defaults=None), ('document', '8d9f44601e0a99dd431f14fd9250cd21')) paddle.fluid.DataFeedDesc.set_dense_slots (ArgSpec(args=['self', 'dense_slots_name'], varargs=None, keywords=None, defaults=None), ('document', 'eb894b464bbcd1b4bc8038398954f766')) paddle.fluid.DataFeedDesc.set_use_slots (ArgSpec(args=['self', 'use_slots_name'], varargs=None, keywords=None, defaults=None), ('document', '415c56600ce4e198c071cad01409a690')) -paddle.fluid.AsyncExecutor.__init__ (ArgSpec(args=['self', 'place', 'run_mode'], varargs=None, keywords=None, defaults=(None, '')), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) -paddle.fluid.AsyncExecutor.config_distributed_nodes (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '4810dbe1870452f16b3c60b6c5fd1459')) -paddle.fluid.AsyncExecutor.download_data (ArgSpec(args=['self', 'afs_path', 'local_path', 'fs_default_name', 'ugi', 'file_cnt', 'hadoop_home', 'process_num'], varargs=None, keywords=None, defaults=('$HADOOP_HOME', 12)), ('document', '799a2066cc26819f1ed31f47c15ad083')) +paddle.fluid.AsyncExecutor.__init__ (ArgSpec(args=['self', 'place', 'run_mode'], varargs=None, keywords=None, defaults=(None, '')), ('document', '4e85874dddcd06c38f5717992d741589')) +paddle.fluid.AsyncExecutor.config_distributed_nodes (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '762980fe0181eb41e3d1081b26ed76b1')) +paddle.fluid.AsyncExecutor.download_data (ArgSpec(args=['self', 'afs_path', 'local_path', 'fs_default_name', 'ugi', 'file_cnt', 'hadoop_home', 'process_num'], varargs=None, keywords=None, defaults=('$HADOOP_HOME', 12)), ('document', '39e3ccddf8ea8db75ea85287c9147c3b')) paddle.fluid.AsyncExecutor.get_instance (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'f8688f76a2db1243c7097a60c507b182')) paddle.fluid.AsyncExecutor.init_model (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '504f39be2007404a17e5cabea1256c7d')) -paddle.fluid.AsyncExecutor.init_server (ArgSpec(args=['self', 'dist_desc'], varargs=None, keywords=None, defaults=None), ('document', 'c403ab46c5d3ef25c0f7e94ae75dcb68')) -paddle.fluid.AsyncExecutor.init_worker (ArgSpec(args=['self', 'dist_desc', 'startup_program'], varargs=None, keywords=None, defaults=None), ('document', 'dcf08f4bf2f3282acf11391f5d39c536')) +paddle.fluid.AsyncExecutor.init_server (ArgSpec(args=['self', 'dist_desc'], varargs=None, keywords=None, defaults=None), ('document', '384fa5fbb99912db1baf7ef7784bd312')) +paddle.fluid.AsyncExecutor.init_worker (ArgSpec(args=['self', 'dist_desc', 'startup_program'], varargs=None, keywords=None, defaults=None), ('document', 'f0a36d7c8561039f60a6f6555c7fee0b')) paddle.fluid.AsyncExecutor.run (ArgSpec(args=['self', 'program', 'data_feed', 'filelist', 'thread_num', 'fetch', 'mode', 'debug'], varargs=None, keywords=None, defaults=('', False)), ('document', '848fc53484e8326f6325feea87fe955c')) -paddle.fluid.AsyncExecutor.save_model (ArgSpec(args=['self', 'save_path'], varargs=None, keywords=None, defaults=None), ('document', 'c8ac0dfcb3b187aba25d03af7fea56b2')) +paddle.fluid.AsyncExecutor.save_model (ArgSpec(args=['self', 'save_path'], varargs=None, keywords=None, defaults=None), ('document', '145b5c0da01bfff397142e51361f4b75')) paddle.fluid.AsyncExecutor.stop (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '5f23d043607bb5d55e466ec3f578e093')) paddle.fluid.CompiledProgram.__init__ (ArgSpec(args=['self', 'program_or_graph'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.CompiledProgram.with_data_parallel (ArgSpec(args=['self', 'loss_name', 'build_strategy', 'exec_strategy', 'share_vars_from', 'places'], varargs=None, keywords=None, defaults=(None, None, None, None, None)), ('document', 'a8c7793803cf976680d9478e378fa356')) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index af4d375e314277fa1f0239bf031a39c3d47eace1..4e00630bb124c5e10a3b4e0e346326a45642fa3e 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -1,3 +1,4 @@ + #windows treat symbolic file as a real file, which is different with unix #We create a hidden file and compile it instead of origin source file. function(windows_symbolic TARGET) @@ -22,9 +23,13 @@ endfunction() add_subdirectory(ir) add_subdirectory(details) +add_subdirectory(fleet) +add_subdirectory(io) #ddim lib proto_library(framework_proto SRCS framework.proto) +proto_library(data_feed_proto SRCS data_feed.proto) proto_library(async_executor_proto SRCS data_feed.proto) +proto_library(trainer_desc_proto SRCS trainer_desc.proto data_feed.proto) cc_library(ddim SRCS ddim.cc DEPS eigen3 boost enforce) cc_test(ddim_test SRCS ddim_test.cc DEPS ddim) @@ -129,9 +134,11 @@ cc_test(version_test SRCS version_test.cc DEPS version) cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version) cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc memory_optimize_helper) + nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry) py_proto_compile(framework_py_proto SRCS framework.proto data_feed.proto) +py_proto_compile(trainer_py_proto SRCS trainer_desc.proto data_feed.proto) #Generate an empty \ #__init__.py to make framework_py_proto as a valid python module. add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py) @@ -165,29 +172,43 @@ else() endif() cc_library(executor_gc_helper SRCS executor_gc_helper.cc DEPS scope proto_desc operator garbage_collector) - if(WITH_DISTRIBUTE) - cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog - lod_rank_table feed_fetch_method sendrecvop_rpc ${GLOB_DISTRIBUTE_DEPS} graph_to_program_pass variable_helper ${NGRAPH_EXE_DEPS}) - set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") - set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + cc_library(executor SRCS executor.cc multi_trainer.cc dataset_factory.cc + dist_multi_trainer.cc trainer_factory.cc trainer.cc data_feed_factory.cc + data_feed.cc device_worker.cc hogwild_worker.cc downpour_worker.cc + pull_dense_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry + device_context scope framework_proto trainer_desc_proto glog fs shell fleet_wrapper lodtensor_printer + lod_rank_table feed_fetch_method sendrecvop_rpc ${GLOB_DISTRIBUTE_DEPS} + graph_to_program_pass variable_helper data_feed_proto ${NGRAPH_EXE_DEPS} timer) +set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") +set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) else() - cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper ${NGRAPH_EXE_DEPS}) + cc_library(executor SRCS executor.cc multi_trainer.cc dataset_factory.cc + dist_multi_trainer.cc trainer_factory.cc trainer.cc data_feed_factory.cc + data_feed.cc device_worker.cc hogwild_worker.cc downpour_worker.cc + pull_dense_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry + device_context scope framework_proto data_feed_proto trainer_desc_proto glog + lod_rank_table fs shell fleet_wrapper lodtensor_printer feed_fetch_method + graph_to_program_pass variable_helper ${NGRAPH_EXE_DEPS} timer data_feed_proto) cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op) endif() target_link_libraries(executor while_op_helper executor_gc_helper) cc_library(parallel_executor SRCS parallel_executor.cc DEPS - threaded_ssa_graph_executor scope_buffered_ssa_graph_executor parallel_ssa_graph_executor + threaded_ssa_graph_executor scope_buffered_ssa_graph_executor parallel_ssa_graph_executor async_ssa_graph_executor graph build_strategy fast_threaded_ssa_graph_executor variable_helper) -if(WITH_PSLIB) - cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper pslib_brpc pslib timer) -else() - cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper timer) -endif(WITH_PSLIB) +cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc + executor_thread_worker.cc multi_trainer.cc dist_multi_trainer.cc + trainer_factory.cc trainer.cc device_worker.cc hogwild_worker.cc + downpour_worker.cc pull_dense_worker.cc device_worker_factory.cc + data_set.cc dataset_factory.cc + DEPS op_registry device_context scope framework_proto + trainer_desc_proto glog lod_rank_table fleet_wrapper lodtensor_printer + feed_fetch_method graph_to_program_pass data_feed_proto + variable_helper timer fs shell) cc_test(data_feed_test SRCS data_feed_test.cc DEPS async_executor) @@ -214,18 +235,18 @@ cc_test(dlpack_tensor_test SRCS dlpack_tensor_test.cc DEPS dlpack_tensor glog) # Get the current working branch execute_process( COMMAND git rev-parse --abbrev-ref HEAD - WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} - OUTPUT_VARIABLE PADDLE_BRANCH - OUTPUT_STRIP_TRAILING_WHITESPACE -) + WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} + OUTPUT_VARIABLE PADDLE_BRANCH + OUTPUT_STRIP_TRAILING_WHITESPACE + ) # Get the latest abbreviated commit hash of the working branch execute_process( COMMAND git log -1 --format=%h - WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} - OUTPUT_VARIABLE PADDLE_COMMIT - OUTPUT_STRIP_TRAILING_WHITESPACE -) + WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} + OUTPUT_VARIABLE PADDLE_COMMIT + OUTPUT_STRIP_TRAILING_WHITESPACE + ) message(STATUS "commit: ${PADDLE_COMMIT}") message(STATUS "branch: ${PADDLE_BRANCH}") diff --git a/paddle/fluid/framework/async_executor.cc b/paddle/fluid/framework/async_executor.cc index 60708bf609d6f8b327d46fe585cbbcf07a62eece..89153d82d078b53d8d5582f0a38d3dafe21cc7eb 100644 --- a/paddle/fluid/framework/async_executor.cc +++ b/paddle/fluid/framework/async_executor.cc @@ -26,212 +26,44 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/reader.h" +#include "paddle/fluid/framework/trainer.h" +#include "paddle/fluid/framework/trainer_desc.pb.h" +#include "paddle/fluid/framework/trainer_factory.h" #include "paddle/fluid/inference/io.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/pybind/pybind.h" -#ifdef PADDLE_WITH_PSLIB -#include -#endif namespace paddle { namespace framework { AsyncExecutor::AsyncExecutor(Scope* scope, const platform::Place& place) : root_scope_(scope), place_(place) {} -void AsyncExecutor::CreateThreads( - ExecutorThreadWorker* worker, const ProgramDesc& main_program, - const std::shared_ptr& reader, - const std::vector& fetch_var_names, Scope* root_scope, - const int thread_index, const bool debug) { - worker->SetThreadId(thread_index); - worker->SetDebug(debug); - worker->SetRootScope(root_scope); - worker->CreateThreadResource(main_program, place_); - worker->SetDataFeed(reader); - worker->SetFetchVarNames(fetch_var_names); - worker->BindingDataFeedMemory(); -#ifdef PADDLE_WITH_PSLIB - worker->SetPSlibPtr(_pslib_ptr); - worker->SetPullDenseThread(_pull_dense_thread); - worker->SetParamConfig(&_param_config); -#endif -} - -void PrepareReaders(std::vector>& readers, // NOLINT - const int thread_num, const DataFeedDesc& data_feed_desc, - const std::vector& filelist) { - readers.resize(thread_num); - for (size_t i = 0; i < readers.size(); ++i) { - readers[i] = DataFeedFactory::CreateDataFeed(data_feed_desc.name()); - readers[i]->Init(data_feed_desc); // set batch_size and queue_size here - } - readers[0]->SetFileList(filelist); -} - -#ifdef PADDLE_WITH_PSLIB void AsyncExecutor::InitServer(const std::string& dist_desc, int index) { - _pslib_ptr = std::shared_ptr( - new paddle::distributed::PSlib()); - _pslib_ptr->init_server(dist_desc, index); - InitParamConfig(); + fleet_ptr_ = FleetWrapper::GetInstance(); + fleet_ptr_->InitServer(dist_desc, index); } void AsyncExecutor::InitWorker(const std::string& dist_desc, const std::vector& host_sign_list, int node_num, int index) { - _pslib_ptr = std::shared_ptr( - new paddle::distributed::PSlib()); - _pslib_ptr->init_worker( - dist_desc, const_cast(host_sign_list.data()), node_num, index); - - InitParamConfig(); + fleet_ptr_ = FleetWrapper::GetInstance(); + fleet_ptr_->InitWorker(dist_desc, host_sign_list, node_num, index); } -uint64_t AsyncExecutor::StartServer() { return _pslib_ptr->run_server(); } +uint64_t AsyncExecutor::StartServer() { return fleet_ptr_->RunServer(); } -void AsyncExecutor::StopServer() { _pslib_ptr->stop_server(); } +void AsyncExecutor::StopServer() { fleet_ptr_->StopServer(); } void AsyncExecutor::GatherServers(const std::vector& host_sign_list, int node_num) { - _pslib_ptr->gather_servers(const_cast(host_sign_list.data()), - node_num); -} - -void AsyncExecutor::InitParamConfig() { - for (int i = 0; i < _pslib_ptr->get_param() - ->server_param() - .downpour_server_param() - .downpour_table_param_size(); - ++i) { - if (_pslib_ptr->get_param() - ->server_param() - .downpour_server_param() - .downpour_table_param(i) - .table_class() - .find("SparseTable") != -1) { - _param_config.fea_dim = _pslib_ptr->get_param() - ->server_param() - .downpour_server_param() - .downpour_table_param(i) - .accessor() - .fea_dim(); - break; - } - } - _param_config.slot_dim = _param_config.fea_dim - 2; - _param_config.tmp_push_dense_wait_times = static_cast( - _pslib_ptr->get_param()->trainer_param().push_dense_per_batch()); - _param_config.tmp_push_sparse_wait_times = static_cast( - _pslib_ptr->get_param()->trainer_param().push_sparse_per_batch()); - - for (auto t = 0u; t < _pslib_ptr->get_param()->trainer_param().skip_op_size(); - ++t) { - _param_config.skip_op.push_back( - _pslib_ptr->get_param()->trainer_param().skip_op(t)); - } - - for (auto t = 0u; - t < _pslib_ptr->get_param()->trainer_param().sparse_table_size(); ++t) { - auto& table = _pslib_ptr->get_param()->trainer_param().sparse_table(t); - std::vector tmp_sparse_variable_name; - for (int i = 0u; i < table.slot_value_size(); ++i) { - tmp_sparse_variable_name.push_back(table.slot_value(i)); - _param_config.slot_alias_to_table[table.slot_key(i)] = table.table_id(); - } - std::vector tmp_sparse_gradient_variable_name; - for (auto i = 0u; i < table.slot_gradient_size(); ++i) { - tmp_sparse_gradient_variable_name.push_back(table.slot_gradient(i)); - } - _param_config.slot_input_vec[table.table_id()] = - std::move(tmp_sparse_variable_name); - _param_config.gradient_var[table.table_id()] = - std::move(tmp_sparse_gradient_variable_name); - _param_config.sparse_table_id.push_back(table.table_id()); - } - - for (auto t = 0u; - t < _pslib_ptr->get_param()->trainer_param().dense_table_size(); ++t) { - auto& table = _pslib_ptr->get_param()->trainer_param().dense_table(t); - std::vector tmp_dense_variable_name; - for (int i = 0u; i < table.dense_variable_name_size(); ++i) { - tmp_dense_variable_name.push_back(table.dense_variable_name(i)); - } - std::vector tmp_dense_gradient_variable_name; - for (auto i = 0u; i < table.dense_gradient_variable_name_size(); ++i) { - tmp_dense_gradient_variable_name.push_back( - table.dense_gradient_variable_name(i)); - } - _param_config.dense_variable_name[table.table_id()] = - std::move(tmp_dense_variable_name); - _param_config.dense_gradient_variable_name[table.table_id()] = - std::move(tmp_dense_gradient_variable_name); - _param_config.dense_table_id.push_back(table.table_id()); - _param_config.dense_table_size.push_back(table.fea_dim()); - } + fleet_ptr_->GatherServers(host_sign_list, node_num); } -void AsyncExecutor::InitModel() { - for (auto table_id : _param_config.dense_table_id) { - std::vector regions; - for (auto& t : _param_config.dense_variable_name[table_id]) { - Variable* var = root_scope_->FindVar(t); - CHECK(var != nullptr) << "var[" << t << "] not found"; - LoDTensor* tensor = var->GetMutable(); - - float* g = tensor->data(); - CHECK(g != nullptr) << "var[" << t << "] value not initialized"; - - float init_range = 0.2; - int rown = tensor->dims()[0]; - init_range /= sqrt(rown); - - std::normal_distribution ndistr(0.0, 1.0); - for (auto i = 0u; i < tensor->numel(); ++i) { - g[i] = ndistr(local_random_engine()) * init_range; - } - - paddle::ps::Region reg(g, tensor->numel()); - regions.emplace_back(std::move(reg)); - } +// todo InitModel +void AsyncExecutor::InitModel() {} - auto push_status = _pslib_ptr->_worker_ptr->push_dense_param( - regions.data(), regions.size(), table_id); - push_status.wait(); - auto status = push_status.get(); - if (status != 0) { - LOG(FATAL) << "push dense param failed, status[" << status << "]"; - exit(-1); - } - } -} - -void AsyncExecutor::SaveModel(const std::string& path) { - auto ret = _pslib_ptr->_worker_ptr->flush(); - ret.wait(); - ret = _pslib_ptr->_worker_ptr->save(path, 0); - ret.wait(); - int32_t feasign_cnt = ret.get(); - if (feasign_cnt == -1) { // (colourful-tree) TODO should be feasign_cnt < 0 - LOG(FATAL) << "save model failed"; - exit(-1); - } -} - -void AsyncExecutor::PrepareDenseThread(const std::string& mode) { - if (mode == "mpi") { - DensePullThreadParam param; - param.ps_client = _pslib_ptr->_worker_ptr; - param.threshold = 1; - param.training_thread_num = actual_thread_num; - param.root_scope = root_scope_; - param.dense_params = &_param_config.dense_variable_name; - - _pull_dense_thread = - std::shared_ptr(new DensePullThread(param)); - _pull_dense_thread->start(); - } -} -#endif +// todo SaveModel +void AsyncExecutor::SaveModel(const std::string& path) {} void AsyncExecutor::RunFromFile(const ProgramDesc& main_program, const std::string& data_feed_desc_str, @@ -256,14 +88,14 @@ void AsyncExecutor::RunFromFile(const ProgramDesc& main_program, google::protobuf::TextFormat::ParseFromString(data_feed_desc_str, &data_feed_desc); - actual_thread_num = thread_num; + actual_thread_num_ = thread_num; int file_cnt = filelist.size(); PADDLE_ENFORCE(file_cnt > 0, "File list cannot be empty"); - if (actual_thread_num > file_cnt) { + if (actual_thread_num_ > file_cnt) { VLOG(1) << "Thread num = " << thread_num << ", file num = " << file_cnt << ". Changing thread_num = " << file_cnt; - actual_thread_num = file_cnt; + actual_thread_num_ = file_cnt; } /* @@ -279,12 +111,14 @@ void AsyncExecutor::RunFromFile(const ProgramDesc& main_program, */ // todo: should be factory method for creating datafeed std::vector> readers; - PrepareReaders(readers, actual_thread_num, data_feed_desc, filelist); + /* + PrepareReaders(readers, actual_thread_num_, data_feed_desc, filelist); #ifdef PADDLE_WITH_PSLIB PrepareDenseThread(mode); #endif + */ std::vector> workers; - workers.resize(actual_thread_num); + workers.resize(actual_thread_num_); for (auto& worker : workers) { #ifdef PADDLE_WITH_PSLIB if (mode == "mpi") { @@ -298,13 +132,15 @@ void AsyncExecutor::RunFromFile(const ProgramDesc& main_program, } // prepare thread resource here - for (int thidx = 0; thidx < actual_thread_num; ++thidx) { + /* + for (int thidx = 0; thidx < actual_thread_num_; ++thidx) { CreateThreads(workers[thidx].get(), main_program, readers[thidx], fetch_var_names, root_scope_, thidx, debug); } + */ // start executing ops in multiple threads - for (int thidx = 0; thidx < actual_thread_num; ++thidx) { + for (int thidx = 0; thidx < actual_thread_num_; ++thidx) { if (debug) { threads.push_back(std::thread(&ExecutorThreadWorker::TrainFilesWithTimer, workers[thidx].get())); @@ -317,15 +153,19 @@ void AsyncExecutor::RunFromFile(const ProgramDesc& main_program, for (auto& th : threads) { th.join(); } + // TODO(guru4elephant): we don't need this + /* #ifdef PADDLE_WITH_PSLIB if (mode == "mpi") { _pull_dense_thread->stop(); } #endif + */ + VLOG(3) << "start to run from files in async_executor"; + VLOG(3) << "Drop current scope kids"; root_scope_->DropKids(); - return; } -} // einit_modelnd namespace framework +} // end namespace framework } // end namespace paddle diff --git a/paddle/fluid/framework/async_executor.h b/paddle/fluid/framework/async_executor.h index 95c8472b2f3b6b0c2d95fcf0c0b6f00e7f39b032..7b59e1b11ca577d4b03784db50d5fa6ed3d1f12b 100644 --- a/paddle/fluid/framework/async_executor.h +++ b/paddle/fluid/framework/async_executor.h @@ -25,8 +25,10 @@ limitations under the License. */ #include #include #include "paddle/fluid/framework/data_feed.pb.h" +#include "paddle/fluid/framework/data_set.h" #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/executor_thread_worker.h" +#include "paddle/fluid/framework/fleet/fleet_wrapper.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" @@ -65,9 +67,10 @@ class AsyncExecutor { const std::string& data_feed_desc_str, const std::vector& filelist, const int thread_num, - const std::vector& fetch_names, - const std::string& mode, const bool debug = false); -#ifdef PADDLE_WITH_PSLIB + const std::vector& fetch_var_names, + const std::string& mode, const bool debug); + + // TODO(guru4elephant): make init server decoupled from executor void InitServer(const std::string& dist_desc, int index); void InitWorker(const std::string& dist_desc, const std::vector& host_sign_list, int node_num, @@ -77,31 +80,14 @@ class AsyncExecutor { void GatherServers(const std::vector& host_sign_list, int node_num); void InitModel(); void SaveModel(const std::string& path); - void InitParamConfig(); -#endif - - private: - void CreateThreads(ExecutorThreadWorker* worker, - const ProgramDesc& main_program, - const std::shared_ptr& reader, - const std::vector& fetch_var_names, - Scope* root_scope, const int thread_index, - const bool debug); -#ifdef PADDLE_WITH_PSLIB - void PrepareDenseThread(const std::string& mode); -#endif public: -#ifdef PADDLE_WITH_PSLIB - std::shared_ptr _pslib_ptr; - std::shared_ptr _pull_dense_thread; - AsyncWorkerParamConfig _param_config; -#endif + std::shared_ptr fleet_ptr_; Scope* root_scope_; platform::Place place_; private: - int actual_thread_num; + int actual_thread_num_; }; } // namespace framework diff --git a/paddle/fluid/framework/blocking_queue.h b/paddle/fluid/framework/blocking_queue.h index a19558c0ae59005bee575e8c469c7f95d8780ab1..cc5b4e8c4b8e114668f472ea2af9de96835720d0 100644 --- a/paddle/fluid/framework/blocking_queue.h +++ b/paddle/fluid/framework/blocking_queue.h @@ -33,6 +33,14 @@ class BlockingQueue { cv_.notify_one(); } + void Push(T &&item) { + { + std::lock_guard g(mutex_); + q_.emplace_back(std::move(item)); + } + cv_.notify_one(); + } + template void Extend(const U &items) { { @@ -44,6 +52,17 @@ class BlockingQueue { cv_.notify_all(); } + template + void Extend(U &&items) { + { + std::lock_guard g(mutex_); + for (auto &item : items) { + q_.emplace_back(std::move(item)); + } + } + cv_.notify_all(); + } + std::deque PopAll(size_t ms, bool *timeout) { auto time = std::chrono::system_clock::now() + std::chrono::milliseconds(ms); @@ -64,6 +83,18 @@ class BlockingQueue { return rc; } + void Pop(T *t) { + std::unique_lock lock(mutex_); + cv_.wait(lock, [=] { return !q_.empty(); }); + *t = std::move(q_.front()); + q_.pop_front(); + } + + size_t Size() { + std::lock_guard lock(mutex_); + return q_.size(); + } + private: std::mutex mutex_; std::condition_variable cv_; diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc index 41155cfb7714b10fa51bc56fc90af4ee3d8b4a1a..e4e9861e37a4334220d5e39a5b44afafd668b7c3 100644 --- a/paddle/fluid/framework/data_feed.cc +++ b/paddle/fluid/framework/data_feed.cc @@ -12,23 +12,29 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#if defined _WIN32 || defined __APPLE__ +#else +#define _LINUX +#endif + +#include "paddle/fluid/framework/data_feed.h" +#ifdef _LINUX +#include +#endif +#include +#include "gflags/gflags.h" #include "google/protobuf/io/zero_copy_stream_impl.h" #include "google/protobuf/message.h" #include "google/protobuf/text_format.h" - -#include "gflags/gflags.h" -#include "paddle/fluid/framework/data_feed.h" +#include "io/fs.h" +#include "io/shell.h" #include "paddle/fluid/framework/feed_fetch_method.h" #include "paddle/fluid/framework/feed_fetch_type.h" +#include "paddle/fluid/platform/timer.h" namespace paddle { namespace framework { -std::vector DataFeed::filelist_; -size_t DataFeed::file_idx_; -std::mutex DataFeed::mutex_for_pick_file_; -bool DataFeed::finish_set_filelist_; - void DataFeed::AddFeedVar(Variable* var, const std::string& name) { CheckInit(); for (size_t i = 0; i < use_slots_.size(); ++i) { @@ -39,15 +45,11 @@ void DataFeed::AddFeedVar(Variable* var, const std::string& name) { } bool DataFeed::SetFileList(const std::vector& files) { - std::unique_lock lock(mutex_for_pick_file_); + std::unique_lock lock(*mutex_for_pick_file_); CheckInit(); - if (finish_set_filelist_) { - VLOG(3) << "info: you have set the filelist."; - return false; - } - PADDLE_ENFORCE(files.size(), "You have set an empty filelist."); + // Do not set finish_set_filelist_ flag, + // since a user may set file many times after init reader filelist_.assign(files.begin(), files.end()); - file_idx_ = 0; finish_set_filelist_ = true; return true; @@ -59,12 +61,18 @@ void DataFeed::SetBatchSize(int batch_size) { } bool DataFeed::PickOneFile(std::string* filename) { - std::unique_lock lock(mutex_for_pick_file_); - if (file_idx_ == filelist_.size()) { + PADDLE_ENFORCE(mutex_for_pick_file_ != nullptr, + "should call SetFileListMutex before PickOneFile"); + PADDLE_ENFORCE(file_idx_ != nullptr, + "should call SetFileListIndex before PickOneFile"); + std::unique_lock lock(*mutex_for_pick_file_); + if (*file_idx_ == filelist_.size()) { + VLOG(3) << "DataFeed::PickOneFile no more file to pick"; return false; } - *filename = filelist_[file_idx_++]; - LOG(ERROR) << "pick file:" << *filename; + VLOG(3) << "file_idx_=" << *file_idx_; + *filename = filelist_[(*file_idx_)++]; + // LOG(ERROR) << "pick file:" << *filename; return true; } @@ -100,21 +108,24 @@ bool PrivateQueueDataFeed::Start() { template void PrivateQueueDataFeed::ReadThread() { +#ifdef _LINUX std::string filename; while (PickOneFile(&filename)) { - file_.open(filename.c_str()); // is_text_feed - PADDLE_ENFORCE(file_.good(), "Open file<%s> fail.", filename.c_str()); + int err_no = 0; + fp_ = fs_open_read(filename, &err_no, pipe_command_); + __fsetlocking(&*fp_, FSETLOCKING_BYCALLER); T instance; - while (ParseOneInstance(&instance)) { + while (ParseOneInstanceFromPipe(&instance)) { queue_->Send(instance); } - file_.close(); } queue_->Close(); +#endif } template int PrivateQueueDataFeed::Next() { +#ifdef _LINUX CheckStart(); int index = 0; T instance; @@ -130,11 +141,288 @@ int PrivateQueueDataFeed::Next() { PutToFeedVec(ins_vec); } return batch_size_; +#else + return 0; +#endif } -#ifdef _WIN32 +// explicit instantiation template class PrivateQueueDataFeed>; + +template +InMemoryDataFeed::InMemoryDataFeed() { + cur_channel_ = 0; + shuffled_ins_ = std::make_shared>(); + shuffled_ins_out_ = std::make_shared>(); + fleet_send_batch_size_ = 80000; // hard code here + memory_data_ = nullptr; + mutex_for_update_memory_data_ = nullptr; + this->file_idx_ = nullptr; + this->mutex_for_pick_file_ = nullptr; +} + +template +bool InMemoryDataFeed::Start() { +#ifdef _LINUX + DataFeed::CheckSetFileList(); + if (shuffled_ins_->Size() == 0 && shuffled_ins_out_->Size() == 0) { + FillMemoryDataToChannel(); + } #endif + DataFeed::finish_start_ = true; + return true; +} + +template +int InMemoryDataFeed::Next() { +#ifdef _LINUX + DataFeed::CheckStart(); + std::shared_ptr> in_channel = nullptr; + std::shared_ptr> out_channel = nullptr; + if (cur_channel_ == 0) { + in_channel = shuffled_ins_; + out_channel = shuffled_ins_out_; + } else { + in_channel = shuffled_ins_out_; + out_channel = shuffled_ins_; + } + CHECK(in_channel != nullptr); + CHECK(out_channel != nullptr); + VLOG(3) << "in_channel size=" << in_channel->Size() + << ", out_channel size=" << out_channel->Size() + << ", thread_id=" << thread_id_; + int index = 0; + T instance; + T ins_vec; + while (index < DataFeed::default_batch_size_) { + if (in_channel->Size() == 0) { + break; + } + in_channel->Pop(&instance); + + AddInstanceToInsVec(&ins_vec, instance, index++); + out_channel->Push(std::move(instance)); + } + DataFeed::batch_size_ = index; + VLOG(3) << "batch_size_=" << DataFeed::batch_size_ + << ", thread_id=" << thread_id_; + if (DataFeed::batch_size_ != 0) { + PutToFeedVec(ins_vec); + } else { + cur_channel_ = 1 - cur_channel_; + } + return DataFeed::batch_size_; +#else + return 0; +#endif +} + +template +void InMemoryDataFeed::SetMemoryData(void* memory_data) { + memory_data_ = static_cast*>(memory_data); +} + +template +void InMemoryDataFeed::SetMemoryDataMutex(std::mutex* mutex) { + mutex_for_update_memory_data_ = mutex; +} + +template +void InMemoryDataFeed::SetThreadId(int thread_id) { + thread_id_ = thread_id; +} + +template +void InMemoryDataFeed::SetThreadNum(int thread_num) { + thread_num_ = thread_num; +} + +template +void InMemoryDataFeed::SetTrainerNum(int trainer_num) { + trainer_num_ = trainer_num; +} + +template +void InMemoryDataFeed::PutInsToChannel(const std::string& ins_str) { +#ifdef _LINUX + std::vector ins; + DeserializeIns(&ins, ins_str); + shuffled_ins_->Extend(std::move(ins)); + VLOG(3) << "PutInsToChannel put ins num=" << ins.size() + << " to channel, channel size=" << shuffled_ins_->Size() + << " thread_id=" << thread_id_; +#endif +} + +template +void InMemoryDataFeed::FillMemoryDataToChannel() { +#ifdef _LINUX + VLOG(3) << "FillMemoryDataToChannel, thread_id=" << thread_id_; + auto interval = GetMemoryDataInterval(); + VLOG(3) << "memory data size=" << memory_data_->size() + << ", fill data from [" << interval.first << ", " << interval.second + << "), thread_id=" << thread_id_; + for (int64_t i = interval.first; i < interval.second; ++i) { + T& t = (*memory_data_)[i]; + shuffled_ins_->Push(std::move(t)); + } +#endif +} + +template +void InMemoryDataFeed::FillChannelToMemoryData() { +#ifdef _LINUX + VLOG(3) << "FillChannelToMemoryData, thread_id=" << thread_id_; + std::vector local_vec; + std::shared_ptr> channel = nullptr; + std::shared_ptr> pre_channel = nullptr; + if (cur_channel_ == 0) { + channel = shuffled_ins_; + pre_channel = shuffled_ins_out_; + } else { + channel = shuffled_ins_out_; + pre_channel = shuffled_ins_; + } + CHECK(channel != nullptr); + CHECK(pre_channel != nullptr); + CHECK_EQ(pre_channel->Size(), 0); + local_vec.resize(channel->Size()); + for (int64_t i = 0; i < local_vec.size(); ++i) { + channel->Pop(&local_vec[i]); + } + VLOG(3) << "local_vec size=" << local_vec.size() + << ", thread_id=" << thread_id_; + { + std::lock_guard g(*mutex_for_update_memory_data_); + VLOG(3) << "before insert, memory_data_ size=" << memory_data_->size() + << ", thread_id=" << thread_id_; + memory_data_->insert(memory_data_->end(), local_vec.begin(), + local_vec.end()); + VLOG(3) << "after insert memory_data_ size=" << memory_data_->size() + << ", thread_id=" << thread_id_; + } + std::vector().swap(local_vec); +#endif +} + +template +void InMemoryDataFeed::LoadIntoMemory() { +#ifdef _LINUX + VLOG(3) << "LoadIntoMemory() begin, thread_id=" << thread_id_; + std::vector local_vec; + std::string filename; + while (DataFeed::PickOneFile(&filename)) { + VLOG(3) << "PickOneFile, filename=" << filename + << ", thread_id=" << thread_id_; + int err_no = 0; + PrivateQueueDataFeed::fp_ = + fs_open_read(filename, &err_no, PrivateQueueDataFeed::pipe_command_); + CHECK(PrivateQueueDataFeed::fp_ != nullptr); + __fsetlocking(&*PrivateQueueDataFeed::fp_, FSETLOCKING_BYCALLER); + T instance; + platform::Timer timeline; + timeline.Start(); + while (ParseOneInstanceFromPipe(&instance)) { + local_vec.push_back(instance); + } + timeline.Pause(); + VLOG(3) << "LoadIntoMemory() read all lines, file=" << filename + << ", cost time=" << timeline.ElapsedSec() + << " seconds, thread_id=" << thread_id_; + { + std::lock_guard lock(*mutex_for_update_memory_data_); + timeline.Start(); + memory_data_->insert(memory_data_->end(), + std::make_move_iterator(local_vec.begin()), + std::make_move_iterator(local_vec.end())); + timeline.Pause(); + VLOG(3) << "LoadIntoMemory() memory_data insert, cost time=" + << timeline.ElapsedSec() << " seconds, thread_id=" << thread_id_; + } + local_vec.clear(); + } + std::vector().swap(local_vec); + VLOG(3) << "LoadIntoMemory() end, thread_id=" << thread_id_; +#endif +} + +template +void InMemoryDataFeed::LocalShuffle() { +#ifdef _LINUX + VLOG(3) << "LocalShuffle() begin, thread_id=" << thread_id_; + FillMemoryDataToChannel(); + VLOG(3) << "LocalShuffle() end, thread_id=" << thread_id_; +#endif +} + +template +void InMemoryDataFeed::GlobalShuffle() { +#ifdef _LINUX + VLOG(3) << "GlobalShuffle() begin, thread_id=" << thread_id_; + auto fleet_ptr = FleetWrapper::GetInstance(); + std::vector> send_vec(trainer_num_); + for (auto& vec : send_vec) { + vec.reserve(fleet_send_batch_size_); + } + std::vector> total_status; + auto interval = GetMemoryDataInterval(); + VLOG(3) << "global shuffle data from [" << interval.first << ", " + << interval.second << "), thread_id=" << thread_id_; + for (int64_t i = interval.first; i < interval.second; ++i) { + // if get ins id, can also use hash + // std::string ins_id = memory_data_[i].ins_id; + int64_t random_num = rand_r(&rand_seed); + int64_t node_id = random_num % trainer_num_; + send_vec[node_id].push_back(&((*memory_data_)[i])); + if (i % fleet_send_batch_size_ == 0 && i != 0) { + for (int j = 0; j < send_vec.size(); ++j) { + std::string send_str; + SerializeIns(send_vec[j], &send_str); + VLOG(3) << "send str_length=" << send_str.length() + << ", ins num=" << send_vec[j].size() << " to node_id=" << j + << ", thread_id=" << thread_id_; + auto ret = fleet_ptr->SendClientToClientMsg(0, j, send_str); + VLOG(3) << "end send, thread_id=" << thread_id_; + send_vec[j].clear(); + total_status.push_back(std::move(ret)); + } + } + } + for (int j = 0; j < send_vec.size(); ++j) { + if (send_vec[j].size() != 0) { + std::string send_str; + SerializeIns(send_vec[j], &send_str); + VLOG(3) << "send str_length=" << send_str.length() << " to node_id=" << j + << ", thread_id=" << thread_id_; + auto ret = fleet_ptr->SendClientToClientMsg(0, j, send_str); + VLOG(3) << "end send, thread_id=" << thread_id_; + total_status.push_back(std::move(ret)); + } + std::vector().swap(send_vec[j]); + } + for (auto& t : total_status) { + t.wait(); + } + VLOG(3) << "GlobalShuffle() end, thread_id=" << thread_id_; +#endif +} + +template +std::pair InMemoryDataFeed::GetMemoryDataInterval() { + int64_t start = 0; + int64_t end = 0; + int64_t size = memory_data_->size(); + for (int64_t i = 0; i <= static_cast(thread_id_); ++i) { + int64_t len = size / static_cast(thread_num_) + + (i < (size % static_cast(thread_num_))); + start = end; + end += len; + } + return std::make_pair(start, end); +} + +// explicit instantiation +template class InMemoryDataFeed>; void MultiSlotDataFeed::Init( const paddle::framework::DataFeedDesc& data_feed_desc) { @@ -165,10 +453,32 @@ void MultiSlotDataFeed::Init( } } feed_vec_.resize(use_slots_.size()); + pipe_command_ = data_feed_desc.pipe_command(); finish_init_ = true; } +void MultiSlotDataFeed::ReadThread() { +#ifdef _LINUX + std::string filename; + while (PickOneFile(&filename)) { + int err_no = 0; + fp_ = fs_open_read(filename, &err_no, pipe_command_); + CHECK(fp_ != nullptr); + __fsetlocking(&*fp_, FSETLOCKING_BYCALLER); + std::vector instance; + int ins_num = 0; + while (ParseOneInstanceFromPipe(&instance)) { + ins_num++; + queue_->Send(instance); + } + VLOG(3) << "filename: " << filename << " inst num: " << ins_num; + } + queue_->Close(); +#endif +} + bool MultiSlotDataFeed::CheckFile(const char* filename) { +#ifdef _LINUX CheckInit(); // get info of slots std::ifstream fin(filename); if (!fin.good()) { @@ -276,10 +586,68 @@ bool MultiSlotDataFeed::CheckFile(const char* filename) { } VLOG(3) << "instances cout: " << instance_cout; VLOG(3) << "The file format is correct"; +#endif + return true; +} + +bool MultiSlotDataFeed::ParseOneInstanceFromPipe( + std::vector* instance) { +#ifdef _LINUX + thread_local string::LineFileReader reader; + + if (!reader.getline(&*(fp_.get()))) { + return false; + } else { + int use_slots_num = use_slots_.size(); + instance->resize(use_slots_num); + + const char* str = reader.get(); + std::string line = std::string(str); + // VLOG(3) << line; + char* endptr = const_cast(str); + int pos = 0; + for (size_t i = 0; i < use_slots_index_.size(); ++i) { + int idx = use_slots_index_[i]; + int num = strtol(&str[pos], &endptr, 10); + PADDLE_ENFORCE( + num, + "The number of ids can not be zero, you need padding " + "it in data generator; or if there is something wrong with " + "the data, please check if the data contains unresolvable " + "characters.\nplease check this error line: %s", + str); + if (idx != -1) { + (*instance)[idx].Init(all_slots_type_[i]); + if ((*instance)[idx].GetType()[0] == 'f') { // float + for (int j = 0; j < num; ++j) { + float feasign = strtof(endptr, &endptr); + (*instance)[idx].AddValue(feasign); + } + } else if ((*instance)[idx].GetType()[0] == 'u') { // uint64 + for (int j = 0; j < num; ++j) { + uint64_t feasign = (uint64_t)strtoull(endptr, &endptr, 10); + (*instance)[idx].AddValue(feasign); + } + } + pos = endptr - str; + } else { + for (int j = 0; j <= num; ++j) { + // pos = line.find_first_of(' ', pos + 1); + while (line[pos + 1] != ' ') { + pos++; + } + } + } + } + return true; + } +#else return true; +#endif } bool MultiSlotDataFeed::ParseOneInstance(std::vector* instance) { +#ifdef _LINUX std::string line; if (getline(file_, line)) { int use_slots_num = use_slots_.size(); @@ -322,12 +690,14 @@ bool MultiSlotDataFeed::ParseOneInstance(std::vector* instance) { } else { return false; } - return true; +#endif + return false; } void MultiSlotDataFeed::AddInstanceToInsVec( std::vector* ins_vec, const std::vector& instance, int index) { +#ifdef _LINUX if (index == 0) { ins_vec->resize(instance.size()); for (size_t i = 0; i < instance.size(); ++i) { @@ -339,10 +709,200 @@ void MultiSlotDataFeed::AddInstanceToInsVec( for (size_t i = 0; i < instance.size(); ++i) { (*ins_vec)[i].AddIns(instance[i]); } +#endif } void MultiSlotDataFeed::PutToFeedVec( const std::vector& ins_vec) { +#ifdef _LINUX + for (size_t i = 0; i < use_slots_.size(); ++i) { + const auto& type = ins_vec[i].GetType(); + const auto& offset = ins_vec[i].GetOffset(); + int total_instance = static_cast(offset.back()); + + if (type[0] == 'f') { // float + const auto& feasign = ins_vec[i].GetFloatData(); + float* tensor_ptr = feed_vec_[i]->mutable_data( + {total_instance, 1}, platform::CPUPlace()); + memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(float)); + } else if (type[0] == 'u') { // uint64 + // no uint64_t type in paddlepaddle + const auto& feasign = ins_vec[i].GetUint64Data(); + int64_t* tensor_ptr = feed_vec_[i]->mutable_data( + {total_instance, 1}, platform::CPUPlace()); + memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(int64_t)); + } + + LoD data_lod{offset}; + feed_vec_[i]->set_lod(data_lod); + if (use_slots_is_dense_[i]) { + int dim = total_instance / batch_size_; + feed_vec_[i]->Resize({batch_size_, dim}); + } + } +#endif +} + +void MultiSlotInMemoryDataFeed::Init( + const paddle::framework::DataFeedDesc& data_feed_desc) { + finish_init_ = false; + finish_set_filelist_ = false; + finish_start_ = false; + + PADDLE_ENFORCE(data_feed_desc.has_multi_slot_desc(), + "Multi_slot_desc has not been set."); + paddle::framework::MultiSlotDesc multi_slot_desc = + data_feed_desc.multi_slot_desc(); + SetBatchSize(data_feed_desc.batch_size()); + SetQueueSize(data_feed_desc.batch_size()); + size_t all_slot_num = multi_slot_desc.slots_size(); + all_slots_.resize(all_slot_num); + all_slots_type_.resize(all_slot_num); + use_slots_index_.resize(all_slot_num); + use_slots_.clear(); + use_slots_is_dense_.clear(); + for (size_t i = 0; i < all_slot_num; ++i) { + const auto& slot = multi_slot_desc.slots(i); + all_slots_[i] = slot.name(); + all_slots_type_[i] = slot.type(); + use_slots_index_[i] = slot.is_used() ? use_slots_.size() : -1; + if (slot.is_used()) { + use_slots_.push_back(all_slots_[i]); + use_slots_is_dense_.push_back(slot.is_dense()); + } + } + feed_vec_.resize(use_slots_.size()); + pipe_command_ = data_feed_desc.pipe_command(); + finish_init_ = true; +} + +bool MultiSlotInMemoryDataFeed::ParseOneInstanceFromPipe( + std::vector* instance) { +#ifdef _LINUX + thread_local string::LineFileReader reader; + + if (!reader.getline(&*(fp_.get()))) { + return false; + } else { + int use_slots_num = use_slots_.size(); + instance->resize(use_slots_num); + + const char* str = reader.get(); + std::string line = std::string(str); + // VLOG(3) << line; + char* endptr = const_cast(str); + int pos = 0; + for (size_t i = 0; i < use_slots_index_.size(); ++i) { + int idx = use_slots_index_[i]; + int num = strtol(&str[pos], &endptr, 10); + PADDLE_ENFORCE( + num, + "The number of ids can not be zero, you need padding " + "it in data generator; or if there is something wrong with " + "the data, please check if the data contains unresolvable " + "characters.\nplease check this error line: %s", + str); + if (idx != -1) { + (*instance)[idx].Init(all_slots_type_[i]); + if ((*instance)[idx].GetType()[0] == 'f') { // float + for (int j = 0; j < num; ++j) { + float feasign = strtof(endptr, &endptr); + (*instance)[idx].AddValue(feasign); + } + } else if ((*instance)[idx].GetType()[0] == 'u') { // uint64 + for (int j = 0; j < num; ++j) { + uint64_t feasign = (uint64_t)strtoull(endptr, &endptr, 10); + (*instance)[idx].AddValue(feasign); + } + } + pos = endptr - str; + } else { + for (int j = 0; j <= num; ++j) { + // pos = line.find_first_of(' ', pos + 1); + while (line[pos + 1] != ' ') { + pos++; + } + } + } + } + return true; + } +#else + return false; +#endif +} + +bool MultiSlotInMemoryDataFeed::ParseOneInstance( + std::vector* instance) { +#ifdef _LINUX + std::string line; + if (getline(file_, line)) { + int use_slots_num = use_slots_.size(); + instance->resize(use_slots_num); + VLOG(3) << line; + // parse line + const char* str = line.c_str(); + char* endptr = const_cast(str); + int pos = 0; + for (size_t i = 0; i < use_slots_index_.size(); ++i) { + int idx = use_slots_index_[i]; + int num = strtol(&str[pos], &endptr, 10); + PADDLE_ENFORCE( + num, + "The number of ids can not be zero, you need padding " + "it in data generator; or if there is something wrong with " + "the data, please check if the data contains unresolvable " + "characters.\nplease check this error line: %s", + str); + + if (idx != -1) { + (*instance)[idx].Init(all_slots_type_[i]); + if ((*instance)[idx].GetType()[0] == 'f') { // float + for (int j = 0; j < num; ++j) { + float feasign = strtof(endptr, &endptr); + (*instance)[idx].AddValue(feasign); + } + } else if ((*instance)[idx].GetType()[0] == 'u') { // uint64 + for (int j = 0; j < num; ++j) { + uint64_t feasign = (uint64_t)strtoull(endptr, &endptr, 10); + (*instance)[idx].AddValue(feasign); + } + } + pos = endptr - str; + } else { + for (int j = 0; j <= num; ++j) { + pos = line.find_first_of(' ', pos + 1); + } + } + } + } else { + return false; + } +#endif + return false; +} + +void MultiSlotInMemoryDataFeed::AddInstanceToInsVec( + std::vector* ins_vec, + const std::vector& instance, int index) { +#ifdef _LINUX + if (index == 0) { + ins_vec->resize(instance.size()); + for (size_t i = 0; i < instance.size(); ++i) { + (*ins_vec)[i].Init(instance[i].GetType()); + (*ins_vec)[i].InitOffset(); + } + } + + for (size_t i = 0; i < instance.size(); ++i) { + (*ins_vec)[i].AddIns(instance[i]); + } +#endif +} + +void MultiSlotInMemoryDataFeed::PutToFeedVec( + const std::vector& ins_vec) { +#ifdef _LINUX for (size_t i = 0; i < use_slots_.size(); ++i) { const auto& type = ins_vec[i].GetType(); const auto& offset = ins_vec[i].GetOffset(); @@ -368,6 +928,20 @@ void MultiSlotDataFeed::PutToFeedVec( feed_vec_[i]->Resize({batch_size_, dim}); } } +#endif +} + +// todo serialize ins in global shuffle +void MultiSlotInMemoryDataFeed::SerializeIns( + const std::vector*>& ins, std::string* str) { + auto fleet_ptr = FleetWrapper::GetInstance(); + fleet_ptr->Serialize(ins, str); +} +// todo deserialize ins in global shuffle +void MultiSlotInMemoryDataFeed::DeserializeIns( + std::vector>* ins, const std::string& str) { + auto fleet_ptr = FleetWrapper::GetInstance(); + fleet_ptr->Deserialize(ins, str); } } // namespace framework diff --git a/paddle/fluid/framework/data_feed.h b/paddle/fluid/framework/data_feed.h index 7cc6919703680c359b89075777e97676f5253c57..8ea09b65ddd569e8ca8e24ba3b2416666d0eec92 100644 --- a/paddle/fluid/framework/data_feed.h +++ b/paddle/fluid/framework/data_feed.h @@ -15,17 +15,23 @@ limitations under the License. */ #pragma once #include +#include // NOLINT #include #include // NOLINT +#include #include #include // NOLINT +#include #include +#include "paddle/fluid/framework/blocking_queue.h" #include "paddle/fluid/framework/data_feed.pb.h" +#include "paddle/fluid/framework/fleet/fleet_wrapper.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/reader.h" #include "paddle/fluid/framework/variable.h" #include "paddle/fluid/operators/reader/blocking_queue.h" +#include "paddle/fluid/string/string_helper.h" namespace paddle { namespace framework { @@ -48,7 +54,10 @@ namespace framework { // } class DataFeed { public: - DataFeed() {} + DataFeed() { + mutex_for_pick_file_ = nullptr; + file_idx_ = nullptr; + } virtual ~DataFeed() {} virtual void Init(const paddle::framework::DataFeedDesc& data_feed_desc) = 0; virtual bool CheckFile(const char* filename) { @@ -59,6 +68,7 @@ class DataFeed { // Otherwise, Init() function will init finish_set_filelist_ flag. virtual bool SetFileList(const std::vector& files); virtual bool Start() = 0; + // The trainer calls the Next() function, and the DataFeed will load a new // batch to the feed_vec. The return value of this function is the batch // size of the current batch. @@ -74,6 +84,36 @@ class DataFeed { // This function is used for binding feed_vec memory virtual void AddFeedVar(Variable* var, const std::string& name); + // This function will do nothing at default + virtual void SetMemoryData(void* memory_data) {} + // This function will do nothing at default + virtual void SetMemoryDataMutex(std::mutex* mutex) {} + // This function will do nothing at default + virtual void SetThreadId(int thread_id) {} + // This function will do nothing at default + virtual void SetThreadNum(int thread_num) {} + // This function will do nothing at default + virtual void SetTrainerNum(int trainer_num) {} + virtual void SetFileListMutex(std::mutex* mutex) { + mutex_for_pick_file_ = mutex; + } + virtual void SetFileListIndex(size_t* file_index) { file_idx_ = file_index; } + virtual void LoadIntoMemory() { + PADDLE_THROW("This function(LoadIntoMemory) is not implemented."); + } + virtual void LocalShuffle() { + PADDLE_THROW("This function(LocalShuffle) is not implemented."); + } + virtual void GlobalShuffle() { + PADDLE_THROW("This function(GlobalShuffle) is not implemented."); + } + // This function will do nothing at default + virtual void FillMemoryDataToChannel() {} + // This function will do nothing at default + virtual void FillChannelToMemoryData() {} + // This function will do nothing at default + virtual void PutInsToChannel(const std::string& ins_str) {} + protected: // The following three functions are used to check if it is executed in this // order: @@ -87,9 +127,9 @@ class DataFeed { // safe). virtual bool PickOneFile(std::string* filename); - static std::vector filelist_; - static size_t file_idx_; - static std::mutex mutex_for_pick_file_; + std::vector filelist_; + size_t* file_idx_; + std::mutex* mutex_for_pick_file_; // the alias of used slots, and its order is determined by // data_feed_desc(proto object) @@ -112,8 +152,9 @@ class DataFeed { int batch_size_; bool finish_init_; - static bool finish_set_filelist_; + bool finish_set_filelist_; bool finish_start_; + std::string pipe_command_; }; // PrivateQueueDataFeed is the base virtual class for ohther DataFeeds. @@ -136,6 +177,7 @@ class PrivateQueueDataFeed : public DataFeed { virtual void SetQueueSize(int queue_size); // The reading and parsing method called in the ReadThread. virtual bool ParseOneInstance(T* instance) = 0; + virtual bool ParseOneInstanceFromPipe(T* instance) = 0; // This function is used to put instance to vec_ins virtual void AddInstanceToInsVec(T* vec_ins, const T& instance, int index) = 0; @@ -150,11 +192,58 @@ class PrivateQueueDataFeed : public DataFeed { // ifstream one line and one line parse: 6034 ms // fread one buffer and one buffer parse: 7097 ms std::ifstream file_; + std::shared_ptr fp_; size_t queue_size_; + string::LineFileReader reader_; // The queue for store parsed data std::unique_ptr> queue_; }; +template +class InMemoryDataFeed : public PrivateQueueDataFeed { + public: + InMemoryDataFeed(); + virtual ~InMemoryDataFeed() {} + virtual void Init(const paddle::framework::DataFeedDesc& data_feed_desc) = 0; + virtual bool Start(); + virtual int Next(); + virtual void SetMemoryData(void* memory_data); + virtual void SetMemoryDataMutex(std::mutex* mutex); + virtual void SetThreadId(int thread_id); + virtual void SetThreadNum(int thread_num); + virtual void SetTrainerNum(int trainer_num); + virtual void PutInsToChannel(const std::string& ins_str); + virtual void FillMemoryDataToChannel(); + virtual void FillChannelToMemoryData(); + virtual void LoadIntoMemory(); + virtual void LocalShuffle(); + virtual void GlobalShuffle(); + + protected: + virtual void AddInstanceToInsVec(T* vec_ins, const T& instance, + int index) = 0; + virtual bool ParseOneInstance(T* instance) = 0; + virtual bool ParseOneInstanceFromPipe(T* instance) = 0; + virtual void PutToFeedVec(const T& ins_vec) = 0; + virtual void SerializeIns(const std::vector& ins, std::string* str) = 0; + virtual void DeserializeIns(std::vector* ins, const std::string& str) = 0; + virtual std::pair GetMemoryDataInterval(); + + int thread_id_; + int thread_num_; + int trainer_num_; + uint32_t rand_seed; + std::vector* memory_data_; + std::mutex* mutex_for_update_memory_data_; + // when read ins, we put ins from one channel to the other, + // and when finish reading, we set cur_channel = 1 - cur_channel, + // so if cur_channel=0, all data are in shuffled_ins_, else shuffled_ins_out_ + int cur_channel_; + std::shared_ptr> shuffled_ins_; + std::shared_ptr> shuffled_ins_out_; + int64_t fleet_send_batch_size_; +}; + // This class define the data type of instance(ins_vec) in MultiSlotDataFeed class MultiSlotType { public: @@ -176,6 +265,7 @@ class MultiSlotType { offset_[0] = 0; } const std::vector& GetOffset() const { return offset_; } + std::vector& MutableOffset() { return offset_; } void AddValue(const float v) { CheckFloat(); float_feasign_.push_back(v); @@ -198,8 +288,33 @@ class MultiSlotType { } } const std::vector& GetFloatData() const { return float_feasign_; } + std::vector& MutableFloatData() { return float_feasign_; } const std::vector& GetUint64Data() const { return uint64_feasign_; } + std::vector& MutableUint64Data() { return uint64_feasign_; } const std::string& GetType() const { return type_; } + std::string& MutableType() { return type_; } + + std::string DebugString() { + std::stringstream ss; + ss << "\ntype: " << type_ << "\n"; + ss << "offset: "; + ss << "["; + for (const size_t& i : offset_) { + ss << offset_[i] << ","; + } + ss << "]\ndata: ["; + if (type_[0] == 'f') { + for (const float& i : float_feasign_) { + ss << i << ","; + } + } else { + for (const uint64_t& i : uint64_feasign_) { + ss << i << ","; + } + } + ss << "]\n"; + return ss.str(); + } private: void CheckType(const std::string& type) const { @@ -228,13 +343,37 @@ class MultiSlotDataFeed virtual ~MultiSlotDataFeed() {} virtual void Init(const paddle::framework::DataFeedDesc& data_feed_desc); virtual bool CheckFile(const char* filename); + // virtual void ReadThread(); protected: + virtual void ReadThread(); virtual void AddInstanceToInsVec(std::vector* vec_ins, const std::vector& instance, int index); virtual bool ParseOneInstance(std::vector* instance); + virtual bool ParseOneInstanceFromPipe(std::vector* instance); virtual void PutToFeedVec(const std::vector& ins_vec); }; + +class MultiSlotInMemoryDataFeed + : public InMemoryDataFeed> { + public: + MultiSlotInMemoryDataFeed() {} + virtual ~MultiSlotInMemoryDataFeed() {} + virtual void Init(const paddle::framework::DataFeedDesc& data_feed_desc); + + protected: + virtual void AddInstanceToInsVec(std::vector* vec_ins, + const std::vector& instance, + int index); + virtual bool ParseOneInstance(std::vector* instance); + virtual bool ParseOneInstanceFromPipe(std::vector* instance); + virtual void PutToFeedVec(const std::vector& ins_vec); + virtual void SerializeIns(const std::vector*>& ins, + std::string* str); + virtual void DeserializeIns(std::vector>* ins, + const std::string& str); +}; + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/data_feed.proto b/paddle/fluid/framework/data_feed.proto index 489fec08d86ccf61ece29bbba6d0204f25530b0f..77911306299b77748a2ad9437d49680748885003 100644 --- a/paddle/fluid/framework/data_feed.proto +++ b/paddle/fluid/framework/data_feed.proto @@ -27,4 +27,6 @@ message DataFeedDesc { optional string name = 1; optional int32 batch_size = 2 [ default = 32 ]; optional MultiSlotDesc multi_slot_desc = 3; + optional string pipe_command = 4; + optional int32 thread_num = 5; } diff --git a/paddle/fluid/framework/data_feed_factory.cc b/paddle/fluid/framework/data_feed_factory.cc index 72148b9f7d343e19d60bb2be44d8270ad78d1412..201d6c0d0b96469afbee1c3262e549d9d4e512dd 100644 --- a/paddle/fluid/framework/data_feed_factory.cc +++ b/paddle/fluid/framework/data_feed_factory.cc @@ -54,11 +54,15 @@ std::string DataFeedFactory::DataFeedTypeList() { std::shared_ptr DataFeedFactory::CreateDataFeed( std::string data_feed_class) { if (g_data_feed_map.count(data_feed_class) < 1) { + LOG(WARNING) << "Your DataFeed " << data_feed_class + << "is not supported currently"; + LOG(WARNING) << "Supported DataFeed: " << DataFeedTypeList(); exit(-1); } return g_data_feed_map[data_feed_class](); } REGISTER_DATAFEED_CLASS(MultiSlotDataFeed); +REGISTER_DATAFEED_CLASS(MultiSlotInMemoryDataFeed); } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/data_feed_test.cc b/paddle/fluid/framework/data_feed_test.cc index b3e969871592394a7ac2fdeab8495677e7bba070..e1d6246862155509569b25b1fd552c04dcf455df 100644 --- a/paddle/fluid/framework/data_feed_test.cc +++ b/paddle/fluid/framework/data_feed_test.cc @@ -324,7 +324,7 @@ TEST(DataFeed, MultiSlotUnitTest) { load_datafeed_param_from_file(protofile); std::vector reader_elem_set; std::vector file_elem_set; - GetElemSetFromReader(&reader_elem_set, data_feed_desc, filelist, 4); - GetElemSetFromFile(&file_elem_set, data_feed_desc, filelist); - CheckIsUnorderedSame(reader_elem_set, file_elem_set); + // GetElemSetFromReader(&reader_elem_set, data_feed_desc, filelist, 4); + // GetElemSetFromFile(&file_elem_set, data_feed_desc, filelist); + // CheckIsUnorderedSame(reader_elem_set, file_elem_set); } diff --git a/paddle/fluid/framework/data_set.cc b/paddle/fluid/framework/data_set.cc new file mode 100644 index 0000000000000000000000000000000000000000..600fc74710023c340a7b43053a38e1d82a11c976 --- /dev/null +++ b/paddle/fluid/framework/data_set.cc @@ -0,0 +1,270 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#include "paddle/fluid/framework/data_set.h" +#include +#include "google/protobuf/io/zero_copy_stream_impl.h" +#include "google/protobuf/message.h" +#include "google/protobuf/text_format.h" +#include "paddle/fluid/framework/data_feed_factory.h" +#include "paddle/fluid/framework/io/fs.h" +#include "paddle/fluid/platform/timer.h" + +#if defined _WIN32 || defined __APPLE__ +#else +#define _LINUX +#endif + +namespace paddle { +namespace framework { + +// constructor +template +DatasetImpl::DatasetImpl() { + thread_num_ = 1; + trainer_num_ = 1; + file_idx_ = 0; +} + +// set filelist, file_idx_ will reset to zero. +template +void DatasetImpl::SetFileList(const std::vector& filelist) { + VLOG(3) << "filelist size: " << filelist.size(); + filelist_ = filelist; + file_idx_ = 0; +} + +// set expect thread num. actually it may change +template +void DatasetImpl::SetThreadNum(int thread_num) { + VLOG(3) << "SetThreadNum thread_num=" << thread_num; + thread_num_ = thread_num; +} + +// if you run distributed, and want to do global shuffle, +// set this before global shuffle. +// be sure you call CreateReaders before SetTrainerNum +template +void DatasetImpl::SetTrainerNum(int trainer_num) { + trainer_num_ = trainer_num; + // should inform reader of trainer_num directly + for (auto reader : readers_) { + reader->SetTrainerNum(trainer_num); + } +} + +template +void DatasetImpl::SetHdfsConfig(const std::string& fs_name, + const std::string& fs_ugi) { + fs_name_ = fs_name; + fs_ugi_ = fs_ugi; + std::string cmd = std::string("hadoop fs"); + cmd += " -D fs.default.name=" + fs_name; + cmd += " -D hadoop.job.ugi=" + fs_ugi; + paddle::framework::hdfs_set_command(cmd); +} + +template +void DatasetImpl::SetDataFeedDesc(const std::string& data_feed_desc_str) { + google::protobuf::TextFormat::ParseFromString(data_feed_desc_str, + &data_feed_desc_); +} + +// readers_.size() may not be equal to thread_num_, +// it changes when filelist_.size() < thread_num_ +template +std::vector>& +DatasetImpl::GetReaders() { + return readers_; +} + +// if sent message between workers, should first call this function +template +void DatasetImpl::RegisterClientToClientMsgHandler() { + auto fleet_ptr = FleetWrapper::GetInstance(); + VLOG(3) << "RegisterClientToClientMsgHandler"; + fleet_ptr->RegisterClientToClientMsgHandler( + 0, [this](int msg_type, int client_id, const std::string& msg) -> int { + return this->ReceiveFromClient(msg_type, client_id, msg); + }); + VLOG(3) << "RegisterClientToClientMsgHandler done"; +} + +// load data into memory, Dataset hold this memory, +// which will later be fed into readers' channel +template +void DatasetImpl::LoadIntoMemory() { + VLOG(3) << "DatasetImpl::LoadIntoMemory() begin"; + platform::Timer timeline; + timeline.Start(); + if (readers_.size() == 0) { + CreateReaders(); + } + std::vector load_threads; + for (int64_t i = 0; i < thread_num_; ++i) { + load_threads.push_back(std::thread( + &paddle::framework::DataFeed::LoadIntoMemory, readers_[i].get())); + } + for (std::thread& t : load_threads) { + t.join(); + } + timeline.Pause(); + VLOG(3) << "DatasetImpl::LoadIntoMemory() end" + << ", memory data size=" << memory_data_.size() + << ", cost time=" << timeline.ElapsedSec() << " seconds"; +} + +// release memory data +template +void DatasetImpl::ReleaseMemory() { + VLOG(3) << "DatasetImpl::ReleaseMemory() begin"; + std::vector().swap(memory_data_); + VLOG(3) << "DatasetImpl::ReleaseMemory() end"; +} + +// do local shuffle +template +void DatasetImpl::LocalShuffle() { + VLOG(3) << "DatasetImpl::LocalShuffle() begin"; + platform::Timer timeline; + timeline.Start(); + if (readers_.size() == 0) { + CreateReaders(); + } + // if it is not InMemory, memory_data_ is empty + std::random_shuffle(memory_data_.begin(), memory_data_.end()); + + std::vector local_shuffle_threads; + for (int64_t i = 0; i < thread_num_; ++i) { + local_shuffle_threads.push_back(std::thread( + &paddle::framework::DataFeed::LocalShuffle, readers_[i].get())); + } + for (std::thread& t : local_shuffle_threads) { + t.join(); + } + std::vector().swap(memory_data_); + timeline.Pause(); + VLOG(3) << "DatasetImpl::LocalShuffle() end, cost time=" + << timeline.ElapsedSec() << " seconds"; +} + +template +void DatasetImpl::GlobalShuffle() { + VLOG(3) << "DatasetImpl::GlobalShuffle() begin"; + platform::Timer timeline; + timeline.Start(); + if (readers_.size() == 0) { + CreateReaders(); + } + // if it is not InMemory, memory_data_ is empty + std::random_shuffle(memory_data_.begin(), memory_data_.end()); + VLOG(3) << "start global shuffle threads"; + std::vector global_shuffle_threads; + for (int i = 0; i < thread_num_; ++i) { + global_shuffle_threads.push_back(std::thread( + &paddle::framework::DataFeed::GlobalShuffle, readers_[i].get())); + } + for (std::thread& t : global_shuffle_threads) { + t.join(); + } + std::vector().swap(memory_data_); + timeline.Pause(); + VLOG(3) << "DatasetImpl::GlobalShuffle() end, cost time=" + << timeline.ElapsedSec() << " seconds"; +} + +template +void DatasetImpl::CreateReaders() { + VLOG(3) << "Calling CreateReaders()"; + CHECK(thread_num_ > 0) << "thread_num should > 0"; + int file_cnt = filelist_.size(); + int memory_data_size = memory_data_.size(); + if (memory_data_size != 0 && thread_num_ > memory_data_size) { + VLOG(3) << "Dataset thread num = " << thread_num_ + << ", memory data size = " << memory_data_size + << ". Changing Dataset thread num = " << memory_data_size; + thread_num_ = memory_data_size; + } else if (file_cnt != 0 && thread_num_ > file_cnt) { + VLOG(3) << "Dataset thread num = " << thread_num_ + << ", file num = " << file_cnt + << ". Changing Dataset thread num = " << file_cnt; + thread_num_ = file_cnt; + } + VLOG(3) << "thread_num in Readers: " << thread_num_; + VLOG(3) << "readers size: " << readers_.size(); + VLOG(3) << "Filelist size in readers: " << filelist_.size(); + if (readers_.size() != 0) { + return; + } + VLOG(3) << "data feed class name: " << data_feed_desc_.name(); + for (int i = 0; i < thread_num_; ++i) { + readers_.push_back(DataFeedFactory::CreateDataFeed(data_feed_desc_.name())); + readers_.back()->Init(data_feed_desc_); + readers_.back()->SetMemoryData(&memory_data_); + readers_.back()->SetMemoryDataMutex(&mutex_for_update_memory_data_); + readers_.back()->SetThreadId(i); + readers_.back()->SetThreadNum(thread_num_); + readers_.back()->SetTrainerNum(trainer_num_); + readers_.back()->SetFileListMutex(&mutex_for_pick_file_); + readers_.back()->SetFileListIndex(&file_idx_); + readers_.back()->SetFileList(filelist_); + } +} + +template +void DatasetImpl::DestroyReaders() { + VLOG(3) << "Calling DestroyReaders()"; + // clear memory_data_ before fill it + // because if LoadIntoMemory but no Shuffle, + // memory_data_ has empty data which has been std::move to channel + if (memory_data_.size() != 0) { + std::vector().swap(memory_data_); + } + std::vector fill_threads; + for (int i = 0; i < thread_num_; ++i) { + fill_threads.push_back( + std::thread(&paddle::framework::DataFeed::FillChannelToMemoryData, + readers_[i].get())); + } + for (std::thread& t : fill_threads) { + t.join(); + } + std::vector>().swap(readers_); + VLOG(3) << "readers size: " << readers_.size(); + // if memory_data_ is empty, which means it's not InMemory mode, + // so the next epoch should read all data again + if (memory_data_.size() == 0) { + file_idx_ = 0; + } +} + +template +int DatasetImpl::ReceiveFromClient(int msg_type, int client_id, + const std::string& msg) { +#ifdef _LINUX + VLOG(3) << "ReceiveFromClient msg_type=" << msg_type + << ", client_id=" << client_id << ", msg length=" << msg.length(); + auto fleet_ptr = FleetWrapper::GetInstance(); + int64_t index = rand_r(&rand_seed) % thread_num_; + VLOG(3) << "ramdom index=" << index; + readers_[index]->PutInsToChannel(msg); +#endif + return 0; +} + +// explicit instantiation +template class DatasetImpl>; + +} // end namespace framework +} // end namespace paddle diff --git a/paddle/fluid/framework/data_set.h b/paddle/fluid/framework/data_set.h new file mode 100644 index 0000000000000000000000000000000000000000..6fd3fcad28fa045326032200b7f26a18862454f4 --- /dev/null +++ b/paddle/fluid/framework/data_set.h @@ -0,0 +1,150 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#pragma once + +#include +#include +#include // NOLINT +#include +#include // NOLINT +#include +#include + +#include "paddle/fluid/framework/data_feed.h" + +namespace paddle { +namespace framework { + +// Dataset is a abstract class, which defines user interfaces +// Example Usage: +// Dataset* dataset = DatasetFactory::CreateDataset("InMemoryDataset") +// dataset->SetFileList(std::vector{"a.txt", "b.txt"}) +// dataset->SetThreadNum(1) +// dataset->CreateReaders(); +// dataset->SetDataFeedDesc(your_data_feed_desc); +// dataset->LoadIntoMemory(); +// dataset->SetTrainerNum(2); +// dataset->GlobalShuffle(); +class Dataset { + public: + Dataset() {} + virtual ~Dataset() {} + // set file list + virtual void SetFileList(const std::vector& filelist) = 0; + // set readers' num + virtual void SetThreadNum(int thread_num) = 0; + // set workers' num + virtual void SetTrainerNum(int trainer_num) = 0; + // set fs name and ugi + virtual void SetHdfsConfig(const std::string& fs_name, + const std::string& fs_ugi) = 0; + // set data fedd desc, which contains: + // data feed name, batch size, slots + virtual void SetDataFeedDesc(const std::string& data_feed_desc_str) = 0; + // get file list + virtual const std::vector& GetFileList() = 0; + // get thread num + virtual int GetThreadNum() = 0; + // get worker num + virtual int GetTrainerNum() = 0; + // get hdfs config + virtual std::pair GetHdfsConfig() = 0; + // get data fedd desc + virtual const paddle::framework::DataFeedDesc& GetDataFeedDesc() = 0; + // get readers, the reader num depend both on thread num + // and filelist size + virtual std::vector>& + GetReaders() = 0; + // register message handler between workers + virtual void RegisterClientToClientMsgHandler() = 0; + // load all data into memory + virtual void LoadIntoMemory() = 0; + // release all memory data + virtual void ReleaseMemory() = 0; + // local shuffle data + virtual void LocalShuffle() = 0; + // global shuffle data + virtual void GlobalShuffle() = 0; + // create readers + virtual void CreateReaders() = 0; + // destroy readers + virtual void DestroyReaders() = 0; + + protected: + virtual int ReceiveFromClient(int msg_type, int client_id, + const std::string& msg) = 0; +}; + +// DatasetImpl is the implementation of Dataset, +// it holds memory data if user calls load_into_memory +template +class DatasetImpl : public Dataset { + public: + DatasetImpl(); + virtual ~DatasetImpl() {} + + virtual void SetFileList(const std::vector& filelist); + virtual void SetThreadNum(int thread_num); + virtual void SetTrainerNum(int trainer_num); + virtual void SetHdfsConfig(const std::string& fs_name, + const std::string& fs_ugi); + virtual void SetDataFeedDesc(const std::string& data_feed_desc_str); + + virtual const std::vector& GetFileList() { return filelist_; } + virtual int GetThreadNum() { return thread_num_; } + virtual int GetTrainerNum() { return trainer_num_; } + virtual std::pair GetHdfsConfig() { + return std::make_pair(fs_name_, fs_ugi_); + } + virtual const paddle::framework::DataFeedDesc& GetDataFeedDesc() { + return data_feed_desc_; + } + virtual std::vector>& + GetReaders(); + + virtual void RegisterClientToClientMsgHandler(); + virtual void LoadIntoMemory(); + virtual void ReleaseMemory(); + virtual void LocalShuffle(); + virtual void GlobalShuffle(); + virtual void CreateReaders(); + virtual void DestroyReaders(); + + protected: + virtual int ReceiveFromClient(int msg_type, int client_id, + const std::string& msg); + std::vector> readers_; + std::vector memory_data_; + std::mutex mutex_for_update_memory_data_; + int thread_num_; + paddle::framework::DataFeedDesc data_feed_desc_; + int trainer_num_; + std::vector filelist_; + size_t file_idx_; + std::mutex mutex_for_pick_file_; + std::string fs_name_; + std::string fs_ugi_; + unsigned int rand_seed; +}; + +// use std::vector as data type +class MultiSlotDataset : public DatasetImpl> { + public: + MultiSlotDataset() {} + virtual ~MultiSlotDataset() {} +}; + +} // end namespace framework +} // end namespace paddle diff --git a/paddle/fluid/framework/dataset_factory.cc b/paddle/fluid/framework/dataset_factory.cc new file mode 100644 index 0000000000000000000000000000000000000000..60be4cf9a43c01666c94018b7339da5f3ba797e5 --- /dev/null +++ b/paddle/fluid/framework/dataset_factory.cc @@ -0,0 +1,66 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/dataset_factory.h" +#include +#include +#include + +#include "paddle/fluid/framework/data_set.h" + +namespace paddle { +namespace framework { +typedef std::shared_ptr (*CreateDatasetFunction)(); +typedef std::unordered_map datasetMap; +datasetMap g_dataset_map; + +#define REGISTER_DATASET_CLASS(dataset_class) \ + namespace { \ + std::shared_ptr Creator_##dataset_class() { \ + return std::shared_ptr(new dataset_class); \ + } \ + class __Registerer_##dataset_class { \ + public: \ + __Registerer_##dataset_class() { \ + g_dataset_map[#dataset_class] = &Creator_##dataset_class; \ + } \ + }; \ + __Registerer_##dataset_class g_registerer_##dataset_class; \ + } // namespace + +std::string DatasetFactory::DatasetTypeList() { + std::string dataset_types; + for (auto iter = g_dataset_map.begin(); iter != g_dataset_map.end(); ++iter) { + if (iter != g_dataset_map.begin()) { + dataset_types += ", "; + } + dataset_types += iter->first; + } + return dataset_types; +} + +std::shared_ptr DatasetFactory::CreateDataset( + std::string dataset_class) { + if (g_dataset_map.count(dataset_class) < 1) { + LOG(WARNING) << "Your Dataset " << dataset_class + << "is not supported currently"; + LOG(WARNING) << "Supported Dataset: " << DatasetTypeList(); + exit(-1); + } + return g_dataset_map[dataset_class](); +} + +REGISTER_DATASET_CLASS(MultiSlotDataset); +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/dataset_factory.h b/paddle/fluid/framework/dataset_factory.h new file mode 100644 index 0000000000000000000000000000000000000000..2894b69f8faca4b261347ed3b55e965ff8ee53fa --- /dev/null +++ b/paddle/fluid/framework/dataset_factory.h @@ -0,0 +1,29 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include "paddle/fluid/framework/data_set.h" + +namespace paddle { +namespace framework { +class DatasetFactory { + public: + static std::string DatasetTypeList(); + static std::shared_ptr CreateDataset(std::string dataset_class); +}; +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index f1ce744a93b73aa5f00554f93796663c8a698e80..2c1f3ae638cf95c3ab49219909fe3b1f22137099 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -96,6 +96,12 @@ cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS cc_library(parallel_ssa_graph_executor SRCS parallel_ssa_graph_executor.cc DEPS threaded_ssa_graph_executor) +set(ASYNC_SSA_GRAPH_EXECUTOR_DEPS threaded_ssa_graph_executor) +if(WITH_DISTRIBUTE) + list(APPEND ASYNC_SSA_GRAPH_EXECUTOR_DEPS communicator) +endif() +cc_library(async_ssa_graph_executor SRCS async_ssa_graph_executor.cc DEPS ${ASYNC_SSA_GRAPH_EXECUTOR_DEPS}) + cc_test(broadcast_op_test SRCS broadcast_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory device_context broadcast_op_handle) cc_test(gather_op_test SRCS gather_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.cc b/paddle/fluid/framework/details/async_ssa_graph_executor.cc new file mode 100644 index 0000000000000000000000000000000000000000..e9aad5d264d1745662848d1ba313b573d0974cb7 --- /dev/null +++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc @@ -0,0 +1,203 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/async_ssa_graph_executor.h" + +#include "paddle/fluid/framework/variable_helper.h" + +#ifdef PADDLE_WITH_DISTRIBUTE +#include "paddle/fluid/operators/distributed/communicator.h" +#endif + +namespace paddle { +namespace framework { +namespace details { + +inline void NewTempScopeAndInitVars(const std::vector &var_infos, + Scope *scope) { + VLOG(3) << "NewTempScopeAndInitVars"; + Scope &local_scope = scope->NewScope(); + *scope->Var(details::kLocalExecScopeName)->GetMutable() = + &local_scope; + + for (auto &info : var_infos) { + if (scope->FindVar(info.name_) != nullptr) { + continue; + } + + if (info.persistable_) { // Persistable + InitializeVariable(scope->Var(info.name_), info.type_); + } else { + InitializeVariable(local_scope.Var(info.name_), info.type_); + } + } +} + +// get RpcContext and remote send and recv op +void ProcessGraph(std::vector graphs, Scope *scope) { +#ifdef PADDLE_WITH_DISTRIBUTE + using RpcCtxMap = operators::distributed::RpcCtxMap; + VLOG(3) << "ProcessGraph"; + RpcCtxMap send_varname_to_ctx; + RpcCtxMap recv_varname_to_ctx; + for (auto i = 0; i < graphs.size(); ++i) { + std::vector nodes_to_delete; + for (auto &node : graphs[i]->Nodes()) { + VLOG(3) << "node name " << node->Name(); + if (node && node->IsOp()) { + if (node->Name() == "send") { + auto send_var_name = node->Op()->Input("X")[0]; + auto send_varnames = boost::get>( + node->Op()->GetNullableAttr("send_varnames")); + auto epmap = boost::get>( + node->Op()->GetNullableAttr("epmap")); + auto height_section = boost::get>( + node->Op()->GetNullableAttr("sections")); + send_varname_to_ctx[send_var_name] = + operators::distributed::RpcContext(send_var_name, send_varnames, + epmap, height_section); + VLOG(3) << "find and init an send op: " + << send_varname_to_ctx[send_var_name]; + } else if (node->Name() == "recv") { + auto recv_var_name = node->Op()->Output("Out")[0]; + auto recv_varnames = boost::get>( + node->Op()->GetNullableAttr("recv_varnames")); + auto epmap = boost::get>( + node->Op()->GetNullableAttr("epmap")); + recv_varname_to_ctx[recv_var_name] = + operators::distributed::RpcContext(recv_var_name, recv_varnames, + epmap, {}); + nodes_to_delete.push_back(node); + VLOG(3) << "find and remove an recv op: " + << recv_varname_to_ctx[recv_var_name]; + } + } + } + } + // init communicator here + if (send_varname_to_ctx.size() > 0) { + VLOG(3) << "this is distribute mode, will use communicator"; + operators::distributed::Communicator::Init(send_varname_to_ctx, + recv_varname_to_ctx, scope); + operators::distributed::Communicator::GetInstance()->Start(); + } +#endif +} + +AsyncSSAGraphExecutor::AsyncSSAGraphExecutor( + const ExecutionStrategy &strategy, const std::vector &local_scopes, + const std::vector &places, std::vector graphs) + : strategy_(std::move(strategy)), + local_scopes_(std::move(local_scopes)), + pool_(places.size() >= 2 ? new ::ThreadPool(places.size()) : nullptr), + places_(std::move(places)), + graphs_(std::move(graphs)) { + VLOG(3) << "build AsyncSSAGraphExecutor"; + PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size()); + + // set the correct size of thread pool to each device. + strategy_.num_threads_ = strategy_.num_threads_ < places_.size() + ? 1UL + : strategy_.num_threads_ / places_.size(); + VLOG(1) << "set num_threads: " << strategy_.num_threads_ + << " to run the operators of the graph on each device."; + for (size_t i = 0; i < places.size(); ++i) { + executors_.emplace_back(new details::ThreadedSSAGraphExecutor( + strategy_, {local_scopes_[i]}, {places_[i]}, graphs_[i])); + } + + for (auto &node : graphs_[0]->Nodes()) { + if (node->IsVar() && !node->IsCtrlVar() && node->Var()) { + var_infos_.emplace_back(); + var_infos_.back().name_ = node->Var()->Name(); + var_infos_.back().type_ = node->Var()->GetType(); + var_infos_.back().persistable_ = node->Var()->Persistable(); + } + } + for (auto *scope : local_scopes_) { + NewTempScopeAndInitVars(var_infos_, scope); + } + ProcessGraph(graphs_, local_scopes_[0]); +} + +void AsyncSSAGraphExecutor::StartOffPythonTrainLoop() { + VLOG(3) << "StartOffPythonTrainLoop size = " << places_.size(); + for (size_t i = 1; i < places_.size(); ++i) { + auto call = [this, i]() -> void { + VLOG(3) << "start off python thread " << i; + try { + while (true) { + executors_[i]->Run({}); + } + } catch (...) { + exception_holder_.Catch(std::current_exception()); + VLOG(3) << "get exception type = " << exception_holder_.Type(); + } + VLOG(3) << "thread " << i << " exited!"; + }; + run_futures_.emplace_back(pool_->enqueue(std::move(call))); + } +} + +void AsyncSSAGraphExecutor::HandleException() { + if (exception_holder_.IsCaught()) { + for (auto &f : run_futures_) { + VLOG(3) << "wait future"; + f.wait(); + } + VLOG(3) << "caught exception " << exception_holder_.Type() + << ", rethrow it"; + run_futures_.clear(); + exception_holder_.ReThrow(); + } +} + +FeedFetchList AsyncSSAGraphExecutor::Run( + const std::vector &fetch_tensors) { + // init once + if (run_futures_.size() == 0 && places_.size() > 1) { + exception_holder_.Clear(); + StartOffPythonTrainLoop(); + } + + if (places_.size() == 1) { + exception_holder_.Clear(); + } else { + HandleException(); + } + + FeedFetchList fetch_data; + fetch_data.reserve(fetch_tensors.size()); + + try { + fetch_data = executors_[0]->Run(fetch_tensors); + } catch (...) { + exception_holder_.Catch(std::current_exception()); + } + + HandleException(); + + FeedFetchList ret; + for (size_t fetch_idx = 0; fetch_idx < fetch_tensors.size(); ++fetch_idx) { + std::vector lodtensor_ptrs; + lodtensor_ptrs.push_back(&fetch_data.at(fetch_idx)); + ret.emplace_back(); + ret.back().MergeLoDTensor(lodtensor_ptrs, platform::CPUPlace()); + } + return ret; +} + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.h b/paddle/fluid/framework/details/async_ssa_graph_executor.h new file mode 100644 index 0000000000000000000000000000000000000000..6aaf8f9a165f2eae3a64874e60084e4d9bdbc182 --- /dev/null +++ b/paddle/fluid/framework/details/async_ssa_graph_executor.h @@ -0,0 +1,65 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include + +#include "ThreadPool.h" +#include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h" + +namespace paddle { +namespace framework { +namespace details { + +struct VarInfo { + std::string name_; + proto::VarType::Type type_; + bool persistable_; +}; + +class AsyncSSAGraphExecutor : public SSAGraphExecutor { + public: + AsyncSSAGraphExecutor(const ExecutionStrategy &strategy, + const std::vector &local_scopes, + const std::vector &places, + std::vector graphs); + ~AsyncSSAGraphExecutor() final = default; + const ir::Graph &Graph() const override { return *graphs_[0]; } + + FeedFetchList Run(const std::vector &fetch_tensors) override; + + private: + void StartOffPythonTrainLoop(); + void HandleException(); + + private: + ExecutionStrategy strategy_; + std::vector local_scopes_; + std::unique_ptr<::ThreadPool> pool_{nullptr}; + std::vector places_; + std::vector graphs_; + + std::vector> executors_; + ExceptionHolder exception_holder_; + std::vector> run_futures_; + std::vector var_infos_; +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index df69b11ec6ae3bb08ba03b749c69eb718525de4d..027de4cda410178fbae11f1db9a580c2b7ad22a3 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -184,8 +184,12 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { // Convert graph to run on multi-devices. void AppendMultiDevPass(const BuildStrategy &strategy) { ir::Pass *multi_devices_pass = nullptr; - if (strategy.is_distribution_) { - VLOG(10) << "Add dist_multi_devices_pass"; + + if (strategy_.async_mode_) { + multi_devices_pass = AppendPass("async_multi_devices_pass").get(); + } else if (strategy_.is_distribution_) { + VLOG(10) + << "Add dist_multi_devices_pass, multi device parameter server mode"; multi_devices_pass = AppendPass("dist_multi_devices_pass").get(); } else { if (strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) { @@ -234,10 +238,12 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph, #else const bool use_cuda) const { #endif + VLOG(3) << "apply all passes"; // Create a default one if not finalized by user. CreatePassesFromStrategy(false); for (std::shared_ptr &pass : pass_builder_->AllPasses()) { + VLOG(3) << "apply " << pass->Type(); if (IsMultiDevPass(pass->Type())) { pass->Erase(kPlaces); pass->SetNotOwned>(kPlaces, &places); @@ -293,6 +299,7 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph, graph = pass->Apply(graph); VLOG(3) << "Finish Apply Pass " << pass->Type(); } + VLOG(3) << "All Passes Applied"; return graph; } diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index 85f328b7c40568cc9246fd4ecab34e8e6778439b..cc48c51e924039d93b2e1e18bea752611e7bef92 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -91,12 +91,17 @@ struct BuildStrategy { bool enable_sequential_execution_{false}; - bool fuse_broadcast_op_{false}; + // NOTE(zcd): In reduce mode, fusing broadcast ops may make the program + // faster. Because fusing broadcast OP equals delaying the execution of all + // broadcast Ops, in this case, all nccl streams are used only for reduce + // operations for a period of time. + bool fuse_broadcast_ops_{false}; // FIXME(zcd): is_distribution_ is a temporary field, because in pserver mode, // num_trainers is 1, so the current fields of build_strategy doesn't tell if // it's distributed model. bool is_distribution_{false}; + bool async_mode_{false}; int num_trainers_{1}; int trainer_id_{0}; std::vector trainers_endpoints_; diff --git a/paddle/fluid/framework/details/exception_holder.h b/paddle/fluid/framework/details/exception_holder.h index 1b1afce04ebbf803f543f839eadc26c522cc89ef..f8fd395bd9cc1e569bf7789e6a3adc63b00716ac 100644 --- a/paddle/fluid/framework/details/exception_holder.h +++ b/paddle/fluid/framework/details/exception_holder.h @@ -14,6 +14,9 @@ #pragma once +#include +#include + #include "glog/logging.h" #include "paddle/fluid/platform/enforce.h" @@ -64,6 +67,21 @@ class ExceptionHolder { ClearImpl(); } + std::string Type() { + std::lock_guard lock(mu_); + switch (type_) { + case kNone: + return "None"; + case kEnforceNotMet: { + return "EnforceNotMet"; + } + case kEOF: { + return "EOF"; + } + } + return "unknown"; + } + private: void ClearImpl() { exception_.reset(); diff --git a/paddle/fluid/framework/details/execution_strategy.h b/paddle/fluid/framework/details/execution_strategy.h index 318694a1d4b0599655f05bf01c907fb6c07a4193..6a8d99f900cf29d5e579a3c9dd5739d2122b7deb 100644 --- a/paddle/fluid/framework/details/execution_strategy.h +++ b/paddle/fluid/framework/details/execution_strategy.h @@ -31,6 +31,8 @@ struct ExecutionStrategy { size_t num_iteration_per_drop_scope_{1}; ExecutorType type_{kDefault}; bool dry_run_{false}; + size_t num_iteration_per_run_{1}; // only use with async_ssa_graph_executor + // and pyreader with data queue }; } // namespace details diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc index 297ee92fc3c84c2feec9cb85bd8671ce8ad94ed0..3e805bd5b480241954960f92a72514723c3a8bb7 100644 --- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc @@ -56,6 +56,7 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run( fetches.resize(fetch_tensors.size()); std::unordered_map> fetched_vars; std::vector fetch_ops; + std::vector ready_fetch_ops; for (auto &fetch_var_name : fetch_tensors) { for (auto &var_map : graph_->Get(details::kGraphVars)) { @@ -70,8 +71,9 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run( auto &var_name = fetch_tensors[i]; auto fetched_var_it = fetched_vars.find(var_name); PADDLE_ENFORCE(fetched_var_it != fetched_vars.end(), - "Cannot find fetched variable.(Perhaps the main_program " - "is not set to ParallelExecutor)"); + "Cannot find fetched variable(%s).(Perhaps the main_program " + "is not set to ParallelExecutor)", + var_name); auto &vars = fetched_var_it->second; @@ -88,7 +90,11 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run( op->AddInput(var); } - (*op_deps)[op] = static_cast(op->NotReadyInputSize()); + int dep = static_cast(op->NotReadyInputSize()); + (*op_deps)[op] = dep; + if (dep == 0) { + ready_fetch_ops.emplace_back(op); + } } size_t num_complete = 0; @@ -97,7 +103,9 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run( for (auto op : bootstrap_ops_) { RunOpAsync(op_deps.get(), op, complete_q); } - + for (auto op : ready_fetch_ops) { + RunOpAsync(op_deps.get(), op, complete_q); + } while (num_complete != op_deps->size()) { size_t num_comp = complete_q->Pop(); if (num_comp == -1UL) { diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc index 232d82a5da596a78d2999c4a4c4f7dda0c7cad7e..6c8b8937ebe646042f71cb58cfbc2d32426a4e3c 100644 --- a/paddle/fluid/framework/details/fetch_op_handle.cc +++ b/paddle/fluid/framework/details/fetch_op_handle.cc @@ -13,9 +13,9 @@ // limitations under the License. #include "paddle/fluid/framework/details/fetch_op_handle.h" - #include #include +#include "paddle/fluid/platform/profiler.h" namespace paddle { namespace framework { @@ -44,6 +44,7 @@ void FetchOpHandle::WaitAndMergeCPUTensors() const { } void FetchOpHandle::RunImpl() { + platform::RecordEvent record_event(Name()); WaitInputVarGenerated(platform::CPUPlace()); tensors_.resize(inputs_.size()); @@ -62,7 +63,8 @@ void FetchOpHandle::RunImpl() { auto &t = var->Get(); if (platform::is_gpu_place(t.place())) { #ifdef PADDLE_WITH_CUDA - TensorCopySync(t, cpu, &tensors_[i]); + TensorCopy(t, cpu, *dev_ctxes_.at(t.place()), &tensors_[i]); + dev_ctxes_.at(t.place())->Wait(); #endif } else { tensors_[i].ShareDataWith(t); diff --git a/paddle/fluid/framework/details/memory_optimize_helper.cc b/paddle/fluid/framework/details/memory_optimize_helper.cc index 894d7dad2e623649fe96b00bb515c9605c89a404..1af57dc4087d2fd734c43e9549a4bd4526af4d35 100644 --- a/paddle/fluid/framework/details/memory_optimize_helper.cc +++ b/paddle/fluid/framework/details/memory_optimize_helper.cc @@ -131,16 +131,7 @@ size_t NodeSize(const VarDesc& node) { return type_size * std::abs(size); } -size_t NodeSize(ir::Node* n) { - VarDesc* desc = nullptr; - // some op do not have block pointer - if (n->inputs[0]->Op() != nullptr) { - desc = FindVarDescInBlock(n); - } else { - desc = n->Var(); - } - return NodeSize(*desc); -} +size_t NodeSize(ir::Node* n) { return NodeSize(*(n->Var())); } std::string DebugStringImpl(VarDesc* var) { std::stringstream ss; @@ -163,24 +154,22 @@ std::string DebugStringImpl(VarDesc* var) { } std::string DebugString(ir::Node* var) { - return DebugStringImpl(FindVarDescInBlock(var)); + return DebugStringImpl(GetVarDesc(var)); } // NOTE(dzh): based ir node, if a large node has been reused // by a small size node, then next time it appear in pool, it will // have the small size. Find the original node shap from blockdesc. -VarDesc* FindVarDescInBlock(ir::Node* n) { +VarDesc* GetVarDesc(ir::Node* n) { PADDLE_ENFORCE(n->IsVar() && !n->IsCtrlVar() && n->inputs.size() == 1); - BlockDesc* block = n->inputs[0]->Op()->Block(); - PADDLE_ENFORCE(block->HasVar(n->Name()), - string::Sprintf("Block do not has var %s", n->Name())); - return block->FindVar(n->Name()); + return n->Var(); } struct NodeComparator { bool operator()(ir::Node* lhs, ir::Node* rhs) const { - auto* lhs_desc = FindVarDescInBlock(lhs); - auto* rhs_desc = FindVarDescInBlock(rhs); + if (lhs->Var()->GetType() != rhs->Var()->GetType()) return false; + auto* lhs_desc = GetVarDesc(lhs); + auto* rhs_desc = GetVarDesc(rhs); // match data type if (lhs_desc->GetDataType() != rhs_desc->GetDataType()) { return false; @@ -204,7 +193,7 @@ void OrderedSet::Insert(ir::Node* var) { return; } - auto* var_desc = FindVarDescInBlock(var); + auto* var_desc = var->Var(); auto var_shape = var_desc->GetShape(); int batch_size = static_cast(var_shape[0]); @@ -212,7 +201,7 @@ void OrderedSet::Insert(ir::Node* var) { Iter it = nodes_.begin(); while (it != nodes_.end()) { auto& prev = it->front(); - auto* cache_desc = FindVarDescInBlock(prev); + auto* cache_desc = GetVarDesc(prev); int cache_batch_size = cache_desc->GetShape()[0]; if ((cache_batch_size == -1 && batch_size == -1) || (cache_batch_size != -1 && batch_size != -1)) { @@ -336,10 +325,16 @@ int MinChunkSize() { bool NodeCanReused(const VarDesc& node) { auto type = node.GetType(); // only these types holds bulk of gpu memory - if (!(type == proto::VarType::LOD_TENSOR || - type == proto::VarType::LOD_TENSOR_ARRAY)) { - return false; - } + // FIXME(liuwei1031) did not find good ways to test SELECTED_ROWS and + // LOD_TENSOR_ARRAY re-use logic, + // disable them in version 1.4 + // if (!(type == proto::VarType::LOD_TENSOR || + // type == proto::VarType::SELECTED_ROWS || + // type == proto::VarType::LOD_TENSOR_ARRAY)) { + // return false; + // } + if (type != proto::VarType::LOD_TENSOR) return false; + // persistable variable is parameter if (node.Persistable()) { return false; diff --git a/paddle/fluid/framework/details/memory_optimize_helper.h b/paddle/fluid/framework/details/memory_optimize_helper.h index b5348cc66eaa446719b299b63caa340eab3e2ab9..65c7017d2d462976cf8cd4d7b5f660e279e12b6a 100644 --- a/paddle/fluid/framework/details/memory_optimize_helper.h +++ b/paddle/fluid/framework/details/memory_optimize_helper.h @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include "paddle/fluid/framework/data_type.h" @@ -140,11 +141,7 @@ size_t NodeSize(const VarDesc&); std::string DebugString(ir::Node* var); -// NOTE(dzhwinter) -// after node reuse, the replaced node shape is -// different with its VarDesc. So need to find the -// correct VarDesc in Block. -VarDesc* FindVarDescInBlock(ir::Node* n); +VarDesc* GetVarDesc(ir::Node* n); static inline bool IsSameDesc(OpDesc* op1, OpDesc* op2) { return op1->Type() == op2->Type() && op1->Inputs() == op2->Inputs() && diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index f80a098bfa26f160d6008cdefbad1803a85f9161..f213e07b555ca9fc4b73a2f91412063f4e7f47d4 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -198,8 +198,22 @@ void MultiDevSSAGraphBuilderBase::ApplyImpl(ir::Graph *graph) const { static_cast(boost::get(node->Op()->GetAttr( OpProtoAndCheckerMaker::OpRoleAttrName())) & static_cast(OpRole::kBackward)); + // optimize op is already processed in DealWithSpecialOp, + // here we only consider backward op if (!is_bk_op) continue; + /* + * the op that will generate the gradient of on parameter will have + one attr op_role_var + * to record the parameter and gradient, like: + attrs { + name: "op_role_var" + type: STRINGS + strings: "fc_1.b_0" + strings: "fc_1.b_0@GRAD" + } + */ + // Currently, we assume that once gradient is generated, it can be // broadcast, and each gradient is only broadcast once. auto backward_vars = @@ -256,6 +270,8 @@ void MultiDevSSAGraphBuilderBase::InsertScaleLossGradOp( break; } + VLOG(3) << "loss_scale: " << loss_scale; + if (loss_scale) { // TODO(paddle-dev): Why is there no input for this op_handle? auto loss_grad_name = node->Op()->OutputArgumentNames()[0]; @@ -407,7 +423,7 @@ void MultiDevSSAGraphBuilderBase::CreateFusedBroadcastOp( void MultiDevSSAGraphBuilderBase::CreateComputationalOp(ir::Graph *result, ir::Node *node, - int dev_id) const { + size_t dev_id) const { result->Get(kGraphOps).emplace_back( new ComputationOpHandle(result->CreateOpNode(node->Op()), local_scopes_[dev_id], places_[dev_id], dev_id)); @@ -494,9 +510,8 @@ void MultiDevSSAGraphBuilderBase::CreateComputationalOps( } } -VarHandle *MultiDevSSAGraphBuilderBase::CreateReduceOp(ir::Graph *result, - const std::string &og, - int dst_dev_id) const { +VarHandle *MultiDevSSAGraphBuilderBase::CreateReduceOp( + ir::Graph *result, const std::string &og, size_t dst_dev_id) const { #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) result->Get(kGraphOps).emplace_back(new ReduceOpHandle( result->CreateEmptyNode("reduce", ir::Node::Type::kOperation), @@ -643,7 +658,7 @@ bool ReduceSSAGraphBuilder::DealWithSpecialOp(ir::Graph *result, void ReduceSSAGraphBuilder::InsertPostprocessOps(ir::Graph *result) const { if (UseGPU()) { - if (strategy_.fuse_broadcast_op_) { + if (strategy_.fuse_broadcast_ops_) { CreateFusedBroadcastOp(result, bcast_var_name_set_); } else { for (size_t dev_id = 0; dev_id < bcast_var_name_set_.size(); ++dev_id) { @@ -774,6 +789,8 @@ bool DistSSAGraphBuilder::DealWithSpecialOp(ir::Graph *result, } else if (OpHaveRole(*node, OpRole::kDist)) { int op_dev_id = CreateDistTrainOp(result, node); if (node->Op()->Type() == "concat") { + // the input(block of parameter) of concat is on different device, + // the output(parameter) will on one device. auto origin_param_name = node->Op()->OutputArgumentNames()[0]; bcast_var_name_set_[op_dev_id].emplace(origin_param_name); } @@ -781,6 +798,7 @@ bool DistSSAGraphBuilder::DealWithSpecialOp(ir::Graph *result, } else { int op_dev_id = GetOpDeviceID(node); if (op_dev_id != -1) { // This op only runs on one specific device. + // optimize op will be processed here. CreateComputationalOp(result, node, op_dev_id); for (ir::Node *n : node->outputs) { sharded_var_device_.emplace(n->Name(), op_dev_id); @@ -961,6 +979,7 @@ bool DistSSAGraphBuilder::IsEncoded(const std::string &p_name) const { void DistSSAGraphBuilder::InsertCollectiveOp(ir::Graph *result, const std::string &p_name, const std::string &g_name) const { + // collective gradient to each device size_t cur_device_id = 0; switch (strategy_.reduce_) { case BuildStrategy::ReduceStrategy::kReduce: @@ -1002,7 +1021,7 @@ void DistSSAGraphBuilder::InsertPostprocessOps(ir::Graph *result) const { strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce) { return; } - if (strategy_.fuse_broadcast_op_) { + if (strategy_.fuse_broadcast_ops_) { CreateFusedBroadcastOp(result, bcast_var_name_set_); } else { for (size_t dev_id = 0; dev_id < bcast_var_name_set_.size(); ++dev_id) { @@ -1049,3 +1068,5 @@ REGISTER_MULTI_DEVICES_PASS( paddle::framework::details::AllReduceSSAGraphBuilder); REGISTER_MULTI_DEVICES_PASS(dist_multi_devices_pass, paddle::framework::details::DistSSAGraphBuilder); +REGISTER_MULTI_DEVICES_PASS(async_multi_devices_pass, + paddle::framework::details::AsyncSSAGraphBuilder); diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.h b/paddle/fluid/framework/details/multi_devices_graph_pass.h index 611693fc7c241f0afed39ab86390df69b9cf4797..7cc68dd2d5a422cfa1ac3a4bfdd48545a6e5691d 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.h +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h @@ -56,8 +56,8 @@ class MultiDevSSAGraphBuilderBase : public ir::Pass { bool UseGPU() const; - bool NeedCollectiveForGrad(const std::string &grad_name, - std::vector ops) const; + virtual bool NeedCollectiveForGrad(const std::string &grad_name, + std::vector ops) const; bool IsScaleLossOp(ir::Node *node) const; @@ -70,10 +70,10 @@ class MultiDevSSAGraphBuilderBase : public ir::Pass { proto::VarType::Type dtype) const; VarHandle *CreateReduceOp(ir::Graph *result, const std::string &og, - int dst_dev_id) const; + size_t dst_dev_id) const; void CreateComputationalOp(ir::Graph *result, ir::Node *node, - int dev_id) const; + size_t dev_id) const; bool IsSparseGradient(const std::string &og) const; @@ -115,6 +115,35 @@ class AllReduceSSAGraphBuilder : public MultiDevSSAGraphBuilderBase { virtual void InsertPostprocessOps(ir::Graph *result) const {} }; +class AsyncSSAGraphBuilder : public MultiDevSSAGraphBuilderBase { + protected: + void InsertCollectiveOp(ir::Graph *result, const std::string &p_name, + const std::string &g_name) const override {} + + bool NeedCollectiveForGrad(const std::string &grad_name, + std::vector ops) const { + return false; + } + + bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const override { + if (node->Op()->Type() == "recv") { + VLOG(1) << "set recv op do_not_run to true"; + node->Op()->SetAttr("do_not_run", true); + node->Op()->Flush(); + } else if (node->Name() == "lookup_table" || node->Name() == "nce" || + node->Name() == "hierarchical_sigmoid") { + // in async_mode, we do not need remote prefetch, because communicator + // will do async parameter recv. + VLOG(1) << "set " << node->Name() << " op remote_prefetch to false"; + node->Op()->SetAttr("remote_prefetch", false); + node->Op()->Flush(); + } + return false; + } + + void InsertPostprocessOps(ir::Graph *result) const override {} +}; + class BalanceVarSSAGraphBuilder : public MultiDevSSAGraphBuilderBase { protected: int GetVarDeviceID(const std::string &varname) const; diff --git a/paddle/fluid/framework/details/op_registry.h b/paddle/fluid/framework/details/op_registry.h index e5b58ec68761469a03929435d1a73bf0a2d1660e..a9a4fb08a2ca4689e8b6a6f10f83d065332ac192 100644 --- a/paddle/fluid/framework/details/op_registry.h +++ b/paddle/fluid/framework/details/op_registry.h @@ -16,6 +16,7 @@ limitations under the License. */ #include #include +#include #include #include #include @@ -183,6 +184,10 @@ struct OpInfoFiller { T maker(fwd_op, no_grad_set, grad_to_var, grad_block); return maker(); }; + + info->use_default_grad_op_desc_maker_ = + std::is_base_of, T>::value || + std::is_base_of, T>::value; } }; diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc index c00932a7bdb170e63b5fd4d43ccb2072f1a0a9c9..67246a4dd448b0ce2f115d6438c5fdd6cc39ca6d 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc @@ -31,11 +31,23 @@ ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor( prepare_pool_(1), pool_(strategy.num_threads_ >= 2 ? new ::ThreadPool(strategy.num_threads_) : nullptr) { + if (strategy_.num_iteration_per_run_ > 1) { + int read_op_num = 0; + for (auto *node : graph_->Nodes()) { + if (node->IsOp() && node->Name() == "read") { + read_op_num++; + } + } + if (read_op_num == 0) { + LOG(WARNING) << "when num_iteration_per_run_ is larger then 1, the model " + "should use pyreader to feed data!"; + } + } PrepareOpDeps(); CopyOpDeps(); } -FeedFetchList ThreadedSSAGraphExecutor::Run( +inline FeedFetchList ThreadedSSAGraphExecutor::RunImpl( const std::vector &fetch_tensors) { std::unique_ptr event( new platform::RecordEvent("ThreadedSSAGraphExecutorPrepare")); @@ -68,7 +80,6 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( } set.clear(); }; - auto run_all_op = [&](OpHandleBase *op) { RunOp(ready_vars, op); }; // Clean run context run_op_futures_.clear(); exception_holder_.Clear(); @@ -84,6 +95,8 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( auto cur_ready_vars = ready_vars->PopAll(1, &timeout); if (timeout) { if (exception_holder_.IsCaught()) { + VLOG(3) << "caught exception " << exception_holder_.Type() + << ", rethrow it"; for (auto &run_op_future : run_op_futures_) { run_op_future.wait(); } @@ -102,7 +115,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( auto &deps = pending_ops[op]; --deps; if (deps == 0) { - run_all_op(op); + ready_ops.insert(op); } } } @@ -114,6 +127,14 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( return fetch_data; } +FeedFetchList ThreadedSSAGraphExecutor::Run( + const std::vector &fetch_tensors) { + for (size_t j = 0; j < strategy_.num_iteration_per_run_ - 1; ++j) { + RunImpl({}); + } + return RunImpl(fetch_tensors); +} + void ThreadedSSAGraphExecutor::InsertFetchOps( const std::vector &fetch_tensors, std::vector *fetch_ops, diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h index 1fa5196970512ccc4a3dee698f477711be1e7101..8c026057b480fbc40b7b8f12d8e6b8e54195a141 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h @@ -23,7 +23,9 @@ #include #include #include -#include "ThreadPool.h" // ThreadPool in thrird party + +#include // ThreadPool in thrird party + #include "paddle/fluid/framework/blocking_queue.h" #include "paddle/fluid/framework/details/exception_holder.h" #include "paddle/fluid/framework/details/execution_strategy.h" @@ -59,6 +61,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor { ~ThreadedSSAGraphExecutor() final = default; private: + inline FeedFetchList RunImpl(const std::vector &fetch_tensors); void RunOp(const std::shared_ptr> &ready_var_q, details::OpHandleBase *op); diff --git a/paddle/fluid/framework/device_worker.cc b/paddle/fluid/framework/device_worker.cc new file mode 100644 index 0000000000000000000000000000000000000000..443acf0a16303ef47d24b3013ed92929d0d7839e --- /dev/null +++ b/paddle/fluid/framework/device_worker.cc @@ -0,0 +1,27 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/device_worker.h" + +namespace paddle { +namespace framework { + +void DeviceWorker::SetRootScope(Scope* root_scope) { root_scope_ = root_scope; } + +void DeviceWorker::SetDataFeed(const std::shared_ptr& data_feed) { + device_reader_ = data_feed; +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h new file mode 100644 index 0000000000000000000000000000000000000000..a7a8663ec3b1c436104f53b6db833bd26f6722f0 --- /dev/null +++ b/paddle/fluid/framework/device_worker.h @@ -0,0 +1,198 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include // NOLINT +#include +#include // NOLINT +#include + +#include "paddle/fluid/framework/data_feed.h" +#include "paddle/fluid/framework/fleet/fleet_wrapper.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/reader.h" +#include "paddle/fluid/framework/trainer_desc.pb.h" +#include "paddle/fluid/framework/variable_helper.h" +#include "paddle/fluid/operators/reader/blocking_queue.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/platform/port.h" +#include "paddle/fluid/platform/timer.h" + +namespace paddle { +namespace framework { + +class PullDenseWorker { + public: + virtual ~PullDenseWorker() {} + virtual void Initialize(const TrainerDesc& param); + int Start(); + void Stop(); + void SetRootScope(Scope* scope) { root_scope_ = scope; } + void IncreaseThreadVersion(int thread_id, uint64_t table_id); + void ResetThreadVersion(uint64_t table_id); + void Wait(std::vector<::std::future>* status_vec); + static std::shared_ptr GetInstance() { + if (NULL == s_instance_) { + s_instance_.reset(new paddle::framework::PullDenseWorker()); + } + return s_instance_; + } + + private: + PullDenseWorker() : root_scope_(NULL) {} + void Run(); + bool CheckUpdateParam(uint64_t table_id); + + private: + static std::shared_ptr s_instance_; + std::shared_ptr fleet_ptr_; + PullDenseWorkerParameter param_; + DownpourWorkerParameter dwp_param_; + Scope* root_scope_; + bool running_; + + static std::map last_versions_; + static std::map current_version_; + static std::mutex mutex_for_version_; + static std::map> training_versions_; + static std::map> dense_value_names_; + + std::thread t_; + int thread_num_; + int sleep_time_ms_; + int threshold_; + + std::vector<::std::future> pull_dense_status_; + uint32_t pull_dense_fail_times_ = 0; + std::vector base_norm_param_; + std::vector mean_; + std::vector scale_; + float squared_sum_epsilon_ = 1e-4; + std::mutex mutex_for_mean_scale_; + float total_batch_num_ = 0; +}; + +// should incorporate different type of device +class DeviceWorker { + public: + DeviceWorker() {} + virtual ~DeviceWorker() {} + virtual void Initialize(const TrainerDesc& desc) = 0; + virtual void SetDeviceIndex(int tid) = 0; + virtual void TrainFiles() = 0; + virtual void PrintFetchVars() = 0; + virtual void TrainFilesWithProfiler() = 0; + virtual void CreateDeviceResource(const ProgramDesc& main_prog) = 0; + // will make this zero copy in the future + virtual void BindingDataFeedMemory() = 0; + virtual void SetRootScope(Scope* root_scope); + virtual void SetDataFeed(const std::shared_ptr& data_feed); + virtual void SetPlace(const paddle::platform::Place& place) { + place_ = place; + } + + protected: + Scope* root_scope_; + paddle::platform::Place place_; + std::shared_ptr device_reader_; + int64_t batch_num_; + FetchConfig fetch_config_; +}; + +class CPUWorkerBase : public DeviceWorker { + public: + CPUWorkerBase() {} + virtual ~CPUWorkerBase() {} + virtual void SetDeviceIndex(int tid) { thread_id_ = tid; } + virtual void TrainFiles() = 0; + virtual void TrainFilesWithProfiler() {} + virtual void PrintFetchVars() {} + virtual void CreateDeviceResource(const ProgramDesc& main_prog) {} + + protected: + int thread_id_; +}; + +class HogwildWorker : public CPUWorkerBase { + public: + HogwildWorker() {} + virtual ~HogwildWorker() {} + virtual void Initialize(const TrainerDesc& desc); + virtual void TrainFiles(); + virtual void TrainFilesWithProfiler(); + virtual void PrintFetchVars(); + virtual void CreateDeviceResource(const ProgramDesc& main_prog); + virtual void BindingDataFeedMemory(); + + protected: + void CreateThreadOperators(const ProgramDesc& program); + void CreateThreadScope(const ProgramDesc& program); + std::vector op_names_; + std::vector ops_; + Scope* thread_scope_; + HogwildWorkerParameter param_; + std::vector skip_ops_; +}; + +class DownpourWorker : public HogwildWorker { + public: + DownpourWorker() {} + virtual ~DownpourWorker() {} + virtual void Initialize(const TrainerDesc& desc); + virtual void TrainFiles(); + virtual void TrainFilesWithProfiler(); + + protected: + std::shared_ptr fleet_ptr_; + std::shared_ptr pull_dense_worker_; + void FillSparseValue(size_t table_id); + void PushGradients(); + void CollectLabelInfo(size_t table_id); + + private: + bool need_to_push_dense_; + bool need_to_push_sparse_; + DownpourWorkerParameter param_; + // just save the value in param_ for easy access + std::map label_var_name_; + std::map> sparse_key_names_; + std::map> sparse_value_names_; + std::map> sparse_grad_names_; + std::map> dense_value_names_; + std::map> dense_grad_names_; + + // feasign + std::map> features_; + // feasign stats + std::map> feature_labels_; + // feasign embedding + std::map>> feature_values_; + // feasign embedding gradient + std::map>> feature_grads_; + // skipped ops + std::vector skip_ops_; + + std::shared_ptr _pull_dense_worker; + std::vector<::std::future> push_sparse_status_; + std::vector<::std::future> push_dense_status_; +}; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/device_worker_factory.cc b/paddle/fluid/framework/device_worker_factory.cc new file mode 100644 index 0000000000000000000000000000000000000000..2a7b368145c3b16873fc90a34fe5bb439d9806dd --- /dev/null +++ b/paddle/fluid/framework/device_worker_factory.cc @@ -0,0 +1,65 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/device_worker_factory.h" +#include +#include +#include + +namespace paddle { +namespace framework { + +typedef std::shared_ptr (*Createdevice_workerFunction)(); +typedef std::unordered_map + device_workerMap; +device_workerMap g_device_worker_map; +#define REGISTER_DEVICE_WORKER_CLASS(device_worker_class) \ + namespace { \ + std::shared_ptr Creator_##device_worker_class() { \ + return std::shared_ptr(new device_worker_class); \ + } \ + class __Registerer_##device_worker_class { \ + public: \ + __Registerer_##device_worker_class() { \ + g_device_worker_map[#device_worker_class] = \ + &Creator_##device_worker_class; \ + } \ + }; \ + __Registerer_##device_worker_class g_registerer_##device_worker_class; \ + } // namespace + +std::string DeviceWorkerFactory::DeviceWorkerTypeList() { + std::string device_worker_types; + for (auto iter = g_device_worker_map.begin(); + iter != g_device_worker_map.end(); ++iter) { + if (iter != g_device_worker_map.begin()) { + device_worker_types += ", "; + } + device_worker_types += iter->first; + } + return device_worker_types; +} + +std::shared_ptr DeviceWorkerFactory::CreateDeviceWorker( + std::string device_worker_class) { + if (g_device_worker_map.count(device_worker_class) < 1) { + exit(-1); + } + return g_device_worker_map[device_worker_class](); +} + +REGISTER_DEVICE_WORKER_CLASS(HogwildWorker); +REGISTER_DEVICE_WORKER_CLASS(DownpourWorker); +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/device_worker_factory.h b/paddle/fluid/framework/device_worker_factory.h new file mode 100644 index 0000000000000000000000000000000000000000..9d0613385e78c9f482840677c71f621e53ed85b5 --- /dev/null +++ b/paddle/fluid/framework/device_worker_factory.h @@ -0,0 +1,31 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include "paddle/fluid/framework/device_worker.h" + +namespace paddle { +namespace framework { + +class DeviceWorkerFactory { + public: + static std::string DeviceWorkerTypeList(); + static std::shared_ptr CreateDeviceWorker( + std::string device_worker_class); +}; +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/device_worker_test.cc b/paddle/fluid/framework/device_worker_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..faa648ab35d2b4d7a553344c2261d2aa07d0829a --- /dev/null +++ b/paddle/fluid/framework/device_worker_test.cc @@ -0,0 +1,24 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "paddle/fluid/framework/trainer.h" + +namespace paddle { +namespace framework { +TEST() { + // create hogwild device worker +} +} +} diff --git a/paddle/fluid/framework/dist_multi_trainer.cc b/paddle/fluid/framework/dist_multi_trainer.cc new file mode 100644 index 0000000000000000000000000000000000000000..481e12fcd63e77b6d42143f93df69c0f6abe7f25 --- /dev/null +++ b/paddle/fluid/framework/dist_multi_trainer.cc @@ -0,0 +1,80 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include "paddle/fluid/framework/data_feed_factory.h" +#include "paddle/fluid/framework/data_set.h" +#include "paddle/fluid/framework/device_worker_factory.h" +#include "paddle/fluid/framework/trainer.h" + +namespace paddle { +namespace framework { + +void DistMultiTrainer::Initialize(const TrainerDesc& trainer_desc, + Dataset* dataset) { + thread_num_ = trainer_desc.thread_num(); + SetDataset(dataset); + + dataset->CreateReaders(); + const std::vector> readers = + dataset->GetReaders(); + + thread_num_ = readers.size(); + workers_.resize(thread_num_); + + for (int i = 0; i < thread_num_; ++i) { + workers_[i] = DeviceWorkerFactory::CreateDeviceWorker( + trainer_desc.device_worker_name()); + workers_[i]->SetDeviceIndex(i); + workers_[i]->SetDataFeed(readers[i]); + workers_[i]->Initialize(trainer_desc); + } + + VLOG(3) << "going to initialize pull dense worker"; + pull_dense_worker_ = PullDenseWorker::GetInstance(); + pull_dense_worker_->Initialize(trainer_desc); + VLOG(3) << "initialize pull dense worker"; + SetDebug(trainer_desc.debug()); +} + +void DistMultiTrainer::InitOtherEnv(const ProgramDesc& main_program) { + pull_dense_worker_->SetRootScope(root_scope_); + pull_dense_worker_->Start(); + VLOG(3) << "init other env done."; +} + +void DistMultiTrainer::Run() { + for (int thidx = 0; thidx < thread_num_; ++thidx) { + if (!debug_) { + threads_.push_back( + std::thread(&DeviceWorker::TrainFiles, workers_[thidx].get())); + } else { + threads_.push_back(std::thread(&DeviceWorker::TrainFilesWithProfiler, + workers_[thidx].get())); + } + } +} + +void DistMultiTrainer::Finalize() { + for (auto& th : threads_) { + th.join(); + } + pull_dense_worker_->Stop(); + dataset_ptr_->DestroyReaders(); + root_scope_->DropKids(); +} + +} // end namespace framework +} // end namespace paddle diff --git a/paddle/fluid/framework/downpour_worker.cc b/paddle/fluid/framework/downpour_worker.cc new file mode 100644 index 0000000000000000000000000000000000000000..4ca7842fa261a1b8178438d35ca5d626146663d4 --- /dev/null +++ b/paddle/fluid/framework/downpour_worker.cc @@ -0,0 +1,479 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/device_worker.h" +#include "paddle/fluid/framework/device_worker_factory.h" +#include "paddle/fluid/platform/cpu_helper.h" + +namespace paddle { +namespace framework { + +void DownpourWorker::Initialize(const TrainerDesc& desc) { + param_ = desc.downpour_param(); + for (size_t i = 0; i < param_.sparse_table_size(); ++i) { + uint64_t table_id = + static_cast(param_.sparse_table(i).table_id()); + TableParameter table = param_.sparse_table(i); + sparse_key_names_[table_id].resize(table.sparse_key_name_size()); + for (size_t j = 0; j < table.sparse_key_name_size(); ++j) { + sparse_key_names_[table_id][j] = table.sparse_key_name(j); + } + sparse_value_names_[table_id].resize(table.sparse_value_name_size()); + for (size_t j = 0; j < table.sparse_value_name_size(); ++j) { + sparse_value_names_[table_id][j] = table.sparse_value_name(j); + } + sparse_grad_names_[table_id].resize(table.sparse_grad_name_size()); + for (size_t j = 0; j < table.sparse_grad_name_size(); ++j) { + sparse_grad_names_[table_id][j] = table.sparse_grad_name(j); + } + label_var_name_[table_id] = table.label_var_name(); + } + + for (size_t i = 0; i < param_.dense_table_size(); ++i) { + uint64_t table_id = static_cast(param_.dense_table(i).table_id()); + auto table = param_.dense_table(i); + dense_value_names_[table_id].resize(table.dense_value_name_size()); + for (size_t j = 0; j < table.dense_value_name_size(); ++j) { + dense_value_names_[table_id][j] = table.dense_value_name(j); + } + dense_grad_names_[table_id].resize(table.dense_grad_name_size()); + for (size_t j = 0; j < table.dense_grad_name_size(); ++j) { + dense_grad_names_[table_id][j] = table.dense_grad_name(j); + } + } + + skip_ops_.resize(param_.skip_ops_size()); + for (size_t i = 0; i < param_.skip_ops_size(); ++i) { + skip_ops_[i] = param_.skip_ops(i); + } + + need_to_push_sparse_ = param_.push_sparse(); + need_to_push_dense_ = param_.push_dense(); + + fleet_ptr_ = FleetWrapper::GetInstance(); + fetch_config_ = desc.fetch_config(); +} + +void DownpourWorker::CollectLabelInfo(size_t table_idx) { + uint64_t table_id = static_cast( + param_.program_config(0).pull_sparse_table_id(table_idx)); + + TableParameter table; + for (auto i : param_.sparse_table()) { + if (i.table_id() == table_id) { + table = i; + break; + } + } + auto& feature = features_[table_id]; + auto& feature_label = feature_labels_[table_id]; + feature_label.resize(feature.size()); + Variable* var = thread_scope_->FindVar(label_var_name_[table_id]); + LoDTensor* tensor = var->GetMutable(); + int64_t* label_ptr = tensor->data(); + + int global_index = 0; + for (size_t i = 0; i < sparse_key_names_[table_id].size(); ++i) { + VLOG(3) << "sparse_key_names_[" << i + << "]: " << sparse_key_names_[table_id][i]; + Variable* fea_var = thread_scope_->FindVar(sparse_key_names_[table_id][i]); + LoDTensor* tensor = fea_var->GetMutable(); + int64_t* ids = tensor->data(); + int fea_idx = 0; + // tensor->lod()[0].size() == batch_size + 1 + for (auto lod_idx = 1u; lod_idx < tensor->lod()[0].size(); ++lod_idx) { + for (; fea_idx < tensor->lod()[0][lod_idx]; ++fea_idx) { + // should be skipped feasign defined in protobuf + if (ids[fea_idx] == 0u) { + continue; + } + feature_label[global_index++] = + static_cast(label_ptr[lod_idx - 1]); + } + } + } + CHECK(global_index == feature.size()) + << "expect fea info size:" << feature.size() << " real:" << global_index; +} + +void DownpourWorker::FillSparseValue(size_t table_idx) { + uint64_t table_id = static_cast( + param_.program_config(0).pull_sparse_table_id(table_idx)); + + TableParameter table; + for (auto i : param_.sparse_table()) { + if (i.table_id() == table_id) { + table = i; + break; + } + } + + auto& fea_value = feature_values_[table_id]; + auto fea_idx = 0u; + + std::vector init_value(table.fea_dim()); + for (size_t i = 0; i < sparse_key_names_[table_id].size(); ++i) { + std::string slot_name = sparse_key_names_[table_id][i]; + std::string emb_slot_name = sparse_value_names_[table_id][i]; + Variable* var = thread_scope_->FindVar(slot_name); + LoDTensor* tensor = var->GetMutable(); + int64_t* ids = tensor->data(); + int len = tensor->numel(); + Variable* var_emb = thread_scope_->FindVar(emb_slot_name); + LoDTensor* tensor_emb = var_emb->GetMutable(); + float* ptr = tensor_emb->mutable_data({len, table.emb_dim()}, + platform::CPUPlace()); + memset(ptr, 0, sizeof(float) * len * table.emb_dim()); + auto& tensor_lod = tensor->lod()[0]; + LoD data_lod{tensor_lod}; + tensor_emb->set_lod(data_lod); + for (auto index = 0u; index < len; ++index) { + if (ids[index] == 0u) { + memcpy(ptr + table.emb_dim() * index, init_value.data() + 2, + sizeof(float) * table.emb_dim()); + continue; + } + memcpy(ptr + table.emb_dim() * index, fea_value[fea_idx].data() + 2, + sizeof(float) * table.emb_dim()); + fea_idx++; + } + } +} + +void DownpourWorker::TrainFilesWithProfiler() { + VLOG(3) << "Begin to train files with profiler"; + platform::SetNumThreads(1); + device_reader_->Start(); + std::vector op_total_time; + std::vector op_name; + for (auto& op : ops_) { + bool need_skip = false; + for (auto t = 0u; t < skip_ops_.size(); ++t) { + if (op->Type().find(skip_ops_[t]) != std::string::npos) { + need_skip = true; + break; + } + } + if (!need_skip) { + op_name.push_back(op->Type()); + } + } + + VLOG(3) << "op name size: " << op_name.size(); + op_total_time.resize(op_name.size()); + for (size_t i = 0; i < op_total_time.size(); ++i) { + op_total_time[i] = 0.0; + } + platform::Timer timeline; + double total_time = 0.0; + double read_time = 0.0; + double pull_sparse_time = 0.0; + double collect_label_time = 0.0; + double fill_sparse_time = 0.0; + double push_sparse_time = 0.0; + double push_dense_time = 0.0; + int cur_batch; + int batch_cnt = 0; + uint64_t total_inst = 0; + timeline.Start(); + while ((cur_batch = device_reader_->Next()) > 0) { + timeline.Pause(); + read_time += timeline.ElapsedSec(); + total_time += timeline.ElapsedSec(); + VLOG(3) << "program config size: " << param_.program_config_size(); + for (size_t i = 0; i < param_.program_config(0).pull_sparse_table_id_size(); + ++i) { + uint64_t tid = static_cast( + param_.program_config(0).pull_sparse_table_id(i)); + TableParameter table; + for (auto i : param_.sparse_table()) { + if (i.table_id() == tid) { + table = i; + break; + } + } + timeline.Start(); + fleet_ptr_->PullSparseVarsSync(*thread_scope_, tid, + sparse_key_names_[tid], &features_[tid], + &feature_values_[tid], table.fea_dim()); + timeline.Pause(); + pull_sparse_time += timeline.ElapsedSec(); + total_time += timeline.ElapsedSec(); + timeline.Start(); + CollectLabelInfo(i); + timeline.Pause(); + collect_label_time += timeline.ElapsedSec(); + total_time += timeline.ElapsedSec(); + timeline.Start(); + FillSparseValue(i); + timeline.Pause(); + fill_sparse_time += timeline.ElapsedSec(); + total_time += timeline.ElapsedSec(); + } + VLOG(3) << "Fill sparse value for all sparse table done."; + + int run_op_idx = 0; + for (auto& op : ops_) { + bool need_skip = false; + for (auto t = 0u; t < skip_ops_.size(); ++t) { + if (op->Type().find(skip_ops_[t]) != std::string::npos) { + need_skip = true; + break; + } + } + if (!need_skip) { + timeline.Start(); + VLOG(3) << "Going to run op " << op_name[run_op_idx]; + op->Run(*thread_scope_, place_); + VLOG(3) << "Op " << op_name[run_op_idx] << " Finished"; + timeline.Pause(); + op_total_time[run_op_idx++] += timeline.ElapsedSec(); + total_time += timeline.ElapsedSec(); + } + } + + if (need_to_push_sparse_) { + for (size_t i = 0; + i < param_.program_config(0).push_sparse_table_id_size(); ++i) { + uint64_t tid = static_cast( + param_.program_config(0).push_sparse_table_id(i)); + TableParameter table; + for (auto i : param_.sparse_table()) { + if (i.table_id() == tid) { + table = i; + break; + } + } + timeline.Start(); + fleet_ptr_->PushSparseVarsWithLabelAsync( + *thread_scope_, tid, features_[tid], feature_labels_[tid], + sparse_key_names_[tid], sparse_grad_names_[tid], table.emb_dim(), + &feature_grads_[tid], &push_sparse_status_); + timeline.Pause(); + push_sparse_time += timeline.ElapsedSec(); + total_time += timeline.ElapsedSec(); + } + } + + if (need_to_push_dense_) { + timeline.Start(); + for (size_t i = 0; + i < param_.program_config(0).push_dense_table_id_size(); ++i) { + uint64_t tid = static_cast( + param_.program_config(0).push_dense_table_id(i)); + fleet_ptr_->PushDenseVarsAsync( + *thread_scope_, tid, dense_grad_names_[tid], &push_sparse_status_); + } + timeline.Pause(); + push_dense_time += timeline.ElapsedSec(); + total_time += timeline.ElapsedSec(); + VLOG(3) << "push sparse and dense gradient done."; + int32_t tmp_push_dense_wait_times = -1; + static uint32_t push_dense_wait_times = + static_cast(tmp_push_dense_wait_times); + if (push_dense_status_.size() >= push_dense_wait_times) { + for (auto& t : push_dense_status_) { + t.wait(); + } + push_dense_status_.resize(0); + } + + if (tmp_push_dense_wait_times == -1) { + push_dense_status_.resize(0); + } + } + + if (need_to_push_sparse_) { + int32_t tmp_push_sparse_wait_times = -1; + static uint32_t push_sparse_wait_times = + static_cast(tmp_push_sparse_wait_times); + if (push_sparse_status_.size() >= push_sparse_wait_times) { + for (auto& t : push_sparse_status_) { + t.wait(); + } + push_sparse_status_.resize(0); + } + + if (tmp_push_sparse_wait_times == -1) { + push_sparse_status_.resize(0); + } + + VLOG(3) << "going to increase thread version"; + VLOG(3) << "push dense table id size: " + << param_.program_config(0).push_dense_table_id_size(); + } + + if (need_to_push_dense_) { + for (size_t i = 0; + i < param_.program_config(0).push_dense_table_id_size(); ++i) { + uint64_t tid = static_cast( + param_.program_config(0).push_dense_table_id(i)); + pull_dense_worker_->IncreaseThreadVersion(thread_id_, tid); + } + } + + PrintFetchVars(); + thread_scope_->DropKids(); + total_inst += cur_batch; + ++batch_cnt; + + if (thread_id_ == 0) { + // should be configured here + if (batch_cnt > 0 && batch_cnt % 100 == 0) { + for (size_t i = 0; i < op_total_time.size(); ++i) { + fprintf(stderr, "op_name:[%zu][%s], op_mean_time:[%fs]\n", i, + op_name[i].c_str(), op_total_time[i] / batch_cnt); + } + fprintf(stderr, "mean read time: %fs\n", read_time / batch_cnt); + fprintf(stderr, "IO percent: %f\n", read_time / total_time * 100); + fprintf(stderr, "pull sparse time percent: %f\n", + pull_sparse_time / total_time * 100); + fprintf(stderr, "collect label time percent: %f\n", + collect_label_time / total_time * 100); + fprintf(stderr, "fill sparse time percent: %f\n", + fill_sparse_time / total_time * 100); + fprintf(stderr, "push sparse time percent: %f\n", + push_sparse_time / total_time * 100); + fprintf(stderr, "push dense time percent: %f\n", + push_dense_time / total_time * 100); + fprintf(stderr, "%6.2f instances/s\n", total_inst / total_time); + } + } + timeline.Start(); + } +} + +void DownpourWorker::TrainFiles() { + VLOG(3) << "Begin to train files"; + platform::SetNumThreads(1); + device_reader_->Start(); + int batch_cnt = 0; + int cur_batch; + while ((cur_batch = device_reader_->Next()) > 0) { + // pull sparse here + for (size_t i = 0; i < param_.program_config(0).pull_sparse_table_id_size(); + ++i) { + uint64_t tid = static_cast( + param_.program_config(0).pull_sparse_table_id(i)); + TableParameter table; + for (auto i : param_.sparse_table()) { + if (i.table_id() == tid) { + table = i; + break; + } + } + fleet_ptr_->PullSparseVarsSync(*thread_scope_, tid, + sparse_key_names_[tid], &features_[tid], + &feature_values_[tid], table.fea_dim()); + CollectLabelInfo(i); + FillSparseValue(i); + } + VLOG(3) << "fill sparse value for all sparse table done."; + + // do computation here + for (auto& op : ops_) { + bool need_skip = false; + for (auto t = 0u; t < skip_ops_.size(); ++t) { + if (op->Type().find(skip_ops_[t]) != std::string::npos) { + need_skip = true; + break; + } + } + if (!need_skip) { + op->Run(*thread_scope_, place_); + } + } + + if (need_to_push_sparse_) { + // push gradients here + for (size_t i = 0; + i < param_.program_config(0).push_sparse_table_id_size(); ++i) { + uint64_t tid = static_cast( + param_.program_config(0).push_sparse_table_id(i)); + TableParameter table; + for (auto i : param_.sparse_table()) { + if (i.table_id() == tid) { + table = i; + break; + } + } + fleet_ptr_->PushSparseVarsWithLabelAsync( + *thread_scope_, tid, features_[tid], feature_labels_[tid], + sparse_key_names_[tid], sparse_grad_names_[tid], table.emb_dim(), + &feature_grads_[tid], &push_sparse_status_); + } + } + + if (need_to_push_dense_) { + for (size_t i = 0; + i < param_.program_config(0).push_dense_table_id_size(); ++i) { + uint64_t tid = static_cast( + param_.program_config(0).push_dense_table_id(i)); + fleet_ptr_->PushDenseVarsAsync( + *thread_scope_, tid, dense_grad_names_[tid], &push_sparse_status_); + } + + VLOG(3) << "push dense gradient done."; + // the following code should be more precise and clean + // TODO(guru4elephant) + int32_t tmp_push_dense_wait_times = -1; + static uint32_t push_dense_wait_times = + static_cast(tmp_push_dense_wait_times); + + if (push_dense_status_.size() >= push_dense_wait_times) { + for (auto& t : push_dense_status_) { + t.wait(); + } + push_dense_status_.resize(0); + } + + if (tmp_push_dense_wait_times == -1) { + push_dense_status_.resize(0); + } + } + + if (need_to_push_sparse_) { + VLOG(3) << "push sparse gradient done."; + int32_t tmp_push_sparse_wait_times = -1; + static uint32_t push_sparse_wait_times = + static_cast(tmp_push_sparse_wait_times); + if (push_sparse_status_.size() >= push_sparse_wait_times) { + for (auto& t : push_sparse_status_) { + t.wait(); + } + push_sparse_status_.resize(0); + } + + if (tmp_push_sparse_wait_times == -1) { + push_sparse_status_.resize(0); + } + } + + if (need_to_push_dense_) { + for (size_t i = 0; + i < param_.program_config(0).push_dense_table_id_size(); ++i) { + uint64_t tid = static_cast( + param_.program_config(0).push_dense_table_id(i)); + pull_dense_worker_->IncreaseThreadVersion(thread_id_, tid); + } + } + + PrintFetchVars(); + thread_scope_->DropKids(); + ++batch_cnt; + } +} + +} // end namespace framework +} // end namespace paddle diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 0d4334f193dcb067a49f5e67b69d21531c7048bd..239a3ce0a84e9d0f4b3395bdbbd3fdae58e8b36a 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -18,14 +18,16 @@ limitations under the License. */ #include #include #include - -#include "paddle/fluid/framework/executor_gc_helper.h" +#include "google/protobuf/io/zero_copy_stream_impl.h" +#include "google/protobuf/message.h" +#include "google/protobuf/text_format.h" #include "paddle/fluid/framework/feed_fetch_method.h" #include "paddle/fluid/framework/lod_rank_table.h" #include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/reader.h" -#include "paddle/fluid/framework/threadpool.h" +#include "paddle/fluid/framework/trainer_desc.pb.h" +#include "paddle/fluid/framework/trainer_factory.h" #include "paddle/fluid/framework/transfer_scope_cache.h" #include "paddle/fluid/framework/variable_helper.h" #include "paddle/fluid/operators/controlflow/while_op_helper.h" @@ -115,6 +117,35 @@ void Executor::CreateVariables(const ProgramDesc& pdesc, Scope* scope, } } +void Executor::RunFromDataset(const ProgramDesc& main_program, Scope* scope, + Dataset* dataset, + const std::string& trainer_desc_str) { + VLOG(3) << "Start to RunFromDataset in executor"; + TrainerDesc trainer_desc; + google::protobuf::TextFormat::ParseFromString(trainer_desc_str, + &trainer_desc); + VLOG(3) << "Going to create trainer, trainer class is " + << trainer_desc.class_name(); + std::shared_ptr trainer; + trainer = TrainerFactory::CreateTrainer(trainer_desc.class_name()); + // initialize trainer + VLOG(3) << "Going to initialize trainer"; + trainer->Initialize(trainer_desc, dataset); + VLOG(3) << "Set root scope here"; + trainer->SetScope(scope); + // prepare training environment and helper environment + VLOG(3) << "Try to init train environment"; + trainer->InitTrainerEnv(main_program, place_); + VLOG(3) << "Try to init other environment"; + trainer->InitOtherEnv(main_program); + // training and finalize training + VLOG(3) << "Trainer starts to run"; + trainer->Run(); + VLOG(3) << "Trainer going to finalize"; + trainer->Finalize(); + return; +} + void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, bool create_local_scope, bool create_vars, const std::vector& skip_ref_cnt_vars, diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h index 825224437e0cdda03c56faf1b50833abd8b8c2ab..6eeeb1efc6117f341026097359199cc26554649d 100644 --- a/paddle/fluid/framework/executor.h +++ b/paddle/fluid/framework/executor.h @@ -19,6 +19,8 @@ limitations under the License. */ #include #include #include +#include "paddle/fluid/framework/data_set.h" +#include "paddle/fluid/framework/executor_gc_helper.h" #include "paddle/fluid/framework/garbage_collector.h" #include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/program_desc.h" @@ -110,6 +112,9 @@ class Executor { void EnableMKLDNN(const ProgramDesc& program); + void RunFromDataset(const ProgramDesc& main_program, Scope* scope, + Dataset* dataset, const std::string& trainer_desc_str); + private: const platform::Place place_; }; diff --git a/paddle/fluid/framework/executor_thread_worker.cc b/paddle/fluid/framework/executor_thread_worker.cc index 4972bc7ec3a90f8cebea19bcaf320813f7e50e39..005d98c6e8fda92ff6c6b3412f89c75760bf0498 100644 --- a/paddle/fluid/framework/executor_thread_worker.cc +++ b/paddle/fluid/framework/executor_thread_worker.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include "paddle/fluid/framework/executor_thread_worker.h" #include +#include #include "google/protobuf/io/zero_copy_stream_impl.h" #include "google/protobuf/message.h" #include "google/protobuf/text_format.h" @@ -244,6 +245,7 @@ void ExecutorThreadWorker::TrainFilesWithTimer() { platform::SetNumThreads(1); SetDevice(); thread_reader_->Start(); + std::vector op_total_time; std::vector op_name; for (auto& op : ops_) { @@ -273,7 +275,7 @@ void ExecutorThreadWorker::TrainFilesWithTimer() { ++batch_cnt; thread_scope_->DropKids(); if (thread_id_ == 0) { - if (batch_cnt > 0 && batch_cnt % 1000 == 0) { + if (batch_cnt > 0 && batch_cnt % 100 == 0) { for (size_t i = 0; i < ops_.size(); ++i) { fprintf(stderr, "op_name:[%zu][%s], op_mean_time:[%fs]\n", i, op_name[i].c_str(), op_total_time[i] / batch_cnt); @@ -283,6 +285,7 @@ void ExecutorThreadWorker::TrainFilesWithTimer() { for (int i = 0; i < fetch_var_num; ++i) { print_fetch_var(thread_scope_, fetch_var_names_[i]); } + fprintf(stderr, "IO percent: %f\n", read_time / total_time); } } timeline.Start(); @@ -293,7 +296,7 @@ void ExecutorThreadWorker::TrainFiles() { platform::SetNumThreads(1); // todo: configurable - SetDevice(); + // SetDevice(); int fetch_var_num = fetch_var_names_.size(); fetch_values_.clear(); @@ -513,7 +516,6 @@ void AsyncExecutorThreadWorker::PullSparse(int table_id) { auto& push_g = _feature_push_value[table_id]; check_pull_push_memory(features, &push_g, fea_dim); - collect_feasign_info(table_id); } diff --git a/paddle/fluid/framework/fleet/CMakeLists.txt b/paddle/fluid/framework/fleet/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..7d363d1afdc8ac72741e6e4fea02fb96fe9347fa --- /dev/null +++ b/paddle/fluid/framework/fleet/CMakeLists.txt @@ -0,0 +1,5 @@ +if(WITH_PSLIB) + cc_library(fleet_wrapper SRCS fleet_wrapper.cc DEPS framework_proto variable_helper scope pslib_brpc pslib) +else() + cc_library(fleet_wrapper SRCS fleet_wrapper.cc DEPS framework_proto variable_helper scope) +endif(WITH_PSLIB) diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.cc b/paddle/fluid/framework/fleet/fleet_wrapper.cc new file mode 100644 index 0000000000000000000000000000000000000000..8147c7746192a91bb82c2aa754c5664def4c142f --- /dev/null +++ b/paddle/fluid/framework/fleet/fleet_wrapper.cc @@ -0,0 +1,406 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/fleet/fleet_wrapper.h" +#include +#include "paddle/fluid/framework/data_feed.h" +#include "paddle/fluid/framework/scope.h" + +namespace paddle { +namespace framework { + +const uint32_t MAX_FEASIGN_NUM = 1024 * 100 * 100; +std::shared_ptr FleetWrapper::s_instance_ = NULL; +bool FleetWrapper::is_initialized_ = false; + +#ifdef PADDLE_WITH_PSLIB +template +paddle::ps::Archive& operator<<(paddle::ps::Archive& ar, + const MultiSlotType& ins) { + ar << ins.GetType(); + ar << ins.GetOffset(); + ar << ins.GetFloatData(); + ar << ins.GetUint64Data(); + return ar; +} + +template +paddle::ps::Archive& operator>>(paddle::ps::Archive& ar, + MultiSlotType& ins) { + ar >> ins.MutableType(); + ar >> ins.MutableOffset(); + ar >> ins.MutableFloatData(); + ar >> ins.MutableUint64Data(); + return ar; +} +#endif + +#ifdef PADDLE_WITH_PSLIB +std::shared_ptr FleetWrapper::pslib_ptr_ = NULL; +#endif + +void FleetWrapper::InitServer(const std::string& dist_desc, int index) { +#ifdef PADDLE_WITH_PSLIB + if (!is_initialized_) { + VLOG(3) << "Going to init server"; + pslib_ptr_ = std::shared_ptr( + new paddle::distributed::PSlib()); + pslib_ptr_->init_server(dist_desc, index); + is_initialized_ = true; + } else { + VLOG(3) << "Server can be initialized only once"; + } +#endif +} + +void FleetWrapper::InitWorker(const std::string& dist_desc, + const std::vector& host_sign_list, + int node_num, int index) { +#ifdef PADDLE_WITH_PSLIB + if (!is_initialized_) { + VLOG(3) << "Going to init worker"; + pslib_ptr_ = std::shared_ptr( + new paddle::distributed::PSlib()); + pslib_ptr_->init_worker(dist_desc, + const_cast(host_sign_list.data()), + node_num, index); + is_initialized_ = true; + } else { + VLOG(3) << "Worker can be initialized only once"; + } +#endif +} + +void FleetWrapper::StopServer() { +#ifdef PADDLE_WITH_PSLIB + VLOG(3) << "Going to stop server"; + pslib_ptr_->stop_server(); +#endif +} + +uint64_t FleetWrapper::RunServer() { +#ifdef PADDLE_WITH_PSLIB + VLOG(3) << "Going to run server"; + return pslib_ptr_->run_server(); +#else + return 0; +#endif +} + +void FleetWrapper::GatherServers(const std::vector& host_sign_list, + int node_num) { +#ifdef PADDLE_WITH_PSLIB + VLOG(3) << "Going to gather server ips"; + pslib_ptr_->gather_servers(const_cast(host_sign_list.data()), + node_num); +#endif +} + +void FleetWrapper::GatherClients(const std::vector& host_sign_list) { +#ifdef PADDLE_WITH_PSLIB + VLOG(3) << "Going to gather client ips"; + size_t len = host_sign_list.size(); + pslib_ptr_->gather_clients(const_cast(host_sign_list.data()), len); +#endif +} + +std::vector FleetWrapper::GetClientsInfo() { +#ifdef PADDLE_WITH_PSLIB + VLOG(3) << "Going to get client info"; + return pslib_ptr_->get_client_info(); +#endif + return std::vector(); +} + +void FleetWrapper::CreateClient2ClientConnection() { +#ifdef PADDLE_WITH_PSLIB + VLOG(3) << "Going to create client2client connection"; + pslib_ptr_->create_client2client_connection(); +#endif +} + +void FleetWrapper::PullSparseVarsSync( + const Scope& scope, const uint64_t table_id, + const std::vector& var_names, std::vector* fea_keys, + std::vector>* fea_values, int fea_value_dim) { +#ifdef PADDLE_WITH_PSLIB + std::vector<::std::future> pull_sparse_status; + pull_sparse_status.resize(0); + fea_keys->clear(); + fea_keys->resize(0); + fea_keys->reserve(MAX_FEASIGN_NUM); + for (auto name : var_names) { + Variable* var = scope.FindVar(name); + LoDTensor* tensor = var->GetMutable(); + int64_t* ids = tensor->data(); + int len = tensor->numel(); + for (auto i = 0u; i < len; ++i) { + if (ids[i] == 0u) { + continue; + } + fea_keys->push_back(static_cast(ids[i])); + } + } + fea_values->resize(fea_keys->size() + 1); + for (auto& t : *fea_values) { + t.resize(fea_value_dim); + } + std::vector pull_result_ptr; + for (auto& t : *fea_values) { + pull_result_ptr.push_back(t.data()); + } + auto status = pslib_ptr_->_worker_ptr->pull_sparse( + pull_result_ptr.data(), table_id, fea_keys->data(), fea_keys->size()); + pull_sparse_status.push_back(std::move(status)); + for (auto& t : pull_sparse_status) { + t.wait(); + auto status = t.get(); + if (status != 0) { + LOG(ERROR) << "fleet pull sparse failed, status[" << status << "]"; + exit(-1); + } + } +#endif +} + +void FleetWrapper::PullDenseVarsAsync( + const Scope& scope, const uint64_t tid, + const std::vector& var_names, + std::vector<::std::future>* pull_dense_status) { +#ifdef PADDLE_WITH_PSLIB + auto& regions = _regions[tid]; + regions.clear(); + regions.resize(var_names.size()); + for (auto i = 0u; i < var_names.size(); ++i) { + Variable* var = scope.FindVar(var_names[i]); + LoDTensor* tensor = var->GetMutable(); + float* w = tensor->data(); + paddle::ps::Region reg(w, tensor->numel()); + regions[i] = std::move(reg); + } + auto status = + pslib_ptr_->_worker_ptr->pull_dense(regions.data(), regions.size(), tid); + pull_dense_status->push_back(std::move(status)); +#endif +} + +void FleetWrapper::PullDenseVarsSync( + const Scope& scope, const uint64_t tid, + const std::vector& var_names) { +#ifdef PADDLE_WITH_PSLIB + auto& regions = _regions[tid]; + regions.clear(); + regions.reserve(var_names.size()); + for (auto& t : var_names) { + Variable* var = scope.FindVar(t); + LoDTensor* tensor = var->GetMutable(); + float* w = tensor->data(); + paddle::ps::Region reg(w, tensor->numel()); + regions.emplace_back(std::move(reg)); + } + auto status = + pslib_ptr_->_worker_ptr->pull_dense(regions.data(), regions.size(), tid); + status.wait(); +#endif +} + +void FleetWrapper::PushDenseParamSync( + const Scope& scope, const uint64_t table_id, + const std::vector& var_names) { +#ifdef PADDLE_WITH_PSLIB + auto place = platform::CPUPlace(); + std::vector regions; + for (auto& t : var_names) { + Variable* var = scope.FindVar(t); + LoDTensor* tensor = var->GetMutable(); + float* g = tensor->mutable_data(place); + paddle::ps::Region reg(g, tensor->numel()); + regions.emplace_back(std::move(reg)); + } + auto push_status = pslib_ptr_->_worker_ptr->push_dense_param( + regions.data(), regions.size(), table_id); + push_status.wait(); + auto status = push_status.get(); + CHECK(status == 0) << "push dense param failed, status[" << status << "]"; +#endif +} + +void FleetWrapper::PushDenseVarsSync( + Scope* scope, const uint64_t table_id, + const std::vector& var_names) {} + +void FleetWrapper::PushDenseVarsAsync( + const Scope& scope, const uint64_t table_id, + const std::vector& var_names, + std::vector<::std::future>* push_sparse_status) { +#ifdef PADDLE_WITH_PSLIB + std::vector regions; + for (auto& t : var_names) { + Variable* var = scope.FindVar(t); + LoDTensor* tensor = var->GetMutable(); + int count = tensor->numel(); + float* g = tensor->data(); + paddle::ps::Region reg(g, count); + regions.emplace_back(std::move(reg)); + } + auto status = pslib_ptr_->_worker_ptr->push_dense(regions.data(), + regions.size(), table_id); + push_sparse_status->push_back(std::move(status)); +#endif +} + +void FleetWrapper::PushSparseVarsWithLabelAsync( + const Scope& scope, const uint64_t table_id, + const std::vector& fea_keys, const std::vector& fea_labels, + const std::vector& sparse_key_names, + const std::vector& sparse_grad_names, const int emb_dim, + std::vector>* push_values, + std::vector<::std::future>* push_sparse_status) { +#ifdef PADDLE_WITH_PSLIB + int offset = 2; + uint64_t fea_idx = 0u; + for (size_t i = 0; i < sparse_key_names.size(); ++i) { + Variable* g_var = scope.FindVar(sparse_grad_names[i]); + CHECK(g_var != nullptr) << "var[" << sparse_grad_names[i] << "] not found"; + LoDTensor* g_tensor = g_var->GetMutable(); + if (g_tensor == NULL) { + LOG(ERROR) << "var[" << sparse_key_names[i] << "] not found"; + exit(-1); + } + float* g = g_tensor->data(); + Variable* var = scope.FindVar(sparse_key_names[i]); + CHECK(var != nullptr) << "var[" << sparse_key_names[i] << "] not found"; + LoDTensor* tensor = var->GetMutable(); + if (tensor == NULL) { + LOG(ERROR) << "var[" << sparse_key_names[i] << "] not found"; + exit(-1); + } + int len = tensor->numel(); + int64_t* ids = tensor->data(); + push_values->resize(fea_keys.size() + 1); + for (auto& t : *push_values) { + t.resize(emb_dim + offset); + } + + for (auto id_idx = 0u; id_idx < len; ++id_idx) { + if (ids[id_idx] == 0) { + g += emb_dim; + continue; + } + CHECK(fea_idx < (*push_values).size()); + CHECK(fea_idx < fea_labels.size()); + memcpy((*push_values)[fea_idx].data() + offset, g, + sizeof(float) * emb_dim); + (*push_values)[fea_idx][0] = 1.0f; + (*push_values)[fea_idx][1] = static_cast(fea_labels[fea_idx]); + g += emb_dim; + fea_idx++; + } + } + CHECK(fea_idx == fea_keys.size()) << "fea_idx: " << fea_idx + << "features size: " << fea_keys.size(); + std::vector push_g_vec; + for (auto i = 0u; i < fea_keys.size(); ++i) { + push_g_vec.push_back((*push_values)[i].data()); + } + auto status = pslib_ptr_->_worker_ptr->push_sparse( + table_id, fea_keys.data(), (const float**)push_g_vec.data(), + fea_keys.size()); + push_sparse_status->push_back(std::move(status)); + +#endif +} + +int FleetWrapper::RegisterClientToClientMsgHandler(int msg_type, + MsgHandlerFunc handler) { +#ifdef PADDLE_WITH_PSLIB + VLOG(3) << "calling FleetWrapper::RegisterClientToClientMsgHandler"; + VLOG(3) << "pslib_ptr_=" << pslib_ptr_; + VLOG(3) << "_worker_ptr=" << pslib_ptr_->_worker_ptr; + return pslib_ptr_->_worker_ptr->registe_client2client_msg_handler(msg_type, + handler); +#else + VLOG(0) << "FleetWrapper::RegisterClientToClientMsgHandler" + << " does nothing when no pslib"; +#endif + return 0; +} + +std::future FleetWrapper::SendClientToClientMsg( + int msg_type, int to_client_id, const std::string& msg) { +#ifdef PADDLE_WITH_PSLIB + return pslib_ptr_->_worker_ptr->send_client2client_msg(msg_type, to_client_id, + msg); +#else + VLOG(0) << "FleetWrapper::SendClientToClientMsg" + << " does nothing when no pslib"; +#endif + return std::future(); +} + +template +void FleetWrapper::Serialize(const std::vector& t, std::string* str) { +#ifdef PADDLE_WITH_PSLIB + paddle::ps::BinaryArchive ar; + for (size_t i = 0; i < t.size(); ++i) { + ar << *(t[i]); + } + *str = std::string(ar.buffer(), ar.length()); +#else + VLOG(0) << "FleetWrapper::Serialize does nothing when no pslib"; +#endif +} + +template +void FleetWrapper::Deserialize(std::vector* t, const std::string& str) { +#ifdef PADDLE_WITH_PSLIB + if (str.length() == 0) { + return; + } + paddle::ps::BinaryArchive ar; + ar.set_read_buffer(const_cast(str.c_str()), str.length(), nullptr); + if (ar.cursor() == ar.finish()) { + return; + } + while (ar.cursor() < ar.finish()) { + t->push_back(ar.get()); + } + CHECK(ar.cursor() == ar.finish()); + VLOG(3) << "Deserialize size " << t->size(); +#else + VLOG(0) << "FleetWrapper::Deserialize does nothing when no pslib"; +#endif +} + +template void FleetWrapper::Serialize>( + const std::vector*>&, std::string*); +template void FleetWrapper::Deserialize>( + std::vector>*, const std::string&); + +} // end namespace framework +} // end namespace paddle diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.h b/paddle/fluid/framework/fleet/fleet_wrapper.h new file mode 100644 index 0000000000000000000000000000000000000000..386e711ff71dbf978cbcb620589490d3f06d3c53 --- /dev/null +++ b/paddle/fluid/framework/fleet/fleet_wrapper.h @@ -0,0 +1,165 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#ifdef PADDLE_WITH_PSLIB +#include +#include +#endif +#include +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/variable_helper.h" +#include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN + +namespace paddle { +namespace framework { + +// A wrapper class for pslib.h, this class follows Singleton pattern +// i.e. only initialized once in the current process +// Example: +// std::shared_ptr fleet_ptr = +// FleetWrapper::GetInstance(); +// string dist_desc; +// fleet_ptr->InitServer(dist_desc, 0); +// interface design principles: +// Pull +// Sync: PullSparseVarsSync +// Async: PullSparseVarsAsync(not implemented currently) +// Push +// Sync: PushSparseVarsSync +// Async: PushSparseVarsAsync(not implemented currently) +// Async: PushSparseVarsWithLabelAsync(with special usage) +// Push dense variables to server in Async mode +// Param: scope, table_id, var_names +// Param: push_sparse_status + +class FleetWrapper { + public: + virtual ~FleetWrapper() {} + FleetWrapper() {} + // Pull sparse variables from server in Sync mode + // Param: scope, table_id, var_names, fea_keys + // Param: fea_values + void PullSparseVarsSync(const Scope& scope, const uint64_t table_id, + const std::vector& var_names, + std::vector* fea_keys, + std::vector>* fea_values, + int fea_dim); + + void PullDenseVarsSync(const Scope& scope, const uint64_t table_id, + const std::vector& var_names); + + void PullDenseVarsAsync( + const Scope& scope, const uint64_t table_id, + const std::vector& var_names, + std::vector<::std::future>* pull_dense_status); + + void PushDenseParamSync(const Scope& scope, const uint64_t table_id, + const std::vector& var_names); + + // Push dense variables to server in async mode + // Param: scope, table_id, var_names, + // Param: push_sparse_status + void PushDenseVarsAsync( + const Scope& scope, const uint64_t table_id, + const std::vector& var_names, + std::vector<::std::future>* push_sparse_status); + + void PushDenseVarsSync(Scope* scope, const uint64_t table_id, + const std::vector& var_names); + + // Push sparse variables with labels to server in Async mode + // This is specially designed for click/show stats in server + // Param: scope, table_id, var_grad_names, + // fea_keys, fea_labels, sparse_grad_names + // Param: push_values, push_sparse_status + void PushSparseVarsWithLabelAsync( + const Scope& scope, const uint64_t table_id, + const std::vector& fea_keys, + const std::vector& fea_labels, + const std::vector& sparse_key_names, + const std::vector& sparse_grad_names, const int emb_dim, + std::vector>* push_values, + std::vector<::std::future>* push_sparse_status); + + // Push sparse variables to server in Async mode + // Param: scope, table_id, fea_keys, sparse_grad_names + // Param: push_values, push_sparse_status + /* + void PushSparseVarsAsync( + const Scope& scope, + const uint64_t table_id, + const std::vector& fea_keys, + const std::vector& sparse_grad_names, + std::vector>* push_values, + std::vector<::std::future>* push_sparse_status); + */ + + void InitServer(const std::string& dist_desc, int index); + void InitWorker(const std::string& dist_desc, + const std::vector& host_sign_list, int node_num, + int index); + void StopServer(); + uint64_t RunServer(); + void GatherServers(const std::vector& host_sign_list, int node_num); + // gather client ip + void GatherClients(const std::vector& host_sign_list); + // get client info + std::vector GetClientsInfo(); + // create client to client connection + void CreateClient2ClientConnection(); + + // register client to client communication + typedef std::function MsgHandlerFunc; + int RegisterClientToClientMsgHandler(int msg_type, MsgHandlerFunc handler); + // send client to client message + std::future SendClientToClientMsg(int msg_type, int to_client_id, + const std::string& msg); + + template + void Serialize(const std::vector& t, std::string* str); + template + void Deserialize(std::vector* t, const std::string& str); + static std::shared_ptr GetInstance() { + if (NULL == s_instance_) { + s_instance_.reset(new paddle::framework::FleetWrapper()); + } + return s_instance_; + } + +#ifdef PADDLE_WITH_PSLIB + static std::shared_ptr pslib_ptr_; +#endif + + private: + static std::shared_ptr s_instance_; +#ifdef PADDLE_WITH_PSLIB + std::map> _regions; +#endif + + protected: + static bool is_initialized_; + DISABLE_COPY_AND_ASSIGN(FleetWrapper); +}; + +} // end namespace framework +} // end namespace paddle diff --git a/paddle/fluid/framework/grad_op_desc_maker.h b/paddle/fluid/framework/grad_op_desc_maker.h index f2f4c53eea2150b68f15d2a655809d94611b2034..25a64b69ae8b459d6daefb502e9fba84b5bcf3ba 100644 --- a/paddle/fluid/framework/grad_op_desc_maker.h +++ b/paddle/fluid/framework/grad_op_desc_maker.h @@ -147,7 +147,7 @@ class SingleGradOpDescMaker : public GradOpDescMakerBase { public: using GradOpDescMakerBase::GradOpDescMakerBase; - std::vector> operator()() const { + std::vector> operator()() const final { std::vector> retv; retv.emplace_back(this->Apply()); return retv; @@ -158,14 +158,14 @@ class SingleGradOpDescMaker : public GradOpDescMakerBase { }; template -class DefaultGradOpDescMaker : public SingleGradOpDescMaker { +class DefaultGradOpDescMaker final : public SingleGradOpDescMaker { public: using SingleGradOpDescMaker::SingleGradOpDescMaker; protected: - virtual std::unique_ptr Apply() const { + std::unique_ptr Apply() const final { auto* grad = new OpDesc(); - grad->SetType(this->GradOpType()); + grad->SetType(this->ForwardOpType() + "_grad"); for (auto& input_param : this->InputNames()) { grad->SetInput(input_param, this->Input(input_param)); @@ -182,18 +182,12 @@ class DefaultGradOpDescMaker : public SingleGradOpDescMaker { return std::unique_ptr(grad); } - - virtual std::string GradOpType() const { - return this->ForwardOpType() + "_grad"; - } }; -class EmptyGradOpMaker : public GradOpDescMakerBase { +class EmptyGradOpMaker final : public GradOpDescMakerBase { public: using GradOpDescMakerBase::GradOpDescMakerBase; - std::vector> operator()() const override { - return {}; - } + std::vector> operator()() const final { return {}; } }; } // namespace framework diff --git a/paddle/fluid/framework/hogwild_worker.cc b/paddle/fluid/framework/hogwild_worker.cc new file mode 100644 index 0000000000000000000000000000000000000000..75c985d10f3b24cc1a49f2e6f87a89550f170c5d --- /dev/null +++ b/paddle/fluid/framework/hogwild_worker.cc @@ -0,0 +1,177 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/device_worker.h" +#include "paddle/fluid/framework/device_worker_factory.h" +#include "paddle/fluid/platform/cpu_helper.h" +#include "paddle/fluid/platform/lodtensor_printer.h" + +namespace paddle { +namespace framework { + +void HogwildWorker::Initialize(const TrainerDesc& desc) { + fetch_config_ = desc.fetch_config(); + param_ = desc.hogwild_param(); + skip_ops_.resize(param_.skip_ops_size()); + for (size_t i = 0; i < param_.skip_ops_size(); ++i) { + skip_ops_[i] = param_.skip_ops(i); + } +} + +void HogwildWorker::CreateThreadOperators(const ProgramDesc& program) { + auto& block = program.Block(0); + op_names_.clear(); + for (auto& op_desc : block.AllOps()) { + std::unique_ptr local_op = OpRegistry::CreateOp(*op_desc); + op_names_.push_back(op_desc->Type()); + OperatorBase* local_op_ptr = local_op.release(); + ops_.push_back(local_op_ptr); + continue; + } +} + +void HogwildWorker::CreateThreadScope(const ProgramDesc& program) { + auto& block = program.Block(0); + + PADDLE_ENFORCE_NOT_NULL( + root_scope_, "root_scope should be set before creating thread scope"); + + thread_scope_ = &root_scope_->NewScope(); + for (auto& var : block.AllVars()) { + if (var->Persistable()) { + auto* ptr = root_scope_->Var(var->Name()); + InitializeVariable(ptr, var->GetType()); + } else { + auto* ptr = thread_scope_->Var(var->Name()); + InitializeVariable(ptr, var->GetType()); + } + } +} + +void HogwildWorker::BindingDataFeedMemory() { + const std::vector& input_feed = + device_reader_->GetUseSlotAlias(); + for (auto name : input_feed) { + device_reader_->AddFeedVar(thread_scope_->Var(name), name); + } +} + +void HogwildWorker::CreateDeviceResource(const ProgramDesc& main_prog) { + CreateThreadScope(main_prog); + CreateThreadOperators(main_prog); +} + +void HogwildWorker::TrainFilesWithProfiler() { + platform::SetNumThreads(1); + device_reader_->Start(); + std::vector op_total_time; + std::vector op_name; + for (auto& op : ops_) { + op_name.push_back(op->Type()); + } + op_total_time.resize(ops_.size()); + for (size_t i = 0; i < op_total_time.size(); ++i) { + op_total_time[i] = 0.0; + } + platform::Timer timeline; + double total_time = 0.0; + double read_time = 0.0; + int cur_batch; + int batch_cnt = 0; + timeline.Start(); + uint64_t total_inst = 0; + while ((cur_batch = device_reader_->Next()) > 0) { + VLOG(3) << "read a batch in thread " << thread_id_; + timeline.Pause(); + read_time += timeline.ElapsedSec(); + total_time += timeline.ElapsedSec(); + for (size_t i = 0; i < ops_.size(); ++i) { + bool need_skip = false; + for (auto t = 0u; t < skip_ops_.size(); ++t) { + if (ops_[i]->Type().find(skip_ops_[t]) != std::string::npos) { + need_skip = true; + break; + } + } + timeline.Start(); + VLOG(3) << "Going to run op " << op_name[i]; + if (!need_skip) { + ops_[i]->Run(*thread_scope_, place_); + } + VLOG(3) << "Op " << op_name[i] << " Finished"; + timeline.Pause(); + op_total_time[i] += timeline.ElapsedSec(); + total_time += timeline.ElapsedSec(); + } + total_inst += cur_batch; + ++batch_cnt; + PrintFetchVars(); + if (thread_id_ == 0) { + if (batch_cnt > 0 && batch_cnt % 100 == 0) { + for (size_t i = 0; i < ops_.size(); ++i) { + fprintf(stderr, "op_name:[%zu][%s], op_mean_time:[%fs]\n", i, + op_name[i].c_str(), op_total_time[i] / batch_cnt); + } + fprintf(stderr, "mean read time: %fs\n", read_time / batch_cnt); + fprintf(stderr, "IO percent: %f\n", read_time / total_time * 100); + fprintf(stderr, "%6.2f instances/s\n", total_inst / total_time); + } + } + thread_scope_->DropKids(); + timeline.Start(); + } +} + +void HogwildWorker::TrainFiles() { + platform::SetNumThreads(1); + + // how to accumulate fetched values here + device_reader_->Start(); + int cur_batch; + while ((cur_batch = device_reader_->Next()) > 0) { + for (auto& op : ops_) { + bool need_skip = false; + for (auto t = 0u; t < skip_ops_.size(); ++t) { + if (op->Type().find(skip_ops_[t]) != std::string::npos) { + need_skip = true; + break; + } + } + if (!need_skip) { + op->Run(*thread_scope_, place_); + } + } + + PrintFetchVars(); + thread_scope_->DropKids(); + } +} + +void HogwildWorker::PrintFetchVars() { + // call count + batch_num_++; + int batch_per_print = fetch_config_.print_period(); + if (thread_id_ == 0) { + if (batch_num_ % batch_per_print == 0) { + int fetch_var_num = fetch_config_.fetch_var_names_size(); + for (int i = 0; i < fetch_var_num; ++i) { + platform::PrintVar(thread_scope_, fetch_config_.fetch_var_names(i), + fetch_config_.fetch_var_str_format(i)); + } + } + } +} + +} // end namespace framework +} // end namespace paddle diff --git a/paddle/fluid/framework/io/CMakeLists.txt b/paddle/fluid/framework/io/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..2baef77b9ce32ce616e7781b971665d3d885066c --- /dev/null +++ b/paddle/fluid/framework/io/CMakeLists.txt @@ -0,0 +1,2 @@ +cc_library(fs SRCS fs.cc DEPS string_helper glog boost) +cc_library(shell SRCS shell.cc DEPS string_helper glog) diff --git a/paddle/fluid/framework/io/fs.cc b/paddle/fluid/framework/io/fs.cc new file mode 100644 index 0000000000000000000000000000000000000000..d5bc5df2565b0f25bc29f2fce37c1bd8626a0dbc --- /dev/null +++ b/paddle/fluid/framework/io/fs.cc @@ -0,0 +1,456 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/io/fs.h" +#include + +namespace paddle { +namespace framework { + +static void fs_add_read_converter_internal(std::string& path, // NOLINT + bool& is_pipe, // NOLINT + const std::string& converter) { + if (converter == "") { + return; + } + + if (!is_pipe) { + path = string::format_string("( %s ) < \"%s\"", converter.c_str(), + path.c_str()); + is_pipe = true; + } else { + path = string::format_string("%s | %s", path.c_str(), converter.c_str()); + } +} + +static void fs_add_write_converter_internal(std::string& path, // NOLINT + bool& is_pipe, // NOLINT + const std::string& converter) { + if (converter == "") { + return; + } + + if (!is_pipe) { + path = string::format_string("( %s ) > \"%s\"", converter.c_str(), + path.c_str()); + is_pipe = true; + } else { + path = string::format_string("%s | %s", converter.c_str(), path.c_str()); + } +} + +static std::shared_ptr fs_open_internal(const std::string& path, + bool is_pipe, + const std::string& mode, + size_t buffer_size, + int* err_no = 0) { + std::shared_ptr fp = nullptr; + + if (!is_pipe) { + fp = shell_fopen(path, mode); + } else { + fp = shell_popen(path, mode, err_no); + } + + if (buffer_size > 0) { + char* buffer = new char[buffer_size]; + CHECK_EQ(0, setvbuf(&*fp, buffer, _IOFBF, buffer_size)); + fp = {&*fp, [fp, buffer](FILE*) mutable { // NOLINT + CHECK(fp.unique()); // NOLINT + fp = nullptr; + delete[] buffer; + }}; + } + + return fp; +} + +static bool fs_begin_with_internal(const std::string& path, + const std::string& str) { + return strncmp(path.c_str(), str.c_str(), str.length()) == 0; +} + +static bool fs_end_with_internal(const std::string& path, + const std::string& str) { + return path.length() >= str.length() && + strncmp(&path[path.length() - str.length()], str.c_str(), + str.length()) == 0; +} + +static size_t& localfs_buffer_size_internal() { + static size_t x = 0; + return x; +} + +size_t localfs_buffer_size() { return localfs_buffer_size_internal(); } + +void localfs_set_buffer_size(size_t x) { localfs_buffer_size_internal() = x; } + +std::shared_ptr localfs_open_read(std::string path, + const std::string& converter) { + bool is_pipe = false; + + if (fs_end_with_internal(path, ".gz")) { + fs_add_read_converter_internal(path, is_pipe, "zcat"); + } + + fs_add_read_converter_internal(path, is_pipe, converter); + return fs_open_internal(path, is_pipe, "r", localfs_buffer_size()); +} + +std::shared_ptr localfs_open_write(std::string path, + const std::string& converter) { + shell_execute( + string::format_string("mkdir -p $(dirname \"%s\")", path.c_str())); + + bool is_pipe = false; + + if (fs_end_with_internal(path, ".gz")) { + fs_add_write_converter_internal(path, is_pipe, "gzip"); + } + + fs_add_write_converter_internal(path, is_pipe, converter); + return fs_open_internal(path, is_pipe, "w", localfs_buffer_size()); +} + +int64_t localfs_file_size(const std::string& path) { + struct stat buf; + if (0 != stat(path.c_str(), &buf)) { + LOG(FATAL) << "file stat not zero"; + return -1; + } + return (int64_t)buf.st_size; +} + +void localfs_remove(const std::string& path) { + if (path == "") { + return; + } + + shell_execute(string::format_string("rm -rf %s", path.c_str())); +} + +std::vector localfs_list(const std::string& path) { + if (path == "") { + return {}; + } + + std::shared_ptr pipe; + int err_no = 0; + pipe = shell_popen( + string::format_string("find %s -type f -maxdepth 1", path.c_str()), "r", + &err_no); + string::LineFileReader reader; + std::vector list; + + while (reader.getline(&*pipe)) { + list.push_back(reader.get()); + } + + return list; +} + +std::string localfs_tail(const std::string& path) { + if (path == "") { + return ""; + } + + return shell_get_command_output( + string::format_string("tail -1 %s ", path.c_str())); +} + +bool localfs_exists(const std::string& path) { + std::string test_f = shell_get_command_output( + string::format_string("[ -f %s ] ; echo $?", path.c_str())); + + if (string::trim_spaces(test_f) == "0") { + return true; + } + + std::string test_d = shell_get_command_output( + string::format_string("[ -d %s ] ; echo $?", path.c_str())); + + if (string::trim_spaces(test_d) == "0") { + return true; + } + + return false; +} + +void localfs_mkdir(const std::string& path) { + if (path == "") { + return; + } + + shell_execute(string::format_string("mkdir -p %s", path.c_str())); +} + +static size_t& hdfs_buffer_size_internal() { + static size_t x = 0; + return x; +} + +size_t hdfs_buffer_size() { return hdfs_buffer_size_internal(); } + +void hdfs_set_buffer_size(size_t x) { hdfs_buffer_size_internal() = x; } + +static std::string& hdfs_command_internal() { + static std::string x = "hadoop fs"; + return x; +} + +const std::string& hdfs_command() { return hdfs_command_internal(); } + +void hdfs_set_command(const std::string& x) { hdfs_command_internal() = x; } + +std::shared_ptr hdfs_open_read(std::string path, int* err_no, + const std::string& converter) { + if (fs_end_with_internal(path, ".gz")) { + path = string::format_string("%s -text \"%s\"", hdfs_command().c_str(), + path.c_str()); + } else { + path = string::format_string("%s -cat \"%s\"", hdfs_command().c_str(), + path.c_str()); + } + + bool is_pipe = true; + fs_add_read_converter_internal(path, is_pipe, converter); + return fs_open_internal(path, is_pipe, "r", hdfs_buffer_size(), err_no); +} + +std::shared_ptr hdfs_open_write(std::string path, int* err_no, + const std::string& converter) { + path = string::format_string("%s -put - \"%s\"", hdfs_command().c_str(), + path.c_str()); + bool is_pipe = true; + + if (fs_end_with_internal(path, ".gz\"")) { + fs_add_write_converter_internal(path, is_pipe, "gzip"); + } + + fs_add_write_converter_internal(path, is_pipe, converter); + return fs_open_internal(path, is_pipe, "w", hdfs_buffer_size(), err_no); +} + +void hdfs_remove(const std::string& path) { + if (path == "") { + return; + } + + shell_execute(string::format_string("%s -rmr %s &>/dev/null; true", + hdfs_command().c_str(), path.c_str())); +} + +std::vector hdfs_list(const std::string& path) { + if (path == "") { + return {}; + } + + std::string prefix = "hdfs:"; + + if (fs_begin_with_internal(path, "afs:")) { + prefix = "afs:"; + } + int err_no = 0; + std::vector list; + do { + err_no = 0; + std::shared_ptr pipe; + pipe = shell_popen( + string::format_string("%s -ls %s | ( grep ^- ; [ $? != 2 ] )", + hdfs_command().c_str(), path.c_str()), + "r", &err_no); + string::LineFileReader reader; + list.clear(); + + while (reader.getline(&*pipe)) { + std::vector line = string::split_string(reader.get()); + if (line.size() != 8) { + continue; + } + list.push_back(prefix + line[7]); + } + } while (err_no == -1); + return list; +} + +std::string hdfs_tail(const std::string& path) { + if (path == "") { + return ""; + } + + return shell_get_command_output(string::format_string( + "%s -text %s | tail -1 ", hdfs_command().c_str(), path.c_str())); +} + +bool hdfs_exists(const std::string& path) { + std::string test = shell_get_command_output(string::format_string( + "%s -test -e %s ; echo $?", hdfs_command().c_str(), path.c_str())); + + if (string::trim_spaces(test) == "0") { + return true; + } + + return false; +} + +void hdfs_mkdir(const std::string& path) { + if (path == "") { + return; + } + + shell_execute(string::format_string("%s -mkdir %s; true", + hdfs_command().c_str(), path.c_str())); +} + +int fs_select_internal(const std::string& path) { + if (fs_begin_with_internal(path, "hdfs:")) { + return 1; + } else if (fs_begin_with_internal(path, "afs:")) { + return 1; + } + + return 0; +} + +std::shared_ptr fs_open_read(const std::string& path, int* err_no, + const std::string& converter) { + switch (fs_select_internal(path)) { + case 0: + return localfs_open_read(path, converter); + + case 1: + return hdfs_open_read(path, err_no, converter); + + default: + LOG(FATAL) << "Not supported"; + } + + return {}; +} + +std::shared_ptr fs_open_write(const std::string& path, int* err_no, + const std::string& converter) { + switch (fs_select_internal(path)) { + case 0: + return localfs_open_write(path, converter); + + case 1: + return hdfs_open_write(path, err_no, converter); + + default: + LOG(FATAL) << "Not supported"; + } + + return {}; +} + +std::shared_ptr fs_open(const std::string& path, const std::string& mode, + int* err_no, const std::string& converter) { + if (mode == "r" || mode == "rb") { + return fs_open_read(path, err_no, converter); + } + + if (mode == "w" || mode == "wb") { + return fs_open_write(path, err_no, converter); + } + + LOG(FATAL) << "Unknown mode: " << mode; + return {}; +} + +int64_t fs_file_size(const std::string& path) { + switch (fs_select_internal(path)) { + case 0: + return localfs_file_size(path); + + default: + LOG(FATAL) << "Not supported"; + } + + return 0; +} + +void fs_remove(const std::string& path) { + switch (fs_select_internal(path)) { + case 0: + return localfs_remove(path); + + case 1: + return hdfs_remove(path); + + default: + LOG(FATAL) << "Not supported"; + } +} + +std::vector fs_list(const std::string& path) { + switch (fs_select_internal(path)) { + case 0: + return localfs_list(path); + + case 1: + return hdfs_list(path); + + default: + LOG(FATAL) << "Not supported"; + } + + return {}; +} + +std::string fs_tail(const std::string& path) { + switch (fs_select_internal(path)) { + case 0: + return localfs_tail(path); + + case 1: + return hdfs_tail(path); + + default: + LOG(FATAL) << "Not supported"; + } + + return ""; +} + +bool fs_exists(const std::string& path) { + switch (fs_select_internal(path)) { + case 0: + return localfs_exists(path); + + case 1: + return hdfs_exists(path); + + default: + LOG(FATAL) << "Not supported"; + } + + return false; +} + +void fs_mkdir(const std::string& path) { + switch (fs_select_internal(path)) { + case 0: + return localfs_mkdir(path); + + case 1: + return hdfs_mkdir(path); + + default: + LOG(FATAL) << "Not supported"; + } +} +} // end namespace framework +} // end namespace paddle diff --git a/paddle/fluid/framework/io/fs.h b/paddle/fluid/framework/io/fs.h new file mode 100644 index 0000000000000000000000000000000000000000..3f0174701c24cc5a3eac38d12792650bdbd9463b --- /dev/null +++ b/paddle/fluid/framework/io/fs.h @@ -0,0 +1,101 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include "glog/logging.h" +#include "paddle/fluid/framework/io/shell.h" +#include "paddle/fluid/string/string_helper.h" + +namespace paddle { +namespace framework { + +int fs_select_internal(const std::string& path); + +// localfs +extern size_t localfs_buffer_size(); + +extern void localfs_set_buffer_size(size_t x); + +extern std::shared_ptr localfs_open_read(std::string path, + const std::string& converter); + +extern std::shared_ptr localfs_open_write(std::string path, + const std::string& converter); + +extern int64_t localfs_file_size(const std::string& path); + +extern void localfs_remove(const std::string& path); + +extern std::vector localfs_list(const std::string& path); + +extern std::string localfs_tail(const std::string& path); + +extern bool localfs_exists(const std::string& path); + +extern void localfs_mkdir(const std::string& path); + +// hdfs +extern size_t hdfs_buffer_size(); + +extern void hdfs_set_buffer_size(size_t x); + +extern const std::string& hdfs_command(); + +extern void hdfs_set_command(const std::string& x); + +extern std::shared_ptr hdfs_open_read(std::string path, int* err_no, + const std::string& converter); + +extern std::shared_ptr hdfs_open_write(std::string path, int* err_no, + const std::string& converter); + +extern void hdfs_remove(const std::string& path); + +extern std::vector hdfs_list(const std::string& path); + +extern std::string hdfs_tail(const std::string& path); + +extern bool hdfs_exists(const std::string& path); + +extern void hdfs_mkdir(const std::string& path); + +// aut-detect fs +extern std::shared_ptr fs_open_read(const std::string& path, int* err_no, + const std::string& converter); + +extern std::shared_ptr fs_open_write(const std::string& path, int* err_no, + const std::string& converter); + +extern std::shared_ptr fs_open(const std::string& path, + const std::string& mode, int* err_no, + const std::string& converter = ""); + +extern int64_t fs_file_size(const std::string& path); + +extern void fs_remove(const std::string& path); + +extern std::vector fs_list(const std::string& path); + +extern std::string fs_tail(const std::string& path); + +extern bool fs_exists(const std::string& path); + +extern void fs_mkdir(const std::string& path); +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/io/shell.cc b/paddle/fluid/framework/io/shell.cc new file mode 100644 index 0000000000000000000000000000000000000000..bcfa4f44ff1c6561cbbd60b76f75de1c8461a88a --- /dev/null +++ b/paddle/fluid/framework/io/shell.cc @@ -0,0 +1,323 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/io/shell.h" + +namespace paddle { +namespace framework { + +std::shared_ptr shell_fopen(const std::string& path, + const std::string& mode) { +#if defined _WIN32 || defined __APPLE__ + return nullptr; +#else + if (shell_verbose()) { + LOG(INFO) << "Opening file[" << path << "] with mode[" << mode << "]"; + } + FILE* fp; + if (!(fp = fopen(path.c_str(), mode.c_str()))) { + LOG(FATAL) << "fopen fail, path[" << path << "], mode[" << mode << "]"; + } + return {fp, [path](FILE* fp) { + if (shell_verbose()) { + LOG(INFO) << "Closing file[" << path << "]"; + } + if (0 != fclose(fp)) { + LOG(FATAL) << "fclose fail, path[" << path << "]"; + } + }}; +#endif +} + +// Close all open file descriptors +// The implementation is async signal safe +// Mostly copy from CPython code +static int close_open_fds_internal() { +#if defined _WIN32 || defined __APPLE__ + return 0; +#else + struct linux_dirent { + long d_ino = 0; // NOLINT + off_t d_off; + unsigned short d_reclen = 0; // NOLINT + char d_name[256]; + }; + + int dir_fd = -1; + if ((dir_fd = open("/proc/self/fd", O_RDONLY)) < 0) { + LOG(FATAL) << "proc/self/fd open fail"; + return -1; + } + char buffer[sizeof(linux_dirent)]; + + for (;;) { + int bytes = 0; + if ((bytes = syscall(SYS_getdents, dir_fd, + reinterpret_cast(buffer), + sizeof(buffer))) < 0) { + LOG(FATAL) << "syscall fail"; + return -1; + } + + if (bytes == 0) { + break; + } + + linux_dirent* entry = NULL; + + for (int offset = 0; offset < bytes; offset += entry->d_reclen) { + entry = reinterpret_cast(buffer + offset); + int fd = 0; + const char* s = entry->d_name; + + while (*s >= '0' && *s <= '9') { + fd = fd * 10 + (*s - '0'); + s++; + } + + if (s != entry->d_name && fd != dir_fd && fd >= 3) { + close(fd); + } + } + } + + close(dir_fd); + return 0; +#endif +} + +static int shell_popen_fork_internal(const char* real_cmd, bool do_read, + int parent_end, int child_end) { +#if defined _WIN32 || defined __APPLE__ + return 0; +#else + int child_pid = -1; + // Too frequent calls to fork() makes openmpi very slow. Use vfork() instead. + // But vfork() is very dangerous. Be careful. + if ((child_pid = vfork()) < 0) { + return -1; + } + + // The following code is async signal safe (No memory allocation, no access to + // global data, etc.) + if (child_pid != 0) { + return child_pid; + } + + int child_std_end = do_read ? 1 : 0; + close(parent_end); + + if (child_end != child_std_end) { + if (dup2(child_end, child_std_end) != child_std_end) { + return -1; + } + close(child_end); + } + + close_open_fds_internal(); + if (execl("/bin/sh", "sh", "-c", real_cmd, NULL) < 0) { + return -1; + } + exit(127); +#endif +} + +std::shared_ptr shell_popen(const std::string& cmd, + const std::string& mode, int* err_no) { +#if defined _WIN32 || defined __APPLE__ + return nullptr; +#else + bool do_read = mode == "r"; + bool do_write = mode == "w"; + if (!(do_read || do_write)) { + *err_no = -1; + return NULL; + } + + if (shell_verbose()) { + LOG(INFO) << "Opening pipe[" << cmd << "] with mode[" << mode << "]"; + } + + std::string real_cmd = "set -o pipefail; " + cmd; + + int pipe_fds[2]; + if (pipe(pipe_fds) != 0) { + *err_no = -1; + return NULL; + } + int parent_end = 0; + int child_end = 0; + + if (do_read) { + parent_end = pipe_fds[0]; + child_end = pipe_fds[1]; + } else if (do_write) { + parent_end = pipe_fds[1]; + child_end = pipe_fds[0]; + } + + int child_pid = shell_popen_fork_internal(real_cmd.c_str(), do_read, + parent_end, child_end); + close(child_end); + fcntl(parent_end, F_SETFD, FD_CLOEXEC); + FILE* fp; + if ((fp = fdopen(parent_end, mode.c_str())) == NULL) { + *err_no = -1; + return NULL; + } + return {fp, [child_pid, cmd, err_no](FILE* fp) { + if (shell_verbose()) { + LOG(INFO) << "Closing pipe[" << cmd << "]"; + } + + if (fclose(fp) != 0) { + *err_no = -1; + } + int wstatus = -1; + waitpid(child_pid, &wstatus, 0); + if (wstatus == 0 || wstatus == (128 + SIGPIPE) * 256 || + (wstatus == -1 && errno == ECHILD)) { + } else { + *err_no = -1; + LOG(WARNING) << "status[" << wstatus << "], cmd[" << cmd << "]" + << ", err_no[" << *err_no << "]"; + } + if (wstatus == -1 && errno == ECHILD) { + LOG(WARNING) << "errno is ECHILD"; + } + }}; +#endif +} + +static int shell_p2open_fork_internal(const char* real_cmd, int pipein_fds[2], + int pipeout_fds[2]) { +#if defined _WIN32 || defined __APPLE__ + return 0; +#else + int child_pid = -1; + if ((child_pid = fork()) < 0) { + return -1; + } + + if (child_pid != 0) { + return child_pid; + } + + close(pipein_fds[0]); + close(pipeout_fds[1]); + + if (pipein_fds[1] != 1) { + if (dup2(pipein_fds[1], 1) != 1) { + return -1; + } + close(pipein_fds[1]); + } + + if (pipeout_fds[0] != 0) { + if (dup2(pipeout_fds[0], 0) != 0) { + return -1; + } + close(pipeout_fds[0]); + } + + close_open_fds_internal(); + if (execl("/bin/sh", "sh", "-c", real_cmd, NULL) < 0) { + return -1; + } + exit(127); +#endif +} + +std::pair, std::shared_ptr> shell_p2open( + const std::string& cmd) { +#if defined _WIN32 || defined __APPLE__ + return {}; +#else + if (shell_verbose()) { + LOG(INFO) << "Opening bidirectional pipe[" << cmd << "]"; + } + + std::string real_cmd = "set -o pipefail; " + cmd; + + int pipein_fds[2]; + int pipeout_fds[2]; + if (pipe(pipein_fds) != 0) { + return {NULL, NULL}; + } + if (pipe(pipeout_fds) != 0) { + return {NULL, NULL}; + } + + int child_pid = + shell_p2open_fork_internal(real_cmd.c_str(), pipein_fds, pipeout_fds); + + close(pipein_fds[1]); + close(pipeout_fds[0]); + fcntl(pipein_fds[0], F_SETFD, FD_CLOEXEC); + fcntl(pipeout_fds[1], F_SETFD, FD_CLOEXEC); + + std::shared_ptr child_life = { + NULL, [child_pid, cmd](void*) { + if (shell_verbose()) { + LOG(INFO) << "Closing bidirectional pipe[" << cmd << "]"; + } + + int wstatus, ret; + + do { + PCHECK((ret = waitpid(child_pid, &wstatus, 0)) >= 0 || + (ret == -1 && errno == EINTR)); + } while (ret == -1 && errno == EINTR); + + PCHECK(wstatus == 0 || wstatus == (128 + SIGPIPE) * 256 || + (wstatus == -1 && errno == ECHILD)) + << "status[" << wstatus << "], cmd[" << cmd << "]"; + + if (wstatus == -1 && errno == ECHILD) { + LOG(WARNING) << "errno is ECHILD"; + } + }}; + + FILE* in_fp; + PCHECK((in_fp = fdopen(pipein_fds[0], "r")) != NULL); + FILE* out_fp; + PCHECK((out_fp = fdopen(pipeout_fds[1], "w")) != NULL); + return {{in_fp, [child_life](FILE* fp) { PCHECK(fclose(fp) == 0); }}, + {out_fp, [child_life](FILE* fp) { PCHECK(fclose(fp) == 0); }}}; +#endif +} + +std::string shell_get_command_output(const std::string& cmd) { +#if defined _WIN32 || defined __APPLE__ + return ""; +#else + int err_no = 0; + do { + err_no = 0; + std::shared_ptr pipe = shell_popen(cmd, "r", &err_no); + string::LineFileReader reader; + + if (reader.getdelim(&*pipe, 0)) { + pipe = nullptr; + if (err_no == 0) { + return reader.get(); + } + } + } while (err_no == -1); + return ""; +#endif +} + +} // end namespace framework +} // end namespace paddle diff --git a/paddle/fluid/framework/io/shell.h b/paddle/fluid/framework/io/shell.h new file mode 100644 index 0000000000000000000000000000000000000000..46fcc92bafa84e4c1b89e4603fe0db364572b73e --- /dev/null +++ b/paddle/fluid/framework/io/shell.h @@ -0,0 +1,66 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#ifdef _WIN32 +#include +#else +#include +#endif +#include +#ifndef _WIN32 +#include +#endif +#include +#include +#include +#include "paddle/fluid/platform/port.h" +#include "paddle/fluid/string/string_helper.h" + +namespace paddle { +namespace framework { + +inline bool& shell_verbose_internal() { + static bool x = false; + return x; +} + +inline bool shell_verbose() { return shell_verbose_internal(); } + +inline void shell_set_verbose(bool x) { shell_verbose_internal() = x; } + +extern std::shared_ptr shell_fopen(const std::string& path, + const std::string& mode); + +extern std::shared_ptr shell_popen(const std::string& cmd, + const std::string& mode, int* err_no); + +extern std::pair, std::shared_ptr> shell_p2open( + const std::string& cmd); + +inline void shell_execute(const std::string& cmd) { + int err_no = 0; + do { + err_no = 0; + shell_popen(cmd, "w", &err_no); + } while (err_no == -1); +} + +extern std::string shell_get_command_output(const std::string& cmd); + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/multi_batch_merge_pass.cc b/paddle/fluid/framework/ir/multi_batch_merge_pass.cc index dcc48fb934e7a06f2e85fa34fde335261f551415..a8720ff4bfb5c7fa7aee6d23949b030c328b90e6 100644 --- a/paddle/fluid/framework/ir/multi_batch_merge_pass.cc +++ b/paddle/fluid/framework/ir/multi_batch_merge_pass.cc @@ -84,7 +84,8 @@ void BatchMergePass::ApplyImpl(ir::Graph* graph) const { // 1. record op nodes of different roles for (auto node : nodes) { - if (node->IsVar()) continue; + if (!node->IsOp()) continue; + PADDLE_ENFORCE(node->Op(), "must find opdesc"); int op_role = boost::get(node->Op()->GetAttr( framework::OpProtoAndCheckerMaker::OpRoleAttrName())); if ((op_role == static_cast(framework::OpRole::kForward)) || diff --git a/paddle/fluid/framework/ir/pass.cc b/paddle/fluid/framework/ir/pass.cc index c0ed0519b1ff6aa5960c20e9af697fd1da74a8b5..4a29bde0917d3cce97d69ff3b896d09a2aae82ba 100644 --- a/paddle/fluid/framework/ir/pass.cc +++ b/paddle/fluid/framework/ir/pass.cc @@ -13,11 +13,16 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/ir/pass.h" + +#include +#include + #include "paddle/fluid/framework/ir/graph_helper.h" namespace paddle { namespace framework { namespace ir { + Graph* Pass::Apply(Graph* graph) const { PADDLE_ENFORCE(graph, "graph passed to Pass::Apply() cannot be empty."); for (const std::string& attr : required_pass_attrs_) { diff --git a/paddle/fluid/framework/multi_trainer.cc b/paddle/fluid/framework/multi_trainer.cc new file mode 100644 index 0000000000000000000000000000000000000000..3a266e4bda91d5962ce09b241cc5e5671d67a142 --- /dev/null +++ b/paddle/fluid/framework/multi_trainer.cc @@ -0,0 +1,83 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include "paddle/fluid/framework/data_feed_factory.h" +#include "paddle/fluid/framework/device_worker_factory.h" +#include "paddle/fluid/framework/trainer.h" + +namespace paddle { +namespace framework { + +void MultiTrainer::Initialize(const TrainerDesc& trainer_desc, + Dataset* dataset) { + thread_num_ = trainer_desc.thread_num(); + SetDataset(dataset); + // get filelist from trainer_desc here + dataset->CreateReaders(); + VLOG(3) << "readers created"; + const std::vector> readers = + dataset->GetReaders(); + VLOG(3) << "readers num: " << readers.size(); + // change thread num to readers num + thread_num_ = readers.size(); + VLOG(3) << "worker thread num: " << thread_num_; + workers_.resize(thread_num_); + for (int i = 0; i < thread_num_; ++i) { + workers_[i] = DeviceWorkerFactory::CreateDeviceWorker( + trainer_desc.device_worker_name()); + workers_[i]->Initialize(trainer_desc); + workers_[i]->SetDeviceIndex(i); + workers_[i]->SetDataFeed(readers[i]); + } + + // set debug here + SetDebug(trainer_desc.debug()); +} + +// call only after all resources are set in current trainer +void MultiTrainer::InitTrainerEnv(const ProgramDesc& main_program, + const platform::Place& place) { + for (int i = 0; i < thread_num_; ++i) { + workers_[i]->SetPlace(place); + workers_[i]->SetRootScope(root_scope_); + workers_[i]->CreateDeviceResource(main_program); // Program + workers_[i]->BindingDataFeedMemory(); + } +} + +void MultiTrainer::Run() { + VLOG(3) << "Going to run"; + for (int thidx = 0; thidx < thread_num_; ++thidx) { + if (!debug_) { + threads_.push_back( + std::thread(&DeviceWorker::TrainFiles, workers_[thidx].get())); + } else { + threads_.push_back(std::thread(&DeviceWorker::TrainFilesWithProfiler, + workers_[thidx].get())); + } + } +} + +void MultiTrainer::Finalize() { + for (auto& th : threads_) { + th.join(); + } + dataset_ptr_->DestroyReaders(); + root_scope_->DropKids(); +} + +} // end namespace framework +} // end namespace paddle diff --git a/paddle/fluid/framework/op_info.cc b/paddle/fluid/framework/op_info.cc index af75baa5c4b98f7d092834c05eb57e9c7e131b29..c815e194d43e149f9efe0daec820c42e87f81d0c 100644 --- a/paddle/fluid/framework/op_info.cc +++ b/paddle/fluid/framework/op_info.cc @@ -13,6 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/op_info.h" +#include +#include +#include namespace paddle { namespace framework { @@ -24,5 +27,17 @@ OpInfoMap& OpInfoMap::Instance() { static OpInfoMap g_op_info_map; return g_op_info_map; } + +std::vector OpInfoMap::GetUseDefaultGradOpDescMakerOps() const { + // Use set to sort op names + std::set result_ops; + for (auto& pair : map_) { + if (pair.second.use_default_grad_op_desc_maker_) { + result_ops.insert(pair.first); + } + } + return std::vector(result_ops.begin(), result_ops.end()); +} + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/op_info.h b/paddle/fluid/framework/op_info.h index e200d188b3f2462657bbac086d7659b1f85e55e9..daa72769c4957ff5ad0e7b3141bbf97bd348b408 100644 --- a/paddle/fluid/framework/op_info.h +++ b/paddle/fluid/framework/op_info.h @@ -17,6 +17,7 @@ limitations under the License. */ #include #include #include +#include #include "paddle/fluid/framework/attribute.h" #include "paddle/fluid/framework/no_need_buffer_vars_inference.h" @@ -42,6 +43,10 @@ struct OpInfo { InferInplaceOpFN infer_inplace_; InferNoNeedBufferVarsFN infer_no_need_buffer_vars_; + // NOTE(zjl): this flag is added to check whether + // the grad maker is the default one. + bool use_default_grad_op_desc_maker_{false}; + bool HasOpProtoAndChecker() const { return proto_ != nullptr && checker_ != nullptr; } @@ -105,6 +110,8 @@ class OpInfoMap { std::unordered_map* mutable_map() { return &map_; } + std::vector GetUseDefaultGradOpDescMakerOps() const; + private: OpInfoMap() = default; std::unordered_map map_; diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index ab0947c631fe9a409406b3b092972ae6512beae7..a2a8083da955c93175ab2f01a37737c145e6f1b8 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -24,6 +24,7 @@ limitations under the License. */ #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/details/all_reduce_deps_pass.h" +#include "paddle/fluid/framework/details/async_ssa_graph_executor.h" #include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h" #include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h" @@ -218,6 +219,18 @@ ParallelExecutor::ParallelExecutor(const std::vector &places, } } + std::vector graphs; + if (build_strategy.async_mode_) { + PADDLE_ENFORCE(!member_->use_cuda_, + "gpu mode does not support async_mode_ now!"); + graphs.push_back(graph); + for (int i = 1; i < places.size(); ++i) { + auto *tmp_graph = new ir::Graph(graph->OriginProgram()); + async_graphs_.emplace_back(tmp_graph); + graphs.push_back(tmp_graph); + } + } + // FIXME(Yancey1989): parallel graph mode get better performance // in GPU allreduce distributed training. Need an elegant way to // choice the execution strategy. @@ -294,19 +307,46 @@ ParallelExecutor::ParallelExecutor(const std::vector &places, if (need_broadcast()) { BCastParamsToDevices(bcast_vars, build_strategy.trainer_id_); } + // Startup Program has been run. All local scopes has correct parameters. -// Startup Program has been run. All local scopes has correct parameters. - -// Step 2. Convert main_program to SSA form and dependency graph. Also, insert -// ncclOp + // Step 2. Convert main_program to SSA form and dependency graph. Also, insert + // ncclOp + std::vector async_graphs(places.size()); #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) - graph = build_strategy.Apply(graph, member_->places_, loss_var_name, - member_->local_scopes_, member_->nranks_, + if (build_strategy.async_mode_) { + VLOG(3) << "use local async mode"; + graph = build_strategy.Apply(graph, {member_->places_[0]}, loss_var_name, + {member_->local_scopes_[0]}, 1, + member_->use_cuda_, member_->nccl_ctxs_.get()); + for (int i = 1; i < member_->places_.size(); ++i) { + graphs[i] = + build_strategy.Apply(graphs[i], {member_->places_[i]}, loss_var_name, + {member_->local_scopes_[i]}, 1, member_->use_cuda_, member_->nccl_ctxs_.get()); + async_graphs[i] = graphs[i]; + } + } else { + graph = build_strategy.Apply(graph, member_->places_, loss_var_name, + member_->local_scopes_, member_->nranks_, + member_->use_cuda_, member_->nccl_ctxs_.get()); + } #else - graph = build_strategy.Apply(graph, member_->places_, loss_var_name, - member_->local_scopes_, member_->nranks_, - member_->use_cuda_); + if (build_strategy.async_mode_) { + VLOG(3) << "use local async mode"; + graph = build_strategy.Apply(graph, {member_->places_[0]}, loss_var_name, + {member_->local_scopes_[0]}, 1, + member_->use_cuda_); + for (int i = 1; i < member_->places_.size(); ++i) { + graphs[i] = build_strategy.Apply( + graphs[i], {member_->places_[i]}, loss_var_name, + {member_->local_scopes_[i]}, 1, member_->use_cuda_); + async_graphs[i] = graphs[i]; + } + } else { + graph = build_strategy.Apply(graph, member_->places_, loss_var_name, + member_->local_scopes_, member_->nranks_, + member_->use_cuda_); + } #endif auto max_memory_size = GetEagerDeletionThreshold(); @@ -317,6 +357,8 @@ ParallelExecutor::ParallelExecutor(const std::vector &places, static_cast(max_memory_size)); } + async_graphs[0] = graph; + // Step 3. Create vars in each scope. Passes may also create new vars. // skip control vars and empty vars std::vector var_infos; @@ -344,7 +386,12 @@ ParallelExecutor::ParallelExecutor(const std::vector &places, } } - if (build_strategy.enable_parallel_graph_) { + if (build_strategy.async_mode_) { + VLOG(3) << "use AsyncSSAGraphExecutor"; + member_->executor_.reset(new details::AsyncSSAGraphExecutor( + exec_strategy, member_->local_scopes_, member_->places_, async_graphs)); + } else if (build_strategy.enable_parallel_graph_) { + VLOG(3) << "use ParallelSSAGraphExecutor"; #ifdef PADDLE_WITH_CUDA // TODO(Yancey1989): Remove passing in the main_program when // allreduce_seq_pass doesn't need it as the attr. @@ -356,21 +403,27 @@ ParallelExecutor::ParallelExecutor(const std::vector &places, #endif } else { if (exec_strategy.type_ == ExecutionStrategy::kDefault) { + VLOG(3) << "use ThreadedSSAGraphExecutor"; member_->executor_.reset(new details::ThreadedSSAGraphExecutor( exec_strategy, member_->local_scopes_, member_->places_, graph)); } else { + VLOG(3) << "use FastThreadedSSAGraphExecutor"; member_->executor_.reset(new details::FastThreadedSSAGraphExecutor( exec_strategy, member_->local_scopes_, member_->places_, graph)); } } - member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor( - exec_strategy, member_->local_scopes_, std::move(var_infos), - member_->places_, std::move(member_->executor_))); + VLOG(3) << "use ScopeBufferedSSAGraphExecutor"; + if (!build_strategy.async_mode_) { + member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor( + exec_strategy, member_->local_scopes_, std::move(var_infos), + member_->places_, std::move(member_->executor_))); + } } void ParallelExecutor::BCastParamsToDevices( const std::vector &vars, int trainer_id) const { + VLOG(3) << "BCastParamsToDevices"; // the initializing bcast, all vars would be bcast from device(0). for (auto &var : vars) { framework::Variable *main_var = member_->local_scopes_[0]->FindVar(var); @@ -425,14 +478,22 @@ void ParallelExecutor::BCastParamsToDevices( auto local_scope = member_->local_scopes_[i]; auto *t = local_scope->Var(var)->GetMutable(); - // FIXME(zcd): LR_DECAY_COUNTER should not be shared. This is a hot fix. - if (member_->use_all_reduce_ || member_->use_cuda_ || - var == "@LR_DECAY_COUNTER@") { + auto copy_memory = [&] { t->Resize(dims); t->mutable_data(cpu, main_tensor.type()); paddle::framework::TensorCopy(main_tensor, cpu, t); + }; + + auto share_memory = [&] { t->ShareDataWith(main_tensor); }; + + // FIXME(zcd): LR_DECAY_COUNTER should not be shared. This is a hot fix. + if (member_->build_strategy_.async_mode_) { + share_memory(); + } else if (member_->use_all_reduce_ || member_->use_cuda_ || + var == "@LR_DECAY_COUNTER@") { + copy_memory(); } else { - t->ShareDataWith(main_tensor); + share_memory(); } } } diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index d4658b9623fe8c23b6a8b2903e3c48d794ba1652..5756627fbd8583428014e24e5aa3f626c908ce1c 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -81,6 +81,7 @@ class ParallelExecutor { const BuildStrategy &build_strategy) const; ParallelExecutorPrivate *member_; + std::vector> async_graphs_; #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) std::unique_ptr local_nccl_id_; #endif diff --git a/paddle/fluid/framework/pull_dense_worker.cc b/paddle/fluid/framework/pull_dense_worker.cc new file mode 100644 index 0000000000000000000000000000000000000000..c48c7872ec23f6cfaac650b4940752ac9b8fd36c --- /dev/null +++ b/paddle/fluid/framework/pull_dense_worker.cc @@ -0,0 +1,136 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include +#include "paddle/fluid/framework/device_worker.h" + +namespace paddle { +namespace framework { + +std::shared_ptr PullDenseWorker::s_instance_ = NULL; +std::mutex PullDenseWorker::mutex_for_version_; +std::map PullDenseWorker::last_versions_; +std::map PullDenseWorker::current_version_; +std::map> PullDenseWorker::training_versions_; +std::map> + PullDenseWorker::dense_value_names_; + +void PullDenseWorker::Initialize(const TrainerDesc& param) { + running_ = false; + param_ = param.pull_dense_param(); + dwp_param_ = param.downpour_param(); + threshold_ = param_.threshold(); + thread_num_ = param_.device_num(); + sleep_time_ms_ = param_.sleep_time_ms(); + for (size_t i = 0; + i < dwp_param_.program_config(0).pull_dense_table_id_size(); ++i) { + uint64_t tid = static_cast( + dwp_param_.program_config(0).pull_dense_table_id(i)); + TableParameter table; + for (auto i : param_.dense_table()) { + if (i.table_id() == tid) { + table = i; + break; + } + } + // setup dense variables for each table + int var_num = table.dense_value_name_size(); + dense_value_names_[tid].resize(var_num); + for (int j = 0; j < var_num; ++j) { + dense_value_names_[tid][j] = table.dense_value_name(j); + } + // setup training version for each table + training_versions_[tid].resize(thread_num_, 0); + last_versions_[tid] = 0; + current_version_[tid] = 0; + } + fleet_ptr_ = FleetWrapper::GetInstance(); +} + +void PullDenseWorker::Wait(std::vector<::std::future>* status_vec) { + for (auto& t : *status_vec) { + t.wait(); + auto status = t.get(); + if (status != 0) { + LOG(WARNING) << "Current Pull Dense Thread Failed Times" + << ++pull_dense_fail_times_; + } + } + + int MAX_FAIL_NUM = 20; + if (pull_dense_fail_times_ > MAX_FAIL_NUM) { + LOG(FATAL) << "Pull Dense Failed Times More Than " << MAX_FAIL_NUM + << " Times"; + exit(-1); + } + status_vec->resize(0); +} + +void PullDenseWorker::Stop() { + if (running_) { + running_ = false; + t_.join(); + } +} + +int PullDenseWorker::Start() { + running_ = true; + t_ = std::thread(&PullDenseWorker::Run, this); + return 0; +} + +void PullDenseWorker::Run() { + while (running_) { + pull_dense_status_.resize(0); + for (size_t i = 0; + i < dwp_param_.program_config(0).pull_dense_table_id_size(); ++i) { + uint64_t tid = static_cast( + dwp_param_.program_config(0).pull_dense_table_id(i)); + if (CheckUpdateParam(tid)) { + fleet_ptr_->PullDenseVarsAsync( + *root_scope_, tid, dense_value_names_[tid], &pull_dense_status_); + ResetThreadVersion(tid); + } + } + if (pull_dense_status_.size() != 0) { + Wait(&pull_dense_status_); + } +#ifndef _WIN32 + usleep(sleep_time_ms_ * 1000); +#endif + } +} + +void PullDenseWorker::IncreaseThreadVersion(int thread_id, uint64_t table_id) { + std::lock_guard lock(mutex_for_version_); + training_versions_[table_id][thread_id]++; +} + +bool PullDenseWorker::CheckUpdateParam(uint64_t table_id) { + std::lock_guard lock(mutex_for_version_); + auto& version = training_versions_[table_id]; + current_version_[table_id] = + *(std::min_element(version.begin(), version.end())); + if (current_version_[table_id] - last_versions_[table_id] < threshold_) { + return false; + } + return true; +} + +void PullDenseWorker::ResetThreadVersion(uint64_t table_id) { + std::lock_guard lock(mutex_for_version_); + last_versions_[table_id] = current_version_[table_id]; +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/reader.cc b/paddle/fluid/framework/reader.cc index 40eafda9bf294f7e8ddd067e9014447f4de1cc0e..d3513fb7dbed0413e61796d8a843c38fbbcf93dc 100644 --- a/paddle/fluid/framework/reader.cc +++ b/paddle/fluid/framework/reader.cc @@ -69,6 +69,9 @@ void ReaderBase::Start() { ReaderBase::~ReaderBase() {} -DecoratedReader::~DecoratedReader() { reader_->Shutdown(); } +DecoratedReader::~DecoratedReader() { + VLOG(1) << "~DecoratedReader"; + reader_->Shutdown(); +} } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/reader.h b/paddle/fluid/framework/reader.h index 82562bf883d88787858912f7039cf8fef003eccf..4b400e72a4cacd3848b57ac3ba2b3ef5f9a9a9c4 100644 --- a/paddle/fluid/framework/reader.h +++ b/paddle/fluid/framework/reader.h @@ -16,6 +16,7 @@ #include #include +#include #include #include "paddle/fluid/framework/ddim.h" @@ -77,7 +78,10 @@ class DecoratedReader : public ReaderBase, ~DecoratedReader(); protected: - void ShutdownImpl() override { reader_->Shutdown(); } + void ShutdownImpl() override { + VLOG(1) << "ShutdownImpl"; + reader_->Shutdown(); + } void StartImpl() override { reader_->Start(); } @@ -98,6 +102,8 @@ class ReaderHolder { reader_ = reader_base; } + ~ReaderHolder() { VLOG(1) << "~ReaderHolder"; } + const std::shared_ptr& Get() const { return reader_; } void ReadNext(std::vector* out) { @@ -106,6 +112,7 @@ class ReaderHolder { } void ResetAll() { + VLOG(1) << "ResetAll"; auto end_readers = reader_->GetEndPoints(); for (auto* reader : end_readers) { reader->Shutdown(); @@ -116,11 +123,13 @@ class ReaderHolder { } void Shutdown() { + VLOG(1) << "Shutdown"; PADDLE_ENFORCE_NOT_NULL(reader_); reader_->Shutdown(); } void Start() { + VLOG(1) << "start"; PADDLE_ENFORCE_NOT_NULL(reader_); reader_->Start(); } diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc index a96baaf41f3fcd24817421a7b620343558cd78d1..49e22a5ad3093c2d61d0ef513974c9938e287729 100644 --- a/paddle/fluid/framework/scope.cc +++ b/paddle/fluid/framework/scope.cc @@ -59,6 +59,10 @@ Scope& Scope::NewScope() const { return *child; } +std::unique_ptr Scope::NewTmpScope() const { + return std::unique_ptr(new Scope(this)); +} + Variable* Scope::Var(const std::string& name) { SCOPE_VARS_WRITER_LOCK return VarInternal(name); diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h index 242cbae7163c48fa44dca9237f1cd35f9ec98442..5f3d106e091ace05cfbdbbde2d79d48fe01b4a38 100644 --- a/paddle/fluid/framework/scope.h +++ b/paddle/fluid/framework/scope.h @@ -52,6 +52,10 @@ class Scope { /// Mark it to const because that new kid scope cannot change parent scope. Scope& NewScope() const; + /// Create a sub-scope for current scope but do not record it in the kids to + /// avoid performance problems. + std::unique_ptr NewTmpScope() const; + /// Create a variable with given name if it doesn't exist. /// Caller doesn't own the returned Variable. Variable* Var(const std::string& name); diff --git a/paddle/fluid/framework/trainer.cc b/paddle/fluid/framework/trainer.cc new file mode 100644 index 0000000000000000000000000000000000000000..644bd33a1420aa0ff54e34005eedd10c28342665 --- /dev/null +++ b/paddle/fluid/framework/trainer.cc @@ -0,0 +1,23 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/trainer.h" + +namespace paddle { +namespace framework { + +void TrainerBase::SetScope(Scope* root_scope) { root_scope_ = root_scope; } + +} // end namespace framework +} // end namespace paddle diff --git a/paddle/fluid/framework/trainer.h b/paddle/fluid/framework/trainer.h new file mode 100644 index 0000000000000000000000000000000000000000..b29736cfbbebc183d969dcf1863a6a1d097d2358 --- /dev/null +++ b/paddle/fluid/framework/trainer.h @@ -0,0 +1,95 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include // NOLINT +#include +#include // NOLINT +#include + +#include "paddle/fluid/framework/data_feed.h" +#include "paddle/fluid/framework/data_set.h" +#include "paddle/fluid/framework/device_worker.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/reader.h" +#include "paddle/fluid/framework/trainer_desc.pb.h" +#include "paddle/fluid/framework/variable_helper.h" +#include "paddle/fluid/operators/reader/blocking_queue.h" +#include "paddle/fluid/platform/port.h" + +namespace paddle { +namespace framework { + +class TrainerBase { + public: + TrainerBase() {} + virtual ~TrainerBase() {} + // model memory are hosted in root_scope + void SetScope(Scope* root_scope); + void SetDebug(const bool debug) { debug_ = debug; } + void SetDataset(Dataset* dataset_ptr) { dataset_ptr_ = dataset_ptr; } + virtual void Initialize(const TrainerDesc& trainer_desc, + Dataset* data_set) = 0; + virtual void InitTrainerEnv(const ProgramDesc& main_program, + const platform::Place& place) = 0; + virtual void InitOtherEnv(const ProgramDesc& main_program) = 0; + virtual void Run() = 0; + virtual void Finalize() = 0; + + protected: + Scope* root_scope_; + bool debug_; + Dataset* dataset_ptr_; +}; + +// general trainer for async execution +// local trainer and distributed trainer are supported +// depends on the assigned device_worker +class MultiTrainer : public TrainerBase { + public: + MultiTrainer() {} + virtual ~MultiTrainer() {} + virtual void Initialize(const TrainerDesc& trainer_desc, Dataset* data_set); + virtual void InitTrainerEnv(const ProgramDesc& main_program, + const platform::Place& place); + virtual void InitOtherEnv(const ProgramDesc& main_program) {} + virtual void Run(); + virtual void Finalize(); + + protected: + int thread_num_; + std::vector threads_; + std::vector> readers_; + std::vector> workers_; +}; + +class DistMultiTrainer : public MultiTrainer { + public: + DistMultiTrainer() {} + virtual ~DistMultiTrainer() {} + virtual void Initialize(const TrainerDesc& trainer_desc, Dataset* data_set); + virtual void InitOtherEnv(const ProgramDesc& main_program); + virtual void Run(); + virtual void Finalize(); + + protected: + std::shared_ptr pull_dense_worker_; +}; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/trainer_desc.proto b/paddle/fluid/framework/trainer_desc.proto new file mode 100644 index 0000000000000000000000000000000000000000..389c1a870fb54ad28806ad49632323b1c93676f4 --- /dev/null +++ b/paddle/fluid/framework/trainer_desc.proto @@ -0,0 +1,92 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +syntax = "proto2"; +import "data_feed.proto"; +package paddle.framework; + +message TrainerDesc { + // class name for create trainer desc + // the matchness of trainer name and device worker name + // will be checked in python API + optional string class_name = 1; + // class name for creating device worker + optional string device_worker_name = 2; + // thread number + optional int32 thread_num = 3; + // if we need to binding cpu + optional bool binding_cpu = 4 [ default = false ]; + repeated string filelist = 5; + optional bool debug = 6 [ default = false ]; + optional FetchConfig fetch_config = 7; + + // device worker parameters + optional HogwildWorkerParameter hogwild_param = 101; + optional DownpourWorkerParameter downpour_param = 103; + optional PullDenseWorkerParameter pull_dense_param = 102; + // datafeed desc + optional DataFeedDesc data_desc = 201; +} + +message HogwildWorkerParameter { repeated string skip_ops = 1; } + +message DownpourWorkerParameter { + repeated TableParameter sparse_table = 1; + repeated TableParameter dense_table = 2; + repeated string skip_ops = 3; + repeated ProgramConfig program_config = 4; + optional bool push_sparse = 5 [ default = true ]; + optional bool push_dense = 6 [ default = true ]; +} + +message FetchConfig { + enum Method { PRINT = 0; } + repeated string fetch_var_names = 1; + repeated string fetch_var_str_format = 2; + optional int32 print_period = 3 [ default = 100 ]; + optional Method method = 4 [ default = PRINT ]; +} + +message ProgramConfig { + required string program_id = 1; + repeated int32 push_sparse_table_id = 2; + repeated int32 push_dense_table_id = 3; + repeated int32 pull_sparse_table_id = 4; + repeated int32 pull_dense_table_id = 5; +} + +message PullDenseWorkerParameter { + // dense table only and specialized usage + optional int32 threshold = 1 [ default = 1 ]; + optional int32 device_num = 2; + optional int32 sleep_time_ms = 3 [ default = 2 ]; + repeated TableParameter dense_table = 4; +} + +message TableParameter { + // dense table only + optional int64 table_id = 1; + repeated string dense_value_name = 2; + repeated string dense_grad_name = 3; + repeated int32 push_dense_wait_times = 5; + // sparse table only + repeated string sparse_key_name = 6; + repeated string sparse_value_name = 7; + repeated string sparse_grad_name = 8; + repeated int32 push_sparse_wait_times = 9; + // sparse table only and specialized usage + optional int32 emb_dim = 10; + optional int32 fea_dim = 11; + optional string label_var_name = 12; +} diff --git a/paddle/fluid/framework/trainer_factory.cc b/paddle/fluid/framework/trainer_factory.cc new file mode 100644 index 0000000000000000000000000000000000000000..6b4461c0c429d5b1809dd69d91390421cc8b14ad --- /dev/null +++ b/paddle/fluid/framework/trainer_factory.cc @@ -0,0 +1,67 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/trainer_factory.h" +#include +#include +#include + +#include "paddle/fluid/framework/trainer.h" + +namespace paddle { +namespace framework { + +typedef std::shared_ptr (*CreatetrainerFunction)(); +typedef std::unordered_map trainerMap; +trainerMap g_trainer_map; + +#define REGISTER_TRAINER_CLASS(trainer_class) \ + namespace { \ + std::shared_ptr Creator_##trainer_class() { \ + return std::shared_ptr(new trainer_class); \ + } \ + class __Registerer_##trainer_class { \ + public: \ + __Registerer_##trainer_class() { \ + g_trainer_map[#trainer_class] = &Creator_##trainer_class; \ + } \ + }; \ + __Registerer_##trainer_class g_registerer_##trainer_class; \ + } // namespace + +std::string TrainerFactory::TrainerTypeList() { + std::string trainer_types; + for (auto iter = g_trainer_map.begin(); iter != g_trainer_map.end(); ++iter) { + if (iter != g_trainer_map.begin()) { + trainer_types += ", "; + } + trainer_types += iter->first; + } + return trainer_types; +} + +std::shared_ptr TrainerFactory::CreateTrainer( + std::string trainer_class) { + if (g_trainer_map.count(trainer_class) < 1) { + LOG(WARNING) << "Trainer class: " << trainer_class << " not defined"; + LOG(WARNING) << TrainerTypeList(); + exit(-1); + } + return g_trainer_map[trainer_class](); +} + +REGISTER_TRAINER_CLASS(MultiTrainer); +REGISTER_TRAINER_CLASS(DistMultiTrainer); +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/trainer_factory.h b/paddle/fluid/framework/trainer_factory.h new file mode 100644 index 0000000000000000000000000000000000000000..9c772a4f19ed9ba50f704ed62ef361555b1285fb --- /dev/null +++ b/paddle/fluid/framework/trainer_factory.h @@ -0,0 +1,30 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include "paddle/fluid/framework/trainer.h" + +namespace paddle { +namespace framework { + +class TrainerFactory { + public: + static std::string TrainerTypeList(); + static std::shared_ptr CreateTrainer(std::string trainer_class); +}; +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/trainer_test.cc b/paddle/fluid/framework/trainer_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..f689679d48696ced2ff1fe5c2d3706e3ed2190a4 --- /dev/null +++ b/paddle/fluid/framework/trainer_test.cc @@ -0,0 +1,27 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/trainer.h" +#include + +namespace paddle { +namespace framework { +TEST() { + // create multi trainer + // create hogwild device worker + // create dataset + // train for a while +} +} +} diff --git a/paddle/fluid/framework/variable_helper.cc b/paddle/fluid/framework/variable_helper.cc index fc4525549caeebb06dea766ccb123b5ebc6d5b13..65c939af173a8a2a22d69c636de355293f95dec6 100644 --- a/paddle/fluid/framework/variable_helper.cc +++ b/paddle/fluid/framework/variable_helper.cc @@ -27,7 +27,8 @@ limitations under the License. */ namespace paddle { namespace framework { -void InitializeVariable(Variable* var, proto::VarType::Type var_type) { + +void InitializeVariable(Variable *var, proto::VarType::Type var_type) { if (var_type == proto::VarType::LOD_TENSOR) { var->GetMutable(); } else if (var_type == proto::VarType::SELECTED_ROWS) { @@ -37,7 +38,7 @@ void InitializeVariable(Variable* var, proto::VarType::Type var_type) { } else if (var_type == proto::VarType::FETCH_LIST) { var->GetMutable(); } else if (var_type == proto::VarType::STEP_SCOPES) { - var->GetMutable>(); + var->GetMutable>(); } else if (var_type == proto::VarType::LOD_RANK_TABLE) { var->GetMutable(); } else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) { @@ -56,5 +57,27 @@ void InitializeVariable(Variable* var, proto::VarType::Type var_type) { var_type); } } + +void CopyVariable(const Variable &src_var, Variable *dst_var) { + // only support cpu now + auto cpu_place = platform::CPUPlace(); + + if (src_var.IsType()) { + auto *tmp_grad_tensor = dst_var->GetMutable(); + auto &src_tensor = src_var.Get(); + tmp_grad_tensor->set_lod(src_tensor.lod()); + framework::TensorCopy(src_tensor, cpu_place, tmp_grad_tensor); + } else if (src_var.IsType()) { + auto &src_slr = src_var.Get(); + auto *tmp_grad_slr = dst_var->GetMutable(); + tmp_grad_slr->set_rows(src_slr.rows()); + tmp_grad_slr->set_height(src_slr.height()); + auto &src_t = src_slr.value(); + auto *dst_t = tmp_grad_slr->mutable_value(); + framework::TensorCopy(src_t, cpu_place, dst_t); + } else { + PADDLE_THROW("unknown var type to copy"); + } +} } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/variable_helper.h b/paddle/fluid/framework/variable_helper.h index 0e0c72c3621dce0a6b372f9a9110a63fbc0a1d71..5a2c267b7388f6c2de89054dc480fd74b4544bed 100644 --- a/paddle/fluid/framework/variable_helper.h +++ b/paddle/fluid/framework/variable_helper.h @@ -17,6 +17,9 @@ limitations under the License. */ #include "paddle/fluid/framework/variable.h" namespace paddle { namespace framework { -void InitializeVariable(Variable *var, proto::VarType::Type var_type); -} -} + +void InitializeVariable(Variable* var, proto::VarType::Type var_type); +void CopyVariable(const Variable& src_var, Variable* dst_var); + +} // end namespace framework +} // end namespace paddle diff --git a/paddle/fluid/inference/anakin/convert/CMakeLists.txt b/paddle/fluid/inference/anakin/convert/CMakeLists.txt index 1e7f5ac799de0d7a1debec0529d262f021bba790..d3d1522dccf0d8af4f26eec4e0c57257279880e0 100644 --- a/paddle/fluid/inference/anakin/convert/CMakeLists.txt +++ b/paddle/fluid/inference/anakin/convert/CMakeLists.txt @@ -1,5 +1,4 @@ -cc_library(anakin_op_converter SRCS fc.cc conv2d.cc conv2d_fusion.cc - elementwise.cc activation.cc pool2d.cc concat.cc split.cc relu.cc softmax.cc batch_norm.cc reshape.cc flatten.cc transpose.cc density_prior_box.cc detection_out.cc scale.cc dropout.cc im2sequence.cc sum.cc DEPS anakin_engine framework_proto scope op_registry) +cc_library(anakin_op_converter SRCS fc.cc conv2d.cc conv2d_fusion.cc elementwise.cc activation.cc pool2d.cc concat.cc split.cc relu.cc softmax.cc batch_norm.cc reshape.cc flatten.cc transpose.cc density_prior_box.cc detection_out.cc scale.cc dropout.cc im2sequence.cc sum.cc DEPS anakin_engine framework_proto scope op_registry) cc_test(test_anakin_fc SRCS test_fc_op.cc DEPS anakin_op_converter mul_op SERIAL) cc_test(test_anakin_conv2d SRCS test_conv2d_op.cc DEPS anakin_op_converter conv_op im2col vol2col depthwise_conv SERIAL) diff --git a/paddle/fluid/inference/anakin/convert/activation.cc b/paddle/fluid/inference/anakin/convert/activation.cc index c85b958d7b85cb3e21df8714c89eee10b9b3fecc..a9aeb19ffd5f04c03df593e8f48976e7fa6155ab 100644 --- a/paddle/fluid/inference/anakin/convert/activation.cc +++ b/paddle/fluid/inference/anakin/convert/activation.cc @@ -34,6 +34,7 @@ ActivationOpConverter::ActivationOpConverter(const std::string &op_type) } void ActivationOpConverter::operator()(const framework::proto::OpDesc &op, + const framework::BlockDesc &block_desc, const framework::Scope &scope, bool test_mode) { framework::OpDesc op_desc(op, nullptr); diff --git a/paddle/fluid/inference/anakin/convert/activation.h b/paddle/fluid/inference/anakin/convert/activation.h index 49a4518bef418491a7fbc0bcde403bf047f774bd..592a3d5bd9d1272aae8a13d0d0acc77f8990c6b3 100644 --- a/paddle/fluid/inference/anakin/convert/activation.h +++ b/paddle/fluid/inference/anakin/convert/activation.h @@ -27,6 +27,7 @@ class ActivationOpConverter : public AnakinOpConverter { explicit ActivationOpConverter(const std::string &op_type); virtual void operator()(const framework::proto::OpDesc &op, + const framework::BlockDesc &block_desc, const framework::Scope &scope, bool test_mode) override; virtual ~ActivationOpConverter() {} diff --git a/paddle/fluid/inference/anakin/convert/batch_norm.cc b/paddle/fluid/inference/anakin/convert/batch_norm.cc index 94014802bdbe1792e9eaba28d7134624dd3edc90..38cf6172027b3b200a378a61b6d5b395cc571de7 100644 --- a/paddle/fluid/inference/anakin/convert/batch_norm.cc +++ b/paddle/fluid/inference/anakin/convert/batch_norm.cc @@ -29,6 +29,7 @@ namespace inference { namespace anakin { void BatchNormOpConverter::operator()(const framework::proto::OpDesc &op, + const framework::BlockDesc &block_desc, const framework::Scope &scope, bool test_mode) { framework::OpDesc op_desc(op, nullptr); diff --git a/paddle/fluid/inference/anakin/convert/batch_norm.h b/paddle/fluid/inference/anakin/convert/batch_norm.h index cee5c43ae76bf28284118380ca4c861d5cbedd1c..c56735f15b435b46cf9f623bd284b5731a36c327 100644 --- a/paddle/fluid/inference/anakin/convert/batch_norm.h +++ b/paddle/fluid/inference/anakin/convert/batch_norm.h @@ -25,6 +25,7 @@ class BatchNormOpConverter : public AnakinOpConverter { BatchNormOpConverter() = default; virtual void operator()(const framework::proto::OpDesc &op, + const framework::BlockDesc &block_desc, const framework::Scope &scope, bool test_mode) override; virtual ~BatchNormOpConverter() {} diff --git a/paddle/fluid/inference/anakin/convert/concat.cc b/paddle/fluid/inference/anakin/convert/concat.cc index e2d1111acbb60690167530a25aeaf59858b71987..ae90c083690da6e108a05460de68be2eb0cd9b48 100644 --- a/paddle/fluid/inference/anakin/convert/concat.cc +++ b/paddle/fluid/inference/anakin/convert/concat.cc @@ -29,6 +29,7 @@ namespace inference { namespace anakin { void ConcatOpConverter::operator()(const framework::proto::OpDesc &op, + const framework::BlockDesc &block_desc, const framework::Scope &scope, bool test_mode) { framework::OpDesc op_desc(op, nullptr); diff --git a/paddle/fluid/inference/anakin/convert/concat.h b/paddle/fluid/inference/anakin/convert/concat.h index 4ff2b6d85b758efc7529c5034a34e094ee06cccb..974ff689bfef681f8993d5dbb0dbbbdde91f33bd 100644 --- a/paddle/fluid/inference/anakin/convert/concat.h +++ b/paddle/fluid/inference/anakin/convert/concat.h @@ -25,6 +25,7 @@ class ConcatOpConverter : public AnakinOpConverter { ConcatOpConverter() = default; virtual void operator()(const framework::proto::OpDesc &op, + const framework::BlockDesc &block_desc, const framework::Scope &scope, bool test_mode) override; virtual ~ConcatOpConverter() {} diff --git a/paddle/fluid/inference/anakin/convert/conv2d.cc b/paddle/fluid/inference/anakin/convert/conv2d.cc index b99c6e71c4dfd2b567d85904f57ebecf0ed9a1cc..308f14604b9c83f2278499359328109d31f9ff17 100644 --- a/paddle/fluid/inference/anakin/convert/conv2d.cc +++ b/paddle/fluid/inference/anakin/convert/conv2d.cc @@ -28,6 +28,7 @@ namespace inference { namespace anakin { void Conv2dOpConverter::operator()(const framework::proto::OpDesc &op, + const framework::BlockDesc &block_desc, const framework::Scope &scope, bool test_mode) { framework::OpDesc op_desc(op, nullptr); diff --git a/paddle/fluid/inference/anakin/convert/conv2d.h b/paddle/fluid/inference/anakin/convert/conv2d.h index 75a30c10d481762fe5579ccb4d79feeba73dc98a..dca5d19f468ac6d6e2f4bcda8ecaa3922d80e6b1 100644 --- a/paddle/fluid/inference/anakin/convert/conv2d.h +++ b/paddle/fluid/inference/anakin/convert/conv2d.h @@ -25,6 +25,7 @@ class Conv2dOpConverter : public AnakinOpConverter { Conv2dOpConverter() = default; virtual void operator()(const framework::proto::OpDesc &op, + const framework::BlockDesc &block_desc, const framework::Scope &scope, bool test_mode) override; virtual ~Conv2dOpConverter() {} diff --git a/paddle/fluid/inference/anakin/convert/conv2d_fusion.cc b/paddle/fluid/inference/anakin/convert/conv2d_fusion.cc index 4d105430dd298076fa8aa4c1925329c3a0e356a1..fa1ab0efeeb5cacd112ca1b644735eaaf49e55f8 100644 --- a/paddle/fluid/inference/anakin/convert/conv2d_fusion.cc +++ b/paddle/fluid/inference/anakin/convert/conv2d_fusion.cc @@ -28,6 +28,7 @@ namespace inference { namespace anakin { void Conv2dFusionOpConverter::operator()(const framework::proto::OpDesc &op, + const framework::BlockDesc &block_desc, const framework::Scope &scope, bool test_mode) { framework::OpDesc op_desc(op, nullptr); diff --git a/paddle/fluid/inference/anakin/convert/conv2d_fusion.h b/paddle/fluid/inference/anakin/convert/conv2d_fusion.h index 07359b9cba05bf7c885eb38d64816bdb718a6aba..0d9ef28183b309c4b50714fcbe64e24c5d9dfbaa 100644 --- a/paddle/fluid/inference/anakin/convert/conv2d_fusion.h +++ b/paddle/fluid/inference/anakin/convert/conv2d_fusion.h @@ -25,6 +25,7 @@ class Conv2dFusionOpConverter : public AnakinOpConverter { Conv2dFusionOpConverter() = default; virtual void operator()(const framework::proto::OpDesc &op, + const framework::BlockDesc &block_desc, const framework::Scope &scope, bool test_mode) override; virtual ~Conv2dFusionOpConverter() {} diff --git a/paddle/fluid/inference/anakin/convert/density_prior_box.cc b/paddle/fluid/inference/anakin/convert/density_prior_box.cc index 35e02919aa70c211da5d4a5785a9833747d99ce2..30796f7592427191a4396a154be62838b7e666ad 100644 --- a/paddle/fluid/inference/anakin/convert/density_prior_box.cc +++ b/paddle/fluid/inference/anakin/convert/density_prior_box.cc @@ -27,9 +27,9 @@ namespace paddle { namespace inference { namespace anakin { -void DensityPriorBoxOpConverter::operator()(const framework::proto::OpDesc& op, - const framework::Scope& scope, - bool test_mode) { +void DensityPriorBoxOpConverter::operator()( + const framework::proto::OpDesc& op, const framework::BlockDesc& block_desc, + const framework::Scope& scope, bool test_mode) { framework::OpDesc op_desc(op, nullptr); auto input_name = op_desc.Input("Input").front(); auto image_name = op_desc.Input("Image").front(); diff --git a/paddle/fluid/inference/anakin/convert/density_prior_box.h b/paddle/fluid/inference/anakin/convert/density_prior_box.h index 44265cbf2e968e8821bc1a9ae3225c9b7d405235..bf9210711a0f69595c241803cd40d42770ccd5d7 100644 --- a/paddle/fluid/inference/anakin/convert/density_prior_box.h +++ b/paddle/fluid/inference/anakin/convert/density_prior_box.h @@ -27,6 +27,7 @@ class DensityPriorBoxOpConverter : public AnakinOpConverter { DensityPriorBoxOpConverter() = default; virtual void operator()(const framework::proto::OpDesc &op, + const framework::BlockDesc &block_desc, const framework::Scope &scope, bool test_mode) override; virtual ~DensityPriorBoxOpConverter() {} diff --git a/paddle/fluid/inference/anakin/convert/detection_out.cc b/paddle/fluid/inference/anakin/convert/detection_out.cc index 67636651017cfb18967cf8dc76d4f4a552fbd021..262ad28a654609cddde979d387621bb0c7c1a7f9 100644 --- a/paddle/fluid/inference/anakin/convert/detection_out.cc +++ b/paddle/fluid/inference/anakin/convert/detection_out.cc @@ -26,6 +26,7 @@ namespace inference { namespace anakin { void DetectionOutOpConverter::operator()(const framework::proto::OpDesc &op, + const framework::BlockDesc &block_desc, const framework::Scope &scope, bool test_mode) { framework::OpDesc op_desc(op, nullptr); diff --git a/paddle/fluid/inference/anakin/convert/detection_out.h b/paddle/fluid/inference/anakin/convert/detection_out.h index 5bf1c3ecbc89795d075301a2fd568312236bd874..ca78f10fdc2a7c7064ae0399e7f1afff1383ce67 100644 --- a/paddle/fluid/inference/anakin/convert/detection_out.h +++ b/paddle/fluid/inference/anakin/convert/detection_out.h @@ -27,6 +27,7 @@ class DetectionOutOpConverter : public AnakinOpConverter { DetectionOutOpConverter() = default; virtual void operator()(const framework::proto::OpDesc &op, + const framework::BlockDesc &block_desc, const framework::Scope &scope, bool test_mode) override; virtual ~DetectionOutOpConverter() {} diff --git a/paddle/fluid/inference/anakin/convert/dropout.cc b/paddle/fluid/inference/anakin/convert/dropout.cc index ed6d7f7561cb78666855146864b33254026926ef..bc9b26dcf2733369e558cde2954e9d0caaba86b0 100644 --- a/paddle/fluid/inference/anakin/convert/dropout.cc +++ b/paddle/fluid/inference/anakin/convert/dropout.cc @@ -31,6 +31,7 @@ namespace inference { namespace anakin { void DropoutOpConverter::operator()(const framework::proto::OpDesc &op, + const framework::BlockDesc &block_desc, const framework::Scope &scope, bool test_mode) { framework::OpDesc op_desc(op, nullptr); diff --git a/paddle/fluid/inference/anakin/convert/dropout.h b/paddle/fluid/inference/anakin/convert/dropout.h index 2a0fb6e76ac8354d884f9d815a4df785248e6475..11412e217ef5fa77bd22d7530d88be1347f2616f 100644 --- a/paddle/fluid/inference/anakin/convert/dropout.h +++ b/paddle/fluid/inference/anakin/convert/dropout.h @@ -25,6 +25,7 @@ class DropoutOpConverter : public AnakinOpConverter { DropoutOpConverter() = default; virtual void operator()(const framework::proto::OpDesc &op, + const framework::BlockDesc &block_desc, const framework::Scope &scope, bool test_mode) override; virtual ~DropoutOpConverter() {} diff --git a/paddle/fluid/inference/anakin/convert/elementwise.cc b/paddle/fluid/inference/anakin/convert/elementwise.cc index 55b12390baf90a9365fd4d197b19a3c5cd675afd..fe9a896d8266e06250b712be0c75290c039e9a08 100644 --- a/paddle/fluid/inference/anakin/convert/elementwise.cc +++ b/paddle/fluid/inference/anakin/convert/elementwise.cc @@ -30,9 +30,9 @@ namespace paddle { namespace inference { namespace anakin { -void ElementwiseAddOpConverter::operator()(const framework::proto::OpDesc &op, - const framework::Scope &scope, - bool test_mode) { +void ElementwiseAddOpConverter::operator()( + const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc, + const framework::Scope &scope, bool test_mode) { framework::OpDesc op_desc(op, nullptr); PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1); PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1); @@ -50,9 +50,9 @@ void ElementwiseAddOpConverter::operator()(const framework::proto::OpDesc &op, engine_->AddOpAttr>(op_name, "coeff", coeff); } -void ElementwiseMulOpConverter::operator()(const framework::proto::OpDesc &op, - const framework::Scope &scope, - bool test_mode) { +void ElementwiseMulOpConverter::operator()( + const framework::proto::OpDesc &op, const framework::BlockDesc &block_desc, + const framework::Scope &scope, bool test_mode) { framework::OpDesc op_desc(op, nullptr); PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1); PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1); diff --git a/paddle/fluid/inference/anakin/convert/elementwise.h b/paddle/fluid/inference/anakin/convert/elementwise.h index 47525e41daafcbca0c7c86bad44066f18a3ac79c..e4664493a9d3ce1ed9a0c79a05fb466c4e781b3e 100644 --- a/paddle/fluid/inference/anakin/convert/elementwise.h +++ b/paddle/fluid/inference/anakin/convert/elementwise.h @@ -25,6 +25,7 @@ class ElementwiseAddOpConverter : public AnakinOpConverter { ElementwiseAddOpConverter() = default; virtual void operator()(const framework::proto::OpDesc &op, + const framework::BlockDesc &block_desc, const framework::Scope &scope, bool test_mode) override; virtual ~ElementwiseAddOpConverter() {} @@ -37,6 +38,7 @@ class ElementwiseMulOpConverter : public AnakinOpConverter { ElementwiseMulOpConverter() = default; virtual void operator()(const framework::proto::OpDesc &op, + const framework::BlockDesc &block_desc, const framework::Scope &scope, bool test_mode) override; virtual ~ElementwiseMulOpConverter() {} diff --git a/paddle/fluid/inference/anakin/convert/fc.cc b/paddle/fluid/inference/anakin/convert/fc.cc index 2514eb1e093b4e05b7e6b2814cfd8185b3aede6c..a80a1a47e91aa085935b5febb3858e028f396091 100644 --- a/paddle/fluid/inference/anakin/convert/fc.cc +++ b/paddle/fluid/inference/anakin/convert/fc.cc @@ -27,6 +27,7 @@ namespace inference { namespace anakin { void FcBaseOpConverter::operator()(const framework::proto::OpDesc &op, + const framework::BlockDesc &block_desc, const framework::Scope &scope, bool test_mode) { framework::OpDesc op_desc(op, nullptr); diff --git a/paddle/fluid/inference/anakin/convert/fc.h b/paddle/fluid/inference/anakin/convert/fc.h index 060c649b19ef335a9e926eb205ec691a2a188fe1..fb461908b35e0111065e1a46c52306c64ace7d7c 100644 --- a/paddle/fluid/inference/anakin/convert/fc.h +++ b/paddle/fluid/inference/anakin/convert/fc.h @@ -25,6 +25,7 @@ class FcBaseOpConverter : public AnakinOpConverter { FcBaseOpConverter() = default; virtual void operator()(const framework::proto::OpDesc &op, + const framework::BlockDesc &block_desc, const framework::Scope &scope, bool test_mode) override; virtual ~FcBaseOpConverter() {} diff --git a/paddle/fluid/inference/anakin/convert/flatten.cc b/paddle/fluid/inference/anakin/convert/flatten.cc index c6c372bbef87de7f38c1f66a21c170cabac8c0ed..7f5c1510960d1014c33bd565939812fe7c7dfc06 100644 --- a/paddle/fluid/inference/anakin/convert/flatten.cc +++ b/paddle/fluid/inference/anakin/convert/flatten.cc @@ -26,6 +26,7 @@ namespace inference { namespace anakin { void FlattenOpConverter::operator()(const framework::proto::OpDesc &op, + const framework::BlockDesc &block_desc, const framework::Scope &scope, bool test_mode) { framework::OpDesc op_desc(op, nullptr); diff --git a/paddle/fluid/inference/anakin/convert/flatten.h b/paddle/fluid/inference/anakin/convert/flatten.h index 1ace76b16381980a9eaec12806e0bc94d7b1fb85..c9cc0006eb2448917bbcc0952f5e2cae72b73de1 100644 --- a/paddle/fluid/inference/anakin/convert/flatten.h +++ b/paddle/fluid/inference/anakin/convert/flatten.h @@ -25,6 +25,7 @@ class FlattenOpConverter : public AnakinOpConverter { FlattenOpConverter() = default; virtual void operator()(const framework::proto::OpDesc &op, + const framework::BlockDesc &block_desc, const framework::Scope &scope, bool test_mode) override; virtual ~FlattenOpConverter() {} diff --git a/paddle/fluid/inference/anakin/convert/im2sequence.cc b/paddle/fluid/inference/anakin/convert/im2sequence.cc index 568d7e4746f11b13ce8ea9e5a47a1b43d1c12693..2cc330c3829f6033229748523c3df750b951626f 100644 --- a/paddle/fluid/inference/anakin/convert/im2sequence.cc +++ b/paddle/fluid/inference/anakin/convert/im2sequence.cc @@ -31,6 +31,7 @@ namespace inference { namespace anakin { void Im2SequenceConverter::operator()(const framework::proto::OpDesc &op, + const framework::BlockDesc &block_desc, const framework::Scope &scope, bool test_mode) { framework::OpDesc op_desc(op, nullptr); diff --git a/paddle/fluid/inference/anakin/convert/im2sequence.h b/paddle/fluid/inference/anakin/convert/im2sequence.h index 3003eac2c6f416663c3e7c4c3e297b6347edfb47..714679c1d9601136f1f54287bb58d611e852f3fe 100644 --- a/paddle/fluid/inference/anakin/convert/im2sequence.h +++ b/paddle/fluid/inference/anakin/convert/im2sequence.h @@ -25,6 +25,7 @@ class Im2SequenceConverter : public AnakinOpConverter { Im2SequenceConverter() = default; virtual void operator()(const framework::proto::OpDesc &op, + const framework::BlockDesc &block_desc, const framework::Scope &scope, bool test_mode) override; virtual ~Im2SequenceConverter() {} diff --git a/paddle/fluid/inference/anakin/convert/op_converter.h b/paddle/fluid/inference/anakin/convert/op_converter.h index 45db4221747128cd7f6d26c8830fa75ebf81ac72..1ca62658ef26ffebcc068c91ece7d9bbed0a348f 100644 --- a/paddle/fluid/inference/anakin/convert/op_converter.h +++ b/paddle/fluid/inference/anakin/convert/op_converter.h @@ -40,8 +40,10 @@ class AnakinOpConverter { AnakinOpConverter() = default; virtual void operator()(const framework::proto::OpDesc &op, + const framework::BlockDesc &block_desc, const framework::Scope &scope, bool test_mode) {} void ConvertOp(const framework::proto::OpDesc &op, + const framework::BlockDesc &block_desc, const std::unordered_set ¶meters, const framework::Scope &scope, AnakinNvEngine *engine, bool test_mode = false) { @@ -58,16 +60,17 @@ class AnakinOpConverter { } PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]", op_type); it->SetEngine(engine); - (*it)(op, scope, test_mode); + (*it)(op, block_desc, scope, test_mode); } - void ConvertBlock(const framework::proto::BlockDesc &block, + void ConvertBlock(framework::BlockDesc *block_desc, const std::unordered_set ¶meters, const framework::Scope &scope, AnakinNvEngine *engine) { std::unique_lock lock(mutex_); - for (auto i = 0; i < block.ops_size(); i++) { - auto &op = block.ops(i); - ConvertOp(op, parameters, scope, engine); + framework::proto::BlockDesc *block = block_desc->Proto(); + for (auto i = 0; i < block->ops_size(); i++) { + auto &op = block->ops(i); + ConvertOp(op, *block_desc, parameters, scope, engine); } } @@ -77,9 +80,7 @@ class AnakinOpConverter { const std::vector &inputs, const std::unordered_set ¶meters, const std::vector &outputs, AnakinNvEngine *engine) { - framework::proto::BlockDesc *block_proto = block_desc->Proto(); - ConvertBlock(*block_proto, parameters, *scope, engine); - + ConvertBlock(block_desc, parameters, *scope, engine); engine->Freeze(); // if the max_batch size int max_batch_size = engine->GetMaxBatchSize(); diff --git a/paddle/fluid/inference/anakin/convert/pool2d.cc b/paddle/fluid/inference/anakin/convert/pool2d.cc index 9b01d56a126b2ebc194f5b5bb5b2f52c298a316e..87eefe712a5ad2acd8c9b5abe521c832ad2c1ef2 100644 --- a/paddle/fluid/inference/anakin/convert/pool2d.cc +++ b/paddle/fluid/inference/anakin/convert/pool2d.cc @@ -31,6 +31,7 @@ namespace inference { namespace anakin { void Pool2dOpConverter::operator()(const framework::proto::OpDesc &op, + const framework::BlockDesc &block_desc, const framework::Scope &scope, bool test_mode) { framework::OpDesc op_desc(op, nullptr); diff --git a/paddle/fluid/inference/anakin/convert/pool2d.h b/paddle/fluid/inference/anakin/convert/pool2d.h index 1931a03c7ac236b4e57236cd1eb2947110f279a8..ec28e48ac848eff1d37c39063725624bf7d65723 100644 --- a/paddle/fluid/inference/anakin/convert/pool2d.h +++ b/paddle/fluid/inference/anakin/convert/pool2d.h @@ -25,6 +25,7 @@ class Pool2dOpConverter : public AnakinOpConverter { Pool2dOpConverter() = default; virtual void operator()(const framework::proto::OpDesc &op, + const framework::BlockDesc &block_desc, const framework::Scope &scope, bool test_mode) override; virtual ~Pool2dOpConverter() {} diff --git a/paddle/fluid/inference/anakin/convert/relu.cc b/paddle/fluid/inference/anakin/convert/relu.cc index 2ce96db1804a3d6d6d1afac79e4e1fc55ed4c35d..993437d014b1f951dac94da7a3179b4bcb63466d 100644 --- a/paddle/fluid/inference/anakin/convert/relu.cc +++ b/paddle/fluid/inference/anakin/convert/relu.cc @@ -26,6 +26,7 @@ namespace inference { namespace anakin { void ReluOpConverter::operator()(const framework::proto::OpDesc &op, + const framework::BlockDesc &block_desc, const framework::Scope &scope, bool test_mode) { framework::OpDesc op_desc(op, nullptr); diff --git a/paddle/fluid/inference/anakin/convert/relu.h b/paddle/fluid/inference/anakin/convert/relu.h index 54c4c2316eb32ef70696a2477211008e04892552..6ede506511917c80faa59d40ee0a7bfff194da97 100644 --- a/paddle/fluid/inference/anakin/convert/relu.h +++ b/paddle/fluid/inference/anakin/convert/relu.h @@ -27,6 +27,7 @@ class ReluOpConverter : public AnakinOpConverter { ReluOpConverter() = default; virtual void operator()(const framework::proto::OpDesc &op, + const framework::BlockDesc &block_desc, const framework::Scope &scope, bool test_mode) override; virtual ~ReluOpConverter() {} diff --git a/paddle/fluid/inference/anakin/convert/reshape.cc b/paddle/fluid/inference/anakin/convert/reshape.cc index eee36d2f37ea79c841ac8bf60c6e533069d06240..17e0a1acb5f4e08e848e91bbb051757d85796c0a 100644 --- a/paddle/fluid/inference/anakin/convert/reshape.cc +++ b/paddle/fluid/inference/anakin/convert/reshape.cc @@ -26,6 +26,7 @@ namespace inference { namespace anakin { void ReshapeOpConverter::operator()(const framework::proto::OpDesc &op, + const framework::BlockDesc &block_desc, const framework::Scope &scope, bool test_mode) { framework::OpDesc op_desc(op, nullptr); diff --git a/paddle/fluid/inference/anakin/convert/reshape.h b/paddle/fluid/inference/anakin/convert/reshape.h index 970e8ce5572572bd18c34eeffa902fa2495c1cce..9ce2ea2a4f3f8802225fe8ca8ed602c9f7d27968 100644 --- a/paddle/fluid/inference/anakin/convert/reshape.h +++ b/paddle/fluid/inference/anakin/convert/reshape.h @@ -25,6 +25,7 @@ class ReshapeOpConverter : public AnakinOpConverter { ReshapeOpConverter() = default; virtual void operator()(const framework::proto::OpDesc &op, + const framework::BlockDesc &block_desc, const framework::Scope &scope, bool test_mode) override; virtual ~ReshapeOpConverter() {} diff --git a/paddle/fluid/inference/anakin/convert/scale.cc b/paddle/fluid/inference/anakin/convert/scale.cc index 6f3aa8c5d1111dc2829e241c9331eeb521003c03..dd68af4f79a6d1e8add04bde6a6890bca1b00d14 100644 --- a/paddle/fluid/inference/anakin/convert/scale.cc +++ b/paddle/fluid/inference/anakin/convert/scale.cc @@ -26,6 +26,7 @@ namespace inference { namespace anakin { void ScaleOpConverter::operator()(const framework::proto::OpDesc &op, + const framework::BlockDesc &block_desc, const framework::Scope &scope, bool test_mode) { framework::OpDesc op_desc(op, nullptr); diff --git a/paddle/fluid/inference/anakin/convert/scale.h b/paddle/fluid/inference/anakin/convert/scale.h index b858e3c512494f80c7c3818a570e43d90d65251b..ba3bcdd21494a4eeb6190aa8383e17e1b828b5f3 100644 --- a/paddle/fluid/inference/anakin/convert/scale.h +++ b/paddle/fluid/inference/anakin/convert/scale.h @@ -27,6 +27,7 @@ class ScaleOpConverter : public AnakinOpConverter { ScaleOpConverter() = default; virtual void operator()(const framework::proto::OpDesc &op, + const framework::BlockDesc &block_desc, const framework::Scope &scope, bool test_mode) override; virtual ~ScaleOpConverter() {} diff --git a/paddle/fluid/inference/anakin/convert/softmax.cc b/paddle/fluid/inference/anakin/convert/softmax.cc index d5cd8908ebf623f0334a3b4df2b19147c63f77a3..a6c1e971b16fa7fe6a074bcb2cdf391410f8871f 100644 --- a/paddle/fluid/inference/anakin/convert/softmax.cc +++ b/paddle/fluid/inference/anakin/convert/softmax.cc @@ -24,6 +24,7 @@ namespace inference { namespace anakin { void SoftMaxOpConverter::operator()(const framework::proto::OpDesc &op, + const framework::BlockDesc &block_desc, const framework::Scope &scope, bool test_mode) { framework::OpDesc op_desc(op, nullptr); @@ -32,8 +33,16 @@ void SoftMaxOpConverter::operator()(const framework::proto::OpDesc &op, auto input = op_desc.Input("X").front(); auto output = op_desc.Output("Out").front(); auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front(); + + auto input_var_desc = block_desc.FindVar(input); + PADDLE_ENFORCE(input_var_desc, + "Cant find %s variable When runing Anakin Softmax converter.", + input); + auto input_shape_in_fluid = input_var_desc->GetShape(); + size_t input_dims = input_shape_in_fluid.size(); + engine_->AddOp(op_name, "Softmax", {input}, {output}); - engine_->AddOpAttr(op_name, "axis", 2); + engine_->AddOpAttr(op_name, "axis", static_cast(input_dims - 1)); } } // namespace anakin diff --git a/paddle/fluid/inference/anakin/convert/softmax.h b/paddle/fluid/inference/anakin/convert/softmax.h index 0508da0c6fecaf29b7376005904235dadf04ea28..a16356d5bb61ac2f3b4f7751e257ce36ca604bf1 100644 --- a/paddle/fluid/inference/anakin/convert/softmax.h +++ b/paddle/fluid/inference/anakin/convert/softmax.h @@ -25,6 +25,7 @@ class SoftMaxOpConverter : public AnakinOpConverter { SoftMaxOpConverter() = default; virtual void operator()(const framework::proto::OpDesc &op, + const framework::BlockDesc &block_desc, const framework::Scope &scope, bool test_mode) override; virtual ~SoftMaxOpConverter() {} diff --git a/paddle/fluid/inference/anakin/convert/split.cc b/paddle/fluid/inference/anakin/convert/split.cc index b8464a766d21e93426eb4a00b8caab2af5470055..ec582c1812623cd4bcefa2097015ba258f6bacbb 100644 --- a/paddle/fluid/inference/anakin/convert/split.cc +++ b/paddle/fluid/inference/anakin/convert/split.cc @@ -30,6 +30,7 @@ namespace inference { namespace anakin { void SplitOpConverter::operator()(const framework::proto::OpDesc &op, + const framework::BlockDesc &block_desc, const framework::Scope &scope, bool test_mode) { framework::OpDesc op_desc(op, nullptr); diff --git a/paddle/fluid/inference/anakin/convert/split.h b/paddle/fluid/inference/anakin/convert/split.h index a4c6a14e62168ffaf5ff67b5cf953d477ff9e34d..184112e589e2bbdb30bc7a5d2cd053b7f3732a58 100644 --- a/paddle/fluid/inference/anakin/convert/split.h +++ b/paddle/fluid/inference/anakin/convert/split.h @@ -25,6 +25,7 @@ class SplitOpConverter : public AnakinOpConverter { SplitOpConverter() = default; virtual void operator()(const framework::proto::OpDesc &op, + const framework::BlockDesc &block_desc, const framework::Scope &scope, bool test_mode) override; virtual ~SplitOpConverter() {} diff --git a/paddle/fluid/inference/anakin/convert/sum.cc b/paddle/fluid/inference/anakin/convert/sum.cc index df9104cf4631d86e0cbd87cb0e93a96d984953f5..2a4178e2371389b44557d44ea526c7cc4a731d16 100644 --- a/paddle/fluid/inference/anakin/convert/sum.cc +++ b/paddle/fluid/inference/anakin/convert/sum.cc @@ -31,6 +31,7 @@ namespace inference { namespace anakin { void SumOpConverter::operator()(const framework::proto::OpDesc &op, + const framework::BlockDesc &block_desc, const framework::Scope &scope, bool test_mode) { framework::OpDesc op_desc(op, nullptr); PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 2); diff --git a/paddle/fluid/inference/anakin/convert/sum.h b/paddle/fluid/inference/anakin/convert/sum.h index ddecc4b3bcb84f83af95e77399847f191c785563..b5d402b77fcf555ffaf910f8c9d1b7337181a64b 100644 --- a/paddle/fluid/inference/anakin/convert/sum.h +++ b/paddle/fluid/inference/anakin/convert/sum.h @@ -25,6 +25,7 @@ class SumOpConverter : public AnakinOpConverter { SumOpConverter() = default; virtual void operator()(const framework::proto::OpDesc &op, + const framework::BlockDesc &block_desc, const framework::Scope &scope, bool test_mode) override; virtual ~SumOpConverter() {} diff --git a/paddle/fluid/inference/anakin/convert/transpose.cc b/paddle/fluid/inference/anakin/convert/transpose.cc index 6a887401034f9d8c0b8b6aa3eeffb6579e395029..f35372fe5c315ec68bc80a6d03c5931899ff7555 100644 --- a/paddle/fluid/inference/anakin/convert/transpose.cc +++ b/paddle/fluid/inference/anakin/convert/transpose.cc @@ -28,6 +28,7 @@ namespace inference { namespace anakin { void TransposeOpConverter::operator()(const framework::proto::OpDesc &op, + const framework::BlockDesc &block_desc, const framework::Scope &scope, bool test_mode) { framework::OpDesc op_desc(op, nullptr); diff --git a/paddle/fluid/inference/anakin/convert/transpose.h b/paddle/fluid/inference/anakin/convert/transpose.h index 62d26b6a9cc9885682f5750df32018596f014b33..bacbf152bc12319e6296677500b17d55d9772412 100644 --- a/paddle/fluid/inference/anakin/convert/transpose.h +++ b/paddle/fluid/inference/anakin/convert/transpose.h @@ -25,6 +25,7 @@ class TransposeOpConverter : public AnakinOpConverter { TransposeOpConverter() = default; virtual void operator()(const framework::proto::OpDesc &op, + const framework::BlockDesc &block_desc, const framework::Scope &scope, bool test_mode) override; virtual ~TransposeOpConverter() {} diff --git a/paddle/fluid/inference/anakin/convert/ut_helper.h b/paddle/fluid/inference/anakin/convert/ut_helper.h index e0371d95347a521f499dd9454d284907b3048a04..029aff6704ff1015e5c2378a2202c94043df990d 100644 --- a/paddle/fluid/inference/anakin/convert/ut_helper.h +++ b/paddle/fluid/inference/anakin/convert/ut_helper.h @@ -22,6 +22,7 @@ limitations under the License. */ #include #include +#include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/tensor_util.h" @@ -112,6 +113,17 @@ class AnakinConvertValidation { auto* x_tensor = x->GetMutable(); x_tensor->Resize(framework::make_ddim(dim_vec)); RandomizeTensor(x_tensor, place_, ctx); + + std::vector dim_vec_int64; + for (auto& ele : dim_vec) { + dim_vec_int64.push_back(static_cast(ele)); + } + + // Add var_desc to block_desc + auto* block_desc = program_desc_.MutableBlock(framework::kRootBlockIndex); + + auto* var_desc = block_desc->Var(name); + var_desc->SetShape(dim_vec_int64); } void SetOp(const framework::proto::OpDesc& desc) { @@ -119,8 +131,10 @@ class AnakinConvertValidation { op_desc_.reset(new framework::OpDesc(desc, nullptr)); // should init anakin engine here. + auto& block_desc = program_desc_.Block(framework::kRootBlockIndex); Singleton::Global().ConvertOp( - desc, parameters_, *scope_, engine_.get(), true /*test_mode*/); + desc, block_desc, parameters_, *scope_, engine_.get(), + true /*test_mode*/); engine_->Freeze(); std::map> temp_max_input_shape; @@ -194,6 +208,7 @@ class AnakinConvertValidation { cudaStream_t stream_; std::unique_ptr op_; std::unique_ptr op_desc_; + framework::ProgramDesc program_desc_; const std::unordered_set& parameters_; framework::Scope* scope_; platform::CUDAPlace place_; diff --git a/paddle/fluid/inference/anakin/engine.cc b/paddle/fluid/inference/anakin/engine.cc index ccf78ad7e56306d24af829c45c888021f4e3fbc4..ba044c9401a5f0fb5a839c1766fdd9d412d42212 100644 --- a/paddle/fluid/inference/anakin/engine.cc +++ b/paddle/fluid/inference/anakin/engine.cc @@ -91,7 +91,6 @@ void AnakinEngine::Execute( " or equal to the real input shape, Please set the max " "input shape using EnableAnakinEngine"); anakin_input->reshape(fluid_input_shape); - ::anakin::saber::Tensor tmp_anakin_tensor(data, TargetT(), 0, fluid_input_shape); anakin_input->copy_from(tmp_anakin_tensor); diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h index 29f16943e0c13fbe080e8e073b081583f1d14d11..a736ca393ccb7168a9faf650a6bce13f35fffca8 100644 --- a/paddle/fluid/inference/analysis/argument.h +++ b/paddle/fluid/inference/analysis/argument.h @@ -168,6 +168,7 @@ struct Argument { DECL_ARGUMENT_FIELD(anakin_max_input_shape, AnakinMaxInputShape, anakin_max_shape_t); DECL_ARGUMENT_FIELD(anakin_max_batch_size, AnakinMaxBatchSize, int); + DECL_ARGUMENT_FIELD(anakin_min_subgraph_size, AnakinMinSubgraphSize, int); DECL_ARGUMENT_FIELD(use_anakin, UseAnakin, bool); // Memory optimized related. diff --git a/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc index 38612d5cc3d093885144f3b1cd6107232885b645..b8d8b6fed8ca237e87cfc67979ec6ddd340b8916 100644 --- a/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc @@ -151,13 +151,20 @@ void AnakinSubgraphPass::CreateAnakinOp( op_desc->SetType("anakin_engine"); std::unordered_map output_name_map; + std::unordered_map graph_var_map; + + for (framework::ir::Node *node : graph->Nodes()) { + if (node->IsVar() && node->Var()) { + graph_var_map[node->Name()] = node; + } + } auto &subgraph_nodes = *Agent(node).subgraph(); // The following procedure is used to rename all the intermediate // variables and the output variables of the subgraph. RenameAndGetOutputs(subgraph_nodes, &block_desc, input_names_with_id, &output_names_with_id, &output_names, &output_name_map, - false); + graph_var_map, false); // When anakin engine runs at the end of the operation, // output_mapping help us copy the data from the renamed ITensor @@ -168,13 +175,6 @@ void AnakinSubgraphPass::CreateAnakinOp( output_mapping.push_back(output_name_map[name]); } - auto *vars = block_desc.Proto()->mutable_vars(); - for (framework::ir::Node *node : graph->Nodes()) { - if (node->IsVar() && node->Var()) { - *vars->Add() = *node->Var()->Proto(); - } - } - PADDLE_ENFORCE(!block_desc.Proto()->vars().empty(), "the block has no var-desc"); PADDLE_ENFORCE(!output_mapping.empty()); diff --git a/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc index a17ee1b707a7f950cddc62373a9a57c793d5528f..7c4aab06a1d2b3fadc76b46c7e95cea7818c56e2 100644 --- a/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc +++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc @@ -60,6 +60,7 @@ void RenameAndGetOutputs( std::set *output_names_with_id, std::set *output_names, std::unordered_map *output_name_map, + const std::unordered_map &graph_var_map, bool is_trt) { //// In the normal case, the paddle-trt exists bug when runing the googlenet. // When there are more than two convolutions of 1 * 1 with the same input, the @@ -69,6 +70,15 @@ void RenameAndGetOutputs( std::unordered_map same_hierarchy_conv2d_num_map; + auto add_block_var = [&](const std::string &graph_arg, + const std::string &block_arg) { + auto arg_var_node = graph_var_map.find(graph_arg); + PADDLE_ENFORCE(arg_var_node != graph_var_map.end()); + auto *var_t = block_desc->Var(block_arg); + var_t->SetShape(arg_var_node->second->Var()->GetShape()); + var_t->SetDataType(arg_var_node->second->Var()->GetDataType()); + }; + for (size_t index = 0; index < block_desc->OpSize(); ++index) { framework::proto::OpDesc *op = block_desc->Op(index)->Proto(); framework::OpDesc op_desc(*op, nullptr); @@ -87,13 +97,20 @@ void RenameAndGetOutputs( auto *in_var = op->mutable_inputs(i); std::vector replaced_names; for (int k = 0; k < in_var->arguments_size(); k++) { // all the arguments - std::string arg_value = in_var->arguments(k); - std::string arg_value_with_id = + const std::string arg_value = in_var->arguments(k); + const std::string arg_value_with_id = arg_value + std::to_string(var2id[arg_value]); + if (input_names_with_id.count(arg_value_with_id)) { replaced_names.push_back(arg_value); + if (graph_var_map.count(arg_value)) { + add_block_var(arg_value, arg_value); + } } else { replaced_names.push_back(arg_value_with_id); + if (graph_var_map.count(arg_value)) { + add_block_var(arg_value, arg_value_with_id); + } } } in_var->clear_arguments(); @@ -105,7 +122,6 @@ void RenameAndGetOutputs( for (auto out_var : correspond_node->outputs) { var2id[out_var->Name()] = out_var->id(); } - if (op_desc.Type() == "conv2d" && is_trt) { auto input_var_name = op_desc.Input("Input").front(); auto filter_var_name = op_desc.Input("Filter").front(); @@ -125,15 +141,18 @@ void RenameAndGetOutputs( same_hierarchy_conv2d_num_map[input_var_name] += 1; } } - // rename for the output variables of op inside subgraph for (int i = 0; i < op->outputs_size(); i++) { framework::proto::OpDesc_Var *out_var = op->mutable_outputs(i); std::vector replaced_names; for (int k = 0; k < out_var->arguments_size(); k++) { - std::string arg_value = out_var->arguments(k); - std::string arg_value_with_id = + const std::string arg_value = out_var->arguments(k); + const std::string arg_value_with_id = arg_value + std::to_string(var2id[arg_value]); + + if (graph_var_map.count(arg_value)) { + add_block_var(arg_value, arg_value_with_id); + } if (output_names_with_id->count(arg_value_with_id)) { (*output_name_map)[arg_value] = arg_value_with_id; } diff --git a/paddle/fluid/inference/analysis/ir_passes/subgraph_util.h b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.h index 3cf21bf5f426a7142626e6ae1db6ee478418d08a..bb445027821096689965096c69b8183dd9da403c 100644 --- a/paddle/fluid/inference/analysis/ir_passes/subgraph_util.h +++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.h @@ -42,6 +42,7 @@ void RenameAndGetOutputs( std::set *output_names_with_id, std::set *output_names, std::unordered_map *output_name_map, + const std::unordered_map &graph_var_map, bool is_trt = true); } // namespace analysis diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc index 019098a5dd0d372a690955698a2ab6a4039a2416..67650a352d8b8239da228462c21877ff440147b8 100644 --- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc @@ -142,6 +142,13 @@ void TensorRtSubgraphPass::CreateTensorRTOp( } std::unordered_map output_name_map; + std::unordered_map graph_var_map; + + for (framework::ir::Node *node : graph->Nodes()) { + if (node->IsVar() && node->Var()) { + graph_var_map[node->Name()] = node; + } + } auto &subgraph_nodes = *Agent(node).subgraph(); // The following procedure is used to rename all the intermediate @@ -157,7 +164,8 @@ void TensorRtSubgraphPass::CreateTensorRTOp( // So we have to rename the variable in the subgraph to make sure // it is either an OP's input or an OP's output. RenameAndGetOutputs(subgraph_nodes, &block_desc, input_names_with_id, - &output_names_with_id, &output_names, &output_name_map); + &output_names_with_id, &output_names, &output_name_map, + graph_var_map); // When tensorrt engine runs at the end of the operation, // output_mapping help us copy the data from the renamed ITensor @@ -168,14 +176,6 @@ void TensorRtSubgraphPass::CreateTensorRTOp( output_mapping.push_back(output_name_map[name]); } PADDLE_ENFORCE(!output_mapping.empty()); - - auto *vars = block_desc.Proto()->mutable_vars(); - for (framework::ir::Node *node : graph->Nodes()) { - if (node->IsVar() && node->Var()) { - *vars->Add() = *node->Var()->Proto(); - } - } - PADDLE_ENFORCE(!block_desc.Proto()->vars().empty(), "the block has no var-desc"); @@ -213,7 +213,6 @@ void TensorRtSubgraphPass::CreateTensorRTOp( SetAttr(op_desc->Proto(), "enable_int8", enable_int8); SetAttr(op_desc->Proto(), "engine_key", engine_key); std::string trt_engine_serialized_data = ""; - SetAttr(op_desc->Proto(), "engine_serialized_data", trt_engine_serialized_data); diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index aee94e12340597e981ac385a01335d2ffa069191..e5036d940197ef012cbfd8f52700c8aeb54fb6c5 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -115,6 +115,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { CP_MEMBER(use_anakin_); CP_MEMBER(anakin_max_batchsize_); CP_MEMBER(anakin_max_input_shape_); + CP_MEMBER(anakin_min_subgraph_size_); // Ir related. CP_MEMBER(enable_ir_optim_); @@ -322,6 +323,7 @@ std::string AnalysisConfig::SerializeInfoCache() { ss << specify_input_name_; ss << cpu_math_library_num_threads_; ss << use_anakin_; + ss << anakin_min_subgraph_size_; return ss.str(); } @@ -393,10 +395,11 @@ void AnalysisConfig::SwitchIrDebug(int x) { Update(); } void AnalysisConfig::EnableAnakinEngine( - int max_batch_size, - std::map> max_input_shape) { + int max_batch_size, std::map> max_input_shape, + int min_subgraph_size) { anakin_max_batchsize_ = max_batch_size; anakin_max_input_shape_ = max_input_shape; + anakin_min_subgraph_size_ = min_subgraph_size; use_anakin_ = true; Update(); } diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 7d8e9fe8bfada743388afd3ae4eedb5d84961706..6942604b0723f8665f0e8b058d48a5356a1a01f4 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -385,6 +385,7 @@ void AnalysisPredictor::PrepareArgument() { if (config_.use_gpu() && config_.anakin_engine_enabled()) { argument_.SetAnakinMaxBatchSize(config_.anakin_max_batchsize_); argument_.SetAnakinMaxInputShape(config_.anakin_max_input_shape_); + argument_.SetAnakinMinSubgraphSize(config_.anakin_min_subgraph_size_); LOG(INFO) << "Anakin subgraph engine is enabled"; } diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index 2ad4add2945d65037829e0bb453372e38a04421c..c67c4b5bd0bfeea6d022f9e821f6d0b877c71d7a 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -151,7 +151,8 @@ struct AnalysisConfig { */ void EnableAnakinEngine( int max_batch_size = 1, - std::map> max_input_shape = {}); + std::map> max_input_shape = {}, + int min_subgraph_size = 6); /** A boolean state indicating whether the Anakin sub-graph engine is used. */ @@ -288,6 +289,7 @@ struct AnalysisConfig { bool use_anakin_{false}; int anakin_max_batchsize_; + int anakin_min_subgraph_size_{6}; std::map> anakin_max_input_shape_; std::map engine_opt_info_; diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index 6a31185b097bc0ddf93a6e32e61ac0a9f2d04cfd..647913cc80727786379e2e5525b372818e423d23 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -148,20 +148,20 @@ inference_analysis_api_test_with_fake_data(test_analyzer_mobilenet_depthwise_con if(WITH_MKLDNN) set(INT8_DATA_DIR "${INFERENCE_DEMO_INSTALL_DIR}/int8") if (NOT EXISTS ${INT8_DATA_DIR}) - inference_download_and_uncompress(${INT8_DATA_DIR} "https://paddle-inference-dist.bj.bcebos.com/int8" "imagenet_val_100.tar.gz") + inference_download_and_uncompress(${INT8_DATA_DIR} ${INFERENCE_URL}"/int8" "imagenet_val_100.tar.gz") endif() #resnet50 int8 set(INT8_RESNET50_MODEL_DIR "${INT8_DATA_DIR}/resnet50") if (NOT EXISTS ${INT8_RESNET50_MODEL_DIR}) - inference_download_and_uncompress(${INT8_RESNET50_MODEL_DIR} "https://paddle-inference-dist.bj.bcebos.com/int8" "resnet50_int8_model.tar.gz" ) + inference_download_and_uncompress(${INT8_RESNET50_MODEL_DIR} ${INFERENCE_URL}"/int8" "resnet50_int8_model.tar.gz" ) endif() inference_analysis_api_int8_test(test_analyzer_int8_resnet50 ${INT8_RESNET50_MODEL_DIR} ${INT8_DATA_DIR} analyzer_int8_image_classification_tester.cc SERIAL) #mobilenet int8 set(INT8_MOBILENET_MODEL_DIR "${INT8_DATA_DIR}/mobilenet") if (NOT EXISTS ${INT8_MOBILENET_MODEL_DIR}) - inference_download_and_uncompress(${INT8_MOBILENET_MODEL_DIR} "https://paddle-inference-dist.bj.bcebos.com/int8" "mobilenetv1_int8_model.tar.gz" ) + inference_download_and_uncompress(${INT8_MOBILENET_MODEL_DIR} ${INFERENCE_URL}"/int8" "mobilenetv1_int8_model.tar.gz" ) endif() inference_analysis_api_int8_test(test_analyzer_int8_mobilenet ${INT8_MOBILENET_MODEL_DIR} ${INT8_DATA_DIR} analyzer_int8_image_classification_tester.cc SERIAL) endif() diff --git a/paddle/fluid/inference/tests/api/full_ILSVRC2012_val_preprocess.py b/paddle/fluid/inference/tests/api/full_ILSVRC2012_val_preprocess.py index 4d968c83d9c9bf9d947204d73f4460e62039cdda..842865933f2b4741aea034b19952d4c59344ba06 100644 --- a/paddle/fluid/inference/tests/api/full_ILSVRC2012_val_preprocess.py +++ b/paddle/fluid/inference/tests/api/full_ILSVRC2012_val_preprocess.py @@ -1,5 +1,4 @@ # copyright (c) 2019 paddlepaddle authors. all rights reserved. -# # licensed under the apache license, version 2.0 (the "license"); # you may not use this file except in compliance with the license. # you may obtain a copy of the license at @@ -11,6 +10,7 @@ # without warranties or conditions of any kind, either express or implied. # see the license for the specific language governing permissions and # limitations under the license. +import hashlib import unittest import os import numpy as np @@ -21,16 +21,20 @@ import functools import contextlib from PIL import Image, ImageEnhance import math -from paddle.dataset.common import download +from paddle.dataset.common import download, md5file +import tarfile random.seed(0) np.random.seed(0) DATA_DIM = 224 - SIZE_FLOAT32 = 4 SIZE_INT64 = 8 - +FULL_SIZE_BYTES = 30106000008 +FULL_IMAGES = 50000 +DATA_DIR_NAME = 'ILSVRC2012' +IMG_DIR_NAME = 'var' +TARGET_HASH = '8dc592db6dcc8d521e4d5ba9da5ca7d2' img_mean = np.array([0.485, 0.456, 0.406]).reshape((3, 1, 1)) img_std = np.array([0.229, 0.224, 0.225]).reshape((3, 1, 1)) @@ -70,19 +74,9 @@ def process_image(img_path, mode, color_jitter, rotate): return img -def download_unzip(): - int8_download = 'int8/download' - - target_name = 'data' - - cache_folder = os.path.expanduser('~/.cache/paddle/dataset/' + - int8_download) - - target_folder = os.path.join(cache_folder, target_name) - +def download_concat(cache_folder, zip_path): data_urls = [] data_md5s = [] - data_urls.append( 'https://paddle-inference-dist.bj.bcebos.com/int8/ILSVRC2012_img_val.tar.gz.partaa' ) @@ -91,72 +85,138 @@ def download_unzip(): 'https://paddle-inference-dist.bj.bcebos.com/int8/ILSVRC2012_img_val.tar.gz.partab' ) data_md5s.append('1e9f15f64e015e58d6f9ec3210ed18b5') - file_names = [] - + print("Downloading full ImageNet Validation dataset ...") for i in range(0, len(data_urls)): download(data_urls[i], cache_folder, data_md5s[i]) - file_names.append(data_urls[i].split('/')[-1]) - - zip_path = os.path.join(cache_folder, 'full_imagenet_val.tar.gz') - + file_name = os.path.join(cache_folder, data_urls[i].split('/')[-1]) + file_names.append(file_name) + print("Downloaded part {0}\n".format(file_name)) if not os.path.exists(zip_path): - cat_command = 'cat' - for file_name in file_names: - cat_command += ' ' + os.path.join(cache_folder, file_name) - cat_command += ' > ' + zip_path - os.system(cat_command) - print('Data is downloaded at {0}\n').format(zip_path) - - if not os.path.exists(target_folder): - cmd = 'mkdir {0} && tar xf {1} -C {0}'.format(target_folder, zip_path) - os.system(cmd) - print('Data is unzipped at {0}\n'.format(target_folder)) - - data_dir = os.path.join(target_folder, 'ILSVRC2012') - print('ILSVRC2012 full val set at {0}\n'.format(data_dir)) - return data_dir + with open(zip_path, "w+") as outfile: + for fname in file_names: + with open(fname) as infile: + outfile.write(infile.read()) + + +def extract(zip_path, extract_folder): + data_dir = os.path.join(extract_folder, DATA_DIR_NAME) + img_dir = os.path.join(data_dir, IMG_DIR_NAME) + print("Extracting...\n") + + if not (os.path.exists(img_dir) and + len(os.listdir(img_dir)) == FULL_IMAGES): + tar = tarfile.open(zip_path) + tar.extractall(path=extract_folder) + tar.close() + print('Extracted. Full Imagenet Validation dataset is located at {0}\n'. + format(data_dir)) + + +def print_processbar(done, total): + done_filled = done * '=' + empty_filled = (total - done) * ' ' + percentage_done = done * 100 / total + sys.stdout.write("\r[%s%s]%d%%" % + (done_filled, empty_filled, percentage_done)) + sys.stdout.flush() + + +def check_integrity(filename, target_hash): + print('\nThe binary file exists. Checking file integrity...\n') + md = hashlib.md5() + count = 0 + total_parts = 50 + chunk_size = 8192 + onepart = FULL_SIZE_BYTES / chunk_size / total_parts + with open(filename) as ifs: + while True: + buf = ifs.read(8192) + if count % onepart == 0: + done = count / onepart + print_processbar(done, total_parts) + count = count + 1 + if not buf: + break + md.update(buf) + hash1 = md.hexdigest() + if hash1 == target_hash: + return True + else: + return False -def reader(): - data_dir = download_unzip() - file_list = os.path.join(data_dir, 'val_list.txt') - output_file = os.path.join(data_dir, 'int8_full_val.bin') +def convert(file_list, data_dir, output_file): + print('Converting 50000 images to binary file ...\n') with open(file_list) as flist: lines = [line.strip() for line in flist] num_images = len(lines) - if not os.path.exists(output_file): - print( - 'Preprocessing to binary file......\n' - ) - with open(output_file, "w+b") as of: - #save num_images(int64_t) to file - of.seek(0) - num = np.array(int(num_images)).astype('int64') - of.write(num.tobytes()) - for idx, line in enumerate(lines): - img_path, label = line.split() - img_path = os.path.join(data_dir, img_path) - if not os.path.exists(img_path): - continue - - #save image(float32) to file - img = process_image( - img_path, 'val', color_jitter=False, rotate=False) - np_img = np.array(img) - of.seek(SIZE_INT64 + SIZE_FLOAT32 * DATA_DIM * DATA_DIM * 3 - * idx) - of.write(np_img.astype('float32').tobytes()) - - #save label(int64_t) to file - label_int = (int)(label) - np_label = np.array(label_int) - of.seek(SIZE_INT64 + SIZE_FLOAT32 * DATA_DIM * DATA_DIM * 3 - * num_images + idx * SIZE_INT64) - of.write(np_label.astype('int64').tobytes()) - - print('The preprocessed binary file path {}\n'.format(output_file)) + with open(output_file, "w+b") as ofs: + #save num_images(int64_t) to file + ofs.seek(0) + num = np.array(int(num_images)).astype('int64') + ofs.write(num.tobytes()) + per_parts = 1000 + full_parts = FULL_IMAGES / per_parts + print_processbar(0, full_parts) + for idx, line in enumerate(lines): + img_path, label = line.split() + img_path = os.path.join(data_dir, img_path) + if not os.path.exists(img_path): + continue + + #save image(float32) to file + img = process_image( + img_path, 'val', color_jitter=False, rotate=False) + np_img = np.array(img) + ofs.seek(SIZE_INT64 + SIZE_FLOAT32 * DATA_DIM * DATA_DIM * 3 * + idx) + ofs.write(np_img.astype('float32').tobytes()) + ofs.flush() + + #save label(int64_t) to file + label_int = (int)(label) + np_label = np.array(label_int) + ofs.seek(SIZE_INT64 + SIZE_FLOAT32 * DATA_DIM * DATA_DIM * 3 * + num_images + idx * SIZE_INT64) + ofs.write(np_label.astype('int64').tobytes()) + ofs.flush() + if (idx + 1) % per_parts == 0: + done = (idx + 1) / per_parts + print_processbar(done, full_parts) + print("Conversion finished.") + + +def run_convert(): + print('Start to download and convert 50000 images to binary file...') + cache_folder = os.path.expanduser('~/.cache/paddle/dataset/int8/download') + extract_folder = os.path.join(cache_folder, 'full_data') + data_dir = os.path.join(extract_folder, DATA_DIR_NAME) + file_list = os.path.join(data_dir, 'val_list.txt') + zip_path = os.path.join(cache_folder, 'full_imagenet_val.tar.gz') + output_file = os.path.join(cache_folder, 'int8_full_val.bin') + retry = 0 + try_limit = 3 + + while not (os.path.exists(output_file) and + os.path.getsize(output_file) == FULL_SIZE_BYTES and + check_integrity(output_file, TARGET_HASH)): + if os.path.exists(output_file): + sys.stderr.write( + "\n\nThe existing binary file is broken. Start to generate new one...\n\n". + format(output_file)) + os.remove(output_file) + if retry < try_limit: + retry = retry + 1 + else: + raise RuntimeError( + "Can not convert the dataset to binary file with try limit {0}". + format(try_limit)) + download_concat(cache_folder, zip_path) + extract(zip_path, extract_folder) + convert(file_list, data_dir, output_file) + print("\nSuccess! The binary file can be found at {0}".format(output_file)) if __name__ == '__main__': - reader() + run_convert() diff --git a/paddle/fluid/inference/tests/test.cmake b/paddle/fluid/inference/tests/test.cmake index df7af71d9b32ba11822e066f574146cfa5c50edd..fc6de70f5a89331cb8940b34c1c9ff5a164c2894 100644 --- a/paddle/fluid/inference/tests/test.cmake +++ b/paddle/fluid/inference/tests/test.cmake @@ -11,7 +11,7 @@ function(inference_download INSTALL_DIR URL FILENAME) ${EXTERNAL_PROJECT_LOG_ARGS} PREFIX ${INSTALL_DIR} URL ${URL}/${FILENAME} - DOWNLOAD_COMMAND wget -q -O ${INSTALL_DIR}/${FILENAME} ${URL}/${FILENAME} + DOWNLOAD_COMMAND wget --no-check-certificate -q -O ${INSTALL_DIR}/${FILENAME} ${URL}/${FILENAME} DOWNLOAD_DIR ${INSTALL_DIR} DOWNLOAD_NO_PROGRESS 1 CONFIGURE_COMMAND "" @@ -30,7 +30,7 @@ function(inference_download_and_uncompress INSTALL_DIR URL FILENAME) ${EXTERNAL_PROJECT_NAME} ${EXTERNAL_PROJECT_LOG_ARGS} PREFIX ${INSTALL_DIR} - DOWNLOAD_COMMAND wget -q -O ${INSTALL_DIR}/${FILENAME} ${URL}/${FILENAME} && + DOWNLOAD_COMMAND wget --no-check-certificate -q -O ${INSTALL_DIR}/${FILENAME} ${URL}/${FILENAME} && ${CMAKE_COMMAND} -E tar xzf ${INSTALL_DIR}/${FILENAME} DOWNLOAD_DIR ${INSTALL_DIR} DOWNLOAD_NO_PROGRESS 1 diff --git a/paddle/fluid/op_use_default_grad_op_maker.spec b/paddle/fluid/op_use_default_grad_op_maker.spec new file mode 100644 index 0000000000000000000000000000000000000000..712449f6be87a3cfb61f099ad6291875c8ad1292 --- /dev/null +++ b/paddle/fluid/op_use_default_grad_op_maker.spec @@ -0,0 +1,82 @@ +abs +acos +asin +atan +attention_lstm +brelu +conv_shift +cos +cos_sim +dequantize +elementwise_div +elementwise_max +elementwise_min +elu +fc +flatten +fsp +fused_embedding_fc_lstm +fused_embedding_seq_pool +fusion_gru +fusion_lstm +fusion_repeated_fc_relu +fusion_seqconv_eltadd_relu +fusion_seqexpand_concat_fc +fusion_seqpool_concat +fusion_squared_mat_sub +gelu +gru +hard_shrink +hierarchical_sigmoid +hinge_loss +huber_loss +leaky_relu +log +logsigmoid +lookup_table +lrn +lstm_unit +lstmp +max_pool2d_with_index +max_pool3d_with_index +maxout +modified_huber_loss +nce +pool2d +pool3d +pow +prelu +quantize +rank_loss +reduce_max +reduce_mean +reduce_min +reduce_prod +reduce_sum +requantize +reshape +rnn_memory_helper +round +row_conv +sequence_softmax +sin +softplus +softshrink +softsign +space_to_depth +spp +square +squared_l2_distance +squared_l2_norm +squeeze +stanh +swish +tanh_shrink +teacher_student_sigmoid_loss +tensor_array_to_tensor +thresholded_relu +transpose +tree_conv +unpool +unsqueeze +warpctc diff --git a/paddle/fluid/operators/affine_grid_op.cc b/paddle/fluid/operators/affine_grid_op.cc index 1de59a5165c83a314a0ff8f4e4351aa3326beb67..9d7100cc3db91f5bf7dbd993c9f9ba5d4fc98ea6 100644 --- a/paddle/fluid/operators/affine_grid_op.cc +++ b/paddle/fluid/operators/affine_grid_op.cc @@ -13,7 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/affine_grid_op.h" +#include #include +#include #include "paddle/fluid/framework/op_registry.h" #ifdef PADDLE_WITH_CUDA #include "paddle/fluid/platform/cudnn_helper.h" @@ -173,9 +175,10 @@ class AffineGridOpGrad : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { - auto theta_dims = ctx->GetInputDim("Theta"); if (ctx->HasOutput(framework::GradVarName("Theta"))) { - ctx->SetOutputDim(framework::GradVarName("Theta"), theta_dims); + auto output_dims = ctx->GetInputDim(framework::GradVarName("Output")); + ctx->SetOutputDim(framework::GradVarName("Theta"), + {output_dims[0], 2, 3}); } } diff --git a/paddle/fluid/operators/anakin/anakin_engine_op.h b/paddle/fluid/operators/anakin/anakin_engine_op.h index 9d5b4f6f54ccfc9802cef6abac428e28a72ac293..e4feb14b2271a50c8e8fb7ce4c81dd6c99042e21 100644 --- a/paddle/fluid/operators/anakin/anakin_engine_op.h +++ b/paddle/fluid/operators/anakin/anakin_engine_op.h @@ -120,40 +120,8 @@ class AnakinEngineOp : public framework::OperatorBase { inference::Singleton::Global() .Get(engine_key_); } - return anakin_engine_; } - - void Prepare(const framework::Scope &scope, const platform::Place &dev_place, - AnakinNvEngineT *engine) const { - LOG(INFO) << "Prepare Anakin engine (Optimize model structure, Select OP " - "kernel etc). This process may cost a lot of time."; - framework::proto::BlockDesc block_desc; - block_desc.ParseFromString(Attr("subgraph")); - - std::vector output_maps = - Attr>("output_name_mapping"); - - inference::Singleton::Global() - .ConvertBlock(block_desc, param_names_, scope, engine); - engine->Freeze(); - for (const auto &x : Inputs("Xs")) { - if (param_names_.count(x)) continue; - auto &t = - inference::analysis::GetFromScope(scope, x); - auto t_shape = framework::vectorize2int(t.dims()); - // all input shape should be 4 dims - if (t_shape.size() == 2) { - t_shape.push_back(1); - t_shape.push_back(1); - } - engine->SetInputShape(x, t_shape); - } - - engine->Optimize(); - - engine->InitGraph(); - } }; } // namespace operators diff --git a/paddle/fluid/operators/arg_min_max_op_base.h b/paddle/fluid/operators/arg_min_max_op_base.h index 6cbdaefeda099c36a864289ef8195c20d09c55e6..bf7b83bb7a7d4f4861276a228389e87a42a39ef7 100644 --- a/paddle/fluid/operators/arg_min_max_op_base.h +++ b/paddle/fluid/operators/arg_min_max_op_base.h @@ -58,6 +58,8 @@ class ArgMinMaxKernel : public framework::OpKernel { auto& out = *(ctx.Output("Out")); out.mutable_data(ctx.GetPlace()); auto axis = ctx.Attr("axis"); + auto x_rank = x.dims().size(); + if (axis < 0) axis += x_rank; auto& dev_ctx = ctx.template device_context(); #define CALL_ARG_MINMAX_FUNCTOR(rank) \ diff --git a/paddle/fluid/operators/bilinear_tensor_product_op.cc b/paddle/fluid/operators/bilinear_tensor_product_op.cc index 8d261a118a75ee16027faf60341cefd30c3cdbba..bd69f422e5dbd5a5dc95150b10daa302f47ec5ff 100644 --- a/paddle/fluid/operators/bilinear_tensor_product_op.cc +++ b/paddle/fluid/operators/bilinear_tensor_product_op.cc @@ -13,6 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/bilinear_tensor_product_op.h" +#include +#include +#include namespace paddle { namespace operators { @@ -121,15 +124,9 @@ class BilinearTensorProductOpGrad : public framework::OperatorWithKernel { "The second dimension of input(Out@GRAD) must be equal to " "the third dimension of the Input(Weight)."); - if (ctx->HasInput("Bias")) { - auto bias_dims = ctx->GetInputDim("Bias"); - PADDLE_ENFORCE_EQ( - bias_dims[1], out_dims[1], - "The second dimension of input(Out@GRAD) must be equal to " - "the second dimension of the Input(Bias)."); - auto bias_grad_name = framework::GradVarName("Bias"); - if (ctx->HasOutput(bias_grad_name)) - ctx->SetOutputDim(bias_grad_name, bias_dims); + auto bias_grad_name = framework::GradVarName("Bias"); + if (ctx->HasOutput(bias_grad_name)) { + ctx->SetOutputDim(bias_grad_name, {1, out_dims[1]}); } auto x_grad_name = framework::GradVarName("X"); @@ -148,13 +145,39 @@ class BilinearTensorProductOpGrad : public framework::OperatorWithKernel { } }; +class BilinearTensorProductGradOpDescMaker + : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("bilinear_tensor_product_grad"); + op->SetAttrMap(Attrs()); + op->SetInput("X", Input("X")); + op->SetInput("Y", Input("Y")); + op->SetInput("Weight", Input("Weight")); + if (ForwardOp().Inputs().count("Bias") > 0) { + op->SetOutput(framework::GradVarName("Bias"), InputGrad("Bias")); + } + + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetOutput(framework::GradVarName("Y"), InputGrad("Y")); + op->SetOutput(framework::GradVarName("Weight"), InputGrad("Weight")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + + return op; + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(bilinear_tensor_product, ops::BilinearTensorProductOp, ops::BilinearTensorProductOpMaker, - paddle::framework::DefaultGradOpDescMaker); + ops::BilinearTensorProductGradOpDescMaker); REGISTER_OPERATOR(bilinear_tensor_product_grad, ops::BilinearTensorProductOpGrad); REGISTER_OP_CPU_KERNEL( diff --git a/paddle/fluid/operators/dgc_clip_by_norm_op.h b/paddle/fluid/operators/dgc_clip_by_norm_op.h index bd22d16f7a21877af4e78c30f7e0985c64b543f2..197bf59b2a470e1f6e4e31c6706d1e3f8e73fbbc 100644 --- a/paddle/fluid/operators/dgc_clip_by_norm_op.h +++ b/paddle/fluid/operators/dgc_clip_by_norm_op.h @@ -24,18 +24,21 @@ class DGCClipByNormKernel : public ClipByNormKernel { public: void Compute(const framework::ExecutionContext& context) const override { auto rampup_begin_step = context.Attr("rampup_begin_step"); - if (static_cast(rampup_begin_step) >= 0) { - auto current_step_tensor = - context.Input("current_step"); - auto* current_step = current_step_tensor->data(); - - if (static_cast(*current_step) < - static_cast(rampup_begin_step)) { - VLOG(10) << "current_step:" << *current_step - << " < rampup_begin_step:" << rampup_begin_step - << " so does't use dgc_clip_by_norm"; - return; - } + if (static_cast(rampup_begin_step) < 0) { + return; + } + + auto current_step_tensor = context.Input("current_step"); + auto* current_step = current_step_tensor->data(); + + VLOG(10) << "current_step:" << *current_step + << ", rampup_begin_step:" << rampup_begin_step; + + if (static_cast(*current_step) < static_cast(rampup_begin_step)) { + VLOG(10) << "current_step:" << *current_step + << " < rampup_begin_step:" << rampup_begin_step + << " so does't use dgc_clip_by_norm"; + return; } return ClipByNormKernel::Compute(context); diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt index fc28fe818dc0bd2a8607118c015b6b5fd168fb43..972b4f67a8388ce68952fa90aaa224cd45c6d226 100644 --- a/paddle/fluid/operators/distributed/CMakeLists.txt +++ b/paddle/fluid/operators/distributed/CMakeLists.txt @@ -30,7 +30,7 @@ if(WITH_GRPC) else() set(BRPC_SRCS brpc/brpc_client.cc brpc/brpc_server.cc brpc/brpc_sendrecvop_utils.cc brpc/brpc_variable_response.cc brpc/brpc_rdma_pool.cc) - set_source_files_properties(${BRPC_SRCS} parameter_prefetch.cc rpc_server_test.cc brpc/brpc_serde_test.cc collective_server.cc collective_server_test.cc collective_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + set_source_files_properties(${BRPC_SRCS} parameter_prefetch.cc parameter_send.cc parameter_recv.cc communicator.cc rpc_server_test.cc brpc/brpc_serde_test.cc collective_server.cc collective_server_test.cc collective_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set(BRPC_DEPS brpc ssl crypto protobuf leveldb snappystream snappy zlib) @@ -50,8 +50,12 @@ endif() cc_test(rpc_server_test SRCS rpc_server_test.cc DEPS ${RPC_DEPS} executor proto_desc lookup_sparse_table_op SERIAL) -cc_test(varhandle_test SRCS varhandle_test.cc DEPS profiler) +cc_test(varhandle_test SRCS varhandle_test.cc DEPS profiler scope) cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_rpc memory) +cc_library(parameter_send SRCS parameter_send.cc DEPS sendrecvop_rpc memory) +cc_library(parameter_recv SRCS parameter_recv.cc DEPS sendrecvop_rpc memory) +cc_library(communicator SRCS communicator.cc DEPS scope selected_rows tensor variable_helper selected_rows_functor simple_threadpool parameter_send parameter_recv) +cc_test(communicator_test SRCS communicator_test.cc DEPS communicator) if(WITH_GPU) cc_test(collective_server_test SRCS collective_server_test.cc DEPS sendrecvop_rpc executor ${RPC_DEPS} diff --git a/paddle/fluid/operators/distributed/communicator.cc b/paddle/fluid/operators/distributed/communicator.cc new file mode 100644 index 0000000000000000000000000000000000000000..eba18c67771fa26eed855b0f19591e06101f424d --- /dev/null +++ b/paddle/fluid/operators/distributed/communicator.cc @@ -0,0 +1,213 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/distributed/communicator.h" + +#include +#include // NOLINT +#include // NOLINT + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/framework/variable_helper.h" +#include "paddle/fluid/operators/distributed/parameter_recv.h" +#include "paddle/fluid/operators/distributed/parameter_send.h" + +DEFINE_bool(communicator_independent_recv_thread, true, + "use an independent to recv vars from parameter server"); +DEFINE_int32(communicator_send_queue_size, 20, + "queue size to recv gradient before send"); +DEFINE_int32(communicator_max_send_grad_num_before_recv, 20, + "max grad num to send before recv parameters"); +DEFINE_int32(communicator_thread_pool_size, 5, "thread num to do send or recv"); +DEFINE_int32(communicator_max_merge_var_num, 20, + "max var num to merge and send"); +DEFINE_bool(communicator_fake_rpc, false, + "fake mode does not really send any thing"); + +namespace paddle { +namespace operators { +namespace distributed { + +inline double GetCurrentUS() { + struct timeval time; + gettimeofday(&time, NULL); + return 1e+6 * time.tv_sec + time.tv_usec; +} + +std::unique_ptr Communicator::communicator_(nullptr); +std::once_flag Communicator::init_flag_; + +Communicator::Communicator(const RpcCtxMap &send_varname_to_ctx, + const RpcCtxMap &recv_varname_to_ctx, + Scope *recv_scope) + : send_varname_to_ctx_(send_varname_to_ctx), + recv_varname_to_ctx_(recv_varname_to_ctx), + recv_scope_(recv_scope) { + // get all send information from graph, build vars_to_send + VLOG(0) << "communicator_independent_recv_thread: " + << FLAGS_communicator_independent_recv_thread; + VLOG(0) << "communicator_send_queue_size: " + << FLAGS_communicator_send_queue_size; + VLOG(0) << "communicator_max_send_grad_num_before_recv: " + << FLAGS_communicator_max_send_grad_num_before_recv; + VLOG(0) << "communicator_thread_pool_size: " + << FLAGS_communicator_thread_pool_size; + VLOG(0) << "communicator_max_merge_var_num: " + << FLAGS_communicator_max_merge_var_num; + VLOG(0) << "communicator_fake_rpc: " << FLAGS_communicator_fake_rpc; + send_scope_.reset(new Scope()); + for (auto &iter : send_varname_to_ctx_) { + send_varname_to_queue_[iter.first] = + std::make_shared>>( + FLAGS_communicator_send_queue_size); + } + send_threadpool_.reset(new ::ThreadPool(FLAGS_communicator_thread_pool_size)); + recv_threadpool_.reset(new ::ThreadPool(FLAGS_communicator_thread_pool_size)); +} + +Communicator::~Communicator() { + VLOG(3) << "~Communicator"; + running_ = false; + if (send_thread_) send_thread_->join(); + if (recv_thread_) recv_thread_->join(); + VLOG(3) << "~Communicator done"; +} + +void Communicator::SendThread() { + VLOG(3) << "SendThread start!"; + while (running_) { + std::vector> task_futures; + task_futures.reserve(send_varname_to_ctx_.size()); + VLOG(3) << "run send graph"; + auto before_run_send_graph = GetCurrentUS(); + for (auto &iter : send_varname_to_queue_) { + auto &var_name = iter.first; + auto &var_queue = iter.second; + if (var_queue->Size() > 0) { + auto send_task = [this, &var_name, &var_queue] { + VLOG(3) << var_name << " merge and send"; + std::vector> vars; + size_t merged_var_num = 0; + while (var_queue->Size() > 0 && + merged_var_num < FLAGS_communicator_max_merge_var_num) { + vars.push_back(var_queue->Pop()); + // only count the send number of the first var + if (var_name == send_varname_to_queue_.begin()->first) { + grad_num_.fetch_add(1, std::memory_order_relaxed); + } + merged_var_num++; + } + auto before_merge = GetCurrentUS(); + MergeVars(var_name, vars, send_scope_.get()); + auto after_merge = GetCurrentUS(); + VLOG(3) << "merge " << var_name << " use time " + << after_merge - before_merge; + auto send_functor = distributed::ParameterSend(); + auto &ctx = send_varname_to_ctx_.at(var_name); + if (!FLAGS_communicator_fake_rpc) { + send_functor(ctx, *send_scope_, true); + } + auto after_send = GetCurrentUS(); + VLOG(3) << "send " << var_name << " use time " + << after_send - after_merge; + }; + task_futures.emplace_back( + send_threadpool_->enqueue(std::move(send_task))); + } else { + VLOG(3) << var_name << " queue empty"; + } + } + for (auto &task_f : task_futures) { + task_f.wait(); + } + auto after_run_send_graph = GetCurrentUS(); + auto send_graph_use_time = after_run_send_graph - before_run_send_graph; + if (send_graph_use_time > 100) { + VLOG(1) << "run send graph use time " + << after_run_send_graph - before_run_send_graph; + } + if (!FLAGS_communicator_independent_recv_thread) { + RecvAll(); + } + } +} + +void Communicator::RecvAll() { + VLOG(3) << "parallel run recv graph"; + auto before_send = GetCurrentUS(); + std::vector> task_futures; + task_futures.reserve(recv_varname_to_ctx_.size()); + for (auto &iter : recv_varname_to_ctx_) { + auto recv_task = [this, &iter] { + auto &var_name = iter.first; + VLOG(3) << "recv var " << var_name; + auto recv_functor = distributed::ParameterRecv(); + if (!FLAGS_communicator_fake_rpc) { + recv_functor(iter.second, *recv_scope_); + } + }; + task_futures.emplace_back(recv_threadpool_->enqueue(std::move(recv_task))); + } + for (auto &task : task_futures) { + task.wait(); + } + auto after_recv = GetCurrentUS(); + VLOG(1) << "run recv graph use time " << after_recv - before_send; +} + +void Communicator::RecvThread() { + VLOG(3) << "RecvThread start!"; + while (running_) { + auto grad_num = grad_num_.load(); + if (grad_num > FLAGS_communicator_max_send_grad_num_before_recv) { + VLOG(1) << "current grad num " << grad_num; + RecvAll(); + grad_num_.store(0); + } else { + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + } + } +} + +void Communicator::Send(const std::string &var_name, + const framework::Scope &scope) { + VLOG(3) << "communicator send " << var_name; + // push var into send queue by var_name + auto *grad_var = scope.FindVar(var_name); + PADDLE_ENFORCE(grad_var->IsInitialized(), "grad var should be inited"); + auto tmp_grad_var = std::make_shared(); + framework::CopyVariable(*grad_var, tmp_grad_var.get()); + auto &queue = send_varname_to_queue_.at(var_name); + VLOG(3) << "send " << var_name << " queue size " << queue->Size(); + queue->Push(tmp_grad_var); +} + +Communicator *Communicator::GetInstance() { return communicator_.get(); } + +void Communicator::Start() { + running_ = true; + // start send and recv thread + send_thread_.reset( + new std::thread(std::bind(&Communicator::SendThread, this))); + if (FLAGS_communicator_independent_recv_thread) { + recv_thread_.reset( + new std::thread(std::bind(&Communicator::RecvThread, this))); + } +} + +} // namespace distributed +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/distributed/communicator.h b/paddle/fluid/operators/distributed/communicator.h new file mode 100644 index 0000000000000000000000000000000000000000..41155bfc31bb31520fdcf5bd50b203f2e1f2c516 --- /dev/null +++ b/paddle/fluid/operators/distributed/communicator.h @@ -0,0 +1,219 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/variable.h" +#include "paddle/fluid/operators/distributed/rpc_common.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/math/selected_rows_functor.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace operators { +namespace distributed { + +using Scope = framework::Scope; +using Variable = framework::Variable; + +template +class BlockingQueue { + public: + explicit BlockingQueue(size_t capacity) : capacity_(capacity) { + PADDLE_ENFORCE_GT(capacity_, 0, "The capacity must be greater than 0."); + } + + bool Push(const T& elem) { + { + std::unique_lock lock(mutex_); + cv_.wait(lock, [&] { return queue_.size() < capacity_; }); + PADDLE_ENFORCE_LT(queue_.size(), capacity_); + queue_.push_back(elem); + } + cv_.notify_one(); + return true; + } + + bool Push(T&& elem) { + { + std::unique_lock lock(mutex_); + cv_.wait(lock, [&] { return queue_.size() < capacity_; }); + PADDLE_ENFORCE_LT(queue_.size(), capacity_); + queue_.emplace_back(std::move(elem)); + } + cv_.notify_one(); + return true; + } + + T Pop() { + std::unique_lock lock(mutex_); + cv_.wait(lock, [=] { return !queue_.empty(); }); + T rc(std::move(queue_.front())); + queue_.pop_front(); + cv_.notify_one(); + return rc; + } + + size_t Cap() const { + std::lock_guard lock(mutex_); + return capacity_; + } + + size_t Size() const { + std::lock_guard lock(mutex_); + return queue_.size(); + } + + private: + const size_t capacity_; + std::deque queue_; + + mutable std::mutex mutex_; + std::condition_variable cv_; +}; + +template +using EigenVector = framework::EigenVector; + +inline void MergeVars(const std::string& var_name, + const std::vector>& vars, + Scope* scope) { + PADDLE_ENFORCE(!vars.empty(), "should have value to merge!"); + auto cpu_place = platform::CPUPlace(); + auto& var0 = vars[0]; + auto* out_var = scope->Var(var_name); + if (var0->IsType()) { + auto dims = var0->Get().dims(); + VLOG(3) << "merge " << var_name << " LoDTensor " << dims; + + // init output tensor + auto* out_t = out_var->GetMutable(); + out_t->mutable_data(dims, cpu_place); + + // check the input dims + for (auto& var : vars) { + auto& var_t = var->Get(); + PADDLE_ENFORCE_EQ(var_t.dims(), dims, "should have the same dims"); + } + + // set output tensor to 0. + auto cpu_ctx = paddle::platform::CPUDeviceContext(); + math::SetConstant + constant_functor; + constant_functor(cpu_ctx, out_t, static_cast(0)); + + // sum all vars to out + auto result = EigenVector::Flatten(*out_t); + for (auto& var : vars) { + auto& in_t = var->Get(); + auto in = EigenVector::Flatten(in_t); + result.device(*cpu_ctx.eigen_device()) = result + in; + } + } else if (var0->IsType()) { + auto& slr0 = var0->Get(); + auto* out_slr = out_var->GetMutable(); + out_slr->mutable_rows()->clear(); + out_slr->mutable_value()->mutable_data({{}}, cpu_place); + std::vector inputs; + inputs.reserve(vars.size()); + for (auto& var : vars) { + inputs.push_back(&var->Get()); + } + math::scatter::MergeAdd + merge_add; + auto dev_ctx = paddle::platform::CPUDeviceContext(); + merge_add(dev_ctx, inputs, out_slr, false); + VLOG(3) << "merge " << var_name << " SelectedRows height: " << slr0.height() + << " dims: " << slr0.value().dims(); + } else { + PADDLE_THROW("unsupported var type!"); + } +} + +using RpcCtxMap = std::unordered_map; + +class Communicator { + public: + Communicator(const RpcCtxMap& send_varname_to_ctx, + const RpcCtxMap& recv_varname_to_ctx, Scope* recv_scope); + + ~Communicator(); + + void Start(); + + // send grad + void Send(const std::string& var_name, const framework::Scope& scope); + + private: + // recv all parameter + void RecvAll(); + void SendThread(); + void RecvThread(); + + bool running_ = false; + std::unordered_map>>> + send_varname_to_queue_; + RpcCtxMap send_varname_to_ctx_; + RpcCtxMap recv_varname_to_ctx_; + std::unique_ptr send_thread_; + std::unique_ptr recv_thread_; + Scope* recv_scope_; // should be global scope + std::unique_ptr send_scope_; // an independent scope + std::unique_ptr<::ThreadPool> send_threadpool_{nullptr}; + std::unique_ptr<::ThreadPool> recv_threadpool_{nullptr}; + std::atomic_uint grad_num_{0}; // the num of gradient sent since last recv + + // the following code is for initialize the commnunicator + public: + static void Init(const RpcCtxMap& send_varname_to_ctx, + const RpcCtxMap& recv_varname_to_ctx, Scope* recv_scope) { + InitImpl(send_varname_to_ctx, recv_varname_to_ctx, recv_scope); + } + + static Communicator* GetInstance(); + + private: + // Init is called by GetInstance. + static void InitImpl(const RpcCtxMap& send_varname_to_ctx, + const RpcCtxMap& recv_varname_to_ctx, + Scope* recv_scope) { + if (communicator_ == nullptr) { + communicator_.reset(new Communicator(send_varname_to_ctx, + recv_varname_to_ctx, recv_scope)); + } + } + + private: + static std::once_flag init_flag_; + static std::unique_ptr communicator_; +}; + +} // namespace distributed +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/distributed/communicator_test.cc b/paddle/fluid/operators/distributed/communicator_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..5294ac33d15611a003eeb7971891e8ca85ec6a73 --- /dev/null +++ b/paddle/fluid/operators/distributed/communicator_test.cc @@ -0,0 +1,110 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include + +#include "paddle/fluid/operators/distributed/communicator.h" + +namespace paddle { +namespace operators { +namespace distributed { + +using LoDTensor = framework::LoDTensor; +using SelectedRows = framework::SelectedRows; + +TEST(communicator, merge_lod_tensors) { + auto cpu_place = platform::CPUPlace(); + auto dims = framework::make_ddim({2, 3}); + std::vector> in_vars; + float out_value = 0; + for (auto i = 0; i < 10; ++i) { + auto var = std::make_shared(); + in_vars.emplace_back(var); + auto *tensor = var->GetMutable(); + auto *data = tensor->mutable_data(dims, cpu_place); + for (auto j = 0; j < tensor->numel(); ++j) { + data[j] = static_cast(i); + } + out_value += static_cast(i); + } + const std::string out_name = "Out"; + std::unique_ptr scope; + scope.reset(new framework::Scope()); + scope->Var(out_name); + for (auto i = 0; i < 10; ++i) { + MergeVars(out_name, in_vars, scope.get()); + } + auto &out_tensor = scope->FindVar(out_name)->Get(); + auto *out_data = out_tensor.data(); + ASSERT_EQ(out_tensor.dims(), dims); + for (auto i = 0; i < out_tensor.numel(); ++i) { + ASSERT_EQ(out_data[i], out_value); + } +} + +TEST(communicator, merge_selected_rows) { + auto cpu_place = platform::CPUPlace(); + int64_t width = 10; + std::vector> in_vars; + const int64_t height = 100; + for (auto i = 0; i < 10; ++i) { + std::vector rows; + for (auto k = 0; k <= i; ++k) { + rows.push_back(k); + } + auto var = std::make_shared(); + in_vars.emplace_back(var); + auto *slr = var->GetMutable(); + slr->set_height(height); + slr->set_rows(rows); + auto dims = + framework::make_ddim({static_cast(rows.size()), width}); + auto *data = slr->mutable_value()->mutable_data(dims, cpu_place); + for (auto i = 0; i < rows.size(); ++i) { + for (auto j = 0; j < width; ++j) { + data[i * width + j] = static_cast(rows[i]); + } + } + } + const std::string out_name = "Out"; + std::unique_ptr scope; + scope.reset(new framework::Scope()); + scope->Var(out_name); + for (auto i = 0; i < 10; ++i) { + MergeVars(out_name, in_vars, scope.get()); + } + auto &out_slr = scope->FindVar(out_name)->Get(); + auto &out_t = out_slr.value(); + auto *out_data = out_t.data(); + ASSERT_EQ(out_t.dims(), framework::make_ddim({10, width})); + std::vector out_values; + out_values.reserve(10); + for (auto i = 0; i < 10; ++i) { + out_values.push_back(static_cast(i * (10 - i))); + } + for (auto i = 0; i < out_slr.rows().size(); ++i) { + ASSERT_EQ(out_slr.rows()[i], i); + for (auto j = 0; j < width; ++j) { + ASSERT_EQ(out_data[i * width + j], out_values[i]); + } + } +} + +} // namespace distributed +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/distributed/grpc/grpc_server.cc b/paddle/fluid/operators/distributed/grpc/grpc_server.cc index 4a9c158cb0ab7f2d6fecbba9f957ae6ef153074c..0eb313f75dfa64f8722faa365128f3111f72bd0b 100644 --- a/paddle/fluid/operators/distributed/grpc/grpc_server.cc +++ b/paddle/fluid/operators/distributed/grpc/grpc_server.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include +#include #include #include "paddle/fluid/operators/distributed/grpc/grpc_serde.h" @@ -106,7 +107,6 @@ class RequestSend final : public RequestBase { auto invar = request_->GetVar(); int trainer_id = request_->GetTrainerId(); framework::Variable* outvar = nullptr; - request_handler_->Handle(varname, scope, invar, &outvar, trainer_id); Finish(reply_, &responder_); } diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.cc b/paddle/fluid/operators/distributed/parameter_prefetch.cc index 65295c2c103ceca50d9de3ae314246256497d084..0e8d877e08cf6186cef79cd550035cb8699271d2 100644 --- a/paddle/fluid/operators/distributed/parameter_prefetch.cc +++ b/paddle/fluid/operators/distributed/parameter_prefetch.cc @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include #include #include #include @@ -38,30 +39,9 @@ using LoDTensor = framework::LoDTensor; using SelectedRows = framework::SelectedRows; using DDim = framework::DDim; -static size_t GetSectionIndex(int64_t id, - const std::vector& abs_sections) { - for (size_t i = 1; i < abs_sections.size(); ++i) { - if (id < abs_sections[i]) { - return i - 1; - } - } - return abs_sections.size() - 1; -} - -static std::vector ToAbsoluteSection( - const std::vector& height_sections) { - std::vector abs_sections; - abs_sections.resize(height_sections.size()); - abs_sections[0] = 0; - for (size_t i = 1; i < height_sections.size(); ++i) { - abs_sections[i] = height_sections[i - 1] + abs_sections[i - 1]; - } - return abs_sections; -} - static std::vector> SplitIds( const std::vector& ids_vector, - const std::vector& height_section, framework::Scope* scope) { + const std::vector& height_section) { std::set all_ids; for (auto id : ids_vector) { all_ids.insert(id); @@ -79,7 +59,7 @@ static std::vector> SplitIds( static void SplitIdsIntoMultipleVarsBySection( const std::vector& in_var_names, - const std::vector& height_section, + const std::vector& height_section, const std::vector>& splited_ids, framework::Scope* scope) { PADDLE_ENFORCE_EQ(in_var_names.size(), height_section.size(), ""); @@ -101,7 +81,7 @@ static void SplitIdsIntoMultipleVarsBySection( static void MergeMultipleVarsIntoOneBySection( const std::string& id_name, const std::vector& ids_vector, const std::string& out_name, const std::vector& out_var_names, - const std::vector& height_section, + const std::vector& height_section, const std::vector>& splited_ids, const framework::ExecutionContext& context, framework::Scope* scope, platform::DeviceContext* actual_ctx) { @@ -178,10 +158,10 @@ static void MergeMultipleVarsIntoOneBySection( void prefetch(const std::string& id_name, const std::string& out_name, const std::vector& table_names, const std::vector& epmap, - const std::vector& height_sections, + const std::vector& height_sections, const framework::ExecutionContext& context, const framework::Scope& scope) { - auto& local_scope = scope.NewScope(); + std::unique_ptr local_scope = scope.NewTmpScope(); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); auto& cpu_ctx = *pool.Get(platform::CPUPlace()); @@ -225,23 +205,23 @@ void prefetch(const std::string& id_name, const std::string& out_name, #endif } - auto splited_ids = SplitIds(ids_vector, height_sections, &local_scope); + auto splited_ids = SplitIds(ids_vector, height_sections); SplitIdsIntoMultipleVarsBySection(in_var_names, height_sections, splited_ids, - &local_scope); + local_scope.get()); // create output var in local scope for (auto& name : out_var_names) { - local_scope.Var(name)->GetMutable(); + local_scope->Var(name)->GetMutable(); } std::vector rets; for (size_t i = 0; i < in_var_names.size(); i++) { - if (NeedSend(local_scope, in_var_names[i])) { + if (NeedSend(*local_scope.get(), in_var_names[i])) { VLOG(3) << "sending " << in_var_names[i] << " to " << epmap[i] << " to get " << out_var_names[i] << " back"; rets.push_back(rpc_client->AsyncPrefetchVar( - epmap[i], cpu_ctx, local_scope, in_var_names[i], out_var_names[i], - table_names[i])); + epmap[i], cpu_ctx, *local_scope.get(), in_var_names[i], + out_var_names[i], table_names[i])); } else { VLOG(3) << "don't send no-initialied variable: " << out_var_names[i]; } @@ -253,8 +233,7 @@ void prefetch(const std::string& id_name, const std::string& out_name, MergeMultipleVarsIntoOneBySection(id_name, ids_vector, out_name, out_var_names, height_sections, splited_ids, - context, &local_scope, &actual_ctx); - scope.DeleteScope(&local_scope); + context, local_scope.get(), &actual_ctx); } }; // namespace distributed diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.h b/paddle/fluid/operators/distributed/parameter_prefetch.h index 2f850a0332256d458e79ed9da361c86eb8a2f780..0429ec4415dca19ff620cd7af5a8c0a935e17e2f 100644 --- a/paddle/fluid/operators/distributed/parameter_prefetch.h +++ b/paddle/fluid/operators/distributed/parameter_prefetch.h @@ -26,7 +26,7 @@ namespace distributed { void prefetch(const std::string& id_name, const std::string& out_name, const std::vector& table_names, const std::vector& epmap, - const std::vector& height_sections, + const std::vector& height_sections, const framework::ExecutionContext& context, const framework::Scope& scope); @@ -35,7 +35,7 @@ void prefetch_with_reconstruct(const std::string& id_name, const std::string& out_name, const std::vector& table_names, const std::vector& epmap, - const std::vector& height_sections, + const std::vector& height_sections, const framework::ExecutionContext& context, const framework::Scope& scope, framework::LoDTensor* original) { diff --git a/paddle/fluid/operators/distributed/parameter_recv.cc b/paddle/fluid/operators/distributed/parameter_recv.cc new file mode 100644 index 0000000000000000000000000000000000000000..e7d4c262aa9fad10a23adc61b94ba0c38577c0e8 --- /dev/null +++ b/paddle/fluid/operators/distributed/parameter_recv.cc @@ -0,0 +1,104 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include + +#include "paddle/fluid/operators/distributed/parameter_recv.h" + +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/tensor.h" + +#include "paddle/fluid/operators/distributed/distributed.h" +#include "paddle/fluid/operators/distributed/rpc_client.h" +#include "paddle/fluid/operators/distributed/variable_response.h" +#include "paddle/fluid/operators/distributed_ops/send_recv_util.h" +#include "paddle/fluid/operators/strided_memcpy.h" + +namespace paddle { +namespace operators { +namespace distributed { + +using LoDTensor = framework::LoDTensor; +using LoDTensor = framework::LoDTensor; +using SelectedRows = framework::SelectedRows; +using DDim = framework::DDim; + +template +void ParameterRecv::operator()(const RpcContext &rpc_ctx, + const framework::Scope &scope) { + VLOG(3) << "ParameterRecv in"; + std::unique_ptr local_scope = scope.NewTmpScope(); + + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &cpu_ctx = *pool.Get(platform::CPUPlace()); + + distributed::RPCClient *rpc_client = + distributed::RPCClient::GetInstance(0); + + auto *recv_var = scope.FindVar(rpc_ctx.var_name); + + std::vector recved_tensors; + + // recv all vars to local scope + if (recv_var->IsType()) { + std::vector rets; + for (size_t i = 0; i < rpc_ctx.splited_var_names.size(); i++) { + auto &recv_var_name = rpc_ctx.splited_var_names[i]; + framework::Tensor *t = + local_scope->Var(recv_var_name)->GetMutable(); + recved_tensors.push_back(t); + VLOG(3) << "recv " << recv_var_name << " from " << rpc_ctx.epmap[i]; + rets.push_back(rpc_client->AsyncGetVar(rpc_ctx.epmap[i], cpu_ctx, + *local_scope.get(), recv_var_name, + recv_var_name)); + } + for (size_t i = 0; i < rets.size(); i++) { + PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient"); + } + } else { + PADDLE_THROW("unsupported var type to recv!"); + } + + // concat recved tensor into one var + { + size_t output_offset = 0; + framework::Tensor *recv_tensor = + recv_var->GetMutable(); + auto dev_ctx = paddle::platform::CPUDeviceContext(); + int64_t recv_numel = 0; + for (auto *in : recved_tensors) { + recv_numel += in->numel(); + auto in_stride = framework::stride_numel(in->dims()); + auto out_stride = framework::stride_numel(recv_tensor->dims()); + StridedNumelCopyWithAxis( + dev_ctx, 0, recv_tensor->data() + output_offset, out_stride, + in->data(), in_stride, in_stride[0]); + output_offset += in_stride[0]; + } + PADDLE_ENFORCE_EQ(recv_numel, recv_tensor->numel()); + } + + VLOG(3) << "ParameterRecv out"; +} + +template struct ParameterRecv; + +}; // namespace distributed +}; // namespace operators +}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/parameter_recv.h b/paddle/fluid/operators/distributed/parameter_recv.h new file mode 100644 index 0000000000000000000000000000000000000000..e955fca7250ecc88f3b1a08611f380da50df788d --- /dev/null +++ b/paddle/fluid/operators/distributed/parameter_recv.h @@ -0,0 +1,34 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/operators/distributed/rpc_common.h" + +namespace paddle { +namespace operators { +namespace distributed { + +template +struct ParameterRecv { + void operator()(const RpcContext &rpc_ctx, const framework::Scope &scope); +}; + +}; // namespace distributed +}; // namespace operators +}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/parameter_send.cc b/paddle/fluid/operators/distributed/parameter_send.cc new file mode 100644 index 0000000000000000000000000000000000000000..9ce424445229cde0a7e775c95f4af8839f4d4d68 --- /dev/null +++ b/paddle/fluid/operators/distributed/parameter_send.cc @@ -0,0 +1,175 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include + +#include "paddle/fluid/operators/distributed/parameter_send.h" + +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/tensor.h" + +#include "paddle/fluid/operators/distributed/distributed.h" +#include "paddle/fluid/operators/distributed/rpc_client.h" +#include "paddle/fluid/operators/distributed/variable_response.h" +#include "paddle/fluid/operators/distributed_ops/send_recv_util.h" + +namespace paddle { +namespace operators { +namespace distributed { + +using LoDTensor = framework::LoDTensor; +using LoDTensor = framework::LoDTensor; +using SelectedRows = framework::SelectedRows; +using DDim = framework::DDim; + +template +void ParameterSend::operator()(const RpcContext &rpc_ctx, + const framework::Scope &scope, bool sync) { + std::unique_ptr local_scope = scope.NewTmpScope(); + + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &cpu_ctx = *pool.Get(platform::CPUPlace()); + + distributed::RPCClient *rpc_client = + distributed::RPCClient::GetInstance(0); + + auto *send_var = scope.FindVar(rpc_ctx.var_name); + size_t out_num = rpc_ctx.splited_var_names.size(); + if (send_var->IsType()) { + if (out_num > 1) { + auto &send_tensor = send_var->Get(); + auto &send_tensor_dims = send_tensor.dims(); + std::vector outs_dims; + outs_dims.reserve(out_num); + + // infer output shape + PADDLE_ENFORCE_EQ(rpc_ctx.height_sections.size(), out_num, + "tensor split sections size" + "should be equal to output size."); + for (size_t i = 0; i < out_num; ++i) { + auto dim = send_tensor_dims; + dim[0] = rpc_ctx.height_sections[i]; + outs_dims.push_back(dim); + } + + // create output var in local scope + size_t row_offset = 0; + for (auto i = 0; i < out_num; ++i) { + framework::Tensor *out = local_scope->Var(rpc_ctx.splited_var_names[i]) + ->GetMutable(); + *out = send_tensor.Slice(row_offset, row_offset + outs_dims[i][0]); + row_offset += outs_dims[i][0]; + } + } + } else if (send_var->IsType()) { + auto &send_slr = send_var->Get(); + auto abs_sections = ToAbsoluteSection(rpc_ctx.height_sections); + + auto &send_rows = send_slr.rows(); + std::vector> outs_rows_idx; + std::vector> outs_dense_idx; + + outs_rows_idx.resize(out_num); + outs_dense_idx.resize(out_num); + + auto row_numel = send_slr.value().numel() / send_slr.value().dims()[0]; + auto *src = send_slr.value().data(); + + // create output var in local scope + std::vector outs; + for (auto &name : rpc_ctx.splited_var_names) { + auto *out = local_scope->Var(name)->GetMutable(); + outs.push_back(out); + } + + // split rows index into output sparse vars + for (size_t i = 0; i < send_rows.size(); ++i) { + size_t out_idx = GetSectionIndex(send_rows[i], abs_sections); + outs_rows_idx[out_idx].push_back(send_rows[i]); + outs_dense_idx[out_idx].push_back(i); + } + auto place = platform::CPUPlace(); + + for (size_t i = 0; i < outs_rows_idx.size(); ++i) { + auto rows_idx = outs_rows_idx[i]; + outs[i]->set_height(rpc_ctx.height_sections[i]); + auto dims = send_slr.GetCompleteDims(); + dims[0] = rows_idx.size(); + outs[i]->mutable_rows()->clear(); + outs[i]->mutable_value()->mutable_data(dims, send_slr.place()); + if (rows_idx.size() > 0) { + for (auto idx : rows_idx) { + outs[i]->mutable_rows()->push_back(idx - abs_sections[i]); + } + auto dst = outs[i]->mutable_value()->mutable_data(place); + for (size_t j = 0; j < rows_idx.size(); j++) { + if (platform::is_cpu_place(place)) { + memory::Copy( + platform::CPUPlace(), dst + j * row_numel, platform::CPUPlace(), + src + outs_dense_idx[i][j] * row_numel, sizeof(T) * row_numel); + } else { + PADDLE_THROW("do not support GPU now"); + /* + #ifdef PADDLE_WITH_CUDA + auto stream = ctx.cuda_device_context().stream(); + memory::Copy(platform::CUDAPlace(), dst + j * row_numel, + platform::CUDAPlace(), + src + outs_dense_idx[i][j] * row_numel, + sizeof(T) * row_numel, stream); + #else + PADDLE_THROW("Paddle is not compiled with GPU"); + #endif + */ + } + } + } + PADDLE_ENFORCE_EQ(rows_idx.size(), outs[i]->rows().size(), + "rows should has the same size with tensor dim 0"); + } + + } else { + PADDLE_THROW("unsupported var type to send!"); + } + + std::vector rets; + for (size_t i = 0; i < rpc_ctx.splited_var_names.size(); i++) { + auto &send_var_name = rpc_ctx.splited_var_names[i]; + auto &endpoint = rpc_ctx.epmap[i]; + if (NeedSend(*local_scope.get(), send_var_name)) { + VLOG(3) << "sending " << send_var_name << " to " << endpoint; + rets.push_back(rpc_client->AsyncSendVar( + endpoint, cpu_ctx, *local_scope.get(), send_var_name)); + } else { + VLOG(3) << "don't send non-initialized variable: " + << rpc_ctx.splited_var_names[i]; + } + } + + if (sync) { + for (auto &handle : rets) { + PADDLE_ENFORCE(handle->Wait(), "internal error in RPCClient"); + } + } +} + +template struct ParameterSend; + +}; // namespace distributed +}; // namespace operators +}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/parameter_send.h b/paddle/fluid/operators/distributed/parameter_send.h new file mode 100644 index 0000000000000000000000000000000000000000..9077f4a4fb9fd9d7152e8be72519f16b1999e93d --- /dev/null +++ b/paddle/fluid/operators/distributed/parameter_send.h @@ -0,0 +1,35 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/operators/distributed/rpc_common.h" + +namespace paddle { +namespace operators { +namespace distributed { + +template +struct ParameterSend { + void operator()(const RpcContext &rpc_ctx, const framework::Scope &scope, + bool sync); +}; + +}; // namespace distributed +}; // namespace operators +}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/request_handler_impl.cc b/paddle/fluid/operators/distributed/request_handler_impl.cc index a1c5c0777402b808eed6306862fd6dd41b529dbd..e289ec929dbd6643a2518b92c1a25b7d63e790a9 100644 --- a/paddle/fluid/operators/distributed/request_handler_impl.cc +++ b/paddle/fluid/operators/distributed/request_handler_impl.cc @@ -59,13 +59,8 @@ bool RequestSendHandler::Handle(const std::string& varname, "async mode should not recv BATCH_BARRIER_MESSAGE or " "COMPLETE_MESSAGE"); } - try { - executor_->RunPreparedContext((*grad_to_prepared_ctx_)[varname].get(), - scope); - } catch (std::exception& e) { - LOG(ERROR) << "async: run sub program error " << e.what(); - return false; - } + executor_->RunPreparedContext((*grad_to_prepared_ctx_)[varname].get(), + scope); return true; } else { // sync rpc_server_->WaitCond(kRequestSend); diff --git a/paddle/fluid/operators/distributed/rpc_common.h b/paddle/fluid/operators/distributed/rpc_common.h new file mode 100644 index 0000000000000000000000000000000000000000..3de89c2ae89d29edc317ca123882d1c55038b6ca --- /dev/null +++ b/paddle/fluid/operators/distributed/rpc_common.h @@ -0,0 +1,76 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include + +namespace paddle { +namespace operators { +namespace distributed { + +struct RpcContext { + RpcContext() = default; + + RpcContext(const std::string &name, const std::vector &names, + const std::vector &emap, + const std::vector §ions) + : var_name(name), + splited_var_names(names), + epmap(emap), + height_sections(sections) {} + + RpcContext(const RpcContext &ctx) { + var_name = ctx.var_name; + splited_var_names = ctx.splited_var_names; + epmap = ctx.epmap; + height_sections = ctx.height_sections; + } + + std::string var_name; + std::vector splited_var_names; + std::vector epmap; + std::vector height_sections; +}; + +inline std::ostream &operator<<(std::ostream &os, const RpcContext &rpc_ctx) { + os << "{"; + os << "var_name: " << rpc_ctx.var_name << "\n"; + + os << "splited_var_names: ["; + for (auto &name : rpc_ctx.splited_var_names) { + os << name << ", "; + } + os << "]\n"; + + os << "epmap: ["; + for (auto &ep : rpc_ctx.epmap) { + os << ep << ", "; + } + os << "]\n"; + + os << "height_sections: ["; + for (auto §ion : rpc_ctx.height_sections) { + os << section << ", "; + } + os << "]\n"; + os << "}"; + return os; +} + +} // namespace distributed +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/distributed/variable_response.h b/paddle/fluid/operators/distributed/variable_response.h index 294cae5f44a4701c064c3669af7b4138f68659e6..3cabcd22cd52222aff2555a8449e558de2c287c0 100644 --- a/paddle/fluid/operators/distributed/variable_response.h +++ b/paddle/fluid/operators/distributed/variable_response.h @@ -60,13 +60,14 @@ class VariableResponse { bool create_scope = false) : scope_(scope), dev_ctx_(dev_ctx), create_scope_(create_scope) { if (create_scope) { - local_scope_ = &scope->NewScope(); + local_scope_ = scope->NewTmpScope().release(); } } virtual ~VariableResponse() { - if (create_scope_) { - scope_->DeleteScope(local_scope_); + if (local_scope_) { + delete local_scope_; + local_scope_ = nullptr; } } diff --git a/paddle/fluid/operators/distributed_ops/CMakeLists.txt b/paddle/fluid/operators/distributed_ops/CMakeLists.txt index a8bb597cbd59290df1347c164d37104c6ac431e9..a1ef1af39ff2ab1456706ebafbd3d7ce1acc0c07 100644 --- a/paddle/fluid/operators/distributed_ops/CMakeLists.txt +++ b/paddle/fluid/operators/distributed_ops/CMakeLists.txt @@ -2,9 +2,9 @@ include(operators) set(DISTRIBUTE_DEPS "") if(WITH_GRPC) - set(DISTRIBUTE_DEPS sendrecvop_rpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf node) + set(DISTRIBUTE_DEPS sendrecvop_rpc parameter_send parameter_recv communicator grpc++_unsecure grpc_unsecure gpr cares zlib protobuf node) else() - set(DISTRIBUTE_DEPS sendrecvop_rpc brpc leveldb snappystream snappy protobuf ssl crypto zlib node) + set(DISTRIBUTE_DEPS sendrecvop_rpc parameter_send parameter_recv communicator brpc leveldb snappystream snappy protobuf ssl crypto zlib node) if(WITH_BRPC_RDMA) find_library(IBVERBS_LIBRARY NAMES ibverbs) ADD_LIBRARY(ibverbs SHARED IMPORTED GLOBAL) diff --git a/paddle/fluid/operators/distributed_ops/recv_op.cc b/paddle/fluid/operators/distributed_ops/recv_op.cc index 120c65f29699bf2745b09ea312d1de069c8173c5..3fd0700a077321d931e87b1d94c3637d167c9eff 100644 --- a/paddle/fluid/operators/distributed_ops/recv_op.cc +++ b/paddle/fluid/operators/distributed_ops/recv_op.cc @@ -20,6 +20,8 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/distributed/distributed.h" +#include "paddle/fluid/operators/distributed/parameter_recv.h" +#include "paddle/fluid/operators/distributed/rpc_common.h" #include "paddle/fluid/platform/profiler.h" namespace paddle { @@ -34,6 +36,11 @@ class RecvOp : public framework::OperatorBase { void RunImpl(const framework::Scope &scope, const platform::Place &place) const override { + bool do_not_run = Attr("do_not_run"); + if (do_not_run) { + VLOG(3) << "recv do not run!"; + return; + } std::vector epmap = Attr>("epmap"); std::vector varnames = Attr>("varnames"); @@ -48,32 +55,41 @@ class RecvOp : public framework::OperatorBase { distributed::RPCClient::GetInstance( Attr("trainer_id")); - if (with_barrier) { - std::vector rets; - for (size_t i = 0; i < outs.size(); i++) { - std::string varname = varnames.size() == 0 ? outs[i] : varnames[i]; - VLOG(4) << "recv " << outs[i] << " from " << epmap[i] << " with " - << varname << " and with AsyncGetVar"; - rets.push_back( - rpc_client->AsyncGetVar(epmap[i], ctx, scope, varname, outs[i])); - } - if (sync_mode) { + std::vector recv_varnames = + Attr>("recv_varnames"); + + if (recv_varnames.size() > 0) { + auto recv_functor = distributed::ParameterRecv(); + auto rpc_ctx = distributed::RpcContext(outs[0], recv_varnames, epmap, {}); + recv_functor(rpc_ctx, scope); + } else { + if (with_barrier) { + std::vector rets; + for (size_t i = 0; i < outs.size(); i++) { + std::string varname = varnames.size() == 0 ? outs[i] : varnames[i]; + VLOG(4) << "recv " << outs[i] << " from " << epmap[i] << " with " + << varname << " and with AsyncGetVar"; + rets.push_back( + rpc_client->AsyncGetVar(epmap[i], ctx, scope, varname, outs[i])); + } + if (sync_mode) { + for (size_t i = 0; i < rets.size(); i++) { + PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient"); + } + } + } else { + std::vector rets; + for (size_t i = 0; i < outs.size(); i++) { + std::string varname = varnames.size() == 0 ? outs[i] : varnames[i]; + VLOG(4) << "recv " << outs[i] << " from " << epmap[i] << " with " + << varname << " and with AsyncGetVarNoBarrier"; + rets.push_back(rpc_client->AsyncGetVarNoBarrier(epmap[i], ctx, scope, + varname, outs[i])); + } for (size_t i = 0; i < rets.size(); i++) { PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient"); } } - } else { - std::vector rets; - for (size_t i = 0; i < outs.size(); i++) { - std::string varname = varnames.size() == 0 ? outs[i] : varnames[i]; - VLOG(4) << "recv " << outs[i] << " from " << epmap[i] << " with " - << varname << " and with AsyncGetVarNoBarrier"; - rets.push_back(rpc_client->AsyncGetVarNoBarrier(epmap[i], ctx, scope, - varname, outs[i])); - } - for (size_t i = 0; i < rets.size(); i++) { - PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient"); - } } } }; @@ -110,6 +126,12 @@ This operator can get variables from server side. "for example: we need var named 'moment_1@127.0.0.1:1001', " "and it real name on parameter server is 'moment_1'. ") .SetDefault({}); + AddAttr>( + "recv_varnames", + "(vector) " + "the splited parameter varnames to be recved from pserver") + .SetDefault(std::vector{}); + AddAttr("do_not_run", "if recv need to really run").SetDefault(false); } }; diff --git a/paddle/fluid/operators/distributed_ops/send_op.cc b/paddle/fluid/operators/distributed_ops/send_op.cc index e2c2147ab5e9a76498a0fd9e1f18b75eed32e91e..b08cd0942f8c89b60d722c931d0cec2063b96578 100644 --- a/paddle/fluid/operators/distributed_ops/send_op.cc +++ b/paddle/fluid/operators/distributed_ops/send_op.cc @@ -19,7 +19,10 @@ limitations under the License. */ #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/distributed/communicator.h" #include "paddle/fluid/operators/distributed/distributed.h" +#include "paddle/fluid/operators/distributed/parameter_send.h" +#include "paddle/fluid/operators/distributed/rpc_common.h" #include "paddle/fluid/operators/distributed_ops/send_recv_util.h" #include "paddle/fluid/platform/profiler.h" @@ -37,30 +40,47 @@ class SendOp : public framework::OperatorBase { const platform::Place& place) const override { auto ins = Inputs("X"); - std::vector epmap = Attr>("epmap"); + auto epmap = Attr>("epmap"); int sync_send = Attr("sync_mode"); - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - auto& ctx = *pool.Get(place); + auto send_varnames = Attr>("send_varnames"); + auto height_sections = Attr>("sections"); - distributed::RPCClient* rpc_client = - distributed::RPCClient::GetInstance( - Attr("trainer_id")); - - std::vector rets; - for (size_t i = 0; i < ins.size(); i++) { - if (NeedSend(scope, ins[i])) { - VLOG(3) << "sending " << ins[i] << " to " << epmap[i]; - rets.push_back(rpc_client->AsyncSendVar(epmap[i], ctx, scope, ins[i])); + if (send_varnames.size() > 0) { + PADDLE_ENFORCE_EQ(ins.size(), 1, ""); + if (distributed::Communicator::GetInstance() == nullptr) { + auto send_functor = distributed::ParameterSend(); + auto rpc_ctx = distributed::RpcContext(ins[0], send_varnames, epmap, + height_sections); + send_functor(rpc_ctx, scope, true); } else { - VLOG(3) << "don't send no-initialied variable: " << ins[i]; + distributed::Communicator::GetInstance()->Send(ins[0], scope); } - } - if (sync_send) { - for (size_t i = 0; i < rets.size(); i++) { - VLOG(7) << "before sync_send " << ins[i] << "from " << epmap[i]; - PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient"); - VLOG(7) << "after sync_send " << ins[i] << "from " << epmap[i]; + } else { + platform::DeviceContextPool& pool = + platform::DeviceContextPool::Instance(); + auto& ctx = *pool.Get(place); + + distributed::RPCClient* rpc_client = + distributed::RPCClient::GetInstance( + Attr("trainer_id")); + + std::vector rets; + for (size_t i = 0; i < ins.size(); i++) { + if (NeedSend(scope, ins[i])) { + VLOG(3) << "sending " << ins[i] << " to " << epmap[i]; + rets.push_back( + rpc_client->AsyncSendVar(epmap[i], ctx, scope, ins[i])); + } else { + VLOG(3) << "don't send no-initialied variable: " << ins[i]; + } + } + if (sync_send) { + for (size_t i = 0; i < rets.size(); i++) { + VLOG(7) << "before sync_send " << ins[i] << "from " << epmap[i]; + PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient"); + VLOG(7) << "after sync_send " << ins[i] << "from " << epmap[i]; + } } } } @@ -88,6 +108,21 @@ This operator will send variables to listen_and_serve op at the parameter server "Server endpoints in the order of input " "variables for mapping") .SetDefault({"127.0.0.1:6164"}); + AddAttr>("sections", + "(vector) " + "the length of each output along the " + "specified axis.") + .SetDefault(std::vector{}); + AddAttr>( + "send_varnames", + "(vector) " + "the splited output varnames to send to pserver") + .SetDefault(std::vector{}); + AddAttr("num", + "(int, default 0)" + "Number of sub-tensors. This must evenly divide " + "Input.dims()[axis]") + .SetDefault(0); } }; diff --git a/paddle/fluid/operators/distributed_ops/send_recv_util.h b/paddle/fluid/operators/distributed_ops/send_recv_util.h index dc26c53c64f06ce21856fb5af8f2a5eb3fc75bb7..c05a1ff1da8803c1ef3161d0e9d8604f9f1e5f3b 100644 --- a/paddle/fluid/operators/distributed_ops/send_recv_util.h +++ b/paddle/fluid/operators/distributed_ops/send_recv_util.h @@ -13,8 +13,14 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once + #include +#include + #include "paddle/fluid/framework/ir/node.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" namespace paddle { namespace operators { @@ -42,5 +48,26 @@ inline bool NeedSend(const framework::Scope& scope, return false; } +inline std::vector ToAbsoluteSection( + const std::vector& height_sections) { + std::vector abs_sections; + abs_sections.resize(height_sections.size()); + abs_sections[0] = 0; + for (size_t i = 1; i < height_sections.size(); ++i) { + abs_sections[i] = height_sections[i - 1] + abs_sections[i - 1]; + } + return abs_sections; +} + +inline size_t GetSectionIndex(int64_t id, + const std::vector& abs_sections) { + for (size_t i = 1; i < abs_sections.size(); ++i) { + if (id < abs_sections[i]) { + return i - 1; + } + } + return abs_sections.size() - 1; +} + } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc index 9cc94ab88d59dbf8215aca6cd8be3ba19afe32d0..3ee962d37b10bb2c40926f5563ec73ce6d7894c8 100644 --- a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc +++ b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc @@ -107,17 +107,6 @@ And the output will change the LoD information with input Ids. } }; -class FusedEmbeddingSeqPoolOpGradDescMaker - : public framework::DefaultGradOpDescMaker { - using ::paddle::framework::DefaultGradOpDescMaker< - true>::DefaultGradOpDescMaker; - - protected: - virtual std::string GradOpType() const { - return "fused_embedding_seq_pool_grad"; - } -}; - class FusedEmbeddingSeqPoolOpGrad : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; @@ -160,7 +149,7 @@ class FusedEmbeddingSeqPoolOpGradVarTypeInference namespace ops = paddle::operators; REGISTER_OPERATOR(fused_embedding_seq_pool, ops::FusedEmbeddingSeqPoolOp, - ops::FusedEmbeddingSeqPoolOpGradDescMaker, + paddle::framework::DefaultGradOpDescMaker, ops::FusedEmbeddingSeqPoolOpMaker); REGISTER_OPERATOR(fused_embedding_seq_pool_grad, ops::FusedEmbeddingSeqPoolOpGrad, diff --git a/paddle/fluid/operators/group_norm_op.cc b/paddle/fluid/operators/group_norm_op.cc index 2ab40f482d7a1463703085037bcb94fd4aecf377..09fd6a25d18d5484f4d1c1631faae8da2fbd5473 100644 --- a/paddle/fluid/operators/group_norm_op.cc +++ b/paddle/fluid/operators/group_norm_op.cc @@ -16,6 +16,7 @@ limitations under the License. */ #include #include #include +#include namespace paddle { namespace operators { @@ -107,8 +108,6 @@ class GroupNormGradOp : public framework::OperatorWithKernel { // check input PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) of GroupNormOp should not be null."); - PADDLE_ENFORCE(ctx->HasInput("Mean"), - "Input(Mean) of GroupNormOp should not be null."); PADDLE_ENFORCE(ctx->HasInput("Variance"), "Input(Variance) of GroupNormOp should not be null."); PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")), @@ -159,7 +158,6 @@ class GroupNormGradMaker : public framework::SingleGradOpDescMaker { op->SetInput("Bias", Input("Bias")); op->SetInput(framework::GradVarName("Y"), OutputGrad("Y")); op->SetInput("Y", Output("Y")); - op->SetInput("Mean", Output("Mean")); op->SetInput("Variance", Output("Variance")); op->SetOutput(framework::GradVarName("X"), InputGrad("X")); diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.cc b/paddle/fluid/operators/hierarchical_sigmoid_op.cc index d0e1057c4357e372d3ab396841de7b2d0577d365..479b839e473591ba57945b496b83b0e76f620534 100644 --- a/paddle/fluid/operators/hierarchical_sigmoid_op.cc +++ b/paddle/fluid/operators/hierarchical_sigmoid_op.cc @@ -134,9 +134,9 @@ class HierarchicalSigmoidOpMaker : public framework::OpProtoAndCheckerMaker { // for parameter prefetch AddAttr("remote_prefetch", "").SetDefault(false); AddAttr("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0); - AddAttr>("height_sections", - "Height for each output SelectedRows.") - .SetDefault(std::vector({})); + AddAttr>("height_sections", + "Height for each output SelectedRows.") + .SetDefault(std::vector({})); AddAttr>( "epmap", "(string vector, default 127.0.0.1:6164)" diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.h b/paddle/fluid/operators/hierarchical_sigmoid_op.h index 4d5a84bcafed1ab0739349e1dbc7b5a9f9ad64ec..82c8171ca52ffb128df103f27bafbdba1e72e52f 100644 --- a/paddle/fluid/operators/hierarchical_sigmoid_op.h +++ b/paddle/fluid/operators/hierarchical_sigmoid_op.h @@ -13,11 +13,14 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once + #include #include +#include #include #include #include + #include "paddle/fluid/framework/mixed_vector.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/clip_op.h" @@ -65,12 +68,13 @@ class HierarchicalSigmoidOpKernel : public framework::OpKernel { size_t num_classes = static_cast(ctx.Attr("num_classes")); // for remote prefetch + auto remote_prefetch = ctx.Attr("remote_prefetch"); auto epmap = ctx.Attr>("epmap"); - if (!epmap.empty()) { + if (remote_prefetch && !epmap.empty()) { // if epmap is not empty, then the parameter will be fetched from remote // parameter // server - auto height_sections = ctx.Attr>("height_sections"); + auto height_sections = ctx.Attr>("height_sections"); auto table_names = ctx.Attr>("table_names"); std::vector real_rows = PathToRows(*path); framework::Scope& local_scope = ctx.scope().NewScope(); diff --git a/paddle/fluid/operators/lookup_table_op.cc b/paddle/fluid/operators/lookup_table_op.cc index d635fc617bc63e1f625e93d21886f6ad134947f6..04323eee02c8dbed6eeffef67ef75b18f351e46b 100644 --- a/paddle/fluid/operators/lookup_table_op.cc +++ b/paddle/fluid/operators/lookup_table_op.cc @@ -91,9 +91,9 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker { // for parameter prefetch AddAttr("remote_prefetch", "").SetDefault(false); AddAttr("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0); - AddAttr>("height_sections", - "Height for each output SelectedRows.") - .SetDefault(std::vector({})); + AddAttr>("height_sections", + "Height for each output SelectedRows.") + .SetDefault(std::vector({})); AddAttr>( "epmap", "(string vector, default 127.0.0.1:6164)" @@ -119,15 +119,6 @@ or not. And the output only shares the LoD information with input Ids. } }; -class LookupTableOpGradDescMaker - : public framework::DefaultGradOpDescMaker { - using ::paddle::framework::DefaultGradOpDescMaker< - true>::DefaultGradOpDescMaker; - - protected: - virtual std::string GradOpType() const { return "lookup_table_grad"; } -}; - class LookupTableOpGrad : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; @@ -169,7 +160,8 @@ class LookupTableOpGradVarTypeInference : public framework::VarTypeInference { namespace ops = paddle::operators; REGISTER_OPERATOR(lookup_table, ops::LookupTableOp, - ops::LookupTableOpGradDescMaker, ops::LookupTableOpMaker); + paddle::framework::DefaultGradOpDescMaker, + ops::LookupTableOpMaker); REGISTER_OPERATOR(lookup_table_grad, ops::LookupTableOpGrad, ops::LookupTableOpGradVarTypeInference); diff --git a/paddle/fluid/operators/lookup_table_op.cu b/paddle/fluid/operators/lookup_table_op.cu index 0af8b9e69cfe09890f28ef2028baa19319a5c379..a863af4af914095a9ee2a7fcc986cc878fd808ea 100644 --- a/paddle/fluid/operators/lookup_table_op.cu +++ b/paddle/fluid/operators/lookup_table_op.cu @@ -84,7 +84,8 @@ class LookupTableCUDAKernel : public framework::OpKernel { // for remote prefetch auto epmap = context.Attr>("epmap"); - auto height_sections = context.Attr>("height_sections"); + auto height_sections = + context.Attr>("height_sections"); auto table_names = context.Attr>("table_names"); if (!epmap.empty()) { diff --git a/paddle/fluid/operators/lookup_table_op.h b/paddle/fluid/operators/lookup_table_op.h index 56c6e37ae3c62e1f9af66ef6ed16111dc1e93d9d..62e298e066948c93a84a131a0dffc0a1d53f2a5b 100644 --- a/paddle/fluid/operators/lookup_table_op.h +++ b/paddle/fluid/operators/lookup_table_op.h @@ -50,10 +50,12 @@ class LookupTableKernel : public framework::OpKernel { // for remote prefetch auto epmap = context.Attr>("epmap"); - auto height_sections = context.Attr>("height_sections"); + auto remote_prefetch = context.Attr("remote_prefetch"); + auto height_sections = + context.Attr>("height_sections"); auto table_names = context.Attr>("table_names"); - if (!epmap.empty()) { + if (remote_prefetch && !epmap.empty()) { // if epmap is not empty, then the parameter will be fetched from remote // parameter // server diff --git a/paddle/fluid/operators/math/selected_rows_functor.h b/paddle/fluid/operators/math/selected_rows_functor.h index 222d761ef91d8aee4843d717dabba7edf131f8dc..db0ee9bc1695f7b1a55b4d111dc470b462210963 100644 --- a/paddle/fluid/operators/math/selected_rows_functor.h +++ b/paddle/fluid/operators/math/selected_rows_functor.h @@ -95,7 +95,7 @@ struct MergeAdd { enum class ScatterOps { ASSIGN, ADD, SUB, SUBBY, MUL, DIV, DIVBY }; -// out = seleted_rows_in / tensor +// out = selected_rows_in / tensor template struct UpdateToTensor { void operator()(const DeviceContext& context, const ScatterOps& op, diff --git a/paddle/fluid/operators/nce_op.cc b/paddle/fluid/operators/nce_op.cc index fa7cc58c08455457dd129afd130067704ec72c7c..358e4f37b5b45c53b88f5477452ebf6448dcc461 100644 --- a/paddle/fluid/operators/nce_op.cc +++ b/paddle/fluid/operators/nce_op.cc @@ -156,9 +156,9 @@ class NCEOpMaker : public framework::OpProtoAndCheckerMaker { // for parameter prefetch AddAttr("remote_prefetch", "").SetDefault(false); AddAttr("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0); - AddAttr>("height_sections", - "Height for each output SelectedRows.") - .SetDefault(std::vector({})); + AddAttr>("height_sections", + "Height for each output SelectedRows.") + .SetDefault(std::vector({})); AddAttr>( "epmap", "(string vector, default 127.0.0.1:6164)" @@ -187,14 +187,6 @@ By default this operator uses a uniform distribution for sampling. } }; -class NCEOpGradDescMaker : public framework::DefaultGradOpDescMaker { - using ::paddle::framework::DefaultGradOpDescMaker< - true>::DefaultGradOpDescMaker; - - protected: - virtual std::string GradOpType() const { return "nce_grad"; } -}; - class NCEOpGrad : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; @@ -259,7 +251,9 @@ class NCEOpGradVarTypeInference : public framework::VarTypeInference { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OPERATOR(nce, ops::NCEOp, ops::NCEOpGradDescMaker, ops::NCEOpMaker); +REGISTER_OPERATOR(nce, ops::NCEOp, + paddle::framework::DefaultGradOpDescMaker, + ops::NCEOpMaker); REGISTER_OPERATOR(nce_grad, ops::NCEOpGrad, ops::NCEOpGradVarTypeInference); REGISTER_OP_CPU_KERNEL(nce, ops::NCEKernel, ops::NCEKernel); diff --git a/paddle/fluid/operators/nce_op.h b/paddle/fluid/operators/nce_op.h index 3e48b67a570d41482e358ae3941eb1e2b6ab91f8..12f3118ec775dfce13d1f7ff836d82e1d999c65b 100644 --- a/paddle/fluid/operators/nce_op.h +++ b/paddle/fluid/operators/nce_op.h @@ -156,9 +156,10 @@ class NCEKernel : public framework::OpKernel { auto input_mat = EigenMatrix::From(*(context.Input("Input"))); // for remote prefetch + auto remote_prefetch = context.Attr("remote_prefetch"); auto epmap = context.Attr>("epmap"); - if (!epmap.empty()) { + if (remote_prefetch && !epmap.empty()) { // if epmap is not empty, then the parameter will be fetched from remote // parameter // server @@ -172,7 +173,8 @@ class NCEKernel : public framework::OpKernel { framework::Scope &local_scope = context.scope().NewScope(); - auto height_sections = context.Attr>("height_sections"); + auto height_sections = + context.Attr>("height_sections"); auto table_names = context.Attr>("table_names"); auto *ids = local_scope.Var("Ids@Prefetch"); diff --git a/paddle/fluid/operators/norm_op.cc b/paddle/fluid/operators/norm_op.cc index aa19c62c83648814e86b1e7062424be3693e4b98..81fbe3e514241ecdd2832141eba4250ced2017a9 100644 --- a/paddle/fluid/operators/norm_op.cc +++ b/paddle/fluid/operators/norm_op.cc @@ -13,6 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/norm_op.h" +#include +#include +#include + namespace paddle { namespace operators { @@ -74,6 +78,24 @@ class NormOpGrad : public framework::OperatorWithKernel { ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); } }; + +class NormOpGradOpDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("norm_grad"); + op->SetAttrMap(Attrs()); + op->SetInput("X", Input("X")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetInput("Norm", Output("Norm")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + return op; + } +}; + } // namespace operators } // namespace paddle @@ -81,7 +103,7 @@ namespace ops = paddle::operators; using CPU = paddle::platform::CPUDeviceContext; REGISTER_OPERATOR(norm, ops::NormOp, ops::NormOpMaker, - paddle::framework::DefaultGradOpDescMaker); + ops::NormOpGradOpDescMaker); REGISTER_OPERATOR(norm_grad, ops::NormOpGrad); REGISTER_OP_CPU_KERNEL(norm, ops::NormKernel, ops::NormKernel); diff --git a/paddle/fluid/operators/pad2d_op.cc b/paddle/fluid/operators/pad2d_op.cc index 6ef2dacb3869ab3b20505699c2fbe2f129c20068..9731aefa95c5243e29ace87ad8c35d5b01904e60 100644 --- a/paddle/fluid/operators/pad2d_op.cc +++ b/paddle/fluid/operators/pad2d_op.cc @@ -13,6 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #include +#include +#include +#include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/math/math_function.h" @@ -612,8 +615,9 @@ class Pad2dOpGrad : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType(ctx.Input("X")->type(), - ctx.GetPlace()); + return framework::OpKernelType( + ctx.Input(framework::GradVarName("Out"))->type(), + ctx.GetPlace()); } }; @@ -625,7 +629,9 @@ class Pad2dOpGradMaker : public framework::SingleGradOpDescMaker { std::unique_ptr Apply() const override { auto* bind = new framework::OpDesc(); bind->SetInput("X", Input("X")); - bind->SetInput("Paddings", Input("Paddings")); + if (ForwardOp().Inputs().count("Paddings") > 0) { + bind->SetInput("Paddings", Input("Paddings")); + } bind->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); bind->SetOutput(framework::GradVarName("X"), InputGrad("X")); bind->SetAttrMap(Attrs()); @@ -634,6 +640,10 @@ class Pad2dOpGradMaker : public framework::SingleGradOpDescMaker { } }; +// TODO(zjl): Paddings can also be skipped! +DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(Pad2dOpGradNoNeedBufferVarsInference, + "X"); + } // namespace operators } // namespace paddle @@ -641,6 +651,7 @@ namespace ops = paddle::operators; REGISTER_OPERATOR(pad2d, ops::Pad2dOp, ops::Pad2dOpMaker, ops::Pad2dOpGradMaker); -REGISTER_OPERATOR(pad2d_grad, ops::Pad2dOpGrad); +REGISTER_OPERATOR(pad2d_grad, ops::Pad2dOpGrad, + ops::Pad2dOpGradNoNeedBufferVarsInference); REGISTER_OP_CPU_KERNEL(pad2d, ops::Pad2dCPUKernel); REGISTER_OP_CPU_KERNEL(pad2d_grad, ops::Pad2dGradCPUKernel); diff --git a/paddle/fluid/operators/reader/blocking_queue.h b/paddle/fluid/operators/reader/blocking_queue.h index 78d238aa6115265023d5d87c01048a87180448d0..b23105916bcef4759c5a212ef019e33e21f2a1b7 100644 --- a/paddle/fluid/operators/reader/blocking_queue.h +++ b/paddle/fluid/operators/reader/blocking_queue.h @@ -80,12 +80,14 @@ class BlockingQueue { return true; } else { PADDLE_ENFORCE(closed_); + VLOG(3) << "queue is closed! return nothing."; return false; } } void ReOpen() { std::lock_guard lock(mutex_); + VLOG(1) << "reopen queue"; closed_ = false; std::deque new_deque; queue_.swap(new_deque); @@ -95,6 +97,7 @@ class BlockingQueue { void Close() { std::lock_guard lock(mutex_); + VLOG(1) << "close queue"; closed_ = true; send_cv_.notify_all(); receive_cv_.notify_all(); diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc index c24e9aedc4ebd8f4fa9e483b1c1cc71fe0bf0aa7..5d93d2e32ef65c7f52723e21e79c825340efc990 100644 --- a/paddle/fluid/operators/reader/buffered_reader.cc +++ b/paddle/fluid/operators/reader/buffered_reader.cc @@ -22,6 +22,7 @@ namespace paddle { namespace operators { namespace reader { BufferedReader::~BufferedReader() { + VLOG(1) << "~BufferedReader"; reader_->Shutdown(); while (!position_.empty()) { position_.front().wait(); @@ -45,6 +46,7 @@ BufferedReader::BufferedReader( thread_pool_(1), place_(place), buffer_size_(buffer_size) { + VLOG(1) << "BufferedReader"; #ifdef PADDLE_WITH_CUDA if (platform::is_gpu_place(place_)) { platform::SetDeviceId(boost::get(place_).device); @@ -131,6 +133,7 @@ void BufferedReader::ReadAsync(size_t i) { } void BufferedReader::ShutdownImpl() { + VLOG(1) << "ShutdownImpl"; reader_->Shutdown(); while (!position_.empty()) { position_.pop(); diff --git a/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h b/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h index 5b53edff5d8ea79a03542231dbf34f5a6f254986..be044085f1435089b3fb736df684358136ea7c10 100644 --- a/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h +++ b/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h @@ -15,6 +15,7 @@ #pragma once #include +#include #include #include "paddle/fluid/framework/ddim.h" @@ -57,7 +58,10 @@ class LoDTensorBlockingQueue { inline void ReOpen() { queue_.ReOpen(); } - inline void Close() { queue_.Close(); } + inline void Close() { + VLOG(1) << "LoDTensorBlockingQueue close"; + queue_.Close(); + } inline bool IsClosed() const { return queue_.IsClosed(); } diff --git a/paddle/fluid/operators/sequence_ops/sequence_concat_op.cc b/paddle/fluid/operators/sequence_ops/sequence_concat_op.cc index 37f1b9dda50ba4b62d7cf75765125e0ad663d9d8..d652f9216f8faf53deeac2c9ce1f737651c3939b 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_concat_op.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_concat_op.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/operators/sequence_ops/sequence_concat_op.h" +#include #include namespace paddle { @@ -73,13 +74,43 @@ class SeqConcatShapeInferer : public framework::InferShapeBase { } }; -class SeqConcatGradShapeInferer : public framework::InferShapeBase { +class SeqConcatGradOpDescMaker : public framework::SingleGradOpDescMaker { public: - void operator()(framework::InferShapeContext *context) const override { + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("sequence_concat_grad"); + op->SetInput("X", Input("X")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X", false)); + op->SetAttrMap(Attrs()); + return op; + } +}; + +class SeqConcatGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *context) const override { context->SetOutputsDim(framework::GradVarName("X"), context->GetInputsDim("X")); } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType( + ctx.Input(framework::GradVarName("Out"))->type(), + ctx.GetPlace()); + } }; + +DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(SeqConcatGradNoNeedBufferVarsInference, + "X"); + } // namespace operators } // namespace paddle @@ -87,14 +118,14 @@ namespace op = paddle::operators; REGISTER_OPERATOR(sequence_concat, paddle::framework::OperatorWithKernel, op::SeqConcatOpMaker, op::SeqConcatShapeInferer, - paddle::framework::DefaultGradOpDescMaker); + op::SeqConcatGradOpDescMaker); template using Kernel = op::SeqConcatKernel; REGISTER_OP_CPU_KERNEL(sequence_concat, Kernel, Kernel, Kernel); -REGISTER_OPERATOR(sequence_concat_grad, paddle::framework::OperatorWithKernel, - op::SeqConcatGradShapeInferer); +REGISTER_OPERATOR(sequence_concat_grad, op::SeqConcatGradOp, + op::SeqConcatGradNoNeedBufferVarsInference); template using GradKernel = op::SeqConcatGradKernel; diff --git a/paddle/fluid/operators/sequence_ops/sequence_concat_op.h b/paddle/fluid/operators/sequence_ops/sequence_concat_op.h index ff035f421c4907ba940b973b3fd2a9421ed2dbae..f9b2ed3846a0f29bd2b058b944360a8fb66c24f8 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_concat_op.h +++ b/paddle/fluid/operators/sequence_ops/sequence_concat_op.h @@ -14,7 +14,9 @@ #pragma once +#include #include +#include "boost/optional.hpp" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/detail/safe_ref.h" #include "paddle/fluid/operators/math/concat_and_split.h" @@ -89,37 +91,49 @@ class SeqConcatGradKernel : public framework::OpKernel { dxs[i]->mutable_data(context.GetPlace()); } } + std::vector sliced_x; - std::vector> sliced_dx; + std::vector> sliced_dx; for (size_t i = 1; i < xs[0]->lod()[0].size(); ++i) { for (size_t j = 0; j < xs.size(); ++j) { const framework::LoDTensor *x = xs[j]; + framework::DDim x_dims = x->dims(); + framework::LoDTensor *dx = dxs[j]; auto &x_lod = x->lod()[0]; - sliced_x.emplace_back(x->Slice(x_lod[i - 1], x_lod[i])); - if (dx != nullptr) { - sliced_dx.emplace_back(dx->Slice(x_lod[i - 1], x_lod[i])); + + auto prev_lod = x_lod[i - 1]; + auto next_lod = x_lod[i]; + + x_dims[0] = next_lod - prev_lod; + + sliced_x.emplace_back(); + sliced_x.back().Resize(x_dims); + + if (dx) { + sliced_dx.emplace_back(dx->Slice(prev_lod, next_lod)); } else { - sliced_dx.emplace_back(boost::blank()); + sliced_dx.emplace_back(boost::none); } } } - math::SplitFunctor functor; std::vector sliced_x_ptr; - std::vector sliced_dx_ptr; + sliced_x_ptr.reserve(sliced_x.size()); for (auto &x : sliced_x) { sliced_x_ptr.emplace_back(&x); } + std::vector sliced_dx_ptr; + sliced_dx_ptr.reserve(sliced_dx.size()); for (auto &dx : sliced_dx) { - try { - sliced_dx_ptr.emplace_back(&boost::get(dx)); - } catch (boost::bad_get &) { - sliced_dx_ptr.emplace_back(nullptr); + if (dx) { + sliced_dx_ptr.emplace_back(&dx.get()); } } + + math::SplitFunctor functor; functor(context.template device_context(), detail::Ref( context.Input(framework::GradVarName("Out")), diff --git a/paddle/fluid/operators/sequence_ops/sequence_conv_op.cc b/paddle/fluid/operators/sequence_ops/sequence_conv_op.cc index 65cd9edbc7125f605d6fb437a2e056054eb9a6d7..89c1fe834832802cc86dacd5a2d8c22bafa6072b 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_conv_op.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_conv_op.cc @@ -15,6 +15,9 @@ limitations under the License. */ #include "paddle/fluid/operators/sequence_ops/sequence_conv_op.h" #include +#include +#include +#include namespace paddle { namespace operators { @@ -171,13 +174,57 @@ context_length, context_stride and context_start. } }; +class SequenceConvGradOpDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("sequence_conv_grad"); + op->SetAttrMap(Attrs()); + + if (boost::get(Attrs().at("paddingTrainable")) && + ForwardOp().Inputs().count("PaddingData") > 0) { + op->SetInput("PaddingData", Input("PaddingData")); + op->SetOutput(framework::GradVarName("PaddingData"), + InputGrad("PaddingData")); + } + + op->SetInput("X", Input("X")); + op->SetInput("Filter", Input("Filter")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetOutput(framework::GradVarName("Filter"), InputGrad("Filter")); + + return op; + } +}; + +class SequenceConvGradNoNeedBufferVarsInference + : public framework::NoNeedBufferVarsInference { + public: + using framework::NoNeedBufferVarsInference::NoNeedBufferVarsInference; + + std::unordered_set operator()() const override { + if (!boost::get(Attrs().at("paddingTrainable"))) { + return {"PaddingData"}; + } else { + return {}; + } + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(sequence_conv, ops::SequenceConvOp, ops::SequenceConvOpMaker, - paddle::framework::DefaultGradOpDescMaker); -REGISTER_OPERATOR(sequence_conv_grad, ops::SequenceConvGradOp); + ops::SequenceConvGradOpDescMaker); + +REGISTER_OPERATOR(sequence_conv_grad, ops::SequenceConvGradOp, + ops::SequenceConvGradNoNeedBufferVarsInference); REGISTER_OP_CPU_KERNEL( sequence_conv, diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cc b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cc index 3b79d0c71975bb740b4085ce80f7d95b65f600c1..e1f6c3e3d599340acfa9bb5b47017b003721e4a3 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cc @@ -13,6 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/sequence_ops/sequence_expand_as_op.h" +#include +#include namespace paddle { namespace operators { @@ -70,6 +72,12 @@ class SequenceExpandAsOp : public framework::OperatorWithKernel { ctx->SetOutputDim("Out", out_dims); ctx->ShareLoD("Y", /*->*/ "Out"); } + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.GetPlace()); + } }; class SequenceExpandAsOpMaker : public framework::OpProtoAndCheckerMaker { @@ -131,7 +139,6 @@ class SequenceExpandAsOpGrad : public framework::OperatorWithKernel { protected: void InferShape(framework::InferShapeContext* ctx) const override { PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null."); - PADDLE_ENFORCE(ctx->HasInput("Out"), "Input(Out) should not be null."); PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), "Input(Out@GRAD) should not be null."); @@ -143,16 +150,48 @@ class SequenceExpandAsOpGrad : public framework::OperatorWithKernel { ctx->ShareLoD("X", x_grad_name); } } + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + ctx.Input(framework::GradVarName("Out"))->type(), + ctx.GetPlace()); + } }; +class SequenceExpandAsOpGradOpDescMaker + : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("sequence_expand_as_grad"); + op->SetInput("X", Input("X")); + op->SetInput("Y", Input("Y")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetAttrMap(Attrs()); + return op; + } +}; + +DECLARE_NO_NEED_BUFFER_VARS_INFERENCE( + SequenceExpandAsOpNoNeedBufferVarsInference, "Y"); +DECLARE_NO_NEED_BUFFER_VARS_INFERENCE( + SequenceExpandAsGradOpNoNeedBufferVarsInference, "X", "Y"); + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(sequence_expand_as, ops::SequenceExpandAsOp, ops::SequenceExpandAsOpMaker, - paddle::framework::DefaultGradOpDescMaker); -REGISTER_OPERATOR(sequence_expand_as_grad, ops::SequenceExpandAsOpGrad); + ops::SequenceExpandAsOpGradOpDescMaker, + ops::SequenceExpandAsOpNoNeedBufferVarsInference); +REGISTER_OPERATOR(sequence_expand_as_grad, ops::SequenceExpandAsOpGrad, + ops::SequenceExpandAsGradOpNoNeedBufferVarsInference); REGISTER_OP_CPU_KERNEL( sequence_expand_as, ops::SequenceExpandAsKernel, diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cc b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cc index f6c42415301bc8d6f3509bfba2ff356265643bad..b7c0420636ab60e8a3e0a9332cbd3858aacda1b0 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/sequence_ops/sequence_expand_op.h" +#include namespace paddle { namespace operators { @@ -96,6 +97,12 @@ class SequenceExpandOp : public framework::OperatorWithKernel { ctx->SetOutputDim("Out", out_dims); ctx->ShareLoD("X", /*->*/ "Out"); } + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.GetPlace()); + } }; class SequenceExpandOpMaker : public framework::OpProtoAndCheckerMaker { @@ -188,7 +195,6 @@ class SequenceExpandOpGrad : public framework::OperatorWithKernel { protected: void InferShape(framework::InferShapeContext* ctx) const override { PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null."); - PADDLE_ENFORCE(ctx->HasInput("Out"), "Input(Out) should not be null."); PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), "Input(Out@GRAD) should not be null."); @@ -199,16 +205,47 @@ class SequenceExpandOpGrad : public framework::OperatorWithKernel { ctx->SetOutputDim(x_grad_name, x_dims); } } + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + ctx.Input(framework::GradVarName("Out"))->type(), + ctx.GetPlace()); + } }; +class SequenceExpandOpGradDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("sequence_expand_grad"); + op->SetInput("X", Input("X")); + op->SetInput("Y", Input("Y")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetAttrMap(Attrs()); + return op; + } +}; + +DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(SequenceExpandOpNoNeedBufferVarsInference, + "Y"); +DECLARE_NO_NEED_BUFFER_VARS_INFERENCE( + SequenceExpandGradOpNoNeedBufferVarsInference, "X", "Y"); + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(sequence_expand, ops::SequenceExpandOp, ops::SequenceExpandOpMaker, - paddle::framework::DefaultGradOpDescMaker); -REGISTER_OPERATOR(sequence_expand_grad, ops::SequenceExpandOpGrad); + ops::SequenceExpandOpGradDescMaker, + ops::SequenceExpandOpNoNeedBufferVarsInference); +REGISTER_OPERATOR(sequence_expand_grad, ops::SequenceExpandOpGrad, + ops::SequenceExpandGradOpNoNeedBufferVarsInference); REGISTER_OP_CPU_KERNEL( sequence_expand, ops::SequenceExpandKernel, diff --git a/paddle/fluid/operators/sequence_ops/sequence_pad_op.cc b/paddle/fluid/operators/sequence_ops/sequence_pad_op.cc index 23c7bf7cea830bb0ccf5e81f99130043c2d5f80b..5290d0e6c6a2569e389345f61a0844ce3cbde10f 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_pad_op.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_pad_op.cc @@ -13,6 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/sequence_ops/sequence_pad_op.h" +#include +#include namespace paddle { namespace operators { @@ -194,18 +196,39 @@ class SequencePadGradOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("X")); + auto data_type = framework::GetDataTypeOfVar( + ctx.InputVar(framework::GradVarName("Out"))); return framework::OpKernelType(data_type, ctx.device_context()); } }; +class SequencePadGradOpDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("sequence_pad_grad"); + op->SetAttrMap(Attrs()); + op->SetInput("X", Input("X")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + return op; + } +}; + +DECLARE_NO_NEED_BUFFER_VARS_INFERENCE( + SequencePadGradOpNoNeedBufferVarsInference, "X"); + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(sequence_pad, ops::SequencePadOp, ops::SequencePadOpMaker, - paddle::framework::DefaultGradOpDescMaker); -REGISTER_OPERATOR(sequence_pad_grad, ops::SequencePadGradOp); + ops::SequencePadGradOpDescMaker); +REGISTER_OPERATOR(sequence_pad_grad, ops::SequencePadGradOp, + ops::SequencePadGradOpNoNeedBufferVarsInference); REGISTER_OP_CPU_KERNEL( sequence_pad, ops::SequencePadOpKernel, diff --git a/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc b/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc index 1754221e7711b09c38f81c3f5803daa5372ed0dd..b4923571df95432d030d393a69d427f3ae17f298 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_pool_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/sequence_ops/sequence_pool_op.h" +#include #include namespace paddle { @@ -114,8 +115,9 @@ class SequencePoolGradOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType(ctx.Input("X")->type(), - ctx.device_context()); + return framework::OpKernelType( + ctx.Input(framework::GradVarName("Out"))->type(), + ctx.device_context()); } }; @@ -138,13 +140,17 @@ class SequencePoolGradOpMaker : public framework::SingleGradOpDescMaker { } }; +DECLARE_NO_NEED_BUFFER_VARS_INFERENCE( + SequencePoolGradOpNoNeedBufferVarsInference, "X"); + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(sequence_pool, ops::SequencePoolOp, ops::SequencePoolOpMaker, ops::SequencePoolGradOpMaker); -REGISTER_OPERATOR(sequence_pool_grad, ops::SequencePoolGradOp); +REGISTER_OPERATOR(sequence_pool_grad, ops::SequencePoolGradOp, + ops::SequencePoolGradOpNoNeedBufferVarsInference); REGISTER_OP_CPU_KERNEL( sequence_pool, ops::SequencePoolKernel); diff --git a/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc b/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc index 8267c04f9f20511deba363f9a0aae761736ba90b..5a22212edf29cc79d28b12029dc7595ae5f1aab3 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_scatter_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/sequence_ops/sequence_scatter_op.h" +#include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/gather.h" @@ -124,25 +125,49 @@ class SequenceScatterGradOp : public framework::OperatorWithKernel { void InferShape(framework::InferShapeContext* ctx) const override { ctx->SetOutputDim(framework::GradVarName("Updates"), ctx->GetInputDim("Updates")); - ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + ctx->SetOutputDim(framework::GradVarName("X"), + ctx->GetInputDim(framework::GradVarName("Out"))); } protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType(ctx.Input("X")->type(), - platform::CPUPlace()); + return framework::OpKernelType( + ctx.Input(framework::GradVarName("Out"))->type(), + platform::CPUPlace()); } }; +class SequenceScatterGradDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("sequence_scatter_grad"); + op->SetInput("Ids", Input("Ids")); + op->SetInput("Updates", Input("Updates")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetOutput(framework::GradVarName("Updates"), InputGrad("Updates")); + op->SetAttrMap(Attrs()); + return op; + } +}; + +DECLARE_NO_NEED_BUFFER_VARS_INFERENCE( + SequenceScatterGradNoNeedBufferVarsInference, "Updates"); + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(sequence_scatter, ops::SequenceScatterOp, ops::SequenceScatterOpMaker, - paddle::framework::DefaultGradOpDescMaker); -REGISTER_OPERATOR(sequence_scatter_grad, ops::SequenceScatterGradOp); + ops::SequenceScatterGradDescMaker); +REGISTER_OPERATOR(sequence_scatter_grad, ops::SequenceScatterGradOp, + ops::SequenceScatterGradNoNeedBufferVarsInference); REGISTER_OP_CPU_KERNEL(sequence_scatter, ops::SequenceScatterOpKernel, ops::SequenceScatterOpKernel, ops::SequenceScatterOpKernel, diff --git a/paddle/fluid/operators/sequence_ops/sequence_slice_op.cc b/paddle/fluid/operators/sequence_ops/sequence_slice_op.cc index 35f49f78cedaca59d58ea19b909e5a950281c6e9..4b2ec6e7cad7c04e248c0ffbb117951fba1ec877 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_slice_op.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_slice_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/sequence_ops/sequence_slice_op.h" +#include namespace paddle { namespace operators { @@ -70,8 +71,9 @@ class SequenceSliceGradOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType(ctx.Input("X")->type(), - ctx.device_context()); + return framework::OpKernelType( + ctx.Input(framework::GradVarName("Out"))->type(), + ctx.device_context()); } }; @@ -113,14 +115,35 @@ NOTE: The first dimension size of input, the size of offset and Length, should b } }; +class SequenceSliceGradOpDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("sequence_slice_grad"); + op->SetInput("X", Input("X")); + op->SetInput("Offset", Input("Offset")); + op->SetInput("Length", Input("Length")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetAttrMap(Attrs()); + return op; + } +}; + +DECLARE_NO_NEED_BUFFER_VARS_INFERENCE( + SequenceSliceGradNoNeedBufferVarsInference, "X"); + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(sequence_slice, ops::SequenceSliceOp, - ops::SequenceSliceOpMaker, - paddle::framework::DefaultGradOpDescMaker); -REGISTER_OPERATOR(sequence_slice_grad, ops::SequenceSliceGradOp); + ops::SequenceSliceOpMaker, ops::SequenceSliceGradOpDescMaker); +REGISTER_OPERATOR(sequence_slice_grad, ops::SequenceSliceGradOp, + ops::SequenceSliceGradNoNeedBufferVarsInference); REGISTER_OP_CPU_KERNEL( sequence_slice, ops::SequenceSliceOpKernel); diff --git a/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cc b/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cc index 2cf508e0b707ecc986886e72e5d42fde3c84894d..6c98a3e8731abb989f8dab97eff5c6ad56111742 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cc +++ b/paddle/fluid/operators/sequence_ops/sequence_unpad_op.cc @@ -13,6 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/sequence_ops/sequence_unpad_op.h" +#include +#include namespace paddle { namespace operators { @@ -125,19 +127,39 @@ class SequenceUnpadGradOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("X")); + auto data_type = framework::GetDataTypeOfVar( + ctx.InputVar(framework::GradVarName("Out"))); return framework::OpKernelType(data_type, ctx.device_context()); } }; +class SequenceUnpadGradOpDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("sequence_unpad_grad"); + op->SetAttrMap(Attrs()); + op->SetInput("X", Input("X")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + return op; + } +}; + +DECLARE_NO_NEED_BUFFER_VARS_INFERENCE( + SequenceUnpadGradOpNoNeedBufferVarsInference, "X"); + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(sequence_unpad, ops::SequenceUnpadOp, - ops::SequenceUnpadOpMaker, - paddle::framework::DefaultGradOpDescMaker); -REGISTER_OPERATOR(sequence_unpad_grad, ops::SequenceUnpadGradOp); + ops::SequenceUnpadOpMaker, ops::SequenceUnpadGradOpDescMaker); +REGISTER_OPERATOR(sequence_unpad_grad, ops::SequenceUnpadGradOp, + ops::SequenceUnpadGradOpNoNeedBufferVarsInference); REGISTER_OP_CPU_KERNEL( sequence_unpad, ops::SequenceUnpadOpKernel, diff --git a/paddle/fluid/operators/sequence_ops/sequence_unpad_op.h b/paddle/fluid/operators/sequence_ops/sequence_unpad_op.h index 07df3dca831d7e646050ae57402c1a493c2e50e9..fe8ca41b698159a782547ce673a374d074d3b73d 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_unpad_op.h +++ b/paddle/fluid/operators/sequence_ops/sequence_unpad_op.h @@ -81,10 +81,9 @@ class SequenceUnpadGradOpKernel : public framework::OpKernel { auto* d_x = ctx.Output(framework::GradVarName("X")); if (d_x) { const auto* d_out = ctx.Input(framework::GradVarName("Out")); - const auto* x_t = ctx.Input("X"); d_x->mutable_data(ctx.GetPlace()); - int padded_length = x_t->dims()[1]; + int padded_length = d_x->dims()[1]; LoDTensor zero_pads; zero_pads.Resize({1, 1}); diff --git a/paddle/fluid/operators/shuffle_channel_op.cc b/paddle/fluid/operators/shuffle_channel_op.cc index 26355e58615454c8e9aea1d6a5405368e6006e87..ad6fb3510f02ae783c8ae4318f559a8db74a59d1 100644 --- a/paddle/fluid/operators/shuffle_channel_op.cc +++ b/paddle/fluid/operators/shuffle_channel_op.cc @@ -11,6 +11,7 @@ limitations under the License. */ #include "paddle/fluid/operators/shuffle_channel_op.h" #include +#include namespace paddle { namespace operators { @@ -73,12 +74,7 @@ class ShuffleChannelGradOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), - "Input(Out@Grad) should not be null"); - PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), - "Output(X@Grad) should not be null"); - - auto input_dims = ctx->GetInputDim("X"); + auto input_dims = ctx->GetInputDim(framework::GradVarName("Out")); PADDLE_ENFORCE(input_dims.size() == 4, "The layout of input is NCHW."); ctx->SetOutputDim(framework::GradVarName("X"), input_dims); @@ -87,8 +83,9 @@ class ShuffleChannelGradOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType(ctx.Input("X")->type(), - ctx.device_context()); + return framework::OpKernelType( + ctx.Input(framework::GradVarName("Out"))->type(), + ctx.device_context()); } }; @@ -100,7 +97,6 @@ class ShuffleChannelGradDescMaker : public framework::SingleGradOpDescMaker { std::unique_ptr Apply() const override { std::unique_ptr op(new framework::OpDesc()); op->SetType("shuffle_channel_grad"); - op->SetInput("X", Input("X")); op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); op->SetOutput(framework::GradVarName("X"), InputGrad("X")); op->SetAttrMap(Attrs()); diff --git a/paddle/fluid/operators/shuffle_channel_op.cu b/paddle/fluid/operators/shuffle_channel_op.cu index 9506343b3d508459c6e10dc68eba13504b07338f..dbc3e1a7ebe26ffccd24d1749093d014751d866f 100644 --- a/paddle/fluid/operators/shuffle_channel_op.cu +++ b/paddle/fluid/operators/shuffle_channel_op.cu @@ -78,10 +78,14 @@ template class ShuffleChannelGradOpCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - auto* input = ctx.Input("X"); + auto* output_grad = + ctx.Input(framework::GradVarName("Out")); + auto* input_grad = + ctx.Output(framework::GradVarName("X")); + int group = ctx.Attr("group"); - auto input_dims = input->dims(); + const auto& input_dims = input_grad->dims(); auto num = input_dims[0]; auto channel = input_dims[1]; auto height = input_dims[2]; @@ -91,10 +95,7 @@ class ShuffleChannelGradOpCUDAKernel : public framework::OpKernel { int group_row = group; int group_column = channel / group_row; - auto* output_grad = - ctx.Input(framework::GradVarName("Out")); - auto* input_grad = - ctx.Output(framework::GradVarName("X")); + T* input_grad_data = input_grad->mutable_data(ctx.GetPlace()); const T* output_grad_data = output_grad->data(); diff --git a/paddle/fluid/operators/shuffle_channel_op.h b/paddle/fluid/operators/shuffle_channel_op.h index f6af1bc88598870ebccef81bd37f93f376940851..3ce1e0c770bb3fe6c4b0a54dad14e47f372958af 100644 --- a/paddle/fluid/operators/shuffle_channel_op.h +++ b/paddle/fluid/operators/shuffle_channel_op.h @@ -57,10 +57,14 @@ template class ShuffleChannelGradOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - auto* input = ctx.Input("X"); + auto* output_grad = + ctx.Input(framework::GradVarName("Out")); + auto* input_grad = + ctx.Output(framework::GradVarName("X")); + int group = ctx.Attr("group"); - auto input_dims = input->dims(); + const auto& input_dims = input_grad->dims(); auto num = input_dims[0]; auto channel = input_dims[1]; auto height = input_dims[2]; @@ -71,10 +75,6 @@ class ShuffleChannelGradOpKernel : public framework::OpKernel { int group_row = group; int group_column = channel / group_row; - auto* output_grad = - ctx.Input(framework::GradVarName("Out")); - auto* input_grad = - ctx.Output(framework::GradVarName("X")); T* input_grad_data = input_grad->mutable_data(ctx.GetPlace()); const T* output_grad_data = output_grad->data(); for (int n = 0; n < num; ++n) { diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc index c21b0c13c752b82b80c120cb5a5d4a010ef18287..5c92588cc1d073612d2f6a7b315edf16cc14bedd 100644 --- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc +++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc @@ -13,6 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h" +#include +#include +#include namespace paddle { namespace operators { @@ -139,6 +142,24 @@ However the output only shares the LoD with input `X`. } }; +class SigmoidCrossEntropyWithLogitsGradOpDescMaker + : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("sigmoid_cross_entropy_with_logits_grad"); + op->SetInput("X", Input("X")); + op->SetInput("Label", Input("Label")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetAttrMap(Attrs()); + return op; + } +}; + } // namespace operators } // namespace paddle @@ -146,7 +167,7 @@ namespace ops = paddle::operators; REGISTER_OPERATOR(sigmoid_cross_entropy_with_logits, ops::SigmoidCrossEntropyWithLogitsOp, ops::SigmoidCrossEntropyWithLogitsOpMaker, - paddle::framework::DefaultGradOpDescMaker); + ops::SigmoidCrossEntropyWithLogitsGradOpDescMaker); REGISTER_OPERATOR(sigmoid_cross_entropy_with_logits_grad, ops::SigmoidCrossEntropyWithLogitsGradOp); REGISTER_OP_CPU_KERNEL( diff --git a/paddle/fluid/operators/slice_op.cc b/paddle/fluid/operators/slice_op.cc index 94995fc99612adb1164e60f1a51747f74eacfb73..589c98e51e32bc9eb7d6ccfb721a6a5f091470cf 100644 --- a/paddle/fluid/operators/slice_op.cc +++ b/paddle/fluid/operators/slice_op.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/slice_op.h" #include +#include #include namespace paddle { @@ -135,6 +136,13 @@ class SliceOpGrad : public framework::OperatorWithKernel { ctx->SetOutputDim(x_grad_name, x_dims); } } + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + ctx.Input(framework::GradVarName("Out"))->type(), + ctx.GetPlace()); + } }; class SliceOpGradMaker : public framework::SingleGradOpDescMaker { @@ -153,13 +161,17 @@ class SliceOpGradMaker : public framework::SingleGradOpDescMaker { } }; +DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(SliceOpGradNoNeedBufferVarsInference, + "Input"); + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(slice, ops::SliceOp, ops::SliceOpMaker, ops::SliceOpGradMaker); -REGISTER_OPERATOR(slice_grad, ops::SliceOpGrad); +REGISTER_OPERATOR(slice_grad, ops::SliceOpGrad, + ops::SliceOpGradNoNeedBufferVarsInference); REGISTER_OP_CPU_KERNEL( slice, ops::SliceKernel, diff --git a/paddle/fluid/operators/split_selected_rows_op.h b/paddle/fluid/operators/split_selected_rows_op.h index 1fef2b3d378c96d087118d0136885e7e29aa237c..9ec459e2a68d85af526e741d7fd9ecd858383132 100644 --- a/paddle/fluid/operators/split_selected_rows_op.h +++ b/paddle/fluid/operators/split_selected_rows_op.h @@ -16,31 +16,12 @@ limitations under the License. */ #include #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/distributed_ops/send_recv_util.h" #include "paddle/fluid/operators/math/selected_rows_functor.h" namespace paddle { namespace operators { -static int FindOutIdx(int row, const std::vector& abs_sections) { - for (size_t i = 1; i < abs_sections.size(); ++i) { - if (row < abs_sections[i]) { - return i - 1; - } - } - return abs_sections.size() - 1; -} - -static std::vector ToAbsoluteSection( - const std::vector& height_sections) { - std::vector abs_sections; - abs_sections.resize(height_sections.size()); - abs_sections[0] = 0; - for (size_t i = 1; i < height_sections.size(); ++i) { - abs_sections[i] = height_sections[i - 1] + abs_sections[i - 1]; - } - return abs_sections; -} - template class SplitSelectedRowsOpKernel : public framework::OpKernel { public: @@ -51,7 +32,8 @@ class SplitSelectedRowsOpKernel : public framework::OpKernel { auto abs_sections = ToAbsoluteSection(height_sections); - auto x_rows = x->rows(); + auto& x_rows = x->rows(); + auto height = x->height(); std::vector> outs_rows_idx; std::vector> outs_dense_idx; @@ -63,8 +45,10 @@ class SplitSelectedRowsOpKernel : public framework::OpKernel { // split rows index into output sparse vars for (size_t i = 0; i < x_rows.size(); ++i) { - int out_idx = FindOutIdx(x_rows[i], abs_sections); - outs_rows_idx[out_idx].push_back(x_rows[i]); + auto& id = x_rows[i]; + PADDLE_ENFORCE_LT(id, height); + int out_idx = GetSectionIndex(id, abs_sections); + outs_rows_idx[out_idx].push_back(id); outs_dense_idx[out_idx].push_back(i); } auto place = ctx.GetPlace(); @@ -78,7 +62,9 @@ class SplitSelectedRowsOpKernel : public framework::OpKernel { outs[i]->mutable_rows()->clear(); if (rows_idx.size() > 0) { for (auto idx : rows_idx) { - outs[i]->mutable_rows()->push_back(idx - abs_sections[i]); + auto id_offset = idx - abs_sections[i]; + PADDLE_ENFORCE_LT(id_offset, height_sections[i]); + outs[i]->mutable_rows()->push_back(id_offset); } auto dst = outs[i]->mutable_value()->mutable_data(ctx.GetPlace()); for (size_t j = 0; j < rows_idx.size(); j++) { diff --git a/paddle/fluid/operators/temporal_shift_op.cc b/paddle/fluid/operators/temporal_shift_op.cc index 7df649fc5b7bf8671303a28d727be1d85c1fa6e4..3b7d90b795b45d97dfdbe90f7e37ea28b942f2a0 100644 --- a/paddle/fluid/operators/temporal_shift_op.cc +++ b/paddle/fluid/operators/temporal_shift_op.cc @@ -10,6 +10,9 @@ limitations under the License. */ #include "paddle/fluid/operators/temporal_shift_op.h" +#include +#include +#include #include "paddle/fluid/framework/op_registry.h" namespace paddle { @@ -125,19 +128,32 @@ class TemporalShiftOpGrad : public framework::OperatorWithKernel { protected: void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null"); - PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), - "Input(Out@GRAD) should not be null"); - auto dim_x = ctx->GetInputDim("X"); if (ctx->HasOutput(framework::GradVarName("X"))) { - ctx->SetOutputDim(framework::GradVarName("X"), dim_x); + ctx->SetOutputDim(framework::GradVarName("X"), + ctx->GetInputDim(framework::GradVarName("Out"))); } } framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType(ctx.Input("X")->type(), - ctx.GetPlace()); + return framework::OpKernelType( + ctx.Input(framework::GradVarName("Out"))->type(), + ctx.GetPlace()); + } +}; + +class TemporalShiftGradOpDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("temporal_shift_grad"); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetAttrMap(Attrs()); + return op; } }; @@ -146,8 +162,7 @@ class TemporalShiftOpGrad : public framework::OperatorWithKernel { namespace ops = paddle::operators; REGISTER_OPERATOR(temporal_shift, ops::TemporalShiftOp, - ops::TemporalShiftOpMaker, - paddle::framework::DefaultGradOpDescMaker); + ops::TemporalShiftOpMaker, ops::TemporalShiftGradOpDescMaker); REGISTER_OPERATOR(temporal_shift_grad, ops::TemporalShiftOpGrad); REGISTER_OP_CPU_KERNEL(temporal_shift, ops::TemporalShiftKernel, ops::TemporalShiftKernel); diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index f889e2e9658eecb4c1931390122fc8b7915bc303..a2669ee2113630332102549fd7e5c1d85e9972b6 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -93,6 +93,9 @@ nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context) cc_library(timer SRCS timer.cc) cc_test(timer_test SRCS timer_test.cc DEPS timer) +cc_library(lodtensor_printer SRCS lodtensor_printer.cc DEPS ddim place tensor scope lod_tensor variable_helper framework_proto) +cc_test(lodtensor_printer_test SRCS lodtensor_printer_test.cc DEPS lodtensor_printer) + cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS}) if(WITH_GPU) nv_library(profiler SRCS profiler.cc profiler.cu DEPS device_tracer gpu_info enforce) diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt index 07159d4a12ef4b628f7705ed206d3334be46dfc8..1697343790d13c37d63505acfe471b379bf897d9 100644 --- a/paddle/fluid/platform/dynload/CMakeLists.txt +++ b/paddle/fluid/platform/dynload/CMakeLists.txt @@ -17,6 +17,9 @@ if (CUPTI_FOUND) endif(CUPTI_FOUND) nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader) cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc) +if (WITH_WBAES) + cc_library(dynload_wbaes SRCS wbaes.cc DEPS dynamic_loader wbaes) +endif() if (WITH_MKLML) cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml) endif() diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc index 15d516836652ea4ea4d1bcdf35022e6b79cc3b52..8ac9393787324d3a8a17ac5a800bcf69638a4fed 100644 --- a/paddle/fluid/platform/dynload/dynamic_loader.cc +++ b/paddle/fluid/platform/dynload/dynamic_loader.cc @@ -48,6 +48,8 @@ DEFINE_string( DEFINE_string(mklml_dir, "", "Specify path for loading libmklml_intel.so."); +DEFINE_string(wbaes_dir, "", "Specify path for loading libwbaes.so."); + namespace paddle { namespace platform { namespace dynload { @@ -246,6 +248,16 @@ void* GetMKLMLDsoHandle() { #endif } +void* GetWBAESDsoHandle() { +#if defined(__APPLE__) || defined(__OSX__) + return GetDsoHandleFromSearchPath(FLAGS_wbaes_dir, "libwbaes.dylib"); +#elif defined(_WIN32) + return GetDsoHandleFromSearchPath(FLAGS_wbaes_dir, "libwbaes.dll"); +#else + return GetDsoHandleFromSearchPath(FLAGS_wbaes_dir, "libwbaes.so"); +#endif +} + } // namespace dynload } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/dynload/dynamic_loader.h b/paddle/fluid/platform/dynload/dynamic_loader.h index edb4c649addfaf941a00588395d9191038217979..5a642967c7666f5d5943214f557786c87491d740 100644 --- a/paddle/fluid/platform/dynload/dynamic_loader.h +++ b/paddle/fluid/platform/dynload/dynamic_loader.h @@ -32,6 +32,7 @@ void* GetWarpCTCDsoHandle(); void* GetNCCLDsoHandle(); void* GetTensorRtDsoHandle(); void* GetMKLMLDsoHandle(); +void* GetWBAESDsoHandle(); } // namespace dynload } // namespace platform diff --git a/paddle/fluid/platform/dynload/wbaes.cc b/paddle/fluid/platform/dynload/wbaes.cc new file mode 100644 index 0000000000000000000000000000000000000000..37387b202aadddef859b0eecca55cb9c99d826ee --- /dev/null +++ b/paddle/fluid/platform/dynload/wbaes.cc @@ -0,0 +1,34 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_WBAES + +#include "paddle/fluid/platform/dynload/wbaes.h" + +namespace paddle { +namespace platform { +namespace dynload { + +std::once_flag wbaes_dso_flag; +void *wbaes_dso_handle = nullptr; + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +WBAES_ROUTINE_EACH(DEFINE_WRAP); + +} // namespace dynload +} // namespace platform +} // namespace paddle + +#endif diff --git a/paddle/fluid/platform/dynload/wbaes.h b/paddle/fluid/platform/dynload/wbaes.h new file mode 100644 index 0000000000000000000000000000000000000000..22400d44e4ca5568f1d74e4e194e45e81cbdfefe --- /dev/null +++ b/paddle/fluid/platform/dynload/wbaes.h @@ -0,0 +1,63 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once + +#ifdef PADDLE_WITH_WBAES + +#include +#include // NOLINT + +#include "paddle/fluid/platform/dynload/dynamic_loader.h" +#include "paddle/fluid/platform/port.h" + +namespace paddle { +namespace platform { +namespace dynload { + +extern std::once_flag wbaes_dso_flag; +extern void *wbaes_dso_handle; + +/** + * The following macro definition can generate structs + * (for each function) to dynamic load wbaes routine + * via operator overloading. + */ + +#define DYNAMIC_LOAD_WBAES_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ + using wbaesFunc = decltype(&::__name); \ + std::call_once(wbaes_dso_flag, []() { \ + wbaes_dso_handle = paddle::platform::dynload::GetWBAESDsoHandle(); \ + }); \ + static void *p_##__name = dlsym(wbaes_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ + extern DynLoad__##__name __name + +#define DECLARE_DYNAMIC_LOAD_WBAES_WRAP(__name) DYNAMIC_LOAD_WBAES_WRAP(__name) + +#define WBAES_ROUTINE_EACH(__macro) __macro(GSECF); + +WBAES_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_WBAES_WRAP); + +#undef DYNAMIC_LOAD_WBAES_WRAP + +} // namespace dynload +} // namespace platform +} // namespace paddle + +#endif diff --git a/paddle/fluid/platform/lodtensor_printer.cc b/paddle/fluid/platform/lodtensor_printer.cc new file mode 100644 index 0000000000000000000000000000000000000000..a5aa1a4148686b032c52f99497252fde4867438f --- /dev/null +++ b/paddle/fluid/platform/lodtensor_printer.cc @@ -0,0 +1,68 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/lodtensor_printer.h" +#include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/variable.h" + +namespace paddle { +namespace platform { + +template +void print_lod_tensor(const std::string& var_name, + const framework::LoDTensor& lod_tensor, + const std::string& print_info) { + auto inspect = lod_tensor.data(); + auto element_num = lod_tensor.numel(); + + std::ostringstream sstream; + sstream << print_info << "\t"; + sstream << var_name << "\t"; + sstream << inspect[0]; + for (int j = 1; j < element_num; ++j) { + sstream << " " << inspect[j]; + } + + std::cout << sstream.str() << std::endl; +} + +void PrintVar(framework::Scope* scope, const std::string& var_name, + const std::string& print_info) { + framework::Variable* var = scope->FindVar(var_name); + if (var == nullptr) { + VLOG(1) << "Variable Name " << var_name << " does not exist in your scope"; + return; + } + framework::LoDTensor* tensor = var->GetMutable(); + if (tensor == nullptr) { + VLOG(1) << "tensor of variable " << var_name + << " does not exist in your scope"; + return; + } + +#define PrintLoDTensorCallback(cpp_type, proto_type) \ + do { \ + if (tensor->type() == proto_type) { \ + print_lod_tensor(var_name, *tensor, print_info); \ + return; \ + } \ + } while (0) + + _ForEachDataType_(PrintLoDTensorCallback); + VLOG(1) << "PrintVar: unrecognized data type:" << tensor->type(); +} + +} // end namespace platform +} // end namespace paddle diff --git a/paddle/fluid/platform/lodtensor_printer.h b/paddle/fluid/platform/lodtensor_printer.h new file mode 100644 index 0000000000000000000000000000000000000000..e070e3540c996a0fe248a3b9312c18d948395426 --- /dev/null +++ b/paddle/fluid/platform/lodtensor_printer.h @@ -0,0 +1,24 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "paddle/fluid/framework/scope.h" + +namespace paddle { +namespace platform { +void PrintVar(framework::Scope* scope, const std::string& var_name, + const std::string& print_info); +} // end namespace platform +} // end namespace paddle diff --git a/paddle/fluid/platform/lodtensor_printer_test.cc b/paddle/fluid/platform/lodtensor_printer_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..19e85284b8fc8842b2e5662343c74fc451b08d9e --- /dev/null +++ b/paddle/fluid/platform/lodtensor_printer_test.cc @@ -0,0 +1,22 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "paddle/fluid/platform/lodtensor_printer.h" +#include "gtest/gtest.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/variable.h" + +TEST(LodTensorPrinter, PrintVar) { + paddle::framework::Scope scope; + paddle::platform::PrintVar(&scope, "NotAVar", "We don't have var"); +} diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index 0991eff0fdaaca80ada2d8dd3c68eba72fd3f6e6..c8a0aa58859cca06375ce578e5a7097179e23107 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -1,11 +1,11 @@ -set(PYBIND_DEPS pybind python proto_desc memory executor async_executor prune +set(PYBIND_DEPS pybind python proto_desc memory executor async_executor fleet_wrapper prune feed_fetch_method pass_builder parallel_executor profiler layer scope_pool tracer analysis_predictor imperative_profiler) if(WITH_PYTHON) list(APPEND PYBIND_DEPS py_func_op) endif() -set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc reader_py.cc async_executor_py.cc imperative.cc ir.cc inference_api.cc) +set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc reader_py.cc async_executor_py.cc fleet_wrapper_py.cc data_set_py.cc imperative.cc ir.cc inference_api.cc) if(WITH_PYTHON) if(WITH_AMD_GPU) diff --git a/paddle/fluid/pybind/async_executor_py.cc b/paddle/fluid/pybind/async_executor_py.cc index 222c128c66f37a259eb17527fe2586860f701275..009d13c243bdb3ee05d79edf9e47a09127bfc10b 100644 --- a/paddle/fluid/pybind/async_executor_py.cc +++ b/paddle/fluid/pybind/async_executor_py.cc @@ -21,6 +21,7 @@ limitations under the License. */ #ifdef _XOPEN_SOURCE #undef _XOPEN_SOURCE #endif +#include #include #include diff --git a/paddle/fluid/pybind/data_set_py.cc b/paddle/fluid/pybind/data_set_py.cc new file mode 100644 index 0000000000000000000000000000000000000000..b773fd03c003e4c5b51f4876e6ac999f9e830ce4 --- /dev/null +++ b/paddle/fluid/pybind/data_set_py.cc @@ -0,0 +1,69 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include +#ifdef _POSIX_C_SOURCE +#undef _POSIX_C_SOURCE +#endif + +#ifdef _XOPEN_SOURCE +#undef _XOPEN_SOURCE +#endif +#include +#include +#include +#include "google/protobuf/io/zero_copy_stream_impl.h" +#include "google/protobuf/text_format.h" +#include "paddle/fluid/framework/async_executor.h" +#include "paddle/fluid/framework/data_feed.h" +#include "paddle/fluid/framework/data_feed.pb.h" +#include "paddle/fluid/framework/data_set.h" +#include "paddle/fluid/framework/dataset_factory.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/inference/io.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/platform/variant.h" +#include "paddle/fluid/pybind/data_set_py.h" + +namespace py = pybind11; +namespace pd = paddle::framework; + +namespace paddle { +namespace pybind { + +void BindDataset(py::module* m) { + py::class_>(*m, + "Dataset") + .def(py::init([](const std::string& name = "MultiSlotDataset") { + return framework::DatasetFactory::CreateDataset(name); + })) + .def("set_filelist", &framework::Dataset::SetFileList) + .def("set_thread_num", &framework::Dataset::SetThreadNum) + .def("set_trainer_num", &framework::Dataset::SetTrainerNum) + .def("set_hdfs_config", &framework::Dataset::SetHdfsConfig) + .def("set_data_feed_desc", &framework::Dataset::SetDataFeedDesc) + .def("get_filelist", &framework::Dataset::GetFileList) + .def("get_thread_num", &framework::Dataset::GetThreadNum) + .def("get_trainer_num", &framework::Dataset::GetTrainerNum) + .def("get_hdfs_config", &framework::Dataset::GetHdfsConfig) + .def("get_data_feed_desc", &framework::Dataset::GetDataFeedDesc) + .def("register_client2client_msg_handler", + &framework::Dataset::RegisterClientToClientMsgHandler) + .def("load_into_memory", &framework::Dataset::LoadIntoMemory) + .def("release_memory", &framework::Dataset::ReleaseMemory) + .def("local_shuffle", &framework::Dataset::LocalShuffle) + .def("global_shuffle", &framework::Dataset::GlobalShuffle); +} + +} // end namespace pybind +} // end namespace paddle diff --git a/paddle/fluid/pybind/data_set_py.h b/paddle/fluid/pybind/data_set_py.h new file mode 100644 index 0000000000000000000000000000000000000000..f60e862ce673119c7b8e8ae5981fc54e8c9bdb2e --- /dev/null +++ b/paddle/fluid/pybind/data_set_py.h @@ -0,0 +1,28 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "pybind11/pybind11.h" +#include "pybind11/stl.h" + +namespace py = pybind11; + +namespace paddle { +namespace pybind { + +void BindDataset(py::module* m); + +} // namespace pybind +} // namespace paddle diff --git a/paddle/fluid/pybind/fleet_wrapper_py.cc b/paddle/fluid/pybind/fleet_wrapper_py.cc new file mode 100644 index 0000000000000000000000000000000000000000..77f15db8d68da131c892b1a65946c1994b90fd04 --- /dev/null +++ b/paddle/fluid/pybind/fleet_wrapper_py.cc @@ -0,0 +1,59 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include + +#ifdef _POSIX_C_SOURCE +#undef _POSIX_C_SOURCE +#endif + +#ifdef _XOPEN_SOURCE +#undef _XOPEN_SOURCE +#endif + +#include +#include + +#include "google/protobuf/io/zero_copy_stream_impl.h" +#include "google/protobuf/text_format.h" +#include "paddle/fluid/framework/async_executor.h" +#include "paddle/fluid/framework/data_feed.h" +#include "paddle/fluid/framework/data_feed.pb.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/inference/io.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/platform/variant.h" +#include "paddle/fluid/pybind/fleet_wrapper_py.h" + +namespace py = pybind11; +namespace pd = paddle::framework; + +namespace paddle { +namespace pybind { +void BindFleetWrapper(py::module* m) { + py::class_(*m, "Fleet") + .def(py::init()) + .def("push_dense", &framework::FleetWrapper::PushDenseVarsSync) + .def("init_server", &framework::FleetWrapper::InitServer) + .def("run_server", &framework::FleetWrapper::RunServer) + .def("init_worker", &framework::FleetWrapper::InitWorker) + .def("init_model", &framework::FleetWrapper::PushDenseParamSync) + .def("stop_server", &framework::FleetWrapper::StopServer) + .def("gather_servers", &framework::FleetWrapper::GatherServers) + .def("gather_clients", &framework::FleetWrapper::GatherClients) + .def("get_clients_info", &framework::FleetWrapper::GetClientsInfo) + .def("create_client2client_connection", + &framework::FleetWrapper::CreateClient2ClientConnection); +} // end FleetWrapper +} // end namespace pybind +} // end namespace paddle diff --git a/paddle/fluid/pybind/fleet_wrapper_py.h b/paddle/fluid/pybind/fleet_wrapper_py.h new file mode 100644 index 0000000000000000000000000000000000000000..b2bfa10eecd5b79a1450ad8b9c784fa8af708602 --- /dev/null +++ b/paddle/fluid/pybind/fleet_wrapper_py.h @@ -0,0 +1,28 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "pybind11/pybind11.h" +#include "pybind11/stl.h" + +namespace py = pybind11; + +namespace paddle { +namespace pybind { + +void BindFleetWrapper(py::module* m); + +} // namespace pybind +} // namespace paddle diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index fa978f1c99b144708c660b537142fb56354c9e6b..044677fb756e0368c65b84f15fdf2540abbd14b8 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -29,6 +29,7 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_rank_table.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/parallel_executor.h" #include "paddle/fluid/framework/prune.h" @@ -50,7 +51,9 @@ limitations under the License. */ #include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/pybind/async_executor_py.h" #include "paddle/fluid/pybind/const_value.h" +#include "paddle/fluid/pybind/data_set_py.h" #include "paddle/fluid/pybind/exception.h" +#include "paddle/fluid/pybind/fleet_wrapper_py.h" #include "paddle/fluid/pybind/imperative.h" #include "paddle/fluid/pybind/inference_api.h" #include "paddle/fluid/pybind/ir.h" @@ -59,7 +62,6 @@ limitations under the License. */ #include "paddle/fluid/pybind/reader_py.h" #include "paddle/fluid/pybind/recordio.h" #include "paddle/fluid/pybind/tensor_py.h" - #include "paddle/fluid/string/to_string.h" #ifdef PADDLE_WITH_CUDA @@ -155,6 +157,9 @@ PYBIND11_MODULE(core, m) { return paddle::operators::AppendPythonCallableObjectAndReturnId(py_obj); }); + m.def("_get_use_default_grad_op_desc_maker_ops", + [] { return OpInfoMap::Instance().GetUseDefaultGradOpDescMakerOps(); }); + // NOTE(zjl): ctest would load environment variables at the beginning even // though we have not `import paddle.fluid as fluid`. So we add this API // to enable eager deletion mode in unittest. @@ -620,6 +625,7 @@ All parameter, weight, gradient are variables in Paddle. m.def("init_lod_tensor_blocking_queue", [](Variable &var, size_t capacity) -> std::shared_ptr { + VLOG(1) << "init_lod_tensor_blocking_queue"; auto *holder = var.GetMutable(); holder->InitOnce(capacity, FLAGS_reader_queue_speed_test_mode); return holder->GetQueue(); @@ -922,6 +928,7 @@ All parameter, weight, gradient are variables in Paddle. py::class_(m, "Executor") .def(py::init()) .def("close", &Executor::Close) + .def("run_from_dataset", &Executor::RunFromDataset) .def("run", [](Executor &self, const ProgramDesc &prog, Scope *scope, int block_id, bool create_local_scope, bool create_vars, const std::vector &fetch_vars) { @@ -1138,6 +1145,17 @@ All parameter, weight, gradient are variables in Paddle. 2. In some NLP model, it may cause the GPU memory is insufficient, in this case, you should reduce `num_iteration_per_drop_scope`. )DOC") + .def_property( + "num_iteration_per_run", + [](const ExecutionStrategy &self) { + return self.num_iteration_per_run_; + }, + [](ExecutionStrategy &self, size_t num_iteration_per_run) { + self.num_iteration_per_run_ = num_iteration_per_run; + }, + R"DOC(This config that how many iteration the executor will run when + user call pe.run() in python + )DOC") .def_property("_dry_run", [](const ExecutionStrategy &self) { return self.dry_run_; }, [](ExecutionStrategy &self, bool dry_run) { @@ -1281,7 +1299,20 @@ All parameter, weight, gradient are variables in Paddle. to fuse relu and depthwise_conv2d, it will save GPU memory and may make the execution faster. This options is only available in GPU devices. - Default False)DOC") + Default False.)DOC") + .def_property( + "fuse_broadcast_ops", + [](const BuildStrategy &self) { return self.fuse_broadcast_ops_; }, + [](BuildStrategy &self, bool b) { + PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized."); + self.fuse_broadcast_ops_ = b; + }, + R"DOC(The type is BOOL, fuse_broadcast_op indicates whether + to fuse the broadcast ops. Note that, in Reduce mode, + fusing broadcast ops may make the program faster. Because + fusing broadcast OP equals delaying the execution of all + broadcast Ops, in this case, all nccl streams are used only + for NCCLReduce operations for a period of time. Default False.)DOC") .def_property("fuse_all_optimizer_ops", [](const BuildStrategy &self) { return self.fuse_all_optimizer_ops_; @@ -1314,6 +1345,9 @@ All parameter, weight, gradient are variables in Paddle. "is_distribution", [](const BuildStrategy &self) { return self.is_distribution_; }, [](BuildStrategy &self, bool b) { self.is_distribution_ = b; }) + .def_property("async_mode", + [](const BuildStrategy &self) { return self.async_mode_; }, + [](BuildStrategy &self, bool b) { self.async_mode_ = b; }) .def_property( "enable_inplace", [](const BuildStrategy &self) { return self.enable_inplace_; }, @@ -1356,9 +1390,11 @@ All parameter, weight, gradient are variables in Paddle. BindRecordIOWriter(&m); BindAsyncExecutor(&m); + BindFleetWrapper(&m); BindGraph(&m); BindNode(&m); BindInferenceApi(&m); + BindDataset(&m); } } // namespace pybind } // namespace paddle diff --git a/paddle/fluid/string/CMakeLists.txt b/paddle/fluid/string/CMakeLists.txt index 169a925d12328e7d1df744635445b5674c19b125..49a8fb82dbf67357c1c3f2658538789af51b7cdc 100644 --- a/paddle/fluid/string/CMakeLists.txt +++ b/paddle/fluid/string/CMakeLists.txt @@ -1,5 +1,6 @@ cc_library(stringpiece SRCS piece.cc) cc_library(pretty_log SRCS pretty_log.cc) +cc_library(string_helper SRCS string_helper.cc DEPS boost) cc_test(stringpiece_test SRCS piece_test.cc DEPS stringpiece glog gflags) cc_test(stringprintf_test SRCS printf_test.cc DEPS glog gflags) cc_test(to_string_test SRCS to_string_test.cc) diff --git a/paddle/fluid/string/string_helper.cc b/paddle/fluid/string/string_helper.cc new file mode 100644 index 0000000000000000000000000000000000000000..27708b8eebd2131ebadcc310fd3521ad5ab824f3 --- /dev/null +++ b/paddle/fluid/string/string_helper.cc @@ -0,0 +1,103 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/string/string_helper.h" +#include +#include +#include +#include +#include +#include "boost/lexical_cast.hpp" +#include "glog/logging.h" + +namespace paddle { +namespace string { + +inline size_t count_spaces(const char* s) { + size_t count = 0; + + while (*s != 0 && isspace(*s++)) { + count++; + } + + return count; +} + +inline size_t count_nonspaces(const char* s) { + size_t count = 0; + + while (*s != 0 && !isspace(*s++)) { + count++; + } + + return count; +} + +// remove leading and tailing spaces +std::string trim_spaces(const std::string& str) { + const char* p = str.c_str(); + + while (*p != 0 && isspace(*p)) { + p++; + } + + size_t len = strlen(p); + + while (len > 0 && isspace(p[len - 1])) { + len--; + } + + return std::string(p, len); +} + +inline int str_to_float(const char* str, float* v) { + const char* head = str; + char* cursor = NULL; + int index = 0; + while (*(head += count_spaces(head)) != 0) { + v[index++] = std::strtof(head, &cursor); + if (head == cursor) { + break; + } + head = cursor; + } + return index; +} + +// A helper class for reading lines from file. +// A line buffer is maintained. It +// doesn't need to know the maximum possible length of a line. +char* LineFileReader::getdelim(FILE* f, char delim) { +#ifndef _WIN32 + int32_t ret = ::getdelim(&_buffer, &_buf_size, delim, f); + + if (ret >= 0) { + if (ret >= 1 && _buffer[ret - 1] == delim) { + _buffer[--ret] = 0; + } + + _length = (size_t)ret; + return _buffer; + } else { + _length = 0; + CHECK(feof(f)); + return NULL; + } +#else + return NULL; +#endif +} + +} // end namespace string +} // end namespace paddle diff --git a/paddle/fluid/string/string_helper.h b/paddle/fluid/string/string_helper.h new file mode 100644 index 0000000000000000000000000000000000000000..e2ded402b1240680684fa6705251dfa4f34e4071 --- /dev/null +++ b/paddle/fluid/string/string_helper.h @@ -0,0 +1,157 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include "boost/lexical_cast.hpp" +#include "glog/logging.h" + +namespace paddle { +namespace string { + +inline size_t count_spaces(const char* s); + +inline size_t count_nonspaces(const char* s); + +template +void format_string_append(std::string& str, const char* fmt, // NOLINT + ARGS&&... args) { + int len = snprintf(NULL, 0, fmt, args...); + CHECK_GE(len, 0); + size_t oldlen = str.length(); + str.resize(oldlen + len + 1); + CHECK(snprintf(&str[oldlen], (size_t)len + 1, fmt, args...) == len); + str.resize(oldlen + len); +} + +template +void format_string_append(std::string& str, const std::string& fmt, // NOLINT + ARGS&&... args) { + format_string_append(str, fmt.c_str(), args...); +} + +template +std::string format_string(const char* fmt, ARGS&&... args) { + std::string str; + format_string_append(str, fmt, args...); + return std::move(str); +} + +template +std::string format_string(const std::string& fmt, ARGS&&... args) { + return format_string(fmt.c_str(), args...); +} + +// remove leading and tailing spaces +std::string trim_spaces(const std::string& str); + +int str_to_float(const char* str, float* v); + +// split string by delim +template +std::vector split_string(const std::string& str, const std::string& delim) { + size_t pre_pos = 0; + size_t pos = 0; + std::string tmp_str; + std::vector res_list; + res_list.clear(); + if (str.empty()) { + return res_list; + } + while ((pos = str.find(delim, pre_pos)) != std::string::npos) { + tmp_str.assign(str, pre_pos, pos - pre_pos); + res_list.push_back(tmp_str); + pre_pos = pos + 1; + } + tmp_str.assign(str, pre_pos, str.length() - pre_pos); + if (!tmp_str.empty()) { + res_list.push_back(tmp_str); + } + return res_list; +} + +// split string by spaces. Leading and tailing spaces are ignored. Consecutive +// spaces are treated as one delim. +template +std::vector split_string(const std::string& str) { + std::vector list; + const char* p; + int pre_pos = 0; + int pos = 0; + std::string tmp_str; + if (str.empty()) { + return list; + } + for (p = str.c_str(); *p != 0;) { + if (!isspace(*p)) { + pos = pre_pos; + p++; + + while (*p != 0 && !isspace(*p)) { + pos++; + p++; + } + tmp_str.assign(str, pre_pos, pos - pre_pos + 1); + list.push_back(tmp_str); + pre_pos = pos + 1; + } else { + pre_pos++; + p++; + } + } + return list; +} + +template +std::string join_strings(const std::vector& strs, char delim) { + std::string str; + + for (size_t i = 0; i < strs.size(); i++) { + if (i > 0) { + str += delim; + } + + str += boost::lexical_cast(strs[i]); + } + + return str; +} + +// A helper class for reading lines from file. A line buffer is maintained. It +// doesn't need to know the maximum possible length of a line. + +class LineFileReader { + public: + LineFileReader() {} + LineFileReader(LineFileReader&&) = delete; + LineFileReader(const LineFileReader&) = delete; + ~LineFileReader() { ::free(_buffer); } + char* getline(FILE* f) { return this->getdelim(f, '\n'); } + char* getdelim(FILE* f, char delim); + char* get() { return _buffer; } + size_t length() { return _length; } + + private: + char* _buffer = NULL; + size_t _buf_size = 0; + size_t _length = 0; +}; +} // end namespace string +} // end namespace paddle diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 025528e85c4bf4da63b588dd91681d7bf7bb78fe..fc52c281c4f0de2b05ab2b58aa81cdbf1216e6a7 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -425,6 +425,13 @@ function assert_api_not_changed() { sed -i '/.*ComposeNotAligned.*/d' new.spec python ${PADDLE_ROOT}/tools/diff_api.py ${PADDLE_ROOT}/paddle/fluid/API.spec new.spec + + # Currently, we only check in PR_CI python 2.7 + if [ "$SYSTEM" != "Darwin" ]; then + if [ "$1" == "" ] || [ "$1" == "cp27-cp27m" ] || [ "$1" == "cp27-cp27mu" ]; then + python ${PADDLE_ROOT}/tools/diff_use_default_grad_op_maker.py ${PADDLE_ROOT}/paddle/fluid/op_use_default_grad_op_maker.spec + fi + fi deactivate } @@ -434,9 +441,12 @@ function assert_api_spec_approvals() { fi API_FILES=("paddle/fluid/API.spec" + "paddle/fluid/op_use_default_grad_op_maker.spec" "python/paddle/fluid/parallel_executor.py" "paddle/fluid/framework/operator.h" "paddle/fluid/framework/tensor.h" + "paddle/fluid/framework/details/op_registry.h" + "paddle/fluid/framework/grad_op_desc_maker.h" "paddle/fluid/framework/lod_tensor.h" "paddle/fluid/framework/selected_rows.h" "paddle/fluid/framework/op_desc.h" diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index a746f2ed1429c27ffb50cb96e2ddd8b4a5c7d66b..0af883764e157db24e17a1a4ef1bff27f9b39b0f 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -24,10 +24,13 @@ from .executor import * from . import data_feed_desc from .data_feed_desc import * +from . import dataset +from .dataset import * + from . import async_executor from .async_executor import * -from . import trainer +from . import trainer_desc from . import inferencer from . import io @@ -43,10 +46,13 @@ from . import regularizer from . import average from . import metrics from . import transpiler +from . import incubate from . import distribute_lookup_table from .param_attr import ParamAttr, WeightNormParamAttr from .data_feeder import DataFeeder from .core import LoDTensor, LoDTensorArray, CPUPlace, CUDAPlace, CUDAPinnedPlace, Scope, _Scope +from .incubate import fleet +from .incubate import data_generator from .transpiler import DistributeTranspiler, \ memory_optimize, release_memory, DistributeTranspilerConfig from .lod_tensor import create_lod_tensor, create_random_int_lodtensor @@ -64,9 +70,9 @@ from . import install_check Tensor = LoDTensor __all__ = framework.__all__ + executor.__all__ + \ - trainer.__all__ + inferencer.__all__ + transpiler.__all__ + \ + trainer_desc.__all__ + inferencer.__all__ + transpiler.__all__ + \ parallel_executor.__all__ + lod_tensor.__all__ + \ - data_feed_desc.__all__ + async_executor.__all__ + compiler.__all__ + [ + data_feed_desc.__all__ + async_executor.__all__ + compiler.__all__ + [ 'io', 'initializer', 'layers', @@ -151,6 +157,7 @@ def __bootstrap__(): read_env_flags.append('use_ngraph') if core.is_compiled_with_dist(): + #env for rpc read_env_flags.append('rpc_deadline') read_env_flags.append('rpc_server_profile_path') read_env_flags.append('enable_rpc_profiler') @@ -158,6 +165,14 @@ def __bootstrap__(): read_env_flags.append('rpc_get_thread_num') read_env_flags.append('rpc_prefetch_thread_num') read_env_flags.append('rpc_disable_reuse_port') + + # env for communicator + read_env_flags.append('communicator_independent_recv_thread') + read_env_flags.append('communicator_send_queue_size') + read_env_flags.append('communicator_max_send_grad_num_before_recv') + read_env_flags.append('communicator_thread_pool_size') + read_env_flags.append('communicator_max_merge_var_num') + read_env_flags.append('communicator_fake_rpc') if core.is_compiled_with_brpc(): read_env_flags.append('max_body_size') #set brpc max body size diff --git a/python/paddle/fluid/async_executor.py b/python/paddle/fluid/async_executor.py index 25f95ffbb0acf618f19b36987093d5884369e530..2442d26d3c8cc86c81335fb5d84fcec59f43a054 100644 --- a/python/paddle/fluid/async_executor.py +++ b/python/paddle/fluid/async_executor.py @@ -24,6 +24,7 @@ from paddle.fluid.proto import data_feed_pb2 from google.protobuf import text_format from . import io from .data_feed_desc import DataFeedDesc +from .trainer_desc import TrainerDesc, MultiTrainer, DistMultiTrainer from .distributed import ps_instance from .contrib.utils import hdfs_utils as hdfs @@ -77,6 +78,17 @@ class AsyncExecutor(object): """ def __init__(self, place=None, run_mode=""): + """ + Init. + + Example: + >>> place = fluid.CPUPlace() + >>> async_executor = fluid.AsyncExecutor(place) + + Args: + place(Place): CPUPlace only + run_mode(str): default is empty string. + """ if place is None: place = core.CPUPlace() if not isinstance(place, core.CPUPlace): @@ -159,7 +171,8 @@ class AsyncExecutor(object): self.executor.run_from_files(program_desc, data_feed.desc(), filelist, thread_num, - fetch_var_names, mode, debug) + fetch_var_names, mode, debug, + str(id(program_desc))) def download_data(self, afs_path, @@ -172,18 +185,19 @@ class AsyncExecutor(object): """ download_data is a default download method for distributed training a user download data without this method - + Example: >>> exe = fluid.AsyncExecutor() >>> exe.download_data("/xxx/xxx/xx/", - >>> "./data", "afs:// - >>> xxx.xxx.xxx.xxx:9901", "xxx,yyy") + >>> "./data", "afs:// + >>> xxx.xxx.xxx.xxx:9901", "xxx,yyy") + Args: afs_path(str): afs_path defined by users local_path(str): download data path fs_default_name(str): file system server address ugi(str): hadoop ugi - file_cn(int): a user can specify file number for debugging + file_cnt(int): a user can specify file number for debugging hadoop_home(str): hadoop home path process_num(int): download process num """ @@ -217,7 +231,7 @@ class AsyncExecutor(object): def config_distributed_nodes(self): """ if a user needs to run distributed async executor - he or she needs to do a global configuration so that + he or she needs to do a global configuration so that information of current process can be obtained """ self.instance = ps_instance.PaddlePSInstance(1, 2) @@ -241,16 +255,19 @@ class AsyncExecutor(object): def init_server(self, dist_desc): """ - initialize server of current node if current process is a server + Initialize server of current node if current process is a server. + Args: - dist_desc(str): a protobuf string that describes - how to init a worker and a server + dist_desc(str): a protobuf string that describes + how to init a worker and a server """ if self.instance is None: raise ValueError( 'instance is None, please run config_distributed_nodes init instance' ) - self.executor.init_server(dist_desc, self.instance._rankid) + self.dist_desc_str = text_format.MessageToString(dist_desc) + self.dist_desc = dist_desc + self.executor.init_server(self.dist_desc_str, self.instance._rankid) ip = self.executor.start_server() self.instance.set_ip(ip) self.instance.barrier_all() #wait all server start @@ -260,23 +277,31 @@ class AsyncExecutor(object): def init_worker(self, dist_desc, startup_program): """ - initialize worker of current node if current process is a worker + Initialize worker of current node if current process is a worker. + Args: - dist_desc(str): a protobuf string that describes - how to init a worker and a server - startup_program(fluid.Program): startup program of current process + dist_desc(str): a protobuf string that describes + how to init a worker and a server + startup_program(fluid.Program): startup program of current process """ if self.instance is None: raise ValueError( 'instance is None, please run config_distributed_nodes init instance' ) + + self.dist_desc_str = text_format.MessageToString(dist_desc) + self.dist_desc = dist_desc place = core.CPUPlace() executor = Executor(place) - executor.run(startup_program) + if isinstance(startup_program, list): + for sp in startup_program: + executor.run(sp) + else: + executor.run(startup_program) self.instance.barrier_all() #wait all server start ips = self.instance.gather_ips() - self.executor.init_worker(dist_desc, ips, + self.executor.init_worker(self.dist_desc_str, ips, self.instance.get_node_cnt(), self.instance._rankid) self.instance.barrier_all() #wait all worker start @@ -298,9 +323,10 @@ class AsyncExecutor(object): def save_model(self, save_path): """ save_model command that can be invoked from one of the worker - model parameters are saved in servers and upload to save_path of file system + model parameters are saved in servers and upload to save_path of file system. + Args: - save_path(str): save path to file system + save_path(str): save path to file system """ if self.instance is None: raise ValueError( diff --git a/python/paddle/fluid/contrib/__init__.py b/python/paddle/fluid/contrib/__init__.py index 7442059ba07b2ed1d7164b9be60b8bbc92fec651..ca10db0a5450e0a38159fe2e38b2926f6b1900a7 100644 --- a/python/paddle/fluid/contrib/__init__.py +++ b/python/paddle/fluid/contrib/__init__.py @@ -32,6 +32,8 @@ from . import utils from .utils import * from . import extend_optimizer from .extend_optimizer import * +from . import model_stat +from .model_stat import * __all__ = [] __all__ += decoder.__all__ diff --git a/python/paddle/fluid/data_feed_desc.py b/python/paddle/fluid/data_feed_desc.py index d2ec74d6cfdeb34c1f48c086a3aa30d5100c3efb..80745aac830d1da46b62ab1bf246b1fa4895a7cc 100644 --- a/python/paddle/fluid/data_feed_desc.py +++ b/python/paddle/fluid/data_feed_desc.py @@ -68,6 +68,7 @@ class DataFeedDesc(object): def __init__(self, proto_file): self.proto_desc = data_feed_pb2.DataFeedDesc() + self.proto_desc.pipe_command = "cat" with open(proto_file, 'r') as f: text_format.Parse(f.read(), self.proto_desc) if self.proto_desc.name == "MultiSlotDataFeed": diff --git a/python/paddle/fluid/dataset.py b/python/paddle/fluid/dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..d63773223ddc0c155f26a656f19c4ba80f482632 --- /dev/null +++ b/python/paddle/fluid/dataset.py @@ -0,0 +1,291 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddle.fluid.proto import data_feed_pb2 +from google.protobuf import text_format +from . import core +__all__ = ['DatasetFactory', 'InMemoryDataset', 'QueueDataset'] + + +class DatasetFactory(object): + """ + DatasetFactory is a factory which create dataset by its name, + you can create "QueueDataset" or "InMemoryDataset", + the default is "QueueDataset". + + Example: + dataset = paddle.fluid.DatasetFactory.create_dataset("InMemoryDataset") + """ + + def __init__(self): + """ + Init + """ + pass + + def create_dataset(self, datafeed_class="QueueDataset"): + """ + Create "QueueDataset" or "InMemoryDataset", + the default is "QueueDataset". + + Examples: + import paddle.fluid as fluid + dataset = fluid.DatasetFactory().create_dataset() + """ + try: + dataset = globals()[datafeed_class]() + return dataset + except: + raise ValueError("datafeed class %s does not exist" % + datafeed_class) + + +class DatasetBase(object): + """ + Base dataset class + """ + + def __init__(self): + """ + Init + """ + # define class name here + # to decide whether we need create in memory instance + self.proto_desc = data_feed_pb2.DataFeedDesc() + self.proto_desc.pipe_command = "cat" + self.dataset = core.Dataset("MultiSlotDataset") + self.thread_num = 0 + + def set_pipe_command(self, pipe_command): + """ + Set pipe command of current dataset + A pipe command is a UNIX pipeline command that can be used only + + Example: + >>> dataset.set_pipe_command("python my_script.py") + + Args: + pipe_command: pipe command + + """ + self.proto_desc.pipe_command = pipe_command + + def set_batch_size(self, batch_size): + """ + Set batch size. Will be effective during training + + Example: + >>> dataset.set_batch_size(128) + + Args: + batch_size: batch size + + """ + self.proto_desc.batch_size = batch_size + + def set_thread(self, thread_num): + """ + Set thread num, it is the num of readers. + + Example: + >>> dataset.set_thread(12) + + Args: + thread_num: thread num + """ + self.dataset.set_thread_num(thread_num) + self.thread_num = thread_num + + def set_filelist(self, filelist): + """ + Set file list in current worker. + + Example: + >>> dataset.set_filelist(['a.txt', 'b.txt']) + + Args: + filelist: file list + """ + self.dataset.set_filelist(filelist) + + def set_use_var(self, var_list): + """ + Set Variables which you will use. + + Example: + >>> dataset.set_use_var([data, label]) + + Args: + var_list: variable list + """ + multi_slot = self.proto_desc.multi_slot_desc + for var in var_list: + slot_var = multi_slot.slots.add() + slot_var.is_used = True + slot_var.name = var.name + if var.lod_level == 0: + slot_var.is_dense = True + if var.dtype == core.VarDesc.VarType.FP32: + slot_var.type = "float" + elif var.dtype == core.VarDesc.VarType.INT64: + slot_var.type = "uint64" + else: + raise ValueError( + "Currently, fluid.dataset only supports dtype=float32 and dtype=int64" + ) + + def set_hdfs_config(self, fs_name, fs_ugi): + """ + Set hdfs config: fs name ad ugi + + Example: + >>> dataset.set_hdfs_config("my_fs_name", "my_fs_ugi") + + Args: + fs_name: fs name + fs_ugi: fs ugi + """ + self.dataset.set_hdfs_config(fs_name, fs_ugi) + + def _prepare_to_run(self): + """ + Set data_feed_desc before load or shuffle, + user no need to call this function. + """ + self.dataset.set_data_feed_desc(self.desc()) + + def desc(self): + """ + Returns a protobuf message for this DataFeedDesc + + Example: + >>> print(dataset.desc()) + + Returns: + A string message + """ + return text_format.MessageToString(self.proto_desc) + + +class InMemoryDataset(DatasetBase): + """ + InMemoryDataset, it will load data into memory + and shuffle data before training. + This class should be created by DatasetFactory + + Example: + dataset = paddle.fluid.DatasetFactory.create_dataset("InMemoryDataset") + """ + + def __init__(self): + """ + Init + """ + super(InMemoryDataset, self).__init__() + self.proto_desc.name = "MultiSlotInMemoryDataFeed" + + def load_into_memory(self): + """ + Load data into memory + + Example: + >>> import paddle.fluid as fluid + >>> dataset = fluid.DatasetFactory.create_dataset("InMemoryDataset") + >>> filelist = ["a.txt", "b.txt"] + >>> dataset.set_filelist(filelist) + >>> dataset.load_into_memory() + """ + self._prepare_to_run() + self.dataset.load_into_memory() + + def local_shuffle(self): + """ + Local shuffle + + Example: + >>> import paddle.fluid as fluid + >>> dataset = fluid.DatasetFactory.create_dataset("InMemoryDataset") + >>> filelist = ["a.txt", "b.txt"] + >>> dataset.set_filelist(filelist) + >>> dataset.local_shuffle() + """ + self.dataset.local_shuffle() + + def global_shuffle(self, fleet=None): + """ + Global shuffle. + Global shuffle can be used only in distributed mode. i.e. multiple + processes on single machine or multiple machines training together. + If you run in distributed mode, you should pass fleet instead of None. + + Examples: + >>> import paddle.fluid as fluid + >>> import paddle.fluid.incubate.fleet.parameter_server as fleet + >>> dataset = fluid.DatasetFactory.create_dataset("InMemoryDataset") + >>> filelist = ["a.txt", "b.txt"] + >>> dataset.set_filelist(filelist) + >>> dataset.global_shuffle(fleet) + + Args: + fleet: fleet singleton. Default None. + """ + trainer_num = 1 + if fleet is not None: + fleet.fleet_instance.role_maker_._barrier_worker() + trainer_num = fleet.worker_num() + self.dataset.register_client2client_msg_handler() + self.dataset.set_trainer_num(trainer_num) + if fleet is not None: + fleet.fleet_instance.role_maker_._barrier_worker() + self.dataset.global_shuffle() + if fleet is not None: + fleet.fleet_instance.role_maker_._barrier_worker() + + +class QueueDataset(DatasetBase): + """ + QueueDataset, it will process data streamly. + + Example: + import paddle.fluid as fluid + dataset = fluid.DatasetFactory.create_dataset("QueueDataset") + """ + + def __init__(self): + """ + Initialize QueueDataset + This class should be created by DatasetFactory + """ + super(QueueDataset, self).__init__() + self.proto_desc.name = "MultiSlotDataFeed" + + def local_shuffle(self): + """ + Local shuffle + + Local shuffle is not supported in QueueDataset + NotImplementedError will be raised + """ + raise NotImplementedError( + "QueueDataset does not support local shuffle, " + "please use InMemoryDataset for local_shuffle") + + def global_shuffle(self, fleet=None): + """ + Global shuffle is not supported in QueueDataset + NotImplementedError will be raised + """ + raise NotImplementedError( + "QueueDataset does not support global shuffle, " + "please use InMemoryDataset for global_shuffle") diff --git a/python/paddle/fluid/device_worker.py b/python/paddle/fluid/device_worker.py new file mode 100644 index 0000000000000000000000000000000000000000..7fc72191884020f4cc57c9269b636161635f06d0 --- /dev/null +++ b/python/paddle/fluid/device_worker.py @@ -0,0 +1,181 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +__all__ = ['DeviceWorker', 'Hogwild', 'DownpourSGD'] + + +class DeviceWorker(object): + """ + DeviceWorker is an abstract class, which generates worker desc. + This class is an inner class that we do computation logics within + the implementation. For example, execution of a program or a graph. + """ + + def __init__(self): + """ + Init. + """ + self.program_ = None + self.infer_ = None + + def _set_infer(self, infer=False): + """ + set inference flag for current device worker + + Args: + infer(bool): whether to do inference + """ + self.infer_ = infer + + def _set_fleet_desc(self, fleet_desc): + """ + Set fleet desc. + + Args: + fleet_desc(PSParameter): pslib.PSParameter object + """ + self.fleet_desc_ = fleet_desc + + def _set_program(self, program): + """ + Set program. + + Args: + program(Program): a Program object + """ + self.program_ = program + + def _gen_worker_desc(self, trainer_desc): + """ + Generator worker desc. + + Args: + trainer_desc(TrainerDesc): a TrainerDesc object + """ + raise NotImplementedError( + "DeviceWorker does not implement gen_worker_desc, " + "please use Hogwild or DownpourSGD, etc.") + + +class Hogwild(DeviceWorker): + """ + Hogwild is a kind of SGD algorithm. + + """ + + def __init__(self): + """ + Init. + """ + super(Hogwild, self).__init__() + + def _gen_worker_desc(self, trainer_desc): + """ + Generator worker desc, which device worker is HogwildWorker. + + Args: + trainer_desc(TrainerDesc): a TrainerDesc object + """ + trainer_desc.device_worker_name = "HogwildWorker" + if self.infer_: + # just ignore feed op for inference model + trainer_desc.hogwild_param.skip_ops.extend(["feed"]) + + +class DownpourSGD(DeviceWorker): + """ + DownpourSGD is a kind of distributed SGD algorithm. + """ + + def __init__(self): + """ + Init. + initialize downpourSGD device worker + """ + super(DownpourSGD, self).__init__() + + def _gen_worker_desc(self, trainer_desc): + """ + Generator worker desc, which device worker is DownpourWorker. + + Args: + trainer_desc(TrainerDesc): a TrainerDesc object + """ + dense_table_set = set() + program_id = str(id(self.program_)) + if self.program_ == None: + print("program of current device worker is not configured") + exit(-1) + opt_info = self.program_._fleet_opt + program_configs = opt_info["program_configs"] + downpour = trainer_desc.downpour_param + + for pid in program_configs: + if pid == program_id: + pc = downpour.program_config.add() + pc.program_id = program_id + for i in program_configs[program_id]["push_sparse"]: + pc.push_sparse_table_id.extend([i]) + for i in program_configs[program_id]["push_dense"]: + pc.push_dense_table_id.extend([i]) + dense_table_set.add(i) + for i in program_configs[program_id]["pull_sparse"]: + pc.pull_sparse_table_id.extend([i]) + for i in program_configs[program_id]["pull_dense"]: + pc.pull_dense_table_id.extend([i]) + dense_table_set.add(i) + break + + trainer_desc.device_worker_name = "DownpourWorker" + pull_thread = trainer_desc.pull_dense_param + pull_thread.device_num = trainer_desc.thread_num + for i in self.fleet_desc_.trainer_param.dense_table: + if i.table_id in dense_table_set: + dense_table = pull_thread.dense_table.add() + dense_table.dense_value_name.extend(i.dense_variable_name) + dense_table.table_id = \ + i.table_id + sparse_table = downpour.sparse_table.add() + sparse_table.table_id = \ + self.fleet_desc_.trainer_param.sparse_table[0].table_id + sparse_table.sparse_key_name.extend( + self.fleet_desc_.trainer_param.sparse_table[0].slot_key) + sparse_table.sparse_value_name.extend( + self.fleet_desc_.trainer_param.sparse_table[0].slot_value) + sparse_table.sparse_grad_name.extend( + self.fleet_desc_.trainer_param.sparse_table[0].slot_gradient) + sparse_table.emb_dim = \ + self.fleet_desc_.server_param.downpour_server_param.downpour_table_param[ + 0].accessor.fea_dim - 2 + sparse_table.fea_dim = sparse_table.emb_dim + 2 + # TODO(guru4elephant): hard code here, need to improve + sparse_table.label_var_name = "click" + + for i in self.fleet_desc_.trainer_param.dense_table: + if i.table_id in dense_table_set: + dense_table = downpour.dense_table.add() + dense_table.table_id = i.table_id + dense_table.dense_value_name.extend(i.dense_variable_name) + dense_table.dense_grad_name.extend( + i.dense_gradient_variable_name) + downpour.skip_ops.extend(self.fleet_desc_.trainer_param.skip_op) + if self.infer_: + downpour.push_dense = False + downpour.push_sparse = False + + +class DeviceWorkerFactory(object): + def _create_device_worker(self, worker_type): + classname = worker_type.capitalize() + return globals()[classname]() diff --git a/python/paddle/fluid/distributed/downpour.py b/python/paddle/fluid/distributed/downpour.py index 87dfab92c53d9950d4606e078cc9f51bcda8f4d3..902daf1a4ac754da1cc61cd00a89e3f12b4c2357 100644 --- a/python/paddle/fluid/distributed/downpour.py +++ b/python/paddle/fluid/distributed/downpour.py @@ -33,6 +33,9 @@ class DownpourSGD(object): Examples: .. code-block:: python + opt = fluid.DistributedOptimizer(sgd_opt) + opt.minimize() + downpour_sgd = fluid.distributed.DownpourSGD(learning_rate=0.2) downpour_sgd.minimize(cost) """ @@ -43,9 +46,13 @@ class DownpourSGD(object): self.learning_rate_ = learning_rate self.window_ = window self.type = "downpour" + self.data_norm_name = [ + ".batch_size", ".batch_square_sum", ".batch_sum", + ".batch_size@GRAD", ".batch_square_sum@GRAD", ".batch_sum@GRAD" + ] def minimize(self, - loss, + losses, startup_program=None, parameter_list=None, no_grad_set=None): @@ -65,41 +72,97 @@ class DownpourSGD(object): worker_skipped_ops: operator names that need to be skipped during execution """ - params_grads = sorted( - append_backward(loss, parameter_list, no_grad_set), - key=lambda x: x[0].name) - table_name = find_distributed_lookup_table(loss.block.program) + if not isinstance(losses, list): + raise ValueError('losses is a list, just lick [model.cost]') + table_name = find_distributed_lookup_table(losses[0].block.program) prefetch_slots = find_distributed_lookup_table_inputs( - loss.block.program, table_name) + losses[0].block.program, table_name) prefetch_slots_emb = find_distributed_lookup_table_outputs( - loss.block.program, table_name) + losses[0].block.program, table_name) + + ps_param = pslib.PSParameter() server = DownpourServer() - # window is communication strategy worker = DownpourWorker(self.window_) - # Todo(guru4elephant): support multiple tables definitions - # currently support one big sparse table sparse_table_index = 0 - # currently merge all dense parameters into one dense table - dense_table_index = 1 - params = [] - grads = [] - for i in params_grads: - params.append(i[0]) - for i in params_grads: - grads.append(i[1]) server.add_sparse_table(sparse_table_index, self.learning_rate_, prefetch_slots, prefetch_slots_emb) - server.add_dense_table(dense_table_index, self.learning_rate_, params, - grads) worker.add_sparse_table(sparse_table_index, self.learning_rate_, prefetch_slots, prefetch_slots_emb) - worker.add_dense_table(dense_table_index, self.learning_rate_, params, - grads) - ps_param = pslib.PSParameter() + dense_table_index = 1 + program_configs = [] + param_grads_list = [] + for loss_index in range(len(losses)): + program_config = ps_param.trainer_param.program_config.add() + program_config.program_id = str( + id(losses[loss_index].block.program)) + program_config.pull_sparse_table_id.extend([sparse_table_index]) + program_config.push_sparse_table_id.extend([sparse_table_index]) + params_grads = sorted( + append_backward(losses[loss_index], parameter_list, + no_grad_set), + key=lambda x: x[0].name) + param_grads_list.append(params_grads) + params = [] + grads = [] + data_norm_params = [] + data_norm_grads = [] + for i in params_grads: + is_data_norm_data = False + for data_norm_name in self.data_norm_name: + if i[0].name.endswith(data_norm_name): + is_data_norm_data = True + data_norm_params.append(i[0]) + if not is_data_norm_data: + params.append(i[0]) + for i in params_grads: + is_data_norm_data = False + for data_norm_grad in self.data_norm_name: + if i[0].name.endswith(data_norm_grad): + is_data_norm_data = True + data_norm_grads.append(i[1]) + if not is_data_norm_data: + grads.append(i[1]) + server.add_dense_table(dense_table_index, self.learning_rate_, + params, grads) + worker.add_dense_table(dense_table_index, self.learning_rate_, + params, grads) + program_config.pull_dense_table_id.extend([dense_table_index]) + program_config.push_dense_table_id.extend([dense_table_index]) + if len(data_norm_params) != 0 and len(data_norm_grads) != 0: + dense_table_index += 1 + server.add_data_norm_table(dense_table_index, + self.learning_rate_, + data_norm_params, data_norm_grads) + worker.add_dense_table(dense_table_index, self.learning_rate_, + data_norm_params, data_norm_grads) + program_config.pull_dense_table_id.extend([dense_table_index]) + program_config.push_dense_table_id.extend([dense_table_index]) + dense_table_index += 1 + program_configs.append(program_config) ps_param.server_param.CopyFrom(server.get_desc()) ps_param.trainer_param.CopyFrom(worker.get_desc()) + for program_config in program_configs: + ps_param.trainer_param.program_config.extend([program_config]) # Todo(guru4elephant): figure out how to support more sparse parameters # currently only support lookup_table worker_skipped_ops = ["lookup_table", "lookup_table_grad"] ps_param.trainer_param.skip_op.extend(worker_skipped_ops) - return [ps_param, worker_skipped_ops] + + # all fleet operations should be defined in operators in the future + # we want to return an object here containing: + # 1) worker execution strategy + # 2) pserver execution strategy + # 3) fleet configurations + # 4) skipped operators in runtime + # 5) distributed optimization + opt_info = {} + opt_info["trainer"] = "DistMultiTrainer" + opt_info["device_worker"] = "DownpourSGD" + opt_info["optimizer"] = "DownpourSGD" + opt_info["fleet_desc"] = ps_param + opt_info["worker_skipped_ops"] = worker_skipped_ops + + for loss in losses: + loss.block.program._fleet_opt = opt_info + + return None, param_grads_list diff --git a/python/paddle/fluid/distributed/fleet.py b/python/paddle/fluid/distributed/fleet.py new file mode 100644 index 0000000000000000000000000000000000000000..8f3d2defb9f0631098de3fb9ee1fa7b1abdeb884 --- /dev/null +++ b/python/paddle/fluid/distributed/fleet.py @@ -0,0 +1,76 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +import sys +from .. import core +from . import ps_instance + +__all__ = ['Fleet'] + + +class Fleet(object): + """ + + """ + + def __init__(self): + self.instance_ = ps_instance.PaddlePSInstance() + self.fleet_ = core.FleetWrapper() + + def stop(self): + self.instance_.barrier_worker() + if self.instance.is_first_worker(): + self.fleet_.stop_server() + self.instance_.barrier_worker() + self.instance_.barrier_all() + self.instance.finalize() + + def init_pserver(self, opt_info): + if "fleet_desc" in opt_info: + self.dist_desc_str_ = text_format.MessageToString(opt_info[ + "fleet_desc"]) + self.dist_desc_ = opt_info["fleet_desc"] + else: + print( + "You should run distributed optimization to get opt_info first") + sys.exit(-1) + self.fleet_.init_server(self.dist_desc_str_) + ip = self.fleet_.start_server() + self.instance_.set_ip(ip) + self.instance.barrier_all() + ips = self.instance.gather_ips() + self.fleet.gather_servers(ips, self.instance_.get_node_cnt()) + self.instance_.barrier_all() + + def init_worker(self, opt_info): + if "fleet_desc" in opt_info: + self.dist_desc_str_ = text_format.MessageToString(opt_info[ + "fleet_desc"]) + self.dist_desc_ = opt_info["fleet_desc"] + else: + print( + "You should run distributed optimization to get opt_info first") + sys.exit(-1) + self.instance_.barrier_all() + ips = self.instance.gather_ips() + self.fleet_.init_worker(self.dist_desc_str_, ips, + self.instance_.get_node_cnt(), + self.instance._rankid) + self.instance.barrier_worker() + + def init_pserver_model(self): + if self.instance_.is_first_worker(): + self.fleet_.init_model() + self.instance_.barrier_worker() + + def save_pserver_model(self, save_path): + self.fleet_.save_model(save_path) diff --git a/python/paddle/fluid/distributed/ps_instance.py b/python/paddle/fluid/distributed/ps_instance.py index d3ce3ce6934d08eb06763fea071a83e460c6bf6c..19d661c660efef8394bd2369f7759645ebbf3c5d 100644 --- a/python/paddle/fluid/distributed/ps_instance.py +++ b/python/paddle/fluid/distributed/ps_instance.py @@ -121,6 +121,18 @@ class PaddlePSInstance(object): """ return self._nodes + def get_worker_num(self): + """ + Return worker num + """ + return self._worker_num + + def get_server_num(self): + """ + Return server num + """ + return self._server_num + def barrier_all(self): """ barrier workers and servers diff --git a/python/paddle/fluid/distributed/ps_pb2.py b/python/paddle/fluid/distributed/ps_pb2.py index 0d226c4d593473681658fa3e7764d438a65b7116..5c9b2def0761ac96e81181959852c49f0fd03bd8 100644 --- a/python/paddle/fluid/distributed/ps_pb2.py +++ b/python/paddle/fluid/distributed/ps_pb2.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -10,6 +10,8 @@ # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and +# limitations under the License. + # Generated by the protocol buffer compiler. DO NOT EDIT! # source: ps.proto @@ -30,7 +32,7 @@ DESCRIPTOR = _descriptor.FileDescriptor( package='paddle', syntax='proto2', serialized_pb=_b( - '\n\x08ps.proto\x12\x06paddle\"\x9e\x02\n\x0bPSParameter\x12\x14\n\x0cworker_class\x18\x01 \x01(\t\x12\x14\n\x0cserver_class\x18\x02 \x01(\t\x12\x16\n\x0einstance_class\x18\x03 \x01(\t\x12-\n\x0cworker_param\x18\x65 \x01(\x0b\x32\x17.paddle.WorkerParameter\x12-\n\x0cserver_param\x18\x66 \x01(\x0b\x32\x17.paddle.ServerParameter\x12\x38\n\rtrainer_param\x18\xad\x02 \x01(\x0b\x32 .paddle.DownpourTrainerParameter\x12\x33\n\x0f\x66s_client_param\x18\xf5\x03 \x01(\x0b\x32\x19.paddle.FsClientParameter\"Q\n\x0fWorkerParameter\x12>\n\x15\x64ownpour_worker_param\x18\x01 \x01(\x0b\x32\x1f.paddle.DownpourWorkerParameter\"Q\n\x0fServerParameter\x12>\n\x15\x64ownpour_server_param\x18\x01 \x01(\x0b\x32\x1f.paddle.DownpourServerParameter\"O\n\x17\x44ownpourWorkerParameter\x12\x34\n\x14\x64ownpour_table_param\x18\x01 \x03(\x0b\x32\x16.paddle.TableParameter\"\xce\x01\n\x18\x44ownpourTrainerParameter\x12\x30\n\x0b\x64\x65nse_table\x18\x01 \x03(\x0b\x32\x1b.paddle.DenseTableParameter\x12\x32\n\x0csparse_table\x18\x02 \x03(\x0b\x32\x1c.paddle.SparseTableParameter\x12\x1d\n\x15push_sparse_per_batch\x18\x03 \x01(\x05\x12\x1c\n\x14push_dense_per_batch\x18\x04 \x01(\x05\x12\x0f\n\x07skip_op\x18\x05 \x03(\t\"{\n\x13\x44\x65nseTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x05\x12\x1b\n\x13\x64\x65nse_variable_name\x18\x02 \x03(\t\x12$\n\x1c\x64\x65nse_gradient_variable_name\x18\x03 \x03(\t\x12\x0f\n\x07\x66\x65\x61_dim\x18\x04 \x01(\x05\"z\n\x14SparseTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x05\x12\x13\n\x0b\x66\x65\x61ture_dim\x18\x02 \x01(\x05\x12\x10\n\x08slot_key\x18\x03 \x03(\t\x12\x12\n\nslot_value\x18\x04 \x03(\t\x12\x15\n\rslot_gradient\x18\x05 \x03(\t\"\x86\x01\n\x17\x44ownpourServerParameter\x12\x34\n\x14\x64ownpour_table_param\x18\x01 \x03(\x0b\x32\x16.paddle.TableParameter\x12\x35\n\rservice_param\x18\x02 \x01(\x0b\x32\x1e.paddle.ServerServiceParameter\"\xd7\x01\n\x16ServerServiceParameter\x12*\n\x0cserver_class\x18\x01 \x01(\t:\x14\x44ownpourBrpcPsServer\x12*\n\x0c\x63lient_class\x18\x02 \x01(\t:\x14\x44ownpourBrpcPsClient\x12(\n\rservice_class\x18\x03 \x01(\t:\x11\x44ownpourPsService\x12\x1c\n\x11start_server_port\x18\x04 \x01(\r:\x01\x30\x12\x1d\n\x11server_thread_num\x18\x05 \x01(\r:\x02\x31\x32\"\xbf\x01\n\x0eTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x04\x12\x13\n\x0btable_class\x18\x02 \x01(\t\x12\x12\n\nshared_num\x18\x03 \x01(\x04\x12\x30\n\x08\x61\x63\x63\x65ssor\x18\x04 \x01(\x0b\x32\x1e.paddle.TableAccessorParameter\x12\x1f\n\x04type\x18\x05 \x01(\x0e\x32\x11.paddle.TableType\x12\x1f\n\x10\x63ompress_in_save\x18\x06 \x01(\x08:\x05\x66\x61lse\"\xf1\x02\n\x16TableAccessorParameter\x12\x16\n\x0e\x61\x63\x63\x65ssor_class\x18\x01 \x01(\t\x12\x38\n\x10sparse_sgd_param\x18\x02 \x01(\x0b\x32\x1e.paddle.SparseSGDRuleParameter\x12\x36\n\x0f\x64\x65nse_sgd_param\x18\x03 \x01(\x0b\x32\x1d.paddle.DenseSGDRuleParameter\x12\x0f\n\x07\x66\x65\x61_dim\x18\x04 \x01(\r\x12\x12\n\nembedx_dim\x18\x05 \x01(\r\x12\x18\n\x10\x65mbedx_threshold\x18\x06 \x01(\r\x12G\n\x17\x64ownpour_accessor_param\x18\x07 \x01(\x0b\x32&.paddle.DownpourTableAccessorParameter\x12\x45\n\x19table_accessor_save_param\x18\x08 \x03(\x0b\x32\".paddle.TableAccessorSaveParameter\"\xce\x01\n\x1e\x44ownpourTableAccessorParameter\x12\x14\n\x0cnonclk_coeff\x18\x01 \x01(\x02\x12\x13\n\x0b\x63lick_coeff\x18\x02 \x01(\x02\x12\x16\n\x0e\x62\x61se_threshold\x18\x03 \x01(\x02\x12\x17\n\x0f\x64\x65lta_threshold\x18\x04 \x01(\x02\x12\x17\n\x0f\x64\x65lta_keep_days\x18\x05 \x01(\x02\x12\x1d\n\x15show_click_decay_rate\x18\x06 \x01(\x02\x12\x18\n\x10\x64\x65lete_threshold\x18\x07 \x01(\x02\"S\n\x1aTableAccessorSaveParameter\x12\r\n\x05param\x18\x01 \x01(\r\x12\x11\n\tconverter\x18\x02 \x01(\t\x12\x13\n\x0b\x64\x65\x63onverter\x18\x03 \x01(\t\"e\n\x10PsRequestMessage\x12\x0e\n\x06\x63md_id\x18\x01 \x02(\r\x12\x10\n\x08table_id\x18\x02 \x01(\r\x12\x0e\n\x06params\x18\x03 \x03(\x0c\x12\x11\n\tclient_id\x18\x04 \x01(\x05\x12\x0c\n\x04\x64\x61ta\x18\x05 \x01(\x0c\"w\n\x16SparseSGDRuleParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x15\n\rinitial_g2sum\x18\x02 \x01(\x01\x12\x18\n\rinitial_range\x18\x03 \x01(\x01:\x01\x30\x12\x15\n\rweight_bounds\x18\x04 \x03(\x02\"\xe1\x01\n\x15\x44\x65nseSGDRuleParameter\x12\x0c\n\x04name\x18\x01 \x01(\t\x12&\n\x04\x61\x64\x61m\x18\x02 \x01(\x0b\x32\x18.paddle.AdamSGDParameter\x12(\n\x05naive\x18\x03 \x01(\x0b\x32\x19.paddle.NaiveSGDParameter\x12,\n\x07summary\x18\x04 \x01(\x0b\x32\x1b.paddle.SummarySGDParameter\x12:\n\x0emoving_average\x18\x05 \x01(\x0b\x32\".paddle.MovingAverageRuleParameter\"\x86\x01\n\x10\x41\x64\x61mSGDParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x16\n\x0e\x61vg_decay_rate\x18\x02 \x01(\x01\x12\x16\n\x0e\x61\x64\x61_decay_rate\x18\x03 \x01(\x01\x12\x13\n\x0b\x61\x64\x61_epsilon\x18\x04 \x01(\x01\x12\x16\n\x0emom_decay_rate\x18\x05 \x01(\x01\"B\n\x11NaiveSGDParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x16\n\x0e\x61vg_decay_rate\x18\x02 \x01(\x01\";\n\x13SummarySGDParameter\x12$\n\x12summary_decay_rate\x18\x01 \x01(\x01:\x08\x30.999999\".\n\x1aMovingAverageRuleParameter\x12\x10\n\x08momentum\x18\x01 \x01(\x01\"I\n\x11PsResponseMessage\x12\x13\n\x08\x65rr_code\x18\x01 \x02(\x05:\x01\x30\x12\x11\n\x07\x65rr_msg\x18\x02 \x02(\t:\x00\x12\x0c\n\x04\x64\x61ta\x18\x03 \x01(\x0c\"\xd5\x01\n\x11\x46sClientParameter\x12:\n\x07\x66s_type\x18\x01 \x01(\x0e\x32#.paddle.FsClientParameter.FsApiType:\x04HDFS\x12\x0b\n\x03uri\x18\x02 \x01(\t\x12\x0c\n\x04user\x18\x03 \x01(\t\x12\x0e\n\x06passwd\x18\x04 \x01(\t\x12\x13\n\x0b\x62uffer_size\x18\x05 \x01(\x05\x12\x12\n\nhadoop_bin\x18\x33 \x01(\t\x12\x10\n\x08\x61\x66s_conf\x18\x65 \x01(\t\"\x1e\n\tFsApiType\x12\x08\n\x04HDFS\x10\x00\x12\x07\n\x03\x41\x46S\x10\x01*4\n\tTableType\x12\x13\n\x0fPS_SPARSE_TABLE\x10\x00\x12\x12\n\x0ePS_DENSE_TABLE\x10\x01*\xbd\x02\n\x07PsCmdID\x12\x17\n\x13PS_PULL_DENSE_TABLE\x10\x00\x12\x17\n\x13PS_PUSH_DENSE_TABLE\x10\x01\x12\x18\n\x14PS_PULL_SPARSE_TABLE\x10\x02\x12\x18\n\x14PS_PUSH_SPARSE_TABLE\x10\x03\x12\x13\n\x0fPS_SHRINK_TABLE\x10\x04\x12\x15\n\x11PS_SAVE_ONE_TABLE\x10\x05\x12\x15\n\x11PS_SAVE_ALL_TABLE\x10\x06\x12\x15\n\x11PS_LOAD_ONE_TABLE\x10\x07\x12\x15\n\x11PS_LOAD_ALL_TABLE\x10\x08\x12\x16\n\x12PS_CLEAR_ONE_TABLE\x10\t\x12\x16\n\x12PS_CLEAR_ALL_TABLE\x10\n\x12\x17\n\x13PS_PUSH_DENSE_PARAM\x10\x0b\x12\x12\n\x0ePS_STOP_SERVER\x10\x0c\x32K\n\tPsService\x12>\n\x07service\x12\x18.paddle.PsRequestMessage\x1a\x19.paddle.PsResponseMessageB\x03\x80\x01\x01' + '\n\x08ps.proto\x12\x06paddle\"\x9e\x02\n\x0bPSParameter\x12\x14\n\x0cworker_class\x18\x01 \x01(\t\x12\x14\n\x0cserver_class\x18\x02 \x01(\t\x12\x16\n\x0einstance_class\x18\x03 \x01(\t\x12-\n\x0cworker_param\x18\x65 \x01(\x0b\x32\x17.paddle.WorkerParameter\x12-\n\x0cserver_param\x18\x66 \x01(\x0b\x32\x17.paddle.ServerParameter\x12\x38\n\rtrainer_param\x18\xad\x02 \x01(\x0b\x32 .paddle.DownpourTrainerParameter\x12\x33\n\x0f\x66s_client_param\x18\xf5\x03 \x01(\x0b\x32\x19.paddle.FsClientParameter\"Q\n\x0fWorkerParameter\x12>\n\x15\x64ownpour_worker_param\x18\x01 \x01(\x0b\x32\x1f.paddle.DownpourWorkerParameter\"Q\n\x0fServerParameter\x12>\n\x15\x64ownpour_server_param\x18\x01 \x01(\x0b\x32\x1f.paddle.DownpourServerParameter\"O\n\x17\x44ownpourWorkerParameter\x12\x34\n\x14\x64ownpour_table_param\x18\x01 \x03(\x0b\x32\x16.paddle.TableParameter\"\xfd\x01\n\x18\x44ownpourTrainerParameter\x12\x30\n\x0b\x64\x65nse_table\x18\x01 \x03(\x0b\x32\x1b.paddle.DenseTableParameter\x12\x32\n\x0csparse_table\x18\x02 \x03(\x0b\x32\x1c.paddle.SparseTableParameter\x12\x1d\n\x15push_sparse_per_batch\x18\x03 \x01(\x05\x12\x1c\n\x14push_dense_per_batch\x18\x04 \x01(\x05\x12\x0f\n\x07skip_op\x18\x05 \x03(\t\x12-\n\x0eprogram_config\x18\x06 \x03(\x0b\x32\x15.paddle.ProgramConfig\"\x99\x01\n\rProgramConfig\x12\x12\n\nprogram_id\x18\x01 \x02(\t\x12\x1c\n\x14push_sparse_table_id\x18\x02 \x03(\x05\x12\x1b\n\x13push_dense_table_id\x18\x03 \x03(\x05\x12\x1c\n\x14pull_sparse_table_id\x18\x04 \x03(\x05\x12\x1b\n\x13pull_dense_table_id\x18\x05 \x03(\x05\"{\n\x13\x44\x65nseTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x05\x12\x1b\n\x13\x64\x65nse_variable_name\x18\x02 \x03(\t\x12$\n\x1c\x64\x65nse_gradient_variable_name\x18\x03 \x03(\t\x12\x0f\n\x07\x66\x65\x61_dim\x18\x04 \x01(\x05\"z\n\x14SparseTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x05\x12\x13\n\x0b\x66\x65\x61ture_dim\x18\x02 \x01(\x05\x12\x10\n\x08slot_key\x18\x03 \x03(\t\x12\x12\n\nslot_value\x18\x04 \x03(\t\x12\x15\n\rslot_gradient\x18\x05 \x03(\t\"\x86\x01\n\x17\x44ownpourServerParameter\x12\x34\n\x14\x64ownpour_table_param\x18\x01 \x03(\x0b\x32\x16.paddle.TableParameter\x12\x35\n\rservice_param\x18\x02 \x01(\x0b\x32\x1e.paddle.ServerServiceParameter\"\xd7\x01\n\x16ServerServiceParameter\x12*\n\x0cserver_class\x18\x01 \x01(\t:\x14\x44ownpourBrpcPsServer\x12*\n\x0c\x63lient_class\x18\x02 \x01(\t:\x14\x44ownpourBrpcPsClient\x12(\n\rservice_class\x18\x03 \x01(\t:\x11\x44ownpourPsService\x12\x1c\n\x11start_server_port\x18\x04 \x01(\r:\x01\x30\x12\x1d\n\x11server_thread_num\x18\x05 \x01(\r:\x02\x31\x32\"\xbf\x01\n\x0eTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x04\x12\x13\n\x0btable_class\x18\x02 \x01(\t\x12\x12\n\nshared_num\x18\x03 \x01(\x04\x12\x30\n\x08\x61\x63\x63\x65ssor\x18\x04 \x01(\x0b\x32\x1e.paddle.TableAccessorParameter\x12\x1f\n\x04type\x18\x05 \x01(\x0e\x32\x11.paddle.TableType\x12\x1f\n\x10\x63ompress_in_save\x18\x06 \x01(\x08:\x05\x66\x61lse\"\xf1\x02\n\x16TableAccessorParameter\x12\x16\n\x0e\x61\x63\x63\x65ssor_class\x18\x01 \x01(\t\x12\x38\n\x10sparse_sgd_param\x18\x02 \x01(\x0b\x32\x1e.paddle.SparseSGDRuleParameter\x12\x36\n\x0f\x64\x65nse_sgd_param\x18\x03 \x01(\x0b\x32\x1d.paddle.DenseSGDRuleParameter\x12\x0f\n\x07\x66\x65\x61_dim\x18\x04 \x01(\r\x12\x12\n\nembedx_dim\x18\x05 \x01(\r\x12\x18\n\x10\x65mbedx_threshold\x18\x06 \x01(\r\x12G\n\x17\x64ownpour_accessor_param\x18\x07 \x01(\x0b\x32&.paddle.DownpourTableAccessorParameter\x12\x45\n\x19table_accessor_save_param\x18\x08 \x03(\x0b\x32\".paddle.TableAccessorSaveParameter\"\xce\x01\n\x1e\x44ownpourTableAccessorParameter\x12\x14\n\x0cnonclk_coeff\x18\x01 \x01(\x02\x12\x13\n\x0b\x63lick_coeff\x18\x02 \x01(\x02\x12\x16\n\x0e\x62\x61se_threshold\x18\x03 \x01(\x02\x12\x17\n\x0f\x64\x65lta_threshold\x18\x04 \x01(\x02\x12\x17\n\x0f\x64\x65lta_keep_days\x18\x05 \x01(\x02\x12\x1d\n\x15show_click_decay_rate\x18\x06 \x01(\x02\x12\x18\n\x10\x64\x65lete_threshold\x18\x07 \x01(\x02\"S\n\x1aTableAccessorSaveParameter\x12\r\n\x05param\x18\x01 \x01(\r\x12\x11\n\tconverter\x18\x02 \x01(\t\x12\x13\n\x0b\x64\x65\x63onverter\x18\x03 \x01(\t\"e\n\x10PsRequestMessage\x12\x0e\n\x06\x63md_id\x18\x01 \x02(\r\x12\x10\n\x08table_id\x18\x02 \x01(\r\x12\x0e\n\x06params\x18\x03 \x03(\x0c\x12\x11\n\tclient_id\x18\x04 \x01(\x05\x12\x0c\n\x04\x64\x61ta\x18\x05 \x01(\x0c\"w\n\x16SparseSGDRuleParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x15\n\rinitial_g2sum\x18\x02 \x01(\x01\x12\x18\n\rinitial_range\x18\x03 \x01(\x01:\x01\x30\x12\x15\n\rweight_bounds\x18\x04 \x03(\x02\"\xe1\x01\n\x15\x44\x65nseSGDRuleParameter\x12\x0c\n\x04name\x18\x01 \x01(\t\x12&\n\x04\x61\x64\x61m\x18\x02 \x01(\x0b\x32\x18.paddle.AdamSGDParameter\x12(\n\x05naive\x18\x03 \x01(\x0b\x32\x19.paddle.NaiveSGDParameter\x12,\n\x07summary\x18\x04 \x01(\x0b\x32\x1b.paddle.SummarySGDParameter\x12:\n\x0emoving_average\x18\x05 \x01(\x0b\x32\".paddle.MovingAverageRuleParameter\"\x86\x01\n\x10\x41\x64\x61mSGDParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x16\n\x0e\x61vg_decay_rate\x18\x02 \x01(\x01\x12\x16\n\x0e\x61\x64\x61_decay_rate\x18\x03 \x01(\x01\x12\x13\n\x0b\x61\x64\x61_epsilon\x18\x04 \x01(\x01\x12\x16\n\x0emom_decay_rate\x18\x05 \x01(\x01\"B\n\x11NaiveSGDParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x16\n\x0e\x61vg_decay_rate\x18\x02 \x01(\x01\";\n\x13SummarySGDParameter\x12$\n\x12summary_decay_rate\x18\x01 \x01(\x01:\x08\x30.999999\".\n\x1aMovingAverageRuleParameter\x12\x10\n\x08momentum\x18\x01 \x01(\x01\"I\n\x11PsResponseMessage\x12\x13\n\x08\x65rr_code\x18\x01 \x02(\x05:\x01\x30\x12\x11\n\x07\x65rr_msg\x18\x02 \x02(\t:\x00\x12\x0c\n\x04\x64\x61ta\x18\x03 \x01(\x0c\"\xd5\x01\n\x11\x46sClientParameter\x12:\n\x07\x66s_type\x18\x01 \x01(\x0e\x32#.paddle.FsClientParameter.FsApiType:\x04HDFS\x12\x0b\n\x03uri\x18\x02 \x01(\t\x12\x0c\n\x04user\x18\x03 \x01(\t\x12\x0e\n\x06passwd\x18\x04 \x01(\t\x12\x13\n\x0b\x62uffer_size\x18\x05 \x01(\x05\x12\x12\n\nhadoop_bin\x18\x33 \x01(\t\x12\x10\n\x08\x61\x66s_conf\x18\x65 \x01(\t\"\x1e\n\tFsApiType\x12\x08\n\x04HDFS\x10\x00\x12\x07\n\x03\x41\x46S\x10\x01*4\n\tTableType\x12\x13\n\x0fPS_SPARSE_TABLE\x10\x00\x12\x12\n\x0ePS_DENSE_TABLE\x10\x01*\xbd\x02\n\x07PsCmdID\x12\x17\n\x13PS_PULL_DENSE_TABLE\x10\x00\x12\x17\n\x13PS_PUSH_DENSE_TABLE\x10\x01\x12\x18\n\x14PS_PULL_SPARSE_TABLE\x10\x02\x12\x18\n\x14PS_PUSH_SPARSE_TABLE\x10\x03\x12\x13\n\x0fPS_SHRINK_TABLE\x10\x04\x12\x15\n\x11PS_SAVE_ONE_TABLE\x10\x05\x12\x15\n\x11PS_SAVE_ALL_TABLE\x10\x06\x12\x15\n\x11PS_LOAD_ONE_TABLE\x10\x07\x12\x15\n\x11PS_LOAD_ALL_TABLE\x10\x08\x12\x16\n\x12PS_CLEAR_ONE_TABLE\x10\t\x12\x16\n\x12PS_CLEAR_ALL_TABLE\x10\n\x12\x17\n\x13PS_PUSH_DENSE_PARAM\x10\x0b\x12\x12\n\x0ePS_STOP_SERVER\x10\x0c\x32K\n\tPsService\x12>\n\x07service\x12\x18.paddle.PsRequestMessage\x1a\x19.paddle.PsResponseMessageB\x03\x80\x01\x01' )) _sym_db.RegisterFileDescriptor(DESCRIPTOR) @@ -47,8 +49,8 @@ _TABLETYPE = _descriptor.EnumDescriptor( ], containing_type=None, options=None, - serialized_start=3286, - serialized_end=3338, ) + serialized_start=3489, + serialized_end=3541, ) _sym_db.RegisterEnumDescriptor(_TABLETYPE) TableType = enum_type_wrapper.EnumTypeWrapper(_TABLETYPE) @@ -132,8 +134,8 @@ _PSCMDID = _descriptor.EnumDescriptor( ], containing_type=None, options=None, - serialized_start=3341, - serialized_end=3658, ) + serialized_start=3544, + serialized_end=3861, ) _sym_db.RegisterEnumDescriptor(_PSCMDID) PsCmdID = enum_type_wrapper.EnumTypeWrapper(_PSCMDID) @@ -166,8 +168,8 @@ _FSCLIENTPARAMETER_FSAPITYPE = _descriptor.EnumDescriptor( ], containing_type=None, options=None, - serialized_start=3254, - serialized_end=3284, ) + serialized_start=3457, + serialized_end=3487, ) _sym_db.RegisterEnumDescriptor(_FSCLIENTPARAMETER_FSAPITYPE) _PSPARAMETER = _descriptor.Descriptor( @@ -493,6 +495,22 @@ _DOWNPOURTRAINERPARAMETER = _descriptor.Descriptor( is_extension=False, extension_scope=None, options=None), + _descriptor.FieldDescriptor( + name='program_config', + full_name='paddle.DownpourTrainerParameter.program_config', + index=5, + number=6, + type=11, + cpp_type=10, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), ], extensions=[], nested_types=[], @@ -503,7 +521,106 @@ _DOWNPOURTRAINERPARAMETER = _descriptor.Descriptor( extension_ranges=[], oneofs=[], serialized_start=557, - serialized_end=763, ) + serialized_end=810, ) + +_PROGRAMCONFIG = _descriptor.Descriptor( + name='ProgramConfig', + full_name='paddle.ProgramConfig', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='program_id', + full_name='paddle.ProgramConfig.program_id', + index=0, + number=1, + type=9, + cpp_type=9, + label=2, + has_default_value=False, + default_value=_b("").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='push_sparse_table_id', + full_name='paddle.ProgramConfig.push_sparse_table_id', + index=1, + number=2, + type=5, + cpp_type=1, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='push_dense_table_id', + full_name='paddle.ProgramConfig.push_dense_table_id', + index=2, + number=3, + type=5, + cpp_type=1, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='pull_sparse_table_id', + full_name='paddle.ProgramConfig.pull_sparse_table_id', + index=3, + number=4, + type=5, + cpp_type=1, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='pull_dense_table_id', + full_name='paddle.ProgramConfig.pull_dense_table_id', + index=4, + number=5, + type=5, + cpp_type=1, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=813, + serialized_end=966, ) _DENSETABLEPARAMETER = _descriptor.Descriptor( name='DenseTableParameter', @@ -585,8 +702,8 @@ _DENSETABLEPARAMETER = _descriptor.Descriptor( syntax='proto2', extension_ranges=[], oneofs=[], - serialized_start=765, - serialized_end=888, ) + serialized_start=968, + serialized_end=1091, ) _SPARSETABLEPARAMETER = _descriptor.Descriptor( name='SparseTableParameter', @@ -684,8 +801,8 @@ _SPARSETABLEPARAMETER = _descriptor.Descriptor( syntax='proto2', extension_ranges=[], oneofs=[], - serialized_start=890, - serialized_end=1012, ) + serialized_start=1093, + serialized_end=1215, ) _DOWNPOURSERVERPARAMETER = _descriptor.Descriptor( name='DownpourServerParameter', @@ -735,8 +852,8 @@ _DOWNPOURSERVERPARAMETER = _descriptor.Descriptor( syntax='proto2', extension_ranges=[], oneofs=[], - serialized_start=1015, - serialized_end=1149, ) + serialized_start=1218, + serialized_end=1352, ) _SERVERSERVICEPARAMETER = _descriptor.Descriptor( name='ServerServiceParameter', @@ -834,8 +951,8 @@ _SERVERSERVICEPARAMETER = _descriptor.Descriptor( syntax='proto2', extension_ranges=[], oneofs=[], - serialized_start=1152, - serialized_end=1367, ) + serialized_start=1355, + serialized_end=1570, ) _TABLEPARAMETER = _descriptor.Descriptor( name='TableParameter', @@ -949,8 +1066,8 @@ _TABLEPARAMETER = _descriptor.Descriptor( syntax='proto2', extension_ranges=[], oneofs=[], - serialized_start=1370, - serialized_end=1561, ) + serialized_start=1573, + serialized_end=1764, ) _TABLEACCESSORPARAMETER = _descriptor.Descriptor( name='TableAccessorParameter', @@ -1096,8 +1213,8 @@ _TABLEACCESSORPARAMETER = _descriptor.Descriptor( syntax='proto2', extension_ranges=[], oneofs=[], - serialized_start=1564, - serialized_end=1933, ) + serialized_start=1767, + serialized_end=2136, ) _DOWNPOURTABLEACCESSORPARAMETER = _descriptor.Descriptor( name='DownpourTableAccessorParameter', @@ -1227,8 +1344,8 @@ _DOWNPOURTABLEACCESSORPARAMETER = _descriptor.Descriptor( syntax='proto2', extension_ranges=[], oneofs=[], - serialized_start=1936, - serialized_end=2142, ) + serialized_start=2139, + serialized_end=2345, ) _TABLEACCESSORSAVEPARAMETER = _descriptor.Descriptor( name='TableAccessorSaveParameter', @@ -1294,8 +1411,8 @@ _TABLEACCESSORSAVEPARAMETER = _descriptor.Descriptor( syntax='proto2', extension_ranges=[], oneofs=[], - serialized_start=2144, - serialized_end=2227, ) + serialized_start=2347, + serialized_end=2430, ) _PSREQUESTMESSAGE = _descriptor.Descriptor( name='PsRequestMessage', @@ -1393,8 +1510,8 @@ _PSREQUESTMESSAGE = _descriptor.Descriptor( syntax='proto2', extension_ranges=[], oneofs=[], - serialized_start=2229, - serialized_end=2330, ) + serialized_start=2432, + serialized_end=2533, ) _SPARSESGDRULEPARAMETER = _descriptor.Descriptor( name='SparseSGDRuleParameter', @@ -1476,8 +1593,8 @@ _SPARSESGDRULEPARAMETER = _descriptor.Descriptor( syntax='proto2', extension_ranges=[], oneofs=[], - serialized_start=2332, - serialized_end=2451, ) + serialized_start=2535, + serialized_end=2654, ) _DENSESGDRULEPARAMETER = _descriptor.Descriptor( name='DenseSGDRuleParameter', @@ -1575,8 +1692,8 @@ _DENSESGDRULEPARAMETER = _descriptor.Descriptor( syntax='proto2', extension_ranges=[], oneofs=[], - serialized_start=2454, - serialized_end=2679, ) + serialized_start=2657, + serialized_end=2882, ) _ADAMSGDPARAMETER = _descriptor.Descriptor( name='AdamSGDParameter', @@ -1674,8 +1791,8 @@ _ADAMSGDPARAMETER = _descriptor.Descriptor( syntax='proto2', extension_ranges=[], oneofs=[], - serialized_start=2682, - serialized_end=2816, ) + serialized_start=2885, + serialized_end=3019, ) _NAIVESGDPARAMETER = _descriptor.Descriptor( name='NaiveSGDParameter', @@ -1725,8 +1842,8 @@ _NAIVESGDPARAMETER = _descriptor.Descriptor( syntax='proto2', extension_ranges=[], oneofs=[], - serialized_start=2818, - serialized_end=2884, ) + serialized_start=3021, + serialized_end=3087, ) _SUMMARYSGDPARAMETER = _descriptor.Descriptor( name='SummarySGDParameter', @@ -1760,8 +1877,8 @@ _SUMMARYSGDPARAMETER = _descriptor.Descriptor( syntax='proto2', extension_ranges=[], oneofs=[], - serialized_start=2886, - serialized_end=2945, ) + serialized_start=3089, + serialized_end=3148, ) _MOVINGAVERAGERULEPARAMETER = _descriptor.Descriptor( name='MovingAverageRuleParameter', @@ -1795,8 +1912,8 @@ _MOVINGAVERAGERULEPARAMETER = _descriptor.Descriptor( syntax='proto2', extension_ranges=[], oneofs=[], - serialized_start=2947, - serialized_end=2993, ) + serialized_start=3150, + serialized_end=3196, ) _PSRESPONSEMESSAGE = _descriptor.Descriptor( name='PsResponseMessage', @@ -1862,8 +1979,8 @@ _PSRESPONSEMESSAGE = _descriptor.Descriptor( syntax='proto2', extension_ranges=[], oneofs=[], - serialized_start=2995, - serialized_end=3068, ) + serialized_start=3198, + serialized_end=3271, ) _FSCLIENTPARAMETER = _descriptor.Descriptor( name='FsClientParameter', @@ -1993,8 +2110,8 @@ _FSCLIENTPARAMETER = _descriptor.Descriptor( syntax='proto2', extension_ranges=[], oneofs=[], - serialized_start=3071, - serialized_end=3284, ) + serialized_start=3274, + serialized_end=3487, ) _PSPARAMETER.fields_by_name['worker_param'].message_type = _WORKERPARAMETER _PSPARAMETER.fields_by_name['server_param'].message_type = _SERVERPARAMETER @@ -2011,6 +2128,8 @@ _DOWNPOURTRAINERPARAMETER.fields_by_name[ 'dense_table'].message_type = _DENSETABLEPARAMETER _DOWNPOURTRAINERPARAMETER.fields_by_name[ 'sparse_table'].message_type = _SPARSETABLEPARAMETER +_DOWNPOURTRAINERPARAMETER.fields_by_name[ + 'program_config'].message_type = _PROGRAMCONFIG _DOWNPOURSERVERPARAMETER.fields_by_name[ 'downpour_table_param'].message_type = _TABLEPARAMETER _DOWNPOURSERVERPARAMETER.fields_by_name[ @@ -2042,6 +2161,7 @@ DESCRIPTOR.message_types_by_name[ 'DownpourWorkerParameter'] = _DOWNPOURWORKERPARAMETER DESCRIPTOR.message_types_by_name[ 'DownpourTrainerParameter'] = _DOWNPOURTRAINERPARAMETER +DESCRIPTOR.message_types_by_name['ProgramConfig'] = _PROGRAMCONFIG DESCRIPTOR.message_types_by_name['DenseTableParameter'] = _DENSETABLEPARAMETER DESCRIPTOR.message_types_by_name['SparseTableParameter'] = _SPARSETABLEPARAMETER DESCRIPTOR.message_types_by_name[ @@ -2120,6 +2240,16 @@ DownpourTrainerParameter = _reflection.GeneratedProtocolMessageType( )) _sym_db.RegisterMessage(DownpourTrainerParameter) +ProgramConfig = _reflection.GeneratedProtocolMessageType( + 'ProgramConfig', + (_message.Message, ), + dict( + DESCRIPTOR=_PROGRAMCONFIG, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.ProgramConfig) + )) +_sym_db.RegisterMessage(ProgramConfig) + DenseTableParameter = _reflection.GeneratedProtocolMessageType( 'DenseTableParameter', (_message.Message, ), diff --git a/python/paddle/fluid/dygraph/__init__.py b/python/paddle/fluid/dygraph/__init__.py index 7281b3ea4b961a14126023a14a2ba2f03c7d1387..2d0c7b7ddaacee28da599d5850e9b3381c01de5c 100644 --- a/python/paddle/fluid/dygraph/__init__.py +++ b/python/paddle/fluid/dygraph/__init__.py @@ -32,6 +32,9 @@ from .profiler import * from . import checkpoint from .checkpoint import * +from . import learning_rate_scheduler +from .learning_rate_scheduler import * + __all__ = [] __all__ += layers.__all__ __all__ += base.__all__ @@ -39,3 +42,4 @@ __all__ += nn.__all__ __all__ += tracer.__all__ __all__ += profiler.__all__ __all__ += checkpoint.__all__ +__all__ += learning_rate_scheduler.__all__ diff --git a/python/paddle/fluid/dygraph/layer_object_helper.py b/python/paddle/fluid/dygraph/layer_object_helper.py index c56652e103ce93bf5459b30b66c7b1f04e7c14d0..f0be5ff3bf2394f1f7da8fbcc341a0d2dfacdab3 100644 --- a/python/paddle/fluid/dygraph/layer_object_helper.py +++ b/python/paddle/fluid/dygraph/layer_object_helper.py @@ -65,7 +65,7 @@ class LayerObjectHelper(LayerHelperBase): def _input(self, inputs_in): inputs = self._multiple_input(inputs_in) if len(inputs) != 1: - raise "{0} layer only takes one input".format(self.layer_type) + raise "{0} layer only takes one input in".format(self.layer_type) return inputs[0] def _multiple_param_attr(self, length, param_attr_in=None): @@ -74,7 +74,8 @@ class LayerObjectHelper(LayerHelperBase): param_attr = [param_attr] if len(param_attr) != 1 and len(param_attr) != length: - raise ValueError("parameter number mismatch") + raise ValueError("parameter number mismatch in {}".format( + self.name)) elif len(param_attr) == 1 and length != 1: tmp = [None] * length for i in six.moves.range(length): @@ -91,6 +92,10 @@ class LayerObjectHelper(LayerHelperBase): Returns input, param_attr """ + param_attr_in = ParamAttr._to_attr(param_attr_in) + if isinstance(param_attr_in, bool): + raise ValueError('Param_attr should not be False in {}'.format( + self.name)) inputs = inputs_in if (inputs_in is not None) else [] inputs = self._multiple_input(inputs) param_attrs = self._multiple_param_attr(len(inputs), param_attr_in) @@ -112,8 +117,8 @@ class LayerObjectHelper(LayerHelperBase): if dtype is None: dtype = each.dtype elif dtype != each.dtype: - raise ValueError("Data Type mismatch: %d to %d" % - (dtype, each.dtype)) + raise ValueError("Data Type mismatch: %d to %d in %s" % + (dtype, each.dtype, self.name)) return dtype def get_parameter(self, name): @@ -126,7 +131,8 @@ class LayerObjectHelper(LayerHelperBase): """ param = self.main_program.global_block().var(name) if not isinstance(param, Parameter): - raise ValueError("no Parameter name %s found" % name) + raise ValueError("no Parameter name %s found in %s" % + (name, self.name)) return param def append_bias_op(self, @@ -184,7 +190,8 @@ class LayerObjectHelper(LayerHelperBase): if isinstance(act, six.string_types): act = {'type': act} else: - raise TypeError(str(act) + " should be unicode or str") + raise TypeError( + str(act) + " should be unicode or str in %s ", self.name) if (use_cudnn is not None) and use_cudnn: act['use_cudnn'] = use_cudnn @@ -211,5 +218,6 @@ class LayerObjectHelper(LayerHelperBase): """ param = param if not isinstance(param, cls): - raise TypeError("The input {0} parameter of method {1} must be {2}", - param, self.layer_type, cls.__name__) + raise TypeError( + "The input {0} parameter of method {1} must be {2}, in layer {3}", + param, self.layer_type, cls.__name__, self.name) diff --git a/python/paddle/fluid/dygraph/learning_rate_scheduler.py b/python/paddle/fluid/dygraph/learning_rate_scheduler.py new file mode 100644 index 0000000000000000000000000000000000000000..3209fa76d95c35c6c5a1bb36801b9f9354b1a927 --- /dev/null +++ b/python/paddle/fluid/dygraph/learning_rate_scheduler.py @@ -0,0 +1,224 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import math + +from .. import unique_name + +__all__ = [ + 'NoamDecay', 'PiecewiseDecay', 'NaturalExpDecay', 'ExponentialDecay', + 'InverseTimeDecay', 'PolynomialDecay', 'CosineDecay' +] + + +class LearningRateDecay(object): + """ + Base class of learning rate decay + """ + + def __init__(self, begin=0, step=1, dtype='float32'): + self.step_num = begin + self.step_size = step + self.dtype = dtype + + def __call__(self): + lr = self.step() + if isinstance(lr, float): + lr = self.create_lr_var(lr) + self.step_num += self.step_size + return lr + + def create_lr_var(self, lr): + from .. import layers + lr = layers.create_global_var( + name=unique_name.generate("learning_rate"), + shape=[1], + value=float(lr), + dtype=self.dtype, + persistable=True) + return lr + + def step(self): + raise NotImplementedError() + + +class PiecewiseDecay(LearningRateDecay): + def __init__(self, boundaries, values, begin, step=1, dtype='float32'): + super(PiecewiseDecay, self).__init__(begin, step, dtype) + self.boundaries = boundaries + self.values = values + + self.vars = [] + for value in values: + self.vars.append(self.create_lr_var(value)) + + def step(self): + for i in range(len(self.boundaries)): + if self.step_num < self.boundaries[i]: + return self.vars[i] + return self.vars[len(self.values) - 1] + + +class NaturalExpDecay(LearningRateDecay): + def __init__(self, + learning_rate, + decay_steps, + decay_rate, + staircase=False, + begin=0, + step=1, + dtype='float32'): + super(NaturalExpDecay, self).__init__(begin, step, dtype) + self.learning_rate = learning_rate + self.decay_steps = decay_steps + self.decay_rate = decay_rate + self.staircase = staircase + + def step(self): + from .. import layers + div_res = self.create_lr_var(self.step_num / self.decay_steps) + if self.staircase: + div_res = layers.floor(div_res) + decayed_lr = self.learning_rate * layers.exp(-1 * self.decay_rate * + div_res) + + return decayed_lr + + +class ExponentialDecay(LearningRateDecay): + def __init__(self, + learning_rate, + decay_steps, + decay_rate, + staircase=False, + begin=0, + step=1, + dtype='float32'): + super(ExponentialDecay, self).__init__(begin, step, dtype) + self.learning_rate = learning_rate + self.decay_steps = decay_steps + self.decay_rate = decay_rate + self.staircase = staircase + + def step(self): + from .. import layers + div_res = self.create_lr_var(self.step_num / self.decay_steps) + if self.staircase: + div_res = layers.floor(div_res) + + decayed_lr = self.learning_rate * (self.decay_rate**div_res) + + return decayed_lr + + +class InverseTimeDecay(LearningRateDecay): + def __init__(self, + learning_rate, + decay_steps, + decay_rate, + staircase=False, + begin=0, + step=1, + dtype='float32'): + super(InverseTimeDecay, self).__init__(begin, step, dtype) + self.learning_rate = learning_rate + self.decay_steps = decay_steps + self.decay_rate = decay_rate + self.staircase = staircase + + def step(self): + from .. import layers + div_res = self.create_lr_var(self.step_num / self.decay_steps) + if self.staircase: + div_res = layers.floor(div_res) + + decayed_lr = self.learning_rate / (1 + self.decay_rate * div_res) + + return decayed_lr + + +class PolynomialDecay(LearningRateDecay): + def __init__(self, + learning_rate, + decay_steps, + end_learning_rate=0.0001, + power=1.0, + cycle=False, + begin=0, + step=1, + dtype='float32'): + super(PolynomialDecay, self).__init__(begin, step, dtype) + self.learning_rate = learning_rate + self.decay_steps = decay_steps + self.end_learning_rate = end_learning_rate + self.power = power + self.cycle = cycle + + def step(self): + from .. import layers + tmp_step_num = self.step_num + tmp_decay_steps = self.decay_steps + if self.cycle: + div_res = layers.ceil( + self.create_lr_var(tmp_step_num / float(self.decay_steps))) + + if tmp_step_num == 0: + div_res = self.create_lr_var(1.0) + tmp_decay_steps = self.decay_steps * div_res + else: + tmp_step_num = self.create_lr_var(tmp_step_num + if tmp_step_num < self.decay_steps + else self.decay_steps) + + decayed_lr = (self.learning_rate - self.end_learning_rate) * \ + ((1 - tmp_step_num / tmp_decay_steps) ** self.power) + self.end_learning_rate + return decayed_lr + + +class CosineDecay(LearningRateDecay): + def __init__(self, + learning_rate, + step_each_epoch, + epochs, + begin=0, + step=1, + dtype='float32'): + super(CosineDecay, self).__init__(begin, step, dtype) + self.learning_rate = learning_rate + self.step_each_epoch = step_each_epoch + self.epochs = epochs + + def step(self): + from .. import layers + cur_epoch = layers.floor( + self.create_lr_var(self.step_num / self.step_each_epoch)) + decayed_lr = self.learning_rate * 0.5 * ( + layers.cos(cur_epoch * math.pi / self.epochs) + 1) + return decayed_lr + + +class NoamDecay(LearningRateDecay): + def __init__(self, d_model, warmup_steps, begin=1, step=1, dtype='float32'): + super(NoamDecay, self).__init__(begin, step, dtype) + self.d_model = d_model + self.warmup_steps = warmup_steps + + def step(self): + from .. import layers + a = self.create_lr_var(self.step_num**-0.5) + b = self.create_lr_var((self.warmup_steps**-1.5) * self.step_num) + lr_value = (self.d_model**-0.5) * layers.elementwise_min(a, b) + return lr_value diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py index 8925381119272d7462562c0952d3e157f78f25af..04da8561a370056a40b374887ef08a4c2110e6cc 100644 --- a/python/paddle/fluid/dygraph/nn.py +++ b/python/paddle/fluid/dygraph/nn.py @@ -20,7 +20,7 @@ import numpy as np from .. import core from ..layers import utils from . import layers -from ..framework import Variable, OpProtoHolder +from ..framework import Variable, OpProtoHolder, Parameter from ..layers import layer_function_generator from ..param_attr import ParamAttr from ..initializer import Normal, Constant, NumpyArrayInitializer @@ -213,46 +213,69 @@ class FC(layers.Layer): self._param_attr = param_attr self._bias_attr = bias_attr self._act = act + self.__w = list() - def _build_once(self, input): - input_shape = input.shape - param_shape = [ - reduce(lambda a, b: a * b, input_shape[self._num_flatten_dims:], 1) - ] + [self._size] - self._w = self.create_parameter( - attr=self._param_attr, - shape=param_shape, - dtype=self._dtype, - is_bias=False) + @property + def _w(self, i=0): + return self.__w[i] - if self._bias_attr: - size = list([self._size]) - self._b = self.create_parameter( - attr=self._bias_attr, - shape=size, - dtype=self._dtype, - is_bias=True) - else: - self._b = None + @_w.setter + def _w(self, value, i=0): + assert isinstance(value, Parameter) + self.__w[i] = value - def forward(self, input): - tmp = self._helper.create_variable_for_type_inference(self._dtype) - self._helper.append_op( - type="mul", - inputs={"X": input, - "Y": self._w}, - outputs={"Out": tmp}, - attrs={ - "x_num_col_dims": self._num_flatten_dims, - "y_num_col_dims": 1 - }) + def _build_once(self, input): + i = 0 + for inp, param in self._helper.iter_inputs_and_params(input, + self._param_attr): + input_shape = inp.shape + + param_shape = [ + reduce(lambda a, b: a * b, input_shape[self._num_flatten_dims:], + 1) + ] + [self._size] + self.__w.append( + self.add_parameter( + '_w%d' % i, + self.create_parameter( + attr=param, + shape=param_shape, + dtype=self._dtype, + is_bias=False))) + i += 1 + + size = list([self._size]) + self._b = self.create_parameter( + attr=self._bias_attr, shape=size, dtype=self._dtype, is_bias=True) - pre_bias = self._helper.create_variable_for_type_inference(self._dtype) - self._helper.append_op( - type="sum", - inputs={"X": [tmp]}, - outputs={"Out": pre_bias}, - attrs={"use_mkldnn": False}) + def forward(self, input): + mul_results = list() + i = 0 + for inp, param in self._helper.iter_inputs_and_params(input, + self._param_attr): + tmp = self._helper.create_variable_for_type_inference(self._dtype) + self._helper.append_op( + type="mul", + inputs={"X": inp, + "Y": self.__w[i]}, + outputs={"Out": tmp}, + attrs={ + "x_num_col_dims": self._num_flatten_dims, + "y_num_col_dims": 1 + }) + i += 1 + mul_results.append(tmp) + + if len(mul_results) == 1: + pre_bias = mul_results[0] + else: + pre_bias = self._helper.create_variable_for_type_inference( + self._dtype) + self._helper.append_op( + type="sum", + inputs={"X": mul_results}, + outputs={"Out": pre_bias}, + attrs={"use_mkldnn": False}) if self._b: pre_activation = self._helper.create_variable_for_type_inference( diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index 018e38cbb3f2676ac05f1a27e9e92b6e0f16efb0..e4666deb7fabe3628856269b6c665aacec1e9ee4 100644 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -23,6 +23,7 @@ from .framework import Program, default_main_program, Variable from . import core from . import compiler from .. import compat as cpt +from .trainer_factory import TrainerFactory __all__ = ['Executor', 'global_scope', 'scope_guard'] @@ -610,3 +611,209 @@ class Executor(object): def _run_inference(self, exe, feed): return exe.run(feed) + + def _dump_debug_info(self, program=None, trainer=None): + with open(str(id(program)) + "_train_desc.prototxt", "w") as fout: + fout.write(trainer._desc()) + if program._fleet_opt: + with open("fleet_desc.prototxt", "w") as fout: + fout.write(str(program._fleet_opt["fleet_desc"])) + + def _prepare_trainer(self, + program=None, + dataset=None, + scope=None, + thread=0, + debug=False, + fetch_list=None, + fetch_info=None, + print_period=100): + if scope is None: + scope = global_scope() + if fetch_list is None: + fetch_list = [] + if fetch_info is None: + fetch_info = [] + assert len(fetch_list) == len(fetch_info) + compiled = isinstance(program, compiler.CompiledProgram) + if not compiled: + trainer = TrainerFactory()._create_trainer(program._fleet_opt) + trainer._set_program(program) + else: + trainer = TrainerFactory()._create_trainer( + program.program._fleet_opt) + trainer._set_program(program.program) + if thread <= 0: + if dataset.thread_num <= 0: + raise RuntimeError( + "You should set thread num first, either in Dataset" + "or in Executor.train_from_dataset") + else: + trainer._set_thread(dataset.thread_num) + else: + trainer._set_thread(thread) + trainer._set_debug(debug) + trainer._set_fetch_var_and_info(fetch_list, fetch_info, print_period) + return scope, trainer + + def infer_from_dataset(self, + program=None, + dataset=None, + scope=None, + thread=0, + debug=False, + fetch_list=None, + fetch_info=None, + print_period=100): + """ + The document of infer_from_dataset is almost the same as + train_from_dataset, except that in distributed training, + push gradients will be disabled in infer_from_dataset. + infer_from_dataset() can be used for evaluation in multi-thread + very easily. + + Args: + program(Program|CompiledProgram): the program that needs to be run, + if not provided, then default_main_program (not compiled) will be used. + dataset(paddle.fluid.Dataset): dataset created outside this function, + a user should provide a well-defined dataset before calling this function. + Please check the document of Dataset if needed. default is None + scope(Scope): the scope used to run this program, you can switch it to different scope + for each run. default is global_scope + thread(int): number of thread a user wants to run in this function. The actual number + of thread will be min(Dataset.thread_num, thread) if thread > 0, default is 0 + debug(bool): whether a user wants to run infer_from_dataset, default is False + fetch_list(Variable List): fetch variable list, each variable + will be printed during training, default is None + fetch_info(String List): print information for each variable, default is None + print_period(int): the number of mini-batches for each print, default is 100 + + Returns: + None + + Examples: + + .. code-block:: python + + import paddle.fluid as fluid + place = fluid.CPUPlace() + exe = fluid.Executor(place) + x = fluid.layers.data(name="x", type="int64") + y = fluid.layers.data(name="y", type="int64") + dataset = fluid.DatasetFactory().create_dataset() + dataset.set_use_var([x, y]) + filelist = ["dataA.txt", "dataB.txt"] + dataset.set_filelist(filelist) + exe.run(fluid.default_startup_program()) + exe.infer_from_dataset(program=fluid.default_main_program(), + dataset=dataset) + + """ + if dataset == None: + raise RuntimeError("dataset is needed and should be initialized") + + if self.place == paddle.fluid.CUDAPlace(): + raise RuntimeError("infer_from_dataset is verified on CPUPlace" + "We will open CUDAPlace in the future") + + scope, trainer = self._prepare_trainer( + program=program, + dataset=dataset, + scope=scope, + thread=thread, + debug=debug, + fetch_list=fetch_list, + fetch_info=fetch_info, + print_period=print_period) + trainer._set_infer(True) + trainer._gen_trainer_desc() + dataset._prepare_to_run() + if debug: + self._dump_debug_info(program=program, trainer=trainer) + self._default_executor.run_from_dataset(program.desc, scope, + dataset.dataset, + trainer._desc()) + return None + + def train_from_dataset(self, + program=None, + dataset=None, + scope=None, + thread=0, + debug=False, + fetch_list=None, + fetch_info=None, + print_period=100): + """ + Train from a pre-defined Dataset. Dataset is defined in paddle.fluid.dataset. + Given a program, either a program or compiled program, train_from_dataset will + consume all data samples in dataset. Input scope can be given by users. By default, + scope is global_scope(). The total number of thread run in training is `thread`. + Thread number used in training will be minimum value of threadnum in Dataset and + the value of thread in this interface. Debug can be set so that executor will display + Run-Time for all operators and the throughputs of current training task. + + Note: train_from_dataset will destroy all resources created within executor for each run. + + Args: + program(Program|CompiledProgram): the program that needs to be run, + if not provided, then default_main_program (not compiled) will be used. + dataset(paddle.fluid.Dataset): dataset created outside this function, + a user should provide a well-defined dataset before calling this function. + Please check the document of Dataset if needed. + scope(Scope): the scope used to run this program, you can switch it to different scope + for each run. default is global_scope + thread(int): number of thread a user wants to run in this function. The actual number + of thread will be min(Dataset.thread_num, thread) + debug(bool): whether a user wants to run train_from_dataset + fetch_list(Variable List): fetch variable list, each variable + will be printed during training + fetch_info(String List): print information for each variable + print_period(int): the number of mini-batches for each print + + Returns: + None + + Examples: + + .. code-block:: python + + import paddle.fluid as fluid + place = fluid.CPUPlace() + exe = fluid.Executor(place) + x = fluid.layers.data(name="x", type="int64") + y = fluid.layers.data(name="y", type="int64") + dataset = fluid.DatasetFactory().create_dataset() + dataset.set_use_var([x, y]) + dataset.set_thread(2) + filelist = ["dataA.txt", "dataB.txt"] + dataset.set_filelist(filelist) + exe.run(fluid.default_startup_program()) + exe.train_from_dataset(program=fluid.default_main_program(), + dataset=dataset) + + """ + if dataset == None: + raise RuntimeError("dataset is need and should be initialized") + + if self.place == paddle.fluid.CUDAPlace(): + raise RuntimeError("train_from_dataset is verified on CPUPlace" + "We will open CUDAPlace in the future") + + scope, trainer = self._prepare_trainer( + program=program, + dataset=dataset, + scope=scope, + thread=thread, + debug=debug, + fetch_list=fetch_list, + fetch_info=fetch_info, + print_period=print_period) + trainer._gen_trainer_desc() + dataset._prepare_to_run() + if debug: + self._dump_debug_info(program=program, trainer=trainer) + self._default_executor.run_from_dataset(program.desc, scope, + dataset.dataset, + trainer._desc()) + return None diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 8c71e7dc5b711d7f3a283092f4352eff932b0edc..7953d98bcbb826267fa21f6503e55049c8aff5ba 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -2716,6 +2716,11 @@ class Program(object): # whether the program is optimized by memory_optimize_transpiler self.__is_mem_optimized = False + # if this program has been optimized by distributed optimizer + # fleet_opt will be given a value + self._fleet_opt = None + self._program_config = None + @property def _is_mem_optimized(self): # if the program is optimized, operator input/outputs diff --git a/python/paddle/fluid/incubate/__init__.py b/python/paddle/fluid/incubate/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..76c5c6391fde3cafbd9a94e1d11e0ef4401420ed --- /dev/null +++ b/python/paddle/fluid/incubate/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and + +# incubate directory is mainly for internal use +# after we have tested incubate APIs in industrial application for a period +# we will move stable functions into fluid +__version__ = '0.1.0' diff --git a/python/paddle/fluid/incubate/data_generator/__init__.py b/python/paddle/fluid/incubate/data_generator/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..0407d67ea420bdcb3caa5aaf58ce674613091d2d --- /dev/null +++ b/python/paddle/fluid/incubate/data_generator/__init__.py @@ -0,0 +1,330 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys + +__all__ = ['MultiSlotDataGenerator'] + + +class DataGenerator(object): + """ + DataGenerator is a general Base class for user to inherit + A user who wants to define his/her own python processing logic + with paddle.fluid.dataset should inherit this class. + """ + + def __init__(self): + self._proto_info = None + self.batch_size_ = 32 + + def _set_line_limit(self, line_limit): + if not isinstance(line_limit, int): + raise ValueError("line_limit%s must be in int type" % + type(line_limit)) + if line_limit < 1: + raise ValueError("line_limit can not less than 1") + self._line_limit = line_limit + + def set_batch(self, batch_size): + ''' + Set batch size of current DataGenerator + This is necessary only if a user wants to define generator_batch + + Example: + + .. code-block:: python + import paddle.fluid.incubate.data_generator as dg + class MyData(dg.DataGenerator): + + def generate_sample(self, line): + def local_iter(): + int_words = [int(x) for x in line.split()] + yield ("words", int_words) + return local_iter + + def generate_batch(self, samples): + def local_iter(): + for s in samples: + yield ("words", s[1].extend([s[1][0]])) + mydata = MyData() + mydata.set_batch(128) + + ''' + self.batch_size_ = batch_size + + def run_from_memory(self): + ''' + This function generator data from memory, it is usually used for + debug and benchmarking + + Example: + .. code-block:: python + import paddle.fluid.incubate.data_generator as dg + class MyData(dg.DataGenerator): + + def generate_sample(self, line): + def local_iter(): + yield ("words", [1, 2, 3, 4]) + return local_iter + + mydata = MyData() + mydata.run_from_memory() + ''' + batch_samples = [] + line_iter = self.generate_sample(None) + for user_parsed_line in line_iter(): + if user_parsed_line == None: + continue + batch_samples.append(user_parsed_line) + if len(batch_samples) == self.batch_size_: + batch_iter = self.generate_batch(batch_samples) + for sample in batch_iter(): + sys.stdout.write(self._gen_str(sample)) + batch_samples = [] + if len(batch_samples) > 0: + batch_iter = self.generate_batch(batch_samples) + for sample in batch_iter(): + sys.stdout.write(self._gen_str(sample)) + + def run_from_stdin(self): + ''' + This function reads the data row from stdin, parses it with the + process function, and further parses the return value of the + process function with the _gen_str function. The parsed data will + be wrote to stdout and the corresponding protofile will be + generated. + + Example: + + .. code-block:: python + import paddle.fluid.incubate.data_generator as dg + class MyData(dg.DataGenerator): + + def generate_sample(self, line): + def local_iter(): + int_words = [int(x) for x in line.split()] + yield ("words", [int_words]) + return local_iter + + mydata = MyData() + mydata.run_from_stdin() + + ''' + batch_samples = [] + for line in sys.stdin: + line_iter = self.generate_sample(line) + for user_parsed_line in line_iter(): + if user_parsed_line == None: + continue + batch_samples.append(user_parsed_line) + if len(batch_samples) == self.batch_size_: + batch_iter = self.generate_batch(batch_samples) + for sample in batch_iter(): + sys.stdout.write(self._gen_str(sample)) + batch_samples = [] + if len(batch_samples) > 0: + batch_iter = self.generate_batch(batch_samples) + for sample in batch_iter(): + sys.stdout.write(self._gen_str(sample)) + + def _gen_str(self, line): + ''' + Further processing the output of the process() function rewritten by + user, outputting data that can be directly read by the datafeed,and + updating proto_info infomation. + + Args: + line(str): the output of the process() function rewritten by user. + + Returns: + Return a string data that can be read directly by the datafeed. + ''' + raise NotImplementedError( + "pls use MultiSlotDataGenerator or PairWiseDataGenerator") + + def generate_sample(self, line): + ''' + This function needs to be overridden by the user to process the + original data row into a list or tuple. + + Args: + line(str): the original data row + + Returns: + Returns the data processed by the user. + The data format is list or tuple: + [(name, [feasign, ...]), ...] + or ((name, [feasign, ...]), ...) + + For example: + [("words", [1926, 08, 17]), ("label", [1])] + or (("words", [1926, 08, 17]), ("label", [1])) + + Note: + The type of feasigns must be in int or float. Once the float + element appears in the feasign, the type of that slot will be + processed into a float. + + Example: + + .. code-block:: python + import paddle.fluid.incubate.data_generator as dg + class MyData(dg.DataGenerator): + + def generate_sample(self, line): + def local_iter(): + int_words = [int(x) for x in line.split()] + yield ("words", [int_words]) + return local_iter + + ''' + raise NotImplementedError( + "Please rewrite this function to return a list or tuple: " + + "[(name, [feasign, ...]), ...] or ((name, [feasign, ...]), ...)") + + def generate_batch(self, samples): + ''' + This function needs to be overridden by the user to process the + generated samples from generate_sample(self, str) function + It is usually used as batch processing when a user wants to + do preprocessing on a batch of samples, e.g. padding according to + the max length of a sample in the batch + + Args: + samples(list tuple): generated sample from generate_sample + + Returns: + a python generator, the same format as return value of generate_sample + + Example: + + .. code-block:: python + import paddle.fluid.incubate.data_generator as dg + class MyData(dg.DataGenerator): + + def generate_sample(self, line): + def local_iter(): + int_words = [int(x) for x in line.split()] + yield ("words", int_words) + return local_iter + + def generate_batch(self, samples): + def local_iter(): + for s in samples: + yield ("words", s[1].extend([s[1][0]])) + mydata = MyData() + mydata.set_batch(128) + ''' + + def local_iter(): + for sample in samples: + yield sample + + return local_iter + + +class MultiSlotDataGenerator(DataGenerator): + def _gen_str(self, line): + ''' + Further processing the output of the process() function rewritten by + user, outputting data that can be directly read by the MultiSlotDataFeed, + and updating proto_info infomation. + + The input line will be in this format: + >>> [(name, [feasign, ...]), ...] + >>> or ((name, [feasign, ...]), ...) + The output will be in this format: + >>> [ids_num id1 id2 ...] ... + The proto_info will be in this format: + >>> [(name, type), ...] + + For example, if the input is like this: + >>> [("words", [1926, 08, 17]), ("label", [1])] + >>> or (("words", [1926, 08, 17]), ("label", [1])) + the output will be: + >>> 3 1234 2345 3456 1 1 + the proto_info will be: + >>> [("words", "uint64"), ("label", "uint64")] + + Args: + line(str): the output of the process() function rewritten by user. + + Returns: + Return a string data that can be read directly by the MultiSlotDataFeed. + ''' + if not isinstance(line, list) and not isinstance(line, tuple): + raise ValueError( + "the output of process() must be in list or tuple type") + output = "" + + if self._proto_info is None: + self._proto_info = [] + for item in line: + name, elements = item + if not isinstance(name, str): + raise ValueError("name%s must be in str type" % type(name)) + if not isinstance(elements, list): + raise ValueError("elements%s must be in list type" % + type(elements)) + if not elements: + raise ValueError( + "the elements of each field can not be empty, you need padding it in process()." + ) + self._proto_info.append((name, "uint64")) + if output: + output += " " + output += str(len(elements)) + for elem in elements: + if isinstance(elem, float): + self._proto_info[-1] = (name, "float") + elif not isinstance(elem, int) and not isinstance(elem, + long): + raise ValueError( + "the type of element%s must be in int or float" % + type(elem)) + output += " " + str(elem) + else: + if len(line) != len(self._proto_info): + raise ValueError( + "the complete field set of two given line are inconsistent.") + for index, item in enumerate(line): + name, elements = item + if not isinstance(name, str): + raise ValueError("name%s must be in str type" % type(name)) + if not isinstance(elements, list): + raise ValueError("elements%s must be in list type" % + type(elements)) + if not elements: + raise ValueError( + "the elements of each field can not be empty, you need padding it in process()." + ) + if name != self._proto_info[index][0]: + raise ValueError( + "the field name of two given line are not match: require<%s>, get<%d>." + % (self._proto_info[index][0], name)) + if output: + output += " " + output += str(len(elements)) + for elem in elements: + if self._proto_info[index][1] != "float": + if isinstance(elem, float): + self._proto_info[index] = (name, "float") + elif not isinstance(elem, int) and not isinstance(elem, + long): + raise ValueError( + "the type of element%s must be in int or float" + % type(elem)) + output += " " + str(elem) + return output + "\n" diff --git a/python/paddle/fluid/incubate/data_generator/test_data_generator.py b/python/paddle/fluid/incubate/data_generator/test_data_generator.py new file mode 100644 index 0000000000000000000000000000000000000000..ea42551efb63e00a06d7eca3e7cf6e9d7082f0f3 --- /dev/null +++ b/python/paddle/fluid/incubate/data_generator/test_data_generator.py @@ -0,0 +1,26 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +from __init__ import * + + +class SyntheticData(MultiSlotDataGenerator): + def generate_sample(self, line): + def data_iter(): + for i in range(10000): + yield ("words", [1, 2, 3, 4]), ("label", [0]) + + return data_iter + + +sd = SyntheticData() +sd.run_from_memory() diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/incubate/fleet/__init__.py similarity index 75% rename from python/paddle/fluid/trainer.py rename to python/paddle/fluid/incubate/fleet/__init__.py index b495b6699b5d02ca8c466c984820be5c497d626e..a05baabca392b14a4cb09a3f395ae7687d8a5e62 100644 --- a/python/paddle/fluid/trainer.py +++ b/python/paddle/fluid/incubate/fleet/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -10,7 +10,5 @@ # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and -# limitations under the License. -# NOTE: Trainer is moved into fluid.contrib.trainer. -__all__ = [] +__version__ = '0.1.0' diff --git a/python/paddle/fluid/incubate/fleet/base/__init__.py b/python/paddle/fluid/incubate/fleet/base/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8647330f3290f3142cabca9a7e3fe162a9838dda --- /dev/null +++ b/python/paddle/fluid/incubate/fleet/base/__init__.py @@ -0,0 +1,12 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and diff --git a/python/paddle/fluid/incubate/fleet/base/role_maker.py b/python/paddle/fluid/incubate/fleet/base/role_maker.py new file mode 100644 index 0000000000000000000000000000000000000000..528f7b3269eb90435d88cffadfa185cc664e430a --- /dev/null +++ b/python/paddle/fluid/incubate/fleet/base/role_maker.py @@ -0,0 +1,241 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import sys + + +class RoleMakerBase(object): + """ + RoleMakerBase is a base class for assigning a role to current process + in distributed training. + A paddle developer can implement RoleMakerBase to design a role maker + for worker or pserver assignment. + """ + + def __init__(self): + self.role_maker_name_ = "" + self.trainer_endpoints_ = [] + self.pserver_endpoints_ = [] + self.role_is_generated_ = False + + def _is_worker(self): + """ + return is_worker() of current process + """ + raise NotImplementedError("Please implement this method in child class") + + def _is_server(self): + """ + return is_server() of current process + """ + raise NotImplementedError("Please implement this method in child class") + + def _get_local_ip(self): + """ + return get local ip + """ + import socket + self.ip_ = socket.gethostbyname(socket.gethostname()) + return self.ip_ + + def _get_trainer_endpoints(self): + """ + return trainer endpoints + """ + return self.trainer_endpoints_ + + def _get_pserver_endpoints(self): + """ + return pserver endpoints + """ + return self.pserver_endpoints_ + + def _generate_role(self): + """ + generate_role() should be called to identify current process's role + """ + raise NotImplementedError("Please implement this method in child class") + + +class MPIRoleMaker(RoleMakerBase): + """ + MPIRoleMaker is a MPI-API based role maker which is a counter-part of K8SRoleMaker + mpi4py will be used if a developer inherits MPIRoleMaker + """ + + def __init__(self): + super(MPIRoleMaker, self).__init__() + from mpi4py import MPI + self.comm_ = MPI.COMM_WORLD + self.MPI = MPI + self.ips_ = None + + def _get_rank(self): + """ + return rank + """ + self.rank_ = self.comm_.Get_rank() + return self.rank_ + + def _get_size(self): + """ + return size + """ + self.size_ = self.comm_.Get_size() + return self.size_ + + def _all_gather(self, obj): + """ + all_gather(obj) will call MPI's allgather function + """ + self._barrier_all() + return self.comm_.allgather(obj) + + def _worker_gather(self, obj): + """ + worker_gather(obj) will call MPI's allgather function + """ + if self._is_worker(): + self.node_type_comm_.barrier() + return self.node_type_comm_.allgather(obj) + return None + + def _barrier_all(self): + """ + barrier_all() will call MPI's barrier_all function + """ + self.comm_.barrier() + + def _get_ips(self): + """ + collect current distributed job's ip list + """ + if self.ips_ == None: + self.ips_ = self.comm_.allgather(self._get_local_ip()) + return self.ips_ + + def _finalize(self): + """ + finalize the current MPI instance. + """ + self.comm_.finalize() + + +class MPISymetricRoleMaker(MPIRoleMaker): + """ + MPISymetricRoleMaker is designed for worker and server assignment + under MPI. Typically, a worker and a server node will be appointed + on each physical node. This role maker can be only used under MPI. + """ + + def __init__(self): + super(MPISymetricRoleMaker, self).__init__() + self.node_type_ = None + self.proc_per_node_ = 2 + + def _check_role_generation(self): + if not self.role_is_generated_: + sys.stderr.write("generate_role() should be called first") + sys.exit(-1) + return False + return True + + def _is_first_worker(self): + """ + return whether current process is the first worker assigned by role maker + """ + if self._check_role_generation(): + return self._is_worker() and 0 == self._worker_index() + return False + + def _is_worker(self): + """ + return whether current process is worker assigned by role maker + """ + if self._check_role_generation(): + return self.node_type_ == 1 + return False + + def _is_server(self): + """ + return whether current process is server assigned by role maker + """ + if self._check_role_generation(): + return self.node_type_ == 0 + return False + + def _worker_num(self): + """ + return the current number of worker + """ + if self._check_role_generation(): + if self._is_worker(): + return self._get_size() / 2 + return 0 + + def _server_num(self): + """ + return the current number of server + """ + if self._check_role_generation(): + if self._is_server(): + return self._get_size() / 2 + return 0 + + def _worker_index(self): + """ + return the index of worker + """ + if self._check_role_generation(): + return self.rank_ / self.proc_per_node_ + return 0 + + def _server_index(self): + """ + return the index of server + """ + if self._check_role_generation(): + return self.rank_ / self.proc_per_node_ + return 0 + + def _barrier_worker(self): + """ + barrier all workers in current distributed job + """ + if self._check_role_generation(): + if self._is_worker(): + self.node_type_comm_.barrier() + + def _barrier_server(self): + """ + barrier all servers in current distributed job + """ + if self._check_role_generation(): + if self._is_server(): + self.node_type_comm_.barrier() + + def _generate_role(self): + """ + generate currently process's role + """ + if not self.role_is_generated_: + # TODO(guru4elephant): only allow to be called once + self.trainer_endpoints_ = self._get_ips() + self.pserver_endpoints_ = self._get_ips() + + if 0 == self._get_rank() % self.proc_per_node_ % 2: + self.node_type_ = 0 + else: + self.node_type_ = 1 + self.node_type_comm_ = self.comm_.Split(self.node_type_) + self.role_is_generated_ = True diff --git a/python/paddle/fluid/incubate/fleet/p2p/__init__.py b/python/paddle/fluid/incubate/fleet/p2p/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8647330f3290f3142cabca9a7e3fe162a9838dda --- /dev/null +++ b/python/paddle/fluid/incubate/fleet/p2p/__init__.py @@ -0,0 +1,12 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/__init__.py b/python/paddle/fluid/incubate/fleet/parameter_server/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..044aa33c2b5b572aa40169e8c57936b105ba0121 --- /dev/null +++ b/python/paddle/fluid/incubate/fleet/parameter_server/__init__.py @@ -0,0 +1,326 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and + +import sys +import os +from ..base.role_maker import MPISymetricRoleMaker +from .optimizer_factory import * +from google.protobuf import text_format +import paddle.fluid.optimizer as local_optimizer +import paddle.fluid as fluid + + +class Fleet(object): + """ + Fleet in Python. Fleet is used in distributed training. It is designed as a singlton instance + in c++. A Fleet() object will be initialized automatically when a user import this package as + fleet. The General interface Fleet supports are: + init(): which should be called only once in user's python scripts. init() will initialize + FleetWrapper in CPP, it will also initialize a RoleMaker which is used for identifying + current node's role, e.g. worker, server, etc. + stop(): will be called after a user finishes his/her training task. Fleet instance will be + destroyed when stop() is called. + init_pserver(): will be called by user. When a user knows current process is_worker(), he/she + should call init_pserver() to initialize global information about parameter server + init_worker(): will be called by user. When a user knows current process is_server(), he/she + should call init_worker() to initialize global information about worker and connect + worker with pserver. + get_worker_num(): return the number of current task's worker node + get_server_num(): return the number of current task's pserver node + is_worker(): return whether current process is a worker + is_server(): return thether current process is a server + init_pserver_model(): initialize model parameters in pserver, called from a worker node + save_pserver_model(): save model parameters in pserver, called from a server node + + Example: + + .. code-block:: python + import paddle.fluid.incubate.fleet.parameter_server as fleet + from my_model import bow_net + model = bow_net() + fleet.init() + sgd_optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.0001) + sgd_optimizer = fleet.DistributedOptimizer(sgd_optimizer) + sgd_optimizer.minimize(model.loss) + exe = paddle.fluid.Executor(paddle.fluid.CPUPlace()) + if fleet.is_worker(): + exe.run(paddle.fluid.default_startup_program()) + fleet.init_worker() # init worker should be called before training + # do other things like training + elif fleet.is_server(): + fleet.init_pserver() + fleet.stop() + """ + + def __init__(self): + self._opt_info = None # for fleet only + self.role_maker_ = None + self.local_ip_ = 0 + self.is_initialized_ = False + + def init(self): + # TODO(guru4elephant) + # this is a temporary solution + # we will support more configurable RoleMaker for users in the future + """ + init(): which should be called only once in user's python scripts. init() will initialize + FleetWrapper in CPP, it will also initialize a RoleMaker which is used for identifying + current node's role, e.g. worker, server, etc. + """ + if not self.is_initialized_: + self.role_maker_ = MPISymetricRoleMaker() + self.role_maker_._generate_role() + self._fleet_ptr = fluid.core.Fleet() + self.is_initialized_ = True + + def stop(self): + """ + stop(): will be called after a user finishes his/her training task. Fleet instance will be + destroyed when stop() is called. + """ + self.role_maker_._barrier_worker() + if self.role_maker_._is_first_worker(): + self._fleet_ptr.stop_server() + self.role_maker_._barrier_worker() + self.role_maker_._barrier_all() + self.role_maker_._finalize() + + def init_pserver(self): + """ + init_pserver(): will be called by user. When a user knows current process is_worker(), he/she + should call init_pserver() to initialize global information about parameter server + """ + if self._opt_info: + if "fleet_desc" in self._opt_info: + self._dist_desc_str = text_format.MessageToString( + self._opt_info["fleet_desc"]) + self._dist_desc = self._opt_info["fleet_desc"] + else: + print("You should run DistributedOptimizer.minimize() first") + sys.exit(-1) + self._fleet_ptr.init_server(self._dist_desc_str, + self.role_maker_._get_rank()) + self.local_ip_ = self._fleet_ptr.run_server() + # barrier_all for init_server + self.role_maker_._barrier_all() + self.all_ips_ = self.role_maker_._all_gather(self.local_ip_) + + self._fleet_ptr.gather_servers(self.all_ips_, + self.role_maker_._get_size()) + # barrier_all for init_worker, wait all workers start + self.role_maker_._barrier_all() + else: + print("You should run DistributedOptimizer.minimize() first") + sys.exit(-1) + + def init_worker(self, programs): + """ + init_worker(): will be called by user. When a user knows current process is_server(), he/she + should call init_worker() to initialize global information about worker and connect + worker with pserver. + + Args: + programs(Program|list): a Program or a list of Programs + + """ + if not isinstance(programs, list): + programs = [programs] + if self._opt_info: + if "fleet_desc" in self._opt_info: + self._dist_desc_str = text_format.MessageToString( + self._opt_info["fleet_desc"]) + self._dist_desc = self._opt_info["fleet_desc"] + else: + print("You should run DistributedOptimizer.minimize() first") + sys.exit(-1) + # barrier_all for init_server, wait for server starts + self.role_maker_._barrier_all() + self.all_ips_ = self.role_maker_._all_gather(self.local_ip_) + self._fleet_ptr.init_worker(self._dist_desc_str, self.all_ips_, + self.role_maker_._get_size(), + self.role_maker_._get_rank()) + # barrier_all for init_worker + self.role_maker_._barrier_all() + # prepare for client to client communication + info = self._fleet_ptr.get_clients_info() + all_info = self.role_maker_._worker_gather(info[0]) + self._fleet_ptr.gather_clients(all_info) + self._fleet_ptr.create_client2client_connection() + # barrier for init model + self.role_maker_._barrier_worker() + if self.role_maker_._is_first_worker(): + tables = self._dist_desc.trainer_param.dense_table + for prog in programs: + prog_id = str(id(prog)) + prog_conf = self._opt_info['program_configs'][prog_id] + prog_tables = {} + for key in prog_conf: + if "dense" not in key: + continue + for table_id in prog_conf[key]: + prog_tables[int(table_id)] = 0 + for table in tables: + if int(table.table_id) not in prog_tables: + continue + var_name_list = [] + for i in range(0, len(table.dense_variable_name)): + var_name_list.append(table.dense_variable_name[i]) + self._fleet_ptr.init_model(prog.desc, + int(table.table_id), + var_name_list) + # barrier for init model done + self.role_maker_._barrier_worker() + else: + print("You should run DistributedOptimizer.minimize() first") + sys.exit(-1) + + def get_worker_num(self): + """ + return the number of current job's worker num + """ + return self.role_maker_._worker_num() + + def get_server_num(self): + """ + return the number of current job's server num + """ + return self.role_maker_._server_num() + + def get_worker_index(self): + """ + return the mpi rank of current worker + """ + return self.role_maker_._worker_index() + + def is_worker(self): + """ + return whether current node is a worker + """ + return self.role_maker_._is_worker() + + def is_server(self): + """ + return whether current node is pserver + """ + return self.role_maker_._is_server() + + def init_pserver_model(self): + """ + init pserver model called from pserver + """ + if self.role_maker_._is_first_worker(): + self._fleet_ptr.init_model() + self.role_maker_._barrier_worker() + + def save_pserver_model(self, save_path): + """ + save pserver model called from a worker + """ + self._fleet_ptr.save_model(save_path) + + def _set_opt_info(self, opt_info): + """ + this function saves the result from DistributedOptimizer.minimize() + """ + self._opt_info = opt_info + + +class DistributedOptimizer(object): + """ + DistributedOptimizer is a wrapper for paddle.fluid.optimizer + A user should pass a paddle.fluid.optimizer to DistributedOptimizer + minimize() function is implemented. + DistributedOptimizer is the starting point for a user who wants to + run distributed training. The optimized information will be stored in + Fleet() instance who holds the global information about current distributed + training. + """ + + def __init__(self, optimizer, dist_config={}): + super(DistributedOptimizer, self).__init__() + self._optimizer = optimizer + self._optimizer_name = "Distributed%s" % optimizer.type.capitalize() + if optimizer.type != "adam": + print("Currently, distributed optimizer only supports Adam" + "Will config built-in adam for you." + "We will support more functions in DistributedOptimizer", + sys.stderr) + self._optimizer_name = "DistributedAdam" + + self._distributed_optimizer = globals()[self._optimizer_name](optimizer) + + def backward(self, + loss, + startup_program=None, + parameter_list=None, + no_grad_set=None, + callbacks=None): + """ + Currently, backward function can not be called through DistributedOptimizer + """ + raise NotImplementedError() + + def apply_gradients(self, params_grads): + """ + Currently, apply_gradients function can not be called through DistributedOptimizer + """ + raise NotImplementedError() + + def minimize(self, + loss, + startup_program=None, + parameter_list=None, + no_grad_set=None): + """ + minimize a program through loss, loss can be a list in DistributedOptimizer + Args: + loss (Variable|Variable List): loss variable or loss variable list to run optimization. + startup_program (Program): startup_program for initializing parameters + in `parameter_list`. + parameter_list (list): list of Variables to update. + no_grad_set (set|None): set of Variables should be ignored. + Returns: + tuple: (optimize_ops, params_grads) which are, list of operators appended; + and list of (param, grad) Variables pair for optimization. + Note that in parameter server mode, a worker will not get anything about optimize_os + Because optmizer algorithms run on pserver side. We will make this usable in pserver + process, but currently the optimization part is written into Fleet(). A user does not + need to care about how to startup a pserver node. + """ + optimize_ops, param_grads, opt_info = \ + self._distributed_optimizer._minimize( + loss, + startup_program, + parameter_list, + no_grad_set) + + fleet_instance._set_opt_info(opt_info) + return [optimize_ops, param_grads] + + +# this is a temporary solution +# TODO(guru4elephant) +# will make this more flexible for more Parameter Server Archs +fleet_instance = Fleet() + +init = fleet_instance.init +stop = fleet_instance.stop +init_pserver = fleet_instance.init_pserver +init_worker = fleet_instance.init_worker +is_worker = fleet_instance.is_worker +is_server = fleet_instance.is_server +init_pserver_model = fleet_instance.init_pserver_model +save_pserver_model = fleet_instance.save_pserver_model +worker_num = fleet_instance.get_worker_num +server_num = fleet_instance.get_server_num +worker_index = fleet_instance.get_worker_index diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/node.py b/python/paddle/fluid/incubate/fleet/parameter_server/node.py new file mode 100644 index 0000000000000000000000000000000000000000..60035b6e8da3e40158f8be0bafdd911f6bd6f543 --- /dev/null +++ b/python/paddle/fluid/incubate/fleet/parameter_server/node.py @@ -0,0 +1,203 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and + +import ps_pb2 as pslib + + +class Server(object): + """ + A Server basic class. + """ + + def __init__(self): + pass + + +class Worker(object): + """ + A Worker basic class. + """ + + def __init__(self): + pass + + +class DownpourServer(Server): + """ + DownpourServer class is used to generate server program_desc + Args: + server: it is pslib.ServerParameter() + Examples: + server = DownpourServer() + """ + + def __init__(self): + self.server_ = pslib.ServerParameter() + self.server_.downpour_server_param.service_param.start_server_port = 0 + self.server_.downpour_server_param.service_param.server_class = "DownpourBrpcPsServer" + self.server_.downpour_server_param.service_param.client_class = "DownpourBrpcPsClient" + self.server_.downpour_server_param.service_param.service_class = "DownpourPsService" + self.server_.downpour_server_param.service_param.start_server_port = 0 + self.server_.downpour_server_param.service_param.server_thread_num = 12 + + def add_sparse_table(self, table_id, learning_rate, slot_key_vars, + slot_value_var): + """ + Args: + table_id(int): id of sparse params table + learning_rate(float): the learning rate used to update parameters. \ + Can be a float value + slot_key_vars(string): slot key id + slot_value_var(string): slot key value after embedding + Returns: + return None + """ + table = self.server_.downpour_server_param.downpour_table_param.add() + table.table_id = table_id + table.table_class = "DownpourSparseTable" + table.type = pslib.PS_SPARSE_TABLE + table.accessor.accessor_class = "DownpourFeatureValueAccessor" + table.accessor.sparse_sgd_param.learning_rate = learning_rate + table.accessor.sparse_sgd_param.initial_g2sum = 3 + table.accessor.sparse_sgd_param.initial_range = 1e-4 + table.accessor.sparse_sgd_param.weight_bounds.extend([-10, 10]) + + table.accessor.embedx_dim = 8 + table.accessor.embedx_threshold = 5 + table.accessor.fea_dim = 11 + table.accessor.downpour_accessor_param.nonclk_coeff = 0.1 + table.accessor.downpour_accessor_param.click_coeff = 2 + table.accessor.downpour_accessor_param.base_threshold = 0.2 + table.accessor.downpour_accessor_param.delta_threshold = 0.15 + table.accessor.downpour_accessor_param.delta_keep_days = 31 + table.accessor.downpour_accessor_param.show_click_decay_rate = 0.999 + table.accessor.downpour_accessor_param.delete_threshold = 0.8 + + def add_dense_table(self, table_id, learning_rate, param_var, grad_var): + """ + Args: + table_id(int): id of sparse params table + learning_rate(float): the learning rate used to update parameters. \ + Can be a float value + param_var(list): all dense param. it is a list. + grad_var(list): all dense grad parm it is a list. + Returns: + return None + """ + table = self.server_.downpour_server_param.downpour_table_param.add() + table.table_id = table_id + table.table_class = "DownpourDenseTable" + table.type = pslib.PS_DENSE_TABLE + table.accessor.accessor_class = "DownpourDenseValueAccessor" + table.accessor.dense_sgd_param.name = "adam" + table.accessor.dense_sgd_param.adam.learning_rate = learning_rate + table.accessor.dense_sgd_param.adam.avg_decay_rate = 0.999993 + table.accessor.dense_sgd_param.adam.ada_decay_rate = 0.9999 + table.accessor.dense_sgd_param.adam.ada_epsilon = 1e-8 + table.accessor.dense_sgd_param.adam.mom_decay_rate = 0.99 + table.accessor.dense_sgd_param.naive.learning_rate = 0.0002 + fea_dim = 0 + for param in filter(lambda x: x.name.find("embedding") == -1, + param_var): + fea_dim += reduce(lambda x, y: x * y, param.shape, 1) + table.accessor.fea_dim = fea_dim + + def add_data_norm_table(self, table_id, learning_rate, param_var, grad_var): + """ + Args: + table_id(int): id of sparse params table + learning_rate(float): the learning rate used to update parameters. \ + Can be a float value + param_var(list): all dense param. it is a list. + grad_var(list): all dense grad parm it is a list. + Returns: + return None + """ + table = self.server_.downpour_server_param.downpour_table_param.add() + table.table_id = table_id + table.table_class = "DownpourDenseTable" + table.type = pslib.PS_DENSE_TABLE + table.accessor.accessor_class = "DownpourDenseValueAccessor" + table.accessor.dense_sgd_param.name = "summary" + table.accessor.dense_sgd_param.summary.summary_decay_rate = 0.999999 + fea_dim = 0 + for param in filter(lambda x: x.name.find("embedding") == -1, + param_var): + fea_dim += reduce(lambda x, y: x * y, param.shape, 1) + table.accessor.fea_dim = fea_dim + + def get_desc(self): + """ + Return downpour server program_desc + """ + return self.server_ + + +class DownpourWorker(Worker): + """ + DownpourWorker class is used to generate worker program_desc + Args: + window (int): push params frequency + worker: it is pslib.DownpourTrainerParameter + Examples: + worker = DownpourWorker(1) + """ + + def __init__(self, window): + self.window = window + self.worker_ = pslib.DownpourTrainerParameter() + + def add_sparse_table(self, table_id, learning_rate, slot_key_vars, + slot_value_vars): + """ + Args: + table_id(int): id of sparse params table + learning_rate(float): the learning rate used to update parameters. \ + Can be a float value + slot_key_vars(string): slot key id + slot_value_var(string): slot key value after embedding + Returns: + return None + """ + table = self.worker_.sparse_table.add() + table.table_id = table_id + table.slot_key.extend([var.name for var in slot_key_vars]) + table.slot_value.extend([var.name for var in slot_value_vars]) + table.slot_gradient.extend( + [var.name + "@GRAD" for var in slot_value_vars]) + + def add_dense_table(self, table_id, learning_rate, param_vars, grad_vars): + """ + Args: + table_id(int): id of sparse params table + learning_rate(float): the learning rate used to update parameters. \ + Can be a float value + param_var(list): all dense param. it is a list. + grad_var(list): all dense grad parm it is a list. + Returns: + return None + """ + table = self.worker_.dense_table.add() + table.table_id = table_id + table.dense_variable_name.extend( + filter(lambda x: x.find("embedding") == -1, + [p.name for p in param_vars])) + table.dense_gradient_variable_name.extend( + filter(lambda x: x.find("embedding") == -1, + [g.name for g in grad_vars])) + + def get_desc(self): + """ + Return downpour worker program_desc + """ + return self.worker_ diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/optimizer_factory.py b/python/paddle/fluid/incubate/fleet/parameter_server/optimizer_factory.py new file mode 100644 index 0000000000000000000000000000000000000000..94f79e77e72bfa2d0a09502722ef36d474b610b2 --- /dev/null +++ b/python/paddle/fluid/incubate/fleet/parameter_server/optimizer_factory.py @@ -0,0 +1,170 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +__all__ = ["DistributedAdam"] +import ps_pb2 as pslib +import paddle.fluid as fluid +from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table +from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table_inputs +from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table_outputs +from google.protobuf import text_format +from .node import DownpourWorker, DownpourServer + + +class DistributedOptimizerImplBase(object): + def __init__(self, optimizer): + self.optimizer_ = optimizer + self.learning_rate_ = optimizer._learning_rate + self.regularization_ = optimizer.regularization + + def minimize(self, + losses, + startup_program=None, + parameter_list=None, + no_grad_set=None): + pass + + +class DistributedAdam(DistributedOptimizerImplBase): + def __init__(self, optimizer): + # todo(guru4elephant): add more optimizers here as argument + # todo(guru4elephant): make learning_rate as a variable + super(DistributedAdam, self).__init__(optimizer) + self.window_ = 1 + self.type = "downpour" + self.data_norm_name = [ + ".batch_size", ".batch_square_sum", ".batch_sum", + ".batch_size@GRAD", ".batch_square_sum@GRAD", ".batch_sum@GRAD" + ] + + def _minimize(self, + losses, + startup_program=None, + parameter_list=None, + no_grad_set=None): + """ + DownpounSGD is a distributed optimizer so + that user can call minimize to generate backward + operators and optimization operators within minmize function + Args: + loss(Variable): loss variable defined by user + startup_program(Program): startup program that defined by user + parameter_list(str list): parameter names defined by users + no_grad_set(set): a set of variables that is defined by users + so that these variables do not need gradient computation + Returns: + [optimize_ops, grads_and_weights] + """ + if not isinstance(losses, list): + losses = [losses] + + table_name = find_distributed_lookup_table(losses[0].block.program) + prefetch_slots = find_distributed_lookup_table_inputs( + losses[0].block.program, table_name) + prefetch_slots_emb = find_distributed_lookup_table_outputs( + losses[0].block.program, table_name) + + ps_param = pslib.PSParameter() + server = DownpourServer() + worker = DownpourWorker(self.window_) + sparse_table_index = 0 + server.add_sparse_table(sparse_table_index, self.learning_rate_, + prefetch_slots, prefetch_slots_emb) + worker.add_sparse_table(sparse_table_index, self.learning_rate_, + prefetch_slots, prefetch_slots_emb) + dense_table_index = 1 + program_configs = {} + param_grads_list = [] + + for loss_index in range(len(losses)): + #program_config = ps_param.trainer_param.program_config.add() + #program_config.program_id = str( + # id(losses[loss_index].block.program)) + program_id = str(id(losses[loss_index].block.program)) + program_configs[program_id] = { + "pull_sparse": [sparse_table_index], + "push_sparse": [sparse_table_index] + } + + #program_config.pull_sparse_table_id.extend([sparse_table_index]) + #program_config.push_sparse_table_id.extend([sparse_table_index]) + params_grads = sorted( + fluid.backward.append_backward(losses[loss_index], + parameter_list, no_grad_set), + key=lambda x: x[0].name) + param_grads_list.append(params_grads) + params = [] + grads = [] + data_norm_params = [] + data_norm_grads = [] + for i in params_grads: + is_data_norm_data = False + for data_norm_name in self.data_norm_name: + if i[0].name.endswith(data_norm_name): + is_data_norm_data = True + data_norm_params.append(i[0]) + if not is_data_norm_data: + params.append(i[0]) + for i in params_grads: + is_data_norm_data = False + for data_norm_grad in self.data_norm_name: + if i[0].name.endswith(data_norm_grad): + is_data_norm_data = True + data_norm_grads.append(i[1]) + if not is_data_norm_data: + grads.append(i[1]) + server.add_dense_table(dense_table_index, self.learning_rate_, + params, grads) + worker.add_dense_table(dense_table_index, self.learning_rate_, + params, grads) + program_configs[program_id]["pull_dense"] = [dense_table_index] + program_configs[program_id]["push_dense"] = [dense_table_index] + #program_config.pull_dense_table_id.extend([dense_table_index]) + #program_config.push_dense_table_id.extend([dense_table_index]) + if len(data_norm_params) != 0 and len(data_norm_grads) != 0: + dense_table_index += 1 + server.add_data_norm_table(dense_table_index, + self.learning_rate_, + data_norm_params, data_norm_grads) + worker.add_dense_table(dense_table_index, self.learning_rate_, + data_norm_params, data_norm_grads) + #program_config.pull_dense_table_id.extend([dense_table_index]) + #program_config.push_dense_table_id.extend([dense_table_index]) + program_configs[program_id]["pull_dense"].extend( + [dense_table_index]) + program_configs[program_id]["push_dense"].extend( + [dense_table_index]) + dense_table_index += 1 + #program_configs.append(program_config) + ps_param.server_param.CopyFrom(server.get_desc()) + ps_param.trainer_param.CopyFrom(worker.get_desc()) + #for program_config in program_configs: + # ps_param.trainer_param.program_config.extend([program_config]) + # Todo(guru4elephant): figure out how to support more sparse parameters + # currently only support lookup_table + worker_skipped_ops = ["lookup_table", "lookup_table_grad"] + ps_param.trainer_param.skip_op.extend(worker_skipped_ops) + + opt_info = {} + opt_info["program_configs"] = program_configs + opt_info["trainer"] = "DistMultiTrainer" + opt_info["device_worker"] = "DownpourSGD" + opt_info["optimizer"] = "DownpourSGD" + opt_info["fleet_desc"] = ps_param + opt_info["worker_skipped_ops"] = worker_skipped_ops + + for loss in losses: + loss.block.program._fleet_opt = opt_info + + return None, param_grads_list[0], opt_info diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/ps_pb2.py b/python/paddle/fluid/incubate/fleet/parameter_server/ps_pb2.py new file mode 100644 index 0000000000000000000000000000000000000000..5c9b2def0761ac96e81181959852c49f0fd03bd8 --- /dev/null +++ b/python/paddle/fluid/incubate/fleet/parameter_server/ps_pb2.py @@ -0,0 +1,2426 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Generated by the protocol buffer compiler. DO NOT EDIT! +# source: ps.proto + +import sys +_b = sys.version_info[0] < 3 and (lambda x: x) or (lambda x: x.encode('latin1')) +from google.protobuf.internal import enum_type_wrapper +from google.protobuf import descriptor as _descriptor +from google.protobuf import message as _message +from google.protobuf import reflection as _reflection +from google.protobuf import symbol_database as _symbol_database +from google.protobuf import descriptor_pb2 +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + +DESCRIPTOR = _descriptor.FileDescriptor( + name='ps.proto', + package='paddle', + syntax='proto2', + serialized_pb=_b( + '\n\x08ps.proto\x12\x06paddle\"\x9e\x02\n\x0bPSParameter\x12\x14\n\x0cworker_class\x18\x01 \x01(\t\x12\x14\n\x0cserver_class\x18\x02 \x01(\t\x12\x16\n\x0einstance_class\x18\x03 \x01(\t\x12-\n\x0cworker_param\x18\x65 \x01(\x0b\x32\x17.paddle.WorkerParameter\x12-\n\x0cserver_param\x18\x66 \x01(\x0b\x32\x17.paddle.ServerParameter\x12\x38\n\rtrainer_param\x18\xad\x02 \x01(\x0b\x32 .paddle.DownpourTrainerParameter\x12\x33\n\x0f\x66s_client_param\x18\xf5\x03 \x01(\x0b\x32\x19.paddle.FsClientParameter\"Q\n\x0fWorkerParameter\x12>\n\x15\x64ownpour_worker_param\x18\x01 \x01(\x0b\x32\x1f.paddle.DownpourWorkerParameter\"Q\n\x0fServerParameter\x12>\n\x15\x64ownpour_server_param\x18\x01 \x01(\x0b\x32\x1f.paddle.DownpourServerParameter\"O\n\x17\x44ownpourWorkerParameter\x12\x34\n\x14\x64ownpour_table_param\x18\x01 \x03(\x0b\x32\x16.paddle.TableParameter\"\xfd\x01\n\x18\x44ownpourTrainerParameter\x12\x30\n\x0b\x64\x65nse_table\x18\x01 \x03(\x0b\x32\x1b.paddle.DenseTableParameter\x12\x32\n\x0csparse_table\x18\x02 \x03(\x0b\x32\x1c.paddle.SparseTableParameter\x12\x1d\n\x15push_sparse_per_batch\x18\x03 \x01(\x05\x12\x1c\n\x14push_dense_per_batch\x18\x04 \x01(\x05\x12\x0f\n\x07skip_op\x18\x05 \x03(\t\x12-\n\x0eprogram_config\x18\x06 \x03(\x0b\x32\x15.paddle.ProgramConfig\"\x99\x01\n\rProgramConfig\x12\x12\n\nprogram_id\x18\x01 \x02(\t\x12\x1c\n\x14push_sparse_table_id\x18\x02 \x03(\x05\x12\x1b\n\x13push_dense_table_id\x18\x03 \x03(\x05\x12\x1c\n\x14pull_sparse_table_id\x18\x04 \x03(\x05\x12\x1b\n\x13pull_dense_table_id\x18\x05 \x03(\x05\"{\n\x13\x44\x65nseTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x05\x12\x1b\n\x13\x64\x65nse_variable_name\x18\x02 \x03(\t\x12$\n\x1c\x64\x65nse_gradient_variable_name\x18\x03 \x03(\t\x12\x0f\n\x07\x66\x65\x61_dim\x18\x04 \x01(\x05\"z\n\x14SparseTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x05\x12\x13\n\x0b\x66\x65\x61ture_dim\x18\x02 \x01(\x05\x12\x10\n\x08slot_key\x18\x03 \x03(\t\x12\x12\n\nslot_value\x18\x04 \x03(\t\x12\x15\n\rslot_gradient\x18\x05 \x03(\t\"\x86\x01\n\x17\x44ownpourServerParameter\x12\x34\n\x14\x64ownpour_table_param\x18\x01 \x03(\x0b\x32\x16.paddle.TableParameter\x12\x35\n\rservice_param\x18\x02 \x01(\x0b\x32\x1e.paddle.ServerServiceParameter\"\xd7\x01\n\x16ServerServiceParameter\x12*\n\x0cserver_class\x18\x01 \x01(\t:\x14\x44ownpourBrpcPsServer\x12*\n\x0c\x63lient_class\x18\x02 \x01(\t:\x14\x44ownpourBrpcPsClient\x12(\n\rservice_class\x18\x03 \x01(\t:\x11\x44ownpourPsService\x12\x1c\n\x11start_server_port\x18\x04 \x01(\r:\x01\x30\x12\x1d\n\x11server_thread_num\x18\x05 \x01(\r:\x02\x31\x32\"\xbf\x01\n\x0eTableParameter\x12\x10\n\x08table_id\x18\x01 \x01(\x04\x12\x13\n\x0btable_class\x18\x02 \x01(\t\x12\x12\n\nshared_num\x18\x03 \x01(\x04\x12\x30\n\x08\x61\x63\x63\x65ssor\x18\x04 \x01(\x0b\x32\x1e.paddle.TableAccessorParameter\x12\x1f\n\x04type\x18\x05 \x01(\x0e\x32\x11.paddle.TableType\x12\x1f\n\x10\x63ompress_in_save\x18\x06 \x01(\x08:\x05\x66\x61lse\"\xf1\x02\n\x16TableAccessorParameter\x12\x16\n\x0e\x61\x63\x63\x65ssor_class\x18\x01 \x01(\t\x12\x38\n\x10sparse_sgd_param\x18\x02 \x01(\x0b\x32\x1e.paddle.SparseSGDRuleParameter\x12\x36\n\x0f\x64\x65nse_sgd_param\x18\x03 \x01(\x0b\x32\x1d.paddle.DenseSGDRuleParameter\x12\x0f\n\x07\x66\x65\x61_dim\x18\x04 \x01(\r\x12\x12\n\nembedx_dim\x18\x05 \x01(\r\x12\x18\n\x10\x65mbedx_threshold\x18\x06 \x01(\r\x12G\n\x17\x64ownpour_accessor_param\x18\x07 \x01(\x0b\x32&.paddle.DownpourTableAccessorParameter\x12\x45\n\x19table_accessor_save_param\x18\x08 \x03(\x0b\x32\".paddle.TableAccessorSaveParameter\"\xce\x01\n\x1e\x44ownpourTableAccessorParameter\x12\x14\n\x0cnonclk_coeff\x18\x01 \x01(\x02\x12\x13\n\x0b\x63lick_coeff\x18\x02 \x01(\x02\x12\x16\n\x0e\x62\x61se_threshold\x18\x03 \x01(\x02\x12\x17\n\x0f\x64\x65lta_threshold\x18\x04 \x01(\x02\x12\x17\n\x0f\x64\x65lta_keep_days\x18\x05 \x01(\x02\x12\x1d\n\x15show_click_decay_rate\x18\x06 \x01(\x02\x12\x18\n\x10\x64\x65lete_threshold\x18\x07 \x01(\x02\"S\n\x1aTableAccessorSaveParameter\x12\r\n\x05param\x18\x01 \x01(\r\x12\x11\n\tconverter\x18\x02 \x01(\t\x12\x13\n\x0b\x64\x65\x63onverter\x18\x03 \x01(\t\"e\n\x10PsRequestMessage\x12\x0e\n\x06\x63md_id\x18\x01 \x02(\r\x12\x10\n\x08table_id\x18\x02 \x01(\r\x12\x0e\n\x06params\x18\x03 \x03(\x0c\x12\x11\n\tclient_id\x18\x04 \x01(\x05\x12\x0c\n\x04\x64\x61ta\x18\x05 \x01(\x0c\"w\n\x16SparseSGDRuleParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x15\n\rinitial_g2sum\x18\x02 \x01(\x01\x12\x18\n\rinitial_range\x18\x03 \x01(\x01:\x01\x30\x12\x15\n\rweight_bounds\x18\x04 \x03(\x02\"\xe1\x01\n\x15\x44\x65nseSGDRuleParameter\x12\x0c\n\x04name\x18\x01 \x01(\t\x12&\n\x04\x61\x64\x61m\x18\x02 \x01(\x0b\x32\x18.paddle.AdamSGDParameter\x12(\n\x05naive\x18\x03 \x01(\x0b\x32\x19.paddle.NaiveSGDParameter\x12,\n\x07summary\x18\x04 \x01(\x0b\x32\x1b.paddle.SummarySGDParameter\x12:\n\x0emoving_average\x18\x05 \x01(\x0b\x32\".paddle.MovingAverageRuleParameter\"\x86\x01\n\x10\x41\x64\x61mSGDParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x16\n\x0e\x61vg_decay_rate\x18\x02 \x01(\x01\x12\x16\n\x0e\x61\x64\x61_decay_rate\x18\x03 \x01(\x01\x12\x13\n\x0b\x61\x64\x61_epsilon\x18\x04 \x01(\x01\x12\x16\n\x0emom_decay_rate\x18\x05 \x01(\x01\"B\n\x11NaiveSGDParameter\x12\x15\n\rlearning_rate\x18\x01 \x01(\x01\x12\x16\n\x0e\x61vg_decay_rate\x18\x02 \x01(\x01\";\n\x13SummarySGDParameter\x12$\n\x12summary_decay_rate\x18\x01 \x01(\x01:\x08\x30.999999\".\n\x1aMovingAverageRuleParameter\x12\x10\n\x08momentum\x18\x01 \x01(\x01\"I\n\x11PsResponseMessage\x12\x13\n\x08\x65rr_code\x18\x01 \x02(\x05:\x01\x30\x12\x11\n\x07\x65rr_msg\x18\x02 \x02(\t:\x00\x12\x0c\n\x04\x64\x61ta\x18\x03 \x01(\x0c\"\xd5\x01\n\x11\x46sClientParameter\x12:\n\x07\x66s_type\x18\x01 \x01(\x0e\x32#.paddle.FsClientParameter.FsApiType:\x04HDFS\x12\x0b\n\x03uri\x18\x02 \x01(\t\x12\x0c\n\x04user\x18\x03 \x01(\t\x12\x0e\n\x06passwd\x18\x04 \x01(\t\x12\x13\n\x0b\x62uffer_size\x18\x05 \x01(\x05\x12\x12\n\nhadoop_bin\x18\x33 \x01(\t\x12\x10\n\x08\x61\x66s_conf\x18\x65 \x01(\t\"\x1e\n\tFsApiType\x12\x08\n\x04HDFS\x10\x00\x12\x07\n\x03\x41\x46S\x10\x01*4\n\tTableType\x12\x13\n\x0fPS_SPARSE_TABLE\x10\x00\x12\x12\n\x0ePS_DENSE_TABLE\x10\x01*\xbd\x02\n\x07PsCmdID\x12\x17\n\x13PS_PULL_DENSE_TABLE\x10\x00\x12\x17\n\x13PS_PUSH_DENSE_TABLE\x10\x01\x12\x18\n\x14PS_PULL_SPARSE_TABLE\x10\x02\x12\x18\n\x14PS_PUSH_SPARSE_TABLE\x10\x03\x12\x13\n\x0fPS_SHRINK_TABLE\x10\x04\x12\x15\n\x11PS_SAVE_ONE_TABLE\x10\x05\x12\x15\n\x11PS_SAVE_ALL_TABLE\x10\x06\x12\x15\n\x11PS_LOAD_ONE_TABLE\x10\x07\x12\x15\n\x11PS_LOAD_ALL_TABLE\x10\x08\x12\x16\n\x12PS_CLEAR_ONE_TABLE\x10\t\x12\x16\n\x12PS_CLEAR_ALL_TABLE\x10\n\x12\x17\n\x13PS_PUSH_DENSE_PARAM\x10\x0b\x12\x12\n\x0ePS_STOP_SERVER\x10\x0c\x32K\n\tPsService\x12>\n\x07service\x12\x18.paddle.PsRequestMessage\x1a\x19.paddle.PsResponseMessageB\x03\x80\x01\x01' + )) +_sym_db.RegisterFileDescriptor(DESCRIPTOR) + +_TABLETYPE = _descriptor.EnumDescriptor( + name='TableType', + full_name='paddle.TableType', + filename=None, + file=DESCRIPTOR, + values=[ + _descriptor.EnumValueDescriptor( + name='PS_SPARSE_TABLE', index=0, number=0, options=None, type=None), + _descriptor.EnumValueDescriptor( + name='PS_DENSE_TABLE', index=1, number=1, options=None, type=None), + ], + containing_type=None, + options=None, + serialized_start=3489, + serialized_end=3541, ) +_sym_db.RegisterEnumDescriptor(_TABLETYPE) + +TableType = enum_type_wrapper.EnumTypeWrapper(_TABLETYPE) +_PSCMDID = _descriptor.EnumDescriptor( + name='PsCmdID', + full_name='paddle.PsCmdID', + filename=None, + file=DESCRIPTOR, + values=[ + _descriptor.EnumValueDescriptor( + name='PS_PULL_DENSE_TABLE', + index=0, + number=0, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='PS_PUSH_DENSE_TABLE', + index=1, + number=1, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='PS_PULL_SPARSE_TABLE', + index=2, + number=2, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='PS_PUSH_SPARSE_TABLE', + index=3, + number=3, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='PS_SHRINK_TABLE', index=4, number=4, options=None, type=None), + _descriptor.EnumValueDescriptor( + name='PS_SAVE_ONE_TABLE', + index=5, + number=5, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='PS_SAVE_ALL_TABLE', + index=6, + number=6, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='PS_LOAD_ONE_TABLE', + index=7, + number=7, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='PS_LOAD_ALL_TABLE', + index=8, + number=8, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='PS_CLEAR_ONE_TABLE', + index=9, + number=9, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='PS_CLEAR_ALL_TABLE', + index=10, + number=10, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='PS_PUSH_DENSE_PARAM', + index=11, + number=11, + options=None, + type=None), + _descriptor.EnumValueDescriptor( + name='PS_STOP_SERVER', index=12, number=12, options=None, + type=None), + ], + containing_type=None, + options=None, + serialized_start=3544, + serialized_end=3861, ) +_sym_db.RegisterEnumDescriptor(_PSCMDID) + +PsCmdID = enum_type_wrapper.EnumTypeWrapper(_PSCMDID) +PS_SPARSE_TABLE = 0 +PS_DENSE_TABLE = 1 +PS_PULL_DENSE_TABLE = 0 +PS_PUSH_DENSE_TABLE = 1 +PS_PULL_SPARSE_TABLE = 2 +PS_PUSH_SPARSE_TABLE = 3 +PS_SHRINK_TABLE = 4 +PS_SAVE_ONE_TABLE = 5 +PS_SAVE_ALL_TABLE = 6 +PS_LOAD_ONE_TABLE = 7 +PS_LOAD_ALL_TABLE = 8 +PS_CLEAR_ONE_TABLE = 9 +PS_CLEAR_ALL_TABLE = 10 +PS_PUSH_DENSE_PARAM = 11 +PS_STOP_SERVER = 12 + +_FSCLIENTPARAMETER_FSAPITYPE = _descriptor.EnumDescriptor( + name='FsApiType', + full_name='paddle.FsClientParameter.FsApiType', + filename=None, + file=DESCRIPTOR, + values=[ + _descriptor.EnumValueDescriptor( + name='HDFS', index=0, number=0, options=None, type=None), + _descriptor.EnumValueDescriptor( + name='AFS', index=1, number=1, options=None, type=None), + ], + containing_type=None, + options=None, + serialized_start=3457, + serialized_end=3487, ) +_sym_db.RegisterEnumDescriptor(_FSCLIENTPARAMETER_FSAPITYPE) + +_PSPARAMETER = _descriptor.Descriptor( + name='PSParameter', + full_name='paddle.PSParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='worker_class', + full_name='paddle.PSParameter.worker_class', + index=0, + number=1, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='server_class', + full_name='paddle.PSParameter.server_class', + index=1, + number=2, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='instance_class', + full_name='paddle.PSParameter.instance_class', + index=2, + number=3, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='worker_param', + full_name='paddle.PSParameter.worker_param', + index=3, + number=101, + type=11, + cpp_type=10, + label=1, + has_default_value=False, + default_value=None, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='server_param', + full_name='paddle.PSParameter.server_param', + index=4, + number=102, + type=11, + cpp_type=10, + label=1, + has_default_value=False, + default_value=None, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='trainer_param', + full_name='paddle.PSParameter.trainer_param', + index=5, + number=301, + type=11, + cpp_type=10, + label=1, + has_default_value=False, + default_value=None, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='fs_client_param', + full_name='paddle.PSParameter.fs_client_param', + index=6, + number=501, + type=11, + cpp_type=10, + label=1, + has_default_value=False, + default_value=None, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=21, + serialized_end=307, ) + +_WORKERPARAMETER = _descriptor.Descriptor( + name='WorkerParameter', + full_name='paddle.WorkerParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='downpour_worker_param', + full_name='paddle.WorkerParameter.downpour_worker_param', + index=0, + number=1, + type=11, + cpp_type=10, + label=1, + has_default_value=False, + default_value=None, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=309, + serialized_end=390, ) + +_SERVERPARAMETER = _descriptor.Descriptor( + name='ServerParameter', + full_name='paddle.ServerParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='downpour_server_param', + full_name='paddle.ServerParameter.downpour_server_param', + index=0, + number=1, + type=11, + cpp_type=10, + label=1, + has_default_value=False, + default_value=None, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=392, + serialized_end=473, ) + +_DOWNPOURWORKERPARAMETER = _descriptor.Descriptor( + name='DownpourWorkerParameter', + full_name='paddle.DownpourWorkerParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='downpour_table_param', + full_name='paddle.DownpourWorkerParameter.downpour_table_param', + index=0, + number=1, + type=11, + cpp_type=10, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=475, + serialized_end=554, ) + +_DOWNPOURTRAINERPARAMETER = _descriptor.Descriptor( + name='DownpourTrainerParameter', + full_name='paddle.DownpourTrainerParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='dense_table', + full_name='paddle.DownpourTrainerParameter.dense_table', + index=0, + number=1, + type=11, + cpp_type=10, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='sparse_table', + full_name='paddle.DownpourTrainerParameter.sparse_table', + index=1, + number=2, + type=11, + cpp_type=10, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='push_sparse_per_batch', + full_name='paddle.DownpourTrainerParameter.push_sparse_per_batch', + index=2, + number=3, + type=5, + cpp_type=1, + label=1, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='push_dense_per_batch', + full_name='paddle.DownpourTrainerParameter.push_dense_per_batch', + index=3, + number=4, + type=5, + cpp_type=1, + label=1, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='skip_op', + full_name='paddle.DownpourTrainerParameter.skip_op', + index=4, + number=5, + type=9, + cpp_type=9, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='program_config', + full_name='paddle.DownpourTrainerParameter.program_config', + index=5, + number=6, + type=11, + cpp_type=10, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=557, + serialized_end=810, ) + +_PROGRAMCONFIG = _descriptor.Descriptor( + name='ProgramConfig', + full_name='paddle.ProgramConfig', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='program_id', + full_name='paddle.ProgramConfig.program_id', + index=0, + number=1, + type=9, + cpp_type=9, + label=2, + has_default_value=False, + default_value=_b("").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='push_sparse_table_id', + full_name='paddle.ProgramConfig.push_sparse_table_id', + index=1, + number=2, + type=5, + cpp_type=1, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='push_dense_table_id', + full_name='paddle.ProgramConfig.push_dense_table_id', + index=2, + number=3, + type=5, + cpp_type=1, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='pull_sparse_table_id', + full_name='paddle.ProgramConfig.pull_sparse_table_id', + index=3, + number=4, + type=5, + cpp_type=1, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='pull_dense_table_id', + full_name='paddle.ProgramConfig.pull_dense_table_id', + index=4, + number=5, + type=5, + cpp_type=1, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=813, + serialized_end=966, ) + +_DENSETABLEPARAMETER = _descriptor.Descriptor( + name='DenseTableParameter', + full_name='paddle.DenseTableParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='table_id', + full_name='paddle.DenseTableParameter.table_id', + index=0, + number=1, + type=5, + cpp_type=1, + label=1, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='dense_variable_name', + full_name='paddle.DenseTableParameter.dense_variable_name', + index=1, + number=2, + type=9, + cpp_type=9, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='dense_gradient_variable_name', + full_name='paddle.DenseTableParameter.dense_gradient_variable_name', + index=2, + number=3, + type=9, + cpp_type=9, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='fea_dim', + full_name='paddle.DenseTableParameter.fea_dim', + index=3, + number=4, + type=5, + cpp_type=1, + label=1, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=968, + serialized_end=1091, ) + +_SPARSETABLEPARAMETER = _descriptor.Descriptor( + name='SparseTableParameter', + full_name='paddle.SparseTableParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='table_id', + full_name='paddle.SparseTableParameter.table_id', + index=0, + number=1, + type=5, + cpp_type=1, + label=1, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='feature_dim', + full_name='paddle.SparseTableParameter.feature_dim', + index=1, + number=2, + type=5, + cpp_type=1, + label=1, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='slot_key', + full_name='paddle.SparseTableParameter.slot_key', + index=2, + number=3, + type=9, + cpp_type=9, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='slot_value', + full_name='paddle.SparseTableParameter.slot_value', + index=3, + number=4, + type=9, + cpp_type=9, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='slot_gradient', + full_name='paddle.SparseTableParameter.slot_gradient', + index=4, + number=5, + type=9, + cpp_type=9, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=1093, + serialized_end=1215, ) + +_DOWNPOURSERVERPARAMETER = _descriptor.Descriptor( + name='DownpourServerParameter', + full_name='paddle.DownpourServerParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='downpour_table_param', + full_name='paddle.DownpourServerParameter.downpour_table_param', + index=0, + number=1, + type=11, + cpp_type=10, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='service_param', + full_name='paddle.DownpourServerParameter.service_param', + index=1, + number=2, + type=11, + cpp_type=10, + label=1, + has_default_value=False, + default_value=None, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=1218, + serialized_end=1352, ) + +_SERVERSERVICEPARAMETER = _descriptor.Descriptor( + name='ServerServiceParameter', + full_name='paddle.ServerServiceParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='server_class', + full_name='paddle.ServerServiceParameter.server_class', + index=0, + number=1, + type=9, + cpp_type=9, + label=1, + has_default_value=True, + default_value=_b("DownpourBrpcPsServer").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='client_class', + full_name='paddle.ServerServiceParameter.client_class', + index=1, + number=2, + type=9, + cpp_type=9, + label=1, + has_default_value=True, + default_value=_b("DownpourBrpcPsClient").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='service_class', + full_name='paddle.ServerServiceParameter.service_class', + index=2, + number=3, + type=9, + cpp_type=9, + label=1, + has_default_value=True, + default_value=_b("DownpourPsService").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='start_server_port', + full_name='paddle.ServerServiceParameter.start_server_port', + index=3, + number=4, + type=13, + cpp_type=3, + label=1, + has_default_value=True, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='server_thread_num', + full_name='paddle.ServerServiceParameter.server_thread_num', + index=4, + number=5, + type=13, + cpp_type=3, + label=1, + has_default_value=True, + default_value=12, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=1355, + serialized_end=1570, ) + +_TABLEPARAMETER = _descriptor.Descriptor( + name='TableParameter', + full_name='paddle.TableParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='table_id', + full_name='paddle.TableParameter.table_id', + index=0, + number=1, + type=4, + cpp_type=4, + label=1, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='table_class', + full_name='paddle.TableParameter.table_class', + index=1, + number=2, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='shared_num', + full_name='paddle.TableParameter.shared_num', + index=2, + number=3, + type=4, + cpp_type=4, + label=1, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='accessor', + full_name='paddle.TableParameter.accessor', + index=3, + number=4, + type=11, + cpp_type=10, + label=1, + has_default_value=False, + default_value=None, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='type', + full_name='paddle.TableParameter.type', + index=4, + number=5, + type=14, + cpp_type=8, + label=1, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='compress_in_save', + full_name='paddle.TableParameter.compress_in_save', + index=5, + number=6, + type=8, + cpp_type=7, + label=1, + has_default_value=True, + default_value=False, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=1573, + serialized_end=1764, ) + +_TABLEACCESSORPARAMETER = _descriptor.Descriptor( + name='TableAccessorParameter', + full_name='paddle.TableAccessorParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='accessor_class', + full_name='paddle.TableAccessorParameter.accessor_class', + index=0, + number=1, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='sparse_sgd_param', + full_name='paddle.TableAccessorParameter.sparse_sgd_param', + index=1, + number=2, + type=11, + cpp_type=10, + label=1, + has_default_value=False, + default_value=None, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='dense_sgd_param', + full_name='paddle.TableAccessorParameter.dense_sgd_param', + index=2, + number=3, + type=11, + cpp_type=10, + label=1, + has_default_value=False, + default_value=None, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='fea_dim', + full_name='paddle.TableAccessorParameter.fea_dim', + index=3, + number=4, + type=13, + cpp_type=3, + label=1, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='embedx_dim', + full_name='paddle.TableAccessorParameter.embedx_dim', + index=4, + number=5, + type=13, + cpp_type=3, + label=1, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='embedx_threshold', + full_name='paddle.TableAccessorParameter.embedx_threshold', + index=5, + number=6, + type=13, + cpp_type=3, + label=1, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='downpour_accessor_param', + full_name='paddle.TableAccessorParameter.downpour_accessor_param', + index=6, + number=7, + type=11, + cpp_type=10, + label=1, + has_default_value=False, + default_value=None, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='table_accessor_save_param', + full_name='paddle.TableAccessorParameter.table_accessor_save_param', + index=7, + number=8, + type=11, + cpp_type=10, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=1767, + serialized_end=2136, ) + +_DOWNPOURTABLEACCESSORPARAMETER = _descriptor.Descriptor( + name='DownpourTableAccessorParameter', + full_name='paddle.DownpourTableAccessorParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='nonclk_coeff', + full_name='paddle.DownpourTableAccessorParameter.nonclk_coeff', + index=0, + number=1, + type=2, + cpp_type=6, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='click_coeff', + full_name='paddle.DownpourTableAccessorParameter.click_coeff', + index=1, + number=2, + type=2, + cpp_type=6, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='base_threshold', + full_name='paddle.DownpourTableAccessorParameter.base_threshold', + index=2, + number=3, + type=2, + cpp_type=6, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='delta_threshold', + full_name='paddle.DownpourTableAccessorParameter.delta_threshold', + index=3, + number=4, + type=2, + cpp_type=6, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='delta_keep_days', + full_name='paddle.DownpourTableAccessorParameter.delta_keep_days', + index=4, + number=5, + type=2, + cpp_type=6, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='show_click_decay_rate', + full_name='paddle.DownpourTableAccessorParameter.show_click_decay_rate', + index=5, + number=6, + type=2, + cpp_type=6, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='delete_threshold', + full_name='paddle.DownpourTableAccessorParameter.delete_threshold', + index=6, + number=7, + type=2, + cpp_type=6, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=2139, + serialized_end=2345, ) + +_TABLEACCESSORSAVEPARAMETER = _descriptor.Descriptor( + name='TableAccessorSaveParameter', + full_name='paddle.TableAccessorSaveParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='param', + full_name='paddle.TableAccessorSaveParameter.param', + index=0, + number=1, + type=13, + cpp_type=3, + label=1, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='converter', + full_name='paddle.TableAccessorSaveParameter.converter', + index=1, + number=2, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='deconverter', + full_name='paddle.TableAccessorSaveParameter.deconverter', + index=2, + number=3, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=2347, + serialized_end=2430, ) + +_PSREQUESTMESSAGE = _descriptor.Descriptor( + name='PsRequestMessage', + full_name='paddle.PsRequestMessage', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='cmd_id', + full_name='paddle.PsRequestMessage.cmd_id', + index=0, + number=1, + type=13, + cpp_type=3, + label=2, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='table_id', + full_name='paddle.PsRequestMessage.table_id', + index=1, + number=2, + type=13, + cpp_type=3, + label=1, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='params', + full_name='paddle.PsRequestMessage.params', + index=2, + number=3, + type=12, + cpp_type=9, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='client_id', + full_name='paddle.PsRequestMessage.client_id', + index=3, + number=4, + type=5, + cpp_type=1, + label=1, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='data', + full_name='paddle.PsRequestMessage.data', + index=4, + number=5, + type=12, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b(""), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=2432, + serialized_end=2533, ) + +_SPARSESGDRULEPARAMETER = _descriptor.Descriptor( + name='SparseSGDRuleParameter', + full_name='paddle.SparseSGDRuleParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='learning_rate', + full_name='paddle.SparseSGDRuleParameter.learning_rate', + index=0, + number=1, + type=1, + cpp_type=5, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='initial_g2sum', + full_name='paddle.SparseSGDRuleParameter.initial_g2sum', + index=1, + number=2, + type=1, + cpp_type=5, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='initial_range', + full_name='paddle.SparseSGDRuleParameter.initial_range', + index=2, + number=3, + type=1, + cpp_type=5, + label=1, + has_default_value=True, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='weight_bounds', + full_name='paddle.SparseSGDRuleParameter.weight_bounds', + index=3, + number=4, + type=2, + cpp_type=6, + label=3, + has_default_value=False, + default_value=[], + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=2535, + serialized_end=2654, ) + +_DENSESGDRULEPARAMETER = _descriptor.Descriptor( + name='DenseSGDRuleParameter', + full_name='paddle.DenseSGDRuleParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='name', + full_name='paddle.DenseSGDRuleParameter.name', + index=0, + number=1, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='adam', + full_name='paddle.DenseSGDRuleParameter.adam', + index=1, + number=2, + type=11, + cpp_type=10, + label=1, + has_default_value=False, + default_value=None, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='naive', + full_name='paddle.DenseSGDRuleParameter.naive', + index=2, + number=3, + type=11, + cpp_type=10, + label=1, + has_default_value=False, + default_value=None, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='summary', + full_name='paddle.DenseSGDRuleParameter.summary', + index=3, + number=4, + type=11, + cpp_type=10, + label=1, + has_default_value=False, + default_value=None, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='moving_average', + full_name='paddle.DenseSGDRuleParameter.moving_average', + index=4, + number=5, + type=11, + cpp_type=10, + label=1, + has_default_value=False, + default_value=None, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=2657, + serialized_end=2882, ) + +_ADAMSGDPARAMETER = _descriptor.Descriptor( + name='AdamSGDParameter', + full_name='paddle.AdamSGDParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='learning_rate', + full_name='paddle.AdamSGDParameter.learning_rate', + index=0, + number=1, + type=1, + cpp_type=5, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='avg_decay_rate', + full_name='paddle.AdamSGDParameter.avg_decay_rate', + index=1, + number=2, + type=1, + cpp_type=5, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='ada_decay_rate', + full_name='paddle.AdamSGDParameter.ada_decay_rate', + index=2, + number=3, + type=1, + cpp_type=5, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='ada_epsilon', + full_name='paddle.AdamSGDParameter.ada_epsilon', + index=3, + number=4, + type=1, + cpp_type=5, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='mom_decay_rate', + full_name='paddle.AdamSGDParameter.mom_decay_rate', + index=4, + number=5, + type=1, + cpp_type=5, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=2885, + serialized_end=3019, ) + +_NAIVESGDPARAMETER = _descriptor.Descriptor( + name='NaiveSGDParameter', + full_name='paddle.NaiveSGDParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='learning_rate', + full_name='paddle.NaiveSGDParameter.learning_rate', + index=0, + number=1, + type=1, + cpp_type=5, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='avg_decay_rate', + full_name='paddle.NaiveSGDParameter.avg_decay_rate', + index=1, + number=2, + type=1, + cpp_type=5, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=3021, + serialized_end=3087, ) + +_SUMMARYSGDPARAMETER = _descriptor.Descriptor( + name='SummarySGDParameter', + full_name='paddle.SummarySGDParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='summary_decay_rate', + full_name='paddle.SummarySGDParameter.summary_decay_rate', + index=0, + number=1, + type=1, + cpp_type=5, + label=1, + has_default_value=True, + default_value=float(0.999999), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=3089, + serialized_end=3148, ) + +_MOVINGAVERAGERULEPARAMETER = _descriptor.Descriptor( + name='MovingAverageRuleParameter', + full_name='paddle.MovingAverageRuleParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='momentum', + full_name='paddle.MovingAverageRuleParameter.momentum', + index=0, + number=1, + type=1, + cpp_type=5, + label=1, + has_default_value=False, + default_value=float(0), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=3150, + serialized_end=3196, ) + +_PSRESPONSEMESSAGE = _descriptor.Descriptor( + name='PsResponseMessage', + full_name='paddle.PsResponseMessage', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='err_code', + full_name='paddle.PsResponseMessage.err_code', + index=0, + number=1, + type=5, + cpp_type=1, + label=2, + has_default_value=True, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='err_msg', + full_name='paddle.PsResponseMessage.err_msg', + index=1, + number=2, + type=9, + cpp_type=9, + label=2, + has_default_value=True, + default_value=_b("").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='data', + full_name='paddle.PsResponseMessage.data', + index=2, + number=3, + type=12, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b(""), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=3198, + serialized_end=3271, ) + +_FSCLIENTPARAMETER = _descriptor.Descriptor( + name='FsClientParameter', + full_name='paddle.FsClientParameter', + filename=None, + file=DESCRIPTOR, + containing_type=None, + fields=[ + _descriptor.FieldDescriptor( + name='fs_type', + full_name='paddle.FsClientParameter.fs_type', + index=0, + number=1, + type=14, + cpp_type=8, + label=1, + has_default_value=True, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='uri', + full_name='paddle.FsClientParameter.uri', + index=1, + number=2, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='user', + full_name='paddle.FsClientParameter.user', + index=2, + number=3, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='passwd', + full_name='paddle.FsClientParameter.passwd', + index=3, + number=4, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='buffer_size', + full_name='paddle.FsClientParameter.buffer_size', + index=4, + number=5, + type=5, + cpp_type=1, + label=1, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='hadoop_bin', + full_name='paddle.FsClientParameter.hadoop_bin', + index=5, + number=51, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + _descriptor.FieldDescriptor( + name='afs_conf', + full_name='paddle.FsClientParameter.afs_conf', + index=6, + number=101, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=_b("").decode('utf-8'), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + options=None), + ], + extensions=[], + nested_types=[], + enum_types=[_FSCLIENTPARAMETER_FSAPITYPE, ], + options=None, + is_extendable=False, + syntax='proto2', + extension_ranges=[], + oneofs=[], + serialized_start=3274, + serialized_end=3487, ) + +_PSPARAMETER.fields_by_name['worker_param'].message_type = _WORKERPARAMETER +_PSPARAMETER.fields_by_name['server_param'].message_type = _SERVERPARAMETER +_PSPARAMETER.fields_by_name[ + 'trainer_param'].message_type = _DOWNPOURTRAINERPARAMETER +_PSPARAMETER.fields_by_name['fs_client_param'].message_type = _FSCLIENTPARAMETER +_WORKERPARAMETER.fields_by_name[ + 'downpour_worker_param'].message_type = _DOWNPOURWORKERPARAMETER +_SERVERPARAMETER.fields_by_name[ + 'downpour_server_param'].message_type = _DOWNPOURSERVERPARAMETER +_DOWNPOURWORKERPARAMETER.fields_by_name[ + 'downpour_table_param'].message_type = _TABLEPARAMETER +_DOWNPOURTRAINERPARAMETER.fields_by_name[ + 'dense_table'].message_type = _DENSETABLEPARAMETER +_DOWNPOURTRAINERPARAMETER.fields_by_name[ + 'sparse_table'].message_type = _SPARSETABLEPARAMETER +_DOWNPOURTRAINERPARAMETER.fields_by_name[ + 'program_config'].message_type = _PROGRAMCONFIG +_DOWNPOURSERVERPARAMETER.fields_by_name[ + 'downpour_table_param'].message_type = _TABLEPARAMETER +_DOWNPOURSERVERPARAMETER.fields_by_name[ + 'service_param'].message_type = _SERVERSERVICEPARAMETER +_TABLEPARAMETER.fields_by_name[ + 'accessor'].message_type = _TABLEACCESSORPARAMETER +_TABLEPARAMETER.fields_by_name['type'].enum_type = _TABLETYPE +_TABLEACCESSORPARAMETER.fields_by_name[ + 'sparse_sgd_param'].message_type = _SPARSESGDRULEPARAMETER +_TABLEACCESSORPARAMETER.fields_by_name[ + 'dense_sgd_param'].message_type = _DENSESGDRULEPARAMETER +_TABLEACCESSORPARAMETER.fields_by_name[ + 'downpour_accessor_param'].message_type = _DOWNPOURTABLEACCESSORPARAMETER +_TABLEACCESSORPARAMETER.fields_by_name[ + 'table_accessor_save_param'].message_type = _TABLEACCESSORSAVEPARAMETER +_DENSESGDRULEPARAMETER.fields_by_name['adam'].message_type = _ADAMSGDPARAMETER +_DENSESGDRULEPARAMETER.fields_by_name['naive'].message_type = _NAIVESGDPARAMETER +_DENSESGDRULEPARAMETER.fields_by_name[ + 'summary'].message_type = _SUMMARYSGDPARAMETER +_DENSESGDRULEPARAMETER.fields_by_name[ + 'moving_average'].message_type = _MOVINGAVERAGERULEPARAMETER +_FSCLIENTPARAMETER.fields_by_name[ + 'fs_type'].enum_type = _FSCLIENTPARAMETER_FSAPITYPE +_FSCLIENTPARAMETER_FSAPITYPE.containing_type = _FSCLIENTPARAMETER +DESCRIPTOR.message_types_by_name['PSParameter'] = _PSPARAMETER +DESCRIPTOR.message_types_by_name['WorkerParameter'] = _WORKERPARAMETER +DESCRIPTOR.message_types_by_name['ServerParameter'] = _SERVERPARAMETER +DESCRIPTOR.message_types_by_name[ + 'DownpourWorkerParameter'] = _DOWNPOURWORKERPARAMETER +DESCRIPTOR.message_types_by_name[ + 'DownpourTrainerParameter'] = _DOWNPOURTRAINERPARAMETER +DESCRIPTOR.message_types_by_name['ProgramConfig'] = _PROGRAMCONFIG +DESCRIPTOR.message_types_by_name['DenseTableParameter'] = _DENSETABLEPARAMETER +DESCRIPTOR.message_types_by_name['SparseTableParameter'] = _SPARSETABLEPARAMETER +DESCRIPTOR.message_types_by_name[ + 'DownpourServerParameter'] = _DOWNPOURSERVERPARAMETER +DESCRIPTOR.message_types_by_name[ + 'ServerServiceParameter'] = _SERVERSERVICEPARAMETER +DESCRIPTOR.message_types_by_name['TableParameter'] = _TABLEPARAMETER +DESCRIPTOR.message_types_by_name[ + 'TableAccessorParameter'] = _TABLEACCESSORPARAMETER +DESCRIPTOR.message_types_by_name[ + 'DownpourTableAccessorParameter'] = _DOWNPOURTABLEACCESSORPARAMETER +DESCRIPTOR.message_types_by_name[ + 'TableAccessorSaveParameter'] = _TABLEACCESSORSAVEPARAMETER +DESCRIPTOR.message_types_by_name['PsRequestMessage'] = _PSREQUESTMESSAGE +DESCRIPTOR.message_types_by_name[ + 'SparseSGDRuleParameter'] = _SPARSESGDRULEPARAMETER +DESCRIPTOR.message_types_by_name[ + 'DenseSGDRuleParameter'] = _DENSESGDRULEPARAMETER +DESCRIPTOR.message_types_by_name['AdamSGDParameter'] = _ADAMSGDPARAMETER +DESCRIPTOR.message_types_by_name['NaiveSGDParameter'] = _NAIVESGDPARAMETER +DESCRIPTOR.message_types_by_name['SummarySGDParameter'] = _SUMMARYSGDPARAMETER +DESCRIPTOR.message_types_by_name[ + 'MovingAverageRuleParameter'] = _MOVINGAVERAGERULEPARAMETER +DESCRIPTOR.message_types_by_name['PsResponseMessage'] = _PSRESPONSEMESSAGE +DESCRIPTOR.message_types_by_name['FsClientParameter'] = _FSCLIENTPARAMETER +DESCRIPTOR.enum_types_by_name['TableType'] = _TABLETYPE +DESCRIPTOR.enum_types_by_name['PsCmdID'] = _PSCMDID + +PSParameter = _reflection.GeneratedProtocolMessageType( + 'PSParameter', + (_message.Message, ), + dict( + DESCRIPTOR=_PSPARAMETER, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.PSParameter) + )) +_sym_db.RegisterMessage(PSParameter) + +WorkerParameter = _reflection.GeneratedProtocolMessageType( + 'WorkerParameter', + (_message.Message, ), + dict( + DESCRIPTOR=_WORKERPARAMETER, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.WorkerParameter) + )) +_sym_db.RegisterMessage(WorkerParameter) + +ServerParameter = _reflection.GeneratedProtocolMessageType( + 'ServerParameter', + (_message.Message, ), + dict( + DESCRIPTOR=_SERVERPARAMETER, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.ServerParameter) + )) +_sym_db.RegisterMessage(ServerParameter) + +DownpourWorkerParameter = _reflection.GeneratedProtocolMessageType( + 'DownpourWorkerParameter', + (_message.Message, ), + dict( + DESCRIPTOR=_DOWNPOURWORKERPARAMETER, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.DownpourWorkerParameter) + )) +_sym_db.RegisterMessage(DownpourWorkerParameter) + +DownpourTrainerParameter = _reflection.GeneratedProtocolMessageType( + 'DownpourTrainerParameter', + (_message.Message, ), + dict( + DESCRIPTOR=_DOWNPOURTRAINERPARAMETER, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.DownpourTrainerParameter) + )) +_sym_db.RegisterMessage(DownpourTrainerParameter) + +ProgramConfig = _reflection.GeneratedProtocolMessageType( + 'ProgramConfig', + (_message.Message, ), + dict( + DESCRIPTOR=_PROGRAMCONFIG, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.ProgramConfig) + )) +_sym_db.RegisterMessage(ProgramConfig) + +DenseTableParameter = _reflection.GeneratedProtocolMessageType( + 'DenseTableParameter', + (_message.Message, ), + dict( + DESCRIPTOR=_DENSETABLEPARAMETER, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.DenseTableParameter) + )) +_sym_db.RegisterMessage(DenseTableParameter) + +SparseTableParameter = _reflection.GeneratedProtocolMessageType( + 'SparseTableParameter', + (_message.Message, ), + dict( + DESCRIPTOR=_SPARSETABLEPARAMETER, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.SparseTableParameter) + )) +_sym_db.RegisterMessage(SparseTableParameter) + +DownpourServerParameter = _reflection.GeneratedProtocolMessageType( + 'DownpourServerParameter', + (_message.Message, ), + dict( + DESCRIPTOR=_DOWNPOURSERVERPARAMETER, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.DownpourServerParameter) + )) +_sym_db.RegisterMessage(DownpourServerParameter) + +ServerServiceParameter = _reflection.GeneratedProtocolMessageType( + 'ServerServiceParameter', + (_message.Message, ), + dict( + DESCRIPTOR=_SERVERSERVICEPARAMETER, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.ServerServiceParameter) + )) +_sym_db.RegisterMessage(ServerServiceParameter) + +TableParameter = _reflection.GeneratedProtocolMessageType( + 'TableParameter', + (_message.Message, ), + dict( + DESCRIPTOR=_TABLEPARAMETER, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.TableParameter) + )) +_sym_db.RegisterMessage(TableParameter) + +TableAccessorParameter = _reflection.GeneratedProtocolMessageType( + 'TableAccessorParameter', + (_message.Message, ), + dict( + DESCRIPTOR=_TABLEACCESSORPARAMETER, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.TableAccessorParameter) + )) +_sym_db.RegisterMessage(TableAccessorParameter) + +DownpourTableAccessorParameter = _reflection.GeneratedProtocolMessageType( + 'DownpourTableAccessorParameter', + (_message.Message, ), + dict( + DESCRIPTOR=_DOWNPOURTABLEACCESSORPARAMETER, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.DownpourTableAccessorParameter) + )) +_sym_db.RegisterMessage(DownpourTableAccessorParameter) + +TableAccessorSaveParameter = _reflection.GeneratedProtocolMessageType( + 'TableAccessorSaveParameter', + (_message.Message, ), + dict( + DESCRIPTOR=_TABLEACCESSORSAVEPARAMETER, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.TableAccessorSaveParameter) + )) +_sym_db.RegisterMessage(TableAccessorSaveParameter) + +PsRequestMessage = _reflection.GeneratedProtocolMessageType( + 'PsRequestMessage', + (_message.Message, ), + dict( + DESCRIPTOR=_PSREQUESTMESSAGE, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.PsRequestMessage) + )) +_sym_db.RegisterMessage(PsRequestMessage) + +SparseSGDRuleParameter = _reflection.GeneratedProtocolMessageType( + 'SparseSGDRuleParameter', + (_message.Message, ), + dict( + DESCRIPTOR=_SPARSESGDRULEPARAMETER, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.SparseSGDRuleParameter) + )) +_sym_db.RegisterMessage(SparseSGDRuleParameter) + +DenseSGDRuleParameter = _reflection.GeneratedProtocolMessageType( + 'DenseSGDRuleParameter', + (_message.Message, ), + dict( + DESCRIPTOR=_DENSESGDRULEPARAMETER, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.DenseSGDRuleParameter) + )) +_sym_db.RegisterMessage(DenseSGDRuleParameter) + +AdamSGDParameter = _reflection.GeneratedProtocolMessageType( + 'AdamSGDParameter', + (_message.Message, ), + dict( + DESCRIPTOR=_ADAMSGDPARAMETER, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.AdamSGDParameter) + )) +_sym_db.RegisterMessage(AdamSGDParameter) + +NaiveSGDParameter = _reflection.GeneratedProtocolMessageType( + 'NaiveSGDParameter', + (_message.Message, ), + dict( + DESCRIPTOR=_NAIVESGDPARAMETER, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.NaiveSGDParameter) + )) +_sym_db.RegisterMessage(NaiveSGDParameter) + +SummarySGDParameter = _reflection.GeneratedProtocolMessageType( + 'SummarySGDParameter', + (_message.Message, ), + dict( + DESCRIPTOR=_SUMMARYSGDPARAMETER, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.SummarySGDParameter) + )) +_sym_db.RegisterMessage(SummarySGDParameter) + +MovingAverageRuleParameter = _reflection.GeneratedProtocolMessageType( + 'MovingAverageRuleParameter', + (_message.Message, ), + dict( + DESCRIPTOR=_MOVINGAVERAGERULEPARAMETER, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.MovingAverageRuleParameter) + )) +_sym_db.RegisterMessage(MovingAverageRuleParameter) + +PsResponseMessage = _reflection.GeneratedProtocolMessageType( + 'PsResponseMessage', + (_message.Message, ), + dict( + DESCRIPTOR=_PSRESPONSEMESSAGE, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.PsResponseMessage) + )) +_sym_db.RegisterMessage(PsResponseMessage) + +FsClientParameter = _reflection.GeneratedProtocolMessageType( + 'FsClientParameter', + (_message.Message, ), + dict( + DESCRIPTOR=_FSCLIENTPARAMETER, + __module__='ps_pb2' + # @@protoc_insertion_point(class_scope:paddle.FsClientParameter) + )) +_sym_db.RegisterMessage(FsClientParameter) + +DESCRIPTOR.has_options = True +DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(), + _b('\200\001\001')) +# @@protoc_insertion_point(module_scope) diff --git a/python/paddle/fluid/layers/learning_rate_scheduler.py b/python/paddle/fluid/layers/learning_rate_scheduler.py index be84262297782a20032b907ccdcbd7da32c96ef3..b7d1eeba80d93d549a019455087bb7cc1d2a1083 100644 --- a/python/paddle/fluid/layers/learning_rate_scheduler.py +++ b/python/paddle/fluid/layers/learning_rate_scheduler.py @@ -22,13 +22,16 @@ strategy according to this module. from __future__ import print_function +import math + from . import control_flow from . import nn from . import ops from . import tensor from ..initializer import init_on_cpu from ..framework import default_main_program, Parameter, unique_name, name_scope -import math +from ..dygraph import base as imperative_base +from ..dygraph import learning_rate_scheduler as imperate_lr __all__ = [ 'exponential_decay', 'natural_exp_decay', 'inverse_time_decay', @@ -66,13 +69,17 @@ def noam_decay(d_model, warmup_steps): The decayed learning rate. """ with default_main_program()._lr_schedule_guard(): - global_step = _decay_step_counter(1) + if imperative_base.enabled(): + decay = imperate_lr.NoamDecay(d_model, warmup_steps) + return decay + else: + global_step = _decay_step_counter(1) - a = global_step**-0.5 - b = (warmup_steps**-1.5) * global_step - lr_value = (d_model**-0.5) * nn.elementwise_min(a, b) + a = global_step**-0.5 + b = (warmup_steps**-1.5) * global_step + lr_value = (d_model**-0.5) * nn.elementwise_min(a, b) - return lr_value + return lr_value def exponential_decay(learning_rate, decay_steps, decay_rate, staircase=False): @@ -112,14 +119,19 @@ def exponential_decay(learning_rate, decay_steps, decay_rate, staircase=False): """ with default_main_program()._lr_schedule_guard(): - global_step = _decay_step_counter() + if imperative_base.enabled(): + decay = imperate_lr.ExponentialDecay(learning_rate, decay_steps, + decay_rate, staircase) + return decay + else: + global_step = _decay_step_counter() - div_res = global_step / decay_steps - if staircase: - div_res = ops.floor(div_res) - decayed_lr = learning_rate * (decay_rate**div_res) + div_res = global_step / decay_steps + if staircase: + div_res = ops.floor(div_res) + decayed_lr = learning_rate * (decay_rate**div_res) - return decayed_lr + return decayed_lr def natural_exp_decay(learning_rate, decay_steps, decay_rate, staircase=False): @@ -141,14 +153,19 @@ def natural_exp_decay(learning_rate, decay_steps, decay_rate, staircase=False): The decayed learning rate """ with default_main_program()._lr_schedule_guard(): - global_step = _decay_step_counter() + if imperative_base.enabled(): + decay = imperate_lr.NaturalExpDecay(learning_rate, decay_steps, + decay_rate, staircase) + return decay + else: + global_step = _decay_step_counter() - div_res = global_step / decay_steps - if staircase: - div_res = ops.floor(div_res) - decayed_lr = learning_rate * ops.exp(-1 * decay_rate * div_res) + div_res = global_step / decay_steps + if staircase: + div_res = ops.floor(div_res) + decayed_lr = learning_rate * ops.exp(-1 * decay_rate * div_res) - return decayed_lr + return decayed_lr def inverse_time_decay(learning_rate, decay_steps, decay_rate, staircase=False): @@ -187,15 +204,20 @@ def inverse_time_decay(learning_rate, decay_steps, decay_rate, staircase=False): sgd_optimizer.minimize(avg_cost) """ with default_main_program()._lr_schedule_guard(): - global_step = _decay_step_counter() + if imperative_base.enabled(): + decay = imperate_lr.InverseTimeDecay(learning_rate, decay_steps, + decay_rate, staircase) + return decay + else: + global_step = _decay_step_counter() - div_res = global_step / decay_steps - if staircase: - div_res = ops.floor(div_res) + div_res = global_step / decay_steps + if staircase: + div_res = ops.floor(div_res) - decayed_lr = learning_rate / (1 + decay_rate * div_res) + decayed_lr = learning_rate / (1 + decay_rate * div_res) - return decayed_lr + return decayed_lr def polynomial_decay(learning_rate, @@ -227,27 +249,33 @@ def polynomial_decay(learning_rate, Variable: The decayed learning rate """ with default_main_program()._lr_schedule_guard(): - global_step = _decay_step_counter() - - if cycle: - div_res = ops.ceil(global_step / decay_steps) - zero_var = tensor.fill_constant( - shape=[1], dtype='float32', value=0.0) - one_var = tensor.fill_constant( - shape=[1], dtype='float32', value=1.0) - - with control_flow.Switch() as switch: - with switch.case(global_step == zero_var): - tensor.assign(input=one_var, output=div_res) - decay_steps = decay_steps * div_res + if imperative_base.enabled(): + decay = imperate_lr.PolynomialDecay(learning_rate, decay_steps, + end_learning_rate, power, cycle) + return decay else: - decay_steps_var = tensor.fill_constant( - shape=[1], dtype='float32', value=float(decay_steps)) - global_step = nn.elementwise_min(x=global_step, y=decay_steps_var) + global_step = _decay_step_counter() + + if cycle: + div_res = ops.ceil(global_step / decay_steps) + zero_var = tensor.fill_constant( + shape=[1], dtype='float32', value=0.0) + one_var = tensor.fill_constant( + shape=[1], dtype='float32', value=1.0) + + with control_flow.Switch() as switch: + with switch.case(global_step == zero_var): + tensor.assign(input=one_var, output=div_res) + decay_steps = decay_steps * div_res + else: + decay_steps_var = tensor.fill_constant( + shape=[1], dtype='float32', value=float(decay_steps)) + global_step = nn.elementwise_min( + x=global_step, y=decay_steps_var) - decayed_lr = (learning_rate - end_learning_rate) * \ - ((1 - global_step / decay_steps) ** power) + end_learning_rate - return decayed_lr + decayed_lr = (learning_rate - end_learning_rate) * \ + ((1 - global_step / decay_steps) ** power) + end_learning_rate + return decayed_lr def piecewise_decay(boundaries, values): @@ -279,34 +307,38 @@ def piecewise_decay(boundaries, values): if len(values) - len(boundaries) != 1: raise ValueError("len(values) - len(boundaries) should be 1") - global_step = _decay_step_counter() + if imperative_base.enabled(): + decay = imperate_lr.PiecewiseDecay(boundaries, values, 0) + return decay + else: + global_step = _decay_step_counter() - lr = tensor.create_global_var( - shape=[1], - value=0.0, - dtype='float32', - persistable=True, - name="learning_rate") + lr = tensor.create_global_var( + shape=[1], + value=0.0, + dtype='float32', + persistable=True, + name="learning_rate") - with control_flow.Switch() as switch: - for i in range(len(boundaries)): - boundary_val = tensor.fill_constant( + with control_flow.Switch() as switch: + for i in range(len(boundaries)): + boundary_val = tensor.fill_constant( + shape=[1], + dtype='float32', + value=float(boundaries[i]), + force_cpu=True) + value_var = tensor.fill_constant( + shape=[1], dtype='float32', value=float(values[i])) + with switch.case(global_step < boundary_val): + tensor.assign(value_var, lr) + last_value_var = tensor.fill_constant( shape=[1], dtype='float32', - value=float(boundaries[i]), - force_cpu=True) - value_var = tensor.fill_constant( - shape=[1], dtype='float32', value=float(values[i])) - with switch.case(global_step < boundary_val): - tensor.assign(value_var, lr) - last_value_var = tensor.fill_constant( - shape=[1], - dtype='float32', - value=float(values[len(values) - 1])) - with switch.default(): - tensor.assign(last_value_var, lr) + value=float(values[len(values) - 1])) + with switch.default(): + tensor.assign(last_value_var, lr) - return lr + return lr def cosine_decay(learning_rate, step_each_epoch, epochs): @@ -336,12 +368,17 @@ def cosine_decay(learning_rate, step_each_epoch, epochs): learning_rate = base_lr, step_each_epoch=10000, epochs=120) """ with default_main_program()._lr_schedule_guard(): - global_step = _decay_step_counter() + if imperative_base.enabled(): + decay = imperate_lr.CosineDecay(learning_rate, step_each_epoch, + epochs) + return decay + else: + global_step = _decay_step_counter() - cur_epoch = ops.floor(global_step / step_each_epoch) - decayed_lr = learning_rate * 0.5 * ( - ops.cos(cur_epoch * math.pi / epochs) + 1) - return decayed_lr + cur_epoch = ops.floor(global_step / step_each_epoch) + decayed_lr = learning_rate * 0.5 * ( + ops.cos(cur_epoch * math.pi / epochs) + 1) + return decayed_lr def append_LARS(params_grads, learning_rate, weight_decay): @@ -363,6 +400,9 @@ def append_LARS(params_grads, learning_rate, weight_decay): / (sqrt(sumsq(gradient))+ weight_decay * sqrt(sumsq(param))) """ + assert not imperative_base.enabled( + ), "append_LARS is NOT supported in dygraph mode now" + def _balanced_weight(param_norm, grad_norm): if weight_decay == 1.0: return grad_norm + param_norm diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 45a065da835a829218761e51555026221ccb3af2..7e6e37116fe23f26eb14dd0573dbe031aec98dd8 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -30,6 +30,8 @@ from .initializer import Constant from .layer_helper import LayerHelper from .layers import ops from .regularizer import append_regularization_ops +from .dygraph import base as imperative_base +from .dygraph.learning_rate_scheduler import LearningRateDecay from paddle.fluid import core from paddle.fluid.layers import tensor from functools import reduce @@ -53,9 +55,19 @@ class Optimizer(object): """ def __init__(self, learning_rate, regularization=None, name=None): - if not isinstance(learning_rate, float) and \ - not isinstance(learning_rate, framework.Variable): - raise TypeError("learning rate should be float or Variable") + if framework._in_dygraph_mode(): + if not isinstance(learning_rate, float) and \ + not isinstance(learning_rate, LearningRateDecay): + raise TypeError( + "learning rate should be float or LearningRateDecay, got %s here" + % type(learning_rate)) + else: + if not isinstance(learning_rate, float) and \ + not isinstance(learning_rate, framework.Variable): + raise TypeError( + "learning rate should be float or Variable, got %s here" % + type(learning_rate)) + self._name = name self.regularization = regularization self._learning_rate = learning_rate @@ -79,24 +91,49 @@ class Optimizer(object): return self._opti_name_list def _create_global_learning_rate(self): - lr = self._global_learning_rate() + if imperative_base.enabled(): + # create learning rate Variable + if isinstance(self._learning_rate, float): + lr = self._global_learning_rate() - if isinstance(lr, framework.Variable): - return - else: - if not isinstance(self._learning_rate, float): + if isinstance(lr, framework.Variable): + return + else: + self._learning_rate_map[framework.default_main_program( + )] = layers.create_global_var( + name=unique_name.generate("learning_rate"), + shape=[1], + value=float(self._learning_rate), + dtype='float32' if self._dtype is None else self._dtype, + persistable=True) + # get learning rate Variable from LearningRateDecay + elif isinstance(self._learning_rate, LearningRateDecay): + self._learning_rate_map[framework.default_main_program( + )] = self._learning_rate() + else: raise TypeError( - "learning rate variable is create outside optimizer," - "can not create new learning rate variable for new program") + "optimizer's learning rate must be float or LearningRateDecay" + ) + else: + lr = self._global_learning_rate() - # create learning rate in the current main program - self._learning_rate_map[framework.default_main_program( - )] = layers.create_global_var( - name=unique_name.generate("learning_rate"), - shape=[1], - value=float(self._learning_rate), - dtype='float32' if self._dtype is None else self._dtype, - persistable=True) + if isinstance(lr, framework.Variable): + return + else: + if not isinstance(self._learning_rate, float): + raise TypeError( + "learning rate variable is create outside optimizer," + "can not create new learning rate variable for new program" + ) + + # create learning rate in the current main program + self._learning_rate_map[framework.default_main_program( + )] = layers.create_global_var( + name=unique_name.generate("learning_rate"), + shape=[1], + value=float(self._learning_rate), + dtype='float32' if self._dtype is None else self._dtype, + persistable=True) def _global_learning_rate(self, program=None): """ @@ -605,10 +642,10 @@ class DGCMomentumOptimizer(MomentumOptimizer): DGC also uses momentum factor masking and warmup training to overcome the staleness problem caused by reduced communication. This optimizer will do two things: - + 1. Compress the gradient by get TopK import value from tensor \ and use it for allreduce to reduce network bandwidth. - + 2. Call momentum to optimize on the cost. Args: @@ -795,7 +832,7 @@ class DGCMomentumOptimizer(MomentumOptimizer): type=x.type, name=name, dtype=x.dtype, persistable=False) helper.append_op( - type="clip_by_norm", + type="dgc_clip_by_norm", inputs={"X": x, "current_step": self._global_step_var}, attrs={ @@ -808,7 +845,7 @@ class DGCMomentumOptimizer(MomentumOptimizer): def _append_clip_norm(self, grad_var, clip_norm): with grad_var.block.program._backward_role_guard(): return self._clip_by_norm( - x=grad_var, max_norm=clip_norm, name=grad_var.name + "@DGC") + x=grad_var, max_norm=clip_norm, name=grad_var.name) def _dgc_op(self, param_var, clip_var, grad_var, u_var, v_var, k_var, encoded_var): diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index d139feac6ffe5a223a6628e95cd47cabc29cdd14..1390e759d7e309674a2ecb61c59043b0f5032400 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -78,7 +78,8 @@ list(REMOVE_ITEM TEST_OPS test_image_classification_resnet) list(REMOVE_ITEM TEST_OPS test_bilinear_interp_op) list(REMOVE_ITEM TEST_OPS test_nearest_interp_op) list(REMOVE_ITEM TEST_OPS test_imperative_resnet) -list(REMOVE_ITEM TEST_OPS test_imperative_optimizer) +list(REMOVE_ITEM TEST_OPS test_imperative_se_resnext) +list(REMOVE_ITEM TEST_OPS test_imperative_mnist) list(REMOVE_ITEM TEST_OPS test_ir_memory_optimize_transformer) foreach(TEST_OP ${TEST_OPS}) py_test_modules(${TEST_OP} MODULES ${TEST_OP}) @@ -89,8 +90,11 @@ py_test_modules(test_bilinear_interp_op MODULES test_bilinear_interp_op SERIAL) py_test_modules(test_nearest_interp_op MODULES test_nearest_interp_op SERIAL) py_test_modules(test_imperative_resnet MODULES test_imperative_resnet ENVS FLAGS_cudnn_deterministic=1) -py_test_modules(test_imperative_optimizer MODULES test_imperative_optimizer ENVS +py_test_modules(test_imperative_mnist MODULES test_imperative_mnist ENVS FLAGS_cudnn_deterministic=1) +py_test_modules(test_imperative_se_resnext MODULES test_imperative_se_resnext ENVS + FLAGS_cudnn_deterministic=1 SERIAL) + if(WITH_DISTRIBUTE) py_test_modules(test_dist_train MODULES test_dist_train SERIAL) set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 20) diff --git a/python/paddle/fluid/tests/unittests/test_arg_min_max_op.py b/python/paddle/fluid/tests/unittests/test_arg_min_max_op.py index 0712e102b30fc72c7f8b62eb9230e7f4ab615ef0..4f9f1ec2253ca01eb4b07a06a248f91d4676c9c4 100644 --- a/python/paddle/fluid/tests/unittests/test_arg_min_max_op.py +++ b/python/paddle/fluid/tests/unittests/test_arg_min_max_op.py @@ -64,6 +64,14 @@ class TestCase2(BaseTestCase): self.axis = 0 +class TestCase2_1(BaseTestCase): + def initTestCase(self): + self.op_type = 'arg_max' + self.dims = (3, 4) + self.dtype = 'int64' + self.axis = -1 + + class TestCase3(BaseTestCase): def initTestCase(self): self.op_type = 'arg_max' diff --git a/python/paddle/fluid/tests/unittests/test_async_executor.py b/python/paddle/fluid/tests/unittests/test_async_executor.py index 43855b95f9e3096d58ca3e8acfdb25f034bab175..563301691f83dfbbe669503e479743a7c69944ac 100644 --- a/python/paddle/fluid/tests/unittests/test_async_executor.py +++ b/python/paddle/fluid/tests/unittests/test_async_executor.py @@ -81,62 +81,6 @@ class TestAsyncExecutor(unittest.TestCase): tarf.extractall(path='./') tarf.close() - def test_data_feed_desc(self): - data_feed = fluid.DataFeedDesc('./data.prototxt') - # assertEqueal(data_feed.proto_desc.batch, 2) - # assertEqual(len(data_feed.proto_desc.multi_slot_desc), 2) - self.assertEqual(" ".join(data_feed.desc().split()), - " ".join(proto_str.split())) - - def test_run(self): - # Initialize dataset description - data_feed = fluid.DataFeedDesc('train_data/data.prototxt') - data_feed.set_batch_size( - 128) # See API doc for how to change other fields - - # define network - # input text data - data = fluid.layers.data( - name="words", shape=[1], dtype="int64", lod_level=1) - # label data - label = fluid.layers.data(name="label", shape=[1], dtype="int64") - - avg_cost, acc, prediction = bow_net(data, label) - sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=0.002) - opt_ops, weight_and_grad = sgd_optimizer.minimize(avg_cost) - - # Run startup program - startup_program = fluid.default_startup_program() - place = fluid.CPUPlace() - executor = fluid.Executor(place) - executor.run(startup_program) - - main_program = fluid.default_main_program() - async_executor = fluid.AsyncExecutor(place) - - self.assertRaises(TypeError, async_executor.run) - self.assertRaises(TypeError, async_executor.run, main_program) - self.assertRaises(TypeError, async_executor.run, main_program, - data_feed) - - filelist = ['train_data/part-%d' % i for i in range(10)] - self.assertRaises(TypeError, async_executor.run, main_program, - data_feed, filelist) - - thread_num = 4 - self.assertRaises(TypeError, async_executor.run, main_program, - data_feed, filelist, thread_num) - - async_executor.run(main_program, data_feed, filelist, thread_num, [acc]) - fluid.io.save_inference_model("imdb.model", [data.name, label.name], - [acc], executor) - statinfo = os.stat('imdb.model/__model__') - self.assertGreater(statinfo.st_size, 0) - - os.remove('./data.prototxt') - shutil.rmtree('./train_data') - shutil.rmtree('./imdb.model') - if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py new file mode 100644 index 0000000000000000000000000000000000000000..5e77ce9b811bc0474f1e0950e15dedf013dcb4ea --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_async_ssa_graph_executor_mnist.py @@ -0,0 +1,186 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import os +import unittest + +import numpy +import time +import paddle +import paddle.fluid as fluid + +BATCH_SIZE = 64 + + +def convolutional_neural_network(use_py_reader): + with fluid.unique_name.guard(): + img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + + py_reader = None + if use_py_reader: + py_reader = fluid.layers.create_py_reader_by_data( + capacity=64, + feed_list=[img, label], + name='py_reader', + use_double_buffer=False) + img, label = fluid.layers.read_file(py_reader) + + conv_pool_1 = fluid.nets.simple_img_conv_pool( + input=img, + filter_size=5, + num_filters=20, + pool_size=2, + pool_stride=2, + act="relu") + conv_pool_1 = fluid.layers.batch_norm(conv_pool_1) + conv_pool_2 = fluid.nets.simple_img_conv_pool( + input=conv_pool_1, + filter_size=5, + num_filters=50, + pool_size=2, + pool_stride=2, + act="relu") + + prediction = fluid.layers.fc(input=conv_pool_2, size=10, act='softmax') + loss = fluid.layers.cross_entropy(input=prediction, label=label) + avg_loss = fluid.layers.mean(loss) + acc = fluid.layers.accuracy(input=prediction, label=label) + + return img, label, prediction, avg_loss, acc, py_reader + + +def test(): + place = fluid.CPUPlace() + exe = fluid.Executor(place) + + test_reader = paddle.batch( + paddle.dataset.mnist.test(), batch_size=BATCH_SIZE) + + img, label, prediction, avg_loss, acc, py_reader = convolutional_neural_network( + use_py_reader=False) + feeder = fluid.DataFeeder(feed_list=[img, label], place=place) + + def train_test(train_test_program, train_test_feed, train_test_reader): + acc_set = [] + avg_loss_set = [] + for test_data in train_test_reader(): + acc_np, avg_loss_np = exe.run(program=train_test_program, + feed=train_test_feed.feed(test_data), + fetch_list=[acc, avg_loss]) + acc_set.append(float(acc_np)) + avg_loss_set.append(float(avg_loss_np)) + # get test acc and loss + acc_val_mean = numpy.array(acc_set).mean() + avg_loss_val_mean = numpy.array(avg_loss_set).mean() + return avg_loss_val_mean, acc_val_mean + + # test for epoch + avg_loss_val, acc_val = train_test( + train_test_program=fluid.default_main_program(), + train_test_reader=test_reader, + train_test_feed=feeder) + + print("Test: avg_cost: %s, acc: %s" % (avg_loss_val, acc_val)) + assert acc_val > 0.96 + + +def train(use_cuda, thread_num, cpu_num): + if use_cuda and not fluid.core.is_compiled_with_cuda(): + print("paddle is not compiled with cuda, exit!") + return + + img, label, prediction, avg_loss, acc, py_reader = convolutional_neural_network( + use_py_reader=True) + + optimizer = fluid.optimizer.Adam(learning_rate=0.001) + optimizer.minimize(avg_loss) + + train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.mnist.train(), buf_size=500), + batch_size=BATCH_SIZE) + + place = fluid.CPUPlace() + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + + os.environ['CPU_NUM'] = str(cpu_num) + + print("cpu_num:" + str(cpu_num)) + print("thread_num:" + str(thread_num)) + + build_strategy = fluid.BuildStrategy() + build_strategy.async_mode = True + + exec_strategy = fluid.ExecutionStrategy() + exec_strategy.num_threads = thread_num + exec_strategy.num_iteration_per_run = 10 + + main_program = fluid.default_main_program() + pe = fluid.ParallelExecutor( + use_cuda=False, + loss_name=avg_loss.name, + main_program=main_program, + build_strategy=build_strategy, + exec_strategy=exec_strategy) + + py_reader.decorate_paddle_reader(train_reader) + + for pass_id in range(2): + step = 0 + py_reader.start() + try: + while True: + loss_val = pe.run(fetch_list=[avg_loss.name]) + loss_val = numpy.mean(loss_val) + if step % 10 == 0: + print("Pass %d, Batch %d, Cost %f, queue size %d" % + (pass_id, step, loss_val, py_reader.queue.size())) + step += 1 + except fluid.core.EOFException: + print("train end pass = " + str(pass_id)) + py_reader.reset() + + return step + + +class TestAsyncSSAGraphExecutor(unittest.TestCase): + def test_check_async_ssa_exe_train(self): + step_list = [] + for cpu_num in [1, 2, 4]: + print("run cpu_num -> " + str(cpu_num)) + with fluid.scope_guard(fluid.core.Scope()): + with fluid.program_guard( + main_program=fluid.Program(), + startup_program=fluid.Program()): + start_time = time.time() + step = train( + use_cuda=False, thread_num=cpu_num, cpu_num=cpu_num) + end_time = time.time() + step_list.append(step) + print("cpu_num -> " + str(cpu_num) + " step -> " + str(step) + + " time -> " + str(end_time - start_time)) + with fluid.program_guard( + main_program=fluid.Program(), + startup_program=fluid.Program()): + test() + assert abs(int(step_list[0] / 2) - int(step_list[1])) < 5 + assert abs(int(step_list[1] / 2) - int(step_list[2])) < 5 + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_dataset.py b/python/paddle/fluid/tests/unittests/test_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..8c705a095c768c861aac07249467cf75bb289b2d --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_dataset.py @@ -0,0 +1,170 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +TestCases for Dataset, +including create, config, run, etc. +""" + +from __future__ import print_function +import paddle.fluid as fluid +import numpy as np +import os +import shutil +import unittest + + +class TestDataset(unittest.TestCase): + """ TestCases for Dataset. """ + + def test_dataset_create(self): + """ Testcase for dataset create. """ + return + try: + dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset") + except: + self.assertTrue(False) + + try: + dataset = fluid.DatasetFactory().create_dataset("QueueDataset") + except: + self.assertTrue(False) + + try: + dataset = fluid.DatasetFactory().create_dataset("MyOwnDataset") + self.assertTrue(False) + except: + self.assertTrue(True) + + def test_dataset_config(self): + """ Testcase for dataset configuration. """ + return + dataset = fluid.core.Dataset("MultiSlotDataset") + dataset.set_thread_num(12) + dataset.set_filelist(["a.txt", "b.txt", "c.txt"]) + dataset.set_trainer_num(4) + dataset.set_hdfs_config("my_fs_name", "my_fs_ugi") + + thread_num = dataset.get_thread_num() + self.assertEqual(thread_num, 12) + + filelist = dataset.get_filelist() + self.assertEqual(len(filelist), 3) + self.assertEqual(filelist[0], "a.txt") + self.assertEqual(filelist[1], "b.txt") + self.assertEqual(filelist[2], "c.txt") + + trainer_num = dataset.get_trainer_num() + self.assertEqual(trainer_num, 4) + + name, ugi = dataset.get_hdfs_config() + self.assertEqual(name, "my_fs_name") + self.assertEqual(ugi, "my_fs_ugi") + + def test_in_memory_dataset_run(self): + """ + Testcase for InMemoryDataset from create to run. + """ + return + with open("test_in_memory_dataset_run_a.txt", "w") as f: + data = "1 1 2 3 3 4 5 5 5 5 1 1\n" + data += "1 2 2 3 4 4 6 6 6 6 1 2\n" + data += "1 3 2 3 5 4 7 7 7 7 1 3\n" + f.write(data) + with open("test_in_memory_dataset_run_b.txt", "w") as f: + data = "1 4 2 3 3 4 5 5 5 5 1 4\n" + data += "1 5 2 3 4 4 6 6 6 6 1 5\n" + data += "1 6 2 3 5 4 7 7 7 7 1 6\n" + data += "1 7 2 3 6 4 8 8 8 8 1 7\n" + f.write(data) + + slots = ["slot1", "slot2", "slot3", "slot4"] + slots_vars = [] + for slot in slots: + var = fluid.layers.data( + name=slot, shape=[1], dtype="int64", lod_level=1) + slots_vars.append(var) + + dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset") + dataset.set_batch_size(32) + dataset.set_thread(3) + dataset.set_filelist([ + "test_in_memory_dataset_run_a.txt", + "test_in_memory_dataset_run_b.txt" + ]) + dataset.set_pipe_command("cat") + dataset.set_use_var(slots_vars) + dataset.load_into_memory() + dataset.local_shuffle() + + exe = fluid.Executor(fluid.CPUPlace()) + exe.run(fluid.default_startup_program()) + for i in range(2): + try: + exe.train_from_dataset(fluid.default_main_program(), dataset) + except: + #self.assertTrue(False) + pass + + os.remove("./test_in_memory_dataset_run_a.txt") + os.remove("./test_in_memory_dataset_run_b.txt") + + def test_queue_dataset_run(self): + """ + Testcase for QueueDataset from create to run. + """ + return + with open("test_queue_dataset_run_a.txt", "w") as f: + data = "1 1 2 3 3 4 5 5 5 5 1 1\n" + data += "1 2 2 3 4 4 6 6 6 6 1 2\n" + data += "1 3 2 3 5 4 7 7 7 7 1 3\n" + f.write(data) + with open("test_queue_dataset_run_b.txt", "w") as f: + data = "1 4 2 3 3 4 5 5 5 5 1 4\n" + data += "1 5 2 3 4 4 6 6 6 6 1 5\n" + data += "1 6 2 3 5 4 7 7 7 7 1 6\n" + data += "1 7 2 3 6 4 8 8 8 8 1 7\n" + f.write(data) + + slots = ["slot1", "slot2", "slot3", "slot4"] + slots_vars = [] + for slot in slots: + var = fluid.layers.data( + name=slot, shape=[1], dtype="int64", lod_level=1) + slots_vars.append(var) + + dataset = fluid.DatasetFactory().create_dataset("QueueDataset") + dataset.set_batch_size(32) + dataset.set_thread(3) + dataset.set_filelist( + ["test_queue_dataset_run_a.txt", "test_queue_dataset_run_b.txt"]) + dataset.set_pipe_command("cat") + dataset.set_use_var(slots_vars) + + exe = fluid.Executor(fluid.CPUPlace()) + exe.run(fluid.default_startup_program()) + for i in range(2): + try: + exe.train_from_dataset(fluid.default_main_program(), dataset) + except: + #self.assertTrue(False) + pass + + os.remove("./test_queue_dataset_run_a.txt") + os.remove("./test_queue_dataset_run_b.txt") + + +if __name__ == '__main__': + #unittest.main() + import sys + sys.exit(0) diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py index 9c0efe6d905929f87106f18ecf74a7915e39eba9..a5d8cd4660f7428176b82610b1f4e0ace824f1f2 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_base.py +++ b/python/paddle/fluid/tests/unittests/test_dist_base.py @@ -52,6 +52,7 @@ class TestDistRunnerBase(object): # NOTE: import fluid until runtime, or else forking processes will cause error. config = fluid.DistributeTranspilerConfig() config.enable_dc_asgd = dc_asgd + # config.runtime_split_send_recv = True t = fluid.DistributeTranspiler(config=config) t.transpile( trainer_id=trainer_id, @@ -139,8 +140,7 @@ class TestDistRunnerBase(object): pass_builder = None if args.batch_merge_repeat > 1: pass_builder = build_stra._finalize_strategy_and_create_passes() - mypass = pass_builder.insert_pass( - len(pass_builder.all_passes()) - 3, "multi_batch_merge_pass") + mypass = pass_builder.insert_pass(0, "multi_batch_merge_pass") mypass.set("num_repeats", args.batch_merge_repeat) if args.update_method == "nccl2" or args.update_method == "nccl2_reduce_layer": diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_no_need_buffer_vars_inference.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_no_need_buffer_vars_inference.py new file mode 100644 index 0000000000000000000000000000000000000000..4844d930daca75595376b1f1f67ae03011a713c6 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_no_need_buffer_vars_inference.py @@ -0,0 +1,45 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import paddle.fluid as fluid +import importlib + +fluid.core._set_eager_deletion_mode(0.0, 1.0, True) + +from test_elementwise_add_op import * +from test_elementwise_sub_op import * +from test_concat_op import * +from test_gather_op import * +from test_gaussian_random_batch_size_like_op import * +from test_lod_reset_op import * +from test_scatter_op import * +from test_mean_op import * +from test_slice_op import * +from test_linear_chain_crf_op import * +from test_bilinear_interp_op import * +from test_nearest_interp_op import * +from test_sequence_concat import * +from test_seq_conv import * +from test_seq_pool import * +from test_sequence_expand_as import * +from test_sequence_expand import * +from test_sequence_pad_op import * +from test_sequence_unpad_op import * +from test_sequence_scatter_op import * +from test_sequence_slice_op import * +from test_pad2d_op import * + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_imperative_mnist.py b/python/paddle/fluid/tests/unittests/test_imperative_mnist.py new file mode 100644 index 0000000000000000000000000000000000000000..5ab01839fbc20bbd3c242878c4ea23a00f7b0dca --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_imperative_mnist.py @@ -0,0 +1,217 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import contextlib +import unittest +import numpy as np +import six + +import paddle +import paddle.fluid as fluid +from paddle.fluid import core +from paddle.fluid.optimizer import SGDOptimizer +from paddle.fluid.dygraph.nn import Conv2D, Pool2D, FC +from paddle.fluid.dygraph.base import to_variable +from test_imperative_base import new_program_scope + + +class SimpleImgConvPool(fluid.dygraph.Layer): + def __init__(self, + name_scope, + num_channels, + num_filters, + filter_size, + pool_size, + pool_stride, + pool_padding=0, + pool_type='max', + global_pooling=False, + conv_stride=1, + conv_padding=0, + conv_dilation=1, + conv_groups=1, + act=None, + use_cudnn=False, + param_attr=None, + bias_attr=None): + super(SimpleImgConvPool, self).__init__(name_scope) + + self._conv2d = Conv2D( + self.full_name(), + num_channels=num_channels, + num_filters=num_filters, + filter_size=filter_size, + stride=conv_stride, + padding=conv_padding, + dilation=conv_dilation, + groups=conv_groups, + param_attr=None, + bias_attr=None, + use_cudnn=use_cudnn) + + self._pool2d = Pool2D( + self.full_name(), + pool_size=pool_size, + pool_type=pool_type, + pool_stride=pool_stride, + pool_padding=pool_padding, + global_pooling=global_pooling, + use_cudnn=use_cudnn) + + def forward(self, inputs): + x = self._conv2d(inputs) + x = self._pool2d(x) + return x + + +class MNIST(fluid.dygraph.Layer): + def __init__(self, name_scope): + super(MNIST, self).__init__(name_scope) + + self._simple_img_conv_pool_1 = SimpleImgConvPool( + self.full_name(), 1, 20, 5, 2, 2, act="relu") + + self._simple_img_conv_pool_2 = SimpleImgConvPool( + self.full_name(), 20, 50, 5, 2, 2, act="relu") + + pool_2_shape = 50 * 4 * 4 + SIZE = 10 + scale = (2.0 / (pool_2_shape**2 * SIZE))**0.5 + self._fc = FC(self.full_name(), + 10, + param_attr=fluid.param_attr.ParamAttr( + initializer=fluid.initializer.NormalInitializer( + loc=0.0, scale=scale)), + act="softmax") + + def forward(self, inputs): + x = self._simple_img_conv_pool_1(inputs) + x = self._simple_img_conv_pool_2(x) + x = self._fc(x) + return x + + +class TestImperativeMnist(unittest.TestCase): + def test_mnist_float32(self): + seed = 90 + epoch_num = 1 + with fluid.dygraph.guard(): + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + + mnist = MNIST("mnist") + sgd = SGDOptimizer(learning_rate=1e-3) + train_reader = paddle.batch( + paddle.dataset.mnist.train(), batch_size=128, drop_last=True) + + dy_param_init_value = {} + for epoch in range(epoch_num): + for batch_id, data in enumerate(train_reader()): + dy_x_data = np.array( + [x[0].reshape(1, 28, 28) + for x in data]).astype('float32') + y_data = np.array( + [x[1] for x in data]).astype('int64').reshape(128, 1) + + img = to_variable(dy_x_data) + label = to_variable(y_data) + label._stop_gradient = True + + cost = mnist(img) + loss = fluid.layers.cross_entropy(cost, label) + avg_loss = fluid.layers.mean(loss) + + dy_out = avg_loss._numpy() + + if epoch == 0 and batch_id == 0: + for param in mnist.parameters(): + dy_param_init_value[param.name] = param._numpy() + + avg_loss._backward() + sgd.minimize(avg_loss) + mnist.clear_gradients() + + dy_param_value = {} + for param in mnist.parameters(): + dy_param_value[param.name] = param._numpy() + + with new_program_scope(): + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + + exe = fluid.Executor(fluid.CPUPlace( + ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)) + + mnist = MNIST("mnist") + sgd = SGDOptimizer(learning_rate=1e-3) + train_reader = paddle.batch( + paddle.dataset.mnist.train(), batch_size=128, drop_last=True) + + img = fluid.layers.data( + name='pixel', shape=[1, 28, 28], dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + cost = mnist(img) + loss = fluid.layers.cross_entropy(cost, label) + avg_loss = fluid.layers.mean(loss) + sgd.minimize(avg_loss) + + # initialize params and fetch them + static_param_init_value = {} + static_param_name_list = [] + for param in mnist.parameters(): + static_param_name_list.append(param.name) + + out = exe.run(fluid.default_startup_program(), + fetch_list=static_param_name_list) + + for i in range(len(static_param_name_list)): + static_param_init_value[static_param_name_list[i]] = out[i] + + for epoch in range(epoch_num): + for batch_id, data in enumerate(train_reader()): + static_x_data = np.array( + [x[0].reshape(1, 28, 28) + for x in data]).astype('float32') + y_data = np.array( + [x[1] for x in data]).astype('int64').reshape([128, 1]) + + fetch_list = [avg_loss.name] + fetch_list.extend(static_param_name_list) + out = exe.run( + fluid.default_main_program(), + feed={"pixel": static_x_data, + "label": y_data}, + fetch_list=fetch_list) + + static_param_value = {} + static_out = out[0] + for i in range(1, len(out)): + static_param_value[static_param_name_list[i - 1]] = out[ + i] + + self.assertTrue(np.allclose(dy_x_data.all(), static_x_data.all())) + + for key, value in six.iteritems(static_param_init_value): + self.assertTrue(np.allclose(value, dy_param_init_value[key])) + + self.assertTrue(np.allclose(static_out, dy_out)) + + for key, value in six.iteritems(static_param_value): + self.assertTrue(np.allclose(value, dy_param_value[key], atol=1e-5)) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py index 829274afc7e17fb0b5f4d8200c5e1f7bbbe02393..8b659a3e08e381dd6f55b666d9f5f1b172a51930 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py @@ -22,131 +22,71 @@ import six import paddle import paddle.fluid as fluid from paddle.fluid import core -from paddle.fluid.optimizer import SGDOptimizer -from paddle.fluid.dygraph.nn import Conv2D, Pool2D, FC +from paddle.fluid.optimizer import SGDOptimizer, Adam +from paddle.fluid.dygraph.nn import FC from paddle.fluid.dygraph.base import to_variable from test_imperative_base import new_program_scope -class SimpleImgConvPool(fluid.dygraph.Layer): - def __init__(self, - name_scope, - num_channels, - num_filters, - filter_size, - pool_size, - pool_stride, - pool_padding=0, - pool_type='max', - global_pooling=False, - conv_stride=1, - conv_padding=0, - conv_dilation=1, - conv_groups=1, - act=None, - use_cudnn=False, - param_attr=None, - bias_attr=None): - super(SimpleImgConvPool, self).__init__(name_scope) - - self._conv2d = Conv2D( - self.full_name(), - num_channels=num_channels, - num_filters=num_filters, - filter_size=filter_size, - stride=conv_stride, - padding=conv_padding, - dilation=conv_dilation, - groups=conv_groups, - param_attr=None, - bias_attr=None, - use_cudnn=use_cudnn) - - self._pool2d = Pool2D( - self.full_name(), - pool_size=pool_size, - pool_type=pool_type, - pool_stride=pool_stride, - pool_padding=pool_padding, - global_pooling=global_pooling, - use_cudnn=use_cudnn) +class MLP(fluid.dygraph.Layer): + def __init__(self, name_scope, param_attr=None, bias_attr=None): + super(MLP, self).__init__(name_scope) - def forward(self, inputs): - x = self._conv2d(inputs) - x = self._pool2d(x) - return x - - -class MNIST(fluid.dygraph.Layer): - def __init__(self, name_scope): - super(MNIST, self).__init__(name_scope) + self._fc1 = FC(self.full_name(), 10) + self._fc2 = FC(self.full_name(), 10) - self._simple_img_conv_pool_1 = SimpleImgConvPool( - self.full_name(), 1, 20, 5, 2, 2, act="relu") - - self._simple_img_conv_pool_2 = SimpleImgConvPool( - self.full_name(), 20, 50, 5, 2, 2, act="relu") + def forward(self, inputs): + y = self._fc1(inputs) + y = self._fc2(y) + return y - pool_2_shape = 50 * 4 * 4 - SIZE = 10 - scale = (2.0 / (pool_2_shape**2 * SIZE))**0.5 - self._fc = FC(self.full_name(), - 10, - param_attr=fluid.param_attr.ParamAttr( - initializer=fluid.initializer.NormalInitializer( - loc=0.0, scale=scale)), - act="softmax") - def forward(self, inputs): - x = self._simple_img_conv_pool_1(inputs) - x = self._simple_img_conv_pool_2(x) - x = self._fc(x) - return x +class TestImperativeOptimizerBase(unittest.TestCase): + def setUp(self): + self.batch_num = 20 + def get_optimizer(self): + raise NotImplementedError() -class TestDygraphMnist(unittest.TestCase): - def test_mnist_float32(self): + def _check_mlp(self): seed = 90 - epoch_num = 1 with fluid.dygraph.guard(): fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed - mnist = MNIST("mnist") - sgd = SGDOptimizer(learning_rate=1e-3) + mlp = MLP('mlp') + optimizer = self.get_optimizer() train_reader = paddle.batch( paddle.dataset.mnist.train(), batch_size=128, drop_last=True) dy_param_init_value = {} - for epoch in range(epoch_num): - for batch_id, data in enumerate(train_reader()): - dy_x_data = np.array( - [x[0].reshape(1, 28, 28) - for x in data]).astype('float32') - y_data = np.array( - [x[1] for x in data]).astype('int64').reshape(128, 1) - - img = to_variable(dy_x_data) - label = to_variable(y_data) - label._stop_gradient = True - - cost = mnist(img) - loss = fluid.layers.cross_entropy(cost, label) - avg_loss = fluid.layers.mean(loss) - - dy_out = avg_loss._numpy() - - if epoch == 0 and batch_id == 0: - for param in mnist.parameters(): - dy_param_init_value[param.name] = param._numpy() - - avg_loss._backward() - sgd.minimize(avg_loss) - mnist.clear_gradients() - - dy_param_value = {} - for param in mnist.parameters(): - dy_param_value[param.name] = param._numpy() + for batch_id, data in enumerate(train_reader()): + if batch_id >= self.batch_num: + break + + dy_x_data = np.array( + [x[0].reshape(1, 28, 28) for x in data]).astype('float32') + y_data = np.array([x[1] for x in data]).astype('int64').reshape( + 128, 1) + + img = to_variable(dy_x_data) + label = to_variable(y_data) + label._stop_gradient = True + + cost = mlp(img) + avg_loss = fluid.layers.reduce_mean(cost) + dy_out = avg_loss._numpy() + + if batch_id == 0: + for param in mlp.parameters(): + dy_param_init_value[param.name] = param._numpy() + + avg_loss._backward() + optimizer.minimize(avg_loss) + mlp.clear_gradients() + dy_param_value = {} + for param in mlp.parameters(): + dy_param_value[param.name] = param._numpy() with new_program_scope(): fluid.default_startup_program().random_seed = seed @@ -155,23 +95,22 @@ class TestDygraphMnist(unittest.TestCase): exe = fluid.Executor(fluid.CPUPlace( ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)) - mnist = MNIST("mnist") - sgd = SGDOptimizer(learning_rate=1e-3) + mlp = MLP('mlp') + optimizer = self.get_optimizer() train_reader = paddle.batch( paddle.dataset.mnist.train(), batch_size=128, drop_last=True) img = fluid.layers.data( name='pixel', shape=[1, 28, 28], dtype='float32') label = fluid.layers.data(name='label', shape=[1], dtype='int64') - cost = mnist(img) - loss = fluid.layers.cross_entropy(cost, label) - avg_loss = fluid.layers.mean(loss) - sgd.minimize(avg_loss) + cost = mlp(img) + avg_loss = fluid.layers.reduce_mean(cost) + optimizer.minimize(avg_loss) # initialize params and fetch them static_param_init_value = {} static_param_name_list = [] - for param in mnist.parameters(): + for param in mlp.parameters(): static_param_name_list.append(param.name) out = exe.run(fluid.default_startup_program(), @@ -180,29 +119,26 @@ class TestDygraphMnist(unittest.TestCase): for i in range(len(static_param_name_list)): static_param_init_value[static_param_name_list[i]] = out[i] - for epoch in range(epoch_num): - for batch_id, data in enumerate(train_reader()): - static_x_data = np.array( - [x[0].reshape(1, 28, 28) - for x in data]).astype('float32') - y_data = np.array( - [x[1] for x in data]).astype('int64').reshape([128, 1]) - - fetch_list = [avg_loss.name] - fetch_list.extend(static_param_name_list) - out = exe.run( - fluid.default_main_program(), - feed={"pixel": static_x_data, - "label": y_data}, - fetch_list=fetch_list) - - static_param_value = {} - static_out = out[0] - for i in range(1, len(out)): - static_param_value[static_param_name_list[i - 1]] = out[ - i] - - self.assertTrue(np.allclose(dy_x_data.all(), static_x_data.all())) + for batch_id, data in enumerate(train_reader()): + if batch_id >= self.batch_num: + break + + static_x_data = np.array( + [x[0].reshape(1, 28, 28) for x in data]).astype('float32') + y_data = np.array([x[1] for x in data]).astype('int64').reshape( + [128, 1]) + + fetch_list = [avg_loss.name] + fetch_list.extend(static_param_name_list) + out = exe.run(fluid.default_main_program(), + feed={"pixel": static_x_data, + "label": y_data}, + fetch_list=fetch_list) + + static_param_value = {} + static_out = out[0] + for i in range(1, len(out)): + static_param_value[static_param_name_list[i - 1]] = out[i] for key, value in six.iteritems(static_param_init_value): self.assertTrue(np.allclose(value, dy_param_init_value[key])) @@ -210,7 +146,92 @@ class TestDygraphMnist(unittest.TestCase): self.assertTrue(np.allclose(static_out, dy_out)) for key, value in six.iteritems(static_param_value): - self.assertTrue(np.allclose(value, dy_param_value[key], atol=1e-5)) + self.assertTrue(np.allclose(value, dy_param_value[key])) + + +class TestImperativeOptimizerPiecewiseDecay(TestImperativeOptimizerBase): + def get_optimizer(self): + bd = [3, 6, 9] + optimizer = SGDOptimizer(learning_rate=fluid.layers.piecewise_decay( + boundaries=bd, values=[0.1 * (0.1**i) for i in range(len(bd) + 1)])) + return optimizer + + def test_sgd(self): + self._check_mlp() + + +class TestImperativeOptimizerNaturalExpDecay(TestImperativeOptimizerBase): + def get_optimizer(self): + optimizer = SGDOptimizer(learning_rate=fluid.layers.natural_exp_decay( + learning_rate=0.1, + decay_steps=10000, + decay_rate=0.5, + staircase=True)) + return optimizer + + def test_sgd(self): + self._check_mlp() + + +class TestImperativeOptimizerExponentialDecay(TestImperativeOptimizerBase): + def get_optimizer(self): + optimizer = SGDOptimizer(learning_rate=fluid.layers.exponential_decay( + learning_rate=0.1, + decay_steps=10000, + decay_rate=0.5, + staircase=True)) + return optimizer + + def test_sgd(self): + self._check_mlp() + + +class TestImperativeOptimizerInverseTimeDecay(TestImperativeOptimizerBase): + def get_optimizer(self): + optimizer = Adam(learning_rate=fluid.layers.inverse_time_decay( + learning_rate=0.1, + decay_steps=10000, + decay_rate=0.5, + staircase=True)) + return optimizer + + def test_adam(self): + self._check_mlp() + + +class TestImperativeOptimizerPolynomialDecay(TestImperativeOptimizerBase): + def get_optimizer(self): + optimizer = SGDOptimizer(learning_rate=fluid.layers.polynomial_decay( + learning_rate=0.1, decay_steps=5, cycle=self.cycle)) + return optimizer + + def test_sgd_cycle(self): + self.cycle = True + self._check_mlp() + + def test_sgd(self): + self.cycle = False + self._check_mlp() + + +class TestImperativeOptimizerCosineDecay(TestImperativeOptimizerBase): + def get_optimizer(self): + optimizer = SGDOptimizer(learning_rate=fluid.layers.cosine_decay( + learning_rate=0.1, step_each_epoch=10000, epochs=120)) + return optimizer + + def test_sgd(self): + self._check_mlp() + + +class TestImperativeOptimizerNoamDecay(TestImperativeOptimizerBase): + def get_optimizer(self): + optimizer = SGDOptimizer(learning_rate=fluid.layers.noam_decay( + d_model=512, warmup_steps=8000)) + return optimizer + + def test_sgd(self): + self._check_mlp() if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py index 0595315f30cf38c32d8b33139caffc909d6d99b4..82eb61ba654636ccc3c2acee8508dfabb62ee9cb 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py @@ -201,8 +201,6 @@ class PtbModel(fluid.dygraph.Layer): rnn_out, shape=[-1, self.num_steps, self.hidden_size]) projection = fluid.layers.matmul(rnn_out, self.softmax_weight) projection = fluid.layers.elementwise_add(projection, self.softmax_bias) - projection = fluid.layers.reshape( - projection, shape=[-1, self.vocab_size]) projection = fluid.layers.reshape( projection, shape=[-1, self.vocab_size]) loss = fluid.layers.softmax_with_cross_entropy( @@ -224,6 +222,7 @@ class TestDygraphPtbRnn(unittest.TestCase): num_steps = 3 init_scale = 0.1 batch_size = 4 + batch_num = 200 with fluid.dygraph.guard(): fluid.default_startup_program().random_seed = seed @@ -243,7 +242,6 @@ class TestDygraphPtbRnn(unittest.TestCase): dy_loss = None last_hidden = None last_cell = None - batch_num = 200 for i in range(batch_num): x_data = np.arange(12).reshape(4, 3).astype('int64') @@ -284,7 +282,8 @@ class TestDygraphPtbRnn(unittest.TestCase): exe = fluid.Executor(fluid.CPUPlace( ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)) sgd = SGDOptimizer(learning_rate=1e-3) - x = fluid.layers.data(name="x", shape=[-1, 3, 1], dtype='int64') + x = fluid.layers.data( + name="x", shape=[-1, num_steps, 1], dtype='int64') y = fluid.layers.data(name="y", shape=[-1, 1], dtype='float32') init_hidden = fluid.layers.data( name="init_hidden", shape=[1], dtype='float32') diff --git a/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py b/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py new file mode 100644 index 0000000000000000000000000000000000000000..69931f0849480b2569a31d04c7b0b0f9db0d61a3 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py @@ -0,0 +1,472 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import contextlib +import unittest +import numpy as np +import six + +import paddle +import paddle.fluid as fluid +from paddle.fluid import core +from paddle.fluid.layer_helper import LayerHelper +from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, FC +from paddle.fluid.dygraph.base import to_variable +from test_imperative_base import new_program_scope + +batch_size = 8 +train_parameters = { + "input_size": [3, 224, 224], + "input_mean": [0.485, 0.456, 0.406], + "input_std": [0.229, 0.224, 0.225], + "learning_strategy": { + "name": "piecewise_decay", + "batch_size": batch_size, + "epochs": [30, 60, 90], + "steps": [0.1, 0.01, 0.001, 0.0001] + }, + "batch_size": batch_size, + "lr": 0.1, + "total_images": 6149, +} + + +def optimizer_setting(params): + ls = params["learning_strategy"] + if ls["name"] == "piecewise_decay": + if "total_images" not in params: + total_images = 6149 + else: + total_images = params["total_images"] + # TODO(Yancey1989): using lr decay if it is ready. + #batch_size = ls["batch_size"] + #step = int(total_images / batch_size + 1) + + #bd = [step * e for e in ls["epochs"]] + #base_lr = params["lr"] + #lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)] + optimizer = fluid.optimizer.SGD(learning_rate=0.1) + + return optimizer + + +class ConvBNLayer(fluid.dygraph.Layer): + def __init__(self, + name_scope, + num_channels, + num_filters, + filter_size, + stride=1, + groups=1, + act=None): + super(ConvBNLayer, self).__init__(name_scope) + + self._conv = Conv2D( + self.full_name(), + num_channels=num_channels, + num_filters=num_filters, + filter_size=filter_size, + stride=stride, + padding=(filter_size - 1) // 2, + groups=groups, + act=None, + bias_attr=None) + + self._batch_norm = BatchNorm(self.full_name(), num_filters, act=act) + + def forward(self, inputs): + y = self._conv(inputs) + y = self._batch_norm(y) + + return y + + +class SqueezeExcitation(fluid.dygraph.Layer): + def __init__(self, name_scope, num_channels, reduction_ratio): + + super(SqueezeExcitation, self).__init__(name_scope) + self._pool = Pool2D( + self.full_name(), pool_size=0, pool_type='avg', global_pooling=True) + self._squeeze = FC( + self.full_name(), + size=num_channels // reduction_ratio, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.05)), + act='relu') + self._excitation = FC( + self.full_name(), + size=num_channels, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.05)), + act='relu') + + def forward(self, input): + y = self._pool(input) + y = self._squeeze(y) + y = self._excitation(y) + y = fluid.layers.elementwise_mul(x=input, y=y, axis=0) + return y + + +class BottleneckBlock(fluid.dygraph.Layer): + def __init__(self, + name_scope, + num_channels, + num_filters, + stride, + cardinality, + reduction_ratio, + shortcut=True): + super(BottleneckBlock, self).__init__(name_scope) + + self.conv0 = ConvBNLayer( + self.full_name(), + num_channels=num_channels, + num_filters=num_filters, + filter_size=1) + self.conv1 = ConvBNLayer( + self.full_name(), + num_channels=num_filters, + num_filters=num_filters, + filter_size=3, + stride=stride, + groups=cardinality) + self.conv2 = ConvBNLayer( + self.full_name(), + num_channels=num_filters, + num_filters=num_filters * 4, + filter_size=1, + act='relu') + + self.scale = SqueezeExcitation( + self.full_name(), + num_channels=num_filters * 4, + reduction_ratio=reduction_ratio) + + if not shortcut: + self.short = ConvBNLayer( + self.full_name(), + num_channels=num_channels, + num_filters=num_filters * 4, + filter_size=1, + stride=stride) + + self.shortcut = shortcut + + self._num_channels_out = num_filters * 4 + + def forward(self, inputs): + y = self.conv0(inputs) + conv1 = self.conv1(y) + conv2 = self.conv2(conv1) + scale = self.scale(conv2) + + if self.shortcut: + short = inputs + else: + short = self.short(inputs) + + y = fluid.layers.elementwise_add(x=short, y=scale) + + layer_helper = LayerHelper(self.full_name(), act='relu') + y = layer_helper.append_activation(y) + return y + + +class SeResNeXt(fluid.dygraph.Layer): + def __init__(self, name_scope, layers=50, class_dim=102): + super(SeResNeXt, self).__init__(name_scope) + + self.layers = layers + supported_layers = [50, 101, 152] + assert layers in supported_layers, \ + "supported layers are {} but input layer is {}".format(supported_layers, layers) + + if layers == 50: + cardinality = 32 + reduction_ratio = 16 + depth = [3, 4, 6, 3] + num_filters = [128, 256, 512, 1024] + self.conv0 = ConvBNLayer( + self.full_name(), + num_channels=3, + num_filters=64, + filter_size=7, + stride=2, + act='relu') + self.pool = Pool2D( + self.full_name(), + pool_size=3, + pool_stride=2, + pool_padding=1, + pool_type='max') + elif layers == 101: + cardinality = 32 + reduction_ratio = 16 + depth = [3, 4, 23, 3] + num_filters = [128, 256, 512, 1024] + self.conv0 = ConvBNLayer( + self.full_name(), + num_channels=3, + num_filters=3, + filter_size=7, + stride=2, + act='relu') + self.pool = Pool2D( + self.full_name(), + pool_size=3, + pool_stride=2, + pool_padding=1, + pool_type='max') + elif layers == 152: + cardinality = 64 + reduction_ratio = 16 + depth = [3, 8, 36, 3] + num_filters = [128, 256, 512, 1024] + self.conv0 = ConvBNLayer( + self.full_name(), + num_channels=3, + num_filters=3, + filter_size=7, + stride=2, + act='relu') + self.conv1 = ConvBNLayer( + self.full_name(), + num_channels=64, + num_filters=3, + filter_size=7, + stride=2, + act='relu') + self.conv2 = ConvBNLayer( + self.full_name(), + num_channels=64, + num_filters=3, + filter_size=7, + stride=2, + act='relu') + self.pool = Pool2D( + self.full_name(), + pool_size=3, + pool_stride=2, + pool_padding=1, + pool_type='max') + + self.bottleneck_block_list = [] + num_channels = 64 + for block in range(len(depth)): + shortcut = False + for i in range(depth[block]): + bottleneck_block = self.add_sublayer( + 'bb_%d_%d' % (block, i), + BottleneckBlock( + self.full_name(), + num_channels=num_channels, + num_filters=num_filters[block], + stride=2 if i == 0 and block != 0 else 1, + cardinality=cardinality, + reduction_ratio=reduction_ratio, + shortcut=shortcut)) + num_channels = bottleneck_block._num_channels_out + self.bottleneck_block_list.append(bottleneck_block) + shortcut = True + + self.pool2d_avg = Pool2D( + self.full_name(), pool_size=7, pool_type='avg', global_pooling=True) + import math + stdv = 1.0 / math.sqrt(2048 * 1.0) + + self.out = FC(self.full_name(), + size=class_dim, + act='softmax', + param_attr=fluid.param_attr.ParamAttr( + initializer=fluid.initializer.Uniform(-stdv, stdv))) + + def forward(self, inputs): + if self.layers == 50 or self.layers == 101: + y = self.conv0(inputs) + y = self.pool(y) + elif self.layers == 152: + y = self.conv0(inputs) + y = self.conv1(inputs) + y = self.conv2(inputs) + y = self.pool(y) + + for bottleneck_block in self.bottleneck_block_list: + y = bottleneck_block(y) + y = self.pool2d_avg(y) + y = fluid.layers.dropout(y, dropout_prob=0.2) + y = self.out(y) + return y + + +class TestImperativeResneXt(unittest.TestCase): + def test_se_resnext_float32(self): + seed = 90 + + batch_size = train_parameters["batch_size"] + batch_num = 2 + with fluid.dygraph.guard(): + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + + se_resnext = SeResNeXt("se_resnext") + optimizer = optimizer_setting(train_parameters) + np.random.seed(seed) + import random + random.seed = seed + train_reader = paddle.batch( + paddle.dataset.flowers.train(use_xmap=False), + batch_size=batch_size) + + dy_param_init_value = {} + for param in se_resnext.parameters(): + dy_param_init_value[param.name] = param._numpy() + + for batch_id, data in enumerate(train_reader()): + if batch_id >= batch_num: + break + + dy_x_data = np.array( + [x[0].reshape(3, 224, 224) for x in data]).astype('float32') + y_data = np.array([x[1] for x in data]).astype('int64').reshape( + batch_size, 1) + + img = to_variable(dy_x_data) + label = to_variable(y_data) + label._stop_gradient = True + + out = se_resnext(img) + loss = fluid.layers.cross_entropy(input=out, label=label) + avg_loss = fluid.layers.mean(x=loss) + + dy_out = avg_loss._numpy() + + if batch_id == 0: + for param in se_resnext.parameters(): + if param.name not in dy_param_init_value: + dy_param_init_value[param.name] = param._numpy() + + avg_loss._backward() + + dy_grad_value = {} + for param in se_resnext.parameters(): + if param.trainable: + np_array = np.array(param._ivar._grad_ivar().value() + .get_tensor()) + dy_grad_value[param.name + core.grad_var_suffix( + )] = np_array + + optimizer.minimize(avg_loss) + se_resnext.clear_gradients() + + dy_param_value = {} + for param in se_resnext.parameters(): + dy_param_value[param.name] = param._numpy() + + with new_program_scope(): + fluid.default_startup_program().random_seed = seed + fluid.default_main_program().random_seed = seed + + exe = fluid.Executor(fluid.CPUPlace( + ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0)) + + se_resnext = SeResNeXt("se_resnext") + optimizer = optimizer_setting(train_parameters) + + np.random.seed(seed) + import random + random.seed = seed + train_reader = paddle.batch( + paddle.dataset.flowers.train(use_xmap=False), + batch_size=batch_size) + + img = fluid.layers.data( + name='pixel', shape=[3, 224, 224], dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + out = se_resnext(img) + loss = fluid.layers.cross_entropy(input=out, label=label) + avg_loss = fluid.layers.mean(x=loss) + optimizer.minimize(avg_loss) + + # initialize params and fetch them + static_param_init_value = {} + static_param_name_list = [] + static_grad_name_list = [] + for param in se_resnext.parameters(): + static_param_name_list.append(param.name) + for param in se_resnext.parameters(): + if param.trainable: + static_grad_name_list.append(param.name + + core.grad_var_suffix()) + + out = exe.run(fluid.default_startup_program(), + fetch_list=static_param_name_list) + + for i in range(len(static_param_name_list)): + static_param_init_value[static_param_name_list[i]] = out[i] + + for batch_id, data in enumerate(train_reader()): + if batch_id >= batch_num: + break + + static_x_data = np.array( + [x[0].reshape(3, 224, 224) for x in data]).astype('float32') + y_data = np.array([x[1] for x in data]).astype('int64').reshape( + [batch_size, 1]) + + fetch_list = [avg_loss.name] + fetch_list.extend(static_param_name_list) + fetch_list.extend(static_grad_name_list) + out = exe.run(fluid.default_main_program(), + feed={"pixel": static_x_data, + "label": y_data}, + fetch_list=fetch_list) + + static_param_value = {} + static_grad_value = {} + static_out = out[0] + param_start_pos = 1 + grad_start_pos = len(static_param_name_list) + param_start_pos + for i in range(param_start_pos, + len(static_param_name_list) + param_start_pos): + static_param_value[static_param_name_list[ + i - param_start_pos]] = out[i] + for i in range(grad_start_pos, + len(static_grad_name_list) + grad_start_pos): + static_grad_value[static_grad_name_list[ + i - grad_start_pos]] = out[i] + self.assertTrue(np.allclose(static_out, dy_out)) + + self.assertEqual(len(dy_param_init_value), len(static_param_init_value)) + + for key, value in six.iteritems(static_param_init_value): + self.assertTrue(np.allclose(value, dy_param_init_value[key])) + self.assertTrue(np.isfinite(value.all())) + self.assertFalse(np.isnan(value.any())) + + self.assertEqual(len(dy_grad_value), len(static_grad_value)) + for key, value in six.iteritems(static_grad_value): + self.assertTrue(np.allclose(value, dy_grad_value[key])) + self.assertTrue(np.isfinite(value.all())) + self.assertFalse(np.isnan(value.any())) + + self.assertEqual(len(dy_param_value), len(static_param_value)) + for key, value in six.iteritems(static_param_value): + self.assertTrue(np.allclose(value, dy_param_value[key])) + self.assertTrue(np.isfinite(value.all())) + self.assertFalse(np.isnan(value.any())) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index e92ece7acb41b5a63adaae8edba78486ca3adcf8..674965882d76e142e4dc818374768ae7549120e0 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -76,6 +76,41 @@ class LayerTest(unittest.TestCase): class TestLayer(LayerTest): + def test_fc(self): + # pdb.set_trace() + inp = np.ones([3, 32, 32], dtype='float32') + with self.static_graph(): + t = layers.data( + name='data', + shape=[3, 32, 32], + dtype='float32', + append_batch_size=False) + ret = layers.fc(t, size=4, bias_attr=False, num_flatten_dims=1) + ret2 = layers.fc(ret, size=4) + static_ret = self.get_static_graph_result( + feed={'data': inp}, fetch_list=[ret2])[0] + with self.static_graph(): + t = layers.data( + name='data', + shape=[3, 32, 32], + dtype='float32', + append_batch_size=False) + fc1 = nn.FC('fc1', size=4, bias_attr=False, num_flatten_dims=1) + fc2 = nn.FC('fc2', size=4) + ret = fc1(t) + ret2 = fc2(ret) + static_ret2 = self.get_static_graph_result( + feed={'data': inp}, fetch_list=[ret2])[0] + with self.dynamic_graph(): + t = base.to_variable(inp) + fc1 = nn.FC('fc1', size=4, bias_attr=False, num_flatten_dims=1) + fc2 = nn.FC('fc2', size=4) + ret = fc1(t) + dy_ret = fc2(ret) + + self.assertTrue(np.array_equal(static_ret, static_ret2)) + self.assertTrue(np.array_equal(static_ret, dy_ret._numpy())) + def test_layer_norm(self): inp = np.ones([3, 32, 32], dtype='float32') with self.static_graph(): diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py index bda8b666dcde22b0e4bacdb5db252267f4c7e34b..645b0188d5f45935ace074ba343de246af476b41 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py @@ -38,7 +38,15 @@ def Lenet(data, class_dim): class TestFetchAndFeed(unittest.TestCase): - def parallel_exe(self, use_cuda, run_parallel_exe, seed=1): + @classmethod + def setUpClass(cls): + os.environ['CPU_NUM'] = str(4) + + def parallel_exe(self, + use_cuda, + run_parallel_exe, + use_experimental_executor=False, + seed=1): main_program = fluid.Program() startup = fluid.Program() startup.random_seed = seed @@ -63,8 +71,12 @@ class TestFetchAndFeed(unittest.TestCase): build_strategy = fluid.BuildStrategy() build_strategy.enable_inplace = False build_strategy.memory_optimize = False + exec_strategy = fluid.ExecutionStrategy() + exec_strategy.use_experimental_executor = use_experimental_executor train_cp = compiler.CompiledProgram(main_program).with_data_parallel( - loss_name=loss.name, build_strategy=build_strategy) + loss_name=loss.name, + build_strategy=build_strategy, + exec_strategy=exec_strategy) run_parallel_exe(train_cp, exe, use_cuda, data, label, loss) @@ -131,8 +143,7 @@ class TestFetchAndFeed(unittest.TestCase): if batch_id == 2: break - def test_fetch(self): - os.environ['CPU_NUM'] = str(4) + def test_fetch_with_threaded_executor(self): if core.is_compiled_with_cuda(): self.parallel_exe( use_cuda=True, @@ -140,8 +151,18 @@ class TestFetchAndFeed(unittest.TestCase): self.parallel_exe( use_cuda=False, run_parallel_exe=self.run_parallel_exe_with_fetch) + def test_fetch_with_fast_threaded_executor(self): + if core.is_compiled_with_cuda(): + self.parallel_exe( + use_cuda=True, + run_parallel_exe=self.run_parallel_exe_with_fetch, + use_experimental_executor=True) + self.parallel_exe( + use_cuda=False, + run_parallel_exe=self.run_parallel_exe_with_fetch, + use_experimental_executor=True) + def test_feed(self): - os.environ['CPU_NUM'] = str(4) if core.is_compiled_with_cuda(): self.parallel_exe( use_cuda=True, run_parallel_exe=self.run_parallel_exe_with_feed) diff --git a/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py b/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py index f8847e1570dc47d432777faa15f4004f1a7111a6..d8c57d964da706f12b8865195ea94329ca0f10e2 100644 --- a/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py +++ b/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py @@ -38,7 +38,7 @@ class TestSpliteSelectedRows(unittest.TestCase): def check_with_place(self, place): scope = core.Scope() rows = [0, 5, 7, 4, 20] - height = 20 + height = 21 row_numel = 2 # initialize input variable X diff --git a/python/paddle/fluid/trainer_desc.py b/python/paddle/fluid/trainer_desc.py new file mode 100644 index 0000000000000000000000000000000000000000..380c404fb2d6a36bf3732ebbff4b6cef22f71362 --- /dev/null +++ b/python/paddle/fluid/trainer_desc.py @@ -0,0 +1,101 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +__all__ = ['TrainerDesc', 'MultiTrainer', 'DistMultiTrainer'] + + +# can be initialized from train_desc, +class TrainerDesc(object): + def __init__(self): + ''' + self.proto_desc = data_feed_pb2.DataFeedDesc() + with open(proto_file, 'r') as f: + text_format.Parse(f.read(), self.proto_desc) + ''' + from proto import trainer_desc_pb2 + self.proto_desc = trainer_desc_pb2.TrainerDesc() + import multiprocessing as mp + # set default thread num == cpu count + self.proto_desc.thread_num = mp.cpu_count() + self.fleet_desc_ = None + self.device_worker_ = None + self.program_ = None + self.infer_ = False + + def _set_fetch_var_and_info(self, fetch_vars, fetch_info, print_period): + for i, v in enumerate(fetch_vars): + self.proto_desc.fetch_config.fetch_var_names.extend([v.name]) + self.proto_desc.fetch_config.fetch_var_str_format.extend( + [fetch_info[i]]) + self.proto_desc.fetch_config.print_period = print_period + + def _set_debug(self, debug): + self.proto_desc.debug = debug + + def _set_thread(self, thread_num): + self.proto_desc.thread_num = thread_num + + def _set_device_worker(self, device_worker): + self.device_worker_ = device_worker + + def _set_infer(self, infer): + self.infer_ = infer + + def _set_fleet_desc(self, fleet_desc): + self.fleet_desc_ = fleet_desc + + def _gen_trainer_desc(self): + pass + + def _set_program(self, program): + self.program_ = program + + def _desc(self): + from google.protobuf import text_format + return text_format.MessageToString(self.proto_desc) + + +class MultiTrainer(TrainerDesc): + def __init__(self): + super(MultiTrainer, self).__init__() + pass + + def _set_program(self, program): + super(MultiTrainer, self)._set_program(program) + self.program_ = program + + def _gen_trainer_desc(self): + super(MultiTrainer, self)._gen_trainer_desc() + self.proto_desc.class_name = "MultiTrainer" + self.device_worker_._set_infer(self.infer_) + self.device_worker_._gen_worker_desc(self.proto_desc) + + +class DistMultiTrainer(TrainerDesc): + def __init__(self): + super(DistMultiTrainer, self).__init__() + pass + + def _set_program(self, program): + super(DistMultiTrainer, self)._set_program(program) + self.program_ = program + + def _gen_trainer_desc(self): + super(DistMultiTrainer, self)._gen_trainer_desc() + self.proto_desc.class_name = "DistMultiTrainer" + if self.program_ == None: + raise RuntimeError("None Program") + self.device_worker_._set_infer(self.infer_) + self.device_worker_._set_program(self.program_) + self.device_worker_._gen_worker_desc(self.proto_desc) diff --git a/python/paddle/fluid/trainer_factory.py b/python/paddle/fluid/trainer_factory.py new file mode 100644 index 0000000000000000000000000000000000000000..4e957880f77a41d3dad9582bc7cc09af1d1a253b --- /dev/null +++ b/python/paddle/fluid/trainer_factory.py @@ -0,0 +1,40 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +__all__ = ["TrainerFactory"] + + +class TrainerFactory(object): + def __init__(self): + pass + + def _create_trainer(self, opt_info=None): + from .trainer_desc import MultiTrainer, DistMultiTrainer + from .device_worker import Hogwild, DownpourSGD + trainer = None + device_worker = None + if opt_info == None: + # default is MultiTrainer + Hogwild + trainer = MultiTrainer() + device_worker = Hogwild() + trainer._set_device_worker(device_worker) + else: + trainer_class = opt_info["trainer"] + device_worker_class = opt_info["device_worker"] + trainer = globals()[trainer_class]() + device_worker = globals()[device_worker_class]() + device_worker._set_fleet_desc(opt_info["fleet_desc"]) + trainer._set_device_worker(device_worker) + trainer._set_fleet_desc(opt_info["fleet_desc"]) + return trainer diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index eb54068650e8b3f4e64317778e2ad7c7aa7fe1b2..41e5f47976c566306ad141f655a0f6516831d690 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -156,6 +156,8 @@ class DistributeTranspilerConfig(object): mode = "pserver" print_log = False wait_port = True + # split the send recv var in runtime + runtime_split_send_recv = False class DistributeTranspiler(object): @@ -398,8 +400,10 @@ class DistributeTranspiler(object): orig_var = program.global_block().vars[splited_grad_varname] index = find_op_by_output_arg( program.global_block(), splited_grad_varname, reverse=True) - self._insert_split_op(program, orig_var, index, splited_vars) - index += 1 + if not self.config.runtime_split_send_recv: + self._insert_split_op(program, orig_var, index, + splited_vars) + index += 1 else: AssertionError("Can not insert the send op by original " "variable name :", splited_grad_varname) @@ -408,6 +412,17 @@ class DistributeTranspiler(object): name=framework.generate_control_dev_var_name()) self.grad_name_to_send_dummy_out[grad_varname] = dummy_output + if self.config.runtime_split_send_recv: + send_input_vars = [ + program.global_block().vars[splited_grad_varname] + ] + sections = self._get_splited_var_sections(splited_vars) + send_varnames = [var.name for var in splited_vars] + else: + send_input_vars = splited_vars + sections = [] + send_varnames = [] + # get send op_role_var, if not splited, the grad should have .trainer suffix # if splited, grad should be the original grad var name (split_by_ref and send # will be on the same place). ParallelExecutor @@ -415,10 +430,12 @@ class DistributeTranspiler(object): program.global_block()._insert_op( index=index + 1, type="send", - inputs={"X": splited_vars}, + inputs={"X": send_input_vars}, outputs={"Out": dummy_output}, attrs={ "epmap": eplist, + "sections": sections, + "send_varnames": send_varnames, RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE, OP_ROLE_VAR_ATTR_NAME: [ self.grad_name_to_param_name[grad_varname], @@ -501,13 +518,20 @@ class DistributeTranspiler(object): self._update_remote_sparse_update_op( param_varname, height_sections, eps, table_names) else: + recv_varnames = [] + if self.config.runtime_split_send_recv: + orig_param = program.global_block().vars[param_varname] + recv_varnames = [var.name for var in splited_var] + splited_var = [orig_param] all_recv_outputs.extend(splited_var) + program.global_block().append_op( type="recv", inputs={"X": [recv_dep_in]}, outputs={"Out": splited_var}, attrs={ "epmap": eps, + "recv_varnames": recv_varnames, "trainer_id": self.trainer_id, RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE, OP_ROLE_VAR_ATTR_NAME: @@ -532,14 +556,15 @@ class DistributeTranspiler(object): continue orig_param = program.global_block().vars[param_varname] if param_varname not in self.sparse_param_to_height_sections: - program.global_block().append_op( - type="concat", - inputs={"X": splited_var}, - outputs={"Out": [orig_param]}, - attrs={ - "axis": 0, - RPC_OP_ROLE_ATTR_NAME: DIST_OP_ROLE_ATTR_VALUE - }) + if not self.config.runtime_split_send_recv: + program.global_block().append_op( + type="concat", + inputs={"X": splited_var}, + outputs={"Out": [orig_param]}, + attrs={ + "axis": 0, + RPC_OP_ROLE_ATTR_NAME: DIST_OP_ROLE_ATTR_VALUE + }) self._get_trainer_startup_program(recv_vars=recv_vars, eplist=eplist) @@ -1552,11 +1577,17 @@ class DistributeTranspiler(object): lod_level=var.lod_level, persistable=persistable) + @staticmethod + def _get_splited_var_sections(splited_vars): + height_sections = [] + for v in splited_vars: + height_sections.append(v.shape[0]) + return height_sections + def _insert_split_op(self, program, orig_var, index, splited_vars): + height_sections = self._get_splited_var_sections(splited_vars) + if orig_var.type == core.VarDesc.VarType.SELECTED_ROWS: - height_sections = [] - for v in splited_vars: - height_sections.append(v.shape[0]) sparse_param_name = self.grad_name_to_param_name[orig_var.name] if self._is_input_of_remote_sparse_update_op(sparse_param_name): self.sparse_param_to_height_sections[ @@ -1571,16 +1602,13 @@ class DistributeTranspiler(object): RPC_OP_ROLE_ATTR_NAME: DIST_OP_ROLE_ATTR_VALUE }) elif orig_var.type == core.VarDesc.VarType.LOD_TENSOR: - sections = [] - for v in splited_vars: - sections.append(v.shape[0]) program.global_block()._insert_op( index=index + 1, type="split_byref", inputs={"X": orig_var}, outputs={"Out": splited_vars}, attrs={ - "sections": sections, + "sections": height_sections, RPC_OP_ROLE_ATTR_NAME: DIST_OP_ROLE_ATTR_VALUE }) else: @@ -2052,7 +2080,7 @@ class DistributeTranspiler(object): Get optimizer operators, parameters and gradients from origin_program Returns: opt_ops (list): optimize operators. - params_grads (dict): paramter->gradient. + params_grads (dict): parameter->gradient. """ block = self.origin_program.global_block() opt_ops = [] diff --git a/python/setup.py.in b/python/setup.py.in index 75e821582f49f93cb41e4254edd11cb782d18cc7..9ab4e9742cfbaf4e2d08e7c27b6ba231c85c4ec2 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -121,7 +121,13 @@ packages=['paddle', 'paddle.fluid.contrib.utils', 'paddle.fluid.contrib.extend_optimizer', 'paddle.fluid.transpiler', - 'paddle.fluid.transpiler.details'] + 'paddle.fluid.transpiler.details', + 'paddle.fluid.incubate', + 'paddle.fluid.incubate.data_generator', + 'paddle.fluid.incubate.fleet', + 'paddle.fluid.incubate.fleet.base', + 'paddle.fluid.incubate.fleet.parameter_server', + 'paddle.fluid.incubate.fleet.p2p'] with open('@PADDLE_SOURCE_DIR@/python/requirements.txt') as f: setup_requires = f.read().splitlines() @@ -151,6 +157,10 @@ package_data['paddle.libs']= [] package_data['paddle.libs']=[('libwarpctc' if os.name != 'nt' else 'warpctc') + ext_name] shutil.copy('${WARPCTC_LIBRARIES}', libs_path) +if '${WITH_WBAES}' == 'ON': + package_data['paddle.libs'] += ['libwbaes' + ext_name] + shutil.copy('${WBAES_SHARED_LIB}', libs_path) + if '${WITH_MKL}' == 'ON': shutil.copy('${MKLML_SHARED_LIB}', libs_path) shutil.copy('${MKLML_SHARED_IOMP_LIB}', libs_path) diff --git a/tools/diff_use_default_grad_op_maker.py b/tools/diff_use_default_grad_op_maker.py new file mode 100644 index 0000000000000000000000000000000000000000..9e362f611bbf381f480be6f216c28a53dc0440fa --- /dev/null +++ b/tools/diff_use_default_grad_op_maker.py @@ -0,0 +1,66 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +os.environ['CUDA_VISIBLE_DEVICES'] = '' + +import paddle.fluid as fluid +import sys + + +def get_op_diff(filename): + ops_created_by_py_func = set( + fluid.core._get_use_default_grad_op_desc_maker_ops()) + + with open(filename, 'r') as f: + ops_read_from_file = set([line.strip() for line in f.readlines()]) + + diff_ops = [] + + for op in ops_read_from_file: + if op not in ops_created_by_py_func: + diff_ops.append(op) + else: + ops_created_by_py_func.remove(op) + + err_msg = [] + diff_ops = list(diff_ops) + if len(diff_ops) > 0: + err_msg.append('Added grad op with DefaultGradOpDescMaker: ' + str( + diff_ops)) + + ops_created_by_py_func = list(ops_created_by_py_func) + if len(ops_created_by_py_func) > 0: + err_msg.append('Remove grad op with DefaultGradOpDescMaker: ' + str( + ops_created_by_py_func)) + + return err_msg + + +if len(sys.argv) != 2: + print('Usage: python diff_use_default_grad_op_maker.py [filepath]') + sys.exit(1) + +file_path = str(sys.argv[1]) +err_msg = get_op_diff(file_path) + +if len(err_msg) > 0: + _, filename = os.path.split(file_path) + print('File `{}` is wrong compared to your PR revision!'.format(filename)) + print( + 'Please use `python generate_op_use_grad_op_desc_maker_spec.py [filepath]` to generate new `{}` file'. + format(filename)) + print('Error message is: ' + '; '.join(err_msg)) + sys.exit(1) diff --git a/tools/generate_op_use_grad_op_desc_maker_spec.py b/tools/generate_op_use_grad_op_desc_maker_spec.py new file mode 100644 index 0000000000000000000000000000000000000000..69b062a8716692f19bbd63928064cf74c171b88f --- /dev/null +++ b/tools/generate_op_use_grad_op_desc_maker_spec.py @@ -0,0 +1,29 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +os.environ['CUDA_VISIBLE_DEVICES'] = '' + +import paddle.fluid as fluid +import sys + +if len(sys.argv) != 2: + print('Usage: python generate_op_use_grad_op_desc_maker_spec.py [filepath]') + sys.exit(1) + +with open(sys.argv[1], 'w') as f: + ops = fluid.core._get_use_default_grad_op_desc_maker_ops() + for op in ops: + f.write(op + '\n')