diff --git a/CMakeLists.txt b/CMakeLists.txt index cfaab206e1f321a55119d4a8d65c4a99d3819fff..b35290e12f6d50376bffb538d213bf586f4f9e58 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -55,12 +55,13 @@ option(WITH_FLUID_ONLY "Compile PaddlePaddle fluid only" OFF) option(WITH_GOLANG "Compile PaddlePaddle with GOLANG" OFF) option(GLIDE_INSTALL "Download and install go dependencies " ON) option(USE_NNPACK "Compile PaddlePaddle with NNPACK library" OFF) -option(WITH_DISTRIBUTE "Compile with grpc distributed support" OFF) +option(WITH_DISTRIBUTE "Compile with distributed support" OFF) option(USE_EIGEN_FOR_BLAS "Use matrix multiplication in Eigen" OFF) option(EIGEN_USE_THREADS "Compile with multi-threaded Eigen" OFF) option(WITH_ARM_FP16 "Use half precision support on armv8.2-a cpu" OFF) option(WITH_FAST_BUNDLE_TEST "Bundle tests that can be run in a single process together to reduce launch overhead" OFF) option(WITH_CONTRIB "Compile the third-party contributation" OFF) +option(WITH_GRPC "Use grpc as the default rpc framework" ${WITH_DISTRIBUTE}) # CMAKE_BUILD_TYPE if(NOT CMAKE_BUILD_TYPE) @@ -147,7 +148,16 @@ include(external/any) # download libn::any include(external/eigen) # download eigen3 include(external/pybind11) # download pybind11 include(external/cares) -include(external/grpc) + +if(WITH_DISTRIBUTE) + if(WITH_GRPC) + include(external/grpc) + else() + include(external/leveldb) + include(external/brpc) + endif() +endif() + include(external/snappy) # download snappy include(external/snappystream) include(external/threadpool) diff --git a/benchmark/fluid/README.md b/benchmark/fluid/README.md index f40f3c129741f9b6e3654399a9110b065fec7d6c..28cade4634bb62723bf5120169e202657f548234 100644 --- a/benchmark/fluid/README.md +++ b/benchmark/fluid/README.md @@ -24,10 +24,12 @@ Currently supported `--model` argument include: * Run the following command to start a benchmark job locally: ```bash - python fluid_benchmark.py --model mnist --device GPU + python fluid_benchmark.py --model mnist --device GPU ``` You can choose to use GPU/CPU training. With GPU training, you can specify `--gpus ` to run multi GPU training. + You can set async mode parameter server. With async mode, you can specify + `--async_mode` to train model asynchronous. * Run distributed training with parameter servers: * see [run_fluid_benchmark.sh](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/fluid/run_fluid_benchmark.sh) as an example. * start parameter servers: diff --git a/benchmark/fluid/args.py b/benchmark/fluid/args.py new file mode 100644 index 0000000000000000000000000000000000000000..68a3d42d7a8a8082730f4cae3b5d4ea33819ca2f --- /dev/null +++ b/benchmark/fluid/args.py @@ -0,0 +1,126 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse + +__all__ = ['parse_args', ] + +BENCHMARK_MODELS = [ + "machine_translation", "resnet", "vgg", "mnist", "stacked_dynamic_lstm" +] + + +def parse_args(): + parser = argparse.ArgumentParser('Fluid model benchmarks.') + parser.add_argument( + '--model', + type=str, + choices=BENCHMARK_MODELS, + default='resnet', + help='The model to run benchmark with.') + parser.add_argument( + '--batch_size', type=int, default=32, help='The minibatch size.') + # args related to learning rate + parser.add_argument( + '--learning_rate', type=float, default=0.001, help='The learning rate.') + # TODO(wuyi): add "--use_fake_data" option back. + parser.add_argument( + '--skip_batch_num', + type=int, + default=5, + help='The first num of minibatch num to skip, for better performance test' + ) + parser.add_argument( + '--iterations', type=int, default=80, help='The number of minibatches.') + parser.add_argument( + '--pass_num', type=int, default=100, help='The number of passes.') + parser.add_argument( + '--data_format', + type=str, + default='NCHW', + choices=['NCHW', 'NHWC'], + help='The data data_format, now only support NCHW.') + parser.add_argument( + '--device', + type=str, + default='GPU', + choices=['CPU', 'GPU'], + help='The device type.') + parser.add_argument( + '--gpus', + type=int, + default=1, + help='If gpus > 1, will use ParallelExecutor to run, else use Executor.') + # this option is available only for vgg and resnet. + parser.add_argument( + '--cpus', + type=int, + default=1, + help='If cpus > 1, will use ParallelDo to run, else use Executor.') + parser.add_argument( + '--data_set', + type=str, + default='flowers', + choices=['cifar10', 'flowers'], + help='Optional dataset for benchmark.') + parser.add_argument( + '--infer_only', action='store_true', help='If set, run forward only.') + parser.add_argument( + '--use_cprof', action='store_true', help='If set, use cProfile.') + parser.add_argument( + '--use_nvprof', + action='store_true', + help='If set, use nvprof for CUDA.') + parser.add_argument( + '--no_test', + action='store_true', + help='If set, do not test the testset during training.') + parser.add_argument( + '--memory_optimize', + action='store_true', + help='If set, optimize runtime memory before start.') + parser.add_argument( + '--use_fake_data', + action='store_true', + help='If set ommit the actual read data operators.') + parser.add_argument( + '--profile', action='store_true', help='If set, profile a few steps.') + parser.add_argument( + '--update_method', + type=str, + default='local', + choices=['local', 'pserver', 'nccl2'], + help='Choose parameter update method, can be local, pserver, nccl2.') + parser.add_argument( + '--no_split_var', + action='store_true', + default=False, + help='Whether split variables into blocks when update_method is pserver') + parser.add_argument( + '--async_mode', + action='store_true', + default=False, + help='Whether start pserver in async mode to support ASGD') + parser.add_argument( + '--use_reader_op', + action='store_true', + help='Whether to use reader op, and must specify the data path if set this to true.' + ) + parser.add_argument( + '--data_path', + type=str, + default="", + help='Directory that contains all the training recordio files.') + args = parser.parse_args() + return args diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py index 62a05234c45ee4fe1dc21f5a74efc269227154db..902dca209fcc07cc8048acda84e917f0f6c6af68 100644 --- a/benchmark/fluid/fluid_benchmark.py +++ b/benchmark/fluid/fluid_benchmark.py @@ -24,108 +24,7 @@ import paddle.fluid.core as core import paddle.fluid.profiler as profiler import paddle.fluid.transpiler.distribute_transpiler as distribute_transpiler -BENCHMARK_MODELS = [ - "machine_translation", "resnet", "vgg", "mnist", "stacked_dynamic_lstm" -] - - -def parse_args(): - parser = argparse.ArgumentParser('Fluid model benchmarks.') - parser.add_argument( - '--model', - type=str, - choices=BENCHMARK_MODELS, - default='resnet', - help='The model to run benchmark with.') - parser.add_argument( - '--batch_size', - type=int, - default=32, - help='The batch size on each gpu.') - parser.add_argument( - '--learning_rate', type=float, default=0.001, help='The learning rate.') - parser.add_argument( - '--skip_batch_num', - type=int, - default=5, - help='The first num of minibatch num to skip, for better performance test' - ) - parser.add_argument( - '--iterations', - type=int, - default=80, - help='The number of minibatches, set to -1 to run all batches.') - parser.add_argument( - '--pass_num', type=int, default=100, help='The number of passes.') - parser.add_argument( - '--data_format', - type=str, - default='NCHW', - choices=['NCHW', 'NHWC'], - help='The data data_format, now only support NCHW.') - parser.add_argument( - '--device', - type=str, - default='GPU', - choices=['CPU', 'GPU'], - help='The device type.') - parser.add_argument( - '--gpus', - type=int, - default=1, - help='If gpus > 1, will use ParallelExecutor to run, else use Executor.') - # this option is available only for vgg and resnet. - parser.add_argument( - '--cpus', - type=int, - default=1, - help='If cpus > 1, will use ParallelDo to run, else use Executor.') - parser.add_argument( - '--data_set', - type=str, - default='flowers', - choices=['cifar10', 'flowers', 'imagenet'], - help='Optional dataset for benchmark.') - parser.add_argument( - '--infer_only', action='store_true', help='If set, run forward only.') - parser.add_argument( - '--use_cprof', action='store_true', help='If set, use cProfile.') - parser.add_argument( - '--use_nvprof', - action='store_true', - help='If set, use nvprof for CUDA.') - parser.add_argument( - '--no_test', - action='store_true', - help='If set, do not test the testset during training.') - parser.add_argument( - '--memory_optimize', - action='store_true', - help='If set, optimize runtime memory before start.') - parser.add_argument( - '--use_fake_data', - action='store_true', - help='If set ommit the actual read data operators.') - parser.add_argument( - '--profile', action='store_true', help='If set, profile a few steps.') - parser.add_argument( - '--update_method', - type=str, - default='local', - choices=['local', 'pserver', 'nccl2'], - help='Choose parameter update method, can be local, pserver, nccl2.') - parser.add_argument( - '--use_reader_op', - action='store_true', - help='Whether to use reader op, and must specify the data path if set this to true.' - ) - parser.add_argument( - '--data_path', - type=str, - default="", - help='Directory that contains all the training recordio files.') - args = parser.parse_args() - return args +from args import * def append_nccl2_prepare(trainer_id): @@ -160,7 +59,7 @@ def append_nccl2_prepare(trainer_id): "nccl-based dist train.") -def dist_transpile(trainer_id): +def dist_transpile(trainer_id, args): if trainer_id < 0: return None, None @@ -182,7 +81,12 @@ def dist_transpile(trainer_id): training_role = os.getenv("PADDLE_TRAINING_ROLE") t = distribute_transpiler.DistributeTranspiler() - t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers) + t.transpile( + trainer_id, + pservers=pserver_endpoints, + trainers=trainers, + sync_mode=not args.async_mode, + slice_var_up=not args.no_split_var) if training_role == "PSERVER": pserver_program = t.get_pserver_program(current_endpoint) pserver_startup_program = t.get_startup_program(current_endpoint, @@ -417,7 +321,7 @@ def main(): fluid.memory_optimize(fluid.default_main_program()) if args.update_method == "pserver": - train_prog, startup_prog = dist_transpile(trainer_id) + train_prog, startup_prog = dist_transpile(trainer_id, args) if not train_prog: raise Exception( "Must configure correct environments to run dist train.") diff --git a/benchmark/fluid/models/stacked_dynamic_lstm.py b/benchmark/fluid/models/stacked_dynamic_lstm.py index e1c4857f1a365f6480929ea57296a9801f5ea945..211869af4e8d7180cb485811d3363c50d32f0f74 100644 --- a/benchmark/fluid/models/stacked_dynamic_lstm.py +++ b/benchmark/fluid/models/stacked_dynamic_lstm.py @@ -104,8 +104,9 @@ def get_model(args): loss = fluid.layers.mean(x=loss) # add acc + batch_size_tensor = fluid.layers.create_tensor(dtype='int64') batch_acc = fluid.layers.accuracy(input=logit, label=fluid.layers.data(name='label', \ - shape=[1], dtype='int64')) + shape=[1], dtype='int64'), total=batch_size_tensor) inference_program = fluid.default_main_program().clone() with fluid.program_guard(inference_program): diff --git a/benchmark/fluid/models/vgg.py b/benchmark/fluid/models/vgg.py index 6092cdeb884b3a9b60a3bcf20b022f2b0685e6aa..932601302d2f5d56b53e3462af886429034d8989 100644 --- a/benchmark/fluid/models/vgg.py +++ b/benchmark/fluid/models/vgg.py @@ -82,7 +82,8 @@ def get_model(args): data_file, batch_size=args.batch_size)) images, label = fluid.layers.read_file(data_file) else: - images = fluid.layers.data(name='data', shape=dshape, dtype='float32') + images = fluid.layers.data( + name='data', shape=data_shape, dtype='float32') label = fluid.layers.data(name='label', shape=[1], dtype='int64') # Train program diff --git a/cmake/configure.cmake b/cmake/configure.cmake index 4158d0528a1aea52c2a3f0880fe1000183a9df53..89d7a62fe9aca3a71ad34b976a186a80174bfd5e 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -166,3 +166,7 @@ if(WITH_GOLANG) endif() endif(WITH_GOLANG) + +if(WITH_GRPC) + add_definitions(-DPADDLE_WITH_GRPC) +endif(WITH_GRPC) diff --git a/cmake/external/brpc.cmake b/cmake/external/brpc.cmake new file mode 100644 index 0000000000000000000000000000000000000000..8e2c913b2caae0c4eeb844d2b51a8975e81c1592 --- /dev/null +++ b/cmake/external/brpc.cmake @@ -0,0 +1,58 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +INCLUDE(ExternalProject) + +SET(BRPC_SOURCES_DIR ${THIRD_PARTY_PATH}/brpc) +SET(BRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/brpc) +SET(BRPC_INCLUDE_DIR "${BRPC_INSTALL_DIR}/include" CACHE PATH "brpc include directory." FORCE) +SET(BRPC_LIBRARIES "${BRPC_INSTALL_DIR}/lib/libbrpc.a" CACHE FILEPATH "brpc library." FORCE) + +INCLUDE_DIRECTORIES(${BRPC_INCLUDE_DIR}) + +# Reference https://stackoverflow.com/questions/45414507/pass-a-list-of-prefix-paths-to-externalproject-add-in-cmake-args +set(prefix_path "${THIRD_PARTY_PATH}/install/gflags|${THIRD_PARTY_PATH}/install/leveldb|${THIRD_PARTY_PATH}/install/snappy|${THIRD_PARTY_PATH}/install/gtest|${THIRD_PARTY_PATH}/install/protobuf") + +# If minimal .a is need, you can set WITH_DEBUG_SYMBOLS=OFF +ExternalProject_Add( + extern_brpc + ${EXTERNAL_PROJECT_LOG_ARGS} + GIT_REPOSITORY "https://github.com/brpc/brpc" + GIT_TAG "6d153dd7ff00f960ae6895c9c5fff0ce9f07aff2" + PREFIX ${BRPC_SOURCES_DIR} + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + -DCMAKE_INSTALL_PREFIX=${BRPC_INSTALL_DIR} + -DCMAKE_INSTALL_LIBDIR=${BRPC_INSTALL_DIR}/lib + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + -DCMAKE_PREFIX_PATH=${prefix_path} + -DBRPC_WITH_GLOG=ON + ${EXTERNAL_OPTIONAL_ARGS} + LIST_SEPARATOR | + CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${BRPC_INSTALL_DIR} + -DCMAKE_INSTALL_LIBDIR:PATH=${BRPC_INSTALL_DIR}/lib + -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} +) +ADD_DEPENDENCIES(extern_brpc protobuf leveldb gflags glog gtest snappy) +ADD_LIBRARY(brpc STATIC IMPORTED GLOBAL) +SET_PROPERTY(TARGET brpc PROPERTY IMPORTED_LOCATION ${BRPC_LIBRARIES}) +ADD_DEPENDENCIES(brpc extern_brpc) + + +LIST(APPEND external_project_dependencies brpc) diff --git a/cmake/external/leveldb.cmake b/cmake/external/leveldb.cmake new file mode 100644 index 0000000000000000000000000000000000000000..fb5091731da02b497a14f119e944905eee4979d5 --- /dev/null +++ b/cmake/external/leveldb.cmake @@ -0,0 +1,44 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +INCLUDE(ExternalProject) + +SET(LEVELDB_SOURCES_DIR ${THIRD_PARTY_PATH}/leveldb) +SET(LEVELDB_INSTALL_DIR ${THIRD_PARTY_PATH}/install/leveldb) +SET(LEVELDB_INCLUDE_DIR "${LEVELDB_INSTALL_DIR}/include" CACHE PATH "leveldb include directory." FORCE) +SET(LEVELDB_LIBRARIES "${LEVELDB_INSTALL_DIR}/lib/libleveldb.a" CACHE FILEPATH "leveldb library." FORCE) +INCLUDE_DIRECTORIES(${LEVELDB_INCLUDE_DIR}) + +ExternalProject_Add( + extern_leveldb + ${EXTERNAL_PROJECT_LOG_ARGS} + PREFIX ${LEVELDB_SOURCES_DIR} + URL "https://github.com/google/leveldb/archive/v1.18.tar.gz" + URL_MD5 "73770de34a2a5ab34498d2e05b2b7fa0" + CONFIGURE_COMMAND "" + BUILD_COMMAND CXXFLAGS=-fPIC make -j ${NUM_OF_PROCESSOR} libleveldb.a + INSTALL_COMMAND mkdir -p ${LEVELDB_INSTALL_DIR}/lib/ + && cp ${LEVELDB_SOURCES_DIR}/src/extern_leveldb/libleveldb.a ${LEVELDB_LIBRARIES} + && cp -r ${LEVELDB_SOURCES_DIR}/src/extern_leveldb/include ${LEVELDB_INSTALL_DIR}/ + BUILD_IN_SOURCE 1 +) + +ADD_DEPENDENCIES(extern_leveldb snappy) + +ADD_LIBRARY(leveldb STATIC IMPORTED GLOBAL) +SET_PROPERTY(TARGET leveldb PROPERTY IMPORTED_LOCATION ${LEVELDB_LIBRARIES}) +ADD_DEPENDENCIES(leveldb extern_leveldb) + +LIST(APPEND external_project_dependencies leveldb) + diff --git a/cmake/generic.cmake b/cmake/generic.cmake index 9ddd05b3d9404df29ca1bf634105314b7e6a5b70..0e2df86c19086357ab520edfcd8421e35768c928 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -610,3 +610,21 @@ function(grpc_library TARGET_NAME) COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") cc_library("${TARGET_NAME}" SRCS "${grpc_library_SRCS}" DEPS "${TARGET_NAME}_grpc" "${TARGET_NAME}_proto" "${grpc_library_DEPS}") endfunction() + + +function(brpc_library TARGET_NAME) + set(oneValueArgs PROTO) + set(multiValueArgs SRCS DEPS) + set(options "") + cmake_parse_arguments(brpc_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + + message(STATUS "generating brpc ${brpc_library_PROTO}") + + get_filename_component(ABS_PROTO ${brpc_library_PROTO} ABSOLUTE) + get_filename_component(PROTO_WE ${brpc_library_PROTO} NAME_WE) + get_filename_component(PROTO_PATH ${ABS_PROTO} PATH) + + protobuf_generate_cpp(brpc_proto_srcs brpc_proto_hdrs "${ABS_PROTO}") + cc_library("${TARGET_NAME}_proto" SRCS "${brpc_proto_srcs}") + cc_library("${TARGET_NAME}" SRCS "${brpc_library_SRCS}" DEPS "${TARGET_NAME}_proto" "${brpc_library_DEPS}") +endfunction() diff --git a/doc/fluid/dev/api_doc_std_cn.md b/doc/fluid/dev/api_doc_std_cn.md index b50f18f21df0787b9761bf0935ed7f4384ff0f98..7d39b8de1e6dc502ffea5f7882bd6a42b1ed6549 100644 --- a/doc/fluid/dev/api_doc_std_cn.md +++ b/doc/fluid/dev/api_doc_std_cn.md @@ -1,8 +1,9 @@ # API注释撰写标准 -- [API注释模块](#API注释模块) -- [格式及示例](#格式及示例) -- [完整示例](#完整示例) +- [API注释撰写标准](#api) + - [API注释模块](#api) + - [格式及示例](#) + - [完整示例](#) ## API注释模块 @@ -217,4 +218,4 @@ API文档须使用reStructuredText格式撰写,该格式详情请参考[链接 ## 完整示例 -fc 的完整注释见[示例](src/fc.py)。 +fc 的完整注释见[示例](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/src/fc.py)。 diff --git a/doc/fluid/dev/api_doc_std_en.md b/doc/fluid/dev/api_doc_std_en.md index e57072d52fd162e92a3482aef33f99ab9394c532..f175b219750d1c765a6a111c2ec3aa732fa46175 100644 --- a/doc/fluid/dev/api_doc_std_en.md +++ b/doc/fluid/dev/api_doc_std_en.md @@ -1,8 +1,9 @@ # API Doc Standard -- [API Doc Structure](#API Doc Structure) -- [Format and Examples](#Format and Examples) -- [Complete Example](#Complete Example) +- [API Doc Standard](#api-doc-standard) + - [API Doc Structure](#api-doc-structure) + - [Format and Examples](#format-and-examples) + - [Complete Example](#complete-example) ## API Doc Structure @@ -223,4 +224,4 @@ Format and examples of each part of API documantation are as follows: (take fc f ## Complete Example -Complete Example of fc please see [here](src/fc.py)。 +Complete Example of fc please see [here](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/fluid/dev/src/fc.py)。 diff --git a/paddle/contrib/inference/test_paddle_inference_api_impl.cc b/paddle/contrib/inference/test_paddle_inference_api_impl.cc index 4b6cb7b051d1ad2c63e895017c7faf1245c22612..5d843010e02b09087e6b328428e80fb40eb5bb97 100644 --- a/paddle/contrib/inference/test_paddle_inference_api_impl.cc +++ b/paddle/contrib/inference/test_paddle_inference_api_impl.cc @@ -109,7 +109,6 @@ void MainWord2Vec(bool use_gpu) { void MainImageClassification(bool use_gpu) { int batch_size = 2; - bool use_mkldnn = false; bool repeat = false; NativeConfig config = GetConfig(); config.use_gpu = use_gpu; @@ -134,12 +133,8 @@ void MainImageClassification(bool use_gpu) { std::vector cpu_fetchs1; cpu_fetchs1.push_back(&output1); - TestInference(config.model_dir, - cpu_feeds, - cpu_fetchs1, - repeat, - is_combined, - use_mkldnn); + TestInference( + config.model_dir, cpu_feeds, cpu_fetchs1, repeat, is_combined); auto predictor = CreatePaddlePredictor(config); std::vector paddle_tensor_feeds; diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 627370cd2df7317b4d32aa967565aaf9cf0c7a08..4271e4c1bb6bc7b83f2633191ea2d464f4f56c4c 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -87,7 +87,7 @@ cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method) -cc_library(parallel_executor SRCS parallel_executor.cc DEPS graph_builder_factory threaded_ssa_graph_executor scope_buffered_ssa_graph_executor) +cc_library(parallel_executor SRCS parallel_executor.cc DEPS ssa_graph_builder_factory threaded_ssa_graph_executor scope_buffered_ssa_graph_executor) cc_library(prune SRCS prune.cc DEPS framework_proto) cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context) diff --git a/paddle/fluid/framework/data_type.cc b/paddle/fluid/framework/data_type.cc index b6b93cf422a60c1d8e9cb8b477efd562f9fe4758..60382faffb8e53870658b2d1ff83abc4008cb4cf 100644 --- a/paddle/fluid/framework/data_type.cc +++ b/paddle/fluid/framework/data_type.cc @@ -28,6 +28,9 @@ struct DataTypeMap { }; static DataTypeMap* InitDataTypeMap(); +// C++11 removes the need for manual locking. Concurrent execution shall wait if +// a static local variable is already being initialized. +// https://stackoverflow.com/questions/11711920/how-to-implement-multithread-safe-singleton-in-c11-without-using-mutex static DataTypeMap& gDataTypeMap() { static DataTypeMap* g_data_type_map_ = InitDataTypeMap(); return *g_data_type_map_; diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index c43826b64cc5140c539df17fdd13d9bee7fefdcd..3c73b6cc55c187c3f6e7edd1ce38cc58f4e8413d 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -8,18 +8,19 @@ cc_library(rpc_op_handle SRCS rpc_op_handle.cc DEPS framework_proto scope place cc_library(ssa_graph SRCS ssa_graph.cc DEPS var_handle op_handle_base) cc_library(ssa_graph_builder SRCS ssa_graph_builder.cc DEPS ssa_graph) cc_library(ssa_graph_printer SRCS ssa_graph_printer.cc DEPS ssa_graph_builder) +cc_library(ssa_graph_checker SRCS ssa_graph_checker.cc DEPS ssa_graph_builder) cc_library(variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_rows) if(WITH_GPU) - nv_library(nccl_all_reduce_op_handle SRCS nccl_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory + nv_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory dynload_cuda variable_visitor) - set(multi_devices_graph_builder_deps nccl_all_reduce_op_handle) nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim dynload_cuda) nv_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor dynload_cuda) else() - set(multi_devices_graph_builder_deps) + cc_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory + variable_visitor) cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim) cc_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor) endif() @@ -28,10 +29,10 @@ cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope d cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope) cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle - scale_loss_grad_op_handle rpc_op_handle ${multi_devices_graph_builder_deps} reduce_op_handle broadcast_op_handle) + scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle) -cc_library(graph_builder_factory SRCS graph_builder_factory.cc DEPS multi_devices_graph_builder ssa_graph_printer) +cc_library(ssa_graph_builder_factory SRCS ssa_graph_builder_factory.cc DEPS multi_devices_graph_builder ssa_graph_printer ssa_graph_checker) cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph framework_proto) cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope diff --git a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc similarity index 78% rename from paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc rename to paddle/fluid/framework/details/all_reduce_op_handle.cc index 5bba089ade801a06e0364835efe5249105dcfcac..b335d3a0d364c916e19574de8d3ed89aaec7de41 100644 --- a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc @@ -13,25 +13,33 @@ // limitations under the License. #include +#include "paddle/fluid/framework/details/all_reduce_op_handle.h" #include "paddle/fluid/framework/details/container_cast.h" -#include "paddle/fluid/framework/details/nccl_all_reduce_op_handle.h" #include "paddle/fluid/framework/details/reduce_and_gather.h" #include "paddle/fluid/framework/details/variable_visitor.h" namespace paddle { namespace framework { namespace details { -NCCLAllReduceOpHandle::NCCLAllReduceOpHandle( - const std::vector &local_scopes, - const std::vector &places, - const platform::NCCLContextMap &ctxs) + +#ifdef PADDLE_WITH_CUDA +AllReduceOpHandle::AllReduceOpHandle(const std::vector &local_scopes, + const std::vector &places, + const platform::NCCLContextMap *ctxs) : local_scopes_(local_scopes), places_(places), nccl_ctxs_(ctxs) { - for (auto &p : places_) { - this->dev_ctxes_[p] = nccl_ctxs_.DevCtx(p); + if (nccl_ctxs_) { + for (auto &p : places_) { + this->dev_ctxes_[p] = nccl_ctxs_->DevCtx(p); + } } } +#else +AllReduceOpHandle::AllReduceOpHandle(const std::vector &local_scopes, + const std::vector &places) + : local_scopes_(local_scopes), places_(places) {} +#endif -void NCCLAllReduceOpHandle::RunImpl() { +void AllReduceOpHandle::RunImpl() { if (NoDummyInputSize() == 1) { return; // No need to all reduce when GPU count = 1; } else { @@ -58,6 +66,8 @@ void NCCLAllReduceOpHandle::RunImpl() { } if (platform::is_gpu_place(lod_tensors[0]->place())) { +#ifdef PADDLE_WITH_CUDA + PADDLE_ENFORCE(nccl_ctxs_, "nccl_ctxs should not be nullptr."); int dtype = -1; size_t numel = 0; std::vector> all_reduce_calls; @@ -75,7 +85,7 @@ void NCCLAllReduceOpHandle::RunImpl() { } int dev_id = boost::get(p).device; - auto &nccl_ctx = nccl_ctxs_.at(dev_id); + auto &nccl_ctx = nccl_ctxs_->at(dev_id); auto stream = nccl_ctx.stream(); auto comm = nccl_ctx.comm_; all_reduce_calls.emplace_back([=] { @@ -90,22 +100,25 @@ void NCCLAllReduceOpHandle::RunImpl() { call(); } }); +#else + PADDLE_THROW("Not compiled with CUDA"); +#endif } else { // Special handle CPU only Operator's gradient. Like CRF auto &trg = *this->local_scopes_[0] ->FindVar(kLocalExecScopeName) ->Get() - ->Var() + ->FindVar(out_var_handles[0]->name_) ->GetMutable(); // Reduce All Tensor to trg in CPU ReduceLoDTensor func(lod_tensors, &trg); VisitDataType(ToDataType(lod_tensors[0]->type()), func); - for (size_t i = 0; i < local_scopes_.size(); ++i) { + for (size_t i = 1; i < local_scopes_.size(); ++i) { auto &scope = *local_scopes_[i]->FindVar(kLocalExecScopeName)->Get(); auto &p = places_[i]; - auto *var = scope.FindVar(in_var_handles[i]->name_); + auto *var = scope.FindVar(out_var_handles[i]->name_); auto *dev_ctx = dev_ctxes_[p]; RunAndRecordEvent(p, [&trg, var, dev_ctx, p] { @@ -118,7 +131,7 @@ void NCCLAllReduceOpHandle::RunImpl() { } } -std::string NCCLAllReduceOpHandle::Name() const { return "nccl_all_reduce"; } +std::string AllReduceOpHandle::Name() const { return "all_reduce"; } } // namespace details } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h b/paddle/fluid/framework/details/all_reduce_op_handle.h similarity index 73% rename from paddle/fluid/framework/details/nccl_all_reduce_op_handle.h rename to paddle/fluid/framework/details/all_reduce_op_handle.h index 8e98d894b828b4162059b30f5c6a74cfc06f402e..fdd250b0d3eb166249271a95f7592b9fadee5265 100644 --- a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h +++ b/paddle/fluid/framework/details/all_reduce_op_handle.h @@ -20,17 +20,23 @@ #include "paddle/fluid/framework/details/op_handle_base.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/scope.h" +#ifdef PADDLE_WITH_CUDA #include "paddle/fluid/platform/nccl_helper.h" +#endif namespace paddle { namespace framework { namespace details { -struct NCCLAllReduceOpHandle : public OpHandleBase { - NCCLAllReduceOpHandle(const std::vector &local_scopes, - const std::vector &places, - const platform::NCCLContextMap &ctxs); - +struct AllReduceOpHandle : public OpHandleBase { +#ifdef PADDLE_WITH_CUDA + AllReduceOpHandle(const std::vector &local_scopes, + const std::vector &places, + const platform::NCCLContextMap *ctxs); +#else + AllReduceOpHandle(const std::vector &local_scopes, + const std::vector &places); +#endif std::string Name() const override; // Delay and buffer nccl_all_reduce together can significantly increase @@ -43,7 +49,9 @@ struct NCCLAllReduceOpHandle : public OpHandleBase { private: std::vector local_scopes_; std::vector places_; - const platform::NCCLContextMap &nccl_ctxs_; +#ifdef PADDLE_WITH_CUDA + const platform::NCCLContextMap *nccl_ctxs_; +#endif }; } // namespace details diff --git a/paddle/fluid/framework/details/execution_strategy.h b/paddle/fluid/framework/details/execution_strategy.h index e7aa74742f827efabff1189d3213edd748d9082d..716d674fa29bad9321fc20979775c06f26bf4679 100644 --- a/paddle/fluid/framework/details/execution_strategy.h +++ b/paddle/fluid/framework/details/execution_strategy.h @@ -20,7 +20,7 @@ namespace details { struct ExecutionStrategy { size_t num_threads_{0}; - bool use_event_{true}; + bool use_cuda_{true}; bool allow_op_delay_{false}; size_t num_iteration_per_drop_scope_{100}; }; diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc index 97242ebf2af304e1498e2ef37cd87d1ef07fb6df..78356cb1be3bd089c26dde663275e2c8109df951 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc @@ -17,6 +17,7 @@ #include #include +#include "paddle/fluid/framework/details/all_reduce_op_handle.h" #include "paddle/fluid/framework/details/broadcast_op_handle.h" #include "paddle/fluid/framework/details/computation_op_handle.h" #include "paddle/fluid/framework/details/multi_devices_graph_builder.h" @@ -26,10 +27,6 @@ #include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/scope.h" -#ifdef PADDLE_WITH_CUDA -#include "paddle/fluid/framework/details/nccl_all_reduce_op_handle.h" -#endif - namespace paddle { namespace framework { namespace details { @@ -89,7 +86,7 @@ std::vector MultiDevSSAGraphBuilder::FindDistTrainSendVars( for (auto *op : program.Block(0).AllOps()) { // TODO(Yancey1989): use a graceful method to find send op, // instead of the the hard code string - if (op->Type() == "send_vars") { + if (op->Type() == "send") { auto op_vars = op->InputArgumentNames(); send_vars.reserve(send_vars.size() + std::distance(op_vars.begin(), op_vars.end())); @@ -243,7 +240,7 @@ std::unique_ptr MultiDevSSAGraphBuilder::Build( CreateReduceOp(&result, g_name, 0); CreateBroadcastOp(&result, g_name, 0); } else { - InsertNCCLAllReduceOp(&result, g_name); + InsertAllReduceOp(&result, g_name); } break; } @@ -286,6 +283,19 @@ bool MultiDevSSAGraphBuilder::IsSparseGradient( return false; } +void MultiDevSSAGraphBuilder::SetCommunicationContext( + OpHandleBase *op_handle, const platform::Place &p) const { +#ifdef PADDLE_WITH_CUDA + if (nccl_ctxs_ == nullptr) { + op_handle->SetDeviceContext(p, + platform::DeviceContextPool::Instance().Get(p)); + } +#else + op_handle->SetDeviceContext(p, + platform::DeviceContextPool::Instance().Get(p)); +#endif +} + void MultiDevSSAGraphBuilder::CreateBroadcastOp(SSAGraph *result, const std::string &p_name, size_t src_dev_id) const { @@ -300,15 +310,12 @@ void MultiDevSSAGraphBuilder::CreateBroadcastOp(SSAGraph *result, op_handle->AddInput(in); for (size_t i = 0; i < places_.size(); ++i) { - auto &vars = result->vars_.at(i).at(p_name); auto &p = places_[i]; + SetCommunicationContext(op_handle, p); + auto &vars = result->vars_.at(i).at(p_name); auto *out_var = new VarHandle(vars.size(), i, p_name, p); vars.emplace_back(out_var); op_handle->AddOutput(out_var); -#ifndef ADDLE_WITH_CUDA - op_handle->SetDeviceContext(p, - platform::DeviceContextPool::Instance().Get(p)); -#endif } } @@ -320,15 +327,19 @@ void MultiDevSSAGraphBuilder::CreateComputationalOp(SSAGraph *result, CreateOpHandleIOs(result, op, dev_id); } -void MultiDevSSAGraphBuilder::InsertNCCLAllReduceOp( - SSAGraph *result, const std::string &og) const { +void MultiDevSSAGraphBuilder::InsertAllReduceOp(SSAGraph *result, + const std::string &og) const { #ifdef PADDLE_WITH_CUDA result->ops_.emplace_back( - new NCCLAllReduceOpHandle(local_scopes_, places_, *nccl_ctxs_)); + new AllReduceOpHandle(local_scopes_, places_, nccl_ctxs_)); +#else + result->ops_.emplace_back(new AllReduceOpHandle(local_scopes_, places_)); +#endif auto *op_handle = result->ops_.back().get(); for (size_t i = 0; i < places_.size(); ++i) { auto &p = places_[i]; + SetCommunicationContext(op_handle, p); auto &vars = result->vars_[i][og]; PADDLE_ENFORCE(!vars.empty()); auto &prev_grad = vars.back(); @@ -338,9 +349,6 @@ void MultiDevSSAGraphBuilder::InsertNCCLAllReduceOp( vars.emplace_back(var); op_handle->AddOutput(var); } -#else - PADDLE_ENFORCE("Not implemented"); -#endif } bool MultiDevSSAGraphBuilder::IsParameterGradientOnce( @@ -379,7 +387,9 @@ void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(SSAGraph *result) const { for (size_t i = 0; i < places_.size(); ++i) { // Insert ScaleCost OpHandle #ifdef PADDLE_WITH_CUDA - auto *communication_dev_ctx = nccl_ctxs_->DevCtx(places_[i]); + auto *communication_dev_ctx = + nccl_ctxs_ ? nccl_ctxs_->DevCtx(places_[i]) + : platform::DeviceContextPool::Instance().Get(places_[i]); #else auto *communication_dev_ctx = platform::DeviceContextPool::Instance().Get(platform::CPUPlace()); @@ -424,12 +434,9 @@ VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(SSAGraph *result, auto *op_handle = result->ops_.back().get(); for (size_t i = 0; i < places_.size(); ++i) { - auto &vars = result->vars_[i][og]; -#ifndef PADDLE_WITH_CUDA auto &p = places_[i]; - op_handle->SetDeviceContext(p, - platform::DeviceContextPool::Instance().Get(p)); -#endif + SetCommunicationContext(op_handle, p); + auto &vars = result->vars_[i][og]; PADDLE_ENFORCE(!vars.empty()); auto &prev_grad = vars.back(); op_handle->AddInput(prev_grad.get()); @@ -468,17 +475,17 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(SSAGraph *result, new RPCOpHandle(op, local_scopes_[0], op.Type(), places_[0])); if (op.Type() == "send_barrier") { - ConnectOp(result, result->ops_.back().get(), "send_vars"); + ConnectOp(result, result->ops_.back().get(), "send"); } else if (op.Type() == "recv") { ConnectOp(result, result->ops_.back().get(), "send_barrier"); } else if (op.Type() == "fetch_barrier") { ConnectOp(result, result->ops_.back().get(), "recv"); - } else if (op.Type() == "send_vars") { + } else if (op.Type() == "send") { // do nothing } else { PADDLE_THROW( "rpc op should be in [" - "send_vars, send_barrier. recv, fetch_barrier]"); + "send, send_barrier. recv, fetch_barrier]"); } // TODO(Yancey1989): schedule rpc op on different place may diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.h b/paddle/fluid/framework/details/multi_devices_graph_builder.h index 544cbe585c7423b5f3eb98ee698ca5668376f1ca..78581755fe4890800636944d6cd89875a852cc19 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_builder.h +++ b/paddle/fluid/framework/details/multi_devices_graph_builder.h @@ -100,7 +100,7 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder { const std::vector> &var_name_on_devices, const OpDesc &op) const; - void InsertNCCLAllReduceOp(SSAGraph *result, const std::string &og) const; + void InsertAllReduceOp(SSAGraph *result, const std::string &og) const; void CreateBroadcastOp(SSAGraph *result, const std::string &p_name, size_t src_dev_id) const; @@ -111,6 +111,9 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder { private: BuildStrategy strategy_; + + void SetCommunicationContext(OpHandleBase *op_handle, + const platform::Place &p) const; }; } // namespace details } // namespace framework diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc index 3849cca59a3347137b769f97261cfbf97da8d6ff..f79565fe71c4aef140475c922cbbf5a1e0b7fe03 100644 --- a/paddle/fluid/framework/details/op_handle_base.cc +++ b/paddle/fluid/framework/details/op_handle_base.cc @@ -39,9 +39,9 @@ OpHandleBase::~OpHandleBase() { #endif } -void OpHandleBase::Run(bool use_event) { +void OpHandleBase::Run(bool use_cuda) { #ifdef PADDLE_WITH_CUDA - if (events_.empty() && use_event) { + if (events_.empty() && use_cuda) { for (auto &p : dev_ctxes_) { int dev_id = boost::get(p.first).device; PADDLE_ENFORCE(cudaSetDevice(dev_id)); @@ -50,7 +50,7 @@ void OpHandleBase::Run(bool use_event) { } } #else - PADDLE_ENFORCE(!use_event); + PADDLE_ENFORCE(!use_cuda); #endif RunImpl(); diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h index dc92b0fe9f760d95d4869fdd56c0400b6710437f..fbd90a3296bca92b097cab925b218b91e7f4752f 100644 --- a/paddle/fluid/framework/details/op_handle_base.h +++ b/paddle/fluid/framework/details/op_handle_base.h @@ -36,7 +36,7 @@ class OpHandleBase { virtual std::string Name() const = 0; - void Run(bool use_event); + void Run(bool use_cuda); virtual void RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx); diff --git a/paddle/fluid/framework/details/reduce_and_gather.h b/paddle/fluid/framework/details/reduce_and_gather.h index 2b95a284990da8f9b7c16d6e4221eb1ed061f74b..a6ffb37313a88120bc9e8d5ce326f60aeebdff69 100644 --- a/paddle/fluid/framework/details/reduce_and_gather.h +++ b/paddle/fluid/framework/details/reduce_and_gather.h @@ -37,7 +37,9 @@ struct ReduceLoDTensor { PADDLE_ENFORCE_NE(t0.numel(), 0); dst_tensor_.Resize(t0.dims()); T *dst = dst_tensor_.mutable_data(platform::CPUPlace()); - std::copy(t0.data(), t0.data() + t0.numel(), dst); + if (dst != t0.data()) { + std::copy(t0.data(), t0.data() + t0.numel(), dst); + } for (size_t i = 1; i < src_tensors_.size(); ++i) { auto &t = *src_tensors_[i]; diff --git a/paddle/fluid/framework/details/ssa_graph_builder.cc b/paddle/fluid/framework/details/ssa_graph_builder.cc index 211113c7979ee95d896c0a57879f7b3ad13b36ef..88a21f48879a15450051ad94ed76e1c48bf23014 100644 --- a/paddle/fluid/framework/details/ssa_graph_builder.cc +++ b/paddle/fluid/framework/details/ssa_graph_builder.cc @@ -11,8 +11,8 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. - #include "paddle/fluid/framework/details/ssa_graph_builder.h" +#include namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/details/graph_builder_factory.cc b/paddle/fluid/framework/details/ssa_graph_builder_factory.cc similarity index 90% rename from paddle/fluid/framework/details/graph_builder_factory.cc rename to paddle/fluid/framework/details/ssa_graph_builder_factory.cc index a04b9bb63c06b40ff5c30c9792cdfad5d64d404c..b4b49d3de6da2e5fd7836668619e42d10bb6b35a 100644 --- a/paddle/fluid/framework/details/graph_builder_factory.cc +++ b/paddle/fluid/framework/details/ssa_graph_builder_factory.cc @@ -12,9 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/framework/details/graph_builder_factory.h" +#include "paddle/fluid/framework/details/ssa_graph_builder_factory.h" #include #include "paddle/fluid/framework/details/multi_devices_graph_builder.h" +#include "paddle/fluid/framework/details/ssa_graph_checker.h" #include "paddle/fluid/framework/details/ssa_graph_printer.h" namespace paddle { @@ -40,6 +41,8 @@ std::unique_ptr SSAGraphBuilderFactory::Create() { res.reset(new SSAGraghBuilderWithPrinter( std::move(fout), std::move(graphviz_printer), std::move(res))); } + res.reset(new SSAGraghBuilderWithChecker(std::move(res))); + return res; } } // namespace details diff --git a/paddle/fluid/framework/details/graph_builder_factory.h b/paddle/fluid/framework/details/ssa_graph_builder_factory.h similarity index 95% rename from paddle/fluid/framework/details/graph_builder_factory.h rename to paddle/fluid/framework/details/ssa_graph_builder_factory.h index 857ab12d684e19788597e144fc0c46571d06aafc..91a119de83ed3d1573803e48faf86c874eed98d6 100644 --- a/paddle/fluid/framework/details/graph_builder_factory.h +++ b/paddle/fluid/framework/details/ssa_graph_builder_factory.h @@ -40,7 +40,11 @@ class SSAGraphBuilderFactory { loss_var_name_(loss_var_name), param_names_(param_names), local_scopes_(local_scopes), - strategy_(strategy) {} + strategy_(strategy) { +#ifdef PADDLE_WITH_CUDA + nccl_ctxs_ = nullptr; +#endif + } #ifdef PADDLE_WITH_CUDA void SetNCCLContextMap(platform::NCCLContextMap* nccl_ctxs) { diff --git a/paddle/fluid/framework/details/ssa_graph_checker.cc b/paddle/fluid/framework/details/ssa_graph_checker.cc new file mode 100644 index 0000000000000000000000000000000000000000..da5428946ee588e8eac1f78929dc0432df532975 --- /dev/null +++ b/paddle/fluid/framework/details/ssa_graph_checker.cc @@ -0,0 +1,87 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/ssa_graph.h" +#include +#include "paddle/fluid/framework/details/ssa_graph_checker.h" + +namespace paddle { +namespace framework { +namespace details { + +bool SSAGraghBuilderWithChecker::IsValidGraph(const SSAGraph *graph) const { + std::unordered_map pending_ops; + std::unordered_set pending_vars; + std::unordered_set ready_vars; + std::unordered_set ready_ops; + + auto insert_pending_var = [&](VarHandleBase *var) { + pending_vars.insert(var); + if (var->generated_op_ == nullptr) { + ready_vars.emplace(var); + } + }; + + for (auto &var_map : graph->vars_) { + for (auto &name_pair : var_map) { + for (auto &version_pair : name_pair.second) { + insert_pending_var(version_pair.get()); + } + } + } + + for (auto &var : graph->dep_vars_) { + insert_pending_var(var.get()); + } + + for (auto &op : graph->ops_) { + if (op->Inputs().empty()) { + ready_ops.insert(op.get()); + } else { + pending_ops.insert({op.get(), op.get()->NoDupInputSize()}); + } + } + + auto run_all_ops = [&](std::unordered_set &set) { + for (auto *op : set) { + for (auto out : op->Outputs()) { + ready_vars.emplace(out); + } + } + set.clear(); + }; + + while (!pending_vars.empty()) { + run_all_ops(ready_ops); + + if (ready_vars.empty()) { + return false; + } + + for (auto ready_var : ready_vars) { + pending_vars.erase(ready_var); + for (auto *op : ready_var->pending_ops_) { + auto &deps = --pending_ops[op]; + if (deps == 0) { + ready_ops.insert(op); + } + } + } + ready_vars.clear(); + } + return true; +} +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/ssa_graph_checker.h b/paddle/fluid/framework/details/ssa_graph_checker.h new file mode 100644 index 0000000000000000000000000000000000000000..542c4a172891ba9d3621918986089b2e400b6ae8 --- /dev/null +++ b/paddle/fluid/framework/details/ssa_graph_checker.h @@ -0,0 +1,44 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/details/ssa_graph_builder.h" + +namespace paddle { +namespace framework { +namespace details { +class SSAGraph; + +class SSAGraghBuilderWithChecker : public SSAGraphBuilder { + public: + explicit SSAGraghBuilderWithChecker( + std::unique_ptr&& builder) + : builder_(std::move(builder)) {} + + std::unique_ptr Build(const ProgramDesc& program) const override { + auto graph = builder_->Build(program); + PADDLE_ENFORCE(IsValidGraph(graph.get())); + return graph; + } + + bool IsValidGraph(const SSAGraph* graph) const; + + private: + std::unique_ptr builder_; +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc index 496fadd04dac982b87b9d9e14f599ed37d9709d0..6c5098ce85b784a3edcf8f48d2cc828aabd8e161 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc @@ -185,6 +185,7 @@ void ThreadedSSAGraphExecutor::InsertPendingVar( ready_vars->Push(var); } } + void ThreadedSSAGraphExecutor::RunOp( BlockingQueue *ready_var_q, details::OpHandleBase *op) { auto op_run = [ready_var_q, op, this] { @@ -192,7 +193,7 @@ void ThreadedSSAGraphExecutor::RunOp( if (VLOG_IS_ON(10)) { VLOG(10) << op << " " << op->Name() << " : " << op->DebugString(); } - op->Run(strategy_.use_event_); + op->Run(strategy_.use_cuda_); VLOG(10) << op << " " << op->Name() << " Done "; running_ops_--; ready_var_q->Extend(op->Outputs()); diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 15af9c409002510c870e5f5c91c5bc67f9490f52..0c050ca271da8eb79d3271db470eba4decde42c3 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -24,6 +24,7 @@ limitations under the License. */ #include "paddle/fluid/platform/profiler.h" DECLARE_bool(benchmark); +DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run"); namespace paddle { namespace framework { @@ -115,6 +116,7 @@ void Executor::CreateVariables(const ProgramDesc& pdesc, Scope* scope, void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, bool create_local_scope, bool create_vars) { platform::RecordBlock b(block_id); + if (FLAGS_use_mkldnn) EnableMKLDNN(pdesc); auto ctx = Prepare(pdesc, block_id); RunPreparedContext(ctx.get(), scope, create_local_scope, create_vars); } @@ -214,6 +216,7 @@ void Executor::Run(const ProgramDesc& program, Scope* scope, const std::string& feed_holder_name, const std::string& fetch_holder_name) { platform::RecordBlock b(kProgramId); + if (FLAGS_use_mkldnn) EnableMKLDNN(program); bool has_feed_ops = has_feed_operators(program.Block(0), *feed_targets, feed_holder_name); bool has_fetch_ops = @@ -225,7 +228,6 @@ void Executor::Run(const ProgramDesc& program, Scope* scope, unique_ptr_of_copy_program.reset(new ProgramDesc(program)); copy_program = unique_ptr_of_copy_program.get(); } - auto* global_block = copy_program->MutableBlock(0); if (!has_feed_ops) { @@ -382,5 +384,19 @@ void Executor::RunPreparedContext( } } +void Executor::EnableMKLDNN(const ProgramDesc& program) { +#ifdef PADDLE_WITH_MKLDNN + VLOG(3) << "use_mkldnn=True"; + for (size_t bid = 0; bid < program.Size(); ++bid) { + auto* block = const_cast(program).MutableBlock(bid); + for (auto* op : block->AllOps()) { + if (op->HasAttr("use_mkldnn")) { + op->SetAttr("use_mkldnn", true); + } + } + } +#endif +} + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h index 0c3c23611d95e0da67cabfb8fb2755a4a52c991b..e6f9c3d31c18f762ef2de269977e0642a79fb174 100644 --- a/paddle/fluid/framework/executor.h +++ b/paddle/fluid/framework/executor.h @@ -81,6 +81,8 @@ class Executor { const std::string& feed_holder_name = "feed", const std::string& fetch_holder_name = "fetch"); + void EnableMKLDNN(const ProgramDesc& program); + private: const platform::Place place_; }; diff --git a/paddle/fluid/framework/framework.proto b/paddle/fluid/framework/framework.proto index d35125fe8c3c8018c38650dc87b2b1474ded6058..68fcc104d48b2b39929ed2198a2dd2eabae10e94 100644 --- a/paddle/fluid/framework/framework.proto +++ b/paddle/fluid/framework/framework.proto @@ -71,6 +71,7 @@ message OpProto { optional bool duplicable = 3 [ default = false ]; optional bool intermediate = 4 [ default = false ]; optional bool dispensable = 5 [ default = false ]; + optional string reuse = 6; } // AttrProto describes the C++ type Attribute. diff --git a/paddle/fluid/framework/op_info.cc b/paddle/fluid/framework/op_info.cc index b99e82f8c4358b60a014c6fc7c61c9bbb8683834..f1261dee0319440995951d1bee145404186a8ad4 100644 --- a/paddle/fluid/framework/op_info.cc +++ b/paddle/fluid/framework/op_info.cc @@ -17,12 +17,11 @@ limitations under the License. */ namespace paddle { namespace framework { -static OpInfoMap* g_op_info_map = nullptr; - +// C++11 removes the need for manual locking. Concurrent execution shall wait if +// a static local variable is already being initialized. +// https://stackoverflow.com/questions/11711920/how-to-implement-multithread-safe-singleton-in-c11-without-using-mutex OpInfoMap& OpInfoMap::Instance() { - if (g_op_info_map == nullptr) { - g_op_info_map = new OpInfoMap(); - } + static OpInfoMap* g_op_info_map = new OpInfoMap(); return *g_op_info_map; } } // namespace framework diff --git a/paddle/fluid/framework/op_proto_maker.cc b/paddle/fluid/framework/op_proto_maker.cc index ae9f4efd44acdcdff2806deea6826e4089459a78..001b5cb5a8eb57cbe0a2e0ad7f64ef05f8149922 100644 --- a/paddle/fluid/framework/op_proto_maker.cc +++ b/paddle/fluid/framework/op_proto_maker.cc @@ -21,6 +21,7 @@ namespace framework { void OpProtoAndCheckerMaker::Validate() { validated_ = true; CheckNoDuplicatedInOutAttrs(); + CheckReuseVars(); } OpProtoAndCheckerMaker::VariableBuilder OpProtoAndCheckerMaker::AddInput( @@ -56,6 +57,24 @@ void OpProtoAndCheckerMaker::CheckNoDuplicatedInOutAttrs() { } } +void OpProtoAndCheckerMaker::CheckReuseVars() { + std::unordered_set names; + for (auto& input : proto_->inputs()) { + names.insert(input.name()); + } + auto checker = [&](const std::string& name, const std::string& reused) { + PADDLE_ENFORCE( + names.count(reused), + "Output [%s] reuse Input [%s], but the input is not registered.", name, + reused); + }; + for (auto& output : proto_->outputs()) { + if (output.has_reuse()) { + checker(output.name(), output.reuse()); + } + } +} + void OpProtoAndCheckerMaker::operator()(proto::OpProto* proto, OpAttrChecker* attr_checker) { proto_ = proto; diff --git a/paddle/fluid/framework/op_proto_maker.h b/paddle/fluid/framework/op_proto_maker.h index 8493b9d8b326c71a33b95bf95e5fc1743c686eb7..92f86bb5de520878d0a7b8d7214620580242c061 100644 --- a/paddle/fluid/framework/op_proto_maker.h +++ b/paddle/fluid/framework/op_proto_maker.h @@ -14,6 +14,8 @@ limitations under the License. */ #pragma once #include +#include + #include "glog/logging.h" #include "paddle/fluid/framework/attribute.h" #include "paddle/fluid/framework/framework.pb.h" @@ -64,6 +66,11 @@ class OpProtoAndCheckerMaker { var_->set_dispensable(true); return *this; } + + VariableBuilder &Reuse(const std::string &name) { + var_->set_reuse(name); + return *this; + } }; VariableBuilder AddInput(const std::string &name, const std::string &comment); @@ -89,6 +96,8 @@ class OpProtoAndCheckerMaker { void CheckNoDuplicatedInOutAttrs(); void Validate(); + void CheckReuseVars(); + proto::OpProto *proto_; OpAttrChecker *op_checker_; bool validated_{false}; diff --git a/paddle/fluid/framework/op_proto_maker_test.cc b/paddle/fluid/framework/op_proto_maker_test.cc index a8030d377fdb4d4aef74b315e21792dad10fac96..58f70cb39c0d96ed3b9ff35ea132ba75a37f5405 100644 --- a/paddle/fluid/framework/op_proto_maker_test.cc +++ b/paddle/fluid/framework/op_proto_maker_test.cc @@ -47,3 +47,23 @@ TEST(ProtoMaker, DuplicatedInOut) { ASSERT_THROW(proto_maker(&op_proto, &op_checker), paddle::platform::EnforceNotMet); } + +class TestInplaceProtoMaker : public paddle::framework::OpProtoAndCheckerMaker { + public: + void Make() { + AddInput("X", "input of test op"); + AddOutput("XOut", "output of test op").Reuse("X"); + AddOutput("NoOut", "output of test op").Reuse("NotExists"); + } +}; + +TEST(ProtoMaker, InplaceOutput) { + paddle::framework::proto::OpProto op_proto; + paddle::framework::OpAttrChecker op_checker; + TestInplaceProtoMaker proto_maker; + ASSERT_THROW(proto_maker(&op_proto, &op_checker), + paddle::platform::EnforceNotMet); + // proto_maker(&op_proto, &op_checker); + // proto_maker.Make(); + // ASSERT_THROW(proto_maker.Validate(), paddle::platform::EnforceNotMet); +} diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index ce56f55e4195a0625cd0754152285b80e4282183..ac4d1f58a5b3b11f034af7618681ebd913d8afb9 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -22,8 +22,8 @@ limitations under the License. */ #include "paddle/fluid/platform/nccl_helper.h" #endif -#include "paddle/fluid/framework/details/graph_builder_factory.h" #include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h" +#include "paddle/fluid/framework/details/ssa_graph_builder_factory.h" #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h" #include "paddle/fluid/platform/profiler.h" @@ -43,7 +43,8 @@ class ParallelExecutorPrivate { #ifdef PADDLE_WITH_CUDA std::unique_ptr nccl_ctxs_; #endif - bool own_local_scope; + bool own_local_scope_; + bool use_cuda_; }; std::vector &ParallelExecutor::GetLocalScopes() { @@ -60,35 +61,40 @@ ParallelExecutor::ParallelExecutor( size_t num_trainers, size_t trainer_id) : member_(new ParallelExecutorPrivate(places)) { member_->global_scope_ = scope; + member_->use_cuda_ = exec_strategy.use_cuda_; // Step 1. Bcast the params to devs. // Create local scopes if (local_scopes.empty()) { - member_->own_local_scope = true; + member_->own_local_scope_ = true; member_->local_scopes_.emplace_back(member_->global_scope_); for (size_t i = 1; i < member_->places_.size(); ++i) { member_->local_scopes_.emplace_back(&scope->NewScope()); } } else { - member_->own_local_scope = false; + member_->own_local_scope_ = false; PADDLE_ENFORCE_EQ(member_->places_.size(), local_scopes.size()); for (size_t i = 0; i < member_->places_.size(); ++i) { member_->local_scopes_.emplace_back(&local_scopes[i]->NewScope()); } } + if (member_->use_cuda_) { // Bcast Parameters to all GPUs #ifdef PADDLE_WITH_CUDA - auto *nccl_id_var = scope->FindVar(NCCL_ID_VARNAME); - ncclUniqueId *nccl_id = nullptr; - if (nccl_id_var != nullptr) { - nccl_id = nccl_id_var->GetMutable(); - } - member_->nccl_ctxs_.reset(new platform::NCCLContextMap( - member_->places_, nccl_id, num_trainers, trainer_id)); + auto *nccl_id_var = scope->FindVar(NCCL_ID_VARNAME); + ncclUniqueId *nccl_id = nullptr; + if (nccl_id_var != nullptr) { + nccl_id = nccl_id_var->GetMutable(); + } + member_->nccl_ctxs_.reset(new platform::NCCLContextMap( + member_->places_, nccl_id, num_trainers, trainer_id)); +#else + PADDLE_THROW("Not compiled with CUDA"); #endif - if (platform::is_gpu_place(places[0]) && member_->local_scopes_.size() != 1 && - local_scopes.empty()) { // Is CUDA + } + + if (member_->local_scopes_.size() != 1 && local_scopes.empty()) { BCastParamsToGPUs(bcast_vars); } // Startup Program has been run. All local scopes has correct parameters. @@ -108,9 +114,13 @@ ParallelExecutor::ParallelExecutor( details::SSAGraphBuilderFactory builder_factory( member_->places_, loss_var_name, params, member_->local_scopes_, build_strategy); + if (member_->use_cuda_) { #ifdef PADDLE_WITH_CUDA - builder_factory.SetNCCLContextMap(member_->nccl_ctxs_.get()); + builder_factory.SetNCCLContextMap(member_->nccl_ctxs_.get()); +#else + PADDLE_THROW("Not compiled with CUDA"); #endif + } member_->executor_.reset(new details::ThreadedSSAGraphExecutor( exec_strategy, member_->local_scopes_, places, @@ -123,7 +133,6 @@ ParallelExecutor::ParallelExecutor( void ParallelExecutor::BCastParamsToGPUs( const std::unordered_set &vars) const { -#ifdef PADDLE_WITH_CUDA auto *main_scope = member_->local_scopes_[0]; for (auto &var : vars) { @@ -135,6 +144,7 @@ void ParallelExecutor::BCastParamsToGPUs( auto &main_tensor = main_var->Get(); auto &dims = main_tensor.dims(); if (paddle::platform::is_gpu_place(main_tensor.place())) { +#ifdef PADDLE_WITH_CUDA size_t numel = main_tensor.numel(); ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type()); platform::NCCLGroupGuard guard; @@ -153,6 +163,10 @@ void ParallelExecutor::BCastParamsToGPUs( platform::dynload::ncclBcast(buffer, numel, data_type, 0, nccl_ctx.comm_, nccl_ctx.stream()); } + member_->nccl_ctxs_->WaitAll(); +#else + PADDLE_THROW("Not compiled with CUDA"); +#endif } else { platform::CPUPlace cpu; for (size_t i = 1; i < member_->places_.size(); ++i) { @@ -163,11 +177,7 @@ void ParallelExecutor::BCastParamsToGPUs( paddle::framework::TensorCopy(main_tensor, cpu, t); } } - member_->nccl_ctxs_->WaitAll(); } -#else - PADDLE_THROW("Not compiled with CUDA"); -#endif } void ParallelExecutor::Run(const std::vector &fetch_tensors, @@ -213,7 +223,7 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes( } ParallelExecutor::~ParallelExecutor() { - if (member_->own_local_scope) { + if (member_->own_local_scope_) { for (size_t i = 1; i < member_->local_scopes_.size(); ++i) { member_->global_scope_->DeleteScope(member_->local_scopes_[i]); } diff --git a/paddle/fluid/inference/tensorrt/convert/ut_helper.h b/paddle/fluid/inference/tensorrt/convert/ut_helper.h index 236d169017f65e4c9d513c3ca4511daa2dfee06e..3b1f531adc5d756259df1c350f7f44bf71ee1f93 100644 --- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h +++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h @@ -64,7 +64,8 @@ class TRTConvertValidation { TRTConvertValidation(int batch_size, const std::unordered_set& parameters, - framework::Scope& scope, int workspace_size = 1 << 10) + framework::Scope& scope, // NOLINT + int workspace_size = 1 << 10) : parameters_(parameters), scope_(scope) { // create engine. engine_.reset(new TensorRTEngine(10, 1 << 10, &stream_)); diff --git a/paddle/fluid/inference/tests/book/test_inference_image_classification.cc b/paddle/fluid/inference/tests/book/test_inference_image_classification.cc index 987da18116cc6f4902bd66ae317f2470a8bc5057..60c761c5281e2f535aab0200c93fb738addcdb87 100644 --- a/paddle/fluid/inference/tests/book/test_inference_image_classification.cc +++ b/paddle/fluid/inference/tests/book/test_inference_image_classification.cc @@ -21,7 +21,6 @@ DEFINE_string(fp16_dirname, "", "Directory of the float16 inference model."); DEFINE_int32(batch_size, 1, "Batch size of input data"); DEFINE_int32(repeat, 1, "Running the inference program repeat times"); DEFINE_bool(skip_cpu, false, "Skip the cpu test"); -DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run inference"); TEST(inference, image_classification) { if (FLAGS_dirname.empty() || FLAGS_batch_size < 1 || FLAGS_repeat < 1) { @@ -59,10 +58,8 @@ TEST(inference, image_classification) { // Run inference on CPU LOG(INFO) << "--- CPU Runs: ---"; LOG(INFO) << "Batch size is " << FLAGS_batch_size; - LOG(INFO) << "FLAGS_use_mkldnn: " << FLAGS_use_mkldnn; TestInference( - dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, is_combined, - FLAGS_use_mkldnn); + dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, is_combined); LOG(INFO) << output1.dims(); } diff --git a/paddle/fluid/inference/tests/book/test_inference_nlp.cc b/paddle/fluid/inference/tests/book/test_inference_nlp.cc index a0e83a17058a4edcb8f23f23ce155e644ae0cf3b..9dcd79c3bb9ed713ff0f12024969cc5798750988 100644 --- a/paddle/fluid/inference/tests/book/test_inference_nlp.cc +++ b/paddle/fluid/inference/tests/book/test_inference_nlp.cc @@ -27,7 +27,6 @@ limitations under the License. */ DEFINE_string(model_path, "", "Directory of the inference model."); DEFINE_string(data_file, "", "File of input index data."); DEFINE_int32(repeat, 100, "Running the inference program repeat times"); -DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run inference"); DEFINE_bool(prepare_vars, true, "Prepare variables before executor"); DEFINE_int32(num_threads, 1, "Number of threads should be used"); @@ -190,9 +189,6 @@ TEST(inference, nlp) { std::unique_ptr inference_program; inference_program = InitProgram(&executor, scope.get(), FLAGS_model_path, /*model combined*/ false); - if (FLAGS_use_mkldnn) { - EnableMKLDNN(inference_program); - } // always prepare context std::unique_ptr ctx; ctx = executor.Prepare(*inference_program, 0); diff --git a/paddle/fluid/inference/tests/test_helper.h b/paddle/fluid/inference/tests/test_helper.h index 01b8dc0be662da22fe15a79cd9abfe5fa92c9577..44c36b1683b037832a218df02184e7cd2ba143e9 100644 --- a/paddle/fluid/inference/tests/test_helper.h +++ b/paddle/fluid/inference/tests/test_helper.h @@ -22,6 +22,8 @@ limitations under the License. */ #include "paddle/fluid/inference/io.h" #include "paddle/fluid/platform/profiler.h" +DECLARE_bool(use_mkldnn); + template void SetupTensor(paddle::framework::LoDTensor* input, paddle::framework::DDim dims, T lower, T upper) { @@ -133,24 +135,11 @@ std::vector> GetFeedTargetShapes( return feed_target_shapes; } -void EnableMKLDNN( - const std::unique_ptr& program) { - for (size_t bid = 0; bid < program->Size(); ++bid) { - auto* block = program->MutableBlock(bid); - for (auto* op : block->AllOps()) { - if (op->HasAttr("use_mkldnn")) { - op->SetAttr("use_mkldnn", true); - } - } - } -} - template void TestInference(const std::string& dirname, const std::vector& cpu_feeds, const std::vector& cpu_fetchs, - const int repeat = 1, const bool is_combined = false, - const bool use_mkldnn = false) { + const int repeat = 1, const bool is_combined = false) { // 1. Define place, executor, scope auto place = Place(); auto executor = paddle::framework::Executor(place); @@ -182,9 +171,6 @@ void TestInference(const std::string& dirname, "init_program", paddle::platform::DeviceContextPool::Instance().Get(place)); inference_program = InitProgram(&executor, scope, dirname, is_combined); - if (use_mkldnn) { - EnableMKLDNN(inference_program); - } } // Disable the profiler and print the timing information paddle::platform::DisableProfiler(paddle::platform::EventSortingKey::kDefault, @@ -210,7 +196,10 @@ void TestInference(const std::string& dirname, fetch_targets[fetch_target_names[i]] = cpu_fetchs[i]; } - // 6. Run the inference program + // 6. If export Flags_use_mkldnn=True, use mkldnn related ops. + if (FLAGS_use_mkldnn) executor.EnableMKLDNN(*inference_program); + + // 7. Run the inference program { if (!CreateVars) { // If users don't want to create and destroy variables every time they diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 5e86b16ba1ff69c798372a144fb3bf699768f2e6..d6a36eff09c7f70803d3be619b26d16660da1ec2 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -186,19 +186,23 @@ endif() add_subdirectory(detail) if(WITH_DISTRIBUTE) - - set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf) + + set(DISTRIBUTE_DEPS "") + if(WITH_GRPC) + set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf) + else() + set(DISTRIBUTE_DEPS sendrecvop_brpc brpc leveldb snappystream snappy protobuf ssl crypto zlib) + endif() + set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") - op_library(send_op DEPS ${DISTRIBUTE_DEPS}) - set_source_files_properties(send_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) op_library(prefetch_op DEPS ${DISTRIBUTE_DEPS}) set_source_files_properties(prefetch_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) op_library(recv_op DEPS ${DISTRIBUTE_DEPS}) set_source_files_properties(recv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) op_library(listen_and_serv_op DEPS ${DISTRIBUTE_DEPS}) set_source_files_properties(listen_and_serv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - op_library(send_vars_op DEPS ${DISTRIBUTE_DEPS}) - set_source_files_properties(send_vars_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + op_library(send_op DEPS ${DISTRIBUTE_DEPS}) + set_source_files_properties(send_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) op_library(send_barrier_op DEPS ${DISTRIBUTE_DEPS}) op_library(fetch_barrier_op DEPS ${DISTRIBUTE_DEPS}) set_source_files_properties(send_barrier_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) @@ -208,15 +212,18 @@ if(WITH_DISTRIBUTE) # listen_and_serv_op sum_op executor SERIAL) if(WITH_GPU) set_source_files_properties(test_send_nccl_id.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - cc_test(test_send_nccl_id SRCS test_send_nccl_id.cc DEPS send_op - listen_and_serv_op executor SERIAL) - op_library(gen_nccl_id_op DEPS nccl_common sendrecvop_grpc) + cc_test(test_send_nccl_id SRCS test_send_nccl_id.cc DEPS listen_and_serv_op executor SERIAL) + if(WITH_GRPC) + op_library(gen_nccl_id_op DEPS nccl_common sendrecvop_grpc) + else() + op_library(gen_nccl_id_op DEPS nccl_common sendrecvop_brpc) + endif() set_source_files_properties(gen_nccl_id_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) else() set(DEPS_OPS ${DEPS_OPS} gen_nccl_id_op) endif() else() - set(DEPS_OPS ${DEPS_OPS} send_op prefetch_op recv_op listen_and_serv_op send_vars_op send_barrier_op fetch_barrier_op gen_nccl_id_op) + set(DEPS_OPS ${DEPS_OPS} prefetch_op recv_op listen_and_serv_op send_op send_barrier_op fetch_barrier_op gen_nccl_id_op) endif() op_library(cross_entropy_op DEPS cross_entropy) diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc index 96e4c0e04cc30db6d0b86376434d5ea02694ae21..af1d85047e519df6766b2139a0445ae9dc5945e2 100644 --- a/paddle/fluid/operators/activation_op.cc +++ b/paddle/fluid/operators/activation_op.cc @@ -25,7 +25,7 @@ namespace operators { public: \ void Make() override { \ AddInput("X", "Input of " #OP_NAME " operator"); \ - AddOutput("Out", "Output of " #OP_NAME " operator"); \ + AddOutput("Out", "Output of " #OP_NAME " operator").Reuse("X"); \ AddAttr("use_mkldnn", \ "(bool, default false) Only used in mkldnn kernel") \ .SetDefault(false); \ diff --git a/paddle/fluid/operators/adam_op.cc b/paddle/fluid/operators/adam_op.cc index 99b0239855d6241b064a5883c2be3d58078b3b61..6ee73c3000fb45b4e1cd5bbb730da7d61b494b6f 100644 --- a/paddle/fluid/operators/adam_op.cc +++ b/paddle/fluid/operators/adam_op.cc @@ -89,9 +89,9 @@ class AdamOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("Beta1Pow", "(Tensor) Input beta1 power accumulator"); AddInput("Beta2Pow", "(Tensor) Input beta2 power accumulator"); - AddOutput("ParamOut", "(Tensor) Output parameter"); - AddOutput("Moment1Out", "(Tensor) Output first moment"); - AddOutput("Moment2Out", "(Tensor) Output second moment"); + AddOutput("ParamOut", "(Tensor) Output parameter").Reuse("Param"); + AddOutput("Moment1Out", "(Tensor) Output first moment").Reuse("Moment1"); + AddOutput("Moment2Out", "(Tensor) Output second moment").Reuse("Moment2"); AddAttr("beta1", "(float, default 0.9) " diff --git a/paddle/fluid/operators/arg_max_op.cc b/paddle/fluid/operators/arg_max_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..8174d3735859b1fac40cd4c07545f34874d31ab7 --- /dev/null +++ b/paddle/fluid/operators/arg_max_op.cc @@ -0,0 +1,33 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/arg_min_max_op_base.h" + +REGISTER_OPERATOR(arg_max, paddle::operators::ArgMinMaxOp, + paddle::operators::ArgMaxOpMaker, + paddle::framework::EmptyGradOpMaker); + +REGISTER_OP_CPU_KERNEL( + arg_max, + paddle::operators::ArgMaxKernel, + paddle::operators::ArgMaxKernel, + paddle::operators::ArgMaxKernel, + paddle::operators::ArgMaxKernel, + paddle::operators::ArgMaxKernel, + paddle::operators::ArgMaxKernel, + paddle::operators::ArgMaxKernel); diff --git a/paddle/fluid/operators/arg_max_op.cu b/paddle/fluid/operators/arg_max_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..a147d77a9e9c577984028e1a6ed9582dda622069 --- /dev/null +++ b/paddle/fluid/operators/arg_max_op.cu @@ -0,0 +1,31 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/arg_min_max_op_base.h" + +REGISTER_OP_CUDA_KERNEL( + arg_max, + paddle::operators::ArgMaxKernel, + paddle::operators::ArgMaxKernel, + paddle::operators::ArgMaxKernel, + paddle::operators::ArgMaxKernel, + paddle::operators::ArgMaxKernel, + paddle::operators::ArgMaxKernel, + paddle::operators::ArgMaxKernel); diff --git a/paddle/fluid/operators/arg_min_max_op_base.h b/paddle/fluid/operators/arg_min_max_op_base.h new file mode 100644 index 0000000000000000000000000000000000000000..6cbdaefeda099c36a864289ef8195c20d09c55e6 --- /dev/null +++ b/paddle/fluid/operators/arg_min_max_op_base.h @@ -0,0 +1,160 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include +#include "paddle/fluid/framework/ddim.h" +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/string/printf.h" + +namespace paddle { +namespace operators { + +enum ArgMinMaxType { kArgMin, kArgMax }; + +template +struct ArgMinMaxFunctor {}; + +#define DECLARE_ARG_MIN_MAX_FUNCTOR(eigen_op_type, enum_argminmax_value) \ + template \ + struct ArgMinMaxFunctor { \ + void operator()(const DeviceContext& ctx, const framework::LoDTensor& in, \ + framework::LoDTensor* out, int64_t axis) { \ + auto in_eigen = framework::EigenTensor::From(in); \ + auto out_eigen = framework::EigenTensor::From(*out); \ + out_eigen.device(*(ctx.eigen_device())) = \ + in_eigen.eigen_op_type(axis).template cast(); \ + } \ + } + +DECLARE_ARG_MIN_MAX_FUNCTOR(argmin, ArgMinMaxType::kArgMin); +DECLARE_ARG_MIN_MAX_FUNCTOR(argmax, ArgMinMaxType::kArgMax); + +template +class ArgMinMaxKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto& x = *(ctx.Input("X")); + auto& out = *(ctx.Output("Out")); + out.mutable_data(ctx.GetPlace()); + auto axis = ctx.Attr("axis"); + auto& dev_ctx = ctx.template device_context(); + +#define CALL_ARG_MINMAX_FUNCTOR(rank) \ + ArgMinMaxFunctor \ + functor##rank; \ + functor##rank(dev_ctx, x, &out, axis) + + switch (x.dims().size()) { + case 1: + CALL_ARG_MINMAX_FUNCTOR(1); + break; + case 2: + CALL_ARG_MINMAX_FUNCTOR(2); + break; + case 3: + CALL_ARG_MINMAX_FUNCTOR(3); + break; + case 4: + CALL_ARG_MINMAX_FUNCTOR(4); + break; + case 5: + CALL_ARG_MINMAX_FUNCTOR(5); + break; + case 6: + CALL_ARG_MINMAX_FUNCTOR(6); + break; + default: + PADDLE_THROW( + "%s operator doesn't supports tensors whose ranks are greater " + "than 6.", + (EnumArgMinMaxValue == kArgMin ? "argmin" : "argmax")); + break; +#undef CALL_ARG_MINMAX_FUNCTOR + } + } +}; + +template +using ArgMinKernel = + ArgMinMaxKernel; + +template +using ArgMaxKernel = + ArgMinMaxKernel; + +class ArgMinMaxOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null"); + PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null"); + const auto& x_dims = ctx->GetInputDim("X"); + int64_t axis = ctx->Attrs().Get("axis"); + PADDLE_ENFORCE(axis >= -x_dims.size() && axis < x_dims.size(), + "'axis' must be inside [-Rank(X), Rank(X))"); + + auto x_rank = x_dims.size(); + if (axis < 0) axis += x_rank; + + std::vector vec; + for (int64_t i = 0; i < axis; i++) vec.push_back(x_dims[i]); + for (int64_t i = axis + 1; i < x_rank; i++) vec.push_back(x_dims[i]); + ctx->SetOutputDim("Out", framework::make_ddim(vec)); + } +}; + +class BaseArgMinMaxOpMaker : public framework::OpProtoAndCheckerMaker { + protected: + virtual const char* OpName() const = 0; + virtual const char* Name() const = 0; + + public: + void Make() override { + AddInput("X", "Input tensor."); + AddOutput("Out", "Output tensor."); + AddAttr("axis", "The axis in which to compute the arg indics."); + AddComment(string::Sprintf(R"DOC( + %s Operator. + + Computes the indices of the %s elements of the input tensor's element + along the provided axis. +)DOC", + OpName(), Name())); + } +}; + +class ArgMinOpMaker : public BaseArgMinMaxOpMaker { + protected: + const char* OpName() const override { return "ArgMin"; } + const char* Name() const override { return "min"; } +}; + +class ArgMaxOpMaker : public BaseArgMinMaxOpMaker { + protected: + const char* OpName() const override { return "ArgMax"; } + const char* Name() const override { return "max"; } +}; +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/arg_min_op.cc b/paddle/fluid/operators/arg_min_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..41f188029f17dbe8717afc0ca0760a39edc24b54 --- /dev/null +++ b/paddle/fluid/operators/arg_min_op.cc @@ -0,0 +1,33 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/arg_min_max_op_base.h" + +REGISTER_OPERATOR(arg_min, paddle::operators::ArgMinMaxOp, + paddle::operators::ArgMinOpMaker, + paddle::framework::EmptyGradOpMaker); + +REGISTER_OP_CPU_KERNEL( + arg_min, + paddle::operators::ArgMinKernel, + paddle::operators::ArgMinKernel, + paddle::operators::ArgMinKernel, + paddle::operators::ArgMinKernel, + paddle::operators::ArgMinKernel, + paddle::operators::ArgMinKernel, + paddle::operators::ArgMinKernel); diff --git a/paddle/fluid/operators/arg_min_op.cu b/paddle/fluid/operators/arg_min_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..4d020508505a6ebac8be41ce1e4f99d436b67ab5 --- /dev/null +++ b/paddle/fluid/operators/arg_min_op.cu @@ -0,0 +1,31 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/arg_min_max_op_base.h" + +REGISTER_OP_CUDA_KERNEL( + arg_min, + paddle::operators::ArgMinKernel, + paddle::operators::ArgMinKernel, + paddle::operators::ArgMinKernel, + paddle::operators::ArgMinKernel, + paddle::operators::ArgMinKernel, + paddle::operators::ArgMinKernel, + paddle::operators::ArgMinKernel); diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc index d7e0af28c1bfa6a9073b25b0a301234cc5d194f5..92fbb9adaf6a6a335abee3c9443d4b1d6097021b 100644 --- a/paddle/fluid/operators/batch_norm_op.cc +++ b/paddle/fluid/operators/batch_norm_op.cc @@ -151,13 +151,15 @@ class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("Variance", "The global variance (for training) " "or estimated Variance (for testing)"); - AddOutput("Y", "result after normalization"); + AddOutput("Y", "result after normalization").Reuse("X"); AddOutput("MeanOut", "Share memory with Mean. " - "Store the global mean when training"); + "Store the global mean when training") + .Reuse("Mean"); AddOutput("VarianceOut", "Share memory with Variance. " - "Store the global Variance when training"); + "Store the global Variance when training") + .Reuse("Variance"); AddOutput("SavedMean", "Mean of the current mini batch, " "will apply to output when training") diff --git a/paddle/fluid/operators/batch_size_like.h b/paddle/fluid/operators/batch_size_like.h index 483c9f8c2191fa4eb98b91112f9d6753e2fbddc3..fc15d56891cf7af10a91ca22a09c84fa2e52d465 100644 --- a/paddle/fluid/operators/batch_size_like.h +++ b/paddle/fluid/operators/batch_size_like.h @@ -54,18 +54,18 @@ class BatchSizeLikeOp : public framework::OperatorWithKernel { class BatchSizeLikeOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() final { - AddInput("Input", - "(Tensor) Tensor " - "whose input_dim_idx'th dimension specifies the batch_size"); + AddInput( + "Input", + "Tensor whose input_dim_idx'th dimension specifies the batch_size"); AddOutput("Out", - "(Tensor) Tensor of specified shape will be filled " + "Tensor of specified shape will be filled " "with the specified value"); - AddAttr>("shape", "(vector) The shape of the output"); + AddAttr>("shape", "The shape of the output"); AddAttr("input_dim_idx", - "(int, default 0) The index of input's batch size dimension") + "default 0. The index of input's batch size dimension") .SetDefault(0); AddAttr("output_dim_idx", - "(int, default 0) The index of output's batch size dimension") + "default 0. The index of output's batch size dimension") .SetDefault(0); Apply(); } diff --git a/paddle/fluid/operators/bilinear_interp_op.cc b/paddle/fluid/operators/bilinear_interp_op.cc index 3321adf2743c28f6eeca8b5cc91ef89beed6b97c..2572e813d656353a2187c29da89266733a32f3ce 100644 --- a/paddle/fluid/operators/bilinear_interp_op.cc +++ b/paddle/fluid/operators/bilinear_interp_op.cc @@ -56,17 +56,16 @@ class BilinearInterpOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { AddInput("X", - "(Tensor) The input tensor of bilinear interpolation, " + "The input tensor of bilinear interpolation, " "This is a 4-D tensor with shape of (N x C x h x w)"); AddInput("OutSize", - "(Tensor) This is a 1-D tensor with two number. " + "This is a 1-D tensor with two number. " "The first number is height and the second number is width.") .AsDispensable(); - AddOutput("Out", - "(Tensor) The dimension of output is (N x C x out_h x out_w]"); + AddOutput("Out", "The dimension of output is (N x C x out_h x out_w)"); - AddAttr("out_h", "(int) output height of bilinear interpolation op."); - AddAttr("out_w", "(int) output width of bilinear interpolation op."); + AddAttr("out_h", "output height of bilinear interpolation op."); + AddAttr("out_w", "output width of bilinear interpolation op."); AddComment(R"DOC( Bilinear interpolation is an extension of linear interpolation for interpolating functions of two variables (e.g. H-direction and diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc index 850297a2327f33a4a765f64f201e217fce5db89b..27f1313116aad99d34fa8f1d3d6a1e7aced4d394 100644 --- a/paddle/fluid/operators/conv_op.cc +++ b/paddle/fluid/operators/conv_op.cc @@ -125,7 +125,8 @@ void Conv2DOpMaker::Make() { "input image channels divided by the groups."); AddOutput("Output", "(Tensor) The output tensor of convolution operator. " - "The format of output tensor is also NCHW."); + "The format of output tensor is also NCHW.") + .Reuse("Input"); AddAttr>("strides", "(vector default:{1, 1}), the " "strides(h_stride, w_stride) of " @@ -220,7 +221,8 @@ void Conv3DOpMaker::Make() { "input image channels divided by the groups."); AddOutput("Output", "(Tensor) The output tensor of convolution operator." - "The format of output tensor is also NCDHW."); + "The format of output tensor is also NCDHW.") + .Reuse("Input"); AddAttr>("strides", "(vector, default:{1, 1, 1}), the " "strides(d_stride, h_stride, w_stride) of " diff --git a/paddle/fluid/operators/cross_entropy_op.cc b/paddle/fluid/operators/cross_entropy_op.cc index a3bec3da45136bca5cb2763e7ffd6b67703a1813..d5e095f9cad95b74b8ff79e4a60ccbdf11512a5a 100644 --- a/paddle/fluid/operators/cross_entropy_op.cc +++ b/paddle/fluid/operators/cross_entropy_op.cc @@ -124,7 +124,8 @@ class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker { "Tensor with shape [N x D]."); AddOutput("Y", "(Tensor, default Tensor), a 2-D tensor with shape " - "[N x 1]. The cross entropy loss."); + "[N x 1]. The cross entropy loss.") + .Reuse("X"); AddAttr("soft_label", "(bool, default false), a flag indicating whether to " "interpretate the given labels as soft labels.") diff --git a/paddle/fluid/operators/detail/CMakeLists.txt b/paddle/fluid/operators/detail/CMakeLists.txt index c29dc5d7e077a73b7db6a0c9204c2029ec1392bc..abc5aad0430e71928a441c9488dda16dfdd63b9c 100644 --- a/paddle/fluid/operators/detail/CMakeLists.txt +++ b/paddle/fluid/operators/detail/CMakeLists.txt @@ -1,12 +1,38 @@ -if(WITH_DISTRIBUTE) +if(NOT WITH_DISTRIBUTE) + return() +endif() + + +if(WITH_GRPC) grpc_library(sendrecvop_grpc SRCS bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc request_handler_impl.cc rpc_client.cc rpc_server.cc grpc_server.cc variable_response.cc PROTO send_recv.proto DEPS lod_tensor selected_rows memory) set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") - set_source_files_properties(serde_test.cc grpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - cc_test(serde_test SRCS serde_test.cc variable_response.cc DEPS grpc++_unsecure grpc_unsecure gpr + set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + cc_test(serde_test SRCS grpc_serde_test.cc variable_response.cc DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc SERIAL) - cc_test(grpc_server_test SRCS grpc_server_test.cc DEPS sendrecvop_grpc + cc_test(grpc_server_test SRCS rpc_server_test.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor proto_desc lookup_table_op SERIAL) + return() endif() + + +set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") +set_source_files_properties(brpc_server.cc brpc_client.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +brpc_library(sendrecvop_brpc SRCS brpc_client.cc brpc_server.cc rpc_server.cc rpc_client.cc request_handler_impl.cc + PROTO send_recv.proto + DEPS lod_tensor selected_rows memory) + +find_library(OPENSSL_CRYPTO_LIBRARY_STATIC NAMES libcrypto.so) +ADD_LIBRARY(crypto SHARED IMPORTED GLOBAL) +SET_PROPERTY(TARGET crypto PROPERTY IMPORTED_LOCATION ${OPENSSL_CRYPTO_LIBRARY_STATIC}) + + +find_library(OPENSSL_SSL_LIBRARY_STATIC NAMES libssl.so) +ADD_LIBRARY(ssl SHARED IMPORTED GLOBAL) +SET_PROPERTY(TARGET ssl PROPERTY IMPORTED_LOCATION ${OPENSSL_SSL_LIBRARY_STATIC}) + +cc_test(brpc_server_test SRCS rpc_server_test.cc DEPS sendrecvop_brpc + brpc protobuf leveldb gflags glog + protobuf executor proto_desc lookup_table_op snappystream snappy ssl crypto SERIAL) diff --git a/paddle/fluid/operators/detail/brpc_client.cc b/paddle/fluid/operators/detail/brpc_client.cc new file mode 100644 index 0000000000000000000000000000000000000000..9a4e410f1d83e93883438fae116c38eb60787673 --- /dev/null +++ b/paddle/fluid/operators/detail/brpc_client.cc @@ -0,0 +1,180 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/detail/brpc_client.h" +#include "paddle/fluid/framework/threadpool.h" + +namespace paddle { +namespace operators { +namespace detail { + +DEFINE_int32(brpc_channel_num, 24, + "Number of channels to send requests connected to one server"); +DEFINE_int32(timeout_ms, 30000, "RPC timeout in milliseconds"); +DEFINE_int32(max_retry, 3, "Max retries(not including the first RPC)"); + +BRPCClient::~BRPCClient() { Wait(); } + +void HandleSendResponse(brpc::Controller* cntl, + sendrecv::VoidMessage* response) { + // std::unique_ptr makes sure cntl/response will be deleted before returning. + std::unique_ptr cntl_guard(cntl); + std::unique_ptr response_guard(response); + + if (cntl->Failed()) { + LOG(WARNING) << "Fail to send EchoRequest, " << cntl->ErrorText(); + return; + } + LOG(INFO) << "Received response from " << cntl->remote_side() + << " latency=" << cntl->latency_us() << "us"; +} + +bool BRPCClient::AsyncSendVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, int64_t time_out) { + const platform::DeviceContext* p_ctx = &ctx; + const std::string ep_val = ep; + const std::string var_name_val = var_name; + const framework::Scope* p_scope = &scope; + const auto ch_ptr = GetChannel(ep_val); + + framework::AsyncIO( + [var_name_val, p_ctx, ep_val, p_scope, time_out, ch_ptr, this] { + auto ch_ctx = ch_ptr->Pop(); + brpc::Controller* cntl = new brpc::Controller(); + sendrecv::VoidMessage* response = new sendrecv::VoidMessage(); + cntl->set_timeout_ms(time_out); + + google::protobuf::Closure* done = + brpc::NewCallback(&HandleSendResponse, cntl, response); + + sendrecv::VariableMessage request; + ch_ctx->stub->SendVariable(cntl, &request, response, done); + }); + req_count_++; + + return true; +} + +void HandleGetResponse(brpc::Controller* cntl, + sendrecv::VariableMessage* response) { + // std::unique_ptr makes sure cntl/response will be deleted before returning. + std::unique_ptr cntl_guard(cntl); + std::unique_ptr response_guard(response); + + if (cntl->Failed()) { + LOG(WARNING) << "Fail to send EchoRequest, " << cntl->ErrorText(); + return; + } + LOG(INFO) << "Received response from " << cntl->remote_side() + << " latency=" << cntl->latency_us() << "us"; + + // framework::Variable* outvar = nullptr; + // DeserializeFromByteBuffer(ret_msg, *var_h.ctx, var_h.scope, &outvar); +} + +bool BRPCClient::AsyncGetVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, int64_t time_out) { + const platform::DeviceContext* p_ctx = &ctx; + const std::string ep_val = ep; + const std::string var_name_val = var_name; + const framework::Scope* p_scope = &scope; + const auto ch = GetChannel(ep_val); + + framework::AsyncIO( + [var_name_val, ep_val, p_scope, p_ctx, time_out, ch, this] {}); + + req_count_++; + + return true; +} + +bool BRPCClient::AsyncPrefetchVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& in_var_name, + const std::string& out_var_name, + int64_t time_out) { + const platform::DeviceContext* p_ctx = &ctx; + const std::string ep_val = ep; + const std::string in_var_name_val = in_var_name; + const std::string out_var_name_val = out_var_name; + const framework::Scope* p_scope = &scope; + const auto ch = GetChannel(ep_val); + + framework::AsyncIO([in_var_name_val, out_var_name_val, ep_val, p_scope, p_ctx, + time_out, ch, this] {}); + + req_count_++; + return true; +} + +void BRPCClient::AsyncSendBatchBarrier(const std::string& ep, + int64_t time_out) { + req_count_++; +} + +void BRPCClient::AsyncSendFetchBarrier(const std::string& ep, + int64_t time_out) { + req_count_++; +} + +void BRPCClient::Wait() { + std::unique_lock lk(sync_mutex_); + sync_cond_.wait(lk, [this] { return req_count_ == 0; }); +} + +ChannelQueuePtr BRPCClient::GetChannel(const std::string& ep) { + { + std::lock_guard guard(chan_mutex_); + auto it = channels_.find(ep); + if (it != channels_.end()) { + return it->second; + } + } + + ChannelQueuePtr q(new framework::BlockingQueue()); + + brpc::ChannelOptions options; + options.protocol = "baidu_std"; + options.connection_type = "pooled"; + options.connect_timeout_ms = 100; + options.timeout_ms = FLAGS_timeout_ms /*milliseconds*/; + options.max_retry = FLAGS_max_retry; + for (int i = 0; i < FLAGS_brpc_channel_num; ++i) { + std::shared_ptr c(new ChannelContext()); + if (c->channel.Init(ep.c_str(), &options) != 0) { + LOG(ERROR) << "Fail to initialize channel"; + return nullptr; + } + + c->stub.reset(new sendrecv::SendRecvService_Stub( + static_cast(&c->channel))); + q->Push(c); + } + + { + std::lock_guard guard(chan_mutex_); + channels_[ep] = q; + } + + return q; +} + +} // namespace detail +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/detail/brpc_client.h b/paddle/fluid/operators/detail/brpc_client.h new file mode 100644 index 0000000000000000000000000000000000000000..1e953ea431d51a9586bfd0b352c7f27d079ff1a8 --- /dev/null +++ b/paddle/fluid/operators/detail/brpc_client.h @@ -0,0 +1,100 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include + +#include // NOLINT +#include +#include +#include +#include +#include // NOLINT +#include +#include + +#include "brpc/channel.h" +#include "paddle/fluid/framework/blocking_queue.h" +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/operators/detail/rpc_client.h" +#include "paddle/fluid/operators/detail/send_recv.pb.h" +#include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN + +namespace paddle { +namespace operators { +namespace detail { + +struct ChannelContext { + brpc::Channel channel; + std::shared_ptr stub; +}; + +typedef std::shared_ptr ChannelContextPtr; +typedef std::shared_ptr> + ChannelQueuePtr; + +class BRPCClient : public RPCClient { + public: + BRPCClient() {} + virtual ~BRPCClient(); + + bool AsyncSendVar(const std::string& ep, const platform::DeviceContext& ctx, + const framework::Scope& scope, const std::string& var_name, + int64_t time_out = RPCClient::rpc_time_out) override; + + bool AsyncGetVar(const std::string& ep, const platform::DeviceContext& ctx, + const framework::Scope& scope, const std::string& var_name, + int64_t time_out = RPCClient::rpc_time_out) override; + + bool AsyncPrefetchVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& in_var_name, + const std::string& out_var_name, + int64_t time_out = RPCClient::rpc_time_out) override; + + void AsyncSendBatchBarrier( + const std::string& ep, + int64_t time_out = RPCClient::rpc_time_out) override; + + void AsyncSendFetchBarrier( + const std::string& ep, + int64_t time_out = RPCClient::rpc_time_out) override; + + void Wait() override; + + private: + void Proceed(); + ChannelQueuePtr GetChannel(const std::string& ep); + + private: + std::unordered_map channels_; + + // mutex for Wait client sync + std::mutex sync_mutex_; + std::condition_variable sync_cond_; + std::atomic req_count_{0}; + + // mutex for GetChannel thread safety + std::mutex chan_mutex_; + DISABLE_COPY_AND_ASSIGN(BRPCClient); +}; + +} // namespace detail +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/detail/brpc_server.cc b/paddle/fluid/operators/detail/brpc_server.cc new file mode 100644 index 0000000000000000000000000000000000000000..2170abe679f9ededff3b53e3139e56f8aad227cb --- /dev/null +++ b/paddle/fluid/operators/detail/brpc_server.cc @@ -0,0 +1,144 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/detail/brpc_server.h" +#include "paddle/fluid/operators/detail/request_handler.h" + +namespace sendrecv { + +typedef std::unordered_map + HandlerMap; + +class BRPCServiceImpl : public SendRecvService { + public: + explicit BRPCServiceImpl(const HandlerMap& rpc_call_map) + : request_send_h_(nullptr), + request_get_h_(nullptr), + request_prefetch_h_(nullptr) { + auto it = rpc_call_map.find(paddle::operators::detail::kRequestSend); + if (it != rpc_call_map.end()) { + request_send_h_ = it->second; + } + + it = rpc_call_map.find(paddle::operators::detail::kRequestSend); + if (it != rpc_call_map.end()) { + request_get_h_ = it->second; + } + + it = rpc_call_map.find(paddle::operators::detail::kRequestPrefetch); + if (it != rpc_call_map.end()) { + request_prefetch_h_ = it->second; + } + } + + virtual ~BRPCServiceImpl() {} + + void SendVariable(google::protobuf::RpcController* cntl_butil, + const VariableMessage* request, VoidMessage* response, + google::protobuf::Closure* done) override { + PADDLE_ENFORCE(request_send_h_ != nullptr, + "RequestSend handler should be registed first!"); + brpc::ClosureGuard done_guard(done); + + paddle::framework::Scope* local_scope = request_send_h_->scope(); + paddle::framework::Variable* outvar = nullptr; + paddle::framework::Variable* invar = nullptr; + + std::string varname = request->varname(); + + if (!request_send_h_->sync_mode()) { + local_scope = &request_send_h_->scope()->NewScope(); + invar = local_scope->Var(varname); + } else { + invar = local_scope->FindVar(varname); + } + + request_send_h_->Handle(varname, local_scope, invar, &outvar); + + if (!request_send_h_->sync_mode()) { + request_send_h_->scope()->DeleteScope(local_scope); + } + } + + void GetVariable(google::protobuf::RpcController* cntl_butil, + const VariableMessage* request, VariableMessage* response, + google::protobuf::Closure* done) override { + PADDLE_ENFORCE(request_get_h_ != nullptr, + "RequestGet handler should be registed first!"); + } + + void PrefetchVariable(google::protobuf::RpcController* cntl_butil, + const VariableMessage* request, + VariableMessage* response, + google::protobuf::Closure* done) override { + PADDLE_ENFORCE(request_prefetch_h_ != nullptr, + "kRequestPrefetch handler should be registed first!"); + } + + private: + paddle::operators::detail::RequestHandler* request_send_h_; + paddle::operators::detail::RequestHandler* request_get_h_; + paddle::operators::detail::RequestHandler* request_prefetch_h_; +}; +} // namespace sendrecv + +namespace paddle { +namespace operators { +namespace detail { + +void AsyncBRPCServer::StartServer() { + // Instance of your service. + sendrecv::BRPCServiceImpl service_impl(rpc_call_map_); + + // Add the service into server. Notice the second parameter, because the + // service is put on stack, we don't want server to delete it, otherwise + // use brpc::SERVER_OWNS_SERVICE. + if (server_.AddService(&service_impl, brpc::SERVER_DOESNT_OWN_SERVICE) != 0) { + LOG(FATAL) << "Fail to add service"; + return; + } + + brpc::ServerOptions options; + options.idle_timeout_sec = idle_timeout_s_; + options.max_concurrency = max_concurrency_; + if (server_.Start(bind_address_.c_str(), &options) != 0) { + LOG(FATAL) << "Fail to start EchoServer" << bind_address_; + return; + } + + butil::EndPoint ep = server_.listen_address(); + selected_port_ = ep.port; + + { + std::lock_guard lock(this->mutex_ready_); + ready_ = 1; + } + condition_ready_.notify_all(); + + server_.Join(); +} + +void AsyncBRPCServer::ShutDownImpl() { server_.Stop(1000); } + +void AsyncBRPCServer::WaitServerReady() { + VLOG(3) << "AsyncGRPCServer is wait server ready"; + std::unique_lock lock(this->mutex_ready_); + condition_ready_.wait(lock, [=] { return this->ready_ == 1; }); + VLOG(3) << "AsyncGRPCServer WaitSeverReady"; +} + +}; // namespace detail +}; // namespace operators +}; // namespace paddle diff --git a/paddle/fluid/operators/detail/brpc_server.h b/paddle/fluid/operators/detail/brpc_server.h new file mode 100644 index 0000000000000000000000000000000000000000..0105c8074a46849031d8fa9c21a5507a982ec3c3 --- /dev/null +++ b/paddle/fluid/operators/detail/brpc_server.h @@ -0,0 +1,53 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include // NOLINT +#include // NOLINT +#include + +#include "brpc/server.h" +#include "paddle/fluid/operators/detail/rpc_server.h" +#include "paddle/fluid/operators/detail/send_recv.pb.h" + +namespace paddle { +namespace operators { +namespace detail { + +class AsyncBRPCServer final : public RPCServer { + public: + explicit AsyncBRPCServer(const std::string& address, int client_num) + : RPCServer(address, client_num), ready_(0) {} + + virtual ~AsyncBRPCServer() {} + void StartServer() override; + void WaitServerReady() override; + + private: + void ShutDownImpl() override; + + brpc::Server server_; + + static constexpr int idle_timeout_s_ = -1; + static constexpr int max_concurrency_ = 0; + + std::mutex mutex_ready_; + std::condition_variable condition_ready_; + int ready_; +}; + +}; // namespace detail +}; // namespace operators +}; // namespace paddle diff --git a/paddle/fluid/operators/detail/grpc_client.cc b/paddle/fluid/operators/detail/grpc_client.cc index fae39418b4166c9110a40b550e0b9801075edc7e..6b8373b1509c898e6ae70a18833df39a4898714a 100644 --- a/paddle/fluid/operators/detail/grpc_client.cc +++ b/paddle/fluid/operators/detail/grpc_client.cc @@ -19,6 +19,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/threadpool.h" +#include "paddle/fluid/operators/detail/request_handler.h" #include "paddle/fluid/platform/profiler.h" namespace paddle { diff --git a/paddle/fluid/operators/detail/serde_test.cc b/paddle/fluid/operators/detail/grpc_serde_test.cc similarity index 100% rename from paddle/fluid/operators/detail/serde_test.cc rename to paddle/fluid/operators/detail/grpc_serde_test.cc diff --git a/paddle/fluid/operators/detail/grpc_server.cc b/paddle/fluid/operators/detail/grpc_server.cc index 57867aad4d679f75ea790b65b5773a73586fd96e..18651544a1c0207127be335c37fe85c6e24dc16e 100644 --- a/paddle/fluid/operators/detail/grpc_server.cc +++ b/paddle/fluid/operators/detail/grpc_server.cc @@ -41,11 +41,22 @@ class RequestBase { virtual ~RequestBase() {} virtual void Process() = 0; - CallStatus Status() { return status_; } - void SetStatus(CallStatus status) { status_ = status; } + CallStatus Status() const { + std::lock_guard l(status_mu_); + return status_; + } + + template + void Finish(const T& reply, ServerAsyncResponseWriter* responder) { + std::lock_guard l(status_mu_); + status_ = FINISH; + responder->Finish(reply, ::grpc::Status::OK, + reinterpret_cast(static_cast(req_id_))); + } virtual std::string GetReqName() = 0; protected: + mutable std::mutex status_mu_; ::grpc::ServerContext ctx_; GrpcService::AsyncService* service_; ::grpc::ServerCompletionQueue* cq_; @@ -80,9 +91,7 @@ class RequestSend final : public RequestBase { framework::Variable* outvar = nullptr; request_handler_->Handle(varname, scope, invar, &outvar); - status_ = FINISH; - responder_.Finish(reply_, ::grpc::Status::OK, - reinterpret_cast(static_cast(req_id_))); + Finish(reply_, &responder_); } protected: @@ -122,9 +131,7 @@ class RequestGet final : public RequestBase { SerializeToByteBuffer(varname, outvar, *request_handler_->dev_ctx(), &reply_); } - status_ = FINISH; - responder_.Finish(reply_, ::grpc::Status::OK, - reinterpret_cast(static_cast(req_id_))); + Finish(reply_, &responder_); } protected: @@ -166,9 +173,7 @@ class RequestPrefetch final : public RequestBase { SerializeToByteBuffer(varname, outvar, *request_handler_->dev_ctx(), &reply_); - responder_.Finish(reply_, ::grpc::Status::OK, - reinterpret_cast(static_cast(req_id_))); - status_ = FINISH; + Finish(reply_, &responder_); } protected: diff --git a/paddle/fluid/operators/detail/grpc_server.h b/paddle/fluid/operators/detail/grpc_server.h index e6ffc7066f24d5088a95801ed1c0670b24d5771f..f1db7590f6f14d5d44acc12453861a446e278cd2 100644 --- a/paddle/fluid/operators/detail/grpc_server.h +++ b/paddle/fluid/operators/detail/grpc_server.h @@ -53,6 +53,7 @@ class AsyncGRPCServer final : public RPCServer { void StartServer() override; private: + // HandleRequest needs to be thread-safe. void HandleRequest( ::grpc::ServerCompletionQueue* cq, const std::string& rpc_name, std::function TryToRegisterNewOne); diff --git a/paddle/fluid/operators/detail/macros.h b/paddle/fluid/operators/detail/macros.h new file mode 100644 index 0000000000000000000000000000000000000000..da1de72dad00db3ffe609e17bd198ef0a56bbfcd --- /dev/null +++ b/paddle/fluid/operators/detail/macros.h @@ -0,0 +1,27 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#ifdef PADDLE_WITH_GRPC +#include "paddle/fluid/operators/detail/grpc_client.h" +#include "paddle/fluid/operators/detail/grpc_server.h" +#define RPCSERVER_T detail::AsyncGRPCServer +#define RPCCLIENT_T detail::GRPCClient +#else +#include "paddle/fluid/operators/detail/brpc_client.h" +#include "paddle/fluid/operators/detail/brpc_server.h" +#define RPCSERVER_T detail::AsyncBRPCServer +#define RPCCLIENT_T detail::BRPCClient +#endif diff --git a/paddle/fluid/operators/detail/request_handler.h b/paddle/fluid/operators/detail/request_handler.h index d74206aaba6a79ee06475985e642221bd84d9382..fa979024e37f435b918568a1c5e603f8962f9172 100644 --- a/paddle/fluid/operators/detail/request_handler.h +++ b/paddle/fluid/operators/detail/request_handler.h @@ -28,7 +28,6 @@ #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/var_type.h" -#include "paddle/fluid/operators/detail/sendrecvop_utils.h" namespace paddle { namespace operators { @@ -38,6 +37,10 @@ constexpr char kRequestSend[] = "RequestSend"; constexpr char kRequestGet[] = "RequestGet"; constexpr char kRequestPrefetch[] = "RequestPrefetch"; +#define LISTEN_TERMINATE_MESSAGE "TERMINATE@RECV" +#define BATCH_BARRIER_MESSAGE "BATCH_BARRIER@RECV" +#define FETCH_BARRIER_MESSAGE "FETCH_BARRIER@RECV" + class RPCServer; class RequestHandler { diff --git a/paddle/fluid/operators/detail/request_handler_impl.cc b/paddle/fluid/operators/detail/request_handler_impl.cc index 9473dce55029f2a4e0987ab8f6f5e7205d7fff47..5f1a346e93b1a0239af77b86d10782d67c403e23 100644 --- a/paddle/fluid/operators/detail/request_handler_impl.cc +++ b/paddle/fluid/operators/detail/request_handler_impl.cc @@ -16,15 +16,12 @@ #include #include -#include "paddle/fluid/framework/blocking_queue.h" #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/operators/detail/request_handler_impl.h" #include "paddle/fluid/operators/detail/rpc_server.h" -#include "paddle/fluid/operators/detail/sendrecvop_utils.h" -#include "paddle/fluid/operators/detail/variable_response.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/detail/request_handler_impl.h b/paddle/fluid/operators/detail/request_handler_impl.h index 443d951914dd0f40e8831abc637848363d9fef16..c392267cfaeb7e94e6a23d6445c09b756e7e58b1 100644 --- a/paddle/fluid/operators/detail/request_handler_impl.h +++ b/paddle/fluid/operators/detail/request_handler_impl.h @@ -29,7 +29,6 @@ #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/var_type.h" #include "paddle/fluid/operators/detail/request_handler.h" -#include "paddle/fluid/operators/detail/sendrecvop_utils.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/detail/rpc_client.h b/paddle/fluid/operators/detail/rpc_client.h index 8e3717f076db6a52916d0e15813f9f61148c6553..7e76ac0348574d4090793b191be0ff3ff8666b37 100644 --- a/paddle/fluid/operators/detail/rpc_client.h +++ b/paddle/fluid/operators/detail/rpc_client.h @@ -26,6 +26,8 @@ namespace detail { class RPCClient { public: + RPCClient() {} + virtual ~RPCClient() {} virtual bool AsyncSendVar(const std::string& ep, const platform::DeviceContext& ctx, const framework::Scope& scope, diff --git a/paddle/fluid/operators/detail/grpc_server_test.cc b/paddle/fluid/operators/detail/rpc_server_test.cc similarity index 92% rename from paddle/fluid/operators/detail/grpc_server_test.cc rename to paddle/fluid/operators/detail/rpc_server_test.cc index a1f9ba15e656a686c4bb0d81cf00dea120b8c0ad..f49274a7b53d04f82718887aabaffd5d33053dfe 100644 --- a/paddle/fluid/operators/detail/grpc_server_test.cc +++ b/paddle/fluid/operators/detail/rpc_server_test.cc @@ -17,15 +17,14 @@ limitations under the License. */ #include // NOLINT #include "gtest/gtest.h" -#include "paddle/fluid/operators/detail/grpc_client.h" -#include "paddle/fluid/operators/detail/grpc_server.h" -#include "paddle/fluid/operators/detail/rpc_client.h" - #include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/operators/detail/macros.h" #include "paddle/fluid/operators/detail/request_handler_impl.h" +#include "paddle/fluid/operators/detail/rpc_client.h" +#include "paddle/fluid/operators/detail/rpc_server.h" namespace framework = paddle::framework; namespace platform = paddle::platform; @@ -33,7 +32,7 @@ namespace detail = paddle::operators::detail; USE_OP(lookup_table); -std::unique_ptr g_rpc_service; +std::unique_ptr g_rpc_service; std::unique_ptr g_req_handler; framework::BlockDesc* AppendPrefetchBlcok(framework::ProgramDesc* program) { @@ -112,20 +111,19 @@ void StartServer() { g_req_handler->SetRPCServer(g_rpc_service.get()); std::thread server_thread( - std::bind(&detail::AsyncGRPCServer::StartServer, g_rpc_service.get())); + std::bind(&detail::RPCServer::StartServer, g_rpc_service.get())); server_thread.join(); } TEST(PREFETCH, CPU) { g_req_handler.reset(new detail::RequestPrefetchHandler(true)); - g_rpc_service.reset(new detail::AsyncGRPCServer("127.0.0.1:0", 1)); + g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 1)); + detail::RPCClient* client = detail::RPCClient::GetInstance(); std::thread server_thread(StartServer); g_rpc_service->WaitServerReady(); - detail::RPCClient* client = - detail::RPCClient::GetInstance(); int port = g_rpc_service->GetSelectedPort(); std::string ep = paddle::string::Sprintf("127.0.0.1:%d", port); diff --git a/paddle/fluid/operators/detail/send_recv.proto b/paddle/fluid/operators/detail/send_recv.proto index a244afc46f3247c7e6e8481b09b5c729a2a569f7..54cb93e04d18b3784be187c9c8885bbccc55488b 100644 --- a/paddle/fluid/operators/detail/send_recv.proto +++ b/paddle/fluid/operators/detail/send_recv.proto @@ -14,6 +14,8 @@ limitations under the License. */ syntax = "proto3"; package sendrecv; +// option cc_generic_services = true; + service SendRecvService { // For parameter server round-robin like hashing, do not split tensors. // Send and recv only one tensor diff --git a/paddle/fluid/operators/detail/sendrecvop_utils.h b/paddle/fluid/operators/detail/sendrecvop_utils.h index c72e1bd076f670458f3915072154847db6205092..bd16bf1dab8d933ffd18b6d6d9e3ce1c7d73029b 100644 --- a/paddle/fluid/operators/detail/sendrecvop_utils.h +++ b/paddle/fluid/operators/detail/sendrecvop_utils.h @@ -32,16 +32,6 @@ namespace paddle { namespace operators { namespace detail { -#define LISTEN_TERMINATE_MESSAGE "TERMINATE@RECV" -#define BATCH_BARRIER_MESSAGE "BATCH_BARRIER@RECV" -#define FETCH_BARRIER_MESSAGE "FETCH_BARRIER@RECV" - -static int64_t GetTimestamp() { - struct timeval tp; - gettimeofday(&tp, NULL); - return tp.tv_sec * 1000 + tp.tv_usec / 1000; -} - typedef void (*DestroyCallback)(void*); void SerializeToByteBuffer(const std::string& name, framework::Variable* var, diff --git a/paddle/fluid/operators/elementwise_op.h b/paddle/fluid/operators/elementwise_op.h index f4cec8ad971abebe8d6dff1a384c8414269148a5..0803a6035d342fefdae69297461fc78abbf18414 100644 --- a/paddle/fluid/operators/elementwise_op.h +++ b/paddle/fluid/operators/elementwise_op.h @@ -59,7 +59,7 @@ class ElementwiseOpMaker : public framework::OpProtoAndCheckerMaker { void Make() final { AddInput("X", "(Tensor), The first input tensor of elementwise op."); AddInput("Y", "(Tensor), The second input tensor of elementwise op."); - AddOutput("Out", "The output of elementwise op."); + AddOutput("Out", "The output of elementwise op.").Reuse("X"); AddAttr("axis", "(int, default -1). The start dimension index " "for broadcasting Y onto X.") diff --git a/paddle/fluid/operators/fetch_barrier_op.cc b/paddle/fluid/operators/fetch_barrier_op.cc index ad67585e485b237f5d4b809af712ce658ef602bb..98b051afb551f373009d2bd3df1a8daa64b7e6c7 100644 --- a/paddle/fluid/operators/fetch_barrier_op.cc +++ b/paddle/fluid/operators/fetch_barrier_op.cc @@ -19,9 +19,7 @@ limitations under the License. */ #include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" - -#include "paddle/fluid/operators/detail/grpc_client.h" -#include "paddle/fluid/operators/detail/rpc_client.h" +#include "paddle/fluid/operators/detail/macros.h" #include "paddle/fluid/platform/profiler.h" namespace paddle { @@ -45,7 +43,7 @@ class FetchBarrierOp : public framework::OperatorBase { platform::RecordEvent record_event(Type(), &ctx); detail::RPCClient* rpc_client = - detail::RPCClient::GetInstance(); + detail::RPCClient::GetInstance(); rpc_client->Wait(); diff --git a/paddle/fluid/operators/fill_constant_batch_size_like_op.cc b/paddle/fluid/operators/fill_constant_batch_size_like_op.cc index 1ae78675a0cac8a72aeaef1227b631a41e4a10b2..453a1b32a0171a2ca88879ab3287e89c4d3c7759 100644 --- a/paddle/fluid/operators/fill_constant_batch_size_like_op.cc +++ b/paddle/fluid/operators/fill_constant_batch_size_like_op.cc @@ -32,16 +32,16 @@ class FillConstantBatchSizeLikeOp : public BatchSizeLikeOp { class FillConstantBatchSizeLikeOpMaker : public BatchSizeLikeOpMaker { protected: void Apply() override { - AddAttr("dtype", - "(int, default 5 (FP32)) " - "Output data type") + AddAttr( + "dtype", + "It could be numpy.dtype. Output data type. Default is float32") .SetDefault(framework::proto::VarType::FP32); - AddAttr("value", "(float, default 0) The value to be filled") + AddAttr("value", "default 0. The value to be filled") .SetDefault(0.0f); AddComment(R"DOC( -FillConstantBatchSizeLike Operator. - -Fill up a variable with specified constant value. +This function creates a tensor of specified *shape*, *dtype* and batch size, +and initializes this with a constant supplied in *value*. The batch size is +obtained from the `input` tensor. )DOC"); } diff --git a/paddle/fluid/operators/gen_nccl_id_op.cc b/paddle/fluid/operators/gen_nccl_id_op.cc index 547de4fa49dc16182c424118c0f5705d2396100a..111e58844c83806af4ebe0aa9e2126a9ddec1d8a 100644 --- a/paddle/fluid/operators/gen_nccl_id_op.cc +++ b/paddle/fluid/operators/gen_nccl_id_op.cc @@ -21,8 +21,7 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/threadpool.h" -#include "paddle/fluid/operators/detail/grpc_client.h" -#include "paddle/fluid/operators/detail/grpc_server.h" +#include "paddle/fluid/operators/detail/macros.h" #include "paddle/fluid/operators/detail/request_handler_impl.h" #include "paddle/fluid/platform/nccl_helper.h" @@ -61,8 +60,8 @@ class GenNCCLIdOp : public framework::OperatorBase { std::vector endpoint_list = Attr>("endpoint_list"); - detail::RPCClient* client = - detail::RPCClient::GetInstance(); + detail::RPCClient* client = detail::RPCClient::GetInstance(); + for (auto& ep : endpoint_list) { VLOG(3) << "sending nccl id to " << ep; client->AsyncSendVar(ep, dev_ctx, *scope, NCCL_ID_VARNAME); @@ -78,9 +77,11 @@ class GenNCCLIdOp : public framework::OperatorBase { // deleter will call GRPC Server's base class's dtor and // that will cause a wired crash. detail::RequestSendHandler rpc_h(true); - detail::AsyncGRPCServer rpc_service(endpoint, 1); - rpc_service.RegisterRPC(detail::kRequestSend, &rpc_h); - rpc_h.SetRPCServer(&rpc_service); + std::unique_ptr rpc_service( + new RPCSERVER_T(endpoint, 1)); + + rpc_service->RegisterRPC(detail::kRequestSend, &rpc_h); + rpc_h.SetRPCServer(rpc_service.get()); framework::ProgramDesc empty_program; framework::Executor executor(dev_ctx.GetPlace()); @@ -90,12 +91,13 @@ class GenNCCLIdOp : public framework::OperatorBase { rpc_h.SetExecutor(&executor); std::thread server_thread( - std::bind(&detail::AsyncGRPCServer::StartServer, &rpc_service)); - rpc_service.SetCond(detail::kRequestSend); + std::bind(&detail::RPCServer::StartServer, rpc_service.get())); + + rpc_service->SetCond(detail::kRequestSend); VLOG(3) << "start getting nccl id from trainer 0..."; - rpc_service.WaitBarrier(detail::kRequestSend); + rpc_service->WaitBarrier(detail::kRequestSend); VLOG(3) << "got nccl id and stop server..."; - rpc_service.ShutDown(); + rpc_service->ShutDown(); VLOG(3) << "rpc server stopped"; server_thread.join(); } diff --git a/paddle/fluid/operators/linear_chain_crf_op.cc b/paddle/fluid/operators/linear_chain_crf_op.cc index e38525cd7f44de020f364ffd16e71a439048347f..a711da362771353891f900f544d97e64510dc0ba 100644 --- a/paddle/fluid/operators/linear_chain_crf_op.cc +++ b/paddle/fluid/operators/linear_chain_crf_op.cc @@ -67,8 +67,6 @@ class LinearChainCRFOpMaker : public framework::OpProtoAndCheckerMaker { "mini-batch. Note: S is equal to the sequence number in a mini-batch. " "The output is no longer a LoDTensor."); AddComment(R"DOC( -LinearChainCRF Operator. - Conditional Random Field defines an undirected probabilistic graph with nodes denoting random variables and edges denoting dependencies between these variables. CRF learns the conditional probability $P(Y|X)$, where diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc index 66d31c88951926a6dd9b7262942a69bb1564a416..76106004404bb0bb108cca003869f07c43bbf62f 100644 --- a/paddle/fluid/operators/listen_and_serv_op.cc +++ b/paddle/fluid/operators/listen_and_serv_op.cc @@ -19,7 +19,8 @@ limitations under the License. */ #include // NOLINT #include -#include "paddle/fluid/operators/detail/grpc_server.h" +#include "paddle/fluid/operators/detail/macros.h" + #include "paddle/fluid/operators/detail/request_handler_impl.h" #include "paddle/fluid/operators/listen_and_serv_op.h" #include "paddle/fluid/platform/profiler.h" @@ -89,6 +90,12 @@ void ListenAndServOp::SavePort() const { rpc_service_->SavePort(); } +static int64_t GetTimestamp() { + struct timeval tp; + gettimeofday(&tp, NULL); + return tp.tv_sec * 1000 + tp.tv_usec / 1000; +} + void ListenAndServOp::RunSyncLoop(framework::Executor *executor, framework::ProgramDesc *program, framework::Scope *recv_scope, @@ -127,7 +134,7 @@ void ListenAndServOp::RunSyncLoop(framework::Executor *executor, int32_t last_parent_blkid = program->Block(1).Parent(); std::vector parallel_blkids; parallel_blkids.push_back(1); - double ts = detail::GetTimestamp(); + double ts = GetTimestamp(); for (size_t blkid = 2; blkid < num_blocks; ++blkid) { if (blkid != static_cast(prefetch_block->ID())) { if (program->Block(blkid).Parent() != last_parent_blkid) { @@ -141,7 +148,7 @@ void ListenAndServOp::RunSyncLoop(framework::Executor *executor, } ParallelExecuteBlocks(parallel_blkids, executor, optimize_prepared, program, recv_scope); - VLOG(2) << "run all blocks spent " << detail::GetTimestamp() - ts << "(ms)"; + VLOG(2) << "run all blocks spent " << GetTimestamp() - ts << "(ms)"; rpc_service_->SetCond(detail::kRequestGet); rpc_service_->WaitBarrier(detail::kRequestGet); @@ -235,8 +242,8 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, LOG(INFO) << "sync_mode:" << sync_mode << ", fan_in:" << fan_in << ", end_point:" << endpoint; - // request_handler_.reset(new detail::GRPCRequestSendHandler(sync_mode)); - rpc_service_.reset(new detail::AsyncGRPCServer(endpoint, fan_in)); + rpc_service_.reset(new RPCSERVER_T(endpoint, fan_in)); + request_send_handler_.reset(new detail::RequestSendHandler(sync_mode)); request_get_handler_.reset(new detail::RequestGetHandler(sync_mode)); request_prefetch_handler_.reset( diff --git a/paddle/fluid/operators/load_op.cc b/paddle/fluid/operators/load_op.cc index 93f45cff8a26201b1fbb1c44141e125a67c44037..8f4b5049271c9592d2db268ea7ff2f5c8abc28b6 100644 --- a/paddle/fluid/operators/load_op.cc +++ b/paddle/fluid/operators/load_op.cc @@ -74,25 +74,18 @@ class LoadOp : public framework::OperatorBase { class LoadOpProtoMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { - AddOutput("Out", "(Tensor) The tensor need to be loaded"); + AddOutput("Out", "The tensor need to be loaded"); AddAttr( "load_as_fp16", - "(boolean, default false)" "If true, the tensor will be first loaded and then " "converted to float16 data type. Otherwise, the tensor will be " - "directly loaded without data type conversion.") + "directly loaded without data type conversion. Default is false.") .SetDefault(false); AddAttr("file_path", - "(string) " - "Variable will be loaded from \"file_path\".") + R"(Variable will be loaded from "file_path")") .AddCustomChecker( [](const std::string &path) { return !path.empty(); }); - AddComment(R"DOC( -Load Operator. - -Load operator will load a tensor variable from disk file. - -)DOC"); + AddComment("Load operator will load a tensor variable from disk file."); } }; } // namespace operators diff --git a/paddle/fluid/operators/max_sequence_len_op.cc b/paddle/fluid/operators/max_sequence_len_op.cc index 8e508b68eeab69a4595904dcc3ea0a541d9ab6e6..b1e69f375d3274aade3184af02f7f914dba5db71 100644 --- a/paddle/fluid/operators/max_sequence_len_op.cc +++ b/paddle/fluid/operators/max_sequence_len_op.cc @@ -42,10 +42,15 @@ class MaxSeqenceLenOp : public framework::OperatorBase { class MaxSeqenceLenOpProtoMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { - AddInput("RankTable", "The lod_rank_table."); - AddOutput("Out", "The max sequence length."); - AddComment( - R"DOC(Calculate the max sequence length through lod_rank_table.)DOC"); + AddInput("RankTable", "Input variable which is a LoDRankTable object"); + AddOutput("Out", "The max sequence length"); + AddComment(R"DOC( + Given a LoDRankTable object, this layer returns the max length of + a batch of sequences. In fact, a LoDRankTable object contains a list of + tuples() and the list is already sorted by + sequence length in descending order, so the operator just returns the + sequence length of the first tuple element +)DOC"); } }; diff --git a/paddle/fluid/operators/mean_op.cc b/paddle/fluid/operators/mean_op.cc index 74477eb439dc202c3f5f17fdf3e1647bc5c23512..4881cff4a368ffae9b030f04b7fff01d6ee7d26e 100644 --- a/paddle/fluid/operators/mean_op.cc +++ b/paddle/fluid/operators/mean_op.cc @@ -34,7 +34,7 @@ class MeanOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { AddInput("X", "The input of mean op"); - AddOutput("Out", "The output of mean op"); + AddOutput("Out", "The output of mean op").Reuse("X"); AddComment(R"DOC( Mean Operator. diff --git a/paddle/fluid/operators/pool_mkldnn_op.cc b/paddle/fluid/operators/pool_mkldnn_op.cc index a045f9e98dd7348973c3c4506f44d3e261599a14..5341187d1ce9400ac34750ab691608e76158ae0d 100644 --- a/paddle/fluid/operators/pool_mkldnn_op.cc +++ b/paddle/fluid/operators/pool_mkldnn_op.cc @@ -18,9 +18,14 @@ limitations under the License. */ namespace paddle { namespace operators { -using mkldnn::memory; // Note: paddle has also "memory" namespace -using mkldnn::pooling_forward; +using framework::DataLayout; +using mkldnn::memory; using mkldnn::pooling_backward; +using mkldnn::pooling_forward; +using mkldnn::primitive; +using mkldnn::reorder; +using mkldnn::stream; +using platform::to_void_cast; // Generate keys for storing/retriving primitives for this operator // TODO(jczaja): Make hashing function more optimial @@ -55,8 +60,9 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel { const Tensor* input = ctx.Input("X"); Tensor* output = ctx.Output("Out"); - // Get an unique name from "argument" name of "Out" variable - // This name will be used as key when saving info into device context + PADDLE_ENFORCE(input->layout() == DataLayout::kMKLDNN && + input->format() != memory::format::format_undef, + "Wrong layout/format set for Input tensor"); std::string pooling_type = ctx.Attr("pooling_type"); std::vector ksize = ctx.Attr>("ksize"); @@ -82,6 +88,9 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel { std::vector src_tz = paddle::framework::vectorize2int(input->dims()); std::vector dst_tz = paddle::framework::vectorize2int(output->dims()); + auto input_format = input->format(); + memory::format output_format{memory::format::format_undef}; + const std::string key = gethash(src_tz, pooling_type, ksize, strides, paddings, ctx.op().Output("Out")); const std::string key_pool_p = key + "@pool_p"; @@ -94,16 +103,17 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel { auto pool_p = std::static_pointer_cast(dev_ctx.GetBlob(key_pool_p)); if (pool_p == nullptr) { - // TODO(pzelazko-intel): support more formats + auto src_md = platform::MKLDNNMemDesc( + src_tz, platform::MKLDNNGetDataType(), input_format); - auto src_md = - platform::MKLDNNMemDesc(src_tz, platform::MKLDNNGetDataType(), - mkldnn::memory::format::nchw); - auto dst_md = - platform::MKLDNNMemDesc(dst_tz, platform::MKLDNNGetDataType(), - mkldnn::memory::format::nchw); + /* create memory descriptor for pooling without specified format + * ('any') which lets a primitive (pooling in this case) choose + * the memory format preferred for best performance + */ + auto dst_md = platform::MKLDNNMemDesc(dst_tz, mkldnn::memory::f32, + mkldnn::memory::format::any); - std::shared_ptr pool_pd = + std::shared_ptr pool_pd = CreatePrimitiveDesc(src_md, dst_md, strides, paddings, ksize, pooling_type, mkldnn_engine); @@ -116,20 +126,22 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel { // save pool_workspace_memory to be referred in backward path dev_ctx.SetBlob(key_pool_workspace_memory, workspace_memory); - auto pool_src_memory_p = std::make_shared( - memory::primitive_desc{src_md, mkldnn_engine}, - static_cast(const_cast(input_data))); - dev_ctx.SetBlob(key_pool_src_mem_p, pool_src_memory_p); + auto src_memory = std::make_shared(pool_pd->src_primitive_desc(), + to_void_cast(input_data)); + auto dst_memory = + std::make_shared(pool_pd->dst_primitive_desc(), output_data); - auto pool_dst_memory_p = std::make_shared( - memory::primitive_desc{dst_md, mkldnn_engine}, - static_cast(output_data)); - dev_ctx.SetBlob(key_pool_dst_mem_p, pool_dst_memory_p); + dev_ctx.SetBlob(key_pool_src_mem_p, src_memory); + dev_ctx.SetBlob(key_pool_dst_mem_p, dst_memory); + + pool_p = std::make_shared(*pool_pd, *(src_memory.get()), + *(dst_memory.get()), + *workspace_memory); - pool_p = std::make_shared( - *pool_pd, *(pool_src_memory_p.get()), *(pool_dst_memory_p.get()), - *workspace_memory); dev_ctx.SetBlob(key_pool_p, pool_p); + + output_format = + (memory::format)dst_memory->get_primitive_desc().desc().data.format; } else { // Primitives already exist auto pool_src_memory_p = @@ -140,14 +152,20 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel { std::static_pointer_cast(dev_ctx.GetBlob(key_pool_dst_mem_p)); PADDLE_ENFORCE(pool_dst_memory_p != nullptr, "Fail to find pooling dst mem_p in device context"); - pool_src_memory_p->set_data_handle( - reinterpret_cast(const_cast(input_data))); + pool_src_memory_p->set_data_handle(to_void_cast(input_data)); pool_dst_memory_p->set_data_handle(output_data); + + output_format = (memory::format)pool_dst_memory_p->get_primitive_desc() + .desc() + .data.format; } // push primitive to stream and wait until it's executed std::vector pipeline{*(pool_p.get())}; - mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); + stream(stream::kind::eager).submit(pipeline).wait(); + + output->set_layout(DataLayout::kMKLDNN); + output->set_format(output_format); } private: @@ -194,6 +212,13 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel { const Tensor* out_grad = ctx.Input(framework::GradVarName("Out")); Tensor* in_x_grad = ctx.Output(framework::GradVarName("X")); + PADDLE_ENFORCE(in_x->layout() == DataLayout::kMKLDNN && + in_x->format() != memory::format::format_undef, + "Wrong layout/format set for Input X tensor"); + PADDLE_ENFORCE(out_grad->layout() == DataLayout::kMKLDNN && + out_grad->format() != memory::format::format_undef, + "Wrong layout/format set for Input output_grad tensor"); + std::string pooling_type = ctx.Attr("pooling_type"); std::vector ksize = ctx.Attr>("ksize"); std::vector strides = ctx.Attr>("strides"); @@ -212,6 +237,7 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel { const T* out_grad_data = out_grad->data(); T* in_x_grad_data = in_x_grad->mutable_data(ctx.GetPlace()); + memory::format in_x_grad_format{memory::format::format_undef}; std::vector diff_src_tz = paddle::framework::vectorize2int(in_x_grad->dims()); @@ -225,39 +251,48 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel { const std::string key_pool_bwd_p = key + "@pool_bwd_p"; const std::string key_pool_diff_src_mem_p = key + "@pool_diff_src_mem_p"; const std::string key_pool_diff_dst_mem_p = key + "@pool_diff_dst_mem_p"; + const std::string key_pool_src_mem_p = key + "@pool_src_mem_p"; + const std::string key_pool_dst_mem_p = key + "@pool_dst_mem_p"; const std::string key_pool_pd = key + "@pool_pd"; const std::string key_pool_workspace_memory = key + "@pool_workspace_memory"; + auto user_diff_dst_memory = + memory({{{diff_dst_tz}, memory::data_type::f32, out_grad->format()}, + mkldnn_engine}, + to_void_cast(out_grad_data)); + + std::shared_ptr diff_src_memory; + std::shared_ptr diff_dst_memory; + auto dst_memory = + std::static_pointer_cast(dev_ctx.GetBlob(key_pool_dst_mem_p)); + PADDLE_ENFORCE(dst_memory != nullptr, + "Fail to find dst_memory in device context"); + + primitive reorder_diff_dst; + bool is_diff_dst_reordered = false; auto pool_bwd_p = std::static_pointer_cast( dev_ctx.GetBlob(key_pool_bwd_p)); if (pool_bwd_p == nullptr) { - auto diff_src_md = - platform::MKLDNNMemDesc(diff_src_tz, platform::MKLDNNGetDataType(), - mkldnn::memory::format::nchw); - auto diff_dst_md = - platform::MKLDNNMemDesc(diff_dst_tz, platform::MKLDNNGetDataType(), - mkldnn::memory::format::nchw); + // Retrieve src_memory/dst_memory saved in forward pass + auto src_memory = + std::static_pointer_cast(dev_ctx.GetBlob(key_pool_src_mem_p)); + PADDLE_ENFORCE(src_memory != nullptr, + "Fail to find src_memory in device context"); // Retrieve pool_pd/pool_workspace_memory from device context auto pool_pd = std::static_pointer_cast( dev_ctx.GetBlob(key_pool_pd)); PADDLE_ENFORCE(pool_pd != nullptr, "Fail to find pool_pd in device context"); - - auto workspace_memory = std::static_pointer_cast( + auto workspace_memory = std::static_pointer_cast( dev_ctx.GetBlob(key_pool_workspace_memory)); PADDLE_ENFORCE(workspace_memory != nullptr, "Fail to find workspace_memory in device context"); - auto pool_diff_src_memory_p = std::make_shared(memory( - {diff_src_md, mkldnn_engine}, static_cast(in_x_grad_data))); - dev_ctx.SetBlob(key_pool_diff_src_mem_p, pool_diff_src_memory_p); - - auto pool_diff_dst_memory_p = std::make_shared( - memory({diff_dst_md, mkldnn_engine}, - static_cast(const_cast(out_grad_data)))); - dev_ctx.SetBlob(key_pool_diff_dst_mem_p, pool_diff_dst_memory_p); + // create memory descriptors for pooling + auto diff_src_md = src_memory.get()->get_primitive_desc().desc(); + auto diff_dst_md = dst_memory.get()->get_primitive_desc().desc(); auto pool_bwd_desc = mkldnn::pooling_backward::desc( pooling_type == "max" ? mkldnn::algorithm::pooling_max @@ -267,35 +302,74 @@ class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel { auto pool_bwd_pd = mkldnn::pooling_backward::primitive_desc( pool_bwd_desc, mkldnn_engine, *pool_pd); + // reorder between user_diff_dst and pool diff_dst if needed + diff_dst_memory = std::make_shared(user_diff_dst_memory); + if (memory::primitive_desc(dst_memory->get_primitive_desc()) != + user_diff_dst_memory.get_primitive_desc()) { + diff_dst_memory = + std::make_shared(dst_memory.get()->get_primitive_desc()); + reorder_diff_dst = reorder(user_diff_dst_memory, *diff_dst_memory); + is_diff_dst_reordered = true; + } + + diff_src_memory = std::make_shared( + pool_bwd_pd.diff_src_primitive_desc(), in_x_grad_data); + + dev_ctx.SetBlob(key_pool_diff_src_mem_p, diff_src_memory); + dev_ctx.SetBlob(key_pool_diff_dst_mem_p, diff_dst_memory); + pool_bwd_p = std::make_shared( - pool_bwd_pd, *(pool_diff_dst_memory_p.get()), *workspace_memory, - *(pool_diff_src_memory_p)); + pool_bwd_pd, *(diff_dst_memory.get()), *workspace_memory, + *(diff_src_memory)); dev_ctx.SetBlob(key_pool_bwd_p, pool_bwd_p); + } else { // Primitives already exist - auto pool_diff_src_memory_p = std::static_pointer_cast( + diff_src_memory = std::static_pointer_cast( dev_ctx.GetBlob(key_pool_diff_src_mem_p)); - PADDLE_ENFORCE(pool_diff_src_memory_p != nullptr, + PADDLE_ENFORCE(diff_src_memory != nullptr, "Fail to find pooling src mem_p in device context"); - auto pool_diff_dst_memory_p = std::static_pointer_cast( + diff_dst_memory = std::static_pointer_cast( dev_ctx.GetBlob(key_pool_diff_dst_mem_p)); - PADDLE_ENFORCE(pool_diff_dst_memory_p != nullptr, + PADDLE_ENFORCE(diff_dst_memory != nullptr, "Fail to find pooling dst mem_p in device context"); - pool_diff_src_memory_p->set_data_handle( - reinterpret_cast(in_x_grad_data)); - pool_diff_dst_memory_p->set_data_handle(const_cast(out_grad_data)); + + diff_src_memory->set_data_handle(reinterpret_cast(in_x_grad_data)); + diff_dst_memory->set_data_handle(const_cast(out_grad_data)); + + // reorder between user_diff_dst and pool diff_dst if needed + if (memory::primitive_desc(dst_memory->get_primitive_desc()) != + user_diff_dst_memory.get_primitive_desc()) { + diff_dst_memory = + std::make_shared(dst_memory.get()->get_primitive_desc()); + reorder_diff_dst = reorder(user_diff_dst_memory, *diff_dst_memory); + is_diff_dst_reordered = true; + } } + in_x_grad_format = (memory::format)diff_src_memory->get_primitive_desc() + .desc() + .data.format; + // push primitive to stream and wait until it's executed - std::vector pipeline{*(pool_bwd_p.get())}; + std::vector pipeline; + if (is_diff_dst_reordered) { + pipeline.push_back(reorder_diff_dst); + } + pipeline.push_back(*(pool_bwd_p.get())); mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); + + in_x_grad->set_layout(DataLayout::kMKLDNN); + in_x_grad->set_format(in_x_grad_format); } // Compute() }; } // namespace operators } // namespace paddle +namespace ops = paddle::operators; + REGISTER_OP_KERNEL(pool2d, MKLDNN, ::paddle::platform::CPUPlace, - paddle::operators::PoolMKLDNNOpKernel); + ops::PoolMKLDNNOpKernel); REGISTER_OP_KERNEL(pool2d_grad, MKLDNN, ::paddle::platform::CPUPlace, - paddle::operators::PoolMKLDNNGradOpKernel); + ops::PoolMKLDNNGradOpKernel); diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc index 18aa2bd352c5d184b5748e57b4af17c1ae0d7a82..6707cdded4020fe3e2b01ba399dfc279a9da677d 100644 --- a/paddle/fluid/operators/pool_op.cc +++ b/paddle/fluid/operators/pool_op.cc @@ -151,7 +151,8 @@ void Pool2dOpMaker::Make() { "The format of output tensor is also NCHW, " "where N is batch size, C is the number of channels, " "H is the height of the feature, " - "and W is the width of the feature."); + "and W is the width of the feature.") + .Reuse("X"); AddAttr("pooling_type", "(string), pooling type, can be \"max\" for max-pooling " @@ -244,7 +245,8 @@ void Pool3dOpMaker::Make() { "The format of output tensor is also NCDHW, " "where N is batch size, C is " "the number of channels, and D, H and W is the depth, height and " - "width of the feature, respectively."); + "width of the feature, respectively.") + .Reuse("X"); AddAttr("pooling_type", "(string) Pooling type, can be \"max\" for max-pooling " diff --git a/paddle/fluid/operators/prefetch_op.cc b/paddle/fluid/operators/prefetch_op.cc index d96359d6befa68bdd0c255dde9c63bfc7fffc0a5..f71ba84b318c1f8b0604310f3db8a0826124e207 100644 --- a/paddle/fluid/operators/prefetch_op.cc +++ b/paddle/fluid/operators/prefetch_op.cc @@ -18,7 +18,7 @@ limitations under the License. */ #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/detail/grpc_client.h" +#include "paddle/fluid/operators/detail/macros.h" #include "paddle/fluid/operators/send_recv_util.h" namespace paddle { @@ -42,7 +42,7 @@ class PrefetchOp : public framework::OperatorBase { auto& ctx = *pool.Get(place); detail::RPCClient* rpc_client = - detail::RPCClient::GetInstance(); + detail::RPCClient::GetInstance(); for (size_t i = 0; i < ins.size(); i++) { if (NeedSend(scope, ins[i])) { diff --git a/paddle/fluid/operators/recv_op.cc b/paddle/fluid/operators/recv_op.cc index 1ea1cc458b2a20017cc36457af81387a8c808642..15dfb5469bf51330b98d6699fb3ce708222212ed 100644 --- a/paddle/fluid/operators/recv_op.cc +++ b/paddle/fluid/operators/recv_op.cc @@ -19,8 +19,7 @@ limitations under the License. */ #include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" - -#include "paddle/fluid/operators/detail/grpc_client.h" +#include "paddle/fluid/operators/detail/macros.h" #include "paddle/fluid/platform/profiler.h" namespace paddle { @@ -45,7 +44,7 @@ class RecvOp : public framework::OperatorBase { platform::RecordEvent record_event(Type(), &ctx); detail::RPCClient* rpc_client = - detail::RPCClient::GetInstance(); + detail::RPCClient::GetInstance(); for (size_t i = 0; i < outs.size(); i++) { VLOG(3) << "getting " << outs[i] << " from " << epmap[i]; @@ -78,9 +77,15 @@ This operator can get variables from server side. } }; +class RecvOpShapeInference : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext* ctx) const override {} +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; -REGISTER_OPERATOR(recv, ops::RecvOp, ops::RecvOpMaker); +REGISTER_OPERATOR(recv, ops::RecvOp, paddle::framework::EmptyGradOpMaker, + ops::RecvOpMaker, ops::RecvOpShapeInference); diff --git a/paddle/fluid/operators/send_barrier_op.cc b/paddle/fluid/operators/send_barrier_op.cc index 511ad753876dd26a4d6bc8c2c727c7c9253ce59c..c6c975a23ce846464388c72af5d8902144ceb16a 100644 --- a/paddle/fluid/operators/send_barrier_op.cc +++ b/paddle/fluid/operators/send_barrier_op.cc @@ -19,8 +19,8 @@ limitations under the License. */ #include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/detail/macros.h" -#include "paddle/fluid/operators/detail/grpc_client.h" #include "paddle/fluid/platform/profiler.h" namespace paddle { @@ -45,7 +45,7 @@ class SendBarrierOp : public framework::OperatorBase { platform::RecordEvent record_event(Type(), &ctx); detail::RPCClient* rpc_client = - detail::RPCClient::GetInstance(); + detail::RPCClient::GetInstance(); VLOG(3) << "SendBarrierOp sync_mode:" << sync_mode; diff --git a/paddle/fluid/operators/send_op.cc b/paddle/fluid/operators/send_op.cc index 9697579707f1a510ba7db8a1a9616b59918b971b..84ec36625314572d16e5c537884b6efec420cc60 100644 --- a/paddle/fluid/operators/send_op.cc +++ b/paddle/fluid/operators/send_op.cc @@ -16,10 +16,9 @@ limitations under the License. */ #include #include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/detail/grpc_client.h" +#include "paddle/fluid/operators/detail/macros.h" #include "paddle/fluid/operators/send_recv_util.h" #include "paddle/fluid/platform/profiler.h" @@ -36,12 +35,9 @@ class SendOp : public framework::OperatorBase { void RunImpl(const framework::Scope& scope, const platform::Place& place) const override { auto ins = Inputs("X"); - auto outs = Outputs("Out"); - std::vector epmap = Attr>("epmap"); - std::vector endpoints = - Attr>("endpoints"); - bool sync_mode = Attr("sync_mode"); + std::vector epmap = Attr>("epmap"); + int sync_send = Attr("sync_mode"); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); auto& ctx = *pool.Get(place); @@ -50,37 +46,19 @@ class SendOp : public framework::OperatorBase { platform::RecordEvent record_event(Type(), &ctx); detail::RPCClient* rpc_client = - detail::RPCClient::GetInstance(); + detail::RPCClient::GetInstance(); for (size_t i = 0; i < ins.size(); i++) { if (NeedSend(scope, ins[i])) { VLOG(3) << "sending " << ins[i] << " to " << epmap[i]; + // TODO(Yancey1989): we need to use an IO threadpool which has + // a larger number of threads than the computing threadpool. rpc_client->AsyncSendVar(epmap[i], ctx, scope, ins[i]); } else { VLOG(3) << "don't send no-initialied variable: " << ins[i]; } } - rpc_client->Wait(); - - if (sync_mode) { - for (auto& ep : endpoints) { - VLOG(3) << "batch barrier, ep: " << ep; - rpc_client->AsyncSendBatchBarrier(ep); - } - rpc_client->Wait(); - } - - if (outs.size() > 0) { - for (size_t i = 0; i < outs.size(); i++) { - VLOG(2) << "getting " << outs[i] << " from " << epmap[i]; - rpc_client->AsyncGetVar(epmap[i], ctx, scope, outs[i]); - } - rpc_client->Wait(); - // tell pservers that current trainer have called fetch - for (auto& ep : endpoints) { - VLOG(2) << "send fetch barrier, ep: " << ep; - rpc_client->AsyncSendFetchBarrier(ep); - } + if (sync_send) { rpc_client->Wait(); } } @@ -89,26 +67,22 @@ class SendOp : public framework::OperatorBase { class SendOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() { - AddInput("X", "(Tensor) Input tensor to be sent").AsDuplicable(); - AddOutput("Out", "(Tensor) Output tensor to be received from server") + AddInput("X", "(Tensor, SelectedRows) Input variables to be sent") .AsDuplicable(); AddComment(R"DOC( Send operator -This operator will send tensor to recv_op at the parameter server. +This operator will send variables to listen_and_serve op at the parameter server. )DOC"); - // TODO(typhoonzero): remove this attr generate de-duplicated vector from - // epmap when initializing. - AddAttr>("endpoints", - "(string vector, default 127.0.0.1:6164)" - "Server endpoints to send variables to.") - .SetDefault({}); + AddAttr("sync_mode", + "(int, default 0)" + "sync send or async send.") + .SetDefault(0); AddAttr>("epmap", "(string vector, default 127.0.0.1:6164)" "Server endpoints in the order of input " "variables for mapping") - .SetDefault({}); - AddAttr("sync_mode", "work in sync_mode or not").SetDefault(true); + .SetDefault({"127.0.0.1:6164"}); } }; diff --git a/paddle/fluid/operators/send_vars_op.cc b/paddle/fluid/operators/send_vars_op.cc deleted file mode 100644 index 564e40461f8f894cffab11e26cc538b7964b6f19..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/send_vars_op.cc +++ /dev/null @@ -1,101 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include // NOLINT -#include - -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/detail/grpc_client.h" -#include "paddle/fluid/operators/send_recv_util.h" -#include "paddle/fluid/platform/profiler.h" - -namespace paddle { -namespace operators { - -class SendVarsOp : public framework::OperatorBase { - public: - SendVarsOp(const std::string& type, const framework::VariableNameMap& inputs, - const framework::VariableNameMap& outputs, - const framework::AttributeMap& attrs) - : OperatorBase(type, inputs, outputs, attrs) {} - - void RunImpl(const framework::Scope& scope, - const platform::Place& place) const override { - auto ins = Inputs("X"); - - std::vector epmap = Attr>("epmap"); - int sync_send = Attr("sync_send"); - - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - auto& ctx = *pool.Get(place); - - // For profiling - platform::RecordEvent record_event(Type(), &ctx); - - detail::RPCClient* rpc_client = - detail::RPCClient::GetInstance(); - - for (size_t i = 0; i < ins.size(); i++) { - if (NeedSend(scope, ins[i])) { - VLOG(3) << "sending " << ins[i] << " to " << epmap[i]; - // TODO(Yancey1989): we need to use an IO threadpool which has - // a larger number of threads than the computing threadpool. - rpc_client->AsyncSendVar(epmap[i], ctx, scope, ins[i]); - } else { - VLOG(3) << "don't send no-initialied variable: " << ins[i]; - } - } - if (sync_send) { - rpc_client->Wait(); - } - } -}; - -class SendVarsOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() { - AddInput("X", "(Tensor, SelectedRows) Input variables to be sent") - .AsDuplicable(); - AddComment(R"DOC( -Send operator - -This operator will send variables to listen_and_serve op at the parameter server. -)DOC"); - AddAttr("sync_send", - "(int, default 0)" - "sync send or async send.") - .SetDefault(0); - AddAttr>("epmap", - "(string vector, default 127.0.0.1:6164)" - "Server endpoints in the order of input " - "variables for mapping") - .SetDefault({"127.0.0.1:6164"}); - } -}; - -class SendVarsOpShapeInference : public framework::InferShapeBase { - public: - void operator()(framework::InferShapeContext* ctx) const override {} -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -REGISTER_OPERATOR(send_vars, ops::SendVarsOp, - paddle::framework::EmptyGradOpMaker, ops::SendVarsOpMaker, - ops::SendVarsOpShapeInference); diff --git a/paddle/fluid/operators/sgd_op.cc b/paddle/fluid/operators/sgd_op.cc index 7a2bdeac09d61603f437ff10d58d0542bb3c3689..fef230e42d07a5ed73b7a7a6ab682694675bb9d2 100644 --- a/paddle/fluid/operators/sgd_op.cc +++ b/paddle/fluid/operators/sgd_op.cc @@ -74,7 +74,8 @@ class SGDOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("Grad", "(Tensor or SelectedRows) Input gradient"); AddOutput("ParamOut", "(Tensor or SelectedRows, same with Param) " - "Output parameter, should share the same memory with Param"); + "Output parameter, should share the same memory with Param") + .Reuse("Param"); AddComment(R"DOC( SGD operator diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc index c90a3be964a3a309a182d3620abec619c366dd84..847b3cbd1bd416ae1326211c98ba9d145c103298 100644 --- a/paddle/fluid/operators/softmax_op.cc +++ b/paddle/fluid/operators/softmax_op.cc @@ -83,7 +83,8 @@ class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("X", "The input tensor of softmax. " "2-D with shape [batch_size, input_feature_dimensions]."); - AddOutput("Out", "The normalized values with the same shape as X."); + AddOutput("Out", "The normalized values with the same shape as X.") + .Reuse("X"); AddAttr( "use_cudnn", "(bool, default false) Only used in cudnn kernel, need install cudnn") diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc index bcc5e22d4a77349e7cde9a43b83f23d4c867d994..863baba9ea7663d0b21875e0b423dc4a6ce2d59a 100644 --- a/paddle/fluid/operators/sum_op.cc +++ b/paddle/fluid/operators/sum_op.cc @@ -115,7 +115,7 @@ class SumOpMaker : public framework::OpProtoAndCheckerMaker { void Make() override { AddInput("X", "(vector) The input tensors of sum operator.") .AsDuplicable(); - AddOutput("Out", "(Tensor) The output tensor of sum operator."); + AddOutput("Out", "(Tensor) The output tensor of sum operator.").Reuse("X"); AddComment(R"DOC( Sum operator. diff --git a/paddle/fluid/operators/test_send_nccl_id.cc b/paddle/fluid/operators/test_send_nccl_id.cc index a01a75cbb070f7f835b93f5ddc62f5aad5a5e667..5015b1005569ba70b147ebb795243e24ab81ea5c 100644 --- a/paddle/fluid/operators/test_send_nccl_id.cc +++ b/paddle/fluid/operators/test_send_nccl_id.cc @@ -20,8 +20,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/detail/grpc_client.h" -#include "paddle/fluid/operators/detail/grpc_server.h" +#include "paddle/fluid/operators/detail/macros.h" #include "paddle/fluid/operators/detail/request_handler_impl.h" #include "paddle/fluid/operators/listen_and_serv_op.h" #include "paddle/fluid/operators/math/math_function.h" @@ -29,6 +28,10 @@ limitations under the License. */ #include "paddle/fluid/platform/nccl_helper.h" #include "paddle/fluid/string/printf.h" +#ifdef PADDLE_WITH_GRPC +#include "paddle/fluid/operators/send_recv_util.h" +#endif + USE_NO_KERNEL_OP(listen_and_serv); namespace f = paddle::framework; @@ -37,7 +40,7 @@ namespace m = paddle::operators::math; namespace detail = paddle::operators::detail; namespace string = paddle::string; -std::unique_ptr g_rpc_service; +std::unique_ptr g_rpc_service; std::unique_ptr g_req_handler; void StartServer() { @@ -58,7 +61,7 @@ void StartServer() { g_req_handler->SetRPCServer(g_rpc_service.get()); std::thread server_thread( - std::bind(&detail::AsyncGRPCServer::StartServer, g_rpc_service.get())); + std::bind(&detail::RPCServer::StartServer, g_rpc_service.get())); g_rpc_service->SetCond(detail::kRequestSend); g_rpc_service->WaitBarrier(detail::kRequestSend); @@ -68,9 +71,9 @@ void StartServer() { server_thread.join(); } -TEST(SendNcclId, GrpcServer) { +TEST(SendNcclId, RPCServer) { g_req_handler.reset(new detail::RequestSendHandler(true)); - g_rpc_service.reset(new detail::AsyncGRPCServer("127.0.0.1:0", 1)); + g_rpc_service.reset(new RPCSERVER_T("127.0.0.1:0", 1)); std::thread server_thread(StartServer); g_rpc_service->WaitServerReady(); @@ -87,8 +90,9 @@ TEST(SendNcclId, GrpcServer) { int port = g_rpc_service->GetSelectedPort(); std::string ep = string::Sprintf("127.0.0.1:%d", port); - detail::RPCClient* client = - detail::RPCClient::GetInstance(); + + detail::RPCClient* client = detail::RPCClient::GetInstance(); + LOG(INFO) << "connect to server" << ep; client->AsyncSendVar(ep, dev_ctx, scope, NCCL_ID_VARNAME); client->Wait(); diff --git a/paddle/fluid/operators/top_k_op.cc b/paddle/fluid/operators/top_k_op.cc index c17d1afc309c65035063348d4934ea1783b018ed..4a8ac441cfaf642fde58ee30865a22e83c065498 100644 --- a/paddle/fluid/operators/top_k_op.cc +++ b/paddle/fluid/operators/top_k_op.cc @@ -50,7 +50,7 @@ class TopkOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { AddInput("X", "(Tensor) The input of Topk op"); - AddOutput("Out", "(Tensor) The output tensor of Topk op"); + AddOutput("Out", "(Tensor) The output tensor of Topk op").Reuse("X"); AddOutput("Indices", "(Tensor) The indices of Topk elements of input"); AddComment(R"DOC( Top K operator diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index 6b82d93237b6baa20703c5b54b56f5381dd858df..292ffef1aef12732812b8c5b0020cad73b1d06fc 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -11,6 +11,7 @@ limitations under the License. */ #pragma once #include +#include // NOLINT #include #include #include @@ -100,6 +101,7 @@ class CUDADeviceContext : public DeviceContext { template void RecordEvent(cudaEvent_t ev, Callback callback) { + std::lock_guard guard(mtx_); callback(); PADDLE_ENFORCE(cudaEventRecord(ev, stream_)); } @@ -116,6 +118,8 @@ class CUDADeviceContext : public DeviceContext { int compute_capability; int multi_process; int max_threads_per_mp; + + std::mutex mtx_; }; template <> diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 669d1bdaa3ec194be817cdc5e1f8484770c70c68..c88fbef63cf26c671246b15ea9872da0e7a92c1a 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -509,10 +509,10 @@ All parameter, weight, gradient are variables in Paddle. self.num_threads_ = num_threads; }) .def_property( - "use_event", - [](const ExecutionStrategy &self) { return self.use_event_; }, - [](ExecutionStrategy &self, bool use_event) { - self.use_event_ = use_event; + "use_cuda", + [](const ExecutionStrategy &self) { return self.use_cuda_; }, + [](ExecutionStrategy &self, bool use_cuda) { + self.use_cuda_ = use_cuda; }) .def_property( "allow_op_delay", diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc index 586ec48477f085a14d2f15b265a95d596705694f..507479c8622c8d33722e08bba018ad1ba5452e15 100644 --- a/paddle/testing/paddle_gtest_main.cc +++ b/paddle/testing/paddle_gtest_main.cc @@ -30,7 +30,7 @@ int main(int argc, char** argv) { new_argv.push_back( strdup("--tryfromenv=fraction_of_gpu_memory_to_use,use_pinned_memory")); #else - new_argv.push_back(strdup("--tryfromenv=use_pinned_memory")); + new_argv.push_back(strdup("--tryfromenv=use_pinned_memory,use_mkldnn")); #endif int new_argc = static_cast(new_argv.size()); char** new_argv_address = new_argv.data(); diff --git a/python/paddle/dataset/flowers.py b/python/paddle/dataset/flowers.py index f082e33be3357fbe405ab1a1ef5e0e601108a363..527044b415533cc640e3cfc5837c08ab0f8b74b1 100644 --- a/python/paddle/dataset/flowers.py +++ b/python/paddle/dataset/flowers.py @@ -119,7 +119,8 @@ def reader_creator(data_file, yield sample, int(label) - 1 if use_xmap: - return xmap_readers(mapper, reader, cpu_count(), buffered_size) + cpu_num = int(os.environ.get('CPU_NUM', cpu_count())) + return xmap_readers(mapper, reader, cpu_num, buffered_size) else: return map_readers(mapper, reader) diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 68aee304a6761e97a0dab4183611d9d07152da16..bd985ad733aa8eece2f8374d033f452a0175a011 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -117,7 +117,7 @@ def __bootstrap__(): read_env_flags = [ 'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir', - 'eager_delete_scope' + 'eager_delete_scope', 'use_mkldnn' ] if core.is_compiled_with_cuda(): read_env_flags += [ diff --git a/python/paddle/fluid/data_feeder.py b/python/paddle/fluid/data_feeder.py index 7940dabcfb03cc9eb46f678365685a6e99bcceec..e2013137b14f73bb0fcfb57b4bdc35fcc043bdc0 100644 --- a/python/paddle/fluid/data_feeder.py +++ b/python/paddle/fluid/data_feeder.py @@ -15,6 +15,7 @@ from __future__ import print_function import core import numpy +import os import six.moves as six import multiprocessing @@ -150,7 +151,9 @@ class DataFeeder(object): elif isinstance(self.place, core.CUDAPlace): return core.get_cuda_device_count() else: - return multiprocessing.cpu_count() + cpu_num = int( + os.environ.get('CPU_NUM', multiprocessing.cpu_count())) + return cpu_num def decorate_reader(self, reader, diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py index d1ea9f148566d20988a43f4c9d421c4452697ef1..80e8ff484a4c04df1b41bbca284d7c604962934c 100644 --- a/python/paddle/fluid/layers/control_flow.py +++ b/python/paddle/fluid/layers/control_flow.py @@ -13,7 +13,7 @@ # limitations under the License. import contextlib -from layer_function_generator import autodoc +from layer_function_generator import autodoc, templatedoc from tensor import assign, fill_constant from .. import core from ..framework import Program, Variable, Operator @@ -721,26 +721,22 @@ def lod_rank_table(x, level=0): return table +@templatedoc() def max_sequence_len(rank_table): - """Max Sequence Len Operator. Given a LoDRankTable object, this layer - returns the max length of a batch of sequences. In fact, a LoDRankTable - object contains a list of tuples() and - the list is already sorted by sequence length in descending order, so the - operator just returns the sequence length of the first tuple element. + """ + ${comment} + + >>> import paddle.fluid as fluid + >>> x = fluid.layers.data(name='x', shape=[10], dtype='float32', + >>> lod_level=1) + >>> rank_table = layers.lod_rank_table(x=x, level=0) + >>> max_seq_len = layers.max_sequence_len(rank_table) Args: - rank_table (Variable): Input variable which is a LoDRankTable object. + rank_table(${rank_table_type}): ${rank_table_comment}. Returns: - Variable: The max length of sequence. - - Examples: - .. code-block:: python - - x = fluid.layers.data(name='x', shape=[10], - dtype='float32', lod_level=1) - rank_table = layers.lod_rank_table(x=x, level=0) - max_seq_len = layers.max_sequence_len(rank_table) + ${out_comment}. """ helper = LayerHelper("max_seqence_len", **locals()) res = helper.create_tmp_variable(dtype="int64") diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py index a56f3ea9db6b9fabf9d78f102d394a0817a44a98..9de88e2c3205ace74beff43df7ae8956897d965a 100644 --- a/python/paddle/fluid/layers/io.py +++ b/python/paddle/fluid/layers/io.py @@ -19,11 +19,12 @@ from ..unique_name import generate as unique_name from control_flow import BlockGuard from ..layer_helper import LayerHelper from ..executor import global_scope +from layer_function_generator import generate_layer_fn, templatedoc __all__ = [ 'data', 'BlockGuardServ', 'ListenAndServ', 'Send', 'open_recordio_file', 'open_files', 'read_file', 'shuffle', 'batch', 'double_buffer', - 'random_data_generator', 'Preprocessor' + 'random_data_generator', 'Preprocessor', 'load' ] @@ -662,3 +663,29 @@ class Preprocessor(object): "sink_var_names": self.sink_var_names }) return monkey_patch_reader_methods(self.reader) + + +@templatedoc() +def load(out, file_path, load_as_fp16=None): + """ + ${comment} + + >>> import paddle.fluid as fluid + >>> tmp_tensor = fluid.layers.create_tensor(dtype='float32') + >>> fluid.layers.load(tmp_tensor, "./tmp_tensor.bin") + + Args: + out(${out_type}): ${out_comment}. + + file_path(${file_path_type}): ${file_path_comment}. + + load_as_fp16(${load_as_fp16_type}): ${load_as_fp16_comment}. + + Returns: + None + """ + helper = LayerHelper("load", **locals()) + attrs = {"file_path": file_path} + if load_as_fp16 is not None: + attrs['load_as_fp16'] = load_as_fp16 + helper.append_op(type="load", inputs={}, output={"Out": out}, args=attrs) diff --git a/python/paddle/fluid/layers/layer_function_generator.py b/python/paddle/fluid/layers/layer_function_generator.py index 904413cc11b50f80d3c4730bf66ec359f9285ae6..cb60a3aec9a5a69f1eed281eb017384a621c66a8 100644 --- a/python/paddle/fluid/layers/layer_function_generator.py +++ b/python/paddle/fluid/layers/layer_function_generator.py @@ -224,7 +224,10 @@ def autodoc(comment=""): return __impl__ -def templatedoc(): +_inline_math_single_dollar = re.compile(r"\$([^\$]+)\$") + + +def templatedoc(op_type=None): """ Decorator of layer function. It will use the docstring from the layer function as the template. The template arguments are: @@ -238,32 +241,47 @@ def templatedoc(): Decorated function. """ + def trim_ending_dot(msg): + return msg.rstrip('.') + + def escape_inline_math(msg): + return _inline_math_single_dollar.sub(repl=r':math:`\1`', string=msg) + def __impl__(func): - op_proto = OpProtoHolder.instance().get_op_proto(func.__name__) + if op_type is None: + op_type_name = func.__name__ + else: + op_type_name = op_type + op_proto = OpProtoHolder.instance().get_op_proto(op_type_name) tmpl = string.Template(func.__doc__) comment_lines = op_proto.comment.split("\n") comment = "" for line in comment_lines: - line = line.lstrip() - comment += line - comment += "\n" - - args = {"comment": comment} + line = line.strip() + if len(line) != 0: + comment += escape_inline_math(line) + comment += " " + elif len(comment) != 0: + comment += "\n \n " + + args = {"comment": trim_ending_dot(comment)} for each_input in op_proto.inputs: input_name = _convert_(each_input.name) - args["{0}_comment".format(input_name)] = each_input.comment + args["{0}_comment".format(input_name)] = trim_ending_dot( + each_input.comment) args["{0}_type".format(input_name)] = "Variable" for each_attr in op_proto.attrs: input_name = _convert_(each_attr.name) - args["{0}_comment".format(input_name)] = each_attr.comment + args["{0}_comment".format(input_name)] = trim_ending_dot( + each_attr.comment) args["{0}_type".format(input_name)] = _type_to_str_(each_attr.type) for each_opt in op_proto.outputs: output_name = _convert_(each_opt.name) - args["{0}_comment".format(output_name)] = each_opt.comment + args["{0}_comment".format(output_name)] = trim_ending_dot( + each_opt.comment) args["{0}_type".format(output_name)] = "Variable" - func.__doc__ = tmpl.substitute(args) return func diff --git a/python/paddle/fluid/layers/learning_rate_scheduler.py b/python/paddle/fluid/layers/learning_rate_scheduler.py index d13c54daa5a985e2e1bf9357630fe29d24a17bb4..716cc7824eff0c56cc55a055310fa8b1913ac5e6 100644 --- a/python/paddle/fluid/layers/learning_rate_scheduler.py +++ b/python/paddle/fluid/layers/learning_rate_scheduler.py @@ -11,6 +11,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +""" +When training a model, it's often useful to decay the +learning rate during training process, this is called +learning_rate_decay. There are many strategies to do +this, this module will provide some classical method. +User can also implement their own learning_rate_decay +strategy according to this module. +""" import control_flow import nn @@ -22,14 +30,6 @@ __all__ = [ 'exponential_decay', 'natural_exp_decay', 'inverse_time_decay', 'polynomial_decay', 'piecewise_decay', 'noam_decay' ] -""" -When training a model, it's often useful to decay the -learning rate during training process, this is called -learning_rate_decay. There are many strategies to do -this, this module will provide some classical method. -User can also implement their own learning_rate_decay -strategy according to this module. -""" def _decay_step_counter(begin=0): @@ -41,18 +41,20 @@ def _decay_step_counter(begin=0): def noam_decay(d_model, warmup_steps): - """Apply decay to learning rate. - ```python - lr_value = np.power(d_model, -0.5) * np.min([ - np.power(current_steps, -0.5), - np.power(warmup_steps, -1.5) * current_steps - ]) - ``` + """ + Noam decay method. The numpy implementation of noam decay as follows. + + >>> import numpy as np + >>> lr_value = np.power(d_model, -0.5) * np.min([ + >>> np.power(current_steps, -0.5), + >>> np.power(warmup_steps, -1.5) * current_steps]) + + Please reference `attention is all you need + `_. Args: d_model(Variable): The dimensionality of input and output of model. - Reference: attention is all you need - https://arxiv.org/pdf/1706.03762.pdf + warmup_steps(Variable): A super parameter. Returns: diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index ddaeb415af4320c233aa7d01130fe1da2cdcbfa8..b9ea74fc81e0eb9b52e9cd1e9af8cba005a10f21 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -4037,18 +4037,25 @@ def image_resize(input, return out +@templatedoc(op_type="bilinear_interp") def resize_bilinear(input, out_shape=None, scale=None, name=None): """ - This is an alias of layer 'image_resize' with bilinear interpolation. + ${comment} + + Args: + input(${x_type}): ${x_comment}. + + out_shape(${out_size_type}): ${out_size_comment}. - The mathematical meaning of resize bilinear layer is - Bilinear interpolation. - Bilinear interpolation is an extension of linear interpolation for - interpolating functions of two variables (e.g. H-direction and - W-direction in this layer) on a rectilinear 2D grid. + scale(float|None): The multiplier for the input height or width. At + least one of out_shape or scale must be set. And out_shape has + a higher priority than scale. Default: None. + + name(str|None): The output variable name. + + Returns: - For details, please refer to Wikipedia: - https://en.wikipedia.org/wiki/Bilinear_interpolation + ${out_comment}. """ return image_resize(input, out_shape, scale, name, 'BILINEAR') diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py index 69cfde852dd087bb9192da1f7582f925582dbce4..3260f81e9edcd9ed83e98a681c43a5d9dbfd1312 100644 --- a/python/paddle/fluid/layers/ops.py +++ b/python/paddle/fluid/layers/ops.py @@ -73,6 +73,7 @@ __all__ = [ 'sum', 'polygon_box_transform', 'shape', + 'maxout', ] + __activations__ for _OP in set(__all__): diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py index 75d3bf879703a1db1108eae45d879164e0024156..62b01d595a812ee8fc094e40b6dfb5c3f56cd012 100644 --- a/python/paddle/fluid/layers/tensor.py +++ b/python/paddle/fluid/layers/tensor.py @@ -18,6 +18,7 @@ from ..framework import convert_np_dtype_to_dtype_ from ..framework import Variable from ..initializer import Constant, force_init_on_cpu from ..core import VarDesc +from layer_function_generator import templatedoc import numpy __all__ = [ @@ -30,6 +31,8 @@ __all__ = [ 'assign', 'fill_constant_batch_size_like', 'fill_constant', + 'argmin', + 'argmax', 'ones', 'zeros', ] @@ -266,6 +269,7 @@ def fill_constant(shape, dtype, value, force_cpu=False, out=None): return out +@templatedoc() def fill_constant_batch_size_like(input, shape, dtype, @@ -273,30 +277,28 @@ def fill_constant_batch_size_like(input, input_dim_idx=0, output_dim_idx=0): """ - **fill_constant_batch_size_like** - - This function creates a tensor of specified *shape*, *dtype* and batch size, - and initializes this with a constant supplied in *value*. The batch size is - obtained from the `input` tensor. + ${comment} It also sets *stop_gradient* to True. + >>> data = fluid.layers.fill_constant_batch_size_like( + >>> input=like, shape=[1], value=0, dtype='int64') + Args: - input(Variable): Tensor whose dimensions will be used to get batch size - shape(tuple|list|None): Shape of output tensor - dtype(np.dtype|core.VarDesc.VarType|str): Data type of output tensor - value(float): Constant value to initialize the output tensor - input_dim_idx(int): Index of input's batch size dimension - output_dim_idx(int): Index of output's batch size dimension + input(${input_type}): ${input_comment}. - Returns: - Variable: The tensor variable storing the output + shape(${shape_type}): ${shape_comment}. - Examples: - .. code-block:: python + dtype(${dtype_type}): ${dtype_comment}. + + value(${value_type}): ${value_comment}. + + input_dim_idx(${input_dim_idx_type}): ${input_dim_idx_comment}. + + output_dim_idx(${output_dim_idx_type}): ${output_dim_idx_comment}. - data = fluid.layers.fill_constant_batch_size_like( - input=like, shape=[1], value=0, dtype='int64') + Returns: + ${out_comment}. """ helper = LayerHelper("fill_constant_batch_size_like", **locals()) out = helper.create_tmp_variable(dtype=dtype) @@ -315,6 +317,68 @@ def fill_constant_batch_size_like(input, return out +def argmin(x, axis=0): + """ + **argmin** + + This function computes the indices of the min elements + of the input tensor's element along the provided axis. + + Args: + x(Variable): The input to compute the indices of + the min elements. + axis(int): Axis to compute indices along. + + Returns: + Variable: The tensor variable storing the output + + Examples: + .. code-block:: python + + out = fluid.layers.argmin(x=in, axis=0) + out = fluid.layers.argmin(x=in, axis=-1) + """ + helper = LayerHelper("arg_min", **locals()) + out = helper.create_tmp_variable(VarDesc.VarType.INT64) + helper.append_op( + type='arg_min', + inputs={'X': x}, + outputs={'Out': [out]}, + attrs={'axis': axis}) + return out + + +def argmax(x, axis=0): + """ + **argmax** + + This function computes the indices of the max elements + of the input tensor's element along the provided axis. + + Args: + x(Variable): The input to compute the indices of + the max elements. + axis(int): Axis to compute indices along. + + Returns: + Variable: The tensor variable storing the output + + Examples: + .. code-block:: python + + out = fluid.layers.argmax(x=in, axis=0) + out = fluid.layers.argmax(x=in, axis=-1) + """ + helper = LayerHelper("arg_max", **locals()) + out = helper.create_tmp_variable(VarDesc.VarType.INT64) + helper.append_op( + type='arg_max', + inputs={'X': x}, + outputs={'Out': [out]}, + attrs={'axis': axis}) + return out + + def ones(shape, dtype, force_cpu=False): """ **ones** @@ -437,22 +501,6 @@ def save_combine(x, file_path, overwrite=True): "overwrite": overwrite}) -def load(out, file_path): - """ - Loads a variable from a given file. - - Args: - out(variable): The variable to be read from the disk file. - file_path(str): The path of the disk file. - """ - helper = LayerHelper("load", **locals()) - helper.append_op( - type="load", - inputs={}, - output={"Out": out}, - args={"file_path": file_path}) - - def load_combine(out, file_path): """ Loads a list of vairables from a single file. diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index 3117dfe00c7a3df1035c439dc31b81e67781d0cc..0fdc9a035292b3390cece6c5821a60b1b281e54d 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -18,6 +18,7 @@ import framework import executor import warnings import sys +import os __all__ = ['ParallelExecutor', 'ExecutionStrategy', 'BuildStrategy'] @@ -101,7 +102,9 @@ class ParallelExecutor(object): p.set_place(self._act_places[-1]) self._places.append(p) else: - for i in xrange(multiprocessing.cpu_count()): + cpu_num = int( + os.environ.get('CPU_NUM', multiprocessing.cpu_count())) + for i in xrange(cpu_num): p = core.Place() self._act_places.append(core.CPUPlace()) p.set_place(self._act_places[-1]) @@ -110,19 +113,17 @@ class ParallelExecutor(object): if exec_strategy is None: exec_strategy = ExecutionStrategy() - if use_cuda: - exec_strategy.use_event = True - else: - exec_strategy.use_event = False + exec_strategy.use_cuda = use_cuda if exec_strategy.num_threads == 0: if use_cuda: # Experiments on se-resnext shows that too many threads hurt # performance. Worth tunning for other models in the future. - exec_strategy.num_threads = len(self._places) * 2 + exec_strategy.num_threads = len(self._places) * 4 else: - exec_strategy.num_threads = min( - len(self._places) * 2, multiprocessing.cpu_count()) + cpu_num = int( + os.environ.get('CPU_NUM', multiprocessing.cpu_count())) + exec_strategy.num_threads = cpu_num if build_strategy is None: build_strategy = BuildStrategy() diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 673bd728718ca233b426fe2aaae307413d875174..ab683bc101728ba008e01f26ff4d3828b3b99787 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -41,8 +41,8 @@ function(py_test_modules TARGET_NAME) endfunction() list(REMOVE_ITEM TEST_OPS test_warpctc_op) list(REMOVE_ITEM TEST_OPS test_dist_train) -list(REMOVE_ITEM TEST_OPS test_parallel_executor_crf) -list(REMOVE_ITEM TEST_OPS test_parallel_executor_fetch_feed) +#list(REMOVE_ITEM TEST_OPS test_parallel_executor_crf) +#list(REMOVE_ITEM TEST_OPS test_parallel_executor_fetch_feed) # TODO(wuyi): this test hungs on CI, will add it back later list(REMOVE_ITEM TEST_OPS test_listen_and_serv_op) foreach(TEST_OP ${TEST_OPS}) diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py index c9c3c648717814c28c39a401487925824e885946..829c5a1a5fd099543e9e98b9587d4f316a91b587 100644 --- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py +++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +import multiprocessing +import os import unittest import paddle.fluid as fluid import time @@ -23,6 +25,7 @@ __all__ = ['TestParallelExecutorBase'] class TestParallelExecutorBase(unittest.TestCase): def check_network_convergence(self, method, + use_cuda=True, memory_opt=True, iter=50, batch_size=None, @@ -53,7 +56,7 @@ class TestParallelExecutorBase(unittest.TestCase): adam.minimize(loss) if memory_opt: fluid.memory_optimize(main) - place = fluid.CUDAPlace(0) + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() startup_exe = fluid.Executor(place) startup_exe.run(startup) exec_strategy = fluid.ExecutionStrategy() @@ -64,7 +67,7 @@ class TestParallelExecutorBase(unittest.TestCase): if use_parallel_executor: exe = fluid.ParallelExecutor( - True, + use_cuda, loss_name=loss.name, exec_strategy=exec_strategy, build_strategy=build_strategy) @@ -72,7 +75,9 @@ class TestParallelExecutorBase(unittest.TestCase): exe = fluid.Executor(place=place) if batch_size is not None: - batch_size *= fluid.core.get_cuda_device_count() + batch_size *= fluid.core.get_cuda_device_count( + ) if use_cuda else int( + os.environ.get('CPU_NUM', multiprocessing.cpu_count())) begin = time.time() first_loss, = run_executor( exe=exe, feed=feed_dict, fetch_list=[loss.name]) diff --git a/python/paddle/fluid/tests/unittests/test_arg_min_max_op.py b/python/paddle/fluid/tests/unittests/test_arg_min_max_op.py new file mode 100644 index 0000000000000000000000000000000000000000..e04412f809cdd75d07d28a60f0c2f19041a684f6 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_arg_min_max_op.py @@ -0,0 +1,82 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np +from op_test import OpTest + + +class BaseTestCase(OpTest): + def initTestCase(self): + self.op_type = 'arg_min' + self.dims = (3, 4, 5) + self.dtype = 'float32' + self.axis = 0 + + def setUp(self): + self.initTestCase() + self.x = (1000 * np.random.random(self.dims)).astype(self.dtype) + self.inputs = {'X': self.x} + self.attrs = {'axis': self.axis} + if self.op_type == "arg_min": + self.outputs = {'Out': np.argmin(self.x, axis=self.axis)} + else: + self.outputs = {'Out': np.argmax(self.x, axis=self.axis)} + + def test_check_output(self): + self.check_output() + + +class TestCase0(BaseTestCase): + def initTestCase(self): + self.op_type = 'arg_max' + self.dims = (3, 4, 5) + self.dtype = 'float32' + self.axis = 0 + + +class TestCase1(BaseTestCase): + def initTestCase(self): + self.op_type = 'arg_min' + self.dims = (3, 4) + self.dtype = 'float64' + self.axis = 1 + + +class TestCase2(BaseTestCase): + def initTestCase(self): + self.op_type = 'arg_max' + self.dims = (3, 4) + self.dtype = 'int64' + self.axis = 0 + + +class TestCase3(BaseTestCase): + def initTestCase(self): + self.op_type = 'arg_max' + self.dims = (3, ) + self.dtype = 'int64' + self.axis = 0 + + +class TestCase4(BaseTestCase): + def initTestCase(self): + self.op_type = 'arg_min' + self.dims = (1, ) + self.dtype = 'int32' + self.axis = 0 + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py index 32647f9aa81431a3ecc798df6f1360a14fd978af..b4379ad447e01683325dfcbb6a5b322f0b8eac3d 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py +++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import unittest import paddle.fluid as fluid from paddle.fluid.transpiler.distribute_transpiler import delete_ops @@ -54,10 +55,10 @@ class TestDistTranspiler(TranspilerTest): delete_ops(trainer.global_block(), optimize_ops) ops = [op.type for op in trainer.global_block().ops] + [ - "split_byref", "send_vars", "send_barrier", "recv", "recv", + "split_byref", "send", "send_barrier", "recv", "recv", "fetch_barrier", "concat" ] - ops.insert(ops.index("elementwise_add_grad") + 1, "send_vars") + ops.insert(ops.index("elementwise_add_grad") + 1, "send") return ops diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index 621a450fa6a6a8f47e3f1c1de609614b2359c33b..8b0ebe3cf52bf5b4514eacbd5d1bdd7c7a9b8b67 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -387,6 +387,14 @@ class TestBook(unittest.TestCase): self.assertIsNotNone(output) print(str(program)) + def test_maxout(self): + program = Program() + with program_guard(program): + data = layers.data(name='x', shape=[8, 6, 6], dtype="float32") + output = layers.maxout(x=data, groups=2) + self.assertIsNotNone(output) + print(str(program)) + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py index 1226027ddc9c0b9dce9cedc5d1d20c0708647b6f..d1d709551c77908db88be6fda7ac74d4e922138e 100644 --- a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py +++ b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py @@ -70,17 +70,18 @@ class TestListenAndServOp(OpTest): return p.pid def _wait_ps_ready(self, pid): - retry_times = self.ps_timeout + start_left_time = self.ps_timeout + sleep_time = 0.5 while True: - assert retry_times >= 0, "wait ps ready failed" - time.sleep(0.5) + assert start_left_time >= 0, "wait ps ready failed" + time.sleep(sleep_time) try: # the listen_and_serv_op would touch a file which contains the listen port # on the /tmp directory until it was ready to process all the RPC call. os.stat("/tmp/paddle.%d.port" % pid) return except os.error: - retry_times -= 1 + start_left_time -= sleep_time def test_rpc_interfaces(self): # TODO(Yancey1989): need to make sure the rpc interface correctly. diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py index 66e138b03f3b170aca4fb2207438eb9af1783c33..163975555ec2cea5c169cc1da3c4324d91ba3616 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py @@ -17,6 +17,7 @@ import paddle.fluid as fluid import unittest import paddle import numpy as np +import os word_dict, verb_dict, label_dict = conll05.get_dict() word_dict_len = len(word_dict) @@ -101,7 +102,11 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, class TestCRFModel(unittest.TestCase): - def check_network_convergence(self, is_sparse, build_strategy=None): + def check_network_convergence(self, + is_sparse, + build_strategy=None, + use_cuda=True): + os.environ['CPU_NUM'] = str(4) main = fluid.Program() startup = fluid.Program() with fluid.program_guard(main, startup): @@ -145,12 +150,12 @@ class TestCRFModel(unittest.TestCase): paddle.dataset.conll05.test(), buf_size=8192), batch_size=16) - place = fluid.CUDAPlace(0) + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(startup) pe = fluid.ParallelExecutor( - use_cuda=True, + use_cuda=use_cuda, loss_name=avg_cost.name, build_strategy=build_strategy) @@ -172,25 +177,33 @@ class TestCRFModel(unittest.TestCase): build_strategy = fluid.BuildStrategy() build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce self.check_network_convergence( - is_sparse=True, build_strategy=build_strategy) + is_sparse=True, build_strategy=build_strategy, use_cuda=True) + self.check_network_convergence( + is_sparse=True, build_strategy=build_strategy, use_cuda=False) def test_update_dense_parameter_all_reduce(self): build_strategy = fluid.BuildStrategy() build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce self.check_network_convergence( - is_sparse=False, build_strategy=build_strategy) + is_sparse=False, build_strategy=build_strategy, use_cuda=True) + self.check_network_convergence( + is_sparse=False, build_strategy=build_strategy, use_cuda=False) def test_update_sparse_parameter_reduce(self): build_strategy = fluid.BuildStrategy() build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce self.check_network_convergence( - is_sparse=True, build_strategy=build_strategy) + is_sparse=True, build_strategy=build_strategy, use_cuda=True) + self.check_network_convergence( + is_sparse=True, build_strategy=build_strategy, use_cuda=False) def test_update_dense_parameter_reduce(self): build_strategy = fluid.BuildStrategy() build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce self.check_network_convergence( - is_sparse=False, build_strategy=build_strategy) + is_sparse=False, build_strategy=build_strategy, use_cuda=True) + self.check_network_convergence( + is_sparse=False, build_strategy=build_strategy, use_cuda=False) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py index 24f8d28c0304a77a99213374b25d0db728eca265..79702475cca86ca22107d4b1824fda277dd83157 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py @@ -18,6 +18,7 @@ import paddle.fluid as fluid import unittest import numpy as np import paddle +import os def Lenet(data, class_dim): @@ -35,7 +36,7 @@ def Lenet(data, class_dim): class TestFetchOp(unittest.TestCase): - def parallel_exe(self, train_inputs, seed): + def parallel_exe(self, train_inputs, seed, use_cuda): main = fluid.Program() startup = fluid.Program() startup.random_seed = seed @@ -59,13 +60,13 @@ class TestFetchOp(unittest.TestCase): # conv2d_1.b_0@GRAD. Those variables should not be pruned. # fluid.memory_optimize(main) - place = fluid.CUDAPlace(0) + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(startup) feeder = fluid.DataFeeder(place=place, feed_list=[data, label]) pe = fluid.ParallelExecutor( - use_cuda=True, loss_name=loss.name, main_program=main) + use_cuda=use_cuda, loss_name=loss.name, main_program=main) fetch_list = [] all_vars = main.global_block().vars @@ -88,14 +89,16 @@ class TestFetchOp(unittest.TestCase): for i in range(iters): train_inputs.append(tst_reader_iter.next()) - self.parallel_exe(train_inputs, seed=1) + os.environ['CPU_NUM'] = str(4) + self.parallel_exe(train_inputs, seed=1, use_cuda=True) + self.parallel_exe(train_inputs, seed=1, use_cuda=False) class TestFeedParallel(unittest.TestCase): - def test_main(self): + def parallel_exe(self, use_cuda, seed): main = fluid.Program() startup = fluid.Program() - startup.random_seed = 1 + startup.random_seed = seed with fluid.scope_guard(fluid.core.Scope()): with fluid.program_guard(main, startup): data = fluid.layers.data( @@ -111,15 +114,18 @@ class TestFeedParallel(unittest.TestCase): regularization=fluid.regularizer.L2Decay(1e-4)) opt.minimize(loss) - place = fluid.CUDAPlace(0) + + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() feeder = fluid.DataFeeder(place=place, feed_list=[data, label]) reader = feeder.decorate_reader( paddle.batch( flowers.train(), batch_size=16), multi_devices=True) + exe = fluid.Executor(place) exe.run(startup) + pe = fluid.ParallelExecutor( - use_cuda=True, loss_name=loss.name, main_program=main) + use_cuda=use_cuda, loss_name=loss.name, main_program=main) for batch_id, data in enumerate(reader()): loss_np = np.array(pe.run(feed=data, fetch_list=[loss.name])[0]) @@ -127,6 +133,11 @@ class TestFeedParallel(unittest.TestCase): if batch_id == 2: break + def test_feed_op(self): + os.environ['CPU_NUM'] = str(4) + self.parallel_exe(use_cuda=True, seed=1) + self.parallel_exe(use_cuda=False, seed=1) + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py index 015703c3e25f4e11e64ab6a7de99da12bee608f6..a801d99aa1ced35eb7f081fde63ad541f0eb2589 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py @@ -18,6 +18,7 @@ import numpy as np import paddle import paddle.dataset.mnist as mnist import unittest +import os MNIST_RECORDIO_FILE = "./mnist_test_pe.recordio" @@ -85,6 +86,7 @@ def fc_with_batchnorm(use_feed): class TestMNIST(TestParallelExecutorBase): @classmethod def setUpClass(cls): + os.environ['CPU_NUM'] = str(4) # Convert mnist to recordio file with fluid.program_guard(fluid.Program(), fluid.Program()): reader = paddle.batch(mnist.train(), batch_size=4) @@ -99,9 +101,12 @@ class TestMNIST(TestParallelExecutorBase): fluid.recordio_writer.convert_reader_to_recordio_file( MNIST_RECORDIO_FILE, reader, feeder) - def check_simple_fc_convergence(self, balance_parameter_opt_between_cards): - self.check_network_convergence(simple_fc_net) - self.check_network_convergence(simple_fc_net, allow_op_delay=True) + def check_simple_fc_convergence(self, + balance_parameter_opt_between_cards, + use_cuda=True): + self.check_network_convergence(simple_fc_net, use_cuda=use_cuda) + self.check_network_convergence( + simple_fc_net, use_cuda=use_cuda, allow_op_delay=True) img = np.zeros(shape=[32, 784], dtype='float32') label = np.ones(shape=[32, 1], dtype='int64') @@ -109,17 +114,21 @@ class TestMNIST(TestParallelExecutorBase): simple_fc_net, feed_dict={"image": img, "label": label}, + use_cuda=use_cuda, balance_parameter_opt_between_cards=balance_parameter_opt_between_cards ) def test_simple_fc(self): - self.check_simple_fc_convergence(False) + self.check_simple_fc_convergence(False, use_cuda=True) + self.check_simple_fc_convergence(False, use_cuda=False) def test_simple_fc_with_new_strategy(self): - self.check_simple_fc_convergence(True) + self.check_simple_fc_convergence(True, use_cuda=True) + self.check_simple_fc_convergence(True, use_cuda=False) def check_simple_fc_parallel_accuracy(self, - balance_parameter_opt_between_cards): + balance_parameter_opt_between_cards, + use_cuda=True): img = np.zeros(shape=[32, 784], dtype='float32') label = np.ones(shape=[32, 1], dtype='int64') single_first_loss, single_last_loss = self.check_network_convergence( @@ -127,12 +136,14 @@ class TestMNIST(TestParallelExecutorBase): seed=1000, feed_dict={"image": img, "label": label}, + use_cuda=use_cuda, use_parallel_executor=False) parallel_first_loss, parallel_last_loss = self.check_network_convergence( method=simple_fc_net, seed=1000, feed_dict={"image": img, "label": label}, + use_cuda=use_cuda, use_parallel_executor=True, balance_parameter_opt_between_cards=balance_parameter_opt_between_cards ) @@ -143,28 +154,33 @@ class TestMNIST(TestParallelExecutorBase): self.assertAlmostEquals(p_l, single_last_loss[0], delta=1e-6) def test_simple_fc_parallel_accuracy(self): - self.check_simple_fc_parallel_accuracy(False) + self.check_simple_fc_parallel_accuracy(False, use_cuda=True) + self.check_simple_fc_parallel_accuracy(False, use_cuda=False) def test_simple_fc_parallel_accuracy_with_new_strategy(self): - self.check_simple_fc_parallel_accuracy(True) + self.check_simple_fc_parallel_accuracy(True, use_cuda=True) + self.check_simple_fc_parallel_accuracy(True, use_cuda=False) - def check_batchnorm_fc_convergence(self, - balance_parameter_opt_between_cards): - self.check_network_convergence(fc_with_batchnorm) + def check_batchnorm_fc_convergence( + self, balance_parameter_opt_between_cards, use_cuda): + self.check_network_convergence(fc_with_batchnorm, use_cuda=use_cuda) img = np.zeros(shape=[32, 784], dtype='float32') label = np.ones(shape=[32, 1], dtype='int64') self.check_network_convergence( fc_with_batchnorm, feed_dict={"image": img, "label": label}, + use_cuda=use_cuda, balance_parameter_opt_between_cards=balance_parameter_opt_between_cards ) def test_batchnorm_fc(self): - self.check_batchnorm_fc_convergence(False) + self.check_batchnorm_fc_convergence(False, use_cuda=True) + self.check_batchnorm_fc_convergence(False, use_cuda=False) def test_batchnorm_fc_with_new_strategy(self): - self.check_batchnorm_fc_convergence(True) + self.check_batchnorm_fc_convergence(True, use_cuda=True) + self.check_batchnorm_fc_convergence(True, use_cuda=False) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py index a3fa140cbb7994a36d2cbee26d598165f1f771d2..066299e6c6f7f6c159cb0886e86d3404b027b698 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py @@ -15,6 +15,7 @@ import paddle.fluid as fluid from parallel_executor_test_base import TestParallelExecutorBase import unittest +import os def squeeze_excitation(input, num_channels, reduction_ratio): @@ -130,22 +131,30 @@ def SE_ResNeXt50Small(batch_size=2, use_feed=False): class TestResnet(TestParallelExecutorBase): - def check_resnet_convergence(self, balance_parameter_opt_between_cards): + def check_resnet_convergence(self, + balance_parameter_opt_between_cards, + use_cuda=True, + iter=20): + os.environ['CPU_NUM'] = str(4) + import functools batch_size = 2 self.check_network_convergence( functools.partial( SE_ResNeXt50Small, batch_size=batch_size), - iter=20, + iter=iter, batch_size=batch_size, + use_cuda=use_cuda, balance_parameter_opt_between_cards=balance_parameter_opt_between_cards ) def test_resnet(self): - self.check_resnet_convergence(False) + self.check_resnet_convergence(False, use_cuda=True) + self.check_resnet_convergence(False, use_cuda=False, iter=5) def test_resnet_with_new_strategy(self): - self.check_resnet_convergence(True) + self.check_resnet_convergence(True, use_cuda=True) + self.check_resnet_convergence(True, use_cuda=False, iter=5) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py index 93a5f767867d68110cf7b8f441cc740ecd843cf9..31ba8c1d6096c9c89e0695c8eca8e16a5e303a61 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py @@ -15,6 +15,7 @@ import paddle.fluid as fluid import numpy as np import unittest +import os def simple_fc_net(): @@ -35,7 +36,8 @@ def simple_fc_net(): class ParallelExecutorTestingDuringTraining(unittest.TestCase): - def check_network_convergence(self, build_strategy=None): + def check_network_convergence(self, use_cuda, build_strategy=None): + os.environ['CPU_NUM'] = str(4) main = fluid.Program() startup = fluid.Program() with fluid.program_guard(main, startup): @@ -49,19 +51,19 @@ class ParallelExecutorTestingDuringTraining(unittest.TestCase): image = np.random.normal(size=(batch_size, 784)).astype('float32') label = np.random.randint(0, 10, (batch_size, 1), dtype="int64") - place = fluid.CUDAPlace(0) + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(startup) feed_dict = {'image': image, 'label': label} train_exe = fluid.ParallelExecutor( - use_cuda=True, + use_cuda=use_cuda, loss_name=loss.name, main_program=main, build_strategy=build_strategy) test_exe = fluid.ParallelExecutor( - use_cuda=True, + use_cuda=use_cuda, main_program=test_program, share_vars_from=train_exe, build_strategy=build_strategy) @@ -81,12 +83,18 @@ class ParallelExecutorTestingDuringTraining(unittest.TestCase): def test_parallel_testing(self): build_strategy = fluid.BuildStrategy() build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce - self.check_network_convergence(build_strategy) + self.check_network_convergence( + use_cuda=True, build_strategy=build_strategy) + self.check_network_convergence( + use_cuda=False, build_strategy=build_strategy) def test_parallel_testing_with_new_strategy(self): build_strategy = fluid.BuildStrategy() build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce - self.check_network_convergence(build_strategy) + self.check_network_convergence( + use_cuda=True, build_strategy=build_strategy) + self.check_network_convergence( + use_cuda=False, build_strategy=build_strategy) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py index c81df66d987f3d3856af0e19fc935df7de2edacc..b6215fddb11bb6b3a76b5a6395e7254d21971c13 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py @@ -19,6 +19,7 @@ from parallel_executor_test_base import TestParallelExecutorBase import unittest import paddle import paddle.dataset.wmt16 as wmt16 +import os WMT16_RECORDIO_FILE = "./wmt16_test_pe.recordio" @@ -149,6 +150,7 @@ def transformer(use_feed): class TestTransformer(TestParallelExecutorBase): @classmethod def setUpClass(cls): + os.environ['CPU_NUM'] = str(4) reader = paddle.batch( wmt16.train(ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size), @@ -167,7 +169,8 @@ class TestTransformer(TestParallelExecutorBase): @unittest.skip("transformer is buggy in multi gpu") def test_main(self): - self.check_network_convergence(transformer) + self.check_network_convergence(transformer, use_cuda=True) + self.check_network_convergence(transformer, use_cuda=False) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_simple_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_simple_dist_transpiler.py index 5ae2844e295194f95701e1cdccd43bf919bf964f..f4aa7426bc315be501348a64e2f15caed6dc8810 100644 --- a/python/paddle/fluid/tests/unittests/test_simple_dist_transpiler.py +++ b/python/paddle/fluid/tests/unittests/test_simple_dist_transpiler.py @@ -59,9 +59,9 @@ class TestSimpleDistTranspiler(TranspilerTest): delete_ops(trainer.global_block(), optimize_ops) ops = [op.type for op in trainer.global_block().ops] + [ - "send_vars", "send_barrier", "recv", "recv", "fetch_barrier" + "send", "send_barrier", "recv", "recv", "fetch_barrier" ] - ops.insert(ops.index("elementwise_add_grad") + 1, "send_vars") + ops.insert(ops.index("elementwise_add_grad") + 1, "send") return ops def _transpiler_instance(self): diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index bf06f4c2ca73c8cfd79f39eb37499f07d854543c..5ec5b41544c686f88c95da36cedaa682f516caae 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -24,9 +24,9 @@ Steps to transpile trainer: 1. split variable to multiple blocks, aligned by product(dim[1:]) (width). 2. rename splited grad variables to add trainer_id suffix ".trainer_%d". 3. modify trainer program add split_op to each grad variable. -4. append send_op to send splited variables to server and fetch - params(splited blocks or origin param) from server. -5. append concat_op to merge splited blocks to update local weights. +4. append send_op to send splited variables to server and +5. add recv_op to fetch params(splited blocks or origin param) from server. +6. append concat_op to merge splited blocks to update local weights. Steps to transpile pserver: 1. create new program for parameter server. @@ -317,7 +317,7 @@ class DistributeTranspiler: program.global_block().insert_op( index=index + 1, - type="send_vars", + type="send", inputs={"X": splited_vars}, outputs={}, attrs={ @@ -683,7 +683,7 @@ class DistributeTranspiler: break def _split_table_grad_and_add_send_vars(self, program, pserver_endpoints): - # 2. add split_ids_op and send_vars_op to send gradient to pservers + # 2. add split_ids_op and send_op to send gradient to pservers # there should only be one table_name all_ops = program.global_block().ops table_grad_name = grad_var_name(self.table_name) @@ -700,11 +700,11 @@ class DistributeTranspiler: outputs={"Out": self.trainer_side_table_grad_list}) program.global_block().insert_op( index=op_index + 2, - type="send_vars", + type="send", inputs={'X': self.trainer_side_table_grad_list}, outputs={}, attrs={ - "sync_send": True, + "sync_mode": True, "epmap": pserver_endpoints, RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE }) diff --git a/python/paddle/v2/dataset/flowers.py b/python/paddle/v2/dataset/flowers.py index 7bdddeaabec733ef26b3f766c6437f5c53d65044..357a4e9b000ea81afe291ff39dde2bed5c67e619 100644 --- a/python/paddle/v2/dataset/flowers.py +++ b/python/paddle/v2/dataset/flowers.py @@ -119,7 +119,8 @@ def reader_creator(data_file, yield sample, int(label) - 1 if use_xmap: - return xmap_readers(mapper, reader, cpu_count(), buffered_size) + cpu_num = int(os.environ.get('CPU_NUM', cpu_count())) + return xmap_readers(mapper, reader, cpu_num, buffered_size) else: return map_readers(mapper, reader)