diff --git a/.gitignore b/.gitignore index 90138f996cf9cacc3c1cbff0cf2600eefca3f305..fa0c8882606b76ac71b43dcde7e1df6770c46c31 100644 --- a/.gitignore +++ b/.gitignore @@ -28,3 +28,4 @@ third_party/ build_* # clion workspace. cmake-build-* +model_test diff --git a/CMakeLists.txt b/CMakeLists.txt index 6aa2e1715b92d73aa4e5e97d5e52ffac51451d80..ed704585d8a6bf3befd9a549aa5a62a33fea3da9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -62,13 +62,12 @@ option(WITH_DISTRIBUTE "Compile with distributed support" OFF) option(USE_EIGEN_FOR_BLAS "Use matrix multiplication in Eigen" OFF) option(EIGEN_USE_THREADS "Compile with multi-threaded Eigen" OFF) option(WITH_ARM_FP16 "Use half precision support on armv8.2-a cpu" OFF) -option(WITH_FAST_BUNDLE_TEST "Bundle tests that can be run in a single process together to reduce launch overhead" OFF) option(WITH_CONTRIB "Compile the third-party contributation" OFF) option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better debug." OFF) option(WITH_ANAKIN "Compile with Anakin library" OFF) option(WITH_GRPC "Use grpc as the default rpc framework" ${WITH_DISTRIBUTE}) option(WITH_BRPC_RDMA "Use brpc rdma as the rpc protocal" OFF) -option(WITH_INFERENCE "Compile fluid inference library" ON) +option(ON_INFER "Turn on inference optimization." OFF) option(WITH_INFERENCE_API_TEST "Test fluid inference high-level api interface" OFF) option(WITH_SYSTEM_BLAS "Use system blas library" OFF) option(PY_VERSION "Compile PaddlePaddle with python3 support" ${PY_VERSION}) @@ -179,6 +178,7 @@ include(external/eigen) # download eigen3 include(external/pybind11) # download pybind11 include(external/cares) include(external/cub) +include(external/xxhash) # download xxhash if (NOT WIN32) # there is no official support of snappystream, warpctc, nccl, cupti in windows @@ -301,3 +301,11 @@ if(WITH_DOC) find_python_module(recommonmark REQUIRED) add_subdirectory(doc) endif() + +if (ON_INFER) + message(STATUS "On inference mode, will take place some specific optimization.") + add_definitions(-DPADDLE_ON_INFERENCE) +else() + #TODO(luotao), combine this warning with `make inference_lib_dist` command. + message(WARNING "On inference mode, will take place some specific optimization. Turn on the ON_INFER flag when building inference_lib only.") +endif() diff --git a/Dockerfile b/Dockerfile index 738bba9bc2e1ab19709722fe04f1490b1b13bd4f..c8b9eed6d60e5d3b32fc14c0c7af80a785145d1b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -75,14 +75,14 @@ RUN pip3 install -U wheel && \ pip3 install -U docopt PyYAML sphinx==1.5.6 && \ pip3 install sphinx-rtd-theme==0.1.9 recommonmark && \ easy_install -U pip && \ - pip install -U wheel && \ + pip install -U pip setuptools wheel && \ pip install -U docopt PyYAML sphinx==1.5.6 && \ pip install sphinx-rtd-theme==0.1.9 recommonmark -RUN pip3 install pre-commit 'ipython==5.3.0' && \ +RUN pip3 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ pip3 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ pip3 install opencv-python && \ - pip install pre-commit 'ipython==5.3.0' && \ + pip install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ pip install opencv-python diff --git a/README.md b/README.md index 8ee67f66423df8bce27f70015be8752457cd9784..56d6c10c642787836abb55cb2974bda0b8d22da4 100644 --- a/README.md +++ b/README.md @@ -2,8 +2,8 @@ [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle) -[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://paddlepaddle.org/documentation/docs/en/1.0/getstarted/index_en.html) -[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://paddlepaddle.org/documentation/docs/zh/1.0/beginners_guide/index.html) +[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://paddlepaddle.org/documentation/docs/en/1.1/getstarted/index_en.html) +[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://paddlepaddle.org/documentation/docs/zh/1.1/beginners_guide/index.html) [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases) [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE) @@ -19,7 +19,7 @@ Our vision is to enable deep learning for everyone via PaddlePaddle. Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle. -### Latest PaddlePaddle Release: [Fluid 1.0.1](https://github.com/PaddlePaddle/Paddle/tree/release/1.0.0) +### Latest PaddlePaddle Release: [Fluid 1.1.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.1) ### Install Latest Stable Release: ``` # Linux CPU @@ -27,9 +27,9 @@ pip install paddlepaddle # Linux GPU cuda9cudnn7 pip install paddlepaddle-gpu # Linux GPU cuda8cudnn7 -pip install paddlepaddle-gpu==1.0.1.post87 +pip install paddlepaddle-gpu==1.1.0.post87 # Linux GPU cuda8cudnn5 -pip install paddlepaddle-gpu==1.0.1.post85 +pip install paddlepaddle-gpu==1.1.0.post85 # For installation on other platform, refer to http://paddlepaddle.org/ ``` @@ -76,26 +76,26 @@ pip install paddlepaddle-gpu==1.0.1.post85 ## Installation -It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/1.0/beginners_guide/index.html) on our website. +It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/1.1/beginners_guide/index.html) on our website. ## Documentation -We provide [English](http://paddlepaddle.org/documentation/docs/en/1.0.0/getstarted/index_en.html) and -[Chinese](http://paddlepaddle.org/documentation/docs/zh/1.0/beginners_guide/index.html) documentation. +We provide [English](http://paddlepaddle.org/documentation/docs/en/1.1/getstarted/index_en.html) and +[Chinese](http://paddlepaddle.org/documentation/docs/zh/1.1/beginners_guide/index.html) documentation. - [Deep Learning 101](https://github.com/PaddlePaddle/book) You might want to start from this online interactive book that can run in a Jupyter Notebook. -- [Distributed Training](http://paddlepaddle.org/documentation/docs/zh/1.0/user_guides/howto/training/cluster_howto.html) +- [Distributed Training](http://paddlepaddle.org/documentation/docs/zh/1.1/user_guides/howto/training/cluster_howto.html) You can run distributed training jobs on MPI clusters. -- [Python API](http://paddlepaddle.org/documentation/api/zh/1.0/fluid.html) +- [Python API](http://paddlepaddle.org/documentation/api/zh/1.1/fluid.html) Our new API enables much shorter programs. -- [How to Contribute](http://paddlepaddle.org/documentation/docs/zh/1.0/advanced_usage/development/contribute_to_paddle.html) +- [How to Contribute](http://paddlepaddle.org/documentation/docs/zh/1.1/advanced_usage/development/contribute_to_paddle.html) We appreciate your contributions! diff --git a/benchmark/fluid/args.py b/benchmark/fluid/args.py index 9540900b112f54594bbfdbc8d7cd3b6e1f5269dd..ff616ddbb2cb1cb7f348d6d164815823b08b7629 100644 --- a/benchmark/fluid/args.py +++ b/benchmark/fluid/args.py @@ -142,5 +142,10 @@ def parse_args(): choices=['reduce', 'all_reduce'], default='all_reduce', help='Specify the reduce strategy, can be reduce, all_reduce') + parser.add_argument( + '--fuse_broadcast_op', + action='store_true', + help='If set, would fuse multiple broadcast operators into one fused_broadcast operator.' + ) args = parser.parse_args() return args diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py index ddd9fe809853a830ca676cc98f1819f683866def..5f3ce300acc44ad8d2898c27296b866c403f3cc8 100644 --- a/benchmark/fluid/fluid_benchmark.py +++ b/benchmark/fluid/fluid_benchmark.py @@ -177,6 +177,7 @@ def train_parallel(train_args, test_args, args, train_prog, test_prog, else: build_strategy.reduce_strategy = fluid.BuildStrategy( ).ReduceStrategy.AllReduce + build_strategy.fuse_broadcast_op = args.fuse_broadcast_op avg_loss = train_args[0] @@ -240,7 +241,6 @@ def train_parallel(train_args, test_args, args, train_prog, test_prog, if args.use_fake_data or args.use_reader_op: try: - fetch_ret = exe.run(fetch_list) except fluid.core.EOFException as eof: break diff --git a/cmake/external/xxhash.cmake b/cmake/external/xxhash.cmake new file mode 100644 index 0000000000000000000000000000000000000000..c227e09719bd5f0e825f81fb96f78105aa10c79b --- /dev/null +++ b/cmake/external/xxhash.cmake @@ -0,0 +1,50 @@ +INCLUDE(ExternalProject) + +set(XXHASH_SOURCE_DIR ${THIRD_PARTY_PATH}/xxhash) +set(XXHASH_INSTALL_DIR ${THIRD_PARTY_PATH}/install/xxhash) +set(XXHASH_INCLUDE_DIR "${XXHASH_INSTALL_DIR}/include") + +IF(WITH_STATIC_LIB) + SET(BUILD_CMD make lib) +ELSE() + IF(APPLE) + SET(BUILD_CMD sed -i \"\" "s/-Wstrict-prototypes -Wundef/-Wstrict-prototypes -Wundef -fPIC/g" ${XXHASH_SOURCE_DIR}/src/extern_xxhash/Makefile && make lib) + ELSE(APPLE) + SET(BUILD_CMD sed -i "s/-Wstrict-prototypes -Wundef/-Wstrict-prototypes -Wundef -fPIC/g" ${XXHASH_SOURCE_DIR}/src/extern_xxhash/Makefile && make lib) + ENDIF(APPLE) +ENDIF() + +ExternalProject_Add( + extern_xxhash + ${EXTERNAL_PROJECT_LOG_ARGS} + GIT_REPOSITORY "https://github.com/Cyan4973/xxHash" + GIT_TAG "v0.6.5" + PREFIX ${XXHASH_SOURCE_DIR} + DOWNLOAD_NAME "xxhash" + UPDATE_COMMAND "" + CONFIGURE_COMMAND "" + BUILD_IN_SOURCE 1 + PATCH_COMMAND + BUILD_COMMAND ${BUILD_CMD} + INSTALL_COMMAND export PREFIX=${XXHASH_INSTALL_DIR}/ && make install + TEST_COMMAND "" +) + +set(XXHASH_LIBRARIES "${XXHASH_INSTALL_DIR}/lib/libxxhash.a") +INCLUDE_DIRECTORIES(${XXHASH_INCLUDE_DIR}) + +add_library(xxhash STATIC IMPORTED GLOBAL) +set_property(TARGET xxhash PROPERTY IMPORTED_LOCATION ${XXHASH_LIBRARIES}) +include_directories(${XXHASH_INCLUDE_DIR}) +add_dependencies(xxhash extern_xxhash) + +LIST(APPEND external_project_dependencies xxhash) + +IF(WITH_C_API) + INSTALL(DIRECTORY ${XXHASH_INCLUDE_DIR} DESTINATION third_party/xxhash) + IF(ANDROID) + INSTALL(FILES ${XXHASH_LIBRARIES} DESTINATION third_party/xxhash/lib/${ANDROID_ABI}) + ELSE() + INSTALL(FILES ${XXHASH_LIBRARIES} DESTINATION third_party/xxhash/lib) + ENDIF() +ENDIF() diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index 67cca09b64c1ed7a503a886e78347d786eae0de7..efdb093a7b28e19f3b2a774dd54f2e7f042e9ca7 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -31,7 +31,7 @@ function(copy TARGET) foreach(index RANGE ${len}) list(GET copy_lib_SRCS ${index} src) list(GET copy_lib_DSTS ${index} dst) - add_custom_command(TARGET ${TARGET} PRE_BUILD + add_custom_command(TARGET ${TARGET} PRE_BUILD COMMAND mkdir -p "${dst}" COMMAND cp -r "${src}" "${dst}" COMMENT "copying ${src} -> ${dst}") @@ -67,6 +67,13 @@ copy(boost_lib DEPS boost ) +set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/xxhash") +copy(xxhash_lib + SRCS ${XXHASH_INCLUDE_DIR} ${XXHASH_LIBRARIES} + DSTS ${dst_dir} ${dst_dir}/lib + DEPS xxhash +) + if(NOT PROTOBUF_FOUND) set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/protobuf") copy(protobuf_lib @@ -186,7 +193,7 @@ copy(cmake_cache DSTS ${FLUID_INSTALL_DIR}) # This command generates a complete fluid library for both train and inference -add_custom_target(fluid_lib_dist DEPENDS ${fluid_lib_dist_dep}) +add_custom_target(fluid_lib_dist DEPENDS ${fluid_lib_dist_dep}) # Following commands generate a inference-only fluid library # third_party, version.txt and CMakeCache.txt are the same position with ${FLUID_INSTALL_DIR} diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt index 6653244507742b33d9524a7a0e4a5b2b575d358a..6b665a9effba4bef083d007c0c74f2f4c79e647e 100644 --- a/paddle/CMakeLists.txt +++ b/paddle/CMakeLists.txt @@ -24,6 +24,7 @@ if(NOT WITH_FLUID_ONLY) endif() add_subdirectory(testing) +set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests CACHE INTERNAL "python tests directory") if(NOT MOBILE_INFERENCE AND NOT RPI AND NOT WITH_C_API) add_subdirectory(fluid) endif() diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 31ccaa03065f42a4b5ba5905b6f66094aa534ac7..3aeba53572885de85b27d3058054bf8ae06d48a6 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -64,7 +64,7 @@ paddle.fluid.layers.chunk_eval ArgSpec(args=['input', 'label', 'chunk_scheme', ' paddle.fluid.layers.sequence_conv ArgSpec(args=['input', 'num_filters', 'filter_size', 'filter_stride', 'padding', 'bias_attr', 'param_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(3, 1, None, None, None, None, None)) paddle.fluid.layers.conv2d ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None)) paddle.fluid.layers.conv3d ArgSpec(args=['input', 'num_filters', 'filter_size', 'stride', 'padding', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(1, 0, 1, None, None, None, True, None, None)) -paddle.fluid.layers.sequence_pool ArgSpec(args=['input', 'pool_type'], varargs=None, keywords=None, defaults=None) +paddle.fluid.layers.sequence_pool ArgSpec(args=['input', 'pool_type', 'is_test'], varargs=None, keywords=None, defaults=(False,)) paddle.fluid.layers.sequence_softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(False, None)) paddle.fluid.layers.softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(True, None)) paddle.fluid.layers.pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None)) @@ -86,7 +86,7 @@ paddle.fluid.layers.reduce_prod ArgSpec(args=['input', 'dim', 'keep_dim', 'name' paddle.fluid.layers.sequence_first_step ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.sequence_last_step ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.sequence_slice ArgSpec(args=['input', 'offset', 'length', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.dropout ArgSpec(args=['x', 'dropout_prob', 'is_test', 'seed', 'name'], varargs=None, keywords=None, defaults=(False, None, None)) +paddle.fluid.layers.dropout ArgSpec(args=['x', 'dropout_prob', 'is_test', 'seed', 'name', 'dropout_implementation'], varargs=None, keywords=None, defaults=(False, None, None, 'downgrade_in_infer')) paddle.fluid.layers.split ArgSpec(args=['input', 'num_or_sections', 'dim', 'name'], varargs=None, keywords=None, defaults=(-1, None)) paddle.fluid.layers.ctc_greedy_decoder ArgSpec(args=['input', 'blank', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.edit_distance ArgSpec(args=['input', 'label', 'normalized', 'ignored_tokens'], varargs=None, keywords=None, defaults=(True, None)) @@ -107,7 +107,7 @@ paddle.fluid.layers.softmax_with_cross_entropy ArgSpec(args=['logits', 'label', paddle.fluid.layers.smooth_l1 ArgSpec(args=['x', 'y', 'inside_weight', 'outside_weight', 'sigma'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.layers.one_hot ArgSpec(args=['input', 'depth'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.autoincreased_step_counter ArgSpec(args=['counter_name', 'begin', 'step'], varargs=None, keywords=None, defaults=(None, 1, 1)) -paddle.fluid.layers.reshape ArgSpec(args=['x', 'shape', 'actual_shape', 'act', 'inplace', 'name'], varargs=None, keywords=None, defaults=(None, None, True, None)) +paddle.fluid.layers.reshape ArgSpec(args=['x', 'shape', 'actual_shape', 'act', 'inplace', 'name'], varargs=None, keywords=None, defaults=(None, None, False, None)) paddle.fluid.layers.squeeze ArgSpec(args=['input', 'axes', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.unsqueeze ArgSpec(args=['input', 'axes', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.lod_reset ArgSpec(args=['x', 'y', 'target_lod'], varargs=None, keywords=None, defaults=(None, None)) @@ -174,7 +174,12 @@ paddle.fluid.layers.mean ArgSpec(args=['x', 'name'], varargs=None, keywords=None paddle.fluid.layers.mul ArgSpec(args=['x', 'y', 'x_num_col_dims', 'y_num_col_dims', 'name'], varargs=None, keywords=None, defaults=(1, 1, None)) paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.maxout ArgSpec(args=['x', 'groups', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.affine_grid ArgSpec(args=['theta', 'out_shape', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.sequence_reverse ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.affine_channel ArgSpec(args=['x', 'scale', 'bias', 'data_layout', 'name'], varargs=None, keywords=None, defaults=(None, None, 'NCHW', None)) +paddle.fluid.layers.hash ArgSpec(args=['input', 'hash_size', 'num_hash', 'name'], varargs=None, keywords=None, defaults=(1, None)) +paddle.fluid.layers.log_loss ArgSpec(args=['input', 'label', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(0.0001, None)) +paddle.fluid.layers.add_position_encoding ArgSpec(args=['input', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)) paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)) paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None) @@ -353,6 +358,8 @@ paddle.fluid.optimizer.ModelAverage.__init__ ArgSpec(args=['self', 'average_wind paddle.fluid.optimizer.ModelAverage.apply ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) paddle.fluid.optimizer.ModelAverage.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.ModelAverage.restore ArgSpec(args=['self', 'executor'], varargs=None, keywords=None, defaults=None) +paddle.fluid.optimizer.LarsMomentumOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'momentum', 'lars_coeff', 'lars_weight_decay', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.0005, None, None)) +paddle.fluid.optimizer.LarsMomentumOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.backward.append_backward ArgSpec(args=['loss', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.regularizer.L1DecayRegularizer.__init__ ArgSpec(args=['self', 'regularization_coeff'], varargs=None, keywords=None, defaults=(0.0,)) paddle.fluid.regularizer.L2DecayRegularizer.__init__ ArgSpec(args=['self', 'regularization_coeff'], varargs=None, keywords=None, defaults=(0.0,)) diff --git a/paddle/fluid/CMakeLists.txt b/paddle/fluid/CMakeLists.txt index 48b36df6499e59fe742766b5f81fd30a9fb8b900..7d48f0057140cf021a21ea7e304b7e38cc8b9ec2 100644 --- a/paddle/fluid/CMakeLists.txt +++ b/paddle/fluid/CMakeLists.txt @@ -9,8 +9,6 @@ add_subdirectory(pybind) add_subdirectory(recordio) endif(NOT WIN32) -if(WITH_INFERENCE) - # NOTE: please add subdirectory inference at last. - add_subdirectory(inference) - add_subdirectory(train) -endif() +# NOTE: please add subdirectory inference at last. +add_subdirectory(inference) +add_subdirectory(train) diff --git a/paddle/fluid/framework/attribute.cc b/paddle/fluid/framework/attribute.cc index 0dcecb62dba971b48c4f11c0ef47494be40eeea0..fabf2abfc803b8838edb48aa01ab8896799c97ac 100644 --- a/paddle/fluid/framework/attribute.cc +++ b/paddle/fluid/framework/attribute.cc @@ -64,6 +64,13 @@ Attribute GetAttrValue(const proto::OpDesc::Attr& attr_desc) { case proto::AttrType::LONG: { return attr_desc.l(); } + case proto::AttrType::LONGS: { + std::vector val(attr_desc.longs_size()); + for (int i = 0; i < attr_desc.longs_size(); ++i) { + val[i] = attr_desc.longs(i); + } + return val; + } default: PADDLE_THROW("Unsupport attr type %d", attr_desc.type()); } diff --git a/paddle/fluid/framework/attribute.h b/paddle/fluid/framework/attribute.h index 14ca3e96209ed17f12e87fda8506806514698977..d9c76881b7e98d0b7cd29024b98c8f7720398c66 100644 --- a/paddle/fluid/framework/attribute.h +++ b/paddle/fluid/framework/attribute.h @@ -26,6 +26,113 @@ limitations under the License. */ namespace paddle { namespace framework { + +template +struct ExtractAttribute { + explicit ExtractAttribute(const std::string& attr_name) + : attr_name_(attr_name) {} + + T* operator()(Attribute& attr) const { + T* attr_value = nullptr; + try { + attr_value = &boost::get(attr); + } catch (boost::bad_get& bad_get) { + PADDLE_THROW("Cannot get attribute %s by type %s, its type is %s", + attr_name_, paddle::platform::demangle(typeid(T).name()), + paddle::platform::demangle(attr.type().name())); + } + return attr_value; + } + + const std::string& attr_name_; +}; + +// special handle bool +// FIXME(yuyang18): Currently we cast bool into int in python binding. It is +// hard to change the logic there. In another way, we should correct handle +// if the user set `some_flag=1`. +// +// FIX ME anytime if there is a better solution. +template <> +struct ExtractAttribute { + explicit ExtractAttribute(const std::string& attr_name) + : attr_name_(attr_name) {} + + bool* operator()(Attribute& attr) const { + if (attr.type() == typeid(int)) { // NOLINT + int val = boost::get(attr); + attr = static_cast(val); + } else if (attr.type() == typeid(float)) { // NOLINT + float val = boost::get(attr); + attr = static_cast(val); + } + bool* attr_value = nullptr; + try { + attr_value = &boost::get(attr); + } catch (boost::bad_get& bad_get) { + PADDLE_THROW("Cannot get attribute %s by type bool, its type is %s", + attr_name_, paddle::platform::demangle(attr.type().name())); + } + return attr_value; + } + + const std::string& attr_name_; +}; + +template <> +struct ExtractAttribute { + explicit ExtractAttribute(const std::string& attr_name) + : attr_name_(attr_name) {} + + int64_t* operator()(Attribute& attr) const { + if (attr.type() == typeid(int)) { // NOLINT + int val = boost::get(attr); + attr = static_cast(val); + } else if (attr.type() == typeid(float)) { // NOLINT + int val = boost::get(attr); + attr = static_cast(val); + } + int64_t* attr_value = nullptr; + try { + attr_value = &boost::get(attr); + } catch (boost::bad_get& bad_get) { + PADDLE_THROW("Cannot get attribute %s by type int64_t, its type is %s", + attr_name_, paddle::platform::demangle(attr.type().name())); + } + return attr_value; + } + + const std::string& attr_name_; +}; + +template <> +struct ExtractAttribute> { + explicit ExtractAttribute(const std::string& attr_name) + : attr_name_(attr_name) {} + + std::vector* operator()(Attribute& attr) const { + if (attr.type() == typeid(std::vector)) { // NOLINT + std::vector val = boost::get>(attr); + std::vector vec(val.begin(), val.end()); + attr = vec; + } else if (attr.type() == typeid(std::vector)) { // NOLINT + std::vector val = boost::get>(attr); + std::vector vec(val.begin(), val.end()); + attr = vec; + } + std::vector* attr_value = nullptr; + try { + attr_value = &boost::get>(attr); + } catch (boost::bad_get& bad_get) { + PADDLE_THROW("Cannot get attribute %s by type int64_t, its type is %s", + attr_name_, paddle::platform::demangle(attr.type().name())); + } + return attr_value; + } + + const std::string& attr_name_; +}; + template inline proto::AttrType AttrTypeID() { Attribute tmp = T(); @@ -42,7 +149,11 @@ class AttrReader { inline const T& Get(const std::string& name) const { PADDLE_ENFORCE(attrs_.count(name) != 0, "%s should be in AttributeMap", name); - return boost::get(attrs_.at(name)); + + Attribute& attr = const_cast(attrs_.at(name)); + ExtractAttribute extract_attr(name); + T* attr_value = extract_attr(attr); + return *attr_value; } private: @@ -82,7 +193,7 @@ class DefaultValueSetter { public: explicit DefaultValueSetter(T default_value) : default_value_(default_value) {} - void operator()(T& value) const { value = default_value_; } + void operator()(T& value) const { value = default_value_; } // NOLINT private: T default_value_; @@ -117,84 +228,6 @@ class EnumInContainer { std::unordered_set container_; }; -template -struct ExtractAttribute { - explicit ExtractAttribute(const std::string& attr_name) - : attr_name_(attr_name) {} - - T* operator()(Attribute& attr) const { - T* attr_value = nullptr; - try { - attr_value = &boost::get(attr); - } catch (boost::bad_get& bad_get) { - PADDLE_THROW("Cannot get attribute %s by type %s, its type is %s", - attr_name_, paddle::platform::demangle(typeid(T).name()), - paddle::platform::demangle(attr.type().name())); - } - return attr_value; - } - - const std::string& attr_name_; -}; - -// special handle bool -// FIXME(yuyang18): Currently we cast bool into int in python binding. It is -// hard to change the logic there. In another way, we should correct handle -// if the user set `some_flag=1`. -// -// FIX ME anytime if there is a better solution. -template <> -struct ExtractAttribute { - explicit ExtractAttribute(const std::string& attr_name) - : attr_name_(attr_name) {} - - bool* operator()(Attribute& attr) const { - if (attr.type() == typeid(int)) { // NOLINT - int val = boost::get(attr); - attr = static_cast(val); - } else if (attr.type() == typeid(float)) { // NOLINT - float val = boost::get(attr); - attr = static_cast(val); - } - bool* attr_value = nullptr; - try { - attr_value = &boost::get(attr); - } catch (boost::bad_get& bad_get) { - PADDLE_THROW("Cannot get attribute %s by type bool, its type is %s", - attr_name_, paddle::platform::demangle(attr.type().name())); - } - return attr_value; - } - - const std::string& attr_name_; -}; - -template <> -struct ExtractAttribute { - explicit ExtractAttribute(const std::string& attr_name) - : attr_name_(attr_name) {} - - int64_t* operator()(Attribute& attr) const { - if (attr.type() == typeid(int)) { // NOLINT - int val = boost::get(attr); - attr = static_cast(val); - } else if (attr.type() == typeid(float)) { // NOLINT - int val = boost::get(attr); - attr = static_cast(val); - } - int64_t* attr_value = nullptr; - try { - attr_value = &boost::get(attr); - } catch (boost::bad_get& bad_get) { - PADDLE_THROW("Cannot get attribute %s by type int64_t, its type is %s", - attr_name_, paddle::platform::demangle(attr.type().name())); - } - return attr_value; - } - - const std::string& attr_name_; -}; - // check whether a certain attribute fit its limits // an attribute can have more than one limits template @@ -235,7 +268,7 @@ class TypedAttrChecker { return *this; } - void operator()(AttributeMap& attr_map) const { + void operator()(AttributeMap& attr_map) const { // NOLINT if (!attr_map.count(attr_name_)) { // user do not set this attr PADDLE_ENFORCE(!default_value_setter_.empty(), @@ -271,7 +304,7 @@ class OpAttrChecker { return *(checker.target>()); } - void Check(AttributeMap& attr_map) const { + void Check(AttributeMap& attr_map) const { // NOLINT for (const auto& checker : attr_checkers_) { checker(attr_map); } diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index e0a3ef5a9c6c53c42ebea1a41cac0d18a77781b2..aa6b7db5562f8596b1b30a16c0f08fcc433cfcd7 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -16,12 +16,14 @@ if(WITH_GPU) dynload_cuda variable_visitor) nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim dynload_cuda) nv_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor dynload_cuda) + nv_library(fused_broadcast_op_handle SRCS fused_broadcast_op_handle.cc DEPS broadcast_op_handle) else() cc_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory variable_visitor) cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim) cc_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor) + cc_library(fused_broadcast_op_handle SRCS fused_broadcast_op_handle.cc DEPS broadcast_op_handle) endif() cc_library(data_balance_op_handle SRCS data_balance_op_handle.cc DEPS op_handle_base scope lod_tensor) @@ -34,7 +36,7 @@ if(WITH_GPU) endif() cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle - scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle) + scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle fused_broadcast_op_handle) if(WITH_GPU) cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS graph framework_proto reference_count_pass) @@ -54,8 +56,9 @@ cc_library(scope_buffered_ssa_graph_executor SRCS scope_buffered_ssa_graph_execu # device_context reduce_op_handle ) cc_library(fast_threaded_ssa_graph_executor SRCS fast_threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope simple_threadpool device_context) +cc_test(fused_broadcast_op_test SRCS fused_broadcast_op_handle_test.cc DEPS fused_broadcast_op_handle) cc_library(build_strategy SRCS build_strategy.cc DEPS graph_viz_pass multi_devices_graph_pass multi_devices_graph_print_pass multi_devices_graph_check_pass - fuse_elewise_add_act_pass) + fuse_elewise_add_act_pass multi_batch_merge_pass) diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc index 7c5f5bd80a937bf1a1c891155764833d7b21c5c2..b8690156763e4037811245b8016982710445e6a2 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc @@ -34,7 +34,7 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, nccl_ctxs_(ctxs) { if (nccl_ctxs_) { for (auto &p : places_) { - this->dev_ctxes_[p] = nccl_ctxs_->DevCtx(p); + this->SetDeviceContext(p, nccl_ctxs_->DevCtx(p)); } } } @@ -46,7 +46,7 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, #endif void AllReduceOpHandle::RunImpl() { - platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second); + platform::RecordEvent record_event(Name(), dev_ctxes_.cbegin()->second); if (NoDummyInputSize() == 1) { return; // No need to all reduce when GPU count = 1; @@ -127,7 +127,7 @@ void AllReduceOpHandle::RunImpl() { *local_scopes_[i]->FindVar(kLocalExecScopeName)->Get(); auto &p = places_[i]; auto *var = scope.FindVar(out_var_handles[i]->name_); - auto *dev_ctx = dev_ctxes_[p]; + auto *dev_ctx = dev_ctxes_.at(p); RunAndRecordEvent(p, [&trg, var, dev_ctx, p] { auto &tensor_gpu = *var->GetMutable(); diff --git a/paddle/fluid/framework/details/broadcast_op_handle.cc b/paddle/fluid/framework/details/broadcast_op_handle.cc index 4fdab5cd94358d08eac7f8b041bf16d09042f0bd..7f0d06c892541a2697a4ed083f6f4c0fc774a2a4 100644 --- a/paddle/fluid/framework/details/broadcast_op_handle.cc +++ b/paddle/fluid/framework/details/broadcast_op_handle.cc @@ -48,16 +48,27 @@ void BroadcastOpHandle::RunImpl() { var_scopes.emplace_back(s->FindVar(kLocalExecScopeName)->Get()); } + BroadcastOneVar(*in_var_handle, out_var_handles, var_scopes); +} + +void BroadcastOpHandle::BroadcastOneVar( + const VarHandle &in_var_handle, + const std::vector &out_var_handles, + const std::vector &var_scopes) { auto *in_var = - var_scopes.at(in_var_handle->scope_idx_)->FindVar(in_var_handle->name_); + var_scopes.at(in_var_handle.scope_idx_)->FindVar(in_var_handle.name_); PADDLE_ENFORCE_NOT_NULL(in_var); Tensor &in_tensor = VariableVisitor::GetMutableTensor(in_var); + if (UNLIKELY(!in_tensor.IsInitialized())) { + VLOG(3) << "in var " << in_var_handle.name_ << "not inited, return!"; + return; + } - InitOutputValue(*in_var_handle, out_var_handles); + InitOutputValue(in_var_handle, out_var_handles); if (platform::is_cpu_place(in_tensor.place())) { for (auto *out_var_handle : out_var_handles) { - if (out_var_handle->IsTheSameVar(*in_var_handle)) { + if (out_var_handle->IsTheSameVar(in_var_handle)) { continue; } auto &out_p = out_var_handle->place_; @@ -114,12 +125,12 @@ void BroadcastOpHandle::RunImpl() { } } - if (!out_handle->IsTheSameVar(*in_var_handle)) { - auto out_var = var_scopes.at(in_var_handle->scope_idx_) + if (!out_handle->IsTheSameVar(in_var_handle)) { + auto out_var = var_scopes.at(in_var_handle.scope_idx_) ->FindVar(out_var_handles[0]->name_); paddle::framework::TensorCopy( - in_tensor, in_var_handle->place_, - *(dev_ctxes_.at(in_var_handle->place_)), + in_tensor, in_var_handle.place_, + *(dev_ctxes_.at(in_var_handle.place_)), &VariableVisitor::GetMutableTensor(out_var)); } }); diff --git a/paddle/fluid/framework/details/broadcast_op_handle.h b/paddle/fluid/framework/details/broadcast_op_handle.h index fe4e733e43417977df324fde808f52b228a27d19..72180fac864256ddda076c57e50ab1083c113d32 100644 --- a/paddle/fluid/framework/details/broadcast_op_handle.h +++ b/paddle/fluid/framework/details/broadcast_op_handle.h @@ -44,7 +44,8 @@ struct BroadcastOpHandle : public OpHandleBase { nccl_ctxs_(nccl_ctxs) { if (nccl_ctxs_) { for (auto &p_ctx : nccl_ctxs_->contexts_) { - dev_ctxes_[platform::CUDAPlace(p_ctx.first)] = p_ctx.second.ctx_.get(); + this->SetDeviceContext(platform::CUDAPlace(p_ctx.first), + p_ctx.second.ctx_.get()); } } } @@ -61,7 +62,10 @@ struct BroadcastOpHandle : public OpHandleBase { protected: void RunImpl() override; - private: + void BroadcastOneVar(const VarHandle &in_var_handle, + const std::vector &out_var_handles, + const std::vector &var_scopes); + std::vector local_scopes_; std::vector places_; #ifdef PADDLE_WITH_CUDA diff --git a/paddle/fluid/framework/details/broadcast_op_handle_test.cc b/paddle/fluid/framework/details/broadcast_op_handle_test.cc index ab7412a19fbd13fa39dbae9af528d158cc9ddbd0..650de5a48de6b1fdab120cdeda563a169fd1a1c1 100644 --- a/paddle/fluid/framework/details/broadcast_op_handle_test.cc +++ b/paddle/fluid/framework/details/broadcast_op_handle_test.cc @@ -12,232 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/framework/details/broadcast_op_handle.h" -#include "gtest/gtest.h" - -#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/framework/details/broadcast_op_handle_test.h" namespace paddle { namespace framework { namespace details { -namespace f = paddle::framework; -namespace p = paddle::platform; - -// test data amount -const f::DDim kDims = {20, 20}; - -struct TestBroadcastOpHandle { - std::vector> ctxs_; - std::vector local_scopes_; - std::vector param_scopes_; - Scope g_scope_; - std::unique_ptr op_handle_; - std::vector> vars_; - std::vector gpu_list_; - bool use_gpu_; -#ifdef PADDLE_WITH_CUDA - std::unique_ptr nccl_ctxs_; -#endif - - void WaitAll() { - for (size_t j = 0; j < ctxs_.size(); ++j) { - ctxs_[j]->Wait(); - } -#ifdef PADDLE_WITH_CUDA - if (nccl_ctxs_) { - nccl_ctxs_->WaitAll(); - } -#endif - } - - void InitCtxOnGpu(bool use_gpu) { - use_gpu_ = use_gpu; - if (use_gpu_) { -#ifdef PADDLE_WITH_CUDA - int count = p::GetCUDADeviceCount(); - if (count <= 1) { - LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA " - "device count is " - << count; - exit(0); - } - for (int i = 0; i < count; ++i) { - auto p = p::CUDAPlace(i); - gpu_list_.push_back(p); - ctxs_.emplace_back(new p::CUDADeviceContext(p)); - } - nccl_ctxs_.reset(new platform::NCCLContextMap(gpu_list_)); -#else - PADDLE_THROW("CUDA is not support."); -#endif - } else { - int count = 8; - for (int i = 0; i < count; ++i) { - auto p = p::CPUPlace(); - gpu_list_.push_back(p); - ctxs_.emplace_back(new p::CPUDeviceContext(p)); - } -#ifdef PADDLE_WITH_CUDA - nccl_ctxs_.reset(nullptr); -#endif - } - } - - void InitBroadcastOp(size_t input_scope_idx) { - for (size_t j = 0; j < gpu_list_.size(); ++j) { - local_scopes_.push_back(&(g_scope_.NewScope())); - Scope& local_scope = local_scopes_.back()->NewScope(); - *local_scopes_.back() - ->Var(details::kLocalExecScopeName) - ->GetMutable() = &local_scope; - local_scope.Var("out"); - param_scopes_.emplace_back(&local_scope); - } - param_scopes_[input_scope_idx]->Var("input"); - - std::unique_ptr n = - ir::CreateNodeForTest("node0", ir::Node::Type::kOperation); - if (use_gpu_) { -#ifdef PADDLE_WITH_CUDA - op_handle_.reset(new BroadcastOpHandle(n.get(), local_scopes_, gpu_list_, - nccl_ctxs_.get())); -#else - PADDLE_THROW("CUDA is not support."); -#endif - } else { -#ifdef PADDLE_WITH_CUDA - op_handle_.reset(new BroadcastOpHandle(n.get(), local_scopes_, gpu_list_, - nccl_ctxs_.get())); -#else - op_handle_.reset( - new BroadcastOpHandle(n.get(), local_scopes_, gpu_list_)); -#endif - } - - std::unique_ptr v = - ir::CreateNodeForTest("node1", ir::Node::Type::kVariable); - auto* in_var_handle = new VarHandle(v.get(), 1, input_scope_idx, "input", - gpu_list_[input_scope_idx]); - vars_.emplace_back(in_var_handle); - op_handle_->AddInput(in_var_handle); - - // add dummy var - - std::unique_ptr v2 = - ir::CreateNodeForTest("node2", ir::Node::Type::kVariable); - vars_.emplace_back(new DummyVarHandle(v2.get())); - DummyVarHandle* dummy_var_handle = - static_cast(vars_.back().get()); - dummy_var_handle->ClearGeneratedOp(); - op_handle_->AddInput(dummy_var_handle); - - for (size_t j = 0; j < gpu_list_.size(); ++j) { - if (!use_gpu_) { - op_handle_->SetDeviceContext(gpu_list_[j], ctxs_[j].get()); - } - std::unique_ptr v3 = - ir::CreateNodeForTest("node3", ir::Node::Type::kVariable); - VarHandle* out_var_handle = - new VarHandle(v3.get(), 2, j, "out", gpu_list_[j]); - vars_.emplace_back(out_var_handle); - op_handle_->AddOutput(out_var_handle); - } - - // add dummy var - std::unique_ptr v4 = - ir::CreateNodeForTest("node4", ir::Node::Type::kVariable); - vars_.emplace_back(new DummyVarHandle(v4.get())); - DummyVarHandle* out_dummy_var_handle = - static_cast(vars_.back().get()); - out_dummy_var_handle->ClearGeneratedOp(); - op_handle_->AddOutput(out_dummy_var_handle); - } - - void TestBroadcastLodTensor(size_t input_scope_idx) { - auto in_var = param_scopes_[input_scope_idx]->FindVar("input"); - PADDLE_ENFORCE_NOT_NULL(in_var); - auto in_lod_tensor = in_var->GetMutable(); - in_lod_tensor->mutable_data(kDims, gpu_list_[input_scope_idx]); - - std::vector send_vector(static_cast(f::product(kDims))); - for (size_t k = 0; k < send_vector.size(); ++k) { - send_vector[k] = k; - } - f::LoD lod{{0, 10, 20}}; - paddle::framework::TensorFromVector( - send_vector, *(ctxs_[input_scope_idx]), in_lod_tensor); - in_lod_tensor->set_lod(lod); - in_lod_tensor->Resize(kDims); - - op_handle_->Run(false); - - WaitAll(); - - p::CPUPlace cpu_place; - for (size_t j = 0; j < gpu_list_.size(); ++j) { - auto out_var = param_scopes_[j]->FindVar("out"); - PADDLE_ENFORCE_NOT_NULL(out_var); - auto out_tensor = out_var->Get(); - PADDLE_ENFORCE_EQ(out_tensor.lod(), lod, "lod is not equal."); - - f::Tensor result_tensor; - f::TensorCopySync(out_tensor, cpu_place, &result_tensor); - float* ct = result_tensor.mutable_data(cpu_place); - - for (int64_t i = 0; i < f::product(kDims); ++i) { - ASSERT_NEAR(ct[i], send_vector[i], 1e-5); - } - } - } - - void TestBroadcastSelectedRows(size_t input_scope_idx) { - auto in_var = param_scopes_[input_scope_idx]->FindVar("input"); - PADDLE_ENFORCE_NOT_NULL(in_var); - auto in_selected_rows = in_var->GetMutable(); - auto value = in_selected_rows->mutable_value(); - value->mutable_data(kDims, gpu_list_[input_scope_idx]); - int height = static_cast(kDims[0]) * 2; - std::vector rows{0, 1, 2, 3, 3, 0, 14, 7, 3, 1, - 2, 4, 6, 3, 1, 1, 1, 1, 3, 7}; - in_selected_rows->set_height(height); - in_selected_rows->set_rows(rows); - - std::vector send_vector(static_cast(f::product(kDims))); - for (size_t k = 0; k < send_vector.size(); ++k) { - send_vector[k] = k; - } - paddle::framework::TensorFromVector( - send_vector, *(ctxs_[input_scope_idx]), value); - - op_handle_->Run(false); - - WaitAll(); - - p::CPUPlace cpu_place; - for (size_t j = 0; j < gpu_list_.size(); ++j) { - auto out_var = param_scopes_[j]->FindVar("out"); - PADDLE_ENFORCE_NOT_NULL(out_var); - auto& out_select_rows = out_var->Get(); - auto rt = out_select_rows.value(); - - PADDLE_ENFORCE_EQ(out_select_rows.height(), height, - "height is not equal."); - for (size_t k = 0; k < out_select_rows.rows().size(); ++k) { - PADDLE_ENFORCE_EQ(out_select_rows.rows()[k], rows[k]); - } - - f::Tensor result_tensor; - f::TensorCopySync(rt, cpu_place, &result_tensor); - float* ct = result_tensor.data(); - - for (int64_t i = 0; i < f::product(kDims); ++i) { - ASSERT_NEAR(ct[i], send_vector[i], 1e-5); - } - } - } -}; - TEST(BroadcastTester, TestCPUBroadcastTestLodTensor) { TestBroadcastOpHandle test_op; size_t input_scope_idx = 0; diff --git a/paddle/fluid/framework/details/broadcast_op_handle_test.h b/paddle/fluid/framework/details/broadcast_op_handle_test.h new file mode 100644 index 0000000000000000000000000000000000000000..1a2a9ac328c4a9b89bfb89106af81b9fb3ed3028 --- /dev/null +++ b/paddle/fluid/framework/details/broadcast_op_handle_test.h @@ -0,0 +1,271 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include "gtest/gtest.h" +#include "paddle/fluid/framework/details/broadcast_op_handle.h" + +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace framework { +namespace details { + +namespace f = paddle::framework; +namespace p = paddle::platform; + +// test data amount +const f::DDim kDims = {20, 20}; + +struct TestBroadcastOpHandle { + std::vector> ctxs_; + std::vector local_scopes_; + std::vector param_scopes_; + Scope g_scope_; + std::unique_ptr op_handle_; + std::vector> vars_; + std::vector place_list_; + bool use_gpu_; +#ifdef PADDLE_WITH_CUDA + std::unique_ptr nccl_ctxs_; +#endif + + void WaitAll() { + for (size_t j = 0; j < ctxs_.size(); ++j) { + ctxs_[j]->Wait(); + } +#ifdef PADDLE_WITH_CUDA + if (nccl_ctxs_) { + nccl_ctxs_->WaitAll(); + } +#endif + } + + void InitCtxOnGpu(bool use_gpu) { + use_gpu_ = use_gpu; + if (use_gpu_) { +#ifdef PADDLE_WITH_CUDA + int count = p::GetCUDADeviceCount(); + if (count <= 1) { + LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA " + "device count is " + << count; + exit(0); + } + for (int i = 0; i < count; ++i) { + auto p = p::CUDAPlace(i); + place_list_.push_back(p); + ctxs_.emplace_back(new p::CUDADeviceContext(p)); + } + nccl_ctxs_.reset(new platform::NCCLContextMap(place_list_)); +#else + PADDLE_THROW("CUDA is not support."); +#endif + } else { + int count = 8; + for (int i = 0; i < count; ++i) { + auto p = p::CPUPlace(); + place_list_.push_back(p); + ctxs_.emplace_back(new p::CPUDeviceContext(p)); + } +#ifdef PADDLE_WITH_CUDA + nccl_ctxs_.reset(nullptr); +#endif + } + } + + void InitBroadcastOp(size_t input_scope_idx) { + for (size_t j = 0; j < place_list_.size(); ++j) { + local_scopes_.push_back(&(g_scope_.NewScope())); + Scope& local_scope = local_scopes_.back()->NewScope(); + *local_scopes_.back() + ->Var(details::kLocalExecScopeName) + ->GetMutable() = &local_scope; + local_scope.Var("out"); + param_scopes_.emplace_back(&local_scope); + } + param_scopes_[input_scope_idx]->Var("input"); + + std::unique_ptr n = + ir::CreateNodeForTest("node0", ir::Node::Type::kOperation); + if (use_gpu_) { +#ifdef PADDLE_WITH_CUDA + op_handle_.reset(new BroadcastOpHandle(n.get(), local_scopes_, + place_list_, nccl_ctxs_.get())); +#else + PADDLE_THROW("CUDA is not support."); +#endif + } else { +#ifdef PADDLE_WITH_CUDA + op_handle_.reset(new BroadcastOpHandle(n.get(), local_scopes_, + place_list_, nccl_ctxs_.get())); +#else + op_handle_.reset( + new BroadcastOpHandle(n.get(), local_scopes_, place_list_)); +#endif + } + + std::unique_ptr v = + ir::CreateNodeForTest("node1", ir::Node::Type::kVariable); + auto* in_var_handle = new VarHandle(v.get(), 1, input_scope_idx, "input", + place_list_[input_scope_idx]); + vars_.emplace_back(in_var_handle); + op_handle_->AddInput(in_var_handle); + + // add dummy var + + std::unique_ptr v2 = + ir::CreateNodeForTest("node2", ir::Node::Type::kVariable); + vars_.emplace_back(new DummyVarHandle(v2.get())); + DummyVarHandle* dummy_var_handle = + static_cast(vars_.back().get()); + dummy_var_handle->ClearGeneratedOp(); + op_handle_->AddInput(dummy_var_handle); + + for (size_t j = 0; j < place_list_.size(); ++j) { + if (!use_gpu_) { + op_handle_->SetDeviceContext(place_list_[j], ctxs_[j].get()); + } + std::unique_ptr v3 = + ir::CreateNodeForTest("node3", ir::Node::Type::kVariable); + VarHandle* out_var_handle = + new VarHandle(v3.get(), 2, j, "out", place_list_[j]); + vars_.emplace_back(out_var_handle); + op_handle_->AddOutput(out_var_handle); + } + + // add dummy var + std::unique_ptr v4 = + ir::CreateNodeForTest("node4", ir::Node::Type::kVariable); + vars_.emplace_back(new DummyVarHandle(v4.get())); + DummyVarHandle* out_dummy_var_handle = + static_cast(vars_.back().get()); + out_dummy_var_handle->ClearGeneratedOp(); + op_handle_->AddOutput(out_dummy_var_handle); + } + + std::vector InitLoDTensor(const std::string& varname, + size_t input_scope_idx, const f::LoD& lod, + float val_scalar = 0.0) { + auto var = param_scopes_[input_scope_idx]->FindVar(varname); + + PADDLE_ENFORCE_NOT_NULL(var); + auto lod_tensor = var->GetMutable(); + std::vector send_vector(static_cast(f::product(kDims))); + for (size_t k = 0; k < send_vector.size(); ++k) { + send_vector[k] = k + val_scalar; + } + paddle::framework::TensorFromVector( + send_vector, *(ctxs_[input_scope_idx]), lod_tensor); + lod_tensor->set_lod(lod); + lod_tensor->Resize(kDims); + return send_vector; + } + + std::vector InitSelectedRows(const std::string& varname, + size_t input_scope_idx, + const std::vector& rows, + int height, float value_scalar = 0.0) { + std::vector send_vector(static_cast(f::product(kDims))); + for (size_t k = 0; k < send_vector.size(); ++k) { + send_vector[k] = k + value_scalar; + } + + auto var = param_scopes_[input_scope_idx]->FindVar(varname); + PADDLE_ENFORCE_NOT_NULL(var); + auto selected_rows = var->GetMutable(); + auto value = selected_rows->mutable_value(); + value->mutable_data(kDims, place_list_[input_scope_idx]); + selected_rows->set_height(height); + selected_rows->set_rows(rows); + + paddle::framework::TensorFromVector( + send_vector, *(ctxs_[input_scope_idx]), value); + + return send_vector; + } + + void SelectedRowsEqual(const std::string& varname, int input_scope_idx, + const std::vector& send_vector, + const std::vector& rows, int height) { + auto var = param_scopes_[input_scope_idx]->FindVar(varname); + PADDLE_ENFORCE_NOT_NULL(var); + auto& selected_rows = var->Get(); + auto rt = selected_rows.value(); + PADDLE_ENFORCE_EQ(selected_rows.height(), height, "height is not equal."); + + for (size_t k = 0; k < selected_rows.rows().size(); ++k) { + PADDLE_ENFORCE_EQ(selected_rows.rows()[k], rows[k]); + } + + p::CPUPlace cpu_place; + f::Tensor result_tensor; + f::TensorCopySync(rt, cpu_place, &result_tensor); + float* ct = result_tensor.data(); + + for (int64_t i = 0; i < f::product(kDims); ++i) { + ASSERT_NEAR(ct[i], send_vector[i], 1e-5); + } + } + + void LoDTensorEqual(const std::string& varname, + const std::vector& send_vec, const f::LoD& lod, + framework::Scope* scope) { + p::CPUPlace cpu_place; + auto var = scope->FindVar(varname); + PADDLE_ENFORCE_NOT_NULL(var); + auto tensor = var->Get(); + PADDLE_ENFORCE_EQ(tensor.lod(), lod, "lod is not equal."); + f::Tensor result_tensor; + f::TensorCopySync(tensor, cpu_place, &result_tensor); + float* ct = result_tensor.mutable_data(cpu_place); + for (int64_t k = 0; k < f::product(kDims); ++k) { + ASSERT_NEAR(ct[k], send_vec[k], 1e-5); + } + } + + void TestBroadcastLodTensor(size_t input_scope_idx) { + f::LoD lod{{0, 10, 20}}; + auto send_vector = InitLoDTensor("input", input_scope_idx, lod); + + op_handle_->Run(false); + + WaitAll(); + for (size_t j = 0; j < place_list_.size(); ++j) { + LoDTensorEqual("out", send_vector, lod, param_scopes_[j]); + } + } + + void TestBroadcastSelectedRows(size_t input_scope_idx) { + std::vector rows{0, 1, 2, 3, 3, 0, 14, 7, 3, 1, + 2, 4, 6, 3, 1, 1, 1, 1, 3, 7}; + int height = static_cast(kDims[0] * 2); + auto send_vector = InitSelectedRows("input", input_scope_idx, rows, height); + + op_handle_->Run(false); + + WaitAll(); + for (size_t j = 0; j < place_list_.size(); ++j) { + SelectedRowsEqual("out", input_scope_idx, send_vector, rows, height); + } + } +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index 6a6b497fa897e3882995688bf36704b1d77ea962..fefd27fc86fb8dce3311fa580d90f518906dd862 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -121,6 +121,7 @@ std::unique_ptr BuildStrategy::Apply( USE_PASS(fuse_elewise_add_act_pass); USE_PASS(graph_viz_pass); +USE_PASS(multi_batch_merge_pass); USE_PASS(multi_devices_pass); USE_PASS(multi_devices_check_pass); USE_PASS(multi_devices_print_pass); diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index 02c4bea16916d58a6d0fce8918f8fceb9ff9356e..f3ffaf6ecd7c4dd99c40fe58ba88c0cbdc14bde7 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -69,6 +69,8 @@ struct BuildStrategy { bool enable_data_balance_{false}; + bool fuse_broadcast_op_{false}; + // User normally doesn't need to call this API. // The PassBuilder allows for more customized insert, remove of passes // from python side. diff --git a/paddle/fluid/framework/details/computation_op_handle.cc b/paddle/fluid/framework/details/computation_op_handle.cc index b6282debdb4eb6b1f29c39e54ac4f3e2296838da..f9bbfe0016ce0ea0d15a83cb532c44518549b8ad 100644 --- a/paddle/fluid/framework/details/computation_op_handle.cc +++ b/paddle/fluid/framework/details/computation_op_handle.cc @@ -37,7 +37,7 @@ void ComputationOpHandle::RunImpl() { bool ComputationOpHandle::NeedWait(VarHandleBase *in_var) { bool need_wait = in_var && in_var->GeneratedOp() && - in_var->GeneratedOp()->DeviceContext(place_) != dev_ctxes_[place_]; + in_var->GeneratedOp()->DeviceContext(place_) != dev_ctxes_.at(place_); return need_wait; } diff --git a/paddle/fluid/framework/details/data_balance_op_handle.cc b/paddle/fluid/framework/details/data_balance_op_handle.cc index 525d24322442ef4dd6e8c24212af61c908959b87..0b772f9b63e2cfb78175f5e0d7011db8e6a5ec20 100644 --- a/paddle/fluid/framework/details/data_balance_op_handle.cc +++ b/paddle/fluid/framework/details/data_balance_op_handle.cc @@ -28,7 +28,7 @@ DataBalanceOpHandle::DataBalanceOpHandle( : OpHandleBase(node), local_scopes_(local_scopes), places_(places) { if (ctxs) { for (auto &p : places_) { - this->dev_ctxes_[p] = ctxs->DevCtx(p); + this->SetDeviceContext(p, ctxs->DevCtx(p)); } } } @@ -89,8 +89,8 @@ void DataBalanceOpHandle::RunImpl() { PADDLE_ENFORCE_GT(places_.size(), 1, "Data balance can only be enabled when the number of " "places to run larger than 1."); - auto in_var_handles = DynamicCast(inputs_); - auto out_var_handles = DynamicCast(outputs_); + auto in_var_handles = DynamicCast(this->Inputs()); + auto out_var_handles = DynamicCast(this->Outputs()); PADDLE_ENFORCE(in_var_handles.size() % places_.size() == 0); PADDLE_ENFORCE_EQ( in_var_handles.size(), out_var_handles.size(), diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc index 6e22fedf1c39428528c00cce4c9a4460dfb95cb3..98fc390e72fab3701538fd6f974460fa5114fdb0 100644 --- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc @@ -92,13 +92,13 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run( size_t num_complete = 0; remaining_ = 0; - BlockingQueue complete_q; + auto complete_q = std::make_shared>(); for (auto op : bootstrap_ops_) { - RunOpAsync(op_deps.get(), op, &complete_q); + RunOpAsync(op_deps.get(), op, complete_q); } while (num_complete != op_deps->size()) { - size_t num_comp = complete_q.Pop(); + size_t num_comp = complete_q->Pop(); if (num_comp == -1UL) { int remaining = 0; while (true) { @@ -107,7 +107,7 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run( break; } for (int i = 0; i < remaining; ++i) { - complete_q.Pop(); + complete_q->Pop(); } } exception_.ReThrow(); @@ -120,7 +120,8 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run( } void FastThreadedSSAGraphExecutor::RunOpAsync( std::unordered_map> *op_deps, - OpHandleBase *op, BlockingQueue *complete_q) { + OpHandleBase *op, + const std::shared_ptr> &complete_q) { ++remaining_; this->pool_.enqueue([=] { OpHandleBase *op_to_run = op; @@ -144,7 +145,7 @@ void FastThreadedSSAGraphExecutor::RunOpAsync( if (op_to_run == nullptr) { op_to_run = pending_op; } else { - this->RunOpAsync(op_deps, pending_op, complete_q); + RunOpAsync(op_deps, pending_op, complete_q); } } } @@ -156,8 +157,7 @@ void FastThreadedSSAGraphExecutor::RunOpAsync( } void FastThreadedSSAGraphExecutor::PrepareAtomicOpDeps() { atomic_op_deps_ = pool_.enqueue([&] { - std::unordered_map> *op_deps = - new std::unordered_map>; + auto *op_deps = new std::unordered_map>; for (auto &pair : op_deps_) { (*op_deps)[pair.first] = pair.second; } diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h index dad3a231cba6402f57ba654a9ac5fb520b9c8f04..8b8382447105c8caa36963214684d6ee9fa15200 100644 --- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h @@ -50,7 +50,8 @@ class FastThreadedSSAGraphExecutor : public SSAGraphExecutor { std::atomic remaining_; void RunOpAsync(std::unordered_map> *op_deps, - OpHandleBase *op, BlockingQueue *complete_q); + OpHandleBase *op, + const std::shared_ptr> &complete_q); void PrepareAtomicOpDeps(); diff --git a/paddle/fluid/framework/details/fused_broadcast_op_handle.cc b/paddle/fluid/framework/details/fused_broadcast_op_handle.cc new file mode 100644 index 0000000000000000000000000000000000000000..51dfa2d0711f49aaefab0af3549283dbf77eee4a --- /dev/null +++ b/paddle/fluid/framework/details/fused_broadcast_op_handle.cc @@ -0,0 +1,55 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/fused_broadcast_op_handle.h" +#include "paddle/fluid/framework/details/container_cast.h" +#include "paddle/fluid/framework/details/variable_visitor.h" +#include "paddle/fluid/platform/profiler.h" + +namespace paddle { +namespace framework { +namespace details { + +void FusedBroadcastOpHandle::RunImpl() { + platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second); + + if (places_.size() == 1UL) return; + + auto in_var_handles = DynamicCast(inputs_); + auto out_var_handles = DynamicCast(outputs_); + + WaitInputVarGenerated(); + + std::vector var_scopes; + for (auto *s : local_scopes_) { + var_scopes.emplace_back(s->FindVar(kLocalExecScopeName)->Get()); + } + + size_t place_num = places_.size(); + PADDLE_ENFORCE_EQ(in_var_handles.size() * place_num, out_var_handles.size()); + + for (size_t i = 0; i < in_var_handles.size(); ++i) { + BroadcastOneVar( + *in_var_handles[i], + std::vector(out_var_handles.begin() + i * place_num, + out_var_handles.begin() + (i + 1) * place_num), + var_scopes); + } +} + +std::string FusedBroadcastOpHandle::Name() const { return "fused_broadcast"; } + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/fused_broadcast_op_handle.h b/paddle/fluid/framework/details/fused_broadcast_op_handle.h new file mode 100644 index 0000000000000000000000000000000000000000..e37259526a5f6f57d51a0ca8bca96a18211a4790 --- /dev/null +++ b/paddle/fluid/framework/details/fused_broadcast_op_handle.h @@ -0,0 +1,57 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include + +#include "paddle/fluid/framework/details/broadcast_op_handle.h" +#include "paddle/fluid/framework/details/multi_devices_helper.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/platform/device_context.h" + +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/platform/nccl_helper.h" +#endif + +namespace paddle { +namespace framework { +namespace details { + +struct FusedBroadcastOpHandle : public BroadcastOpHandle { + public: +#ifdef PADDLE_WITH_CUDA + FusedBroadcastOpHandle(ir::Node *node, + const std::vector local_scopes, + const std::vector &places, + const platform::NCCLContextMap *nccl_ctx) + : BroadcastOpHandle(node, local_scopes, places, nccl_ctx) {} +#else + FusedBroadcastOpHandle(ir::Node* node, const std::vector local_scopes, + const std::vector& places) + : BroadcastOpHandle(node, local_scopes, places) {} +#endif + std::string Name() const override; + + protected: + void RunImpl() override; +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc b/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..0f12bd2b4e857648342aeb5ad33b6c0fe01c9c73 --- /dev/null +++ b/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc @@ -0,0 +1,165 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/fused_broadcast_op_handle.h" +#include "gtest/gtest.h" +#include "paddle/fluid/framework/details/broadcast_op_handle_test.h" + +namespace paddle { +namespace framework { +namespace details { + +struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle { + std::vector out_varnames_; + + void InitFusedBroadcastOp(std::vector input_scope_idxes) { + // initialize scope and var + for (size_t i = 0; i < place_list_.size(); ++i) { + local_scopes_.push_back(&(g_scope_.NewScope())); + Scope& local_scope = local_scopes_.back()->NewScope(); + *local_scopes_.back() + ->Var(details::kLocalExecScopeName) + ->GetMutable() = &local_scope; + for (size_t j = 0; j < input_scope_idxes.size(); ++j) { + local_scope.Var("out_var" + j); + if (i == j) local_scope.Var("in_var" + j); + } + param_scopes_.emplace_back(&local_scope); + } + + // create op handle node + std::unique_ptr n = + ir::CreateNodeForTest("fused_broadcast", ir::Node::Type::kOperation); + if (use_gpu_) { +#ifdef PADDLE_WITH_CUDA + op_handle_.reset(new FusedBroadcastOpHandle( + n.get(), local_scopes_, place_list_, nccl_ctxs_.get())); +#else + PADDLE_THROW("CUDA is not supported."); +#endif + } else { +#ifdef PADDLE_WITH_CUDA + op_handle_.reset(new FusedBroadcastOpHandle( + n.get(), local_scopes_, place_list_, nccl_ctxs_.get())); +#else + op_handle_.reset( + new FusedBroadcastOpHandle(n.get(), local_scopes_, place_list_)); +#endif + } + + for (size_t i = 0; i < input_scope_idxes.size(); ++i) { + // add input var handle + std::unique_ptr in_node = + ir::CreateNodeForTest("in_node" + i, ir::Node::Type::kVariable); + VarHandle* in_var_handle = + new VarHandle(in_node.get(), 1, input_scope_idxes[i], "in_var" + i, + place_list_[input_scope_idxes[i]]); + vars_.emplace_back(in_var_handle); + op_handle_->AddInput(in_var_handle); + + // add output var handle + for (size_t j = 0; j < place_list_.size(); ++j) { + std::unique_ptr out_node = + ir::CreateNodeForTest("out_node" + i, ir::Node::Type::kVariable); + VarHandle* out_var_handle = + new VarHandle(out_node.get(), 2, j, "out_var" + i, place_list_[j]); + vars_.emplace_back(out_var_handle); + op_handle_->AddOutput(out_var_handle); + } + } + } + + void TestFusedBroadcastLoDTensor(std::vector input_scope_idxes) { + std::vector> send_vec; + f::LoD lod{{0, 10, 20}}; + for (size_t i = 0; i < input_scope_idxes.size(); ++i) { + const std::string varname("in_var" + i); + float val_scalar = static_cast(i); + send_vec.push_back( + InitLoDTensor(varname, input_scope_idxes[i], lod, val_scalar)); + } + + op_handle_->Run(false); + + WaitAll(); + for (size_t i = 0; i < input_scope_idxes.size(); ++i) { + const std::string& varname("out_var" + i); + for (size_t j = 0; j < place_list_.size(); ++j) { + LoDTensorEqual(varname, send_vec[i], lod, param_scopes_[j]); + } + } + } + + void TestFusedBroadcastSelectedRows(std::vector input_scope_idxes) { + std::vector> send_vector; + std::vector rows{0, 1, 2, 3, 3, 0, 14, 7, 3, 1, + 2, 4, 6, 3, 1, 1, 1, 1, 3, 7}; + int height = static_cast(kDims[0] * 2); + for (size_t i = 0; i < input_scope_idxes.size(); ++i) { + const std::string varname("in_var" + i); + float val_scalar = static_cast(i); + send_vector.push_back(InitSelectedRows(varname, input_scope_idxes[i], + rows, height, val_scalar)); + } + + op_handle_->Run(false); + + WaitAll(); + for (size_t i = 0; i < input_scope_idxes.size(); ++i) { + const std::string& varname("out_var" + i); + for (size_t j = 0; j < place_list_.size(); ++j) { + SelectedRowsEqual(varname, input_scope_idxes[i], send_vector[i], rows, + height); + } + } + } +}; + +TEST(FusedBroadcastTester, CPULodTensor) { + TestFusedBroadcastOpHandle test_op; + std::vector input_scope_idxes = {0, 1}; + test_op.InitCtxOnGpu(false); + test_op.InitFusedBroadcastOp(input_scope_idxes); + test_op.TestFusedBroadcastLoDTensor(input_scope_idxes); +} + +TEST(FusedBroadcastTester, CPUSelectedRows) { + TestFusedBroadcastOpHandle test_op; + std::vector input_scope_idxes = {0, 1}; + test_op.InitCtxOnGpu(false); + test_op.InitFusedBroadcastOp(input_scope_idxes); + test_op.TestFusedBroadcastSelectedRows(input_scope_idxes); +} + +#ifdef PADDLE_WITH_CUDA +TEST(FusedBroadcastTester, GPULodTensor) { + TestFusedBroadcastOpHandle test_op; + std::vector input_scope_idxes = {0, 1}; + test_op.InitCtxOnGpu(true); + test_op.InitFusedBroadcastOp(input_scope_idxes); + test_op.TestFusedBroadcastLoDTensor(input_scope_idxes); +} + +TEST(FusedBroadcastTester, GPUSelectedRows) { + TestFusedBroadcastOpHandle test_op; + std::vector input_scope_idxes = {0, 1}; + test_op.InitCtxOnGpu(true); + test_op.InitFusedBroadcastOp(input_scope_idxes); + test_op.TestFusedBroadcastSelectedRows(input_scope_idxes); +} +#endif + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/gather_op_handle.cc b/paddle/fluid/framework/details/gather_op_handle.cc index 9aae19fc73de4387186da47c55710c94d53f1b88..ca4633c5a8f22fc9f7319b06aa766f9fe37dc68c 100644 --- a/paddle/fluid/framework/details/gather_op_handle.cc +++ b/paddle/fluid/framework/details/gather_op_handle.cc @@ -36,7 +36,7 @@ void GatherOpHandle::RunImpl() { VarHandle *out_var_handle; { - auto out_var_handles = DynamicCast(outputs_); + auto out_var_handles = DynamicCast(this->Outputs()); PADDLE_ENFORCE_EQ(out_var_handles.size(), 1, "The number of output should be one."); out_var_handle = out_var_handles.front(); @@ -99,7 +99,7 @@ void GatherOpHandle::RunImpl() { Tensor *out_tensor = out_value->mutable_value(); // copy - auto dev_ctx = dev_ctxes_[out_var_handle->place_]; + auto dev_ctx = dev_ctxes_.at(out_var_handle->place_); RunAndRecordEvent(out_var_handle->place_, [in_tensors, out_tensor, &dev_ctx, t_out_p] { int s = 0, e = 0; diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index ebd1d644bcce0554904ee0931827ef43a6d52b11..f3819887a196a7c8bf35897467bb9d68b428094e 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -21,6 +21,7 @@ #include "paddle/fluid/framework/details/broadcast_op_handle.h" #include "paddle/fluid/framework/details/computation_op_handle.h" #include "paddle/fluid/framework/details/data_balance_op_handle.h" +#include "paddle/fluid/framework/details/fused_broadcast_op_handle.h" #include "paddle/fluid/framework/details/multi_devices_graph_pass.h" #include "paddle/fluid/framework/details/reduce_op_handle.h" #include "paddle/fluid/framework/details/rpc_op_handle.h" @@ -347,7 +348,7 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( BuildStrategy::GradientScaleStrategy::kCustomized) { // TODO(paddle-dev): Why is there no input for this op_handle? auto loss_grad_name = node->Op()->OutputArgumentNames()[0]; - CreateScaleLossGradOp(&result, loss_grad_name); + CreateScaleLossGradOp(&result, loss_grad_name, node->outputs[0]); } // This assumes the backward generating code will ensure IsScaleLossOp // is true only for the op that scale the final scalar loss. @@ -436,10 +437,14 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( if ((use_gpu && strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce) || is_dist_train) { - for (size_t dev_id = 0; dev_id < bcast_var_name_set.size(); ++dev_id) { - auto &to_bcast_set = bcast_var_name_set[dev_id]; - for (auto &bcast_name : to_bcast_set) { - CreateBroadcastOp(&result, bcast_name, dev_id); + if (strategy_.fuse_broadcast_op_) { + CreateFusedBroadcastOp(&result, bcast_var_name_set); + } else { + for (size_t dev_id = 0; dev_id < bcast_var_name_set.size(); ++dev_id) { + auto &to_bcast_set = bcast_var_name_set[dev_id]; + for (auto &bcast_name : to_bcast_set) { + CreateBroadcastOp(&result, bcast_name, dev_id); + } } } } @@ -508,6 +513,44 @@ void MultiDevSSAGraphBuilder::CreateBroadcastOp(ir::Graph *result, } } +void MultiDevSSAGraphBuilder::CreateFusedBroadcastOp( + ir::Graph *result, + const std::vector> &bcast_varnames) const { +#ifdef PADDLE_WITH_CUDA + auto *op_handle = new FusedBroadcastOpHandle( + result->CreateEmptyNode("fused_broadcast", ir::Node::Type::kOperation), + local_scopes_, places_, nccl_ctxs_); +#else + auto *op_handle = new FusedBroadcastOpHandle( + result->CreateEmptyNode("fused_broadcast", ir::Node::Type::kOperation), + local_scopes_, places_); +#endif + result->Get(kGraphOps).emplace_back(op_handle); + + for (size_t i = 0; i < places_.size(); ++i) { + auto &p = places_[i]; + SetCommunicationContext(op_handle, p); + } + + for (size_t dev_id = 0; dev_id < bcast_varnames.size(); ++dev_id) { + for (auto &p_name : bcast_varnames[dev_id]) { + auto *in = + result->Get(kGraphVars).at(dev_id).at(p_name).back().get(); + op_handle->AddInput(in); + for (size_t out_dev_id = 0; out_dev_id < places_.size(); ++out_dev_id) { + auto &p = places_[out_dev_id]; + auto &vars = + result->Get(kGraphVars).at(out_dev_id).at(p_name); + auto *out_var = new VarHandle( + result->CreateEmptyNode(p_name, ir::Node::Type::kVariable), + vars.size(), out_dev_id, p_name, p); + vars.emplace_back(out_var); + op_handle->AddOutput(out_var); + } + } + } +} + void MultiDevSSAGraphBuilder::CreateComputationalOp(ir::Graph *result, ir::Node *node, int dev_id) const { @@ -602,7 +645,8 @@ int MultiDevSSAGraphBuilder::GetVarDeviceID(const ir::Graph &graph, } void MultiDevSSAGraphBuilder::CreateScaleLossGradOp( - ir::Graph *result, const std::string &loss_grad_name) const { + ir::Graph *result, const std::string &loss_grad_name, + ir::Node *out_var_node) const { for (size_t i = 0; i < places_.size(); ++i) { // Insert ScaleCost OpHandle auto *dev_ctx = platform::DeviceContextPool::Instance().Get(places_[i]); @@ -617,10 +661,8 @@ void MultiDevSSAGraphBuilder::CreateScaleLossGradOp( // loss->pending_ops_.emplace_back(op_handle); // op_handle->inputs_.emplace_back(loss); - CreateOpOutput( - result, op_handle, - result->CreateEmptyNode(loss_grad_name, ir::Node::Type::kVariable), - places_[i], i); + CreateOpOutput(result, op_handle, + result->CreateVarNode(out_var_node->Var()), places_[i], i); } } @@ -680,7 +722,8 @@ int MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result, } if (node->Op()->Type() == "split_byref" || - node->Op()->Type() == "split_selected_rows") { + node->Op()->Type() == "split_selected_rows" || + node->Op()->Type() == "split_ids") { // TODO(paddle-dev): getting the first var is not safe. op_dev_id = GetVarDeviceID(*result, input_var_names[0]); if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) { diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.h b/paddle/fluid/framework/details/multi_devices_graph_pass.h index cdf9f13cde608b546d17a1e53e0f6acea9e12566..03b2de2f04da4bac8d342a76c80fd12beaeba4b7 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.h +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h @@ -61,7 +61,8 @@ class MultiDevSSAGraphBuilder : public ir::Pass { size_t num_places) const; void CreateScaleLossGradOp(ir::Graph *result, - const std::string &loss_grad_name) const; + const std::string &loss_grad_name, + ir::Node *out_var_node) const; VarHandle *CreateReduceOp(ir::Graph *result, const std::string &og, int dst_dev_id) const; @@ -78,6 +79,10 @@ class MultiDevSSAGraphBuilder : public ir::Pass { void CreateBroadcastOp(ir::Graph *result, const std::string &p_name, size_t src_dev_id) const; + void CreateFusedBroadcastOp( + ir::Graph *result, + const std::vector> &bcast_varnames) const; + bool IsSparseGradient(const std::string &og) const; size_t GetAppropriateDeviceID( diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc index 3812f0abf1b7069525c4420054c61c01c908acfe..4822627ac3b65972f41d9a23d9fe3dba3de3f97d 100644 --- a/paddle/fluid/framework/details/op_handle_base.cc +++ b/paddle/fluid/framework/details/op_handle_base.cc @@ -103,7 +103,7 @@ void OpHandleBase::WaitInputVarGenerated() { void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) { for (auto *in : inputs_) { if (NeedWait(in)) { - in->GeneratedOp()->RecordWaitEventOnCtx(dev_ctxes_[place]); + in->GeneratedOp()->RecordWaitEventOnCtx(dev_ctxes_.at(place)); } } } diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc index 7fc06f234d42a992328c0b6164f17945d8075c28..4503123eac810917cabcf1e62cff98552ed2f742 100644 --- a/paddle/fluid/framework/details/reduce_op_handle.cc +++ b/paddle/fluid/framework/details/reduce_op_handle.cc @@ -27,7 +27,7 @@ namespace framework { namespace details { void ReduceOpHandle::RunImpl() { - platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second); + platform::RecordEvent record_event(Name(), dev_ctxes_.cbegin()->second); if (places_.size() == 1) return; // the input and output may have dummy var. diff --git a/paddle/fluid/framework/details/reduce_op_handle.h b/paddle/fluid/framework/details/reduce_op_handle.h index a6289b055f97b7b0e57928358d84117b33cf2df8..999828ae457ba43541da06088ce7c25331fd05ec 100644 --- a/paddle/fluid/framework/details/reduce_op_handle.h +++ b/paddle/fluid/framework/details/reduce_op_handle.h @@ -46,7 +46,8 @@ struct ReduceOpHandle : public OpHandleBase { nccl_ctxs_(nccl_ctxs) { if (nccl_ctxs_) { for (auto &p_ctx : nccl_ctxs_->contexts_) { - dev_ctxes_[platform::CUDAPlace(p_ctx.first)] = p_ctx.second.ctx_.get(); + this->SetDeviceContext(platform::CUDAPlace(p_ctx.first), + p_ctx.second.ctx_.get()); } } } diff --git a/paddle/fluid/framework/details/rpc_op_handle.cc b/paddle/fluid/framework/details/rpc_op_handle.cc index f44b374edb29228dff5a8bf003d945291f166d49..65df7f2d510bf4e3e930398182c6dd1eae89241f 100644 --- a/paddle/fluid/framework/details/rpc_op_handle.cc +++ b/paddle/fluid/framework/details/rpc_op_handle.cc @@ -38,7 +38,7 @@ void RPCOpHandle::RunImpl() { continue; } if (in->GeneratedOp()) { - in->GeneratedOp()->RecordWaitEventOnCtx(dev_ctxes_[p]); + in->GeneratedOp()->RecordWaitEventOnCtx(dev_ctxes_.at(p)); } } auto &tmp_scope = local_scope_->FindVar(kLocalExecScopeName)->Get(); diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc index ba243979b34aa1f683de707525403becaf0a1c00..ef1626599795a553e654fe5d3ed74ef3a3a67d78 100644 --- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc +++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc @@ -27,7 +27,7 @@ ScaleLossGradOpHandle::ScaleLossGradOpHandle(ir::Node *node, size_t num_dev, coeff_(static_cast(1.0 / num_dev)), scope_(scope), place_(place) { - dev_ctxes_[place_] = dev_ctx; + this->SetDeviceContext(place_, dev_ctx); } ScaleLossGradOpHandle::~ScaleLossGradOpHandle() {} @@ -46,9 +46,9 @@ void ScaleLossGradOpHandle::RunImpl() { } else { #ifdef PADDLE_WITH_CUDA this->RunAndRecordEvent([&] { - auto stream = - static_cast(this->dev_ctxes_[place_]) - ->stream(); + auto stream = static_cast( + this->dev_ctxes_.at(place_)) + ->stream(); memory::Copy(boost::get(place_), tmp, platform::CPUPlace(), &coeff_, sizeof(float), stream); VLOG(10) << place_ << "RUN Scale loss grad op"; diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc index 31beef3ae829d72570ee7c879dac71ed600cd216..dc63effd1b7c8fe5bb3fc91058eb855e552d3926 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc @@ -39,7 +39,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( new platform::RecordEvent("ThreadedSSAGraphExecutorPrepare", nullptr)); std::unordered_map pending_ops; std::unordered_set pending_vars; - BlockingQueue ready_vars; + auto ready_vars = std::make_shared>(); std::unordered_set ready_ops; // For ops (e.g. nccl_all_reduce) that need to coordinate multiple // streams from multiple GPUs, it's faster to buffer them and schedule @@ -51,12 +51,12 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( for (auto &var_map : graph_->Get(details::kGraphVars)) { for (auto &name_pair : var_map) { for (auto &version_pair : name_pair.second) { - InsertPendingVar(&pending_vars, &ready_vars, version_pair.get()); + InsertPendingVar(&pending_vars, ready_vars.get(), version_pair.get()); } } } for (auto &var : graph_->Get(details::kGraphDepVars)) { - InsertPendingVar(&pending_vars, &ready_vars, var.get()); + InsertPendingVar(&pending_vars, ready_vars.get(), var.get()); } for (auto &op : graph_->Get(details::kGraphOps)) { @@ -73,12 +73,12 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( FeedFetchList fetch_data(fetch_tensors.size()); InsertFetchOps(fetch_tensors, &fetch_ops, &fetch_dependencies, &pending_ops, - &pending_vars, &ready_vars, &fetch_data); + &pending_vars, ready_vars.get(), &fetch_data); auto run_all_ops = [&](std::unordered_set &set) { for (auto *op : set) { running_ops_++; - RunOp(&ready_vars, op); + RunOp(ready_vars, op); } set.clear(); }; @@ -87,7 +87,6 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( run_op_futures_.clear(); exception_holder_.Clear(); event.reset(nullptr); - // Step 3. Execution while (!pending_vars.empty()) { // 1. Run All Ready ops @@ -103,7 +102,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( // 2. Find ready variable bool timeout; - auto cur_ready_vars = ready_vars.PopAll(1, &timeout); + auto cur_ready_vars = ready_vars->PopAll(1, &timeout); if (timeout) { if (exception_holder_.IsCaught()) { @@ -133,7 +132,6 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( } } PADDLE_ENFORCE(ready_ops.empty()); - // Wait FetchOps. ClearFetchOp(graph_.get(), &fetch_ops); @@ -206,7 +204,8 @@ void ThreadedSSAGraphExecutor::InsertPendingVar( } void ThreadedSSAGraphExecutor::RunOp( - BlockingQueue *ready_var_q, details::OpHandleBase *op) { + const std::shared_ptr> &ready_var_q, + details::OpHandleBase *op) { auto op_run = [ready_var_q, op, this] { try { if (VLOG_IS_ON(10)) { diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h index 512f8a4ca5a9b82a395dde11722b8db44ea5ec27..dbb0b498d995a897b109bd4ef98521b2193276ed 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h @@ -51,7 +51,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor { ~ThreadedSSAGraphExecutor() {} private: - void RunOp(BlockingQueue *ready_var_q, + void RunOp(const std::shared_ptr> &ready_var_q, details::OpHandleBase *op); private: diff --git a/paddle/fluid/framework/framework.proto b/paddle/fluid/framework/framework.proto index c99406799ba5f664c4b9f80e0567b293e4ffea51..efdabffb9b33ddf007c13008d0f3afb7a3961eda 100644 --- a/paddle/fluid/framework/framework.proto +++ b/paddle/fluid/framework/framework.proto @@ -35,6 +35,7 @@ enum AttrType { BLOCK = 8; LONG = 9; BLOCKS = 10; + LONGS = 11; } // OpDesc describes an instance of a C++ framework::OperatorBase @@ -55,6 +56,7 @@ message OpDesc { optional int32 block_idx = 12; optional int64 l = 13; repeated int32 blocks_idx = 14; + repeated int64 longs = 15; }; message Var { diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index a145b2fafe64f8c80ac7808583d6670ca0218c06..28231a53bad50fe9f19cfe3e73c3dc09aa3762cf 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -36,10 +36,12 @@ pass_library(fc_lstm_fuse_pass inference) pass_library(embedding_fc_lstm_fuse_pass inference) pass_library(fc_gru_fuse_pass inference) pass_library(seq_concat_fc_fuse_pass inference) +pass_library(multi_batch_merge_pass base) pass_library(conv_bn_fuse_pass inference) pass_library(seqconv_eltadd_relu_fuse_pass inference) if(WITH_MKLDNN) pass_library(mkldnn_placement_pass base) + pass_library(depthwise_conv_mkldnn_pass base) pass_library(conv_bias_mkldnn_fuse_pass inference) pass_library(conv_relu_mkldnn_fuse_pass inference) pass_library(conv_elementwise_add_mkldnn_fuse_pass inference) @@ -58,6 +60,7 @@ cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector) cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto) if (WITH_MKLDNN) + cc_test(test_depthwise_conv_mkldnn_pass SRCS depthwise_conv_mkldnn_pass_tester.cc DEPS depthwise_conv_mkldnn_pass) cc_test(test_conv_relu_mkldnn_fuse_pass SRCS conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass) cc_test(test_conv_elementwise_add_mkldnn_fuse_pass SRCS conv_elementwise_add_mkldnn_fuse_pass_tester.cc DEPS conv_elementwise_add_mkldnn_fuse_pass) endif () diff --git a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h index b5de0d548713772e7ad41cfb6d8b3e9460683efb..fe585bd7c41bb32ae00462e989ab4c0051fc89a8 100644 --- a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h +++ b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h @@ -31,7 +31,8 @@ class ConvReLUFusePass : public FusePassBase { virtual ~ConvReLUFusePass() {} protected: - std::unique_ptr ApplyImpl(std::unique_ptr graph) const; + std::unique_ptr ApplyImpl( + std::unique_ptr graph) const override; }; } // namespace ir diff --git a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc index 8f4bab25ed4919881baf19a961a52aa229e06a8f..19248b4dfee1da81d18cd2effac08ba68dde80fb 100644 --- a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc @@ -15,6 +15,7 @@ #include "paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.h" #include +#include "paddle/fluid/framework/op_proto_maker.h" namespace paddle { namespace framework { @@ -36,6 +37,8 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name, op->SetInput("X", inputs); } op->SetOutput("Out", outputs); + op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), + static_cast(OpRole::kForward)); } // a->OP0->b diff --git a/paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.cc b/paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..19056e18aa892dbc83dfbf7305b6ad8b6b6bc51c --- /dev/null +++ b/paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.cc @@ -0,0 +1,58 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" + +namespace paddle { +namespace framework { +namespace ir { + +#define GET_NODE(id, pattern) \ + PADDLE_ENFORCE(subgraph.count(pattern.RetrieveNode(#id)), \ + "pattern has no Node called %s", #id); \ + auto* id = subgraph.at(pattern.RetrieveNode(#id)); \ + PADDLE_ENFORCE_NOT_NULL(id, "subgraph has no node %s", #id); + +std::unique_ptr DepthwiseConvMKLDNNPass::ApplyImpl( + std::unique_ptr graph) const { + PADDLE_ENFORCE(graph.get()); + FusePassBase::Init("depthwise_conv_mkldnn_pass", graph.get()); + GraphPatternDetector gpd; + + auto* pattern = gpd.mutable_pattern(); + pattern->NewNode("depthwise_conv") + ->assert_is_op("depthwise_conv2d") + ->assert_op_attr("use_mkldnn", true); + + int found_depthwise_conv_mkldnn_count = 0; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + VLOG(3) << "handle DepthwiseConvMKLDNN fuse"; + GET_NODE(depthwise_conv, (*pattern)); + depthwise_conv->Op()->SetType("conv2d"); + found_depthwise_conv_mkldnn_count++; + }; + + gpd(graph.get(), handler); + AddStatis(found_depthwise_conv_mkldnn_count); + return graph; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(depthwise_conv_mkldnn_pass, + paddle::framework::ir::DepthwiseConvMKLDNNPass); diff --git a/paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.h b/paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..8ca6a7325186401c26eb7f9375cf83b7b97cc1c9 --- /dev/null +++ b/paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.h @@ -0,0 +1,34 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/ir/fuse_pass_base.h" + +namespace paddle { +namespace framework { +namespace ir { + +class DepthwiseConvMKLDNNPass : public FusePassBase { + public: + virtual ~DepthwiseConvMKLDNNPass() {} + + protected: + std::unique_ptr ApplyImpl( + std::unique_ptr graph) const override; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass_tester.cc b/paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass_tester.cc new file mode 100644 index 0000000000000000000000000000000000000000..09d0b15f46a7e50afb6aea46383013ce6a6c6118 --- /dev/null +++ b/paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass_tester.cc @@ -0,0 +1,123 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/depthwise_conv_mkldnn_pass.h" + +#include + +namespace paddle { +namespace framework { +namespace ir { + +void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name, + const std::vector& inputs, + const std::vector& outputs, bool use_mkldnn = false) { + auto* op = prog->MutableBlock(0)->AppendOp(); + op->SetType(type); + op->SetAttr("use_mkldnn", use_mkldnn); + op->SetAttr("name", name); + op->SetInput("Input", {inputs[0]}); + op->SetInput("Filter", {inputs[1]}); + op->SetInput("Bias", {inputs[2]}); + op->SetOutput("Out", outputs); +} + +// (a, weights, bias)->depthwise conv mkldnn->b +// (b, weights2, bias2)->depthwise conv no mkldnn->c +// (c, weights3, bias3)->conv mkldnn->d +// (d, weights3, bias3)->conv no mkldnn->e +ProgramDesc BuildProgramDesc() { + ProgramDesc prog; + for (auto& v : std::vector( + {"a", "b", "c", "d", "e", "weights", "bias", "weights2", "bias2", + "weights3", "bias3", "weights4", "bias4"})) { + auto* var = prog.MutableBlock(0)->Var(v); + var->SetType(proto::VarType::SELECTED_ROWS); + if (v == "weights" || v == "bias" || v == "weights2" || v == "bias2" || + v == "weights3" || v == "bias3" || v == "weights4" || v == "bias4") { + var->SetPersistable(true); + } + } + + // depthwise conv with MKL-DNN + SetOp(&prog, "depthwise_conv2d", "conv1", + std::vector({"a", "weights", "bias"}), + std::vector({"b"}), true); + // depthwise conv without MKL-DNN + SetOp(&prog, "depthwise_conv2d", "conv2", + std::vector({"b", "weights2", "bias2"}), + std::vector({"c"}), false); + // conv with MKL-DNN + SetOp(&prog, "conv2d", "conv3", + std::vector({"c", "weights3", "bias3"}), + std::vector({"d"}), true); + // conv without MKL-dNN + SetOp(&prog, "conv2d", "conv4", + std::vector({"d", "weights4", "bias4"}), + std::vector({"e"}), false); + + return prog; +} + +TEST(DepthwiseConvMKLDNNPass, basic) { + auto prog = BuildProgramDesc(); + + std::unique_ptr graph(new ir::Graph(prog)); + + auto pass = PassRegistry::Instance().Get("depthwise_conv_mkldnn_pass"); + + struct counters { + int mkldnn_depthwise_conv_nodes; + int other_depthwise_conv_nodes; + int mkldnn_conv_nodes; + int other_conv_nodes; + }; + + counters before{1, 1, 1, 1}; + + graph = pass->Apply(std::move(graph)); + + // initialize counters before loop + counters after{0, 0, 0, 0}; + + for (auto* node : graph->Nodes()) { + if (node->IsOp()) { + auto* op = node->Op(); + if (op->Type() == "conv2d") { + if (boost::get(op->GetAttr("use_mkldnn"))) + after.mkldnn_conv_nodes++; + else + after.other_conv_nodes++; + } else if (op->Type() == "depthwise_conv2d") { + if (boost::get(op->GetAttr("use_mkldnn"))) + after.mkldnn_depthwise_conv_nodes++; + else + after.other_depthwise_conv_nodes++; + } + } + } + + EXPECT_EQ(after.other_depthwise_conv_nodes, + before.other_depthwise_conv_nodes); + EXPECT_EQ(after.other_conv_nodes, before.other_conv_nodes); + EXPECT_EQ(after.mkldnn_depthwise_conv_nodes, + before.mkldnn_depthwise_conv_nodes - 1); + EXPECT_EQ(after.mkldnn_conv_nodes, before.mkldnn_conv_nodes + 1); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +USE_PASS(depthwise_conv_mkldnn_pass); diff --git a/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc b/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc index 06286a109d01af638e74e06ccc83e2a5500663ea..2db7d95cae1c8c59691fd642e2462e92ed58814f 100644 --- a/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc @@ -15,6 +15,7 @@ #include "paddle/fluid/framework/ir/fc_fuse_pass.h" #include +#include "paddle/fluid/framework/op_proto_maker.h" namespace paddle { namespace framework { @@ -32,6 +33,8 @@ void SetOp(ProgramDesc* prog, const std::string& type, op->SetInput("X", inputs); } op->SetOutput("Out", outputs); + op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), + static_cast(OpRole::kForward)); } // a->OP0->b diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc index 398f7095968e62f92d610f560d7574b27706d13e..813f620d7ceb6f341e5fa6e96fb5ca0face96f51 100644 --- a/paddle/fluid/framework/ir/graph.cc +++ b/paddle/fluid/framework/ir/graph.cc @@ -23,80 +23,78 @@ limitations under the License. */ namespace paddle { namespace framework { namespace ir { - -std::vector FindDistTrainSendVars( - const std::vector &nodes) { - std::vector send_vars; - // since parameters are all in block 0, - // it's enough to only scan send ops in block 0 - for (auto &node : nodes) { - auto op_vars = node->Op()->InputArgumentNames(); - send_vars.reserve(send_vars.size() + - std::distance(op_vars.begin(), op_vars.end())); - send_vars.insert(send_vars.end(), op_vars.begin(), op_vars.end()); - } - return send_vars; -} - -std::vector FindDistTrainRecvVars( - const std::vector &nodes) { - std::vector recv_vars; - for (auto &node : nodes) { - auto op_vars = node->Op()->OutputArgumentNames(); - recv_vars.reserve(recv_vars.size() + - std::distance(op_vars.begin(), op_vars.end())); - recv_vars.insert(recv_vars.end(), op_vars.begin(), op_vars.end()); - } - return recv_vars; -} - -bool IsDistTrainOp(ir::Node *node, const std::vector &send_vars, - const std::vector &recv_vars) { - if (send_vars.size() == 0 || recv_vars.size() == 0) { - return false; - } - - /** - * Check any of opvars contains `.block` and in sendvars - */ - auto checker = [](const std::vector &opvars, - const std::vector &rpc_vars) -> bool { - for (auto &var : opvars) { - // a variable name with the suffix `.block` means it's a splited - // variable by (DistributeTranspiler) - // [python/paddle/fluid/transpiler/distribute_transpiler.py] - if (var.find(".block") != std::string::npos && - std::find(rpc_vars.begin(), rpc_vars.end(), var) != rpc_vars.end()) { - return true; +namespace { + +void CheckProgram(const ProgramDesc &program) { + std::map visit; +#define _INT(role) static_cast(role) + + for (size_t i = 0; i < program.Size(); ++i) { + for (OpDesc *op : program.Block(i).AllOps()) { + // For backward compatibility, some program doesn't have role added. + if (!op->HasAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) continue; + int role_id = boost::get( + op->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())); + visit[role_id] = true; + switch (role_id) { + case _INT(OpRole::kForward): + PADDLE_ENFORCE( + visit.find(_INT(OpRole::kBackward)) == visit.end(), + "Cannot add forward operator before backward operator."); + break; + case _INT(OpRole::kBackward): + case _INT(OpRole::kBackward) | _INT(OpRole::kLoss): + PADDLE_ENFORCE( + visit.find(_INT(OpRole::kOptimize)) == visit.end(), + "Cannot add backward operator before optimize operator."); + break; + case _INT(OpRole::kForward) | _INT(OpRole::kLoss): + PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward) | + _INT(OpRole::kLoss)) == visit.end(), + "Cannot add backward|loss operator before " + "forward|loss operator."); + PADDLE_ENFORCE( + visit.find(_INT(OpRole::kOptimize)) == visit.end(), + "Cannot add backward operator before optimize operator."); + break; + case _INT(OpRole::kOptimize): + case _INT(OpRole::kOptimize) | _INT(OpRole::kLRSched): + PADDLE_ENFORCE(visit.find(_INT(OpRole::kBackward)) != visit.end(), + "Optimize operators must follow backward operator."); + break; + case _INT(OpRole::kLRSched): + case _INT(OpRole::kDist): + case _INT(OpRole::kRPC): + case _INT(OpRole::kNotSpecified): + break; + default: + LOG(FATAL) << "Unknown operator role. Don't add new role because " + "you don't know what you are doing."; } } - return false; - }; - - std::vector input_var_names; - std::vector output_var_names; - for (ir::Node *input : node->inputs) { - input_var_names.push_back(input->Name()); } - for (ir::Node *output : node->outputs) { - output_var_names.push_back(output->Name()); - } - - return checker(output_var_names, send_vars) || - checker(input_var_names, recv_vars); +#undef _INT } +} // namespace Graph::Graph(const ProgramDesc &program) : program_(program) { + CheckProgram(program_); // Make the nodes id start from 0. Node::ResetId(); + auto var_nodes = InitFromProgram(program_); + ResolveHazard(var_nodes); +} +std::map> Graph::InitFromProgram( + const ProgramDesc &program) { VLOG(3) << "block in program:" << program_.Size(); std::unordered_map all_vars; + // var nodes for each var name, will have multiple versions in SSA + std::map> var_nodes; for (auto *var : program.Block(0).AllVars()) { all_vars.emplace(var->Name(), var); } - std::map> var_nodes; for (auto *op : program.Block(0).AllOps()) { ir::Node *node = CreateOpNode(op); // For input args, reuse the same var name if it was created before. @@ -134,7 +132,11 @@ Graph::Graph(const ProgramDesc &program) : program_(program) { var->inputs.push_back(node); } } + return std::move(var_nodes); +} +void Graph::ResolveHazard( + const std::map> &var_nodes) { /** * We should handle write after read(WAR) and write after write(WAW) here. * Because some of the operators of the program can be executed parallelly. @@ -153,6 +155,7 @@ Graph::Graph(const ProgramDesc &program) : program_(program) { auto it_old = versions.rbegin(); ++it_old; for (; it_old != versions.rend(); it_new = it_old, ++it_old) { + VLOG(3) << "deal with var: " << (*it_new)->Name(); ir::Node *write_op = (*it_new)->inputs.empty() ? nullptr : (*it_new)->inputs[0]; const auto &read_ops = (*it_old)->outputs; diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h index ab687e760a761d4e445726bd5149966adc2403d0..9d7aa5d32deb274fbf29481b0d4754c05d1e21b5 100644 --- a/paddle/fluid/framework/ir/graph.h +++ b/paddle/fluid/framework/ir/graph.h @@ -160,6 +160,12 @@ class Graph { return nullptr; } + std::map> InitFromProgram( + const ProgramDesc &program); + + void ResolveHazard( + const std::map> &var_nodes); + private: // This method takes ownership of `node`. ir::Node *AddNode(ir::Node *node) { diff --git a/paddle/fluid/framework/ir/graph_helper.cc b/paddle/fluid/framework/ir/graph_helper.cc index c54766d95a61ac1a4b61566c6de62cbc86685a1d..01e878089171e4620f32b57a65d92d1c86d307db 100644 --- a/paddle/fluid/framework/ir/graph_helper.cc +++ b/paddle/fluid/framework/ir/graph_helper.cc @@ -120,19 +120,25 @@ size_t GraphNum(const Graph &graph) { std::deque q_nodes; std::vector> graph_nodes; std::unordered_set g_nodes; + // q_set used to record records in the queue. + std::unordered_set q_set; size_t graph_count = 0; - auto traverse_nodes = [&visited_nodes, - &q_nodes](const std::vector &nodes) { - std::copy_if( - nodes.begin(), nodes.end(), std::back_inserter(q_nodes), - [&visited_nodes](Node *node) { return !visited_nodes.count(node); }); + auto traverse_nodes = [&visited_nodes, &q_nodes, + &q_set](const std::vector &nodes) { + for (auto n : nodes) { + if (visited_nodes.count(n) == 0 && q_set.count(n) == 0) { + q_nodes.push_back(n); + q_set.insert(n); + } + } }; while (visited_nodes.size() != nodes.size()) { if (!q_nodes.empty()) { auto cur_node = q_nodes.front(); q_nodes.pop_front(); + q_set.erase(cur_node); visited_nodes.insert(cur_node); g_nodes.insert(cur_node); traverse_nodes(cur_node->inputs); @@ -146,6 +152,7 @@ size_t GraphNum(const Graph &graph) { for (auto &n : nodes) { if (visited_nodes.count(n) == 0) { q_nodes.push_back(n); + q_set.insert(n); break; } } diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index 29b604afbfcfc2bac67e447db8cd4c671c036dbe..b20d70132256bd5df7411c46ff4eb246b1f14ba8 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -259,6 +259,15 @@ GraphPatternDetector::DetectPatterns() { return result; } +bool GraphItemCMP(const std::pair &a, + const std::pair &b) { + if (a.first != b.first) { + return a.first < b.first; + } else { + return a.second < b.second; + } +} + // TODO(Superjomn) enhance the function as it marks unique unique as duplicates // see https://github.com/PaddlePaddle/Paddle/issues/13550 void GraphPatternDetector::UniquePatterns( @@ -267,12 +276,16 @@ void GraphPatternDetector::UniquePatterns( std::vector result; std::unordered_set set; + std::hash hasher; for (auto &g : *subgraphs) { - size_t key = 0; - for (auto &item : g) { - key ^= std::hash{}(item.first); - key ^= std::hash{}(item.second); + // Sort the items in the sub-graph, and transform to a string key. + std::vector> sorted_keys(g.begin(), g.end()); + std::sort(sorted_keys.begin(), sorted_keys.end(), GraphItemCMP); + std::stringstream ss; + for (auto &item : sorted_keys) { + ss << item.first << ":" << item.second; } + auto key = hasher(ss.str()); if (!set.count(key)) { result.emplace_back(g); set.insert(key); diff --git a/paddle/fluid/framework/ir/multi_batch_merge_pass.cc b/paddle/fluid/framework/ir/multi_batch_merge_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..bd5b76426eb55cebdabfccd700439a4c418a10f0 --- /dev/null +++ b/paddle/fluid/framework/ir/multi_batch_merge_pass.cc @@ -0,0 +1,315 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/multi_batch_merge_pass.h" + +#include +#include +#include + +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/framework/op_proto_maker.h" + +namespace paddle { +namespace framework { +namespace ir { + +static const char kNumRepeats[] = "num_repeats"; +typedef std::unordered_map> SSAVarList; + +ir::Node* SameNameVar(std::unordered_set all, ir::Node* target) { + for (auto n : all) { + if (target->IsVar() && target->Name() == n->Name()) { + return n; + } + } + return nullptr; +} + +VarDesc CopyVarDesc(VarDesc* var_desc) { + VarDesc repeated_var(var_desc->Name()); + // copy other variable attributes + if (var_desc->GetType() != proto::VarType::READER) { + repeated_var.SetType(var_desc->GetType()); + repeated_var.SetShape(var_desc->GetShape()); + repeated_var.SetDataType(var_desc->GetDataType()); + repeated_var.SetLoDLevel(var_desc->GetLoDLevel()); + repeated_var.SetPersistable(var_desc->Persistable()); + } else { + // TODO(typhoonzero): copy reader var + } + return repeated_var; +} + +VarDesc UpdateGradVarDesc( + VarDesc* var_desc, int repeat, + const std::unordered_set& grad_names, + const std::unordered_set& bn_vars_need_rename) { + if (grad_names.find(var_desc->Name()) != grad_names.end() || + bn_vars_need_rename.find(var_desc->Name()) != bn_vars_need_rename.end()) { + std::string new_gname = + string::Sprintf("%s.repeat.%d", var_desc->Name(), repeat); + VarDesc repeated_var = CopyVarDesc(var_desc); + repeated_var.SetName(new_gname); + VLOG(3) << "update " << var_desc->Name() << " to repeat " << repeat; + return repeated_var; + } + return *var_desc; +} + +std::unique_ptr BatchMergePass::ApplyImpl( + std::unique_ptr graph) const { + int num_repeats = Get(kNumRepeats); + std::vector forward_backward_ops; + std::vector optimize_ops; + std::vector lr_ops; // ops other than forward/backward/optimize + std::unordered_set grad_names; + + std::vector nodes = TopologySortOperations(*graph); + auto origin_nodes = graph->ReleaseNodes(); + VLOG(3) << "origin nodes count: " << origin_nodes.size(); + ir::Graph& result = *graph; + + // 1. record op nodes of different roles + for (auto node : nodes) { + if (node->IsVar()) continue; + int op_role = boost::get(node->Op()->GetAttr( + framework::OpProtoAndCheckerMaker::OpRoleAttrName())); + if ((op_role == static_cast(framework::OpRole::kForward)) || + (op_role & static_cast(framework::OpRole::kBackward)) || + (op_role & static_cast(framework::OpRole::kLoss))) { + forward_backward_ops.push_back(node); + } else if ((op_role & static_cast(framework::OpRole::kOptimize)) || + (op_role & static_cast(framework::OpRole::kDist)) || + (op_role & static_cast(framework::OpRole::kRPC))) { + optimize_ops.push_back(node); + auto op_role_var = node->Op()->GetNullableAttr( + OpProtoAndCheckerMaker::OpRoleVarAttrName()); + auto op_role_vars = boost::get>(op_role_var); + for (size_t i = 0; i < op_role_vars.size(); i += 2) { + grad_names.insert(op_role_vars[i + 1]); + } + } else if (op_role & static_cast(framework::OpRole::kLRSched)) { + lr_ops.push_back(node); + } else { // NOLINT + PADDLE_THROW("Invalid op_role: %d", static_cast(op_role)); + } + } + + // 2. copy forward backward + ir::Node* prev_repeat_last_op_node = nullptr; + // record origin_grad -> repeated grad list map. + std::map> grad_repeated_map; + std::map> created; + std::unordered_set bn_vars_need_rename; + for (int i = 0; i < num_repeats; ++i) { + std::unordered_set copied; + for (size_t node_idx = 0; node_idx < forward_backward_ops.size(); + ++node_idx) { + auto node = forward_backward_ops[node_idx]; + OpDesc repeated_op(*(node->Op()), node->Op()->Block()); + // 3. rename grad outputs to current repeat. + for (auto outname : repeated_op.OutputArgumentNames()) { + if (grad_names.find(outname) != grad_names.end()) { + std::string new_gname = string::Sprintf("%s.repeat.%d", outname, i); + repeated_op.RenameOutput(outname, new_gname); + } + } + // 3.5 let batch_norm ops use independent vars, note batch_norm_grad do + // not need this update + if (node->Name() == "batch_norm") { + // NOTE: assume bn op created by layers use save var as output mean and + // variance + std::string new_mean_name = + string::Sprintf("%s.repeat.%d", repeated_op.Input("Mean")[0], i); + std::string new_var_name = string::Sprintf( + "%s.repeat.%d", repeated_op.Input("Variance")[0], i); + bn_vars_need_rename.insert(repeated_op.Input("Mean")[0]); + bn_vars_need_rename.insert(repeated_op.Input("Variance")[0]); + VLOG(3) << "renaming " << repeated_op.Input("Mean")[0] << " to " + << new_mean_name; + repeated_op.RenameInput(repeated_op.Input("Mean")[0], new_mean_name); + repeated_op.RenameInput(repeated_op.Input("Variance")[0], new_var_name); + repeated_op.RenameOutput(repeated_op.Output("MeanOut")[0], + new_mean_name); + repeated_op.RenameOutput(repeated_op.Output("VarianceOut")[0], + new_var_name); + } + + // 3.9 do copy + auto repeated_node = result.CreateOpNode(&repeated_op); + copied.insert(node); + + // 4. add deps between repeats + if (node_idx == forward_backward_ops.size() - 1) { + prev_repeat_last_op_node = repeated_node; + } + if (node_idx == 0 && prev_repeat_last_op_node) { + auto* depvar = result.CreateControlDepVar(); + prev_repeat_last_op_node->outputs.push_back(depvar); + depvar->inputs.push_back(prev_repeat_last_op_node); + repeated_node->inputs.push_back(depvar); + depvar->outputs.push_back(repeated_node); + } + + for (auto in_node : node->inputs) { + if (in_node->IsCtrlVar()) { + continue; + } + ir::Node* var = nullptr; + auto updated_var = UpdateGradVarDesc(in_node->Var(), i, grad_names, + bn_vars_need_rename); + // should be initialized by startup, how to initilize tensor in the + // scope? + if (node->Name() == "batch_norm" && + bn_vars_need_rename.find(in_node->Name()) != + bn_vars_need_rename.end()) { + // Create bn mean/variance for each repeat + var = result.CreateVarNode(&updated_var); + created[updated_var.Name()].push_back(var); + copied.insert(in_node); + repeated_node->inputs.push_back(var); + var->outputs.push_back(repeated_node); + continue; + } + + // for other ops + if (in_node->inputs.empty() && i > 0) { + // do not copy head vars (inputs, params) in repeats > 0 + var = created.at(in_node->Name()).back(); + } else { + if (copied.find(in_node) == copied.end()) { + var = result.CreateVarNode(&updated_var); + if (grad_names.find(in_node->Var()->Name()) != grad_names.end()) { + grad_repeated_map[in_node].push_back(var); + } + copied.insert(in_node); + created[updated_var.Name()].push_back(var); + } else { + var = created.at(updated_var.Name()).back(); + } + } + repeated_node->inputs.push_back(var); + var->outputs.push_back(repeated_node); + } + for (auto out_node : node->outputs) { + if (out_node->IsCtrlVar()) { + continue; + } + ir::Node* var = nullptr; + auto updated_var = UpdateGradVarDesc(out_node->Var(), i, grad_names, + bn_vars_need_rename); + if (copied.find(out_node) == copied.end()) { + var = result.CreateVarNode(&updated_var); + if (grad_names.find(out_node->Var()->Name()) != grad_names.end()) { + grad_repeated_map[out_node].push_back(var); + } + copied.insert(out_node); + created[updated_var.Name()].push_back(var); + } else { + var = created.at(updated_var.Name()).back(); + } + repeated_node->outputs.push_back(var); + var->inputs.push_back(repeated_node); + } + } + } + + // 5. create GRAD merge op node + for (auto kv : grad_repeated_map) { + OpDesc sum_op; + sum_op.SetType("sum"); + std::vector repeated_grad_names; + for (auto r : kv.second) { + repeated_grad_names.push_back(r->Var()->Name()); + } + sum_op.SetInput("X", repeated_grad_names); + sum_op.SetOutput("Out", {kv.first->Var()->Name()}); + sum_op.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), + static_cast(OpRole::kBackward)); + auto sum_op_node = result.CreateOpNode(&sum_op); + for (auto r : kv.second) { + sum_op_node->inputs.push_back(r); + r->outputs.push_back(sum_op_node); + } + auto sum_out_var_node = result.CreateVarNode(kv.first->Var()); + sum_op_node->outputs.push_back(sum_out_var_node); + sum_out_var_node->inputs.push_back(sum_op_node); + created[sum_out_var_node->Name()].push_back(sum_out_var_node); + + OpDesc scale_op; + scale_op.SetType("scale"); + scale_op.SetInput("X", {sum_out_var_node->Var()->Name()}); + // NOTE: inplace scale. + scale_op.SetOutput("Out", {sum_out_var_node->Var()->Name()}); + scale_op.SetAttr("scale", static_cast(1.0f / num_repeats)); + scale_op.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), + static_cast(OpRole::kBackward)); + auto scale_op_node = result.CreateOpNode(&scale_op); + scale_op_node->inputs.push_back(sum_out_var_node); + sum_out_var_node->outputs.push_back(scale_op_node); + auto scale_out_var_node = result.CreateVarNode(sum_out_var_node->Var()); + scale_op_node->outputs.push_back(scale_out_var_node); + scale_out_var_node->inputs.push_back(scale_op_node); + created[scale_out_var_node->Name()].push_back(scale_out_var_node); + } + // 6. add optimize ops + { + auto copy_node = [&result, &created](ir::Node* node) { + auto op_node = result.CreateOpNode(node->Op()); + // copy op ins/outs + // NOTE: for send/recv ops, the OpDesc uses ctrldepvar to describe + // dependencies, so create those depvars if OpDesc have in/outs. + for (auto in_node : node->inputs) { + if (in_node->IsCtrlVar() && !in_node->Var()) { + continue; + } + ir::Node* var = nullptr; + if (created.find(in_node->Name()) == created.end()) { + var = result.CreateVarNode(in_node->Var()); + created[in_node->Name()].push_back(var); + } else { + var = created.at(in_node->Name()).back(); + } + op_node->inputs.push_back(var); + var->outputs.push_back(op_node); + } + for (auto out_node : node->outputs) { + if (out_node->IsCtrlVar() && !out_node->Var()) { + continue; + } + auto var = result.CreateVarNode(out_node->Var()); + created[out_node->Name()].push_back(var); + op_node->outputs.push_back(var); + var->inputs.push_back(op_node); + } + }; + for (auto node : lr_ops) { + copy_node(node); + } + for (auto node : optimize_ops) { + copy_node(node); + } + } + + result.ResolveHazard(created); + return graph; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(multi_batch_merge_pass, paddle::framework::ir::BatchMergePass) + .RequirePassAttr(paddle::framework::ir::kNumRepeats); diff --git a/paddle/fluid/framework/ir/multi_batch_merge_pass.h b/paddle/fluid/framework/ir/multi_batch_merge_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..c1e5aef20dbc60c18ed03038818bfd8ab217bf28 --- /dev/null +++ b/paddle/fluid/framework/ir/multi_batch_merge_pass.h @@ -0,0 +1,44 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +// BatchMergePass is used to copy forward and backward ops for several +// times to run several batches to simulate large batch size training +// as if we have more than 1 GPUs. +// User can define how many batches to run, gradients will be merged +// through those repeats, and then do optimization using merged gradients. +// This pass is extremely useful when doing large batch-size distributed +// sync training, we can simulate even large batch size as if we have more +// GPUs. + +class BatchMergePass : public Pass { + public: + virtual ~BatchMergePass() {} + + protected: + std::unique_ptr ApplyImpl(std::unique_ptr graph) const override; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/node.h b/paddle/fluid/framework/ir/node.h index 5d6da9f1d76a3c0fc64b7ff35264e385cf19a14b..d6d42f5e92080aa57445e2d6ce59aa3faa89d22d 100644 --- a/paddle/fluid/framework/ir/node.h +++ b/paddle/fluid/framework/ir/node.h @@ -44,6 +44,7 @@ class Node { return op_desc_.get(); } + // Please don't use this API! int id() const { return id_; } bool IsOp() const { return type_ == Type::kOperation; } @@ -92,6 +93,7 @@ class Node { Node() = delete; static int count_; + // Please don't use this API or make this public. static void ResetId() { count_ = 0; } DISABLE_COPY_AND_ASSIGN(Node); }; diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc index 1e7da9a69c7cbf8c13306656599a759515802b76..669d08c70c9b7453264806b346a6c9eb211cfd4a 100644 --- a/paddle/fluid/framework/lod_tensor.cc +++ b/paddle/fluid/framework/lod_tensor.cc @@ -418,7 +418,7 @@ void LoDTensor::MergeLoDTensor( PADDLE_ENFORCE_EQ(new_lod.size(), lod.size()); for (size_t j = 0; j < lod.size(); ++j) { auto &sub_lod = new_lod[j]; - auto &offset = sub_lod.back(); + size_t offset = sub_lod.back(); for (size_t k = 1; k < lod[j].size(); ++k) { sub_lod.push_back(lod[j][k] + offset); } diff --git a/paddle/fluid/framework/lod_tensor_array.h b/paddle/fluid/framework/lod_tensor_array.h index 6d7b6a4ada8729e3698dab5d2b1861aac632be79..36a5c3c5d601390beedaf37ceb98ee2c63ecf5a6 100644 --- a/paddle/fluid/framework/lod_tensor_array.h +++ b/paddle/fluid/framework/lod_tensor_array.h @@ -18,6 +18,8 @@ limitations under the License. */ namespace paddle { namespace framework { + using LoDTensorArray = std::vector; -} + +} // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/mixed_vector.h b/paddle/fluid/framework/mixed_vector.h index 77386f4f069489b6ff7b927a281bdc286ff816e0..e1aac6dc5a92fb616f00de5806f044b83c2f503f 100644 --- a/paddle/fluid/framework/mixed_vector.h +++ b/paddle/fluid/framework/mixed_vector.h @@ -542,6 +542,33 @@ class CPUVector : public std::vector> { this->reserve(this->size() + size_t(end - begin)); this->insert(this->end(), begin, end); } + + const T *CUDAData(platform::Place place) const { + PADDLE_THROW( + "Vector::CUDAData() method is not supported in CPU-only version"); + } + + T *CUDAMutableData(platform::Place place) { + PADDLE_THROW( + "Vector::CUDAMutableData() method is not supported in CPU-only " + "version"); + } + + const T *Data(platform::Place place) const { + PADDLE_ENFORCE( + platform::is_cpu_place(place), + "Vector::Data() method is not supported when not in CPUPlace"); + return this->data(); + } + + T *MutableData(platform::Place place) { + PADDLE_ENFORCE( + platform::is_cpu_place(place), + "Vector::MutableData() method is not supported when not in CPUPlace"); + return this->data(); + } + + const void *Handle() const { return static_cast(this); } }; template diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc index 2840d503f1454271afb309efdd435225ab077dc0..7fb42feb95b4d54aec693228721c052f683f4d80 100644 --- a/paddle/fluid/framework/naive_executor.cc +++ b/paddle/fluid/framework/naive_executor.cc @@ -146,22 +146,5 @@ void NaiveExecutor::CleanFeedFetchOps() { ops_.swap(ops); } -void NaiveExecutor::EnableMKLDNN(const ProgramDesc &program) { -#ifdef PADDLE_WITH_MKLDNN - VLOG(3) << "use_mkldnn=True"; - for (size_t block_id = 0; block_id < program.Size(); ++block_id) { - auto *block = const_cast(program).MutableBlock(block_id); - for (auto *op : block->AllOps()) { - if (op->HasAttr("use_mkldnn")) { - op->SetAttr("use_mkldnn", true); - } - } - } -#else - LOG(WARNING) - << "'MKLDNN' is not supported, Please re-compile with WITH_MKLDNN option"; -#endif -} - } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/naive_executor.h b/paddle/fluid/framework/naive_executor.h index 9374f3f4a35cc0f90e5b2d6e8b397784b8eae123..ddfa6e1f4d8b73f594fc381ab505797491cdd378 100644 --- a/paddle/fluid/framework/naive_executor.h +++ b/paddle/fluid/framework/naive_executor.h @@ -48,8 +48,6 @@ class NaiveExecutor { void CleanFeedFetchOps(); - void EnableMKLDNN(const ProgramDesc& program); - protected: void CreateVariables(const ProgramDesc& desc, Scope* scope, int block_id); diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc index c293cf92b4f3d530109c76850df184af9cad7399..8ece618f3f72552fedcffab3e03ebb30476b7cab 100644 --- a/paddle/fluid/framework/op_desc.cc +++ b/paddle/fluid/framework/op_desc.cc @@ -419,8 +419,15 @@ struct SetAttrDescVisitor : public boost::static_visitor { } VectorToRepeated(blocks_idx, attr_->mutable_blocks_idx()); } + void operator()(BlockDesc *desc) const { attr_->set_block_idx(desc->ID()); } + void operator()(int64_t v) const { attr_->set_l(v); } + + void operator()(const std::vector &v) const { + VectorToRepeated(v, attr_->mutable_longs()); + } + void operator()(boost::blank) const { PADDLE_THROW("Unexpected branch"); } }; diff --git a/paddle/fluid/framework/op_desc.h b/paddle/fluid/framework/op_desc.h index 440e0509be727ec2b84abc76fca44edda11f8a0a..30c8a26c3d2f0068674aa70b4ff875a2f73c1dca 100644 --- a/paddle/fluid/framework/op_desc.h +++ b/paddle/fluid/framework/op_desc.h @@ -121,10 +121,6 @@ class OpDesc { BlockDesc *Block() { return this->block_; } - const BlockDesc &BlockRef() const { return *this->block_; } - - void SetBlock(BlockDesc *block) { this->block_ = block; } - private: template static std::vector MapKeys(const MapType &map) { diff --git a/paddle/fluid/framework/op_proto_maker.h b/paddle/fluid/framework/op_proto_maker.h index 5527783faad6f7ea6e0b9e776cbde3c743277f16..4c59c73d8779eceb267ad532aabccabbd54b0df2 100644 --- a/paddle/fluid/framework/op_proto_maker.h +++ b/paddle/fluid/framework/op_proto_maker.h @@ -28,12 +28,12 @@ enum class OpRole { kBackward = 0x0001, kOptimize = 0x0002, // RPC role is for send/recv releated op - kRPC = 0x0003, + kRPC = 0x0004, // Dist role is for split_byref/split_selected_rows/concat // used for distributed training. - kDist = 0x0004, + kDist = 0x0008, // Tag all learning rate scheduler operators. - kLRSched = 0x0005, + kLRSched = 0x0010, kLoss = 0x0100, // The default value of op's role. This should be only used for unittests and diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 14fcde2fe3b1c3acfc0994e9cd37a784c57826d7..45fc36c70633204dbfadbd10757c08b009d2cc74 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -354,18 +354,18 @@ void OperatorBase::GenerateTemporaryNames() { } } -static bool VarIsTensor(const Variable* var) { - return var->IsType() || var->IsType(); +static bool VarIsTensor(const Variable& var) { + return var.IsType() || var.IsType(); } -static const Tensor* GetTensorFromVar(Variable* var) { - if (var->IsType()) { - return var->GetMutable(); - } else if (var->IsType()) { - return var->GetMutable()->mutable_value(); +const Tensor* GetTensorFromVar(const Variable& var) { + if (var.IsType()) { + return static_cast(&(var.Get())); + } else if (var.IsType()) { + return &(var.Get().value()); } else { PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.", - var->Type().name()); + var.Type().name()); } } @@ -415,8 +415,7 @@ bool ExecutionContext::HasOutput(const std::string& name) const { template <> const Tensor* ExecutionContext::Input(const std::string& name) const { auto* var = InputVar(name); - return var == nullptr ? nullptr - : GetTensorFromVar(const_cast(var)); + return var == nullptr ? nullptr : GetTensorFromVar(*var); } template <> @@ -428,7 +427,7 @@ const std::vector ExecutionContext::MultiInput( std::transform(names.begin(), names.end(), std::back_inserter(res), [&](const std::string& sub_name) { auto var = scope_.FindVar(sub_name); - return var == nullptr ? nullptr : GetTensorFromVar(var); + return var == nullptr ? nullptr : GetTensorFromVar(*var); }); return res; } @@ -770,8 +769,10 @@ void OperatorWithKernel::TransferInplaceVarsBack( for (auto& var_name : inplace_vars) { VLOG(3) << "share inplace var " + var_name + " back to it's original scope"; auto* original_tensor = GetMutableTensorFromVar(scope.FindVar(var_name)); - auto* transformed_tensor = - GetTensorFromVar(transfer_scope.FindVar(var_name)); + auto* var = transfer_scope.FindVar(var_name); + PADDLE_ENFORCE(var != nullptr, "The var[%s] should not be nullptr", + var_name); + auto* transformed_tensor = GetTensorFromVar(*var); original_tensor->ShareDataWith(*transformed_tensor); } } @@ -784,11 +785,11 @@ Scope* OperatorWithKernel::TryTransferData( for (auto& var_name : var_name_item.second) { auto* var = scope.FindVar(var_name); // Only tensor can be tranfer to another device. - if (var == nullptr || !VarIsTensor(var)) { + if (var == nullptr || !VarIsTensor(*var)) { continue; } - auto* tensor_in = GetTensorFromVar(var); + auto* tensor_in = GetTensorFromVar(*var); if (!tensor_in->IsInitialized()) { continue; } diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index 626b50edfd39424473be33e9f8baec5970471477..96ad3205235b921a7cf60ed674a8350f74d18509 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -63,6 +63,7 @@ inline std::string GradVarName(const std::string& var_name) { } proto::VarType::Type GetDataTypeOfVar(const Variable* var); +const Tensor* GetTensorFromVar(const Variable& var); class OperatorBase; class ExecutionContext; diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 7dad872dd04754d2866c361b74ab1236ff743da5..a45b9ec7a20ac3629d182f009b735d4d82fb5dc2 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -109,18 +109,9 @@ ParallelExecutor::ParallelExecutor( if (member_->local_scopes_.size() != 1 && local_scopes.empty()) { BCastParamsToDevices(bcast_vars); } - // Startup Program has been run. All local scopes has correct parameters. +// Startup Program has been run. All local scopes has correct parameters. - // Step 2. Create vars in each scope; - std::vector var_infos; - for (auto *var : main_program.Block(0).AllVars()) { - var_infos.emplace_back(); - var_infos.back().name_ = var->Name(); - var_infos.back().type_ = var->GetType(); - var_infos.back().persistable_ = var->Persistable(); - } - -// Step 3. Convert main_program to SSA form and dependency graph. Also, insert +// Step 2. Convert main_program to SSA form and dependency graph. Also, insert // ncclOp #ifdef PADDLE_WITH_CUDA std::unique_ptr graph = build_strategy.Apply( @@ -156,6 +147,23 @@ ParallelExecutor::ParallelExecutor( params, member_->local_scopes_, member_->use_cuda_); #endif + // Step 3. Create vars in each scope. Passes may also create new vars. + // skip control vars and empty vars + std::vector var_infos; + for (auto &node : graph->Nodes()) { + if (node->IsVar() && !node->IsCtrlVar() && node->Var()) { + var_infos.emplace_back(); + var_infos.back().name_ = node->Var()->Name(); + var_infos.back().type_ = node->Var()->GetType(); + var_infos.back().persistable_ = node->Var()->Persistable(); + } + } + // If the loss_var_name is given, the number of graph should be only one. + if (loss_var_name.size()) { + PADDLE_ENFORCE_EQ(ir::GraphNum(*graph), 1, + "The number of graph should be only one"); + } + if (exec_strategy.type_ == ExecutionStrategy::kDefault) { member_->executor_.reset(new details::ThreadedSSAGraphExecutor( exec_strategy, member_->local_scopes_, places, std::move(graph))); @@ -179,6 +187,10 @@ void ParallelExecutor::BCastParamsToDevices( } auto &main_tensor = main_var->Get(); + if (!main_tensor.IsInitialized()) { + VLOG(3) << "one in var not inited, return!"; + continue; + } auto &dims = main_tensor.dims(); if (paddle::platform::is_gpu_place(main_tensor.place())) { #ifdef PADDLE_WITH_CUDA @@ -291,10 +303,8 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes( } ParallelExecutor::~ParallelExecutor() { - const auto dev_ctxs = - platform::DeviceContextPool::Instance().GetAllDeviceContexts(); - for (auto &dev_ctx : dev_ctxs) { - dev_ctx->Wait(); + for (auto &p : member_->places_) { + platform::DeviceContextPool::Instance().Get(p)->Wait(); } if (member_->own_local_scope_) { diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h index 14f9f36812d690fc4a7440f2e7e6a85e9993a535..9462620e829ec815e1553f6378a67463ea3b8aa3 100644 --- a/paddle/fluid/framework/scope.h +++ b/paddle/fluid/framework/scope.h @@ -78,6 +78,8 @@ class Scope { /// Drop all kids scopes belonged to this scope. void DropKids(); + std::list& kids() const { return kids_; } + /// Find if a scope exists in the kid scopes bool HasKid(const Scope* scope) const; diff --git a/paddle/fluid/framework/tensor_test.cc b/paddle/fluid/framework/tensor_test.cc index cb2061c06a429d8e8116001a4aa4e8c46ea13428..a0a9a573603ceb6b577529101cb331adbc81337a 100644 --- a/paddle/fluid/framework/tensor_test.cc +++ b/paddle/fluid/framework/tensor_test.cc @@ -75,6 +75,19 @@ TEST(Tensor, MutableData) { platform::CPUPlace()); EXPECT_EQ(p1, p2); } + // Not sure if it's desired, but currently, Tensor type can be changed. + { + framework::Tensor src_tensor; + int8_t* p1 = src_tensor.mutable_data(framework::make_ddim({1}), + platform::CPUPlace()); + EXPECT_NE(p1, nullptr); + *p1 = 1; + + uint8_t* p2 = src_tensor.mutable_data(framework::make_ddim({1}), + platform::CPUPlace()); + EXPECT_NE(p2, nullptr); + EXPECT_EQ(static_cast(p2[0]), 1); + } #ifdef PADDLE_WITH_CUDA { diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index 69bcbc0e5891f95af4de8dfd49a25648ca920ab1..ca1e01c89f07c4ffc3979a6a6c3728328e0a1819 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -153,6 +153,12 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place, auto src_gpu_place = boost::get(src_place); auto dst_gpu_place = boost::get(dst_place); memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, nullptr); + } else if (platform::is_cuda_pinned_place(src_place) && + platform::is_gpu_place(dst_place)) { + auto src_pinned_place = boost::get(src_place); + auto dst_gpu_place = boost::get(dst_place); + memory::Copy(dst_gpu_place, dst_ptr, src_pinned_place, src_ptr, size, + nullptr); } #endif } diff --git a/paddle/fluid/framework/threadpool.cc b/paddle/fluid/framework/threadpool.cc index 18cdca3a658a6a89e6ab637a7f38825756acfea8..a588cb417aebe94bd4aeda02b1bc8ba07a04b960 100644 --- a/paddle/fluid/framework/threadpool.cc +++ b/paddle/fluid/framework/threadpool.cc @@ -25,7 +25,6 @@ DEFINE_int32(dist_threadpool_size, 0, namespace paddle { namespace framework { - std::unique_ptr ThreadPool::threadpool_(nullptr); std::once_flag ThreadPool::init_flag_; @@ -47,8 +46,7 @@ void ThreadPool::Init() { } } -ThreadPool::ThreadPool(int num_threads) - : total_threads_(num_threads), idle_threads_(num_threads), running_(true) { +ThreadPool::ThreadPool(int num_threads) : running_(true) { threads_.resize(num_threads); for (auto& thread : threads_) { // TODO(Yancey1989): binding the thread on the specify CPU number @@ -59,6 +57,7 @@ ThreadPool::ThreadPool(int num_threads) ThreadPool::~ThreadPool() { { // notify all threads to stop running + std::lock_guard l(mutex_); running_ = false; scheduled_.notify_all(); } @@ -69,36 +68,24 @@ ThreadPool::~ThreadPool() { } } -void ThreadPool::Wait() { - std::unique_lock lock(mutex_); - completed_.wait(lock, [=] { return Done() == true; }); -} - void ThreadPool::TaskLoop() { - while (running_) { + while (true) { std::unique_lock lock(mutex_); - scheduled_.wait(lock, [=] { return !tasks_.empty() || !running_; }); - if (!running_) { - break; + scheduled_.wait( + lock, [this] { return !this->tasks_.empty() || !this->running_; }); + + if (!running_ || tasks_.empty()) { + return; } + // pop a task from the task queue auto task = std::move(tasks_.front()); tasks_.pop(); - - --idle_threads_; lock.unlock(); // run the task task(); - - { - std::unique_lock lock(mutex_); - ++idle_threads_; - if (Done()) { - completed_.notify_all(); - } - } } } diff --git a/paddle/fluid/framework/threadpool.h b/paddle/fluid/framework/threadpool.h index 94111ee335b1a5df327b3e46d62069b4735c54f6..0687e628aaa4fb7b2e67938fa09a319c8bb35aff 100644 --- a/paddle/fluid/framework/threadpool.h +++ b/paddle/fluid/framework/threadpool.h @@ -57,15 +57,6 @@ class ThreadPool { ~ThreadPool(); - // Returns the number of threads created by the constructor. - size_t Threads() const { return total_threads_; } - - // Returns the number of currently idle threads. - size_t IdleThreads() { - std::unique_lock lock(mutex_); - return idle_threads_; - } - // Run pushes a function to the task queue and returns a std::future // object. To wait for the completion of the task, call // std::future::wait(). @@ -94,25 +85,13 @@ class ThreadPool { }); std::future> f = task.get_future(); tasks_.push(std::move(task)); - lock.unlock(); scheduled_.notify_one(); return f; } - // Wait until all the tasks are completed. - void Wait(); - private: DISABLE_COPY_AND_ASSIGN(ThreadPool); - // If the task queue is empty and avaialbe is equal to the number of - // threads, means that all tasks are completed. Note: this function - // is not thread-safe. Returns true if all tasks are completed. - // Note: don't delete the data member total_threads_ and use - // threads_.size() instead; because you'd need to lock the mutex - // before accessing threads_. - bool Done() { return tasks_.empty() && idle_threads_ == total_threads_; } - // The constructor starts threads to run TaskLoop, which retrieves // and runs tasks from the queue. void TaskLoop(); @@ -125,14 +104,11 @@ class ThreadPool { static std::once_flag init_flag_; std::vector> threads_; - const size_t total_threads_; - size_t idle_threads_; std::queue tasks_; std::mutex mutex_; bool running_; std::condition_variable scheduled_; - std::condition_variable completed_; }; class ThreadPoolIO : ThreadPool { diff --git a/paddle/fluid/framework/threadpool_test.cc b/paddle/fluid/framework/threadpool_test.cc index 27a4ffd4fcbf293a3dea1744b29384d0bee0c137..884d61e23428a0ad758946295ca9c470767e93ef 100644 --- a/paddle/fluid/framework/threadpool_test.cc +++ b/paddle/fluid/framework/threadpool_test.cc @@ -19,10 +19,11 @@ limitations under the License. */ namespace framework = paddle::framework; -void do_sum(framework::ThreadPool* pool, std::atomic* sum, int cnt) { - std::vector> fs; +void do_sum(std::vector>* fs, std::mutex* mu, + std::atomic* sum, int cnt) { for (int i = 0; i < cnt; ++i) { - fs.push_back(framework::Async([sum]() { sum->fetch_add(1); })); + std::lock_guard l(*mu); + fs->push_back(framework::Async([sum]() { sum->fetch_add(1); })); } } @@ -40,18 +41,21 @@ TEST(ThreadPool, ConcurrentInit) { } TEST(ThreadPool, ConcurrentRun) { - framework::ThreadPool* pool = framework::ThreadPool::GetInstance(); std::atomic sum(0); std::vector threads; + std::vector> fs; + std::mutex fs_mu; int n = 50; // sum = (n * (n + 1)) / 2 for (int i = 1; i <= n; ++i) { - std::thread t(do_sum, pool, &sum, i); + std::thread t(do_sum, &fs, &fs_mu, &sum, i); threads.push_back(std::move(t)); } for (auto& t : threads) { t.join(); } - pool->Wait(); + for (auto& t : fs) { + t.wait(); + } EXPECT_EQ(sum, ((n + 1) * n) / 2); } diff --git a/paddle/fluid/framework/type_defs.h b/paddle/fluid/framework/type_defs.h index e099e40f121ff13657e563eb608feecbca0551be..2de6233a9e0d320ec9a06d547db3575eb61925c0 100644 --- a/paddle/fluid/framework/type_defs.h +++ b/paddle/fluid/framework/type_defs.h @@ -36,7 +36,7 @@ using Attribute = boost::variant, std::vector, std::vector, bool, std::vector, BlockDesc*, int64_t, - std::vector>; + std::vector, std::vector>; using AttributeMap = std::unordered_map; diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index 9794a193bcfaae19552b1f6fbdf2dab2898033d5..d31c8e3b7d66a0cdb2c4725783c9a24f049c666d 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -1,3 +1,6 @@ +if(WITH_TESTING) + include(test.cmake) # some generic cmake funtion for inference +endif() # analysis and tensorrt must be added before creating static library, # otherwise, there would be undefined reference to them in static library. add_subdirectory(analysis) @@ -30,7 +33,7 @@ if (WITH_GPU AND TENSORRT_FOUND) endif() # Create static library -cc_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor) +cc_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array) if(NOT APPLE) # TODO(liuyiqu: Temporarily disable the link flag because it is not support on Mac. @@ -40,7 +43,7 @@ endif() # Create shared library cc_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS} - DEPS ${fluid_modules} paddle_fluid_api) + DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array) set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid) if(NOT APPLE) diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt index d4d2fd4634f9e11f3f002e11e177c332ced49885..0354f9e6e9588af601210b8a71ae98c1f90d62f0 100644 --- a/paddle/fluid/inference/analysis/CMakeLists.txt +++ b/paddle/fluid/inference/analysis/CMakeLists.txt @@ -20,22 +20,17 @@ cc_test(test_node SRCS node_tester.cc DEPS analysis) cc_test(test_dot SRCS dot_tester.cc DEPS analysis) cc_binary(inference_analyzer SRCS analyzer_main.cc DEPS analysis paddle_fluid) -function (inference_analysis_test TARGET) - if(WITH_TESTING) - set(options "") - set(oneValueArgs "") - set(multiValueArgs SRCS ARGS EXTRA_DEPS) - cmake_parse_arguments(analysis_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - set(mem_opt "") - if(WITH_GPU) - set(mem_opt "--fraction_of_gpu_memory_to_use=0.5") - endif() - cc_test(${TARGET} - SRCS "${analysis_test_SRCS}" - DEPS analysis pass ${GLOB_PASS_LIB} ${analysis_test_EXTRA_DEPS} - ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model ${mem_opt} ${analysis_test_ARGS}) - set_tests_properties(${TARGET} PROPERTIES DEPENDS test_word2vec) - endif(WITH_TESTING) +function(inference_analysis_test TARGET) + if(WITH_TESTING) + set(options "") + set(oneValueArgs "") + set(multiValueArgs SRCS ARGS EXTRA_DEPS) + cmake_parse_arguments(analysis_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + inference_base_test(${TARGET} + SRCS ${analysis_test_SRCS} + DEPS analysis pass ${GLOB_PASS_LIB} ${analysis_test_EXTRA_DEPS} + ARGS --inference_model_dir=${WORD2VEC_MODEL_DIR} ${analysis_test_ARGS}) + endif() endfunction(inference_analysis_test) inference_analysis_test(test_analyzer SRCS analyzer_tester.cc EXTRA_DEPS paddle_inference_api) diff --git a/paddle/fluid/inference/analysis/analyzer.cc b/paddle/fluid/inference/analysis/analyzer.cc index 2e79d495d5ff00000000029ac0f6eb486aaea94a..ef4142f334e503380dc7ccd74c348404ffe52ee6 100644 --- a/paddle/fluid/inference/analysis/analyzer.cc +++ b/paddle/fluid/inference/analysis/analyzer.cc @@ -107,6 +107,9 @@ void Analyzer::Run(Argument* argument) { passes.push_back("mkldnn_placement_pass"); } #endif + // infer_clean_graph_pass should be the first default pass + // after mkldnn_placement_pass. + passes.push_back("infer_clean_graph_pass"); for (auto& pass : ir_passes_) { if (!disabled_ir_passes_.count(pass)) { passes.push_back(pass); diff --git a/paddle/fluid/inference/analysis/analyzer.h b/paddle/fluid/inference/analysis/analyzer.h index c51a4fdb2f6b27e54637481c23bf6f1f6ec97718..3af1d572dfd81197797dd7e57d87ba12c2f3548e 100644 --- a/paddle/fluid/inference/analysis/analyzer.h +++ b/paddle/fluid/inference/analysis/analyzer.h @@ -67,7 +67,6 @@ class Analyzer : public OrderedRegistry { // larger fusion. const std::vector all_ir_passes_{{ // Manual update the passes here. - "infer_clean_graph_pass", // "attention_lstm_fuse_pass", // "seqconv_eltadd_relu_fuse_pass", // "embedding_fc_lstm_fuse_pass", // @@ -80,6 +79,7 @@ class Analyzer : public OrderedRegistry { "conv_bn_fuse_pass", // "conv_eltwiseadd_bn_fuse_pass", // #ifdef PADDLE_WITH_MKLDNN + "depthwise_conv_mkldnn_pass", // "conv_bias_mkldnn_fuse_pass", // "conv_relu_mkldnn_fuse_pass", // "conv_elementwise_add_mkldnn_fuse_pass", // diff --git a/paddle/fluid/inference/analysis/data_flow_graph_tester.cc b/paddle/fluid/inference/analysis/data_flow_graph_tester.cc index 1682011c3d8cc9927a4b026b370671798cace625..50ce20621fb289023ecccf7bb39d98169765d5ee 100644 --- a/paddle/fluid/inference/analysis/data_flow_graph_tester.cc +++ b/paddle/fluid/inference/analysis/data_flow_graph_tester.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/inference/analysis/data_flow_graph.h" +#include "paddle/fluid/framework/op_proto_maker.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/inference/analysis/ut_helper.h" @@ -130,6 +131,8 @@ void SetOp(framework::ProgramDesc* prog, const std::string& type, op->SetType(type); op->SetInput("Xs", inputs); op->SetOutput("Xs", outputs); + op->SetAttr(framework::OpProtoAndCheckerMaker::OpRoleAttrName(), + static_cast(framework::OpRole::kForward)); } TEST(DataFlowGraph, Build_IR_Graph) { diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt index 0ddd5d53f836131fe37d412fc867cb38f11ee2b5..49a9ebe3ddec1e4fd59ae1155a706859e249d25c 100644 --- a/paddle/fluid/inference/api/CMakeLists.txt +++ b/paddle/fluid/inference/api/CMakeLists.txt @@ -17,32 +17,14 @@ if(APPLE) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=pessimizing-move") endif(APPLE) - set(inference_deps paddle_inference_api paddle_fluid_api analysis pass ir_pass_manager naive_executor ${GLOB_PASS_LIB}) if(WITH_GPU AND TENSORRT_FOUND) set(inference_deps ${inference_deps} paddle_inference_tensorrt_subgraph_engine analysis_predictor) endif() -function(inference_api_test TARGET_NAME) - if (WITH_TESTING) - set(options "") - set(oneValueArgs SRC) - set(multiValueArgs ARGS) - cmake_parse_arguments(inference_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - - cc_test(${TARGET_NAME} - SRCS ${inference_test_SRC} - DEPS "${inference_deps}" - ARGS --dirname=${PYTHON_TESTS_DIR}/book/) - if(inference_test_ARGS) - set_tests_properties(${TARGET_NAME} - PROPERTIES DEPENDS "${inference_test_ARGS}") - endif() - endif(WITH_TESTING) -endfunction(inference_api_test) - -cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope) +cc_library(reset_tensor_array SRCS details/reset_tensor_array.cc DEPS lod_tensor scope) +cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS reset_tensor_array lod_tensor scope) cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis naive_executor zero_copy_tensor) cc_library(zero_copy_tensor SRCS details/zero_copy_tensor.cc DEPS paddle_inference_api) cc_library(zero_copy_tensor_dummy SRCS details/zero_copy_tensor_dummy.cc DEPS paddle_inference_api) @@ -50,10 +32,11 @@ cc_test(test_paddle_inference_api SRCS api_tester.cc DEPS paddle_inference_api) -inference_api_test(test_api_impl SRC api_impl_tester.cc - ARGS test_word2vec test_image_classification) - -set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests) +if(WITH_TESTING) + inference_base_test(test_api_impl SRCS api_impl_tester.cc DEPS ${inference_deps} + ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${PYTHON_TESTS_DIR}/book) + set_tests_properties(test_api_impl PROPERTIES DEPENDS test_image_classification) +endif() cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor ${inference_deps} paddle_inference_api ARGS --dirname=${PYTHON_TESTS_DIR}/book) @@ -61,8 +44,10 @@ if(WITH_GPU AND TENSORRT_FOUND) cc_library(paddle_inference_tensorrt_subgraph_engine SRCS api_tensorrt_subgraph_engine.cc DEPS paddle_inference_api analysis tensorrt_engine paddle_inference_api paddle_fluid_api tensorrt_converter zero_copy_tensor_dummy) - -inference_api_test(test_api_tensorrt_subgraph_engine SRC api_tensorrt_subgraph_engine_tester.cc ARGS test_word2vec) + if(WITH_TESTING) + inference_base_test(test_api_tensorrt_subgraph_engine SRCS api_tensorrt_subgraph_engine_tester.cc DEPS ${inference_deps} + ARGS --dirname=${WORD2VEC_MODEL_DIR}) + endif() endif() if (WITH_ANAKIN AND WITH_MKL) # only needed in CI diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index eec665767164dc6e79738890947c54d7f7217037..54c37fe64590aa82d7100c93c4c5c4ee36491421 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -82,6 +82,7 @@ bool AnalysisPredictor::Init( // Get the feed_target_names and fetch_target_names PrepareFeedFetch(); + return true; } @@ -109,6 +110,10 @@ bool AnalysisPredictor::Run(const std::vector &inputs, return false; } VLOG(3) << "predict cost: " << timer.toc() << "ms"; + + // Fix TensorArray reuse not cleaned bug. + tensor_array_batch_cleaner_.CollectTensorArrays(scope_.get()); + tensor_array_batch_cleaner_.ResetTensorArray(); return true; } @@ -322,6 +327,9 @@ std::unique_ptr AnalysisPredictor::GetOutputTensor( bool AnalysisPredictor::ZeroCopyRun() { executor_->Run(); + // Fix TensorArray reuse not cleaned bug. + tensor_array_batch_cleaner_.CollectTensorArrays(scope_.get()); + tensor_array_batch_cleaner_.ResetTensorArray(); return true; } diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h index 5a9f4d36959d4ee7ca16dec769d9d1283b8787cb..b7dc2067332278c1c38df4beefb5059efe76417f 100644 --- a/paddle/fluid/inference/api/analysis_predictor.h +++ b/paddle/fluid/inference/api/analysis_predictor.h @@ -18,6 +18,7 @@ #include "paddle/fluid/framework/naive_executor.h" #include "paddle/fluid/inference/analysis/analyzer.h" #include "paddle/fluid/inference/api/api_impl.h" +#include "paddle/fluid/inference/api/details/reset_tensor_array.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/string/printf.h" @@ -88,6 +89,7 @@ class AnalysisPredictor : public PaddlePredictor { // Memory buffer for feed inputs. The temporary LoDTensor will cause serious // concurrency problems, so cache them. std::vector feed_tensors_; + details::TensorArrayBatchCleaner tensor_array_batch_cleaner_; }; } // namespace paddle diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc index 7cda9c5d8a8366bd097491f37f5352a10e4fb16c..d06ab8f8c8e3c0adf4a4074eb1450012126e83ea 100644 --- a/paddle/fluid/inference/api/api_impl.cc +++ b/paddle/fluid/inference/api/api_impl.cc @@ -22,6 +22,7 @@ limitations under the License. */ #include "paddle/fluid/framework/feed_fetch_method.h" #include "paddle/fluid/inference/api/api_impl.h" +#include "paddle/fluid/inference/api/details/reset_tensor_array.h" #include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/platform/profiler.h" @@ -157,6 +158,10 @@ bool NativePaddlePredictor::Run(const std::vector &inputs, return false; } VLOG(3) << "predict cost: " << timer.toc() << "ms"; + + // Fix TensorArray reuse not cleaned bug. + tensor_array_batch_cleaner_.CollectTensorArrays(scope_.get()); + tensor_array_batch_cleaner_.ResetTensorArray(); return true; } diff --git a/paddle/fluid/inference/api/api_impl.h b/paddle/fluid/inference/api/api_impl.h index 7882f6a53c7ce9a2486158ea9b50c018d1814091..4e4ab47ca9c5e37f2714ebd48d250c23c7e9b117 100644 --- a/paddle/fluid/inference/api/api_impl.h +++ b/paddle/fluid/inference/api/api_impl.h @@ -26,11 +26,11 @@ limitations under the License. */ #include #include -#include "paddle/fluid/inference/api/paddle_inference_api.h" - #include "paddle/fluid/framework/ddim.h" #include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/naive_executor.h" +#include "paddle/fluid/inference/api/details/reset_tensor_array.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/io.h" #include "paddle/fluid/platform/init.h" @@ -77,6 +77,7 @@ class NativePaddlePredictor : public PaddlePredictor { std::vector fetchs_; // Do not use unique_ptr, use parent scope to delete framework::Scope *sub_scope_{nullptr}; + details::TensorArrayBatchCleaner tensor_array_batch_cleaner_; }; } // namespace paddle diff --git a/paddle/fluid/inference/api/api_impl_tester.cc b/paddle/fluid/inference/api/api_impl_tester.cc index b7b8ee6ea08fe907f3f052ae1118f782ac853ca7..5152b8670ddb206f0927c03149684af4a096df42 100644 --- a/paddle/fluid/inference/api/api_impl_tester.cc +++ b/paddle/fluid/inference/api/api_impl_tester.cc @@ -27,7 +27,9 @@ limitations under the License. */ #define ACC_DIFF 1e-3 #endif -DEFINE_string(dirname, "", "Directory of the inference model."); +DEFINE_string(word2vec_dirname, "", + "Directory of the word2vec inference model."); +DEFINE_string(book_dirname, "", "Directory of the book inference model."); namespace paddle { @@ -49,7 +51,7 @@ PaddleTensor LodTensorToPaddleTensor(framework::LoDTensor* t) { NativeConfig GetConfig() { NativeConfig config; - config.model_dir = FLAGS_dirname + "/word2vec.inference.model"; + config.model_dir = FLAGS_word2vec_dirname; LOG(INFO) << "dirname " << config.model_dir; config.fraction_of_gpu_memory = 0.15; #ifdef PADDLE_WITH_CUDA @@ -116,7 +118,7 @@ void MainImageClassification(bool use_gpu) { NativeConfig config = GetConfig(); config.use_gpu = use_gpu; config.model_dir = - FLAGS_dirname + "/image_classification_resnet.inference.model"; + FLAGS_book_dirname + "/image_classification_resnet.inference.model"; const bool is_combined = false; std::vector> feed_target_shapes = @@ -187,7 +189,7 @@ void MainThreadsWord2Vec(bool use_gpu) { std::vector threads; for (int tid = 0; tid < num_jobs; ++tid) { threads.emplace_back([&, tid]() { - auto predictor = main_predictor->Clone(); + auto predictor = CreatePaddlePredictor(config); auto& local_inputs = paddle_tensor_feeds[tid]; std::vector local_outputs; ASSERT_TRUE(predictor->Run(local_inputs, &local_outputs)); @@ -220,7 +222,7 @@ void MainThreadsImageClassification(bool use_gpu) { NativeConfig config = GetConfig(); config.use_gpu = use_gpu; config.model_dir = - FLAGS_dirname + "/image_classification_resnet.inference.model"; + FLAGS_book_dirname + "/image_classification_resnet.inference.model"; auto main_predictor = CreatePaddlePredictor(config); std::vector jobs(num_jobs); @@ -245,7 +247,7 @@ void MainThreadsImageClassification(bool use_gpu) { std::vector threads; for (int tid = 0; tid < num_jobs; ++tid) { threads.emplace_back([&, tid]() { - auto predictor = main_predictor->Clone(); + auto predictor = CreatePaddlePredictor(config); auto& local_inputs = paddle_tensor_feeds[tid]; std::vector local_outputs; ASSERT_TRUE(predictor->Run(local_inputs, &local_outputs)); @@ -271,7 +273,7 @@ TEST(inference_api_native, word2vec_cpu_threads) { MainThreadsWord2Vec(false /*use_gpu*/); } TEST(inference_api_native, image_classification_cpu) { - MainThreadsImageClassification(false /*use_gpu*/); + MainImageClassification(false /*use_gpu*/); } TEST(inference_api_native, image_classification_cpu_threads) { MainThreadsImageClassification(false /*use_gpu*/); @@ -279,15 +281,17 @@ TEST(inference_api_native, image_classification_cpu_threads) { #ifdef PADDLE_WITH_CUDA TEST(inference_api_native, word2vec_gpu) { MainWord2Vec(true /*use_gpu*/); } -TEST(inference_api_native, word2vec_gpu_threads) { - MainThreadsWord2Vec(true /*use_gpu*/); -} +// Turn off temporarily for the unstable result. +// TEST(inference_api_native, word2vec_gpu_threads) { +// MainThreadsWord2Vec(true /*use_gpu*/); +// } TEST(inference_api_native, image_classification_gpu) { - MainThreadsImageClassification(true /*use_gpu*/); -} -TEST(inference_api_native, image_classification_gpu_threads) { - MainThreadsImageClassification(true /*use_gpu*/); + MainImageClassification(true /*use_gpu*/); } +// Turn off temporarily for the unstable result. +// TEST(inference_api_native, image_classification_gpu_threads) { +// MainThreadsImageClassification(true /*use_gpu*/); +// } #endif diff --git a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc index 702158ea3bcab854eece3ccd40724d92efcbae67..89c9a65cb06ba565f0e0cbdb9b6031c6adbcb64e 100644 --- a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc +++ b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc @@ -29,13 +29,13 @@ void CompareTensorRTWithFluid(bool enable_tensorrt) { //# 1. Create PaddlePredictor with a config. NativeConfig config0; - config0.model_dir = FLAGS_dirname + "word2vec.inference.model"; + config0.model_dir = FLAGS_dirname; config0.use_gpu = true; config0.fraction_of_gpu_memory = 0.3; config0.device = 0; MixedRTConfig config1; - config1.model_dir = FLAGS_dirname + "word2vec.inference.model"; + config1.model_dir = FLAGS_dirname; config1.use_gpu = true; config1.fraction_of_gpu_memory = 0.3; config1.device = 0; diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt index 03f0f726eb61c2619c7719a865383090f86b5b7f..49683eab07a2f5bc008272038a27bdb277396284 100644 --- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt +++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt @@ -52,6 +52,7 @@ include_directories("${PADDLE_LIB}") include_directories("${PADDLE_LIB}/third_party/install/protobuf/include") include_directories("${PADDLE_LIB}/third_party/install/glog/include") include_directories("${PADDLE_LIB}/third_party/install/gflags/include") +include_directories("${PADDLE_LIB}/third_party/install/xxhash/include") if (NOT WIN32) include_directories("${PADDLE_LIB}/third_party/install/snappy/include") include_directories("${PADDLE_LIB}/third_party/install/snappystream/include") @@ -61,8 +62,8 @@ endif(NOT WIN32) include_directories("${PADDLE_LIB}/third_party/boost") include_directories("${PADDLE_LIB}/third_party/eigen3") -if (NOT WIN32) - if (USE_TENSORRT AND WITH_GPU) +if (NOT WIN32) + if (USE_TENSORRT AND WITH_GPU) include_directories("${TENSORRT_INCLUDE_DIR}") link_directories("${TENSORRT_LIB_DIR}") endif() @@ -77,13 +78,14 @@ endif(NOT WIN32) link_directories("${PADDLE_LIB}/third_party/install/protobuf/lib") link_directories("${PADDLE_LIB}/third_party/install/glog/lib") link_directories("${PADDLE_LIB}/third_party/install/gflags/lib") +link_directories("${PADDLE_LIB}/third_party/install/xxhash/lib") link_directories("${PADDLE_LIB}/paddle/lib") add_executable(${DEMO_NAME} ${DEMO_NAME}.cc) if(WITH_MKL) include_directories("${PADDLE_LIB}/third_party/install/mklml/include") - set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX} + set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX} ${PADDLE_LIB}/third_party/install/mklml/lib/libiomp5${CMAKE_SHARED_LIBRARY_SUFFIX}) set(MKLDNN_PATH "${PADDLE_LIB}/third_party/install/mkldnn") if(EXISTS ${MKLDNN_PATH}) @@ -107,7 +109,7 @@ if (NOT WIN32) set(EXTERNAL_LIB "-lrt -ldl -lpthread") set(DEPS ${DEPS} ${MATH_LIB} ${MKLDNN_LIB} - glog gflags protobuf snappystream snappy z + glog gflags protobuf snappystream snappy z xxhash ${EXTERNAL_LIB}) else() set(DEPS ${DEPS} @@ -120,7 +122,7 @@ endif(NOT WIN32) if(WITH_GPU) if(NOT WIN32) - if (USE_TENSORRT) + if (USE_TENSORRT) set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/libnvinfer${CMAKE_STATIC_LIBRARY_SUFFIX}) set(DEPS ${DEPS} ${TENSORRT_LIB_DIR}/libnvinfer_plugin${CMAKE_STATIC_LIBRARY_SUFFIX}) endif() diff --git a/paddle/fluid/inference/api/demo_ci/run.sh b/paddle/fluid/inference/api/demo_ci/run.sh index 6e682b69583e00ab1bbe1c0d22e21ae114a61a76..ff718077c1ba6b10fe87aac10d84f96a23ad6bba 100755 --- a/paddle/fluid/inference/api/demo_ci/run.sh +++ b/paddle/fluid/inference/api/demo_ci/run.sh @@ -16,7 +16,7 @@ if [ $2 == ON ]; then fi if [ $3 == ON ]; then use_gpu_list='true false' -else +else use_gpu_list='false' fi @@ -62,7 +62,7 @@ for WITH_STATIC_LIB in ON OFF; do -DWITH_GPU=$TEST_GPU_CPU \ -DWITH_STATIC_LIB=$WITH_STATIC_LIB make -j - word2vec_model=${PADDLE_ROOT}'/build/python/paddle/fluid/tests/book/word2vec.inference.model' + word2vec_model=$DATA_DIR'/word2vec/word2vec.inference.model' if [ -d $word2vec_model ]; then for use_gpu in $use_gpu_list; do ./simple_on_word2vec \ @@ -83,7 +83,7 @@ for WITH_STATIC_LIB in ON OFF; do -DWITH_STATIC_LIB=$WITH_STATIC_LIB make -j for use_gpu in $use_gpu_list; do - for vis_demo_name in $vis_demo_list; do + for vis_demo_name in $vis_demo_list; do ./vis_demo \ --modeldir=$DATA_DIR/$vis_demo_name/model \ --data=$DATA_DIR/$vis_demo_name/data.txt \ @@ -95,7 +95,7 @@ for WITH_STATIC_LIB in ON OFF; do fi done done - + # --------tensorrt mobilenet------ if [ $USE_TENSORRT == ON -a $TEST_GPU_CPU == ON ]; then rm -rf * @@ -107,7 +107,7 @@ for WITH_STATIC_LIB in ON OFF; do -DUSE_TENSORRT=$USE_TENSORRT \ -DTENSORRT_INCLUDE_DIR=$TENSORRT_INCLUDE_DIR \ -DTENSORRT_LIB_DIR=$TENSORRT_LIB_DIR - make -j + make -j ./trt_mobilenet_demo \ --modeldir=$DATA_DIR/mobilenet/model \ --data=$DATA_DIR/mobilenet/data.txt \ diff --git a/paddle/fluid/inference/api/details/reset_tensor_array.cc b/paddle/fluid/inference/api/details/reset_tensor_array.cc new file mode 100644 index 0000000000000000000000000000000000000000..4ae6c6dc9f44650c1c62f5be5448864d817513b1 --- /dev/null +++ b/paddle/fluid/inference/api/details/reset_tensor_array.cc @@ -0,0 +1,50 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/api/details/reset_tensor_array.h" + +namespace paddle { +namespace details { + +// Should be called after the parameters are loaded. +void TensorArrayBatchCleaner::CollectTensorArrays(framework::Scope *scope) { + if (flag_) { + for (auto &var_name : scope->LocalVarNames()) { + auto *var = scope->FindVar(var_name); + // TODO(Superjomn) should avoid the case when a TensorArray is a + // parameter. + if (var_name == "feed" || var_name == "fetch") continue; + if (var->Type() == typeid(framework::LoDTensorArray)) { + VLOG(4) << "collect " << var_name; + arrays_.push_back(var->GetMutable()); + } + } + for (auto *kid : scope->kids()) { + CollectTensorArrays(kid); + } + + VLOG(3) << "Collect " << arrays_.size() << " arrays"; + flag_ = false; + } +} + +// Should be called when `Run` finished. +void TensorArrayBatchCleaner::ResetTensorArray() { + for (auto *arr : arrays_) { + arr->clear(); + } +} + +} // namespace details +} // namespace paddle diff --git a/paddle/fluid/inference/api/details/reset_tensor_array.h b/paddle/fluid/inference/api/details/reset_tensor_array.h new file mode 100644 index 0000000000000000000000000000000000000000..a39449ff0e67786815dfb8d2d30d79dcdba757d7 --- /dev/null +++ b/paddle/fluid/inference/api/details/reset_tensor_array.h @@ -0,0 +1,37 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/scope.h" + +namespace paddle { +namespace details { + +// Clean the TensorArray each batch to make the behavior the same with the +// training phase. +struct TensorArrayBatchCleaner { + // Fix the tensor array not clear in the inference scenarios. + void CollectTensorArrays(framework::Scope *scope); + void ResetTensorArray(); + + private: + bool flag_{true}; + std::vector arrays_; +}; + +} // namespace details +} // namespace paddle diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h index 24f59cf43a9700ff1732e1ef6ad82e1a6294eede..e46dc1326951f68fd030f2208b9bea1647d0026d 100644 --- a/paddle/fluid/inference/api/helper.h +++ b/paddle/fluid/inference/api/helper.h @@ -160,7 +160,8 @@ static void PrintTime(int batch_size, int repeat, int num_threads, int tid, double latency, int epoch = 1) { LOG(INFO) << "====== batch_size: " << batch_size << ", repeat: " << repeat << ", threads: " << num_threads << ", thread id: " << tid - << ", latency: " << latency << "ms ======"; + << ", latency: " << latency << "ms, fps: " << 1 / (latency / 1000.f) + << " ======"; if (epoch > 1) { int samples = batch_size * epoch; LOG(INFO) << "====== sample number: " << samples diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h index 07ee6e72d1053d2271b8f8d69ce38003f5e038a0..a755ccb93bdee018dfeaf91157e7971b4d4cd832 100644 --- a/paddle/fluid/inference/api/paddle_inference_api.h +++ b/paddle/fluid/inference/api/paddle_inference_api.h @@ -124,7 +124,7 @@ class ZeroCopyTensor { std::vector> lod() const; protected: - ZeroCopyTensor(void* scope) : scope_{scope} {} + explicit ZeroCopyTensor(void* scope) : scope_{scope} {} void SetName(const std::string& name) { name_ = name; } void* FindTensor() const; @@ -259,12 +259,6 @@ struct AnalysisConfig : public NativeConfig { kExclude // Specify the disabled passes in `ir_passes`. }; - void SetIncludeMode() { - ir_mode = IrPassMode::kInclude; - // this pass has to be run at the beginning of all fuse passes - ir_passes = {"infer_clean_graph_pass"}; - } - // Determine whether to perform graph optimization. bool enable_ir_optim = true; // Manually determine the IR passes to run. diff --git a/paddle/fluid/inference/test.cmake b/paddle/fluid/inference/test.cmake new file mode 100644 index 0000000000000000000000000000000000000000..ab3a30ce6bba14a7d5ec700a159d90031e6b5dc7 --- /dev/null +++ b/paddle/fluid/inference/test.cmake @@ -0,0 +1,31 @@ +set(INFERENCE_URL "http://paddle-inference-dist.cdn.bcebos.com" CACHE STRING "inference download url") +set(INFERENCE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING + "A path setting inference demo download directories.") +function (inference_download install_dir url filename) + message(STATUS "Download inference test stuff from ${url}/${filename}") + execute_process(COMMAND bash -c "mkdir -p ${install_dir}") + execute_process(COMMAND bash -c "cd ${install_dir} && wget -q ${url}/${filename}") + message(STATUS "finish downloading ${filename}") +endfunction() + +function (inference_download_and_uncompress install_dir url filename) + inference_download(${install_dir} ${url} ${filename}) + execute_process(COMMAND bash -c "cd ${install_dir} && tar xzf ${filename}") +endfunction() + +set(WORD2VEC_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/word2vec") +if (NOT EXISTS ${WORD2VEC_INSTALL_DIR}) + inference_download_and_uncompress(${WORD2VEC_INSTALL_DIR} ${INFERENCE_URL} "word2vec.inference.model.tar.gz") +endif() +set(WORD2VEC_MODEL_DIR "${WORD2VEC_INSTALL_DIR}/word2vec.inference.model") + +function (inference_base_test TARGET) + set(options "") + set(oneValueArgs "") + set(multiValueArgs SRCS ARGS DEPS) + cmake_parse_arguments(base_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + if(WITH_GPU) + set(mem_opt "--fraction_of_gpu_memory_to_use=0.5") + endif() + cc_test(${TARGET} SRCS ${base_test_SRCS} DEPS ${base_test_DEPS} ARGS ${mem_opt} ${base_test_ARGS}) +endfunction() diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index c3dd1f433691e1c96e9f38ef7b595befad26408f..71fdc67068b3d92a774db82f569d212f6cffad78 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -1,18 +1,4 @@ -set(INFERENCE_URL "http://paddle-inference-dist.cdn.bcebos.com") -set(INFERENCE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING - "A path setting inference demo download directories.") set(INFERENCE_EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor) -function (inference_download install_dir url filename) - message(STATUS "Download inference test stuff from ${url}/${filename}") - execute_process(COMMAND bash -c "mkdir -p ${install_dir}") - execute_process(COMMAND bash -c "cd ${install_dir} && wget -q ${url}/${filename}") - message(STATUS "finish downloading ${filename}") -endfunction() - -function (inference_download_and_uncompress install_dir url filename) - inference_download(${install_dir} ${url} ${filename}) - execute_process(COMMAND bash -c "cd ${install_dir} && tar xzf ${filename}") -endfunction() function(download_model_and_data install_dir model_name data_name) if (NOT EXISTS ${install_dir}) diff --git a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc index 6399476680c0af83a6d26aea952c58543bdce9ae..e0416ff953b61f56a2ca1a45cb382d40a6cffa4a 100644 --- a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc @@ -228,6 +228,7 @@ void SetInput(std::vector> *inputs) { TEST(Analyzer_rnn1, profile) { contrib::AnalysisConfig cfg; SetConfig(&cfg); + cfg.use_gpu = false; std::vector outputs; std::vector> input_slots_all; diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h index 5589b58b0618523b22f169f9ba9930c3ff3e3c48..19c3f532d5dcb7588793fa21fa179f6b48649103 100644 --- a/paddle/fluid/inference/tests/api/tester_helper.h +++ b/paddle/fluid/inference/tests/api/tester_helper.h @@ -139,6 +139,9 @@ void TestMultiThreadPrediction( } for (int tid = 0; tid < num_threads; ++tid) { threads.emplace_back([&, tid]() { +#ifdef PADDLE_WITH_MKLDNN + platform::set_cur_thread_id(static_cast(tid) + 1); +#endif // Each thread should have local inputs and outputs. // The inputs of each thread are all the same. std::vector> inputs_tid = inputs; diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 78ef6f207eadea6799864fe22889103b468d1780..919ad96f7adfc5025d9a8367c467f639c6fe3101 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -268,6 +268,7 @@ if (WITH_GPU AND TENSORRT_FOUND) else() set(DEPS_OPS ${DEPS_OPS} tensorrt_engine_op) endif() +op_library(hash_op DEPS xxhash) op_library(clip_by_norm_op DEPS selected_rows_functor selected_rows) op_library(sum_op DEPS selected_rows_functor) op_library(sgd_op DEPS selected_rows_functor) @@ -300,6 +301,7 @@ op_library(flatten_op DEPS reshape_op) op_library(sequence_pad_op DEPS sequence_padding) op_library(unstack_op DEPS stack_op) op_library(fake_quantize_op DEPS memory) +op_library(crf_decoding_op DEPS jit_kernel) op_library(fusion_lstm_op DEPS jit_kernel) if (WITH_GPU) op_library(conv_op DEPS vol2col depthwise_conv im2col) diff --git a/paddle/fluid/operators/add_position_encoding_op.cc b/paddle/fluid/operators/add_position_encoding_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..8127e554bed1aae7a5ce8837bcadf1b7f13f1ac2 --- /dev/null +++ b/paddle/fluid/operators/add_position_encoding_op.cc @@ -0,0 +1,97 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/add_position_encoding_op.h" + +namespace paddle { +namespace operators { + +class AddPositionEncodingOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "X(Input) of add_position_encoding_op should not be null."); + PADDLE_ENFORCE( + ctx->HasOutput("Out"), + "Out(Output) of add_position_encoding_op should not be null."); + + auto x_dims = ctx->GetInputDim("X"); + ctx->SetOutputDim("Out", x_dims); + ctx->ShareLoD("X", /*->*/ "Out"); + } +}; + +class AddPositionEncodingOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "X(Input) must not be null."); + PADDLE_ENFORCE(ctx->HasInput("Out"), "Out must not be null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Out@GRAD must not be null."); + + auto out_dims = ctx->GetInputDim("Out"); + if (ctx->HasOutput(framework::GradVarName("X"))) { + ctx->SetOutputDim(framework::GradVarName("X"), out_dims); + } + } +}; + +class AddPositionEncodingOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "Input of AddPositionEncoding operator"); + AddOutput("Out", "Output of AddPositionEncoding operator"); + AddAttr("alpha", "The scale of Original Embedding.") + .SetDefault(1.0f) + .AddCustomChecker([](const float& alpha) { + PADDLE_ENFORCE(alpha >= 0.0f, "'alpha' must be above 0.0."); + }); + AddAttr("beta", "The scale of Position Embedding.") + .SetDefault(1.0f) + .AddCustomChecker([](const float& beta) { + PADDLE_ENFORCE(beta >= 0.0f, "'beta' must be between 0.0."); + }); + AddComment(R"DOC( + Add Position Encoding Operator. + + The add position encoding calculates the output based on the input, alpha, beta. + The size of each dimension of the parameters checked in the infer-shape. + )DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plt = paddle::platform; + +REGISTER_OPERATOR(add_position_encoding, ops::AddPositionEncodingOp, + ops::AddPositionEncodingOpMaker, + paddle::framework::DefaultGradOpDescMaker); +REGISTER_OPERATOR(add_position_encoding_grad, ops::AddPositionEncodingOpGrad); + +REGISTER_OP_CPU_KERNEL( + add_position_encoding, + ops::AddPositionEncodingKernel, + ops::AddPositionEncodingKernel); + +REGISTER_OP_CPU_KERNEL( + add_position_encoding_grad, + ops::AddPositionEncodingGradKernel, + ops::AddPositionEncodingGradKernel); diff --git a/paddle/fluid/operators/add_position_encoding_op.h b/paddle/fluid/operators/add_position_encoding_op.h new file mode 100644 index 0000000000000000000000000000000000000000..5f371235f160c416058e877dbba2d9fe89abf7db --- /dev/null +++ b/paddle/fluid/operators/add_position_encoding_op.h @@ -0,0 +1,105 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/detail/safe_ref.h" + +namespace paddle { +namespace operators { + +template +class AddPositionEncodingKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* X = context.Input("X"); + auto& x_lod = X->lod(); + auto* src_ptr = X->data(); + + auto* Out = context.Output("Out"); + auto* dst_ptr = Out->mutable_data(context.GetPlace()); + + float alpha = context.Attr("alpha"); + float beta = context.Attr("beta"); + + auto x_dim = X->dims(); + int batch_size = 0; + int max_seq_len = 0; + int enc_size = 0; + + if (x_lod.empty()) { + PADDLE_ENFORCE( + x_dim.size() == 3UL, + "The input X of Add Position Encoding should be 3-D Tensor!"); + batch_size = x_dim[0]; + max_seq_len = x_dim[1]; + enc_size = x_dim[2]; + } else { + PADDLE_ENFORCE( + x_dim.size() == 2UL, + "The input X of Add Position Encoding should be 2-D LoDTensor!"); + PADDLE_ENFORCE( + x_lod.size() == 1UL, + "The Add Position Encoding Op only supports lod_level == 1!"); + batch_size = x_lod[0].size() - 1; + max_seq_len = -1; + enc_size = x_dim[1]; + } + + PADDLE_ENFORCE(enc_size % 2 == 0, "Only support even encode size!"); + + const int half_size = enc_size / 2; + for (int i = 0; i < batch_size; ++i) { + const int max_length = + x_lod.empty() ? max_seq_len : x_lod[0][i + 1] - x_lod[0][i]; + for (int j = 0; j < max_length; ++j) { + for (int k = 0; k < half_size; ++k) { + const double val = (half_size > 1) + ? j / pow(10000.0, double(k) / (half_size - 1)) + : j / 10000.0; + dst_ptr[k] = src_ptr[k] * alpha + sin(val) * beta; + dst_ptr[half_size + k] = + src_ptr[half_size + k] * alpha + cos(val) * beta; + } + src_ptr += enc_size; + dst_ptr += enc_size; + } + } + } +}; + +template +class AddPositionEncodingGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* dOut = + context.Input(framework::GradVarName("Out")); + auto dout = framework::EigenVector::Flatten(*dOut); + + auto* dX = + context.Output(framework::GradVarName("X")); + dX->mutable_data(context.GetPlace()); + auto dx = framework::EigenVector::Flatten(*dX); + + float alpha = context.Attr("alpha"); + + auto* place = + context.template device_context().eigen_device(); + dx.device(*place) = dout * static_cast(alpha); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/affine_grid_cudnn_op.cu.cc b/paddle/fluid/operators/affine_grid_cudnn_op.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..ed71594ba5781590f3291d56c4ba1a4443003bd5 --- /dev/null +++ b/paddle/fluid/operators/affine_grid_cudnn_op.cu.cc @@ -0,0 +1,112 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/cudnn_helper.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using ScopedSpatialTransformerDescriptor = + platform::ScopedSpatialTransformerDescriptor; + +template +class CUDNNAffineGridOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), + "It must use CUDAPlace."); + auto& dev_ctx = ctx.template device_context(); + auto handle = dev_ctx.cudnn_handle(); + auto* theta = ctx.Input("Theta"); + auto* output = ctx.Output("Output"); + const T* theta_data = theta->data(); + + int n = theta->dims()[0]; + auto size_attr = ctx.Attr>("output_shape"); + Tensor h_sizes; + int* h_size_data; + if (size_attr.size() == 0) { + auto* output_shape = ctx.Input("OutputShape"); + framework::TensorCopy(*output_shape, platform::CPUPlace(), &h_sizes); + h_size_data = h_sizes.data(); + } else { + h_size_data = h_sizes.mutable_data({4}, platform::CPUPlace()); + h_size_data[0] = n; + h_size_data[1] = size_attr[1]; + h_size_data[2] = size_attr[2]; + h_size_data[3] = size_attr[3]; + } + + T* output_data = output->mutable_data( + {n, h_size_data[2], h_size_data[3], 2}, ctx.GetPlace()); + ScopedSpatialTransformerDescriptor st_desc; + cudnnSpatialTransformerDescriptor_t cudnn_st_desc = + st_desc.descriptor(4, h_size_data); + + PADDLE_ENFORCE(platform::dynload::cudnnSpatialTfGridGeneratorForward( + handle, cudnn_st_desc, theta_data, output_data)); + } +}; + +template +class CUDNNAffineGridGradOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), + "It must use CUDAPlace."); + auto& dev_ctx = ctx.template device_context(); + auto handle = dev_ctx.cudnn_handle(); + auto output_grad = ctx.Input(framework::GradVarName("Output")); + auto theta_grad = ctx.Output(framework::GradVarName("Theta")); + + int n = output_grad->dims()[0]; + auto size_attr = ctx.Attr>("output_shape"); + Tensor h_sizes; + int* h_size_data; + if (size_attr.size() == 0) { + auto* output_shape = ctx.Input("OutputShape"); + framework::TensorCopy(*output_shape, platform::CPUPlace(), &h_sizes); + h_size_data = h_sizes.data(); + } else { + h_size_data = h_sizes.mutable_data({4}, platform::CPUPlace()); + h_size_data[0] = n; + h_size_data[1] = size_attr[1]; + h_size_data[2] = size_attr[2]; + h_size_data[3] = size_attr[3]; + } + + ScopedSpatialTransformerDescriptor st_desc; + cudnnSpatialTransformerDescriptor_t cudnn_st_desc = + st_desc.descriptor(4, h_size_data); + + const T* output_grad_data = output_grad->data(); + T* theta_grad_data = theta_grad->mutable_data(ctx.GetPlace()); + + PADDLE_ENFORCE(platform::dynload::cudnnSpatialTfGridGeneratorBackward( + handle, cudnn_st_desc, output_grad_data, theta_grad_data)); + } +}; + +} // namespace operators +} // namespace paddle + +namespace plat = paddle::platform; +REGISTER_OP_KERNEL(affine_grid, CUDNN, plat::CUDAPlace, + paddle::operators::CUDNNAffineGridOpKernel, + paddle::operators::CUDNNAffineGridOpKernel); +REGISTER_OP_KERNEL(affine_grid_grad, CUDNN, plat::CUDAPlace, + paddle::operators::CUDNNAffineGridGradOpKernel, + paddle::operators::CUDNNAffineGridGradOpKernel); diff --git a/paddle/fluid/operators/affine_grid_op.cc b/paddle/fluid/operators/affine_grid_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..0ea28265a245c9cd1a35a79324a33f7cf208a159 --- /dev/null +++ b/paddle/fluid/operators/affine_grid_op.cc @@ -0,0 +1,233 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/affine_grid_op.h" +#include +#include "paddle/fluid/framework/op_registry.h" +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/platform/cudnn_helper.h" +#endif + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +struct Linspace { + framework::Tensor operator()(T start, T end, int count, + const framework::ExecutionContext& ctx) { + Tensor numbers; + T* number_data = numbers.mutable_data({count}, platform::CPUPlace()); + T slice = (end - start) / (T)(count - 1); + for (int i = 0; i < count; ++i) { + number_data[i] = start + (T)i * slice; + } + return numbers; + } +}; + +class AffineGridOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Theta"), + "Input(Theta) of AffineGridOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Output"), + "Output(Output) of AffineGridOp should not be null."); + auto theta_dims = ctx->GetInputDim("Theta"); + PADDLE_ENFORCE(theta_dims.size() == 3, + "AffineGrid's Input(Theta) should be 3-D tensor."); + + auto output_shape = ctx->Attrs().Get>("output_shape"); + if (output_shape.size() == 0) { + PADDLE_ENFORCE(ctx->HasInput("OutputShape"), + "Input(OutputShape) of AffineGridOp should not be null if " + "attr(output_shape) is not configured."); + auto output_shape_dims = ctx->GetInputDim("OutputShape"); + PADDLE_ENFORCE(output_shape_dims.size() == 1, + "AffineGrid's Input(OutputShape) should be 1-D tensor."); + } else { + PADDLE_ENFORCE(output_shape.size() == 4, + "The size of attr(output_shape) should be 4."); + } + + PADDLE_ENFORCE(theta_dims[1] == 2, "Input(theta) dims[1] should be 2."); + PADDLE_ENFORCE(theta_dims[2] == 3, "Input(theta) dims[2] should be 3."); + // N * H * W * 2 + ctx->SetOutputDim("Output", + framework::make_ddim({theta_dims[0], -1, -1, 2})); + ctx->ShareLoD("Theta", "Output"); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + framework::LibraryType library{framework::LibraryType::kPlain}; +#ifdef PADDLE_WITH_CUDA + if (platform::CanCUDNNBeUsed(ctx)) { + library = framework::LibraryType::kCUDNN; + } +#endif + auto data_type = framework::ToDataType(ctx.Input("Theta")->type()); + return framework::OpKernelType(data_type, ctx.GetPlace(), + framework::DataLayout::kAnyLayout, library); + } +}; + +class AffineGridOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput( + "Theta", + "(Tensor) A batch of affine transform parameters with shape [N, 2, 3]. " + "It is used to transform coordinate (x_0, y_0) to coordinate (x_1, " + "y_1)."); + AddInput("OutputShape", + "(Tensor) The shape of target image with format [N, C, H, W].") + .AsDispensable(); + AddOutput("Output", "(Tensor) Output Tensor with shape [N, H, W, 2]."); + AddAttr( + "use_cudnn", + "(bool, default false) Only used in cudnn kernel, need install cudnn") + .SetDefault(true); + AddAttr>( + "output_shape", + "The target output image shape with format [N, C, H, W].") + .SetDefault(std::vector()); + + AddComment(R"DOC( + It generates a grid of (x,y) coordinates using the parameters of the + affine transformation that correspond to a set of points where the input + feature map should be sampled to produce the transformed output feature map. + + Given: + Theta = [[[x_11, x_12, x_13] + [x_14, x_15, x_16]] + [[x_21, x_22, x_23] + [x_24, x_25, x_26]]] + + OutputShape = [2, 3, 5, 5] + + Step 1: + + Generate relative coordinates according to OutputShape. + The values of relative coordinates are in the interval between -1 and 1. + The shape of the relative coordinates is [2, H, W] as below: + + C = [[[-1. -1. -1. -1. -1. ] + [-0.5 -0.5 -0.5 -0.5 -0.5] + [ 0. 0. 0. 0. 0. ] + [ 0.5 0.5 0.5 0.5 0.5] + [ 1. 1. 1. 1. 1. ]] + [[-1. -0.5 0. 0.5 1. ] + [-1. -0.5 0. 0.5 1. ] + [-1. -0.5 0. 0.5 1. ] + [-1. -0.5 0. 0.5 1. ] + [-1. -0.5 0. 0.5 1. ]]] + C[0] is the coordinates in height axis and C[1] is the coordinates in width axis. + + Step2: + Tanspose and reshape C to shape [H * W, 2] and append ones to last dimension. The we get: + C_ = [[-1. -1. 1. ] + [-0.5 -1. 1. ] + [ 0. -1. 1. ] + [ 0.5 -1. 1. ] + [ 1. -1. 1. ] + [-1. -0.5 1. ] + [-0.5 -0.5 1. ] + [ 0. -0.5 1. ] + [ 0.5 -0.5 1. ] + [ 1. -0.5 1. ] + [-1. 0. 1. ] + [-0.5 0. 1. ] + [ 0. 0. 1. ] + [ 0.5 0. 1. ] + [ 1. 0. 1. ] + [-1. 0.5 1. ] + [-0.5 0.5 1. ] + [ 0. 0.5 1. ] + [ 0.5 0.5 1. ] + [ 1. 0.5 1. ] + [-1. 1. 1. ] + [-0.5 1. 1. ] + [ 0. 1. 1. ] + [ 0.5 1. 1. ] + [ 1. 1. 1. ]] + Step3: + Compute output by equation $$Output[i] = C_ * Theta[i]^T$$ + )DOC"); + } +}; + +class AffineGridOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override { + auto theta_dims = ctx->GetInputDim("Theta"); + if (ctx->HasOutput(framework::GradVarName("Theta"))) { + ctx->SetOutputDim(framework::GradVarName("Theta"), theta_dims); + } + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + framework::LibraryType library_{framework::LibraryType::kPlain}; +#ifdef PADDLE_WITH_CUDA + if (platform::CanCUDNNBeUsed(ctx)) { + library_ = framework::LibraryType::kCUDNN; + } +#endif + return framework::OpKernelType( + framework::ToDataType(ctx.Input("Theta")->type()), + ctx.GetPlace(), framework::DataLayout::kAnyLayout, library_); + } +}; + +class AffineGridGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto* op = new framework::OpDesc(); + op->SetType("affine_grid_grad"); + op->SetInput("Theta", Input("Theta")); + op->SetInput("OutputShape", Input("OutputShape")); + op->SetInput(framework::GradVarName("Output"), OutputGrad("Output")); + + op->SetAttrMap(Attrs()); + + op->SetOutput(framework::GradVarName("Theta"), InputGrad("Theta")); + return std::unique_ptr(op); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(affine_grid, ops::AffineGridOp, ops::AffineGridOpMaker, + ops::AffineGridGradMaker); +REGISTER_OPERATOR(affine_grid_grad, ops::AffineGridOpGrad); + +REGISTER_OP_CPU_KERNEL( + affine_grid, + ops::AffineGridOpKernel, + ops::AffineGridOpKernel); +REGISTER_OP_CPU_KERNEL( + affine_grid_grad, + ops::AffineGridGradOpKernel, + ops::AffineGridGradOpKernel); diff --git a/paddle/fluid/operators/affine_grid_op.h b/paddle/fluid/operators/affine_grid_op.h new file mode 100644 index 0000000000000000000000000000000000000000..07e26c292c3bafc4d98bd392a9e1e21a9eb383a8 --- /dev/null +++ b/paddle/fluid/operators/affine_grid_op.h @@ -0,0 +1,190 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/blas.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +template +using EigenTensor = framework::EigenTensor; + +using Array1 = Eigen::DSizes; +using Array2 = Eigen::DSizes; +using Array3 = Eigen::DSizes; +using Array4 = Eigen::DSizes; + +/** + *Return a tensor with evenly spaced numbers over a specified interval. + */ +template +struct Linspace { + framework::Tensor operator()(T start, T end, int count, + const framework::ExecutionContext& ctx); +}; + +template +class AffineGridOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto& place = *ctx.template device_context().eigen_device(); + auto* theta = ctx.Input("Theta"); + int n = theta->dims()[0]; + + auto size_attr = ctx.Attr>("output_shape"); + int h = 0; + int w = 0; + if (size_attr.size() == 0) { + auto* output_shape = ctx.Input("OutputShape"); + Tensor h_sizes; + framework::TensorCopy(*output_shape, platform::CPUPlace(), &h_sizes); + const int* h_size_data = h_sizes.data(); + h = h_size_data[2]; + w = h_size_data[3]; + } else { + h = size_attr[2]; + w = size_attr[3]; + } + + auto* output = ctx.Output("Output"); + output->mutable_data({n, h, w, 2}, ctx.GetPlace()); + + math::SetConstant()( + ctx.template device_context(), output, + static_cast(0)); + + Linspace linspace; + // Get indexes of height with shape [height, width, 1] + auto h_idx = linspace((T)-1, (T)1, h, ctx); + auto h_idx_t = EigenTensor::From(h_idx); + // Get indexes of width with shape [height, width, 1] + auto w_idx = linspace((T)-1, (T)1, w, ctx); + auto w_idx_t = EigenTensor::From(w_idx); + // Get constant ones tensor with shape [height, width, 1] + Tensor ones; + ones.mutable_data({h, w, 1}, ctx.GetPlace()); + auto ones_t = EigenTensor::From(ones).setConstant((T)1); + // Get grid tensor with shape [n, h, w, 3] by concatenating h_idx, w_idx and + // ones + Tensor grid; + grid.mutable_data({n, h, w, 3}, ctx.GetPlace()); + auto grid_t = EigenTensor::From(grid); + + grid_t.device(place) = w_idx_t.reshape(Array2(1, w)) + .broadcast(Array2(h, 1)) + .reshape(Array3(h, w, 1)) + .concatenate(h_idx_t.reshape(Array2(1, h)) + .broadcast(Array2(w, 1)) + .shuffle(Array2(1, 0)) + .reshape(Array3(h, w, 1)), + 2) + .eval() + .concatenate(ones_t, 2) + .reshape(Array4(1, h, w, 3)) + .broadcast(Array4(n, 1, 1, 1)); + + // output = grid * theta.T + // TODO(wanghaoshuang): Refine batched matrix multiply + auto blas = math::GetBlas(ctx); + for (int i = 0; i < n; ++i) { + Tensor sliced_grid = grid.Slice(i, i + 1).Resize({h * w, 3}); + Tensor sliced_theta = theta->Slice(i, i + 1).Resize({2, 3}); + Tensor sliced_out = output->Slice(i, i + 1).Resize({h * w, 2}); + blas.MatMul(sliced_grid, false, sliced_theta, true, T(1), &sliced_out, + T(0)); + } + } +}; + +template +class AffineGridGradOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto& place = *ctx.template device_context().eigen_device(); + auto output_grad = ctx.Input(framework::GradVarName("Output")); + auto theta_grad = ctx.Output(framework::GradVarName("Theta")); + + int n = output_grad->dims()[0]; + auto size_attr = ctx.Attr>("output_shape"); + int h = 0; + int w = 0; + if (size_attr.size() == 0) { + auto* output_shape = ctx.Input("OutputShape"); + Tensor h_sizes; + framework::TensorCopy(*output_shape, platform::CPUPlace(), &h_sizes); + const int* h_size_data = h_sizes.data(); + h = h_size_data[2]; + w = h_size_data[3]; + } else { + h = size_attr[2]; + w = size_attr[3]; + } + + theta_grad->mutable_data({n, 2, 3}, ctx.GetPlace()); + + math::SetConstant()( + ctx.template device_context(), theta_grad, + static_cast(0)); + + Linspace linspace; + + // Get indexes of height with shape [height, width, 1] + auto h_idx = linspace((T)-1, (T)1, h, ctx); + auto h_idx_t = EigenTensor::From(h_idx); + // Get indexes of width with shape [height, width, 1] + auto w_idx = linspace((T)-1, (T)1, w, ctx); + auto w_idx_t = EigenTensor::From(w_idx); + // Get constant ones tensor with shape [height, width, 1] + Tensor ones; + ones.mutable_data({h, w, 1}, ctx.GetPlace()); + auto ones_t = EigenTensor::From(ones).setConstant((T)1); + // Get grid tensor with shape [n, h, w, 3] by concatenating h_idx, w_idx and + // ones + Tensor grid; + grid.mutable_data({n, h, w, 3}, ctx.GetPlace()); + auto grid_t = EigenTensor::From(grid); + grid_t.device(place) = w_idx_t.reshape(Array2(1, w)) + .broadcast(Array2(h, 1)) + .reshape(Array3(h, w, 1)) + .concatenate(h_idx_t.reshape(Array2(1, h)) + .broadcast(Array2(w, 1)) + .shuffle(Array2(1, 0)) + .reshape(Array3(h, w, 1)), + 2) + .eval() + .concatenate(ones_t, 2) + .reshape(Array4(1, h, w, 3)) + .broadcast(Array4(n, 1, 1, 1)); + // output = grid * theta.T + // TODO(wanghaoshuang): Refine batched matrix multiply + auto blas = math::GetBlas(ctx); + for (int i = 0; i < n; ++i) { + Tensor sliced_grid = grid.Slice(i, i + 1).Resize({h * w, 3}); + Tensor sliced_out_grad = output_grad->Slice(i, i + 1).Resize({h * w, 2}); + Tensor sliced_theta_grad = theta_grad->Slice(i, i + 1).Resize({2, 3}); + blas.MatMul(sliced_out_grad, true, sliced_grid, false, T(1), + &sliced_theta_grad, T(0)); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/beam_search_decode_op.cc b/paddle/fluid/operators/beam_search_decode_op.cc index b6cb935814e25b31d4104f9ce24fe952680cb491..0d32cae0e1e5ff274793df50e854283d8e2f7bf8 100644 --- a/paddle/fluid/operators/beam_search_decode_op.cc +++ b/paddle/fluid/operators/beam_search_decode_op.cc @@ -79,6 +79,9 @@ struct BeamSearchDecodeFunctor { bool tensor_on_gpu_; size_t beam_size_; int end_id_; + // TODO(Superjomn) Here might result serious performance issue in the + // concurrency + // scenarios. const LoDTensorArray& step_ids_origin_; const LoDTensorArray& step_scores_origin_; LoDTensorArray step_ids_ = LoDTensorArray(); diff --git a/paddle/fluid/operators/crf_decoding_op.h b/paddle/fluid/operators/crf_decoding_op.h index 8181897c3d3844bda5574e85a08b2af038fcd664..e9d2e84a434d7084c526a6e75363a65577197262 100644 --- a/paddle/fluid/operators/crf_decoding_op.h +++ b/paddle/fluid/operators/crf_decoding_op.h @@ -16,6 +16,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/jit_kernel.h" #include "paddle/fluid/operators/math/math_function.h" namespace paddle { @@ -69,9 +70,6 @@ class CRFDecodingOpKernel : public framework::OpKernel { auto emission_dims = emission_weights.dims(); const size_t seq_len = emission_dims[0]; const size_t tag_num = emission_dims[1]; - - const size_t state_trans_base_idx = 2; - const T* x = emission_weights.data(); const T* w = transition_weights.data(); int64_t* path = decoded_path->data(); @@ -84,221 +82,10 @@ class CRFDecodingOpKernel : public framework::OpKernel { Tensor track; int* track_value = track.mutable_data(emission_dims, platform::CPUPlace()); - -#ifdef __AVX__ -// It use the AVX or AVX512 instruction to deal the data as the vector of 8 or -// 16 elements per iteration. Then it can implement the parallel processing. -// Only optimize for float type. -#ifdef __AVX512F__ - size_t step_size = 16; -#else - size_t step_size = 8; -#endif - if (std::is_same::value && (tag_num >= step_size)) { - size_t steps = tag_num / step_size; - size_t remain = tag_num % step_size; - int last_offset = static_cast(remain) - static_cast(step_size); - - // Setup the alpha initial value. - size_t i_offset = 0; - for (size_t i = 0; i <= steps; ++i) { -#ifdef __AVX512F__ - // Declare the variable for the content of weights, input and alpha - // values. - __m512 w_content, x_content, alpha_content; - - // Load the relevant data into the variables from un-aligned address. - w_content = _mm512_loadu_ps((const float*)(w + i_offset)); - x_content = _mm512_loadu_ps((const float*)(x + i_offset)); - alpha_content = _mm512_add_ps(w_content, x_content); - - // Save the alpha value. - _mm512_storeu_ps(reinterpret_cast(alpha_value + i_offset), - alpha_content); -#else - // Declare the variable for the content of weights, input and alpha - // values. - __m256 w_content, x_content, alpha_content; - - // Load the relevant data into the variables from un-aligned address. - w_content = _mm256_loadu_ps((const float*)(w + i_offset)); - x_content = _mm256_loadu_ps((const float*)(x + i_offset)); - alpha_content = _mm256_add_ps(w_content, x_content); - - // Save the alpha value. - _mm256_storeu_ps(reinterpret_cast(alpha_value + i_offset), - alpha_content); -#endif - i_offset += step_size; - if (i == steps - 1) { - if (remain > 0) { - i_offset += last_offset; - } else { - break; - } - } - } - - // Use the column-major strategy to get the location of maximum score. - size_t seq_offset = 0; - for (size_t k = 1; k < seq_len; ++k) { - size_t j_offset = 0; - for (size_t j = 0; j <= steps; ++j) { -#ifdef __AVX512F__ - // Initialize the variables of maximum score and location. - __m512 max_score = _mm512_set1_ps(-std::numeric_limits::max()); - __m512i max_j = _mm512_setzero_si512(); -#else - // Initialize the variables of maximum score and location. - __m256 max_score = _mm256_set1_ps(-std::numeric_limits::max()); - __m256i max_j = _mm256_set1_epi32(0); -#endif - // Calculate the offset of transition_weights. - size_t trans_offset = state_trans_base_idx * tag_num + j_offset; - for (size_t i = 0; i < tag_num; ++i) { -#ifdef __AVX512F__ - // Initalize the content of alpha variable with related offset. - __m512 alpha_content = - _mm512_set1_ps(*(const float*)(alpha_value + seq_offset + i)); - // Obtain the content of weights from un-aligned address. - __m512 w_content = - _mm512_loadu_ps((const float*)(w + trans_offset)); - - __m512 score_v = _mm512_add_ps(alpha_content, w_content); - - __mmask16 mask = _mm512_cmp_ps_mask(score_v, max_score, _CMP_GT_OS); - - // According to the mask value, it update the index of the max_score - // location. - max_j = _mm512_mask_set1_epi32(max_j, mask, i); - - // Update the max_score value. - max_score = _mm512_max_ps(max_score, score_v); -#else - // Initalize the content of alpha variable with related offset. - __m256 alpha_content = _mm256_broadcast_ss( - (const float*)(alpha_value + seq_offset + i)); - // Obtain the content of weights from un-aligned address. - __m256 w_content = - _mm256_loadu_ps((const float*)(w + trans_offset)); - __m256 score_v = _mm256_add_ps(alpha_content, w_content); - - __m256 mask = _mm256_cmp_ps(score_v, max_score, _CMP_GT_OS); - -#ifdef __AVX2__ - // According to the mask value, it update the index of the max_score - // location. - max_j = _mm256_or_si256( - _mm256_andnot_si256((__m256i)mask, max_j), - _mm256_and_si256((__m256i)mask, _mm256_set1_epi32(i))); -#else - __m128i lo_max_j = _mm256_extractf128_si256(max_j, 0); - __m128i hi_max_j = _mm256_extractf128_si256(max_j, 1); - __m128i lo_mask = _mm256_extractf128_si256((__m256i)mask, 0); - __m128i hi_mask = _mm256_extractf128_si256((__m256i)mask, 1); - - lo_max_j = _mm_andnot_si128(lo_mask, lo_max_j); - hi_max_j = _mm_andnot_si128(hi_mask, hi_max_j); - lo_mask = _mm_and_si128(lo_mask, _mm_set1_epi32(i)); - hi_mask = _mm_and_si128(hi_mask, _mm_set1_epi32(i)); - - lo_max_j = _mm_or_si128(lo_mask, lo_max_j); - hi_max_j = _mm_or_si128(hi_mask, hi_max_j); - - // According to the mask value, it update the index of the max_score - // location. - max_j = _mm256_insertf128_si256(max_j, lo_max_j, 0); - max_j = _mm256_insertf128_si256(max_j, hi_max_j, 1); -#endif - - // Update the max_score value. - max_score = _mm256_max_ps(max_score, score_v); -#endif - trans_offset += tag_num; - } - -#ifdef __AVX512F__ - // Update the alpha and track values. - __m512 x_content = _mm512_loadu_ps( - (const float*)(x + seq_offset + tag_num + j_offset)); - max_score = _mm512_add_ps(max_score, x_content); - _mm512_storeu_ps(reinterpret_cast(alpha_value + seq_offset + - tag_num + j_offset), - max_score); - _mm512_storeu_si512( - reinterpret_cast<__m512i*>(track_value + seq_offset + tag_num + - j_offset), - max_j); -#else - // Update the alpha and track values. - __m256 x_content = _mm256_loadu_ps( - (const float*)(x + seq_offset + tag_num + j_offset)); - max_score = _mm256_add_ps(max_score, x_content); - _mm256_storeu_ps(reinterpret_cast(alpha_value + seq_offset + - tag_num + j_offset), - max_score); - _mm256_storeu_si256( - reinterpret_cast<__m256i*>(track_value + seq_offset + tag_num + - j_offset), - max_j); -#endif - - // Calculate the offset of next step - j_offset += step_size; - if (j == steps - 1) { - if (remain > 0) { - j_offset += last_offset; - } else { - break; - } - } - } - - seq_offset += tag_num; - } - } else { - for (size_t i = 0; i < tag_num; ++i) alpha_value[i] = w[i] + x[i]; - - for (size_t k = 1; k < seq_len; ++k) { - for (size_t i = 0; i < tag_num; ++i) { - T max_score = -std::numeric_limits::max(); - int max_j = 0; - for (size_t j = 0; j < tag_num; ++j) { - T score = alpha_value[(k - 1) * tag_num + j] + - w[(j + state_trans_base_idx) * tag_num + i]; - if (score > max_score) { - max_score = score; - max_j = j; - } - } - - alpha_value[k * tag_num + i] = max_score + x[k * tag_num + i]; - track_value[k * tag_num + i] = max_j; - } - } - } -#else - for (size_t i = 0; i < tag_num; ++i) alpha_value[i] = w[i] + x[i]; - - for (size_t k = 1; k < seq_len; ++k) { - for (size_t i = 0; i < tag_num; ++i) { - T max_score = -std::numeric_limits::max(); - int max_j = 0; - for (size_t j = 0; j < tag_num; ++j) { - T score = alpha_value[(k - 1) * tag_num + j] + - w[(j + state_trans_base_idx) * tag_num + i]; - if (score > max_score) { - max_score = score; - max_j = j; - } - } - - alpha_value[k * tag_num + i] = max_score + x[k * tag_num + i]; - track_value[k * tag_num + i] = max_j; - } - } - -#endif + const auto& ker = math::jitkernel::KernelPool::Instance() + .template Get>( + static_cast(tag_num)); + ker->Compute(static_cast(seq_len), x, w, alpha_value, track_value); T max_score = -std::numeric_limits::max(); int max_i = 0; for (size_t i = 0; i < tag_num; ++i) { diff --git a/paddle/fluid/operators/delete_var_op.cc b/paddle/fluid/operators/delete_var_op.cc index d7a9bfbc437dbf4c723b9c87ff62ec6b62c38638..89416f7ab5d07ddac5b540b9bb361f831c1ef360 100644 --- a/paddle/fluid/operators/delete_var_op.cc +++ b/paddle/fluid/operators/delete_var_op.cc @@ -32,6 +32,11 @@ class DeleteVarOp : public framework::OperatorBase { } }; +class DeleteVarOpShapeInference : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *ctx) const override {} +}; + class DeleteVarOpInfoMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { @@ -48,4 +53,5 @@ It should not be configured by users directly. REGISTER_OPERATOR(delete_var, paddle::operators::DeleteVarOp, paddle::framework::EmptyGradOpMaker, - paddle::operators::DeleteVarOpInfoMaker); + paddle::operators::DeleteVarOpInfoMaker, + paddle::operators::DeleteVarOpShapeInference); diff --git a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc index 339e63a2be13cec7b641b3a9eeb083480fc4b86e..fddd6884017c35112ba48f245759f5d846b55f9a 100644 --- a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc +++ b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc @@ -439,31 +439,88 @@ class GenerateProposalLabelsKernel : public framework::OpKernel { class GenerateProposalLabelsOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { - // TODO(buxingyuan): Add Document - AddInput("RpnRois", "RpnRois."); - AddInput("GtClasses", "GtClasses."); - AddInput("IsCrowd", "IsCrowd."); - AddInput("GtBoxes", "GtBoxes."); - AddInput("ImInfo", "ImInfo."); - - AddOutput("Rois", "Rois."); - AddOutput("LabelsInt32", "LabelsInt32."); - AddOutput("BboxTargets", "BboxTargets."); - AddOutput("BboxInsideWeights", "BboxInsideWeights."); - AddOutput("BboxOutsideWeights", "BboxOutsideWeights."); - - AddAttr("batch_size_per_im", "batch_size_per_im"); - AddAttr("fg_fraction", "fg_fraction"); - AddAttr("fg_thresh", "fg_thresh"); - AddAttr("bg_thresh_hi", "bg_thresh_hi"); - AddAttr("bg_thresh_lo", "bg_thresh_lo"); - AddAttr>("bbox_reg_weights", "bbox_reg_weights"); - AddAttr("class_nums", "class_nums"); - AddAttr("use_random", "use_random").SetDefault(true); + AddInput( + "RpnRois", + "(LoDTensor), This input is a 2D LoDTensor with shape [N, 4]. " + "N is the number of the GenerateProposalOp's output, " + "each element is a bounding box with [xmin, ymin, xmax, ymax] format."); + AddInput("GtClasses", + "(LoDTensor), This input is a 2D LoDTensor with shape [M, 1]. " + "M is the number of groundtruth, " + "each element is a class label of groundtruth."); + AddInput( + "IsCrowd", + "(LoDTensor), This input is a 2D LoDTensor with shape [M, 1]. " + "M is the number of groundtruth, " + "each element is a flag indicates whether a groundtruth is crowd."); + AddInput( + "GtBoxes", + "(LoDTensor), This input is a 2D LoDTensor with shape [M, 4]. " + "M is the number of groundtruth, " + "each element is a bounding box with [xmin, ymin, xmax, ymax] format."); + AddInput("ImInfo", + "(Tensor), This input is a 2D Tensor with shape [B, 3]. " + "B is the number of input images, " + "each element consists of im_height, im_width, im_scale."); + + AddOutput( + "Rois", + "(LoDTensor), This output is a 2D LoDTensor with shape [P, 4]. " + "P usuall equal to batch_size_per_im * batch_size, " + "each element is a bounding box with [xmin, ymin, xmax, ymax] format."); + AddOutput("LabelsInt32", + "(LoDTensor), This output is a 2D LoDTensor with shape [P], " + "each element repersents a class label of a roi"); + AddOutput("BboxTargets", + "(LoDTensor), This output is a 2D LoDTensor with shape [P, 4 * " + "class_nums], " + "each element repersents a box label of a roi"); + AddOutput( + "BboxInsideWeights", + "(LoDTensor), This output is a 2D LoDTensor with shape [P, 4 * " + "class_nums], " + "each element indicates whether a box should contribute to loss."); + AddOutput( + "BboxOutsideWeights", + "(LoDTensor), This output is a 2D LoDTensor with shape [P, 4 * " + "class_nums], " + "each element indicates whether a box should contribute to loss."); + + AddAttr("batch_size_per_im", "Batch size of rois per images."); + AddAttr("fg_fraction", + "Foreground fraction in total batch_size_per_im."); + AddAttr( + "fg_thresh", + "Overlap threshold which is used to chose foreground sample."); + AddAttr("bg_thresh_hi", + "Overlap threshold upper bound which is used to chose " + "background sample."); + AddAttr("bg_thresh_lo", + "Overlap threshold lower bound which is used to chose " + "background sample."); + AddAttr>("bbox_reg_weights", "Box regression weights."); + AddAttr("class_nums", "Class number."); + AddAttr( + "use_random", + "Use random sampling to choose foreground and background boxes.") + .SetDefault(true); AddComment(R"DOC( -Generate Proposals Labels Operator. -)DOC"); +This operator can be, for given the GenerateProposalOp output bounding boxes and groundtruth, +to sample foreground boxes and background boxes, and compute loss target. + +RpnRois is the output boxes of RPN and was processed by generate_proposal_op, these boxes +were combined with groundtruth boxes and sampled according to batch_size_per_im and fg_fraction, +If an instance with a groundtruth overlap greater than fg_thresh, then it was considered as a foreground sample. +If an instance with a groundtruth overlap greater than bg_thresh_lo and lower than bg_thresh_hi, +then it was considered as a background sample. +After all foreground and background boxes are chosen (so called Rois), +then we apply random sampling to make sure +the number of foreground boxes is no more than batch_size_per_im * fg_fraction. + +For each box in Rois, we assign the classification (class label) and regression targets (box label) to it. +Finally BboxInsideWeights and BboxOutsideWeights are used to specify whether it would contribute to training loss. + )DOC"); } }; diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cc b/paddle/fluid/operators/detection/generate_proposals_op.cc index a69d9c9a529f26b3981ca8d1ba226fda71b8820a..709c2dfc4b7c67d7d04074c58ce6da85b6e790fe 100644 --- a/paddle/fluid/operators/detection/generate_proposals_op.cc +++ b/paddle/fluid/operators/detection/generate_proposals_op.cc @@ -284,7 +284,7 @@ static inline Tensor NMS(const platform::DeviceContext &ctx, Tensor *bbox, selected_indices.push_back(idx); ++selected_num; } - sorted_indices.erase(sorted_indices.end()); + sorted_indices.erase(sorted_indices.end() - 1); if (flag && eta < 1 && adaptive_threshold > 0.5) { adaptive_threshold *= eta; } diff --git a/paddle/fluid/operators/dropout_op.cc b/paddle/fluid/operators/dropout_op.cc index 07322e720f26213ea777be3cd22f2fead28507f0..3c28ef30922e6d6ba09b96282619eef15867631e 100644 --- a/paddle/fluid/operators/dropout_op.cc +++ b/paddle/fluid/operators/dropout_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/dropout_op.h" +#include namespace paddle { namespace operators { @@ -57,6 +58,29 @@ class DropoutOpMaker : public framework::OpProtoAndCheckerMaker { "will be dropped.") .SetDefault(false); AddAttr("seed", "Dropout random seed.").SetDefault(0); + AddAttr( + "dropout_implementation", + "[\"downgrade_in_infer\"|\"upscale_in_train\"]" + "There are two kinds of ways to implement dropout" + "(the mask below is a tensor have the same shape with input" + "the value of mask is 0 or 1, the ratio of 0 is dropout_prob)" + "1. downgrade_in_infer(default), downgrade the outcome at inference " + "time" + " train: out = input * mask" + " inference: out = input * dropout_prob" + "2. upscale_in_train, upscale the outcome at training time, do nothing " + "in inference" + " train: out = input * mask / ( 1.0 - dropout_prob )" + " inference: out = input" + " dropout op can be removed from the program. the program will be " + "efficient") + .SetDefault("downgrade_in_infer") + .AddCustomChecker([](const std::string& type) { + PADDLE_ENFORCE( + type == "downgrade_in_infer" || type == "upscale_in_train", + "dropout_implementation can only be downgrade_in_infer or " + "upscale_in_train"); + }); AddComment(R"DOC( Dropout Operator. @@ -104,7 +128,9 @@ REGISTER_OPERATOR(dropout, ops::DropoutOp, ops::DropoutOpMaker, paddle::framework::DefaultGradOpDescMaker); REGISTER_OPERATOR(dropout_grad, ops::DropoutOpGrad); REGISTER_OP_CPU_KERNEL( - dropout, ops::CPUDropoutKernel); + dropout, ops::CPUDropoutKernel, + ops::CPUDropoutKernel); REGISTER_OP_CPU_KERNEL( dropout_grad, - ops::DropoutGradKernel); + ops::DropoutGradKernel, + ops::DropoutGradKernel); diff --git a/paddle/fluid/operators/dropout_op.cu b/paddle/fluid/operators/dropout_op.cu index 1dd66e0280c46c0624ff70e822cb6fa6f06b7aa9..e011f47e086183a4ef3a3373c17acd6c21b6cf7e 100644 --- a/paddle/fluid/operators/dropout_op.cu +++ b/paddle/fluid/operators/dropout_op.cu @@ -17,6 +17,7 @@ limitations under the License. */ #include #include #include +#include #include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/platform/float16.h" @@ -26,7 +27,8 @@ namespace operators { template __global__ void RandomGenerator(const size_t n, const int seed, const float dropout_prob, const T* src, - T* mask_data, T* dst) { + T* mask_data, T* dst, + bool is_upscale_in_train) { thrust::minstd_rand rng; rng.seed(seed); thrust::uniform_real_distribution dist(0, 1); @@ -47,7 +49,11 @@ __global__ void RandomGenerator(const size_t n, const int seed, if (dist(rng) < dropout_prob) { mask = static_cast(0); } else { - mask = static_cast(1); + if (is_upscale_in_train) { + mask = static_cast(1.0f / (1.0f - dropout_prob)); + } else { + mask = static_cast(1); + } } dest = s * mask; mask_data[idx] = mask; @@ -67,6 +73,8 @@ class GPUDropoutKernel : public framework::OpKernel { y->mutable_data(context.GetPlace()); float dropout_prob = context.Attr("dropout_prob"); + auto dropout_implementation = + context.Attr("dropout_implementation"); auto& place = *context.template device_context().eigen_device(); if (!context.Attr("is_test")) { auto* mask = context.Output("Mask"); @@ -83,11 +91,16 @@ class GPUDropoutKernel : public framework::OpKernel { int grid = (x->numel() + threads - 1) / threads; RandomGenerator< T><<>>( - size, seed, dropout_prob, x_data, mask_data, y_data); + size, seed, dropout_prob, x_data, mask_data, y_data, + (dropout_implementation == "upscale_in_train")); } else { auto X = EigenMatrix::Reshape(*x, 1); auto Y = EigenMatrix::Reshape(*y, 1); - Y.device(place) = X * static_cast(1.0f - dropout_prob); + if (dropout_implementation == "upscale_in_train") { + Y.device(place) = X; + } else { + Y.device(place) = X * static_cast(1.0f - dropout_prob); + } } } }; @@ -99,6 +112,8 @@ namespace ops = paddle::operators; namespace plat = paddle::platform; REGISTER_OP_CUDA_KERNEL( dropout, ops::GPUDropoutKernel, - ops::GPUDropoutKernel); -REGISTER_OP_CUDA_KERNEL(dropout_grad, - ops::DropoutGradKernel); + ops::GPUDropoutKernel, + ops::GPUDropoutKernel); +REGISTER_OP_CUDA_KERNEL( + dropout_grad, ops::DropoutGradKernel, + ops::DropoutGradKernel); diff --git a/paddle/fluid/operators/dropout_op.h b/paddle/fluid/operators/dropout_op.h index 0628b4b826d2730a8e3fb4842e4ae550b8c00569..6c629b7b6d255828023ed25680675ca104a33e12 100644 --- a/paddle/fluid/operators/dropout_op.h +++ b/paddle/fluid/operators/dropout_op.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once #include +#include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" @@ -36,6 +37,8 @@ class CPUDropoutKernel : public framework::OpKernel { auto* y_data = y->mutable_data(context.GetPlace()); float dropout_prob = context.Attr("dropout_prob"); + auto dropout_implementation = + context.Attr("dropout_implementation"); if (!context.Attr("is_test")) { auto* mask = context.Output("Mask"); auto* mask_data = mask->mutable_data(context.GetPlace()); @@ -49,14 +52,20 @@ class CPUDropoutKernel : public framework::OpKernel { engine.seed(seed); std::uniform_real_distribution dist(0, 1); + size_t size = framework::product(mask->dims()); for (size_t i = 0; i < size; ++i) { if (dist(engine) < dropout_prob) { mask_data[i] = 0; y_data[i] = 0; } else { - mask_data[i] = 1; - y_data[i] = x_data[i]; + if (dropout_implementation == "upscale_in_train") { + mask_data[i] = 1.0f / static_cast(1.0f - dropout_prob); + y_data[i] = x_data[i] / static_cast(1.0f - dropout_prob); + } else { + mask_data[i] = 1; + y_data[i] = x_data[i]; + } } } } else { @@ -64,7 +73,11 @@ class CPUDropoutKernel : public framework::OpKernel { auto Y = EigenMatrix::Reshape(*y, 1); auto& place = *context.template device_context().eigen_device(); - Y.device(place) = X * (1.0f - dropout_prob); + if (dropout_implementation == "upscale_in_train") { + Y.device(place) = X; + } else { + Y.device(place) = X * static_cast(1.0f - dropout_prob); + } } } }; diff --git a/paddle/fluid/operators/fake_init_op.cc b/paddle/fluid/operators/fake_init_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..28ebdcb03ea83f3ec701106111a7cc5f0f7ed7dc --- /dev/null +++ b/paddle/fluid/operators/fake_init_op.cc @@ -0,0 +1,86 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +class FakeInitInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of FakeInitOp should not be null."); + auto &shape = ctx->Attrs().Get>("shape"); + ctx->SetOutputDim("Out", framework::make_ddim(shape)); + } +}; + +class FakeInitOp : public framework::OperatorBase { + public: + using framework::OperatorBase::OperatorBase; + + private: + void RunImpl(const framework::Scope &scope, + const platform::Place &dev_place) const override { + framework::Tensor *tensor = nullptr; + + auto &out_var = *scope.FindVar(Output("Out")); + + if (out_var.IsType()) { + tensor = out_var.GetMutable(); + tensor->Resize(framework::make_ddim(Attr>("shape"))); + } else if (out_var.IsType()) { + tensor = out_var.GetMutable()->mutable_value(); + tensor->Resize(framework::make_ddim(Attr>("shape"))); + } else { + PADDLE_THROW( + "fake init op's output only" + "supports SelectedRows and LoDTensor"); + } + } +}; + +class FakeInitOpVarTypeInference : public framework::VarTypeInference { + public: + void operator()(const framework::OpDesc &op_desc, + framework::BlockDesc *block) const override {} +}; + +class FakeInitOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddAttr>("shape", + "(vector) The shape of the output"); + AddOutput("Out", + "(Tensor) Tensor of specified shape will be filled " + "with the specified value"); + AddComment(R"DOC( +FakeInit Operator. + +Init an variable but not alloc memory for it, it is used for init the +table parameter at trainer side in distributed lookup table. + +)DOC"); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(fake_init, ops::FakeInitOp, ops::FakeInitInferShape, + ops::FakeInitOpMaker, paddle::framework::EmptyGradOpMaker, + ops::FakeInitOpVarTypeInference); diff --git a/paddle/fluid/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc index e04a68717b351ddb0be5a7e70aa9297e5eb0125f..252f313440296bd9e5eebf26f67b08bbe7decce8 100644 --- a/paddle/fluid/operators/fill_constant_op.cc +++ b/paddle/fluid/operators/fill_constant_op.cc @@ -24,7 +24,7 @@ class FillConstantInferShape : public framework::InferShapeBase { void operator()(framework::InferShapeContext *ctx) const override { PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) of FillConstantOp should not be null."); - auto &shape = ctx->Attrs().Get>("shape"); + auto &shape = ctx->Attrs().Get>("shape"); ctx->SetOutputDim("Out", framework::make_ddim(shape)); } }; @@ -47,10 +47,10 @@ class FillConstantOp : public framework::OperatorBase { if (out_var.IsType()) { tensor = out_var.GetMutable(); - tensor->Resize(framework::make_ddim(Attr>("shape"))); + tensor->Resize(framework::make_ddim(Attr>("shape"))); } else if (out_var.IsType()) { tensor = out_var.GetMutable()->mutable_value(); - tensor->Resize(framework::make_ddim(Attr>("shape"))); + tensor->Resize(framework::make_ddim(Attr>("shape"))); } else { PADDLE_THROW( "fill constant op's output only" @@ -83,7 +83,8 @@ class FillConstantOpMaker : public framework::OpProtoAndCheckerMaker { "(int, default 5 (FP32)) " "Output data type") .SetDefault(framework::proto::VarType::FP32); - AddAttr>("shape", "(vector) The shape of the output"); + AddAttr>("shape", + "(vector) The shape of the output"); AddAttr("value", "(float, default 0) The value to be filled") .SetDefault(0.0f); AddAttr("force_cpu", diff --git a/paddle/fluid/operators/gather_op.cc b/paddle/fluid/operators/gather_op.cc index 089b541a0a61adb5efda6b2e027c913d5808dff0..f84ff206fffddef1030b7ed439e887bdfef342a6 100644 --- a/paddle/fluid/operators/gather_op.cc +++ b/paddle/fluid/operators/gather_op.cc @@ -102,7 +102,9 @@ REGISTER_OPERATOR(gather, ops::GatherOp, ops::GatherOpMaker, paddle::framework::DefaultGradOpDescMaker); REGISTER_OPERATOR(gather_grad, ops::GatherGradOp); REGISTER_OP_CPU_KERNEL(gather, ops::GatherOpKernel, - ops::GatherOpKernel, ops::GatherOpKernel); + ops::GatherOpKernel, ops::GatherOpKernel, + ops::GatherOpKernel); REGISTER_OP_CPU_KERNEL(gather_grad, ops::GatherGradientOpKernel, + ops::GatherGradientOpKernel, ops::GatherGradientOpKernel, - ops::GatherGradientOpKernel); + ops::GatherGradientOpKernel); diff --git a/paddle/fluid/operators/gather_op.cu b/paddle/fluid/operators/gather_op.cu index 7e014dd1cb47ee0575308dc13ba7bc7617baebff..9f4aef08cd58e72ce344a640e6564b9e360ce169 100644 --- a/paddle/fluid/operators/gather_op.cu +++ b/paddle/fluid/operators/gather_op.cu @@ -61,5 +61,11 @@ class GatherGradOpCUDAKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL(gather, ops::GatherOpCUDAKernel); -REGISTER_OP_CUDA_KERNEL(gather_grad, ops::GatherGradOpCUDAKernel); +REGISTER_OP_CUDA_KERNEL(gather, ops::GatherOpCUDAKernel, + ops::GatherOpCUDAKernel, + ops::GatherOpCUDAKernel, + ops::GatherOpCUDAKernel); +REGISTER_OP_CUDA_KERNEL(gather_grad, ops::GatherGradOpCUDAKernel, + ops::GatherGradOpCUDAKernel, + ops::GatherGradOpCUDAKernel, + ops::GatherGradOpCUDAKernel); diff --git a/paddle/fluid/operators/gaussian_random_op.cc b/paddle/fluid/operators/gaussian_random_op.cc index 1488aab1926b5b4ba7bceed582700f5a11fc6c93..c70d5b8bc7569c38cbc003aca7c62dc503df11cf 100644 --- a/paddle/fluid/operators/gaussian_random_op.cc +++ b/paddle/fluid/operators/gaussian_random_op.cc @@ -52,7 +52,7 @@ class GaussianRandomOp : public framework::OperatorWithKernel { void InferShape(framework::InferShapeContext* ctx) const override { PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) of GaussianRandomOp should not be null."); - auto shape = ctx->Attrs().Get>("shape"); + auto shape = ctx->Attrs().Get>("shape"); std::vector temp; temp.reserve(shape.size()); for (auto dim : shape) { @@ -88,9 +88,9 @@ class GaussianRandomOpMaker : public framework::OpProtoAndCheckerMaker { void Make() override { AddOutput("Out", "Output matrix of gaussian random op"); - AddAttr>("shape", - "(vector) " - "The dimension of random tensor."); + AddAttr>("shape", + "(vector) " + "The dimension of random tensor."); AddAttr("mean", "(float, default 0.0) " "mean of random tensor.") diff --git a/paddle/fluid/operators/hash_op.cc b/paddle/fluid/operators/hash_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..b9ebe71a3d7ae270a10a45f4805652415078b363 --- /dev/null +++ b/paddle/fluid/operators/hash_op.cc @@ -0,0 +1,74 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/hash_op.h" +#include +#include + +namespace paddle { +namespace operators { + +class HashOp : public framework::OperatorWithKernel { + public: + HashOp(const std::string &type, const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorWithKernel(type, inputs, outputs, attrs) {} + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of HashOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of HashOp should not be null."); + + auto dims = ctx->GetInputDim("X"); + PADDLE_ENFORCE_EQ(dims.size(), 2UL, + "The input of hash_op's dimensions must be 2"); + std::vector out_dims; + out_dims.reserve(dims.size() + 1); + // copy all dims except the last one + for (size_t i = 0u; i != dims.size() - 1; ++i) { + out_dims.emplace_back(dims[i]); + } + int num_hash = ctx->Attrs().Get("num_hash"); + out_dims.emplace_back(num_hash); + // keep the last dim to 1 + out_dims.emplace_back(1); + + ctx->SetOutputDim("Out", framework::make_ddim(out_dims)); + ctx->ShareLoD("X", /*->*/ "Out"); + } +}; + +class HashOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "(Tensor) Input tensor of scale operator."); + AddOutput("Out", "(Tensor) Output tensor of scale operator."); + AddComment(R"DOC( +**Hash Operator** +$$Out = scale * X$$ +)DOC"); + AddAttr("num_hash", "").SetDefault(1); + AddAttr("mod_by", "").SetDefault(100000); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_WITHOUT_GRADIENT(hash, ops::HashOp, ops::HashOpMaker); +REGISTER_OP_CPU_KERNEL(hash, ops::HashKerel, ops::HashKerel); diff --git a/paddle/fluid/operators/hash_op.h b/paddle/fluid/operators/hash_op.h new file mode 100644 index 0000000000000000000000000000000000000000..9781bb0f453642cefb3eb59a05389c339a7de39d --- /dev/null +++ b/paddle/fluid/operators/hash_op.h @@ -0,0 +1,56 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +extern "C" { +#include +} +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { +// template +template +class HashKerel : public framework::OpKernel { + public: + virtual void Compute(const framework::ExecutionContext& context) const { + auto* out_t = context.Output("Out"); + auto* in_t = context.Input("X"); + int mod_by = context.Attr("mod_by"); + int num_hash = context.Attr("num_hash"); + auto* output = out_t->mutable_data(context.GetPlace()); + + auto in_dims = in_t->dims(); + auto in_lod = in_t->lod(); + PADDLE_ENFORCE_EQ( + static_cast(in_dims[0]), in_lod[0].back(), + "The actual input data's size mismatched with LoD information."); + + auto seq_length = in_dims[0]; + auto last_dim = in_dims[in_dims.size() - 1]; + auto* input = in_t->data(); + for (int idx = 0; idx < seq_length; ++idx) { + for (int ihash = 0; ihash != num_hash; ++ihash) { + output[idx * num_hash + ihash] = + XXH64(input, sizeof(int) * last_dim, ihash) % mod_by; + } + input += last_dim; + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/lars_momentum_op.cc b/paddle/fluid/operators/lars_momentum_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..a8dda93902448fa1bd21b719ffd9c9b500caf755 --- /dev/null +++ b/paddle/fluid/operators/lars_momentum_op.cc @@ -0,0 +1,86 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/lars_momentum_op.h" +#include "paddle/fluid/operators/momentum_op.h" + +namespace paddle { +namespace operators { + +class LarsMomentumOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("Param", + "(LoDTensor, default LoDTensor) " + "Input parameter that has to be updated"); + AddInput("Grad", + "(LoDTensor, default LoDTensor) " + "Input gradient of the parameter"); + AddInput("Velocity", + "(LoDTensor, default LoDTensor) " + "Input velocity (corresponding to the parameter) " + "that has to be updated"); + AddInput("LearningRate", + "(LoDTensor, default LoDTensor) " + "Input learning rate"); + + AddOutput("ParamOut", + "(LoDTensor) This output is updated parameter. " + "It shared memory with Input(Param)."); + AddOutput("VelocityOut", + "(LoDTensor) This output is updated velocity. " + "It shared memory with Input(Velocity)."); + + AddAttr("mu", "(float) Momentum coefficient"); + AddAttr("lars_coeff", "(float, default 0.001) LARS coefficient.") + .SetDefault(0.001); + AddAttr("lars_weight_decay", + "(float, default 0.0005) LARS weight decay") + .SetDefault(0.0005); + + AddComment(R"DOC( +Lars Momentum Optimizer. + +This optimizer use LARS (https://arxiv.org/abs/1708.03888) to optimize each +weight using a local learning rate: + +$$ +local\_lr = \eta * + \frac{\left \| param \right \|}{\left \| grad \right \| + \beta *\left \| param \right \|} \\ +velocity = mu * velocity + + local\_lr * (grad + \beta * param) \\ +param = param - velocity. \\ +$$ + +Note that we use lars_weight_decay here to decay weights, you may need not to +use L2 regularizers in case of using LARS. + +)DOC"); + } +}; + +class LarsMomentumOpVarTypeInference : public framework::VarTypeInference { + public: + void operator()(const framework::OpDesc &op_desc, + framework::BlockDesc *block) const override {} +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(lars_momentum, ops::MomentumOp, ops::LarsMomentumOpMaker, + paddle::framework::EmptyGradOpMaker, + ops::LarsMomentumOpVarTypeInference); +REGISTER_OP_CPU_KERNEL(lars_momentum, ops::LarsMomentumOpKernel, + ops::LarsMomentumOpKernel); diff --git a/paddle/fluid/operators/lars_momentum_op.cu b/paddle/fluid/operators/lars_momentum_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..eb346851a2f690fa05422c84ddcb08307539048f --- /dev/null +++ b/paddle/fluid/operators/lars_momentum_op.cu @@ -0,0 +1,94 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/lars_momentum_op.h" + +namespace paddle { +namespace operators { + +template +__global__ void MomentumLarsKernel(const T* p, const T* g, const T* v, + const T* learning_rate, const T mu, + const int64_t num, const T lars_coeff, + const T lars_weight_decay, const T* p_norm, + const T* g_norm, T* p_out, T* v_out) { + T lr = learning_rate[0]; + T local_lr = learning_rate[0]; + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num; + i += blockDim.x * gridDim.x) { + if (p_norm[0] > 0 && g_norm[0] > 0) { + local_lr = lr * lars_coeff * p_norm[0] / + (g_norm[0] + lars_weight_decay * p_norm[0]); + } + T v_new = v[i] * mu + local_lr * (g[i] + lars_weight_decay * p[i]); + v_out[i] = v_new; + p_out[i] = p[i] - v_new; + } +} + +template +class LarsMomentumOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto param_out = ctx.Output("ParamOut"); + auto velocity_out = ctx.Output("VelocityOut"); + auto param = ctx.Input("Param"); + auto velocity = ctx.Input("Velocity"); + auto grad = ctx.Input("Grad"); + auto learning_rate = ctx.Input("LearningRate"); + + T* p_out = param_out->mutable_data(ctx.GetPlace()); + T* v_out = velocity_out->mutable_data(ctx.GetPlace()); + + T mu = static_cast(ctx.Attr("mu")); + T lars_coeff = ctx.Attr("lars_coeff"); + T lars_weight_decay = ctx.Attr("lars_weight_decay"); + + auto* p = param->data(); + auto* v = velocity->data(); + auto* g = grad->data(); + auto* lr = learning_rate->data(); + + int block = 512; + int grid = (param->numel() + block - 1) / block; + + auto eigen_p = framework::EigenVector::Flatten(*param); + auto eigen_g = framework::EigenVector::Flatten(*grad); + // calculate norms using eigein and launch the kernel. + framework::Tensor p_norm_t, g_norm_t; + p_norm_t.Resize({1}); + g_norm_t.Resize({1}); + auto* p_norm_data = p_norm_t.mutable_data(ctx.GetPlace()); + auto* g_norm_data = g_norm_t.mutable_data(ctx.GetPlace()); + auto ep_norm = framework::EigenScalar::From(p_norm_t); + auto eg_norm = framework::EigenScalar::From(g_norm_t); + + auto* place = ctx.template device_context().eigen_device(); + ep_norm.device(*place) = eigen_p.square().sum().sqrt(); + eg_norm.device(*place) = eigen_g.square().sum().sqrt(); + MomentumLarsKernel<<>>( + p, g, v, lr, mu, param->numel(), lars_coeff, lars_weight_decay, + p_norm_data, g_norm_data, p_out, v_out); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + lars_momentum, + ops::LarsMomentumOpCUDAKernel, + ops::LarsMomentumOpCUDAKernel); diff --git a/paddle/fluid/operators/lars_momentum_op.h b/paddle/fluid/operators/lars_momentum_op.h new file mode 100644 index 0000000000000000000000000000000000000000..e85be99fc42522e461a7915847d82144d8195a96 --- /dev/null +++ b/paddle/fluid/operators/lars_momentum_op.h @@ -0,0 +1,72 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +template +class LarsMomentumOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto param_out = ctx.Output("ParamOut"); + auto velocity_out = ctx.Output("VelocityOut"); + auto param = ctx.Input("Param"); + auto velocity = ctx.Input("Velocity"); + auto learning_rate = ctx.Input("LearningRate"); + auto* grad_var = ctx.InputVar("Grad"); + // only support dense for now. + PADDLE_ENFORCE(grad_var->IsType()); + auto grad = ctx.Input("Grad"); + + param_out->mutable_data(ctx.GetPlace()); + velocity_out->mutable_data(ctx.GetPlace()); + + T mu = static_cast(ctx.Attr("mu")); + T lars_coeff = ctx.Attr("lars_coeff"); + T lars_weight_decay = ctx.Attr("lars_weight_decay"); + + auto p_out = framework::EigenVector::Flatten(*param_out); + auto v_out = framework::EigenVector::Flatten(*velocity_out); + + auto p = framework::EigenVector::Flatten(*param); + auto v = framework::EigenVector::Flatten(*velocity); + auto g = framework::EigenVector::Flatten(*grad); + auto* lr = learning_rate->data(); + + framework::Tensor p_norm_t, g_norm_t; + p_norm_t.Resize({1}); + g_norm_t.Resize({1}); + p_norm_t.mutable_data(ctx.GetPlace()); + g_norm_t.mutable_data(ctx.GetPlace()); + auto ep_norm = framework::EigenScalar::From(p_norm_t); + auto eg_norm = framework::EigenScalar::From(g_norm_t); + + ep_norm = p.square().sum().sqrt(); + eg_norm = g.square().sum().sqrt(); + T local_lr = lr[0]; + if (ep_norm(0) > 0 && eg_norm(0) > 0) { + local_lr = lr[0] * lars_coeff * ep_norm(0) / + (eg_norm(0) + lars_weight_decay * ep_norm(0)); + } + v_out = v * mu + local_lr * (g + lars_weight_decay * p); + p_out = p - v_out; + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc index 26f09c46c2224a4a46d302dff4b2ec594f0be103..a038bad701ba8ede3065af9f352f1f21784a50b7 100644 --- a/paddle/fluid/operators/listen_and_serv_op.cc +++ b/paddle/fluid/operators/listen_and_serv_op.cc @@ -27,6 +27,10 @@ limitations under the License. */ #include "paddle/fluid/operators/distributed/request_handler_impl.h" #include "paddle/fluid/operators/listen_and_serv_op.h" +DEFINE_int32(rpc_send_thread_num, 5, "number of threads for rpc send"); +DEFINE_int32(rpc_get_thread_num, 5, "number of threads for rpc get"); +DEFINE_int32(rpc_prefetch_thread_num, 5, "number of threads for rpc prefetch"); + namespace paddle { namespace operators { @@ -332,11 +336,14 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, sync_mode, checkpoint_block_id)); rpc_service_->RegisterRPC(distributed::kRequestSend, - request_send_handler_.get()); + request_send_handler_.get(), + FLAGS_rpc_send_thread_num); rpc_service_->RegisterRPC(distributed::kRequestGet, - request_get_handler_.get()); + request_get_handler_.get(), + FLAGS_rpc_get_thread_num); rpc_service_->RegisterRPC(distributed::kRequestPrefetch, - request_prefetch_handler_.get()); + request_prefetch_handler_.get(), + FLAGS_rpc_prefetch_thread_num); rpc_service_->RegisterRPC(distributed::kRequestCheckpoint, request_checkpoint_handler_.get()); diff --git a/paddle/fluid/operators/lookup_table_op.cc b/paddle/fluid/operators/lookup_table_op.cc index b9ac54e446811889b647397ae1fbb11c28f46777..3226a727b1f5f6de9e97ce2068381be7c9b69ff3 100644 --- a/paddle/fluid/operators/lookup_table_op.cc +++ b/paddle/fluid/operators/lookup_table_op.cc @@ -81,6 +81,12 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker { "Otherwise the given value indicates padding the output " "with zeros whenever lookup encounters it in Ids.") .SetDefault(kNoPadding); + // NOTE(minqiyang): grad_inplace is an temporal attribute, + // please do NOT set this attribute in python layer. + AddAttr("grad_inplace", + "(boolean, default false) " + "If the grad op reuse the input's variable.") + .SetDefault(false); AddComment(R"DOC( Lookup Table Operator. @@ -115,7 +121,7 @@ class LookupTableOpGrad : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("W")); + auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("Out")); return framework::OpKernelType(data_type, ctx.device_context()); } }; diff --git a/paddle/fluid/operators/lookup_table_op.h b/paddle/fluid/operators/lookup_table_op.h index 58463dc4d6fd7cc3454de766814a947fee161070..e504c4f0cd5c0feaef4a251fad57b389a10a2ce7 100644 --- a/paddle/fluid/operators/lookup_table_op.h +++ b/paddle/fluid/operators/lookup_table_op.h @@ -21,6 +21,7 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/operators/math/blas.h" namespace paddle { namespace operators { @@ -68,6 +69,7 @@ class LookupTableKernel : public framework::OpKernel { const auto *table = table_t.value().data(); auto *output = output_t->mutable_data(context.GetPlace()); + auto blas = math::GetBlas(context); for (int64_t i = 0; i < ids_numel; ++i) { if (padding_idx != kNoPadding && ids[i] == padding_idx) { memset(output + i * row_width, 0, row_width * sizeof(T)); @@ -75,8 +77,8 @@ class LookupTableKernel : public framework::OpKernel { PADDLE_ENFORCE_GE(ids[i], 0); auto id_index = table_t.Index(ids[i]); PADDLE_ENFORCE_GE(id_index, 0, "the input key should be exists."); - memcpy(output + i * row_width, table + id_index * row_width, - row_width * sizeof(T)); + blas.VCOPY(row_width, table + id_index * row_width, + output + i * row_width); } } } @@ -111,27 +113,37 @@ class LookupTableGradKernel : public framework::OpKernel { auto *ids_data = ids->data(); int64_t ids_num = ids->numel(); - framework::Vector new_rows; - new_rows.reserve(ids_num); - for (int64_t i = 0; i < ids_num; i++) { - new_rows.push_back(ids_data[i]); - } + std::vector new_rows; + new_rows.resize(ids_num); + std::memcpy(&new_rows[0], ids_data, ids_num * sizeof(int64_t)); d_table->set_rows(new_rows); auto *d_table_value = d_table->mutable_value(); d_table_value->Resize({ids_num, table_dim[1]}); - d_table_value->mutable_data(context.GetPlace()); - - d_table->set_height(table_dim[0]); - - auto *d_output_data = d_output->data(); - auto *d_table_data = d_table_value->data(); - - auto d_output_dims = d_output->dims(); - PADDLE_ENFORCE_EQ( - d_table_value->dims(), - framework::flatten_to_2d(d_output_dims, d_output_dims.size() - 1)); - memcpy(d_table_data, d_output_data, sizeof(T) * d_output->numel()); + // FIXME(minqiyang): + // memory optimization will NOT reuse Tensor with SelectedRows + // so we could just share the tensor here directly. + // However, the InferVarType method will infer the output SelectedRows + // to Tensor sometimes, which is a bug, so we will add an attribute + // here to indicate the inplace and remove this attribute after + // the InferVarType's bug was fixed + bool grad_inplace = context.Attr("grad_inplace"); + if (grad_inplace) { + d_table_value->ShareDataWith(*d_output); + } else { + d_table_value->mutable_data(context.GetPlace()); + + d_table->set_height(table_dim[0]); + + auto *d_output_data = d_output->data(); + auto *d_table_data = d_table_value->data(); + + auto d_output_dims = d_output->dims(); + PADDLE_ENFORCE_EQ( + d_table_value->dims(), + framework::flatten_to_2d(d_output_dims, d_output_dims.size() - 1)); + memcpy(d_table_data, d_output_data, sizeof(T) * d_output->numel()); + } } else { auto *ids = context.Input("Ids"); auto *d_output = context.Input(framework::GradVarName("Out")); diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt index 55e2ea760158cda631ec07e2c7d318ec1cf79b77..17b675fba8067851f6149edafcc9096690a3fd34 100644 --- a/paddle/fluid/operators/math/CMakeLists.txt +++ b/paddle/fluid/operators/math/CMakeLists.txt @@ -76,6 +76,6 @@ endif() cc_test(concat_test SRCS concat_test.cc DEPS concat_and_split) cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info) cc_library(jit_kernel - SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_rnn.cc + SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_rnn.cc jit_kernel_crf_decode.cc DEPS cpu_info cblas) cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel) diff --git a/paddle/fluid/operators/math/algorithm.h b/paddle/fluid/operators/math/algorithm.h index 262469beea7449eb5820b86de1ac4f790a833e79..2e75b6abce5e1f43742ee15bff1dac4801186cd4 100644 --- a/paddle/fluid/operators/math/algorithm.h +++ b/paddle/fluid/operators/math/algorithm.h @@ -39,6 +39,52 @@ HOSTDEVICE inline int64_t BinarySearch(const T *x, int64_t num, const T &val) { return -1; } +template +HOSTDEVICE inline size_t LowerBound(const T *x, size_t num, const T &val) { +#ifdef __CUDA_ARCH__ + // The following code is from + // https://en.cppreference.com/w/cpp/algorithm/lower_bound + auto *first = x; + int64_t count = static_cast(num); + while (count > 0) { + int64_t step = (count >> 1); + auto *it = first + step; + if (*it < val) { + first = ++it; + count -= (step + 1); + } else { + count = step; + } + } + return static_cast(first - x); +#else + return static_cast(std::lower_bound(x, x + num, val) - x); +#endif +} + +template +HOSTDEVICE inline size_t UpperBound(const T *x, size_t num, const T &val) { +#ifdef __CUDA_ARCH__ + // The following code is from + // https://en.cppreference.com/w/cpp/algorithm/upper_bound + auto *first = x; + int64_t count = static_cast(num); + while (count > 0) { + auto step = (count >> 1); + auto *it = first + step; + if (val < *it) { + count = step; + } else { + first = ++it; + count -= (step + 1); + } + } + return static_cast(first - x); +#else + return static_cast(std::upper_bound(x, x + num, val) - x); +#endif +} + } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/math/jit_kernel.h b/paddle/fluid/operators/math/jit_kernel.h index 9088d0c7a6307c3fbd9707c719ec9e6f6c85fbdb..48e180b1fd43b06cc13f7a4b00c73aff2eb940ac 100644 --- a/paddle/fluid/operators/math/jit_kernel.h +++ b/paddle/fluid/operators/math/jit_kernel.h @@ -151,6 +151,13 @@ class GRUKernel : public Kernel { virtual void ComputeHtPart2(T *gates, const T *ht_1, T *ht) const = 0; }; +template +class CRFDecodeKernel : public Kernel { + public: + virtual void Compute(const int seq_len, const T *x, const T *w, T *alpha, + int *track) const = 0; +}; + } // namespace jitkernel } // namespace math } // namespace operators diff --git a/paddle/fluid/operators/math/jit_kernel_crf_decode.cc b/paddle/fluid/operators/math/jit_kernel_crf_decode.cc new file mode 100644 index 0000000000000000000000000000000000000000..e481d1921a7dc4fd6da3fffbc3959eafa7b4b461 --- /dev/null +++ b/paddle/fluid/operators/math/jit_kernel_crf_decode.cc @@ -0,0 +1,296 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/jit_kernel.h" +#include +#include +#include "paddle/fluid/operators/math/jit_kernel_macro.h" +#ifdef __AVX__ +#include +#endif + +namespace paddle { +namespace operators { +namespace math { +namespace jitkernel { + +namespace jit = platform::jit; + +/* CRF Decode JitKernel */ +template +class CRFDecodeKernelImpl : public CRFDecodeKernel { + public: + explicit CRFDecodeKernelImpl(int tag_num) : CRFDecodeKernel() { + this->num_ = tag_num; + } + void Compute(const int seq_len, const T* x, const T* w, T* alpha, + int* track) const override { + constexpr int state_trans_base_idx = 2; + for (int i = 0; i < this->num_; ++i) { + alpha[i] = w[i] + x[i]; + } + for (int k = 1; k < seq_len; ++k) { + for (int i = 0; i < this->num_; ++i) { + T max_score = -std::numeric_limits::max(); + int max_j = 0; + for (int j = 0; j < this->num_; ++j) { + T score = alpha[(k - 1) * this->num_ + j] + + w[(j + state_trans_base_idx) * this->num_ + i]; + if (score > max_score) { + max_score = score; + max_j = j; + } + } + alpha[k * this->num_ + i] = max_score + x[k * this->num_ + i]; + track[k * this->num_ + i] = max_j; + } + } + } +}; + +#define INIT_ALPHA(step_size) \ + /* Setup the alpha initial value.*/ \ + int i_offset = 0; \ + int last_offset = this->rest_ - step_size; \ + for (int i = 0; i <= this->end_; ++i) { \ + /* weights, input and alpha values. */ \ + __m256 w_content, x_content, alpha_content; \ + /* Load the relevant data into the variables from un-aligned address.*/ \ + w_content = _mm256_loadu_ps(w + i_offset); \ + x_content = _mm256_loadu_ps(x + i_offset); \ + alpha_content = _mm256_add_ps(w_content, x_content); \ + _mm256_storeu_ps(alpha + i_offset, alpha_content); \ + i_offset += step_size; \ + if (i == this->end_ - 1) { \ + if (this->rest_ > 0) { \ + i_offset += last_offset; \ + } else { \ + break; \ + } \ + } \ + } + +#define UPDATE_ALPHA(step_size) \ + /* Update the alpha and track values. */ \ + __m256 x_content = _mm256_loadu_ps(x + seq_offset + this->num_ + j_offset); \ + max_score = _mm256_add_ps(max_score, x_content); \ + _mm256_storeu_ps(alpha + seq_offset + this->num_ + j_offset, max_score); \ + _mm256_storeu_si256( \ + reinterpret_cast<__m256i*>(track + seq_offset + this->num_ + j_offset), \ + max_j); \ + /* Calculate the offset of next step*/ \ + j_offset += step_size; \ + if (j == this->end_ - 1) { \ + if (this->rest_ > 0) { \ + j_offset += last_offset; \ + } else { \ + break; \ + } \ + } + +#define INTRIAVX_FLOAT(block) \ + template <> \ + CRFDecodeKernelImpl::CRFDecodeKernelImpl( \ + int tag_num) \ + : CRFDecodeKernel() { \ + this->num_ = tag_num; \ + this->end_ = this->num_ / AVX_FLOAT_BLOCK; \ + this->rest_ = this->num_ % AVX_FLOAT_BLOCK; \ + } \ + template <> \ + void CRFDecodeKernelImpl::Compute( \ + const int seq_len, const float* x, const float* w, float* alpha, \ + int* track) const { \ + INIT_ALPHA(AVX_FLOAT_BLOCK) \ + /* Use the column-major strategy to get the location of maximum score.*/ \ + int seq_offset = 0; \ + constexpr int state_trans_base_idx = 2; \ + for (int k = 1; k < seq_len; ++k) { \ + int j_offset = 0; \ + for (int j = 0; j <= this->end_; ++j) { \ + /* Initialize the variables of maximum score and location.*/ \ + __m256 max_score = _mm256_set1_ps(-std::numeric_limits::max()); \ + __m256i max_j = _mm256_set1_epi32(0); \ + /* Calculate the offset of transition_weights.*/ \ + int trans_offset = state_trans_base_idx * this->num_ + j_offset; \ + for (int i = 0; i < this->num_; ++i) { \ + /* Initalize the content of alpha variable with related offset.*/ \ + __m256 alpha_content = _mm256_broadcast_ss(alpha + seq_offset + i); \ + /* Obtain the content of weights from un-aligned address.*/ \ + __m256 w_content = _mm256_loadu_ps(w + trans_offset); \ + __m256 score_v = _mm256_add_ps(alpha_content, w_content); \ + __m256 mask = _mm256_cmp_ps(score_v, max_score, _CMP_GT_OS); \ + /* According to the mask value, update the index of the max_score.*/ \ + /* AVX instructions.*/ \ + __m128i lo_max_j = _mm256_extractf128_si256(max_j, 0); \ + __m128i hi_max_j = _mm256_extractf128_si256(max_j, 1); \ + __m128i lo_mask = _mm256_extractf128_si256((__m256i)mask, 0); \ + __m128i hi_mask = _mm256_extractf128_si256((__m256i)mask, 1); \ + lo_max_j = _mm_andnot_si128(lo_mask, lo_max_j); \ + hi_max_j = _mm_andnot_si128(hi_mask, hi_max_j); \ + lo_mask = _mm_and_si128(lo_mask, _mm_set1_epi32(i)); \ + hi_mask = _mm_and_si128(hi_mask, _mm_set1_epi32(i)); \ + lo_max_j = _mm_or_si128(lo_mask, lo_max_j); \ + hi_max_j = _mm_or_si128(hi_mask, hi_max_j); \ + max_j = _mm256_insertf128_si256(max_j, lo_max_j, 0); \ + max_j = _mm256_insertf128_si256(max_j, hi_max_j, 1); \ + /* AVX done*/ \ + /* Update the max_score value.*/ \ + max_score = _mm256_max_ps(max_score, score_v); \ + trans_offset += this->num_; \ + } \ + UPDATE_ALPHA(AVX_FLOAT_BLOCK) \ + } \ + seq_offset += this->num_; \ + } \ + } + +#define INTRIAVX2_FLOAT(isa, block) \ + template <> \ + CRFDecodeKernelImpl::CRFDecodeKernelImpl(int tag_num) \ + : CRFDecodeKernel() { \ + this->num_ = tag_num; \ + this->end_ = this->num_ / AVX2_FLOAT_BLOCK; \ + this->rest_ = this->num_ % AVX2_FLOAT_BLOCK; \ + } \ + template <> \ + void CRFDecodeKernelImpl::Compute( \ + const int seq_len, const float* x, const float* w, float* alpha, \ + int* track) const { \ + INIT_ALPHA(AVX2_FLOAT_BLOCK) \ + /* Use the column-major strategy to get the location of maximum score.*/ \ + int seq_offset = 0; \ + constexpr int state_trans_base_idx = 2; \ + for (int k = 1; k < seq_len; ++k) { \ + int j_offset = 0; \ + for (int j = 0; j <= this->end_; ++j) { \ + /* Initialize the variables of maximum score and location.*/ \ + __m256 max_score = _mm256_set1_ps(-std::numeric_limits::max()); \ + __m256i max_j = _mm256_set1_epi32(0); \ + /* Calculate the offset of transition_weights.*/ \ + int trans_offset = state_trans_base_idx * this->num_ + j_offset; \ + for (int i = 0; i < this->num_; ++i) { \ + /* Initalize the content of alpha variable with related offset.*/ \ + __m256 alpha_content = _mm256_broadcast_ss(alpha + seq_offset + i); \ + /* Obtain the content of weights from un-aligned address.*/ \ + __m256 w_content = _mm256_loadu_ps(w + trans_offset); \ + __m256 score_v = _mm256_add_ps(alpha_content, w_content); \ + __m256 mask = _mm256_cmp_ps(score_v, max_score, _CMP_GT_OS); \ + /* According to the mask value, update the index of the max_score.*/ \ + /* AVX2 instructions.*/ \ + max_j = _mm256_or_si256( \ + _mm256_andnot_si256((__m256i)mask, max_j), \ + _mm256_and_si256((__m256i)mask, _mm256_set1_epi32(i))); \ + /* Update the max_score value.*/ \ + max_score = _mm256_max_ps(max_score, score_v); \ + trans_offset += this->num_; \ + } \ + UPDATE_ALPHA(AVX2_FLOAT_BLOCK) \ + } \ + seq_offset += this->num_; \ + } \ + } + +#define INTRIAVX512_FLOAT(block) \ + template <> \ + CRFDecodeKernelImpl::CRFDecodeKernelImpl( \ + int tag_num) \ + : CRFDecodeKernel() { \ + this->num_ = tag_num; \ + this->end_ = this->num_ / AVX512_FLOAT_BLOCK; \ + this->rest_ = this->num_ % AVX512_FLOAT_BLOCK; \ + } \ + template <> \ + void CRFDecodeKernelImpl::Compute( \ + const int seq_len, const float* x, const float* w, float* alpha, \ + int* track) const { \ + INIT_ALPHA(AVX512_FLOAT_BLOCK) \ + /* Use the column-major strategy to get the location of maximum score.*/ \ + int seq_offset = 0; \ + constexpr int state_trans_base_idx = 2; \ + for (int k = 1; k < seq_len; ++k) { \ + int j_offset = 0; \ + for (int j = 0; j <= this->end_; ++j) { \ + /* Initialize the variables of maximum score and location.*/ \ + __m512 max_score = _mm512_set1_ps(-std::numeric_limits::max()); \ + __m512i max_j = _mm512_setzero_si512(); \ + /* Calculate the offset of transition_weights.*/ \ + int trans_offset = state_trans_base_idx * this->num_ + j_offset; \ + for (int i = 0; i < this->num_; ++i) { \ + /* Initalize the content of alpha variable with related offset.*/ \ + __m512 alpha_content = _mm512_set1_ps(*(alpha + seq_offset + i)); \ + /* Obtain the content of weights from un-aligned address.*/ \ + __m512 w_content = _mm512_loadu_ps(w + trans_offset); \ + __m512 score_v = _mm512_add_ps(alpha_content, w_content); \ + __mmask16 mask = _mm512_cmp_ps_mask(score_v, max_score, _CMP_GT_OS); \ + /* AVX512 instructions.*/ \ + max_j = _mm512_mask_set1_epi32(max_j, mask, i); \ + /* Update the max_score value.*/ \ + max_score = _mm512_max_ps(max_score, score_v); \ + trans_offset += this->num_; \ + } \ + /* Update the alpha and track values.*/ \ + __m512 x_content = \ + _mm512_loadu_ps(x + seq_offset + this->num_ + j_offset); \ + max_score = _mm512_add_ps(max_score, x_content); \ + _mm512_storeu_ps(alpha + seq_offset + this->num_ + j_offset, \ + max_score); \ + _mm512_storeu_si512(reinterpret_cast<__m512i*>(track + seq_offset + \ + this->num_ + j_offset), \ + max_j); \ + /* Calculate the offset of next step*/ \ + j_offset += AVX512_FLOAT_BLOCK; \ + if (j == this->end_ - 1) { \ + if (this->rest_ > 0) { \ + j_offset += last_offset; \ + } else { \ + break; \ + } \ + } \ + } \ + seq_offset += this->num_; \ + } \ + } + +#ifdef __AVX__ +INTRIAVX_FLOAT(kEQ8); +INTRIAVX_FLOAT(kGT8LT16); +INTRIAVX_FLOAT(kEQ16); +INTRIAVX_FLOAT(kGT16); +#endif +#ifdef __AVX2__ +INTRIAVX2_FLOAT(jit::avx2, kEQ8); +INTRIAVX2_FLOAT(jit::avx2, kGT8LT16); +INTRIAVX2_FLOAT(jit::avx2, kEQ16); +INTRIAVX2_FLOAT(jit::avx2, kGT16); +#endif +#ifdef __AVX512F__ +INTRIAVX2_FLOAT(jit::avx512f, kEQ8); +INTRIAVX2_FLOAT(jit::avx512f, kGT8LT16); +INTRIAVX512_FLOAT(kEQ16); +INTRIAVX512_FLOAT(kGT16); +#endif + +#undef INTRIAVX512_FLOAT +#undef INTRIAVX2_FLOAT +#undef INTRIAVX_FLOAT +#undef INIT_ALPHA +#undef UPDATE_ALPHA + +REGISTER_JITKERNEL(crf_decode, CRFDecodeKernel); + +} // namespace jitkernel +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc index 08f57dd45ad76946cbcafb98a3414003ed9d67a9..75946740375d74043960b68e94eb048b3bab4b79 100644 --- a/paddle/fluid/operators/math/selected_rows_functor.cc +++ b/paddle/fluid/operators/math/selected_rows_functor.cc @@ -12,9 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include #include -#include +#include #include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/selected_rows_functor.h" @@ -230,8 +229,24 @@ template struct SelectedRowsAddToTensor; // add or mul. namespace scatter { -size_t FindPos(const std::vector& rows, int64_t value) { - return std::find(rows.begin(), rows.end(), value) - rows.begin(); +template +typename std::enable_if< + std::is_floating_point::value && + std::is_same::value>::type +elementwise_add_to(const DeviceContext& ctx, BlasT* blas, + size_t data_len, const T* in, T* out) { + blas->AXPY(data_len, 1., in, out); +} + +template +typename std::enable_if< + !std::is_floating_point::value && + std::is_same::value>::type +elementwise_add_to(const DeviceContext& ctx, BlasT* blas, + size_t data_len, const T* in, T* out) { + for (int64_t i = 0; i < data_len; i++) { + out[i] += in[i]; + } } template @@ -246,48 +261,84 @@ struct MergeAdd { void operator()(const platform::CPUDeviceContext& context, const framework::SelectedRows& input, framework::SelectedRows* output) { - framework::SelectedRows& out = *output; - std::vector input_rows(input.rows()); + std::vector inputs; + inputs.push_back(&input); + (*this)(context, inputs, output); + } - std::map> merge_row_map; - for (size_t i = 0; i < input_rows.size(); ++i) { - merge_row_map[input_rows[i]].push_back(i); + void operator()(const platform::CPUDeviceContext& context, + const std::vector& inputs, + framework::SelectedRows* output) { + if (inputs.size() == 0) { + VLOG(3) << "no input! return"; + return; } - - std::vector merge_rows(merge_row_map.size()); - size_t idx = 0; - int64_t input_width = input.value().dims()[1]; - out.set_height(input.height()); - - T* out_data = out.mutable_value()->mutable_data( + const framework::SelectedRows* has_value_input = nullptr; + for (auto* in : inputs) { + if (in->rows().size() > 0) { + has_value_input = in; + break; + } + } + if (has_value_input == nullptr) { + VLOG(3) << "no input has value! just return" << std::endl; + return; + } + auto input_width = has_value_input->value().dims()[1]; + auto input_height = has_value_input->height(); + framework::SelectedRows& out = *output; + std::set merged_row_set; + for (auto* input : inputs) { + if (input->rows().size() == 0) { + continue; + } + PADDLE_ENFORCE_EQ(input_width, input->value().dims()[1], + "all input should have same " + "dimension except for the first one"); + PADDLE_ENFORCE_EQ(input_height, input->height(), + "all input should have same height"); + merged_row_set.insert(input->rows().begin(), input->rows().end()); + } + std::vector merge_rows(merged_row_set.begin(), + merged_row_set.end()); + std::unordered_map rows_to_id; + for (size_t i = 0; i < merge_rows.size(); ++i) { + rows_to_id[merge_rows[i]] = i; + } + out.set_rows(merge_rows); + out.set_height(input_height); + out.mutable_value()->mutable_data( framework::make_ddim( {static_cast(merge_rows.size()), input_width}), context.GetPlace()); - const T* in_data = input.value().data(); - - for (auto& row_pair : merge_row_map) { - auto* out_ptr = out_data + idx * input_width; - auto& rows = row_pair.second; - merge_rows[idx] = row_pair.first; - ++idx; - // rows.size() is always larger than 0 - std::memcpy(out_ptr, in_data + rows[0] * input_width, - sizeof(T) * input_width); - - for (size_t i = 1; i < rows.size(); ++i) { - auto* in_ptr = in_data + rows[i] * input_width; - for (int64_t j = 0; j < input_width; ++j) { - out_ptr[j] += in_ptr[j]; - } + + math::SetConstant constant_functor; + constant_functor(context, out.mutable_value(), 0.0); + + auto* out_data = out.mutable_value()->data(); + + auto blas = math::GetBlas(context); + for (auto* input : inputs) { + if (input->rows().size() == 0) { + continue; + } + auto* input_data = input->value().data(); + auto& input_rows = input->rows(); + + for (size_t i = 0; i < input_rows.size(); i++) { + size_t out_i = rows_to_id[input_rows[i]]; + elementwise_add_to( + context, &blas, static_cast(input_width), + &input_data[i * input_width], &out_data[out_i * input_width]); } } - - out.set_rows(merge_rows); } }; template struct MergeAdd; template struct MergeAdd; +template struct MergeAdd; +template struct MergeAdd; template struct UpdateToTensor { diff --git a/paddle/fluid/operators/math/selected_rows_functor.cu b/paddle/fluid/operators/math/selected_rows_functor.cu index ba8eccf82042b679f69a32f9d053f05ac8fb9a99..10f39822b9c904ce236a1a2a3806d70693bd2e63 100644 --- a/paddle/fluid/operators/math/selected_rows_functor.cu +++ b/paddle/fluid/operators/math/selected_rows_functor.cu @@ -267,10 +267,15 @@ struct MergeAdd { void operator()(const platform::CUDADeviceContext& context, const framework::SelectedRows& input, framework::SelectedRows* output) { - framework::SelectedRows& out = *output; framework::Vector input_rows(input.rows()); + if (input_rows.size() == 0) { + return; + } + + framework::SelectedRows& out = *output; std::set row_set(input_rows.begin(), input_rows.end()); - std::vector merge_rows(row_set.begin(), row_set.end()); + std::vector merge_rows_cpu(row_set.begin(), row_set.end()); + framework::Vector merge_rows(merge_rows_cpu); auto input_width = input.value().dims()[1]; @@ -296,6 +301,73 @@ struct MergeAdd { out.mutable_rows()->CUDAMutableData(context.GetPlace()), out.rows().size(), input_width); } + + void operator()(const platform::CUDADeviceContext& context, + const std::vector& inputs, + framework::SelectedRows* output) { + if (inputs.size() == 0) { + VLOG(3) << "no input! return"; + return; + } + const framework::SelectedRows* has_value_input = nullptr; + for (auto* in : inputs) { + if (in->rows().size() > 0) { + has_value_input = in; + break; + } + } + if (has_value_input == nullptr) { + VLOG(3) << "no input has value! just return" << std::endl; + return; + } + auto input_width = has_value_input->value().dims()[1]; + auto input_height = has_value_input->height(); + framework::SelectedRows& out = *output; + std::set merged_row_set; + for (auto* input : inputs) { + if (input->rows().size() == 0) { + continue; + } + PADDLE_ENFORCE_EQ(input_width, input->value().dims()[1], + "all input should have same " + "dimension except for the first one"); + PADDLE_ENFORCE_EQ(input_height, input->height(), + "all input should have same height"); + merged_row_set.insert(input->rows().begin(), input->rows().end()); + } + std::vector merge_rows_cpu(merged_row_set.begin(), + merged_row_set.end()); + framework::Vector merge_rows(merge_rows_cpu); + + out.set_rows(merge_rows); + out.set_height(input_height); + out.mutable_value()->mutable_data( + framework::make_ddim( + {static_cast(merge_rows.size()), input_width}), + context.GetPlace()); + + math::SetConstant constant_functor; + constant_functor(context, out.mutable_value(), 0.0); + + auto* out_data = out.mutable_value()->data(); + + const int block_size = 256; + dim3 threads(block_size, 1); + + for (auto* input : inputs) { + if (input->rows().size() == 0) { + continue; + } + auto* input_data = input->value().data(); + auto& input_rows = input->rows(); + dim3 grid1(input_rows.size(), 1); + + MergeAddKernel<<>>( + input_data, input_rows.CUDAData(context.GetPlace()), out_data, + out.mutable_rows()->CUDAMutableData(context.GetPlace()), + out.rows().size(), input_width); + } + } }; template struct MergeAdd; diff --git a/paddle/fluid/operators/math/selected_rows_functor.h b/paddle/fluid/operators/math/selected_rows_functor.h index 900be86f91c6658a5265189a6745316c6471209e..521c53dd0d71707c13c4364c5ee59943a03d4a2d 100644 --- a/paddle/fluid/operators/math/selected_rows_functor.h +++ b/paddle/fluid/operators/math/selected_rows_functor.h @@ -83,104 +83,9 @@ struct MergeAdd { void operator()(const DeviceContext& context, const framework::SelectedRows& input, framework::SelectedRows* output); -}; - -template <> -struct MergeAdd { - framework::SelectedRows operator()(const platform::CPUDeviceContext& context, - const framework::SelectedRows& input) { - framework::SelectedRows out; - (*this)(context, input, &out); - return out; - } - - void operator()(const platform::CPUDeviceContext& context, - const framework::SelectedRows& input, - framework::SelectedRows* output) { - framework::SelectedRows& out = *output; - std::vector input_rows(input.rows()); - - std::map> merge_row_map; - for (size_t i = 0; i < input_rows.size(); ++i) { - merge_row_map[input_rows[i]].push_back(i); - } - - std::vector merge_rows(merge_row_map.size()); - size_t idx = 0; - int64_t input_width = input.value().dims()[1]; - out.set_height(input.height()); - - auto* out_data = out.mutable_value()->mutable_data( - framework::make_ddim( - {static_cast(merge_rows.size()), input_width}), - context.GetPlace()); - auto* in_data = input.value().data(); - - auto blas = GetBlas(context); - for (auto& row_pair : merge_row_map) { - auto* out_ptr = out_data + idx * input_width; - auto& rows = row_pair.second; - merge_rows[idx] = row_pair.first; - ++idx; - // rows.size() is always larger than 0 - blas.VCOPY(input_width, in_data + rows[0] * input_width, out_ptr); - - for (size_t i = 1; i < rows.size(); ++i) { - blas.AXPY(input_width, 1., in_data + rows[i] * input_width, out_ptr); - } - } - - out.set_rows(merge_rows); - } -}; - -template <> -struct MergeAdd { - framework::SelectedRows operator()(const platform::CPUDeviceContext& context, - const framework::SelectedRows& input) { - framework::SelectedRows out; - (*this)(context, input, &out); - return out; - } - - void operator()(const platform::CPUDeviceContext& context, - const framework::SelectedRows& input, - framework::SelectedRows* output) { - framework::SelectedRows& out = *output; - std::vector input_rows(input.rows()); - - std::map> merge_row_map; - for (size_t i = 0; i < input_rows.size(); ++i) { - merge_row_map[input_rows[i]].push_back(i); - } - - std::vector merge_rows(merge_row_map.size()); - size_t idx = 0; - int64_t input_width = input.value().dims()[1]; - out.set_height(input.height()); - - auto* out_data = out.mutable_value()->mutable_data( - framework::make_ddim( - {static_cast(merge_rows.size()), input_width}), - context.GetPlace()); - auto* in_data = input.value().data(); - - auto blas = GetBlas(context); - for (auto& row_pair : merge_row_map) { - auto* out_ptr = out_data + idx * input_width; - auto& rows = row_pair.second; - merge_rows[idx] = row_pair.first; - ++idx; - // rows.size() is always larger than 0 - blas.VCOPY(input_width, in_data + rows[0] * input_width, out_ptr); - - for (size_t i = 1; i < rows.size(); ++i) { - blas.AXPY(input_width, 1., in_data + rows[i] * input_width, out_ptr); - } - } - - out.set_rows(merge_rows); - } + void operator()(const DeviceContext& context, + const std::vector& inputs, + framework::SelectedRows* output); }; template diff --git a/paddle/fluid/operators/math/selected_rows_functor_test.cc b/paddle/fluid/operators/math/selected_rows_functor_test.cc index 835589356042b44c9fa5988aed726434fd66910a..f15b37a1e3f0ae9c7612c4f74470472393ff4ad6 100644 --- a/paddle/fluid/operators/math/selected_rows_functor_test.cc +++ b/paddle/fluid/operators/math/selected_rows_functor_test.cc @@ -302,6 +302,64 @@ TEST(selected_rows_functor, cpu_merge_add_int) { EXPECT_EQ(out_data[1 * row_numel], 2); EXPECT_EQ(out_data[2 * row_numel], 1); } + +TEST(selected_rows_functor, cpu_merge_add_multi) { + paddle::platform::CPUPlace cpu_place; + paddle::platform::CPUDeviceContext ctx(cpu_place); + paddle::operators::math::SetConstant + set_const; + + int64_t height = 10; + int64_t row_numel = 8; + + std::vector rows1{5, 2, 5, 3, 5}; + std::unique_ptr selected_rows1{ + new paddle::framework::SelectedRows(rows1, height)}; + auto* in1_value = selected_rows1->mutable_value(); + in1_value->mutable_data( + paddle::framework::make_ddim( + {static_cast(rows1.size()), row_numel}), + cpu_place); + set_const(ctx, in1_value, 1.0); + + std::vector rows2{2, 5, 3, 5, 3}; + std::unique_ptr selected_rows2{ + new paddle::framework::SelectedRows(rows2, height)}; + auto* in2_value = selected_rows2->mutable_value(); + in2_value->mutable_data( + paddle::framework::make_ddim( + {static_cast(rows2.size()), row_numel}), + cpu_place); + set_const(ctx, in2_value, 1.0); + + std::unique_ptr output{ + new paddle::framework::SelectedRows()}; + output->set_height(height); + paddle::operators::math::scatter::MergeAdd + merge_add_functor; + + std::vector inputs; + inputs.push_back(selected_rows1.get()); + inputs.push_back(selected_rows2.get()); + merge_add_functor(ctx, inputs, output.get()); + + EXPECT_EQ(output->height(), height); + EXPECT_EQ(output->value().dims(), + paddle::framework::make_ddim({3, row_numel})); + + std::vector ret_rows{2, 3, 5}; + EXPECT_EQ(output->rows(), ret_rows); + + auto* out_data = output->value().data(); + for (size_t i = 0; i < ret_rows.size(); ++i) { + for (size_t j = 0; j < row_numel; ++j) { + EXPECT_EQ(out_data[i * row_numel + j], ret_rows[i]); + } + } +} + TEST(selected_rows_functor, cpu_sum_to) { paddle::platform::CPUPlace cpu_place; paddle::platform::CPUDeviceContext ctx(cpu_place); @@ -318,6 +376,7 @@ TEST(selected_rows_functor, cpu_sum_to) { paddle::framework::make_ddim( {static_cast(rows1.size()), row_numel}), cpu_place); + functor(ctx, in1_value, 1.0); std::vector rows2{0, 5, 7, 9}; std::unique_ptr selected_rows2{ @@ -327,6 +386,7 @@ TEST(selected_rows_functor, cpu_sum_to) { paddle::framework::make_ddim( {static_cast(rows2.size()), row_numel}), cpu_place); + functor(ctx, in2_value, 2.0); std::unique_ptr output{ new paddle::framework::SelectedRows()}; diff --git a/paddle/fluid/operators/math/selected_rows_functor_test.cu b/paddle/fluid/operators/math/selected_rows_functor_test.cu index 5fc50aba25d8e69480a17f0f80877b0d03e17276..17af3e3999ca688c584f636f4c00386f886f9bbf 100644 --- a/paddle/fluid/operators/math/selected_rows_functor_test.cu +++ b/paddle/fluid/operators/math/selected_rows_functor_test.cu @@ -241,3 +241,67 @@ TEST(selected_rows_functor, gpu_add_to) { // row9: 2.0 + 3.0 EXPECT_EQ(tensor1_cpu_data[9 * row_numel + 6], 5.0); } + +TEST(selected_rows_functor, gpu_merge_add) { + paddle::platform::CUDAPlace gpu_place(0); + paddle::platform::CPUPlace cpu_place; + paddle::platform::CUDADeviceContext& ctx = + *reinterpret_cast( + paddle::platform::DeviceContextPool::Instance().Get(gpu_place)); + paddle::operators::math::SetConstant + set_const; + + int64_t height = 10; + int64_t row_numel = 8; + + std::vector rows1{5, 2, 5, 3, 5}; + std::unique_ptr selected_rows1{ + new paddle::framework::SelectedRows(rows1, height)}; + auto* in1_value = selected_rows1->mutable_value(); + in1_value->mutable_data( + paddle::framework::make_ddim( + {static_cast(rows1.size()), row_numel}), + gpu_place); + set_const(ctx, in1_value, 1.0); + + std::vector rows2{2, 5, 3, 5, 3}; + std::unique_ptr selected_rows2{ + new paddle::framework::SelectedRows(rows2, height)}; + auto* in2_value = selected_rows2->mutable_value(); + in2_value->mutable_data( + paddle::framework::make_ddim( + {static_cast(rows2.size()), row_numel}), + gpu_place); + set_const(ctx, in2_value, 1.0); + + std::unique_ptr output{ + new paddle::framework::SelectedRows()}; + output->set_height(height); + paddle::operators::math::scatter::MergeAdd< + paddle::platform::CUDADeviceContext, float> + merge_add_functor; + + std::vector inputs; + inputs.push_back(selected_rows1.get()); + inputs.push_back(selected_rows2.get()); + merge_add_functor(ctx, inputs, output.get()); + + paddle::framework::Tensor output_cpu; + paddle::framework::TensorCopy(output->value(), cpu_place, ctx, &output_cpu); + ctx.Wait(); + + EXPECT_EQ(output->height(), height); + EXPECT_EQ(output->value().dims(), + paddle::framework::make_ddim({3, row_numel})); + + std::vector ret_rows{2, 3, 5}; + EXPECT_EQ(output->rows(), ret_rows); + + auto* out_data = output_cpu.data(); + for (size_t i = 0; i < ret_rows.size(); ++i) { + for (size_t j = 0; j < row_numel; ++j) { + EXPECT_EQ(out_data[i * row_numel + j], ret_rows[i]); + } + } +} diff --git a/paddle/fluid/operators/math/sequence_pooling.cc b/paddle/fluid/operators/math/sequence_pooling.cc index 7be8539a7b0f1890898fd386a3056601fda8a7c3..6d491dbf1ed162ef07fda4c07e95cc57108486fd 100644 --- a/paddle/fluid/operators/math/sequence_pooling.cc +++ b/paddle/fluid/operators/math/sequence_pooling.cc @@ -31,7 +31,7 @@ template using EigenMatrix = framework::EigenMatrix; -template +template class MaxSeqPoolFunctor { public: void operator()(const platform::CPUDeviceContext& context, @@ -70,7 +70,41 @@ class MaxSeqPoolFunctor { } } }; +// Instantisation of Max Sequence Pooling for test phase eg. no need to fill +// index buffer +template +class MaxSeqPoolFunctor { + public: + void operator()(const platform::CPUDeviceContext& context, + const framework::LoDTensor& input, framework::Tensor* output, + framework::Tensor* index) { + auto in_dims = input.dims(); + auto out_dims = output->dims(); + PADDLE_ENFORCE_GT(in_dims.size(), 1); + PADDLE_ENFORCE_GT(out_dims.size(), 1); + for (int64_t i = 1; i < in_dims.size(); ++i) { + PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i]); + } + + auto starts = input.lod()[0]; + const T* in_data = input.data(); + T* out_data = output->data(); + int64_t num_seq = out_dims[0]; + int64_t dim = output->numel() / num_seq; + for (int64_t i = 0; i < num_seq; ++i) { + std::memcpy(&out_data[i * dim], &in_data[starts[i] * dim], + dim * sizeof(T)); + for (size_t j = starts[i] + 1; j < starts[i + 1]; ++j) { + for (int64_t k = 0; k < dim; ++k) { + if (in_data[j * dim + k] > out_data[i * dim + k]) { + out_data[i * dim + k] = in_data[j * dim + k]; + } + } + } + } + } +}; template class MaxSeqPoolGradFunctor { public: @@ -188,11 +222,16 @@ class SequencePoolFunctor { /* max pool has index output */ void operator()(const platform::CPUDeviceContext& context, const std::string pooltype, const framework::LoDTensor& input, - framework::Tensor* output, + framework::Tensor* output, bool is_test, framework::Tensor* index = nullptr) { if (pooltype == "MAX") { - math::MaxSeqPoolFunctor max_pool; - max_pool(context, input, output, index); + if (is_test) { + math::MaxSeqPoolFunctor max_pool; + max_pool(context, input, output, index); + } else { + math::MaxSeqPoolFunctor max_pool; + max_pool(context, input, output, index); + } return; } if (pooltype == "LAST") { @@ -200,6 +239,7 @@ class SequencePoolFunctor { last_pool(context, input, output); return; } + if (pooltype == "FIRST") { math::FirstSeqPoolFunctor first_pool; first_pool(context, input, output); diff --git a/paddle/fluid/operators/math/sequence_pooling.cu b/paddle/fluid/operators/math/sequence_pooling.cu index a92aef805a0434f2ebcbc62d4e5eaef0cfb21bfa..0015fafbc892912424dfa6dbd1778438d384ca19 100644 --- a/paddle/fluid/operators/math/sequence_pooling.cu +++ b/paddle/fluid/operators/math/sequence_pooling.cu @@ -133,7 +133,7 @@ class SequencePoolFunctor { public: void operator()(const platform::CUDADeviceContext& context, const std::string pooltype, const framework::LoDTensor& input, - framework::Tensor* output, + framework::Tensor* output, bool is_test, framework::Tensor* index = nullptr) { auto& lod = input.lod()[0]; const size_t item_dim = output->numel() / output->dims()[0]; diff --git a/paddle/fluid/operators/math/sequence_pooling.h b/paddle/fluid/operators/math/sequence_pooling.h index 8dcbee65d0b63a137e5f422ec8667cc950641b4a..a1046ea2160d0ae9c2251612c97d3f2640b0aad1 100644 --- a/paddle/fluid/operators/math/sequence_pooling.h +++ b/paddle/fluid/operators/math/sequence_pooling.h @@ -28,7 +28,7 @@ class SequencePoolFunctor { /* max pool has index output */ void operator()(const DeviceContext& context, const std::string pooltype, const framework::LoDTensor& input, framework::Tensor* output, - framework::Tensor* index = nullptr); + bool is_test = false, framework::Tensor* index = nullptr); }; template diff --git a/paddle/fluid/operators/merge_ids_op.cc b/paddle/fluid/operators/merge_ids_op.cc index c6ec4ab047d5e91625e646fd26108d2e477cdce5..6e0e13698097ade36449f2e8ff6ab981a1b24311 100644 --- a/paddle/fluid/operators/merge_ids_op.cc +++ b/paddle/fluid/operators/merge_ids_op.cc @@ -20,13 +20,16 @@ namespace operators { class MergeIdsOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { - AddInput("Ids", "(LoDTensor) the input ids with shape{batch_num, 1}"); - AddInput( - "X", - "(LoDTensors) multi input tensor with shape{batch_num, N}, N is the " - "size of embedding table") + AddInput("Ids", "(LoDTensor) the input ids with shape{batch_num, 1}") + .AsDuplicable(); + AddInput("Rows", "(LoDTensor) the input ids with shape{row_size, 1}, ") + .AsDuplicable(); + AddInput("X", + "(LoDTensors) multi input tensor with shape{Rows, N}, N is the " + "size of embedding table") + .AsDuplicable(); + AddOutput("Out", "(LoDTensor) The merged outputs of the input tensors.") .AsDuplicable(); - AddOutput("Out", "(LoDTensor) The merged outputs of the input tensors."); AddComment(R"DOC( Merge multi LoDTensor's into one according to Ids's shard num. @@ -79,15 +82,19 @@ class MergeIdsOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("Ids"), "MergeIdsOp must has input Ids."); - PADDLE_ENFORCE(ctx->HasInputs("X"), "MergeIdsOp must has input X."); - PADDLE_ENFORCE(ctx->HasOutput("Out"), "MergeIdsOp must has output Out."); + PADDLE_ENFORCE(ctx->HasInputs("Ids"), + "MergeIdsOp must has multi input Ids."); + PADDLE_ENFORCE(ctx->HasInputs("Rows"), + "MergeIdsOp must has multi input Rows."); + PADDLE_ENFORCE(ctx->HasInputs("X"), "MergeIdsOp must has multi input X."); + PADDLE_ENFORCE(ctx->HasOutputs("Out"), + "MergeIdsOp must has multi output Out."); auto ids_var_type = ctx->GetInputsVarType("Ids").front(); - auto ids_dims = ctx->GetInputDim("Ids"); + auto ids_dims = ctx->GetInputsDim("Ids"); if (ids_var_type == framework::proto::VarType::LOD_TENSOR) { - PADDLE_ENFORCE_EQ(ids_dims.size(), 2); - PADDLE_ENFORCE_EQ(ids_dims[1], 1); + PADDLE_ENFORCE_EQ(ids_dims[0].size(), 2); + PADDLE_ENFORCE_EQ(ids_dims[0][1], 1); } auto x_var_type = ctx->GetInputsVarType("X"); for (auto &var_type : x_var_type) { diff --git a/paddle/fluid/operators/merge_ids_op.h b/paddle/fluid/operators/merge_ids_op.h index 83712a8519c6817151e1922c606c0fdd4682a2db..fef9e023d02f45e21ec409ad398ba7d9bdd36880 100644 --- a/paddle/fluid/operators/merge_ids_op.h +++ b/paddle/fluid/operators/merge_ids_op.h @@ -14,6 +14,8 @@ limitations under the License. */ #pragma once +#include +#include #include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/tensor_util.h" @@ -30,59 +32,70 @@ class MergeIdsOpKernel : public framework::OpKernel { if (!platform::is_cpu_place(place)) { PADDLE_THROW("MergeIds do not support GPU kernel"); } - VLOG(3) << "run in MergeIdsOpKernel"; - const auto *ids_var = ctx.InputVar("Ids"); - PADDLE_ENFORCE(ids_var->IsType(), - "only support to merge Ids of LoDTensor"); + const auto ids = ctx.MultiInput("Ids"); + const auto row_ids = ctx.MultiInput("Rows"); + const auto x_tensors = ctx.MultiInput("X"); + auto outs = ctx.MultiOutput("Out"); - const auto &ids_tensor = ids_var->Get(); - const auto &ids_dims = ids_tensor.dims(); - const int64_t *ids = ids_tensor.data(); + PADDLE_ENFORCE_EQ(row_ids.size(), x_tensors.size(), + "the number of Rows and X should be the same"); + PADDLE_ENFORCE_EQ(ids.size(), outs.size(), + "the number of Ids and Out should be the same"); - auto x_tensors = ctx.MultiInput("X"); + int row_ids_size = 0; + int row_size = 0; + int embedding_size = 0; - auto *out = ctx.Output("Out"); + for (int i = 0; i < x_tensors.size(); ++i) { + const auto *x_tensor = x_tensors[i]; + const auto *row_id = row_ids[i]; - int batch_size = 0; - int embedding_size = 0; - for (auto &input : x_tensors) { - if (framework::product(input->dims()) != 0) { - if (embedding_size == 0) { - embedding_size = input->dims()[1]; - } - PADDLE_ENFORCE_EQ(embedding_size, input->dims()[1], - "embedding size of all input should be the same"); - batch_size += input->dims()[0]; + if (embedding_size == 0) { + embedding_size = x_tensor->dims()[1]; } + PADDLE_ENFORCE_EQ(embedding_size, x_tensor->dims()[1], + "embedding size of all input should be the same"); + row_size += x_tensor->dims()[0]; + row_ids_size += row_id->dims()[0]; } + PADDLE_ENFORCE_EQ( - batch_size, ids_dims[0], - "the batch size of ids and merged embedding value should be the same"); + row_size, row_ids_size, + "the merged X dim[0] and merged Rows dim[0] should be the same"); + + std::unordered_map> + selected_rows_idx_map; + for (int i = 0; i < x_tensors.size(); ++i) { + const auto *row_id = row_ids[i]; + + for (int j = 0; j < row_id->numel(); ++j) { + int64_t key = row_id->data()[j]; + std::tuple val = std::make_tuple(i, j); + selected_rows_idx_map.insert(std::make_pair(key, val)); + } + } + PADDLE_ENFORCE_EQ(row_ids_size, selected_rows_idx_map.size(), + "the rows and tensor map size should be the same"); + + for (int i = 0; i < outs.size(); ++i) { + auto *out_ids = ids[i]; + auto *out = outs[i]; - const size_t shard_num = x_tensors.size(); + out->set_lod(out_ids->lod()); - if (shard_num == 1) { - VLOG(3) << "only one shard, we can copy the data directly"; - TensorCopy(*x_tensors[0], place, out); - } else { - std::vector in_indexs(shard_num, 0); + int nums = static_cast(out_ids->dims()[0]); auto *out_data = out->mutable_data( - framework::make_ddim({batch_size, embedding_size}), place); - // copy data from ins[shard_num] to out. - for (int i = 0; i < ids_dims[0]; ++i) { - int64_t id = ids[i]; - size_t shard_id = static_cast(id) % shard_num; - int index = in_indexs[shard_id]; - memcpy(out_data + embedding_size * i, - x_tensors[shard_id]->data() + index * embedding_size, + framework::make_ddim({nums, embedding_size}), place); + for (int j = 0; j < nums; ++j) { + int id = out_ids->data()[j]; + auto row_tuple = selected_rows_idx_map[id]; + int64_t row_idx = std::get<1>(row_tuple); + const auto *x_tensor = x_tensors[std::get<0>(row_tuple)]; + + memcpy(out_data + embedding_size * j, + x_tensor->data() + row_idx * embedding_size, sizeof(T) * embedding_size); - in_indexs[shard_id] += 1; - } - - for (size_t i = 0; i < shard_num; ++i) { - PADDLE_ENFORCE_EQ(in_indexs[i], x_tensors[i]->dims()[0], - "after merge, all data in x_tensor should be used"); } } } diff --git a/paddle/fluid/operators/momentum_op.cc b/paddle/fluid/operators/momentum_op.cc index 12b916fcebd425bd4a03d920f947829098a924a1..7f0b51580aa2591ac7338ad7c29ee4756d909925 100644 --- a/paddle/fluid/operators/momentum_op.cc +++ b/paddle/fluid/operators/momentum_op.cc @@ -19,54 +19,6 @@ namespace operators { using Tensor = framework::Tensor; -class MomentumOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - protected: - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("Param"), - "Input(param) of Momentum should not be null."); - PADDLE_ENFORCE(ctx->HasInput("Grad"), - "Input(grad) of Momentum should not be null."); - PADDLE_ENFORCE(ctx->HasInput("Velocity"), - "Input(velocity) of Momentum should not be null."); - PADDLE_ENFORCE(ctx->HasInput("LearningRate"), - "Input(LearningRate) of Momentum should not be null."); - PADDLE_ENFORCE( - ctx->GetInputsVarType("Param").front() == - framework::proto::VarType::LOD_TENSOR, - "The input var's type should be LoDTensor, but the received is %s", - ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front()); - - PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), - "Output(ParamOut) of Momentum should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("VelocityOut"), - "Output(VelocityOut) of Momentum should not be null."); - - auto param_dim = ctx->GetInputDim("Param"); - if (ctx->GetInputsVarType("Grad")[0] == - framework::proto::VarType::LOD_TENSOR) { - PADDLE_ENFORCE_EQ( - param_dim, ctx->GetInputDim("Grad"), - "Param and Grad input of MomentumOp should have the same dimension."); - PADDLE_ENFORCE_EQ( - param_dim, ctx->GetInputDim("Velocity"), - "Param and Velocity of MomentumOp should have the same dimension."); - } - PADDLE_ENFORCE_EQ(framework::product(ctx->GetInputDim("LearningRate")), 1, - "Learning_rate should be a scalar"); - - ctx->SetOutputDim("ParamOut", param_dim); - ctx->SetOutputDim("VelocityOut", param_dim); - } - framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - auto input_data_type = framework::GetDataTypeOfVar(ctx.InputVar("Param")); - return framework::OpKernelType(input_data_type, ctx.GetPlace()); - } -}; - class MomentumOpInferVarType : public framework::VarTypeInference { public: void operator()(const framework::OpDesc& op_desc, diff --git a/paddle/fluid/operators/momentum_op.h b/paddle/fluid/operators/momentum_op.h index 6b4d00f56ca06c402c07ecf770a390e88ae3edf1..71f079e4d97f5259359ee6572f584894551452ca 100644 --- a/paddle/fluid/operators/momentum_op.h +++ b/paddle/fluid/operators/momentum_op.h @@ -28,6 +28,54 @@ using framework::SelectedRows; struct NoNesterov; struct UseNesterov; +class MomentumOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Param"), + "Input(param) of Momentum should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Grad"), + "Input(grad) of Momentum should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Velocity"), + "Input(velocity) of Momentum should not be null."); + PADDLE_ENFORCE(ctx->HasInput("LearningRate"), + "Input(LearningRate) of Momentum should not be null."); + PADDLE_ENFORCE( + ctx->GetInputsVarType("Param").front() == + framework::proto::VarType::LOD_TENSOR, + "The input var's type should be LoDTensor, but the received is %s", + ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front()); + + PADDLE_ENFORCE(ctx->HasOutput("ParamOut"), + "Output(ParamOut) of Momentum should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("VelocityOut"), + "Output(VelocityOut) of Momentum should not be null."); + + auto param_dim = ctx->GetInputDim("Param"); + if (ctx->GetInputsVarType("Grad")[0] == + framework::proto::VarType::LOD_TENSOR) { + PADDLE_ENFORCE_EQ( + param_dim, ctx->GetInputDim("Grad"), + "Param and Grad input of MomentumOp should have the same dimension."); + PADDLE_ENFORCE_EQ( + param_dim, ctx->GetInputDim("Velocity"), + "Param and Velocity of MomentumOp should have the same dimension."); + } + PADDLE_ENFORCE_EQ(framework::product(ctx->GetInputDim("LearningRate")), 1, + "Learning_rate should be a scalar"); + + ctx->SetOutputDim("ParamOut", param_dim); + ctx->SetOutputDim("VelocityOut", param_dim); + } + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + auto input_data_type = framework::GetDataTypeOfVar(ctx.InputVar("Param")); + return framework::OpKernelType(input_data_type, ctx.GetPlace()); + } +}; + template class CPUDenseMomentumFunctor { private: diff --git a/paddle/fluid/operators/sequence_pool_op.cc b/paddle/fluid/operators/sequence_pool_op.cc index 15d3f064eb7b025dc9a85b2aabad24186061cbd4..217bb1610fd3f02f0f72d3b7750ebcdfad243f48 100644 --- a/paddle/fluid/operators/sequence_pool_op.cc +++ b/paddle/fluid/operators/sequence_pool_op.cc @@ -47,6 +47,7 @@ class SequencePoolOpMaker : public framework::OpProtoAndCheckerMaker { "(Tensor) This tensor is used for the sequence max-pooling " "to record the max indexes.") .AsIntermediate(); + AddAttr("is_test", "").SetDefault(false); AddAttr( "pooltype", "(string, default 'AVERAGE') the pooling pooltype of SequencePoolOp.") diff --git a/paddle/fluid/operators/sequence_pool_op.h b/paddle/fluid/operators/sequence_pool_op.h index 2aa20792f24305a106c500a3d7a6e3d363bc31d8..f2e4a55dee49664b2fc09813f6dba5f68aaf11d5 100644 --- a/paddle/fluid/operators/sequence_pool_op.h +++ b/paddle/fluid/operators/sequence_pool_op.h @@ -32,10 +32,6 @@ class SequencePoolKernel : public framework::OpKernel { auto* in = context.Input("X"); auto* out = context.Output("Out"); std::string pooltype = context.Attr("pooltype"); - Tensor* index = nullptr; - if (pooltype == "MAX") { - index = context.Output("MaxIndex"); - } auto dims = in->dims(); auto lod = in->lod(); @@ -48,13 +44,22 @@ class SequencePoolKernel : public framework::OpKernel { dims[0] = lod[0].size() - 1; out->Resize({dims}); out->mutable_data(context.GetPlace()); - if (pooltype == "MAX") { + Tensor* index = nullptr; + + const bool is_test = context.Attr("is_test"); + + // Do not create index buffer for inference (is_test) mode + // TODO(jczaja): Skip index buffer creation for other devices eg. GPU + if (pooltype == "MAX" && + (is_test == false || + platform::is_cpu_place(context.GetPlace()) == false)) { + index = context.Output("MaxIndex"); index->Resize({dims}); index->mutable_data(context.GetPlace()); } math::SequencePoolFunctor pool; pool(context.template device_context(), pooltype, *in, out, - index); + is_test, index); } }; diff --git a/paddle/fluid/operators/sequence_reverse_op.cc b/paddle/fluid/operators/sequence_reverse_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..1428cca1a6bf6150594f9cb72dbf00cd0eff7df5 --- /dev/null +++ b/paddle/fluid/operators/sequence_reverse_op.cc @@ -0,0 +1,29 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/sequence_reverse_op.h" + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(sequence_reverse, ops::SequenceReverseOp, + ops::SequenceReverseOpMaker, + ops::SequenceReverseGradOpDescMaker); + +REGISTER_OP_CPU_KERNEL( + sequence_reverse, + ops::SequenceReverseOpKernel, + ops::SequenceReverseOpKernel, + ops::SequenceReverseOpKernel, + ops::SequenceReverseOpKernel, + ops::SequenceReverseOpKernel); diff --git a/paddle/fluid/operators/sequence_reverse_op.cu b/paddle/fluid/operators/sequence_reverse_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..ce65f4799e8661adca60d212eaa9c3f0f92c4c29 --- /dev/null +++ b/paddle/fluid/operators/sequence_reverse_op.cu @@ -0,0 +1,25 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/sequence_reverse_op.h" + +namespace ops = paddle::operators; + +REGISTER_OP_CUDA_KERNEL( + sequence_reverse, + ops::SequenceReverseOpKernel, + ops::SequenceReverseOpKernel, + ops::SequenceReverseOpKernel, + ops::SequenceReverseOpKernel, + ops::SequenceReverseOpKernel); diff --git a/paddle/fluid/operators/sequence_reverse_op.h b/paddle/fluid/operators/sequence_reverse_op.h new file mode 100644 index 0000000000000000000000000000000000000000..39dad2311b2bcf29f808723caf7bfaef4c88cef2 --- /dev/null +++ b/paddle/fluid/operators/sequence_reverse_op.h @@ -0,0 +1,157 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/algorithm.h" +#include "paddle/fluid/platform/for_range.h" + +namespace paddle { +namespace operators { + +class SequenceReverseOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must exist"); + PADDLE_ENFORCE(ctx->HasOutput("Y"), "Output(Y) must exist"); + + auto x_dim = ctx->GetInputDim("X"); + PADDLE_ENFORCE_GE(x_dim.size(), 2, + "Rank of Input(X) must be not less than 2."); + + ctx->SetOutputDim("Y", x_dim); + ctx->ShareLoD("X", "Y"); + } +}; + +class SequenceReverseOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "The input LoDTensor of sequence_reverse op."); + AddOutput("Y", "The output LoDTensor of sequence_reverse op."); + AddComment(R"DOC( +SequenceReverse Operator. + +Reverse each sequence in input X along dim 0. + +Assuming X is a LoDTensor with dims [5, 4] and lod [[0, 2, 5]], where: + +X.data() = [ + [1, 2, 3, 4], + [5, 6, 7, 8], # the 0-th sequence with length 2 + [9, 10, 11, 12], + [13, 14, 15, 16], + [17, 18, 19, 20] # the 1-st sequence with length 3 +] + +The output Y would be a LoDTensor sharing the same dims and lod with input X, +and: + +Y.data() = [ + [5, 6, 7, 8], + [1, 2, 3, 4], # the reversed 0-th sequence with length 2 + [17, 18, 19, 20], + [13, 14, 15, 16], + [9, 10, 11, 12] # the reversed 1-st sequence with length 3 +] + +This Operator is useful to build a reverse dynamic RNN network. + +This Operator only supports one-level lod currently. + )DOC"); + } +}; + +template +struct SequenceReverseFunctor { + SequenceReverseFunctor(const T *x, T *y, const size_t *lod, size_t lod_count, + size_t row_numel) + : x_(x), y_(y), lod_(lod), lod_count_(lod_count), row_numel_(row_numel) {} + + HOSTDEVICE void operator()(size_t idx_x) const { + auto row_idx_x = idx_x / row_numel_; + auto lod_idx = math::UpperBound(lod_, lod_count_, row_idx_x); + auto row_idx_y = lod_[lod_idx - 1] + (lod_[lod_idx] - 1 - row_idx_x); + auto idx_y = row_idx_y * row_numel_ + idx_x % row_numel_; + y_[idx_y] = x_[idx_x]; + } + + const T *x_; + T *y_; + const size_t *lod_; + size_t lod_count_; + size_t row_numel_; +}; + +template +class SequenceReverseOpKernel : public framework::OpKernel { + using LoDTensor = framework::LoDTensor; + + public: + void Compute(const framework::ExecutionContext &ctx) const override { + auto &x = *ctx.Input("X"); + auto *y = ctx.Output("Y"); + + PADDLE_ENFORCE_EQ(x.lod().size(), 1, + "SequenceReverse Op only support one level lod."); + + auto &dev_ctx = ctx.template device_context(); + const size_t *lod; + size_t lod_count = x.lod()[0].size(); + +#ifdef PADDLE_WITH_CUDA + if (platform::is_gpu_place(ctx.GetPlace())) { + lod = x.lod()[0].CUDAData(ctx.GetPlace()); + } else { +#endif + lod = x.lod()[0].data(); +#ifdef PADDLE_WITH_CUDA + } +#endif + + size_t limit = static_cast(x.numel()); + size_t row_numel = static_cast(limit / x.dims()[0]); + auto *x_data = x.data(); + auto *y_data = y->mutable_data(ctx.GetPlace()); + + PADDLE_ENFORCE_NE(x_data, y_data, + "SequenceReverse Op does not support in-place operation"); + + SequenceReverseFunctor functor(x_data, y_data, lod, lod_count, + row_numel); + platform::ForRange for_range(dev_ctx, limit); + for_range(functor); + } +}; + +class SequenceReverseGradOpDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("sequence_reverse"); + op->SetInput("X", OutputGrad("Y")); + op->SetOutput("Y", InputGrad("X")); + op->SetAttrMap(Attrs()); + return op; + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/softmax_cudnn_op.cu.cc b/paddle/fluid/operators/softmax_cudnn_op.cu.cc index 2bdb23e999621b10799b5163f326bc4b66a437e6..f6e241af0634650f4a32be6a4547617f8ec3ee60 100644 --- a/paddle/fluid/operators/softmax_cudnn_op.cu.cc +++ b/paddle/fluid/operators/softmax_cudnn_op.cu.cc @@ -76,6 +76,8 @@ namespace ops = paddle::operators; namespace plat = paddle::platform; REGISTER_OP_KERNEL(softmax, CUDNN, plat::CUDAPlace, ops::SoftmaxCUDNNKernel, + ops::SoftmaxCUDNNKernel, ops::SoftmaxCUDNNKernel); REGISTER_OP_KERNEL(softmax_grad, CUDNN, plat::CUDAPlace, - ops::SoftmaxGradCUDNNKernel); + ops::SoftmaxGradCUDNNKernel, + ops::SoftmaxGradCUDNNKernel); diff --git a/paddle/fluid/operators/split_ids_op.cc b/paddle/fluid/operators/split_ids_op.cc index c867c46873ae7ddbdbda280351e4ab28235bcc08..243f81e296fb95a2c7e9f717950b8a958ad98852 100644 --- a/paddle/fluid/operators/split_ids_op.cc +++ b/paddle/fluid/operators/split_ids_op.cc @@ -20,20 +20,27 @@ namespace operators { class SplitIdsOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { - AddInput("Ids", "(LoDTensor) the input ids with shape{batch_num, 1}"); - AddOutput("Out", "(LoDTensor) The outputs of the input Ids.") + AddInput("Ids", "(LoDTensor) the input ids with shape{batch_num, 1}") + .AsDuplicable(); + + AddOutput("Out", "(LoDTensors) The outputs of the input Ids.") .AsDuplicable(); AddComment(R"DOC( Split a LoDTensor of Ids into multi LoDTensors, the number is pserver's number Example: Input: - X = [1,2,3,4,5,6] + X = [[1,2,3,4,5,6],[2,3]] Out(3 output): - out0 = [3, 6] - out1 = [1, 4] - out2 = [2, 5] + if compress is True: + out0 = [3, 3, 6] + out1 = [1, 4] + out2 = [2, 2, 5] + else: + out0 = [3, 6] + out1 = [1, 4] + out2 = [2, 5] )DOC"); } }; @@ -43,16 +50,24 @@ class SplitIdsOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("Ids"), "SplitIdsOp must has input Ids."); + PADDLE_ENFORCE(ctx->HasInputs("Ids"), "SplitIdsOp must has input Ids."); PADDLE_ENFORCE(ctx->HasOutputs("Out"), "SplitIdsOp must has output Out."); auto ids_var_type = ctx->GetInputsVarType("Ids").front(); - auto ids_dims = ctx->GetInputDim("Ids"); + auto ids_dims = ctx->GetInputsDim("Ids"); if (ids_var_type == framework::proto::VarType::LOD_TENSOR) { - PADDLE_ENFORCE_EQ(ids_dims.size(), 2); - PADDLE_ENFORCE_EQ(ids_dims[1], 1); + PADDLE_ENFORCE_EQ(ids_dims[0].size(), 2); } } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType( + framework::ToDataType( + ctx.MultiInput("Ids").front()->type()), + ctx.GetPlace()); + } }; class SplitIdsOpInferVarType : public framework::VarTypeInference { @@ -66,12 +81,28 @@ class SplitIdsOpInferVarType : public framework::VarTypeInference { } }; +class SplitIdsOpGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto grad = new framework::OpDesc(); + grad->SetType("concat"); + grad->SetInput("X", OutputGrad("Out")); + grad->SetOutput("Out", InputGrad("Ids")); + grad->SetAttr("axis", 0); + return std::unique_ptr(grad); + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(split_ids, ops::SplitIdsOp, ops::SplitIdsOpMaker, - ops::SplitIdsOpInferVarType); + ops::SplitIdsOpGradMaker, ops::SplitIdsOpInferVarType); + REGISTER_OP_CPU_KERNEL( split_ids, ops::SplitIdsOpKernel, ops::SplitIdsOpKernel); diff --git a/paddle/fluid/operators/split_ids_op.h b/paddle/fluid/operators/split_ids_op.h index c4af5a65fc5f81c1af7c1fdcca637ca37c940637..69ac6c5a6b9a8b318520eb9a3ff89a3a6be48339 100644 --- a/paddle/fluid/operators/split_ids_op.h +++ b/paddle/fluid/operators/split_ids_op.h @@ -14,6 +14,8 @@ limitations under the License. */ #pragma once +#include +#include #include #include #include "paddle/fluid/framework/op_registry.h" @@ -31,19 +33,39 @@ class SplitIdsOpKernel : public framework::OpKernel { PADDLE_THROW("SplitIds do not support GPU kernel"); } - const auto *ids_var = ctx.InputVar("Ids"); + const auto ids_vars = ctx.MultiInputVar("Ids"); + + PADDLE_ENFORCE_GT(ids_vars.size(), 0, "The number of Ids should > 0"); + auto *ids_var = ids_vars[0]; + if (ids_var->IsType()) { - const auto &ids_dims = ctx.Input("Ids")->dims(); - const T *ids = ctx.Input("Ids")->data(); + int batch_size = 0; + const auto ids_tensors = ctx.MultiInput("Ids"); + for (size_t i = 0; i < ids_tensors.size(); ++i) { + batch_size += ids_tensors[i]->dims()[0]; + } + VLOG(4) << "Get Total BatchSize is: " << batch_size; + + std::vector all_ids(batch_size); + int offset = 0; + for (size_t i = 0; i < ids_tensors.size(); ++i) { + const auto *ids = ids_tensors[i]; + std::memcpy(all_ids.data() + offset, ids->data(), + ids->numel() * sizeof(T)); + offset += ids->numel(); + } + + std::set st(all_ids.begin(), all_ids.end()); + all_ids.assign(st.begin(), st.end()); + auto outs = ctx.MultiOutput("Out"); const size_t shard_num = outs.size(); - std::vector> out_ids; out_ids.resize(outs.size()); // split id by their shard_num. - for (int i = 0; i < ids_dims[0]; ++i) { - T id = ids[i]; + for (int i = 0; i < all_ids.size(); ++i) { + T id = all_ids[i]; size_t shard_id = static_cast(id) % shard_num; out_ids[shard_id].push_back(id); } @@ -64,7 +86,7 @@ class SplitIdsOpKernel : public framework::OpKernel { PADDLE_ENFORCE_EQ(ids_dims[0], static_cast(ids_selected_rows->rows().size()), ""); - const T *ids = ids_selected_rows->value().data(); + const T *ids_data = ids_selected_rows->value().data(); const auto &ids_rows = ids_selected_rows->rows(); auto outs = ctx.MultiOutput("Out"); const size_t shard_num = outs.size(); @@ -87,7 +109,7 @@ class SplitIdsOpKernel : public framework::OpKernel { T *output = out->mutable_value()->mutable_data(ddim, place); for (int64_t i = 0; i < ddim[0]; ++i) { memcpy(output + i * row_width, - ids + id_to_index[out->rows()[i]] * row_width, + ids_data + id_to_index[out->rows()[i]] * row_width, row_width * sizeof(T)); } } diff --git a/paddle/fluid/operators/split_selected_rows_op.cc b/paddle/fluid/operators/split_selected_rows_op.cc index 76615a9405d7a8e3fa9dba8d01a956209e02ae8f..0e7b1463d1ba81aed53e0e3f3a90d2a1fbf0ffbc 100644 --- a/paddle/fluid/operators/split_selected_rows_op.cc +++ b/paddle/fluid/operators/split_selected_rows_op.cc @@ -22,9 +22,9 @@ class SplitSelectedRowsOpMaker : public framework::OpProtoAndCheckerMaker { void Make() override { AddInput("X", "The input SelectedRows."); AddOutput("Out", "The outputs of the input SelectedRows.").AsDuplicable(); - AddAttr>("height_sections", - "Height for each output SelectedRows.") - .SetDefault(std::vector({})); + AddAttr>("height_sections", + "Height for each output SelectedRows.") + .SetDefault(std::vector({})); AddComment(R"DOC( Split a SelectedRows with a specified rows section. diff --git a/paddle/fluid/operators/split_selected_rows_op.h b/paddle/fluid/operators/split_selected_rows_op.h index 0e9ce165b98845f4745ee70b028513ea31cc6657..af64607fafc6544047714e731846a2440be219b8 100644 --- a/paddle/fluid/operators/split_selected_rows_op.h +++ b/paddle/fluid/operators/split_selected_rows_op.h @@ -21,7 +21,7 @@ limitations under the License. */ namespace paddle { namespace operators { -static int FindOutIdx(int row, const std::vector& abs_sections) { +static int FindOutIdx(int row, const std::vector& abs_sections) { for (size_t i = 1; i < abs_sections.size(); ++i) { if (row < abs_sections[i]) { return i - 1; @@ -30,9 +30,9 @@ static int FindOutIdx(int row, const std::vector& abs_sections) { return abs_sections.size() - 1; } -static std::vector ToAbsoluteSection( - const std::vector& height_sections) { - std::vector abs_sections; +static std::vector ToAbsoluteSection( + const std::vector& height_sections) { + std::vector abs_sections; abs_sections.resize(height_sections.size()); abs_sections[0] = 0; for (size_t i = 1; i < height_sections.size(); ++i) { @@ -47,7 +47,7 @@ class SplitSelectedRowsOpKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& ctx) const override { auto* x = ctx.Input("X"); auto outs = ctx.MultiOutput("Out"); - auto height_sections = ctx.Attr>("height_sections"); + auto height_sections = ctx.Attr>("height_sections"); auto abs_sections = ToAbsoluteSection(height_sections); diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc index 34dbac2ab8dcc9bd2b91e2daa2f42806057f5f56..d19ac9839c90a116265b761e3b1b3f855e2d95e8 100644 --- a/paddle/fluid/operators/sum_op.cc +++ b/paddle/fluid/operators/sum_op.cc @@ -67,6 +67,7 @@ class SumOp : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { auto x_vars = ctx.MultiInputVar("X"); + auto x_vars_name = ctx.Inputs("X"); framework::LibraryType library{framework::LibraryType::kPlain}; framework::DataLayout layout{framework::DataLayout::kAnyLayout}; @@ -81,15 +82,18 @@ class SumOp : public framework::OperatorWithKernel { if (x_vars[0]->IsType()) { int dtype = -1; - for (auto& x_var : x_vars) { - auto& lod_tensor = x_var->Get(); - if (lod_tensor.numel() == 0) { + for (size_t idx = 0; idx < x_vars.size(); ++idx) { + PADDLE_ENFORCE(x_vars[idx] != nullptr, + "Input var[%s] should not be nullptr", x_vars_name[idx]); + // FIXME(zcd): The input x_var may be SelectedRows or LoDTensor. + auto tensor = framework::GetTensorFromVar(*x_vars[idx]); + if (tensor->numel() == 0) { continue; } if (dtype == -1) { - dtype = framework::ToDataType(lod_tensor.type()); + dtype = framework::ToDataType(tensor->type()); } else { - PADDLE_ENFORCE_EQ(dtype, framework::ToDataType(lod_tensor.type())); + PADDLE_ENFORCE_EQ(dtype, framework::ToDataType(tensor->type())); } } PADDLE_ENFORCE_NE(dtype, -1, diff --git a/paddle/fluid/operators/sum_op.h b/paddle/fluid/operators/sum_op.h index 11987c61aebaad00f8a71f1b909c83c44ddc8b0e..f6e12dfc76c6ce73f10e707387f6a9cedacde3c8 100644 --- a/paddle/fluid/operators/sum_op.h +++ b/paddle/fluid/operators/sum_op.h @@ -83,79 +83,54 @@ class SumKernel : public framework::OpKernel { } } } else if (out_var->IsType()) { - std::unique_ptr in0; - if (in_place) { - // If is in_place, we store the input[0] to in0 - auto &in_sel0 = in_vars[0]->Get(); - auto &rows = in_sel0.rows(); -#ifdef PADDLE_WITH_CUDA - std::vector rows_in_cpu; - rows_in_cpu.reserve(rows.size()); - for (auto item : rows) { - rows_in_cpu.push_back(item); - } - in0.reset(new framework::SelectedRows(rows_in_cpu, in_sel0.height())); -#else - in0.reset(new framework::SelectedRows(rows, in_sel0.height())); -#endif - in0->mutable_value()->ShareDataWith(in_sel0.value()); + if (in_place && in_vars.size() < 2) { + return; } - auto get_selected_row = [&](size_t i) -> const SelectedRows & { - if (i == 0 && in0) { - return *in0.get(); - } else { - return in_vars[i]->Get(); + std::vector inputs; + SelectedRows temp_in0; + + if (in_place) { + auto &in0 = in_vars[0]->Get(); + temp_in0.set_height(in0.height()); + temp_in0.set_rows(in0.rows()); + framework::TensorCopy(in0.value(), in0.place(), + context.device_context(), + temp_in0.mutable_value()); + inputs.push_back(&temp_in0); + for (size_t i = 1; i < in_vars.size(); ++i) { + auto &in = in_vars[i]->Get(); + if (in.rows().size() > 0) { + inputs.push_back(&in); + } + } + } else { + for (auto &in_var : in_vars) { + auto &in = in_var->Get(); + if (in.rows().size() > 0) { + inputs.push_back(&in_var->Get()); + } } - }; + } auto *out = context.Output("Out"); out->mutable_rows()->clear(); - auto *out_value = out->mutable_value(); - - // Runtime InferShape - size_t first_dim = 0; - for (size_t i = 0; i < in_num; i++) { - auto &sel_row = get_selected_row(i); - first_dim += sel_row.rows().size(); - } - std::vector in_dim; - for (size_t i = 0; i < in_num; i++) { - auto &sel_row = get_selected_row(i); - if (sel_row.rows().size() > 0) { - in_dim = framework::vectorize(sel_row.value().dims()); + bool has_data = false; + for (auto &in : inputs) { + if (in->rows().size() > 0) { + has_data = true; break; } } - if (in_dim.empty()) { - VLOG(3) << "WARNING: all the inputs are empty"; - in_dim = - framework::vectorize(get_selected_row(in_num - 1).value().dims()); + if (has_data) { + math::scatter::MergeAdd merge_add; + merge_add(context.template device_context(), inputs, + out); } else { - in_dim[0] = static_cast(first_dim); - } - - out_value->Resize(framework::make_ddim(in_dim)); - out_value->mutable_data(context.GetPlace()); - // if all the input sparse vars are empty, no need to - // merge these vars. - if (first_dim == 0UL) { - return; - } - - math::SelectedRowsAddTo functor; - - int64_t offset = 0; - for (size_t i = 0; i < in_num; i++) { - auto &sel_row = get_selected_row(i); - if (sel_row.rows().size() == 0) { - continue; - } - PADDLE_ENFORCE_EQ(out->height(), sel_row.height()); - functor(context.template device_context(), sel_row, - offset, out); - offset += sel_row.value().numel(); + // no data, just set a empty out tensor. + out->mutable_value()->mutable_data(framework::make_ddim({0}), + context.GetPlace()); } } else if (out_var->IsType()) { auto &out_array = *out_var->GetMutable(); diff --git a/paddle/fluid/operators/transpose_op.cc b/paddle/fluid/operators/transpose_op.cc index 6a9fc6611a8f8eaa6749aefac0673ccabaebbcfe..bbd71db6062107f6ba40343c84d942b54b3958e6 100644 --- a/paddle/fluid/operators/transpose_op.cc +++ b/paddle/fluid/operators/transpose_op.cc @@ -210,18 +210,21 @@ REGISTER_OPERATOR(transpose, ops::TransposeOp, ops::TransposeOpMaker, REGISTER_OPERATOR(transpose_grad, ops::TransposeOpGrad); REGISTER_OP_CPU_KERNEL( - transpose, ops::TransposeKernel); + transpose, ops::TransposeKernel, + ops::TransposeKernel); REGISTER_OP_CPU_KERNEL( transpose_grad, - ops::TransposeGradKernel); + ops::TransposeGradKernel, + ops::TransposeGradKernel); REGISTER_OPERATOR(transpose2, ops::Transpose2Op, ops::Transpose2OpMaker, ops::Transpose2GradMaker); REGISTER_OPERATOR(transpose2_grad, ops::Transpose2OpGrad); REGISTER_OP_CPU_KERNEL( - transpose2, - ops::TransposeKernel); + transpose2, ops::TransposeKernel, + ops::TransposeKernel); REGISTER_OP_CPU_KERNEL( transpose2_grad, - ops::TransposeGradKernel); + ops::TransposeGradKernel, + ops::TransposeGradKernel); diff --git a/paddle/fluid/operators/transpose_op.cu.cc b/paddle/fluid/operators/transpose_op.cu.cc index c1b5a8b31be243fab3af06a18c8e51986c953700..b4025350fa9f3610bde43eee91cd059f3063813f 100644 --- a/paddle/fluid/operators/transpose_op.cu.cc +++ b/paddle/fluid/operators/transpose_op.cu.cc @@ -16,15 +16,18 @@ limitations under the License. */ namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( - transpose, - ops::TransposeKernel); + transpose, ops::TransposeKernel, + ops::TransposeKernel); REGISTER_OP_CUDA_KERNEL( transpose_grad, - ops::TransposeGradKernel); + ops::TransposeGradKernel, + ops::TransposeGradKernel); REGISTER_OP_CUDA_KERNEL( transpose2, - ops::TransposeKernel); + ops::TransposeKernel, + ops::TransposeKernel); REGISTER_OP_CUDA_KERNEL( transpose2_grad, - ops::TransposeGradKernel); + ops::TransposeGradKernel, + ops::TransposeGradKernel); diff --git a/paddle/fluid/operators/uniform_random_op.cc b/paddle/fluid/operators/uniform_random_op.cc index aa907595cb7cf165974caa69fe8eb0370471732d..e3132ae76f624f3338d749e4fcebbd0ecd7ffe79 100644 --- a/paddle/fluid/operators/uniform_random_op.cc +++ b/paddle/fluid/operators/uniform_random_op.cc @@ -29,7 +29,7 @@ class CPUUniformRandomKernel : public framework::OpKernel { if (out_var->IsType()) { tensor = out_var->GetMutable(); } else if (out_var->IsType()) { - auto shape = ctx.Attr>("shape"); + auto shape = ctx.Attr>("shape"); auto *selected_rows = out_var->GetMutable(); tensor = selected_rows->mutable_value(); tensor->Resize(framework::make_ddim(shape)); @@ -67,7 +67,7 @@ class UniformRandomOp : public framework::OperatorWithKernel { PADDLE_ENFORCE( ctx->Attrs().Get("min") < ctx->Attrs().Get("max"), "uniform_random's min must less then max"); - auto &shape = ctx->Attrs().Get>("shape"); + auto &shape = ctx->Attrs().Get>("shape"); std::vector temp; temp.reserve(shape.size()); for (auto dim : shape) { @@ -94,7 +94,7 @@ This operator initializes a tensor with random values sampled from a uniform distribution. The random result is in set [min, max]. )DOC"); - AddAttr>("shape", "The shape of the output tensor"); + AddAttr>("shape", "The shape of the output tensor"); AddAttr("min", "Minimum value of uniform random. [default -1.0].") .SetDefault(-1.0f); AddAttr("max", "Maximun value of uniform random. [default 1.0].") diff --git a/paddle/fluid/operators/uniform_random_op.cu b/paddle/fluid/operators/uniform_random_op.cu index bbb692b0ddfc18e8a62c0d2a6bac88f9932f6704..2bb0ecc139f7096d1b61150e0a2d4fb095338749 100644 --- a/paddle/fluid/operators/uniform_random_op.cu +++ b/paddle/fluid/operators/uniform_random_op.cu @@ -48,7 +48,7 @@ class GPUUniformRandomKernel : public framework::OpKernel { if (out_var->IsType()) { tensor = out_var->GetMutable(); } else if (out_var->IsType()) { - auto shape = context.Attr>("shape"); + auto shape = context.Attr>("shape"); tensor = out_var->GetMutable()->mutable_value(); tensor->Resize(framework::make_ddim(shape)); } else { diff --git a/paddle/fluid/platform/cudnn_helper.h b/paddle/fluid/platform/cudnn_helper.h index bb8b14bb9fa41942c3aa653ca224c0842fbf9a00..1ad66f0525e4857533d5cf5ef3ee041aec297794 100644 --- a/paddle/fluid/platform/cudnn_helper.h +++ b/paddle/fluid/platform/cudnn_helper.h @@ -341,6 +341,28 @@ class ScopedPoolingDescriptor { DISABLE_COPY_AND_ASSIGN(ScopedPoolingDescriptor); }; +class ScopedSpatialTransformerDescriptor { + public: + ScopedSpatialTransformerDescriptor() { + PADDLE_ENFORCE(dynload::cudnnCreateSpatialTransformerDescriptor(&desc_)); + } + ~ScopedSpatialTransformerDescriptor() { + PADDLE_ENFORCE(dynload::cudnnDestroySpatialTransformerDescriptor(desc_)); + } + + template + inline cudnnSpatialTransformerDescriptor_t descriptor(const int nbDims, + const int dimA[]) { + PADDLE_ENFORCE(dynload::cudnnSetSpatialTransformerNdDescriptor( + desc_, CUDNN_SAMPLER_BILINEAR, CudnnDataType::type, nbDims, dimA)); + return desc_; + } + + private: + cudnnSpatialTransformerDescriptor_t desc_; + DISABLE_COPY_AND_ASSIGN(ScopedSpatialTransformerDescriptor); +}; + inline bool CanCUDNNBeUsed(const framework::ExecutionContext& ctx) { bool use_cudnn = ctx.Attr("use_cudnn"); use_cudnn &= paddle::platform::is_gpu_place(ctx.GetPlace()); diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index 7d1cf57253819b34fedfb292ad1635650f53f20f..924810bd61841139bc1849a000aaa57b07b71f0c 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -32,23 +32,25 @@ platform::DeviceContext* DeviceContextPool::Get(const platform::Place& place) { "'Place' is not supported, Please re-compile with WITH_GPU " "option"); } - return it->second.get(); + return it->second.get().get(); } -const std::vector -DeviceContextPool::GetAllDeviceContexts() const { - std::vector all_device_ctx; - all_device_ctx.reserve(device_contexts_.size()); - for (auto& dev_ctx : device_contexts_) { - all_device_ctx.emplace_back(dev_ctx.second.get()); - } - return all_device_ctx; +template +inline void EmplaceDeviceContext( + std::map>>* + map_ptr, + platform::Place p) { + using PtrType = std::unique_ptr; + map_ptr->emplace(p, std::async(std::launch::deferred, [=] { + // lazy evaluation. i.e., only create device context at + // first `Get` + return PtrType(new DevCtx(boost::get(p))); + })); } DeviceContextPool::DeviceContextPool( const std::vector& places) { PADDLE_ENFORCE_GT(places.size(), 0); - using PtrType = std::unique_ptr; std::set set; for (auto& p : places) { set.insert(p); @@ -57,16 +59,13 @@ DeviceContextPool::DeviceContextPool( for (auto& p : set) { if (platform::is_cpu_place(p)) { #ifdef PADDLE_WITH_MKLDNN - device_contexts_.emplace( - p, PtrType(new MKLDNNDeviceContext(boost::get(p)))); + EmplaceDeviceContext(&device_contexts_, p); #else - device_contexts_.emplace( - p, PtrType(new CPUDeviceContext(boost::get(p)))); + EmplaceDeviceContext(&device_contexts_, p); #endif } else if (platform::is_gpu_place(p)) { #ifdef PADDLE_WITH_CUDA - device_contexts_.emplace( - p, PtrType(new CUDADeviceContext(boost::get(p)))); + EmplaceDeviceContext(&device_contexts_, p); #else PADDLE_THROW( "'CUDAPlace' is not supported, Please re-compile with WITH_GPU " @@ -74,9 +73,8 @@ DeviceContextPool::DeviceContextPool( #endif } else if (platform::is_cuda_pinned_place(p)) { #ifdef PADDLE_WITH_CUDA - device_contexts_.emplace( - p, - PtrType(new CUDAPinnedDeviceContext(boost::get(p)))); + EmplaceDeviceContext( + &device_contexts_, p); #else PADDLE_THROW( "'CUDAPlace' is not supported, Please re-compile with WITH_GPU " @@ -296,38 +294,73 @@ Place CUDAPinnedDeviceContext::GetPlace() const { return place_; } #ifdef PADDLE_WITH_MKLDNN MKLDNNDeviceContext::MKLDNNDeviceContext(CPUPlace place) - : CPUDeviceContext(place), engine_(mkldnn::engine::cpu, 0), p_blobs_() { - p_blobs_.reset(new std::unordered_map>()); + : CPUDeviceContext(place), engine_(mkldnn::engine::cpu, 0), p_blobmap_() { + p_blobmap_.reset(new BlobMap()); + p_mutex_.reset(new std::mutex()); +} + +namespace { +// Current thread's id. +thread_local int cur_thread_id = 0; } +void set_cur_thread_id(int tid) { cur_thread_id = tid; } +int get_cur_thread_id(void) { return cur_thread_id; } + void MKLDNNDeviceContext::SetBlob(const std::string& name, std::shared_ptr data) const { - std::unordered_map>* p; - p = p_blobs_.get(); + BlobMap* pMap = p_blobmap_.get(); + std::shared_ptr pBlob = nullptr; + + int tid = platform::get_cur_thread_id(); + + std::lock_guard lock(*p_mutex_.get()); - auto it = p->find(name); + // Find KeyBlob for current thread + auto map_it = pMap->find(tid); - if (it == p->end()) { - (*p)[name] = data; // create new blob + if (map_it == pMap->end()) { + // 1st time to set blob in current thread + pBlob = std::shared_ptr(new KeyBlob()); + (*pMap)[tid] = pBlob; } else { - it->second = data; // set data to existing blob + pBlob = map_it->second; } + // Find Key in found (or newly created) KeyBlob + auto key_it = pBlob->find(name); + + if (key_it == pBlob->end()) { + (*pBlob)[name] = data; // create new blob + } else { + key_it->second = data; // set data to existing blob + } + + // lock will be automatically released when out of scope return; } std::shared_ptr MKLDNNDeviceContext::GetBlob( const std::string& name) const { - std::unordered_map>* p; - p = p_blobs_.get(); + BlobMap* pMap = p_blobmap_.get(); + std::shared_ptr pBlob = nullptr; - auto it = p->find(name); + int tid = platform::get_cur_thread_id(); - if (it != p->end()) { - return it->second; - } + std::lock_guard lock(*p_mutex_.get()); + + // Find KeyBlob for current thread firstly + auto map_it = pMap->find(tid); + if (map_it == pMap->end()) return nullptr; + pBlob = map_it->second; + + // Find Blob via name + auto key_it = pBlob->find(name); + + if (key_it == pBlob->end()) return nullptr; - return nullptr; + // lock will be automatically released when out of scope + return key_it->second; } #endif diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index 999bbe00f1659881050cb0dc89570b74b201aca7..0240b9380f3213b2a030061007e04abe1d73c6e3 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -10,6 +10,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include // NOLINT #include #include // NOLINT #include @@ -176,6 +177,12 @@ struct DefaultDeviceContextType { #endif #ifdef PADDLE_WITH_MKLDNN +using KeyBlob = std::unordered_map>; +using BlobMap = std::unordered_map>; + +void set_cur_thread_id(int); +int get_cur_thread_id(void); + class MKLDNNDeviceContext : public CPUDeviceContext { public: explicit MKLDNNDeviceContext(CPUPlace place); @@ -191,8 +198,8 @@ class MKLDNNDeviceContext : public CPUDeviceContext { private: mkldnn::engine engine_; - std::shared_ptr>> - p_blobs_; + std::shared_ptr p_blobmap_; + std::shared_ptr p_mutex_; }; #endif @@ -217,9 +224,6 @@ class DeviceContextPool { /*! \brief Return handle of single device context. */ platform::DeviceContext* Get(const platform::Place& place); - /*! \brief Return all the device contexts. */ - const std::vector GetAllDeviceContexts() const; - template const typename DefaultDeviceContextType::TYPE* GetByPlace( const Place& place) { @@ -231,7 +235,8 @@ class DeviceContextPool { private: static DeviceContextPool* pool; - std::map> device_contexts_; + std::map>> + device_contexts_; DISABLE_COPY_AND_ASSIGN(DeviceContextPool); }; diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h index e6353f67ef118072a2d8e49111e8ecc486589998..d3d754b6f58d25a9dfacafaf55d50b353a71ee6d 100644 --- a/paddle/fluid/platform/dynload/cudnn.h +++ b/paddle/fluid/platform/dynload/cudnn.h @@ -65,44 +65,51 @@ extern void EnforceCUDNNLoaded(const char* fn_name); * include all needed cudnn functions in HPPL * different cudnn version has different interfaces **/ -#define CUDNN_DNN_ROUTINE_EACH(__macro) \ - __macro(cudnnSetTensor4dDescriptor); \ - __macro(cudnnSetTensor4dDescriptorEx); \ - __macro(cudnnSetTensorNdDescriptor); \ - __macro(cudnnGetTensorNdDescriptor); \ - __macro(cudnnGetConvolutionNdForwardOutputDim); \ - __macro(cudnnGetConvolutionForwardAlgorithm); \ - __macro(cudnnCreateTensorDescriptor); \ - __macro(cudnnDestroyTensorDescriptor); \ - __macro(cudnnCreateFilterDescriptor); \ - __macro(cudnnSetFilter4dDescriptor); \ - __macro(cudnnSetFilterNdDescriptor); \ - __macro(cudnnGetFilterNdDescriptor); \ - __macro(cudnnSetPooling2dDescriptor); \ - __macro(cudnnSetPoolingNdDescriptor); \ - __macro(cudnnGetPoolingNdDescriptor); \ - __macro(cudnnDestroyFilterDescriptor); \ - __macro(cudnnCreateConvolutionDescriptor); \ - __macro(cudnnCreatePoolingDescriptor); \ - __macro(cudnnDestroyPoolingDescriptor); \ - __macro(cudnnSetConvolution2dDescriptor); \ - __macro(cudnnDestroyConvolutionDescriptor); \ - __macro(cudnnSetConvolutionNdDescriptor); \ - __macro(cudnnGetConvolutionNdDescriptor); \ - __macro(cudnnDeriveBNTensorDescriptor); \ - __macro(cudnnCreate); \ - __macro(cudnnDestroy); \ - __macro(cudnnSetStream); \ - __macro(cudnnActivationForward); \ - __macro(cudnnConvolutionForward); \ - __macro(cudnnConvolutionBackwardBias); \ - __macro(cudnnGetConvolutionForwardWorkspaceSize); \ - __macro(cudnnTransformTensor); \ - __macro(cudnnPoolingForward); \ - __macro(cudnnPoolingBackward); \ - __macro(cudnnSoftmaxBackward); \ - __macro(cudnnSoftmaxForward); \ - __macro(cudnnGetVersion); \ +#define CUDNN_DNN_ROUTINE_EACH(__macro) \ + __macro(cudnnSetTensor4dDescriptor); \ + __macro(cudnnSetTensor4dDescriptorEx); \ + __macro(cudnnSetTensorNdDescriptor); \ + __macro(cudnnGetTensorNdDescriptor); \ + __macro(cudnnGetConvolutionNdForwardOutputDim); \ + __macro(cudnnGetConvolutionForwardAlgorithm); \ + __macro(cudnnCreateTensorDescriptor); \ + __macro(cudnnDestroyTensorDescriptor); \ + __macro(cudnnCreateFilterDescriptor); \ + __macro(cudnnSetFilter4dDescriptor); \ + __macro(cudnnSetFilterNdDescriptor); \ + __macro(cudnnGetFilterNdDescriptor); \ + __macro(cudnnSetPooling2dDescriptor); \ + __macro(cudnnSetPoolingNdDescriptor); \ + __macro(cudnnGetPoolingNdDescriptor); \ + __macro(cudnnDestroyFilterDescriptor); \ + __macro(cudnnCreateConvolutionDescriptor); \ + __macro(cudnnCreatePoolingDescriptor); \ + __macro(cudnnDestroyPoolingDescriptor); \ + __macro(cudnnSetConvolution2dDescriptor); \ + __macro(cudnnDestroyConvolutionDescriptor); \ + __macro(cudnnSetConvolutionNdDescriptor); \ + __macro(cudnnGetConvolutionNdDescriptor); \ + __macro(cudnnDeriveBNTensorDescriptor); \ + __macro(cudnnCreateSpatialTransformerDescriptor); \ + __macro(cudnnSetSpatialTransformerNdDescriptor); \ + __macro(cudnnDestroySpatialTransformerDescriptor); \ + __macro(cudnnSpatialTfGridGeneratorForward); \ + __macro(cudnnSpatialTfGridGeneratorBackward); \ + __macro(cudnnSpatialTfSamplerForward); \ + __macro(cudnnSpatialTfSamplerBackward); \ + __macro(cudnnCreate); \ + __macro(cudnnDestroy); \ + __macro(cudnnSetStream); \ + __macro(cudnnActivationForward); \ + __macro(cudnnConvolutionForward); \ + __macro(cudnnConvolutionBackwardBias); \ + __macro(cudnnGetConvolutionForwardWorkspaceSize); \ + __macro(cudnnTransformTensor); \ + __macro(cudnnPoolingForward); \ + __macro(cudnnPoolingBackward); \ + __macro(cudnnSoftmaxBackward); \ + __macro(cudnnSoftmaxForward); \ + __macro(cudnnGetVersion); \ __macro(cudnnGetErrorString); CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc index 3b22718a8c6f994dbc2dc3e7aaa19a7163f716ba..d3b0d4a22954c1d67dc9551b997dcffa0625cbeb 100644 --- a/paddle/fluid/pybind/protobuf.cc +++ b/paddle/fluid/pybind/protobuf.cc @@ -57,6 +57,18 @@ struct variant_caster> { auto caster = make_caster(); if (!load_success_ && caster.load(src, convert)) { load_success_ = true; + + if (std::is_same>::value) { + auto caster_ints = make_caster>(); + if (caster_ints.load(src, convert)) { + VLOG(4) << "This value are floats and int64_ts satisfy " + "simultaneously, will set it's type to " + "std::vector"; + value = cast_op>(caster_ints); + return true; + } + } + value = cast_op(caster); return true; } @@ -259,6 +271,8 @@ void BindOpDesc(pybind11::module *m) { pybind11::enum_(*m, "AttrType", "") .value("INT", pd::proto::AttrType::INT) .value("INTS", pd::proto::AttrType::INTS) + .value("LONG", pd::proto::AttrType::LONG) + .value("LONGS", pd::proto::AttrType::LONGS) .value("FLOAT", pd::proto::AttrType::FLOAT) .value("FLOATS", pd::proto::AttrType::FLOATS) .value("STRING", pd::proto::AttrType::STRING) diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 339a7c98c6a2bba2cd46790cecc169ef447c63ce..5f15a29f4c3e9b1412912fe4723642d1ede60346 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -645,9 +645,13 @@ All parameter, weight, gradient are variables in Paddle. py::class_> pass(m, "Pass"); pass.def(py::init()) - .def("set_str", [](ir::Pass &self, const std::string &name, - const std::string &attr) { - self.Set(name, new std::string(attr)); + .def( + "set_str", + [](ir::Pass &self, const std::string &name, const std::string &attr) { + self.Set(name, new std::string(attr)); + }) + .def("set_int", [](ir::Pass &self, const std::string &name, int val) { + self.Set(name, new int(val)); }); py::class_> pb( diff --git a/paddle/fluid/train/demo/CMakeLists.txt b/paddle/fluid/train/demo/CMakeLists.txt index 78d6e5ff554b9cd9facae85be166a697e0b75337..eabb51d370aff709e289e1fc727aa2dbb551d82e 100644 --- a/paddle/fluid/train/demo/CMakeLists.txt +++ b/paddle/fluid/train/demo/CMakeLists.txt @@ -15,6 +15,7 @@ include_directories("${PADDLE_LIB}") include_directories("${PADDLE_LIB}/third_party/install/protobuf/include") include_directories("${PADDLE_LIB}/third_party/install/glog/include") include_directories("${PADDLE_LIB}/third_party/install/gflags/include") +include_directories("${PADDLE_LIB}/third_party/install/xxhash/include") include_directories("${PADDLE_LIB}/third_party/install/snappy/include") include_directories("${PADDLE_LIB}/third_party/install/snappystream/include") include_directories("${PADDLE_LIB}/third_party/install/zlib/include") @@ -27,6 +28,7 @@ link_directories("${PADDLE_LIB}/third_party/install/snappystream/lib") link_directories("${PADDLE_LIB}/third_party/install/protobuf/lib") link_directories("${PADDLE_LIB}/third_party/install/glog/lib") link_directories("${PADDLE_LIB}/third_party/install/gflags/lib") +link_directories("${PADDLE_LIB}/third_party/install/xxhash/lib") link_directories("${PADDLE_LIB}/third_party/install/zlib/lib") add_executable(demo_trainer demo_trainer.cc) @@ -62,5 +64,5 @@ target_link_libraries(demo_trainer ${ARCHIVE_END} ${MATH_LIB} ${MKLDNN_LIB} - glog gflags protobuf snappystream snappy z + glog gflags protobuf snappystream snappy z xxhash ${EXTERNAL_LIB}) diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 85493c10549c290330ed09b9f28accb7a980de6a..d7676f89ab5e781f910f98d03e72d5f7c1023a9a 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -95,9 +95,9 @@ function cmake_gen() { exit 1 fi fi - else + else if [ "$1" != "" ]; then - echo "using python abi: $1" + echo "using python abi: $1" if [ "$1" == "cp27-cp27m" ]; then export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.11-ucs4/lib:} export PATH=/opt/python/cp27-cp27m/bin/:${PATH} @@ -119,7 +119,7 @@ function cmake_gen() { fi fi fi - + if [ "$SYSTEM" == "Darwin" ]; then WITH_DISTRIBUTE=${WITH_DISTRIBUTE:-ON} WITH_AVX=${WITH_AVX:-ON} @@ -127,7 +127,7 @@ function cmake_gen() { else INFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR:-/root/.cache/inference_demo} fi - + cat <